From 031a0300f2c94e81598bcc08a9e6de7f10a18d7b Mon Sep 17 00:00:00 2001
From: "Fabio M. De Francesco" <fmdefrancesco@gmail.com>
Date: Thu, 1 Sep 2022 18:07:04 +0200
Subject: [PATCH 0001/1406] ecryptfs: Replace kmap() with kmap_local_page()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The use of kmap() is being deprecated in favor of kmap_local_page().

There are two main problems with kmap(): (1) It comes with an overhead as
the mapping space is restricted and protected by a global lock for
synchronization and (2) it also requires global TLB invalidation when the
kmap’s pool wraps and it might block when the mapping space is fully
utilized until a slot becomes available.

With kmap_local_page() the mappings are per thread, CPU local, can take
page faults, and can be called from any context (including interrupts).
It is faster than kmap() in kernels with HIGHMEM enabled. Furthermore,
the tasks can be preempted and, when they are scheduled to run again, the
kernel virtual addresses are restored and still valid.

Since its use in fs/ecryptfs is safe everywhere, it should be preferred.

Therefore, replace kmap() with kmap_local_page() in fs/ecryptfs.

Cc: "Venkataramanan, Anirudh" <anirudh.venkataramanan@intel.com>
Suggested-by: Ira Weiny <ira.weiny@intel.com>
Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Signed-off-by: Fabio M. De Francesco <fmdefrancesco@gmail.com>
Signed-off-by: Tyler Hicks <code@tyhicks.com>
Link: https://lore.kernel.org/r/20220901160704.25701-1-fmdefrancesco@gmail.com
---
 fs/ecryptfs/crypto.c     | 8 ++++----
 fs/ecryptfs/read_write.c | 8 ++++----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index e3f5d7f3c8a0ad..03263ebcccc6bd 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -465,10 +465,10 @@ int ecryptfs_encrypt_page(struct page *page)
 	}
 
 	lower_offset = lower_offset_for_page(crypt_stat, page);
-	enc_extent_virt = kmap(enc_extent_page);
+	enc_extent_virt = kmap_local_page(enc_extent_page);
 	rc = ecryptfs_write_lower(ecryptfs_inode, enc_extent_virt, lower_offset,
 				  PAGE_SIZE);
-	kunmap(enc_extent_page);
+	kunmap_local(enc_extent_virt);
 	if (rc < 0) {
 		ecryptfs_printk(KERN_ERR,
 			"Error attempting to write lower page; rc = [%d]\n",
@@ -514,10 +514,10 @@ int ecryptfs_decrypt_page(struct page *page)
 	BUG_ON(!(crypt_stat->flags & ECRYPTFS_ENCRYPTED));
 
 	lower_offset = lower_offset_for_page(crypt_stat, page);
-	page_virt = kmap(page);
+	page_virt = kmap_local_page(page);
 	rc = ecryptfs_read_lower(page_virt, lower_offset, PAGE_SIZE,
 				 ecryptfs_inode);
-	kunmap(page);
+	kunmap_local(page_virt);
 	if (rc < 0) {
 		ecryptfs_printk(KERN_ERR,
 			"Error attempting to read lower page; rc = [%d]\n",
diff --git a/fs/ecryptfs/read_write.c b/fs/ecryptfs/read_write.c
index 60bdcaddcbe57e..5edf027c835906 100644
--- a/fs/ecryptfs/read_write.c
+++ b/fs/ecryptfs/read_write.c
@@ -64,11 +64,11 @@ int ecryptfs_write_lower_page_segment(struct inode *ecryptfs_inode,
 
 	offset = ((((loff_t)page_for_lower->index) << PAGE_SHIFT)
 		  + offset_in_page);
-	virt = kmap(page_for_lower);
+	virt = kmap_local_page(page_for_lower);
 	rc = ecryptfs_write_lower(ecryptfs_inode, virt, offset, size);
 	if (rc > 0)
 		rc = 0;
-	kunmap(page_for_lower);
+	kunmap_local(virt);
 	return rc;
 }
 
@@ -253,11 +253,11 @@ int ecryptfs_read_lower_page_segment(struct page *page_for_ecryptfs,
 	int rc;
 
 	offset = ((((loff_t)page_index) << PAGE_SHIFT) + offset_in_page);
-	virt = kmap(page_for_ecryptfs);
+	virt = kmap_local_page(page_for_ecryptfs);
 	rc = ecryptfs_read_lower(virt, offset, size, ecryptfs_inode);
 	if (rc > 0)
 		rc = 0;
-	kunmap(page_for_ecryptfs);
+	kunmap_local(virt);
 	flush_dcache_page(page_for_ecryptfs);
 	return rc;
 }

From c1cc2db216078f9b1e29c991b1b9177c26757162 Mon Sep 17 00:00:00 2001
From: Slark Xiao <slark_xiao@163.com>
Date: Fri, 22 Jul 2022 18:02:12 +0800
Subject: [PATCH 0002/1406] ecryptfs: keystore: Fix typo 'the the' in comment

Replace 'the the' with 'the' in the comment.

Signed-off-by: Slark Xiao <slark_xiao@163.com>
Signed-off-by: Tyler Hicks <code@tyhicks.com>
Link: https://lore.kernel.org/r/20220722100212.79490-1-slark_xiao@163.com
---
 fs/ecryptfs/keystore.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index 3fe41964c0d8d9..2452d6fd7062d7 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -878,7 +878,7 @@ struct ecryptfs_parse_tag_70_packet_silly_stack {
  * @filename: This function kmalloc's the memory for the filename
  * @filename_size: This function sets this to the amount of memory
  *                 kmalloc'd for the filename
- * @packet_size: This function sets this to the the number of octets
+ * @packet_size: This function sets this to the number of octets
  *               in the packet parsed
  * @mount_crypt_stat: The mount-wide cryptographic context
  * @data: The memory location containing the start of the tag 70

From a3d78fe3e1ae8c6a1901635c54a1a799656f72c8 Mon Sep 17 00:00:00 2001
From: Zipeng Zhang <zhangzipeng0@foxmail.com>
Date: Mon, 20 Mar 2023 10:04:28 +0800
Subject: [PATCH 0003/1406] fs: ecryptfs: comment typo fix

Comment typo fix "vitual" -> "virtual".

Signed-off-by: Zipeng Zhang <zhangzipeng0@foxmail.com>
Signed-off-by: Tyler Hicks <code@tyhicks.com>
---
 fs/ecryptfs/crypto.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index 03263ebcccc6bd..c64985bf8c9e34 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -1313,7 +1313,7 @@ static int ecryptfs_read_headers_virt(char *page_virt,
 
 /**
  * ecryptfs_read_xattr_region
- * @page_virt: The vitual address into which to read the xattr data
+ * @page_virt: The virtual address into which to read the xattr data
  * @ecryptfs_inode: The eCryptfs inode
  *
  * Attempts to read the crypto metadata from the extended attribute

From 1d1472185a8821a14dd796ce3fa6375ed5a9f519 Mon Sep 17 00:00:00 2001
From: Jinjie Ruan <ruanjinjie@huawei.com>
Date: Thu, 31 Aug 2023 20:39:27 +0800
Subject: [PATCH 0004/1406] ntb: intel: Fix the NULL vs IS_ERR() bug for
 debugfs_create_dir()

The debugfs_create_dir() function returns error pointers.
It never returns NULL. So use IS_ERR() to check it.

Fixes: e26a5843f7f5 ("NTB: Split ntb_hw_intel and ntb_transport drivers")
Signed-off-by: Jinjie Ruan <ruanjinjie@huawei.com>
Reviewed-by: Dave Jiang <dave.jiang@intel.com>
Signed-off-by: Jon Mason <jdmason@kudzu.us>
---
 drivers/ntb/hw/intel/ntb_hw_gen1.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/ntb/hw/intel/ntb_hw_gen1.c b/drivers/ntb/hw/intel/ntb_hw_gen1.c
index 9ab836d0d4f12d..079b8cd7978573 100644
--- a/drivers/ntb/hw/intel/ntb_hw_gen1.c
+++ b/drivers/ntb/hw/intel/ntb_hw_gen1.c
@@ -778,7 +778,7 @@ static void ndev_init_debugfs(struct intel_ntb_dev *ndev)
 		ndev->debugfs_dir =
 			debugfs_create_dir(pci_name(ndev->ntb.pdev),
 					   debugfs_dir);
-		if (!ndev->debugfs_dir)
+		if (IS_ERR(ndev->debugfs_dir))
 			ndev->debugfs_info = NULL;
 		else
 			ndev->debugfs_info =

From 9341b37ec17a8793e8439e9b18354ba69556b786 Mon Sep 17 00:00:00 2001
From: Max Hawking <maxahawking@sonnenkinder.org>
Date: Sun, 8 Oct 2023 20:45:16 -0700
Subject: [PATCH 0005/1406] ntb_perf: Fix printk format

The correct printk format is %pa or %pap, but not %pa[p].

Fixes: 99a06056124d ("NTB: ntb_perf: Fix address err in perf_copy_chunk")
Signed-off-by: Max Hawking <maxahawking@sonnenkinder.org>
Signed-off-by: Jon Mason <jdmason@kudzu.us>
---
 drivers/ntb/test/ntb_perf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/ntb/test/ntb_perf.c b/drivers/ntb/test/ntb_perf.c
index 553f1f46bc664f..72bc1d017a46ee 100644
--- a/drivers/ntb/test/ntb_perf.c
+++ b/drivers/ntb/test/ntb_perf.c
@@ -1227,7 +1227,7 @@ static ssize_t perf_dbgfs_read_info(struct file *filep, char __user *ubuf,
 			"\tOut buffer addr 0x%pK\n", peer->outbuf);
 
 		pos += scnprintf(buf + pos, buf_size - pos,
-			"\tOut buff phys addr %pa[p]\n", &peer->out_phys_addr);
+			"\tOut buff phys addr %pap\n", &peer->out_phys_addr);
 
 		pos += scnprintf(buf + pos, buf_size - pos,
 			"\tOut buffer size %pa\n", &peer->outbuf_size);

From 4425c1d9b44ded655d2668e1ce95a62bccf7b21b Mon Sep 17 00:00:00 2001
From: Seamus Connor <sconnor@purestorage.com>
Date: Fri, 13 Oct 2023 14:11:29 -0700
Subject: [PATCH 0006/1406] configfs: improve item creation performance

As the size of a directory increases item creation slows down.
Optimizing access to s_children removes this bottleneck.

dirents are already pinned into the cache, there is no need to scan the
s_children list looking for duplicate Items. The configfs_dirent_exists
check is moved to a location where it is called only during subsystem
initialization.

d_lookup will only need to call configfs_lookup in the case where the
item in question is not pinned to dcache. The only items not pinned to
dcache are attributes. These are placed at the front of the s_children
list, whilst pinned items are inserted at the back. configfs_lookup
stops scanning when it encounters the first pinned entry in s_children.

The assumption of the above optimizations is that there will be few
attributes, but potentially many Items in a given directory.

Signed-off-by: Seamus Connor <sconnor@purestorage.com>
Reviewed-by: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/configfs/configfs_internal.h |  4 ++--
 fs/configfs/dir.c               | 42 +++++++++++++++++++++++++--------
 fs/configfs/inode.c             | 24 -------------------
 3 files changed, 34 insertions(+), 36 deletions(-)

diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h
index e710a1782382a2..0b969d0eb8ff98 100644
--- a/fs/configfs/configfs_internal.h
+++ b/fs/configfs/configfs_internal.h
@@ -55,6 +55,8 @@ struct configfs_dirent {
 #define CONFIGFS_USET_IN_MKDIR	0x0200
 #define CONFIGFS_USET_CREATING	0x0400
 #define CONFIGFS_NOT_PINNED	(CONFIGFS_ITEM_ATTR | CONFIGFS_ITEM_BIN_ATTR)
+#define CONFIGFS_PINNED \
+	(CONFIGFS_ROOT | CONFIGFS_DIR | CONFIGFS_ITEM_LINK)
 
 extern struct mutex configfs_symlink_mutex;
 extern spinlock_t configfs_dirent_lock;
@@ -73,8 +75,6 @@ extern int configfs_make_dirent(struct configfs_dirent *, struct dentry *,
 				void *, umode_t, int, struct configfs_fragment *);
 extern int configfs_dirent_is_ready(struct configfs_dirent *);
 
-extern void configfs_hash_and_remove(struct dentry * dir, const char * name);
-
 extern const unsigned char * configfs_get_name(struct configfs_dirent *sd);
 extern void configfs_drop_dentry(struct configfs_dirent *sd, struct dentry *parent);
 extern int configfs_setattr(struct mnt_idmap *idmap,
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 18677cd4e62f54..7d6cd4b366d5a0 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -207,7 +207,17 @@ static struct configfs_dirent *configfs_new_dirent(struct configfs_dirent *paren
 		return ERR_PTR(-ENOENT);
 	}
 	sd->s_frag = get_fragment(frag);
-	list_add(&sd->s_sibling, &parent_sd->s_children);
+
+	/*
+	 * configfs_lookup scans only for unpinned items. s_children is
+	 * partitioned so that configfs_lookup can bail out early.
+	 * CONFIGFS_PINNED and CONFIGFS_NOT_PINNED are not symmetrical.  readdir
+	 * cursors still need to be inserted at the front of the list.
+	 */
+	if (sd->s_type & CONFIGFS_PINNED)
+		list_add_tail(&sd->s_sibling, &parent_sd->s_children);
+	else
+		list_add(&sd->s_sibling, &parent_sd->s_children);
 	spin_unlock(&configfs_dirent_lock);
 
 	return sd;
@@ -220,10 +230,11 @@ static struct configfs_dirent *configfs_new_dirent(struct configfs_dirent *paren
  *
  * called with parent inode's i_mutex held
  */
-static int configfs_dirent_exists(struct configfs_dirent *parent_sd,
-				  const unsigned char *new)
+static int configfs_dirent_exists(struct dentry *dentry)
 {
-	struct configfs_dirent * sd;
+	struct configfs_dirent *parent_sd = dentry->d_parent->d_fsdata;
+	const unsigned char *new = dentry->d_name.name;
+	struct configfs_dirent *sd;
 
 	list_for_each_entry(sd, &parent_sd->s_children, s_sibling) {
 		if (sd->s_element) {
@@ -289,10 +300,6 @@ static int configfs_create_dir(struct config_item *item, struct dentry *dentry,
 
 	BUG_ON(!item);
 
-	error = configfs_dirent_exists(p->d_fsdata, dentry->d_name.name);
-	if (unlikely(error))
-		return error;
-
 	error = configfs_make_dirent(p->d_fsdata, dentry, item, mode,
 				     CONFIGFS_DIR | CONFIGFS_USET_CREATING,
 				     frag);
@@ -451,6 +458,18 @@ static struct dentry * configfs_lookup(struct inode *dir,
 
 	spin_lock(&configfs_dirent_lock);
 	list_for_each_entry(sd, &parent_sd->s_children, s_sibling) {
+
+		/*
+		 * s_children is partitioned, see configfs_new_dirent. The first
+		 * pinned item indicates we can stop scanning.
+		 */
+		if (sd->s_type & CONFIGFS_PINNED)
+			break;
+
+		/*
+		 * Note: CONFIGFS_PINNED and CONFIGFS_NOT_PINNED are asymmetric.
+		 * there may be a readdir cursor in this list
+		 */
 		if ((sd->s_type & CONFIGFS_NOT_PINNED) &&
 		    !strcmp(configfs_get_name(sd), dentry->d_name.name)) {
 			struct configfs_attribute *attr = sd->s_element;
@@ -1875,8 +1894,11 @@ int configfs_register_subsystem(struct configfs_subsystem *subsys)
 	if (dentry) {
 		d_add(dentry, NULL);
 
-		err = configfs_attach_group(sd->s_element, &group->cg_item,
-					    dentry, frag);
+		err = configfs_dirent_exists(dentry);
+		if (!err)
+			err = configfs_attach_group(sd->s_element,
+						    &group->cg_item,
+						    dentry, frag);
 		if (err) {
 			BUG_ON(d_inode(dentry));
 			d_drop(dentry);
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index fbdcb3582926a7..9a133fa147f295 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -217,27 +217,3 @@ void configfs_drop_dentry(struct configfs_dirent * sd, struct dentry * parent)
 	}
 }
 
-void configfs_hash_and_remove(struct dentry * dir, const char * name)
-{
-	struct configfs_dirent * sd;
-	struct configfs_dirent * parent_sd = dir->d_fsdata;
-
-	if (d_really_is_negative(dir))
-		/* no inode means this hasn't been made visible yet */
-		return;
-
-	inode_lock(d_inode(dir));
-	list_for_each_entry(sd, &parent_sd->s_children, s_sibling) {
-		if (!sd->s_element)
-			continue;
-		if (!strcmp(configfs_get_name(sd), name)) {
-			spin_lock(&configfs_dirent_lock);
-			list_del_init(&sd->s_sibling);
-			spin_unlock(&configfs_dirent_lock);
-			configfs_drop_dentry(sd, dir);
-			configfs_put(sd);
-			break;
-		}
-	}
-	inode_unlock(d_inode(dir));
-}

From 98988fc8e9edc4da5825b3dd5544880c210a8738 Mon Sep 17 00:00:00 2001
From: Nick Terrell <terrelln@meta.com>
Date: Thu, 16 Nov 2023 12:52:21 -0800
Subject: [PATCH 0007/1406] zstd: import upstream v1.5.5

Import upstream zstd v1.5.5 to expose upstream's QAT integration.

Import from upstream commit 58b3ef79 [0]. This is one commit before the
tag v1.5.5-kernel [1], which is signed with upstream's signing key. The
next patch in the series imports from v1.5.5-kernel, and is included in
the series, rather than just importing directly from v1.5.5-kernel,
because it is a non-trivial patch applied to improve the kernel's
decompression speed. This commit contains 3 backported patches on top of
v1.5.5: Two from the Linux copy of zstd, and one from upstream's `dev`
branch.

In addition to keeping the kernel's copy of zstd up to date, this update
was requested by Intel to expose upstream zstd's external match provider
API to the kernel, which allows QAT to accelerate the LZ match finding
stage.

This commit was generated by:

  export ZSTD=/path/to/repo/zstd/
  export LINUX=/path/to/repo/linux/
  cd "$ZSTD/contrib/linux-kernel"
  git checkout v1.5.5-kernel~
  make import LINUX="$LINUX"

I tested and benchmarked this commit on x86-64 with gcc-13.2.1 on an
Intel i9-9900K by running my benchmark scripts that benchmark zstd's
performance in btrfs and squashfs compressed filesystems. This commit
improves compression speed, especially for higher compression levels,
and regresses decompression speed. But the decompression speed
regression is addressed by the next patch in the series.

Component,	Level,	C. time delta,	size delta,	D. time delta
Btrfs    ,	    1,	        -1.9%,	     +0.0%,	        +9.5%
Btrfs    ,	    3,	        -5.6%,	     +0.0%,	        +7.4%
Btrfs    ,	    5,	        -4.9%,	     +0.0%,	        +5.0%
Btrfs    ,	    7,	        -5.7%,	     +0.0%,	        +5.2%
Btrfs    ,	    9,	        -5.7%,	     +0.0%,	        +4.0%
Squashfs ,	    1,	          N/A,	      0.0%,	       +11.6%

I also boot tested with a zstd compressed kernel on i386 and aarch64.

Link: https://github.com/facebook/zstd/commit/58b3ef79eb9f1e6613684ea6e5b89720660ee8b6
Link: https://github.com/facebook/zstd/tree/v1.5.5-kernel
Signed-off-by: Nick Terrell <terrelln@fb.com>
---
 include/linux/zstd.h                          |    2 +-
 include/linux/zstd_errors.h                   |   23 +-
 include/linux/zstd_lib.h                      |  697 +++++--
 lib/zstd/Makefile                             |    2 +-
 lib/zstd/common/allocations.h                 |   56 +
 lib/zstd/common/bits.h                        |  149 ++
 lib/zstd/common/bitstream.h                   |   53 +-
 lib/zstd/common/compiler.h                    |   14 +-
 lib/zstd/common/cpu.h                         |    3 +-
 lib/zstd/common/debug.c                       |    3 +-
 lib/zstd/common/debug.h                       |    3 +-
 lib/zstd/common/entropy_common.c              |   42 +-
 lib/zstd/common/error_private.c               |   12 +-
 lib/zstd/common/error_private.h               |    3 +-
 lib/zstd/common/fse.h                         |   89 +-
 lib/zstd/common/fse_decompress.c              |   94 +-
 lib/zstd/common/huf.h                         |  222 +--
 lib/zstd/common/mem.h                         |    2 +-
 lib/zstd/common/portability_macros.h          |   26 +-
 lib/zstd/common/zstd_common.c                 |   38 +-
 lib/zstd/common/zstd_deps.h                   |   16 +-
 lib/zstd/common/zstd_internal.h               |   99 +-
 lib/zstd/compress/clevels.h                   |    3 +-
 lib/zstd/compress/fse_compress.c              |   59 +-
 lib/zstd/compress/hist.c                      |    3 +-
 lib/zstd/compress/hist.h                      |    3 +-
 lib/zstd/compress/huf_compress.c              |  372 ++--
 lib/zstd/compress/zstd_compress.c             | 1762 ++++++++++++-----
 lib/zstd/compress/zstd_compress_internal.h    |  333 +++-
 lib/zstd/compress/zstd_compress_literals.c    |  155 +-
 lib/zstd/compress/zstd_compress_literals.h    |   25 +-
 lib/zstd/compress/zstd_compress_sequences.c   |    7 +-
 lib/zstd/compress/zstd_compress_sequences.h   |    3 +-
 lib/zstd/compress/zstd_compress_superblock.c  |   47 +-
 lib/zstd/compress/zstd_compress_superblock.h  |    3 +-
 lib/zstd/compress/zstd_cwksp.h                |  149 +-
 lib/zstd/compress/zstd_double_fast.c          |  129 +-
 lib/zstd/compress/zstd_double_fast.h          |    6 +-
 lib/zstd/compress/zstd_fast.c                 |  582 ++++--
 lib/zstd/compress/zstd_fast.h                 |    6 +-
 lib/zstd/compress/zstd_lazy.c                 |  518 ++---
 lib/zstd/compress/zstd_lazy.h                 |    7 +-
 lib/zstd/compress/zstd_ldm.c                  |   11 +-
 lib/zstd/compress/zstd_ldm.h                  |    3 +-
 lib/zstd/compress/zstd_ldm_geartab.h          |    3 +-
 lib/zstd/compress/zstd_opt.c                  |  187 +-
 lib/zstd/compress/zstd_opt.h                  |    3 +-
 lib/zstd/decompress/huf_decompress.c          |  731 ++++---
 lib/zstd/decompress/zstd_ddict.c              |    9 +-
 lib/zstd/decompress/zstd_ddict.h              |    3 +-
 lib/zstd/decompress/zstd_decompress.c         |  261 ++-
 lib/zstd/decompress/zstd_decompress_block.c   |  283 ++-
 lib/zstd/decompress/zstd_decompress_block.h   |    8 +-
 .../decompress/zstd_decompress_internal.h     |    7 +-
 lib/zstd/decompress_sources.h                 |    2 +-
 lib/zstd/zstd_common_module.c                 |    5 +-
 lib/zstd/zstd_compress_module.c               |    2 +-
 lib/zstd/zstd_decompress_module.c             |    4 +-
 58 files changed, 4748 insertions(+), 2594 deletions(-)
 create mode 100644 lib/zstd/common/allocations.h
 create mode 100644 lib/zstd/common/bits.h

diff --git a/include/linux/zstd.h b/include/linux/zstd.h
index 113408eef6ecef..f109d49f43f80c 100644
--- a/include/linux/zstd.h
+++ b/include/linux/zstd.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
diff --git a/include/linux/zstd_errors.h b/include/linux/zstd_errors.h
index 58b6dd45a969f7..6d5cf55f0bf3e9 100644
--- a/include/linux/zstd_errors.h
+++ b/include/linux/zstd_errors.h
@@ -1,5 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -17,8 +18,17 @@
 
 
 /* =====   ZSTDERRORLIB_API : control library symbols visibility   ===== */
-#define ZSTDERRORLIB_VISIBILITY 
-#define ZSTDERRORLIB_API ZSTDERRORLIB_VISIBILITY
+#define ZSTDERRORLIB_VISIBLE 
+
+#ifndef ZSTDERRORLIB_HIDDEN
+#  if (__GNUC__ >= 4) && !defined(__MINGW32__)
+#    define ZSTDERRORLIB_HIDDEN __attribute__ ((visibility ("hidden")))
+#  else
+#    define ZSTDERRORLIB_HIDDEN
+#  endif
+#endif
+
+#define ZSTDERRORLIB_API ZSTDERRORLIB_VISIBLE
 
 /*-*********************************************
  *  Error codes list
@@ -43,14 +53,17 @@ typedef enum {
   ZSTD_error_frameParameter_windowTooLarge = 16,
   ZSTD_error_corruption_detected = 20,
   ZSTD_error_checksum_wrong      = 22,
+  ZSTD_error_literals_headerWrong = 24,
   ZSTD_error_dictionary_corrupted      = 30,
   ZSTD_error_dictionary_wrong          = 32,
   ZSTD_error_dictionaryCreation_failed = 34,
   ZSTD_error_parameter_unsupported   = 40,
+  ZSTD_error_parameter_combination_unsupported = 41,
   ZSTD_error_parameter_outOfBound    = 42,
   ZSTD_error_tableLog_tooLarge       = 44,
   ZSTD_error_maxSymbolValue_tooLarge = 46,
   ZSTD_error_maxSymbolValue_tooSmall = 48,
+  ZSTD_error_stabilityCondition_notRespected = 50,
   ZSTD_error_stage_wrong       = 60,
   ZSTD_error_init_missing      = 62,
   ZSTD_error_memory_allocation = 64,
@@ -58,11 +71,15 @@ typedef enum {
   ZSTD_error_dstSize_tooSmall = 70,
   ZSTD_error_srcSize_wrong    = 72,
   ZSTD_error_dstBuffer_null   = 74,
+  ZSTD_error_noForwardProgress_destFull = 80,
+  ZSTD_error_noForwardProgress_inputEmpty = 82,
   /* following error codes are __NOT STABLE__, they can be removed or changed in future versions */
   ZSTD_error_frameIndex_tooLarge = 100,
   ZSTD_error_seekableIO          = 102,
   ZSTD_error_dstBuffer_wrong     = 104,
   ZSTD_error_srcBuffer_wrong     = 105,
+  ZSTD_error_sequenceProducer_failed = 106,
+  ZSTD_error_externalSequences_invalid = 107,
   ZSTD_error_maxCode = 120  /* never EVER use this value directly, it can change in future versions! Use ZSTD_isError() instead */
 } ZSTD_ErrorCode;
 
diff --git a/include/linux/zstd_lib.h b/include/linux/zstd_lib.h
index 79d55465d5c1d6..8b4ffe649df575 100644
--- a/include/linux/zstd_lib.h
+++ b/include/linux/zstd_lib.h
@@ -1,5 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -11,23 +12,42 @@
 #ifndef ZSTD_H_235446
 #define ZSTD_H_235446
 
-/* ======   Dependency   ======*/
+/* ======   Dependencies   ======*/
 #include <linux/limits.h>   /* INT_MAX */
 #include <linux/types.h>   /* size_t */
 
 
 /* =====   ZSTDLIB_API : control library symbols visibility   ===== */
-#ifndef ZSTDLIB_VISIBLE
+#define ZSTDLIB_VISIBLE 
+
+#ifndef ZSTDLIB_HIDDEN
 #  if (__GNUC__ >= 4) && !defined(__MINGW32__)
-#    define ZSTDLIB_VISIBLE __attribute__ ((visibility ("default")))
 #    define ZSTDLIB_HIDDEN __attribute__ ((visibility ("hidden")))
 #  else
-#    define ZSTDLIB_VISIBLE
 #    define ZSTDLIB_HIDDEN
 #  endif
 #endif
+
 #define ZSTDLIB_API ZSTDLIB_VISIBLE
 
+/* Deprecation warnings :
+ * Should these warnings be a problem, it is generally possible to disable them,
+ * typically with -Wno-deprecated-declarations for gcc or _CRT_SECURE_NO_WARNINGS in Visual.
+ * Otherwise, it's also possible to define ZSTD_DISABLE_DEPRECATE_WARNINGS.
+ */
+#ifdef ZSTD_DISABLE_DEPRECATE_WARNINGS
+#  define ZSTD_DEPRECATED(message) /* disable deprecation warnings */
+#else
+#  if (defined(GNUC) && (GNUC > 4 || (GNUC == 4 && GNUC_MINOR >= 5))) || defined(__clang__)
+#    define ZSTD_DEPRECATED(message) __attribute__((deprecated(message)))
+#  elif (__GNUC__ >= 3)
+#    define ZSTD_DEPRECATED(message) __attribute__((deprecated))
+#  else
+#    pragma message("WARNING: You need to implement ZSTD_DEPRECATED for this compiler")
+#    define ZSTD_DEPRECATED(message)
+#  endif
+#endif /* ZSTD_DISABLE_DEPRECATE_WARNINGS */
+
 
 /* *****************************************************************************
   Introduction
@@ -65,7 +85,7 @@
 /*------   Version   ------*/
 #define ZSTD_VERSION_MAJOR    1
 #define ZSTD_VERSION_MINOR    5
-#define ZSTD_VERSION_RELEASE  2
+#define ZSTD_VERSION_RELEASE  5
 #define ZSTD_VERSION_NUMBER  (ZSTD_VERSION_MAJOR *100*100 + ZSTD_VERSION_MINOR *100 + ZSTD_VERSION_RELEASE)
 
 /*! ZSTD_versionNumber() :
@@ -107,7 +127,8 @@ ZSTDLIB_API const char* ZSTD_versionString(void);
 ***************************************/
 /*! ZSTD_compress() :
  *  Compresses `src` content as a single zstd compressed frame into already allocated `dst`.
- *  Hint : compression runs faster if `dstCapacity` >=  `ZSTD_compressBound(srcSize)`.
+ *  NOTE: Providing `dstCapacity >= ZSTD_compressBound(srcSize)` guarantees that zstd will have
+ *        enough space to successfully compress the data.
  *  @return : compressed size written into `dst` (<= `dstCapacity),
  *            or an error code if it fails (which can be tested using ZSTD_isError()). */
 ZSTDLIB_API size_t ZSTD_compress( void* dst, size_t dstCapacity,
@@ -156,7 +177,9 @@ ZSTDLIB_API unsigned long long ZSTD_getFrameContentSize(const void *src, size_t
  *  "empty", "unknown" and "error" results to the same return value (0),
  *  while ZSTD_getFrameContentSize() gives them separate return values.
  * @return : decompressed size of `src` frame content _if known and not empty_, 0 otherwise. */
-ZSTDLIB_API unsigned long long ZSTD_getDecompressedSize(const void* src, size_t srcSize);
+ZSTD_DEPRECATED("Replaced by ZSTD_getFrameContentSize")
+ZSTDLIB_API
+unsigned long long ZSTD_getDecompressedSize(const void* src, size_t srcSize);
 
 /*! ZSTD_findFrameCompressedSize() : Requires v1.4.0+
  * `src` should point to the start of a ZSTD frame or skippable frame.
@@ -168,8 +191,30 @@ ZSTDLIB_API size_t ZSTD_findFrameCompressedSize(const void* src, size_t srcSize)
 
 
 /*======  Helper functions  ======*/
-#define ZSTD_COMPRESSBOUND(srcSize)   ((srcSize) + ((srcSize)>>8) + (((srcSize) < (128<<10)) ? (((128<<10) - (srcSize)) >> 11) /* margin, from 64 to 0 */ : 0))  /* this formula ensures that bound(A) + bound(B) <= bound(A+B) as long as A and B >= 128 KB */
-ZSTDLIB_API size_t      ZSTD_compressBound(size_t srcSize); /*!< maximum compressed size in worst case single-pass scenario */
+/* ZSTD_compressBound() :
+ * maximum compressed size in worst case single-pass scenario.
+ * When invoking `ZSTD_compress()` or any other one-pass compression function,
+ * it's recommended to provide @dstCapacity >= ZSTD_compressBound(srcSize)
+ * as it eliminates one potential failure scenario,
+ * aka not enough room in dst buffer to write the compressed frame.
+ * Note : ZSTD_compressBound() itself can fail, if @srcSize > ZSTD_MAX_INPUT_SIZE .
+ *        In which case, ZSTD_compressBound() will return an error code
+ *        which can be tested using ZSTD_isError().
+ *
+ * ZSTD_COMPRESSBOUND() :
+ * same as ZSTD_compressBound(), but as a macro.
+ * It can be used to produce constants, which can be useful for static allocation,
+ * for example to size a static array on stack.
+ * Will produce constant value 0 if srcSize too large.
+ */
+#define ZSTD_MAX_INPUT_SIZE ((sizeof(size_t)==8) ? 0xFF00FF00FF00FF00LLU : 0xFF00FF00U)
+#define ZSTD_COMPRESSBOUND(srcSize)   (((size_t)(srcSize) >= ZSTD_MAX_INPUT_SIZE) ? 0 : (srcSize) + ((srcSize)>>8) + (((srcSize) < (128<<10)) ? (((128<<10) - (srcSize)) >> 11) /* margin, from 64 to 0 */ : 0))  /* this formula ensures that bound(A) + bound(B) <= bound(A+B) as long as A and B >= 128 KB */
+ZSTDLIB_API size_t ZSTD_compressBound(size_t srcSize); /*!< maximum compressed size in worst case single-pass scenario */
+/* ZSTD_isError() :
+ * Most ZSTD_* functions returning a size_t value can be tested for error,
+ * using ZSTD_isError().
+ * @return 1 if error, 0 otherwise
+ */
 ZSTDLIB_API unsigned    ZSTD_isError(size_t code);          /*!< tells if a `size_t` function result is an error code */
 ZSTDLIB_API const char* ZSTD_getErrorName(size_t code);     /*!< provides readable string from an error code */
 ZSTDLIB_API int         ZSTD_minCLevel(void);               /*!< minimum negative compression level allowed, requires v1.4.0+ */
@@ -412,6 +457,9 @@ typedef enum {
      * ZSTD_c_validateSequences
      * ZSTD_c_useBlockSplitter
      * ZSTD_c_useRowMatchFinder
+     * ZSTD_c_prefetchCDictTables
+     * ZSTD_c_enableSeqProducerFallback
+     * ZSTD_c_maxBlockSize
      * Because they are not stable, it's necessary to define ZSTD_STATIC_LINKING_ONLY to access them.
      * note : never ever use experimentalParam? names directly;
      *        also, the enums values themselves are unstable and can still change.
@@ -430,7 +478,11 @@ typedef enum {
      ZSTD_c_experimentalParam12=1009,
      ZSTD_c_experimentalParam13=1010,
      ZSTD_c_experimentalParam14=1011,
-     ZSTD_c_experimentalParam15=1012
+     ZSTD_c_experimentalParam15=1012,
+     ZSTD_c_experimentalParam16=1013,
+     ZSTD_c_experimentalParam17=1014,
+     ZSTD_c_experimentalParam18=1015,
+     ZSTD_c_experimentalParam19=1016
 } ZSTD_cParameter;
 
 typedef struct {
@@ -493,7 +545,7 @@ typedef enum {
  *                  They will be used to compress next frame.
  *                  Resetting session never fails.
  *  - The parameters : changes all parameters back to "default".
- *                  This removes any reference to any dictionary too.
+ *                  This also removes any reference to any dictionary or external sequence producer.
  *                  Parameters can only be changed between 2 sessions (i.e. no compression is currently ongoing)
  *                  otherwise the reset fails, and function returns an error value (which can be tested using ZSTD_isError())
  *  - Both : similar to resetting the session, followed by resetting parameters.
@@ -506,7 +558,8 @@ ZSTDLIB_API size_t ZSTD_CCtx_reset(ZSTD_CCtx* cctx, ZSTD_ResetDirective reset);
  *  Should cctx hold data from a previously unfinished frame, everything about it is forgotten.
  *  - Compression parameters are pushed into CCtx before starting compression, using ZSTD_CCtx_set*()
  *  - The function is always blocking, returns when compression is completed.
- *  Hint : compression runs faster if `dstCapacity` >=  `ZSTD_compressBound(srcSize)`.
+ *  NOTE: Providing `dstCapacity >= ZSTD_compressBound(srcSize)` guarantees that zstd will have
+ *        enough space to successfully compress the data, though it is possible it fails for other reasons.
  * @return : compressed size written into `dst` (<= `dstCapacity),
  *           or an error code if it fails (which can be tested using ZSTD_isError()).
  */
@@ -543,13 +596,15 @@ typedef enum {
      * ZSTD_d_stableOutBuffer
      * ZSTD_d_forceIgnoreChecksum
      * ZSTD_d_refMultipleDDicts
+     * ZSTD_d_disableHuffmanAssembly
      * Because they are not stable, it's necessary to define ZSTD_STATIC_LINKING_ONLY to access them.
      * note : never ever use experimentalParam? names directly
      */
      ZSTD_d_experimentalParam1=1000,
      ZSTD_d_experimentalParam2=1001,
      ZSTD_d_experimentalParam3=1002,
-     ZSTD_d_experimentalParam4=1003
+     ZSTD_d_experimentalParam4=1003,
+     ZSTD_d_experimentalParam5=1004
 
 } ZSTD_dParameter;
 
@@ -728,8 +783,6 @@ ZSTDLIB_API size_t ZSTD_CStreamOutSize(void);   /*< recommended size for output
  * This following is a legacy streaming API, available since v1.0+ .
  * It can be replaced by ZSTD_CCtx_reset() and ZSTD_compressStream2().
  * It is redundant, but remains fully supported.
- * Streaming in combination with advanced parameters and dictionary compression
- * can only be used through the new API.
  ******************************************************************************/
 
 /*!
@@ -738,6 +791,9 @@ ZSTDLIB_API size_t ZSTD_CStreamOutSize(void);   /*< recommended size for output
  *     ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only);
  *     ZSTD_CCtx_refCDict(zcs, NULL); // clear the dictionary (if any)
  *     ZSTD_CCtx_setParameter(zcs, ZSTD_c_compressionLevel, compressionLevel);
+ *
+ * Note that ZSTD_initCStream() clears any previously set dictionary. Use the new API
+ * to compress with a dictionary.
  */
 ZSTDLIB_API size_t ZSTD_initCStream(ZSTD_CStream* zcs, int compressionLevel);
 /*!
@@ -788,13 +844,31 @@ ZSTDLIB_API size_t ZSTD_freeDStream(ZSTD_DStream* zds);  /* accept NULL pointer
 
 /*===== Streaming decompression functions =====*/
 
-/* This function is redundant with the advanced API and equivalent to:
+/*! ZSTD_initDStream() :
+ * Initialize/reset DStream state for new decompression operation.
+ * Call before new decompression operation using same DStream.
  *
+ * Note : This function is redundant with the advanced API and equivalent to:
  *     ZSTD_DCtx_reset(zds, ZSTD_reset_session_only);
  *     ZSTD_DCtx_refDDict(zds, NULL);
  */
 ZSTDLIB_API size_t ZSTD_initDStream(ZSTD_DStream* zds);
 
+/*! ZSTD_decompressStream() :
+ * Streaming decompression function.
+ * Call repetitively to consume full input updating it as necessary.
+ * Function will update both input and output `pos` fields exposing current state via these fields:
+ * - `input.pos < input.size`, some input remaining and caller should provide remaining input
+ *   on the next call.
+ * - `output.pos < output.size`, decoder finished and flushed all remaining buffers.
+ * - `output.pos == output.size`, potentially uncflushed data present in the internal buffers,
+ *   call ZSTD_decompressStream() again to flush remaining data to output.
+ * Note : with no additional input, amount of data flushed <= ZSTD_BLOCKSIZE_MAX.
+ *
+ * @return : 0 when a frame is completely decoded and fully flushed,
+ *           or an error code, which can be tested using ZSTD_isError(),
+ *           or any other value > 0, which means there is some decoding or flushing to do to complete current frame.
+ */
 ZSTDLIB_API size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inBuffer* input);
 
 ZSTDLIB_API size_t ZSTD_DStreamInSize(void);    /*!< recommended size for input buffer */
@@ -913,7 +987,7 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromDDict(const ZSTD_DDict* ddict);
  *  If @return == 0, the dictID could not be decoded.
  *  This could for one of the following reasons :
  *  - The frame does not require a dictionary to be decoded (most common case).
- *  - The frame was built with dictID intentionally removed. Whatever dictionary is necessary is a hidden information.
+ *  - The frame was built with dictID intentionally removed. Whatever dictionary is necessary is a hidden piece of information.
  *    Note : this use case also happens when using a non-conformant dictionary.
  *  - `srcSize` is too small, and as a result, the frame header could not be decoded (only possible if `srcSize < ZSTD_FRAMEHEADERSIZE_MAX`).
  *  - This is not a Zstandard frame.
@@ -925,9 +999,11 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize);
  * Advanced dictionary and prefix API (Requires v1.4.0+)
  *
  * This API allows dictionaries to be used with ZSTD_compress2(),
- * ZSTD_compressStream2(), and ZSTD_decompressDCtx(). Dictionaries are sticky, and
- * only reset with the context is reset with ZSTD_reset_parameters or
- * ZSTD_reset_session_and_parameters. Prefixes are single-use.
+ * ZSTD_compressStream2(), and ZSTD_decompressDCtx().
+ * Dictionaries are sticky, they remain valid when same context is re-used,
+ * they only reset when the context is reset
+ * with ZSTD_reset_parameters or ZSTD_reset_session_and_parameters.
+ * In contrast, Prefixes are single-use.
  ******************************************************************************/
 
 
@@ -937,8 +1013,9 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize);
  * @result : 0, or an error code (which can be tested with ZSTD_isError()).
  *  Special: Loading a NULL (or 0-size) dictionary invalidates previous dictionary,
  *           meaning "return to no-dictionary mode".
- *  Note 1 : Dictionary is sticky, it will be used for all future compressed frames.
- *           To return to "no-dictionary" situation, load a NULL dictionary (or reset parameters).
+ *  Note 1 : Dictionary is sticky, it will be used for all future compressed frames,
+ *           until parameters are reset, a new dictionary is loaded, or the dictionary
+ *           is explicitly invalidated by loading a NULL dictionary.
  *  Note 2 : Loading a dictionary involves building tables.
  *           It's also a CPU consuming operation, with non-negligible impact on latency.
  *           Tables are dependent on compression parameters, and for this reason,
@@ -947,11 +1024,15 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize);
  *           Use experimental ZSTD_CCtx_loadDictionary_byReference() to reference content instead.
  *           In such a case, dictionary buffer must outlive its users.
  *  Note 4 : Use ZSTD_CCtx_loadDictionary_advanced()
- *           to precisely select how dictionary content must be interpreted. */
+ *           to precisely select how dictionary content must be interpreted.
+ *  Note 5 : This method does not benefit from LDM (long distance mode).
+ *           If you want to employ LDM on some large dictionary content,
+ *           prefer employing ZSTD_CCtx_refPrefix() described below.
+ */
 ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary(ZSTD_CCtx* cctx, const void* dict, size_t dictSize);
 
 /*! ZSTD_CCtx_refCDict() : Requires v1.4.0+
- *  Reference a prepared dictionary, to be used for all next compressed frames.
+ *  Reference a prepared dictionary, to be used for all future compressed frames.
  *  Note that compression parameters are enforced from within CDict,
  *  and supersede any compression parameter previously set within CCtx.
  *  The parameters ignored are labelled as "superseded-by-cdict" in the ZSTD_cParameter enum docs.
@@ -970,6 +1051,7 @@ ZSTDLIB_API size_t ZSTD_CCtx_refCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict);
  *  Decompression will need same prefix to properly regenerate data.
  *  Compressing with a prefix is similar in outcome as performing a diff and compressing it,
  *  but performs much faster, especially during decompression (compression speed is tunable with compression level).
+ *  This method is compatible with LDM (long distance mode).
  * @result : 0, or an error code (which can be tested with ZSTD_isError()).
  *  Special: Adding any prefix (including NULL) invalidates any previous prefix or dictionary
  *  Note 1 : Prefix buffer is referenced. It **must** outlive compression.
@@ -986,9 +1068,9 @@ ZSTDLIB_API size_t ZSTD_CCtx_refPrefix(ZSTD_CCtx* cctx,
                                  const void* prefix, size_t prefixSize);
 
 /*! ZSTD_DCtx_loadDictionary() : Requires v1.4.0+
- *  Create an internal DDict from dict buffer,
- *  to be used to decompress next frames.
- *  The dictionary remains valid for all future frames, until explicitly invalidated.
+ *  Create an internal DDict from dict buffer, to be used to decompress all future frames.
+ *  The dictionary remains valid for all future frames, until explicitly invalidated, or
+ *  a new dictionary is loaded.
  * @result : 0, or an error code (which can be tested with ZSTD_isError()).
  *  Special : Adding a NULL (or 0-size) dictionary invalidates any previous dictionary,
  *            meaning "return to no-dictionary mode".
@@ -1012,9 +1094,10 @@ ZSTDLIB_API size_t ZSTD_DCtx_loadDictionary(ZSTD_DCtx* dctx, const void* dict, s
  *  The memory for the table is allocated on the first call to refDDict, and can be
  *  freed with ZSTD_freeDCtx().
  *
+ *  If called with ZSTD_d_refMultipleDDicts disabled (the default), only one dictionary
+ *  will be managed, and referencing a dictionary effectively "discards" any previous one.
+ *
  * @result : 0, or an error code (which can be tested with ZSTD_isError()).
- *  Note 1 : Currently, only one dictionary can be managed.
- *           Referencing a new dictionary effectively "discards" any previous one.
  *  Special: referencing a NULL DDict means "return to no-dictionary mode".
  *  Note 2 : DDict is just referenced, its lifetime must outlive its usage from DCtx.
  */
@@ -1071,24 +1154,6 @@ ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict);
 #define ZSTDLIB_STATIC_API ZSTDLIB_VISIBLE
 #endif
 
-/* Deprecation warnings :
- * Should these warnings be a problem, it is generally possible to disable them,
- * typically with -Wno-deprecated-declarations for gcc or _CRT_SECURE_NO_WARNINGS in Visual.
- * Otherwise, it's also possible to define ZSTD_DISABLE_DEPRECATE_WARNINGS.
- */
-#ifdef ZSTD_DISABLE_DEPRECATE_WARNINGS
-#  define ZSTD_DEPRECATED(message) ZSTDLIB_STATIC_API  /* disable deprecation warnings */
-#else
-#  if (defined(GNUC) && (GNUC > 4 || (GNUC == 4 && GNUC_MINOR >= 5))) || defined(__clang__)
-#    define ZSTD_DEPRECATED(message) ZSTDLIB_STATIC_API __attribute__((deprecated(message)))
-#  elif (__GNUC__ >= 3)
-#    define ZSTD_DEPRECATED(message) ZSTDLIB_STATIC_API __attribute__((deprecated))
-#  else
-#    pragma message("WARNING: You need to implement ZSTD_DEPRECATED for this compiler")
-#    define ZSTD_DEPRECATED(message) ZSTDLIB_STATIC_API
-#  endif
-#endif /* ZSTD_DISABLE_DEPRECATE_WARNINGS */
-
 /* **************************************************************************************
  *   experimental API (static linking only)
  ****************************************************************************************
@@ -1123,6 +1188,7 @@ ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict);
 #define ZSTD_TARGETLENGTH_MIN     0   /* note : comparing this constant to an unsigned results in a tautological test */
 #define ZSTD_STRATEGY_MIN        ZSTD_fast
 #define ZSTD_STRATEGY_MAX        ZSTD_btultra2
+#define ZSTD_BLOCKSIZE_MAX_MIN (1 << 10) /* The minimum valid max blocksize. Maximum blocksizes smaller than this make compressBound() inaccurate. */
 
 
 #define ZSTD_OVERLAPLOG_MIN       0
@@ -1303,7 +1369,7 @@ typedef enum {
 } ZSTD_paramSwitch_e;
 
 /* *************************************
-*  Frame size functions
+*  Frame header and size functions
 ***************************************/
 
 /*! ZSTD_findDecompressedSize() :
@@ -1350,29 +1416,109 @@ ZSTDLIB_STATIC_API unsigned long long ZSTD_decompressBound(const void* src, size
  *           or an error code (if srcSize is too small) */
 ZSTDLIB_STATIC_API size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize);
 
+typedef enum { ZSTD_frame, ZSTD_skippableFrame } ZSTD_frameType_e;
+typedef struct {
+    unsigned long long frameContentSize; /* if == ZSTD_CONTENTSIZE_UNKNOWN, it means this field is not available. 0 means "empty" */
+    unsigned long long windowSize;       /* can be very large, up to <= frameContentSize */
+    unsigned blockSizeMax;
+    ZSTD_frameType_e frameType;          /* if == ZSTD_skippableFrame, frameContentSize is the size of skippable content */
+    unsigned headerSize;
+    unsigned dictID;
+    unsigned checksumFlag;
+    unsigned _reserved1;
+    unsigned _reserved2;
+} ZSTD_frameHeader;
+
+/*! ZSTD_getFrameHeader() :
+ *  decode Frame Header, or requires larger `srcSize`.
+ * @return : 0, `zfhPtr` is correctly filled,
+ *          >0, `srcSize` is too small, value is wanted `srcSize` amount,
+ *           or an error code, which can be tested using ZSTD_isError() */
+ZSTDLIB_STATIC_API size_t ZSTD_getFrameHeader(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize);   /*< doesn't consume input */
+/*! ZSTD_getFrameHeader_advanced() :
+ *  same as ZSTD_getFrameHeader(),
+ *  with added capability to select a format (like ZSTD_f_zstd1_magicless) */
+ZSTDLIB_STATIC_API size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format);
+
+/*! ZSTD_decompressionMargin() :
+ * Zstd supports in-place decompression, where the input and output buffers overlap.
+ * In this case, the output buffer must be at least (Margin + Output_Size) bytes large,
+ * and the input buffer must be at the end of the output buffer.
+ *
+ *  _______________________ Output Buffer ________________________
+ * |                                                              |
+ * |                                        ____ Input Buffer ____|
+ * |                                       |                      |
+ * v                                       v                      v
+ * |---------------------------------------|-----------|----------|
+ * ^                                                   ^          ^
+ * |___________________ Output_Size ___________________|_ Margin _|
+ *
+ * NOTE: See also ZSTD_DECOMPRESSION_MARGIN().
+ * NOTE: This applies only to single-pass decompression through ZSTD_decompress() or
+ * ZSTD_decompressDCtx().
+ * NOTE: This function supports multi-frame input.
+ *
+ * @param src The compressed frame(s)
+ * @param srcSize The size of the compressed frame(s)
+ * @returns The decompression margin or an error that can be checked with ZSTD_isError().
+ */
+ZSTDLIB_STATIC_API size_t ZSTD_decompressionMargin(const void* src, size_t srcSize);
+
+/*! ZSTD_DECOMPRESS_MARGIN() :
+ * Similar to ZSTD_decompressionMargin(), but instead of computing the margin from
+ * the compressed frame, compute it from the original size and the blockSizeLog.
+ * See ZSTD_decompressionMargin() for details.
+ *
+ * WARNING: This macro does not support multi-frame input, the input must be a single
+ * zstd frame. If you need that support use the function, or implement it yourself.
+ *
+ * @param originalSize The original uncompressed size of the data.
+ * @param blockSize    The block size == MIN(windowSize, ZSTD_BLOCKSIZE_MAX).
+ *                     Unless you explicitly set the windowLog smaller than
+ *                     ZSTD_BLOCKSIZELOG_MAX you can just use ZSTD_BLOCKSIZE_MAX.
+ */
+#define ZSTD_DECOMPRESSION_MARGIN(originalSize, blockSize) ((size_t)(                                              \
+        ZSTD_FRAMEHEADERSIZE_MAX                                                              /* Frame header */ + \
+        4                                                                                         /* checksum */ + \
+        ((originalSize) == 0 ? 0 : 3 * (((originalSize) + (blockSize) - 1) / blockSize)) /* 3 bytes per block */ + \
+        (blockSize)                                                                    /* One block of margin */   \
+    ))
+
 typedef enum {
   ZSTD_sf_noBlockDelimiters = 0,         /* Representation of ZSTD_Sequence has no block delimiters, sequences only */
   ZSTD_sf_explicitBlockDelimiters = 1    /* Representation of ZSTD_Sequence contains explicit block delimiters */
 } ZSTD_sequenceFormat_e;
 
+/*! ZSTD_sequenceBound() :
+ * `srcSize` : size of the input buffer
+ *  @return : upper-bound for the number of sequences that can be generated
+ *            from a buffer of srcSize bytes
+ *
+ *  note : returns number of sequences - to get bytes, multiply by sizeof(ZSTD_Sequence).
+ */
+ZSTDLIB_STATIC_API size_t ZSTD_sequenceBound(size_t srcSize);
+
 /*! ZSTD_generateSequences() :
- * Generate sequences using ZSTD_compress2, given a source buffer.
+ * Generate sequences using ZSTD_compress2(), given a source buffer.
  *
  * Each block will end with a dummy sequence
  * with offset == 0, matchLength == 0, and litLength == length of last literals.
  * litLength may be == 0, and if so, then the sequence of (of: 0 ml: 0 ll: 0)
  * simply acts as a block delimiter.
  *
- * zc can be used to insert custom compression params.
- * This function invokes ZSTD_compress2
+ * @zc can be used to insert custom compression params.
+ * This function invokes ZSTD_compress2().
  *
  * The output of this function can be fed into ZSTD_compressSequences() with CCtx
  * setting of ZSTD_c_blockDelimiters as ZSTD_sf_explicitBlockDelimiters
  * @return : number of sequences generated
  */
 
-ZSTDLIB_STATIC_API size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs,
-                                          size_t outSeqsSize, const void* src, size_t srcSize);
+ZSTDLIB_STATIC_API size_t
+ZSTD_generateSequences( ZSTD_CCtx* zc,
+                        ZSTD_Sequence* outSeqs, size_t outSeqsSize,
+                        const void* src, size_t srcSize);
 
 /*! ZSTD_mergeBlockDelimiters() :
  * Given an array of ZSTD_Sequence, remove all sequences that represent block delimiters/last literals
@@ -1388,7 +1534,9 @@ ZSTDLIB_STATIC_API size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* o
 ZSTDLIB_STATIC_API size_t ZSTD_mergeBlockDelimiters(ZSTD_Sequence* sequences, size_t seqsSize);
 
 /*! ZSTD_compressSequences() :
- * Compress an array of ZSTD_Sequence, generated from the original source buffer, into dst.
+ * Compress an array of ZSTD_Sequence, associated with @src buffer, into dst.
+ * @src contains the entire input (not just the literals).
+ * If @srcSize > sum(sequence.length), the remaining bytes are considered all literals
  * If a dictionary is included, then the cctx should reference the dict. (see: ZSTD_CCtx_refCDict(), ZSTD_CCtx_loadDictionary(), etc.)
  * The entire source is compressed into a single frame.
  *
@@ -1413,11 +1561,12 @@ ZSTDLIB_STATIC_API size_t ZSTD_mergeBlockDelimiters(ZSTD_Sequence* sequences, si
  * Note: Repcodes are, as of now, always re-calculated within this function, so ZSTD_Sequence::rep is unused.
  * Note 2: Once we integrate ability to ingest repcodes, the explicit block delims mode must respect those repcodes exactly,
  *         and cannot emit an RLE block that disagrees with the repcode history
- * @return : final compressed size or a ZSTD error.
+ * @return : final compressed size, or a ZSTD error code.
  */
-ZSTDLIB_STATIC_API size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst, size_t dstSize,
-                                  const ZSTD_Sequence* inSeqs, size_t inSeqsSize,
-                                  const void* src, size_t srcSize);
+ZSTDLIB_STATIC_API size_t
+ZSTD_compressSequences( ZSTD_CCtx* cctx, void* dst, size_t dstSize,
+                        const ZSTD_Sequence* inSeqs, size_t inSeqsSize,
+                        const void* src, size_t srcSize);
 
 
 /*! ZSTD_writeSkippableFrame() :
@@ -1481,8 +1630,11 @@ ZSTDLIB_API unsigned ZSTD_isSkippableFrame(const void* buffer, size_t size);
  *  and ZSTD_estimateCCtxSize_usingCCtxParams(), which can be used in tandem with ZSTD_CCtxParams_setParameter().
  *  Both can be used to estimate memory using custom compression parameters and arbitrary srcSize limits.
  *
- *  Note 2 : only single-threaded compression is supported.
+ *  Note : only single-threaded compression is supported.
  *  ZSTD_estimateCCtxSize_usingCCtxParams() will return an error code if ZSTD_c_nbWorkers is >= 1.
+ *
+ *  Note 2 : ZSTD_estimateCCtxSize* functions are not compatible with the Block-Level Sequence Producer API at this time.
+ *  Size estimates assume that no external sequence producer is registered.
  */
 ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize(int compressionLevel);
 ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize_usingCParams(ZSTD_compressionParameters cParams);
@@ -1501,7 +1653,12 @@ ZSTDLIB_STATIC_API size_t ZSTD_estimateDCtxSize(void);
  *  or deducted from a valid frame Header, using ZSTD_estimateDStreamSize_fromFrame();
  *  Note : if streaming is init with function ZSTD_init?Stream_usingDict(),
  *         an internal ?Dict will be created, which additional size is not estimated here.
- *         In this case, get total size by adding ZSTD_estimate?DictSize */
+ *         In this case, get total size by adding ZSTD_estimate?DictSize
+ *  Note 2 : only single-threaded compression is supported.
+ *  ZSTD_estimateCStreamSize_usingCCtxParams() will return an error code if ZSTD_c_nbWorkers is >= 1.
+ *  Note 3 : ZSTD_estimateCStreamSize* functions are not compatible with the Block-Level Sequence Producer API at this time.
+ *  Size estimates assume that no external sequence producer is registered.
+ */
 ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize(int compressionLevel);
 ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize_usingCParams(ZSTD_compressionParameters cParams);
 ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params);
@@ -1649,22 +1806,45 @@ ZSTDLIB_STATIC_API size_t ZSTD_checkCParams(ZSTD_compressionParameters params);
  *  This function never fails (wide contract) */
 ZSTDLIB_STATIC_API ZSTD_compressionParameters ZSTD_adjustCParams(ZSTD_compressionParameters cPar, unsigned long long srcSize, size_t dictSize);
 
+/*! ZSTD_CCtx_setCParams() :
+ *  Set all parameters provided within @p cparams into the working @p cctx.
+ *  Note : if modifying parameters during compression (MT mode only),
+ *         note that changes to the .windowLog parameter will be ignored.
+ * @return 0 on success, or an error code (can be checked with ZSTD_isError()).
+ *         On failure, no parameters are updated.
+ */
+ZSTDLIB_STATIC_API size_t ZSTD_CCtx_setCParams(ZSTD_CCtx* cctx, ZSTD_compressionParameters cparams);
+
+/*! ZSTD_CCtx_setFParams() :
+ *  Set all parameters provided within @p fparams into the working @p cctx.
+ * @return 0 on success, or an error code (can be checked with ZSTD_isError()).
+ */
+ZSTDLIB_STATIC_API size_t ZSTD_CCtx_setFParams(ZSTD_CCtx* cctx, ZSTD_frameParameters fparams);
+
+/*! ZSTD_CCtx_setParams() :
+ *  Set all parameters provided within @p params into the working @p cctx.
+ * @return 0 on success, or an error code (can be checked with ZSTD_isError()).
+ */
+ZSTDLIB_STATIC_API size_t ZSTD_CCtx_setParams(ZSTD_CCtx* cctx, ZSTD_parameters params);
+
 /*! ZSTD_compress_advanced() :
  *  Note : this function is now DEPRECATED.
  *         It can be replaced by ZSTD_compress2(), in combination with ZSTD_CCtx_setParameter() and other parameter setters.
  *  This prototype will generate compilation warnings. */
 ZSTD_DEPRECATED("use ZSTD_compress2")
+ZSTDLIB_STATIC_API
 size_t ZSTD_compress_advanced(ZSTD_CCtx* cctx,
-                                          void* dst, size_t dstCapacity,
-                                    const void* src, size_t srcSize,
-                                    const void* dict,size_t dictSize,
-                                          ZSTD_parameters params);
+                              void* dst, size_t dstCapacity,
+                        const void* src, size_t srcSize,
+                        const void* dict,size_t dictSize,
+                              ZSTD_parameters params);
 
 /*! ZSTD_compress_usingCDict_advanced() :
  *  Note : this function is now DEPRECATED.
  *         It can be replaced by ZSTD_compress2(), in combination with ZSTD_CCtx_loadDictionary() and other parameter setters.
  *  This prototype will generate compilation warnings. */
 ZSTD_DEPRECATED("use ZSTD_compress2 with ZSTD_CCtx_loadDictionary")
+ZSTDLIB_STATIC_API
 size_t ZSTD_compress_usingCDict_advanced(ZSTD_CCtx* cctx,
                                               void* dst, size_t dstCapacity,
                                         const void* src, size_t srcSize,
@@ -1808,13 +1988,16 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo
  * Experimental parameter.
  * Default is 0 == disabled. Set to 1 to enable.
  *
- * Tells the compressor that the ZSTD_inBuffer will ALWAYS be the same
- * between calls, except for the modifications that zstd makes to pos (the
- * caller must not modify pos). This is checked by the compressor, and
- * compression will fail if it ever changes. This means the only flush
- * mode that makes sense is ZSTD_e_end, so zstd will error if ZSTD_e_end
- * is not used. The data in the ZSTD_inBuffer in the range [src, src + pos)
- * MUST not be modified during compression or you will get data corruption.
+ * Tells the compressor that input data presented with ZSTD_inBuffer
+ * will ALWAYS be the same between calls.
+ * Technically, the @src pointer must never be changed,
+ * and the @pos field can only be updated by zstd.
+ * However, it's possible to increase the @size field,
+ * allowing scenarios where more data can be appended after compressions starts.
+ * These conditions are checked by the compressor,
+ * and compression will fail if they are not respected.
+ * Also, data in the ZSTD_inBuffer within the range [src, src + pos)
+ * MUST not be modified during compression or it will result in data corruption.
  *
  * When this flag is enabled zstd won't allocate an input window buffer,
  * because the user guarantees it can reference the ZSTD_inBuffer until
@@ -1822,18 +2005,15 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo
  * large enough to fit a block (see ZSTD_c_stableOutBuffer). This will also
  * avoid the memcpy() from the input buffer to the input window buffer.
  *
- * NOTE: ZSTD_compressStream2() will error if ZSTD_e_end is not used.
- * That means this flag cannot be used with ZSTD_compressStream().
- *
  * NOTE: So long as the ZSTD_inBuffer always points to valid memory, using
  * this flag is ALWAYS memory safe, and will never access out-of-bounds
- * memory. However, compression WILL fail if you violate the preconditions.
+ * memory. However, compression WILL fail if conditions are not respected.
  *
- * WARNING: The data in the ZSTD_inBuffer in the range [dst, dst + pos) MUST
- * not be modified during compression or you will get data corruption. This
- * is because zstd needs to reference data in the ZSTD_inBuffer to find
+ * WARNING: The data in the ZSTD_inBuffer in the range [src, src + pos) MUST
+ * not be modified during compression or it will result in data corruption.
+ * This is because zstd needs to reference data in the ZSTD_inBuffer to find
  * matches. Normally zstd maintains its own window buffer for this purpose,
- * but passing this flag tells zstd to use the user provided buffer.
+ * but passing this flag tells zstd to rely on user provided buffer instead.
  */
 #define ZSTD_c_stableInBuffer ZSTD_c_experimentalParam9
 
@@ -1878,7 +2058,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo
  * Without validation, providing a sequence that does not conform to the zstd spec will cause
  * undefined behavior, and may produce a corrupted block.
  *
- * With validation enabled, a if sequence is invalid (see doc/zstd_compression_format.md for
+ * With validation enabled, if sequence is invalid (see doc/zstd_compression_format.md for
  * specifics regarding offset/matchlength requirements) then the function will bail out and
  * return an error.
  *
@@ -1928,6 +2108,79 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo
  */
 #define ZSTD_c_deterministicRefPrefix ZSTD_c_experimentalParam15
 
+/* ZSTD_c_prefetchCDictTables
+ * Controlled with ZSTD_paramSwitch_e enum. Default is ZSTD_ps_auto.
+ *
+ * In some situations, zstd uses CDict tables in-place rather than copying them
+ * into the working context. (See docs on ZSTD_dictAttachPref_e above for details).
+ * In such situations, compression speed is seriously impacted when CDict tables are
+ * "cold" (outside CPU cache). This parameter instructs zstd to prefetch CDict tables
+ * when they are used in-place.
+ *
+ * For sufficiently small inputs, the cost of the prefetch will outweigh the benefit.
+ * For sufficiently large inputs, zstd will by default memcpy() CDict tables
+ * into the working context, so there is no need to prefetch. This parameter is
+ * targeted at a middle range of input sizes, where a prefetch is cheap enough to be
+ * useful but memcpy() is too expensive. The exact range of input sizes where this
+ * makes sense is best determined by careful experimentation.
+ *
+ * Note: for this parameter, ZSTD_ps_auto is currently equivalent to ZSTD_ps_disable,
+ * but in the future zstd may conditionally enable this feature via an auto-detection
+ * heuristic for cold CDicts.
+ * Use ZSTD_ps_disable to opt out of prefetching under any circumstances.
+ */
+#define ZSTD_c_prefetchCDictTables ZSTD_c_experimentalParam16
+
+/* ZSTD_c_enableSeqProducerFallback
+ * Allowed values are 0 (disable) and 1 (enable). The default setting is 0.
+ *
+ * Controls whether zstd will fall back to an internal sequence producer if an
+ * external sequence producer is registered and returns an error code. This fallback
+ * is block-by-block: the internal sequence producer will only be called for blocks
+ * where the external sequence producer returns an error code. Fallback parsing will
+ * follow any other cParam settings, such as compression level, the same as in a
+ * normal (fully-internal) compression operation.
+ *
+ * The user is strongly encouraged to read the full Block-Level Sequence Producer API
+ * documentation (below) before setting this parameter. */
+#define ZSTD_c_enableSeqProducerFallback ZSTD_c_experimentalParam17
+
+/* ZSTD_c_maxBlockSize
+ * Allowed values are between 1KB and ZSTD_BLOCKSIZE_MAX (128KB).
+ * The default is ZSTD_BLOCKSIZE_MAX, and setting to 0 will set to the default.
+ *
+ * This parameter can be used to set an upper bound on the blocksize
+ * that overrides the default ZSTD_BLOCKSIZE_MAX. It cannot be used to set upper
+ * bounds greater than ZSTD_BLOCKSIZE_MAX or bounds lower than 1KB (will make
+ * compressBound() inaccurate). Only currently meant to be used for testing.
+ *
+ */
+#define ZSTD_c_maxBlockSize ZSTD_c_experimentalParam18
+
+/* ZSTD_c_searchForExternalRepcodes
+ * This parameter affects how zstd parses external sequences, such as sequences
+ * provided through the compressSequences() API or from an external block-level
+ * sequence producer.
+ *
+ * If set to ZSTD_ps_enable, the library will check for repeated offsets in
+ * external sequences, even if those repcodes are not explicitly indicated in
+ * the "rep" field. Note that this is the only way to exploit repcode matches
+ * while using compressSequences() or an external sequence producer, since zstd
+ * currently ignores the "rep" field of external sequences.
+ *
+ * If set to ZSTD_ps_disable, the library will not exploit repeated offsets in
+ * external sequences, regardless of whether the "rep" field has been set. This
+ * reduces sequence compression overhead by about 25% while sacrificing some
+ * compression ratio.
+ *
+ * The default value is ZSTD_ps_auto, for which the library will enable/disable
+ * based on compression level.
+ *
+ * Note: for now, this param only has an effect if ZSTD_c_blockDelimiters is
+ * set to ZSTD_sf_explicitBlockDelimiters. That may change in the future.
+ */
+#define ZSTD_c_searchForExternalRepcodes ZSTD_c_experimentalParam19
+
 /*! ZSTD_CCtx_getParameter() :
  *  Get the requested compression parameter value, selected by enum ZSTD_cParameter,
  *  and store it into int* value.
@@ -2084,7 +2337,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParamete
  * in the range [dst, dst + pos) MUST not be modified during decompression
  * or you will get data corruption.
  *
- * When this flags is enabled zstd won't allocate an output buffer, because
+ * When this flag is enabled zstd won't allocate an output buffer, because
  * it can write directly to the ZSTD_outBuffer, but it will still allocate
  * an input buffer large enough to fit any compressed block. This will also
  * avoid the memcpy() from the internal output buffer to the ZSTD_outBuffer.
@@ -2137,6 +2390,17 @@ ZSTDLIB_STATIC_API size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParamete
  */
 #define ZSTD_d_refMultipleDDicts ZSTD_d_experimentalParam4
 
+/* ZSTD_d_disableHuffmanAssembly
+ * Set to 1 to disable the Huffman assembly implementation.
+ * The default value is 0, which allows zstd to use the Huffman assembly
+ * implementation if available.
+ *
+ * This parameter can be used to disable Huffman assembly at runtime.
+ * If you want to disable it at compile time you can define the macro
+ * ZSTD_DISABLE_ASM.
+ */
+#define ZSTD_d_disableHuffmanAssembly ZSTD_d_experimentalParam5
+
 
 /*! ZSTD_DCtx_setFormat() :
  *  This function is REDUNDANT. Prefer ZSTD_DCtx_setParameter().
@@ -2145,6 +2409,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParamete
  *  such ZSTD_f_zstd1_magicless for example.
  * @return : 0, or an error code (which can be tested using ZSTD_isError()). */
 ZSTD_DEPRECATED("use ZSTD_DCtx_setParameter() instead")
+ZSTDLIB_STATIC_API
 size_t ZSTD_DCtx_setFormat(ZSTD_DCtx* dctx, ZSTD_format_e format);
 
 /*! ZSTD_decompressStream_simpleArgs() :
@@ -2181,6 +2446,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_decompressStream_simpleArgs (
  * This prototype will generate compilation warnings.
  */
 ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions")
+ZSTDLIB_STATIC_API
 size_t ZSTD_initCStream_srcSize(ZSTD_CStream* zcs,
                          int compressionLevel,
                          unsigned long long pledgedSrcSize);
@@ -2198,17 +2464,15 @@ size_t ZSTD_initCStream_srcSize(ZSTD_CStream* zcs,
  * This prototype will generate compilation warnings.
  */
 ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions")
+ZSTDLIB_STATIC_API
 size_t ZSTD_initCStream_usingDict(ZSTD_CStream* zcs,
                      const void* dict, size_t dictSize,
                            int compressionLevel);
 
 /*! ZSTD_initCStream_advanced() :
- * This function is DEPRECATED, and is approximately equivalent to:
+ * This function is DEPRECATED, and is equivalent to:
  *     ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only);
- *     // Pseudocode: Set each zstd parameter and leave the rest as-is.
- *     for ((param, value) : params) {
- *         ZSTD_CCtx_setParameter(zcs, param, value);
- *     }
+ *     ZSTD_CCtx_setParams(zcs, params);
  *     ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize);
  *     ZSTD_CCtx_loadDictionary(zcs, dict, dictSize);
  *
@@ -2218,6 +2482,7 @@ size_t ZSTD_initCStream_usingDict(ZSTD_CStream* zcs,
  * This prototype will generate compilation warnings.
  */
 ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions")
+ZSTDLIB_STATIC_API
 size_t ZSTD_initCStream_advanced(ZSTD_CStream* zcs,
                     const void* dict, size_t dictSize,
                           ZSTD_parameters params,
@@ -2232,15 +2497,13 @@ size_t ZSTD_initCStream_advanced(ZSTD_CStream* zcs,
  * This prototype will generate compilation warnings.
  */
 ZSTD_DEPRECATED("use ZSTD_CCtx_reset and ZSTD_CCtx_refCDict, see zstd.h for detailed instructions")
+ZSTDLIB_STATIC_API
 size_t ZSTD_initCStream_usingCDict(ZSTD_CStream* zcs, const ZSTD_CDict* cdict);
 
 /*! ZSTD_initCStream_usingCDict_advanced() :
- *   This function is DEPRECATED, and is approximately equivalent to:
+ *   This function is DEPRECATED, and is equivalent to:
  *     ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only);
- *     // Pseudocode: Set each zstd frame parameter and leave the rest as-is.
- *     for ((fParam, value) : fParams) {
- *         ZSTD_CCtx_setParameter(zcs, fParam, value);
- *     }
+ *     ZSTD_CCtx_setFParams(zcs, fParams);
  *     ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize);
  *     ZSTD_CCtx_refCDict(zcs, cdict);
  *
@@ -2250,6 +2513,7 @@ size_t ZSTD_initCStream_usingCDict(ZSTD_CStream* zcs, const ZSTD_CDict* cdict);
  * This prototype will generate compilation warnings.
  */
 ZSTD_DEPRECATED("use ZSTD_CCtx_reset and ZSTD_CCtx_refCDict, see zstd.h for detailed instructions")
+ZSTDLIB_STATIC_API
 size_t ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs,
                                const ZSTD_CDict* cdict,
                                      ZSTD_frameParameters fParams,
@@ -2274,6 +2538,7 @@ size_t ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs,
  *  This prototype will generate compilation warnings.
  */
 ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions")
+ZSTDLIB_STATIC_API
 size_t ZSTD_resetCStream(ZSTD_CStream* zcs, unsigned long long pledgedSrcSize);
 
 
@@ -2319,8 +2584,8 @@ ZSTDLIB_STATIC_API size_t ZSTD_toFlushNow(ZSTD_CCtx* cctx);
  *     ZSTD_DCtx_loadDictionary(zds, dict, dictSize);
  *
  * note: no dictionary will be used if dict == NULL or dictSize < 8
- * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x
  */
+ZSTD_DEPRECATED("use ZSTD_DCtx_reset + ZSTD_DCtx_loadDictionary, see zstd.h for detailed instructions")
 ZSTDLIB_STATIC_API size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const void* dict, size_t dictSize);
 
 /*!
@@ -2330,8 +2595,8 @@ ZSTDLIB_STATIC_API size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const vo
  *     ZSTD_DCtx_refDDict(zds, ddict);
  *
  * note : ddict is referenced, it must outlive decompression session
- * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x
  */
+ZSTD_DEPRECATED("use ZSTD_DCtx_reset + ZSTD_DCtx_refDDict, see zstd.h for detailed instructions")
 ZSTDLIB_STATIC_API size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* zds, const ZSTD_DDict* ddict);
 
 /*!
@@ -2340,17 +2605,185 @@ ZSTDLIB_STATIC_API size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* zds, const Z
  *     ZSTD_DCtx_reset(zds, ZSTD_reset_session_only);
  *
  * re-use decompression parameters from previous init; saves dictionary loading
- * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x
  */
+ZSTD_DEPRECATED("use ZSTD_DCtx_reset, see zstd.h for detailed instructions")
 ZSTDLIB_STATIC_API size_t ZSTD_resetDStream(ZSTD_DStream* zds);
 
 
+/* ********************* BLOCK-LEVEL SEQUENCE PRODUCER API *********************
+ *
+ * *** OVERVIEW ***
+ * The Block-Level Sequence Producer API allows users to provide their own custom
+ * sequence producer which libzstd invokes to process each block. The produced list
+ * of sequences (literals and matches) is then post-processed by libzstd to produce
+ * valid compressed blocks.
+ *
+ * This block-level offload API is a more granular complement of the existing
+ * frame-level offload API compressSequences() (introduced in v1.5.1). It offers
+ * an easier migration story for applications already integrated with libzstd: the
+ * user application continues to invoke the same compression functions
+ * ZSTD_compress2() or ZSTD_compressStream2() as usual, and transparently benefits
+ * from the specific advantages of the external sequence producer. For example,
+ * the sequence producer could be tuned to take advantage of known characteristics
+ * of the input, to offer better speed / ratio, or could leverage hardware
+ * acceleration not available within libzstd itself.
+ *
+ * See contrib/externalSequenceProducer for an example program employing the
+ * Block-Level Sequence Producer API.
+ *
+ * *** USAGE ***
+ * The user is responsible for implementing a function of type
+ * ZSTD_sequenceProducer_F. For each block, zstd will pass the following
+ * arguments to the user-provided function:
+ *
+ *   - sequenceProducerState: a pointer to a user-managed state for the sequence
+ *     producer.
+ *
+ *   - outSeqs, outSeqsCapacity: an output buffer for the sequence producer.
+ *     outSeqsCapacity is guaranteed >= ZSTD_sequenceBound(srcSize). The memory
+ *     backing outSeqs is managed by the CCtx.
+ *
+ *   - src, srcSize: an input buffer for the sequence producer to parse.
+ *     srcSize is guaranteed to be <= ZSTD_BLOCKSIZE_MAX.
+ *
+ *   - dict, dictSize: a history buffer, which may be empty, which the sequence
+ *     producer may reference as it parses the src buffer. Currently, zstd will
+ *     always pass dictSize == 0 into external sequence producers, but this will
+ *     change in the future.
+ *
+ *   - compressionLevel: a signed integer representing the zstd compression level
+ *     set by the user for the current operation. The sequence producer may choose
+ *     to use this information to change its compression strategy and speed/ratio
+ *     tradeoff. Note: the compression level does not reflect zstd parameters set
+ *     through the advanced API.
+ *
+ *   - windowSize: a size_t representing the maximum allowed offset for external
+ *     sequences. Note that sequence offsets are sometimes allowed to exceed the
+ *     windowSize if a dictionary is present, see doc/zstd_compression_format.md
+ *     for details.
+ *
+ * The user-provided function shall return a size_t representing the number of
+ * sequences written to outSeqs. This return value will be treated as an error
+ * code if it is greater than outSeqsCapacity. The return value must be non-zero
+ * if srcSize is non-zero. The ZSTD_SEQUENCE_PRODUCER_ERROR macro is provided
+ * for convenience, but any value greater than outSeqsCapacity will be treated as
+ * an error code.
+ *
+ * If the user-provided function does not return an error code, the sequences
+ * written to outSeqs must be a valid parse of the src buffer. Data corruption may
+ * occur if the parse is not valid. A parse is defined to be valid if the
+ * following conditions hold:
+ *   - The sum of matchLengths and literalLengths must equal srcSize.
+ *   - All sequences in the parse, except for the final sequence, must have
+ *     matchLength >= ZSTD_MINMATCH_MIN. The final sequence must have
+ *     matchLength >= ZSTD_MINMATCH_MIN or matchLength == 0.
+ *   - All offsets must respect the windowSize parameter as specified in
+ *     doc/zstd_compression_format.md.
+ *   - If the final sequence has matchLength == 0, it must also have offset == 0.
+ *
+ * zstd will only validate these conditions (and fail compression if they do not
+ * hold) if the ZSTD_c_validateSequences cParam is enabled. Note that sequence
+ * validation has a performance cost.
+ *
+ * If the user-provided function returns an error, zstd will either fall back
+ * to an internal sequence producer or fail the compression operation. The user can
+ * choose between the two behaviors by setting the ZSTD_c_enableSeqProducerFallback
+ * cParam. Fallback compression will follow any other cParam settings, such as
+ * compression level, the same as in a normal compression operation.
+ *
+ * The user shall instruct zstd to use a particular ZSTD_sequenceProducer_F
+ * function by calling
+ *         ZSTD_registerSequenceProducer(cctx,
+ *                                       sequenceProducerState,
+ *                                       sequenceProducer)
+ * This setting will persist until the next parameter reset of the CCtx.
+ *
+ * The sequenceProducerState must be initialized by the user before calling
+ * ZSTD_registerSequenceProducer(). The user is responsible for destroying the
+ * sequenceProducerState.
+ *
+ * *** LIMITATIONS ***
+ * This API is compatible with all zstd compression APIs which respect advanced parameters.
+ * However, there are three limitations:
+ *
+ * First, the ZSTD_c_enableLongDistanceMatching cParam is not currently supported.
+ * COMPRESSION WILL FAIL if it is enabled and the user tries to compress with a block-level
+ * external sequence producer.
+ *   - Note that ZSTD_c_enableLongDistanceMatching is auto-enabled by default in some
+ *     cases (see its documentation for details). Users must explicitly set
+ *     ZSTD_c_enableLongDistanceMatching to ZSTD_ps_disable in such cases if an external
+ *     sequence producer is registered.
+ *   - As of this writing, ZSTD_c_enableLongDistanceMatching is disabled by default
+ *     whenever ZSTD_c_windowLog < 128MB, but that's subject to change. Users should
+ *     check the docs on ZSTD_c_enableLongDistanceMatching whenever the Block-Level Sequence
+ *     Producer API is used in conjunction with advanced settings (like ZSTD_c_windowLog).
+ *
+ * Second, history buffers are not currently supported. Concretely, zstd will always pass
+ * dictSize == 0 to the external sequence producer (for now). This has two implications:
+ *   - Dictionaries are not currently supported. Compression will *not* fail if the user
+ *     references a dictionary, but the dictionary won't have any effect.
+ *   - Stream history is not currently supported. All advanced compression APIs, including
+ *     streaming APIs, work with external sequence producers, but each block is treated as
+ *     an independent chunk without history from previous blocks.
+ *
+ * Third, multi-threading within a single compression is not currently supported. In other words,
+ * COMPRESSION WILL FAIL if ZSTD_c_nbWorkers > 0 and an external sequence producer is registered.
+ * Multi-threading across compressions is fine: simply create one CCtx per thread.
+ *
+ * Long-term, we plan to overcome all three limitations. There is no technical blocker to
+ * overcoming them. It is purely a question of engineering effort.
+ */
+
+#define ZSTD_SEQUENCE_PRODUCER_ERROR ((size_t)(-1))
+
+typedef size_t ZSTD_sequenceProducer_F (
+  void* sequenceProducerState,
+  ZSTD_Sequence* outSeqs, size_t outSeqsCapacity,
+  const void* src, size_t srcSize,
+  const void* dict, size_t dictSize,
+  int compressionLevel,
+  size_t windowSize
+);
+
+/*! ZSTD_registerSequenceProducer() :
+ * Instruct zstd to use a block-level external sequence producer function.
+ *
+ * The sequenceProducerState must be initialized by the caller, and the caller is
+ * responsible for managing its lifetime. This parameter is sticky across
+ * compressions. It will remain set until the user explicitly resets compression
+ * parameters.
+ *
+ * Sequence producer registration is considered to be an "advanced parameter",
+ * part of the "advanced API". This means it will only have an effect on compression
+ * APIs which respect advanced parameters, such as compress2() and compressStream2().
+ * Older compression APIs such as compressCCtx(), which predate the introduction of
+ * "advanced parameters", will ignore any external sequence producer setting.
+ *
+ * The sequence producer can be "cleared" by registering a NULL function pointer. This
+ * removes all limitations described above in the "LIMITATIONS" section of the API docs.
+ *
+ * The user is strongly encouraged to read the full API documentation (above) before
+ * calling this function. */
+ZSTDLIB_STATIC_API void
+ZSTD_registerSequenceProducer(
+  ZSTD_CCtx* cctx,
+  void* sequenceProducerState,
+  ZSTD_sequenceProducer_F* sequenceProducer
+);
+
+
 /* *******************************************************************
-*  Buffer-less and synchronous inner streaming functions
+*  Buffer-less and synchronous inner streaming functions (DEPRECATED)
+*
+*  This API is deprecated, and will be removed in a future version.
+*  It allows streaming (de)compression with user allocated buffers.
+*  However, it is hard to use, and not as well tested as the rest of
+*  our API.
 *
-*  This is an advanced API, giving full control over buffer management, for users which need direct control over memory.
-*  But it's also a complex one, with several restrictions, documented below.
-*  Prefer normal streaming API for an easier experience.
+*  Please use the normal streaming API instead: ZSTD_compressStream2,
+*  and ZSTD_decompressStream.
+*  If there is functionality that you need, but it doesn't provide,
+*  please open an issue on our GitHub.
 ********************************************************************* */
 
 /*
@@ -2362,7 +2795,6 @@ ZSTDLIB_STATIC_API size_t ZSTD_resetDStream(ZSTD_DStream* zds);
 
   Start by initializing a context.
   Use ZSTD_compressBegin(), or ZSTD_compressBegin_usingDict() for dictionary compression.
-  It's also possible to duplicate a reference context which has already been initialized, using ZSTD_copyCCtx()
 
   Then, consume your input using ZSTD_compressContinue().
   There are some important considerations to keep in mind when using this advanced function :
@@ -2384,18 +2816,28 @@ ZSTDLIB_STATIC_API size_t ZSTD_resetDStream(ZSTD_DStream* zds);
 */
 
 /*=====   Buffer-less streaming compression functions  =====*/
+ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.")
 ZSTDLIB_STATIC_API size_t ZSTD_compressBegin(ZSTD_CCtx* cctx, int compressionLevel);
+ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.")
 ZSTDLIB_STATIC_API size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel);
+ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.")
 ZSTDLIB_STATIC_API size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict); /*< note: fails if cdict==NULL */
-ZSTDLIB_STATIC_API size_t ZSTD_copyCCtx(ZSTD_CCtx* cctx, const ZSTD_CCtx* preparedCCtx, unsigned long long pledgedSrcSize); /*<  note: if pledgedSrcSize is not known, use ZSTD_CONTENTSIZE_UNKNOWN */
 
+ZSTD_DEPRECATED("This function will likely be removed in a future release. It is misleading and has very limited utility.")
+ZSTDLIB_STATIC_API
+size_t ZSTD_copyCCtx(ZSTD_CCtx* cctx, const ZSTD_CCtx* preparedCCtx, unsigned long long pledgedSrcSize); /*<  note: if pledgedSrcSize is not known, use ZSTD_CONTENTSIZE_UNKNOWN */
+
+ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.")
 ZSTDLIB_STATIC_API size_t ZSTD_compressContinue(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.")
 ZSTDLIB_STATIC_API size_t ZSTD_compressEnd(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
 
 /* The ZSTD_compressBegin_advanced() and ZSTD_compressBegin_usingCDict_advanced() are now DEPRECATED and will generate a compiler warning */
 ZSTD_DEPRECATED("use advanced API to access custom parameters")
+ZSTDLIB_STATIC_API
 size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, ZSTD_parameters params, unsigned long long pledgedSrcSize); /*< pledgedSrcSize : If srcSize is not known at init time, use ZSTD_CONTENTSIZE_UNKNOWN */
 ZSTD_DEPRECATED("use advanced API to access custom parameters")
+ZSTDLIB_STATIC_API
 size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_CDict* const cdict, ZSTD_frameParameters const fParams, unsigned long long const pledgedSrcSize);   /* compression parameters are already set within cdict. pledgedSrcSize must be correct. If srcSize is not known, use macro ZSTD_CONTENTSIZE_UNKNOWN */
 /*
   Buffer-less streaming decompression (synchronous mode)
@@ -2408,8 +2850,8 @@ size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_
   Frame header is extracted from the beginning of compressed frame, so providing only the frame's beginning is enough.
   Data fragment must be large enough to ensure successful decoding.
  `ZSTD_frameHeaderSize_max` bytes is guaranteed to always be large enough.
-  @result : 0 : successful decoding, the `ZSTD_frameHeader` structure is correctly filled.
-           >0 : `srcSize` is too small, please provide at least @result bytes on next attempt.
+  result  : 0 : successful decoding, the `ZSTD_frameHeader` structure is correctly filled.
+           >0 : `srcSize` is too small, please provide at least result bytes on next attempt.
            errorCode, which can be tested using ZSTD_isError().
 
   It fills a ZSTD_frameHeader structure with important information to correctly decode the frame,
@@ -2428,7 +2870,7 @@ size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_
 
   The most memory efficient way is to use a round buffer of sufficient size.
   Sufficient size is determined by invoking ZSTD_decodingBufferSize_min(),
-  which can @return an error code if required value is too large for current system (in 32-bits mode).
+  which can return an error code if required value is too large for current system (in 32-bits mode).
   In a round buffer methodology, ZSTD_decompressContinue() decompresses each block next to previous one,
   up to the moment there is not enough room left in the buffer to guarantee decoding another full block,
   which maximum size is provided in `ZSTD_frameHeader` structure, field `blockSizeMax`.
@@ -2448,7 +2890,7 @@ size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_
   ZSTD_nextSrcSizeToDecompress() tells how many bytes to provide as 'srcSize' to ZSTD_decompressContinue().
   ZSTD_decompressContinue() requires this _exact_ amount of bytes, or it will fail.
 
- @result of ZSTD_decompressContinue() is the number of bytes regenerated within 'dst' (necessarily <= dstCapacity).
+  result of ZSTD_decompressContinue() is the number of bytes regenerated within 'dst' (necessarily <= dstCapacity).
   It can be zero : it just means ZSTD_decompressContinue() has decoded some metadata item.
   It can also be an error code, which can be tested with ZSTD_isError().
 
@@ -2471,27 +2913,7 @@ size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_
 */
 
 /*=====   Buffer-less streaming decompression functions  =====*/
-typedef enum { ZSTD_frame, ZSTD_skippableFrame } ZSTD_frameType_e;
-typedef struct {
-    unsigned long long frameContentSize; /* if == ZSTD_CONTENTSIZE_UNKNOWN, it means this field is not available. 0 means "empty" */
-    unsigned long long windowSize;       /* can be very large, up to <= frameContentSize */
-    unsigned blockSizeMax;
-    ZSTD_frameType_e frameType;          /* if == ZSTD_skippableFrame, frameContentSize is the size of skippable content */
-    unsigned headerSize;
-    unsigned dictID;
-    unsigned checksumFlag;
-} ZSTD_frameHeader;
 
-/*! ZSTD_getFrameHeader() :
- *  decode Frame Header, or requires larger `srcSize`.
- * @return : 0, `zfhPtr` is correctly filled,
- *          >0, `srcSize` is too small, value is wanted `srcSize` amount,
- *           or an error code, which can be tested using ZSTD_isError() */
-ZSTDLIB_STATIC_API size_t ZSTD_getFrameHeader(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize);   /*< doesn't consume input */
-/*! ZSTD_getFrameHeader_advanced() :
- *  same as ZSTD_getFrameHeader(),
- *  with added capability to select a format (like ZSTD_f_zstd1_magicless) */
-ZSTDLIB_STATIC_API size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format);
 ZSTDLIB_STATIC_API size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long long frameContentSize);  /*< when frame content size is not known, pass in frameContentSize == ZSTD_CONTENTSIZE_UNKNOWN */
 
 ZSTDLIB_STATIC_API size_t ZSTD_decompressBegin(ZSTD_DCtx* dctx);
@@ -2502,6 +2924,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx* dctx);
 ZSTDLIB_STATIC_API size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
 
 /* misc */
+ZSTD_DEPRECATED("This function will likely be removed in the next minor release. It is misleading and has very limited utility.")
 ZSTDLIB_STATIC_API void   ZSTD_copyDCtx(ZSTD_DCtx* dctx, const ZSTD_DCtx* preparedDCtx);
 typedef enum { ZSTDnit_frameHeader, ZSTDnit_blockHeader, ZSTDnit_block, ZSTDnit_lastBlock, ZSTDnit_checksum, ZSTDnit_skippableFrame } ZSTD_nextInputType_e;
 ZSTDLIB_STATIC_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx);
@@ -2509,11 +2932,23 @@ ZSTDLIB_STATIC_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx);
 
 
-/* ============================ */
-/*       Block level API       */
-/* ============================ */
+/* ========================================= */
+/*       Block level API (DEPRECATED)       */
+/* ========================================= */
 
 /*!
+
+    This API is deprecated in favor of the regular compression API.
+    You can get the frame header down to 2 bytes by setting:
+      - ZSTD_c_format = ZSTD_f_zstd1_magicless
+      - ZSTD_c_contentSizeFlag = 0
+      - ZSTD_c_checksumFlag = 0
+      - ZSTD_c_dictIDFlag = 0
+
+    This API is not as well tested as our normal API, so we recommend not using it.
+    We will be removing it in a future version. If the normal API doesn't provide
+    the functionality you need, please open a GitHub issue.
+
     Block functions produce and decode raw zstd blocks, without frame metadata.
     Frame metadata cost is typically ~12 bytes, which can be non-negligible for very small blocks (< 100 bytes).
     But users will have to take in charge needed metadata to regenerate data, such as compressed and content sizes.
@@ -2524,7 +2959,6 @@ ZSTDLIB_STATIC_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx);
     - It is necessary to init context before starting
       + compression : any ZSTD_compressBegin*() variant, including with dictionary
       + decompression : any ZSTD_decompressBegin*() variant, including with dictionary
-      + copyCCtx() and copyDCtx() can be used too
     - Block size is limited, it must be <= ZSTD_getBlockSize() <= ZSTD_BLOCKSIZE_MAX == 128 KB
       + If input is larger than a block size, it's necessary to split input data into multiple blocks
       + For inputs larger than a single block, consider using regular ZSTD_compress() instead.
@@ -2541,11 +2975,14 @@ ZSTDLIB_STATIC_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx);
 */
 
 /*=====   Raw zstd block functions  =====*/
+ZSTD_DEPRECATED("The block API is deprecated in favor of the normal compression API. See docs.")
 ZSTDLIB_STATIC_API size_t ZSTD_getBlockSize   (const ZSTD_CCtx* cctx);
+ZSTD_DEPRECATED("The block API is deprecated in favor of the normal compression API. See docs.")
 ZSTDLIB_STATIC_API size_t ZSTD_compressBlock  (ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+ZSTD_DEPRECATED("The block API is deprecated in favor of the normal compression API. See docs.")
 ZSTDLIB_STATIC_API size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+ZSTD_DEPRECATED("The block API is deprecated in favor of the normal compression API. See docs.")
 ZSTDLIB_STATIC_API size_t ZSTD_insertBlock    (ZSTD_DCtx* dctx, const void* blockStart, size_t blockSize);  /*< insert uncompressed block into `dctx` history. Useful for multi-blocks decompression. */
 
-
 #endif   /* ZSTD_H_ZSTD_STATIC_LINKING_ONLY */
 
diff --git a/lib/zstd/Makefile b/lib/zstd/Makefile
index 20f08c644b71a3..464c410b2768c6 100644
--- a/lib/zstd/Makefile
+++ b/lib/zstd/Makefile
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
 # ################################################################
-# Copyright (c) Facebook, Inc.
+# Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under both the BSD-style license (found in the
diff --git a/lib/zstd/common/allocations.h b/lib/zstd/common/allocations.h
new file mode 100644
index 00000000000000..05adbbeccaa9b1
--- /dev/null
+++ b/lib/zstd/common/allocations.h
@@ -0,0 +1,56 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+/* This file provides custom allocation primitives
+ */
+
+#define ZSTD_DEPS_NEED_MALLOC
+#include "zstd_deps.h"   /* ZSTD_malloc, ZSTD_calloc, ZSTD_free, ZSTD_memset */
+
+#include "mem.h" /* MEM_STATIC */
+#define ZSTD_STATIC_LINKING_ONLY
+#include <linux/zstd.h> /* ZSTD_customMem */
+
+#ifndef ZSTD_ALLOCATIONS_H
+#define ZSTD_ALLOCATIONS_H
+
+/* custom memory allocation functions */
+
+MEM_STATIC void* ZSTD_customMalloc(size_t size, ZSTD_customMem customMem)
+{
+    if (customMem.customAlloc)
+        return customMem.customAlloc(customMem.opaque, size);
+    return ZSTD_malloc(size);
+}
+
+MEM_STATIC void* ZSTD_customCalloc(size_t size, ZSTD_customMem customMem)
+{
+    if (customMem.customAlloc) {
+        /* calloc implemented as malloc+memset;
+         * not as efficient as calloc, but next best guess for custom malloc */
+        void* const ptr = customMem.customAlloc(customMem.opaque, size);
+        ZSTD_memset(ptr, 0, size);
+        return ptr;
+    }
+    return ZSTD_calloc(1, size);
+}
+
+MEM_STATIC void ZSTD_customFree(void* ptr, ZSTD_customMem customMem)
+{
+    if (ptr!=NULL) {
+        if (customMem.customFree)
+            customMem.customFree(customMem.opaque, ptr);
+        else
+            ZSTD_free(ptr);
+    }
+}
+
+#endif /* ZSTD_ALLOCATIONS_H */
diff --git a/lib/zstd/common/bits.h b/lib/zstd/common/bits.h
new file mode 100644
index 00000000000000..aa3487ec4b6a71
--- /dev/null
+++ b/lib/zstd/common/bits.h
@@ -0,0 +1,149 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_BITS_H
+#define ZSTD_BITS_H
+
+#include "mem.h"
+
+MEM_STATIC unsigned ZSTD_countTrailingZeros32_fallback(U32 val)
+{
+    assert(val != 0);
+    {
+        static const U32 DeBruijnBytePos[32] = {0, 1, 28, 2, 29, 14, 24, 3,
+                                                30, 22, 20, 15, 25, 17, 4, 8,
+                                                31, 27, 13, 23, 21, 19, 16, 7,
+                                                26, 12, 18, 6, 11, 5, 10, 9};
+        return DeBruijnBytePos[((U32) ((val & -(S32) val) * 0x077CB531U)) >> 27];
+    }
+}
+
+MEM_STATIC unsigned ZSTD_countTrailingZeros32(U32 val)
+{
+    assert(val != 0);
+#   if (__GNUC__ >= 4)
+        return (unsigned)__builtin_ctz(val);
+#   else
+        return ZSTD_countTrailingZeros32_fallback(val);
+#   endif
+}
+
+MEM_STATIC unsigned ZSTD_countLeadingZeros32_fallback(U32 val) {
+    assert(val != 0);
+    {
+        static const U32 DeBruijnClz[32] = {0, 9, 1, 10, 13, 21, 2, 29,
+                                            11, 14, 16, 18, 22, 25, 3, 30,
+                                            8, 12, 20, 28, 15, 17, 24, 7,
+                                            19, 27, 23, 6, 26, 5, 4, 31};
+        val |= val >> 1;
+        val |= val >> 2;
+        val |= val >> 4;
+        val |= val >> 8;
+        val |= val >> 16;
+        return 31 - DeBruijnClz[(val * 0x07C4ACDDU) >> 27];
+    }
+}
+
+MEM_STATIC unsigned ZSTD_countLeadingZeros32(U32 val)
+{
+    assert(val != 0);
+#   if (__GNUC__ >= 4)
+        return (unsigned)__builtin_clz(val);
+#   else
+        return ZSTD_countLeadingZeros32_fallback(val);
+#   endif
+}
+
+MEM_STATIC unsigned ZSTD_countTrailingZeros64(U64 val)
+{
+    assert(val != 0);
+#   if (__GNUC__ >= 4) && defined(__LP64__)
+        return (unsigned)__builtin_ctzll(val);
+#   else
+        {
+            U32 mostSignificantWord = (U32)(val >> 32);
+            U32 leastSignificantWord = (U32)val;
+            if (leastSignificantWord == 0) {
+                return 32 + ZSTD_countTrailingZeros32(mostSignificantWord);
+            } else {
+                return ZSTD_countTrailingZeros32(leastSignificantWord);
+            }
+        }
+#   endif
+}
+
+MEM_STATIC unsigned ZSTD_countLeadingZeros64(U64 val)
+{
+    assert(val != 0);
+#   if (__GNUC__ >= 4)
+        return (unsigned)(__builtin_clzll(val));
+#   else
+        {
+            U32 mostSignificantWord = (U32)(val >> 32);
+            U32 leastSignificantWord = (U32)val;
+            if (mostSignificantWord == 0) {
+                return 32 + ZSTD_countLeadingZeros32(leastSignificantWord);
+            } else {
+                return ZSTD_countLeadingZeros32(mostSignificantWord);
+            }
+        }
+#   endif
+}
+
+MEM_STATIC unsigned ZSTD_NbCommonBytes(size_t val)
+{
+    if (MEM_isLittleEndian()) {
+        if (MEM_64bits()) {
+            return ZSTD_countTrailingZeros64((U64)val) >> 3;
+        } else {
+            return ZSTD_countTrailingZeros32((U32)val) >> 3;
+        }
+    } else {  /* Big Endian CPU */
+        if (MEM_64bits()) {
+            return ZSTD_countLeadingZeros64((U64)val) >> 3;
+        } else {
+            return ZSTD_countLeadingZeros32((U32)val) >> 3;
+        }
+    }
+}
+
+MEM_STATIC unsigned ZSTD_highbit32(U32 val)   /* compress, dictBuilder, decodeCorpus */
+{
+    assert(val != 0);
+    return 31 - ZSTD_countLeadingZeros32(val);
+}
+
+/* ZSTD_rotateRight_*():
+ * Rotates a bitfield to the right by "count" bits.
+ * https://en.wikipedia.org/w/index.php?title=Circular_shift&oldid=991635599#Implementing_circular_shifts
+ */
+MEM_STATIC
+U64 ZSTD_rotateRight_U64(U64 const value, U32 count) {
+    assert(count < 64);
+    count &= 0x3F; /* for fickle pattern recognition */
+    return (value >> count) | (U64)(value << ((0U - count) & 0x3F));
+}
+
+MEM_STATIC
+U32 ZSTD_rotateRight_U32(U32 const value, U32 count) {
+    assert(count < 32);
+    count &= 0x1F; /* for fickle pattern recognition */
+    return (value >> count) | (U32)(value << ((0U - count) & 0x1F));
+}
+
+MEM_STATIC
+U16 ZSTD_rotateRight_U16(U16 const value, U32 count) {
+    assert(count < 16);
+    count &= 0x0F; /* for fickle pattern recognition */
+    return (value >> count) | (U16)(value << ((0U - count) & 0x0F));
+}
+
+#endif /* ZSTD_BITS_H */
diff --git a/lib/zstd/common/bitstream.h b/lib/zstd/common/bitstream.h
index feef3a1b1d6002..444dc4f85c649e 100644
--- a/lib/zstd/common/bitstream.h
+++ b/lib/zstd/common/bitstream.h
@@ -1,7 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /* ******************************************************************
  * bitstream
  * Part of FSE library
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  *
  * You can contact the author at :
  * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
@@ -27,6 +28,7 @@
 #include "compiler.h"       /* UNLIKELY() */
 #include "debug.h"          /* assert(), DEBUGLOG(), RAWLOG() */
 #include "error_private.h"  /* error codes and messages */
+#include "bits.h"           /* ZSTD_highbit32 */
 
 
 /*=========================================
@@ -122,33 +124,6 @@ MEM_STATIC void BIT_flushBitsFast(BIT_CStream_t* bitC);
 MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, unsigned nbBits);
 /* faster, but works only if nbBits >= 1 */
 
-
-
-/*-**************************************************************
-*  Internal functions
-****************************************************************/
-MEM_STATIC unsigned BIT_highbit32 (U32 val)
-{
-    assert(val != 0);
-    {
-#   if (__GNUC__ >= 3)   /* Use GCC Intrinsic */
-        return __builtin_clz (val) ^ 31;
-#   else   /* Software version */
-        static const unsigned DeBruijnClz[32] = { 0,  9,  1, 10, 13, 21,  2, 29,
-                                                 11, 14, 16, 18, 22, 25,  3, 30,
-                                                  8, 12, 20, 28, 15, 17, 24,  7,
-                                                 19, 27, 23,  6, 26,  5,  4, 31 };
-        U32 v = val;
-        v |= v >> 1;
-        v |= v >> 2;
-        v |= v >> 4;
-        v |= v >> 8;
-        v |= v >> 16;
-        return DeBruijnClz[ (U32) (v * 0x07C4ACDDU) >> 27];
-#   endif
-    }
-}
-
 /*=====    Local Constants   =====*/
 static const unsigned BIT_mask[] = {
     0,          1,         3,         7,         0xF,       0x1F,
@@ -178,6 +153,12 @@ MEM_STATIC size_t BIT_initCStream(BIT_CStream_t* bitC,
     return 0;
 }
 
+MEM_STATIC FORCE_INLINE_ATTR size_t BIT_getLowerBits(size_t bitContainer, U32 const nbBits)
+{
+    assert(nbBits < BIT_MASK_SIZE);
+    return bitContainer & BIT_mask[nbBits];
+}
+
 /*! BIT_addBits() :
  *  can add up to 31 bits into `bitC`.
  *  Note : does not check for register overflow ! */
@@ -187,7 +168,7 @@ MEM_STATIC void BIT_addBits(BIT_CStream_t* bitC,
     DEBUG_STATIC_ASSERT(BIT_MASK_SIZE == 32);
     assert(nbBits < BIT_MASK_SIZE);
     assert(nbBits + bitC->bitPos < sizeof(bitC->bitContainer) * 8);
-    bitC->bitContainer |= (value & BIT_mask[nbBits]) << bitC->bitPos;
+    bitC->bitContainer |= BIT_getLowerBits(value, nbBits) << bitC->bitPos;
     bitC->bitPos += nbBits;
 }
 
@@ -266,7 +247,7 @@ MEM_STATIC size_t BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, si
         bitD->ptr   = (const char*)srcBuffer + srcSize - sizeof(bitD->bitContainer);
         bitD->bitContainer = MEM_readLEST(bitD->ptr);
         { BYTE const lastByte = ((const BYTE*)srcBuffer)[srcSize-1];
-          bitD->bitsConsumed = lastByte ? 8 - BIT_highbit32(lastByte) : 0;  /* ensures bitsConsumed is always set */
+          bitD->bitsConsumed = lastByte ? 8 - ZSTD_highbit32(lastByte) : 0;  /* ensures bitsConsumed is always set */
           if (lastByte == 0) return ERROR(GENERIC); /* endMark not present */ }
     } else {
         bitD->ptr   = bitD->start;
@@ -294,7 +275,7 @@ MEM_STATIC size_t BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, si
         default: break;
         }
         {   BYTE const lastByte = ((const BYTE*)srcBuffer)[srcSize-1];
-            bitD->bitsConsumed = lastByte ? 8 - BIT_highbit32(lastByte) : 0;
+            bitD->bitsConsumed = lastByte ? 8 - ZSTD_highbit32(lastByte) : 0;
             if (lastByte == 0) return ERROR(corruption_detected);  /* endMark not present */
         }
         bitD->bitsConsumed += (U32)(sizeof(bitD->bitContainer) - srcSize)*8;
@@ -325,12 +306,6 @@ MEM_STATIC FORCE_INLINE_ATTR size_t BIT_getMiddleBits(size_t bitContainer, U32 c
 #endif
 }
 
-MEM_STATIC FORCE_INLINE_ATTR size_t BIT_getLowerBits(size_t bitContainer, U32 const nbBits)
-{
-    assert(nbBits < BIT_MASK_SIZE);
-    return bitContainer & BIT_mask[nbBits];
-}
-
 /*! BIT_lookBits() :
  *  Provides next n bits from local register.
  *  local register is not modified.
@@ -377,7 +352,7 @@ MEM_STATIC FORCE_INLINE_ATTR size_t BIT_readBits(BIT_DStream_t* bitD, unsigned n
 }
 
 /*! BIT_readBitsFast() :
- *  unsafe version; only works only if nbBits >= 1 */
+ *  unsafe version; only works if nbBits >= 1 */
 MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, unsigned nbBits)
 {
     size_t const value = BIT_lookBitsFast(bitD, nbBits);
@@ -408,7 +383,7 @@ MEM_STATIC BIT_DStream_status BIT_reloadDStreamFast(BIT_DStream_t* bitD)
  *  This function is safe, it guarantees it will not read beyond src buffer.
  * @return : status of `BIT_DStream_t` internal register.
  *           when status == BIT_DStream_unfinished, internal register is filled with at least 25 or 57 bits */
-MEM_STATIC BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD)
+MEM_STATIC FORCE_INLINE_ATTR BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD)
 {
     if (bitD->bitsConsumed > (sizeof(bitD->bitContainer)*8))  /* overflow detected, like end of stream */
         return BIT_DStream_overflow;
diff --git a/lib/zstd/common/compiler.h b/lib/zstd/common/compiler.h
index c42d39faf9bd8f..c437e097557504 100644
--- a/lib/zstd/common/compiler.h
+++ b/lib/zstd/common/compiler.h
@@ -1,5 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -179,6 +180,17 @@
 *  Sanitizer
 *****************************************************************/
 
+/* Issue #3240 reports an ASAN failure on an llvm-mingw build. Out of an
+ * abundance of caution, disable our custom poisoning on mingw. */
+#ifdef __MINGW32__
+#ifndef ZSTD_ASAN_DONT_POISON_WORKSPACE
+#define ZSTD_ASAN_DONT_POISON_WORKSPACE 1
+#endif
+#ifndef ZSTD_MSAN_DONT_POISON_WORKSPACE
+#define ZSTD_MSAN_DONT_POISON_WORKSPACE 1
+#endif
+#endif
+
 
 
 #endif /* ZSTD_COMPILER_H */
diff --git a/lib/zstd/common/cpu.h b/lib/zstd/common/cpu.h
index 0db7b42407eea2..d8319a2bef4ced 100644
--- a/lib/zstd/common/cpu.h
+++ b/lib/zstd/common/cpu.h
@@ -1,5 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /*
- * Copyright (c) Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
diff --git a/lib/zstd/common/debug.c b/lib/zstd/common/debug.c
index bb863c9ea61648..e56ff6464e9187 100644
--- a/lib/zstd/common/debug.c
+++ b/lib/zstd/common/debug.c
@@ -1,7 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
 /* ******************************************************************
  * debug
  * Part of FSE library
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  *
  * You can contact the author at :
  * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
diff --git a/lib/zstd/common/debug.h b/lib/zstd/common/debug.h
index 6dd88d1fbd02ca..da0dbfc614b880 100644
--- a/lib/zstd/common/debug.h
+++ b/lib/zstd/common/debug.h
@@ -1,7 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /* ******************************************************************
  * debug
  * Part of FSE library
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  *
  * You can contact the author at :
  * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
diff --git a/lib/zstd/common/entropy_common.c b/lib/zstd/common/entropy_common.c
index fef67056f05240..6cdd82233fb591 100644
--- a/lib/zstd/common/entropy_common.c
+++ b/lib/zstd/common/entropy_common.c
@@ -1,6 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
 /* ******************************************************************
  * Common functions of New Generation Entropy library
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  *
  *  You can contact the author at :
  *  - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy
@@ -19,8 +20,8 @@
 #include "error_private.h"       /* ERR_*, ERROR */
 #define FSE_STATIC_LINKING_ONLY  /* FSE_MIN_TABLELOG */
 #include "fse.h"
-#define HUF_STATIC_LINKING_ONLY  /* HUF_TABLELOG_ABSOLUTEMAX */
 #include "huf.h"
+#include "bits.h"                /* ZSDT_highbit32, ZSTD_countTrailingZeros32 */
 
 
 /*===   Version   ===*/
@@ -38,23 +39,6 @@ const char* HUF_getErrorName(size_t code) { return ERR_getErrorName(code); }
 /*-**************************************************************
 *  FSE NCount encoding-decoding
 ****************************************************************/
-static U32 FSE_ctz(U32 val)
-{
-    assert(val != 0);
-    {
-#   if (__GNUC__ >= 3)   /* GCC Intrinsic */
-        return __builtin_ctz(val);
-#   else   /* Software version */
-        U32 count = 0;
-        while ((val & 1) == 0) {
-            val >>= 1;
-            ++count;
-        }
-        return count;
-#   endif
-    }
-}
-
 FORCE_INLINE_TEMPLATE
 size_t FSE_readNCount_body(short* normalizedCounter, unsigned* maxSVPtr, unsigned* tableLogPtr,
                            const void* headerBuffer, size_t hbSize)
@@ -102,7 +86,7 @@ size_t FSE_readNCount_body(short* normalizedCounter, unsigned* maxSVPtr, unsigne
              * repeat.
              * Avoid UB by setting the high bit to 1.
              */
-            int repeats = FSE_ctz(~bitStream | 0x80000000) >> 1;
+            int repeats = ZSTD_countTrailingZeros32(~bitStream | 0x80000000) >> 1;
             while (repeats >= 12) {
                 charnum += 3 * 12;
                 if (LIKELY(ip <= iend-7)) {
@@ -113,7 +97,7 @@ size_t FSE_readNCount_body(short* normalizedCounter, unsigned* maxSVPtr, unsigne
                     ip = iend - 4;
                 }
                 bitStream = MEM_readLE32(ip) >> bitCount;
-                repeats = FSE_ctz(~bitStream | 0x80000000) >> 1;
+                repeats = ZSTD_countTrailingZeros32(~bitStream | 0x80000000) >> 1;
             }
             charnum += 3 * repeats;
             bitStream >>= 2 * repeats;
@@ -178,7 +162,7 @@ size_t FSE_readNCount_body(short* normalizedCounter, unsigned* maxSVPtr, unsigne
                  * know that threshold > 1.
                  */
                 if (remaining <= 1) break;
-                nbBits = BIT_highbit32(remaining) + 1;
+                nbBits = ZSTD_highbit32(remaining) + 1;
                 threshold = 1 << (nbBits - 1);
             }
             if (charnum >= maxSV1) break;
@@ -253,7 +237,7 @@ size_t HUF_readStats(BYTE* huffWeight, size_t hwSize, U32* rankStats,
                      const void* src, size_t srcSize)
 {
     U32 wksp[HUF_READ_STATS_WORKSPACE_SIZE_U32];
-    return HUF_readStats_wksp(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, wksp, sizeof(wksp), /* bmi2 */ 0);
+    return HUF_readStats_wksp(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, wksp, sizeof(wksp), /* flags */ 0);
 }
 
 FORCE_INLINE_TEMPLATE size_t
@@ -301,14 +285,14 @@ HUF_readStats_body(BYTE* huffWeight, size_t hwSize, U32* rankStats,
     if (weightTotal == 0) return ERROR(corruption_detected);
 
     /* get last non-null symbol weight (implied, total must be 2^n) */
-    {   U32 const tableLog = BIT_highbit32(weightTotal) + 1;
+    {   U32 const tableLog = ZSTD_highbit32(weightTotal) + 1;
         if (tableLog > HUF_TABLELOG_MAX) return ERROR(corruption_detected);
         *tableLogPtr = tableLog;
         /* determine last weight */
         {   U32 const total = 1 << tableLog;
             U32 const rest = total - weightTotal;
-            U32 const verif = 1 << BIT_highbit32(rest);
-            U32 const lastWeight = BIT_highbit32(rest) + 1;
+            U32 const verif = 1 << ZSTD_highbit32(rest);
+            U32 const lastWeight = ZSTD_highbit32(rest) + 1;
             if (verif != rest) return ERROR(corruption_detected);    /* last value must be a clean power of 2 */
             huffWeight[oSize] = (BYTE)lastWeight;
             rankStats[lastWeight]++;
@@ -345,13 +329,13 @@ size_t HUF_readStats_wksp(BYTE* huffWeight, size_t hwSize, U32* rankStats,
                      U32* nbSymbolsPtr, U32* tableLogPtr,
                      const void* src, size_t srcSize,
                      void* workSpace, size_t wkspSize,
-                     int bmi2)
+                     int flags)
 {
 #if DYNAMIC_BMI2
-    if (bmi2) {
+    if (flags & HUF_flags_bmi2) {
         return HUF_readStats_body_bmi2(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, workSpace, wkspSize);
     }
 #endif
-    (void)bmi2;
+    (void)flags;
     return HUF_readStats_body_default(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, workSpace, wkspSize);
 }
diff --git a/lib/zstd/common/error_private.c b/lib/zstd/common/error_private.c
index 6d1135f8c37330..a4062d30d1703c 100644
--- a/lib/zstd/common/error_private.c
+++ b/lib/zstd/common/error_private.c
@@ -1,5 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -27,9 +28,11 @@ const char* ERR_getErrorString(ERR_enum code)
     case PREFIX(version_unsupported): return "Version not supported";
     case PREFIX(frameParameter_unsupported): return "Unsupported frame parameter";
     case PREFIX(frameParameter_windowTooLarge): return "Frame requires too much memory for decoding";
-    case PREFIX(corruption_detected): return "Corrupted block detected";
+    case PREFIX(corruption_detected): return "Data corruption detected";
     case PREFIX(checksum_wrong): return "Restored data doesn't match checksum";
+    case PREFIX(literals_headerWrong): return "Header of Literals' block doesn't respect format specification";
     case PREFIX(parameter_unsupported): return "Unsupported parameter";
+    case PREFIX(parameter_combination_unsupported): return "Unsupported combination of parameters";
     case PREFIX(parameter_outOfBound): return "Parameter is out of bound";
     case PREFIX(init_missing): return "Context should be init first";
     case PREFIX(memory_allocation): return "Allocation error : not enough memory";
@@ -38,17 +41,22 @@ const char* ERR_getErrorString(ERR_enum code)
     case PREFIX(tableLog_tooLarge): return "tableLog requires too much memory : unsupported";
     case PREFIX(maxSymbolValue_tooLarge): return "Unsupported max Symbol Value : too large";
     case PREFIX(maxSymbolValue_tooSmall): return "Specified maxSymbolValue is too small";
+    case PREFIX(stabilityCondition_notRespected): return "pledged buffer stability condition is not respected";
     case PREFIX(dictionary_corrupted): return "Dictionary is corrupted";
     case PREFIX(dictionary_wrong): return "Dictionary mismatch";
     case PREFIX(dictionaryCreation_failed): return "Cannot create Dictionary from provided samples";
     case PREFIX(dstSize_tooSmall): return "Destination buffer is too small";
     case PREFIX(srcSize_wrong): return "Src size is incorrect";
     case PREFIX(dstBuffer_null): return "Operation on NULL destination buffer";
+    case PREFIX(noForwardProgress_destFull): return "Operation made no progress over multiple calls, due to output buffer being full";
+    case PREFIX(noForwardProgress_inputEmpty): return "Operation made no progress over multiple calls, due to input being empty";
         /* following error codes are not stable and may be removed or changed in a future version */
     case PREFIX(frameIndex_tooLarge): return "Frame index is too large";
     case PREFIX(seekableIO): return "An I/O error occurred when reading/seeking";
     case PREFIX(dstBuffer_wrong): return "Destination buffer is wrong";
     case PREFIX(srcBuffer_wrong): return "Source buffer is wrong";
+    case PREFIX(sequenceProducer_failed): return "Block-level external sequence producer returned an error code";
+    case PREFIX(externalSequences_invalid): return "External sequences are not valid";
     case PREFIX(maxCode):
     default: return notErrorCode;
     }
diff --git a/lib/zstd/common/error_private.h b/lib/zstd/common/error_private.h
index ca5101e542faad..9a4699a38a881b 100644
--- a/lib/zstd/common/error_private.h
+++ b/lib/zstd/common/error_private.h
@@ -1,5 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
diff --git a/lib/zstd/common/fse.h b/lib/zstd/common/fse.h
index 4507043b2287c8..c4e25a21914299 100644
--- a/lib/zstd/common/fse.h
+++ b/lib/zstd/common/fse.h
@@ -1,7 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /* ******************************************************************
  * FSE : Finite State Entropy codec
  * Public Prototypes declaration
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  *
  * You can contact the author at :
  * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
@@ -50,34 +51,6 @@
 FSE_PUBLIC_API unsigned FSE_versionNumber(void);   /*< library version number; to be used when checking dll version */
 
 
-/*-****************************************
-*  FSE simple functions
-******************************************/
-/*! FSE_compress() :
-    Compress content of buffer 'src', of size 'srcSize', into destination buffer 'dst'.
-    'dst' buffer must be already allocated. Compression runs faster is dstCapacity >= FSE_compressBound(srcSize).
-    @return : size of compressed data (<= dstCapacity).
-    Special values : if return == 0, srcData is not compressible => Nothing is stored within dst !!!
-                     if return == 1, srcData is a single byte symbol * srcSize times. Use RLE compression instead.
-                     if FSE_isError(return), compression failed (more details using FSE_getErrorName())
-*/
-FSE_PUBLIC_API size_t FSE_compress(void* dst, size_t dstCapacity,
-                             const void* src, size_t srcSize);
-
-/*! FSE_decompress():
-    Decompress FSE data from buffer 'cSrc', of size 'cSrcSize',
-    into already allocated destination buffer 'dst', of size 'dstCapacity'.
-    @return : size of regenerated data (<= maxDstSize),
-              or an error code, which can be tested using FSE_isError() .
-
-    ** Important ** : FSE_decompress() does not decompress non-compressible nor RLE data !!!
-    Why ? : making this distinction requires a header.
-    Header management is intentionally delegated to the user layer, which can better manage special cases.
-*/
-FSE_PUBLIC_API size_t FSE_decompress(void* dst,  size_t dstCapacity,
-                               const void* cSrc, size_t cSrcSize);
-
-
 /*-*****************************************
 *  Tool functions
 ******************************************/
@@ -88,20 +61,6 @@ FSE_PUBLIC_API unsigned    FSE_isError(size_t code);        /* tells if a return
 FSE_PUBLIC_API const char* FSE_getErrorName(size_t code);   /* provides error code string (useful for debugging) */
 
 
-/*-*****************************************
-*  FSE advanced functions
-******************************************/
-/*! FSE_compress2() :
-    Same as FSE_compress(), but allows the selection of 'maxSymbolValue' and 'tableLog'
-    Both parameters can be defined as '0' to mean : use default value
-    @return : size of compressed data
-    Special values : if return == 0, srcData is not compressible => Nothing is stored within cSrc !!!
-                     if return == 1, srcData is a single byte symbol * srcSize times. Use RLE compression.
-                     if FSE_isError(return), it's an error code.
-*/
-FSE_PUBLIC_API size_t FSE_compress2 (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog);
-
-
 /*-*****************************************
 *  FSE detailed API
 ******************************************/
@@ -161,8 +120,6 @@ FSE_PUBLIC_API size_t FSE_writeNCount (void* buffer, size_t bufferSize,
 /*! Constructor and Destructor of FSE_CTable.
     Note that FSE_CTable size depends on 'tableLog' and 'maxSymbolValue' */
 typedef unsigned FSE_CTable;   /* don't allocate that. It's only meant to be more restrictive than void* */
-FSE_PUBLIC_API FSE_CTable* FSE_createCTable (unsigned maxSymbolValue, unsigned tableLog);
-FSE_PUBLIC_API void        FSE_freeCTable (FSE_CTable* ct);
 
 /*! FSE_buildCTable():
     Builds `ct`, which must be already allocated, using FSE_createCTable().
@@ -238,23 +195,7 @@ FSE_PUBLIC_API size_t FSE_readNCount_bmi2(short* normalizedCounter,
                            unsigned* maxSymbolValuePtr, unsigned* tableLogPtr,
                            const void* rBuffer, size_t rBuffSize, int bmi2);
 
-/*! Constructor and Destructor of FSE_DTable.
-    Note that its size depends on 'tableLog' */
 typedef unsigned FSE_DTable;   /* don't allocate that. It's just a way to be more restrictive than void* */
-FSE_PUBLIC_API FSE_DTable* FSE_createDTable(unsigned tableLog);
-FSE_PUBLIC_API void        FSE_freeDTable(FSE_DTable* dt);
-
-/*! FSE_buildDTable():
-    Builds 'dt', which must be already allocated, using FSE_createDTable().
-    return : 0, or an errorCode, which can be tested using FSE_isError() */
-FSE_PUBLIC_API size_t FSE_buildDTable (FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog);
-
-/*! FSE_decompress_usingDTable():
-    Decompress compressed source `cSrc` of size `cSrcSize` using `dt`
-    into `dst` which must be already allocated.
-    @return : size of regenerated data (necessarily <= `dstCapacity`),
-              or an errorCode, which can be tested using FSE_isError() */
-FSE_PUBLIC_API size_t FSE_decompress_usingDTable(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, const FSE_DTable* dt);
 
 /*!
 Tutorial :
@@ -317,16 +258,6 @@ If there is an error, the function will return an error code, which can be teste
 unsigned FSE_optimalTableLog_internal(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, unsigned minus);
 /*< same as FSE_optimalTableLog(), which used `minus==2` */
 
-/* FSE_compress_wksp() :
- * Same as FSE_compress2(), but using an externally allocated scratch buffer (`workSpace`).
- * FSE_COMPRESS_WKSP_SIZE_U32() provides the minimum size required for `workSpace` as a table of FSE_CTable.
- */
-#define FSE_COMPRESS_WKSP_SIZE_U32(maxTableLog, maxSymbolValue)   ( FSE_CTABLE_SIZE_U32(maxTableLog, maxSymbolValue) + ((maxTableLog > 12) ? (1 << (maxTableLog - 2)) : 1024) )
-size_t FSE_compress_wksp (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize);
-
-size_t FSE_buildCTable_raw (FSE_CTable* ct, unsigned nbBits);
-/*< build a fake FSE_CTable, designed for a flat distribution, where each symbol uses nbBits */
-
 size_t FSE_buildCTable_rle (FSE_CTable* ct, unsigned char symbolValue);
 /*< build a fake FSE_CTable, designed to compress always the same symbolValue */
 
@@ -344,19 +275,11 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct, const short* normalizedCounter, unsi
 FSE_PUBLIC_API size_t FSE_buildDTable_wksp(FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize);
 /*< Same as FSE_buildDTable(), using an externally allocated `workspace` produced with `FSE_BUILD_DTABLE_WKSP_SIZE_U32(maxSymbolValue)` */
 
-size_t FSE_buildDTable_raw (FSE_DTable* dt, unsigned nbBits);
-/*< build a fake FSE_DTable, designed to read a flat distribution where each symbol uses nbBits */
-
-size_t FSE_buildDTable_rle (FSE_DTable* dt, unsigned char symbolValue);
-/*< build a fake FSE_DTable, designed to always generate the same symbolValue */
-
-#define FSE_DECOMPRESS_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) (FSE_DTABLE_SIZE_U32(maxTableLog) + FSE_BUILD_DTABLE_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) + (FSE_MAX_SYMBOL_VALUE + 1) / 2 + 1)
+#define FSE_DECOMPRESS_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) (FSE_DTABLE_SIZE_U32(maxTableLog) + 1 + FSE_BUILD_DTABLE_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) + (FSE_MAX_SYMBOL_VALUE + 1) / 2 + 1)
 #define FSE_DECOMPRESS_WKSP_SIZE(maxTableLog, maxSymbolValue) (FSE_DECOMPRESS_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) * sizeof(unsigned))
-size_t FSE_decompress_wksp(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, unsigned maxLog, void* workSpace, size_t wkspSize);
-/*< same as FSE_decompress(), using an externally allocated `workSpace` produced with `FSE_DECOMPRESS_WKSP_SIZE_U32(maxLog, maxSymbolValue)` */
-
 size_t FSE_decompress_wksp_bmi2(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, unsigned maxLog, void* workSpace, size_t wkspSize, int bmi2);
-/*< Same as FSE_decompress_wksp() but with dynamic BMI2 support. Pass 1 if your CPU supports BMI2 or 0 if it doesn't. */
+/*< same as FSE_decompress(), using an externally allocated `workSpace` produced with `FSE_DECOMPRESS_WKSP_SIZE_U32(maxLog, maxSymbolValue)`.
+ * Set bmi2 to 1 if your CPU supports BMI2 or 0 if it doesn't */
 
 typedef enum {
    FSE_repeat_none,  /*< Cannot use the previous table */
@@ -552,7 +475,7 @@ MEM_STATIC void FSE_flushCState(BIT_CStream_t* bitC, const FSE_CState_t* statePt
 
 /* FSE_getMaxNbBits() :
  * Approximate maximum cost of a symbol, in bits.
- * Fractional get rounded up (i.e : a symbol with a normalized frequency of 3 gives the same result as a frequency of 2)
+ * Fractional get rounded up (i.e. a symbol with a normalized frequency of 3 gives the same result as a frequency of 2)
  * note 1 : assume symbolValue is valid (<= maxSymbolValue)
  * note 2 : if freq[symbolValue]==0, @return a fake cost of tableLog+1 bits */
 MEM_STATIC U32 FSE_getMaxNbBits(const void* symbolTTPtr, U32 symbolValue)
diff --git a/lib/zstd/common/fse_decompress.c b/lib/zstd/common/fse_decompress.c
index 8dcb8ca39767c8..99ce8fa54d0849 100644
--- a/lib/zstd/common/fse_decompress.c
+++ b/lib/zstd/common/fse_decompress.c
@@ -1,6 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
 /* ******************************************************************
  * FSE : Finite State Entropy decoder
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  *
  *  You can contact the author at :
  *  - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
@@ -24,6 +25,7 @@
 #include "error_private.h"
 #define ZSTD_DEPS_NEED_MALLOC
 #include "zstd_deps.h"
+#include "bits.h"       /* ZSTD_highbit32 */
 
 
 /* **************************************************************
@@ -55,19 +57,6 @@
 #define FSE_FUNCTION_NAME(X,Y) FSE_CAT(X,Y)
 #define FSE_TYPE_NAME(X,Y) FSE_CAT(X,Y)
 
-
-/* Function templates */
-FSE_DTable* FSE_createDTable (unsigned tableLog)
-{
-    if (tableLog > FSE_TABLELOG_ABSOLUTE_MAX) tableLog = FSE_TABLELOG_ABSOLUTE_MAX;
-    return (FSE_DTable*)ZSTD_malloc( FSE_DTABLE_SIZE_U32(tableLog) * sizeof (U32) );
-}
-
-void FSE_freeDTable (FSE_DTable* dt)
-{
-    ZSTD_free(dt);
-}
-
 static size_t FSE_buildDTable_internal(FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize)
 {
     void* const tdPtr = dt+1;   /* because *dt is unsigned, 32-bits aligned on 32-bits */
@@ -127,10 +116,10 @@ static size_t FSE_buildDTable_internal(FSE_DTable* dt, const short* normalizedCo
             }
         }
         /* Now we spread those positions across the table.
-         * The benefit of doing it in two stages is that we avoid the the
+         * The benefit of doing it in two stages is that we avoid the
          * variable size inner loop, which caused lots of branch misses.
          * Now we can run through all the positions without any branch misses.
-         * We unroll the loop twice, since that is what emperically worked best.
+         * We unroll the loop twice, since that is what empirically worked best.
          */
         {
             size_t position = 0;
@@ -166,7 +155,7 @@ static size_t FSE_buildDTable_internal(FSE_DTable* dt, const short* normalizedCo
         for (u=0; u<tableSize; u++) {
             FSE_FUNCTION_TYPE const symbol = (FSE_FUNCTION_TYPE)(tableDecode[u].symbol);
             U32 const nextState = symbolNext[symbol]++;
-            tableDecode[u].nbBits = (BYTE) (tableLog - BIT_highbit32(nextState) );
+            tableDecode[u].nbBits = (BYTE) (tableLog - ZSTD_highbit32(nextState) );
             tableDecode[u].newState = (U16) ( (nextState << tableDecode[u].nbBits) - tableSize);
     }   }
 
@@ -184,49 +173,6 @@ size_t FSE_buildDTable_wksp(FSE_DTable* dt, const short* normalizedCounter, unsi
 /*-*******************************************************
 *  Decompression (Byte symbols)
 *********************************************************/
-size_t FSE_buildDTable_rle (FSE_DTable* dt, BYTE symbolValue)
-{
-    void* ptr = dt;
-    FSE_DTableHeader* const DTableH = (FSE_DTableHeader*)ptr;
-    void* dPtr = dt + 1;
-    FSE_decode_t* const cell = (FSE_decode_t*)dPtr;
-
-    DTableH->tableLog = 0;
-    DTableH->fastMode = 0;
-
-    cell->newState = 0;
-    cell->symbol = symbolValue;
-    cell->nbBits = 0;
-
-    return 0;
-}
-
-
-size_t FSE_buildDTable_raw (FSE_DTable* dt, unsigned nbBits)
-{
-    void* ptr = dt;
-    FSE_DTableHeader* const DTableH = (FSE_DTableHeader*)ptr;
-    void* dPtr = dt + 1;
-    FSE_decode_t* const dinfo = (FSE_decode_t*)dPtr;
-    const unsigned tableSize = 1 << nbBits;
-    const unsigned tableMask = tableSize - 1;
-    const unsigned maxSV1 = tableMask+1;
-    unsigned s;
-
-    /* Sanity checks */
-    if (nbBits < 1) return ERROR(GENERIC);         /* min size */
-
-    /* Build Decoding Table */
-    DTableH->tableLog = (U16)nbBits;
-    DTableH->fastMode = 1;
-    for (s=0; s<maxSV1; s++) {
-        dinfo[s].newState = 0;
-        dinfo[s].symbol = (BYTE)s;
-        dinfo[s].nbBits = (BYTE)nbBits;
-    }
-
-    return 0;
-}
 
 FORCE_INLINE_TEMPLATE size_t FSE_decompress_usingDTable_generic(
           void* dst, size_t maxDstSize,
@@ -290,26 +236,6 @@ FORCE_INLINE_TEMPLATE size_t FSE_decompress_usingDTable_generic(
     return op-ostart;
 }
 
-
-size_t FSE_decompress_usingDTable(void* dst, size_t originalSize,
-                            const void* cSrc, size_t cSrcSize,
-                            const FSE_DTable* dt)
-{
-    const void* ptr = dt;
-    const FSE_DTableHeader* DTableH = (const FSE_DTableHeader*)ptr;
-    const U32 fastMode = DTableH->fastMode;
-
-    /* select fast mode (static) */
-    if (fastMode) return FSE_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 1);
-    return FSE_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 0);
-}
-
-
-size_t FSE_decompress_wksp(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, unsigned maxLog, void* workSpace, size_t wkspSize)
-{
-    return FSE_decompress_wksp_bmi2(dst, dstCapacity, cSrc, cSrcSize, maxLog, workSpace, wkspSize, /* bmi2 */ 0);
-}
-
 typedef struct {
     short ncount[FSE_MAX_SYMBOL_VALUE + 1];
     FSE_DTable dtable[]; /* Dynamically sized */
@@ -342,7 +268,8 @@ FORCE_INLINE_TEMPLATE size_t FSE_decompress_wksp_body(
     }
 
     if (FSE_DECOMPRESS_WKSP_SIZE(tableLog, maxSymbolValue) > wkspSize) return ERROR(tableLog_tooLarge);
-    workSpace = wksp->dtable + FSE_DTABLE_SIZE_U32(tableLog);
+    assert(sizeof(*wksp) + FSE_DTABLE_SIZE(tableLog) <= wkspSize);
+    workSpace = (BYTE*)workSpace + sizeof(*wksp) + FSE_DTABLE_SIZE(tableLog);
     wkspSize -= sizeof(*wksp) + FSE_DTABLE_SIZE(tableLog);
 
     CHECK_F( FSE_buildDTable_internal(wksp->dtable, wksp->ncount, maxSymbolValue, tableLog, workSpace, wkspSize) );
@@ -382,9 +309,4 @@ size_t FSE_decompress_wksp_bmi2(void* dst, size_t dstCapacity, const void* cSrc,
     return FSE_decompress_wksp_body_default(dst, dstCapacity, cSrc, cSrcSize, maxLog, workSpace, wkspSize);
 }
 
-
-typedef FSE_DTable DTable_max_t[FSE_DTABLE_SIZE_U32(FSE_MAX_TABLELOG)];
-
-
-
 #endif   /* FSE_COMMONDEFS_ONLY */
diff --git a/lib/zstd/common/huf.h b/lib/zstd/common/huf.h
index 5042ff87030875..8e7943092ed1ae 100644
--- a/lib/zstd/common/huf.h
+++ b/lib/zstd/common/huf.h
@@ -1,7 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /* ******************************************************************
  * huff0 huffman codec,
  * part of Finite State Entropy library
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  *
  * You can contact the author at :
  * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
@@ -18,99 +19,22 @@
 
 /* *** Dependencies *** */
 #include "zstd_deps.h"    /* size_t */
-
-
-/* *** library symbols visibility *** */
-/* Note : when linking with -fvisibility=hidden on gcc, or by default on Visual,
- *        HUF symbols remain "private" (internal symbols for library only).
- *        Set macro FSE_DLL_EXPORT to 1 if you want HUF symbols visible on DLL interface */
-#if defined(FSE_DLL_EXPORT) && (FSE_DLL_EXPORT==1) && defined(__GNUC__) && (__GNUC__ >= 4)
-#  define HUF_PUBLIC_API __attribute__ ((visibility ("default")))
-#elif defined(FSE_DLL_EXPORT) && (FSE_DLL_EXPORT==1)   /* Visual expected */
-#  define HUF_PUBLIC_API __declspec(dllexport)
-#elif defined(FSE_DLL_IMPORT) && (FSE_DLL_IMPORT==1)
-#  define HUF_PUBLIC_API __declspec(dllimport)  /* not required, just to generate faster code (saves a function pointer load from IAT and an indirect jump) */
-#else
-#  define HUF_PUBLIC_API
-#endif
-
-
-/* ========================== */
-/* ***  simple functions  *** */
-/* ========================== */
-
-/* HUF_compress() :
- *  Compress content from buffer 'src', of size 'srcSize', into buffer 'dst'.
- * 'dst' buffer must be already allocated.
- *  Compression runs faster if `dstCapacity` >= HUF_compressBound(srcSize).
- * `srcSize` must be <= `HUF_BLOCKSIZE_MAX` == 128 KB.
- * @return : size of compressed data (<= `dstCapacity`).
- *  Special values : if return == 0, srcData is not compressible => Nothing is stored within dst !!!
- *                   if HUF_isError(return), compression failed (more details using HUF_getErrorName())
- */
-HUF_PUBLIC_API size_t HUF_compress(void* dst, size_t dstCapacity,
-                             const void* src, size_t srcSize);
-
-/* HUF_decompress() :
- *  Decompress HUF data from buffer 'cSrc', of size 'cSrcSize',
- *  into already allocated buffer 'dst', of minimum size 'dstSize'.
- * `originalSize` : **must** be the ***exact*** size of original (uncompressed) data.
- *  Note : in contrast with FSE, HUF_decompress can regenerate
- *         RLE (cSrcSize==1) and uncompressed (cSrcSize==dstSize) data,
- *         because it knows size to regenerate (originalSize).
- * @return : size of regenerated data (== originalSize),
- *           or an error code, which can be tested using HUF_isError()
- */
-HUF_PUBLIC_API size_t HUF_decompress(void* dst,  size_t originalSize,
-                               const void* cSrc, size_t cSrcSize);
+#include "mem.h"          /* U32 */
+#define FSE_STATIC_LINKING_ONLY
+#include "fse.h"
 
 
 /* ***   Tool functions *** */
-#define HUF_BLOCKSIZE_MAX (128 * 1024)                  /*< maximum input size for a single block compressed with HUF_compress */
-HUF_PUBLIC_API size_t HUF_compressBound(size_t size);   /*< maximum compressed size (worst case) */
+#define HUF_BLOCKSIZE_MAX (128 * 1024)   /*< maximum input size for a single block compressed with HUF_compress */
+size_t HUF_compressBound(size_t size);   /*< maximum compressed size (worst case) */
 
 /* Error Management */
-HUF_PUBLIC_API unsigned    HUF_isError(size_t code);       /*< tells if a return value is an error code */
-HUF_PUBLIC_API const char* HUF_getErrorName(size_t code);  /*< provides error code string (useful for debugging) */
+unsigned    HUF_isError(size_t code);       /*< tells if a return value is an error code */
+const char* HUF_getErrorName(size_t code);  /*< provides error code string (useful for debugging) */
 
 
-/* ***   Advanced function   *** */
-
-/* HUF_compress2() :
- *  Same as HUF_compress(), but offers control over `maxSymbolValue` and `tableLog`.
- * `maxSymbolValue` must be <= HUF_SYMBOLVALUE_MAX .
- * `tableLog` must be `<= HUF_TABLELOG_MAX` . */
-HUF_PUBLIC_API size_t HUF_compress2 (void* dst, size_t dstCapacity,
-                               const void* src, size_t srcSize,
-                               unsigned maxSymbolValue, unsigned tableLog);
-
-/* HUF_compress4X_wksp() :
- *  Same as HUF_compress2(), but uses externally allocated `workSpace`.
- * `workspace` must be at least as large as HUF_WORKSPACE_SIZE */
 #define HUF_WORKSPACE_SIZE ((8 << 10) + 512 /* sorting scratch space */)
 #define HUF_WORKSPACE_SIZE_U64 (HUF_WORKSPACE_SIZE / sizeof(U64))
-HUF_PUBLIC_API size_t HUF_compress4X_wksp (void* dst, size_t dstCapacity,
-                                     const void* src, size_t srcSize,
-                                     unsigned maxSymbolValue, unsigned tableLog,
-                                     void* workSpace, size_t wkspSize);
-
-#endif   /* HUF_H_298734234 */
-
-/* ******************************************************************
- *  WARNING !!
- *  The following section contains advanced and experimental definitions
- *  which shall never be used in the context of a dynamic library,
- *  because they are not guaranteed to remain stable in the future.
- *  Only consider them in association with static linking.
- * *****************************************************************/
-#if !defined(HUF_H_HUF_STATIC_LINKING_ONLY)
-#define HUF_H_HUF_STATIC_LINKING_ONLY
-
-/* *** Dependencies *** */
-#include "mem.h"   /* U32 */
-#define FSE_STATIC_LINKING_ONLY
-#include "fse.h"
-
 
 /* *** Constants *** */
 #define HUF_TABLELOG_MAX      12      /* max runtime value of tableLog (due to static allocation); can be modified up to HUF_TABLELOG_ABSOLUTEMAX */
@@ -151,25 +75,49 @@ typedef U32 HUF_DTable;
 /* ****************************************
 *  Advanced decompression functions
 ******************************************/
-size_t HUF_decompress4X1 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /*< single-symbol decoder */
-#ifndef HUF_FORCE_DECOMPRESS_X1
-size_t HUF_decompress4X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /*< double-symbols decoder */
-#endif
 
-size_t HUF_decompress4X_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /*< decodes RLE and uncompressed */
-size_t HUF_decompress4X_hufOnly(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /*< considers RLE and uncompressed as errors */
-size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize); /*< considers RLE and uncompressed as errors */
-size_t HUF_decompress4X1_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /*< single-symbol decoder */
-size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize);   /*< single-symbol decoder */
-#ifndef HUF_FORCE_DECOMPRESS_X1
-size_t HUF_decompress4X2_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /*< double-symbols decoder */
-size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize);   /*< double-symbols decoder */
-#endif
+/*
+ * Huffman flags bitset.
+ * For all flags, 0 is the default value.
+ */
+typedef enum {
+    /*
+     * If compiled with DYNAMIC_BMI2: Set flag only if the CPU supports BMI2 at runtime.
+     * Otherwise: Ignored.
+     */
+    HUF_flags_bmi2 = (1 << 0),
+    /*
+     * If set: Test possible table depths to find the one that produces the smallest header + encoded size.
+     * If unset: Use heuristic to find the table depth.
+     */
+    HUF_flags_optimalDepth = (1 << 1),
+    /*
+     * If set: If the previous table can encode the input, always reuse the previous table.
+     * If unset: If the previous table can encode the input, reuse the previous table if it results in a smaller output.
+     */
+    HUF_flags_preferRepeat = (1 << 2),
+    /*
+     * If set: Sample the input and check if the sample is uncompressible, if it is then don't attempt to compress.
+     * If unset: Always histogram the entire input.
+     */
+    HUF_flags_suspectUncompressible = (1 << 3),
+    /*
+     * If set: Don't use assembly implementations
+     * If unset: Allow using assembly implementations
+     */
+    HUF_flags_disableAsm = (1 << 4),
+    /*
+     * If set: Don't use the fast decoding loop, always use the fallback decoding loop.
+     * If unset: Use the fast decoding loop when possible.
+     */
+    HUF_flags_disableFast = (1 << 5)
+} HUF_flags_e;
 
 
 /* ****************************************
  *  HUF detailed API
  * ****************************************/
+#define HUF_OPTIMAL_DEPTH_THRESHOLD ZSTD_btultra
 
 /*! HUF_compress() does the following:
  *  1. count symbol occurrence from source[] into table count[] using FSE_count() (exposed within "fse.h")
@@ -182,12 +130,12 @@ size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
  *  For example, it's possible to compress several blocks using the same 'CTable',
  *  or to save and regenerate 'CTable' using external methods.
  */
-unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue);
-size_t HUF_buildCTable (HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue, unsigned maxNbBits);   /* @return : maxNbBits; CTable and count can overlap. In which case, CTable will overwrite count content */
-size_t HUF_writeCTable (void* dst, size_t maxDstSize, const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog);
+unsigned HUF_minTableLog(unsigned symbolCardinality);
+unsigned HUF_cardinality(const unsigned* count, unsigned maxSymbolValue);
+unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, void* workSpace,
+ size_t wkspSize, HUF_CElt* table, const unsigned* count, int flags); /* table is used as scratch space for building and testing tables, not a return value */
 size_t HUF_writeCTable_wksp(void* dst, size_t maxDstSize, const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog, void* workspace, size_t workspaceSize);
-size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable);
-size_t HUF_compress4X_usingCTable_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int bmi2);
+size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int flags);
 size_t HUF_estimateCompressedSize(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue);
 int HUF_validateCTable(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue);
 
@@ -196,6 +144,7 @@ typedef enum {
    HUF_repeat_check, /*< Can use the previous table but it must be checked. Note : The previous table must have been constructed by HUF_compress{1, 4}X_repeat */
    HUF_repeat_valid  /*< Can use the previous table and it is assumed to be valid */
  } HUF_repeat;
+
 /* HUF_compress4X_repeat() :
  *  Same as HUF_compress4X_wksp(), but considers using hufTable if *repeat != HUF_repeat_none.
  *  If it uses hufTable it does not modify hufTable or repeat.
@@ -206,13 +155,13 @@ size_t HUF_compress4X_repeat(void* dst, size_t dstSize,
                        const void* src, size_t srcSize,
                        unsigned maxSymbolValue, unsigned tableLog,
                        void* workSpace, size_t wkspSize,    /*< `workSpace` must be aligned on 4-bytes boundaries, `wkspSize` must be >= HUF_WORKSPACE_SIZE */
-                       HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2, unsigned suspectUncompressible);
+                       HUF_CElt* hufTable, HUF_repeat* repeat, int flags);
 
 /* HUF_buildCTable_wksp() :
  *  Same as HUF_buildCTable(), but using externally allocated scratch buffer.
  * `workSpace` must be aligned on 4-bytes boundaries, and its size must be >= HUF_CTABLE_WORKSPACE_SIZE.
  */
-#define HUF_CTABLE_WORKSPACE_SIZE_U32 (2*HUF_SYMBOLVALUE_MAX +1 +1)
+#define HUF_CTABLE_WORKSPACE_SIZE_U32 ((4 * (HUF_SYMBOLVALUE_MAX + 1)) + 192)
 #define HUF_CTABLE_WORKSPACE_SIZE (HUF_CTABLE_WORKSPACE_SIZE_U32 * sizeof(unsigned))
 size_t HUF_buildCTable_wksp (HUF_CElt* tree,
                        const unsigned* count, U32 maxSymbolValue, U32 maxNbBits,
@@ -238,7 +187,7 @@ size_t HUF_readStats_wksp(BYTE* huffWeight, size_t hwSize,
                           U32* rankStats, U32* nbSymbolsPtr, U32* tableLogPtr,
                           const void* src, size_t srcSize,
                           void* workspace, size_t wkspSize,
-                          int bmi2);
+                          int flags);
 
 /* HUF_readCTable() :
  *  Loading a CTable saved with HUF_writeCTable() */
@@ -276,32 +225,12 @@ U32 HUF_selectDecoder (size_t dstSize, size_t cSrcSize);
 #define HUF_DECOMPRESS_WORKSPACE_SIZE ((2 << 10) + (1 << 9))
 #define HUF_DECOMPRESS_WORKSPACE_SIZE_U32 (HUF_DECOMPRESS_WORKSPACE_SIZE / sizeof(U32))
 
-#ifndef HUF_FORCE_DECOMPRESS_X2
-size_t HUF_readDTableX1 (HUF_DTable* DTable, const void* src, size_t srcSize);
-size_t HUF_readDTableX1_wksp (HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize);
-#endif
-#ifndef HUF_FORCE_DECOMPRESS_X1
-size_t HUF_readDTableX2 (HUF_DTable* DTable, const void* src, size_t srcSize);
-size_t HUF_readDTableX2_wksp (HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize);
-#endif
-
-size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
-#ifndef HUF_FORCE_DECOMPRESS_X2
-size_t HUF_decompress4X1_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
-#endif
-#ifndef HUF_FORCE_DECOMPRESS_X1
-size_t HUF_decompress4X2_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
-#endif
-
 
 /* ====================== */
 /* single stream variants */
 /* ====================== */
 
-size_t HUF_compress1X (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog);
-size_t HUF_compress1X_wksp (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize);  /*< `workSpace` must be a table of at least HUF_WORKSPACE_SIZE_U64 U64 */
-size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable);
-size_t HUF_compress1X_usingCTable_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int bmi2);
+size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int flags);
 /* HUF_compress1X_repeat() :
  *  Same as HUF_compress1X_wksp(), but considers using hufTable if *repeat != HUF_repeat_none.
  *  If it uses hufTable it does not modify hufTable or repeat.
@@ -312,47 +241,28 @@ size_t HUF_compress1X_repeat(void* dst, size_t dstSize,
                        const void* src, size_t srcSize,
                        unsigned maxSymbolValue, unsigned tableLog,
                        void* workSpace, size_t wkspSize,   /*< `workSpace` must be aligned on 4-bytes boundaries, `wkspSize` must be >= HUF_WORKSPACE_SIZE */
-                       HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2, unsigned suspectUncompressible);
-
-size_t HUF_decompress1X1 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /* single-symbol decoder */
-#ifndef HUF_FORCE_DECOMPRESS_X1
-size_t HUF_decompress1X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /* double-symbol decoder */
-#endif
-
-size_t HUF_decompress1X_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);
-size_t HUF_decompress1X_DCtx_wksp (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize);
-#ifndef HUF_FORCE_DECOMPRESS_X2
-size_t HUF_decompress1X1_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /*< single-symbol decoder */
-size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize);   /*< single-symbol decoder */
-#endif
-#ifndef HUF_FORCE_DECOMPRESS_X1
-size_t HUF_decompress1X2_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /*< double-symbols decoder */
-size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize);   /*< double-symbols decoder */
-#endif
+                       HUF_CElt* hufTable, HUF_repeat* repeat, int flags);
 
-size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);   /*< automatic selection of sing or double symbol decoder, based on DTable */
-#ifndef HUF_FORCE_DECOMPRESS_X2
-size_t HUF_decompress1X1_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
-#endif
+size_t HUF_decompress1X_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags);
 #ifndef HUF_FORCE_DECOMPRESS_X1
-size_t HUF_decompress1X2_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
+size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags);   /*< double-symbols decoder */
 #endif
 
 /* BMI2 variants.
  * If the CPU has BMI2 support, pass bmi2=1, otherwise pass bmi2=0.
  */
-size_t HUF_decompress1X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2);
+size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags);
 #ifndef HUF_FORCE_DECOMPRESS_X2
-size_t HUF_decompress1X1_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2);
+size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags);
 #endif
-size_t HUF_decompress4X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2);
-size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2);
+size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags);
+size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags);
 #ifndef HUF_FORCE_DECOMPRESS_X2
-size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int bmi2);
+size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int flags);
 #endif
 #ifndef HUF_FORCE_DECOMPRESS_X1
-size_t HUF_readDTableX2_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int bmi2);
+size_t HUF_readDTableX2_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int flags);
 #endif
 
-#endif /* HUF_STATIC_LINKING_ONLY */
+#endif   /* HUF_H_298734234 */
 
diff --git a/lib/zstd/common/mem.h b/lib/zstd/common/mem.h
index 1d9cc03924ca9a..a7231822b6e32b 100644
--- a/lib/zstd/common/mem.h
+++ b/lib/zstd/common/mem.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
diff --git a/lib/zstd/common/portability_macros.h b/lib/zstd/common/portability_macros.h
index 0e3b2c0a527db7..7ede8cf1ffe57c 100644
--- a/lib/zstd/common/portability_macros.h
+++ b/lib/zstd/common/portability_macros.h
@@ -1,5 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /*
- * Copyright (c) Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -12,7 +13,7 @@
 #define ZSTD_PORTABILITY_MACROS_H
 
 /*
- * This header file contains macro defintions to support portability.
+ * This header file contains macro definitions to support portability.
  * This header is shared between C and ASM code, so it MUST only
  * contain macro definitions. It MUST not contain any C code.
  *
@@ -65,7 +66,7 @@
 #endif
 
 /*
- * Only enable assembly for GNUC comptabile compilers,
+ * Only enable assembly for GNUC compatible compilers,
  * because other platforms may not support GAS assembly syntax.
  *
  * Only enable assembly for Linux / MacOS, other platforms may
@@ -90,4 +91,23 @@
  */
 #define ZSTD_ENABLE_ASM_X86_64_BMI2 0
 
+/*
+ * For x86 ELF targets, add .note.gnu.property section for Intel CET in
+ * assembly sources when CET is enabled.
+ *
+ * Additionally, any function that may be called indirectly must begin
+ * with ZSTD_CET_ENDBRANCH.
+ */
+#if defined(__ELF__) && (defined(__x86_64__) || defined(__i386__)) \
+    && defined(__has_include)
+# if __has_include(<cet.h>)
+#  include <cet.h>
+#  define ZSTD_CET_ENDBRANCH _CET_ENDBR
+# endif
+#endif
+
+#ifndef ZSTD_CET_ENDBRANCH
+# define ZSTD_CET_ENDBRANCH
+#endif
+
 #endif /* ZSTD_PORTABILITY_MACROS_H */
diff --git a/lib/zstd/common/zstd_common.c b/lib/zstd/common/zstd_common.c
index 3d7e35b309b5d1..44b95b25344a1e 100644
--- a/lib/zstd/common/zstd_common.c
+++ b/lib/zstd/common/zstd_common.c
@@ -1,5 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -14,7 +15,6 @@
 *  Dependencies
 ***************************************/
 #define ZSTD_DEPS_NEED_MALLOC
-#include "zstd_deps.h"   /* ZSTD_malloc, ZSTD_calloc, ZSTD_free, ZSTD_memset */
 #include "error_private.h"
 #include "zstd_internal.h"
 
@@ -47,37 +47,3 @@ ZSTD_ErrorCode ZSTD_getErrorCode(size_t code) { return ERR_getErrorCode(code); }
 /*! ZSTD_getErrorString() :
  *  provides error code string from enum */
 const char* ZSTD_getErrorString(ZSTD_ErrorCode code) { return ERR_getErrorString(code); }
-
-
-
-/*=**************************************************************
-*  Custom allocator
-****************************************************************/
-void* ZSTD_customMalloc(size_t size, ZSTD_customMem customMem)
-{
-    if (customMem.customAlloc)
-        return customMem.customAlloc(customMem.opaque, size);
-    return ZSTD_malloc(size);
-}
-
-void* ZSTD_customCalloc(size_t size, ZSTD_customMem customMem)
-{
-    if (customMem.customAlloc) {
-        /* calloc implemented as malloc+memset;
-         * not as efficient as calloc, but next best guess for custom malloc */
-        void* const ptr = customMem.customAlloc(customMem.opaque, size);
-        ZSTD_memset(ptr, 0, size);
-        return ptr;
-    }
-    return ZSTD_calloc(1, size);
-}
-
-void ZSTD_customFree(void* ptr, ZSTD_customMem customMem)
-{
-    if (ptr!=NULL) {
-        if (customMem.customFree)
-            customMem.customFree(customMem.opaque, ptr);
-        else
-            ZSTD_free(ptr);
-    }
-}
diff --git a/lib/zstd/common/zstd_deps.h b/lib/zstd/common/zstd_deps.h
index 2c34e8a33a1c1b..f931f7d0e29479 100644
--- a/lib/zstd/common/zstd_deps.h
+++ b/lib/zstd/common/zstd_deps.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /*
- * Copyright (c) Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -105,3 +105,17 @@ static uint64_t ZSTD_div64(uint64_t dividend, uint32_t divisor) {
 
 #endif /* ZSTD_DEPS_IO */
 #endif /* ZSTD_DEPS_NEED_IO */
+
+/*
+ * Only requested when MSAN is enabled.
+ * Need:
+ * intptr_t
+ */
+#ifdef ZSTD_DEPS_NEED_STDINT
+#ifndef ZSTD_DEPS_STDINT
+#define ZSTD_DEPS_STDINT
+
+/* intptr_t already provided by ZSTD_DEPS_COMMON */
+
+#endif /* ZSTD_DEPS_STDINT */
+#endif /* ZSTD_DEPS_NEED_STDINT */
diff --git a/lib/zstd/common/zstd_internal.h b/lib/zstd/common/zstd_internal.h
index 93305d9b41bba7..7f023e4d47740c 100644
--- a/lib/zstd/common/zstd_internal.h
+++ b/lib/zstd/common/zstd_internal.h
@@ -1,5 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -28,7 +29,6 @@
 #include <linux/zstd.h>
 #define FSE_STATIC_LINKING_ONLY
 #include "fse.h"
-#define HUF_STATIC_LINKING_ONLY
 #include "huf.h"
 #include <linux/xxhash.h>                /* XXH_reset, update, digest */
 #define ZSTD_TRACE 0
@@ -83,9 +83,9 @@ typedef enum { bt_raw, bt_rle, bt_compressed, bt_reserved } blockType_e;
 #define ZSTD_FRAMECHECKSUMSIZE 4
 
 #define MIN_SEQUENCES_SIZE 1 /* nbSeq==0 */
-#define MIN_CBLOCK_SIZE (1 /*litCSize*/ + 1 /* RLE or RAW */ + MIN_SEQUENCES_SIZE /* nbSeq==0 */)   /* for a non-null block */
+#define MIN_CBLOCK_SIZE (1 /*litCSize*/ + 1 /* RLE or RAW */)   /* for a non-null block */
+#define MIN_LITERALS_FOR_4_STREAMS 6
 
-#define HufLog 12
 typedef enum { set_basic, set_rle, set_compressed, set_repeat } symbolEncodingType_e;
 
 #define LONGNBSEQ 0x7F00
@@ -93,6 +93,7 @@ typedef enum { set_basic, set_rle, set_compressed, set_repeat } symbolEncodingTy
 #define MINMATCH 3
 
 #define Litbits  8
+#define LitHufLog 11
 #define MaxLit ((1<<Litbits) - 1)
 #define MaxML   52
 #define MaxLL   35
@@ -103,6 +104,8 @@ typedef enum { set_basic, set_rle, set_compressed, set_repeat } symbolEncodingTy
 #define LLFSELog    9
 #define OffFSELog   8
 #define MaxFSELog  MAX(MAX(MLFSELog, LLFSELog), OffFSELog)
+#define MaxMLBits 16
+#define MaxLLBits 16
 
 #define ZSTD_MAX_HUF_HEADER_SIZE 128 /* header + <= 127 byte tree description */
 /* Each table cannot take more than #symbols * FSELog bits */
@@ -225,12 +228,6 @@ void ZSTD_wildcopy(void* dst, const void* src, ptrdiff_t length, ZSTD_overlap_e
          * one COPY16() in the first call. Then, do two calls per loop since
          * at that point it is more likely to have a high trip count.
          */
-#ifdef __aarch64__
-        do {
-            COPY16(op, ip);
-        }
-        while (op < oend);
-#else
         ZSTD_copy16(op, ip);
         if (16 >= length) return;
         op += 16;
@@ -240,7 +237,6 @@ void ZSTD_wildcopy(void* dst, const void* src, ptrdiff_t length, ZSTD_overlap_e
             COPY16(op, ip);
         }
         while (op < oend);
-#endif
     }
 }
 
@@ -289,11 +285,11 @@ typedef enum {
 typedef struct {
     seqDef* sequencesStart;
     seqDef* sequences;      /* ptr to end of sequences */
-    BYTE* litStart;
-    BYTE* lit;              /* ptr to end of literals */
-    BYTE* llCode;
-    BYTE* mlCode;
-    BYTE* ofCode;
+    BYTE*  litStart;
+    BYTE*  lit;             /* ptr to end of literals */
+    BYTE*  llCode;
+    BYTE*  mlCode;
+    BYTE*  ofCode;
     size_t maxNbSeq;
     size_t maxNbLit;
 
@@ -301,8 +297,8 @@ typedef struct {
      * in the seqStore that has a value larger than U16 (if it exists). To do so, we increment
      * the existing value of the litLength or matchLength by 0x10000.
      */
-    ZSTD_longLengthType_e   longLengthType;
-    U32                     longLengthPos;  /* Index of the sequence to apply long length modification to */
+    ZSTD_longLengthType_e longLengthType;
+    U32                   longLengthPos;  /* Index of the sequence to apply long length modification to */
 } seqStore_t;
 
 typedef struct {
@@ -321,10 +317,10 @@ MEM_STATIC ZSTD_sequenceLength ZSTD_getSequenceLength(seqStore_t const* seqStore
     seqLen.matchLength = seq->mlBase + MINMATCH;
     if (seqStore->longLengthPos == (U32)(seq - seqStore->sequencesStart)) {
         if (seqStore->longLengthType == ZSTD_llt_literalLength) {
-            seqLen.litLength += 0xFFFF;
+            seqLen.litLength += 0x10000;
         }
         if (seqStore->longLengthType == ZSTD_llt_matchLength) {
-            seqLen.matchLength += 0xFFFF;
+            seqLen.matchLength += 0x10000;
         }
     }
     return seqLen;
@@ -337,72 +333,13 @@ MEM_STATIC ZSTD_sequenceLength ZSTD_getSequenceLength(seqStore_t const* seqStore
  *          `decompressedBound != ZSTD_CONTENTSIZE_ERROR`
  */
 typedef struct {
+    size_t nbBlocks;
     size_t compressedSize;
     unsigned long long decompressedBound;
 } ZSTD_frameSizeInfo;   /* decompress & legacy */
 
 const seqStore_t* ZSTD_getSeqStore(const ZSTD_CCtx* ctx);   /* compress & dictBuilder */
-void ZSTD_seqToCodes(const seqStore_t* seqStorePtr);   /* compress, dictBuilder, decodeCorpus (shouldn't get its definition from here) */
-
-/* custom memory allocation functions */
-void* ZSTD_customMalloc(size_t size, ZSTD_customMem customMem);
-void* ZSTD_customCalloc(size_t size, ZSTD_customMem customMem);
-void ZSTD_customFree(void* ptr, ZSTD_customMem customMem);
-
-
-MEM_STATIC U32 ZSTD_highbit32(U32 val)   /* compress, dictBuilder, decodeCorpus */
-{
-    assert(val != 0);
-    {
-#   if (__GNUC__ >= 3)   /* GCC Intrinsic */
-        return __builtin_clz (val) ^ 31;
-#   else   /* Software version */
-        static const U32 DeBruijnClz[32] = { 0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30, 8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31 };
-        U32 v = val;
-        v |= v >> 1;
-        v |= v >> 2;
-        v |= v >> 4;
-        v |= v >> 8;
-        v |= v >> 16;
-        return DeBruijnClz[(v * 0x07C4ACDDU) >> 27];
-#   endif
-    }
-}
-
-/*
- * Counts the number of trailing zeros of a `size_t`.
- * Most compilers should support CTZ as a builtin. A backup
- * implementation is provided if the builtin isn't supported, but
- * it may not be terribly efficient.
- */
-MEM_STATIC unsigned ZSTD_countTrailingZeros(size_t val)
-{
-    if (MEM_64bits()) {
-#       if (__GNUC__ >= 4)
-            return __builtin_ctzll((U64)val);
-#       else
-            static const int DeBruijnBytePos[64] = {  0,  1,  2,  7,  3, 13,  8, 19,
-                                                      4, 25, 14, 28,  9, 34, 20, 56,
-                                                      5, 17, 26, 54, 15, 41, 29, 43,
-                                                      10, 31, 38, 35, 21, 45, 49, 57,
-                                                      63,  6, 12, 18, 24, 27, 33, 55,
-                                                      16, 53, 40, 42, 30, 37, 44, 48,
-                                                      62, 11, 23, 32, 52, 39, 36, 47,
-                                                      61, 22, 51, 46, 60, 50, 59, 58 };
-            return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58];
-#       endif
-    } else { /* 32 bits */
-#       if (__GNUC__ >= 3)
-            return __builtin_ctz((U32)val);
-#       else
-            static const int DeBruijnBytePos[32] = {  0,  1, 28,  2, 29, 14, 24,  3,
-                                                     30, 22, 20, 15, 25, 17,  4,  8,
-                                                     31, 27, 13, 23, 21, 19, 16,  7,
-                                                     26, 12, 18,  6, 11,  5, 10,  9 };
-            return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27];
-#       endif
-    }
-}
+int ZSTD_seqToCodes(const seqStore_t* seqStorePtr);   /* compress, dictBuilder, decodeCorpus (shouldn't get its definition from here) */
 
 
 /* ZSTD_invalidateRepCodes() :
diff --git a/lib/zstd/compress/clevels.h b/lib/zstd/compress/clevels.h
index d9a76112ec3afe..6ab8be6532efc0 100644
--- a/lib/zstd/compress/clevels.h
+++ b/lib/zstd/compress/clevels.h
@@ -1,5 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
diff --git a/lib/zstd/compress/fse_compress.c b/lib/zstd/compress/fse_compress.c
index ec5b1ca6d71af6..e46ca6621b488d 100644
--- a/lib/zstd/compress/fse_compress.c
+++ b/lib/zstd/compress/fse_compress.c
@@ -1,6 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
 /* ******************************************************************
  * FSE : Finite State Entropy encoder
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  *
  *  You can contact the author at :
  *  - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
@@ -26,6 +27,7 @@
 #define ZSTD_DEPS_NEED_MALLOC
 #define ZSTD_DEPS_NEED_MATH64
 #include "../common/zstd_deps.h"  /* ZSTD_malloc, ZSTD_free, ZSTD_memcpy, ZSTD_memset */
+#include "../common/bits.h" /* ZSTD_highbit32 */
 
 
 /* **************************************************************
@@ -90,7 +92,7 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct,
     assert(tableLog < 16);   /* required for threshold strategy to work */
 
     /* For explanations on how to distribute symbol values over the table :
-     * http://fastcompression.blogspot.fr/2014/02/fse-distributing-symbol-values.html */
+     * https://fastcompression.blogspot.fr/2014/02/fse-distributing-symbol-values.html */
 
      #ifdef __clang_analyzer__
      ZSTD_memset(tableSymbol, 0, sizeof(*tableSymbol) * tableSize);   /* useless initialization, just to keep scan-build happy */
@@ -191,7 +193,7 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct,
                 break;
             default :
                 assert(normalizedCounter[s] > 1);
-                {   U32 const maxBitsOut = tableLog - BIT_highbit32 ((U32)normalizedCounter[s]-1);
+                {   U32 const maxBitsOut = tableLog - ZSTD_highbit32 ((U32)normalizedCounter[s]-1);
                     U32 const minStatePlus = (U32)normalizedCounter[s] << maxBitsOut;
                     symbolTT[s].deltaNbBits = (maxBitsOut << 16) - minStatePlus;
                     symbolTT[s].deltaFindState = (int)(total - (unsigned)normalizedCounter[s]);
@@ -342,21 +344,11 @@ size_t FSE_writeNCount (void* buffer, size_t bufferSize,
 *  FSE Compression Code
 ****************************************************************/
 
-FSE_CTable* FSE_createCTable (unsigned maxSymbolValue, unsigned tableLog)
-{
-    size_t size;
-    if (tableLog > FSE_TABLELOG_ABSOLUTE_MAX) tableLog = FSE_TABLELOG_ABSOLUTE_MAX;
-    size = FSE_CTABLE_SIZE_U32 (tableLog, maxSymbolValue) * sizeof(U32);
-    return (FSE_CTable*)ZSTD_malloc(size);
-}
-
-void FSE_freeCTable (FSE_CTable* ct) { ZSTD_free(ct); }
-
 /* provides the minimum logSize to safely represent a distribution */
 static unsigned FSE_minTableLog(size_t srcSize, unsigned maxSymbolValue)
 {
-    U32 minBitsSrc = BIT_highbit32((U32)(srcSize)) + 1;
-    U32 minBitsSymbols = BIT_highbit32(maxSymbolValue) + 2;
+    U32 minBitsSrc = ZSTD_highbit32((U32)(srcSize)) + 1;
+    U32 minBitsSymbols = ZSTD_highbit32(maxSymbolValue) + 2;
     U32 minBits = minBitsSrc < minBitsSymbols ? minBitsSrc : minBitsSymbols;
     assert(srcSize > 1); /* Not supported, RLE should be used instead */
     return minBits;
@@ -364,7 +356,7 @@ static unsigned FSE_minTableLog(size_t srcSize, unsigned maxSymbolValue)
 
 unsigned FSE_optimalTableLog_internal(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, unsigned minus)
 {
-    U32 maxBitsSrc = BIT_highbit32((U32)(srcSize - 1)) - minus;
+    U32 maxBitsSrc = ZSTD_highbit32((U32)(srcSize - 1)) - minus;
     U32 tableLog = maxTableLog;
     U32 minBits = FSE_minTableLog(srcSize, maxSymbolValue);
     assert(srcSize > 1); /* Not supported, RLE should be used instead */
@@ -532,40 +524,6 @@ size_t FSE_normalizeCount (short* normalizedCounter, unsigned tableLog,
     return tableLog;
 }
 
-
-/* fake FSE_CTable, for raw (uncompressed) input */
-size_t FSE_buildCTable_raw (FSE_CTable* ct, unsigned nbBits)
-{
-    const unsigned tableSize = 1 << nbBits;
-    const unsigned tableMask = tableSize - 1;
-    const unsigned maxSymbolValue = tableMask;
-    void* const ptr = ct;
-    U16* const tableU16 = ( (U16*) ptr) + 2;
-    void* const FSCT = ((U32*)ptr) + 1 /* header */ + (tableSize>>1);   /* assumption : tableLog >= 1 */
-    FSE_symbolCompressionTransform* const symbolTT = (FSE_symbolCompressionTransform*) (FSCT);
-    unsigned s;
-
-    /* Sanity checks */
-    if (nbBits < 1) return ERROR(GENERIC);             /* min size */
-
-    /* header */
-    tableU16[-2] = (U16) nbBits;
-    tableU16[-1] = (U16) maxSymbolValue;
-
-    /* Build table */
-    for (s=0; s<tableSize; s++)
-        tableU16[s] = (U16)(tableSize + s);
-
-    /* Build Symbol Transformation Table */
-    {   const U32 deltaNbBits = (nbBits << 16) - (1 << nbBits);
-        for (s=0; s<=maxSymbolValue; s++) {
-            symbolTT[s].deltaNbBits = deltaNbBits;
-            symbolTT[s].deltaFindState = s-1;
-    }   }
-
-    return 0;
-}
-
 /* fake FSE_CTable, for rle input (always same symbol) */
 size_t FSE_buildCTable_rle (FSE_CTable* ct, BYTE symbolValue)
 {
@@ -664,5 +622,4 @@ size_t FSE_compress_usingCTable (void* dst, size_t dstSize,
 
 size_t FSE_compressBound(size_t size) { return FSE_COMPRESSBOUND(size); }
 
-
 #endif   /* FSE_COMMONDEFS_ONLY */
diff --git a/lib/zstd/compress/hist.c b/lib/zstd/compress/hist.c
index 3ddc6dfb689482..0b12587cc14b19 100644
--- a/lib/zstd/compress/hist.c
+++ b/lib/zstd/compress/hist.c
@@ -1,7 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
 /* ******************************************************************
  * hist : Histogram functions
  * part of Finite State Entropy project
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  *
  *  You can contact the author at :
  *  - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
diff --git a/lib/zstd/compress/hist.h b/lib/zstd/compress/hist.h
index fc1830abc9c63a..f7687b0fc20a03 100644
--- a/lib/zstd/compress/hist.h
+++ b/lib/zstd/compress/hist.h
@@ -1,7 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /* ******************************************************************
  * hist : Histogram functions
  * part of Finite State Entropy project
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  *
  *  You can contact the author at :
  *  - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
diff --git a/lib/zstd/compress/huf_compress.c b/lib/zstd/compress/huf_compress.c
index 74ef0db4762101..83241abafe35e7 100644
--- a/lib/zstd/compress/huf_compress.c
+++ b/lib/zstd/compress/huf_compress.c
@@ -1,6 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
 /* ******************************************************************
  * Huffman encoder, part of New Generation Entropy library
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  *
  *  You can contact the author at :
  *  - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy
@@ -26,9 +27,9 @@
 #include "hist.h"
 #define FSE_STATIC_LINKING_ONLY   /* FSE_optimalTableLog_internal */
 #include "../common/fse.h"        /* header compression */
-#define HUF_STATIC_LINKING_ONLY
 #include "../common/huf.h"
 #include "../common/error_private.h"
+#include "../common/bits.h"       /* ZSTD_highbit32 */
 
 
 /* **************************************************************
@@ -39,13 +40,67 @@
 
 
 /* **************************************************************
-*  Utils
+*  Required declarations
 ****************************************************************/
-unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue)
+typedef struct nodeElt_s {
+    U32 count;
+    U16 parent;
+    BYTE byte;
+    BYTE nbBits;
+} nodeElt;
+
+
+/* **************************************************************
+*  Debug Traces
+****************************************************************/
+
+#if DEBUGLEVEL >= 2
+
+static size_t showU32(const U32* arr, size_t size)
+{
+    size_t u;
+    for (u=0; u<size; u++) {
+        RAWLOG(6, " %u", arr[u]); (void)arr;
+    }
+    RAWLOG(6, " \n");
+    return size;
+}
+
+static size_t HUF_getNbBits(HUF_CElt elt);
+
+static size_t showCTableBits(const HUF_CElt* ctable, size_t size)
 {
-    return FSE_optimalTableLog_internal(maxTableLog, srcSize, maxSymbolValue, 1);
+    size_t u;
+    for (u=0; u<size; u++) {
+        RAWLOG(6, " %zu", HUF_getNbBits(ctable[u])); (void)ctable;
+    }
+    RAWLOG(6, " \n");
+    return size;
+
 }
 
+static size_t showHNodeSymbols(const nodeElt* hnode, size_t size)
+{
+    size_t u;
+    for (u=0; u<size; u++) {
+        RAWLOG(6, " %u", hnode[u].byte); (void)hnode;
+    }
+    RAWLOG(6, " \n");
+    return size;
+}
+
+static size_t showHNodeBits(const nodeElt* hnode, size_t size)
+{
+    size_t u;
+    for (u=0; u<size; u++) {
+        RAWLOG(6, " %u", hnode[u].nbBits); (void)hnode;
+    }
+    RAWLOG(6, " \n");
+    return size;
+}
+
+#endif
+
 
 /* *******************************************************
 *  HUF : Huffman block compression
@@ -86,7 +141,10 @@ typedef struct {
     S16 norm[HUF_TABLELOG_MAX+1];
 } HUF_CompressWeightsWksp;
 
-static size_t HUF_compressWeights(void* dst, size_t dstSize, const void* weightTable, size_t wtSize, void* workspace, size_t workspaceSize)
+static size_t
+HUF_compressWeights(void* dst, size_t dstSize,
+              const void* weightTable, size_t wtSize,
+                    void* workspace, size_t workspaceSize)
 {
     BYTE* const ostart = (BYTE*) dst;
     BYTE* op = ostart;
@@ -137,7 +195,7 @@ static size_t HUF_getNbBitsFast(HUF_CElt elt)
 
 static size_t HUF_getValue(HUF_CElt elt)
 {
-    return elt & ~0xFF;
+    return elt & ~(size_t)0xFF;
 }
 
 static size_t HUF_getValueFast(HUF_CElt elt)
@@ -175,6 +233,8 @@ size_t HUF_writeCTable_wksp(void* dst, size_t maxDstSize,
     U32 n;
     HUF_WriteCTableWksp* wksp = (HUF_WriteCTableWksp*)HUF_alignUpWorkspace(workspace, &workspaceSize, ZSTD_ALIGNOF(U32));
 
+    HUF_STATIC_ASSERT(HUF_CTABLE_WORKSPACE_SIZE >= sizeof(HUF_WriteCTableWksp));
+
     /* check conditions */
     if (workspaceSize < sizeof(HUF_WriteCTableWksp)) return ERROR(GENERIC);
     if (maxSymbolValue > HUF_SYMBOLVALUE_MAX) return ERROR(maxSymbolValue_tooLarge);
@@ -204,16 +264,6 @@ size_t HUF_writeCTable_wksp(void* dst, size_t maxDstSize,
     return ((maxSymbolValue+1)/2) + 1;
 }
 
-/*! HUF_writeCTable() :
-    `CTable` : Huffman tree to save, using huf representation.
-    @return : size of saved CTable */
-size_t HUF_writeCTable (void* dst, size_t maxDstSize,
-                        const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog)
-{
-    HUF_WriteCTableWksp wksp;
-    return HUF_writeCTable_wksp(dst, maxDstSize, CTable, maxSymbolValue, huffLog, &wksp, sizeof(wksp));
-}
-
 
 size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void* src, size_t srcSize, unsigned* hasZeroWeights)
 {
@@ -269,68 +319,64 @@ size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void
 
 U32 HUF_getNbBitsFromCTable(HUF_CElt const* CTable, U32 symbolValue)
 {
-    const HUF_CElt* ct = CTable + 1;
+    const HUF_CElt* const ct = CTable + 1;
     assert(symbolValue <= HUF_SYMBOLVALUE_MAX);
     return (U32)HUF_getNbBits(ct[symbolValue]);
 }
 
 
-typedef struct nodeElt_s {
-    U32 count;
-    U16 parent;
-    BYTE byte;
-    BYTE nbBits;
-} nodeElt;
-
 /*
  * HUF_setMaxHeight():
- * Enforces maxNbBits on the Huffman tree described in huffNode.
+ * Try to enforce @targetNbBits on the Huffman tree described in @huffNode.
  *
- * It sets all nodes with nbBits > maxNbBits to be maxNbBits. Then it adjusts
- * the tree to so that it is a valid canonical Huffman tree.
+ * It attempts to convert all nodes with nbBits > @targetNbBits
+ * to employ @targetNbBits instead. Then it adjusts the tree
+ * so that it remains a valid canonical Huffman tree.
  *
  * @pre               The sum of the ranks of each symbol == 2^largestBits,
  *                    where largestBits == huffNode[lastNonNull].nbBits.
  * @post              The sum of the ranks of each symbol == 2^largestBits,
- *                    where largestBits is the return value <= maxNbBits.
+ *                    where largestBits is the return value (expected <= targetNbBits).
  *
- * @param huffNode    The Huffman tree modified in place to enforce maxNbBits.
+ * @param huffNode    The Huffman tree modified in place to enforce targetNbBits.
+ *                    It's presumed sorted, from most frequent to rarest symbol.
  * @param lastNonNull The symbol with the lowest count in the Huffman tree.
- * @param maxNbBits   The maximum allowed number of bits, which the Huffman tree
+ * @param targetNbBits  The allowed number of bits, which the Huffman tree
  *                    may not respect. After this function the Huffman tree will
- *                    respect maxNbBits.
- * @return            The maximum number of bits of the Huffman tree after adjustment,
- *                    necessarily no more than maxNbBits.
+ *                    respect targetNbBits.
+ * @return            The maximum number of bits of the Huffman tree after adjustment.
  */
-static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits)
+static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 targetNbBits)
 {
     const U32 largestBits = huffNode[lastNonNull].nbBits;
-    /* early exit : no elt > maxNbBits, so the tree is already valid. */
-    if (largestBits <= maxNbBits) return largestBits;
+    /* early exit : no elt > targetNbBits, so the tree is already valid. */
+    if (largestBits <= targetNbBits) return largestBits;
+
+    DEBUGLOG(5, "HUF_setMaxHeight (targetNbBits = %u)", targetNbBits);
 
     /* there are several too large elements (at least >= 2) */
     {   int totalCost = 0;
-        const U32 baseCost = 1 << (largestBits - maxNbBits);
+        const U32 baseCost = 1 << (largestBits - targetNbBits);
         int n = (int)lastNonNull;
 
-        /* Adjust any ranks > maxNbBits to maxNbBits.
+        /* Adjust any ranks > targetNbBits to targetNbBits.
          * Compute totalCost, which is how far the sum of the ranks is
          * we are over 2^largestBits after adjust the offending ranks.
          */
-        while (huffNode[n].nbBits > maxNbBits) {
+        while (huffNode[n].nbBits > targetNbBits) {
             totalCost += baseCost - (1 << (largestBits - huffNode[n].nbBits));
-            huffNode[n].nbBits = (BYTE)maxNbBits;
+            huffNode[n].nbBits = (BYTE)targetNbBits;
             n--;
         }
-        /* n stops at huffNode[n].nbBits <= maxNbBits */
-        assert(huffNode[n].nbBits <= maxNbBits);
-        /* n end at index of smallest symbol using < maxNbBits */
-        while (huffNode[n].nbBits == maxNbBits) --n;
+        /* n stops at huffNode[n].nbBits <= targetNbBits */
+        assert(huffNode[n].nbBits <= targetNbBits);
+        /* n end at index of smallest symbol using < targetNbBits */
+        while (huffNode[n].nbBits == targetNbBits) --n;
 
-        /* renorm totalCost from 2^largestBits to 2^maxNbBits
+        /* renorm totalCost from 2^largestBits to 2^targetNbBits
          * note : totalCost is necessarily a multiple of baseCost */
-        assert((totalCost & (baseCost - 1)) == 0);
-        totalCost >>= (largestBits - maxNbBits);
+        assert(((U32)totalCost & (baseCost - 1)) == 0);
+        totalCost >>= (largestBits - targetNbBits);
         assert(totalCost > 0);
 
         /* repay normalized cost */
@@ -339,19 +385,19 @@ static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits)
 
             /* Get pos of last (smallest = lowest cum. count) symbol per rank */
             ZSTD_memset(rankLast, 0xF0, sizeof(rankLast));
-            {   U32 currentNbBits = maxNbBits;
+            {   U32 currentNbBits = targetNbBits;
                 int pos;
                 for (pos=n ; pos >= 0; pos--) {
                     if (huffNode[pos].nbBits >= currentNbBits) continue;
-                    currentNbBits = huffNode[pos].nbBits;   /* < maxNbBits */
-                    rankLast[maxNbBits-currentNbBits] = (U32)pos;
+                    currentNbBits = huffNode[pos].nbBits;   /* < targetNbBits */
+                    rankLast[targetNbBits-currentNbBits] = (U32)pos;
             }   }
 
             while (totalCost > 0) {
                 /* Try to reduce the next power of 2 above totalCost because we
                  * gain back half the rank.
                  */
-                U32 nBitsToDecrease = BIT_highbit32((U32)totalCost) + 1;
+                U32 nBitsToDecrease = ZSTD_highbit32((U32)totalCost) + 1;
                 for ( ; nBitsToDecrease > 1; nBitsToDecrease--) {
                     U32 const highPos = rankLast[nBitsToDecrease];
                     U32 const lowPos = rankLast[nBitsToDecrease-1];
@@ -391,7 +437,7 @@ static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits)
                     rankLast[nBitsToDecrease] = noSymbol;
                 else {
                     rankLast[nBitsToDecrease]--;
-                    if (huffNode[rankLast[nBitsToDecrease]].nbBits != maxNbBits-nBitsToDecrease)
+                    if (huffNode[rankLast[nBitsToDecrease]].nbBits != targetNbBits-nBitsToDecrease)
                         rankLast[nBitsToDecrease] = noSymbol;   /* this rank is now empty */
                 }
             }   /* while (totalCost > 0) */
@@ -403,11 +449,11 @@ static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits)
              * TODO.
              */
             while (totalCost < 0) {  /* Sometimes, cost correction overshoot */
-                /* special case : no rank 1 symbol (using maxNbBits-1);
-                 * let's create one from largest rank 0 (using maxNbBits).
+                /* special case : no rank 1 symbol (using targetNbBits-1);
+                 * let's create one from largest rank 0 (using targetNbBits).
                  */
                 if (rankLast[1] == noSymbol) {
-                    while (huffNode[n].nbBits == maxNbBits) n--;
+                    while (huffNode[n].nbBits == targetNbBits) n--;
                     huffNode[n+1].nbBits--;
                     assert(n >= 0);
                     rankLast[1] = (U32)(n+1);
@@ -421,7 +467,7 @@ static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits)
         }   /* repay normalized cost */
     }   /* there are several too large elements (at least >= 2) */
 
-    return maxNbBits;
+    return targetNbBits;
 }
 
 typedef struct {
@@ -429,7 +475,7 @@ typedef struct {
     U16 curr;
 } rankPos;
 
-typedef nodeElt huffNodeTable[HUF_CTABLE_WORKSPACE_SIZE_U32];
+typedef nodeElt huffNodeTable[2 * (HUF_SYMBOLVALUE_MAX + 1)];
 
 /* Number of buckets available for HUF_sort() */
 #define RANK_POSITION_TABLE_SIZE 192
@@ -448,8 +494,8 @@ typedef struct {
  * Let buckets 166 to 192 represent all remaining counts up to RANK_POSITION_MAX_COUNT_LOG using log2 bucketing.
  */
 #define RANK_POSITION_MAX_COUNT_LOG 32
-#define RANK_POSITION_LOG_BUCKETS_BEGIN (RANK_POSITION_TABLE_SIZE - 1) - RANK_POSITION_MAX_COUNT_LOG - 1 /* == 158 */
-#define RANK_POSITION_DISTINCT_COUNT_CUTOFF RANK_POSITION_LOG_BUCKETS_BEGIN + BIT_highbit32(RANK_POSITION_LOG_BUCKETS_BEGIN) /* == 166 */
+#define RANK_POSITION_LOG_BUCKETS_BEGIN ((RANK_POSITION_TABLE_SIZE - 1) - RANK_POSITION_MAX_COUNT_LOG - 1 /* == 158 */)
+#define RANK_POSITION_DISTINCT_COUNT_CUTOFF (RANK_POSITION_LOG_BUCKETS_BEGIN + ZSTD_highbit32(RANK_POSITION_LOG_BUCKETS_BEGIN) /* == 166 */)
 
 /* Return the appropriate bucket index for a given count. See definition of
  * RANK_POSITION_DISTINCT_COUNT_CUTOFF for explanation of bucketing strategy.
@@ -457,7 +503,7 @@ typedef struct {
 static U32 HUF_getIndex(U32 const count) {
     return (count < RANK_POSITION_DISTINCT_COUNT_CUTOFF)
         ? count
-        : BIT_highbit32(count) + RANK_POSITION_LOG_BUCKETS_BEGIN;
+        : ZSTD_highbit32(count) + RANK_POSITION_LOG_BUCKETS_BEGIN;
 }
 
 /* Helper swap function for HUF_quickSortPartition() */
@@ -580,7 +626,7 @@ static void HUF_sort(nodeElt huffNode[], const unsigned count[], U32 const maxSy
 
     /* Sort each bucket. */
     for (n = RANK_POSITION_DISTINCT_COUNT_CUTOFF; n < RANK_POSITION_TABLE_SIZE - 1; ++n) {
-        U32 const bucketSize = rankPosition[n].curr-rankPosition[n].base;
+        int const bucketSize = rankPosition[n].curr - rankPosition[n].base;
         U32 const bucketStartIdx = rankPosition[n].base;
         if (bucketSize > 1) {
             assert(bucketStartIdx < maxSymbolValue1);
@@ -591,6 +637,7 @@ static void HUF_sort(nodeElt huffNode[], const unsigned count[], U32 const maxSy
     assert(HUF_isSorted(huffNode, maxSymbolValue1));
 }
 
+
 /* HUF_buildCTable_wksp() :
  *  Same as HUF_buildCTable(), but using externally allocated scratch buffer.
  *  `workSpace` must be aligned on 4-bytes boundaries, and be at least as large as sizeof(HUF_buildCTable_wksp_tables).
@@ -611,6 +658,7 @@ static int HUF_buildTree(nodeElt* huffNode, U32 maxSymbolValue)
     int lowS, lowN;
     int nodeNb = STARTNODE;
     int n, nodeRoot;
+    DEBUGLOG(5, "HUF_buildTree (alphabet size = %u)", maxSymbolValue + 1);
     /* init for parents */
     nonNullRank = (int)maxSymbolValue;
     while(huffNode[nonNullRank].count == 0) nonNullRank--;
@@ -637,6 +685,8 @@ static int HUF_buildTree(nodeElt* huffNode, U32 maxSymbolValue)
     for (n=0; n<=nonNullRank; n++)
         huffNode[n].nbBits = huffNode[ huffNode[n].parent ].nbBits + 1;
 
+    DEBUGLOG(6, "Initial distribution of bits completed (%zu sorted symbols)", showHNodeBits(huffNode, maxSymbolValue+1));
+
     return nonNullRank;
 }
 
@@ -674,28 +724,36 @@ static void HUF_buildCTableFromTree(HUF_CElt* CTable, nodeElt const* huffNode, i
     CTable[0] = maxNbBits;
 }
 
-size_t HUF_buildCTable_wksp (HUF_CElt* CTable, const unsigned* count, U32 maxSymbolValue, U32 maxNbBits, void* workSpace, size_t wkspSize)
+size_t
+HUF_buildCTable_wksp(HUF_CElt* CTable, const unsigned* count, U32 maxSymbolValue, U32 maxNbBits,
+                     void* workSpace, size_t wkspSize)
 {
-    HUF_buildCTable_wksp_tables* const wksp_tables = (HUF_buildCTable_wksp_tables*)HUF_alignUpWorkspace(workSpace, &wkspSize, ZSTD_ALIGNOF(U32));
+    HUF_buildCTable_wksp_tables* const wksp_tables =
+        (HUF_buildCTable_wksp_tables*)HUF_alignUpWorkspace(workSpace, &wkspSize, ZSTD_ALIGNOF(U32));
     nodeElt* const huffNode0 = wksp_tables->huffNodeTbl;
     nodeElt* const huffNode = huffNode0+1;
     int nonNullRank;
 
+    HUF_STATIC_ASSERT(HUF_CTABLE_WORKSPACE_SIZE == sizeof(HUF_buildCTable_wksp_tables));
+
+    DEBUGLOG(5, "HUF_buildCTable_wksp (alphabet size = %u)", maxSymbolValue+1);
+
     /* safety checks */
     if (wkspSize < sizeof(HUF_buildCTable_wksp_tables))
-      return ERROR(workSpace_tooSmall);
+        return ERROR(workSpace_tooSmall);
     if (maxNbBits == 0) maxNbBits = HUF_TABLELOG_DEFAULT;
     if (maxSymbolValue > HUF_SYMBOLVALUE_MAX)
-      return ERROR(maxSymbolValue_tooLarge);
+        return ERROR(maxSymbolValue_tooLarge);
     ZSTD_memset(huffNode0, 0, sizeof(huffNodeTable));
 
     /* sort, decreasing order */
     HUF_sort(huffNode, count, maxSymbolValue, wksp_tables->rankPosition);
+    DEBUGLOG(6, "sorted symbols completed (%zu symbols)", showHNodeSymbols(huffNode, maxSymbolValue+1));
 
     /* build tree */
     nonNullRank = HUF_buildTree(huffNode, maxSymbolValue);
 
-    /* enforce maxTableLog */
+    /* determine and enforce maxTableLog */
     maxNbBits = HUF_setMaxHeight(huffNode, (U32)nonNullRank, maxNbBits);
     if (maxNbBits > HUF_TABLELOG_MAX) return ERROR(GENERIC);   /* check fit into table */
 
@@ -804,7 +862,7 @@ FORCE_INLINE_TEMPLATE void HUF_addBits(HUF_CStream_t* bitC, HUF_CElt elt, int id
 #if DEBUGLEVEL >= 1
     {
         size_t const nbBits = HUF_getNbBits(elt);
-        size_t const dirtyBits = nbBits == 0 ? 0 : BIT_highbit32((U32)nbBits) + 1;
+        size_t const dirtyBits = nbBits == 0 ? 0 : ZSTD_highbit32((U32)nbBits) + 1;
         (void)dirtyBits;
         /* Middle bits are 0. */
         assert(((elt >> dirtyBits) << (dirtyBits + nbBits)) == 0);
@@ -884,7 +942,7 @@ static size_t HUF_closeCStream(HUF_CStream_t* bitC)
     {
         size_t const nbBits = bitC->bitPos[0] & 0xFF;
         if (bitC->ptr >= bitC->endPtr) return 0; /* overflow detected */
-        return (bitC->ptr - bitC->startPtr) + (nbBits > 0);
+        return (size_t)(bitC->ptr - bitC->startPtr) + (nbBits > 0);
     }
 }
 
@@ -1045,9 +1103,9 @@ HUF_compress1X_usingCTable_internal_default(void* dst, size_t dstSize,
 static size_t
 HUF_compress1X_usingCTable_internal(void* dst, size_t dstSize,
                               const void* src, size_t srcSize,
-                              const HUF_CElt* CTable, const int bmi2)
+                              const HUF_CElt* CTable, const int flags)
 {
-    if (bmi2) {
+    if (flags & HUF_flags_bmi2) {
         return HUF_compress1X_usingCTable_internal_bmi2(dst, dstSize, src, srcSize, CTable);
     }
     return HUF_compress1X_usingCTable_internal_default(dst, dstSize, src, srcSize, CTable);
@@ -1058,28 +1116,23 @@ HUF_compress1X_usingCTable_internal(void* dst, size_t dstSize,
 static size_t
 HUF_compress1X_usingCTable_internal(void* dst, size_t dstSize,
                               const void* src, size_t srcSize,
-                              const HUF_CElt* CTable, const int bmi2)
+                              const HUF_CElt* CTable, const int flags)
 {
-    (void)bmi2;
+    (void)flags;
     return HUF_compress1X_usingCTable_internal_body(dst, dstSize, src, srcSize, CTable);
 }
 
 #endif
 
-size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable)
-{
-    return HUF_compress1X_usingCTable_bmi2(dst, dstSize, src, srcSize, CTable, /* bmi2 */ 0);
-}
-
-size_t HUF_compress1X_usingCTable_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int bmi2)
+size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int flags)
 {
-    return HUF_compress1X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, bmi2);
+    return HUF_compress1X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, flags);
 }
 
 static size_t
 HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
                               const void* src, size_t srcSize,
-                              const HUF_CElt* CTable, int bmi2)
+                              const HUF_CElt* CTable, int flags)
 {
     size_t const segmentSize = (srcSize+3)/4;   /* first 3 segments */
     const BYTE* ip = (const BYTE*) src;
@@ -1093,7 +1146,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
     op += 6;   /* jumpTable */
 
     assert(op <= oend);
-    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, bmi2) );
+    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, flags) );
         if (cSize == 0 || cSize > 65535) return 0;
         MEM_writeLE16(ostart, (U16)cSize);
         op += cSize;
@@ -1101,7 +1154,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
 
     ip += segmentSize;
     assert(op <= oend);
-    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, bmi2) );
+    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, flags) );
         if (cSize == 0 || cSize > 65535) return 0;
         MEM_writeLE16(ostart+2, (U16)cSize);
         op += cSize;
@@ -1109,7 +1162,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
 
     ip += segmentSize;
     assert(op <= oend);
-    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, bmi2) );
+    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, flags) );
         if (cSize == 0 || cSize > 65535) return 0;
         MEM_writeLE16(ostart+4, (U16)cSize);
         op += cSize;
@@ -1118,7 +1171,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
     ip += segmentSize;
     assert(op <= oend);
     assert(ip <= iend);
-    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, (size_t)(iend-ip), CTable, bmi2) );
+    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, (size_t)(iend-ip), CTable, flags) );
         if (cSize == 0 || cSize > 65535) return 0;
         op += cSize;
     }
@@ -1126,14 +1179,9 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
     return (size_t)(op-ostart);
 }
 
-size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable)
+size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int flags)
 {
-    return HUF_compress4X_usingCTable_bmi2(dst, dstSize, src, srcSize, CTable, /* bmi2 */ 0);
-}
-
-size_t HUF_compress4X_usingCTable_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int bmi2)
-{
-    return HUF_compress4X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, bmi2);
+    return HUF_compress4X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, flags);
 }
 
 typedef enum { HUF_singleStream, HUF_fourStreams } HUF_nbStreams_e;
@@ -1141,11 +1189,11 @@ typedef enum { HUF_singleStream, HUF_fourStreams } HUF_nbStreams_e;
 static size_t HUF_compressCTable_internal(
                 BYTE* const ostart, BYTE* op, BYTE* const oend,
                 const void* src, size_t srcSize,
-                HUF_nbStreams_e nbStreams, const HUF_CElt* CTable, const int bmi2)
+                HUF_nbStreams_e nbStreams, const HUF_CElt* CTable, const int flags)
 {
     size_t const cSize = (nbStreams==HUF_singleStream) ?
-                         HUF_compress1X_usingCTable_internal(op, (size_t)(oend - op), src, srcSize, CTable, bmi2) :
-                         HUF_compress4X_usingCTable_internal(op, (size_t)(oend - op), src, srcSize, CTable, bmi2);
+                         HUF_compress1X_usingCTable_internal(op, (size_t)(oend - op), src, srcSize, CTable, flags) :
+                         HUF_compress4X_usingCTable_internal(op, (size_t)(oend - op), src, srcSize, CTable, flags);
     if (HUF_isError(cSize)) { return cSize; }
     if (cSize==0) { return 0; }   /* uncompressible */
     op += cSize;
@@ -1168,6 +1216,79 @@ typedef struct {
 #define SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE 4096
 #define SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO 10  /* Must be >= 2 */
 
+unsigned HUF_cardinality(const unsigned* count, unsigned maxSymbolValue)
+{
+    unsigned cardinality = 0;
+    unsigned i;
+
+    for (i = 0; i < maxSymbolValue + 1; i++) {
+        if (count[i] != 0) cardinality += 1;
+    }
+
+    return cardinality;
+}
+
+unsigned HUF_minTableLog(unsigned symbolCardinality)
+{
+    U32 minBitsSymbols = ZSTD_highbit32(symbolCardinality) + 1;
+    return minBitsSymbols;
+}
+
+unsigned HUF_optimalTableLog(
+            unsigned maxTableLog,
+            size_t srcSize,
+            unsigned maxSymbolValue,
+            void* workSpace, size_t wkspSize,
+            HUF_CElt* table,
+      const unsigned* count,
+            int flags)
+{
+    assert(srcSize > 1); /* Not supported, RLE should be used instead */
+    assert(wkspSize >= sizeof(HUF_buildCTable_wksp_tables));
+
+    if (!(flags & HUF_flags_optimalDepth)) {
+        /* cheap evaluation, based on FSE */
+        return FSE_optimalTableLog_internal(maxTableLog, srcSize, maxSymbolValue, 1);
+    }
+
+    {   BYTE* dst = (BYTE*)workSpace + sizeof(HUF_WriteCTableWksp);
+        size_t dstSize = wkspSize - sizeof(HUF_WriteCTableWksp);
+        size_t maxBits, hSize, newSize;
+        const unsigned symbolCardinality = HUF_cardinality(count, maxSymbolValue);
+        const unsigned minTableLog = HUF_minTableLog(symbolCardinality);
+        size_t optSize = ((size_t) ~0) - 1;
+        unsigned optLog = maxTableLog, optLogGuess;
+
+        DEBUGLOG(6, "HUF_optimalTableLog: probing huf depth (srcSize=%zu)", srcSize);
+
+        /* Search until size increases */
+        for (optLogGuess = minTableLog; optLogGuess <= maxTableLog; optLogGuess++) {
+            DEBUGLOG(7, "checking for huffLog=%u", optLogGuess);
+            maxBits = HUF_buildCTable_wksp(table, count, maxSymbolValue, optLogGuess, workSpace, wkspSize);
+            if (ERR_isError(maxBits)) continue;
+
+            if (maxBits < optLogGuess && optLogGuess > minTableLog) break;
+
+            hSize = HUF_writeCTable_wksp(dst, dstSize, table, maxSymbolValue, (U32)maxBits, workSpace, wkspSize);
+
+            if (ERR_isError(hSize)) continue;
+
+            newSize = HUF_estimateCompressedSize(table, count, maxSymbolValue) + hSize;
+
+            if (newSize > optSize + 1) {
+                break;
+            }
+
+            if (newSize < optSize) {
+                optSize = newSize;
+                optLog = optLogGuess;
+            }
+        }
+        assert(optLog <= HUF_TABLELOG_MAX);
+        return optLog;
+    }
+}
+
 /* HUF_compress_internal() :
  * `workSpace_align4` must be aligned on 4-bytes boundaries,
  * and occupies the same space as a table of HUF_WORKSPACE_SIZE_U64 unsigned */
@@ -1177,14 +1298,14 @@ HUF_compress_internal (void* dst, size_t dstSize,
                        unsigned maxSymbolValue, unsigned huffLog,
                        HUF_nbStreams_e nbStreams,
                        void* workSpace, size_t wkspSize,
-                       HUF_CElt* oldHufTable, HUF_repeat* repeat, int preferRepeat,
-                 const int bmi2, unsigned suspectUncompressible)
+                       HUF_CElt* oldHufTable, HUF_repeat* repeat, int flags)
 {
     HUF_compress_tables_t* const table = (HUF_compress_tables_t*)HUF_alignUpWorkspace(workSpace, &wkspSize, ZSTD_ALIGNOF(size_t));
     BYTE* const ostart = (BYTE*)dst;
     BYTE* const oend = ostart + dstSize;
     BYTE* op = ostart;
 
+    DEBUGLOG(5, "HUF_compress_internal (srcSize=%zu)", srcSize);
     HUF_STATIC_ASSERT(sizeof(*table) + HUF_WORKSPACE_MAX_ALIGNMENT <= HUF_WORKSPACE_SIZE);
 
     /* checks & inits */
@@ -1198,16 +1319,17 @@ HUF_compress_internal (void* dst, size_t dstSize,
     if (!huffLog) huffLog = HUF_TABLELOG_DEFAULT;
 
     /* Heuristic : If old table is valid, use it for small inputs */
-    if (preferRepeat && repeat && *repeat == HUF_repeat_valid) {
+    if ((flags & HUF_flags_preferRepeat) && repeat && *repeat == HUF_repeat_valid) {
         return HUF_compressCTable_internal(ostart, op, oend,
                                            src, srcSize,
-                                           nbStreams, oldHufTable, bmi2);
+                                           nbStreams, oldHufTable, flags);
     }
 
     /* If uncompressible data is suspected, do a smaller sampling first */
     DEBUG_STATIC_ASSERT(SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO >= 2);
-    if (suspectUncompressible && srcSize >= (SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE * SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO)) {
+    if ((flags & HUF_flags_suspectUncompressible) && srcSize >= (SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE * SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO)) {
         size_t largestTotal = 0;
+        DEBUGLOG(5, "input suspected incompressible : sampling to check");
         {   unsigned maxSymbolValueBegin = maxSymbolValue;
             CHECK_V_F(largestBegin, HIST_count_simple (table->count, &maxSymbolValueBegin, (const BYTE*)src, SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE) );
             largestTotal += largestBegin;
@@ -1224,6 +1346,7 @@ HUF_compress_internal (void* dst, size_t dstSize,
         if (largest == srcSize) { *ostart = ((const BYTE*)src)[0]; return 1; }   /* single symbol, rle */
         if (largest <= (srcSize >> 7)+4) return 0;   /* heuristic : probably not compressible enough */
     }
+    DEBUGLOG(6, "histogram detail completed (%zu symbols)", showU32(table->count, maxSymbolValue+1));
 
     /* Check validity of previous table */
     if ( repeat
@@ -1232,19 +1355,20 @@ HUF_compress_internal (void* dst, size_t dstSize,
         *repeat = HUF_repeat_none;
     }
     /* Heuristic : use existing table for small inputs */
-    if (preferRepeat && repeat && *repeat != HUF_repeat_none) {
+    if ((flags & HUF_flags_preferRepeat) && repeat && *repeat != HUF_repeat_none) {
         return HUF_compressCTable_internal(ostart, op, oend,
                                            src, srcSize,
-                                           nbStreams, oldHufTable, bmi2);
+                                           nbStreams, oldHufTable, flags);
     }
 
     /* Build Huffman Tree */
-    huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue);
+    huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue, &table->wksps, sizeof(table->wksps), table->CTable, table->count, flags);
     {   size_t const maxBits = HUF_buildCTable_wksp(table->CTable, table->count,
                                             maxSymbolValue, huffLog,
                                             &table->wksps.buildCTable_wksp, sizeof(table->wksps.buildCTable_wksp));
         CHECK_F(maxBits);
         huffLog = (U32)maxBits;
+        DEBUGLOG(6, "bit distribution completed (%zu symbols)", showCTableBits(table->CTable + 1, maxSymbolValue+1));
     }
     /* Zero unused symbols in CTable, so we can check it for validity */
     {
@@ -1263,7 +1387,7 @@ HUF_compress_internal (void* dst, size_t dstSize,
             if (oldSize <= hSize + newSize || hSize + 12 >= srcSize) {
                 return HUF_compressCTable_internal(ostart, op, oend,
                                                    src, srcSize,
-                                                   nbStreams, oldHufTable, bmi2);
+                                                   nbStreams, oldHufTable, flags);
         }   }
 
         /* Use the new huffman table */
@@ -1275,46 +1399,20 @@ HUF_compress_internal (void* dst, size_t dstSize,
     }
     return HUF_compressCTable_internal(ostart, op, oend,
                                        src, srcSize,
-                                       nbStreams, table->CTable, bmi2);
-}
-
-
-size_t HUF_compress1X_wksp (void* dst, size_t dstSize,
-                      const void* src, size_t srcSize,
-                      unsigned maxSymbolValue, unsigned huffLog,
-                      void* workSpace, size_t wkspSize)
-{
-    return HUF_compress_internal(dst, dstSize, src, srcSize,
-                                 maxSymbolValue, huffLog, HUF_singleStream,
-                                 workSpace, wkspSize,
-                                 NULL, NULL, 0, 0 /*bmi2*/, 0);
+                                       nbStreams, table->CTable, flags);
 }
 
 size_t HUF_compress1X_repeat (void* dst, size_t dstSize,
                       const void* src, size_t srcSize,
                       unsigned maxSymbolValue, unsigned huffLog,
                       void* workSpace, size_t wkspSize,
-                      HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat,
-                      int bmi2, unsigned suspectUncompressible)
+                      HUF_CElt* hufTable, HUF_repeat* repeat, int flags)
 {
+    DEBUGLOG(5, "HUF_compress1X_repeat (srcSize = %zu)", srcSize);
     return HUF_compress_internal(dst, dstSize, src, srcSize,
                                  maxSymbolValue, huffLog, HUF_singleStream,
                                  workSpace, wkspSize, hufTable,
-                                 repeat, preferRepeat, bmi2, suspectUncompressible);
-}
-
-/* HUF_compress4X_repeat():
- * compress input using 4 streams.
- * provide workspace to generate compression tables */
-size_t HUF_compress4X_wksp (void* dst, size_t dstSize,
-                      const void* src, size_t srcSize,
-                      unsigned maxSymbolValue, unsigned huffLog,
-                      void* workSpace, size_t wkspSize)
-{
-    return HUF_compress_internal(dst, dstSize, src, srcSize,
-                                 maxSymbolValue, huffLog, HUF_fourStreams,
-                                 workSpace, wkspSize,
-                                 NULL, NULL, 0, 0 /*bmi2*/, 0);
+                                 repeat, flags);
 }
 
 /* HUF_compress4X_repeat():
@@ -1325,11 +1423,11 @@ size_t HUF_compress4X_repeat (void* dst, size_t dstSize,
                       const void* src, size_t srcSize,
                       unsigned maxSymbolValue, unsigned huffLog,
                       void* workSpace, size_t wkspSize,
-                      HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2, unsigned suspectUncompressible)
+                      HUF_CElt* hufTable, HUF_repeat* repeat, int flags)
 {
+    DEBUGLOG(5, "HUF_compress4X_repeat (srcSize = %zu)", srcSize);
     return HUF_compress_internal(dst, dstSize, src, srcSize,
                                  maxSymbolValue, huffLog, HUF_fourStreams,
                                  workSpace, wkspSize,
-                                 hufTable, repeat, preferRepeat, bmi2, suspectUncompressible);
+                                 hufTable, repeat, flags);
 }
-
diff --git a/lib/zstd/compress/zstd_compress.c b/lib/zstd/compress/zstd_compress.c
index f620cafca633ba..c1c316e9e289f7 100644
--- a/lib/zstd/compress/zstd_compress.c
+++ b/lib/zstd/compress/zstd_compress.c
@@ -1,5 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -11,12 +12,12 @@
 /*-*************************************
 *  Dependencies
 ***************************************/
+#include "../common/allocations.h"  /* ZSTD_customMalloc, ZSTD_customCalloc, ZSTD_customFree */
 #include "../common/zstd_deps.h"  /* INT_MAX, ZSTD_memset, ZSTD_memcpy */
 #include "../common/mem.h"
 #include "hist.h"           /* HIST_countFast_wksp */
 #define FSE_STATIC_LINKING_ONLY   /* FSE_encodeSymbol */
 #include "../common/fse.h"
-#define HUF_STATIC_LINKING_ONLY
 #include "../common/huf.h"
 #include "zstd_compress_internal.h"
 #include "zstd_compress_sequences.h"
@@ -27,6 +28,7 @@
 #include "zstd_opt.h"
 #include "zstd_ldm.h"
 #include "zstd_compress_superblock.h"
+#include  "../common/bits.h"      /* ZSTD_highbit32, ZSTD_rotateRight_U64 */
 
 /* ***************************************************************
 *  Tuning parameters
@@ -55,14 +57,17 @@
 *  Helper functions
 ***************************************/
 /* ZSTD_compressBound()
- * Note that the result from this function is only compatible with the "normal"
- * full-block strategy.
- * When there are a lot of small blocks due to frequent flush in streaming mode
- * the overhead of headers can make the compressed data to be larger than the
- * return value of ZSTD_compressBound().
+ * Note that the result from this function is only valid for
+ * the one-pass compression functions.
+ * When employing the streaming mode,
+ * if flushes are frequently altering the size of blocks,
+ * the overhead from block headers can make the compressed data larger
+ * than the return value of ZSTD_compressBound().
  */
 size_t ZSTD_compressBound(size_t srcSize) {
-    return ZSTD_COMPRESSBOUND(srcSize);
+    size_t const r = ZSTD_COMPRESSBOUND(srcSize);
+    if (r==0) return ERROR(srcSize_wrong);
+    return r;
 }
 
 
@@ -171,12 +176,9 @@ size_t ZSTD_freeCCtx(ZSTD_CCtx* cctx)
     if (cctx==NULL) return 0;   /* support free on NULL */
     RETURN_ERROR_IF(cctx->staticSize, memory_allocation,
                     "not compatible with static CCtx");
-    {
-        int cctxInWorkspace = ZSTD_cwksp_owns_buffer(&cctx->workspace, cctx);
+    {   int cctxInWorkspace = ZSTD_cwksp_owns_buffer(&cctx->workspace, cctx);
         ZSTD_freeCCtxContent(cctx);
-        if (!cctxInWorkspace) {
-            ZSTD_customFree(cctx, cctx->customMem);
-        }
+        if (!cctxInWorkspace) ZSTD_customFree(cctx, cctx->customMem);
     }
     return 0;
 }
@@ -257,9 +259,9 @@ static int ZSTD_allocateChainTable(const ZSTD_strategy strategy,
     return forDDSDict || ((strategy != ZSTD_fast) && !ZSTD_rowMatchFinderUsed(strategy, useRowMatchFinder));
 }
 
-/* Returns 1 if compression parameters are such that we should
+/* Returns ZSTD_ps_enable if compression parameters are such that we should
  * enable long distance matching (wlog >= 27, strategy >= btopt).
- * Returns 0 otherwise.
+ * Returns ZSTD_ps_disable otherwise.
  */
 static ZSTD_paramSwitch_e ZSTD_resolveEnableLdm(ZSTD_paramSwitch_e mode,
                                  const ZSTD_compressionParameters* const cParams) {
@@ -267,6 +269,34 @@ static ZSTD_paramSwitch_e ZSTD_resolveEnableLdm(ZSTD_paramSwitch_e mode,
     return (cParams->strategy >= ZSTD_btopt && cParams->windowLog >= 27) ? ZSTD_ps_enable : ZSTD_ps_disable;
 }
 
+static int ZSTD_resolveExternalSequenceValidation(int mode) {
+    return mode;
+}
+
+/* Resolves maxBlockSize to the default if no value is present. */
+static size_t ZSTD_resolveMaxBlockSize(size_t maxBlockSize) {
+    if (maxBlockSize == 0) {
+        return ZSTD_BLOCKSIZE_MAX;
+    } else {
+        return maxBlockSize;
+    }
+}
+
+static ZSTD_paramSwitch_e ZSTD_resolveExternalRepcodeSearch(ZSTD_paramSwitch_e value, int cLevel) {
+    if (value != ZSTD_ps_auto) return value;
+    if (cLevel < 10) {
+        return ZSTD_ps_disable;
+    } else {
+        return ZSTD_ps_enable;
+    }
+}
+
+/* Returns 1 if compression parameters are such that CDict hashtable and chaintable indices are tagged.
+ * If so, the tags need to be removed in ZSTD_resetCCtx_byCopyingCDict. */
+static int ZSTD_CDictIndicesAreTagged(const ZSTD_compressionParameters* const cParams) {
+    return cParams->strategy == ZSTD_fast || cParams->strategy == ZSTD_dfast;
+}
+
 static ZSTD_CCtx_params ZSTD_makeCCtxParamsFromCParams(
         ZSTD_compressionParameters cParams)
 {
@@ -284,6 +314,10 @@ static ZSTD_CCtx_params ZSTD_makeCCtxParamsFromCParams(
     }
     cctxParams.useBlockSplitter = ZSTD_resolveBlockSplitterMode(cctxParams.useBlockSplitter, &cParams);
     cctxParams.useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(cctxParams.useRowMatchFinder, &cParams);
+    cctxParams.validateSequences = ZSTD_resolveExternalSequenceValidation(cctxParams.validateSequences);
+    cctxParams.maxBlockSize = ZSTD_resolveMaxBlockSize(cctxParams.maxBlockSize);
+    cctxParams.searchForExternalRepcodes = ZSTD_resolveExternalRepcodeSearch(cctxParams.searchForExternalRepcodes,
+                                                                             cctxParams.compressionLevel);
     assert(!ZSTD_checkCParams(cParams));
     return cctxParams;
 }
@@ -329,10 +363,13 @@ size_t ZSTD_CCtxParams_init(ZSTD_CCtx_params* cctxParams, int compressionLevel)
 #define ZSTD_NO_CLEVEL 0
 
 /*
- * Initializes the cctxParams from params and compressionLevel.
+ * Initializes `cctxParams` from `params` and `compressionLevel`.
  * @param compressionLevel If params are derived from a compression level then that compression level, otherwise ZSTD_NO_CLEVEL.
  */
-static void ZSTD_CCtxParams_init_internal(ZSTD_CCtx_params* cctxParams, ZSTD_parameters const* params, int compressionLevel)
+static void
+ZSTD_CCtxParams_init_internal(ZSTD_CCtx_params* cctxParams,
+                        const ZSTD_parameters* params,
+                              int compressionLevel)
 {
     assert(!ZSTD_checkCParams(params->cParams));
     ZSTD_memset(cctxParams, 0, sizeof(*cctxParams));
@@ -345,6 +382,9 @@ static void ZSTD_CCtxParams_init_internal(ZSTD_CCtx_params* cctxParams, ZSTD_par
     cctxParams->useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(cctxParams->useRowMatchFinder, &params->cParams);
     cctxParams->useBlockSplitter = ZSTD_resolveBlockSplitterMode(cctxParams->useBlockSplitter, &params->cParams);
     cctxParams->ldmParams.enableLdm = ZSTD_resolveEnableLdm(cctxParams->ldmParams.enableLdm, &params->cParams);
+    cctxParams->validateSequences = ZSTD_resolveExternalSequenceValidation(cctxParams->validateSequences);
+    cctxParams->maxBlockSize = ZSTD_resolveMaxBlockSize(cctxParams->maxBlockSize);
+    cctxParams->searchForExternalRepcodes = ZSTD_resolveExternalRepcodeSearch(cctxParams->searchForExternalRepcodes, compressionLevel);
     DEBUGLOG(4, "ZSTD_CCtxParams_init_internal: useRowMatchFinder=%d, useBlockSplitter=%d ldm=%d",
                 cctxParams->useRowMatchFinder, cctxParams->useBlockSplitter, cctxParams->ldmParams.enableLdm);
 }
@@ -359,7 +399,7 @@ size_t ZSTD_CCtxParams_init_advanced(ZSTD_CCtx_params* cctxParams, ZSTD_paramete
 
 /*
  * Sets cctxParams' cParams and fParams from params, but otherwise leaves them alone.
- * @param param Validated zstd parameters.
+ * @param params Validated zstd parameters.
  */
 static void ZSTD_CCtxParams_setZstdParams(
         ZSTD_CCtx_params* cctxParams, const ZSTD_parameters* params)
@@ -455,8 +495,8 @@ ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_cParameter param)
         return bounds;
 
     case ZSTD_c_enableLongDistanceMatching:
-        bounds.lowerBound = 0;
-        bounds.upperBound = 1;
+        bounds.lowerBound = (int)ZSTD_ps_auto;
+        bounds.upperBound = (int)ZSTD_ps_disable;
         return bounds;
 
     case ZSTD_c_ldmHashLog:
@@ -549,6 +589,26 @@ ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_cParameter param)
         bounds.upperBound = 1;
         return bounds;
 
+    case ZSTD_c_prefetchCDictTables:
+        bounds.lowerBound = (int)ZSTD_ps_auto;
+        bounds.upperBound = (int)ZSTD_ps_disable;
+        return bounds;
+
+    case ZSTD_c_enableSeqProducerFallback:
+        bounds.lowerBound = 0;
+        bounds.upperBound = 1;
+        return bounds;
+
+    case ZSTD_c_maxBlockSize:
+        bounds.lowerBound = ZSTD_BLOCKSIZE_MAX_MIN;
+        bounds.upperBound = ZSTD_BLOCKSIZE_MAX;
+        return bounds;
+
+    case ZSTD_c_searchForExternalRepcodes:
+        bounds.lowerBound = (int)ZSTD_ps_auto;
+        bounds.upperBound = (int)ZSTD_ps_disable;
+        return bounds;
+
     default:
         bounds.error = ERROR(parameter_unsupported);
         return bounds;
@@ -613,6 +673,10 @@ static int ZSTD_isUpdateAuthorized(ZSTD_cParameter param)
     case ZSTD_c_useBlockSplitter:
     case ZSTD_c_useRowMatchFinder:
     case ZSTD_c_deterministicRefPrefix:
+    case ZSTD_c_prefetchCDictTables:
+    case ZSTD_c_enableSeqProducerFallback:
+    case ZSTD_c_maxBlockSize:
+    case ZSTD_c_searchForExternalRepcodes:
     default:
         return 0;
     }
@@ -625,7 +689,7 @@ size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, int value)
         if (ZSTD_isUpdateAuthorized(param)) {
             cctx->cParamsChanged = 1;
         } else {
-            RETURN_ERROR(stage_wrong, "can only set params in ctx init stage");
+            RETURN_ERROR(stage_wrong, "can only set params in cctx init stage");
     }   }
 
     switch(param)
@@ -668,6 +732,10 @@ size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, int value)
     case ZSTD_c_useBlockSplitter:
     case ZSTD_c_useRowMatchFinder:
     case ZSTD_c_deterministicRefPrefix:
+    case ZSTD_c_prefetchCDictTables:
+    case ZSTD_c_enableSeqProducerFallback:
+    case ZSTD_c_maxBlockSize:
+    case ZSTD_c_searchForExternalRepcodes:
         break;
 
     default: RETURN_ERROR(parameter_unsupported, "unknown parameter");
@@ -723,12 +791,12 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,
     case ZSTD_c_minMatch :
         if (value!=0)   /* 0 => use default */
             BOUNDCHECK(ZSTD_c_minMatch, value);
-        CCtxParams->cParams.minMatch = value;
+        CCtxParams->cParams.minMatch = (U32)value;
         return CCtxParams->cParams.minMatch;
 
     case ZSTD_c_targetLength :
         BOUNDCHECK(ZSTD_c_targetLength, value);
-        CCtxParams->cParams.targetLength = value;
+        CCtxParams->cParams.targetLength = (U32)value;
         return CCtxParams->cParams.targetLength;
 
     case ZSTD_c_strategy :
@@ -741,12 +809,12 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,
         /* Content size written in frame header _when known_ (default:1) */
         DEBUGLOG(4, "set content size flag = %u", (value!=0));
         CCtxParams->fParams.contentSizeFlag = value != 0;
-        return CCtxParams->fParams.contentSizeFlag;
+        return (size_t)CCtxParams->fParams.contentSizeFlag;
 
     case ZSTD_c_checksumFlag :
         /* A 32-bits content checksum will be calculated and written at end of frame (default:0) */
         CCtxParams->fParams.checksumFlag = value != 0;
-        return CCtxParams->fParams.checksumFlag;
+        return (size_t)CCtxParams->fParams.checksumFlag;
 
     case ZSTD_c_dictIDFlag : /* When applicable, dictionary's dictID is provided in frame header (default:1) */
         DEBUGLOG(4, "set dictIDFlag = %u", (value!=0));
@@ -755,18 +823,18 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,
 
     case ZSTD_c_forceMaxWindow :
         CCtxParams->forceWindow = (value != 0);
-        return CCtxParams->forceWindow;
+        return (size_t)CCtxParams->forceWindow;
 
     case ZSTD_c_forceAttachDict : {
         const ZSTD_dictAttachPref_e pref = (ZSTD_dictAttachPref_e)value;
-        BOUNDCHECK(ZSTD_c_forceAttachDict, pref);
+        BOUNDCHECK(ZSTD_c_forceAttachDict, (int)pref);
         CCtxParams->attachDictPref = pref;
         return CCtxParams->attachDictPref;
     }
 
     case ZSTD_c_literalCompressionMode : {
         const ZSTD_paramSwitch_e lcm = (ZSTD_paramSwitch_e)value;
-        BOUNDCHECK(ZSTD_c_literalCompressionMode, lcm);
+        BOUNDCHECK(ZSTD_c_literalCompressionMode, (int)lcm);
         CCtxParams->literalCompressionMode = lcm;
         return CCtxParams->literalCompressionMode;
     }
@@ -789,47 +857,48 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,
 
     case ZSTD_c_enableDedicatedDictSearch :
         CCtxParams->enableDedicatedDictSearch = (value!=0);
-        return CCtxParams->enableDedicatedDictSearch;
+        return (size_t)CCtxParams->enableDedicatedDictSearch;
 
     case ZSTD_c_enableLongDistanceMatching :
+        BOUNDCHECK(ZSTD_c_enableLongDistanceMatching, value);
         CCtxParams->ldmParams.enableLdm = (ZSTD_paramSwitch_e)value;
         return CCtxParams->ldmParams.enableLdm;
 
     case ZSTD_c_ldmHashLog :
         if (value!=0)   /* 0 ==> auto */
             BOUNDCHECK(ZSTD_c_ldmHashLog, value);
-        CCtxParams->ldmParams.hashLog = value;
+        CCtxParams->ldmParams.hashLog = (U32)value;
         return CCtxParams->ldmParams.hashLog;
 
     case ZSTD_c_ldmMinMatch :
         if (value!=0)   /* 0 ==> default */
             BOUNDCHECK(ZSTD_c_ldmMinMatch, value);
-        CCtxParams->ldmParams.minMatchLength = value;
+        CCtxParams->ldmParams.minMatchLength = (U32)value;
         return CCtxParams->ldmParams.minMatchLength;
 
     case ZSTD_c_ldmBucketSizeLog :
         if (value!=0)   /* 0 ==> default */
             BOUNDCHECK(ZSTD_c_ldmBucketSizeLog, value);
-        CCtxParams->ldmParams.bucketSizeLog = value;
+        CCtxParams->ldmParams.bucketSizeLog = (U32)value;
         return CCtxParams->ldmParams.bucketSizeLog;
 
     case ZSTD_c_ldmHashRateLog :
         if (value!=0)   /* 0 ==> default */
             BOUNDCHECK(ZSTD_c_ldmHashRateLog, value);
-        CCtxParams->ldmParams.hashRateLog = value;
+        CCtxParams->ldmParams.hashRateLog = (U32)value;
         return CCtxParams->ldmParams.hashRateLog;
 
     case ZSTD_c_targetCBlockSize :
         if (value!=0)   /* 0 ==> default */
             BOUNDCHECK(ZSTD_c_targetCBlockSize, value);
-        CCtxParams->targetCBlockSize = value;
+        CCtxParams->targetCBlockSize = (U32)value;
         return CCtxParams->targetCBlockSize;
 
     case ZSTD_c_srcSizeHint :
         if (value!=0)    /* 0 ==> default */
             BOUNDCHECK(ZSTD_c_srcSizeHint, value);
         CCtxParams->srcSizeHint = value;
-        return CCtxParams->srcSizeHint;
+        return (size_t)CCtxParams->srcSizeHint;
 
     case ZSTD_c_stableInBuffer:
         BOUNDCHECK(ZSTD_c_stableInBuffer, value);
@@ -866,6 +935,27 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,
         CCtxParams->deterministicRefPrefix = !!value;
         return CCtxParams->deterministicRefPrefix;
 
+    case ZSTD_c_prefetchCDictTables:
+        BOUNDCHECK(ZSTD_c_prefetchCDictTables, value);
+        CCtxParams->prefetchCDictTables = (ZSTD_paramSwitch_e)value;
+        return CCtxParams->prefetchCDictTables;
+
+    case ZSTD_c_enableSeqProducerFallback:
+        BOUNDCHECK(ZSTD_c_enableSeqProducerFallback, value);
+        CCtxParams->enableMatchFinderFallback = value;
+        return CCtxParams->enableMatchFinderFallback;
+
+    case ZSTD_c_maxBlockSize:
+        if (value!=0)    /* 0 ==> default */
+            BOUNDCHECK(ZSTD_c_maxBlockSize, value);
+        CCtxParams->maxBlockSize = value;
+        return CCtxParams->maxBlockSize;
+
+    case ZSTD_c_searchForExternalRepcodes:
+        BOUNDCHECK(ZSTD_c_searchForExternalRepcodes, value);
+        CCtxParams->searchForExternalRepcodes = (ZSTD_paramSwitch_e)value;
+        return CCtxParams->searchForExternalRepcodes;
+
     default: RETURN_ERROR(parameter_unsupported, "unknown parameter");
     }
 }
@@ -980,6 +1070,18 @@ size_t ZSTD_CCtxParams_getParameter(
     case ZSTD_c_deterministicRefPrefix:
         *value = (int)CCtxParams->deterministicRefPrefix;
         break;
+    case ZSTD_c_prefetchCDictTables:
+        *value = (int)CCtxParams->prefetchCDictTables;
+        break;
+    case ZSTD_c_enableSeqProducerFallback:
+        *value = CCtxParams->enableMatchFinderFallback;
+        break;
+    case ZSTD_c_maxBlockSize:
+        *value = (int)CCtxParams->maxBlockSize;
+        break;
+    case ZSTD_c_searchForExternalRepcodes:
+        *value = (int)CCtxParams->searchForExternalRepcodes;
+        break;
     default: RETURN_ERROR(parameter_unsupported, "unknown parameter");
     }
     return 0;
@@ -1006,9 +1108,47 @@ size_t ZSTD_CCtx_setParametersUsingCCtxParams(
     return 0;
 }
 
+size_t ZSTD_CCtx_setCParams(ZSTD_CCtx* cctx, ZSTD_compressionParameters cparams)
+{
+    ZSTD_STATIC_ASSERT(sizeof(cparams) == 7 * 4 /* all params are listed below */);
+    DEBUGLOG(4, "ZSTD_CCtx_setCParams");
+    /* only update if all parameters are valid */
+    FORWARD_IF_ERROR(ZSTD_checkCParams(cparams), "");
+    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_windowLog, cparams.windowLog), "");
+    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_chainLog, cparams.chainLog), "");
+    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_hashLog, cparams.hashLog), "");
+    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_searchLog, cparams.searchLog), "");
+    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_minMatch, cparams.minMatch), "");
+    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_targetLength, cparams.targetLength), "");
+    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_strategy, cparams.strategy), "");
+    return 0;
+}
+
+size_t ZSTD_CCtx_setFParams(ZSTD_CCtx* cctx, ZSTD_frameParameters fparams)
+{
+    ZSTD_STATIC_ASSERT(sizeof(fparams) == 3 * 4 /* all params are listed below */);
+    DEBUGLOG(4, "ZSTD_CCtx_setFParams");
+    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_contentSizeFlag, fparams.contentSizeFlag != 0), "");
+    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_checksumFlag, fparams.checksumFlag != 0), "");
+    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_dictIDFlag, fparams.noDictIDFlag == 0), "");
+    return 0;
+}
+
+size_t ZSTD_CCtx_setParams(ZSTD_CCtx* cctx, ZSTD_parameters params)
+{
+    DEBUGLOG(4, "ZSTD_CCtx_setParams");
+    /* First check cParams, because we want to update all or none. */
+    FORWARD_IF_ERROR(ZSTD_checkCParams(params.cParams), "");
+    /* Next set fParams, because this could fail if the cctx isn't in init stage. */
+    FORWARD_IF_ERROR(ZSTD_CCtx_setFParams(cctx, params.fParams), "");
+    /* Finally set cParams, which should succeed. */
+    FORWARD_IF_ERROR(ZSTD_CCtx_setCParams(cctx, params.cParams), "");
+    return 0;
+}
+
 size_t ZSTD_CCtx_setPledgedSrcSize(ZSTD_CCtx* cctx, unsigned long long pledgedSrcSize)
 {
-    DEBUGLOG(4, "ZSTD_CCtx_setPledgedSrcSize to %u bytes", (U32)pledgedSrcSize);
+    DEBUGLOG(4, "ZSTD_CCtx_setPledgedSrcSize to %llu bytes", pledgedSrcSize);
     RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong,
                     "Can't set pledgedSrcSize when not in init stage.");
     cctx->pledgedSrcSizePlusOne = pledgedSrcSize+1;
@@ -1024,9 +1164,9 @@ static void ZSTD_dedicatedDictSearch_revertCParams(
         ZSTD_compressionParameters* cParams);
 
 /*
- * Initializes the local dict using the requested parameters.
- * NOTE: This does not use the pledged src size, because it may be used for more
- * than one compression.
+ * Initializes the local dictionary using requested parameters.
+ * NOTE: Initialization does not employ the pledged src size,
+ * because the dictionary may be used for multiple compressions.
  */
 static size_t ZSTD_initLocalDict(ZSTD_CCtx* cctx)
 {
@@ -1039,8 +1179,8 @@ static size_t ZSTD_initLocalDict(ZSTD_CCtx* cctx)
         return 0;
     }
     if (dl->cdict != NULL) {
-        assert(cctx->cdict == dl->cdict);
         /* Local dictionary already initialized. */
+        assert(cctx->cdict == dl->cdict);
         return 0;
     }
     assert(dl->dictSize > 0);
@@ -1060,26 +1200,30 @@ static size_t ZSTD_initLocalDict(ZSTD_CCtx* cctx)
 }
 
 size_t ZSTD_CCtx_loadDictionary_advanced(
-        ZSTD_CCtx* cctx, const void* dict, size_t dictSize,
-        ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictContentType_e dictContentType)
+        ZSTD_CCtx* cctx,
+        const void* dict, size_t dictSize,
+        ZSTD_dictLoadMethod_e dictLoadMethod,
+        ZSTD_dictContentType_e dictContentType)
 {
-    RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong,
-                    "Can't load a dictionary when ctx is not in init stage.");
     DEBUGLOG(4, "ZSTD_CCtx_loadDictionary_advanced (size: %u)", (U32)dictSize);
-    ZSTD_clearAllDicts(cctx);  /* in case one already exists */
-    if (dict == NULL || dictSize == 0)  /* no dictionary mode */
+    RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong,
+                    "Can't load a dictionary when cctx is not in init stage.");
+    ZSTD_clearAllDicts(cctx);  /* erase any previously set dictionary */
+    if (dict == NULL || dictSize == 0)  /* no dictionary */
         return 0;
     if (dictLoadMethod == ZSTD_dlm_byRef) {
         cctx->localDict.dict = dict;
     } else {
+        /* copy dictionary content inside CCtx to own its lifetime */
         void* dictBuffer;
         RETURN_ERROR_IF(cctx->staticSize, memory_allocation,
-                        "no malloc for static CCtx");
+                        "static CCtx can't allocate for an internal copy of dictionary");
         dictBuffer = ZSTD_customMalloc(dictSize, cctx->customMem);
-        RETURN_ERROR_IF(!dictBuffer, memory_allocation, "NULL pointer!");
+        RETURN_ERROR_IF(dictBuffer==NULL, memory_allocation,
+                        "allocation failed for dictionary content");
         ZSTD_memcpy(dictBuffer, dict, dictSize);
-        cctx->localDict.dictBuffer = dictBuffer;
-        cctx->localDict.dict = dictBuffer;
+        cctx->localDict.dictBuffer = dictBuffer;  /* owned ptr to free */
+        cctx->localDict.dict = dictBuffer;        /* read-only reference */
     }
     cctx->localDict.dictSize = dictSize;
     cctx->localDict.dictContentType = dictContentType;
@@ -1149,8 +1293,9 @@ size_t ZSTD_CCtx_reset(ZSTD_CCtx* cctx, ZSTD_ResetDirective reset)
     if ( (reset == ZSTD_reset_parameters)
       || (reset == ZSTD_reset_session_and_parameters) ) {
         RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong,
-                        "Can't reset parameters only when not in init stage.");
+                        "Reset parameters is only possible during init stage.");
         ZSTD_clearAllDicts(cctx);
+        ZSTD_memset(&cctx->externalMatchCtx, 0, sizeof(cctx->externalMatchCtx));
         return ZSTD_CCtxParams_reset(&cctx->requestedParams);
     }
     return 0;
@@ -1247,7 +1392,8 @@ static ZSTD_compressionParameters
 ZSTD_adjustCParams_internal(ZSTD_compressionParameters cPar,
                             unsigned long long srcSize,
                             size_t dictSize,
-                            ZSTD_cParamMode_e mode)
+                            ZSTD_cParamMode_e mode,
+                            ZSTD_paramSwitch_e useRowMatchFinder)
 {
     const U64 minSrcSize = 513; /* (1<<9) + 1 */
     const U64 maxWindowResize = 1ULL << (ZSTD_WINDOWLOG_MAX-1);
@@ -1281,8 +1427,8 @@ ZSTD_adjustCParams_internal(ZSTD_compressionParameters cPar,
     }
 
     /* resize windowLog if input is small enough, to use less memory */
-    if ( (srcSize < maxWindowResize)
-      && (dictSize < maxWindowResize) )  {
+    if ( (srcSize <= maxWindowResize)
+      && (dictSize <= maxWindowResize) )  {
         U32 const tSize = (U32)(srcSize + dictSize);
         static U32 const hashSizeMin = 1 << ZSTD_HASHLOG_MIN;
         U32 const srcLog = (tSize < hashSizeMin) ? ZSTD_HASHLOG_MIN :
@@ -1300,6 +1446,42 @@ ZSTD_adjustCParams_internal(ZSTD_compressionParameters cPar,
     if (cPar.windowLog < ZSTD_WINDOWLOG_ABSOLUTEMIN)
         cPar.windowLog = ZSTD_WINDOWLOG_ABSOLUTEMIN;  /* minimum wlog required for valid frame header */
 
+    /* We can't use more than 32 bits of hash in total, so that means that we require:
+     * (hashLog + 8) <= 32 && (chainLog + 8) <= 32
+     */
+    if (mode == ZSTD_cpm_createCDict && ZSTD_CDictIndicesAreTagged(&cPar)) {
+        U32 const maxShortCacheHashLog = 32 - ZSTD_SHORT_CACHE_TAG_BITS;
+        if (cPar.hashLog > maxShortCacheHashLog) {
+            cPar.hashLog = maxShortCacheHashLog;
+        }
+        if (cPar.chainLog > maxShortCacheHashLog) {
+            cPar.chainLog = maxShortCacheHashLog;
+        }
+    }
+
+
+    /* At this point, we aren't 100% sure if we are using the row match finder.
+     * Unless it is explicitly disabled, conservatively assume that it is enabled.
+     * In this case it will only be disabled for small sources, so shrinking the
+     * hash log a little bit shouldn't result in any ratio loss.
+     */
+    if (useRowMatchFinder == ZSTD_ps_auto)
+        useRowMatchFinder = ZSTD_ps_enable;
+
+    /* We can't hash more than 32-bits in total. So that means that we require:
+     * (hashLog - rowLog + 8) <= 32
+     */
+    if (ZSTD_rowMatchFinderUsed(cPar.strategy, useRowMatchFinder)) {
+        /* Switch to 32-entry rows if searchLog is 5 (or more) */
+        U32 const rowLog = BOUNDED(4, cPar.searchLog, 6);
+        U32 const maxRowHashLog = 32 - ZSTD_ROW_HASH_TAG_BITS;
+        U32 const maxHashLog = maxRowHashLog + rowLog;
+        assert(cPar.hashLog >= rowLog);
+        if (cPar.hashLog > maxHashLog) {
+            cPar.hashLog = maxHashLog;
+        }
+    }
+
     return cPar;
 }
 
@@ -1310,7 +1492,7 @@ ZSTD_adjustCParams(ZSTD_compressionParameters cPar,
 {
     cPar = ZSTD_clampCParams(cPar);   /* resulting cPar is necessarily valid (all parameters within range) */
     if (srcSize == 0) srcSize = ZSTD_CONTENTSIZE_UNKNOWN;
-    return ZSTD_adjustCParams_internal(cPar, srcSize, dictSize, ZSTD_cpm_unknown);
+    return ZSTD_adjustCParams_internal(cPar, srcSize, dictSize, ZSTD_cpm_unknown, ZSTD_ps_auto);
 }
 
 static ZSTD_compressionParameters ZSTD_getCParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize, ZSTD_cParamMode_e mode);
@@ -1341,7 +1523,7 @@ ZSTD_compressionParameters ZSTD_getCParamsFromCCtxParams(
     ZSTD_overrideCParams(&cParams, &CCtxParams->cParams);
     assert(!ZSTD_checkCParams(cParams));
     /* srcSizeHint == 0 means 0 */
-    return ZSTD_adjustCParams_internal(cParams, srcSizeHint, dictSize, mode);
+    return ZSTD_adjustCParams_internal(cParams, srcSizeHint, dictSize, mode, CCtxParams->useRowMatchFinder);
 }
 
 static size_t
@@ -1370,7 +1552,7 @@ ZSTD_sizeof_matchState(const ZSTD_compressionParameters* const cParams,
       + ZSTD_cwksp_aligned_alloc_size((ZSTD_OPT_NUM+1) * sizeof(ZSTD_match_t))
       + ZSTD_cwksp_aligned_alloc_size((ZSTD_OPT_NUM+1) * sizeof(ZSTD_optimal_t));
     size_t const lazyAdditionalSpace = ZSTD_rowMatchFinderUsed(cParams->strategy, useRowMatchFinder)
-                                            ? ZSTD_cwksp_aligned_alloc_size(hSize*sizeof(U16))
+                                            ? ZSTD_cwksp_aligned_alloc_size(hSize)
                                             : 0;
     size_t const optSpace = (forCCtx && (cParams->strategy >= ZSTD_btopt))
                                 ? optPotentialSpace
@@ -1386,6 +1568,13 @@ ZSTD_sizeof_matchState(const ZSTD_compressionParameters* const cParams,
     return tableSpace + optSpace + slackSpace + lazyAdditionalSpace;
 }
 
+/* Helper function for calculating memory requirements.
+ * Gives a tighter bound than ZSTD_sequenceBound() by taking minMatch into account. */
+static size_t ZSTD_maxNbSeq(size_t blockSize, unsigned minMatch, int useSequenceProducer) {
+    U32 const divider = (minMatch==3 || useSequenceProducer) ? 3 : 4;
+    return blockSize / divider;
+}
+
 static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal(
         const ZSTD_compressionParameters* cParams,
         const ldmParams_t* ldmParams,
@@ -1393,12 +1582,13 @@ static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal(
         const ZSTD_paramSwitch_e useRowMatchFinder,
         const size_t buffInSize,
         const size_t buffOutSize,
-        const U64 pledgedSrcSize)
+        const U64 pledgedSrcSize,
+        int useSequenceProducer,
+        size_t maxBlockSize)
 {
     size_t const windowSize = (size_t) BOUNDED(1ULL, 1ULL << cParams->windowLog, pledgedSrcSize);
-    size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, windowSize);
-    U32    const divider = (cParams->minMatch==3) ? 3 : 4;
-    size_t const maxNbSeq = blockSize / divider;
+    size_t const blockSize = MIN(ZSTD_resolveMaxBlockSize(maxBlockSize), windowSize);
+    size_t const maxNbSeq = ZSTD_maxNbSeq(blockSize, cParams->minMatch, useSequenceProducer);
     size_t const tokenSpace = ZSTD_cwksp_alloc_size(WILDCOPY_OVERLENGTH + blockSize)
                             + ZSTD_cwksp_aligned_alloc_size(maxNbSeq * sizeof(seqDef))
                             + 3 * ZSTD_cwksp_alloc_size(maxNbSeq * sizeof(BYTE));
@@ -1417,6 +1607,11 @@ static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal(
 
     size_t const cctxSpace = isStatic ? ZSTD_cwksp_alloc_size(sizeof(ZSTD_CCtx)) : 0;
 
+    size_t const maxNbExternalSeq = ZSTD_sequenceBound(blockSize);
+    size_t const externalSeqSpace = useSequenceProducer
+        ? ZSTD_cwksp_aligned_alloc_size(maxNbExternalSeq * sizeof(ZSTD_Sequence))
+        : 0;
+
     size_t const neededSpace =
         cctxSpace +
         entropySpace +
@@ -1425,7 +1620,8 @@ static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal(
         ldmSeqSpace +
         matchStateSize +
         tokenSpace +
-        bufferSpace;
+        bufferSpace +
+        externalSeqSpace;
 
     DEBUGLOG(5, "estimate workspace : %u", (U32)neededSpace);
     return neededSpace;
@@ -1443,7 +1639,7 @@ size_t ZSTD_estimateCCtxSize_usingCCtxParams(const ZSTD_CCtx_params* params)
      * be needed. However, we still allocate two 0-sized buffers, which can
      * take space under ASAN. */
     return ZSTD_estimateCCtxSize_usingCCtxParams_internal(
-        &cParams, &params->ldmParams, 1, useRowMatchFinder, 0, 0, ZSTD_CONTENTSIZE_UNKNOWN);
+        &cParams, &params->ldmParams, 1, useRowMatchFinder, 0, 0, ZSTD_CONTENTSIZE_UNKNOWN, params->useSequenceProducer, params->maxBlockSize);
 }
 
 size_t ZSTD_estimateCCtxSize_usingCParams(ZSTD_compressionParameters cParams)
@@ -1493,7 +1689,7 @@ size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params)
     RETURN_ERROR_IF(params->nbWorkers > 0, GENERIC, "Estimate CCtx size is supported for single-threaded compression only.");
     {   ZSTD_compressionParameters const cParams =
                 ZSTD_getCParamsFromCCtxParams(params, ZSTD_CONTENTSIZE_UNKNOWN, 0, ZSTD_cpm_noAttachDict);
-        size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, (size_t)1 << cParams.windowLog);
+        size_t const blockSize = MIN(ZSTD_resolveMaxBlockSize(params->maxBlockSize), (size_t)1 << cParams.windowLog);
         size_t const inBuffSize = (params->inBufferMode == ZSTD_bm_buffered)
                 ? ((size_t)1 << cParams.windowLog) + blockSize
                 : 0;
@@ -1504,7 +1700,7 @@ size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params)
 
         return ZSTD_estimateCCtxSize_usingCCtxParams_internal(
             &cParams, &params->ldmParams, 1, useRowMatchFinder, inBuffSize, outBuffSize,
-            ZSTD_CONTENTSIZE_UNKNOWN);
+            ZSTD_CONTENTSIZE_UNKNOWN, params->useSequenceProducer, params->maxBlockSize);
     }
 }
 
@@ -1637,6 +1833,19 @@ typedef enum {
     ZSTD_resetTarget_CCtx
 } ZSTD_resetTarget_e;
 
+/* Mixes bits in a 64 bits in a value, based on XXH3_rrmxmx */
+static U64 ZSTD_bitmix(U64 val, U64 len) {
+    val ^= ZSTD_rotateRight_U64(val, 49) ^ ZSTD_rotateRight_U64(val, 24);
+    val *= 0x9FB21C651E98DF25ULL;
+    val ^= (val >> 35) + len ;
+    val *= 0x9FB21C651E98DF25ULL;
+    return val ^ (val >> 28);
+}
+
+/* Mixes in the hashSalt and hashSaltEntropy to create a new hashSalt */
+static void ZSTD_advanceHashSalt(ZSTD_matchState_t* ms) {
+    ms->hashSalt = ZSTD_bitmix(ms->hashSalt, 8) ^ ZSTD_bitmix((U64) ms->hashSaltEntropy, 4);
+}
 
 static size_t
 ZSTD_reset_matchState(ZSTD_matchState_t* ms,
@@ -1664,6 +1873,7 @@ ZSTD_reset_matchState(ZSTD_matchState_t* ms,
     }
 
     ms->hashLog3 = hashLog3;
+    ms->lazySkipping = 0;
 
     ZSTD_invalidateMatchState(ms);
 
@@ -1685,6 +1895,27 @@ ZSTD_reset_matchState(ZSTD_matchState_t* ms,
         ZSTD_cwksp_clean_tables(ws);
     }
 
+    if (ZSTD_rowMatchFinderUsed(cParams->strategy, useRowMatchFinder)) {
+        /* Row match finder needs an additional table of hashes ("tags") */
+        size_t const tagTableSize = hSize;
+        /* We want to generate a new salt in case we reset a Cctx, but we always want to use
+         * 0 when we reset a Cdict */
+        if(forWho == ZSTD_resetTarget_CCtx) {
+            ms->tagTable = (BYTE*) ZSTD_cwksp_reserve_aligned_init_once(ws, tagTableSize);
+            ZSTD_advanceHashSalt(ms);
+        } else {
+            /* When we are not salting we want to always memset the memory */
+            ms->tagTable = (BYTE*) ZSTD_cwksp_reserve_aligned(ws, tagTableSize);
+            ZSTD_memset(ms->tagTable, 0, tagTableSize);
+            ms->hashSalt = 0;
+        }
+        {   /* Switch to 32-entry rows if searchLog is 5 (or more) */
+            U32 const rowLog = BOUNDED(4, cParams->searchLog, 6);
+            assert(cParams->hashLog >= rowLog);
+            ms->rowHashLog = cParams->hashLog - rowLog;
+        }
+    }
+
     /* opt parser space */
     if ((forWho == ZSTD_resetTarget_CCtx) && (cParams->strategy >= ZSTD_btopt)) {
         DEBUGLOG(4, "reserving optimal parser space");
@@ -1696,19 +1927,6 @@ ZSTD_reset_matchState(ZSTD_matchState_t* ms,
         ms->opt.priceTable = (ZSTD_optimal_t*)ZSTD_cwksp_reserve_aligned(ws, (ZSTD_OPT_NUM+1) * sizeof(ZSTD_optimal_t));
     }
 
-    if (ZSTD_rowMatchFinderUsed(cParams->strategy, useRowMatchFinder)) {
-        {   /* Row match finder needs an additional table of hashes ("tags") */
-            size_t const tagTableSize = hSize*sizeof(U16);
-            ms->tagTable = (U16*)ZSTD_cwksp_reserve_aligned(ws, tagTableSize);
-            if (ms->tagTable) ZSTD_memset(ms->tagTable, 0, tagTableSize);
-        }
-        {   /* Switch to 32-entry rows if searchLog is 5 (or more) */
-            U32 const rowLog = BOUNDED(4, cParams->searchLog, 6);
-            assert(cParams->hashLog >= rowLog);
-            ms->rowHashLog = cParams->hashLog - rowLog;
-        }
-    }
-
     ms->cParams = *cParams;
 
     RETURN_ERROR_IF(ZSTD_cwksp_reserve_failed(ws), memory_allocation,
@@ -1768,6 +1986,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
     assert(params->useRowMatchFinder != ZSTD_ps_auto);
     assert(params->useBlockSplitter != ZSTD_ps_auto);
     assert(params->ldmParams.enableLdm != ZSTD_ps_auto);
+    assert(params->maxBlockSize != 0);
     if (params->ldmParams.enableLdm == ZSTD_ps_enable) {
         /* Adjust long distance matching parameters */
         ZSTD_ldm_adjustParameters(&zc->appliedParams.ldmParams, &params->cParams);
@@ -1776,9 +1995,8 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
     }
 
     {   size_t const windowSize = MAX(1, (size_t)MIN(((U64)1 << params->cParams.windowLog), pledgedSrcSize));
-        size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, windowSize);
-        U32    const divider = (params->cParams.minMatch==3) ? 3 : 4;
-        size_t const maxNbSeq = blockSize / divider;
+        size_t const blockSize = MIN(params->maxBlockSize, windowSize);
+        size_t const maxNbSeq = ZSTD_maxNbSeq(blockSize, params->cParams.minMatch, params->useSequenceProducer);
         size_t const buffOutSize = (zbuff == ZSTDb_buffered && params->outBufferMode == ZSTD_bm_buffered)
                 ? ZSTD_compressBound(blockSize) + 1
                 : 0;
@@ -1795,7 +2013,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
         size_t const neededSpace =
             ZSTD_estimateCCtxSize_usingCCtxParams_internal(
                 &params->cParams, &params->ldmParams, zc->staticSize != 0, params->useRowMatchFinder,
-                buffInSize, buffOutSize, pledgedSrcSize);
+                buffInSize, buffOutSize, pledgedSrcSize, params->useSequenceProducer, params->maxBlockSize);
         int resizeWorkspace;
 
         FORWARD_IF_ERROR(neededSpace, "cctx size estimate failed!");
@@ -1838,6 +2056,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
 
         /* init params */
         zc->blockState.matchState.cParams = params->cParams;
+        zc->blockState.matchState.prefetchCDictTables = params->prefetchCDictTables == ZSTD_ps_enable;
         zc->pledgedSrcSizePlusOne = pledgedSrcSize+1;
         zc->consumedSrcSize = 0;
         zc->producedCSize = 0;
@@ -1854,13 +2073,46 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
 
         ZSTD_reset_compressedBlockState(zc->blockState.prevCBlock);
 
+        FORWARD_IF_ERROR(ZSTD_reset_matchState(
+                &zc->blockState.matchState,
+                ws,
+                &params->cParams,
+                params->useRowMatchFinder,
+                crp,
+                needsIndexReset,
+                ZSTD_resetTarget_CCtx), "");
+
+        zc->seqStore.sequencesStart = (seqDef*)ZSTD_cwksp_reserve_aligned(ws, maxNbSeq * sizeof(seqDef));
+
+        /* ldm hash table */
+        if (params->ldmParams.enableLdm == ZSTD_ps_enable) {
+            /* TODO: avoid memset? */
+            size_t const ldmHSize = ((size_t)1) << params->ldmParams.hashLog;
+            zc->ldmState.hashTable = (ldmEntry_t*)ZSTD_cwksp_reserve_aligned(ws, ldmHSize * sizeof(ldmEntry_t));
+            ZSTD_memset(zc->ldmState.hashTable, 0, ldmHSize * sizeof(ldmEntry_t));
+            zc->ldmSequences = (rawSeq*)ZSTD_cwksp_reserve_aligned(ws, maxNbLdmSeq * sizeof(rawSeq));
+            zc->maxNbLdmSequences = maxNbLdmSeq;
+
+            ZSTD_window_init(&zc->ldmState.window);
+            zc->ldmState.loadedDictEnd = 0;
+        }
+
+        /* reserve space for block-level external sequences */
+        if (params->useSequenceProducer) {
+            size_t const maxNbExternalSeq = ZSTD_sequenceBound(blockSize);
+            zc->externalMatchCtx.seqBufferCapacity = maxNbExternalSeq;
+            zc->externalMatchCtx.seqBuffer =
+                (ZSTD_Sequence*)ZSTD_cwksp_reserve_aligned(ws, maxNbExternalSeq * sizeof(ZSTD_Sequence));
+        }
+
+        /* buffers */
+
         /* ZSTD_wildcopy() is used to copy into the literals buffer,
          * so we have to oversize the buffer by WILDCOPY_OVERLENGTH bytes.
          */
         zc->seqStore.litStart = ZSTD_cwksp_reserve_buffer(ws, blockSize + WILDCOPY_OVERLENGTH);
         zc->seqStore.maxNbLit = blockSize;
 
-        /* buffers */
         zc->bufferedPolicy = zbuff;
         zc->inBuffSize = buffInSize;
         zc->inBuff = (char*)ZSTD_cwksp_reserve_buffer(ws, buffInSize);
@@ -1883,32 +2135,9 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
         zc->seqStore.llCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE));
         zc->seqStore.mlCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE));
         zc->seqStore.ofCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE));
-        zc->seqStore.sequencesStart = (seqDef*)ZSTD_cwksp_reserve_aligned(ws, maxNbSeq * sizeof(seqDef));
-
-        FORWARD_IF_ERROR(ZSTD_reset_matchState(
-            &zc->blockState.matchState,
-            ws,
-            &params->cParams,
-            params->useRowMatchFinder,
-            crp,
-            needsIndexReset,
-            ZSTD_resetTarget_CCtx), "");
-
-        /* ldm hash table */
-        if (params->ldmParams.enableLdm == ZSTD_ps_enable) {
-            /* TODO: avoid memset? */
-            size_t const ldmHSize = ((size_t)1) << params->ldmParams.hashLog;
-            zc->ldmState.hashTable = (ldmEntry_t*)ZSTD_cwksp_reserve_aligned(ws, ldmHSize * sizeof(ldmEntry_t));
-            ZSTD_memset(zc->ldmState.hashTable, 0, ldmHSize * sizeof(ldmEntry_t));
-            zc->ldmSequences = (rawSeq*)ZSTD_cwksp_reserve_aligned(ws, maxNbLdmSeq * sizeof(rawSeq));
-            zc->maxNbLdmSequences = maxNbLdmSeq;
-
-            ZSTD_window_init(&zc->ldmState.window);
-            zc->ldmState.loadedDictEnd = 0;
-        }
 
         DEBUGLOG(3, "wksp: finished allocating, %zd bytes remain available", ZSTD_cwksp_available_space(ws));
-        assert(ZSTD_cwksp_estimated_space_within_bounds(ws, neededSpace, resizeWorkspace));
+        assert(ZSTD_cwksp_estimated_space_within_bounds(ws, neededSpace));
 
         zc->initialized = 1;
 
@@ -1980,7 +2209,8 @@ ZSTD_resetCCtx_byAttachingCDict(ZSTD_CCtx* cctx,
         }
 
         params.cParams = ZSTD_adjustCParams_internal(adjusted_cdict_cParams, pledgedSrcSize,
-                                                     cdict->dictContentSize, ZSTD_cpm_attachDict);
+                                                     cdict->dictContentSize, ZSTD_cpm_attachDict,
+                                                     params.useRowMatchFinder);
         params.cParams.windowLog = windowLog;
         params.useRowMatchFinder = cdict->useRowMatchFinder;    /* cdict overrides */
         FORWARD_IF_ERROR(ZSTD_resetCCtx_internal(cctx, &params, pledgedSrcSize,
@@ -2019,6 +2249,22 @@ ZSTD_resetCCtx_byAttachingCDict(ZSTD_CCtx* cctx,
     return 0;
 }
 
+static void ZSTD_copyCDictTableIntoCCtx(U32* dst, U32 const* src, size_t tableSize,
+                                        ZSTD_compressionParameters const* cParams) {
+    if (ZSTD_CDictIndicesAreTagged(cParams)){
+        /* Remove tags from the CDict table if they are present.
+         * See docs on "short cache" in zstd_compress_internal.h for context. */
+        size_t i;
+        for (i = 0; i < tableSize; i++) {
+            U32 const taggedIndex = src[i];
+            U32 const index = taggedIndex >> ZSTD_SHORT_CACHE_TAG_BITS;
+            dst[i] = index;
+        }
+    } else {
+        ZSTD_memcpy(dst, src, tableSize * sizeof(U32));
+    }
+}
+
 static size_t ZSTD_resetCCtx_byCopyingCDict(ZSTD_CCtx* cctx,
                             const ZSTD_CDict* cdict,
                             ZSTD_CCtx_params params,
@@ -2054,21 +2300,23 @@ static size_t ZSTD_resetCCtx_byCopyingCDict(ZSTD_CCtx* cctx,
                                                             : 0;
         size_t const hSize =  (size_t)1 << cdict_cParams->hashLog;
 
-        ZSTD_memcpy(cctx->blockState.matchState.hashTable,
-               cdict->matchState.hashTable,
-               hSize * sizeof(U32));
+        ZSTD_copyCDictTableIntoCCtx(cctx->blockState.matchState.hashTable,
+                                cdict->matchState.hashTable,
+                                hSize, cdict_cParams);
+
         /* Do not copy cdict's chainTable if cctx has parameters such that it would not use chainTable */
         if (ZSTD_allocateChainTable(cctx->appliedParams.cParams.strategy, cctx->appliedParams.useRowMatchFinder, 0 /* forDDSDict */)) {
-            ZSTD_memcpy(cctx->blockState.matchState.chainTable,
-               cdict->matchState.chainTable,
-               chainSize * sizeof(U32));
+            ZSTD_copyCDictTableIntoCCtx(cctx->blockState.matchState.chainTable,
+                                    cdict->matchState.chainTable,
+                                    chainSize, cdict_cParams);
         }
         /* copy tag table */
         if (ZSTD_rowMatchFinderUsed(cdict_cParams->strategy, cdict->useRowMatchFinder)) {
-            size_t const tagTableSize = hSize*sizeof(U16);
+            size_t const tagTableSize = hSize;
             ZSTD_memcpy(cctx->blockState.matchState.tagTable,
-                cdict->matchState.tagTable,
-                tagTableSize);
+                        cdict->matchState.tagTable,
+                        tagTableSize);
+            cctx->blockState.matchState.hashSalt = cdict->matchState.hashSalt;
         }
     }
 
@@ -2147,6 +2395,7 @@ static size_t ZSTD_copyCCtx_internal(ZSTD_CCtx* dstCCtx,
         params.useBlockSplitter = srcCCtx->appliedParams.useBlockSplitter;
         params.ldmParams = srcCCtx->appliedParams.ldmParams;
         params.fParams = fParams;
+        params.maxBlockSize = srcCCtx->appliedParams.maxBlockSize;
         ZSTD_resetCCtx_internal(dstCCtx, &params, pledgedSrcSize,
                                 /* loadedDictSize */ 0,
                                 ZSTDcrp_leaveDirty, zbuff);
@@ -2294,7 +2543,7 @@ static void ZSTD_reduceIndex (ZSTD_matchState_t* ms, ZSTD_CCtx_params const* par
 
 /* See doc/zstd_compression_format.md for detailed format description */
 
-void ZSTD_seqToCodes(const seqStore_t* seqStorePtr)
+int ZSTD_seqToCodes(const seqStore_t* seqStorePtr)
 {
     const seqDef* const sequences = seqStorePtr->sequencesStart;
     BYTE* const llCodeTable = seqStorePtr->llCode;
@@ -2302,18 +2551,24 @@ void ZSTD_seqToCodes(const seqStore_t* seqStorePtr)
     BYTE* const mlCodeTable = seqStorePtr->mlCode;
     U32 const nbSeq = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
     U32 u;
+    int longOffsets = 0;
     assert(nbSeq <= seqStorePtr->maxNbSeq);
     for (u=0; u<nbSeq; u++) {
         U32 const llv = sequences[u].litLength;
+        U32 const ofCode = ZSTD_highbit32(sequences[u].offBase);
         U32 const mlv = sequences[u].mlBase;
         llCodeTable[u] = (BYTE)ZSTD_LLcode(llv);
-        ofCodeTable[u] = (BYTE)ZSTD_highbit32(sequences[u].offBase);
+        ofCodeTable[u] = (BYTE)ofCode;
         mlCodeTable[u] = (BYTE)ZSTD_MLcode(mlv);
+        assert(!(MEM_64bits() && ofCode >= STREAM_ACCUMULATOR_MIN));
+        if (MEM_32bits() && ofCode >= STREAM_ACCUMULATOR_MIN)
+            longOffsets = 1;
     }
     if (seqStorePtr->longLengthType==ZSTD_llt_literalLength)
         llCodeTable[seqStorePtr->longLengthPos] = MaxLL;
     if (seqStorePtr->longLengthType==ZSTD_llt_matchLength)
         mlCodeTable[seqStorePtr->longLengthPos] = MaxML;
+    return longOffsets;
 }
 
 /* ZSTD_useTargetCBlockSize():
@@ -2347,6 +2602,7 @@ typedef struct {
     U32 MLtype;
     size_t size;
     size_t lastCountSize; /* Accounts for bug in 1.3.4. More detail in ZSTD_entropyCompressSeqStore_internal() */
+    int longOffsets;
 } ZSTD_symbolEncodingTypeStats_t;
 
 /* ZSTD_buildSequencesStatistics():
@@ -2357,11 +2613,13 @@ typedef struct {
  * entropyWkspSize must be of size at least ENTROPY_WORKSPACE_SIZE - (MaxSeq + 1)*sizeof(U32)
  */
 static ZSTD_symbolEncodingTypeStats_t
-ZSTD_buildSequencesStatistics(seqStore_t* seqStorePtr, size_t nbSeq,
-                        const ZSTD_fseCTables_t* prevEntropy, ZSTD_fseCTables_t* nextEntropy,
-                              BYTE* dst, const BYTE* const dstEnd,
-                              ZSTD_strategy strategy, unsigned* countWorkspace,
-                              void* entropyWorkspace, size_t entropyWkspSize) {
+ZSTD_buildSequencesStatistics(
+                const seqStore_t* seqStorePtr, size_t nbSeq,
+                const ZSTD_fseCTables_t* prevEntropy, ZSTD_fseCTables_t* nextEntropy,
+                      BYTE* dst, const BYTE* const dstEnd,
+                      ZSTD_strategy strategy, unsigned* countWorkspace,
+                      void* entropyWorkspace, size_t entropyWkspSize)
+{
     BYTE* const ostart = dst;
     const BYTE* const oend = dstEnd;
     BYTE* op = ostart;
@@ -2375,7 +2633,7 @@ ZSTD_buildSequencesStatistics(seqStore_t* seqStorePtr, size_t nbSeq,
 
     stats.lastCountSize = 0;
     /* convert length/distances into codes */
-    ZSTD_seqToCodes(seqStorePtr);
+    stats.longOffsets = ZSTD_seqToCodes(seqStorePtr);
     assert(op <= oend);
     assert(nbSeq != 0); /* ZSTD_selectEncodingType() divides by nbSeq */
     /* build CTable for Literal Lengths */
@@ -2480,22 +2738,22 @@ ZSTD_buildSequencesStatistics(seqStore_t* seqStorePtr, size_t nbSeq,
  */
 #define SUSPECT_UNCOMPRESSIBLE_LITERAL_RATIO 20
 MEM_STATIC size_t
-ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr,
-                          const ZSTD_entropyCTables_t* prevEntropy,
-                                ZSTD_entropyCTables_t* nextEntropy,
-                          const ZSTD_CCtx_params* cctxParams,
-                                void* dst, size_t dstCapacity,
-                                void* entropyWorkspace, size_t entropyWkspSize,
-                          const int bmi2)
+ZSTD_entropyCompressSeqStore_internal(
+                        const seqStore_t* seqStorePtr,
+                        const ZSTD_entropyCTables_t* prevEntropy,
+                              ZSTD_entropyCTables_t* nextEntropy,
+                        const ZSTD_CCtx_params* cctxParams,
+                              void* dst, size_t dstCapacity,
+                              void* entropyWorkspace, size_t entropyWkspSize,
+                        const int bmi2)
 {
-    const int longOffsets = cctxParams->cParams.windowLog > STREAM_ACCUMULATOR_MIN;
     ZSTD_strategy const strategy = cctxParams->cParams.strategy;
     unsigned* count = (unsigned*)entropyWorkspace;
     FSE_CTable* CTable_LitLength = nextEntropy->fse.litlengthCTable;
     FSE_CTable* CTable_OffsetBits = nextEntropy->fse.offcodeCTable;
     FSE_CTable* CTable_MatchLength = nextEntropy->fse.matchlengthCTable;
     const seqDef* const sequences = seqStorePtr->sequencesStart;
-    const size_t nbSeq = seqStorePtr->sequences - seqStorePtr->sequencesStart;
+    const size_t nbSeq = (size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
     const BYTE* const ofCodeTable = seqStorePtr->ofCode;
     const BYTE* const llCodeTable = seqStorePtr->llCode;
     const BYTE* const mlCodeTable = seqStorePtr->mlCode;
@@ -2503,29 +2761,31 @@ ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr,
     BYTE* const oend = ostart + dstCapacity;
     BYTE* op = ostart;
     size_t lastCountSize;
+    int longOffsets = 0;
 
     entropyWorkspace = count + (MaxSeq + 1);
     entropyWkspSize -= (MaxSeq + 1) * sizeof(*count);
 
-    DEBUGLOG(4, "ZSTD_entropyCompressSeqStore_internal (nbSeq=%zu)", nbSeq);
+    DEBUGLOG(5, "ZSTD_entropyCompressSeqStore_internal (nbSeq=%zu, dstCapacity=%zu)", nbSeq, dstCapacity);
     ZSTD_STATIC_ASSERT(HUF_WORKSPACE_SIZE >= (1<<MAX(MLFSELog,LLFSELog)));
     assert(entropyWkspSize >= HUF_WORKSPACE_SIZE);
 
     /* Compress literals */
     {   const BYTE* const literals = seqStorePtr->litStart;
-        size_t const numSequences = seqStorePtr->sequences - seqStorePtr->sequencesStart;
-        size_t const numLiterals = seqStorePtr->lit - seqStorePtr->litStart;
+        size_t const numSequences = (size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
+        size_t const numLiterals = (size_t)(seqStorePtr->lit - seqStorePtr->litStart);
         /* Base suspicion of uncompressibility on ratio of literals to sequences */
         unsigned const suspectUncompressible = (numSequences == 0) || (numLiterals / numSequences >= SUSPECT_UNCOMPRESSIBLE_LITERAL_RATIO);
         size_t const litSize = (size_t)(seqStorePtr->lit - literals);
+
         size_t const cSize = ZSTD_compressLiterals(
-                                    &prevEntropy->huf, &nextEntropy->huf,
-                                    cctxParams->cParams.strategy,
-                                    ZSTD_literalsCompressionIsDisabled(cctxParams),
                                     op, dstCapacity,
                                     literals, litSize,
                                     entropyWorkspace, entropyWkspSize,
-                                    bmi2, suspectUncompressible);
+                                    &prevEntropy->huf, &nextEntropy->huf,
+                                    cctxParams->cParams.strategy,
+                                    ZSTD_literalsCompressionIsDisabled(cctxParams),
+                                    suspectUncompressible, bmi2);
         FORWARD_IF_ERROR(cSize, "ZSTD_compressLiterals failed");
         assert(cSize <= dstCapacity);
         op += cSize;
@@ -2551,11 +2811,10 @@ ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr,
         ZSTD_memcpy(&nextEntropy->fse, &prevEntropy->fse, sizeof(prevEntropy->fse));
         return (size_t)(op - ostart);
     }
-    {
-        ZSTD_symbolEncodingTypeStats_t stats;
-        BYTE* seqHead = op++;
+    {   BYTE* const seqHead = op++;
         /* build stats for sequences */
-        stats = ZSTD_buildSequencesStatistics(seqStorePtr, nbSeq,
+        const ZSTD_symbolEncodingTypeStats_t stats =
+                ZSTD_buildSequencesStatistics(seqStorePtr, nbSeq,
                                              &prevEntropy->fse, &nextEntropy->fse,
                                               op, oend,
                                               strategy, count,
@@ -2564,6 +2823,7 @@ ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr,
         *seqHead = (BYTE)((stats.LLtype<<6) + (stats.Offtype<<4) + (stats.MLtype<<2));
         lastCountSize = stats.lastCountSize;
         op += stats.size;
+        longOffsets = stats.longOffsets;
     }
 
     {   size_t const bitstreamSize = ZSTD_encodeSequences(
@@ -2598,14 +2858,15 @@ ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr,
 }
 
 MEM_STATIC size_t
-ZSTD_entropyCompressSeqStore(seqStore_t* seqStorePtr,
-                       const ZSTD_entropyCTables_t* prevEntropy,
-                             ZSTD_entropyCTables_t* nextEntropy,
-                       const ZSTD_CCtx_params* cctxParams,
-                             void* dst, size_t dstCapacity,
-                             size_t srcSize,
-                             void* entropyWorkspace, size_t entropyWkspSize,
-                             int bmi2)
+ZSTD_entropyCompressSeqStore(
+                    const seqStore_t* seqStorePtr,
+                    const ZSTD_entropyCTables_t* prevEntropy,
+                          ZSTD_entropyCTables_t* nextEntropy,
+                    const ZSTD_CCtx_params* cctxParams,
+                          void* dst, size_t dstCapacity,
+                          size_t srcSize,
+                          void* entropyWorkspace, size_t entropyWkspSize,
+                          int bmi2)
 {
     size_t const cSize = ZSTD_entropyCompressSeqStore_internal(
                             seqStorePtr, prevEntropy, nextEntropy, cctxParams,
@@ -2615,15 +2876,21 @@ ZSTD_entropyCompressSeqStore(seqStore_t* seqStorePtr,
     /* When srcSize <= dstCapacity, there is enough space to write a raw uncompressed block.
      * Since we ran out of space, block must be not compressible, so fall back to raw uncompressed block.
      */
-    if ((cSize == ERROR(dstSize_tooSmall)) & (srcSize <= dstCapacity))
+    if ((cSize == ERROR(dstSize_tooSmall)) & (srcSize <= dstCapacity)) {
+        DEBUGLOG(4, "not enough dstCapacity (%zu) for ZSTD_entropyCompressSeqStore_internal()=> do not compress block", dstCapacity);
         return 0;  /* block not compressed */
+    }
     FORWARD_IF_ERROR(cSize, "ZSTD_entropyCompressSeqStore_internal failed");
 
     /* Check compressibility */
     {   size_t const maxCSize = srcSize - ZSTD_minGain(srcSize, cctxParams->cParams.strategy);
         if (cSize >= maxCSize) return 0;  /* block not compressed */
     }
-    DEBUGLOG(4, "ZSTD_entropyCompressSeqStore() cSize: %zu", cSize);
+    DEBUGLOG(5, "ZSTD_entropyCompressSeqStore() cSize: %zu", cSize);
+    /* libzstd decoder before  > v1.5.4 is not compatible with compressed blocks of size ZSTD_BLOCKSIZE_MAX exactly.
+     * This restriction is indirectly already fulfilled by respecting ZSTD_minGain() condition above.
+     */
+    assert(cSize < ZSTD_BLOCKSIZE_MAX);
     return cSize;
 }
 
@@ -2718,6 +2985,72 @@ void ZSTD_resetSeqStore(seqStore_t* ssPtr)
     ssPtr->longLengthType = ZSTD_llt_none;
 }
 
+/* ZSTD_postProcessSequenceProducerResult() :
+ * Validates and post-processes sequences obtained through the external matchfinder API:
+ *   - Checks whether nbExternalSeqs represents an error condition.
+ *   - Appends a block delimiter to outSeqs if one is not already present.
+ *     See zstd.h for context regarding block delimiters.
+ * Returns the number of sequences after post-processing, or an error code. */
+static size_t ZSTD_postProcessSequenceProducerResult(
+    ZSTD_Sequence* outSeqs, size_t nbExternalSeqs, size_t outSeqsCapacity, size_t srcSize
+) {
+    RETURN_ERROR_IF(
+        nbExternalSeqs > outSeqsCapacity,
+        sequenceProducer_failed,
+        "External sequence producer returned error code %lu",
+        (unsigned long)nbExternalSeqs
+    );
+
+    RETURN_ERROR_IF(
+        nbExternalSeqs == 0 && srcSize > 0,
+        sequenceProducer_failed,
+        "Got zero sequences from external sequence producer for a non-empty src buffer!"
+    );
+
+    if (srcSize == 0) {
+        ZSTD_memset(&outSeqs[0], 0, sizeof(ZSTD_Sequence));
+        return 1;
+    }
+
+    {
+        ZSTD_Sequence const lastSeq = outSeqs[nbExternalSeqs - 1];
+
+        /* We can return early if lastSeq is already a block delimiter. */
+        if (lastSeq.offset == 0 && lastSeq.matchLength == 0) {
+            return nbExternalSeqs;
+        }
+
+        /* This error condition is only possible if the external matchfinder
+         * produced an invalid parse, by definition of ZSTD_sequenceBound(). */
+        RETURN_ERROR_IF(
+            nbExternalSeqs == outSeqsCapacity,
+            sequenceProducer_failed,
+            "nbExternalSeqs == outSeqsCapacity but lastSeq is not a block delimiter!"
+        );
+
+        /* lastSeq is not a block delimiter, so we need to append one. */
+        ZSTD_memset(&outSeqs[nbExternalSeqs], 0, sizeof(ZSTD_Sequence));
+        return nbExternalSeqs + 1;
+    }
+}
+
+/* ZSTD_fastSequenceLengthSum() :
+ * Returns sum(litLen) + sum(matchLen) + lastLits for *seqBuf*.
+ * Similar to another function in zstd_compress.c (determine_blockSize),
+ * except it doesn't check for a block delimiter to end summation.
+ * Removing the early exit allows the compiler to auto-vectorize (https://godbolt.org/z/cY1cajz9P).
+ * This function can be deleted and replaced by determine_blockSize after we resolve issue #3456. */
+static size_t ZSTD_fastSequenceLengthSum(ZSTD_Sequence const* seqBuf, size_t seqBufSize) {
+    size_t matchLenSum, litLenSum, i;
+    matchLenSum = 0;
+    litLenSum = 0;
+    for (i = 0; i < seqBufSize; i++) {
+        litLenSum += seqBuf[i].litLength;
+        matchLenSum += seqBuf[i].matchLength;
+    }
+    return litLenSum + matchLenSum;
+}
+
 typedef enum { ZSTDbss_compress, ZSTDbss_noCompress } ZSTD_buildSeqStore_e;
 
 static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)
@@ -2727,7 +3060,9 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)
     assert(srcSize <= ZSTD_BLOCKSIZE_MAX);
     /* Assert that we have correctly flushed the ctx params into the ms's copy */
     ZSTD_assertEqualCParams(zc->appliedParams.cParams, ms->cParams);
-    if (srcSize < MIN_CBLOCK_SIZE+ZSTD_blockHeaderSize+1) {
+    /* TODO: See 3090. We reduced MIN_CBLOCK_SIZE from 3 to 2 so to compensate we are adding
+     * additional 1. We need to revisit and change this logic to be more consistent */
+    if (srcSize < MIN_CBLOCK_SIZE+ZSTD_blockHeaderSize+1+1) {
         if (zc->appliedParams.cParams.strategy >= ZSTD_btopt) {
             ZSTD_ldm_skipRawSeqStoreBytes(&zc->externSeqStore, srcSize);
         } else {
@@ -2763,6 +3098,15 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)
         }
         if (zc->externSeqStore.pos < zc->externSeqStore.size) {
             assert(zc->appliedParams.ldmParams.enableLdm == ZSTD_ps_disable);
+
+            /* External matchfinder + LDM is technically possible, just not implemented yet.
+             * We need to revisit soon and implement it. */
+            RETURN_ERROR_IF(
+                zc->appliedParams.useSequenceProducer,
+                parameter_combination_unsupported,
+                "Long-distance matching with external sequence producer enabled is not currently supported."
+            );
+
             /* Updates ldmSeqStore.pos */
             lastLLSize =
                 ZSTD_ldm_blockCompress(&zc->externSeqStore,
@@ -2774,6 +3118,14 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)
         } else if (zc->appliedParams.ldmParams.enableLdm == ZSTD_ps_enable) {
             rawSeqStore_t ldmSeqStore = kNullRawSeqStore;
 
+            /* External matchfinder + LDM is technically possible, just not implemented yet.
+             * We need to revisit soon and implement it. */
+            RETURN_ERROR_IF(
+                zc->appliedParams.useSequenceProducer,
+                parameter_combination_unsupported,
+                "Long-distance matching with external sequence producer enabled is not currently supported."
+            );
+
             ldmSeqStore.seq = zc->ldmSequences;
             ldmSeqStore.capacity = zc->maxNbLdmSequences;
             /* Updates ldmSeqStore.size */
@@ -2788,7 +3140,68 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)
                                        zc->appliedParams.useRowMatchFinder,
                                        src, srcSize);
             assert(ldmSeqStore.pos == ldmSeqStore.size);
-        } else {   /* not long range mode */
+        } else if (zc->appliedParams.useSequenceProducer) {
+            assert(
+                zc->externalMatchCtx.seqBufferCapacity >= ZSTD_sequenceBound(srcSize)
+            );
+            assert(zc->externalMatchCtx.mFinder != NULL);
+
+            {   U32 const windowSize = (U32)1 << zc->appliedParams.cParams.windowLog;
+
+                size_t const nbExternalSeqs = (zc->externalMatchCtx.mFinder)(
+                    zc->externalMatchCtx.mState,
+                    zc->externalMatchCtx.seqBuffer,
+                    zc->externalMatchCtx.seqBufferCapacity,
+                    src, srcSize,
+                    NULL, 0,  /* dict and dictSize, currently not supported */
+                    zc->appliedParams.compressionLevel,
+                    windowSize
+                );
+
+                size_t const nbPostProcessedSeqs = ZSTD_postProcessSequenceProducerResult(
+                    zc->externalMatchCtx.seqBuffer,
+                    nbExternalSeqs,
+                    zc->externalMatchCtx.seqBufferCapacity,
+                    srcSize
+                );
+
+                /* Return early if there is no error, since we don't need to worry about last literals */
+                if (!ZSTD_isError(nbPostProcessedSeqs)) {
+                    ZSTD_sequencePosition seqPos = {0,0,0};
+                    size_t const seqLenSum = ZSTD_fastSequenceLengthSum(zc->externalMatchCtx.seqBuffer, nbPostProcessedSeqs);
+                    RETURN_ERROR_IF(seqLenSum > srcSize, externalSequences_invalid, "External sequences imply too large a block!");
+                    FORWARD_IF_ERROR(
+                        ZSTD_copySequencesToSeqStoreExplicitBlockDelim(
+                            zc, &seqPos,
+                            zc->externalMatchCtx.seqBuffer, nbPostProcessedSeqs,
+                            src, srcSize,
+                            zc->appliedParams.searchForExternalRepcodes
+                        ),
+                        "Failed to copy external sequences to seqStore!"
+                    );
+                    ms->ldmSeqStore = NULL;
+                    DEBUGLOG(5, "Copied %lu sequences from external sequence producer to internal seqStore.", (unsigned long)nbExternalSeqs);
+                    return ZSTDbss_compress;
+                }
+
+                /* Propagate the error if fallback is disabled */
+                if (!zc->appliedParams.enableMatchFinderFallback) {
+                    return nbPostProcessedSeqs;
+                }
+
+                /* Fallback to software matchfinder */
+                {   ZSTD_blockCompressor const blockCompressor = ZSTD_selectBlockCompressor(zc->appliedParams.cParams.strategy,
+                                                                                            zc->appliedParams.useRowMatchFinder,
+                                                                                            dictMode);
+                    ms->ldmSeqStore = NULL;
+                    DEBUGLOG(
+                        5,
+                        "External sequence producer returned error code %lu. Falling back to internal parser.",
+                        (unsigned long)nbExternalSeqs
+                    );
+                    lastLLSize = blockCompressor(ms, &zc->seqStore, zc->blockState.nextCBlock->rep, src, srcSize);
+            }   }
+        } else {   /* not long range mode and no external matchfinder */
             ZSTD_blockCompressor const blockCompressor = ZSTD_selectBlockCompressor(zc->appliedParams.cParams.strategy,
                                                                                     zc->appliedParams.useRowMatchFinder,
                                                                                     dictMode);
@@ -2849,7 +3262,7 @@ static void ZSTD_copyBlockSequences(ZSTD_CCtx* zc)
         /* seqStoreSeqs[i].offset == offCode+1, and ZSTD_updateRep() expects offCode
            so we provide seqStoreSeqs[i].offset - 1 */
         ZSTD_updateRep(updatedRepcodes.rep,
-                       seqStoreSeqs[i].offBase - 1,
+                       seqStoreSeqs[i].offBase,
                        seqStoreSeqs[i].litLength == 0);
         literalsRead += outSeqs[i].litLength;
     }
@@ -2865,6 +3278,10 @@ static void ZSTD_copyBlockSequences(ZSTD_CCtx* zc)
     zc->seqCollector.seqIndex += seqStoreSeqSize;
 }
 
+size_t ZSTD_sequenceBound(size_t srcSize) {
+    return (srcSize / ZSTD_MINMATCH_MIN) + 1;
+}
+
 size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs,
                               size_t outSeqsSize, const void* src, size_t srcSize)
 {
@@ -2910,19 +3327,17 @@ static int ZSTD_isRLE(const BYTE* src, size_t length) {
     const size_t unrollMask = unrollSize - 1;
     const size_t prefixLength = length & unrollMask;
     size_t i;
-    size_t u;
     if (length == 1) return 1;
     /* Check if prefix is RLE first before using unrolled loop */
     if (prefixLength && ZSTD_count(ip+1, ip, ip+prefixLength) != prefixLength-1) {
         return 0;
     }
     for (i = prefixLength; i != length; i += unrollSize) {
+        size_t u;
         for (u = 0; u < unrollSize; u += sizeof(size_t)) {
             if (MEM_readST(ip + i + u) != valueST) {
                 return 0;
-            }
-        }
-    }
+    }   }   }
     return 1;
 }
 
@@ -2938,7 +3353,8 @@ static int ZSTD_maybeRLE(seqStore_t const* seqStore)
     return nbSeqs < 4 && nbLits < 10;
 }
 
-static void ZSTD_blockState_confirmRepcodesAndEntropyTables(ZSTD_blockState_t* const bs)
+static void
+ZSTD_blockState_confirmRepcodesAndEntropyTables(ZSTD_blockState_t* const bs)
 {
     ZSTD_compressedBlockState_t* const tmp = bs->prevCBlock;
     bs->prevCBlock = bs->nextCBlock;
@@ -2946,7 +3362,9 @@ static void ZSTD_blockState_confirmRepcodesAndEntropyTables(ZSTD_blockState_t* c
 }
 
 /* Writes the block header */
-static void writeBlockHeader(void* op, size_t cSize, size_t blockSize, U32 lastBlock) {
+static void
+writeBlockHeader(void* op, size_t cSize, size_t blockSize, U32 lastBlock)
+{
     U32 const cBlockHeader = cSize == 1 ?
                         lastBlock + (((U32)bt_rle)<<1) + (U32)(blockSize << 3) :
                         lastBlock + (((U32)bt_compressed)<<1) + (U32)(cSize << 3);
@@ -2959,13 +3377,16 @@ static void writeBlockHeader(void* op, size_t cSize, size_t blockSize, U32 lastB
  *  Stores literals block type (raw, rle, compressed, repeat) and
  *  huffman description table to hufMetadata.
  *  Requires ENTROPY_WORKSPACE_SIZE workspace
- *  @return : size of huffman description table or error code */
-static size_t ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSize,
-                                            const ZSTD_hufCTables_t* prevHuf,
-                                                  ZSTD_hufCTables_t* nextHuf,
-                                                  ZSTD_hufCTablesMetadata_t* hufMetadata,
-                                                  const int literalsCompressionIsDisabled,
-                                                  void* workspace, size_t wkspSize)
+ * @return : size of huffman description table, or an error code
+ */
+static size_t
+ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSize,
+                               const ZSTD_hufCTables_t* prevHuf,
+                                     ZSTD_hufCTables_t* nextHuf,
+                                     ZSTD_hufCTablesMetadata_t* hufMetadata,
+                               const int literalsCompressionIsDisabled,
+                                     void* workspace, size_t wkspSize,
+                                     int hufFlags)
 {
     BYTE* const wkspStart = (BYTE*)workspace;
     BYTE* const wkspEnd = wkspStart + wkspSize;
@@ -2973,9 +3394,9 @@ static size_t ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSi
     unsigned* const countWksp = (unsigned*)workspace;
     const size_t countWkspSize = (HUF_SYMBOLVALUE_MAX + 1) * sizeof(unsigned);
     BYTE* const nodeWksp = countWkspStart + countWkspSize;
-    const size_t nodeWkspSize = wkspEnd-nodeWksp;
+    const size_t nodeWkspSize = (size_t)(wkspEnd - nodeWksp);
     unsigned maxSymbolValue = HUF_SYMBOLVALUE_MAX;
-    unsigned huffLog = HUF_TABLELOG_DEFAULT;
+    unsigned huffLog = LitHufLog;
     HUF_repeat repeat = prevHuf->repeatMode;
     DEBUGLOG(5, "ZSTD_buildBlockEntropyStats_literals (srcSize=%zu)", srcSize);
 
@@ -2990,73 +3411,77 @@ static size_t ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSi
 
     /* small ? don't even attempt compression (speed opt) */
 #ifndef COMPRESS_LITERALS_SIZE_MIN
-#define COMPRESS_LITERALS_SIZE_MIN 63
+# define COMPRESS_LITERALS_SIZE_MIN 63  /* heuristic */
 #endif
     {   size_t const minLitSize = (prevHuf->repeatMode == HUF_repeat_valid) ? 6 : COMPRESS_LITERALS_SIZE_MIN;
         if (srcSize <= minLitSize) {
             DEBUGLOG(5, "set_basic - too small");
             hufMetadata->hType = set_basic;
             return 0;
-        }
-    }
+    }   }
 
     /* Scan input and build symbol stats */
-    {   size_t const largest = HIST_count_wksp (countWksp, &maxSymbolValue, (const BYTE*)src, srcSize, workspace, wkspSize);
+    {   size_t const largest =
+            HIST_count_wksp (countWksp, &maxSymbolValue,
+                            (const BYTE*)src, srcSize,
+                            workspace, wkspSize);
         FORWARD_IF_ERROR(largest, "HIST_count_wksp failed");
         if (largest == srcSize) {
+            /* only one literal symbol */
             DEBUGLOG(5, "set_rle");
             hufMetadata->hType = set_rle;
             return 0;
         }
         if (largest <= (srcSize >> 7)+4) {
+            /* heuristic: likely not compressible */
             DEBUGLOG(5, "set_basic - no gain");
             hufMetadata->hType = set_basic;
             return 0;
-        }
-    }
+    }   }
 
     /* Validate the previous Huffman table */
-    if (repeat == HUF_repeat_check && !HUF_validateCTable((HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue)) {
+    if (repeat == HUF_repeat_check
+      && !HUF_validateCTable((HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue)) {
         repeat = HUF_repeat_none;
     }
 
     /* Build Huffman Tree */
     ZSTD_memset(nextHuf->CTable, 0, sizeof(nextHuf->CTable));
-    huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue);
+    huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue, nodeWksp, nodeWkspSize, nextHuf->CTable, countWksp, hufFlags);
+    assert(huffLog <= LitHufLog);
     {   size_t const maxBits = HUF_buildCTable_wksp((HUF_CElt*)nextHuf->CTable, countWksp,
                                                     maxSymbolValue, huffLog,
                                                     nodeWksp, nodeWkspSize);
         FORWARD_IF_ERROR(maxBits, "HUF_buildCTable_wksp");
         huffLog = (U32)maxBits;
-        {   /* Build and write the CTable */
-            size_t const newCSize = HUF_estimateCompressedSize(
-                    (HUF_CElt*)nextHuf->CTable, countWksp, maxSymbolValue);
-            size_t const hSize = HUF_writeCTable_wksp(
-                    hufMetadata->hufDesBuffer, sizeof(hufMetadata->hufDesBuffer),
-                    (HUF_CElt*)nextHuf->CTable, maxSymbolValue, huffLog,
-                    nodeWksp, nodeWkspSize);
-            /* Check against repeating the previous CTable */
-            if (repeat != HUF_repeat_none) {
-                size_t const oldCSize = HUF_estimateCompressedSize(
-                        (HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue);
-                if (oldCSize < srcSize && (oldCSize <= hSize + newCSize || hSize + 12 >= srcSize)) {
-                    DEBUGLOG(5, "set_repeat - smaller");
-                    ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
-                    hufMetadata->hType = set_repeat;
-                    return 0;
-                }
-            }
-            if (newCSize + hSize >= srcSize) {
-                DEBUGLOG(5, "set_basic - no gains");
+    }
+    {   /* Build and write the CTable */
+        size_t const newCSize = HUF_estimateCompressedSize(
+                (HUF_CElt*)nextHuf->CTable, countWksp, maxSymbolValue);
+        size_t const hSize = HUF_writeCTable_wksp(
+                hufMetadata->hufDesBuffer, sizeof(hufMetadata->hufDesBuffer),
+                (HUF_CElt*)nextHuf->CTable, maxSymbolValue, huffLog,
+                nodeWksp, nodeWkspSize);
+        /* Check against repeating the previous CTable */
+        if (repeat != HUF_repeat_none) {
+            size_t const oldCSize = HUF_estimateCompressedSize(
+                    (HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue);
+            if (oldCSize < srcSize && (oldCSize <= hSize + newCSize || hSize + 12 >= srcSize)) {
+                DEBUGLOG(5, "set_repeat - smaller");
                 ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
-                hufMetadata->hType = set_basic;
+                hufMetadata->hType = set_repeat;
                 return 0;
-            }
-            DEBUGLOG(5, "set_compressed (hSize=%u)", (U32)hSize);
-            hufMetadata->hType = set_compressed;
-            nextHuf->repeatMode = HUF_repeat_check;
-            return hSize;
+        }   }
+        if (newCSize + hSize >= srcSize) {
+            DEBUGLOG(5, "set_basic - no gains");
+            ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
+            hufMetadata->hType = set_basic;
+            return 0;
         }
+        DEBUGLOG(5, "set_compressed (hSize=%u)", (U32)hSize);
+        hufMetadata->hType = set_compressed;
+        nextHuf->repeatMode = HUF_repeat_check;
+        return hSize;
     }
 }
 
@@ -3066,8 +3491,9 @@ static size_t ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSi
  * and updates nextEntropy to the appropriate repeatMode.
  */
 static ZSTD_symbolEncodingTypeStats_t
-ZSTD_buildDummySequencesStatistics(ZSTD_fseCTables_t* nextEntropy) {
-    ZSTD_symbolEncodingTypeStats_t stats = {set_basic, set_basic, set_basic, 0, 0};
+ZSTD_buildDummySequencesStatistics(ZSTD_fseCTables_t* nextEntropy)
+{
+    ZSTD_symbolEncodingTypeStats_t stats = {set_basic, set_basic, set_basic, 0, 0, 0};
     nextEntropy->litlength_repeatMode = FSE_repeat_none;
     nextEntropy->offcode_repeatMode = FSE_repeat_none;
     nextEntropy->matchlength_repeatMode = FSE_repeat_none;
@@ -3078,16 +3504,18 @@ ZSTD_buildDummySequencesStatistics(ZSTD_fseCTables_t* nextEntropy) {
  *  Builds entropy for the sequences.
  *  Stores symbol compression modes and fse table to fseMetadata.
  *  Requires ENTROPY_WORKSPACE_SIZE wksp.
- *  @return : size of fse tables or error code */
-static size_t ZSTD_buildBlockEntropyStats_sequences(seqStore_t* seqStorePtr,
-                                              const ZSTD_fseCTables_t* prevEntropy,
-                                                    ZSTD_fseCTables_t* nextEntropy,
-                                              const ZSTD_CCtx_params* cctxParams,
-                                                    ZSTD_fseCTablesMetadata_t* fseMetadata,
-                                                    void* workspace, size_t wkspSize)
+ * @return : size of fse tables or error code */
+static size_t
+ZSTD_buildBlockEntropyStats_sequences(
+                const seqStore_t* seqStorePtr,
+                const ZSTD_fseCTables_t* prevEntropy,
+                      ZSTD_fseCTables_t* nextEntropy,
+                const ZSTD_CCtx_params* cctxParams,
+                      ZSTD_fseCTablesMetadata_t* fseMetadata,
+                      void* workspace, size_t wkspSize)
 {
     ZSTD_strategy const strategy = cctxParams->cParams.strategy;
-    size_t const nbSeq = seqStorePtr->sequences - seqStorePtr->sequencesStart;
+    size_t const nbSeq = (size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
     BYTE* const ostart = fseMetadata->fseTablesBuffer;
     BYTE* const oend = ostart + sizeof(fseMetadata->fseTablesBuffer);
     BYTE* op = ostart;
@@ -3114,23 +3542,28 @@ static size_t ZSTD_buildBlockEntropyStats_sequences(seqStore_t* seqStorePtr,
 /* ZSTD_buildBlockEntropyStats() :
  *  Builds entropy for the block.
  *  Requires workspace size ENTROPY_WORKSPACE_SIZE
- *
- *  @return : 0 on success or error code
+ * @return : 0 on success, or an error code
+ *  Note : also employed in superblock
  */
-size_t ZSTD_buildBlockEntropyStats(seqStore_t* seqStorePtr,
-                             const ZSTD_entropyCTables_t* prevEntropy,
-                                   ZSTD_entropyCTables_t* nextEntropy,
-                             const ZSTD_CCtx_params* cctxParams,
-                                   ZSTD_entropyCTablesMetadata_t* entropyMetadata,
-                                   void* workspace, size_t wkspSize)
-{
-    size_t const litSize = seqStorePtr->lit - seqStorePtr->litStart;
+size_t ZSTD_buildBlockEntropyStats(
+            const seqStore_t* seqStorePtr,
+            const ZSTD_entropyCTables_t* prevEntropy,
+                  ZSTD_entropyCTables_t* nextEntropy,
+            const ZSTD_CCtx_params* cctxParams,
+                  ZSTD_entropyCTablesMetadata_t* entropyMetadata,
+                  void* workspace, size_t wkspSize)
+{
+    size_t const litSize = (size_t)(seqStorePtr->lit - seqStorePtr->litStart);
+    int const huf_useOptDepth = (cctxParams->cParams.strategy >= HUF_OPTIMAL_DEPTH_THRESHOLD);
+    int const hufFlags = huf_useOptDepth ? HUF_flags_optimalDepth : 0;
+
     entropyMetadata->hufMetadata.hufDesSize =
         ZSTD_buildBlockEntropyStats_literals(seqStorePtr->litStart, litSize,
                                             &prevEntropy->huf, &nextEntropy->huf,
                                             &entropyMetadata->hufMetadata,
                                             ZSTD_literalsCompressionIsDisabled(cctxParams),
-                                            workspace, wkspSize);
+                                            workspace, wkspSize, hufFlags);
+
     FORWARD_IF_ERROR(entropyMetadata->hufMetadata.hufDesSize, "ZSTD_buildBlockEntropyStats_literals failed");
     entropyMetadata->fseMetadata.fseTablesSize =
         ZSTD_buildBlockEntropyStats_sequences(seqStorePtr,
@@ -3143,11 +3576,12 @@ size_t ZSTD_buildBlockEntropyStats(seqStore_t* seqStorePtr,
 }
 
 /* Returns the size estimate for the literals section (header + content) of a block */
-static size_t ZSTD_estimateBlockSize_literal(const BYTE* literals, size_t litSize,
-                                                const ZSTD_hufCTables_t* huf,
-                                                const ZSTD_hufCTablesMetadata_t* hufMetadata,
-                                                void* workspace, size_t wkspSize,
-                                                int writeEntropy)
+static size_t
+ZSTD_estimateBlockSize_literal(const BYTE* literals, size_t litSize,
+                               const ZSTD_hufCTables_t* huf,
+                               const ZSTD_hufCTablesMetadata_t* hufMetadata,
+                               void* workspace, size_t wkspSize,
+                               int writeEntropy)
 {
     unsigned* const countWksp = (unsigned*)workspace;
     unsigned maxSymbolValue = HUF_SYMBOLVALUE_MAX;
@@ -3169,12 +3603,13 @@ static size_t ZSTD_estimateBlockSize_literal(const BYTE* literals, size_t litSiz
 }
 
 /* Returns the size estimate for the FSE-compressed symbols (of, ml, ll) of a block */
-static size_t ZSTD_estimateBlockSize_symbolType(symbolEncodingType_e type,
-                        const BYTE* codeTable, size_t nbSeq, unsigned maxCode,
-                        const FSE_CTable* fseCTable,
-                        const U8* additionalBits,
-                        short const* defaultNorm, U32 defaultNormLog, U32 defaultMax,
-                        void* workspace, size_t wkspSize)
+static size_t
+ZSTD_estimateBlockSize_symbolType(symbolEncodingType_e type,
+                    const BYTE* codeTable, size_t nbSeq, unsigned maxCode,
+                    const FSE_CTable* fseCTable,
+                    const U8* additionalBits,
+                    short const* defaultNorm, U32 defaultNormLog, U32 defaultMax,
+                    void* workspace, size_t wkspSize)
 {
     unsigned* const countWksp = (unsigned*)workspace;
     const BYTE* ctp = codeTable;
@@ -3206,99 +3641,107 @@ static size_t ZSTD_estimateBlockSize_symbolType(symbolEncodingType_e type,
 }
 
 /* Returns the size estimate for the sequences section (header + content) of a block */
-static size_t ZSTD_estimateBlockSize_sequences(const BYTE* ofCodeTable,
-                                                  const BYTE* llCodeTable,
-                                                  const BYTE* mlCodeTable,
-                                                  size_t nbSeq,
-                                                  const ZSTD_fseCTables_t* fseTables,
-                                                  const ZSTD_fseCTablesMetadata_t* fseMetadata,
-                                                  void* workspace, size_t wkspSize,
-                                                  int writeEntropy)
+static size_t
+ZSTD_estimateBlockSize_sequences(const BYTE* ofCodeTable,
+                                 const BYTE* llCodeTable,
+                                 const BYTE* mlCodeTable,
+                                 size_t nbSeq,
+                                 const ZSTD_fseCTables_t* fseTables,
+                                 const ZSTD_fseCTablesMetadata_t* fseMetadata,
+                                 void* workspace, size_t wkspSize,
+                                 int writeEntropy)
 {
     size_t sequencesSectionHeaderSize = 1 /* seqHead */ + 1 /* min seqSize size */ + (nbSeq >= 128) + (nbSeq >= LONGNBSEQ);
     size_t cSeqSizeEstimate = 0;
     cSeqSizeEstimate += ZSTD_estimateBlockSize_symbolType(fseMetadata->ofType, ofCodeTable, nbSeq, MaxOff,
-                                         fseTables->offcodeCTable, NULL,
-                                         OF_defaultNorm, OF_defaultNormLog, DefaultMaxOff,
-                                         workspace, wkspSize);
+                                    fseTables->offcodeCTable, NULL,
+                                    OF_defaultNorm, OF_defaultNormLog, DefaultMaxOff,
+                                    workspace, wkspSize);
     cSeqSizeEstimate += ZSTD_estimateBlockSize_symbolType(fseMetadata->llType, llCodeTable, nbSeq, MaxLL,
-                                         fseTables->litlengthCTable, LL_bits,
-                                         LL_defaultNorm, LL_defaultNormLog, MaxLL,
-                                         workspace, wkspSize);
+                                    fseTables->litlengthCTable, LL_bits,
+                                    LL_defaultNorm, LL_defaultNormLog, MaxLL,
+                                    workspace, wkspSize);
     cSeqSizeEstimate += ZSTD_estimateBlockSize_symbolType(fseMetadata->mlType, mlCodeTable, nbSeq, MaxML,
-                                         fseTables->matchlengthCTable, ML_bits,
-                                         ML_defaultNorm, ML_defaultNormLog, MaxML,
-                                         workspace, wkspSize);
+                                    fseTables->matchlengthCTable, ML_bits,
+                                    ML_defaultNorm, ML_defaultNormLog, MaxML,
+                                    workspace, wkspSize);
     if (writeEntropy) cSeqSizeEstimate += fseMetadata->fseTablesSize;
     return cSeqSizeEstimate + sequencesSectionHeaderSize;
 }
 
 /* Returns the size estimate for a given stream of literals, of, ll, ml */
-static size_t ZSTD_estimateBlockSize(const BYTE* literals, size_t litSize,
-                                     const BYTE* ofCodeTable,
-                                     const BYTE* llCodeTable,
-                                     const BYTE* mlCodeTable,
-                                     size_t nbSeq,
-                                     const ZSTD_entropyCTables_t* entropy,
-                                     const ZSTD_entropyCTablesMetadata_t* entropyMetadata,
-                                     void* workspace, size_t wkspSize,
-                                     int writeLitEntropy, int writeSeqEntropy) {
+static size_t
+ZSTD_estimateBlockSize(const BYTE* literals, size_t litSize,
+                       const BYTE* ofCodeTable,
+                       const BYTE* llCodeTable,
+                       const BYTE* mlCodeTable,
+                       size_t nbSeq,
+                       const ZSTD_entropyCTables_t* entropy,
+                       const ZSTD_entropyCTablesMetadata_t* entropyMetadata,
+                       void* workspace, size_t wkspSize,
+                       int writeLitEntropy, int writeSeqEntropy)
+{
     size_t const literalsSize = ZSTD_estimateBlockSize_literal(literals, litSize,
-                                                         &entropy->huf, &entropyMetadata->hufMetadata,
-                                                         workspace, wkspSize, writeLitEntropy);
+                                    &entropy->huf, &entropyMetadata->hufMetadata,
+                                    workspace, wkspSize, writeLitEntropy);
     size_t const seqSize = ZSTD_estimateBlockSize_sequences(ofCodeTable, llCodeTable, mlCodeTable,
-                                                         nbSeq, &entropy->fse, &entropyMetadata->fseMetadata,
-                                                         workspace, wkspSize, writeSeqEntropy);
+                                    nbSeq, &entropy->fse, &entropyMetadata->fseMetadata,
+                                    workspace, wkspSize, writeSeqEntropy);
     return seqSize + literalsSize + ZSTD_blockHeaderSize;
 }
 
 /* Builds entropy statistics and uses them for blocksize estimation.
  *
- * Returns the estimated compressed size of the seqStore, or a zstd error.
+ * @return: estimated compressed size of the seqStore, or a zstd error.
  */
-static size_t ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(seqStore_t* seqStore, ZSTD_CCtx* zc) {
-    ZSTD_entropyCTablesMetadata_t* entropyMetadata = &zc->blockSplitCtx.entropyMetadata;
+static size_t
+ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(seqStore_t* seqStore, ZSTD_CCtx* zc)
+{
+    ZSTD_entropyCTablesMetadata_t* const entropyMetadata = &zc->blockSplitCtx.entropyMetadata;
     DEBUGLOG(6, "ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize()");
     FORWARD_IF_ERROR(ZSTD_buildBlockEntropyStats(seqStore,
                     &zc->blockState.prevCBlock->entropy,
                     &zc->blockState.nextCBlock->entropy,
                     &zc->appliedParams,
                     entropyMetadata,
-                    zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */), "");
-    return ZSTD_estimateBlockSize(seqStore->litStart, (size_t)(seqStore->lit - seqStore->litStart),
+                    zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE), "");
+    return ZSTD_estimateBlockSize(
+                    seqStore->litStart, (size_t)(seqStore->lit - seqStore->litStart),
                     seqStore->ofCode, seqStore->llCode, seqStore->mlCode,
                     (size_t)(seqStore->sequences - seqStore->sequencesStart),
-                    &zc->blockState.nextCBlock->entropy, entropyMetadata, zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE,
+                    &zc->blockState.nextCBlock->entropy,
+                    entropyMetadata,
+                    zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE,
                     (int)(entropyMetadata->hufMetadata.hType == set_compressed), 1);
 }
 
 /* Returns literals bytes represented in a seqStore */
-static size_t ZSTD_countSeqStoreLiteralsBytes(const seqStore_t* const seqStore) {
+static size_t ZSTD_countSeqStoreLiteralsBytes(const seqStore_t* const seqStore)
+{
     size_t literalsBytes = 0;
-    size_t const nbSeqs = seqStore->sequences - seqStore->sequencesStart;
+    size_t const nbSeqs = (size_t)(seqStore->sequences - seqStore->sequencesStart);
     size_t i;
     for (i = 0; i < nbSeqs; ++i) {
-        seqDef seq = seqStore->sequencesStart[i];
+        seqDef const seq = seqStore->sequencesStart[i];
         literalsBytes += seq.litLength;
         if (i == seqStore->longLengthPos && seqStore->longLengthType == ZSTD_llt_literalLength) {
             literalsBytes += 0x10000;
-        }
-    }
+    }   }
     return literalsBytes;
 }
 
 /* Returns match bytes represented in a seqStore */
-static size_t ZSTD_countSeqStoreMatchBytes(const seqStore_t* const seqStore) {
+static size_t ZSTD_countSeqStoreMatchBytes(const seqStore_t* const seqStore)
+{
     size_t matchBytes = 0;
-    size_t const nbSeqs = seqStore->sequences - seqStore->sequencesStart;
+    size_t const nbSeqs = (size_t)(seqStore->sequences - seqStore->sequencesStart);
     size_t i;
     for (i = 0; i < nbSeqs; ++i) {
         seqDef seq = seqStore->sequencesStart[i];
         matchBytes += seq.mlBase + MINMATCH;
         if (i == seqStore->longLengthPos && seqStore->longLengthType == ZSTD_llt_matchLength) {
             matchBytes += 0x10000;
-        }
-    }
+    }   }
     return matchBytes;
 }
 
@@ -3307,15 +3750,12 @@ static size_t ZSTD_countSeqStoreMatchBytes(const seqStore_t* const seqStore) {
  */
 static void ZSTD_deriveSeqStoreChunk(seqStore_t* resultSeqStore,
                                const seqStore_t* originalSeqStore,
-                                     size_t startIdx, size_t endIdx) {
-    BYTE* const litEnd = originalSeqStore->lit;
-    size_t literalsBytes;
-    size_t literalsBytesPreceding = 0;
-
+                                     size_t startIdx, size_t endIdx)
+{
     *resultSeqStore = *originalSeqStore;
     if (startIdx > 0) {
         resultSeqStore->sequences = originalSeqStore->sequencesStart + startIdx;
-        literalsBytesPreceding = ZSTD_countSeqStoreLiteralsBytes(resultSeqStore);
+        resultSeqStore->litStart += ZSTD_countSeqStoreLiteralsBytes(resultSeqStore);
     }
 
     /* Move longLengthPos into the correct position if necessary */
@@ -3328,13 +3768,12 @@ static void ZSTD_deriveSeqStoreChunk(seqStore_t* resultSeqStore,
     }
     resultSeqStore->sequencesStart = originalSeqStore->sequencesStart + startIdx;
     resultSeqStore->sequences = originalSeqStore->sequencesStart + endIdx;
-    literalsBytes = ZSTD_countSeqStoreLiteralsBytes(resultSeqStore);
-    resultSeqStore->litStart += literalsBytesPreceding;
     if (endIdx == (size_t)(originalSeqStore->sequences - originalSeqStore->sequencesStart)) {
         /* This accounts for possible last literals if the derived chunk reaches the end of the block */
-        resultSeqStore->lit = litEnd;
+        assert(resultSeqStore->lit == originalSeqStore->lit);
     } else {
-        resultSeqStore->lit = resultSeqStore->litStart+literalsBytes;
+        size_t const literalsBytes = ZSTD_countSeqStoreLiteralsBytes(resultSeqStore);
+        resultSeqStore->lit = resultSeqStore->litStart + literalsBytes;
     }
     resultSeqStore->llCode += startIdx;
     resultSeqStore->mlCode += startIdx;
@@ -3342,20 +3781,26 @@ static void ZSTD_deriveSeqStoreChunk(seqStore_t* resultSeqStore,
 }
 
 /*
- * Returns the raw offset represented by the combination of offCode, ll0, and repcode history.
- * offCode must represent a repcode in the numeric representation of ZSTD_storeSeq().
+ * Returns the raw offset represented by the combination of offBase, ll0, and repcode history.
+ * offBase must represent a repcode in the numeric representation of ZSTD_storeSeq().
  */
 static U32
-ZSTD_resolveRepcodeToRawOffset(const U32 rep[ZSTD_REP_NUM], const U32 offCode, const U32 ll0)
-{
-    U32 const adjustedOffCode = STORED_REPCODE(offCode) - 1 + ll0;  /* [ 0 - 3 ] */
-    assert(STORED_IS_REPCODE(offCode));
-    if (adjustedOffCode == ZSTD_REP_NUM) {
-        /* litlength == 0 and offCode == 2 implies selection of first repcode - 1 */
-        assert(rep[0] > 0);
+ZSTD_resolveRepcodeToRawOffset(const U32 rep[ZSTD_REP_NUM], const U32 offBase, const U32 ll0)
+{
+    U32 const adjustedRepCode = OFFBASE_TO_REPCODE(offBase) - 1 + ll0;  /* [ 0 - 3 ] */
+    assert(OFFBASE_IS_REPCODE(offBase));
+    if (adjustedRepCode == ZSTD_REP_NUM) {
+        assert(ll0);
+        /* litlength == 0 and offCode == 2 implies selection of first repcode - 1
+         * This is only valid if it results in a valid offset value, aka > 0.
+         * Note : it may happen that `rep[0]==1` in exceptional circumstances.
+         * In which case this function will return 0, which is an invalid offset.
+         * It's not an issue though, since this value will be
+         * compared and discarded within ZSTD_seqStore_resolveOffCodes().
+         */
         return rep[0] - 1;
     }
-    return rep[adjustedOffCode];
+    return rep[adjustedRepCode];
 }
 
 /*
@@ -3371,30 +3816,33 @@ ZSTD_resolveRepcodeToRawOffset(const U32 rep[ZSTD_REP_NUM], const U32 offCode, c
  *        1-3 : repcode 1-3
  *        4+ : real_offset+3
  */
-static void ZSTD_seqStore_resolveOffCodes(repcodes_t* const dRepcodes, repcodes_t* const cRepcodes,
-                                          seqStore_t* const seqStore, U32 const nbSeq) {
+static void
+ZSTD_seqStore_resolveOffCodes(repcodes_t* const dRepcodes, repcodes_t* const cRepcodes,
+                        const seqStore_t* const seqStore, U32 const nbSeq)
+{
     U32 idx = 0;
+    U32 const longLitLenIdx = seqStore->longLengthType == ZSTD_llt_literalLength ? seqStore->longLengthPos : nbSeq;
     for (; idx < nbSeq; ++idx) {
         seqDef* const seq = seqStore->sequencesStart + idx;
-        U32 const ll0 = (seq->litLength == 0);
-        U32 const offCode = OFFBASE_TO_STORED(seq->offBase);
-        assert(seq->offBase > 0);
-        if (STORED_IS_REPCODE(offCode)) {
-            U32 const dRawOffset = ZSTD_resolveRepcodeToRawOffset(dRepcodes->rep, offCode, ll0);
-            U32 const cRawOffset = ZSTD_resolveRepcodeToRawOffset(cRepcodes->rep, offCode, ll0);
+        U32 const ll0 = (seq->litLength == 0) && (idx != longLitLenIdx);
+        U32 const offBase = seq->offBase;
+        assert(offBase > 0);
+        if (OFFBASE_IS_REPCODE(offBase)) {
+            U32 const dRawOffset = ZSTD_resolveRepcodeToRawOffset(dRepcodes->rep, offBase, ll0);
+            U32 const cRawOffset = ZSTD_resolveRepcodeToRawOffset(cRepcodes->rep, offBase, ll0);
             /* Adjust simulated decompression repcode history if we come across a mismatch. Replace
              * the repcode with the offset it actually references, determined by the compression
              * repcode history.
              */
             if (dRawOffset != cRawOffset) {
-                seq->offBase = cRawOffset + ZSTD_REP_NUM;
+                seq->offBase = OFFSET_TO_OFFBASE(cRawOffset);
             }
         }
         /* Compression repcode history is always updated with values directly from the unmodified seqStore.
          * Decompression repcode history may use modified seq->offset value taken from compression repcode history.
          */
-        ZSTD_updateRep(dRepcodes->rep, OFFBASE_TO_STORED(seq->offBase), ll0);
-        ZSTD_updateRep(cRepcodes->rep, offCode, ll0);
+        ZSTD_updateRep(dRepcodes->rep, seq->offBase, ll0);
+        ZSTD_updateRep(cRepcodes->rep, offBase, ll0);
     }
 }
 
@@ -3404,10 +3852,11 @@ static void ZSTD_seqStore_resolveOffCodes(repcodes_t* const dRepcodes, repcodes_
  * Returns the total size of that block (including header) or a ZSTD error code.
  */
 static size_t
-ZSTD_compressSeqStore_singleBlock(ZSTD_CCtx* zc, seqStore_t* const seqStore,
+ZSTD_compressSeqStore_singleBlock(ZSTD_CCtx* zc,
+                            const seqStore_t* const seqStore,
                                   repcodes_t* const dRep, repcodes_t* const cRep,
                                   void* dst, size_t dstCapacity,
-                                  const void* src, size_t srcSize,
+                            const void* src, size_t srcSize,
                                   U32 lastBlock, U32 isPartition)
 {
     const U32 rleMaxLength = 25;
@@ -3481,45 +3930,49 @@ typedef struct {
 
 /* Helper function to perform the recursive search for block splits.
  * Estimates the cost of seqStore prior to split, and estimates the cost of splitting the sequences in half.
- * If advantageous to split, then we recurse down the two sub-blocks. If not, or if an error occurred in estimation, then
- * we do not recurse.
+ * If advantageous to split, then we recurse down the two sub-blocks.
+ * If not, or if an error occurred in estimation, then we do not recurse.
  *
- * Note: The recursion depth is capped by a heuristic minimum number of sequences, defined by MIN_SEQUENCES_BLOCK_SPLITTING.
+ * Note: The recursion depth is capped by a heuristic minimum number of sequences,
+ * defined by MIN_SEQUENCES_BLOCK_SPLITTING.
  * In theory, this means the absolute largest recursion depth is 10 == log2(maxNbSeqInBlock/MIN_SEQUENCES_BLOCK_SPLITTING).
  * In practice, recursion depth usually doesn't go beyond 4.
  *
- * Furthermore, the number of splits is capped by ZSTD_MAX_NB_BLOCK_SPLITS. At ZSTD_MAX_NB_BLOCK_SPLITS == 196 with the current existing blockSize
+ * Furthermore, the number of splits is capped by ZSTD_MAX_NB_BLOCK_SPLITS.
+ * At ZSTD_MAX_NB_BLOCK_SPLITS == 196 with the current existing blockSize
  * maximum of 128 KB, this value is actually impossible to reach.
  */
 static void
 ZSTD_deriveBlockSplitsHelper(seqStoreSplits* splits, size_t startIdx, size_t endIdx,
                              ZSTD_CCtx* zc, const seqStore_t* origSeqStore)
 {
-    seqStore_t* fullSeqStoreChunk = &zc->blockSplitCtx.fullSeqStoreChunk;
-    seqStore_t* firstHalfSeqStore = &zc->blockSplitCtx.firstHalfSeqStore;
-    seqStore_t* secondHalfSeqStore = &zc->blockSplitCtx.secondHalfSeqStore;
+    seqStore_t* const fullSeqStoreChunk = &zc->blockSplitCtx.fullSeqStoreChunk;
+    seqStore_t* const firstHalfSeqStore = &zc->blockSplitCtx.firstHalfSeqStore;
+    seqStore_t* const secondHalfSeqStore = &zc->blockSplitCtx.secondHalfSeqStore;
     size_t estimatedOriginalSize;
     size_t estimatedFirstHalfSize;
     size_t estimatedSecondHalfSize;
     size_t midIdx = (startIdx + endIdx)/2;
 
+    DEBUGLOG(5, "ZSTD_deriveBlockSplitsHelper: startIdx=%zu endIdx=%zu", startIdx, endIdx);
+    assert(endIdx >= startIdx);
     if (endIdx - startIdx < MIN_SEQUENCES_BLOCK_SPLITTING || splits->idx >= ZSTD_MAX_NB_BLOCK_SPLITS) {
-        DEBUGLOG(6, "ZSTD_deriveBlockSplitsHelper: Too few sequences");
+        DEBUGLOG(6, "ZSTD_deriveBlockSplitsHelper: Too few sequences (%zu)", endIdx - startIdx);
         return;
     }
-    DEBUGLOG(4, "ZSTD_deriveBlockSplitsHelper: startIdx=%zu endIdx=%zu", startIdx, endIdx);
     ZSTD_deriveSeqStoreChunk(fullSeqStoreChunk, origSeqStore, startIdx, endIdx);
     ZSTD_deriveSeqStoreChunk(firstHalfSeqStore, origSeqStore, startIdx, midIdx);
     ZSTD_deriveSeqStoreChunk(secondHalfSeqStore, origSeqStore, midIdx, endIdx);
     estimatedOriginalSize = ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(fullSeqStoreChunk, zc);
     estimatedFirstHalfSize = ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(firstHalfSeqStore, zc);
     estimatedSecondHalfSize = ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(secondHalfSeqStore, zc);
-    DEBUGLOG(4, "Estimated original block size: %zu -- First half split: %zu -- Second half split: %zu",
+    DEBUGLOG(5, "Estimated original block size: %zu -- First half split: %zu -- Second half split: %zu",
              estimatedOriginalSize, estimatedFirstHalfSize, estimatedSecondHalfSize);
     if (ZSTD_isError(estimatedOriginalSize) || ZSTD_isError(estimatedFirstHalfSize) || ZSTD_isError(estimatedSecondHalfSize)) {
         return;
     }
     if (estimatedFirstHalfSize + estimatedSecondHalfSize < estimatedOriginalSize) {
+        DEBUGLOG(5, "split decided at seqNb:%zu", midIdx);
         ZSTD_deriveBlockSplitsHelper(splits, startIdx, midIdx, zc, origSeqStore);
         splits->splitLocations[splits->idx] = (U32)midIdx;
         splits->idx++;
@@ -3527,14 +3980,18 @@ ZSTD_deriveBlockSplitsHelper(seqStoreSplits* splits, size_t startIdx, size_t end
     }
 }
 
-/* Base recursive function. Populates a table with intra-block partition indices that can improve compression ratio.
+/* Base recursive function.
+ * Populates a table with intra-block partition indices that can improve compression ratio.
  *
- * Returns the number of splits made (which equals the size of the partition table - 1).
+ * @return: number of splits made (which equals the size of the partition table - 1).
  */
-static size_t ZSTD_deriveBlockSplits(ZSTD_CCtx* zc, U32 partitions[], U32 nbSeq) {
-    seqStoreSplits splits = {partitions, 0};
+static size_t ZSTD_deriveBlockSplits(ZSTD_CCtx* zc, U32 partitions[], U32 nbSeq)
+{
+    seqStoreSplits splits;
+    splits.splitLocations = partitions;
+    splits.idx = 0;
     if (nbSeq <= 4) {
-        DEBUGLOG(4, "ZSTD_deriveBlockSplits: Too few sequences to split");
+        DEBUGLOG(5, "ZSTD_deriveBlockSplits: Too few sequences to split (%u <= 4)", nbSeq);
         /* Refuse to try and split anything with less than 4 sequences */
         return 0;
     }
@@ -3550,18 +4007,20 @@ static size_t ZSTD_deriveBlockSplits(ZSTD_CCtx* zc, U32 partitions[], U32 nbSeq)
  * Returns combined size of all blocks (which includes headers), or a ZSTD error code.
  */
 static size_t
-ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, size_t dstCapacity,
-                                       const void* src, size_t blockSize, U32 lastBlock, U32 nbSeq)
+ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc,
+                                    void* dst, size_t dstCapacity,
+                              const void* src, size_t blockSize,
+                                    U32 lastBlock, U32 nbSeq)
 {
     size_t cSize = 0;
     const BYTE* ip = (const BYTE*)src;
     BYTE* op = (BYTE*)dst;
     size_t i = 0;
     size_t srcBytesTotal = 0;
-    U32* partitions = zc->blockSplitCtx.partitions; /* size == ZSTD_MAX_NB_BLOCK_SPLITS */
-    seqStore_t* nextSeqStore = &zc->blockSplitCtx.nextSeqStore;
-    seqStore_t* currSeqStore = &zc->blockSplitCtx.currSeqStore;
-    size_t numSplits = ZSTD_deriveBlockSplits(zc, partitions, nbSeq);
+    U32* const partitions = zc->blockSplitCtx.partitions; /* size == ZSTD_MAX_NB_BLOCK_SPLITS */
+    seqStore_t* const nextSeqStore = &zc->blockSplitCtx.nextSeqStore;
+    seqStore_t* const currSeqStore = &zc->blockSplitCtx.currSeqStore;
+    size_t const numSplits = ZSTD_deriveBlockSplits(zc, partitions, nbSeq);
 
     /* If a block is split and some partitions are emitted as RLE/uncompressed, then repcode history
      * may become invalid. In order to reconcile potentially invalid repcodes, we keep track of two
@@ -3583,30 +4042,31 @@ ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, size_t dstCapac
     ZSTD_memcpy(cRep.rep, zc->blockState.prevCBlock->rep, sizeof(repcodes_t));
     ZSTD_memset(nextSeqStore, 0, sizeof(seqStore_t));
 
-    DEBUGLOG(4, "ZSTD_compressBlock_splitBlock_internal (dstCapacity=%u, dictLimit=%u, nextToUpdate=%u)",
+    DEBUGLOG(5, "ZSTD_compressBlock_splitBlock_internal (dstCapacity=%u, dictLimit=%u, nextToUpdate=%u)",
                 (unsigned)dstCapacity, (unsigned)zc->blockState.matchState.window.dictLimit,
                 (unsigned)zc->blockState.matchState.nextToUpdate);
 
     if (numSplits == 0) {
-        size_t cSizeSingleBlock = ZSTD_compressSeqStore_singleBlock(zc, &zc->seqStore,
-                                                                   &dRep, &cRep,
-                                                                    op, dstCapacity,
-                                                                    ip, blockSize,
-                                                                    lastBlock, 0 /* isPartition */);
+        size_t cSizeSingleBlock =
+            ZSTD_compressSeqStore_singleBlock(zc, &zc->seqStore,
+                                            &dRep, &cRep,
+                                            op, dstCapacity,
+                                            ip, blockSize,
+                                            lastBlock, 0 /* isPartition */);
         FORWARD_IF_ERROR(cSizeSingleBlock, "Compressing single block from splitBlock_internal() failed!");
         DEBUGLOG(5, "ZSTD_compressBlock_splitBlock_internal: No splits");
-        assert(cSizeSingleBlock <= ZSTD_BLOCKSIZE_MAX + ZSTD_blockHeaderSize);
+        assert(zc->blockSize <= ZSTD_BLOCKSIZE_MAX);
+        assert(cSizeSingleBlock <= zc->blockSize + ZSTD_blockHeaderSize);
         return cSizeSingleBlock;
     }
 
     ZSTD_deriveSeqStoreChunk(currSeqStore, &zc->seqStore, 0, partitions[0]);
     for (i = 0; i <= numSplits; ++i) {
-        size_t srcBytes;
         size_t cSizeChunk;
         U32 const lastPartition = (i == numSplits);
         U32 lastBlockEntireSrc = 0;
 
-        srcBytes = ZSTD_countSeqStoreLiteralsBytes(currSeqStore) + ZSTD_countSeqStoreMatchBytes(currSeqStore);
+        size_t srcBytes = ZSTD_countSeqStoreLiteralsBytes(currSeqStore) + ZSTD_countSeqStoreMatchBytes(currSeqStore);
         srcBytesTotal += srcBytes;
         if (lastPartition) {
             /* This is the final partition, need to account for possible last literals */
@@ -3621,7 +4081,8 @@ ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, size_t dstCapac
                                                        op, dstCapacity,
                                                        ip, srcBytes,
                                                        lastBlockEntireSrc, 1 /* isPartition */);
-        DEBUGLOG(5, "Estimated size: %zu actual size: %zu", ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(currSeqStore, zc), cSizeChunk);
+        DEBUGLOG(5, "Estimated size: %zu vs %zu : actual size",
+                    ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(currSeqStore, zc), cSizeChunk);
         FORWARD_IF_ERROR(cSizeChunk, "Compressing chunk failed!");
 
         ip += srcBytes;
@@ -3629,10 +4090,10 @@ ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, size_t dstCapac
         dstCapacity -= cSizeChunk;
         cSize += cSizeChunk;
         *currSeqStore = *nextSeqStore;
-        assert(cSizeChunk <= ZSTD_BLOCKSIZE_MAX + ZSTD_blockHeaderSize);
+        assert(cSizeChunk <= zc->blockSize + ZSTD_blockHeaderSize);
     }
-    /* cRep and dRep may have diverged during the compression. If so, we use the dRep repcodes
-     * for the next block.
+    /* cRep and dRep may have diverged during the compression.
+     * If so, we use the dRep repcodes for the next block.
      */
     ZSTD_memcpy(zc->blockState.prevCBlock->rep, dRep.rep, sizeof(repcodes_t));
     return cSize;
@@ -3643,8 +4104,6 @@ ZSTD_compressBlock_splitBlock(ZSTD_CCtx* zc,
                               void* dst, size_t dstCapacity,
                               const void* src, size_t srcSize, U32 lastBlock)
 {
-    const BYTE* ip = (const BYTE*)src;
-    BYTE* op = (BYTE*)dst;
     U32 nbSeq;
     size_t cSize;
     DEBUGLOG(4, "ZSTD_compressBlock_splitBlock");
@@ -3655,7 +4114,7 @@ ZSTD_compressBlock_splitBlock(ZSTD_CCtx* zc,
         if (bss == ZSTDbss_noCompress) {
             if (zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode == FSE_repeat_valid)
                 zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode = FSE_repeat_check;
-            cSize = ZSTD_noCompressBlock(op, dstCapacity, ip, srcSize, lastBlock);
+            cSize = ZSTD_noCompressBlock(dst, dstCapacity, src, srcSize, lastBlock);
             FORWARD_IF_ERROR(cSize, "ZSTD_noCompressBlock failed");
             DEBUGLOG(4, "ZSTD_compressBlock_splitBlock: Nocompress block");
             return cSize;
@@ -3673,9 +4132,9 @@ ZSTD_compressBlock_internal(ZSTD_CCtx* zc,
                             void* dst, size_t dstCapacity,
                             const void* src, size_t srcSize, U32 frame)
 {
-    /* This the upper bound for the length of an rle block.
-     * This isn't the actual upper bound. Finding the real threshold
-     * needs further investigation.
+    /* This is an estimated upper bound for the length of an rle block.
+     * This isn't the actual upper bound.
+     * Finding the real threshold needs further investigation.
      */
     const U32 rleMaxLength = 25;
     size_t cSize;
@@ -3767,10 +4226,11 @@ static size_t ZSTD_compressBlock_targetCBlockSize_body(ZSTD_CCtx* zc,
          *   * cSize >= blockBound(srcSize): We have expanded the block too much so
          *     emit an uncompressed block.
          */
-        {
-            size_t const cSize = ZSTD_compressSuperBlock(zc, dst, dstCapacity, src, srcSize, lastBlock);
+        {   size_t const cSize =
+                ZSTD_compressSuperBlock(zc, dst, dstCapacity, src, srcSize, lastBlock);
             if (cSize != ERROR(dstSize_tooSmall)) {
-                size_t const maxCSize = srcSize - ZSTD_minGain(srcSize, zc->appliedParams.cParams.strategy);
+                size_t const maxCSize =
+                    srcSize - ZSTD_minGain(srcSize, zc->appliedParams.cParams.strategy);
                 FORWARD_IF_ERROR(cSize, "ZSTD_compressSuperBlock failed");
                 if (cSize != 0 && cSize < maxCSize + ZSTD_blockHeaderSize) {
                     ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState);
@@ -3778,7 +4238,7 @@ static size_t ZSTD_compressBlock_targetCBlockSize_body(ZSTD_CCtx* zc,
                 }
             }
         }
-    }
+    } /* if (bss == ZSTDbss_compress)*/
 
     DEBUGLOG(6, "Resorting to ZSTD_noCompressBlock()");
     /* Superblock compression failed, attempt to emit a single no compress block.
@@ -3836,7 +4296,7 @@ static void ZSTD_overflowCorrectIfNeeded(ZSTD_matchState_t* ms,
 *   All blocks will be terminated, all input will be consumed.
 *   Function will issue an error if there is not enough `dstCapacity` to hold the compressed content.
 *   Frame is supposed already started (header already produced)
-*   @return : compressed size, or an error code
+*  @return : compressed size, or an error code
 */
 static size_t ZSTD_compress_frameChunk(ZSTD_CCtx* cctx,
                                      void* dst, size_t dstCapacity,
@@ -3860,7 +4320,9 @@ static size_t ZSTD_compress_frameChunk(ZSTD_CCtx* cctx,
         ZSTD_matchState_t* const ms = &cctx->blockState.matchState;
         U32 const lastBlock = lastFrameChunk & (blockSize >= remaining);
 
-        RETURN_ERROR_IF(dstCapacity < ZSTD_blockHeaderSize + MIN_CBLOCK_SIZE,
+        /* TODO: See 3090. We reduced MIN_CBLOCK_SIZE from 3 to 2 so to compensate we are adding
+         * additional 1. We need to revisit and change this logic to be more consistent */
+        RETURN_ERROR_IF(dstCapacity < ZSTD_blockHeaderSize + MIN_CBLOCK_SIZE + 1,
                         dstSize_tooSmall,
                         "not enough space to store compressed block");
         if (remaining < blockSize) blockSize = remaining;
@@ -3899,7 +4361,7 @@ static size_t ZSTD_compress_frameChunk(ZSTD_CCtx* cctx,
                     MEM_writeLE24(op, cBlockHeader);
                     cSize += ZSTD_blockHeaderSize;
                 }
-            }
+            }  /* if (ZSTD_useTargetCBlockSize(&cctx->appliedParams))*/
 
 
             ip += blockSize;
@@ -4078,31 +4540,51 @@ static size_t ZSTD_compressContinue_internal (ZSTD_CCtx* cctx,
     }
 }
 
-size_t ZSTD_compressContinue (ZSTD_CCtx* cctx,
-                              void* dst, size_t dstCapacity,
-                        const void* src, size_t srcSize)
+size_t ZSTD_compressContinue_public(ZSTD_CCtx* cctx,
+                                        void* dst, size_t dstCapacity,
+                                  const void* src, size_t srcSize)
 {
     DEBUGLOG(5, "ZSTD_compressContinue (srcSize=%u)", (unsigned)srcSize);
     return ZSTD_compressContinue_internal(cctx, dst, dstCapacity, src, srcSize, 1 /* frame mode */, 0 /* last chunk */);
 }
 
+/* NOTE: Must just wrap ZSTD_compressContinue_public() */
+size_t ZSTD_compressContinue(ZSTD_CCtx* cctx,
+                             void* dst, size_t dstCapacity,
+                       const void* src, size_t srcSize)
+{
+    return ZSTD_compressContinue_public(cctx, dst, dstCapacity, src, srcSize);
+}
 
-size_t ZSTD_getBlockSize(const ZSTD_CCtx* cctx)
+static size_t ZSTD_getBlockSize_deprecated(const ZSTD_CCtx* cctx)
 {
     ZSTD_compressionParameters const cParams = cctx->appliedParams.cParams;
     assert(!ZSTD_checkCParams(cParams));
-    return MIN (ZSTD_BLOCKSIZE_MAX, (U32)1 << cParams.windowLog);
+    return MIN(cctx->appliedParams.maxBlockSize, (size_t)1 << cParams.windowLog);
 }
 
-size_t ZSTD_compressBlock(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+/* NOTE: Must just wrap ZSTD_getBlockSize_deprecated() */
+size_t ZSTD_getBlockSize(const ZSTD_CCtx* cctx)
+{
+    return ZSTD_getBlockSize_deprecated(cctx);
+}
+
+/* NOTE: Must just wrap ZSTD_compressBlock_deprecated() */
+size_t ZSTD_compressBlock_deprecated(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize)
 {
     DEBUGLOG(5, "ZSTD_compressBlock: srcSize = %u", (unsigned)srcSize);
-    { size_t const blockSizeMax = ZSTD_getBlockSize(cctx);
+    { size_t const blockSizeMax = ZSTD_getBlockSize_deprecated(cctx);
       RETURN_ERROR_IF(srcSize > blockSizeMax, srcSize_wrong, "input is larger than a block"); }
 
     return ZSTD_compressContinue_internal(cctx, dst, dstCapacity, src, srcSize, 0 /* frame mode */, 0 /* last chunk */);
 }
 
+/* NOTE: Must just wrap ZSTD_compressBlock_deprecated() */
+size_t ZSTD_compressBlock(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_deprecated(cctx, dst, dstCapacity, src, srcSize);
+}
+
 /*! ZSTD_loadDictionaryContent() :
  *  @return : 0, or an error code
  */
@@ -4111,25 +4593,36 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms,
                                          ZSTD_cwksp* ws,
                                          ZSTD_CCtx_params const* params,
                                          const void* src, size_t srcSize,
-                                         ZSTD_dictTableLoadMethod_e dtlm)
+                                         ZSTD_dictTableLoadMethod_e dtlm,
+                                         ZSTD_tableFillPurpose_e tfp)
 {
     const BYTE* ip = (const BYTE*) src;
     const BYTE* const iend = ip + srcSize;
     int const loadLdmDict = params->ldmParams.enableLdm == ZSTD_ps_enable && ls != NULL;
 
-    /* Assert that we the ms params match the params we're being given */
+    /* Assert that the ms params match the params we're being given */
     ZSTD_assertEqualCParams(params->cParams, ms->cParams);
 
-    if (srcSize > ZSTD_CHUNKSIZE_MAX) {
+    {   /* Ensure large dictionaries can't cause index overflow */
+
         /* Allow the dictionary to set indices up to exactly ZSTD_CURRENT_MAX.
          * Dictionaries right at the edge will immediately trigger overflow
          * correction, but I don't want to insert extra constraints here.
          */
-        U32 const maxDictSize = ZSTD_CURRENT_MAX - 1;
-        /* We must have cleared our windows when our source is this large. */
-        assert(ZSTD_window_isEmpty(ms->window));
-        if (loadLdmDict)
-            assert(ZSTD_window_isEmpty(ls->window));
+        U32 maxDictSize = ZSTD_CURRENT_MAX - ZSTD_WINDOW_START_INDEX;
+
+        int const CDictTaggedIndices = ZSTD_CDictIndicesAreTagged(&params->cParams);
+        if (CDictTaggedIndices && tfp == ZSTD_tfp_forCDict) {
+            /* Some dictionary matchfinders in zstd use "short cache",
+             * which treats the lower ZSTD_SHORT_CACHE_TAG_BITS of each
+             * CDict hashtable entry as a tag rather than as part of an index.
+             * When short cache is used, we need to truncate the dictionary
+             * so that its indices don't overlap with the tag. */
+            U32 const shortCacheMaxDictSize = (1u << (32 - ZSTD_SHORT_CACHE_TAG_BITS)) - ZSTD_WINDOW_START_INDEX;
+            maxDictSize = MIN(maxDictSize, shortCacheMaxDictSize);
+            assert(!loadLdmDict);
+        }
+
         /* If the dictionary is too large, only load the suffix of the dictionary. */
         if (srcSize > maxDictSize) {
             ip = iend - maxDictSize;
@@ -4138,30 +4631,46 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms,
         }
     }
 
-    DEBUGLOG(4, "ZSTD_loadDictionaryContent(): useRowMatchFinder=%d", (int)params->useRowMatchFinder);
+    if (srcSize > ZSTD_CHUNKSIZE_MAX) {
+        /* We must have cleared our windows when our source is this large. */
+        assert(ZSTD_window_isEmpty(ms->window));
+        if (loadLdmDict) assert(ZSTD_window_isEmpty(ls->window));
+    }
     ZSTD_window_update(&ms->window, src, srcSize, /* forceNonContiguous */ 0);
-    ms->loadedDictEnd = params->forceWindow ? 0 : (U32)(iend - ms->window.base);
-    ms->forceNonContiguous = params->deterministicRefPrefix;
 
-    if (loadLdmDict) {
+    DEBUGLOG(4, "ZSTD_loadDictionaryContent(): useRowMatchFinder=%d", (int)params->useRowMatchFinder);
+
+    if (loadLdmDict) { /* Load the entire dict into LDM matchfinders. */
         ZSTD_window_update(&ls->window, src, srcSize, /* forceNonContiguous */ 0);
         ls->loadedDictEnd = params->forceWindow ? 0 : (U32)(iend - ls->window.base);
+        ZSTD_ldm_fillHashTable(ls, ip, iend, &params->ldmParams);
+    }
+
+    /* If the dict is larger than we can reasonably index in our tables, only load the suffix. */
+    if (params->cParams.strategy < ZSTD_btultra) {
+        U32 maxDictSize = 8U << MIN(MAX(params->cParams.hashLog, params->cParams.chainLog), 28);
+        if (srcSize > maxDictSize) {
+            ip = iend - maxDictSize;
+            src = ip;
+            srcSize = maxDictSize;
+        }
     }
 
+    ms->nextToUpdate = (U32)(ip - ms->window.base);
+    ms->loadedDictEnd = params->forceWindow ? 0 : (U32)(iend - ms->window.base);
+    ms->forceNonContiguous = params->deterministicRefPrefix;
+
     if (srcSize <= HASH_READ_SIZE) return 0;
 
     ZSTD_overflowCorrectIfNeeded(ms, ws, params, ip, iend);
 
-    if (loadLdmDict)
-        ZSTD_ldm_fillHashTable(ls, ip, iend, &params->ldmParams);
-
     switch(params->cParams.strategy)
     {
     case ZSTD_fast:
-        ZSTD_fillHashTable(ms, iend, dtlm);
+        ZSTD_fillHashTable(ms, iend, dtlm, tfp);
         break;
     case ZSTD_dfast:
-        ZSTD_fillDoubleHashTable(ms, iend, dtlm);
+        ZSTD_fillDoubleHashTable(ms, iend, dtlm, tfp);
         break;
 
     case ZSTD_greedy:
@@ -4174,7 +4683,7 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms,
         } else {
             assert(params->useRowMatchFinder != ZSTD_ps_auto);
             if (params->useRowMatchFinder == ZSTD_ps_enable) {
-                size_t const tagTableSize = ((size_t)1 << params->cParams.hashLog) * sizeof(U16);
+                size_t const tagTableSize = ((size_t)1 << params->cParams.hashLog);
                 ZSTD_memset(ms->tagTable, 0, tagTableSize);
                 ZSTD_row_update(ms, iend-HASH_READ_SIZE);
                 DEBUGLOG(4, "Using row-based hash table for lazy dict");
@@ -4327,6 +4836,7 @@ static size_t ZSTD_loadZstdDictionary(ZSTD_compressedBlockState_t* bs,
                                       ZSTD_CCtx_params const* params,
                                       const void* dict, size_t dictSize,
                                       ZSTD_dictTableLoadMethod_e dtlm,
+                                      ZSTD_tableFillPurpose_e tfp,
                                       void* workspace)
 {
     const BYTE* dictPtr = (const BYTE*)dict;
@@ -4345,7 +4855,7 @@ static size_t ZSTD_loadZstdDictionary(ZSTD_compressedBlockState_t* bs,
     {
         size_t const dictContentSize = (size_t)(dictEnd - dictPtr);
         FORWARD_IF_ERROR(ZSTD_loadDictionaryContent(
-            ms, NULL, ws, params, dictPtr, dictContentSize, dtlm), "");
+            ms, NULL, ws, params, dictPtr, dictContentSize, dtlm, tfp), "");
     }
     return dictID;
 }
@@ -4361,6 +4871,7 @@ ZSTD_compress_insertDictionary(ZSTD_compressedBlockState_t* bs,
                          const void* dict, size_t dictSize,
                                ZSTD_dictContentType_e dictContentType,
                                ZSTD_dictTableLoadMethod_e dtlm,
+                               ZSTD_tableFillPurpose_e tfp,
                                void* workspace)
 {
     DEBUGLOG(4, "ZSTD_compress_insertDictionary (dictSize=%u)", (U32)dictSize);
@@ -4373,13 +4884,13 @@ ZSTD_compress_insertDictionary(ZSTD_compressedBlockState_t* bs,
 
     /* dict restricted modes */
     if (dictContentType == ZSTD_dct_rawContent)
-        return ZSTD_loadDictionaryContent(ms, ls, ws, params, dict, dictSize, dtlm);
+        return ZSTD_loadDictionaryContent(ms, ls, ws, params, dict, dictSize, dtlm, tfp);
 
     if (MEM_readLE32(dict) != ZSTD_MAGIC_DICTIONARY) {
         if (dictContentType == ZSTD_dct_auto) {
             DEBUGLOG(4, "raw content dictionary detected");
             return ZSTD_loadDictionaryContent(
-                ms, ls, ws, params, dict, dictSize, dtlm);
+                ms, ls, ws, params, dict, dictSize, dtlm, tfp);
         }
         RETURN_ERROR_IF(dictContentType == ZSTD_dct_fullDict, dictionary_wrong, "");
         assert(0);   /* impossible */
@@ -4387,13 +4898,14 @@ ZSTD_compress_insertDictionary(ZSTD_compressedBlockState_t* bs,
 
     /* dict as full zstd dictionary */
     return ZSTD_loadZstdDictionary(
-        bs, ms, ws, params, dict, dictSize, dtlm, workspace);
+        bs, ms, ws, params, dict, dictSize, dtlm, tfp, workspace);
 }
 
 #define ZSTD_USE_CDICT_PARAMS_SRCSIZE_CUTOFF (128 KB)
 #define ZSTD_USE_CDICT_PARAMS_DICTSIZE_MULTIPLIER (6ULL)
 
 /*! ZSTD_compressBegin_internal() :
+ * Assumption : either @dict OR @cdict (or none) is non-NULL, never both
  * @return : 0, or an error code */
 static size_t ZSTD_compressBegin_internal(ZSTD_CCtx* cctx,
                                     const void* dict, size_t dictSize,
@@ -4426,11 +4938,11 @@ static size_t ZSTD_compressBegin_internal(ZSTD_CCtx* cctx,
                         cctx->blockState.prevCBlock, &cctx->blockState.matchState,
                         &cctx->ldmState, &cctx->workspace, &cctx->appliedParams, cdict->dictContent,
                         cdict->dictContentSize, cdict->dictContentType, dtlm,
-                        cctx->entropyWorkspace)
+                        ZSTD_tfp_forCCtx, cctx->entropyWorkspace)
               : ZSTD_compress_insertDictionary(
                         cctx->blockState.prevCBlock, &cctx->blockState.matchState,
                         &cctx->ldmState, &cctx->workspace, &cctx->appliedParams, dict, dictSize,
-                        dictContentType, dtlm, cctx->entropyWorkspace);
+                        dictContentType, dtlm, ZSTD_tfp_forCCtx, cctx->entropyWorkspace);
         FORWARD_IF_ERROR(dictID, "ZSTD_compress_insertDictionary failed");
         assert(dictID <= UINT_MAX);
         cctx->dictID = (U32)dictID;
@@ -4471,11 +4983,11 @@ size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* cctx,
                                             &cctxParams, pledgedSrcSize);
 }
 
-size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel)
+static size_t
+ZSTD_compressBegin_usingDict_deprecated(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel)
 {
     ZSTD_CCtx_params cctxParams;
-    {
-        ZSTD_parameters const params = ZSTD_getParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, dictSize, ZSTD_cpm_noAttachDict);
+    {   ZSTD_parameters const params = ZSTD_getParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, dictSize, ZSTD_cpm_noAttachDict);
         ZSTD_CCtxParams_init_internal(&cctxParams, &params, (compressionLevel == 0) ? ZSTD_CLEVEL_DEFAULT : compressionLevel);
     }
     DEBUGLOG(4, "ZSTD_compressBegin_usingDict (dictSize=%u)", (unsigned)dictSize);
@@ -4483,9 +4995,15 @@ size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t di
                                        &cctxParams, ZSTD_CONTENTSIZE_UNKNOWN, ZSTDb_not_buffered);
 }
 
+size_t
+ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel)
+{
+    return ZSTD_compressBegin_usingDict_deprecated(cctx, dict, dictSize, compressionLevel);
+}
+
 size_t ZSTD_compressBegin(ZSTD_CCtx* cctx, int compressionLevel)
 {
-    return ZSTD_compressBegin_usingDict(cctx, NULL, 0, compressionLevel);
+    return ZSTD_compressBegin_usingDict_deprecated(cctx, NULL, 0, compressionLevel);
 }
 
 
@@ -4537,9 +5055,9 @@ void ZSTD_CCtx_trace(ZSTD_CCtx* cctx, size_t extraCSize)
     (void)extraCSize;
 }
 
-size_t ZSTD_compressEnd (ZSTD_CCtx* cctx,
-                         void* dst, size_t dstCapacity,
-                   const void* src, size_t srcSize)
+size_t ZSTD_compressEnd_public(ZSTD_CCtx* cctx,
+                               void* dst, size_t dstCapacity,
+                         const void* src, size_t srcSize)
 {
     size_t endResult;
     size_t const cSize = ZSTD_compressContinue_internal(cctx,
@@ -4563,6 +5081,14 @@ size_t ZSTD_compressEnd (ZSTD_CCtx* cctx,
     return cSize + endResult;
 }
 
+/* NOTE: Must just wrap ZSTD_compressEnd_public() */
+size_t ZSTD_compressEnd(ZSTD_CCtx* cctx,
+                        void* dst, size_t dstCapacity,
+                  const void* src, size_t srcSize)
+{
+    return ZSTD_compressEnd_public(cctx, dst, dstCapacity, src, srcSize);
+}
+
 size_t ZSTD_compress_advanced (ZSTD_CCtx* cctx,
                                void* dst, size_t dstCapacity,
                          const void* src, size_t srcSize,
@@ -4591,7 +5117,7 @@ size_t ZSTD_compress_advanced_internal(
     FORWARD_IF_ERROR( ZSTD_compressBegin_internal(cctx,
                          dict, dictSize, ZSTD_dct_auto, ZSTD_dtlm_fast, NULL,
                          params, srcSize, ZSTDb_not_buffered) , "");
-    return ZSTD_compressEnd(cctx, dst, dstCapacity, src, srcSize);
+    return ZSTD_compressEnd_public(cctx, dst, dstCapacity, src, srcSize);
 }
 
 size_t ZSTD_compress_usingDict(ZSTD_CCtx* cctx,
@@ -4709,7 +5235,7 @@ static size_t ZSTD_initCDict_internal(
         {   size_t const dictID = ZSTD_compress_insertDictionary(
                     &cdict->cBlockState, &cdict->matchState, NULL, &cdict->workspace,
                     &params, cdict->dictContent, cdict->dictContentSize,
-                    dictContentType, ZSTD_dtlm_full, cdict->entropyWorkspace);
+                    dictContentType, ZSTD_dtlm_full, ZSTD_tfp_forCDict, cdict->entropyWorkspace);
             FORWARD_IF_ERROR(dictID, "ZSTD_compress_insertDictionary failed");
             assert(dictID <= (size_t)(U32)-1);
             cdict->dictID = (U32)dictID;
@@ -4906,6 +5432,7 @@ const ZSTD_CDict* ZSTD_initStaticCDict(
     params.cParams = cParams;
     params.useRowMatchFinder = useRowMatchFinder;
     cdict->useRowMatchFinder = useRowMatchFinder;
+    cdict->compressionLevel = ZSTD_NO_CLEVEL;
 
     if (ZSTD_isError( ZSTD_initCDict_internal(cdict,
                                               dict, dictSize,
@@ -4985,12 +5512,17 @@ size_t ZSTD_compressBegin_usingCDict_advanced(
 
 /* ZSTD_compressBegin_usingCDict() :
  * cdict must be != NULL */
-size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict)
+size_t ZSTD_compressBegin_usingCDict_deprecated(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict)
 {
     ZSTD_frameParameters const fParams = { 0 /*content*/, 0 /*checksum*/, 0 /*noDictID*/ };
     return ZSTD_compressBegin_usingCDict_internal(cctx, cdict, fParams, ZSTD_CONTENTSIZE_UNKNOWN);
 }
 
+size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict)
+{
+    return ZSTD_compressBegin_usingCDict_deprecated(cctx, cdict);
+}
+
 /*! ZSTD_compress_usingCDict_internal():
  * Implementation of various ZSTD_compress_usingCDict* functions.
  */
@@ -5000,7 +5532,7 @@ static size_t ZSTD_compress_usingCDict_internal(ZSTD_CCtx* cctx,
                                 const ZSTD_CDict* cdict, ZSTD_frameParameters fParams)
 {
     FORWARD_IF_ERROR(ZSTD_compressBegin_usingCDict_internal(cctx, cdict, fParams, srcSize), ""); /* will check if cdict != NULL */
-    return ZSTD_compressEnd(cctx, dst, dstCapacity, src, srcSize);
+    return ZSTD_compressEnd_public(cctx, dst, dstCapacity, src, srcSize);
 }
 
 /*! ZSTD_compress_usingCDict_advanced():
@@ -5197,30 +5729,41 @@ size_t ZSTD_initCStream(ZSTD_CStream* zcs, int compressionLevel)
 
 static size_t ZSTD_nextInputSizeHint(const ZSTD_CCtx* cctx)
 {
-    size_t hintInSize = cctx->inBuffTarget - cctx->inBuffPos;
-    if (hintInSize==0) hintInSize = cctx->blockSize;
-    return hintInSize;
+    if (cctx->appliedParams.inBufferMode == ZSTD_bm_stable) {
+        return cctx->blockSize - cctx->stableIn_notConsumed;
+    }
+    assert(cctx->appliedParams.inBufferMode == ZSTD_bm_buffered);
+    {   size_t hintInSize = cctx->inBuffTarget - cctx->inBuffPos;
+        if (hintInSize==0) hintInSize = cctx->blockSize;
+        return hintInSize;
+    }
 }
 
 /* ZSTD_compressStream_generic():
  *  internal function for all *compressStream*() variants
- *  non-static, because can be called from zstdmt_compress.c
- * @return : hint size for next input */
+ * @return : hint size for next input to complete ongoing block */
 static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
                                           ZSTD_outBuffer* output,
                                           ZSTD_inBuffer* input,
                                           ZSTD_EndDirective const flushMode)
 {
-    const char* const istart = (const char*)input->src;
-    const char* const iend = input->size != 0 ? istart + input->size : istart;
-    const char* ip = input->pos != 0 ? istart + input->pos : istart;
-    char* const ostart = (char*)output->dst;
-    char* const oend = output->size != 0 ? ostart + output->size : ostart;
-    char* op = output->pos != 0 ? ostart + output->pos : ostart;
+    const char* const istart = (assert(input != NULL), (const char*)input->src);
+    const char* const iend = (istart != NULL) ? istart + input->size : istart;
+    const char* ip = (istart != NULL) ? istart + input->pos : istart;
+    char* const ostart = (assert(output != NULL), (char*)output->dst);
+    char* const oend = (ostart != NULL) ? ostart + output->size : ostart;
+    char* op = (ostart != NULL) ? ostart + output->pos : ostart;
     U32 someMoreWork = 1;
 
     /* check expectations */
-    DEBUGLOG(5, "ZSTD_compressStream_generic, flush=%u", (unsigned)flushMode);
+    DEBUGLOG(5, "ZSTD_compressStream_generic, flush=%i, srcSize = %zu", (int)flushMode, input->size - input->pos);
+    assert(zcs != NULL);
+    if (zcs->appliedParams.inBufferMode == ZSTD_bm_stable) {
+        assert(input->pos >= zcs->stableIn_notConsumed);
+        input->pos -= zcs->stableIn_notConsumed;
+        ip -= zcs->stableIn_notConsumed;
+        zcs->stableIn_notConsumed = 0;
+    }
     if (zcs->appliedParams.inBufferMode == ZSTD_bm_buffered) {
         assert(zcs->inBuff != NULL);
         assert(zcs->inBuffSize > 0);
@@ -5229,8 +5772,10 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
         assert(zcs->outBuff !=  NULL);
         assert(zcs->outBuffSize > 0);
     }
-    assert(output->pos <= output->size);
+    if (input->src == NULL) assert(input->size == 0);
     assert(input->pos <= input->size);
+    if (output->dst == NULL) assert(output->size == 0);
+    assert(output->pos <= output->size);
     assert((U32)flushMode <= (U32)ZSTD_e_end);
 
     while (someMoreWork) {
@@ -5245,7 +5790,7 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
                 || zcs->appliedParams.outBufferMode == ZSTD_bm_stable)  /* OR we are allowed to return dstSizeTooSmall */
               && (zcs->inBuffPos == 0) ) {
                 /* shortcut to compression pass directly into output buffer */
-                size_t const cSize = ZSTD_compressEnd(zcs,
+                size_t const cSize = ZSTD_compressEnd_public(zcs,
                                                 op, oend-op, ip, iend-ip);
                 DEBUGLOG(4, "ZSTD_compressEnd : cSize=%u", (unsigned)cSize);
                 FORWARD_IF_ERROR(cSize, "ZSTD_compressEnd failed");
@@ -5262,8 +5807,7 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
                                         zcs->inBuff + zcs->inBuffPos, toLoad,
                                         ip, iend-ip);
                 zcs->inBuffPos += loaded;
-                if (loaded != 0)
-                    ip += loaded;
+                if (ip) ip += loaded;
                 if ( (flushMode == ZSTD_e_continue)
                   && (zcs->inBuffPos < zcs->inBuffTarget) ) {
                     /* not enough input to fill full block : stop here */
@@ -5274,6 +5818,20 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
                     /* empty */
                     someMoreWork = 0; break;
                 }
+            } else {
+                assert(zcs->appliedParams.inBufferMode == ZSTD_bm_stable);
+                if ( (flushMode == ZSTD_e_continue)
+                  && ( (size_t)(iend - ip) < zcs->blockSize) ) {
+                    /* can't compress a full block : stop here */
+                    zcs->stableIn_notConsumed = (size_t)(iend - ip);
+                    ip = iend;  /* pretend to have consumed input */
+                    someMoreWork = 0; break;
+                }
+                if ( (flushMode == ZSTD_e_flush)
+                  && (ip == iend) ) {
+                    /* empty */
+                    someMoreWork = 0; break;
+                }
             }
             /* compress current block (note : this stage cannot be stopped in the middle) */
             DEBUGLOG(5, "stream compression stage (flushMode==%u)", flushMode);
@@ -5281,9 +5839,8 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
                 void* cDst;
                 size_t cSize;
                 size_t oSize = oend-op;
-                size_t const iSize = inputBuffered
-                    ? zcs->inBuffPos - zcs->inToCompress
-                    : MIN((size_t)(iend - ip), zcs->blockSize);
+                size_t const iSize = inputBuffered ? zcs->inBuffPos - zcs->inToCompress
+                                                   : MIN((size_t)(iend - ip), zcs->blockSize);
                 if (oSize >= ZSTD_compressBound(iSize) || zcs->appliedParams.outBufferMode == ZSTD_bm_stable)
                     cDst = op;   /* compress into output buffer, to skip flush stage */
                 else
@@ -5291,9 +5848,9 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
                 if (inputBuffered) {
                     unsigned const lastBlock = (flushMode == ZSTD_e_end) && (ip==iend);
                     cSize = lastBlock ?
-                            ZSTD_compressEnd(zcs, cDst, oSize,
+                            ZSTD_compressEnd_public(zcs, cDst, oSize,
                                         zcs->inBuff + zcs->inToCompress, iSize) :
-                            ZSTD_compressContinue(zcs, cDst, oSize,
+                            ZSTD_compressContinue_public(zcs, cDst, oSize,
                                         zcs->inBuff + zcs->inToCompress, iSize);
                     FORWARD_IF_ERROR(cSize, "%s", lastBlock ? "ZSTD_compressEnd failed" : "ZSTD_compressContinue failed");
                     zcs->frameEnded = lastBlock;
@@ -5306,19 +5863,16 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
                     if (!lastBlock)
                         assert(zcs->inBuffTarget <= zcs->inBuffSize);
                     zcs->inToCompress = zcs->inBuffPos;
-                } else {
-                    unsigned const lastBlock = (ip + iSize == iend);
-                    assert(flushMode == ZSTD_e_end /* Already validated */);
+                } else { /* !inputBuffered, hence ZSTD_bm_stable */
+                    unsigned const lastBlock = (flushMode == ZSTD_e_end) && (ip + iSize == iend);
                     cSize = lastBlock ?
-                            ZSTD_compressEnd(zcs, cDst, oSize, ip, iSize) :
-                            ZSTD_compressContinue(zcs, cDst, oSize, ip, iSize);
+                            ZSTD_compressEnd_public(zcs, cDst, oSize, ip, iSize) :
+                            ZSTD_compressContinue_public(zcs, cDst, oSize, ip, iSize);
                     /* Consume the input prior to error checking to mirror buffered mode. */
-                    if (iSize > 0)
-                        ip += iSize;
+                    if (ip) ip += iSize;
                     FORWARD_IF_ERROR(cSize, "%s", lastBlock ? "ZSTD_compressEnd failed" : "ZSTD_compressContinue failed");
                     zcs->frameEnded = lastBlock;
-                    if (lastBlock)
-                        assert(ip == iend);
+                    if (lastBlock) assert(ip == iend);
                 }
                 if (cDst == op) {  /* no need to flush */
                     op += cSize;
@@ -5388,8 +5942,10 @@ size_t ZSTD_compressStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output, ZSTD_inBuf
 /* After a compression call set the expected input/output buffer.
  * This is validated at the start of the next compression call.
  */
-static void ZSTD_setBufferExpectations(ZSTD_CCtx* cctx, ZSTD_outBuffer const* output, ZSTD_inBuffer const* input)
+static void
+ZSTD_setBufferExpectations(ZSTD_CCtx* cctx, const ZSTD_outBuffer* output, const ZSTD_inBuffer* input)
 {
+    DEBUGLOG(5, "ZSTD_setBufferExpectations (for advanced stable in/out modes)");
     if (cctx->appliedParams.inBufferMode == ZSTD_bm_stable) {
         cctx->expectedInBuffer = *input;
     }
@@ -5408,22 +5964,22 @@ static size_t ZSTD_checkBufferStability(ZSTD_CCtx const* cctx,
 {
     if (cctx->appliedParams.inBufferMode == ZSTD_bm_stable) {
         ZSTD_inBuffer const expect = cctx->expectedInBuffer;
-        if (expect.src != input->src || expect.pos != input->pos || expect.size != input->size)
-            RETURN_ERROR(srcBuffer_wrong, "ZSTD_c_stableInBuffer enabled but input differs!");
-        if (endOp != ZSTD_e_end)
-            RETURN_ERROR(srcBuffer_wrong, "ZSTD_c_stableInBuffer can only be used with ZSTD_e_end!");
+        if (expect.src != input->src || expect.pos != input->pos)
+            RETURN_ERROR(stabilityCondition_notRespected, "ZSTD_c_stableInBuffer enabled but input differs!");
     }
+    (void)endOp;
     if (cctx->appliedParams.outBufferMode == ZSTD_bm_stable) {
         size_t const outBufferSize = output->size - output->pos;
         if (cctx->expectedOutBufferSize != outBufferSize)
-            RETURN_ERROR(dstBuffer_wrong, "ZSTD_c_stableOutBuffer enabled but output size differs!");
+            RETURN_ERROR(stabilityCondition_notRespected, "ZSTD_c_stableOutBuffer enabled but output size differs!");
     }
     return 0;
 }
 
 static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx,
                                              ZSTD_EndDirective endOp,
-                                             size_t inSize) {
+                                             size_t inSize)
+{
     ZSTD_CCtx_params params = cctx->requestedParams;
     ZSTD_prefixDict const prefixDict = cctx->prefixDict;
     FORWARD_IF_ERROR( ZSTD_initLocalDict(cctx) , ""); /* Init the local dict if present. */
@@ -5437,9 +5993,9 @@ static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx,
         params.compressionLevel = cctx->cdict->compressionLevel;
     }
     DEBUGLOG(4, "ZSTD_compressStream2 : transparent init stage");
-    if (endOp == ZSTD_e_end) cctx->pledgedSrcSizePlusOne = inSize + 1;  /* auto-fix pledgedSrcSize */
-    {
-        size_t const dictSize = prefixDict.dict
+    if (endOp == ZSTD_e_end) cctx->pledgedSrcSizePlusOne = inSize + 1;  /* auto-determine pledgedSrcSize */
+
+    {   size_t const dictSize = prefixDict.dict
                 ? prefixDict.dictSize
                 : (cctx->cdict ? cctx->cdict->dictContentSize : 0);
         ZSTD_cParamMode_e const mode = ZSTD_getCParamMode(cctx->cdict, &params, cctx->pledgedSrcSizePlusOne - 1);
@@ -5451,6 +6007,9 @@ static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx,
     params.useBlockSplitter = ZSTD_resolveBlockSplitterMode(params.useBlockSplitter, &params.cParams);
     params.ldmParams.enableLdm = ZSTD_resolveEnableLdm(params.ldmParams.enableLdm, &params.cParams);
     params.useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(params.useRowMatchFinder, &params.cParams);
+    params.validateSequences = ZSTD_resolveExternalSequenceValidation(params.validateSequences);
+    params.maxBlockSize = ZSTD_resolveMaxBlockSize(params.maxBlockSize);
+    params.searchForExternalRepcodes = ZSTD_resolveExternalRepcodeSearch(params.searchForExternalRepcodes, params.compressionLevel);
 
     {   U64 const pledgedSrcSize = cctx->pledgedSrcSizePlusOne - 1;
         assert(!ZSTD_isError(ZSTD_checkCParams(params.cParams)));
@@ -5477,6 +6036,8 @@ static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx,
     return 0;
 }
 
+/* @return provides a minimum amount of data remaining to be flushed from internal buffers
+ */
 size_t ZSTD_compressStream2( ZSTD_CCtx* cctx,
                              ZSTD_outBuffer* output,
                              ZSTD_inBuffer* input,
@@ -5491,8 +6052,27 @@ size_t ZSTD_compressStream2( ZSTD_CCtx* cctx,
 
     /* transparent initialization stage */
     if (cctx->streamStage == zcss_init) {
-        FORWARD_IF_ERROR(ZSTD_CCtx_init_compressStream2(cctx, endOp, input->size), "CompressStream2 initialization failed");
-        ZSTD_setBufferExpectations(cctx, output, input);    /* Set initial buffer expectations now that we've initialized */
+        size_t const inputSize = input->size - input->pos;  /* no obligation to start from pos==0 */
+        size_t const totalInputSize = inputSize + cctx->stableIn_notConsumed;
+        if ( (cctx->requestedParams.inBufferMode == ZSTD_bm_stable) /* input is presumed stable, across invocations */
+          && (endOp == ZSTD_e_continue)                             /* no flush requested, more input to come */
+          && (totalInputSize < ZSTD_BLOCKSIZE_MAX) ) {              /* not even reached one block yet */
+            if (cctx->stableIn_notConsumed) {  /* not the first time */
+                /* check stable source guarantees */
+                RETURN_ERROR_IF(input->src != cctx->expectedInBuffer.src, stabilityCondition_notRespected, "stableInBuffer condition not respected: wrong src pointer");
+                RETURN_ERROR_IF(input->pos != cctx->expectedInBuffer.size, stabilityCondition_notRespected, "stableInBuffer condition not respected: externally modified pos");
+            }
+            /* pretend input was consumed, to give a sense forward progress */
+            input->pos = input->size;
+            /* save stable inBuffer, for later control, and flush/end */
+            cctx->expectedInBuffer = *input;
+            /* but actually input wasn't consumed, so keep track of position from where compression shall resume */
+            cctx->stableIn_notConsumed += inputSize;
+            /* don't initialize yet, wait for the first block of flush() order, for better parameters adaptation */
+            return ZSTD_FRAMEHEADERSIZE_MIN(cctx->requestedParams.format);  /* at least some header to produce */
+        }
+        FORWARD_IF_ERROR(ZSTD_CCtx_init_compressStream2(cctx, endOp, totalInputSize), "compressStream2 initialization failed");
+        ZSTD_setBufferExpectations(cctx, output, input);   /* Set initial buffer expectations now that we've initialized */
     }
     /* end of transparent initialization stage */
 
@@ -5510,13 +6090,20 @@ size_t ZSTD_compressStream2_simpleArgs (
                       const void* src, size_t srcSize, size_t* srcPos,
                             ZSTD_EndDirective endOp)
 {
-    ZSTD_outBuffer output = { dst, dstCapacity, *dstPos };
-    ZSTD_inBuffer  input  = { src, srcSize, *srcPos };
+    ZSTD_outBuffer output;
+    ZSTD_inBuffer  input;
+    output.dst = dst;
+    output.size = dstCapacity;
+    output.pos = *dstPos;
+    input.src = src;
+    input.size = srcSize;
+    input.pos = *srcPos;
     /* ZSTD_compressStream2() will check validity of dstPos and srcPos */
-    size_t const cErr = ZSTD_compressStream2(cctx, &output, &input, endOp);
-    *dstPos = output.pos;
-    *srcPos = input.pos;
-    return cErr;
+    {   size_t const cErr = ZSTD_compressStream2(cctx, &output, &input, endOp);
+        *dstPos = output.pos;
+        *srcPos = input.pos;
+        return cErr;
+    }
 }
 
 size_t ZSTD_compress2(ZSTD_CCtx* cctx,
@@ -5539,6 +6126,7 @@ size_t ZSTD_compress2(ZSTD_CCtx* cctx,
         /* Reset to the original values. */
         cctx->requestedParams.inBufferMode = originalInBufferMode;
         cctx->requestedParams.outBufferMode = originalOutBufferMode;
+
         FORWARD_IF_ERROR(result, "ZSTD_compressStream2_simpleArgs failed");
         if (result != 0) {  /* compression not completed, due to lack of output space */
             assert(oPos == dstCapacity);
@@ -5549,64 +6137,61 @@ size_t ZSTD_compress2(ZSTD_CCtx* cctx,
     }
 }
 
-typedef struct {
-    U32 idx;             /* Index in array of ZSTD_Sequence */
-    U32 posInSequence;   /* Position within sequence at idx */
-    size_t posInSrc;        /* Number of bytes given by sequences provided so far */
-} ZSTD_sequencePosition;
-
 /* ZSTD_validateSequence() :
  * @offCode : is presumed to follow format required by ZSTD_storeSeq()
  * @returns a ZSTD error code if sequence is not valid
  */
 static size_t
-ZSTD_validateSequence(U32 offCode, U32 matchLength,
-                      size_t posInSrc, U32 windowLog, size_t dictSize)
+ZSTD_validateSequence(U32 offCode, U32 matchLength, U32 minMatch,
+                      size_t posInSrc, U32 windowLog, size_t dictSize, int useSequenceProducer)
 {
-    U32 const windowSize = 1 << windowLog;
+    U32 const windowSize = 1u << windowLog;
     /* posInSrc represents the amount of data the decoder would decode up to this point.
      * As long as the amount of data decoded is less than or equal to window size, offsets may be
      * larger than the total length of output decoded in order to reference the dict, even larger than
      * window size. After output surpasses windowSize, we're limited to windowSize offsets again.
      */
     size_t const offsetBound = posInSrc > windowSize ? (size_t)windowSize : posInSrc + (size_t)dictSize;
-    RETURN_ERROR_IF(offCode > STORE_OFFSET(offsetBound), corruption_detected, "Offset too large!");
-    RETURN_ERROR_IF(matchLength < MINMATCH, corruption_detected, "Matchlength too small");
+    size_t const matchLenLowerBound = (minMatch == 3 || useSequenceProducer) ? 3 : 4;
+    RETURN_ERROR_IF(offCode > OFFSET_TO_OFFBASE(offsetBound), externalSequences_invalid, "Offset too large!");
+    /* Validate maxNbSeq is large enough for the given matchLength and minMatch */
+    RETURN_ERROR_IF(matchLength < matchLenLowerBound, externalSequences_invalid, "Matchlength too small for the minMatch");
     return 0;
 }
 
 /* Returns an offset code, given a sequence's raw offset, the ongoing repcode array, and whether litLength == 0 */
-static U32 ZSTD_finalizeOffCode(U32 rawOffset, const U32 rep[ZSTD_REP_NUM], U32 ll0)
+static U32 ZSTD_finalizeOffBase(U32 rawOffset, const U32 rep[ZSTD_REP_NUM], U32 ll0)
 {
-    U32 offCode = STORE_OFFSET(rawOffset);
+    U32 offBase = OFFSET_TO_OFFBASE(rawOffset);
 
     if (!ll0 && rawOffset == rep[0]) {
-        offCode = STORE_REPCODE_1;
+        offBase = REPCODE1_TO_OFFBASE;
     } else if (rawOffset == rep[1]) {
-        offCode = STORE_REPCODE(2 - ll0);
+        offBase = REPCODE_TO_OFFBASE(2 - ll0);
     } else if (rawOffset == rep[2]) {
-        offCode = STORE_REPCODE(3 - ll0);
+        offBase = REPCODE_TO_OFFBASE(3 - ll0);
     } else if (ll0 && rawOffset == rep[0] - 1) {
-        offCode = STORE_REPCODE_3;
+        offBase = REPCODE3_TO_OFFBASE;
     }
-    return offCode;
+    return offBase;
 }
 
-/* Returns 0 on success, and a ZSTD_error otherwise. This function scans through an array of
- * ZSTD_Sequence, storing the sequences it finds, until it reaches a block delimiter.
- */
-static size_t
+size_t
 ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx,
                                               ZSTD_sequencePosition* seqPos,
                                         const ZSTD_Sequence* const inSeqs, size_t inSeqsSize,
-                                        const void* src, size_t blockSize)
+                                        const void* src, size_t blockSize,
+                                        ZSTD_paramSwitch_e externalRepSearch)
 {
     U32 idx = seqPos->idx;
+    U32 const startIdx = idx;
     BYTE const* ip = (BYTE const*)(src);
     const BYTE* const iend = ip + blockSize;
     repcodes_t updatedRepcodes;
     U32 dictSize;
 
+    DEBUGLOG(5, "ZSTD_copySequencesToSeqStoreExplicitBlockDelim (blockSize = %zu)", blockSize);
+
     if (cctx->cdict) {
         dictSize = (U32)cctx->cdict->dictContentSize;
     } else if (cctx->prefixDict.dict) {
@@ -5615,25 +6200,55 @@ ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx,
         dictSize = 0;
     }
     ZSTD_memcpy(updatedRepcodes.rep, cctx->blockState.prevCBlock->rep, sizeof(repcodes_t));
-    for (; (inSeqs[idx].matchLength != 0 || inSeqs[idx].offset != 0) && idx < inSeqsSize; ++idx) {
+    for (; idx < inSeqsSize && (inSeqs[idx].matchLength != 0 || inSeqs[idx].offset != 0); ++idx) {
         U32 const litLength = inSeqs[idx].litLength;
-        U32 const ll0 = (litLength == 0);
         U32 const matchLength = inSeqs[idx].matchLength;
-        U32 const offCode = ZSTD_finalizeOffCode(inSeqs[idx].offset, updatedRepcodes.rep, ll0);
-        ZSTD_updateRep(updatedRepcodes.rep, offCode, ll0);
+        U32 offBase;
 
-        DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offCode, matchLength, litLength);
+        if (externalRepSearch == ZSTD_ps_disable) {
+            offBase = OFFSET_TO_OFFBASE(inSeqs[idx].offset);
+        } else {
+            U32 const ll0 = (litLength == 0);
+            offBase = ZSTD_finalizeOffBase(inSeqs[idx].offset, updatedRepcodes.rep, ll0);
+            ZSTD_updateRep(updatedRepcodes.rep, offBase, ll0);
+        }
+
+        DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offBase, matchLength, litLength);
         if (cctx->appliedParams.validateSequences) {
             seqPos->posInSrc += litLength + matchLength;
-            FORWARD_IF_ERROR(ZSTD_validateSequence(offCode, matchLength, seqPos->posInSrc,
-                                                cctx->appliedParams.cParams.windowLog, dictSize),
+            FORWARD_IF_ERROR(ZSTD_validateSequence(offBase, matchLength, cctx->appliedParams.cParams.minMatch, seqPos->posInSrc,
+                                                cctx->appliedParams.cParams.windowLog, dictSize, cctx->appliedParams.useSequenceProducer),
                                                 "Sequence validation failed");
         }
-        RETURN_ERROR_IF(idx - seqPos->idx > cctx->seqStore.maxNbSeq, memory_allocation,
+        RETURN_ERROR_IF(idx - seqPos->idx >= cctx->seqStore.maxNbSeq, externalSequences_invalid,
                         "Not enough memory allocated. Try adjusting ZSTD_c_minMatch.");
-        ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offCode, matchLength);
+        ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offBase, matchLength);
         ip += matchLength + litLength;
     }
+
+    /* If we skipped repcode search while parsing, we need to update repcodes now */
+    assert(externalRepSearch != ZSTD_ps_auto);
+    assert(idx >= startIdx);
+    if (externalRepSearch == ZSTD_ps_disable && idx != startIdx) {
+        U32* const rep = updatedRepcodes.rep;
+        U32 lastSeqIdx = idx - 1; /* index of last non-block-delimiter sequence */
+
+        if (lastSeqIdx >= startIdx + 2) {
+            rep[2] = inSeqs[lastSeqIdx - 2].offset;
+            rep[1] = inSeqs[lastSeqIdx - 1].offset;
+            rep[0] = inSeqs[lastSeqIdx].offset;
+        } else if (lastSeqIdx == startIdx + 1) {
+            rep[2] = rep[0];
+            rep[1] = inSeqs[lastSeqIdx - 1].offset;
+            rep[0] = inSeqs[lastSeqIdx].offset;
+        } else {
+            assert(lastSeqIdx == startIdx);
+            rep[2] = rep[1];
+            rep[1] = rep[0];
+            rep[0] = inSeqs[lastSeqIdx].offset;
+        }
+    }
+
     ZSTD_memcpy(cctx->blockState.nextCBlock->rep, updatedRepcodes.rep, sizeof(repcodes_t));
 
     if (inSeqs[idx].litLength) {
@@ -5642,26 +6257,15 @@ ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx,
         ip += inSeqs[idx].litLength;
         seqPos->posInSrc += inSeqs[idx].litLength;
     }
-    RETURN_ERROR_IF(ip != iend, corruption_detected, "Blocksize doesn't agree with block delimiter!");
+    RETURN_ERROR_IF(ip != iend, externalSequences_invalid, "Blocksize doesn't agree with block delimiter!");
     seqPos->idx = idx+1;
     return 0;
 }
 
-/* Returns the number of bytes to move the current read position back by. Only non-zero
- * if we ended up splitting a sequence. Otherwise, it may return a ZSTD error if something
- * went wrong.
- *
- * This function will attempt to scan through blockSize bytes represented by the sequences
- * in inSeqs, storing any (partial) sequences.
- *
- * Occasionally, we may want to change the actual number of bytes we consumed from inSeqs to
- * avoid splitting a match, or to avoid splitting a match such that it would produce a match
- * smaller than MINMATCH. In this case, we return the number of bytes that we didn't read from this block.
- */
-static size_t
+size_t
 ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* seqPos,
                                    const ZSTD_Sequence* const inSeqs, size_t inSeqsSize,
-                                   const void* src, size_t blockSize)
+                                   const void* src, size_t blockSize, ZSTD_paramSwitch_e externalRepSearch)
 {
     U32 idx = seqPos->idx;
     U32 startPosInSequence = seqPos->posInSequence;
@@ -5673,6 +6277,9 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition*
     U32 bytesAdjustment = 0;
     U32 finalMatchSplit = 0;
 
+    /* TODO(embg) support fast parsing mode in noBlockDelim mode */
+    (void)externalRepSearch;
+
     if (cctx->cdict) {
         dictSize = cctx->cdict->dictContentSize;
     } else if (cctx->prefixDict.dict) {
@@ -5680,7 +6287,7 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition*
     } else {
         dictSize = 0;
     }
-    DEBUGLOG(5, "ZSTD_copySequencesToSeqStore: idx: %u PIS: %u blockSize: %zu", idx, startPosInSequence, blockSize);
+    DEBUGLOG(5, "ZSTD_copySequencesToSeqStoreNoBlockDelim: idx: %u PIS: %u blockSize: %zu", idx, startPosInSequence, blockSize);
     DEBUGLOG(5, "Start seq: idx: %u (of: %u ml: %u ll: %u)", idx, inSeqs[idx].offset, inSeqs[idx].matchLength, inSeqs[idx].litLength);
     ZSTD_memcpy(updatedRepcodes.rep, cctx->blockState.prevCBlock->rep, sizeof(repcodes_t));
     while (endPosInSequence && idx < inSeqsSize && !finalMatchSplit) {
@@ -5688,7 +6295,7 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition*
         U32 litLength = currSeq.litLength;
         U32 matchLength = currSeq.matchLength;
         U32 const rawOffset = currSeq.offset;
-        U32 offCode;
+        U32 offBase;
 
         /* Modify the sequence depending on where endPosInSequence lies */
         if (endPosInSequence >= currSeq.litLength + currSeq.matchLength) {
@@ -5702,7 +6309,6 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition*
             /* Move to the next sequence */
             endPosInSequence -= currSeq.litLength + currSeq.matchLength;
             startPosInSequence = 0;
-            idx++;
         } else {
             /* This is the final (partial) sequence we're adding from inSeqs, and endPosInSequence
                does not reach the end of the match. So, we have to split the sequence */
@@ -5742,21 +6348,23 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition*
         }
         /* Check if this offset can be represented with a repcode */
         {   U32 const ll0 = (litLength == 0);
-            offCode = ZSTD_finalizeOffCode(rawOffset, updatedRepcodes.rep, ll0);
-            ZSTD_updateRep(updatedRepcodes.rep, offCode, ll0);
+            offBase = ZSTD_finalizeOffBase(rawOffset, updatedRepcodes.rep, ll0);
+            ZSTD_updateRep(updatedRepcodes.rep, offBase, ll0);
         }
 
         if (cctx->appliedParams.validateSequences) {
             seqPos->posInSrc += litLength + matchLength;
-            FORWARD_IF_ERROR(ZSTD_validateSequence(offCode, matchLength, seqPos->posInSrc,
-                                                   cctx->appliedParams.cParams.windowLog, dictSize),
+            FORWARD_IF_ERROR(ZSTD_validateSequence(offBase, matchLength, cctx->appliedParams.cParams.minMatch, seqPos->posInSrc,
+                                                   cctx->appliedParams.cParams.windowLog, dictSize, cctx->appliedParams.useSequenceProducer),
                                                    "Sequence validation failed");
         }
-        DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offCode, matchLength, litLength);
-        RETURN_ERROR_IF(idx - seqPos->idx > cctx->seqStore.maxNbSeq, memory_allocation,
+        DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offBase, matchLength, litLength);
+        RETURN_ERROR_IF(idx - seqPos->idx >= cctx->seqStore.maxNbSeq, externalSequences_invalid,
                         "Not enough memory allocated. Try adjusting ZSTD_c_minMatch.");
-        ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offCode, matchLength);
+        ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offBase, matchLength);
         ip += matchLength + litLength;
+        if (!finalMatchSplit)
+            idx++; /* Next Sequence */
     }
     DEBUGLOG(5, "Ending seq: idx: %u (of: %u ml: %u ll: %u)", idx, inSeqs[idx].offset, inSeqs[idx].matchLength, inSeqs[idx].litLength);
     assert(idx == inSeqsSize || endPosInSequence <= inSeqs[idx].litLength + inSeqs[idx].matchLength);
@@ -5779,7 +6387,7 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition*
 
 typedef size_t (*ZSTD_sequenceCopier) (ZSTD_CCtx* cctx, ZSTD_sequencePosition* seqPos,
                                        const ZSTD_Sequence* const inSeqs, size_t inSeqsSize,
-                                       const void* src, size_t blockSize);
+                                       const void* src, size_t blockSize, ZSTD_paramSwitch_e externalRepSearch);
 static ZSTD_sequenceCopier ZSTD_selectSequenceCopier(ZSTD_sequenceFormat_e mode)
 {
     ZSTD_sequenceCopier sequenceCopier = NULL;
@@ -5793,6 +6401,57 @@ static ZSTD_sequenceCopier ZSTD_selectSequenceCopier(ZSTD_sequenceFormat_e mode)
     return sequenceCopier;
 }
 
+/* Discover the size of next block by searching for the delimiter.
+ * Note that a block delimiter **must** exist in this mode,
+ * otherwise it's an input error.
+ * The block size retrieved will be later compared to ensure it remains within bounds */
+static size_t
+blockSize_explicitDelimiter(const ZSTD_Sequence* inSeqs, size_t inSeqsSize, ZSTD_sequencePosition seqPos)
+{
+    int end = 0;
+    size_t blockSize = 0;
+    size_t spos = seqPos.idx;
+    DEBUGLOG(6, "blockSize_explicitDelimiter : seq %zu / %zu", spos, inSeqsSize);
+    assert(spos <= inSeqsSize);
+    while (spos < inSeqsSize) {
+        end = (inSeqs[spos].offset == 0);
+        blockSize += inSeqs[spos].litLength + inSeqs[spos].matchLength;
+        if (end) {
+            if (inSeqs[spos].matchLength != 0)
+                RETURN_ERROR(externalSequences_invalid, "delimiter format error : both matchlength and offset must be == 0");
+            break;
+        }
+        spos++;
+    }
+    if (!end)
+        RETURN_ERROR(externalSequences_invalid, "Reached end of sequences without finding a block delimiter");
+    return blockSize;
+}
+
+/* More a "target" block size */
+static size_t blockSize_noDelimiter(size_t blockSize, size_t remaining)
+{
+    int const lastBlock = (remaining <= blockSize);
+    return lastBlock ? remaining : blockSize;
+}
+
+static size_t determine_blockSize(ZSTD_sequenceFormat_e mode,
+                           size_t blockSize, size_t remaining,
+                     const ZSTD_Sequence* inSeqs, size_t inSeqsSize, ZSTD_sequencePosition seqPos)
+{
+    DEBUGLOG(6, "determine_blockSize : remainingSize = %zu", remaining);
+    if (mode == ZSTD_sf_noBlockDelimiters)
+        return blockSize_noDelimiter(blockSize, remaining);
+    {   size_t const explicitBlockSize = blockSize_explicitDelimiter(inSeqs, inSeqsSize, seqPos);
+        FORWARD_IF_ERROR(explicitBlockSize, "Error while determining block size with explicit delimiters");
+        if (explicitBlockSize > blockSize)
+            RETURN_ERROR(externalSequences_invalid, "sequences incorrectly define a too large block");
+        if (explicitBlockSize > remaining)
+            RETURN_ERROR(externalSequences_invalid, "sequences define a frame longer than source");
+        return explicitBlockSize;
+    }
+}
+
 /* Compress, block-by-block, all of the sequences given.
  *
  * Returns the cumulative size of all compressed blocks (including their headers),
@@ -5805,9 +6464,6 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
                           const void* src, size_t srcSize)
 {
     size_t cSize = 0;
-    U32 lastBlock;
-    size_t blockSize;
-    size_t compressedSeqsSize;
     size_t remaining = srcSize;
     ZSTD_sequencePosition seqPos = {0, 0, 0};
 
@@ -5827,22 +6483,29 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
     }
 
     while (remaining) {
+        size_t compressedSeqsSize;
         size_t cBlockSize;
         size_t additionalByteAdjustment;
-        lastBlock = remaining <= cctx->blockSize;
-        blockSize = lastBlock ? (U32)remaining : (U32)cctx->blockSize;
+        size_t blockSize = determine_blockSize(cctx->appliedParams.blockDelimiters,
+                                        cctx->blockSize, remaining,
+                                        inSeqs, inSeqsSize, seqPos);
+        U32 const lastBlock = (blockSize == remaining);
+        FORWARD_IF_ERROR(blockSize, "Error while trying to determine block size");
+        assert(blockSize <= remaining);
         ZSTD_resetSeqStore(&cctx->seqStore);
-        DEBUGLOG(4, "Working on new block. Blocksize: %zu", blockSize);
+        DEBUGLOG(5, "Working on new block. Blocksize: %zu (total:%zu)", blockSize, (ip - (const BYTE*)src) + blockSize);
 
-        additionalByteAdjustment = sequenceCopier(cctx, &seqPos, inSeqs, inSeqsSize, ip, blockSize);
+        additionalByteAdjustment = sequenceCopier(cctx, &seqPos, inSeqs, inSeqsSize, ip, blockSize, cctx->appliedParams.searchForExternalRepcodes);
         FORWARD_IF_ERROR(additionalByteAdjustment, "Bad sequence copy");
         blockSize -= additionalByteAdjustment;
 
         /* If blocks are too small, emit as a nocompress block */
-        if (blockSize < MIN_CBLOCK_SIZE+ZSTD_blockHeaderSize+1) {
+        /* TODO: See 3090. We reduced MIN_CBLOCK_SIZE from 3 to 2 so to compensate we are adding
+         * additional 1. We need to revisit and change this logic to be more consistent */
+        if (blockSize < MIN_CBLOCK_SIZE+ZSTD_blockHeaderSize+1+1) {
             cBlockSize = ZSTD_noCompressBlock(op, dstCapacity, ip, blockSize, lastBlock);
             FORWARD_IF_ERROR(cBlockSize, "Nocompress block failed");
-            DEBUGLOG(4, "Block too small, writing out nocompress block: cSize: %zu", cBlockSize);
+            DEBUGLOG(5, "Block too small, writing out nocompress block: cSize: %zu", cBlockSize);
             cSize += cBlockSize;
             ip += blockSize;
             op += cBlockSize;
@@ -5851,6 +6514,7 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
             continue;
         }
 
+        RETURN_ERROR_IF(dstCapacity < ZSTD_blockHeaderSize, dstSize_tooSmall, "not enough dstCapacity to write a new compressed block");
         compressedSeqsSize = ZSTD_entropyCompressSeqStore(&cctx->seqStore,
                                 &cctx->blockState.prevCBlock->entropy, &cctx->blockState.nextCBlock->entropy,
                                 &cctx->appliedParams,
@@ -5859,11 +6523,11 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
                                 cctx->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */,
                                 cctx->bmi2);
         FORWARD_IF_ERROR(compressedSeqsSize, "Compressing sequences of block failed");
-        DEBUGLOG(4, "Compressed sequences size: %zu", compressedSeqsSize);
+        DEBUGLOG(5, "Compressed sequences size: %zu", compressedSeqsSize);
 
         if (!cctx->isFirstBlock &&
             ZSTD_maybeRLE(&cctx->seqStore) &&
-            ZSTD_isRLE((BYTE const*)src, srcSize)) {
+            ZSTD_isRLE(ip, blockSize)) {
             /* We don't want to emit our first block as a RLE even if it qualifies because
             * doing so will cause the decoder (cli only) to throw a "should consume all input error."
             * This is only an issue for zstd <= v1.4.3
@@ -5874,12 +6538,12 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
         if (compressedSeqsSize == 0) {
             /* ZSTD_noCompressBlock writes the block header as well */
             cBlockSize = ZSTD_noCompressBlock(op, dstCapacity, ip, blockSize, lastBlock);
-            FORWARD_IF_ERROR(cBlockSize, "Nocompress block failed");
-            DEBUGLOG(4, "Writing out nocompress block, size: %zu", cBlockSize);
+            FORWARD_IF_ERROR(cBlockSize, "ZSTD_noCompressBlock failed");
+            DEBUGLOG(5, "Writing out nocompress block, size: %zu", cBlockSize);
         } else if (compressedSeqsSize == 1) {
             cBlockSize = ZSTD_rleCompressBlock(op, dstCapacity, *ip, blockSize, lastBlock);
-            FORWARD_IF_ERROR(cBlockSize, "RLE compress block failed");
-            DEBUGLOG(4, "Writing out RLE block, size: %zu", cBlockSize);
+            FORWARD_IF_ERROR(cBlockSize, "ZSTD_rleCompressBlock failed");
+            DEBUGLOG(5, "Writing out RLE block, size: %zu", cBlockSize);
         } else {
             U32 cBlockHeader;
             /* Error checking and repcodes update */
@@ -5891,11 +6555,10 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
             cBlockHeader = lastBlock + (((U32)bt_compressed)<<1) + (U32)(compressedSeqsSize << 3);
             MEM_writeLE24(op, cBlockHeader);
             cBlockSize = ZSTD_blockHeaderSize + compressedSeqsSize;
-            DEBUGLOG(4, "Writing out compressed block, size: %zu", cBlockSize);
+            DEBUGLOG(5, "Writing out compressed block, size: %zu", cBlockSize);
         }
 
         cSize += cBlockSize;
-        DEBUGLOG(4, "cSize running total: %zu", cSize);
 
         if (lastBlock) {
             break;
@@ -5906,12 +6569,15 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
             dstCapacity -= cBlockSize;
             cctx->isFirstBlock = 0;
         }
+        DEBUGLOG(5, "cSize running total: %zu (remaining dstCapacity=%zu)", cSize, dstCapacity);
     }
 
+    DEBUGLOG(4, "cSize final total: %zu", cSize);
     return cSize;
 }
 
-size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst, size_t dstCapacity,
+size_t ZSTD_compressSequences(ZSTD_CCtx* cctx,
+                              void* dst, size_t dstCapacity,
                               const ZSTD_Sequence* inSeqs, size_t inSeqsSize,
                               const void* src, size_t srcSize)
 {
@@ -5921,7 +6587,7 @@ size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst, size_t dstCapaci
     size_t frameHeaderSize = 0;
 
     /* Transparent initialization stage, same as compressStream2() */
-    DEBUGLOG(3, "ZSTD_compressSequences()");
+    DEBUGLOG(4, "ZSTD_compressSequences (dstCapacity=%zu)", dstCapacity);
     assert(cctx != NULL);
     FORWARD_IF_ERROR(ZSTD_CCtx_init_compressStream2(cctx, ZSTD_e_end, srcSize), "CCtx initialization failed");
     /* Begin writing output, starting with frame header */
@@ -5949,26 +6615,34 @@ size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst, size_t dstCapaci
         cSize += 4;
     }
 
-    DEBUGLOG(3, "Final compressed size: %zu", cSize);
+    DEBUGLOG(4, "Final compressed size: %zu", cSize);
     return cSize;
 }
 
 /*======   Finalize   ======*/
 
+static ZSTD_inBuffer inBuffer_forEndFlush(const ZSTD_CStream* zcs)
+{
+    const ZSTD_inBuffer nullInput = { NULL, 0, 0 };
+    const int stableInput = (zcs->appliedParams.inBufferMode == ZSTD_bm_stable);
+    return stableInput ? zcs->expectedInBuffer : nullInput;
+}
+
 /*! ZSTD_flushStream() :
  * @return : amount of data remaining to flush */
 size_t ZSTD_flushStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output)
 {
-    ZSTD_inBuffer input = { NULL, 0, 0 };
+    ZSTD_inBuffer input = inBuffer_forEndFlush(zcs);
+    input.size = input.pos; /* do not ingest more input during flush */
     return ZSTD_compressStream2(zcs, output, &input, ZSTD_e_flush);
 }
 
 
 size_t ZSTD_endStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output)
 {
-    ZSTD_inBuffer input = { NULL, 0, 0 };
+    ZSTD_inBuffer input = inBuffer_forEndFlush(zcs);
     size_t const remainingToFlush = ZSTD_compressStream2(zcs, output, &input, ZSTD_e_end);
-    FORWARD_IF_ERROR( remainingToFlush , "ZSTD_compressStream2 failed");
+    FORWARD_IF_ERROR(remainingToFlush , "ZSTD_compressStream2(,,ZSTD_e_end) failed");
     if (zcs->appliedParams.nbWorkers > 0) return remainingToFlush;   /* minimal estimation */
     /* single thread mode : attempt to calculate remaining to flush more precisely */
     {   size_t const lastBlockSize = zcs->frameEnded ? 0 : ZSTD_BLOCKHEADERSIZE;
@@ -6090,7 +6764,7 @@ static ZSTD_compressionParameters ZSTD_getCParams_internal(int compressionLevel,
             cp.targetLength = (unsigned)(-clampedCompressionLevel);
         }
         /* refine parameters based on srcSize & dictSize */
-        return ZSTD_adjustCParams_internal(cp, srcSizeHint, dictSize, mode);
+        return ZSTD_adjustCParams_internal(cp, srcSizeHint, dictSize, mode, ZSTD_ps_auto);
     }
 }
 
@@ -6125,3 +6799,21 @@ ZSTD_parameters ZSTD_getParams(int compressionLevel, unsigned long long srcSizeH
     if (srcSizeHint == 0) srcSizeHint = ZSTD_CONTENTSIZE_UNKNOWN;
     return ZSTD_getParams_internal(compressionLevel, srcSizeHint, dictSize, ZSTD_cpm_unknown);
 }
+
+void ZSTD_registerSequenceProducer(
+    ZSTD_CCtx* zc, void* mState,
+    ZSTD_sequenceProducer_F* mFinder
+) {
+    if (mFinder != NULL) {
+        ZSTD_externalMatchCtx emctx;
+        emctx.mState = mState;
+        emctx.mFinder = mFinder;
+        emctx.seqBuffer = NULL;
+        emctx.seqBufferCapacity = 0;
+        zc->externalMatchCtx = emctx;
+        zc->requestedParams.useSequenceProducer = 1;
+    } else {
+        ZSTD_memset(&zc->externalMatchCtx, 0, sizeof(zc->externalMatchCtx));
+        zc->requestedParams.useSequenceProducer = 0;
+    }
+}
diff --git a/lib/zstd/compress/zstd_compress_internal.h b/lib/zstd/compress/zstd_compress_internal.h
index 71697a11ae3056..899f5e2de8e96c 100644
--- a/lib/zstd/compress/zstd_compress_internal.h
+++ b/lib/zstd/compress/zstd_compress_internal.h
@@ -1,5 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -20,6 +21,7 @@
 ***************************************/
 #include "../common/zstd_internal.h"
 #include "zstd_cwksp.h"
+#include "../common/bits.h" /* ZSTD_highbit32, ZSTD_NbCommonBytes */
 
 
 /*-*************************************
@@ -111,12 +113,13 @@ typedef struct {
 /* ZSTD_buildBlockEntropyStats() :
  *  Builds entropy for the block.
  *  @return : 0 on success or error code */
-size_t ZSTD_buildBlockEntropyStats(seqStore_t* seqStorePtr,
-                             const ZSTD_entropyCTables_t* prevEntropy,
-                                   ZSTD_entropyCTables_t* nextEntropy,
-                             const ZSTD_CCtx_params* cctxParams,
-                                   ZSTD_entropyCTablesMetadata_t* entropyMetadata,
-                                   void* workspace, size_t wkspSize);
+size_t ZSTD_buildBlockEntropyStats(
+                    const seqStore_t* seqStorePtr,
+                    const ZSTD_entropyCTables_t* prevEntropy,
+                          ZSTD_entropyCTables_t* nextEntropy,
+                    const ZSTD_CCtx_params* cctxParams,
+                          ZSTD_entropyCTablesMetadata_t* entropyMetadata,
+                          void* workspace, size_t wkspSize);
 
 /* *******************************
 *  Compression internals structs *
@@ -142,6 +145,12 @@ typedef struct {
   size_t capacity;      /* The capacity starting from `seq` pointer */
 } rawSeqStore_t;
 
+typedef struct {
+    U32 idx;            /* Index in array of ZSTD_Sequence */
+    U32 posInSequence;  /* Position within sequence at idx */
+    size_t posInSrc;    /* Number of bytes given by sequences provided so far */
+} ZSTD_sequencePosition;
+
 UNUSED_ATTR static const rawSeqStore_t kNullRawSeqStore = {NULL, 0, 0, 0, 0};
 
 typedef struct {
@@ -212,8 +221,10 @@ struct ZSTD_matchState_t {
     U32 hashLog3;           /* dispatch table for matches of len==3 : larger == faster, more memory */
 
     U32 rowHashLog;                          /* For row-based matchfinder: Hashlog based on nb of rows in the hashTable.*/
-    U16* tagTable;                           /* For row-based matchFinder: A row-based table containing the hashes and head index. */
+    BYTE* tagTable;                          /* For row-based matchFinder: A row-based table containing the hashes and head index. */
     U32 hashCache[ZSTD_ROW_HASH_CACHE_SIZE]; /* For row-based matchFinder: a cache of hashes to improve speed */
+    U64 hashSalt;                            /* For row-based matchFinder: salts the hash for re-use of tag table */
+    U32 hashSaltEntropy;                     /* For row-based matchFinder: collects entropy for salt generation */
 
     U32* hashTable;
     U32* hashTable3;
@@ -228,6 +239,18 @@ struct ZSTD_matchState_t {
     const ZSTD_matchState_t* dictMatchState;
     ZSTD_compressionParameters cParams;
     const rawSeqStore_t* ldmSeqStore;
+
+    /* Controls prefetching in some dictMatchState matchfinders.
+     * This behavior is controlled from the cctx ms.
+     * This parameter has no effect in the cdict ms. */
+    int prefetchCDictTables;
+
+    /* When == 0, lazy match finders insert every position.
+     * When != 0, lazy match finders only insert positions they search.
+     * This allows them to skip much faster over incompressible data,
+     * at a small cost to compression ratio.
+     */
+    int lazySkipping;
 };
 
 typedef struct {
@@ -324,6 +347,24 @@ struct ZSTD_CCtx_params_s {
 
     /* Internal use, for createCCtxParams() and freeCCtxParams() only */
     ZSTD_customMem customMem;
+
+    /* Controls prefetching in some dictMatchState matchfinders */
+    ZSTD_paramSwitch_e prefetchCDictTables;
+
+    /* Controls whether zstd will fall back to an internal matchfinder
+     * if the external matchfinder returns an error code. */
+    int enableMatchFinderFallback;
+
+    /* Indicates whether an external matchfinder has been referenced.
+     * Users can't set this externally.
+     * It is set internally in ZSTD_registerSequenceProducer(). */
+    int useSequenceProducer;
+
+    /* Adjust the max block size*/
+    size_t maxBlockSize;
+
+    /* Controls repcode search in external sequence parsing */
+    ZSTD_paramSwitch_e searchForExternalRepcodes;
 };  /* typedef'd to ZSTD_CCtx_params within "zstd.h" */
 
 #define COMPRESS_SEQUENCES_WORKSPACE_SIZE (sizeof(unsigned) * (MaxSeq + 2))
@@ -355,6 +396,14 @@ typedef struct {
     ZSTD_entropyCTablesMetadata_t entropyMetadata;
 } ZSTD_blockSplitCtx;
 
+/* Context for block-level external matchfinder API */
+typedef struct {
+  void* mState;
+  ZSTD_sequenceProducer_F* mFinder;
+  ZSTD_Sequence* seqBuffer;
+  size_t seqBufferCapacity;
+} ZSTD_externalMatchCtx;
+
 struct ZSTD_CCtx_s {
     ZSTD_compressionStage_e stage;
     int cParamsChanged;                  /* == 1 if cParams(except wlog) or compression level are changed in requestedParams. Triggers transmission of new params to ZSTDMT (if available) then reset to 0. */
@@ -404,6 +453,7 @@ struct ZSTD_CCtx_s {
 
     /* Stable in/out buffer verification */
     ZSTD_inBuffer expectedInBuffer;
+    size_t stableIn_notConsumed; /* nb bytes within stable input buffer that are said to be consumed but are not */
     size_t expectedOutBufferSize;
 
     /* Dictionary */
@@ -417,9 +467,13 @@ struct ZSTD_CCtx_s {
 
     /* Workspace for block splitter */
     ZSTD_blockSplitCtx blockSplitCtx;
+
+    /* Workspace for external matchfinder */
+    ZSTD_externalMatchCtx externalMatchCtx;
 };
 
 typedef enum { ZSTD_dtlm_fast, ZSTD_dtlm_full } ZSTD_dictTableLoadMethod_e;
+typedef enum { ZSTD_tfp_forCCtx, ZSTD_tfp_forCDict } ZSTD_tableFillPurpose_e;
 
 typedef enum {
     ZSTD_noDict = 0,
@@ -441,7 +495,7 @@ typedef enum {
                                  * In this mode we take both the source size and the dictionary size
                                  * into account when selecting and adjusting the parameters.
                                  */
-    ZSTD_cpm_unknown = 3,       /* ZSTD_getCParams, ZSTD_getParams, ZSTD_adjustParams.
+    ZSTD_cpm_unknown = 3        /* ZSTD_getCParams, ZSTD_getParams, ZSTD_adjustParams.
                                  * We don't know what these parameters are for. We default to the legacy
                                  * behavior of taking both the source size and the dict size into account
                                  * when selecting and adjusting parameters.
@@ -500,9 +554,11 @@ MEM_STATIC int ZSTD_cParam_withinBounds(ZSTD_cParameter cParam, int value)
 /* ZSTD_noCompressBlock() :
  * Writes uncompressed block to dst buffer from given src.
  * Returns the size of the block */
-MEM_STATIC size_t ZSTD_noCompressBlock (void* dst, size_t dstCapacity, const void* src, size_t srcSize, U32 lastBlock)
+MEM_STATIC size_t
+ZSTD_noCompressBlock(void* dst, size_t dstCapacity, const void* src, size_t srcSize, U32 lastBlock)
 {
     U32 const cBlockHeader24 = lastBlock + (((U32)bt_raw)<<1) + (U32)(srcSize << 3);
+    DEBUGLOG(5, "ZSTD_noCompressBlock (srcSize=%zu, dstCapacity=%zu)", srcSize, dstCapacity);
     RETURN_ERROR_IF(srcSize + ZSTD_blockHeaderSize > dstCapacity,
                     dstSize_tooSmall, "dst buf too small for uncompressed block");
     MEM_writeLE24(dst, cBlockHeader24);
@@ -510,7 +566,8 @@ MEM_STATIC size_t ZSTD_noCompressBlock (void* dst, size_t dstCapacity, const voi
     return ZSTD_blockHeaderSize + srcSize;
 }
 
-MEM_STATIC size_t ZSTD_rleCompressBlock (void* dst, size_t dstCapacity, BYTE src, size_t srcSize, U32 lastBlock)
+MEM_STATIC size_t
+ZSTD_rleCompressBlock(void* dst, size_t dstCapacity, BYTE src, size_t srcSize, U32 lastBlock)
 {
     BYTE* const op = (BYTE*)dst;
     U32 const cBlockHeader = lastBlock + (((U32)bt_rle)<<1) + (U32)(srcSize << 3);
@@ -529,7 +586,7 @@ MEM_STATIC size_t ZSTD_minGain(size_t srcSize, ZSTD_strategy strat)
 {
     U32 const minlog = (strat>=ZSTD_btultra) ? (U32)(strat) - 1 : 6;
     ZSTD_STATIC_ASSERT(ZSTD_btultra == 8);
-    assert(ZSTD_cParam_withinBounds(ZSTD_c_strategy, strat));
+    assert(ZSTD_cParam_withinBounds(ZSTD_c_strategy, (int)strat));
     return (srcSize >> minlog) + 2;
 }
 
@@ -565,29 +622,27 @@ ZSTD_safecopyLiterals(BYTE* op, BYTE const* ip, BYTE const* const iend, BYTE con
     while (ip < iend) *op++ = *ip++;
 }
 
-#define ZSTD_REP_MOVE     (ZSTD_REP_NUM-1)
-#define STORE_REPCODE_1 STORE_REPCODE(1)
-#define STORE_REPCODE_2 STORE_REPCODE(2)
-#define STORE_REPCODE_3 STORE_REPCODE(3)
-#define STORE_REPCODE(r) (assert((r)>=1), assert((r)<=3), (r)-1)
-#define STORE_OFFSET(o)  (assert((o)>0), o + ZSTD_REP_MOVE)
-#define STORED_IS_OFFSET(o)  ((o) > ZSTD_REP_MOVE)
-#define STORED_IS_REPCODE(o) ((o) <= ZSTD_REP_MOVE)
-#define STORED_OFFSET(o)  (assert(STORED_IS_OFFSET(o)), (o)-ZSTD_REP_MOVE)
-#define STORED_REPCODE(o) (assert(STORED_IS_REPCODE(o)), (o)+1)  /* returns ID 1,2,3 */
-#define STORED_TO_OFFBASE(o) ((o)+1)
-#define OFFBASE_TO_STORED(o) ((o)-1)
+
+#define REPCODE1_TO_OFFBASE REPCODE_TO_OFFBASE(1)
+#define REPCODE2_TO_OFFBASE REPCODE_TO_OFFBASE(2)
+#define REPCODE3_TO_OFFBASE REPCODE_TO_OFFBASE(3)
+#define REPCODE_TO_OFFBASE(r) (assert((r)>=1), assert((r)<=ZSTD_REP_NUM), (r)) /* accepts IDs 1,2,3 */
+#define OFFSET_TO_OFFBASE(o)  (assert((o)>0), o + ZSTD_REP_NUM)
+#define OFFBASE_IS_OFFSET(o)  ((o) > ZSTD_REP_NUM)
+#define OFFBASE_IS_REPCODE(o) ( 1 <= (o) && (o) <= ZSTD_REP_NUM)
+#define OFFBASE_TO_OFFSET(o)  (assert(OFFBASE_IS_OFFSET(o)), (o) - ZSTD_REP_NUM)
+#define OFFBASE_TO_REPCODE(o) (assert(OFFBASE_IS_REPCODE(o)), (o))  /* returns ID 1,2,3 */
 
 /*! ZSTD_storeSeq() :
- *  Store a sequence (litlen, litPtr, offCode and matchLength) into seqStore_t.
- *  @offBase_minus1 : Users should use employ macros STORE_REPCODE_X and STORE_OFFSET().
+ *  Store a sequence (litlen, litPtr, offBase and matchLength) into seqStore_t.
+ *  @offBase : Users should employ macros REPCODE_TO_OFFBASE() and OFFSET_TO_OFFBASE().
  *  @matchLength : must be >= MINMATCH
- *  Allowed to overread literals up to litLimit.
+ *  Allowed to over-read literals up to litLimit.
 */
 HINT_INLINE UNUSED_ATTR void
 ZSTD_storeSeq(seqStore_t* seqStorePtr,
               size_t litLength, const BYTE* literals, const BYTE* litLimit,
-              U32 offBase_minus1,
+              U32 offBase,
               size_t matchLength)
 {
     BYTE const* const litLimit_w = litLimit - WILDCOPY_OVERLENGTH;
@@ -596,8 +651,8 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr,
     static const BYTE* g_start = NULL;
     if (g_start==NULL) g_start = (const BYTE*)literals;  /* note : index only works for compression within a single segment */
     {   U32 const pos = (U32)((const BYTE*)literals - g_start);
-        DEBUGLOG(6, "Cpos%7u :%3u literals, match%4u bytes at offCode%7u",
-               pos, (U32)litLength, (U32)matchLength, (U32)offBase_minus1);
+        DEBUGLOG(6, "Cpos%7u :%3u literals, match%4u bytes at offBase%7u",
+               pos, (U32)litLength, (U32)matchLength, (U32)offBase);
     }
 #endif
     assert((size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart) < seqStorePtr->maxNbSeq);
@@ -607,9 +662,9 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr,
     assert(literals + litLength <= litLimit);
     if (litEnd <= litLimit_w) {
         /* Common case we can use wildcopy.
-	 * First copy 16 bytes, because literals are likely short.
-	 */
-        assert(WILDCOPY_OVERLENGTH >= 16);
+         * First copy 16 bytes, because literals are likely short.
+         */
+        ZSTD_STATIC_ASSERT(WILDCOPY_OVERLENGTH >= 16);
         ZSTD_copy16(seqStorePtr->lit, literals);
         if (litLength > 16) {
             ZSTD_wildcopy(seqStorePtr->lit+16, literals+16, (ptrdiff_t)litLength-16, ZSTD_no_overlap);
@@ -628,7 +683,7 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr,
     seqStorePtr->sequences[0].litLength = (U16)litLength;
 
     /* match offset */
-    seqStorePtr->sequences[0].offBase = STORED_TO_OFFBASE(offBase_minus1);
+    seqStorePtr->sequences[0].offBase = offBase;
 
     /* match Length */
     assert(matchLength >= MINMATCH);
@@ -646,17 +701,17 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr,
 
 /* ZSTD_updateRep() :
  * updates in-place @rep (array of repeat offsets)
- * @offBase_minus1 : sum-type, with same numeric representation as ZSTD_storeSeq()
+ * @offBase : sum-type, using numeric representation of ZSTD_storeSeq()
  */
 MEM_STATIC void
-ZSTD_updateRep(U32 rep[ZSTD_REP_NUM], U32 const offBase_minus1, U32 const ll0)
+ZSTD_updateRep(U32 rep[ZSTD_REP_NUM], U32 const offBase, U32 const ll0)
 {
-    if (STORED_IS_OFFSET(offBase_minus1)) {  /* full offset */
+    if (OFFBASE_IS_OFFSET(offBase)) {  /* full offset */
         rep[2] = rep[1];
         rep[1] = rep[0];
-        rep[0] = STORED_OFFSET(offBase_minus1);
+        rep[0] = OFFBASE_TO_OFFSET(offBase);
     } else {   /* repcode */
-        U32 const repCode = STORED_REPCODE(offBase_minus1) - 1 + ll0;
+        U32 const repCode = OFFBASE_TO_REPCODE(offBase) - 1 + ll0;
         if (repCode > 0) {  /* note : if repCode==0, no change */
             U32 const currentOffset = (repCode==ZSTD_REP_NUM) ? (rep[0] - 1) : rep[repCode];
             rep[2] = (repCode >= 2) ? rep[1] : rep[2];
@@ -673,11 +728,11 @@ typedef struct repcodes_s {
 } repcodes_t;
 
 MEM_STATIC repcodes_t
-ZSTD_newRep(U32 const rep[ZSTD_REP_NUM], U32 const offBase_minus1, U32 const ll0)
+ZSTD_newRep(U32 const rep[ZSTD_REP_NUM], U32 const offBase, U32 const ll0)
 {
     repcodes_t newReps;
     ZSTD_memcpy(&newReps, rep, sizeof(newReps));
-    ZSTD_updateRep(newReps.rep, offBase_minus1, ll0);
+    ZSTD_updateRep(newReps.rep, offBase, ll0);
     return newReps;
 }
 
@@ -685,59 +740,6 @@ ZSTD_newRep(U32 const rep[ZSTD_REP_NUM], U32 const offBase_minus1, U32 const ll0
 /*-*************************************
 *  Match length counter
 ***************************************/
-static unsigned ZSTD_NbCommonBytes (size_t val)
-{
-    if (MEM_isLittleEndian()) {
-        if (MEM_64bits()) {
-#       if (__GNUC__ >= 4)
-            return (__builtin_ctzll((U64)val) >> 3);
-#       else
-            static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2,
-                                                     0, 3, 1, 3, 1, 4, 2, 7,
-                                                     0, 2, 3, 6, 1, 5, 3, 5,
-                                                     1, 3, 4, 4, 2, 5, 6, 7,
-                                                     7, 0, 1, 2, 3, 3, 4, 6,
-                                                     2, 6, 5, 5, 3, 4, 5, 6,
-                                                     7, 1, 2, 4, 6, 4, 4, 5,
-                                                     7, 2, 6, 5, 7, 6, 7, 7 };
-            return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58];
-#       endif
-        } else { /* 32 bits */
-#       if (__GNUC__ >= 3)
-            return (__builtin_ctz((U32)val) >> 3);
-#       else
-            static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0,
-                                                     3, 2, 2, 1, 3, 2, 0, 1,
-                                                     3, 3, 1, 2, 2, 2, 2, 0,
-                                                     3, 1, 2, 0, 1, 0, 1, 1 };
-            return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27];
-#       endif
-        }
-    } else {  /* Big Endian CPU */
-        if (MEM_64bits()) {
-#       if (__GNUC__ >= 4)
-            return (__builtin_clzll(val) >> 3);
-#       else
-            unsigned r;
-            const unsigned n32 = sizeof(size_t)*4;   /* calculate this way due to compiler complaining in 32-bits mode */
-            if (!(val>>n32)) { r=4; } else { r=0; val>>=n32; }
-            if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; }
-            r += (!val);
-            return r;
-#       endif
-        } else { /* 32 bits */
-#       if (__GNUC__ >= 3)
-            return (__builtin_clz((U32)val) >> 3);
-#       else
-            unsigned r;
-            if (!(val>>16)) { r=2; val>>=8; } else { r=0; val>>=24; }
-            r += (!val);
-            return r;
-#       endif
-    }   }
-}
-
-
 MEM_STATIC size_t ZSTD_count(const BYTE* pIn, const BYTE* pMatch, const BYTE* const pInLimit)
 {
     const BYTE* const pStart = pIn;
@@ -783,32 +785,43 @@ ZSTD_count_2segments(const BYTE* ip, const BYTE* match,
  *  Hashes
  ***************************************/
 static const U32 prime3bytes = 506832829U;
-static U32    ZSTD_hash3(U32 u, U32 h) { return ((u << (32-24)) * prime3bytes)  >> (32-h) ; }
-MEM_STATIC size_t ZSTD_hash3Ptr(const void* ptr, U32 h) { return ZSTD_hash3(MEM_readLE32(ptr), h); } /* only in zstd_opt.h */
+static U32    ZSTD_hash3(U32 u, U32 h, U32 s) { assert(h <= 32); return (((u << (32-24)) * prime3bytes) ^ s)  >> (32-h) ; }
+MEM_STATIC size_t ZSTD_hash3Ptr(const void* ptr, U32 h) { return ZSTD_hash3(MEM_readLE32(ptr), h, 0); } /* only in zstd_opt.h */
+MEM_STATIC size_t ZSTD_hash3PtrS(const void* ptr, U32 h, U32 s) { return ZSTD_hash3(MEM_readLE32(ptr), h, s); }
 
 static const U32 prime4bytes = 2654435761U;
-static U32    ZSTD_hash4(U32 u, U32 h) { return (u * prime4bytes) >> (32-h) ; }
-static size_t ZSTD_hash4Ptr(const void* ptr, U32 h) { return ZSTD_hash4(MEM_read32(ptr), h); }
+static U32    ZSTD_hash4(U32 u, U32 h, U32 s) { assert(h <= 32); return ((u * prime4bytes) ^ s) >> (32-h) ; }
+static size_t ZSTD_hash4Ptr(const void* ptr, U32 h) { return ZSTD_hash4(MEM_readLE32(ptr), h, 0); }
+static size_t ZSTD_hash4PtrS(const void* ptr, U32 h, U32 s) { return ZSTD_hash4(MEM_readLE32(ptr), h, s); }
 
 static const U64 prime5bytes = 889523592379ULL;
-static size_t ZSTD_hash5(U64 u, U32 h) { return (size_t)(((u  << (64-40)) * prime5bytes) >> (64-h)) ; }
-static size_t ZSTD_hash5Ptr(const void* p, U32 h) { return ZSTD_hash5(MEM_readLE64(p), h); }
+static size_t ZSTD_hash5(U64 u, U32 h, U64 s) { assert(h <= 64); return (size_t)((((u  << (64-40)) * prime5bytes) ^ s) >> (64-h)) ; }
+static size_t ZSTD_hash5Ptr(const void* p, U32 h) { return ZSTD_hash5(MEM_readLE64(p), h, 0); }
+static size_t ZSTD_hash5PtrS(const void* p, U32 h, U64 s) { return ZSTD_hash5(MEM_readLE64(p), h, s); }
 
 static const U64 prime6bytes = 227718039650203ULL;
-static size_t ZSTD_hash6(U64 u, U32 h) { return (size_t)(((u  << (64-48)) * prime6bytes) >> (64-h)) ; }
-static size_t ZSTD_hash6Ptr(const void* p, U32 h) { return ZSTD_hash6(MEM_readLE64(p), h); }
+static size_t ZSTD_hash6(U64 u, U32 h, U64 s) { assert(h <= 64); return (size_t)((((u  << (64-48)) * prime6bytes) ^ s) >> (64-h)) ; }
+static size_t ZSTD_hash6Ptr(const void* p, U32 h) { return ZSTD_hash6(MEM_readLE64(p), h, 0); }
+static size_t ZSTD_hash6PtrS(const void* p, U32 h, U64 s) { return ZSTD_hash6(MEM_readLE64(p), h, s); }
 
 static const U64 prime7bytes = 58295818150454627ULL;
-static size_t ZSTD_hash7(U64 u, U32 h) { return (size_t)(((u  << (64-56)) * prime7bytes) >> (64-h)) ; }
-static size_t ZSTD_hash7Ptr(const void* p, U32 h) { return ZSTD_hash7(MEM_readLE64(p), h); }
+static size_t ZSTD_hash7(U64 u, U32 h, U64 s) { assert(h <= 64); return (size_t)((((u  << (64-56)) * prime7bytes) ^ s) >> (64-h)) ; }
+static size_t ZSTD_hash7Ptr(const void* p, U32 h) { return ZSTD_hash7(MEM_readLE64(p), h, 0); }
+static size_t ZSTD_hash7PtrS(const void* p, U32 h, U64 s) { return ZSTD_hash7(MEM_readLE64(p), h, s); }
 
 static const U64 prime8bytes = 0xCF1BBCDCB7A56463ULL;
-static size_t ZSTD_hash8(U64 u, U32 h) { return (size_t)(((u) * prime8bytes) >> (64-h)) ; }
-static size_t ZSTD_hash8Ptr(const void* p, U32 h) { return ZSTD_hash8(MEM_readLE64(p), h); }
+static size_t ZSTD_hash8(U64 u, U32 h, U64 s) { assert(h <= 64); return (size_t)((((u) * prime8bytes)  ^ s) >> (64-h)) ; }
+static size_t ZSTD_hash8Ptr(const void* p, U32 h) { return ZSTD_hash8(MEM_readLE64(p), h, 0); }
+static size_t ZSTD_hash8PtrS(const void* p, U32 h, U64 s) { return ZSTD_hash8(MEM_readLE64(p), h, s); }
+
 
 MEM_STATIC FORCE_INLINE_ATTR
 size_t ZSTD_hashPtr(const void* p, U32 hBits, U32 mls)
 {
+    /* Although some of these hashes do support hBits up to 64, some do not.
+     * To be on the safe side, always avoid hBits > 32. */
+    assert(hBits <= 32);
+
     switch(mls)
     {
     default:
@@ -820,6 +833,24 @@ size_t ZSTD_hashPtr(const void* p, U32 hBits, U32 mls)
     }
 }
 
+MEM_STATIC FORCE_INLINE_ATTR
+size_t ZSTD_hashPtrSalted(const void* p, U32 hBits, U32 mls, const U64 hashSalt) {
+    /* Although some of these hashes do support hBits up to 64, some do not.
+     * To be on the safe side, always avoid hBits > 32. */
+    assert(hBits <= 32);
+
+    switch(mls)
+    {
+        default:
+        case 4: return ZSTD_hash4PtrS(p, hBits, (U32)hashSalt);
+        case 5: return ZSTD_hash5PtrS(p, hBits, hashSalt);
+        case 6: return ZSTD_hash6PtrS(p, hBits, hashSalt);
+        case 7: return ZSTD_hash7PtrS(p, hBits, hashSalt);
+        case 8: return ZSTD_hash8PtrS(p, hBits, hashSalt);
+    }
+}
+
+
 /* ZSTD_ipow() :
  * Return base^exponent.
  */
@@ -1167,10 +1198,15 @@ ZSTD_checkDictValidity(const ZSTD_window_t* window,
                     (unsigned)blockEndIdx, (unsigned)maxDist, (unsigned)loadedDictEnd);
         assert(blockEndIdx >= loadedDictEnd);
 
-        if (blockEndIdx > loadedDictEnd + maxDist) {
+        if (blockEndIdx > loadedDictEnd + maxDist || loadedDictEnd != window->dictLimit) {
             /* On reaching window size, dictionaries are invalidated.
              * For simplification, if window size is reached anywhere within next block,
              * the dictionary is invalidated for the full block.
+             *
+             * We also have to invalidate the dictionary if ZSTD_window_update() has detected
+             * non-contiguous segments, which means that loadedDictEnd != window->dictLimit.
+             * loadedDictEnd may be 0, if forceWindow is true, but in that case we never use
+             * dictMatchState, so setting it to NULL is not a problem.
              */
             DEBUGLOG(6, "invalidating dictionary for current block (distance > windowSize)");
             *loadedDictEndPtr = 0;
@@ -1302,6 +1338,42 @@ MEM_STATIC void ZSTD_debugTable(const U32* table, U32 max)
 
 #endif
 
+/* Short Cache */
+
+/* Normally, zstd matchfinders follow this flow:
+ *     1. Compute hash at ip
+ *     2. Load index from hashTable[hash]
+ *     3. Check if *ip == *(base + index)
+ * In dictionary compression, loading *(base + index) is often an L2 or even L3 miss.
+ *
+ * Short cache is an optimization which allows us to avoid step 3 most of the time
+ * when the data doesn't actually match. With short cache, the flow becomes:
+ *     1. Compute (hash, currentTag) at ip. currentTag is an 8-bit independent hash at ip.
+ *     2. Load (index, matchTag) from hashTable[hash]. See ZSTD_writeTaggedIndex to understand how this works.
+ *     3. Only if currentTag == matchTag, check *ip == *(base + index). Otherwise, continue.
+ *
+ * Currently, short cache is only implemented in CDict hashtables. Thus, its use is limited to
+ * dictMatchState matchfinders.
+ */
+#define ZSTD_SHORT_CACHE_TAG_BITS 8
+#define ZSTD_SHORT_CACHE_TAG_MASK ((1u << ZSTD_SHORT_CACHE_TAG_BITS) - 1)
+
+/* Helper function for ZSTD_fillHashTable and ZSTD_fillDoubleHashTable.
+ * Unpacks hashAndTag into (hash, tag), then packs (index, tag) into hashTable[hash]. */
+MEM_STATIC void ZSTD_writeTaggedIndex(U32* const hashTable, size_t hashAndTag, U32 index) {
+    size_t const hash = hashAndTag >> ZSTD_SHORT_CACHE_TAG_BITS;
+    U32 const tag = (U32)(hashAndTag & ZSTD_SHORT_CACHE_TAG_MASK);
+    assert(index >> (32 - ZSTD_SHORT_CACHE_TAG_BITS) == 0);
+    hashTable[hash] = (index << ZSTD_SHORT_CACHE_TAG_BITS) | tag;
+}
+
+/* Helper function for short cache matchfinders.
+ * Unpacks tag1 and tag2 from lower bits of packedTag1 and packedTag2, then checks if the tags match. */
+MEM_STATIC int ZSTD_comparePackedTags(size_t packedTag1, size_t packedTag2) {
+    U32 const tag1 = packedTag1 & ZSTD_SHORT_CACHE_TAG_MASK;
+    U32 const tag2 = packedTag2 & ZSTD_SHORT_CACHE_TAG_MASK;
+    return tag1 == tag2;
+}
 
 
 /* ===============================================================
@@ -1396,4 +1468,51 @@ U32 ZSTD_cycleLog(U32 hashLog, ZSTD_strategy strat);
  */
 void ZSTD_CCtx_trace(ZSTD_CCtx* cctx, size_t extraCSize);
 
+/* Returns 0 on success, and a ZSTD_error otherwise. This function scans through an array of
+ * ZSTD_Sequence, storing the sequences it finds, until it reaches a block delimiter.
+ * Note that the block delimiter must include the last literals of the block.
+ */
+size_t
+ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx,
+                                              ZSTD_sequencePosition* seqPos,
+                                        const ZSTD_Sequence* const inSeqs, size_t inSeqsSize,
+                                        const void* src, size_t blockSize, ZSTD_paramSwitch_e externalRepSearch);
+
+/* Returns the number of bytes to move the current read position back by.
+ * Only non-zero if we ended up splitting a sequence.
+ * Otherwise, it may return a ZSTD error if something went wrong.
+ *
+ * This function will attempt to scan through blockSize bytes
+ * represented by the sequences in @inSeqs,
+ * storing any (partial) sequences.
+ *
+ * Occasionally, we may want to change the actual number of bytes we consumed from inSeqs to
+ * avoid splitting a match, or to avoid splitting a match such that it would produce a match
+ * smaller than MINMATCH. In this case, we return the number of bytes that we didn't read from this block.
+ */
+size_t
+ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* seqPos,
+                                   const ZSTD_Sequence* const inSeqs, size_t inSeqsSize,
+                                   const void* src, size_t blockSize, ZSTD_paramSwitch_e externalRepSearch);
+
+
+/* ===============================================================
+ * Deprecated definitions that are still used internally to avoid
+ * deprecation warnings. These functions are exactly equivalent to
+ * their public variants, but avoid the deprecation warnings.
+ * =============================================================== */
+
+size_t ZSTD_compressBegin_usingCDict_deprecated(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict);
+
+size_t ZSTD_compressContinue_public(ZSTD_CCtx* cctx,
+                                    void* dst, size_t dstCapacity,
+                              const void* src, size_t srcSize);
+
+size_t ZSTD_compressEnd_public(ZSTD_CCtx* cctx,
+                               void* dst, size_t dstCapacity,
+                         const void* src, size_t srcSize);
+
+size_t ZSTD_compressBlock_deprecated(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+
+
 #endif /* ZSTD_COMPRESS_H */
diff --git a/lib/zstd/compress/zstd_compress_literals.c b/lib/zstd/compress/zstd_compress_literals.c
index 52b0a8059aba95..3e9ea46a670a6d 100644
--- a/lib/zstd/compress/zstd_compress_literals.c
+++ b/lib/zstd/compress/zstd_compress_literals.c
@@ -1,5 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -13,11 +14,36 @@
  ***************************************/
 #include "zstd_compress_literals.h"
 
+
+/* **************************************************************
+*  Debug Traces
+****************************************************************/
+#if DEBUGLEVEL >= 2
+
+static size_t showHexa(const void* src, size_t srcSize)
+{
+    const BYTE* const ip = (const BYTE*)src;
+    size_t u;
+    for (u=0; u<srcSize; u++) {
+        RAWLOG(5, " %02X", ip[u]); (void)ip;
+    }
+    RAWLOG(5, " \n");
+    return srcSize;
+}
+
+#endif
+
+
+/* **************************************************************
+*  Literals compression - special cases
+****************************************************************/
 size_t ZSTD_noCompressLiterals (void* dst, size_t dstCapacity, const void* src, size_t srcSize)
 {
     BYTE* const ostart = (BYTE*)dst;
     U32   const flSize = 1 + (srcSize>31) + (srcSize>4095);
 
+    DEBUGLOG(5, "ZSTD_noCompressLiterals: srcSize=%zu, dstCapacity=%zu", srcSize, dstCapacity);
+
     RETURN_ERROR_IF(srcSize + flSize > dstCapacity, dstSize_tooSmall, "");
 
     switch(flSize)
@@ -36,16 +62,30 @@ size_t ZSTD_noCompressLiterals (void* dst, size_t dstCapacity, const void* src,
     }
 
     ZSTD_memcpy(ostart + flSize, src, srcSize);
-    DEBUGLOG(5, "Raw literals: %u -> %u", (U32)srcSize, (U32)(srcSize + flSize));
+    DEBUGLOG(5, "Raw (uncompressed) literals: %u -> %u", (U32)srcSize, (U32)(srcSize + flSize));
     return srcSize + flSize;
 }
 
+static int allBytesIdentical(const void* src, size_t srcSize)
+{
+    assert(srcSize >= 1);
+    assert(src != NULL);
+    {   const BYTE b = ((const BYTE*)src)[0];
+        size_t p;
+        for (p=1; p<srcSize; p++) {
+            if (((const BYTE*)src)[p] != b) return 0;
+        }
+        return 1;
+    }
+}
+
 size_t ZSTD_compressRleLiteralsBlock (void* dst, size_t dstCapacity, const void* src, size_t srcSize)
 {
     BYTE* const ostart = (BYTE*)dst;
     U32   const flSize = 1 + (srcSize>31) + (srcSize>4095);
 
-    (void)dstCapacity;  /* dstCapacity already guaranteed to be >=4, hence large enough */
+    assert(dstCapacity >= 4); (void)dstCapacity;
+    assert(allBytesIdentical(src, srcSize));
 
     switch(flSize)
     {
@@ -63,28 +103,51 @@ size_t ZSTD_compressRleLiteralsBlock (void* dst, size_t dstCapacity, const void*
     }
 
     ostart[flSize] = *(const BYTE*)src;
-    DEBUGLOG(5, "RLE literals: %u -> %u", (U32)srcSize, (U32)flSize + 1);
+    DEBUGLOG(5, "RLE : Repeated Literal (%02X: %u times) -> %u bytes encoded", ((const BYTE*)src)[0], (U32)srcSize, (U32)flSize + 1);
     return flSize+1;
 }
 
-size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf,
-                              ZSTD_hufCTables_t* nextHuf,
-                              ZSTD_strategy strategy, int disableLiteralCompression,
-                              void* dst, size_t dstCapacity,
-                        const void* src, size_t srcSize,
-                              void* entropyWorkspace, size_t entropyWorkspaceSize,
-                        const int bmi2,
-                        unsigned suspectUncompressible)
+/* ZSTD_minLiteralsToCompress() :
+ * returns minimal amount of literals
+ * for literal compression to even be attempted.
+ * Minimum is made tighter as compression strategy increases.
+ */
+static size_t
+ZSTD_minLiteralsToCompress(ZSTD_strategy strategy, HUF_repeat huf_repeat)
+{
+    assert((int)strategy >= 0);
+    assert((int)strategy <= 9);
+    /* btultra2 : min 8 bytes;
+     * then 2x larger for each successive compression strategy
+     * max threshold 64 bytes */
+    {   int const shift = MIN(9-(int)strategy, 3);
+        size_t const mintc = (huf_repeat == HUF_repeat_valid) ? 6 : (size_t)8 << shift;
+        DEBUGLOG(7, "minLiteralsToCompress = %zu", mintc);
+        return mintc;
+    }
+}
+
+size_t ZSTD_compressLiterals (
+                  void* dst, size_t dstCapacity,
+            const void* src, size_t srcSize,
+                  void* entropyWorkspace, size_t entropyWorkspaceSize,
+            const ZSTD_hufCTables_t* prevHuf,
+                  ZSTD_hufCTables_t* nextHuf,
+                  ZSTD_strategy strategy,
+                  int disableLiteralCompression,
+                  int suspectUncompressible,
+                  int bmi2)
 {
-    size_t const minGain = ZSTD_minGain(srcSize, strategy);
     size_t const lhSize = 3 + (srcSize >= 1 KB) + (srcSize >= 16 KB);
     BYTE*  const ostart = (BYTE*)dst;
     U32 singleStream = srcSize < 256;
     symbolEncodingType_e hType = set_compressed;
     size_t cLitSize;
 
-    DEBUGLOG(5,"ZSTD_compressLiterals (disableLiteralCompression=%i srcSize=%u)",
-                disableLiteralCompression, (U32)srcSize);
+    DEBUGLOG(5,"ZSTD_compressLiterals (disableLiteralCompression=%i, srcSize=%u, dstCapacity=%zu)",
+                disableLiteralCompression, (U32)srcSize, dstCapacity);
+
+    DEBUGLOG(6, "Completed literals listing (%zu bytes)", showHexa(src, srcSize));
 
     /* Prepare nextEntropy assuming reusing the existing table */
     ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
@@ -92,40 +155,51 @@ size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf,
     if (disableLiteralCompression)
         return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
 
-    /* small ? don't even attempt compression (speed opt) */
-#   define COMPRESS_LITERALS_SIZE_MIN 63
-    {   size_t const minLitSize = (prevHuf->repeatMode == HUF_repeat_valid) ? 6 : COMPRESS_LITERALS_SIZE_MIN;
-        if (srcSize <= minLitSize) return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
-    }
+    /* if too small, don't even attempt compression (speed opt) */
+    if (srcSize < ZSTD_minLiteralsToCompress(strategy, prevHuf->repeatMode))
+        return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
 
     RETURN_ERROR_IF(dstCapacity < lhSize+1, dstSize_tooSmall, "not enough space for compression");
     {   HUF_repeat repeat = prevHuf->repeatMode;
-        int const preferRepeat = strategy < ZSTD_lazy ? srcSize <= 1024 : 0;
+        int const flags = 0
+            | (bmi2 ? HUF_flags_bmi2 : 0)
+            | (strategy < ZSTD_lazy && srcSize <= 1024 ? HUF_flags_preferRepeat : 0)
+            | (strategy >= HUF_OPTIMAL_DEPTH_THRESHOLD ? HUF_flags_optimalDepth : 0)
+            | (suspectUncompressible ? HUF_flags_suspectUncompressible : 0);
+
+        typedef size_t (*huf_compress_f)(void*, size_t, const void*, size_t, unsigned, unsigned, void*, size_t, HUF_CElt*, HUF_repeat*, int);
+        huf_compress_f huf_compress;
         if (repeat == HUF_repeat_valid && lhSize == 3) singleStream = 1;
-        cLitSize = singleStream ?
-            HUF_compress1X_repeat(
-                ostart+lhSize, dstCapacity-lhSize, src, srcSize,
-                HUF_SYMBOLVALUE_MAX, HUF_TABLELOG_DEFAULT, entropyWorkspace, entropyWorkspaceSize,
-                (HUF_CElt*)nextHuf->CTable, &repeat, preferRepeat, bmi2, suspectUncompressible) :
-            HUF_compress4X_repeat(
-                ostart+lhSize, dstCapacity-lhSize, src, srcSize,
-                HUF_SYMBOLVALUE_MAX, HUF_TABLELOG_DEFAULT, entropyWorkspace, entropyWorkspaceSize,
-                (HUF_CElt*)nextHuf->CTable, &repeat, preferRepeat, bmi2, suspectUncompressible);
+        huf_compress = singleStream ? HUF_compress1X_repeat : HUF_compress4X_repeat;
+        cLitSize = huf_compress(ostart+lhSize, dstCapacity-lhSize,
+                                src, srcSize,
+                                HUF_SYMBOLVALUE_MAX, LitHufLog,
+                                entropyWorkspace, entropyWorkspaceSize,
+                                (HUF_CElt*)nextHuf->CTable,
+                                &repeat, flags);
+        DEBUGLOG(5, "%zu literals compressed into %zu bytes (before header)", srcSize, cLitSize);
         if (repeat != HUF_repeat_none) {
             /* reused the existing table */
-            DEBUGLOG(5, "Reusing previous huffman table");
+            DEBUGLOG(5, "reusing statistics from previous huffman block");
             hType = set_repeat;
         }
     }
 
-    if ((cLitSize==0) || (cLitSize >= srcSize - minGain) || ERR_isError(cLitSize)) {
-        ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
-        return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
-    }
+    {   size_t const minGain = ZSTD_minGain(srcSize, strategy);
+        if ((cLitSize==0) || (cLitSize >= srcSize - minGain) || ERR_isError(cLitSize)) {
+            ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
+            return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
+    }   }
     if (cLitSize==1) {
-        ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
-        return ZSTD_compressRleLiteralsBlock(dst, dstCapacity, src, srcSize);
-    }
+        /* A return value of 1 signals that the alphabet consists of a single symbol.
+         * However, in some rare circumstances, it could be the compressed size (a single byte).
+         * For that outcome to have a chance to happen, it's necessary that `srcSize < 8`.
+         * (it's also necessary to not generate statistics).
+         * Therefore, in such a case, actively check that all bytes are identical. */
+        if ((srcSize >= 8) || allBytesIdentical(src, srcSize)) {
+            ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
+            return ZSTD_compressRleLiteralsBlock(dst, dstCapacity, src, srcSize);
+    }   }
 
     if (hType == set_compressed) {
         /* using a newly constructed table */
@@ -136,16 +210,19 @@ size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf,
     switch(lhSize)
     {
     case 3: /* 2 - 2 - 10 - 10 */
-        {   U32 const lhc = hType + ((!singleStream) << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<14);
+        if (!singleStream) assert(srcSize >= MIN_LITERALS_FOR_4_STREAMS);
+        {   U32 const lhc = hType + ((U32)(!singleStream) << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<14);
             MEM_writeLE24(ostart, lhc);
             break;
         }
     case 4: /* 2 - 2 - 14 - 14 */
+        assert(srcSize >= MIN_LITERALS_FOR_4_STREAMS);
         {   U32 const lhc = hType + (2 << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<18);
             MEM_writeLE32(ostart, lhc);
             break;
         }
     case 5: /* 2 - 2 - 18 - 18 */
+        assert(srcSize >= MIN_LITERALS_FOR_4_STREAMS);
         {   U32 const lhc = hType + (3 << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<22);
             MEM_writeLE32(ostart, lhc);
             ostart[4] = (BYTE)(cLitSize >> 10);
diff --git a/lib/zstd/compress/zstd_compress_literals.h b/lib/zstd/compress/zstd_compress_literals.h
index 9775fb97cb7025..a2a85d6b69e537 100644
--- a/lib/zstd/compress/zstd_compress_literals.h
+++ b/lib/zstd/compress/zstd_compress_literals.h
@@ -1,5 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -16,16 +17,24 @@
 
 size_t ZSTD_noCompressLiterals (void* dst, size_t dstCapacity, const void* src, size_t srcSize);
 
+/* ZSTD_compressRleLiteralsBlock() :
+ * Conditions :
+ * - All bytes in @src are identical
+ * - dstCapacity >= 4 */
 size_t ZSTD_compressRleLiteralsBlock (void* dst, size_t dstCapacity, const void* src, size_t srcSize);
 
-/* If suspectUncompressible then some sampling checks will be run to potentially skip huffman coding */
-size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf,
-                              ZSTD_hufCTables_t* nextHuf,
-                              ZSTD_strategy strategy, int disableLiteralCompression,
-                              void* dst, size_t dstCapacity,
+/* ZSTD_compressLiterals():
+ * @entropyWorkspace: must be aligned on 4-bytes boundaries
+ * @entropyWorkspaceSize : must be >= HUF_WORKSPACE_SIZE
+ * @suspectUncompressible: sampling checks, to potentially skip huffman coding
+ */
+size_t ZSTD_compressLiterals (void* dst, size_t dstCapacity,
                         const void* src, size_t srcSize,
                               void* entropyWorkspace, size_t entropyWorkspaceSize,
-                        const int bmi2,
-                        unsigned suspectUncompressible);
+                        const ZSTD_hufCTables_t* prevHuf,
+                              ZSTD_hufCTables_t* nextHuf,
+                              ZSTD_strategy strategy, int disableLiteralCompression,
+                              int suspectUncompressible,
+                              int bmi2);
 
 #endif /* ZSTD_COMPRESS_LITERALS_H */
diff --git a/lib/zstd/compress/zstd_compress_sequences.c b/lib/zstd/compress/zstd_compress_sequences.c
index 21ddc1b37acf8e..5c028c78d889be 100644
--- a/lib/zstd/compress/zstd_compress_sequences.c
+++ b/lib/zstd/compress/zstd_compress_sequences.c
@@ -1,5 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -58,7 +59,7 @@ static unsigned ZSTD_useLowProbCount(size_t const nbSeq)
 {
     /* Heuristic: This should cover most blocks <= 16K and
      * start to fade out after 16K to about 32K depending on
-     * comprssibility.
+     * compressibility.
      */
     return nbSeq >= 2048;
 }
@@ -166,7 +167,7 @@ ZSTD_selectEncodingType(
     if (mostFrequent == nbSeq) {
         *repeatMode = FSE_repeat_none;
         if (isDefaultAllowed && nbSeq <= 2) {
-            /* Prefer set_basic over set_rle when there are 2 or less symbols,
+            /* Prefer set_basic over set_rle when there are 2 or fewer symbols,
              * since RLE uses 1 byte, but set_basic uses 5-6 bits per symbol.
              * If basic encoding isn't possible, always choose RLE.
              */
diff --git a/lib/zstd/compress/zstd_compress_sequences.h b/lib/zstd/compress/zstd_compress_sequences.h
index 7991364c2f71ff..7fe6f4ff5cf251 100644
--- a/lib/zstd/compress/zstd_compress_sequences.h
+++ b/lib/zstd/compress/zstd_compress_sequences.h
@@ -1,5 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
diff --git a/lib/zstd/compress/zstd_compress_superblock.c b/lib/zstd/compress/zstd_compress_superblock.c
index 17d836cc84e8fa..dbacbaf7273388 100644
--- a/lib/zstd/compress/zstd_compress_superblock.c
+++ b/lib/zstd/compress/zstd_compress_superblock.c
@@ -1,5 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -36,13 +37,14 @@
  *      If it is set_compressed, first sub-block's literals section will be Treeless_Literals_Block
  *      and the following sub-blocks' literals sections will be Treeless_Literals_Block.
  *  @return : compressed size of literals section of a sub-block
- *            Or 0 if it unable to compress.
+ *            Or 0 if unable to compress.
  *            Or error code */
-static size_t ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable,
-                                    const ZSTD_hufCTablesMetadata_t* hufMetadata,
-                                    const BYTE* literals, size_t litSize,
-                                    void* dst, size_t dstSize,
-                                    const int bmi2, int writeEntropy, int* entropyWritten)
+static size_t
+ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable,
+                              const ZSTD_hufCTablesMetadata_t* hufMetadata,
+                              const BYTE* literals, size_t litSize,
+                              void* dst, size_t dstSize,
+                              const int bmi2, int writeEntropy, int* entropyWritten)
 {
     size_t const header = writeEntropy ? 200 : 0;
     size_t const lhSize = 3 + (litSize >= (1 KB - header)) + (litSize >= (16 KB - header));
@@ -53,8 +55,6 @@ static size_t ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable,
     symbolEncodingType_e hType = writeEntropy ? hufMetadata->hType : set_repeat;
     size_t cLitSize = 0;
 
-    (void)bmi2; /* TODO bmi2... */
-
     DEBUGLOG(5, "ZSTD_compressSubBlock_literal (litSize=%zu, lhSize=%zu, writeEntropy=%d)", litSize, lhSize, writeEntropy);
 
     *entropyWritten = 0;
@@ -76,9 +76,9 @@ static size_t ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable,
         DEBUGLOG(5, "ZSTD_compressSubBlock_literal (hSize=%zu)", hufMetadata->hufDesSize);
     }
 
-    /* TODO bmi2 */
-    {   const size_t cSize = singleStream ? HUF_compress1X_usingCTable(op, oend-op, literals, litSize, hufTable)
-                                          : HUF_compress4X_usingCTable(op, oend-op, literals, litSize, hufTable);
+    {   int const flags = bmi2 ? HUF_flags_bmi2 : 0;
+        const size_t cSize = singleStream ? HUF_compress1X_usingCTable(op, oend-op, literals, litSize, hufTable, flags)
+                                          : HUF_compress4X_usingCTable(op, oend-op, literals, litSize, hufTable, flags);
         op += cSize;
         cLitSize += cSize;
         if (cSize == 0 || ERR_isError(cSize)) {
@@ -126,7 +126,11 @@ static size_t ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable,
     return op-ostart;
 }
 
-static size_t ZSTD_seqDecompressedSize(seqStore_t const* seqStore, const seqDef* sequences, size_t nbSeq, size_t litSize, int lastSequence) {
+static size_t
+ZSTD_seqDecompressedSize(seqStore_t const* seqStore,
+                   const seqDef* sequences, size_t nbSeq,
+                         size_t litSize, int lastSequence)
+{
     const seqDef* const sstart = sequences;
     const seqDef* const send = sequences + nbSeq;
     const seqDef* sp = sstart;
@@ -156,13 +160,14 @@ static size_t ZSTD_seqDecompressedSize(seqStore_t const* seqStore, const seqDef*
  *  @return : compressed size of sequences section of a sub-block
  *            Or 0 if it is unable to compress
  *            Or error code. */
-static size_t ZSTD_compressSubBlock_sequences(const ZSTD_fseCTables_t* fseTables,
-                                              const ZSTD_fseCTablesMetadata_t* fseMetadata,
-                                              const seqDef* sequences, size_t nbSeq,
-                                              const BYTE* llCode, const BYTE* mlCode, const BYTE* ofCode,
-                                              const ZSTD_CCtx_params* cctxParams,
-                                              void* dst, size_t dstCapacity,
-                                              const int bmi2, int writeEntropy, int* entropyWritten)
+static size_t
+ZSTD_compressSubBlock_sequences(const ZSTD_fseCTables_t* fseTables,
+                                const ZSTD_fseCTablesMetadata_t* fseMetadata,
+                                const seqDef* sequences, size_t nbSeq,
+                                const BYTE* llCode, const BYTE* mlCode, const BYTE* ofCode,
+                                const ZSTD_CCtx_params* cctxParams,
+                                void* dst, size_t dstCapacity,
+                                const int bmi2, int writeEntropy, int* entropyWritten)
 {
     const int longOffsets = cctxParams->cParams.windowLog > STREAM_ACCUMULATOR_MIN;
     BYTE* const ostart = (BYTE*)dst;
@@ -539,7 +544,7 @@ static size_t ZSTD_compressSubBlock_multi(const seqStore_t* seqStorePtr,
             repcodes_t rep;
             ZSTD_memcpy(&rep, prevCBlock->rep, sizeof(rep));
             for (seq = sstart; seq < sp; ++seq) {
-                ZSTD_updateRep(rep.rep, seq->offBase - 1, ZSTD_getSequenceLength(seqStorePtr, seq).litLength == 0);
+                ZSTD_updateRep(rep.rep, seq->offBase, ZSTD_getSequenceLength(seqStorePtr, seq).litLength == 0);
             }
             ZSTD_memcpy(nextCBlock->rep, &rep, sizeof(rep));
         }
diff --git a/lib/zstd/compress/zstd_compress_superblock.h b/lib/zstd/compress/zstd_compress_superblock.h
index 224ece79546ebb..826bbc9e029b18 100644
--- a/lib/zstd/compress/zstd_compress_superblock.h
+++ b/lib/zstd/compress/zstd_compress_superblock.h
@@ -1,5 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
diff --git a/lib/zstd/compress/zstd_cwksp.h b/lib/zstd/compress/zstd_cwksp.h
index 349fc923c355a9..65ea53b6284479 100644
--- a/lib/zstd/compress/zstd_cwksp.h
+++ b/lib/zstd/compress/zstd_cwksp.h
@@ -1,5 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -14,7 +15,9 @@
 /*-*************************************
 *  Dependencies
 ***************************************/
+#include "../common/allocations.h"  /* ZSTD_customMalloc, ZSTD_customFree */
 #include "../common/zstd_internal.h"
+#include "../common/portability_macros.h"
 
 
 /*-*************************************
@@ -41,8 +44,9 @@
 ***************************************/
 typedef enum {
     ZSTD_cwksp_alloc_objects,
-    ZSTD_cwksp_alloc_buffers,
-    ZSTD_cwksp_alloc_aligned
+    ZSTD_cwksp_alloc_aligned_init_once,
+    ZSTD_cwksp_alloc_aligned,
+    ZSTD_cwksp_alloc_buffers
 } ZSTD_cwksp_alloc_phase_e;
 
 /*
@@ -95,8 +99,8 @@ typedef enum {
  *
  * Workspace Layout:
  *
- * [                        ... workspace ...                         ]
- * [objects][tables ... ->] free space [<- ... aligned][<- ... buffers]
+ * [                        ... workspace ...                           ]
+ * [objects][tables ->] free space [<- buffers][<- aligned][<- init once]
  *
  * The various objects that live in the workspace are divided into the
  * following categories, and are allocated separately:
@@ -120,9 +124,18 @@ typedef enum {
  *   uint32_t arrays, all of whose values are between 0 and (nextSrc - base).
  *   Their sizes depend on the cparams. These tables are 64-byte aligned.
  *
- * - Aligned: these buffers are used for various purposes that require 4 byte
- *   alignment, but don't require any initialization before they're used. These
- *   buffers are each aligned to 64 bytes.
+ * - Init once: these buffers require to be initialized at least once before
+ *   use. They should be used when we want to skip memory initialization
+ *   while not triggering memory checkers (like Valgrind) when reading from
+ *   from this memory without writing to it first.
+ *   These buffers should be used carefully as they might contain data
+ *   from previous compressions.
+ *   Buffers are aligned to 64 bytes.
+ *
+ * - Aligned: these buffers don't require any initialization before they're
+ *   used. The user of the buffer should make sure they write into a buffer
+ *   location before reading from it.
+ *   Buffers are aligned to 64 bytes.
  *
  * - Buffers: these buffers are used for various purposes that don't require
  *   any alignment or initialization before they're used. This means they can
@@ -134,8 +147,9 @@ typedef enum {
  * correctly packed into the workspace buffer. That order is:
  *
  * 1. Objects
- * 2. Buffers
- * 3. Aligned/Tables
+ * 2. Init once / Tables
+ * 3. Aligned / Tables
+ * 4. Buffers / Tables
  *
  * Attempts to reserve objects of different types out of order will fail.
  */
@@ -147,6 +161,7 @@ typedef struct {
     void* tableEnd;
     void* tableValidEnd;
     void* allocStart;
+    void* initOnceStart;
 
     BYTE allocFailed;
     int workspaceOversizedDuration;
@@ -159,6 +174,7 @@ typedef struct {
 ***************************************/
 
 MEM_STATIC size_t ZSTD_cwksp_available_space(ZSTD_cwksp* ws);
+MEM_STATIC void*  ZSTD_cwksp_initialAllocStart(ZSTD_cwksp* ws);
 
 MEM_STATIC void ZSTD_cwksp_assert_internal_consistency(ZSTD_cwksp* ws) {
     (void)ws;
@@ -168,6 +184,8 @@ MEM_STATIC void ZSTD_cwksp_assert_internal_consistency(ZSTD_cwksp* ws) {
     assert(ws->tableEnd <= ws->allocStart);
     assert(ws->tableValidEnd <= ws->allocStart);
     assert(ws->allocStart <= ws->workspaceEnd);
+    assert(ws->initOnceStart <= ZSTD_cwksp_initialAllocStart(ws));
+    assert(ws->workspace <= ws->initOnceStart);
 }
 
 /*
@@ -210,14 +228,10 @@ MEM_STATIC size_t ZSTD_cwksp_aligned_alloc_size(size_t size) {
  * for internal purposes (currently only alignment).
  */
 MEM_STATIC size_t ZSTD_cwksp_slack_space_required(void) {
-    /* For alignment, the wksp will always allocate an additional n_1=[1, 64] bytes
-     * to align the beginning of tables section, as well as another n_2=[0, 63] bytes
-     * to align the beginning of the aligned section.
-     *
-     * n_1 + n_2 == 64 bytes if the cwksp is freshly allocated, due to tables and
-     * aligneds being sized in multiples of 64 bytes.
+    /* For alignment, the wksp will always allocate an additional 2*ZSTD_CWKSP_ALIGNMENT_BYTES
+     * bytes to align the beginning of tables section and end of buffers;
      */
-    size_t const slackSpace = ZSTD_CWKSP_ALIGNMENT_BYTES;
+    size_t const slackSpace = ZSTD_CWKSP_ALIGNMENT_BYTES * 2;
     return slackSpace;
 }
 
@@ -230,10 +244,18 @@ MEM_STATIC size_t ZSTD_cwksp_bytes_to_align_ptr(void* ptr, const size_t alignByt
     size_t const alignBytesMask = alignBytes - 1;
     size_t const bytes = (alignBytes - ((size_t)ptr & (alignBytesMask))) & alignBytesMask;
     assert((alignBytes & alignBytesMask) == 0);
-    assert(bytes != ZSTD_CWKSP_ALIGNMENT_BYTES);
+    assert(bytes < alignBytes);
     return bytes;
 }
 
+/*
+ * Returns the initial value for allocStart which is used to determine the position from
+ * which we can allocate from the end of the workspace.
+ */
+MEM_STATIC void*  ZSTD_cwksp_initialAllocStart(ZSTD_cwksp* ws) {
+    return (void*)((size_t)ws->workspaceEnd & ~(ZSTD_CWKSP_ALIGNMENT_BYTES-1));
+}
+
 /*
  * Internal function. Do not use directly.
  * Reserves the given number of bytes within the aligned/buffer segment of the wksp,
@@ -274,27 +296,16 @@ ZSTD_cwksp_internal_advance_phase(ZSTD_cwksp* ws, ZSTD_cwksp_alloc_phase_e phase
 {
     assert(phase >= ws->phase);
     if (phase > ws->phase) {
-        /* Going from allocating objects to allocating buffers */
-        if (ws->phase < ZSTD_cwksp_alloc_buffers &&
-                phase >= ZSTD_cwksp_alloc_buffers) {
+        /* Going from allocating objects to allocating initOnce / tables */
+        if (ws->phase < ZSTD_cwksp_alloc_aligned_init_once &&
+            phase >= ZSTD_cwksp_alloc_aligned_init_once) {
             ws->tableValidEnd = ws->objectEnd;
-        }
+            ws->initOnceStart = ZSTD_cwksp_initialAllocStart(ws);
 
-        /* Going from allocating buffers to allocating aligneds/tables */
-        if (ws->phase < ZSTD_cwksp_alloc_aligned &&
-                phase >= ZSTD_cwksp_alloc_aligned) {
-            {   /* Align the start of the "aligned" to 64 bytes. Use [1, 64] bytes. */
-                size_t const bytesToAlign =
-                    ZSTD_CWKSP_ALIGNMENT_BYTES - ZSTD_cwksp_bytes_to_align_ptr(ws->allocStart, ZSTD_CWKSP_ALIGNMENT_BYTES);
-                DEBUGLOG(5, "reserving aligned alignment addtl space: %zu", bytesToAlign);
-                ZSTD_STATIC_ASSERT((ZSTD_CWKSP_ALIGNMENT_BYTES & (ZSTD_CWKSP_ALIGNMENT_BYTES - 1)) == 0); /* power of 2 */
-                RETURN_ERROR_IF(!ZSTD_cwksp_reserve_internal_buffer_space(ws, bytesToAlign),
-                                memory_allocation, "aligned phase - alignment initial allocation failed!");
-            }
             {   /* Align the start of the tables to 64 bytes. Use [0, 63] bytes */
-                void* const alloc = ws->objectEnd;
+                void *const alloc = ws->objectEnd;
                 size_t const bytesToAlign = ZSTD_cwksp_bytes_to_align_ptr(alloc, ZSTD_CWKSP_ALIGNMENT_BYTES);
-                void* const objectEnd = (BYTE*)alloc + bytesToAlign;
+                void *const objectEnd = (BYTE *) alloc + bytesToAlign;
                 DEBUGLOG(5, "reserving table alignment addtl space: %zu", bytesToAlign);
                 RETURN_ERROR_IF(objectEnd > ws->workspaceEnd, memory_allocation,
                                 "table phase - alignment initial allocation failed!");
@@ -302,7 +313,9 @@ ZSTD_cwksp_internal_advance_phase(ZSTD_cwksp* ws, ZSTD_cwksp_alloc_phase_e phase
                 ws->tableEnd = objectEnd;  /* table area starts being empty */
                 if (ws->tableValidEnd < ws->tableEnd) {
                     ws->tableValidEnd = ws->tableEnd;
-        }   }   }
+                }
+            }
+        }
         ws->phase = phase;
         ZSTD_cwksp_assert_internal_consistency(ws);
     }
@@ -314,7 +327,7 @@ ZSTD_cwksp_internal_advance_phase(ZSTD_cwksp* ws, ZSTD_cwksp_alloc_phase_e phase
  */
 MEM_STATIC int ZSTD_cwksp_owns_buffer(const ZSTD_cwksp* ws, const void* ptr)
 {
-    return (ptr != NULL) && (ws->workspace <= ptr) && (ptr <= ws->workspaceEnd);
+    return (ptr != NULL) && (ws->workspace <= ptr) && (ptr < ws->workspaceEnd);
 }
 
 /*
@@ -343,6 +356,33 @@ MEM_STATIC BYTE* ZSTD_cwksp_reserve_buffer(ZSTD_cwksp* ws, size_t bytes)
     return (BYTE*)ZSTD_cwksp_reserve_internal(ws, bytes, ZSTD_cwksp_alloc_buffers);
 }
 
+/*
+ * Reserves and returns memory sized on and aligned on ZSTD_CWKSP_ALIGNMENT_BYTES (64 bytes).
+ * This memory has been initialized at least once in the past.
+ * This doesn't mean it has been initialized this time, and it might contain data from previous
+ * operations.
+ * The main usage is for algorithms that might need read access into uninitialized memory.
+ * The algorithm must maintain safety under these conditions and must make sure it doesn't
+ * leak any of the past data (directly or in side channels).
+ */
+MEM_STATIC void* ZSTD_cwksp_reserve_aligned_init_once(ZSTD_cwksp* ws, size_t bytes)
+{
+    size_t const alignedBytes = ZSTD_cwksp_align(bytes, ZSTD_CWKSP_ALIGNMENT_BYTES);
+    void* ptr = ZSTD_cwksp_reserve_internal(ws, alignedBytes, ZSTD_cwksp_alloc_aligned_init_once);
+    assert(((size_t)ptr & (ZSTD_CWKSP_ALIGNMENT_BYTES-1))== 0);
+    if(ptr && ptr < ws->initOnceStart) {
+        /* We assume the memory following the current allocation is either:
+         * 1. Not usable as initOnce memory (end of workspace)
+         * 2. Another initOnce buffer that has been allocated before (and so was previously memset)
+         * 3. An ASAN redzone, in which case we don't want to write on it
+         * For these reasons it should be fine to not explicitly zero every byte up to ws->initOnceStart.
+         * Note that we assume here that MSAN and ASAN cannot run in the same time. */
+        ZSTD_memset(ptr, 0, MIN((size_t)((U8*)ws->initOnceStart - (U8*)ptr), alignedBytes));
+        ws->initOnceStart = ptr;
+    }
+    return ptr;
+}
+
 /*
  * Reserves and returns memory sized on and aligned on ZSTD_CWKSP_ALIGNMENT_BYTES (64 bytes).
  */
@@ -361,13 +401,17 @@ MEM_STATIC void* ZSTD_cwksp_reserve_aligned(ZSTD_cwksp* ws, size_t bytes)
  */
 MEM_STATIC void* ZSTD_cwksp_reserve_table(ZSTD_cwksp* ws, size_t bytes)
 {
-    const ZSTD_cwksp_alloc_phase_e phase = ZSTD_cwksp_alloc_aligned;
+    const ZSTD_cwksp_alloc_phase_e phase = ZSTD_cwksp_alloc_aligned_init_once;
     void* alloc;
     void* end;
     void* top;
 
-    if (ZSTD_isError(ZSTD_cwksp_internal_advance_phase(ws, phase))) {
-        return NULL;
+    /* We can only start allocating tables after we are done reserving space for objects at the
+     * start of the workspace */
+    if(ws->phase < phase) {
+        if (ZSTD_isError(ZSTD_cwksp_internal_advance_phase(ws, phase))) {
+            return NULL;
+        }
     }
     alloc = ws->tableEnd;
     end = (BYTE *)alloc + bytes;
@@ -451,7 +495,7 @@ MEM_STATIC void ZSTD_cwksp_clean_tables(ZSTD_cwksp* ws) {
     assert(ws->tableValidEnd >= ws->objectEnd);
     assert(ws->tableValidEnd <= ws->allocStart);
     if (ws->tableValidEnd < ws->tableEnd) {
-        ZSTD_memset(ws->tableValidEnd, 0, (BYTE*)ws->tableEnd - (BYTE*)ws->tableValidEnd);
+        ZSTD_memset(ws->tableValidEnd, 0, (size_t)((BYTE*)ws->tableEnd - (BYTE*)ws->tableValidEnd));
     }
     ZSTD_cwksp_mark_tables_clean(ws);
 }
@@ -478,10 +522,10 @@ MEM_STATIC void ZSTD_cwksp_clear(ZSTD_cwksp* ws) {
 
 
     ws->tableEnd = ws->objectEnd;
-    ws->allocStart = ws->workspaceEnd;
+    ws->allocStart = ZSTD_cwksp_initialAllocStart(ws);
     ws->allocFailed = 0;
-    if (ws->phase > ZSTD_cwksp_alloc_buffers) {
-        ws->phase = ZSTD_cwksp_alloc_buffers;
+    if (ws->phase > ZSTD_cwksp_alloc_aligned_init_once) {
+        ws->phase = ZSTD_cwksp_alloc_aligned_init_once;
     }
     ZSTD_cwksp_assert_internal_consistency(ws);
 }
@@ -498,6 +542,7 @@ MEM_STATIC void ZSTD_cwksp_init(ZSTD_cwksp* ws, void* start, size_t size, ZSTD_c
     ws->workspaceEnd = (BYTE*)start + size;
     ws->objectEnd = ws->workspace;
     ws->tableValidEnd = ws->objectEnd;
+    ws->initOnceStart = ZSTD_cwksp_initialAllocStart(ws);
     ws->phase = ZSTD_cwksp_alloc_objects;
     ws->isStatic = isStatic;
     ZSTD_cwksp_clear(ws);
@@ -550,17 +595,11 @@ MEM_STATIC int ZSTD_cwksp_reserve_failed(const ZSTD_cwksp* ws) {
  * Returns if the estimated space needed for a wksp is within an acceptable limit of the
  * actual amount of space used.
  */
-MEM_STATIC int ZSTD_cwksp_estimated_space_within_bounds(const ZSTD_cwksp* const ws,
-                                                        size_t const estimatedSpace, int resizedWorkspace) {
-    if (resizedWorkspace) {
-        /* Resized/newly allocated wksp should have exact bounds */
-        return ZSTD_cwksp_used(ws) == estimatedSpace;
-    } else {
-        /* Due to alignment, when reusing a workspace, we can actually consume 63 fewer or more bytes
-         * than estimatedSpace. See the comments in zstd_cwksp.h for details.
-         */
-        return (ZSTD_cwksp_used(ws) >= estimatedSpace - 63) && (ZSTD_cwksp_used(ws) <= estimatedSpace + 63);
-    }
+MEM_STATIC int ZSTD_cwksp_estimated_space_within_bounds(const ZSTD_cwksp *const ws, size_t const estimatedSpace) {
+    /* We have an alignment space between objects and tables between tables and buffers, so we can have up to twice
+     * the alignment bytes difference between estimation and actual usage */
+    return (estimatedSpace - ZSTD_cwksp_slack_space_required()) <= ZSTD_cwksp_used(ws) &&
+           ZSTD_cwksp_used(ws) <= estimatedSpace;
 }
 
 
diff --git a/lib/zstd/compress/zstd_double_fast.c b/lib/zstd/compress/zstd_double_fast.c
index 76933dea2624ea..ab9440a996039d 100644
--- a/lib/zstd/compress/zstd_double_fast.c
+++ b/lib/zstd/compress/zstd_double_fast.c
@@ -1,5 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -11,8 +12,43 @@
 #include "zstd_compress_internal.h"
 #include "zstd_double_fast.h"
 
+static void ZSTD_fillDoubleHashTableForCDict(ZSTD_matchState_t* ms,
+                              void const* end, ZSTD_dictTableLoadMethod_e dtlm)
+{
+    const ZSTD_compressionParameters* const cParams = &ms->cParams;
+    U32* const hashLarge = ms->hashTable;
+    U32  const hBitsL = cParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS;
+    U32  const mls = cParams->minMatch;
+    U32* const hashSmall = ms->chainTable;
+    U32  const hBitsS = cParams->chainLog + ZSTD_SHORT_CACHE_TAG_BITS;
+    const BYTE* const base = ms->window.base;
+    const BYTE* ip = base + ms->nextToUpdate;
+    const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE;
+    const U32 fastHashFillStep = 3;
 
-void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms,
+    /* Always insert every fastHashFillStep position into the hash tables.
+     * Insert the other positions into the large hash table if their entry
+     * is empty.
+     */
+    for (; ip + fastHashFillStep - 1 <= iend; ip += fastHashFillStep) {
+        U32 const curr = (U32)(ip - base);
+        U32 i;
+        for (i = 0; i < fastHashFillStep; ++i) {
+            size_t const smHashAndTag = ZSTD_hashPtr(ip + i, hBitsS, mls);
+            size_t const lgHashAndTag = ZSTD_hashPtr(ip + i, hBitsL, 8);
+            if (i == 0) {
+                ZSTD_writeTaggedIndex(hashSmall, smHashAndTag, curr + i);
+            }
+            if (i == 0 || hashLarge[lgHashAndTag >> ZSTD_SHORT_CACHE_TAG_BITS] == 0) {
+                ZSTD_writeTaggedIndex(hashLarge, lgHashAndTag, curr + i);
+            }
+            /* Only load extra positions for ZSTD_dtlm_full */
+            if (dtlm == ZSTD_dtlm_fast)
+                break;
+    }   }
+}
+
+static void ZSTD_fillDoubleHashTableForCCtx(ZSTD_matchState_t* ms,
                               void const* end, ZSTD_dictTableLoadMethod_e dtlm)
 {
     const ZSTD_compressionParameters* const cParams = &ms->cParams;
@@ -43,7 +79,19 @@ void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms,
             /* Only load extra positions for ZSTD_dtlm_full */
             if (dtlm == ZSTD_dtlm_fast)
                 break;
-    }   }
+        }   }
+}
+
+void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms,
+                        const void* const end,
+                        ZSTD_dictTableLoadMethod_e dtlm,
+                        ZSTD_tableFillPurpose_e tfp)
+{
+    if (tfp == ZSTD_tfp_forCDict) {
+        ZSTD_fillDoubleHashTableForCDict(ms, end, dtlm);
+    } else {
+        ZSTD_fillDoubleHashTableForCCtx(ms, end, dtlm);
+    }
 }
 
 
@@ -67,7 +115,7 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic(
     const BYTE* const iend = istart + srcSize;
     const BYTE* const ilimit = iend - HASH_READ_SIZE;
     U32 offset_1=rep[0], offset_2=rep[1];
-    U32 offsetSaved = 0;
+    U32 offsetSaved1 = 0, offsetSaved2 = 0;
 
     size_t mLength;
     U32 offset;
@@ -100,8 +148,8 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic(
         U32 const current = (U32)(ip - base);
         U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, current, cParams->windowLog);
         U32 const maxRep = current - windowLow;
-        if (offset_2 > maxRep) offsetSaved = offset_2, offset_2 = 0;
-        if (offset_1 > maxRep) offsetSaved = offset_1, offset_1 = 0;
+        if (offset_2 > maxRep) offsetSaved2 = offset_2, offset_2 = 0;
+        if (offset_1 > maxRep) offsetSaved1 = offset_1, offset_1 = 0;
     }
 
     /* Outer Loop: one iteration per match found and stored */
@@ -131,7 +179,7 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic(
             if ((offset_1 > 0) & (MEM_read32(ip+1-offset_1) == MEM_read32(ip+1))) {
                 mLength = ZSTD_count(ip+1+4, ip+1+4-offset_1, iend) + 4;
                 ip++;
-                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, mLength);
+                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength);
                 goto _match_stored;
             }
 
@@ -175,9 +223,13 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic(
         } while (ip1 <= ilimit);
 
 _cleanup:
+        /* If offset_1 started invalid (offsetSaved1 != 0) and became valid (offset_1 != 0),
+         * rotate saved offsets. See comment in ZSTD_compressBlock_fast_noDict for more context. */
+        offsetSaved2 = ((offsetSaved1 != 0) && (offset_1 != 0)) ? offsetSaved1 : offsetSaved2;
+
         /* save reps for next block */
-        rep[0] = offset_1 ? offset_1 : offsetSaved;
-        rep[1] = offset_2 ? offset_2 : offsetSaved;
+        rep[0] = offset_1 ? offset_1 : offsetSaved1;
+        rep[1] = offset_2 ? offset_2 : offsetSaved2;
 
         /* Return the last literals size */
         return (size_t)(iend - anchor);
@@ -217,7 +269,7 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic(
             hashLong[hl1] = (U32)(ip1 - base);
         }
 
-        ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength);
+        ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
 
 _match_stored:
         /* match found */
@@ -243,7 +295,7 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic(
                 U32 const tmpOff = offset_2; offset_2 = offset_1; offset_1 = tmpOff;  /* swap offset_2 <=> offset_1 */
                 hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = (U32)(ip-base);
                 hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = (U32)(ip-base);
-                ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, rLength);
+                ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, rLength);
                 ip += rLength;
                 anchor = ip;
                 continue;   /* faster when present ... (?) */
@@ -275,7 +327,6 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
     const BYTE* const iend = istart + srcSize;
     const BYTE* const ilimit = iend - HASH_READ_SIZE;
     U32 offset_1=rep[0], offset_2=rep[1];
-    U32 offsetSaved = 0;
 
     const ZSTD_matchState_t* const dms = ms->dictMatchState;
     const ZSTD_compressionParameters* const dictCParams = &dms->cParams;
@@ -286,8 +337,8 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
     const BYTE* const dictStart    = dictBase + dictStartIndex;
     const BYTE* const dictEnd      = dms->window.nextSrc;
     const U32 dictIndexDelta       = prefixLowestIndex - (U32)(dictEnd - dictBase);
-    const U32 dictHBitsL           = dictCParams->hashLog;
-    const U32 dictHBitsS           = dictCParams->chainLog;
+    const U32 dictHBitsL           = dictCParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS;
+    const U32 dictHBitsS           = dictCParams->chainLog + ZSTD_SHORT_CACHE_TAG_BITS;
     const U32 dictAndPrefixLength  = (U32)((ip - prefixLowest) + (dictEnd - dictStart));
 
     DEBUGLOG(5, "ZSTD_compressBlock_doubleFast_dictMatchState_generic");
@@ -295,6 +346,13 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
     /* if a dictionary is attached, it must be within window range */
     assert(ms->window.dictLimit + (1U << cParams->windowLog) >= endIndex);
 
+    if (ms->prefetchCDictTables) {
+        size_t const hashTableBytes = (((size_t)1) << dictCParams->hashLog) * sizeof(U32);
+        size_t const chainTableBytes = (((size_t)1) << dictCParams->chainLog) * sizeof(U32);
+        PREFETCH_AREA(dictHashLong, hashTableBytes)
+        PREFETCH_AREA(dictHashSmall, chainTableBytes)
+    }
+
     /* init */
     ip += (dictAndPrefixLength == 0);
 
@@ -309,8 +367,12 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
         U32 offset;
         size_t const h2 = ZSTD_hashPtr(ip, hBitsL, 8);
         size_t const h = ZSTD_hashPtr(ip, hBitsS, mls);
-        size_t const dictHL = ZSTD_hashPtr(ip, dictHBitsL, 8);
-        size_t const dictHS = ZSTD_hashPtr(ip, dictHBitsS, mls);
+        size_t const dictHashAndTagL = ZSTD_hashPtr(ip, dictHBitsL, 8);
+        size_t const dictHashAndTagS = ZSTD_hashPtr(ip, dictHBitsS, mls);
+        U32 const dictMatchIndexAndTagL = dictHashLong[dictHashAndTagL >> ZSTD_SHORT_CACHE_TAG_BITS];
+        U32 const dictMatchIndexAndTagS = dictHashSmall[dictHashAndTagS >> ZSTD_SHORT_CACHE_TAG_BITS];
+        int const dictTagsMatchL = ZSTD_comparePackedTags(dictMatchIndexAndTagL, dictHashAndTagL);
+        int const dictTagsMatchS = ZSTD_comparePackedTags(dictMatchIndexAndTagS, dictHashAndTagS);
         U32 const curr = (U32)(ip-base);
         U32 const matchIndexL = hashLong[h2];
         U32 matchIndexS = hashSmall[h];
@@ -328,7 +390,7 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
             const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
             mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;
             ip++;
-            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, mLength);
+            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength);
             goto _match_stored;
         }
 
@@ -340,9 +402,9 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
                 while (((ip>anchor) & (matchLong>prefixLowest)) && (ip[-1] == matchLong[-1])) { ip--; matchLong--; mLength++; } /* catch up */
                 goto _match_found;
             }
-        } else {
+        } else if (dictTagsMatchL) {
             /* check dictMatchState long match */
-            U32 const dictMatchIndexL = dictHashLong[dictHL];
+            U32 const dictMatchIndexL = dictMatchIndexAndTagL >> ZSTD_SHORT_CACHE_TAG_BITS;
             const BYTE* dictMatchL = dictBase + dictMatchIndexL;
             assert(dictMatchL < dictEnd);
 
@@ -358,9 +420,9 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
             if (MEM_read32(match) == MEM_read32(ip)) {
                 goto _search_next_long;
             }
-        } else {
+        } else if (dictTagsMatchS) {
             /* check dictMatchState short match */
-            U32 const dictMatchIndexS = dictHashSmall[dictHS];
+            U32 const dictMatchIndexS = dictMatchIndexAndTagS >> ZSTD_SHORT_CACHE_TAG_BITS;
             match = dictBase + dictMatchIndexS;
             matchIndexS = dictMatchIndexS + dictIndexDelta;
 
@@ -375,10 +437,11 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
         continue;
 
 _search_next_long:
-
         {   size_t const hl3 = ZSTD_hashPtr(ip+1, hBitsL, 8);
-            size_t const dictHLNext = ZSTD_hashPtr(ip+1, dictHBitsL, 8);
+            size_t const dictHashAndTagL3 = ZSTD_hashPtr(ip+1, dictHBitsL, 8);
             U32 const matchIndexL3 = hashLong[hl3];
+            U32 const dictMatchIndexAndTagL3 = dictHashLong[dictHashAndTagL3 >> ZSTD_SHORT_CACHE_TAG_BITS];
+            int const dictTagsMatchL3 = ZSTD_comparePackedTags(dictMatchIndexAndTagL3, dictHashAndTagL3);
             const BYTE* matchL3 = base + matchIndexL3;
             hashLong[hl3] = curr + 1;
 
@@ -391,9 +454,9 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
                     while (((ip>anchor) & (matchL3>prefixLowest)) && (ip[-1] == matchL3[-1])) { ip--; matchL3--; mLength++; } /* catch up */
                     goto _match_found;
                 }
-            } else {
+            } else if (dictTagsMatchL3) {
                 /* check dict long +1 match */
-                U32 const dictMatchIndexL3 = dictHashLong[dictHLNext];
+                U32 const dictMatchIndexL3 = dictMatchIndexAndTagL3 >> ZSTD_SHORT_CACHE_TAG_BITS;
                 const BYTE* dictMatchL3 = dictBase + dictMatchIndexL3;
                 assert(dictMatchL3 < dictEnd);
                 if (dictMatchL3 > dictStart && MEM_read64(dictMatchL3) == MEM_read64(ip+1)) {
@@ -419,7 +482,7 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
         offset_2 = offset_1;
         offset_1 = offset;
 
-        ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength);
+        ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
 
 _match_stored:
         /* match found */
@@ -448,7 +511,7 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
                     const BYTE* const repEnd2 = repIndex2 < prefixLowestIndex ? dictEnd : iend;
                     size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixLowest) + 4;
                     U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset;   /* swap offset_2 <=> offset_1 */
-                    ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, repLength2);
+                    ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, repLength2);
                     hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = current2;
                     hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = current2;
                     ip += repLength2;
@@ -461,8 +524,8 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
     }   /* while (ip < ilimit) */
 
     /* save reps for next block */
-    rep[0] = offset_1 ? offset_1 : offsetSaved;
-    rep[1] = offset_2 ? offset_2 : offsetSaved;
+    rep[0] = offset_1;
+    rep[1] = offset_2;
 
     /* Return the last literals size */
     return (size_t)(iend - anchor);
@@ -585,7 +648,7 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic(
             const BYTE* repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
             mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixStart) + 4;
             ip++;
-            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, mLength);
+            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength);
         } else {
             if ((matchLongIndex > dictStartIndex) && (MEM_read64(matchLong) == MEM_read64(ip))) {
                 const BYTE* const matchEnd = matchLongIndex < prefixStartIndex ? dictEnd : iend;
@@ -596,7 +659,7 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic(
                 while (((ip>anchor) & (matchLong>lowMatchPtr)) && (ip[-1] == matchLong[-1])) { ip--; matchLong--; mLength++; }   /* catch up */
                 offset_2 = offset_1;
                 offset_1 = offset;
-                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength);
+                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
 
             } else if ((matchIndex > dictStartIndex) && (MEM_read32(match) == MEM_read32(ip))) {
                 size_t const h3 = ZSTD_hashPtr(ip+1, hBitsL, 8);
@@ -621,7 +684,7 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic(
                 }
                 offset_2 = offset_1;
                 offset_1 = offset;
-                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength);
+                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
 
             } else {
                 ip += ((ip-anchor) >> kSearchStrength) + 1;
@@ -653,7 +716,7 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic(
                     const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;
                     size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
                     U32 const tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset;   /* swap offset_2 <=> offset_1 */
-                    ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, repLength2);
+                    ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, repLength2);
                     hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = current2;
                     hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = current2;
                     ip += repLength2;
diff --git a/lib/zstd/compress/zstd_double_fast.h b/lib/zstd/compress/zstd_double_fast.h
index 6822bde65a1d8d..0204f12e4cf70d 100644
--- a/lib/zstd/compress/zstd_double_fast.h
+++ b/lib/zstd/compress/zstd_double_fast.h
@@ -1,5 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -16,7 +17,8 @@
 #include "zstd_compress_internal.h"     /* ZSTD_CCtx, size_t */
 
 void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms,
-                              void const* end, ZSTD_dictTableLoadMethod_e dtlm);
+                              void const* end, ZSTD_dictTableLoadMethod_e dtlm,
+                              ZSTD_tableFillPurpose_e tfp);
 size_t ZSTD_compressBlock_doubleFast(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         void const* src, size_t srcSize);
diff --git a/lib/zstd/compress/zstd_fast.c b/lib/zstd/compress/zstd_fast.c
index a752e6beab52ed..3399b39c5dbc57 100644
--- a/lib/zstd/compress/zstd_fast.c
+++ b/lib/zstd/compress/zstd_fast.c
@@ -1,5 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -11,8 +12,42 @@
 #include "zstd_compress_internal.h"  /* ZSTD_hashPtr, ZSTD_count, ZSTD_storeSeq */
 #include "zstd_fast.h"
 
+static void ZSTD_fillHashTableForCDict(ZSTD_matchState_t* ms,
+                        const void* const end,
+                        ZSTD_dictTableLoadMethod_e dtlm)
+{
+    const ZSTD_compressionParameters* const cParams = &ms->cParams;
+    U32* const hashTable = ms->hashTable;
+    U32  const hBits = cParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS;
+    U32  const mls = cParams->minMatch;
+    const BYTE* const base = ms->window.base;
+    const BYTE* ip = base + ms->nextToUpdate;
+    const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE;
+    const U32 fastHashFillStep = 3;
 
-void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
+    /* Currently, we always use ZSTD_dtlm_full for filling CDict tables.
+     * Feel free to remove this assert if there's a good reason! */
+    assert(dtlm == ZSTD_dtlm_full);
+
+    /* Always insert every fastHashFillStep position into the hash table.
+     * Insert the other positions if their hash entry is empty.
+     */
+    for ( ; ip + fastHashFillStep < iend + 2; ip += fastHashFillStep) {
+        U32 const curr = (U32)(ip - base);
+        {   size_t const hashAndTag = ZSTD_hashPtr(ip, hBits, mls);
+            ZSTD_writeTaggedIndex(hashTable, hashAndTag, curr);   }
+
+        if (dtlm == ZSTD_dtlm_fast) continue;
+        /* Only load extra positions for ZSTD_dtlm_full */
+        {   U32 p;
+            for (p = 1; p < fastHashFillStep; ++p) {
+                size_t const hashAndTag = ZSTD_hashPtr(ip + p, hBits, mls);
+                if (hashTable[hashAndTag >> ZSTD_SHORT_CACHE_TAG_BITS] == 0) {  /* not yet filled */
+                    ZSTD_writeTaggedIndex(hashTable, hashAndTag, curr + p);
+                }   }   }   }
+}
+
+static void ZSTD_fillHashTableForCCtx(ZSTD_matchState_t* ms,
                         const void* const end,
                         ZSTD_dictTableLoadMethod_e dtlm)
 {
@@ -25,6 +60,10 @@ void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
     const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE;
     const U32 fastHashFillStep = 3;
 
+    /* Currently, we always use ZSTD_dtlm_fast for filling CCtx tables.
+     * Feel free to remove this assert if there's a good reason! */
+    assert(dtlm == ZSTD_dtlm_fast);
+
     /* Always insert every fastHashFillStep position into the hash table.
      * Insert the other positions if their hash entry is empty.
      */
@@ -42,6 +81,18 @@ void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
     }   }   }   }
 }
 
+void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
+                        const void* const end,
+                        ZSTD_dictTableLoadMethod_e dtlm,
+                        ZSTD_tableFillPurpose_e tfp)
+{
+    if (tfp == ZSTD_tfp_forCDict) {
+        ZSTD_fillHashTableForCDict(ms, end, dtlm);
+    } else {
+        ZSTD_fillHashTableForCCtx(ms, end, dtlm);
+    }
+}
+
 
 /*
  * If you squint hard enough (and ignore repcodes), the search operation at any
@@ -117,7 +168,7 @@ ZSTD_compressBlock_fast_noDict_generic(
 
     U32 rep_offset1 = rep[0];
     U32 rep_offset2 = rep[1];
-    U32 offsetSaved = 0;
+    U32 offsetSaved1 = 0, offsetSaved2 = 0;
 
     size_t hash0; /* hash for ip0 */
     size_t hash1; /* hash for ip1 */
@@ -141,8 +192,8 @@ ZSTD_compressBlock_fast_noDict_generic(
     {   U32 const curr = (U32)(ip0 - base);
         U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, curr, cParams->windowLog);
         U32 const maxRep = curr - windowLow;
-        if (rep_offset2 > maxRep) offsetSaved = rep_offset2, rep_offset2 = 0;
-        if (rep_offset1 > maxRep) offsetSaved = rep_offset1, rep_offset1 = 0;
+        if (rep_offset2 > maxRep) offsetSaved2 = rep_offset2, rep_offset2 = 0;
+        if (rep_offset1 > maxRep) offsetSaved1 = rep_offset1, rep_offset1 = 0;
     }
 
     /* start each op */
@@ -180,8 +231,14 @@ ZSTD_compressBlock_fast_noDict_generic(
             mLength = ip0[-1] == match0[-1];
             ip0 -= mLength;
             match0 -= mLength;
-            offcode = STORE_REPCODE_1;
+            offcode = REPCODE1_TO_OFFBASE;
             mLength += 4;
+
+            /* First write next hash table entry; we've already calculated it.
+             * This write is known to be safe because the ip1 is before the
+             * repcode (ip2). */
+            hashTable[hash1] = (U32)(ip1 - base);
+
             goto _match;
         }
 
@@ -195,6 +252,12 @@ ZSTD_compressBlock_fast_noDict_generic(
         /* check match at ip[0] */
         if (MEM_read32(ip0) == mval) {
             /* found a match! */
+
+            /* First write next hash table entry; we've already calculated it.
+             * This write is known to be safe because the ip1 == ip0 + 1, so
+             * we know we will resume searching after ip1 */
+            hashTable[hash1] = (U32)(ip1 - base);
+
             goto _offset;
         }
 
@@ -224,6 +287,21 @@ ZSTD_compressBlock_fast_noDict_generic(
         /* check match at ip[0] */
         if (MEM_read32(ip0) == mval) {
             /* found a match! */
+
+            /* first write next hash table entry; we've already calculated it */
+            if (step <= 4) {
+                /* We need to avoid writing an index into the hash table >= the
+                 * position at which we will pick up our searching after we've
+                 * taken this match.
+                 *
+                 * The minimum possible match has length 4, so the earliest ip0
+                 * can be after we take this match will be the current ip0 + 4.
+                 * ip1 is ip0 + step - 1. If ip1 is >= ip0 + 4, we can't safely
+                 * write this position.
+                 */
+                hashTable[hash1] = (U32)(ip1 - base);
+            }
+
             goto _offset;
         }
 
@@ -254,9 +332,24 @@ ZSTD_compressBlock_fast_noDict_generic(
      * However, it seems to be a meaningful performance hit to try to search
      * them. So let's not. */
 
+    /* When the repcodes are outside of the prefix, we set them to zero before the loop.
+     * When the offsets are still zero, we need to restore them after the block to have a correct
+     * repcode history. If only one offset was invalid, it is easy. The tricky case is when both
+     * offsets were invalid. We need to figure out which offset to refill with.
+     *     - If both offsets are zero they are in the same order.
+     *     - If both offsets are non-zero, we won't restore the offsets from `offsetSaved[12]`.
+     *     - If only one is zero, we need to decide which offset to restore.
+     *         - If rep_offset1 is non-zero, then rep_offset2 must be offsetSaved1.
+     *         - It is impossible for rep_offset2 to be non-zero.
+     *
+     * So if rep_offset1 started invalid (offsetSaved1 != 0) and became valid (rep_offset1 != 0), then
+     * set rep[0] = rep_offset1 and rep[1] = offsetSaved1.
+     */
+    offsetSaved2 = ((offsetSaved1 != 0) && (rep_offset1 != 0)) ? offsetSaved1 : offsetSaved2;
+
     /* save reps for next block */
-    rep[0] = rep_offset1 ? rep_offset1 : offsetSaved;
-    rep[1] = rep_offset2 ? rep_offset2 : offsetSaved;
+    rep[0] = rep_offset1 ? rep_offset1 : offsetSaved1;
+    rep[1] = rep_offset2 ? rep_offset2 : offsetSaved2;
 
     /* Return the last literals size */
     return (size_t)(iend - anchor);
@@ -267,7 +360,7 @@ ZSTD_compressBlock_fast_noDict_generic(
     match0 = base + idx;
     rep_offset2 = rep_offset1;
     rep_offset1 = (U32)(ip0-match0);
-    offcode = STORE_OFFSET(rep_offset1);
+    offcode = OFFSET_TO_OFFBASE(rep_offset1);
     mLength = 4;
 
     /* Count the backwards match length. */
@@ -287,11 +380,6 @@ ZSTD_compressBlock_fast_noDict_generic(
     ip0 += mLength;
     anchor = ip0;
 
-    /* write next hash table entry */
-    if (ip1 < ip0) {
-        hashTable[hash1] = (U32)(ip1 - base);
-    }
-
     /* Fill table and check for immediate repcode. */
     if (ip0 <= ilimit) {
         /* Fill Table */
@@ -306,7 +394,7 @@ ZSTD_compressBlock_fast_noDict_generic(
                 { U32 const tmpOff = rep_offset2; rep_offset2 = rep_offset1; rep_offset1 = tmpOff; } /* swap rep_offset2 <=> rep_offset1 */
                 hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = (U32)(ip0-base);
                 ip0 += rLength;
-                ZSTD_storeSeq(seqStore, 0 /*litLen*/, anchor, iend, STORE_REPCODE_1, rLength);
+                ZSTD_storeSeq(seqStore, 0 /*litLen*/, anchor, iend, REPCODE1_TO_OFFBASE, rLength);
                 anchor = ip0;
                 continue;   /* faster when present (confirmed on gcc-8) ... (?) */
     }   }   }
@@ -380,14 +468,14 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic(
     U32 const stepSize = cParams->targetLength + !(cParams->targetLength);
     const BYTE* const base = ms->window.base;
     const BYTE* const istart = (const BYTE*)src;
-    const BYTE* ip = istart;
+    const BYTE* ip0 = istart;
+    const BYTE* ip1 = ip0 + stepSize; /* we assert below that stepSize >= 1 */
     const BYTE* anchor = istart;
     const U32   prefixStartIndex = ms->window.dictLimit;
     const BYTE* const prefixStart = base + prefixStartIndex;
     const BYTE* const iend = istart + srcSize;
     const BYTE* const ilimit = iend - HASH_READ_SIZE;
     U32 offset_1=rep[0], offset_2=rep[1];
-    U32 offsetSaved = 0;
 
     const ZSTD_matchState_t* const dms = ms->dictMatchState;
     const ZSTD_compressionParameters* const dictCParams = &dms->cParams ;
@@ -397,13 +485,13 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic(
     const BYTE* const dictStart    = dictBase + dictStartIndex;
     const BYTE* const dictEnd      = dms->window.nextSrc;
     const U32 dictIndexDelta       = prefixStartIndex - (U32)(dictEnd - dictBase);
-    const U32 dictAndPrefixLength  = (U32)(ip - prefixStart + dictEnd - dictStart);
-    const U32 dictHLog             = dictCParams->hashLog;
+    const U32 dictAndPrefixLength  = (U32)(istart - prefixStart + dictEnd - dictStart);
+    const U32 dictHBits            = dictCParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS;
 
     /* if a dictionary is still attached, it necessarily means that
      * it is within window size. So we just check it. */
     const U32 maxDistance = 1U << cParams->windowLog;
-    const U32 endIndex = (U32)((size_t)(ip - base) + srcSize);
+    const U32 endIndex = (U32)((size_t)(istart - base) + srcSize);
     assert(endIndex - prefixStartIndex <= maxDistance);
     (void)maxDistance; (void)endIndex;   /* these variables are not used when assert() is disabled */
 
@@ -413,106 +501,155 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic(
      * when translating a dict index into a local index */
     assert(prefixStartIndex >= (U32)(dictEnd - dictBase));
 
+    if (ms->prefetchCDictTables) {
+        size_t const hashTableBytes = (((size_t)1) << dictCParams->hashLog) * sizeof(U32);
+        PREFETCH_AREA(dictHashTable, hashTableBytes)
+    }
+
     /* init */
     DEBUGLOG(5, "ZSTD_compressBlock_fast_dictMatchState_generic");
-    ip += (dictAndPrefixLength == 0);
+    ip0 += (dictAndPrefixLength == 0);
     /* dictMatchState repCode checks don't currently handle repCode == 0
      * disabling. */
     assert(offset_1 <= dictAndPrefixLength);
     assert(offset_2 <= dictAndPrefixLength);
 
-    /* Main Search Loop */
-    while (ip < ilimit) {   /* < instead of <=, because repcode check at (ip+1) */
+    /* Outer search loop */
+    assert(stepSize >= 1);
+    while (ip1 <= ilimit) {   /* repcode check at (ip0 + 1) is safe because ip0 < ip1 */
         size_t mLength;
-        size_t const h = ZSTD_hashPtr(ip, hlog, mls);
-        U32 const curr = (U32)(ip-base);
-        U32 const matchIndex = hashTable[h];
-        const BYTE* match = base + matchIndex;
-        const U32 repIndex = curr + 1 - offset_1;
-        const BYTE* repMatch = (repIndex < prefixStartIndex) ?
-                               dictBase + (repIndex - dictIndexDelta) :
-                               base + repIndex;
-        hashTable[h] = curr;   /* update hash table */
-
-        if ( ((U32)((prefixStartIndex-1) - repIndex) >= 3) /* intentional underflow : ensure repIndex isn't overlapping dict + prefix */
-          && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) {
-            const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
-            mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixStart) + 4;
-            ip++;
-            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, mLength);
-        } else if ( (matchIndex <= prefixStartIndex) ) {
-            size_t const dictHash = ZSTD_hashPtr(ip, dictHLog, mls);
-            U32 const dictMatchIndex = dictHashTable[dictHash];
-            const BYTE* dictMatch = dictBase + dictMatchIndex;
-            if (dictMatchIndex <= dictStartIndex ||
-                MEM_read32(dictMatch) != MEM_read32(ip)) {
-                assert(stepSize >= 1);
-                ip += ((ip-anchor) >> kSearchStrength) + stepSize;
-                continue;
-            } else {
-                /* found a dict match */
-                U32 const offset = (U32)(curr-dictMatchIndex-dictIndexDelta);
-                mLength = ZSTD_count_2segments(ip+4, dictMatch+4, iend, dictEnd, prefixStart) + 4;
-                while (((ip>anchor) & (dictMatch>dictStart))
-                     && (ip[-1] == dictMatch[-1])) {
-                    ip--; dictMatch--; mLength++;
+        size_t hash0 = ZSTD_hashPtr(ip0, hlog, mls);
+
+        size_t const dictHashAndTag0 = ZSTD_hashPtr(ip0, dictHBits, mls);
+        U32 dictMatchIndexAndTag = dictHashTable[dictHashAndTag0 >> ZSTD_SHORT_CACHE_TAG_BITS];
+        int dictTagsMatch = ZSTD_comparePackedTags(dictMatchIndexAndTag, dictHashAndTag0);
+
+        U32 matchIndex = hashTable[hash0];
+        U32 curr = (U32)(ip0 - base);
+        size_t step = stepSize;
+        const size_t kStepIncr = 1 << kSearchStrength;
+        const BYTE* nextStep = ip0 + kStepIncr;
+
+        /* Inner search loop */
+        while (1) {
+            const BYTE* match = base + matchIndex;
+            const U32 repIndex = curr + 1 - offset_1;
+            const BYTE* repMatch = (repIndex < prefixStartIndex) ?
+                                   dictBase + (repIndex - dictIndexDelta) :
+                                   base + repIndex;
+            const size_t hash1 = ZSTD_hashPtr(ip1, hlog, mls);
+            size_t const dictHashAndTag1 = ZSTD_hashPtr(ip1, dictHBits, mls);
+            hashTable[hash0] = curr;   /* update hash table */
+
+            if (((U32) ((prefixStartIndex - 1) - repIndex) >=
+                 3) /* intentional underflow : ensure repIndex isn't overlapping dict + prefix */
+                && (MEM_read32(repMatch) == MEM_read32(ip0 + 1))) {
+                const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
+                mLength = ZSTD_count_2segments(ip0 + 1 + 4, repMatch + 4, iend, repMatchEnd, prefixStart) + 4;
+                ip0++;
+                ZSTD_storeSeq(seqStore, (size_t) (ip0 - anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength);
+                break;
+            }
+
+            if (dictTagsMatch) {
+                /* Found a possible dict match */
+                const U32 dictMatchIndex = dictMatchIndexAndTag >> ZSTD_SHORT_CACHE_TAG_BITS;
+                const BYTE* dictMatch = dictBase + dictMatchIndex;
+                if (dictMatchIndex > dictStartIndex &&
+                    MEM_read32(dictMatch) == MEM_read32(ip0)) {
+                    /* To replicate extDict parse behavior, we only use dict matches when the normal matchIndex is invalid */
+                    if (matchIndex <= prefixStartIndex) {
+                        U32 const offset = (U32) (curr - dictMatchIndex - dictIndexDelta);
+                        mLength = ZSTD_count_2segments(ip0 + 4, dictMatch + 4, iend, dictEnd, prefixStart) + 4;
+                        while (((ip0 > anchor) & (dictMatch > dictStart))
+                            && (ip0[-1] == dictMatch[-1])) {
+                            ip0--;
+                            dictMatch--;
+                            mLength++;
+                        } /* catch up */
+                        offset_2 = offset_1;
+                        offset_1 = offset;
+                        ZSTD_storeSeq(seqStore, (size_t) (ip0 - anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
+                        break;
+                    }
+                }
+            }
+
+            if (matchIndex > prefixStartIndex && MEM_read32(match) == MEM_read32(ip0)) {
+                /* found a regular match */
+                U32 const offset = (U32) (ip0 - match);
+                mLength = ZSTD_count(ip0 + 4, match + 4, iend) + 4;
+                while (((ip0 > anchor) & (match > prefixStart))
+                       && (ip0[-1] == match[-1])) {
+                    ip0--;
+                    match--;
+                    mLength++;
                 } /* catch up */
                 offset_2 = offset_1;
                 offset_1 = offset;
-                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength);
+                ZSTD_storeSeq(seqStore, (size_t) (ip0 - anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
+                break;
             }
-        } else if (MEM_read32(match) != MEM_read32(ip)) {
-            /* it's not a match, and we're not going to check the dictionary */
-            assert(stepSize >= 1);
-            ip += ((ip-anchor) >> kSearchStrength) + stepSize;
-            continue;
-        } else {
-            /* found a regular match */
-            U32 const offset = (U32)(ip-match);
-            mLength = ZSTD_count(ip+4, match+4, iend) + 4;
-            while (((ip>anchor) & (match>prefixStart))
-                 && (ip[-1] == match[-1])) { ip--; match--; mLength++; } /* catch up */
-            offset_2 = offset_1;
-            offset_1 = offset;
-            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength);
-        }
+
+            /* Prepare for next iteration */
+            dictMatchIndexAndTag = dictHashTable[dictHashAndTag1 >> ZSTD_SHORT_CACHE_TAG_BITS];
+            dictTagsMatch = ZSTD_comparePackedTags(dictMatchIndexAndTag, dictHashAndTag1);
+            matchIndex = hashTable[hash1];
+
+            if (ip1 >= nextStep) {
+                step++;
+                nextStep += kStepIncr;
+            }
+            ip0 = ip1;
+            ip1 = ip1 + step;
+            if (ip1 > ilimit) goto _cleanup;
+
+            curr = (U32)(ip0 - base);
+            hash0 = hash1;
+        }   /* end inner search loop */
 
         /* match found */
-        ip += mLength;
-        anchor = ip;
+        assert(mLength);
+        ip0 += mLength;
+        anchor = ip0;
 
-        if (ip <= ilimit) {
+        if (ip0 <= ilimit) {
             /* Fill Table */
             assert(base+curr+2 > istart);  /* check base overflow */
             hashTable[ZSTD_hashPtr(base+curr+2, hlog, mls)] = curr+2;  /* here because curr+2 could be > iend-8 */
-            hashTable[ZSTD_hashPtr(ip-2, hlog, mls)] = (U32)(ip-2-base);
+            hashTable[ZSTD_hashPtr(ip0-2, hlog, mls)] = (U32)(ip0-2-base);
 
             /* check immediate repcode */
-            while (ip <= ilimit) {
-                U32 const current2 = (U32)(ip-base);
+            while (ip0 <= ilimit) {
+                U32 const current2 = (U32)(ip0-base);
                 U32 const repIndex2 = current2 - offset_2;
                 const BYTE* repMatch2 = repIndex2 < prefixStartIndex ?
                         dictBase - dictIndexDelta + repIndex2 :
                         base + repIndex2;
                 if ( ((U32)((prefixStartIndex-1) - (U32)repIndex2) >= 3 /* intentional overflow */)
-                   && (MEM_read32(repMatch2) == MEM_read32(ip)) ) {
+                   && (MEM_read32(repMatch2) == MEM_read32(ip0))) {
                     const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;
-                    size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
+                    size_t const repLength2 = ZSTD_count_2segments(ip0+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
                     U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset;   /* swap offset_2 <=> offset_1 */
-                    ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, repLength2);
-                    hashTable[ZSTD_hashPtr(ip, hlog, mls)] = current2;
-                    ip += repLength2;
-                    anchor = ip;
+                    ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, repLength2);
+                    hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = current2;
+                    ip0 += repLength2;
+                    anchor = ip0;
                     continue;
                 }
                 break;
             }
         }
+
+        /* Prepare for next iteration */
+        assert(ip0 == anchor);
+        ip1 = ip0 + stepSize;
     }
 
+_cleanup:
     /* save reps for next block */
-    rep[0] = offset_1 ? offset_1 : offsetSaved;
-    rep[1] = offset_2 ? offset_2 : offsetSaved;
+    rep[0] = offset_1;
+    rep[1] = offset_2;
 
     /* Return the last literals size */
     return (size_t)(iend - anchor);
@@ -553,11 +690,10 @@ static size_t ZSTD_compressBlock_fast_extDict_generic(
     U32* const hashTable = ms->hashTable;
     U32 const hlog = cParams->hashLog;
     /* support stepSize of 0 */
-    U32 const stepSize = cParams->targetLength + !(cParams->targetLength);
+    size_t const stepSize = cParams->targetLength + !(cParams->targetLength) + 1;
     const BYTE* const base = ms->window.base;
     const BYTE* const dictBase = ms->window.dictBase;
     const BYTE* const istart = (const BYTE*)src;
-    const BYTE* ip = istart;
     const BYTE* anchor = istart;
     const U32   endIndex = (U32)((size_t)(istart - base) + srcSize);
     const U32   lowLimit = ZSTD_getLowestMatchIndex(ms, endIndex, cParams->windowLog);
@@ -570,6 +706,28 @@ static size_t ZSTD_compressBlock_fast_extDict_generic(
     const BYTE* const iend = istart + srcSize;
     const BYTE* const ilimit = iend - 8;
     U32 offset_1=rep[0], offset_2=rep[1];
+    U32 offsetSaved1 = 0, offsetSaved2 = 0;
+
+    const BYTE* ip0 = istart;
+    const BYTE* ip1;
+    const BYTE* ip2;
+    const BYTE* ip3;
+    U32 current0;
+
+
+    size_t hash0; /* hash for ip0 */
+    size_t hash1; /* hash for ip1 */
+    U32 idx; /* match idx for ip0 */
+    const BYTE* idxBase; /* base pointer for idx */
+
+    U32 offcode;
+    const BYTE* match0;
+    size_t mLength;
+    const BYTE* matchEnd = 0; /* initialize to avoid warning, assert != 0 later */
+
+    size_t step;
+    const BYTE* nextStep;
+    const size_t kStepIncr = (1 << (kSearchStrength - 1));
 
     (void)hasStep; /* not currently specialized on whether it's accelerated */
 
@@ -579,75 +737,202 @@ static size_t ZSTD_compressBlock_fast_extDict_generic(
     if (prefixStartIndex == dictStartIndex)
         return ZSTD_compressBlock_fast(ms, seqStore, rep, src, srcSize);
 
-    /* Search Loop */
-    while (ip < ilimit) {  /* < instead of <=, because (ip+1) */
-        const size_t h = ZSTD_hashPtr(ip, hlog, mls);
-        const U32    matchIndex = hashTable[h];
-        const BYTE* const matchBase = matchIndex < prefixStartIndex ? dictBase : base;
-        const BYTE*  match = matchBase + matchIndex;
-        const U32    curr = (U32)(ip-base);
-        const U32    repIndex = curr + 1 - offset_1;
-        const BYTE* const repBase = repIndex < prefixStartIndex ? dictBase : base;
-        const BYTE* const repMatch = repBase + repIndex;
-        hashTable[h] = curr;   /* update hash table */
-        DEBUGLOG(7, "offset_1 = %u , curr = %u", offset_1, curr);
-
-        if ( ( ((U32)((prefixStartIndex-1) - repIndex) >= 3) /* intentional underflow */
-             & (offset_1 <= curr+1 - dictStartIndex) ) /* note: we are searching at curr+1 */
-           && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) {
-            const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
-            size_t const rLength = ZSTD_count_2segments(ip+1 +4, repMatch +4, iend, repMatchEnd, prefixStart) + 4;
-            ip++;
-            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, rLength);
-            ip += rLength;
-            anchor = ip;
-        } else {
-            if ( (matchIndex < dictStartIndex) ||
-                 (MEM_read32(match) != MEM_read32(ip)) ) {
-                assert(stepSize >= 1);
-                ip += ((ip-anchor) >> kSearchStrength) + stepSize;
-                continue;
+    {   U32 const curr = (U32)(ip0 - base);
+        U32 const maxRep = curr - dictStartIndex;
+        if (offset_2 >= maxRep) offsetSaved2 = offset_2, offset_2 = 0;
+        if (offset_1 >= maxRep) offsetSaved1 = offset_1, offset_1 = 0;
+    }
+
+    /* start each op */
+_start: /* Requires: ip0 */
+
+    step = stepSize;
+    nextStep = ip0 + kStepIncr;
+
+    /* calculate positions, ip0 - anchor == 0, so we skip step calc */
+    ip1 = ip0 + 1;
+    ip2 = ip0 + step;
+    ip3 = ip2 + 1;
+
+    if (ip3 >= ilimit) {
+        goto _cleanup;
+    }
+
+    hash0 = ZSTD_hashPtr(ip0, hlog, mls);
+    hash1 = ZSTD_hashPtr(ip1, hlog, mls);
+
+    idx = hashTable[hash0];
+    idxBase = idx < prefixStartIndex ? dictBase : base;
+
+    do {
+        {   /* load repcode match for ip[2] */
+            U32 const current2 = (U32)(ip2 - base);
+            U32 const repIndex = current2 - offset_1;
+            const BYTE* const repBase = repIndex < prefixStartIndex ? dictBase : base;
+            U32 rval;
+            if ( ((U32)(prefixStartIndex - repIndex) >= 4) /* intentional underflow */
+                 & (offset_1 > 0) ) {
+                rval = MEM_read32(repBase + repIndex);
+            } else {
+                rval = MEM_read32(ip2) ^ 1; /* guaranteed to not match. */
             }
-            {   const BYTE* const matchEnd = matchIndex < prefixStartIndex ? dictEnd : iend;
-                const BYTE* const lowMatchPtr = matchIndex < prefixStartIndex ? dictStart : prefixStart;
-                U32 const offset = curr - matchIndex;
-                size_t mLength = ZSTD_count_2segments(ip+4, match+4, iend, matchEnd, prefixStart) + 4;
-                while (((ip>anchor) & (match>lowMatchPtr)) && (ip[-1] == match[-1])) { ip--; match--; mLength++; }   /* catch up */
-                offset_2 = offset_1; offset_1 = offset;  /* update offset history */
-                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength);
-                ip += mLength;
-                anchor = ip;
+
+            /* write back hash table entry */
+            current0 = (U32)(ip0 - base);
+            hashTable[hash0] = current0;
+
+            /* check repcode at ip[2] */
+            if (MEM_read32(ip2) == rval) {
+                ip0 = ip2;
+                match0 = repBase + repIndex;
+                matchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
+                assert((match0 != prefixStart) & (match0 != dictStart));
+                mLength = ip0[-1] == match0[-1];
+                ip0 -= mLength;
+                match0 -= mLength;
+                offcode = REPCODE1_TO_OFFBASE;
+                mLength += 4;
+                goto _match;
         }   }
 
-        if (ip <= ilimit) {
-            /* Fill Table */
-            hashTable[ZSTD_hashPtr(base+curr+2, hlog, mls)] = curr+2;
-            hashTable[ZSTD_hashPtr(ip-2, hlog, mls)] = (U32)(ip-2-base);
-            /* check immediate repcode */
-            while (ip <= ilimit) {
-                U32 const current2 = (U32)(ip-base);
-                U32 const repIndex2 = current2 - offset_2;
-                const BYTE* const repMatch2 = repIndex2 < prefixStartIndex ? dictBase + repIndex2 : base + repIndex2;
-                if ( (((U32)((prefixStartIndex-1) - repIndex2) >= 3) & (offset_2 <= curr - dictStartIndex))  /* intentional overflow */
-                   && (MEM_read32(repMatch2) == MEM_read32(ip)) ) {
-                    const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;
-                    size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
-                    { U32 const tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; }  /* swap offset_2 <=> offset_1 */
-                    ZSTD_storeSeq(seqStore, 0 /*litlen*/, anchor, iend, STORE_REPCODE_1, repLength2);
-                    hashTable[ZSTD_hashPtr(ip, hlog, mls)] = current2;
-                    ip += repLength2;
-                    anchor = ip;
-                    continue;
-                }
-                break;
-    }   }   }
+        {   /* load match for ip[0] */
+            U32 const mval = idx >= dictStartIndex ?
+                    MEM_read32(idxBase + idx) :
+                    MEM_read32(ip0) ^ 1; /* guaranteed not to match */
+
+            /* check match at ip[0] */
+            if (MEM_read32(ip0) == mval) {
+                /* found a match! */
+                goto _offset;
+        }   }
+
+        /* lookup ip[1] */
+        idx = hashTable[hash1];
+        idxBase = idx < prefixStartIndex ? dictBase : base;
+
+        /* hash ip[2] */
+        hash0 = hash1;
+        hash1 = ZSTD_hashPtr(ip2, hlog, mls);
+
+        /* advance to next positions */
+        ip0 = ip1;
+        ip1 = ip2;
+        ip2 = ip3;
+
+        /* write back hash table entry */
+        current0 = (U32)(ip0 - base);
+        hashTable[hash0] = current0;
+
+        {   /* load match for ip[0] */
+            U32 const mval = idx >= dictStartIndex ?
+                    MEM_read32(idxBase + idx) :
+                    MEM_read32(ip0) ^ 1; /* guaranteed not to match */
+
+            /* check match at ip[0] */
+            if (MEM_read32(ip0) == mval) {
+                /* found a match! */
+                goto _offset;
+        }   }
+
+        /* lookup ip[1] */
+        idx = hashTable[hash1];
+        idxBase = idx < prefixStartIndex ? dictBase : base;
+
+        /* hash ip[2] */
+        hash0 = hash1;
+        hash1 = ZSTD_hashPtr(ip2, hlog, mls);
+
+        /* advance to next positions */
+        ip0 = ip1;
+        ip1 = ip2;
+        ip2 = ip0 + step;
+        ip3 = ip1 + step;
+
+        /* calculate step */
+        if (ip2 >= nextStep) {
+            step++;
+            PREFETCH_L1(ip1 + 64);
+            PREFETCH_L1(ip1 + 128);
+            nextStep += kStepIncr;
+        }
+    } while (ip3 < ilimit);
+
+_cleanup:
+    /* Note that there are probably still a couple positions we could search.
+     * However, it seems to be a meaningful performance hit to try to search
+     * them. So let's not. */
+
+    /* If offset_1 started invalid (offsetSaved1 != 0) and became valid (offset_1 != 0),
+     * rotate saved offsets. See comment in ZSTD_compressBlock_fast_noDict for more context. */
+    offsetSaved2 = ((offsetSaved1 != 0) && (offset_1 != 0)) ? offsetSaved1 : offsetSaved2;
 
     /* save reps for next block */
-    rep[0] = offset_1;
-    rep[1] = offset_2;
+    rep[0] = offset_1 ? offset_1 : offsetSaved1;
+    rep[1] = offset_2 ? offset_2 : offsetSaved2;
 
     /* Return the last literals size */
     return (size_t)(iend - anchor);
+
+_offset: /* Requires: ip0, idx, idxBase */
+
+    /* Compute the offset code. */
+    {   U32 const offset = current0 - idx;
+        const BYTE* const lowMatchPtr = idx < prefixStartIndex ? dictStart : prefixStart;
+        matchEnd = idx < prefixStartIndex ? dictEnd : iend;
+        match0 = idxBase + idx;
+        offset_2 = offset_1;
+        offset_1 = offset;
+        offcode = OFFSET_TO_OFFBASE(offset);
+        mLength = 4;
+
+        /* Count the backwards match length. */
+        while (((ip0>anchor) & (match0>lowMatchPtr)) && (ip0[-1] == match0[-1])) {
+            ip0--;
+            match0--;
+            mLength++;
+    }   }
+
+_match: /* Requires: ip0, match0, offcode, matchEnd */
+
+    /* Count the forward length. */
+    assert(matchEnd != 0);
+    mLength += ZSTD_count_2segments(ip0 + mLength, match0 + mLength, iend, matchEnd, prefixStart);
+
+    ZSTD_storeSeq(seqStore, (size_t)(ip0 - anchor), anchor, iend, offcode, mLength);
+
+    ip0 += mLength;
+    anchor = ip0;
+
+    /* write next hash table entry */
+    if (ip1 < ip0) {
+        hashTable[hash1] = (U32)(ip1 - base);
+    }
+
+    /* Fill table and check for immediate repcode. */
+    if (ip0 <= ilimit) {
+        /* Fill Table */
+        assert(base+current0+2 > istart);  /* check base overflow */
+        hashTable[ZSTD_hashPtr(base+current0+2, hlog, mls)] = current0+2;  /* here because current+2 could be > iend-8 */
+        hashTable[ZSTD_hashPtr(ip0-2, hlog, mls)] = (U32)(ip0-2-base);
+
+        while (ip0 <= ilimit) {
+            U32 const repIndex2 = (U32)(ip0-base) - offset_2;
+            const BYTE* const repMatch2 = repIndex2 < prefixStartIndex ? dictBase + repIndex2 : base + repIndex2;
+            if ( (((U32)((prefixStartIndex-1) - repIndex2) >= 3) & (offset_2 > 0))  /* intentional underflow */
+                 && (MEM_read32(repMatch2) == MEM_read32(ip0)) ) {
+                const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;
+                size_t const repLength2 = ZSTD_count_2segments(ip0+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
+                { U32 const tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; }  /* swap offset_2 <=> offset_1 */
+                ZSTD_storeSeq(seqStore, 0 /*litlen*/, anchor, iend, REPCODE1_TO_OFFBASE, repLength2);
+                hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = (U32)(ip0-base);
+                ip0 += repLength2;
+                anchor = ip0;
+                continue;
+            }
+            break;
+    }   }
+
+    goto _start;
 }
 
 ZSTD_GEN_FAST_FN(extDict, 4, 0)
@@ -660,6 +945,7 @@ size_t ZSTD_compressBlock_fast_extDict(
         void const* src, size_t srcSize)
 {
     U32 const mls = ms->cParams.minMatch;
+    assert(ms->dictMatchState == NULL);
     switch(mls)
     {
     default: /* includes case 3 */
diff --git a/lib/zstd/compress/zstd_fast.h b/lib/zstd/compress/zstd_fast.h
index fddc2f532d21d1..e64d9e1b2d393d 100644
--- a/lib/zstd/compress/zstd_fast.h
+++ b/lib/zstd/compress/zstd_fast.h
@@ -1,5 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -16,7 +17,8 @@
 #include "zstd_compress_internal.h"
 
 void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
-                        void const* end, ZSTD_dictTableLoadMethod_e dtlm);
+                        void const* end, ZSTD_dictTableLoadMethod_e dtlm,
+                        ZSTD_tableFillPurpose_e tfp);
 size_t ZSTD_compressBlock_fast(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         void const* src, size_t srcSize);
diff --git a/lib/zstd/compress/zstd_lazy.c b/lib/zstd/compress/zstd_lazy.c
index 0298a01a7504a5..f6b4978ceba7f1 100644
--- a/lib/zstd/compress/zstd_lazy.c
+++ b/lib/zstd/compress/zstd_lazy.c
@@ -1,5 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -10,6 +11,9 @@
 
 #include "zstd_compress_internal.h"
 #include "zstd_lazy.h"
+#include "../common/bits.h" /* ZSTD_countTrailingZeros64 */
+
+#define kLazySkippingStep 8
 
 
 /*-*************************************
@@ -197,8 +201,8 @@ ZSTD_DUBT_findBetterDictMatch (
             U32 matchIndex = dictMatchIndex + dictIndexDelta;
             if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) ) {
                 DEBUGLOG(9, "ZSTD_DUBT_findBetterDictMatch(%u) : found better match length %u -> %u and offsetCode %u -> %u (dictMatchIndex %u, matchIndex %u)",
-                    curr, (U32)bestLength, (U32)matchLength, (U32)*offsetPtr, STORE_OFFSET(curr - matchIndex), dictMatchIndex, matchIndex);
-                bestLength = matchLength, *offsetPtr = STORE_OFFSET(curr - matchIndex);
+                    curr, (U32)bestLength, (U32)matchLength, (U32)*offsetPtr, OFFSET_TO_OFFBASE(curr - matchIndex), dictMatchIndex, matchIndex);
+                bestLength = matchLength, *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex);
             }
             if (ip+matchLength == iend) {   /* reached end of input : ip[matchLength] is not valid, no way to know if it's larger or smaller than match */
                 break;   /* drop, to guarantee consistency (miss a little bit of compression) */
@@ -218,7 +222,7 @@ ZSTD_DUBT_findBetterDictMatch (
     }
 
     if (bestLength >= MINMATCH) {
-        U32 const mIndex = curr - (U32)STORED_OFFSET(*offsetPtr); (void)mIndex;
+        U32 const mIndex = curr - (U32)OFFBASE_TO_OFFSET(*offsetPtr); (void)mIndex;
         DEBUGLOG(8, "ZSTD_DUBT_findBetterDictMatch(%u) : found match of length %u and offsetCode %u (pos %u)",
                     curr, (U32)bestLength, (U32)*offsetPtr, mIndex);
     }
@@ -230,7 +234,7 @@ ZSTD_DUBT_findBetterDictMatch (
 static size_t
 ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
                         const BYTE* const ip, const BYTE* const iend,
-                        size_t* offsetPtr,
+                        size_t* offBasePtr,
                         U32 const mls,
                         const ZSTD_dictMode_e dictMode)
 {
@@ -327,8 +331,8 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
             if (matchLength > bestLength) {
                 if (matchLength > matchEndIdx - matchIndex)
                     matchEndIdx = matchIndex + (U32)matchLength;
-                if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) )
-                    bestLength = matchLength, *offsetPtr = STORE_OFFSET(curr - matchIndex);
+                if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr - matchIndex + 1) - ZSTD_highbit32((U32)*offBasePtr)) )
+                    bestLength = matchLength, *offBasePtr = OFFSET_TO_OFFBASE(curr - matchIndex);
                 if (ip+matchLength == iend) {   /* equal : no way to know if inf or sup */
                     if (dictMode == ZSTD_dictMatchState) {
                         nbCompares = 0; /* in addition to avoiding checking any
@@ -361,16 +365,16 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
         if (dictMode == ZSTD_dictMatchState && nbCompares) {
             bestLength = ZSTD_DUBT_findBetterDictMatch(
                     ms, ip, iend,
-                    offsetPtr, bestLength, nbCompares,
+                    offBasePtr, bestLength, nbCompares,
                     mls, dictMode);
         }
 
         assert(matchEndIdx > curr+8); /* ensure nextToUpdate is increased */
         ms->nextToUpdate = matchEndIdx - 8;   /* skip repetitive patterns */
         if (bestLength >= MINMATCH) {
-            U32 const mIndex = curr - (U32)STORED_OFFSET(*offsetPtr); (void)mIndex;
+            U32 const mIndex = curr - (U32)OFFBASE_TO_OFFSET(*offBasePtr); (void)mIndex;
             DEBUGLOG(8, "ZSTD_DUBT_findBestMatch(%u) : found match of length %u and offsetCode %u (pos %u)",
-                        curr, (U32)bestLength, (U32)*offsetPtr, mIndex);
+                        curr, (U32)bestLength, (U32)*offBasePtr, mIndex);
         }
         return bestLength;
     }
@@ -381,14 +385,14 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
 FORCE_INLINE_TEMPLATE size_t
 ZSTD_BtFindBestMatch( ZSTD_matchState_t* ms,
                 const BYTE* const ip, const BYTE* const iLimit,
-                      size_t* offsetPtr,
+                      size_t* offBasePtr,
                 const U32 mls /* template */,
                 const ZSTD_dictMode_e dictMode)
 {
     DEBUGLOG(7, "ZSTD_BtFindBestMatch");
     if (ip < ms->window.base + ms->nextToUpdate) return 0;   /* skipped area */
     ZSTD_updateDUBT(ms, ip, iLimit, mls);
-    return ZSTD_DUBT_findBestMatch(ms, ip, iLimit, offsetPtr, mls, dictMode);
+    return ZSTD_DUBT_findBestMatch(ms, ip, iLimit, offBasePtr, mls, dictMode);
 }
 
 /* *********************************
@@ -561,7 +565,7 @@ size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nb
         /* save best solution */
         if (currentMl > ml) {
             ml = currentMl;
-            *offsetPtr = STORE_OFFSET(curr - (matchIndex + ddsIndexDelta));
+            *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + ddsIndexDelta));
             if (ip+currentMl == iLimit) {
                 /* best possible, avoids read overflow on next attempt */
                 return ml;
@@ -598,7 +602,7 @@ size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nb
             /* save best solution */
             if (currentMl > ml) {
                 ml = currentMl;
-                *offsetPtr = STORE_OFFSET(curr - (matchIndex + ddsIndexDelta));
+                *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + ddsIndexDelta));
                 if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
             }
         }
@@ -617,7 +621,7 @@ size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nb
 FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal(
                         ZSTD_matchState_t* ms,
                         const ZSTD_compressionParameters* const cParams,
-                        const BYTE* ip, U32 const mls)
+                        const BYTE* ip, U32 const mls, U32 const lazySkipping)
 {
     U32* const hashTable  = ms->hashTable;
     const U32 hashLog = cParams->hashLog;
@@ -632,6 +636,9 @@ FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal(
         NEXT_IN_CHAIN(idx, chainMask) = hashTable[h];
         hashTable[h] = idx;
         idx++;
+        /* Stop inserting every position when in the lazy skipping mode. */
+        if (lazySkipping)
+            break;
     }
 
     ms->nextToUpdate = target;
@@ -640,7 +647,7 @@ FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal(
 
 U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip) {
     const ZSTD_compressionParameters* const cParams = &ms->cParams;
-    return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, ms->cParams.minMatch);
+    return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, ms->cParams.minMatch, /* lazySkipping*/ 0);
 }
 
 /* inlining is important to hardwire a hot branch (template emulation) */
@@ -684,14 +691,15 @@ size_t ZSTD_HcFindBestMatch(
     }
 
     /* HC4 match finder */
-    matchIndex = ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, mls);
+    matchIndex = ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, mls, ms->lazySkipping);
 
     for ( ; (matchIndex>=lowLimit) & (nbAttempts>0) ; nbAttempts--) {
         size_t currentMl=0;
         if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) {
             const BYTE* const match = base + matchIndex;
             assert(matchIndex >= dictLimit);   /* ensures this is true if dictMode != ZSTD_extDict */
-            if (match[ml] == ip[ml])   /* potentially better */
+            /* read 4B starting from (match + ml + 1 - sizeof(U32)) */
+            if (MEM_read32(match + ml - 3) == MEM_read32(ip + ml - 3))   /* potentially better */
                 currentMl = ZSTD_count(ip, match, iLimit);
         } else {
             const BYTE* const match = dictBase + matchIndex;
@@ -703,7 +711,7 @@ size_t ZSTD_HcFindBestMatch(
         /* save best solution */
         if (currentMl > ml) {
             ml = currentMl;
-            *offsetPtr = STORE_OFFSET(curr - matchIndex);
+            *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex);
             if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
         }
 
@@ -739,7 +747,7 @@ size_t ZSTD_HcFindBestMatch(
             if (currentMl > ml) {
                 ml = currentMl;
                 assert(curr > matchIndex + dmsIndexDelta);
-                *offsetPtr = STORE_OFFSET(curr - (matchIndex + dmsIndexDelta));
+                *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + dmsIndexDelta));
                 if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
             }
 
@@ -756,8 +764,6 @@ size_t ZSTD_HcFindBestMatch(
 * (SIMD) Row-based matchfinder
 ***********************************/
 /* Constants for row-based hash */
-#define ZSTD_ROW_HASH_TAG_OFFSET 16     /* byte offset of hashes in the match state's tagTable from the beginning of a row */
-#define ZSTD_ROW_HASH_TAG_BITS 8        /* nb bits to use for the tag */
 #define ZSTD_ROW_HASH_TAG_MASK ((1u << ZSTD_ROW_HASH_TAG_BITS) - 1)
 #define ZSTD_ROW_HASH_MAX_ENTRIES 64    /* absolute maximum number of entries per row, for all configurations */
 
@@ -769,64 +775,19 @@ typedef U64 ZSTD_VecMask;   /* Clarifies when we are interacting with a U64 repr
  * Starting from the LSB, returns the idx of the next non-zero bit.
  * Basically counting the nb of trailing zeroes.
  */
-static U32 ZSTD_VecMask_next(ZSTD_VecMask val) {
-    assert(val != 0);
-#   if (defined(__GNUC__) && ((__GNUC__ > 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4))))
-    if (sizeof(size_t) == 4) {
-        U32 mostSignificantWord = (U32)(val >> 32);
-        U32 leastSignificantWord = (U32)val;
-        if (leastSignificantWord == 0) {
-            return 32 + (U32)__builtin_ctz(mostSignificantWord);
-        } else {
-            return (U32)__builtin_ctz(leastSignificantWord);
-        }
-    } else {
-        return (U32)__builtin_ctzll(val);
-    }
-#   else
-    /* Software ctz version: http://aggregate.org/MAGIC/#Trailing%20Zero%20Count
-     * and: https://stackoverflow.com/questions/2709430/count-number-of-bits-in-a-64-bit-long-big-integer
-     */
-    val = ~val & (val - 1ULL); /* Lowest set bit mask */
-    val = val - ((val >> 1) & 0x5555555555555555);
-    val = (val & 0x3333333333333333ULL) + ((val >> 2) & 0x3333333333333333ULL);
-    return (U32)((((val + (val >> 4)) & 0xF0F0F0F0F0F0F0FULL) * 0x101010101010101ULL) >> 56);
-#   endif
-}
-
-/* ZSTD_rotateRight_*():
- * Rotates a bitfield to the right by "count" bits.
- * https://en.wikipedia.org/w/index.php?title=Circular_shift&oldid=991635599#Implementing_circular_shifts
- */
-FORCE_INLINE_TEMPLATE
-U64 ZSTD_rotateRight_U64(U64 const value, U32 count) {
-    assert(count < 64);
-    count &= 0x3F; /* for fickle pattern recognition */
-    return (value >> count) | (U64)(value << ((0U - count) & 0x3F));
-}
-
-FORCE_INLINE_TEMPLATE
-U32 ZSTD_rotateRight_U32(U32 const value, U32 count) {
-    assert(count < 32);
-    count &= 0x1F; /* for fickle pattern recognition */
-    return (value >> count) | (U32)(value << ((0U - count) & 0x1F));
-}
-
-FORCE_INLINE_TEMPLATE
-U16 ZSTD_rotateRight_U16(U16 const value, U32 count) {
-    assert(count < 16);
-    count &= 0x0F; /* for fickle pattern recognition */
-    return (value >> count) | (U16)(value << ((0U - count) & 0x0F));
+MEM_STATIC U32 ZSTD_VecMask_next(ZSTD_VecMask val) {
+    return ZSTD_countTrailingZeros64(val);
 }
 
 /* ZSTD_row_nextIndex():
  * Returns the next index to insert at within a tagTable row, and updates the "head"
- * value to reflect the update. Essentially cycles backwards from [0, {entries per row})
+ * value to reflect the update. Essentially cycles backwards from [1, {entries per row})
  */
 FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextIndex(BYTE* const tagRow, U32 const rowMask) {
-  U32 const next = (*tagRow - 1) & rowMask;
-  *tagRow = (BYTE)next;
-  return next;
+    U32 next = (*tagRow-1) & rowMask;
+    next += (next == 0) ? rowMask : 0; /* skip first position */
+    *tagRow = (BYTE)next;
+    return next;
 }
 
 /* ZSTD_isAligned():
@@ -840,7 +801,7 @@ MEM_STATIC int ZSTD_isAligned(void const* ptr, size_t align) {
 /* ZSTD_row_prefetch():
  * Performs prefetching for the hashTable and tagTable at a given row.
  */
-FORCE_INLINE_TEMPLATE void ZSTD_row_prefetch(U32 const* hashTable, U16 const* tagTable, U32 const relRow, U32 const rowLog) {
+FORCE_INLINE_TEMPLATE void ZSTD_row_prefetch(U32 const* hashTable, BYTE const* tagTable, U32 const relRow, U32 const rowLog) {
     PREFETCH_L1(hashTable + relRow);
     if (rowLog >= 5) {
         PREFETCH_L1(hashTable + relRow + 16);
@@ -864,13 +825,13 @@ FORCE_INLINE_TEMPLATE void ZSTD_row_fillHashCache(ZSTD_matchState_t* ms, const B
                                    U32 idx, const BYTE* const iLimit)
 {
     U32 const* const hashTable = ms->hashTable;
-    U16 const* const tagTable = ms->tagTable;
+    BYTE const* const tagTable = ms->tagTable;
     U32 const hashLog = ms->rowHashLog;
     U32 const maxElemsToPrefetch = (base + idx) > iLimit ? 0 : (U32)(iLimit - (base + idx) + 1);
     U32 const lim = idx + MIN(ZSTD_ROW_HASH_CACHE_SIZE, maxElemsToPrefetch);
 
     for (; idx < lim; ++idx) {
-        U32 const hash = (U32)ZSTD_hashPtr(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
+        U32 const hash = (U32)ZSTD_hashPtrSalted(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt);
         U32 const row = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
         ZSTD_row_prefetch(hashTable, tagTable, row, rowLog);
         ms->hashCache[idx & ZSTD_ROW_HASH_CACHE_MASK] = hash;
@@ -886,11 +847,12 @@ FORCE_INLINE_TEMPLATE void ZSTD_row_fillHashCache(ZSTD_matchState_t* ms, const B
  * base + idx + ZSTD_ROW_HASH_CACHE_SIZE. Also prefetches the appropriate rows from hashTable and tagTable.
  */
 FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextCachedHash(U32* cache, U32 const* hashTable,
-                                                  U16 const* tagTable, BYTE const* base,
+                                                  BYTE const* tagTable, BYTE const* base,
                                                   U32 idx, U32 const hashLog,
-                                                  U32 const rowLog, U32 const mls)
+                                                  U32 const rowLog, U32 const mls,
+                                                  U64 const hashSalt)
 {
-    U32 const newHash = (U32)ZSTD_hashPtr(base+idx+ZSTD_ROW_HASH_CACHE_SIZE, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
+    U32 const newHash = (U32)ZSTD_hashPtrSalted(base+idx+ZSTD_ROW_HASH_CACHE_SIZE, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, hashSalt);
     U32 const row = (newHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
     ZSTD_row_prefetch(hashTable, tagTable, row, rowLog);
     {   U32 const hash = cache[idx & ZSTD_ROW_HASH_CACHE_MASK];
@@ -908,22 +870,21 @@ FORCE_INLINE_TEMPLATE void ZSTD_row_update_internalImpl(ZSTD_matchState_t* ms,
                                                         U32 const rowMask, U32 const useCache)
 {
     U32* const hashTable = ms->hashTable;
-    U16* const tagTable = ms->tagTable;
+    BYTE* const tagTable = ms->tagTable;
     U32 const hashLog = ms->rowHashLog;
     const BYTE* const base = ms->window.base;
 
     DEBUGLOG(6, "ZSTD_row_update_internalImpl(): updateStartIdx=%u, updateEndIdx=%u", updateStartIdx, updateEndIdx);
     for (; updateStartIdx < updateEndIdx; ++updateStartIdx) {
-        U32 const hash = useCache ? ZSTD_row_nextCachedHash(ms->hashCache, hashTable, tagTable, base, updateStartIdx, hashLog, rowLog, mls)
-                                  : (U32)ZSTD_hashPtr(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
+        U32 const hash = useCache ? ZSTD_row_nextCachedHash(ms->hashCache, hashTable, tagTable, base, updateStartIdx, hashLog, rowLog, mls, ms->hashSalt)
+                                  : (U32)ZSTD_hashPtrSalted(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt);
         U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
         U32* const row = hashTable + relRow;
-        BYTE* tagRow = (BYTE*)(tagTable + relRow);  /* Though tagTable is laid out as a table of U16, each tag is only 1 byte.
-                                                       Explicit cast allows us to get exact desired position within each row */
+        BYTE* tagRow = tagTable + relRow;
         U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask);
 
-        assert(hash == ZSTD_hashPtr(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls));
-        ((BYTE*)tagRow)[pos + ZSTD_ROW_HASH_TAG_OFFSET] = hash & ZSTD_ROW_HASH_TAG_MASK;
+        assert(hash == ZSTD_hashPtrSalted(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt));
+        tagRow[pos] = hash & ZSTD_ROW_HASH_TAG_MASK;
         row[pos] = updateStartIdx;
     }
 }
@@ -971,7 +932,35 @@ void ZSTD_row_update(ZSTD_matchState_t* const ms, const BYTE* ip) {
     const U32 mls = MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */);
 
     DEBUGLOG(5, "ZSTD_row_update(), rowLog=%u", rowLog);
-    ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 0 /* dont use cache */);
+    ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 0 /* don't use cache */);
+}
+
+/* Returns the mask width of bits group of which will be set to 1. Given not all
+ * architectures have easy movemask instruction, this helps to iterate over
+ * groups of bits easier and faster.
+ */
+FORCE_INLINE_TEMPLATE U32
+ZSTD_row_matchMaskGroupWidth(const U32 rowEntries)
+{
+    assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64);
+    assert(rowEntries <= ZSTD_ROW_HASH_MAX_ENTRIES);
+    (void)rowEntries;
+#if defined(ZSTD_ARCH_ARM_NEON)
+    /* NEON path only works for little endian */
+    if (!MEM_isLittleEndian()) {
+        return 1;
+    }
+    if (rowEntries == 16) {
+        return 4;
+    }
+    if (rowEntries == 32) {
+        return 2;
+    }
+    if (rowEntries == 64) {
+        return 1;
+    }
+#endif
+    return 1;
 }
 
 #if defined(ZSTD_ARCH_X86_SSE2)
@@ -994,71 +983,82 @@ ZSTD_row_getSSEMask(int nbChunks, const BYTE* const src, const BYTE tag, const U
 }
 #endif
 
-/* Returns a ZSTD_VecMask (U32) that has the nth bit set to 1 if the newly-computed "tag" matches
- * the hash at the nth position in a row of the tagTable.
- * Each row is a circular buffer beginning at the value of "head". So we must rotate the "matches" bitfield
- * to match up with the actual layout of the entries within the hashTable */
+#if defined(ZSTD_ARCH_ARM_NEON)
+FORCE_INLINE_TEMPLATE ZSTD_VecMask
+ZSTD_row_getNEONMask(const U32 rowEntries, const BYTE* const src, const BYTE tag, const U32 headGrouped)
+{
+    assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64);
+    if (rowEntries == 16) {
+        /* vshrn_n_u16 shifts by 4 every u16 and narrows to 8 lower bits.
+         * After that groups of 4 bits represent the equalMask. We lower
+         * all bits except the highest in these groups by doing AND with
+         * 0x88 = 0b10001000.
+         */
+        const uint8x16_t chunk = vld1q_u8(src);
+        const uint16x8_t equalMask = vreinterpretq_u16_u8(vceqq_u8(chunk, vdupq_n_u8(tag)));
+        const uint8x8_t res = vshrn_n_u16(equalMask, 4);
+        const U64 matches = vget_lane_u64(vreinterpret_u64_u8(res), 0);
+        return ZSTD_rotateRight_U64(matches, headGrouped) & 0x8888888888888888ull;
+    } else if (rowEntries == 32) {
+        /* Same idea as with rowEntries == 16 but doing AND with
+         * 0x55 = 0b01010101.
+         */
+        const uint16x8x2_t chunk = vld2q_u16((const uint16_t*)(const void*)src);
+        const uint8x16_t chunk0 = vreinterpretq_u8_u16(chunk.val[0]);
+        const uint8x16_t chunk1 = vreinterpretq_u8_u16(chunk.val[1]);
+        const uint8x16_t dup = vdupq_n_u8(tag);
+        const uint8x8_t t0 = vshrn_n_u16(vreinterpretq_u16_u8(vceqq_u8(chunk0, dup)), 6);
+        const uint8x8_t t1 = vshrn_n_u16(vreinterpretq_u16_u8(vceqq_u8(chunk1, dup)), 6);
+        const uint8x8_t res = vsli_n_u8(t0, t1, 4);
+        const U64 matches = vget_lane_u64(vreinterpret_u64_u8(res), 0) ;
+        return ZSTD_rotateRight_U64(matches, headGrouped) & 0x5555555555555555ull;
+    } else { /* rowEntries == 64 */
+        const uint8x16x4_t chunk = vld4q_u8(src);
+        const uint8x16_t dup = vdupq_n_u8(tag);
+        const uint8x16_t cmp0 = vceqq_u8(chunk.val[0], dup);
+        const uint8x16_t cmp1 = vceqq_u8(chunk.val[1], dup);
+        const uint8x16_t cmp2 = vceqq_u8(chunk.val[2], dup);
+        const uint8x16_t cmp3 = vceqq_u8(chunk.val[3], dup);
+
+        const uint8x16_t t0 = vsriq_n_u8(cmp1, cmp0, 1);
+        const uint8x16_t t1 = vsriq_n_u8(cmp3, cmp2, 1);
+        const uint8x16_t t2 = vsriq_n_u8(t1, t0, 2);
+        const uint8x16_t t3 = vsriq_n_u8(t2, t2, 4);
+        const uint8x8_t t4 = vshrn_n_u16(vreinterpretq_u16_u8(t3), 4);
+        const U64 matches = vget_lane_u64(vreinterpret_u64_u8(t4), 0);
+        return ZSTD_rotateRight_U64(matches, headGrouped);
+    }
+}
+#endif
+
+/* Returns a ZSTD_VecMask (U64) that has the nth group (determined by
+ * ZSTD_row_matchMaskGroupWidth) of bits set to 1 if the newly-computed "tag"
+ * matches the hash at the nth position in a row of the tagTable.
+ * Each row is a circular buffer beginning at the value of "headGrouped". So we
+ * must rotate the "matches" bitfield to match up with the actual layout of the
+ * entries within the hashTable */
 FORCE_INLINE_TEMPLATE ZSTD_VecMask
-ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 head, const U32 rowEntries)
+ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 headGrouped, const U32 rowEntries)
 {
-    const BYTE* const src = tagRow + ZSTD_ROW_HASH_TAG_OFFSET;
+    const BYTE* const src = tagRow;
     assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64);
     assert(rowEntries <= ZSTD_ROW_HASH_MAX_ENTRIES);
+    assert(ZSTD_row_matchMaskGroupWidth(rowEntries) * rowEntries <= sizeof(ZSTD_VecMask) * 8);
 
 #if defined(ZSTD_ARCH_X86_SSE2)
 
-    return ZSTD_row_getSSEMask(rowEntries / 16, src, tag, head);
+    return ZSTD_row_getSSEMask(rowEntries / 16, src, tag, headGrouped);
 
 #else /* SW or NEON-LE */
 
 # if defined(ZSTD_ARCH_ARM_NEON)
   /* This NEON path only works for little endian - otherwise use SWAR below */
     if (MEM_isLittleEndian()) {
-        if (rowEntries == 16) {
-            const uint8x16_t chunk = vld1q_u8(src);
-            const uint16x8_t equalMask = vreinterpretq_u16_u8(vceqq_u8(chunk, vdupq_n_u8(tag)));
-            const uint16x8_t t0 = vshlq_n_u16(equalMask, 7);
-            const uint32x4_t t1 = vreinterpretq_u32_u16(vsriq_n_u16(t0, t0, 14));
-            const uint64x2_t t2 = vreinterpretq_u64_u32(vshrq_n_u32(t1, 14));
-            const uint8x16_t t3 = vreinterpretq_u8_u64(vsraq_n_u64(t2, t2, 28));
-            const U16 hi = (U16)vgetq_lane_u8(t3, 8);
-            const U16 lo = (U16)vgetq_lane_u8(t3, 0);
-            return ZSTD_rotateRight_U16((hi << 8) | lo, head);
-        } else if (rowEntries == 32) {
-            const uint16x8x2_t chunk = vld2q_u16((const U16*)(const void*)src);
-            const uint8x16_t chunk0 = vreinterpretq_u8_u16(chunk.val[0]);
-            const uint8x16_t chunk1 = vreinterpretq_u8_u16(chunk.val[1]);
-            const uint8x16_t equalMask0 = vceqq_u8(chunk0, vdupq_n_u8(tag));
-            const uint8x16_t equalMask1 = vceqq_u8(chunk1, vdupq_n_u8(tag));
-            const int8x8_t pack0 = vqmovn_s16(vreinterpretq_s16_u8(equalMask0));
-            const int8x8_t pack1 = vqmovn_s16(vreinterpretq_s16_u8(equalMask1));
-            const uint8x8_t t0 = vreinterpret_u8_s8(pack0);
-            const uint8x8_t t1 = vreinterpret_u8_s8(pack1);
-            const uint8x8_t t2 = vsri_n_u8(t1, t0, 2);
-            const uint8x8x2_t t3 = vuzp_u8(t2, t0);
-            const uint8x8_t t4 = vsri_n_u8(t3.val[1], t3.val[0], 4);
-            const U32 matches = vget_lane_u32(vreinterpret_u32_u8(t4), 0);
-            return ZSTD_rotateRight_U32(matches, head);
-        } else { /* rowEntries == 64 */
-            const uint8x16x4_t chunk = vld4q_u8(src);
-            const uint8x16_t dup = vdupq_n_u8(tag);
-            const uint8x16_t cmp0 = vceqq_u8(chunk.val[0], dup);
-            const uint8x16_t cmp1 = vceqq_u8(chunk.val[1], dup);
-            const uint8x16_t cmp2 = vceqq_u8(chunk.val[2], dup);
-            const uint8x16_t cmp3 = vceqq_u8(chunk.val[3], dup);
-
-            const uint8x16_t t0 = vsriq_n_u8(cmp1, cmp0, 1);
-            const uint8x16_t t1 = vsriq_n_u8(cmp3, cmp2, 1);
-            const uint8x16_t t2 = vsriq_n_u8(t1, t0, 2);
-            const uint8x16_t t3 = vsriq_n_u8(t2, t2, 4);
-            const uint8x8_t t4 = vshrn_n_u16(vreinterpretq_u16_u8(t3), 4);
-            const U64 matches = vget_lane_u64(vreinterpret_u64_u8(t4), 0);
-            return ZSTD_rotateRight_U64(matches, head);
-        }
+        return ZSTD_row_getNEONMask(rowEntries, src, tag, headGrouped);
     }
 # endif /* ZSTD_ARCH_ARM_NEON */
     /* SWAR */
-    {   const size_t chunkSize = sizeof(size_t);
+    {   const int chunkSize = sizeof(size_t);
         const size_t shiftAmount = ((chunkSize * 8) - chunkSize);
         const size_t xFF = ~((size_t)0);
         const size_t x01 = xFF / 0xFF;
@@ -1091,11 +1091,11 @@ ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 head,
         }
         matches = ~matches;
         if (rowEntries == 16) {
-            return ZSTD_rotateRight_U16((U16)matches, head);
+            return ZSTD_rotateRight_U16((U16)matches, headGrouped);
         } else if (rowEntries == 32) {
-            return ZSTD_rotateRight_U32((U32)matches, head);
+            return ZSTD_rotateRight_U32((U32)matches, headGrouped);
         } else {
-            return ZSTD_rotateRight_U64((U64)matches, head);
+            return ZSTD_rotateRight_U64((U64)matches, headGrouped);
         }
     }
 #endif
@@ -1125,7 +1125,7 @@ size_t ZSTD_RowFindBestMatch(
                         const U32 rowLog)
 {
     U32* const hashTable = ms->hashTable;
-    U16* const tagTable = ms->tagTable;
+    BYTE* const tagTable = ms->tagTable;
     U32* const hashCache = ms->hashCache;
     const U32 hashLog = ms->rowHashLog;
     const ZSTD_compressionParameters* const cParams = &ms->cParams;
@@ -1143,8 +1143,11 @@ size_t ZSTD_RowFindBestMatch(
     const U32 rowEntries = (1U << rowLog);
     const U32 rowMask = rowEntries - 1;
     const U32 cappedSearchLog = MIN(cParams->searchLog, rowLog); /* nb of searches is capped at nb entries per row */
+    const U32 groupWidth = ZSTD_row_matchMaskGroupWidth(rowEntries);
+    const U64 hashSalt = ms->hashSalt;
     U32 nbAttempts = 1U << cappedSearchLog;
     size_t ml=4-1;
+    U32 hash;
 
     /* DMS/DDS variables that may be referenced laster */
     const ZSTD_matchState_t* const dms = ms->dictMatchState;
@@ -1168,7 +1171,7 @@ size_t ZSTD_RowFindBestMatch(
     if (dictMode == ZSTD_dictMatchState) {
         /* Prefetch DMS rows */
         U32* const dmsHashTable = dms->hashTable;
-        U16* const dmsTagTable = dms->tagTable;
+        BYTE* const dmsTagTable = dms->tagTable;
         U32 const dmsHash = (U32)ZSTD_hashPtr(ip, dms->rowHashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
         U32 const dmsRelRow = (dmsHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
         dmsTag = dmsHash & ZSTD_ROW_HASH_TAG_MASK;
@@ -1178,23 +1181,34 @@ size_t ZSTD_RowFindBestMatch(
     }
 
     /* Update the hashTable and tagTable up to (but not including) ip */
-    ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 1 /* useCache */);
+    if (!ms->lazySkipping) {
+        ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 1 /* useCache */);
+        hash = ZSTD_row_nextCachedHash(hashCache, hashTable, tagTable, base, curr, hashLog, rowLog, mls, hashSalt);
+    } else {
+        /* Stop inserting every position when in the lazy skipping mode.
+         * The hash cache is also not kept up to date in this mode.
+         */
+        hash = (U32)ZSTD_hashPtrSalted(ip, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, hashSalt);
+        ms->nextToUpdate = curr;
+    }
+    ms->hashSaltEntropy += hash; /* collect salt entropy */
+
     {   /* Get the hash for ip, compute the appropriate row */
-        U32 const hash = ZSTD_row_nextCachedHash(hashCache, hashTable, tagTable, base, curr, hashLog, rowLog, mls);
         U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
         U32 const tag = hash & ZSTD_ROW_HASH_TAG_MASK;
         U32* const row = hashTable + relRow;
         BYTE* tagRow = (BYTE*)(tagTable + relRow);
-        U32 const head = *tagRow & rowMask;
+        U32 const headGrouped = (*tagRow & rowMask) * groupWidth;
         U32 matchBuffer[ZSTD_ROW_HASH_MAX_ENTRIES];
         size_t numMatches = 0;
         size_t currMatch = 0;
-        ZSTD_VecMask matches = ZSTD_row_getMatchMask(tagRow, (BYTE)tag, head, rowEntries);
+        ZSTD_VecMask matches = ZSTD_row_getMatchMask(tagRow, (BYTE)tag, headGrouped, rowEntries);
 
         /* Cycle through the matches and prefetch */
-        for (; (matches > 0) && (nbAttempts > 0); --nbAttempts, matches &= (matches - 1)) {
-            U32 const matchPos = (head + ZSTD_VecMask_next(matches)) & rowMask;
+        for (; (matches > 0) && (nbAttempts > 0); matches &= (matches - 1)) {
+            U32 const matchPos = ((headGrouped + ZSTD_VecMask_next(matches)) / groupWidth) & rowMask;
             U32 const matchIndex = row[matchPos];
+            if(matchPos == 0) continue;
             assert(numMatches < rowEntries);
             if (matchIndex < lowLimit)
                 break;
@@ -1204,13 +1218,14 @@ size_t ZSTD_RowFindBestMatch(
                 PREFETCH_L1(dictBase + matchIndex);
             }
             matchBuffer[numMatches++] = matchIndex;
+            --nbAttempts;
         }
 
         /* Speed opt: insert current byte into hashtable too. This allows us to avoid one iteration of the loop
            in ZSTD_row_update_internal() at the next search. */
         {
             U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask);
-            tagRow[pos + ZSTD_ROW_HASH_TAG_OFFSET] = (BYTE)tag;
+            tagRow[pos] = (BYTE)tag;
             row[pos] = ms->nextToUpdate++;
         }
 
@@ -1224,7 +1239,8 @@ size_t ZSTD_RowFindBestMatch(
             if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) {
                 const BYTE* const match = base + matchIndex;
                 assert(matchIndex >= dictLimit);   /* ensures this is true if dictMode != ZSTD_extDict */
-                if (match[ml] == ip[ml])   /* potentially better */
+                /* read 4B starting from (match + ml + 1 - sizeof(U32)) */
+                if (MEM_read32(match + ml - 3) == MEM_read32(ip + ml - 3))   /* potentially better */
                     currentMl = ZSTD_count(ip, match, iLimit);
             } else {
                 const BYTE* const match = dictBase + matchIndex;
@@ -1236,7 +1252,7 @@ size_t ZSTD_RowFindBestMatch(
             /* Save best solution */
             if (currentMl > ml) {
                 ml = currentMl;
-                *offsetPtr = STORE_OFFSET(curr - matchIndex);
+                *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex);
                 if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
             }
         }
@@ -1254,19 +1270,21 @@ size_t ZSTD_RowFindBestMatch(
         const U32 dmsSize              = (U32)(dmsEnd - dmsBase);
         const U32 dmsIndexDelta        = dictLimit - dmsSize;
 
-        {   U32 const head = *dmsTagRow & rowMask;
+        {   U32 const headGrouped = (*dmsTagRow & rowMask) * groupWidth;
             U32 matchBuffer[ZSTD_ROW_HASH_MAX_ENTRIES];
             size_t numMatches = 0;
             size_t currMatch = 0;
-            ZSTD_VecMask matches = ZSTD_row_getMatchMask(dmsTagRow, (BYTE)dmsTag, head, rowEntries);
+            ZSTD_VecMask matches = ZSTD_row_getMatchMask(dmsTagRow, (BYTE)dmsTag, headGrouped, rowEntries);
 
-            for (; (matches > 0) && (nbAttempts > 0); --nbAttempts, matches &= (matches - 1)) {
-                U32 const matchPos = (head + ZSTD_VecMask_next(matches)) & rowMask;
+            for (; (matches > 0) && (nbAttempts > 0); matches &= (matches - 1)) {
+                U32 const matchPos = ((headGrouped + ZSTD_VecMask_next(matches)) / groupWidth) & rowMask;
                 U32 const matchIndex = dmsRow[matchPos];
+                if(matchPos == 0) continue;
                 if (matchIndex < dmsLowestIndex)
                     break;
                 PREFETCH_L1(dmsBase + matchIndex);
                 matchBuffer[numMatches++] = matchIndex;
+                --nbAttempts;
             }
 
             /* Return the longest match */
@@ -1285,7 +1303,7 @@ size_t ZSTD_RowFindBestMatch(
                 if (currentMl > ml) {
                     ml = currentMl;
                     assert(curr > matchIndex + dmsIndexDelta);
-                    *offsetPtr = STORE_OFFSET(curr - (matchIndex + dmsIndexDelta));
+                    *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + dmsIndexDelta));
                     if (ip+currentMl == iLimit) break;
                 }
             }
@@ -1491,7 +1509,8 @@ ZSTD_compressBlock_lazy_generic(
     const U32 mls = BOUNDED(4, ms->cParams.minMatch, 6);
     const U32 rowLog = BOUNDED(4, ms->cParams.searchLog, 6);
 
-    U32 offset_1 = rep[0], offset_2 = rep[1], savedOffset=0;
+    U32 offset_1 = rep[0], offset_2 = rep[1];
+    U32 offsetSaved1 = 0, offsetSaved2 = 0;
 
     const int isDMS = dictMode == ZSTD_dictMatchState;
     const int isDDS = dictMode == ZSTD_dedicatedDictSearch;
@@ -1512,8 +1531,8 @@ ZSTD_compressBlock_lazy_generic(
         U32 const curr = (U32)(ip - base);
         U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, curr, ms->cParams.windowLog);
         U32 const maxRep = curr - windowLow;
-        if (offset_2 > maxRep) savedOffset = offset_2, offset_2 = 0;
-        if (offset_1 > maxRep) savedOffset = offset_1, offset_1 = 0;
+        if (offset_2 > maxRep) offsetSaved2 = offset_2, offset_2 = 0;
+        if (offset_1 > maxRep) offsetSaved1 = offset_1, offset_1 = 0;
     }
     if (isDxS) {
         /* dictMatchState repCode checks don't currently handle repCode == 0
@@ -1522,10 +1541,11 @@ ZSTD_compressBlock_lazy_generic(
         assert(offset_2 <= dictAndPrefixLength);
     }
 
+    /* Reset the lazy skipping state */
+    ms->lazySkipping = 0;
+
     if (searchMethod == search_rowHash) {
-        ZSTD_row_fillHashCache(ms, base, rowLog,
-                            MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */),
-                            ms->nextToUpdate, ilimit);
+        ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit);
     }
 
     /* Match Loop */
@@ -1537,7 +1557,7 @@ ZSTD_compressBlock_lazy_generic(
 #endif
     while (ip < ilimit) {
         size_t matchLength=0;
-        size_t offcode=STORE_REPCODE_1;
+        size_t offBase = REPCODE1_TO_OFFBASE;
         const BYTE* start=ip+1;
         DEBUGLOG(7, "search baseline (depth 0)");
 
@@ -1562,14 +1582,23 @@ ZSTD_compressBlock_lazy_generic(
         }
 
         /* first search (depth 0) */
-        {   size_t offsetFound = 999999999;
-            size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offsetFound, mls, rowLog, searchMethod, dictMode);
+        {   size_t offbaseFound = 999999999;
+            size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offbaseFound, mls, rowLog, searchMethod, dictMode);
             if (ml2 > matchLength)
-                matchLength = ml2, start = ip, offcode=offsetFound;
+                matchLength = ml2, start = ip, offBase = offbaseFound;
         }
 
         if (matchLength < 4) {
-            ip += ((ip-anchor) >> kSearchStrength) + 1;   /* jump faster over incompressible sections */
+            size_t const step = ((size_t)(ip-anchor) >> kSearchStrength) + 1;   /* jump faster over incompressible sections */;
+            ip += step;
+            /* Enter the lazy skipping mode once we are skipping more than 8 bytes at a time.
+             * In this mode we stop inserting every position into our tables, and only insert
+             * positions that we search, which is one in step positions.
+             * The exact cutoff is flexible, I've just chosen a number that is reasonably high,
+             * so we minimize the compression ratio loss in "normal" scenarios. This mode gets
+             * triggered once we've gone 2KB without finding any matches.
+             */
+            ms->lazySkipping = step > kLazySkippingStep;
             continue;
         }
 
@@ -1579,12 +1608,12 @@ ZSTD_compressBlock_lazy_generic(
             DEBUGLOG(7, "search depth 1");
             ip ++;
             if ( (dictMode == ZSTD_noDict)
-              && (offcode) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
+              && (offBase) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
                 size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4;
                 int const gain2 = (int)(mlRep * 3);
-                int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
+                int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1);
                 if ((mlRep >= 4) && (gain2 > gain1))
-                    matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip;
+                    matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip;
             }
             if (isDxS) {
                 const U32 repIndex = (U32)(ip - base) - offset_1;
@@ -1596,17 +1625,17 @@ ZSTD_compressBlock_lazy_generic(
                     const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
                     size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;
                     int const gain2 = (int)(mlRep * 3);
-                    int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
+                    int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1);
                     if ((mlRep >= 4) && (gain2 > gain1))
-                        matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip;
+                        matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip;
                 }
             }
-            {   size_t offset2=999999999;
-                size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offset2, mls, rowLog, searchMethod, dictMode);
-                int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2)));   /* raw approx */
-                int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 4);
+            {   size_t ofbCandidate=999999999;
+                size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, dictMode);
+                int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate));   /* raw approx */
+                int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 4);
                 if ((ml2 >= 4) && (gain2 > gain1)) {
-                    matchLength = ml2, offcode = offset2, start = ip;
+                    matchLength = ml2, offBase = ofbCandidate, start = ip;
                     continue;   /* search a better one */
             }   }
 
@@ -1615,12 +1644,12 @@ ZSTD_compressBlock_lazy_generic(
                 DEBUGLOG(7, "search depth 2");
                 ip ++;
                 if ( (dictMode == ZSTD_noDict)
-                  && (offcode) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
+                  && (offBase) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
                     size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4;
                     int const gain2 = (int)(mlRep * 4);
-                    int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
+                    int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1);
                     if ((mlRep >= 4) && (gain2 > gain1))
-                        matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip;
+                        matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip;
                 }
                 if (isDxS) {
                     const U32 repIndex = (U32)(ip - base) - offset_1;
@@ -1632,17 +1661,17 @@ ZSTD_compressBlock_lazy_generic(
                         const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
                         size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;
                         int const gain2 = (int)(mlRep * 4);
-                        int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
+                        int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1);
                         if ((mlRep >= 4) && (gain2 > gain1))
-                            matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip;
+                            matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip;
                     }
                 }
-                {   size_t offset2=999999999;
-                    size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offset2, mls, rowLog, searchMethod, dictMode);
-                    int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2)));   /* raw approx */
-                    int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 7);
+                {   size_t ofbCandidate=999999999;
+                    size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, dictMode);
+                    int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate));   /* raw approx */
+                    int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 7);
                     if ((ml2 >= 4) && (gain2 > gain1)) {
-                        matchLength = ml2, offcode = offset2, start = ip;
+                        matchLength = ml2, offBase = ofbCandidate, start = ip;
                         continue;
             }   }   }
             break;  /* nothing found : store previous solution */
@@ -1653,26 +1682,33 @@ ZSTD_compressBlock_lazy_generic(
          * notably if `value` is unsigned, resulting in a large positive `-value`.
          */
         /* catch up */
-        if (STORED_IS_OFFSET(offcode)) {
+        if (OFFBASE_IS_OFFSET(offBase)) {
             if (dictMode == ZSTD_noDict) {
-                while ( ((start > anchor) & (start - STORED_OFFSET(offcode) > prefixLowest))
-                     && (start[-1] == (start-STORED_OFFSET(offcode))[-1]) )  /* only search for offset within prefix */
+                while ( ((start > anchor) & (start - OFFBASE_TO_OFFSET(offBase) > prefixLowest))
+                     && (start[-1] == (start-OFFBASE_TO_OFFSET(offBase))[-1]) )  /* only search for offset within prefix */
                     { start--; matchLength++; }
             }
             if (isDxS) {
-                U32 const matchIndex = (U32)((size_t)(start-base) - STORED_OFFSET(offcode));
+                U32 const matchIndex = (U32)((size_t)(start-base) - OFFBASE_TO_OFFSET(offBase));
                 const BYTE* match = (matchIndex < prefixLowestIndex) ? dictBase + matchIndex - dictIndexDelta : base + matchIndex;
                 const BYTE* const mStart = (matchIndex < prefixLowestIndex) ? dictLowest : prefixLowest;
                 while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; }  /* catch up */
             }
-            offset_2 = offset_1; offset_1 = (U32)STORED_OFFSET(offcode);
+            offset_2 = offset_1; offset_1 = (U32)OFFBASE_TO_OFFSET(offBase);
         }
         /* store sequence */
 _storeSequence:
         {   size_t const litLength = (size_t)(start - anchor);
-            ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offcode, matchLength);
+            ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offBase, matchLength);
             anchor = ip = start + matchLength;
         }
+        if (ms->lazySkipping) {
+            /* We've found a match, disable lazy skipping mode, and refill the hash cache. */
+            if (searchMethod == search_rowHash) {
+                ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit);
+            }
+            ms->lazySkipping = 0;
+        }
 
         /* check immediate repcode */
         if (isDxS) {
@@ -1686,8 +1722,8 @@ ZSTD_compressBlock_lazy_generic(
                    && (MEM_read32(repMatch) == MEM_read32(ip)) ) {
                     const BYTE* const repEnd2 = repIndex < prefixLowestIndex ? dictEnd : iend;
                     matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd2, prefixLowest) + 4;
-                    offcode = offset_2; offset_2 = offset_1; offset_1 = (U32)offcode;   /* swap offset_2 <=> offset_1 */
-                    ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, matchLength);
+                    offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase;   /* swap offset_2 <=> offset_1 */
+                    ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength);
                     ip += matchLength;
                     anchor = ip;
                     continue;
@@ -1701,16 +1737,20 @@ ZSTD_compressBlock_lazy_generic(
                  && (MEM_read32(ip) == MEM_read32(ip - offset_2)) ) {
                 /* store sequence */
                 matchLength = ZSTD_count(ip+4, ip+4-offset_2, iend) + 4;
-                offcode = offset_2; offset_2 = offset_1; offset_1 = (U32)offcode; /* swap repcodes */
-                ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, matchLength);
+                offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase; /* swap repcodes */
+                ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength);
                 ip += matchLength;
                 anchor = ip;
                 continue;   /* faster when present ... (?) */
     }   }   }
 
-    /* Save reps for next block */
-    rep[0] = offset_1 ? offset_1 : savedOffset;
-    rep[1] = offset_2 ? offset_2 : savedOffset;
+    /* If offset_1 started invalid (offsetSaved1 != 0) and became valid (offset_1 != 0),
+     * rotate saved offsets. See comment in ZSTD_compressBlock_fast_noDict for more context. */
+    offsetSaved2 = ((offsetSaved1 != 0) && (offset_1 != 0)) ? offsetSaved1 : offsetSaved2;
+
+    /* save reps for next block */
+    rep[0] = offset_1 ? offset_1 : offsetSaved1;
+    rep[1] = offset_2 ? offset_2 : offsetSaved2;
 
     /* Return the last literals size */
     return (size_t)(iend - anchor);
@@ -1886,12 +1926,13 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
 
     DEBUGLOG(5, "ZSTD_compressBlock_lazy_extDict_generic (searchFunc=%u)", (U32)searchMethod);
 
+    /* Reset the lazy skipping state */
+    ms->lazySkipping = 0;
+
     /* init */
     ip += (ip == prefixStart);
     if (searchMethod == search_rowHash) {
-        ZSTD_row_fillHashCache(ms, base, rowLog,
-                               MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */),
-                               ms->nextToUpdate, ilimit);
+        ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit);
     }
 
     /* Match Loop */
@@ -1903,7 +1944,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
 #endif
     while (ip < ilimit) {
         size_t matchLength=0;
-        size_t offcode=STORE_REPCODE_1;
+        size_t offBase = REPCODE1_TO_OFFBASE;
         const BYTE* start=ip+1;
         U32 curr = (U32)(ip-base);
 
@@ -1922,14 +1963,23 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
         }   }
 
         /* first search (depth 0) */
-        {   size_t offsetFound = 999999999;
-            size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offsetFound, mls, rowLog, searchMethod, ZSTD_extDict);
+        {   size_t ofbCandidate = 999999999;
+            size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict);
             if (ml2 > matchLength)
-                matchLength = ml2, start = ip, offcode=offsetFound;
+                matchLength = ml2, start = ip, offBase = ofbCandidate;
         }
 
         if (matchLength < 4) {
-            ip += ((ip-anchor) >> kSearchStrength) + 1;   /* jump faster over incompressible sections */
+            size_t const step = ((size_t)(ip-anchor) >> kSearchStrength);
+            ip += step + 1;   /* jump faster over incompressible sections */
+            /* Enter the lazy skipping mode once we are skipping more than 8 bytes at a time.
+             * In this mode we stop inserting every position into our tables, and only insert
+             * positions that we search, which is one in step positions.
+             * The exact cutoff is flexible, I've just chosen a number that is reasonably high,
+             * so we minimize the compression ratio loss in "normal" scenarios. This mode gets
+             * triggered once we've gone 2KB without finding any matches.
+             */
+            ms->lazySkipping = step > kLazySkippingStep;
             continue;
         }
 
@@ -1939,7 +1989,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
             ip ++;
             curr++;
             /* check repCode */
-            if (offcode) {
+            if (offBase) {
                 const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog);
                 const U32 repIndex = (U32)(curr - offset_1);
                 const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
@@ -1951,18 +2001,18 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
                     const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
                     size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;
                     int const gain2 = (int)(repLength * 3);
-                    int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
+                    int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1);
                     if ((repLength >= 4) && (gain2 > gain1))
-                        matchLength = repLength, offcode = STORE_REPCODE_1, start = ip;
+                        matchLength = repLength, offBase = REPCODE1_TO_OFFBASE, start = ip;
             }   }
 
             /* search match, depth 1 */
-            {   size_t offset2=999999999;
-                size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offset2, mls, rowLog, searchMethod, ZSTD_extDict);
-                int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2)));   /* raw approx */
-                int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 4);
+            {   size_t ofbCandidate = 999999999;
+                size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict);
+                int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate));   /* raw approx */
+                int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 4);
                 if ((ml2 >= 4) && (gain2 > gain1)) {
-                    matchLength = ml2, offcode = offset2, start = ip;
+                    matchLength = ml2, offBase = ofbCandidate, start = ip;
                     continue;   /* search a better one */
             }   }
 
@@ -1971,7 +2021,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
                 ip ++;
                 curr++;
                 /* check repCode */
-                if (offcode) {
+                if (offBase) {
                     const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog);
                     const U32 repIndex = (U32)(curr - offset_1);
                     const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
@@ -1983,38 +2033,45 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
                         const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
                         size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;
                         int const gain2 = (int)(repLength * 4);
-                        int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
+                        int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1);
                         if ((repLength >= 4) && (gain2 > gain1))
-                            matchLength = repLength, offcode = STORE_REPCODE_1, start = ip;
+                            matchLength = repLength, offBase = REPCODE1_TO_OFFBASE, start = ip;
                 }   }
 
                 /* search match, depth 2 */
-                {   size_t offset2=999999999;
-                    size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offset2, mls, rowLog, searchMethod, ZSTD_extDict);
-                    int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2)));   /* raw approx */
-                    int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 7);
+                {   size_t ofbCandidate = 999999999;
+                    size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict);
+                    int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate));   /* raw approx */
+                    int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 7);
                     if ((ml2 >= 4) && (gain2 > gain1)) {
-                        matchLength = ml2, offcode = offset2, start = ip;
+                        matchLength = ml2, offBase = ofbCandidate, start = ip;
                         continue;
             }   }   }
             break;  /* nothing found : store previous solution */
         }
 
         /* catch up */
-        if (STORED_IS_OFFSET(offcode)) {
-            U32 const matchIndex = (U32)((size_t)(start-base) - STORED_OFFSET(offcode));
+        if (OFFBASE_IS_OFFSET(offBase)) {
+            U32 const matchIndex = (U32)((size_t)(start-base) - OFFBASE_TO_OFFSET(offBase));
             const BYTE* match = (matchIndex < dictLimit) ? dictBase + matchIndex : base + matchIndex;
             const BYTE* const mStart = (matchIndex < dictLimit) ? dictStart : prefixStart;
             while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; }  /* catch up */
-            offset_2 = offset_1; offset_1 = (U32)STORED_OFFSET(offcode);
+            offset_2 = offset_1; offset_1 = (U32)OFFBASE_TO_OFFSET(offBase);
         }
 
         /* store sequence */
 _storeSequence:
         {   size_t const litLength = (size_t)(start - anchor);
-            ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offcode, matchLength);
+            ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offBase, matchLength);
             anchor = ip = start + matchLength;
         }
+        if (ms->lazySkipping) {
+            /* We've found a match, disable lazy skipping mode, and refill the hash cache. */
+            if (searchMethod == search_rowHash) {
+                ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit);
+            }
+            ms->lazySkipping = 0;
+        }
 
         /* check immediate repcode */
         while (ip <= ilimit) {
@@ -2029,8 +2086,8 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
                 /* repcode detected we should take it */
                 const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
                 matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;
-                offcode = offset_2; offset_2 = offset_1; offset_1 = (U32)offcode;   /* swap offset history */
-                ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, matchLength);
+                offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase;   /* swap offset history */
+                ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength);
                 ip += matchLength;
                 anchor = ip;
                 continue;   /* faster when present ... (?) */
@@ -2096,7 +2153,6 @@ size_t ZSTD_compressBlock_lazy_extDict_row(
 size_t ZSTD_compressBlock_lazy2_extDict_row(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         void const* src, size_t srcSize)
-
 {
     return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2);
 }
diff --git a/lib/zstd/compress/zstd_lazy.h b/lib/zstd/compress/zstd_lazy.h
index e5bdf4df8dde0b..9505bed93c0313 100644
--- a/lib/zstd/compress/zstd_lazy.h
+++ b/lib/zstd/compress/zstd_lazy.h
@@ -1,5 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -22,6 +23,8 @@
  */
 #define ZSTD_LAZY_DDSS_BUCKET_LOG 2
 
+#define ZSTD_ROW_HASH_TAG_BITS 8        /* nb bits to use for the tag */
+
 U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip);
 void ZSTD_row_update(ZSTD_matchState_t* const ms, const BYTE* ip);
 
@@ -113,7 +116,7 @@ size_t ZSTD_compressBlock_lazy2_extDict_row(
 size_t ZSTD_compressBlock_btlazy2_extDict(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         void const* src, size_t srcSize);
-        
+
 
 
 #endif /* ZSTD_LAZY_H */
diff --git a/lib/zstd/compress/zstd_ldm.c b/lib/zstd/compress/zstd_ldm.c
index dd86fc83e7dde3..b7da76b0db7c44 100644
--- a/lib/zstd/compress/zstd_ldm.c
+++ b/lib/zstd/compress/zstd_ldm.c
@@ -1,5 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -242,11 +243,11 @@ static size_t ZSTD_ldm_fillFastTables(ZSTD_matchState_t* ms,
     switch(ms->cParams.strategy)
     {
     case ZSTD_fast:
-        ZSTD_fillHashTable(ms, iend, ZSTD_dtlm_fast);
+        ZSTD_fillHashTable(ms, iend, ZSTD_dtlm_fast, ZSTD_tfp_forCCtx);
         break;
 
     case ZSTD_dfast:
-        ZSTD_fillDoubleHashTable(ms, iend, ZSTD_dtlm_fast);
+        ZSTD_fillDoubleHashTable(ms, iend, ZSTD_dtlm_fast, ZSTD_tfp_forCCtx);
         break;
 
     case ZSTD_greedy:
@@ -549,7 +550,7 @@ size_t ZSTD_ldm_generateSequences(
          * the window through early invalidation.
          * TODO: * Test the chunk size.
          *       * Try invalidation after the sequence generation and test the
-         *         the offset against maxDist directly.
+         *         offset against maxDist directly.
          *
          * NOTE: Because of dictionaries + sequence splitting we MUST make sure
          * that any offset used is valid at the END of the sequence, since it may
@@ -711,7 +712,7 @@ size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore,
             rep[0] = sequence.offset;
             /* Store the sequence */
             ZSTD_storeSeq(seqStore, newLitLength, ip - newLitLength, iend,
-                          STORE_OFFSET(sequence.offset),
+                          OFFSET_TO_OFFBASE(sequence.offset),
                           sequence.matchLength);
             ip += sequence.matchLength;
         }
diff --git a/lib/zstd/compress/zstd_ldm.h b/lib/zstd/compress/zstd_ldm.h
index fbc6a5e88fd7a5..c540731abde725 100644
--- a/lib/zstd/compress/zstd_ldm.h
+++ b/lib/zstd/compress/zstd_ldm.h
@@ -1,5 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
diff --git a/lib/zstd/compress/zstd_ldm_geartab.h b/lib/zstd/compress/zstd_ldm_geartab.h
index 647f865be29033..cfccfc46f6f7b7 100644
--- a/lib/zstd/compress/zstd_ldm_geartab.h
+++ b/lib/zstd/compress/zstd_ldm_geartab.h
@@ -1,5 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
diff --git a/lib/zstd/compress/zstd_opt.c b/lib/zstd/compress/zstd_opt.c
index fd82acfda62f6c..1e41cb04f48207 100644
--- a/lib/zstd/compress/zstd_opt.c
+++ b/lib/zstd/compress/zstd_opt.c
@@ -1,5 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
 /*
- * Copyright (c) Przemyslaw Skibinski, Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -16,7 +17,7 @@
 #define ZSTD_LITFREQ_ADD    2   /* scaling factor for litFreq, so that frequencies adapt faster to new stats */
 #define ZSTD_MAX_PRICE     (1<<30)
 
-#define ZSTD_PREDEF_THRESHOLD 1024   /* if srcSize < ZSTD_PREDEF_THRESHOLD, symbols' cost is assumed static, directly determined by pre-defined distributions */
+#define ZSTD_PREDEF_THRESHOLD 8   /* if srcSize < ZSTD_PREDEF_THRESHOLD, symbols' cost is assumed static, directly determined by pre-defined distributions */
 
 
 /*-*************************************
@@ -26,27 +27,35 @@
 #if 0    /* approximation at bit level (for tests) */
 #  define BITCOST_ACCURACY 0
 #  define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY)
-#  define WEIGHT(stat, opt) ((void)opt, ZSTD_bitWeight(stat))
+#  define WEIGHT(stat, opt) ((void)(opt), ZSTD_bitWeight(stat))
 #elif 0  /* fractional bit accuracy (for tests) */
 #  define BITCOST_ACCURACY 8
 #  define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY)
-#  define WEIGHT(stat,opt) ((void)opt, ZSTD_fracWeight(stat))
+#  define WEIGHT(stat,opt) ((void)(opt), ZSTD_fracWeight(stat))
 #else    /* opt==approx, ultra==accurate */
 #  define BITCOST_ACCURACY 8
 #  define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY)
-#  define WEIGHT(stat,opt) (opt ? ZSTD_fracWeight(stat) : ZSTD_bitWeight(stat))
+#  define WEIGHT(stat,opt) ((opt) ? ZSTD_fracWeight(stat) : ZSTD_bitWeight(stat))
 #endif
 
+/* ZSTD_bitWeight() :
+ * provide estimated "cost" of a stat in full bits only */
 MEM_STATIC U32 ZSTD_bitWeight(U32 stat)
 {
     return (ZSTD_highbit32(stat+1) * BITCOST_MULTIPLIER);
 }
 
+/* ZSTD_fracWeight() :
+ * provide fractional-bit "cost" of a stat,
+ * using linear interpolation approximation */
 MEM_STATIC U32 ZSTD_fracWeight(U32 rawStat)
 {
     U32 const stat = rawStat + 1;
     U32 const hb = ZSTD_highbit32(stat);
     U32 const BWeight = hb * BITCOST_MULTIPLIER;
+    /* Fweight was meant for "Fractional weight"
+     * but it's effectively a value between 1 and 2
+     * using fixed point arithmetic */
     U32 const FWeight = (stat << BITCOST_ACCURACY) >> hb;
     U32 const weight = BWeight + FWeight;
     assert(hb + BITCOST_ACCURACY < 31);
@@ -57,7 +66,7 @@ MEM_STATIC U32 ZSTD_fracWeight(U32 rawStat)
 /* debugging function,
  * @return price in bytes as fractional value
  * for debug messages only */
-MEM_STATIC double ZSTD_fCost(U32 price)
+MEM_STATIC double ZSTD_fCost(int price)
 {
     return (double)price / (BITCOST_MULTIPLIER*8);
 }
@@ -88,20 +97,26 @@ static U32 sum_u32(const unsigned table[], size_t nbElts)
     return total;
 }
 
-static U32 ZSTD_downscaleStats(unsigned* table, U32 lastEltIndex, U32 shift)
+typedef enum { base_0possible=0, base_1guaranteed=1 } base_directive_e;
+
+static U32
+ZSTD_downscaleStats(unsigned* table, U32 lastEltIndex, U32 shift, base_directive_e base1)
 {
     U32 s, sum=0;
-    DEBUGLOG(5, "ZSTD_downscaleStats (nbElts=%u, shift=%u)", (unsigned)lastEltIndex+1, (unsigned)shift);
+    DEBUGLOG(5, "ZSTD_downscaleStats (nbElts=%u, shift=%u)",
+            (unsigned)lastEltIndex+1, (unsigned)shift );
     assert(shift < 30);
     for (s=0; s<lastEltIndex+1; s++) {
-        table[s] = 1 + (table[s] >> shift);
-        sum += table[s];
+        unsigned const base = base1 ? 1 : (table[s]>0);
+        unsigned const newStat = base + (table[s] >> shift);
+        sum += newStat;
+        table[s] = newStat;
     }
     return sum;
 }
 
 /* ZSTD_scaleStats() :
- * reduce all elements in table is sum too large
+ * reduce all elt frequencies in table if sum too large
  * return the resulting sum of elements */
 static U32 ZSTD_scaleStats(unsigned* table, U32 lastEltIndex, U32 logTarget)
 {
@@ -110,7 +125,7 @@ static U32 ZSTD_scaleStats(unsigned* table, U32 lastEltIndex, U32 logTarget)
     DEBUGLOG(5, "ZSTD_scaleStats (nbElts=%u, target=%u)", (unsigned)lastEltIndex+1, (unsigned)logTarget);
     assert(logTarget < 30);
     if (factor <= 1) return prevsum;
-    return ZSTD_downscaleStats(table, lastEltIndex, ZSTD_highbit32(factor));
+    return ZSTD_downscaleStats(table, lastEltIndex, ZSTD_highbit32(factor), base_1guaranteed);
 }
 
 /* ZSTD_rescaleFreqs() :
@@ -129,18 +144,22 @@ ZSTD_rescaleFreqs(optState_t* const optPtr,
     DEBUGLOG(5, "ZSTD_rescaleFreqs (srcSize=%u)", (unsigned)srcSize);
     optPtr->priceType = zop_dynamic;
 
-    if (optPtr->litLengthSum == 0) {  /* first block : init */
-        if (srcSize <= ZSTD_PREDEF_THRESHOLD) {  /* heuristic */
-            DEBUGLOG(5, "(srcSize <= ZSTD_PREDEF_THRESHOLD) => zop_predef");
+    if (optPtr->litLengthSum == 0) {  /* no literals stats collected -> first block assumed -> init */
+
+        /* heuristic: use pre-defined stats for too small inputs */
+        if (srcSize <= ZSTD_PREDEF_THRESHOLD) {
+            DEBUGLOG(5, "srcSize <= %i : use predefined stats", ZSTD_PREDEF_THRESHOLD);
             optPtr->priceType = zop_predef;
         }
 
         assert(optPtr->symbolCosts != NULL);
         if (optPtr->symbolCosts->huf.repeatMode == HUF_repeat_valid) {
-            /* huffman table presumed generated by dictionary */
+
+            /* huffman stats covering the full value set : table presumed generated by dictionary */
             optPtr->priceType = zop_dynamic;
 
             if (compressedLiterals) {
+                /* generate literals statistics from huffman table */
                 unsigned lit;
                 assert(optPtr->litFreq != NULL);
                 optPtr->litSum = 0;
@@ -188,13 +207,14 @@ ZSTD_rescaleFreqs(optState_t* const optPtr,
                     optPtr->offCodeSum += optPtr->offCodeFreq[of];
             }   }
 
-        } else {  /* not a dictionary */
+        } else {  /* first block, no dictionary */
 
             assert(optPtr->litFreq != NULL);
             if (compressedLiterals) {
+                /* base initial cost of literals on direct frequency within src */
                 unsigned lit = MaxLit;
                 HIST_count_simple(optPtr->litFreq, &lit, src, srcSize);   /* use raw first block to init statistics */
-                optPtr->litSum = ZSTD_downscaleStats(optPtr->litFreq, MaxLit, 8);
+                optPtr->litSum = ZSTD_downscaleStats(optPtr->litFreq, MaxLit, 8, base_0possible);
             }
 
             {   unsigned const baseLLfreqs[MaxLL+1] = {
@@ -224,10 +244,9 @@ ZSTD_rescaleFreqs(optState_t* const optPtr,
                 optPtr->offCodeSum = sum_u32(baseOFCfreqs, MaxOff+1);
             }
 
-
         }
 
-    } else {   /* new block : re-use previous statistics, scaled down */
+    } else {   /* new block : scale down accumulated statistics */
 
         if (compressedLiterals)
             optPtr->litSum = ZSTD_scaleStats(optPtr->litFreq, MaxLit, 12);
@@ -255,11 +274,14 @@ static U32 ZSTD_rawLiteralsCost(const BYTE* const literals, U32 const litLength,
         return (litLength*6) * BITCOST_MULTIPLIER;  /* 6 bit per literal - no statistic used */
 
     /* dynamic statistics */
-    {   U32 price = litLength * optPtr->litSumBasePrice;
+    {   U32 price = optPtr->litSumBasePrice * litLength;
+        U32 const litPriceMax = optPtr->litSumBasePrice - BITCOST_MULTIPLIER;
         U32 u;
+        assert(optPtr->litSumBasePrice >= BITCOST_MULTIPLIER);
         for (u=0; u < litLength; u++) {
-            assert(WEIGHT(optPtr->litFreq[literals[u]], optLevel) <= optPtr->litSumBasePrice);   /* literal cost should never be negative */
-            price -= WEIGHT(optPtr->litFreq[literals[u]], optLevel);
+            U32 litPrice = WEIGHT(optPtr->litFreq[literals[u]], optLevel);
+            if (UNLIKELY(litPrice > litPriceMax)) litPrice = litPriceMax;
+            price -= litPrice;
         }
         return price;
     }
@@ -272,10 +294,11 @@ static U32 ZSTD_litLengthPrice(U32 const litLength, const optState_t* const optP
     assert(litLength <= ZSTD_BLOCKSIZE_MAX);
     if (optPtr->priceType == zop_predef)
         return WEIGHT(litLength, optLevel);
-    /* We can't compute the litLength price for sizes >= ZSTD_BLOCKSIZE_MAX
-     * because it isn't representable in the zstd format. So instead just
-     * call it 1 bit more than ZSTD_BLOCKSIZE_MAX - 1. In this case the block
-     * would be all literals.
+
+    /* ZSTD_LLcode() can't compute litLength price for sizes >= ZSTD_BLOCKSIZE_MAX
+     * because it isn't representable in the zstd format.
+     * So instead just pretend it would cost 1 bit more than ZSTD_BLOCKSIZE_MAX - 1.
+     * In such a case, the block would be all literals.
      */
     if (litLength == ZSTD_BLOCKSIZE_MAX)
         return BITCOST_MULTIPLIER + ZSTD_litLengthPrice(ZSTD_BLOCKSIZE_MAX - 1, optPtr, optLevel);
@@ -289,24 +312,25 @@ static U32 ZSTD_litLengthPrice(U32 const litLength, const optState_t* const optP
 }
 
 /* ZSTD_getMatchPrice() :
- * Provides the cost of the match part (offset + matchLength) of a sequence
+ * Provides the cost of the match part (offset + matchLength) of a sequence.
  * Must be combined with ZSTD_fullLiteralsCost() to get the full cost of a sequence.
- * @offcode : expects a scale where 0,1,2 are repcodes 1-3, and 3+ are real_offsets+2
+ * @offBase : sumtype, representing an offset or a repcode, and using numeric representation of ZSTD_storeSeq()
  * @optLevel: when <2, favors small offset for decompression speed (improved cache efficiency)
  */
 FORCE_INLINE_TEMPLATE U32
-ZSTD_getMatchPrice(U32 const offcode,
+ZSTD_getMatchPrice(U32 const offBase,
                    U32 const matchLength,
              const optState_t* const optPtr,
                    int const optLevel)
 {
     U32 price;
-    U32 const offCode = ZSTD_highbit32(STORED_TO_OFFBASE(offcode));
+    U32 const offCode = ZSTD_highbit32(offBase);
     U32 const mlBase = matchLength - MINMATCH;
     assert(matchLength >= MINMATCH);
 
-    if (optPtr->priceType == zop_predef)  /* fixed scheme, do not use statistics */
-        return WEIGHT(mlBase, optLevel) + ((16 + offCode) * BITCOST_MULTIPLIER);
+    if (optPtr->priceType == zop_predef)  /* fixed scheme, does not use statistics */
+        return WEIGHT(mlBase, optLevel)
+             + ((16 + offCode) * BITCOST_MULTIPLIER); /* emulated offset cost */
 
     /* dynamic statistics */
     price = (offCode * BITCOST_MULTIPLIER) + (optPtr->offCodeSumBasePrice - WEIGHT(optPtr->offCodeFreq[offCode], optLevel));
@@ -325,10 +349,10 @@ ZSTD_getMatchPrice(U32 const offcode,
 }
 
 /* ZSTD_updateStats() :
- * assumption : literals + litLengtn <= iend */
+ * assumption : literals + litLength <= iend */
 static void ZSTD_updateStats(optState_t* const optPtr,
                              U32 litLength, const BYTE* literals,
-                             U32 offsetCode, U32 matchLength)
+                             U32 offBase, U32 matchLength)
 {
     /* literals */
     if (ZSTD_compressedLiterals(optPtr)) {
@@ -344,8 +368,8 @@ static void ZSTD_updateStats(optState_t* const optPtr,
         optPtr->litLengthSum++;
     }
 
-    /* offset code : expected to follow storeSeq() numeric representation */
-    {   U32 const offCode = ZSTD_highbit32(STORED_TO_OFFBASE(offsetCode));
+    /* offset code : follows storeSeq() numeric representation */
+    {   U32 const offCode = ZSTD_highbit32(offBase);
         assert(offCode <= MaxOff);
         optPtr->offCodeFreq[offCode]++;
         optPtr->offCodeSum++;
@@ -552,16 +576,17 @@ void ZSTD_updateTree(ZSTD_matchState_t* ms, const BYTE* ip, const BYTE* iend) {
     ZSTD_updateTree_internal(ms, ip, iend, ms->cParams.minMatch, ZSTD_noDict);
 }
 
-FORCE_INLINE_TEMPLATE
-U32 ZSTD_insertBtAndGetAllMatches (
-                    ZSTD_match_t* matches,   /* store result (found matches) in this table (presumed large enough) */
-                    ZSTD_matchState_t* ms,
-                    U32* nextToUpdate3,
-                    const BYTE* const ip, const BYTE* const iLimit, const ZSTD_dictMode_e dictMode,
-                    const U32 rep[ZSTD_REP_NUM],
-                    U32 const ll0,   /* tells if associated literal length is 0 or not. This value must be 0 or 1 */
-                    const U32 lengthToBeat,
-                    U32 const mls /* template */)
+FORCE_INLINE_TEMPLATE U32
+ZSTD_insertBtAndGetAllMatches (
+                ZSTD_match_t* matches,  /* store result (found matches) in this table (presumed large enough) */
+                ZSTD_matchState_t* ms,
+                U32* nextToUpdate3,
+                const BYTE* const ip, const BYTE* const iLimit,
+                const ZSTD_dictMode_e dictMode,
+                const U32 rep[ZSTD_REP_NUM],
+                const U32 ll0,  /* tells if associated literal length is 0 or not. This value must be 0 or 1 */
+                const U32 lengthToBeat,
+                const U32 mls /* template */)
 {
     const ZSTD_compressionParameters* const cParams = &ms->cParams;
     U32 const sufficient_len = MIN(cParams->targetLength, ZSTD_OPT_NUM -1);
@@ -644,7 +669,7 @@ U32 ZSTD_insertBtAndGetAllMatches (
                 DEBUGLOG(8, "found repCode %u (ll0:%u, offset:%u) of length %u",
                             repCode, ll0, repOffset, repLen);
                 bestLength = repLen;
-                matches[mnum].off = STORE_REPCODE(repCode - ll0 + 1);  /* expect value between 1 and 3 */
+                matches[mnum].off = REPCODE_TO_OFFBASE(repCode - ll0 + 1);  /* expect value between 1 and 3 */
                 matches[mnum].len = (U32)repLen;
                 mnum++;
                 if ( (repLen > sufficient_len)
@@ -673,7 +698,7 @@ U32 ZSTD_insertBtAndGetAllMatches (
                 bestLength = mlen;
                 assert(curr > matchIndex3);
                 assert(mnum==0);  /* no prior solution */
-                matches[0].off = STORE_OFFSET(curr - matchIndex3);
+                matches[0].off = OFFSET_TO_OFFBASE(curr - matchIndex3);
                 matches[0].len = (U32)mlen;
                 mnum = 1;
                 if ( (mlen > sufficient_len) |
@@ -706,13 +731,13 @@ U32 ZSTD_insertBtAndGetAllMatches (
         }
 
         if (matchLength > bestLength) {
-            DEBUGLOG(8, "found match of length %u at distance %u (offCode=%u)",
-                    (U32)matchLength, curr - matchIndex, STORE_OFFSET(curr - matchIndex));
+            DEBUGLOG(8, "found match of length %u at distance %u (offBase=%u)",
+                    (U32)matchLength, curr - matchIndex, OFFSET_TO_OFFBASE(curr - matchIndex));
             assert(matchEndIdx > matchIndex);
             if (matchLength > matchEndIdx - matchIndex)
                 matchEndIdx = matchIndex + (U32)matchLength;
             bestLength = matchLength;
-            matches[mnum].off = STORE_OFFSET(curr - matchIndex);
+            matches[mnum].off = OFFSET_TO_OFFBASE(curr - matchIndex);
             matches[mnum].len = (U32)matchLength;
             mnum++;
             if ( (matchLength > ZSTD_OPT_NUM)
@@ -754,12 +779,12 @@ U32 ZSTD_insertBtAndGetAllMatches (
 
             if (matchLength > bestLength) {
                 matchIndex = dictMatchIndex + dmsIndexDelta;
-                DEBUGLOG(8, "found dms match of length %u at distance %u (offCode=%u)",
-                        (U32)matchLength, curr - matchIndex, STORE_OFFSET(curr - matchIndex));
+                DEBUGLOG(8, "found dms match of length %u at distance %u (offBase=%u)",
+                        (U32)matchLength, curr - matchIndex, OFFSET_TO_OFFBASE(curr - matchIndex));
                 if (matchLength > matchEndIdx - matchIndex)
                     matchEndIdx = matchIndex + (U32)matchLength;
                 bestLength = matchLength;
-                matches[mnum].off = STORE_OFFSET(curr - matchIndex);
+                matches[mnum].off = OFFSET_TO_OFFBASE(curr - matchIndex);
                 matches[mnum].len = (U32)matchLength;
                 mnum++;
                 if ( (matchLength > ZSTD_OPT_NUM)
@@ -960,7 +985,7 @@ static void ZSTD_optLdm_maybeAddMatch(ZSTD_match_t* matches, U32* nbMatches,
                                       const ZSTD_optLdm_t* optLdm, U32 currPosInBlock)
 {
     U32 const posDiff = currPosInBlock - optLdm->startPosInBlock;
-    /* Note: ZSTD_match_t actually contains offCode and matchLength (before subtracting MINMATCH) */
+    /* Note: ZSTD_match_t actually contains offBase and matchLength (before subtracting MINMATCH) */
     U32 const candidateMatchLength = optLdm->endPosInBlock - optLdm->startPosInBlock - posDiff;
 
     /* Ensure that current block position is not outside of the match */
@@ -971,11 +996,11 @@ static void ZSTD_optLdm_maybeAddMatch(ZSTD_match_t* matches, U32* nbMatches,
     }
 
     if (*nbMatches == 0 || ((candidateMatchLength > matches[*nbMatches-1].len) && *nbMatches < ZSTD_OPT_NUM)) {
-        U32 const candidateOffCode = STORE_OFFSET(optLdm->offset);
-        DEBUGLOG(6, "ZSTD_optLdm_maybeAddMatch(): Adding ldm candidate match (offCode: %u matchLength %u) at block position=%u",
-                 candidateOffCode, candidateMatchLength, currPosInBlock);
+        U32 const candidateOffBase = OFFSET_TO_OFFBASE(optLdm->offset);
+        DEBUGLOG(6, "ZSTD_optLdm_maybeAddMatch(): Adding ldm candidate match (offBase: %u matchLength %u) at block position=%u",
+                 candidateOffBase, candidateMatchLength, currPosInBlock);
         matches[*nbMatches].len = candidateMatchLength;
-        matches[*nbMatches].off = candidateOffCode;
+        matches[*nbMatches].off = candidateOffBase;
         (*nbMatches)++;
     }
 }
@@ -1062,6 +1087,8 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
     ZSTD_optimal_t lastSequence;
     ZSTD_optLdm_t optLdm;
 
+    ZSTD_memset(&lastSequence, 0, sizeof(ZSTD_optimal_t));
+
     optLdm.seqStore = ms->ldmSeqStore ? *ms->ldmSeqStore : kNullRawSeqStore;
     optLdm.endPosInBlock = optLdm.startPosInBlock = optLdm.offset = 0;
     ZSTD_opt_getNextMatchAndUpdateSeqStore(&optLdm, (U32)(ip-istart), (U32)(iend-ip));
@@ -1098,14 +1125,14 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
 
             /* large match -> immediate encoding */
             {   U32 const maxML = matches[nbMatches-1].len;
-                U32 const maxOffcode = matches[nbMatches-1].off;
-                DEBUGLOG(6, "found %u matches of maxLength=%u and maxOffCode=%u at cPos=%u => start new series",
-                            nbMatches, maxML, maxOffcode, (U32)(ip-prefixStart));
+                U32 const maxOffBase = matches[nbMatches-1].off;
+                DEBUGLOG(6, "found %u matches of maxLength=%u and maxOffBase=%u at cPos=%u => start new series",
+                            nbMatches, maxML, maxOffBase, (U32)(ip-prefixStart));
 
                 if (maxML > sufficient_len) {
                     lastSequence.litlen = litlen;
                     lastSequence.mlen = maxML;
-                    lastSequence.off = maxOffcode;
+                    lastSequence.off = maxOffBase;
                     DEBUGLOG(6, "large match (%u>%u), immediate encoding",
                                 maxML, sufficient_len);
                     cur = 0;
@@ -1122,15 +1149,15 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
                     opt[pos].price = ZSTD_MAX_PRICE;   /* mlen, litlen and price will be fixed during forward scanning */
                 }
                 for (matchNb = 0; matchNb < nbMatches; matchNb++) {
-                    U32 const offcode = matches[matchNb].off;
+                    U32 const offBase = matches[matchNb].off;
                     U32 const end = matches[matchNb].len;
                     for ( ; pos <= end ; pos++ ) {
-                        U32 const matchPrice = ZSTD_getMatchPrice(offcode, pos, optStatePtr, optLevel);
+                        U32 const matchPrice = ZSTD_getMatchPrice(offBase, pos, optStatePtr, optLevel);
                         U32 const sequencePrice = literalsPrice + matchPrice;
                         DEBUGLOG(7, "rPos:%u => set initial price : %.2f",
-                                    pos, ZSTD_fCost(sequencePrice));
+                                    pos, ZSTD_fCost((int)sequencePrice));
                         opt[pos].mlen = pos;
-                        opt[pos].off = offcode;
+                        opt[pos].off = offBase;
                         opt[pos].litlen = litlen;
                         opt[pos].price = (int)sequencePrice;
                 }   }
@@ -1230,7 +1257,7 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
                     U32 const startML = (matchNb>0) ? matches[matchNb-1].len+1 : minMatch;
                     U32 mlen;
 
-                    DEBUGLOG(7, "testing match %u => offCode=%4u, mlen=%2u, llen=%2u",
+                    DEBUGLOG(7, "testing match %u => offBase=%4u, mlen=%2u, llen=%2u",
                                 matchNb, matches[matchNb].off, lastML, litlen);
 
                     for (mlen = lastML; mlen >= startML; mlen--) {  /* scan downward */
@@ -1296,7 +1323,7 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
                 for (storePos=storeStart; storePos <= storeEnd; storePos++) {
                     U32 const llen = opt[storePos].litlen;
                     U32 const mlen = opt[storePos].mlen;
-                    U32 const offCode = opt[storePos].off;
+                    U32 const offBase = opt[storePos].off;
                     U32 const advance = llen + mlen;
                     DEBUGLOG(6, "considering seq starting at %zi, llen=%u, mlen=%u",
                                 anchor - istart, (unsigned)llen, (unsigned)mlen);
@@ -1308,8 +1335,8 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
                     }
 
                     assert(anchor + llen <= iend);
-                    ZSTD_updateStats(optStatePtr, llen, anchor, offCode, mlen);
-                    ZSTD_storeSeq(seqStore, llen, anchor, iend, offCode, mlen);
+                    ZSTD_updateStats(optStatePtr, llen, anchor, offBase, mlen);
+                    ZSTD_storeSeq(seqStore, llen, anchor, iend, offBase, mlen);
                     anchor += advance;
                     ip = anchor;
             }   }
@@ -1349,7 +1376,7 @@ size_t ZSTD_compressBlock_btopt(
 /* ZSTD_initStats_ultra():
  * make a first compression pass, just to seed stats with more accurate starting values.
  * only works on first block, with no dictionary and no ldm.
- * this function cannot error, hence its contract must be respected.
+ * this function cannot error out, its narrow contract must be respected.
  */
 static void
 ZSTD_initStats_ultra(ZSTD_matchState_t* ms,
@@ -1368,7 +1395,7 @@ ZSTD_initStats_ultra(ZSTD_matchState_t* ms,
 
     ZSTD_compressBlock_opt2(ms, seqStore, tmpRep, src, srcSize, ZSTD_noDict);   /* generate stats into ms->opt*/
 
-    /* invalidate first scan from history */
+    /* invalidate first scan from history, only keep entropy stats */
     ZSTD_resetSeqStore(seqStore);
     ms->window.base -= srcSize;
     ms->window.dictLimit += (U32)srcSize;
@@ -1392,20 +1419,20 @@ size_t ZSTD_compressBlock_btultra2(
     U32 const curr = (U32)((const BYTE*)src - ms->window.base);
     DEBUGLOG(5, "ZSTD_compressBlock_btultra2 (srcSize=%zu)", srcSize);
 
-    /* 2-pass strategy:
+    /* 2-passes strategy:
      * this strategy makes a first pass over first block to collect statistics
-     * and seed next round's statistics with it.
-     * After 1st pass, function forgets everything, and starts a new block.
+     * in order to seed next round's statistics with it.
+     * After 1st pass, function forgets history, and starts a new block.
      * Consequently, this can only work if no data has been previously loaded in tables,
      * aka, no dictionary, no prefix, no ldm preprocessing.
      * The compression ratio gain is generally small (~0.5% on first block),
-     * the cost is 2x cpu time on first block. */
+    ** the cost is 2x cpu time on first block. */
     assert(srcSize <= ZSTD_BLOCKSIZE_MAX);
     if ( (ms->opt.litLengthSum==0)   /* first block */
       && (seqStore->sequences == seqStore->sequencesStart)  /* no ldm */
       && (ms->window.dictLimit == ms->window.lowLimit)   /* no dictionary */
-      && (curr == ms->window.dictLimit)   /* start of frame, nothing already loaded nor skipped */
-      && (srcSize > ZSTD_PREDEF_THRESHOLD)
+      && (curr == ms->window.dictLimit)    /* start of frame, nothing already loaded nor skipped */
+      && (srcSize > ZSTD_PREDEF_THRESHOLD) /* input large enough to not employ default stats */
       ) {
         ZSTD_initStats_ultra(ms, seqStore, rep, src, srcSize);
     }
diff --git a/lib/zstd/compress/zstd_opt.h b/lib/zstd/compress/zstd_opt.h
index 22b862858ba7a3..faa73ff4b03dcd 100644
--- a/lib/zstd/compress/zstd_opt.h
+++ b/lib/zstd/compress/zstd_opt.h
@@ -1,5 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
diff --git a/lib/zstd/decompress/huf_decompress.c b/lib/zstd/decompress/huf_decompress.c
index 60958afebc4150..d172e35fbd9a61 100644
--- a/lib/zstd/decompress/huf_decompress.c
+++ b/lib/zstd/decompress/huf_decompress.c
@@ -1,7 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
 /* ******************************************************************
  * huff0 huffman decoder,
  * part of Finite State Entropy library
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  *
  *  You can contact the author at :
  *  - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy
@@ -19,10 +20,10 @@
 #include "../common/compiler.h"
 #include "../common/bitstream.h"  /* BIT_* */
 #include "../common/fse.h"        /* to compress headers */
-#define HUF_STATIC_LINKING_ONLY
 #include "../common/huf.h"
 #include "../common/error_private.h"
 #include "../common/zstd_internal.h"
+#include "../common/bits.h"       /* ZSTD_highbit32, ZSTD_countTrailingZeros64 */
 
 /* **************************************************************
 *  Constants
@@ -43,27 +44,25 @@
 #error "Cannot force the use of the X1 and X2 decoders at the same time!"
 #endif
 
-#if ZSTD_ENABLE_ASM_X86_64_BMI2 && DYNAMIC_BMI2
-# define HUF_ASM_X86_64_BMI2_ATTRS BMI2_TARGET_ATTRIBUTE
+/* When DYNAMIC_BMI2 is enabled, fast decoders are only called when bmi2 is
+ * supported at runtime, so we can add the BMI2 target attribute.
+ * When it is disabled, we will still get BMI2 if it is enabled statically.
+ */
+#if DYNAMIC_BMI2
+# define HUF_FAST_BMI2_ATTRS BMI2_TARGET_ATTRIBUTE
 #else
-# define HUF_ASM_X86_64_BMI2_ATTRS
+# define HUF_FAST_BMI2_ATTRS
 #endif
 
 #define HUF_EXTERN_C
 #define HUF_ASM_DECL HUF_EXTERN_C
 
-#if DYNAMIC_BMI2 || (ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__))
+#if DYNAMIC_BMI2
 # define HUF_NEED_BMI2_FUNCTION 1
 #else
 # define HUF_NEED_BMI2_FUNCTION 0
 #endif
 
-#if !(ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__))
-# define HUF_NEED_DEFAULT_FUNCTION 1
-#else
-# define HUF_NEED_DEFAULT_FUNCTION 0
-#endif
-
 /* **************************************************************
 *  Error Management
 ****************************************************************/
@@ -80,6 +79,11 @@
 /* **************************************************************
 *  BMI2 Variant Wrappers
 ****************************************************************/
+typedef size_t (*HUF_DecompressUsingDTableFn)(void *dst, size_t dstSize,
+                                              const void *cSrc,
+                                              size_t cSrcSize,
+                                              const HUF_DTable *DTable);
+
 #if DYNAMIC_BMI2
 
 #define HUF_DGEN(fn)                                                        \
@@ -101,9 +105,9 @@
     }                                                                       \
                                                                             \
     static size_t fn(void* dst, size_t dstSize, void const* cSrc,           \
-                     size_t cSrcSize, HUF_DTable const* DTable, int bmi2)   \
+                     size_t cSrcSize, HUF_DTable const* DTable, int flags)  \
     {                                                                       \
-        if (bmi2) {                                                         \
+        if (flags & HUF_flags_bmi2) {                                       \
             return fn##_bmi2(dst, dstSize, cSrc, cSrcSize, DTable);         \
         }                                                                   \
         return fn##_default(dst, dstSize, cSrc, cSrcSize, DTable);          \
@@ -113,9 +117,9 @@
 
 #define HUF_DGEN(fn)                                                        \
     static size_t fn(void* dst, size_t dstSize, void const* cSrc,           \
-                     size_t cSrcSize, HUF_DTable const* DTable, int bmi2)   \
+                     size_t cSrcSize, HUF_DTable const* DTable, int flags)  \
     {                                                                       \
-        (void)bmi2;                                                         \
+        (void)flags;                                                        \
         return fn##_body(dst, dstSize, cSrc, cSrcSize, DTable);             \
     }
 
@@ -134,15 +138,28 @@ static DTableDesc HUF_getDTableDesc(const HUF_DTable* table)
     return dtd;
 }
 
-#if ZSTD_ENABLE_ASM_X86_64_BMI2
-
-static size_t HUF_initDStream(BYTE const* ip) {
+static size_t HUF_initFastDStream(BYTE const* ip) {
     BYTE const lastByte = ip[7];
-    size_t const bitsConsumed = lastByte ? 8 - BIT_highbit32(lastByte) : 0;
+    size_t const bitsConsumed = lastByte ? 8 - ZSTD_highbit32(lastByte) : 0;
     size_t const value = MEM_readLEST(ip) | 1;
     assert(bitsConsumed <= 8);
+    assert(sizeof(size_t) == 8);
     return value << bitsConsumed;
 }
+
+
+/*
+ * The input/output arguments to the Huffman fast decoding loop:
+ *
+ * ip [in/out] - The input pointers, must be updated to reflect what is consumed.
+ * op [in/out] - The output pointers, must be updated to reflect what is written.
+ * bits [in/out] - The bitstream containers, must be updated to reflect the current state.
+ * dt [in] - The decoding table.
+ * ilimit [in] - The input limit, stop when any input pointer is below ilimit.
+ * oend [in] - The end of the output stream. op[3] must not cross oend.
+ * iend [in] - The end of each input stream. ip[i] may cross iend[i],
+ *             as long as it is above ilimit, but that indicates corruption.
+ */
 typedef struct {
     BYTE const* ip[4];
     BYTE* op[4];
@@ -151,15 +168,17 @@ typedef struct {
     BYTE const* ilimit;
     BYTE* oend;
     BYTE const* iend[4];
-} HUF_DecompressAsmArgs;
+} HUF_DecompressFastArgs;
+
+typedef void (*HUF_DecompressFastLoopFn)(HUF_DecompressFastArgs*);
 
 /*
- * Initializes args for the asm decoding loop.
- * @returns 0 on success
- *          1 if the fallback implementation should be used.
+ * Initializes args for the fast decoding loop.
+ * @returns 1 on success
+ *          0 if the fallback implementation should be used.
  *          Or an error code on failure.
  */
-static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst, size_t dstSize, void const* src, size_t srcSize, const HUF_DTable* DTable)
+static size_t HUF_DecompressFastArgs_init(HUF_DecompressFastArgs* args, void* dst, size_t dstSize, void const* src, size_t srcSize, const HUF_DTable* DTable)
 {
     void const* dt = DTable + 1;
     U32 const dtLog = HUF_getDTableDesc(DTable).tableLog;
@@ -168,9 +187,11 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst,
 
     BYTE* const oend = (BYTE*)dst + dstSize;
 
-    /* The following condition is false on x32 platform,
-     * but HUF_asm is not compatible with this ABI */
-    if (!(MEM_isLittleEndian() && !MEM_32bits())) return 1;
+    /* The fast decoding loop assumes 64-bit little-endian.
+     * This condition is false on x32.
+     */
+    if (!MEM_isLittleEndian() || MEM_32bits())
+        return 0;
 
     /* strict minimum : jump table + 1 byte per stream */
     if (srcSize < 10)
@@ -181,7 +202,7 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst,
      * On small inputs we don't have enough data to trigger the fast loop, so use the old decoder.
      */
     if (dtLog != HUF_DECODER_FAST_TABLELOG)
-        return 1;
+        return 0;
 
     /* Read the jump table. */
     {
@@ -195,13 +216,13 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst,
         args->iend[2] = args->iend[1] + length2;
         args->iend[3] = args->iend[2] + length3;
 
-        /* HUF_initDStream() requires this, and this small of an input
+        /* HUF_initFastDStream() requires this, and this small of an input
          * won't benefit from the ASM loop anyways.
          * length1 must be >= 16 so that ip[0] >= ilimit before the loop
          * starts.
          */
         if (length1 < 16 || length2 < 8 || length3 < 8 || length4 < 8)
-            return 1;
+            return 0;
         if (length4 > srcSize) return ERROR(corruption_detected);   /* overflow */
     }
     /* ip[] contains the position that is currently loaded into bits[]. */
@@ -218,7 +239,7 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst,
 
     /* No point to call the ASM loop for tiny outputs. */
     if (args->op[3] >= oend)
-        return 1;
+        return 0;
 
     /* bits[] is the bit container.
         * It is read from the MSB down to the LSB.
@@ -227,10 +248,10 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst,
         * set, so that CountTrailingZeros(bits[]) can be used
         * to count how many bits we've consumed.
         */
-    args->bits[0] = HUF_initDStream(args->ip[0]);
-    args->bits[1] = HUF_initDStream(args->ip[1]);
-    args->bits[2] = HUF_initDStream(args->ip[2]);
-    args->bits[3] = HUF_initDStream(args->ip[3]);
+    args->bits[0] = HUF_initFastDStream(args->ip[0]);
+    args->bits[1] = HUF_initFastDStream(args->ip[1]);
+    args->bits[2] = HUF_initFastDStream(args->ip[2]);
+    args->bits[3] = HUF_initFastDStream(args->ip[3]);
 
     /* If ip[] >= ilimit, it is guaranteed to be safe to
         * reload bits[]. It may be beyond its section, but is
@@ -241,10 +262,10 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst,
     args->oend = oend;
     args->dt = dt;
 
-    return 0;
+    return 1;
 }
 
-static size_t HUF_initRemainingDStream(BIT_DStream_t* bit, HUF_DecompressAsmArgs const* args, int stream, BYTE* segmentEnd)
+static size_t HUF_initRemainingDStream(BIT_DStream_t* bit, HUF_DecompressFastArgs const* args, int stream, BYTE* segmentEnd)
 {
     /* Validate that we haven't overwritten. */
     if (args->op[stream] > segmentEnd)
@@ -258,15 +279,15 @@ static size_t HUF_initRemainingDStream(BIT_DStream_t* bit, HUF_DecompressAsmArgs
         return ERROR(corruption_detected);
 
     /* Construct the BIT_DStream_t. */
-    bit->bitContainer = MEM_readLE64(args->ip[stream]);
-    bit->bitsConsumed = ZSTD_countTrailingZeros((size_t)args->bits[stream]);
+    assert(sizeof(size_t) == 8);
+    bit->bitContainer = MEM_readLEST(args->ip[stream]);
+    bit->bitsConsumed = ZSTD_countTrailingZeros64(args->bits[stream]);
     bit->start = (const char*)args->iend[0];
     bit->limitPtr = bit->start + sizeof(size_t);
     bit->ptr = (const char*)args->ip[stream];
 
     return 0;
 }
-#endif
 
 
 #ifndef HUF_FORCE_DECOMPRESS_X2
@@ -283,10 +304,11 @@ typedef struct { BYTE nbBits; BYTE byte; } HUF_DEltX1;   /* single-symbol decodi
 static U64 HUF_DEltX1_set4(BYTE symbol, BYTE nbBits) {
     U64 D4;
     if (MEM_isLittleEndian()) {
-        D4 = (symbol << 8) + nbBits;
+        D4 = (U64)((symbol << 8) + nbBits);
     } else {
-        D4 = symbol + (nbBits << 8);
+        D4 = (U64)(symbol + (nbBits << 8));
     }
+    assert(D4 < (1U << 16));
     D4 *= 0x0001000100010001ULL;
     return D4;
 }
@@ -329,13 +351,7 @@ typedef struct {
         BYTE huffWeight[HUF_SYMBOLVALUE_MAX + 1];
 } HUF_ReadDTableX1_Workspace;
 
-
-size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize)
-{
-    return HUF_readDTableX1_wksp_bmi2(DTable, src, srcSize, workSpace, wkspSize, /* bmi2 */ 0);
-}
-
-size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int bmi2)
+size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int flags)
 {
     U32 tableLog = 0;
     U32 nbSymbols = 0;
@@ -350,7 +366,7 @@ size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t sr
     DEBUG_STATIC_ASSERT(sizeof(DTableDesc) == sizeof(HUF_DTable));
     /* ZSTD_memset(huffWeight, 0, sizeof(huffWeight)); */   /* is not necessary, even though some analyzer complain ... */
 
-    iSize = HUF_readStats_wksp(wksp->huffWeight, HUF_SYMBOLVALUE_MAX + 1, wksp->rankVal, &nbSymbols, &tableLog, src, srcSize, wksp->statsWksp, sizeof(wksp->statsWksp), bmi2);
+    iSize = HUF_readStats_wksp(wksp->huffWeight, HUF_SYMBOLVALUE_MAX + 1, wksp->rankVal, &nbSymbols, &tableLog, src, srcSize, wksp->statsWksp, sizeof(wksp->statsWksp), flags);
     if (HUF_isError(iSize)) return iSize;
 
 
@@ -377,9 +393,8 @@ size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t sr
      * rankStart[0] is not filled because there are no entries in the table for
      * weight 0.
      */
-    {
-        int n;
-        int nextRankStart = 0;
+    {   int n;
+        U32 nextRankStart = 0;
         int const unroll = 4;
         int const nLimit = (int)nbSymbols - unroll + 1;
         for (n=0; n<(int)tableLog+1; n++) {
@@ -406,10 +421,9 @@ size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t sr
      * We can switch based on the length to a different inner loop which is
      * optimized for that particular case.
      */
-    {
-        U32 w;
-        int symbol=wksp->rankVal[0];
-        int rankStart=0;
+    {   U32 w;
+        int symbol = wksp->rankVal[0];
+        int rankStart = 0;
         for (w=1; w<tableLog+1; ++w) {
             int const symbolCount = wksp->rankVal[w];
             int const length = (1 << w) >> 1;
@@ -519,7 +533,7 @@ HUF_decodeStreamX1(BYTE* p, BIT_DStream_t* const bitDPtr, BYTE* const pEnd, cons
     while (p < pEnd)
         HUF_DECODE_SYMBOLX1_0(p, bitDPtr);
 
-    return pEnd-pStart;
+    return (size_t)(pEnd-pStart);
 }
 
 FORCE_INLINE_TEMPLATE size_t
@@ -545,6 +559,10 @@ HUF_decompress1X1_usingDTable_internal_body(
     return dstSize;
 }
 
+/* HUF_decompress4X1_usingDTable_internal_body():
+ * Conditions :
+ * @dstSize >= 6
+ */
 FORCE_INLINE_TEMPLATE size_t
 HUF_decompress4X1_usingDTable_internal_body(
           void* dst,  size_t dstSize,
@@ -588,6 +606,7 @@ HUF_decompress4X1_usingDTable_internal_body(
 
         if (length4 > cSrcSize) return ERROR(corruption_detected);   /* overflow */
         if (opStart4 > oend) return ERROR(corruption_detected);      /* overflow */
+        if (dstSize < 6) return ERROR(corruption_detected);         /* stream 4-split doesn't work */
         CHECK_F( BIT_initDStream(&bitD1, istart1, length1) );
         CHECK_F( BIT_initDStream(&bitD2, istart2, length2) );
         CHECK_F( BIT_initDStream(&bitD3, istart3, length3) );
@@ -650,38 +669,142 @@ size_t HUF_decompress4X1_usingDTable_internal_bmi2(void* dst, size_t dstSize, vo
 }
 #endif
 
-#if HUF_NEED_DEFAULT_FUNCTION
 static
 size_t HUF_decompress4X1_usingDTable_internal_default(void* dst, size_t dstSize, void const* cSrc,
                     size_t cSrcSize, HUF_DTable const* DTable) {
     return HUF_decompress4X1_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable);
 }
-#endif
 
 #if ZSTD_ENABLE_ASM_X86_64_BMI2
 
-HUF_ASM_DECL void HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop(HUF_DecompressAsmArgs* args) ZSTDLIB_HIDDEN;
+HUF_ASM_DECL void HUF_decompress4X1_usingDTable_internal_fast_asm_loop(HUF_DecompressFastArgs* args) ZSTDLIB_HIDDEN;
+
+#endif
+
+static HUF_FAST_BMI2_ATTRS
+void HUF_decompress4X1_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs* args)
+{
+    U64 bits[4];
+    BYTE const* ip[4];
+    BYTE* op[4];
+    U16 const* const dtable = (U16 const*)args->dt;
+    BYTE* const oend = args->oend;
+    BYTE const* const ilimit = args->ilimit;
+
+    /* Copy the arguments to local variables */
+    ZSTD_memcpy(&bits, &args->bits, sizeof(bits));
+    ZSTD_memcpy((void*)(&ip), &args->ip, sizeof(ip));
+    ZSTD_memcpy(&op, &args->op, sizeof(op));
+
+    assert(MEM_isLittleEndian());
+    assert(!MEM_32bits());
+
+    for (;;) {
+        BYTE* olimit;
+        int stream;
+        int symbol;
+
+        /* Assert loop preconditions */
+#ifndef NDEBUG
+        for (stream = 0; stream < 4; ++stream) {
+            assert(op[stream] <= (stream == 3 ? oend : op[stream + 1]));
+            assert(ip[stream] >= ilimit);
+        }
+#endif
+        /* Compute olimit */
+        {
+            /* Each iteration produces 5 output symbols per stream */
+            size_t const oiters = (size_t)(oend - op[3]) / 5;
+            /* Each iteration consumes up to 11 bits * 5 = 55 bits < 7 bytes
+             * per stream.
+             */
+            size_t const iiters = (size_t)(ip[0] - ilimit) / 7;
+            /* We can safely run iters iterations before running bounds checks */
+            size_t const iters = MIN(oiters, iiters);
+            size_t const symbols = iters * 5;
+
+            /* We can simply check that op[3] < olimit, instead of checking all
+             * of our bounds, since we can't hit the other bounds until we've run
+             * iters iterations, which only happens when op[3] == olimit.
+             */
+            olimit = op[3] + symbols;
+
+            /* Exit fast decoding loop once we get close to the end. */
+            if (op[3] + 20 > olimit)
+                break;
+
+            /* Exit the decoding loop if any input pointer has crossed the
+             * previous one. This indicates corruption, and a precondition
+             * to our loop is that ip[i] >= ip[0].
+             */
+            for (stream = 1; stream < 4; ++stream) {
+                if (ip[stream] < ip[stream - 1])
+                    goto _out;
+            }
+        }
+
+#ifndef NDEBUG
+        for (stream = 1; stream < 4; ++stream) {
+            assert(ip[stream] >= ip[stream - 1]);
+        }
+#endif
+
+        do {
+            /* Decode 5 symbols in each of the 4 streams */
+            for (symbol = 0; symbol < 5; ++symbol) {
+                for (stream = 0; stream < 4; ++stream) {
+                    int const index = (int)(bits[stream] >> 53);
+                    int const entry = (int)dtable[index];
+                    bits[stream] <<= (entry & 63);
+                    op[stream][symbol] = (BYTE)((entry >> 8) & 0xFF);
+                }
+            }
+            /* Reload the bitstreams */
+            for (stream = 0; stream < 4; ++stream) {
+                int const ctz = ZSTD_countTrailingZeros64(bits[stream]);
+                int const nbBits = ctz & 7;
+                int const nbBytes = ctz >> 3;
+                op[stream] += 5;
+                ip[stream] -= nbBytes;
+                bits[stream] = MEM_read64(ip[stream]) | 1;
+                bits[stream] <<= nbBits;
+            }
+        } while (op[3] < olimit);
+    }
+
+_out:
 
-static HUF_ASM_X86_64_BMI2_ATTRS
+    /* Save the final values of each of the state variables back to args. */
+    ZSTD_memcpy(&args->bits, &bits, sizeof(bits));
+    ZSTD_memcpy((void*)(&args->ip), &ip, sizeof(ip));
+    ZSTD_memcpy(&args->op, &op, sizeof(op));
+}
+
+/*
+ * @returns @p dstSize on success (>= 6)
+ *          0 if the fallback implementation should be used
+ *          An error if an error occurred
+ */
+static HUF_FAST_BMI2_ATTRS
 size_t
-HUF_decompress4X1_usingDTable_internal_bmi2_asm(
+HUF_decompress4X1_usingDTable_internal_fast(
           void* dst,  size_t dstSize,
     const void* cSrc, size_t cSrcSize,
-    const HUF_DTable* DTable)
+    const HUF_DTable* DTable,
+    HUF_DecompressFastLoopFn loopFn)
 {
     void const* dt = DTable + 1;
     const BYTE* const iend = (const BYTE*)cSrc + 6;
     BYTE* const oend = (BYTE*)dst + dstSize;
-    HUF_DecompressAsmArgs args;
-    {
-        size_t const ret = HUF_DecompressAsmArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable);
-        FORWARD_IF_ERROR(ret, "Failed to init asm args");
-        if (ret != 0)
-            return HUF_decompress4X1_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable);
+    HUF_DecompressFastArgs args;
+    {   size_t const ret = HUF_DecompressFastArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable);
+        FORWARD_IF_ERROR(ret, "Failed to init fast loop args");
+        if (ret == 0)
+            return 0;
     }
 
     assert(args.ip[0] >= args.ilimit);
-    HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop(&args);
+    loopFn(&args);
 
     /* Our loop guarantees that ip[] >= ilimit and that we haven't
     * overwritten any op[].
@@ -694,8 +817,7 @@ HUF_decompress4X1_usingDTable_internal_bmi2_asm(
     (void)iend;
 
     /* finish bit streams one by one. */
-    {
-        size_t const segmentSize = (dstSize+3) / 4;
+    {   size_t const segmentSize = (dstSize+3) / 4;
         BYTE* segmentEnd = (BYTE*)dst;
         int i;
         for (i = 0; i < 4; ++i) {
@@ -712,97 +834,59 @@ HUF_decompress4X1_usingDTable_internal_bmi2_asm(
     }
 
     /* decoded size */
+    assert(dstSize != 0);
     return dstSize;
 }
-#endif /* ZSTD_ENABLE_ASM_X86_64_BMI2 */
-
-typedef size_t (*HUF_decompress_usingDTable_t)(void *dst, size_t dstSize,
-                                               const void *cSrc,
-                                               size_t cSrcSize,
-                                               const HUF_DTable *DTable);
 
 HUF_DGEN(HUF_decompress1X1_usingDTable_internal)
 
 static size_t HUF_decompress4X1_usingDTable_internal(void* dst, size_t dstSize, void const* cSrc,
-                    size_t cSrcSize, HUF_DTable const* DTable, int bmi2)
+                    size_t cSrcSize, HUF_DTable const* DTable, int flags)
 {
+    HUF_DecompressUsingDTableFn fallbackFn = HUF_decompress4X1_usingDTable_internal_default;
+    HUF_DecompressFastLoopFn loopFn = HUF_decompress4X1_usingDTable_internal_fast_c_loop;
+
 #if DYNAMIC_BMI2
-    if (bmi2) {
+    if (flags & HUF_flags_bmi2) {
+        fallbackFn = HUF_decompress4X1_usingDTable_internal_bmi2;
 # if ZSTD_ENABLE_ASM_X86_64_BMI2
-        return HUF_decompress4X1_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable);
-# else
-        return HUF_decompress4X1_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable);
+        if (!(flags & HUF_flags_disableAsm)) {
+            loopFn = HUF_decompress4X1_usingDTable_internal_fast_asm_loop;
+        }
 # endif
+    } else {
+        return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable);
     }
-#else
-    (void)bmi2;
 #endif
 
 #if ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__)
-    return HUF_decompress4X1_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable);
-#else
-    return HUF_decompress4X1_usingDTable_internal_default(dst, dstSize, cSrc, cSrcSize, DTable);
+    if (!(flags & HUF_flags_disableAsm)) {
+        loopFn = HUF_decompress4X1_usingDTable_internal_fast_asm_loop;
+    }
 #endif
-}
-
-
-size_t HUF_decompress1X1_usingDTable(
-          void* dst,  size_t dstSize,
-    const void* cSrc, size_t cSrcSize,
-    const HUF_DTable* DTable)
-{
-    DTableDesc dtd = HUF_getDTableDesc(DTable);
-    if (dtd.tableType != 0) return ERROR(GENERIC);
-    return HUF_decompress1X1_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
-}
 
-size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* DCtx, void* dst, size_t dstSize,
-                                   const void* cSrc, size_t cSrcSize,
-                                   void* workSpace, size_t wkspSize)
-{
-    const BYTE* ip = (const BYTE*) cSrc;
-
-    size_t const hSize = HUF_readDTableX1_wksp(DCtx, cSrc, cSrcSize, workSpace, wkspSize);
-    if (HUF_isError(hSize)) return hSize;
-    if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
-    ip += hSize; cSrcSize -= hSize;
-
-    return HUF_decompress1X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, /* bmi2 */ 0);
-}
-
-
-size_t HUF_decompress4X1_usingDTable(
-          void* dst,  size_t dstSize,
-    const void* cSrc, size_t cSrcSize,
-    const HUF_DTable* DTable)
-{
-    DTableDesc dtd = HUF_getDTableDesc(DTable);
-    if (dtd.tableType != 0) return ERROR(GENERIC);
-    return HUF_decompress4X1_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+    if (!(flags & HUF_flags_disableFast)) {
+        size_t const ret = HUF_decompress4X1_usingDTable_internal_fast(dst, dstSize, cSrc, cSrcSize, DTable, loopFn);
+        if (ret != 0)
+            return ret;
+    }
+    return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable);
 }
 
-static size_t HUF_decompress4X1_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize,
+static size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
                                    const void* cSrc, size_t cSrcSize,
-                                   void* workSpace, size_t wkspSize, int bmi2)
+                                   void* workSpace, size_t wkspSize, int flags)
 {
     const BYTE* ip = (const BYTE*) cSrc;
 
-    size_t const hSize = HUF_readDTableX1_wksp_bmi2(dctx, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
+    size_t const hSize = HUF_readDTableX1_wksp(dctx, cSrc, cSrcSize, workSpace, wkspSize, flags);
     if (HUF_isError(hSize)) return hSize;
     if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
     ip += hSize; cSrcSize -= hSize;
 
-    return HUF_decompress4X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2);
+    return HUF_decompress4X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, flags);
 }
 
-size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
-                                   const void* cSrc, size_t cSrcSize,
-                                   void* workSpace, size_t wkspSize)
-{
-    return HUF_decompress4X1_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, 0);
-}
-
-
 #endif /* HUF_FORCE_DECOMPRESS_X2 */
 
 
@@ -985,7 +1069,7 @@ static void HUF_fillDTableX2Level2(HUF_DEltX2* DTable, U32 targetLog, const U32
 
 static void HUF_fillDTableX2(HUF_DEltX2* DTable, const U32 targetLog,
                            const sortedSymbol_t* sortedList,
-                           const U32* rankStart, rankValCol_t *rankValOrigin, const U32 maxWeight,
+                           const U32* rankStart, rankValCol_t* rankValOrigin, const U32 maxWeight,
                            const U32 nbBitsBaseline)
 {
     U32* const rankVal = rankValOrigin[0];
@@ -1040,14 +1124,7 @@ typedef struct {
 
 size_t HUF_readDTableX2_wksp(HUF_DTable* DTable,
                        const void* src, size_t srcSize,
-                             void* workSpace, size_t wkspSize)
-{
-    return HUF_readDTableX2_wksp_bmi2(DTable, src, srcSize, workSpace, wkspSize, /* bmi2 */ 0);
-}
-
-size_t HUF_readDTableX2_wksp_bmi2(HUF_DTable* DTable,
-                       const void* src, size_t srcSize,
-                             void* workSpace, size_t wkspSize, int bmi2)
+                             void* workSpace, size_t wkspSize, int flags)
 {
     U32 tableLog, maxW, nbSymbols;
     DTableDesc dtd = HUF_getDTableDesc(DTable);
@@ -1069,7 +1146,7 @@ size_t HUF_readDTableX2_wksp_bmi2(HUF_DTable* DTable,
     if (maxTableLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge);
     /* ZSTD_memset(weightList, 0, sizeof(weightList)); */  /* is not necessary, even though some analyzer complain ... */
 
-    iSize = HUF_readStats_wksp(wksp->weightList, HUF_SYMBOLVALUE_MAX + 1, wksp->rankStats, &nbSymbols, &tableLog, src, srcSize, wksp->calleeWksp, sizeof(wksp->calleeWksp), bmi2);
+    iSize = HUF_readStats_wksp(wksp->weightList, HUF_SYMBOLVALUE_MAX + 1, wksp->rankStats, &nbSymbols, &tableLog, src, srcSize, wksp->calleeWksp, sizeof(wksp->calleeWksp), flags);
     if (HUF_isError(iSize)) return iSize;
 
     /* check result */
@@ -1240,6 +1317,11 @@ HUF_decompress1X2_usingDTable_internal_body(
     /* decoded size */
     return dstSize;
 }
+
+/* HUF_decompress4X2_usingDTable_internal_body():
+ * Conditions:
+ * @dstSize >= 6
+ */
 FORCE_INLINE_TEMPLATE size_t
 HUF_decompress4X2_usingDTable_internal_body(
           void* dst,  size_t dstSize,
@@ -1280,8 +1362,9 @@ HUF_decompress4X2_usingDTable_internal_body(
         DTableDesc const dtd = HUF_getDTableDesc(DTable);
         U32 const dtLog = dtd.tableLog;
 
-        if (length4 > cSrcSize) return ERROR(corruption_detected);   /* overflow */
-        if (opStart4 > oend) return ERROR(corruption_detected);      /* overflow */
+        if (length4 > cSrcSize) return ERROR(corruption_detected);  /* overflow */
+        if (opStart4 > oend) return ERROR(corruption_detected);     /* overflow */
+        if (dstSize < 6) return ERROR(corruption_detected);         /* stream 4-split doesn't work */
         CHECK_F( BIT_initDStream(&bitD1, istart1, length1) );
         CHECK_F( BIT_initDStream(&bitD2, istart2, length2) );
         CHECK_F( BIT_initDStream(&bitD3, istart3, length3) );
@@ -1366,36 +1449,177 @@ size_t HUF_decompress4X2_usingDTable_internal_bmi2(void* dst, size_t dstSize, vo
 }
 #endif
 
-#if HUF_NEED_DEFAULT_FUNCTION
 static
 size_t HUF_decompress4X2_usingDTable_internal_default(void* dst, size_t dstSize, void const* cSrc,
                     size_t cSrcSize, HUF_DTable const* DTable) {
     return HUF_decompress4X2_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable);
 }
-#endif
 
 #if ZSTD_ENABLE_ASM_X86_64_BMI2
 
-HUF_ASM_DECL void HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop(HUF_DecompressAsmArgs* args) ZSTDLIB_HIDDEN;
+HUF_ASM_DECL void HUF_decompress4X2_usingDTable_internal_fast_asm_loop(HUF_DecompressFastArgs* args) ZSTDLIB_HIDDEN;
 
-static HUF_ASM_X86_64_BMI2_ATTRS size_t
-HUF_decompress4X2_usingDTable_internal_bmi2_asm(
+#endif
+
+static HUF_FAST_BMI2_ATTRS
+void HUF_decompress4X2_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs* args)
+{
+    U64 bits[4];
+    BYTE const* ip[4];
+    BYTE* op[4];
+    BYTE* oend[4];
+    HUF_DEltX2 const* const dtable = (HUF_DEltX2 const*)args->dt;
+    BYTE const* const ilimit = args->ilimit;
+
+    /* Copy the arguments to local registers. */
+    ZSTD_memcpy(&bits, &args->bits, sizeof(bits));
+    ZSTD_memcpy((void*)(&ip), &args->ip, sizeof(ip));
+    ZSTD_memcpy(&op, &args->op, sizeof(op));
+
+    oend[0] = op[1];
+    oend[1] = op[2];
+    oend[2] = op[3];
+    oend[3] = args->oend;
+
+    assert(MEM_isLittleEndian());
+    assert(!MEM_32bits());
+
+    for (;;) {
+        BYTE* olimit;
+        int stream;
+        int symbol;
+
+        /* Assert loop preconditions */
+#ifndef NDEBUG
+        for (stream = 0; stream < 4; ++stream) {
+            assert(op[stream] <= oend[stream]);
+            assert(ip[stream] >= ilimit);
+        }
+#endif
+        /* Compute olimit */
+        {
+            /* Each loop does 5 table lookups for each of the 4 streams.
+             * Each table lookup consumes up to 11 bits of input, and produces
+             * up to 2 bytes of output.
+             */
+            /* We can consume up to 7 bytes of input per iteration per stream.
+             * We also know that each input pointer is >= ip[0]. So we can run
+             * iters loops before running out of input.
+             */
+            size_t iters = (size_t)(ip[0] - ilimit) / 7;
+            /* Each iteration can produce up to 10 bytes of output per stream.
+             * Each output stream my advance at different rates. So take the
+             * minimum number of safe iterations among all the output streams.
+             */
+            for (stream = 0; stream < 4; ++stream) {
+                size_t const oiters = (size_t)(oend[stream] - op[stream]) / 10;
+                iters = MIN(iters, oiters);
+            }
+
+            /* Each iteration produces at least 5 output symbols. So until
+             * op[3] crosses olimit, we know we haven't executed iters
+             * iterations yet. This saves us maintaining an iters counter,
+             * at the expense of computing the remaining # of iterations
+             * more frequently.
+             */
+            olimit = op[3] + (iters * 5);
+
+            /* Exit the fast decoding loop if we are too close to the end. */
+            if (op[3] + 10 > olimit)
+                break;
+
+            /* Exit the decoding loop if any input pointer has crossed the
+             * previous one. This indicates corruption, and a precondition
+             * to our loop is that ip[i] >= ip[0].
+             */
+            for (stream = 1; stream < 4; ++stream) {
+                if (ip[stream] < ip[stream - 1])
+                    goto _out;
+            }
+        }
+
+#ifndef NDEBUG
+        for (stream = 1; stream < 4; ++stream) {
+            assert(ip[stream] >= ip[stream - 1]);
+        }
+#endif
+
+        do {
+            /* Do 5 table lookups for each of the first 3 streams */
+            for (symbol = 0; symbol < 5; ++symbol) {
+                for (stream = 0; stream < 3; ++stream) {
+                    int const index = (int)(bits[stream] >> 53);
+                    HUF_DEltX2 const entry = dtable[index];
+                    MEM_write16(op[stream], entry.sequence);
+                    bits[stream] <<= (entry.nbBits);
+                    op[stream] += (entry.length);
+                }
+            }
+            /* Do 1 table lookup from the final stream */
+            {
+                int const index = (int)(bits[3] >> 53);
+                HUF_DEltX2 const entry = dtable[index];
+                MEM_write16(op[3], entry.sequence);
+                bits[3] <<= (entry.nbBits);
+                op[3] += (entry.length);
+            }
+            /* Do 4 table lookups from the final stream & reload bitstreams */
+            for (stream = 0; stream < 4; ++stream) {
+                /* Do a table lookup from the final stream.
+                 * This is interleaved with the reloading to reduce register
+                 * pressure. This shouldn't be necessary, but compilers can
+                 * struggle with codegen with high register pressure.
+                 */
+                {
+                    int const index = (int)(bits[3] >> 53);
+                    HUF_DEltX2 const entry = dtable[index];
+                    MEM_write16(op[3], entry.sequence);
+                    bits[3] <<= (entry.nbBits);
+                    op[3] += (entry.length);
+                }
+                /* Reload the bistreams. The final bitstream must be reloaded
+                 * after the 5th symbol was decoded.
+                 */
+                {
+                    int const ctz = ZSTD_countTrailingZeros64(bits[stream]);
+                    int const nbBits = ctz & 7;
+                    int const nbBytes = ctz >> 3;
+                    ip[stream] -= nbBytes;
+                    bits[stream] = MEM_read64(ip[stream]) | 1;
+                    bits[stream] <<= nbBits;
+                }
+            }
+        } while (op[3] < olimit);
+    }
+
+_out:
+
+    /* Save the final values of each of the state variables back to args. */
+    ZSTD_memcpy(&args->bits, &bits, sizeof(bits));
+    ZSTD_memcpy((void*)(&args->ip), &ip, sizeof(ip));
+    ZSTD_memcpy(&args->op, &op, sizeof(op));
+}
+
+
+static HUF_FAST_BMI2_ATTRS size_t
+HUF_decompress4X2_usingDTable_internal_fast(
           void* dst,  size_t dstSize,
     const void* cSrc, size_t cSrcSize,
-    const HUF_DTable* DTable) {
+    const HUF_DTable* DTable,
+    HUF_DecompressFastLoopFn loopFn) {
     void const* dt = DTable + 1;
     const BYTE* const iend = (const BYTE*)cSrc + 6;
     BYTE* const oend = (BYTE*)dst + dstSize;
-    HUF_DecompressAsmArgs args;
+    HUF_DecompressFastArgs args;
     {
-        size_t const ret = HUF_DecompressAsmArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable);
+        size_t const ret = HUF_DecompressFastArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable);
         FORWARD_IF_ERROR(ret, "Failed to init asm args");
-        if (ret != 0)
-            return HUF_decompress4X2_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable);
+        if (ret == 0)
+            return 0;
     }
 
     assert(args.ip[0] >= args.ilimit);
-    HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop(&args);
+    loopFn(&args);
 
     /* note : op4 already verified within main loop */
     assert(args.ip[0] >= iend);
@@ -1426,91 +1650,72 @@ HUF_decompress4X2_usingDTable_internal_bmi2_asm(
     /* decoded size */
     return dstSize;
 }
-#endif /* ZSTD_ENABLE_ASM_X86_64_BMI2 */
 
 static size_t HUF_decompress4X2_usingDTable_internal(void* dst, size_t dstSize, void const* cSrc,
-                    size_t cSrcSize, HUF_DTable const* DTable, int bmi2)
+                    size_t cSrcSize, HUF_DTable const* DTable, int flags)
 {
+    HUF_DecompressUsingDTableFn fallbackFn = HUF_decompress4X2_usingDTable_internal_default;
+    HUF_DecompressFastLoopFn loopFn = HUF_decompress4X2_usingDTable_internal_fast_c_loop;
+
 #if DYNAMIC_BMI2
-    if (bmi2) {
+    if (flags & HUF_flags_bmi2) {
+        fallbackFn = HUF_decompress4X2_usingDTable_internal_bmi2;
 # if ZSTD_ENABLE_ASM_X86_64_BMI2
-        return HUF_decompress4X2_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable);
-# else
-        return HUF_decompress4X2_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable);
+        if (!(flags & HUF_flags_disableAsm)) {
+            loopFn = HUF_decompress4X2_usingDTable_internal_fast_asm_loop;
+        }
 # endif
+    } else {
+        return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable);
     }
-#else
-    (void)bmi2;
 #endif
 
 #if ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__)
-    return HUF_decompress4X2_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable);
-#else
-    return HUF_decompress4X2_usingDTable_internal_default(dst, dstSize, cSrc, cSrcSize, DTable);
+    if (!(flags & HUF_flags_disableAsm)) {
+        loopFn = HUF_decompress4X2_usingDTable_internal_fast_asm_loop;
+    }
 #endif
+
+    if (!(flags & HUF_flags_disableFast)) {
+        size_t const ret = HUF_decompress4X2_usingDTable_internal_fast(dst, dstSize, cSrc, cSrcSize, DTable, loopFn);
+        if (ret != 0)
+            return ret;
+    }
+    return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable);
 }
 
 HUF_DGEN(HUF_decompress1X2_usingDTable_internal)
 
-size_t HUF_decompress1X2_usingDTable(
-          void* dst,  size_t dstSize,
-    const void* cSrc, size_t cSrcSize,
-    const HUF_DTable* DTable)
-{
-    DTableDesc dtd = HUF_getDTableDesc(DTable);
-    if (dtd.tableType != 1) return ERROR(GENERIC);
-    return HUF_decompress1X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
-}
-
 size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* DCtx, void* dst, size_t dstSize,
                                    const void* cSrc, size_t cSrcSize,
-                                   void* workSpace, size_t wkspSize)
+                                   void* workSpace, size_t wkspSize, int flags)
 {
     const BYTE* ip = (const BYTE*) cSrc;
 
     size_t const hSize = HUF_readDTableX2_wksp(DCtx, cSrc, cSrcSize,
-                                               workSpace, wkspSize);
+                                               workSpace, wkspSize, flags);
     if (HUF_isError(hSize)) return hSize;
     if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
     ip += hSize; cSrcSize -= hSize;
 
-    return HUF_decompress1X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, /* bmi2 */ 0);
-}
-
-
-size_t HUF_decompress4X2_usingDTable(
-          void* dst,  size_t dstSize,
-    const void* cSrc, size_t cSrcSize,
-    const HUF_DTable* DTable)
-{
-    DTableDesc dtd = HUF_getDTableDesc(DTable);
-    if (dtd.tableType != 1) return ERROR(GENERIC);
-    return HUF_decompress4X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+    return HUF_decompress1X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, flags);
 }
 
-static size_t HUF_decompress4X2_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize,
+static size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
                                    const void* cSrc, size_t cSrcSize,
-                                   void* workSpace, size_t wkspSize, int bmi2)
+                                   void* workSpace, size_t wkspSize, int flags)
 {
     const BYTE* ip = (const BYTE*) cSrc;
 
     size_t hSize = HUF_readDTableX2_wksp(dctx, cSrc, cSrcSize,
-                                         workSpace, wkspSize);
+                                         workSpace, wkspSize, flags);
     if (HUF_isError(hSize)) return hSize;
     if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
     ip += hSize; cSrcSize -= hSize;
 
-    return HUF_decompress4X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2);
+    return HUF_decompress4X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, flags);
 }
 
-size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
-                                   const void* cSrc, size_t cSrcSize,
-                                   void* workSpace, size_t wkspSize)
-{
-    return HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, /* bmi2 */ 0);
-}
-
-
 #endif /* HUF_FORCE_DECOMPRESS_X1 */
 
 
@@ -1518,44 +1723,6 @@ size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
 /* Universal decompression selectors */
 /* ***********************************/
 
-size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize,
-                                    const void* cSrc, size_t cSrcSize,
-                                    const HUF_DTable* DTable)
-{
-    DTableDesc const dtd = HUF_getDTableDesc(DTable);
-#if defined(HUF_FORCE_DECOMPRESS_X1)
-    (void)dtd;
-    assert(dtd.tableType == 0);
-    return HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
-#elif defined(HUF_FORCE_DECOMPRESS_X2)
-    (void)dtd;
-    assert(dtd.tableType == 1);
-    return HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
-#else
-    return dtd.tableType ? HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0) :
-                           HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
-#endif
-}
-
-size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize,
-                                    const void* cSrc, size_t cSrcSize,
-                                    const HUF_DTable* DTable)
-{
-    DTableDesc const dtd = HUF_getDTableDesc(DTable);
-#if defined(HUF_FORCE_DECOMPRESS_X1)
-    (void)dtd;
-    assert(dtd.tableType == 0);
-    return HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
-#elif defined(HUF_FORCE_DECOMPRESS_X2)
-    (void)dtd;
-    assert(dtd.tableType == 1);
-    return HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
-#else
-    return dtd.tableType ? HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0) :
-                           HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
-#endif
-}
-
 
 #if !defined(HUF_FORCE_DECOMPRESS_X1) && !defined(HUF_FORCE_DECOMPRESS_X2)
 typedef struct { U32 tableTime; U32 decode256Time; } algo_time_t;
@@ -1610,36 +1777,9 @@ U32 HUF_selectDecoder (size_t dstSize, size_t cSrcSize)
 #endif
 }
 
-
-size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst,
-                                     size_t dstSize, const void* cSrc,
-                                     size_t cSrcSize, void* workSpace,
-                                     size_t wkspSize)
-{
-    /* validation checks */
-    if (dstSize == 0) return ERROR(dstSize_tooSmall);
-    if (cSrcSize == 0) return ERROR(corruption_detected);
-
-    {   U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize);
-#if defined(HUF_FORCE_DECOMPRESS_X1)
-        (void)algoNb;
-        assert(algoNb == 0);
-        return HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize);
-#elif defined(HUF_FORCE_DECOMPRESS_X2)
-        (void)algoNb;
-        assert(algoNb == 1);
-        return HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize);
-#else
-        return algoNb ? HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc,
-                            cSrcSize, workSpace, wkspSize):
-                        HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize);
-#endif
-    }
-}
-
 size_t HUF_decompress1X_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
                                   const void* cSrc, size_t cSrcSize,
-                                  void* workSpace, size_t wkspSize)
+                                  void* workSpace, size_t wkspSize, int flags)
 {
     /* validation checks */
     if (dstSize == 0) return ERROR(dstSize_tooSmall);
@@ -1652,71 +1792,71 @@ size_t HUF_decompress1X_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
         (void)algoNb;
         assert(algoNb == 0);
         return HUF_decompress1X1_DCtx_wksp(dctx, dst, dstSize, cSrc,
-                                cSrcSize, workSpace, wkspSize);
+                                cSrcSize, workSpace, wkspSize, flags);
 #elif defined(HUF_FORCE_DECOMPRESS_X2)
         (void)algoNb;
         assert(algoNb == 1);
         return HUF_decompress1X2_DCtx_wksp(dctx, dst, dstSize, cSrc,
-                                cSrcSize, workSpace, wkspSize);
+                                cSrcSize, workSpace, wkspSize, flags);
 #else
         return algoNb ? HUF_decompress1X2_DCtx_wksp(dctx, dst, dstSize, cSrc,
-                                cSrcSize, workSpace, wkspSize):
+                                cSrcSize, workSpace, wkspSize, flags):
                         HUF_decompress1X1_DCtx_wksp(dctx, dst, dstSize, cSrc,
-                                cSrcSize, workSpace, wkspSize);
+                                cSrcSize, workSpace, wkspSize, flags);
 #endif
     }
 }
 
 
-size_t HUF_decompress1X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2)
+size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags)
 {
     DTableDesc const dtd = HUF_getDTableDesc(DTable);
 #if defined(HUF_FORCE_DECOMPRESS_X1)
     (void)dtd;
     assert(dtd.tableType == 0);
-    return HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
+    return HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
 #elif defined(HUF_FORCE_DECOMPRESS_X2)
     (void)dtd;
     assert(dtd.tableType == 1);
-    return HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
+    return HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
 #else
-    return dtd.tableType ? HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2) :
-                           HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
+    return dtd.tableType ? HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags) :
+                           HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
 #endif
 }
 
 #ifndef HUF_FORCE_DECOMPRESS_X2
-size_t HUF_decompress1X1_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2)
+size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags)
 {
     const BYTE* ip = (const BYTE*) cSrc;
 
-    size_t const hSize = HUF_readDTableX1_wksp_bmi2(dctx, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
+    size_t const hSize = HUF_readDTableX1_wksp(dctx, cSrc, cSrcSize, workSpace, wkspSize, flags);
     if (HUF_isError(hSize)) return hSize;
     if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
     ip += hSize; cSrcSize -= hSize;
 
-    return HUF_decompress1X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2);
+    return HUF_decompress1X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, flags);
 }
 #endif
 
-size_t HUF_decompress4X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2)
+size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags)
 {
     DTableDesc const dtd = HUF_getDTableDesc(DTable);
 #if defined(HUF_FORCE_DECOMPRESS_X1)
     (void)dtd;
     assert(dtd.tableType == 0);
-    return HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
+    return HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
 #elif defined(HUF_FORCE_DECOMPRESS_X2)
     (void)dtd;
     assert(dtd.tableType == 1);
-    return HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
+    return HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
 #else
-    return dtd.tableType ? HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2) :
-                           HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
+    return dtd.tableType ? HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags) :
+                           HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
 #endif
 }
 
-size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2)
+size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags)
 {
     /* validation checks */
     if (dstSize == 0) return ERROR(dstSize_tooSmall);
@@ -1726,15 +1866,14 @@ size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t ds
 #if defined(HUF_FORCE_DECOMPRESS_X1)
         (void)algoNb;
         assert(algoNb == 0);
-        return HUF_decompress4X1_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
+        return HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags);
 #elif defined(HUF_FORCE_DECOMPRESS_X2)
         (void)algoNb;
         assert(algoNb == 1);
-        return HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
+        return HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags);
 #else
-        return algoNb ? HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2) :
-                        HUF_decompress4X1_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
+        return algoNb ? HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags) :
+                        HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags);
 #endif
     }
 }
-
diff --git a/lib/zstd/decompress/zstd_ddict.c b/lib/zstd/decompress/zstd_ddict.c
index dbbc7919de534e..30ef65e1ab5ca0 100644
--- a/lib/zstd/decompress/zstd_ddict.c
+++ b/lib/zstd/decompress/zstd_ddict.c
@@ -1,5 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -14,12 +15,12 @@
 /*-*******************************************************
 *  Dependencies
 *********************************************************/
+#include "../common/allocations.h"  /* ZSTD_customMalloc, ZSTD_customFree */
 #include "../common/zstd_deps.h"   /* ZSTD_memcpy, ZSTD_memmove, ZSTD_memset */
 #include "../common/cpu.h"         /* bmi2 */
 #include "../common/mem.h"         /* low level memory routines */
 #define FSE_STATIC_LINKING_ONLY
 #include "../common/fse.h"
-#define HUF_STATIC_LINKING_ONLY
 #include "../common/huf.h"
 #include "zstd_decompress_internal.h"
 #include "zstd_ddict.h"
@@ -131,7 +132,7 @@ static size_t ZSTD_initDDict_internal(ZSTD_DDict* ddict,
         ZSTD_memcpy(internalBuffer, dict, dictSize);
     }
     ddict->dictSize = dictSize;
-    ddict->entropy.hufTable[0] = (HUF_DTable)((HufLog)*0x1000001);  /* cover both little and big endian */
+    ddict->entropy.hufTable[0] = (HUF_DTable)((ZSTD_HUFFDTABLE_CAPACITY_LOG)*0x1000001);  /* cover both little and big endian */
 
     /* parse dictionary content */
     FORWARD_IF_ERROR( ZSTD_loadEntropy_intoDDict(ddict, dictContentType) , "");
@@ -237,5 +238,5 @@ size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict)
 unsigned ZSTD_getDictID_fromDDict(const ZSTD_DDict* ddict)
 {
     if (ddict==NULL) return 0;
-    return ZSTD_getDictID_fromDict(ddict->dictContent, ddict->dictSize);
+    return ddict->dictID;
 }
diff --git a/lib/zstd/decompress/zstd_ddict.h b/lib/zstd/decompress/zstd_ddict.h
index 8c1a79d666f89a..de459a0dacd19b 100644
--- a/lib/zstd/decompress/zstd_ddict.h
+++ b/lib/zstd/decompress/zstd_ddict.h
@@ -1,5 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
diff --git a/lib/zstd/decompress/zstd_decompress.c b/lib/zstd/decompress/zstd_decompress.c
index 6b3177c947114a..03dbdf39109f96 100644
--- a/lib/zstd/decompress/zstd_decompress.c
+++ b/lib/zstd/decompress/zstd_decompress.c
@@ -1,5 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -52,17 +53,18 @@
 /*-*******************************************************
 *  Dependencies
 *********************************************************/
+#include "../common/allocations.h"  /* ZSTD_customMalloc, ZSTD_customCalloc, ZSTD_customFree */
 #include "../common/zstd_deps.h"   /* ZSTD_memcpy, ZSTD_memmove, ZSTD_memset */
 #include "../common/mem.h"         /* low level memory routines */
 #define FSE_STATIC_LINKING_ONLY
 #include "../common/fse.h"
-#define HUF_STATIC_LINKING_ONLY
 #include "../common/huf.h"
 #include <linux/xxhash.h> /* xxh64_reset, xxh64_update, xxh64_digest, XXH64 */
 #include "../common/zstd_internal.h"  /* blockProperties_t */
 #include "zstd_decompress_internal.h"   /* ZSTD_DCtx */
 #include "zstd_ddict.h"  /* ZSTD_DDictDictContent */
 #include "zstd_decompress_block.h"   /* ZSTD_decompressBlock_internal */
+#include "../common/bits.h"  /* ZSTD_highbit32 */
 
 
@@ -72,11 +74,11 @@
  *************************************/
 
 #define DDICT_HASHSET_MAX_LOAD_FACTOR_COUNT_MULT 4
-#define DDICT_HASHSET_MAX_LOAD_FACTOR_SIZE_MULT 3   /* These two constants represent SIZE_MULT/COUNT_MULT load factor without using a float.
-                                                     * Currently, that means a 0.75 load factor.
-                                                     * So, if count * COUNT_MULT / size * SIZE_MULT != 0, then we've exceeded
-                                                     * the load factor of the ddict hash set.
-                                                     */
+#define DDICT_HASHSET_MAX_LOAD_FACTOR_SIZE_MULT 3  /* These two constants represent SIZE_MULT/COUNT_MULT load factor without using a float.
+                                                    * Currently, that means a 0.75 load factor.
+                                                    * So, if count * COUNT_MULT / size * SIZE_MULT != 0, then we've exceeded
+                                                    * the load factor of the ddict hash set.
+                                                    */
 
 #define DDICT_HASHSET_TABLE_BASE_SIZE 64
 #define DDICT_HASHSET_RESIZE_FACTOR 2
@@ -237,6 +239,7 @@ static void ZSTD_DCtx_resetParameters(ZSTD_DCtx* dctx)
     dctx->outBufferMode = ZSTD_bm_buffered;
     dctx->forceIgnoreChecksum = ZSTD_d_validateChecksum;
     dctx->refMultipleDDicts = ZSTD_rmd_refSingleDDict;
+    dctx->disableHufAsm = 0;
 }
 
 static void ZSTD_initDCtx_internal(ZSTD_DCtx* dctx)
@@ -421,16 +424,40 @@ size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize)
  *  note : only works for formats ZSTD_f_zstd1 and ZSTD_f_zstd1_magicless
  * @return : 0, `zfhPtr` is correctly filled,
  *          >0, `srcSize` is too small, value is wanted `srcSize` amount,
- *           or an error code, which can be tested using ZSTD_isError() */
+**           or an error code, which can be tested using ZSTD_isError() */
 size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format)
 {
     const BYTE* ip = (const BYTE*)src;
     size_t const minInputSize = ZSTD_startingInputLength(format);
 
-    ZSTD_memset(zfhPtr, 0, sizeof(*zfhPtr));   /* not strictly necessary, but static analyzer do not understand that zfhPtr is only going to be read only if return value is zero, since they are 2 different signals */
-    if (srcSize < minInputSize) return minInputSize;
-    RETURN_ERROR_IF(src==NULL, GENERIC, "invalid parameter");
+    DEBUGLOG(5, "ZSTD_getFrameHeader_advanced: minInputSize = %zu, srcSize = %zu", minInputSize, srcSize);
+
+    if (srcSize > 0) {
+        /* note : technically could be considered an assert(), since it's an invalid entry */
+        RETURN_ERROR_IF(src==NULL, GENERIC, "invalid parameter : src==NULL, but srcSize>0");
+    }
+    if (srcSize < minInputSize) {
+        if (srcSize > 0 && format != ZSTD_f_zstd1_magicless) {
+            /* when receiving less than @minInputSize bytes,
+             * control these bytes at least correspond to a supported magic number
+             * in order to error out early if they don't.
+            **/
+            size_t const toCopy = MIN(4, srcSize);
+            unsigned char hbuf[4]; MEM_writeLE32(hbuf, ZSTD_MAGICNUMBER);
+            assert(src != NULL);
+            ZSTD_memcpy(hbuf, src, toCopy);
+            if ( MEM_readLE32(hbuf) != ZSTD_MAGICNUMBER ) {
+                /* not a zstd frame : let's check if it's a skippable frame */
+                MEM_writeLE32(hbuf, ZSTD_MAGIC_SKIPPABLE_START);
+                ZSTD_memcpy(hbuf, src, toCopy);
+                if ((MEM_readLE32(hbuf) & ZSTD_MAGIC_SKIPPABLE_MASK) != ZSTD_MAGIC_SKIPPABLE_START) {
+                    RETURN_ERROR(prefix_unknown,
+                                "first bytes don't correspond to any supported magic number");
+        }   }   }
+        return minInputSize;
+    }
 
+    ZSTD_memset(zfhPtr, 0, sizeof(*zfhPtr));   /* not strictly necessary, but static analyzers may not understand that zfhPtr will be read only if return value is zero, since they are 2 different signals */
     if ( (format != ZSTD_f_zstd1_magicless)
       && (MEM_readLE32(src) != ZSTD_MAGICNUMBER) ) {
         if ((MEM_readLE32(src) & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) {
@@ -540,49 +567,52 @@ static size_t readSkippableFrameSize(void const* src, size_t srcSize)
     sizeU32 = MEM_readLE32((BYTE const*)src + ZSTD_FRAMEIDSIZE);
     RETURN_ERROR_IF((U32)(sizeU32 + ZSTD_SKIPPABLEHEADERSIZE) < sizeU32,
                     frameParameter_unsupported, "");
-    {
-        size_t const skippableSize = skippableHeaderSize + sizeU32;
+    {   size_t const skippableSize = skippableHeaderSize + sizeU32;
         RETURN_ERROR_IF(skippableSize > srcSize, srcSize_wrong, "");
         return skippableSize;
     }
 }
 
 /*! ZSTD_readSkippableFrame() :
- * Retrieves a zstd skippable frame containing data given by src, and writes it to dst buffer.
+ * Retrieves content of a skippable frame, and writes it to dst buffer.
  *
  * The parameter magicVariant will receive the magicVariant that was supplied when the frame was written,
  * i.e. magicNumber - ZSTD_MAGIC_SKIPPABLE_START.  This can be NULL if the caller is not interested
  * in the magicVariant.
  *
- * Returns an error if destination buffer is not large enough, or if the frame is not skippable.
+ * Returns an error if destination buffer is not large enough, or if this is not a valid skippable frame.
  *
  * @return : number of bytes written or a ZSTD error.
  */
-ZSTDLIB_API size_t ZSTD_readSkippableFrame(void* dst, size_t dstCapacity, unsigned* magicVariant,
-                                            const void* src, size_t srcSize)
+size_t ZSTD_readSkippableFrame(void* dst, size_t dstCapacity,
+                               unsigned* magicVariant,  /* optional, can be NULL */
+                         const void* src, size_t srcSize)
 {
-    U32 const magicNumber = MEM_readLE32(src);
-    size_t skippableFrameSize = readSkippableFrameSize(src, srcSize);
-    size_t skippableContentSize = skippableFrameSize - ZSTD_SKIPPABLEHEADERSIZE;
-
-    /* check input validity */
-    RETURN_ERROR_IF(!ZSTD_isSkippableFrame(src, srcSize), frameParameter_unsupported, "");
-    RETURN_ERROR_IF(skippableFrameSize < ZSTD_SKIPPABLEHEADERSIZE || skippableFrameSize > srcSize, srcSize_wrong, "");
-    RETURN_ERROR_IF(skippableContentSize > dstCapacity, dstSize_tooSmall, "");
+    RETURN_ERROR_IF(srcSize < ZSTD_SKIPPABLEHEADERSIZE, srcSize_wrong, "");
 
-    /* deliver payload */
-    if (skippableContentSize > 0  && dst != NULL)
-        ZSTD_memcpy(dst, (const BYTE *)src + ZSTD_SKIPPABLEHEADERSIZE, skippableContentSize);
-    if (magicVariant != NULL)
-        *magicVariant = magicNumber - ZSTD_MAGIC_SKIPPABLE_START;
-    return skippableContentSize;
+    {   U32 const magicNumber = MEM_readLE32(src);
+        size_t skippableFrameSize = readSkippableFrameSize(src, srcSize);
+        size_t skippableContentSize = skippableFrameSize - ZSTD_SKIPPABLEHEADERSIZE;
+
+        /* check input validity */
+        RETURN_ERROR_IF(!ZSTD_isSkippableFrame(src, srcSize), frameParameter_unsupported, "");
+        RETURN_ERROR_IF(skippableFrameSize < ZSTD_SKIPPABLEHEADERSIZE || skippableFrameSize > srcSize, srcSize_wrong, "");
+        RETURN_ERROR_IF(skippableContentSize > dstCapacity, dstSize_tooSmall, "");
+
+        /* deliver payload */
+        if (skippableContentSize > 0  && dst != NULL)
+            ZSTD_memcpy(dst, (const BYTE *)src + ZSTD_SKIPPABLEHEADERSIZE, skippableContentSize);
+        if (magicVariant != NULL)
+            *magicVariant = magicNumber - ZSTD_MAGIC_SKIPPABLE_START;
+        return skippableContentSize;
+    }
 }
 
 /* ZSTD_findDecompressedSize() :
- *  compatible with legacy mode
  *  `srcSize` must be the exact length of some number of ZSTD compressed and/or
  *      skippable frames
- *  @return : decompressed size of the frames contained */
+ *  note: compatible with legacy mode
+ * @return : decompressed size of the frames contained */
 unsigned long long ZSTD_findDecompressedSize(const void* src, size_t srcSize)
 {
     unsigned long long totalDstSize = 0;
@@ -592,9 +622,7 @@ unsigned long long ZSTD_findDecompressedSize(const void* src, size_t srcSize)
 
         if ((magicNumber & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) {
             size_t const skippableSize = readSkippableFrameSize(src, srcSize);
-            if (ZSTD_isError(skippableSize)) {
-                return ZSTD_CONTENTSIZE_ERROR;
-            }
+            if (ZSTD_isError(skippableSize)) return ZSTD_CONTENTSIZE_ERROR;
             assert(skippableSize <= srcSize);
 
             src = (const BYTE *)src + skippableSize;
@@ -602,17 +630,17 @@ unsigned long long ZSTD_findDecompressedSize(const void* src, size_t srcSize)
             continue;
         }
 
-        {   unsigned long long const ret = ZSTD_getFrameContentSize(src, srcSize);
-            if (ret >= ZSTD_CONTENTSIZE_ERROR) return ret;
+        {   unsigned long long const fcs = ZSTD_getFrameContentSize(src, srcSize);
+            if (fcs >= ZSTD_CONTENTSIZE_ERROR) return fcs;
 
-            /* check for overflow */
-            if (totalDstSize + ret < totalDstSize) return ZSTD_CONTENTSIZE_ERROR;
-            totalDstSize += ret;
+            if (totalDstSize + fcs < totalDstSize)
+                return ZSTD_CONTENTSIZE_ERROR; /* check for overflow */
+            totalDstSize += fcs;
         }
+        /* skip to next frame */
         {   size_t const frameSrcSize = ZSTD_findFrameCompressedSize(src, srcSize);
-            if (ZSTD_isError(frameSrcSize)) {
-                return ZSTD_CONTENTSIZE_ERROR;
-            }
+            if (ZSTD_isError(frameSrcSize)) return ZSTD_CONTENTSIZE_ERROR;
+            assert(frameSrcSize <= srcSize);
 
             src = (const BYTE *)src + frameSrcSize;
             srcSize -= frameSrcSize;
@@ -730,10 +758,11 @@ static ZSTD_frameSizeInfo ZSTD_findFrameSizeInfo(const void* src, size_t srcSize
             ip += 4;
         }
 
+        frameSizeInfo.nbBlocks = nbBlocks;
         frameSizeInfo.compressedSize = (size_t)(ip - ipstart);
         frameSizeInfo.decompressedBound = (zfh.frameContentSize != ZSTD_CONTENTSIZE_UNKNOWN)
                                         ? zfh.frameContentSize
-                                        : nbBlocks * zfh.blockSizeMax;
+                                        : (unsigned long long)nbBlocks * zfh.blockSizeMax;
         return frameSizeInfo;
     }
 }
@@ -773,6 +802,48 @@ unsigned long long ZSTD_decompressBound(const void* src, size_t srcSize)
     return bound;
 }
 
+size_t ZSTD_decompressionMargin(void const* src, size_t srcSize)
+{
+    size_t margin = 0;
+    unsigned maxBlockSize = 0;
+
+    /* Iterate over each frame */
+    while (srcSize > 0) {
+        ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize);
+        size_t const compressedSize = frameSizeInfo.compressedSize;
+        unsigned long long const decompressedBound = frameSizeInfo.decompressedBound;
+        ZSTD_frameHeader zfh;
+
+        FORWARD_IF_ERROR(ZSTD_getFrameHeader(&zfh, src, srcSize), "");
+        if (ZSTD_isError(compressedSize) || decompressedBound == ZSTD_CONTENTSIZE_ERROR)
+            return ERROR(corruption_detected);
+
+        if (zfh.frameType == ZSTD_frame) {
+            /* Add the frame header to our margin */
+            margin += zfh.headerSize;
+            /* Add the checksum to our margin */
+            margin += zfh.checksumFlag ? 4 : 0;
+            /* Add 3 bytes per block */
+            margin += 3 * frameSizeInfo.nbBlocks;
+
+            /* Compute the max block size */
+            maxBlockSize = MAX(maxBlockSize, zfh.blockSizeMax);
+        } else {
+            assert(zfh.frameType == ZSTD_skippableFrame);
+            /* Add the entire skippable frame size to our margin. */
+            margin += compressedSize;
+        }
+
+        assert(srcSize >= compressedSize);
+        src = (const BYTE*)src + compressedSize;
+        srcSize -= compressedSize;
+    }
+
+    /* Add the max block size back to the margin. */
+    margin += maxBlockSize;
+
+    return margin;
+}
 
 /*-*************************************************************
  *   Frame decoding
@@ -930,6 +1001,7 @@ static size_t ZSTD_decompressFrame(ZSTD_DCtx* dctx,
     }
     ZSTD_DCtx_trace_end(dctx, (U64)(op-ostart), (U64)(ip-istart), /* streaming */ 0);
     /* Allow caller to get size read */
+    DEBUGLOG(4, "ZSTD_decompressFrame: decompressed frame of size %zi, consuming %zi bytes of input", op-ostart, ip - (const BYTE*)*srcPtr);
     *srcPtr = ip;
     *srcSizePtr = remainingSrcSize;
     return (size_t)(op-ostart);
@@ -955,17 +1027,18 @@ static size_t ZSTD_decompressMultiFrame(ZSTD_DCtx* dctx,
     while (srcSize >= ZSTD_startingInputLength(dctx->format)) {
 
 
-        {   U32 const magicNumber = MEM_readLE32(src);
-            DEBUGLOG(4, "reading magic number %08X (expecting %08X)",
-                        (unsigned)magicNumber, ZSTD_MAGICNUMBER);
+        if (srcSize >= 4) {
+            U32 const magicNumber = MEM_readLE32(src);
+            DEBUGLOG(5, "reading magic number %08X", (unsigned)magicNumber);
             if ((magicNumber & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) {
+                /* skippable frame detected : skip it */
                 size_t const skippableSize = readSkippableFrameSize(src, srcSize);
-                FORWARD_IF_ERROR(skippableSize, "readSkippableFrameSize failed");
+                FORWARD_IF_ERROR(skippableSize, "invalid skippable frame");
                 assert(skippableSize <= srcSize);
 
                 src = (const BYTE *)src + skippableSize;
                 srcSize -= skippableSize;
-                continue;
+                continue; /* check next frame */
         }   }
 
         if (ddict) {
@@ -1061,8 +1134,8 @@ size_t ZSTD_decompress(void* dst, size_t dstCapacity, const void* src, size_t sr
 size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx* dctx) { return dctx->expected; }
 
 /*
- * Similar to ZSTD_nextSrcSizeToDecompress(), but when a block input can be streamed,
- * we allow taking a partial block as the input. Currently only raw uncompressed blocks can
+ * Similar to ZSTD_nextSrcSizeToDecompress(), but when a block input can be streamed, we
+ * allow taking a partial block as the input. Currently only raw uncompressed blocks can
  * be streamed.
  *
  * For blocks that can be streamed, this allows us to reduce the latency until we produce
@@ -1262,7 +1335,7 @@ size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, c
 
     default:
         assert(0);   /* impossible */
-        RETURN_ERROR(GENERIC, "impossible to reach");   /* some compiler require default to do something */
+        RETURN_ERROR(GENERIC, "impossible to reach");   /* some compilers require default to do something */
     }
 }
 
@@ -1303,11 +1376,11 @@ ZSTD_loadDEntropy(ZSTD_entropyDTables_t* entropy,
         /* in minimal huffman, we always use X1 variants */
         size_t const hSize = HUF_readDTableX1_wksp(entropy->hufTable,
                                                 dictPtr, dictEnd - dictPtr,
-                                                workspace, workspaceSize);
+                                                workspace, workspaceSize, /* flags */ 0);
 #else
         size_t const hSize = HUF_readDTableX2_wksp(entropy->hufTable,
                                                 dictPtr, (size_t)(dictEnd - dictPtr),
-                                                workspace, workspaceSize);
+                                                workspace, workspaceSize, /* flags */ 0);
 #endif
         RETURN_ERROR_IF(HUF_isError(hSize), dictionary_corrupted, "");
         dictPtr += hSize;
@@ -1403,7 +1476,7 @@ size_t ZSTD_decompressBegin(ZSTD_DCtx* dctx)
     dctx->prefixStart = NULL;
     dctx->virtualStart = NULL;
     dctx->dictEnd = NULL;
-    dctx->entropy.hufTable[0] = (HUF_DTable)((HufLog)*0x1000001);  /* cover both little and big endian */
+    dctx->entropy.hufTable[0] = (HUF_DTable)((ZSTD_HUFFDTABLE_CAPACITY_LOG)*0x1000001);  /* cover both little and big endian */
     dctx->litEntropy = dctx->fseEntropy = 0;
     dctx->dictID = 0;
     dctx->bType = bt_reserved;
@@ -1465,7 +1538,7 @@ unsigned ZSTD_getDictID_fromDict(const void* dict, size_t dictSize)
  *  This could for one of the following reasons :
  *  - The frame does not require a dictionary (most common case).
  *  - The frame was built with dictID intentionally removed.
- *    Needed dictionary is a hidden information.
+ *    Needed dictionary is a hidden piece of information.
  *    Note : this use case also happens when using a non-conformant dictionary.
  *  - `srcSize` is too small, and as a result, frame header could not be decoded.
  *    Note : possible if `srcSize < ZSTD_FRAMEHEADERSIZE_MAX`.
@@ -1474,7 +1547,7 @@ unsigned ZSTD_getDictID_fromDict(const void* dict, size_t dictSize)
  *  ZSTD_getFrameHeader(), which will provide a more precise error code. */
 unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize)
 {
-    ZSTD_frameHeader zfp = { 0, 0, 0, ZSTD_frame, 0, 0, 0 };
+    ZSTD_frameHeader zfp = { 0, 0, 0, ZSTD_frame, 0, 0, 0, 0, 0 };
     size_t const hError = ZSTD_getFrameHeader(&zfp, src, srcSize);
     if (ZSTD_isError(hError)) return 0;
     return zfp.dictID;
@@ -1581,7 +1654,9 @@ size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const void* dict, size_t di
 size_t ZSTD_initDStream(ZSTD_DStream* zds)
 {
     DEBUGLOG(4, "ZSTD_initDStream");
-    return ZSTD_initDStream_usingDDict(zds, NULL);
+    FORWARD_IF_ERROR(ZSTD_DCtx_reset(zds, ZSTD_reset_session_only), "");
+    FORWARD_IF_ERROR(ZSTD_DCtx_refDDict(zds, NULL), "");
+    return ZSTD_startingInputLength(zds->format);
 }
 
 /* ZSTD_initDStream_usingDDict() :
@@ -1589,6 +1664,7 @@ size_t ZSTD_initDStream(ZSTD_DStream* zds)
  * this function cannot fail */
 size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* dctx, const ZSTD_DDict* ddict)
 {
+    DEBUGLOG(4, "ZSTD_initDStream_usingDDict");
     FORWARD_IF_ERROR( ZSTD_DCtx_reset(dctx, ZSTD_reset_session_only) , "");
     FORWARD_IF_ERROR( ZSTD_DCtx_refDDict(dctx, ddict) , "");
     return ZSTD_startingInputLength(dctx->format);
@@ -1599,6 +1675,7 @@ size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* dctx, const ZSTD_DDict* ddict)
  * this function cannot fail */
 size_t ZSTD_resetDStream(ZSTD_DStream* dctx)
 {
+    DEBUGLOG(4, "ZSTD_resetDStream");
     FORWARD_IF_ERROR(ZSTD_DCtx_reset(dctx, ZSTD_reset_session_only), "");
     return ZSTD_startingInputLength(dctx->format);
 }
@@ -1670,6 +1747,11 @@ ZSTD_bounds ZSTD_dParam_getBounds(ZSTD_dParameter dParam)
             bounds.lowerBound = (int)ZSTD_rmd_refSingleDDict;
             bounds.upperBound = (int)ZSTD_rmd_refMultipleDDicts;
             return bounds;
+        case ZSTD_d_disableHuffmanAssembly:
+            bounds.lowerBound = 0;
+            bounds.upperBound = 1;
+            return bounds;
+
         default:;
     }
     bounds.error = ERROR(parameter_unsupported);
@@ -1710,6 +1792,9 @@ size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParameter param, int* value
         case ZSTD_d_refMultipleDDicts:
             *value = (int)dctx->refMultipleDDicts;
             return 0;
+        case ZSTD_d_disableHuffmanAssembly:
+            *value = (int)dctx->disableHufAsm;
+            return 0;
         default:;
     }
     RETURN_ERROR(parameter_unsupported, "");
@@ -1743,6 +1828,10 @@ size_t ZSTD_DCtx_setParameter(ZSTD_DCtx* dctx, ZSTD_dParameter dParam, int value
             }
             dctx->refMultipleDDicts = (ZSTD_refMultipleDDicts_e)value;
             return 0;
+        case ZSTD_d_disableHuffmanAssembly:
+            CHECK_DBOUNDS(ZSTD_d_disableHuffmanAssembly, value);
+            dctx->disableHufAsm = value != 0;
+            return 0;
         default:;
     }
     RETURN_ERROR(parameter_unsupported, "");
@@ -1918,7 +2007,6 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
                 if (zds->refMultipleDDicts && zds->ddictSet) {
                     ZSTD_DCtx_selectFrameDDict(zds);
                 }
-                DEBUGLOG(5, "header size : %u", (U32)hSize);
                 if (ZSTD_isError(hSize)) {
                     return hSize;   /* error */
                 }
@@ -1932,6 +2020,11 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
                             zds->lhSize += remainingInput;
                         }
                         input->pos = input->size;
+                        /* check first few bytes */
+                        FORWARD_IF_ERROR(
+                            ZSTD_getFrameHeader_advanced(&zds->fParams, zds->headerBuffer, zds->lhSize, zds->format),
+                            "First few bytes detected incorrect" );
+                        /* return hint input size */
                         return (MAX((size_t)ZSTD_FRAMEHEADERSIZE_MIN(zds->format), hSize) - zds->lhSize) + ZSTD_blockHeaderSize;   /* remaining header bytes + next block header */
                     }
                     assert(ip != NULL);
@@ -1949,8 +2042,9 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
                     size_t const decompressedSize = ZSTD_decompress_usingDDict(zds, op, (size_t)(oend-op), istart, cSize, ZSTD_getDDict(zds));
                     if (ZSTD_isError(decompressedSize)) return decompressedSize;
                     DEBUGLOG(4, "shortcut to single-pass ZSTD_decompress_usingDDict()")
+                    assert(istart != NULL);
                     ip = istart + cSize;
-                    op += decompressedSize;
+                    op = op ? op + decompressedSize : op; /* can occur if frameContentSize = 0 (empty frame) */
                     zds->expected = 0;
                     zds->streamStage = zdss_init;
                     someMoreWork = 0;
@@ -2034,6 +2128,7 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
                 }
                 if ((size_t)(iend-ip) >= neededInSize) {  /* decode directly from src */
                     FORWARD_IF_ERROR(ZSTD_decompressContinueStream(zds, &op, oend, ip, neededInSize), "");
+                    assert(ip != NULL);
                     ip += neededInSize;
                     /* Function modifies the stage so we must break */
                     break;
@@ -2048,7 +2143,7 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
                 int const isSkipFrame = ZSTD_isSkipFrame(zds);
                 size_t loadedSize;
                 /* At this point we shouldn't be decompressing a block that we can stream. */
-                assert(neededInSize == ZSTD_nextSrcSizeToDecompressWithInputSize(zds, iend - ip));
+                assert(neededInSize == ZSTD_nextSrcSizeToDecompressWithInputSize(zds, (size_t)(iend - ip)));
                 if (isSkipFrame) {
                     loadedSize = MIN(toLoad, (size_t)(iend-ip));
                 } else {
@@ -2057,8 +2152,11 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
                                     "should never happen");
                     loadedSize = ZSTD_limitCopy(zds->inBuff + zds->inPos, toLoad, ip, (size_t)(iend-ip));
                 }
-                ip += loadedSize;
-                zds->inPos += loadedSize;
+                if (loadedSize != 0) {
+                    /* ip may be NULL */
+                    ip += loadedSize;
+                    zds->inPos += loadedSize;
+                }
                 if (loadedSize < toLoad) { someMoreWork = 0; break; }   /* not enough input, wait for more */
 
                 /* decode loaded input */
@@ -2068,14 +2166,17 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
                 break;
             }
         case zdss_flush:
-            {   size_t const toFlushSize = zds->outEnd - zds->outStart;
+            {
+                size_t const toFlushSize = zds->outEnd - zds->outStart;
                 size_t const flushedSize = ZSTD_limitCopy(op, (size_t)(oend-op), zds->outBuff + zds->outStart, toFlushSize);
-                op += flushedSize;
+
+                op = op ? op + flushedSize : op;
+
                 zds->outStart += flushedSize;
                 if (flushedSize == toFlushSize) {  /* flush completed */
                     zds->streamStage = zdss_read;
                     if ( (zds->outBuffSize < zds->fParams.frameContentSize)
-                      && (zds->outStart + zds->fParams.blockSizeMax > zds->outBuffSize) ) {
+                        && (zds->outStart + zds->fParams.blockSizeMax > zds->outBuffSize) ) {
                         DEBUGLOG(5, "restart filling outBuff from beginning (left:%i, needed:%u)",
                                 (int)(zds->outBuffSize - zds->outStart),
                                 (U32)zds->fParams.blockSizeMax);
@@ -2089,7 +2190,7 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
 
         default:
             assert(0);    /* impossible */
-            RETURN_ERROR(GENERIC, "impossible to reach");   /* some compiler require default to do something */
+            RETURN_ERROR(GENERIC, "impossible to reach");   /* some compilers require default to do something */
     }   }
 
     /* result */
@@ -2102,8 +2203,8 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
     if ((ip==istart) && (op==ostart)) {  /* no forward progress */
         zds->noForwardProgress ++;
         if (zds->noForwardProgress >= ZSTD_NO_FORWARD_PROGRESS_MAX) {
-            RETURN_ERROR_IF(op==oend, dstSize_tooSmall, "");
-            RETURN_ERROR_IF(ip==iend, srcSize_wrong, "");
+            RETURN_ERROR_IF(op==oend, noForwardProgress_destFull, "");
+            RETURN_ERROR_IF(ip==iend, noForwardProgress_inputEmpty, "");
             assert(0);
         }
     } else {
@@ -2140,11 +2241,17 @@ size_t ZSTD_decompressStream_simpleArgs (
                             void* dst, size_t dstCapacity, size_t* dstPos,
                       const void* src, size_t srcSize, size_t* srcPos)
 {
-    ZSTD_outBuffer output = { dst, dstCapacity, *dstPos };
-    ZSTD_inBuffer  input  = { src, srcSize, *srcPos };
-    /* ZSTD_compress_generic() will check validity of dstPos and srcPos */
-    size_t const cErr = ZSTD_decompressStream(dctx, &output, &input);
-    *dstPos = output.pos;
-    *srcPos = input.pos;
-    return cErr;
+    ZSTD_outBuffer output;
+    ZSTD_inBuffer  input;
+    output.dst = dst;
+    output.size = dstCapacity;
+    output.pos = *dstPos;
+    input.src = src;
+    input.size = srcSize;
+    input.pos = *srcPos;
+    {   size_t const cErr = ZSTD_decompressStream(dctx, &output, &input);
+        *dstPos = output.pos;
+        *srcPos = input.pos;
+        return cErr;
+    }
 }
diff --git a/lib/zstd/decompress/zstd_decompress_block.c b/lib/zstd/decompress/zstd_decompress_block.c
index c1913b8e7c8974..9f5577e5bc19d5 100644
--- a/lib/zstd/decompress/zstd_decompress_block.c
+++ b/lib/zstd/decompress/zstd_decompress_block.c
@@ -1,5 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -20,12 +21,12 @@
 #include "../common/mem.h"         /* low level memory routines */
 #define FSE_STATIC_LINKING_ONLY
 #include "../common/fse.h"
-#define HUF_STATIC_LINKING_ONLY
 #include "../common/huf.h"
 #include "../common/zstd_internal.h"
 #include "zstd_decompress_internal.h"   /* ZSTD_DCtx */
 #include "zstd_ddict.h"  /* ZSTD_DDictDictContent */
 #include "zstd_decompress_block.h"
+#include "../common/bits.h"  /* ZSTD_highbit32 */
 
 /*_*******************************************************
 *  Macros
@@ -89,7 +90,7 @@ static void ZSTD_allocateLiteralsBuffer(ZSTD_DCtx* dctx, void* const dst, const
             dctx->litBufferEnd = dctx->litBuffer + litSize - ZSTD_LITBUFFEREXTRASIZE;
         }
         else {
-            /* initially this will be stored entirely in dst during huffman decoding, it will partially shifted to litExtraBuffer after */
+            /* initially this will be stored entirely in dst during huffman decoding, it will partially be shifted to litExtraBuffer after */
             dctx->litBuffer = (BYTE*)dst + expectedWriteSize - litSize;
             dctx->litBufferEnd = (BYTE*)dst + expectedWriteSize;
         }
@@ -134,13 +135,16 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
             ZSTD_FALLTHROUGH;
 
         case set_compressed:
-            RETURN_ERROR_IF(srcSize < 5, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 3; here we need up to 5 for case 3");
+            RETURN_ERROR_IF(srcSize < 5, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need up to 5 for case 3");
             {   size_t lhSize, litSize, litCSize;
                 U32 singleStream=0;
                 U32 const lhlCode = (istart[0] >> 2) & 3;
                 U32 const lhc = MEM_readLE32(istart);
                 size_t hufSuccess;
                 size_t expectedWriteSize = MIN(ZSTD_BLOCKSIZE_MAX, dstCapacity);
+                int const flags = 0
+                    | (ZSTD_DCtx_get_bmi2(dctx) ? HUF_flags_bmi2 : 0)
+                    | (dctx->disableHufAsm ? HUF_flags_disableAsm : 0);
                 switch(lhlCode)
                 {
                 case 0: case 1: default:   /* note : default is impossible, since lhlCode into [0..3] */
@@ -165,6 +169,10 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
                 }
                 RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled");
                 RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_detected, "");
+                if (!singleStream)
+                    RETURN_ERROR_IF(litSize < MIN_LITERALS_FOR_4_STREAMS, literals_headerWrong,
+                        "Not enough literals (%zu) for the 4-streams mode (min %u)",
+                        litSize, MIN_LITERALS_FOR_4_STREAMS);
                 RETURN_ERROR_IF(litCSize + lhSize > srcSize, corruption_detected, "");
                 RETURN_ERROR_IF(expectedWriteSize < litSize , dstSize_tooSmall, "");
                 ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 0);
@@ -176,13 +184,14 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
 
                 if (litEncType==set_repeat) {
                     if (singleStream) {
-                        hufSuccess = HUF_decompress1X_usingDTable_bmi2(
+                        hufSuccess = HUF_decompress1X_usingDTable(
                             dctx->litBuffer, litSize, istart+lhSize, litCSize,
-                            dctx->HUFptr, ZSTD_DCtx_get_bmi2(dctx));
+                            dctx->HUFptr, flags);
                     } else {
-                        hufSuccess = HUF_decompress4X_usingDTable_bmi2(
+                        assert(litSize >= MIN_LITERALS_FOR_4_STREAMS);
+                        hufSuccess = HUF_decompress4X_usingDTable(
                             dctx->litBuffer, litSize, istart+lhSize, litCSize,
-                            dctx->HUFptr, ZSTD_DCtx_get_bmi2(dctx));
+                            dctx->HUFptr, flags);
                     }
                 } else {
                     if (singleStream) {
@@ -190,18 +199,18 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
                         hufSuccess = HUF_decompress1X_DCtx_wksp(
                             dctx->entropy.hufTable, dctx->litBuffer, litSize,
                             istart+lhSize, litCSize, dctx->workspace,
-                            sizeof(dctx->workspace));
+                            sizeof(dctx->workspace), flags);
 #else
-                        hufSuccess = HUF_decompress1X1_DCtx_wksp_bmi2(
+                        hufSuccess = HUF_decompress1X1_DCtx_wksp(
                             dctx->entropy.hufTable, dctx->litBuffer, litSize,
                             istart+lhSize, litCSize, dctx->workspace,
-                            sizeof(dctx->workspace), ZSTD_DCtx_get_bmi2(dctx));
+                            sizeof(dctx->workspace), flags);
 #endif
                     } else {
-                        hufSuccess = HUF_decompress4X_hufOnly_wksp_bmi2(
+                        hufSuccess = HUF_decompress4X_hufOnly_wksp(
                             dctx->entropy.hufTable, dctx->litBuffer, litSize,
                             istart+lhSize, litCSize, dctx->workspace,
-                            sizeof(dctx->workspace), ZSTD_DCtx_get_bmi2(dctx));
+                            sizeof(dctx->workspace), flags);
                     }
                 }
                 if (dctx->litBufferLocation == ZSTD_split)
@@ -237,6 +246,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
                     break;
                 case 3:
                     lhSize = 3;
+                    RETURN_ERROR_IF(srcSize<3, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need lhSize = 3");
                     litSize = MEM_readLE24(istart) >> 4;
                     break;
                 }
@@ -279,12 +289,13 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
                     break;
                 case 1:
                     lhSize = 2;
+                    RETURN_ERROR_IF(srcSize<3, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need lhSize+1 = 3");
                     litSize = MEM_readLE16(istart) >> 4;
                     break;
                 case 3:
                     lhSize = 3;
+                    RETURN_ERROR_IF(srcSize<4, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need lhSize+1 = 4");
                     litSize = MEM_readLE24(istart) >> 4;
-                    RETURN_ERROR_IF(srcSize<4, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 3; here we need lhSize+1 = 4");
                     break;
                 }
                 RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled");
@@ -506,14 +517,15 @@ void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt,
                 for (i = 8; i < n; i += 8) {
                     MEM_write64(spread + pos + i, sv);
                 }
-                pos += n;
+                assert(n>=0);
+                pos += (size_t)n;
             }
         }
         /* Now we spread those positions across the table.
-         * The benefit of doing it in two stages is that we avoid the the
+         * The benefit of doing it in two stages is that we avoid the
          * variable size inner loop, which caused lots of branch misses.
          * Now we can run through all the positions without any branch misses.
-         * We unroll the loop twice, since that is what emperically worked best.
+         * We unroll the loop twice, since that is what empirically worked best.
          */
         {
             size_t position = 0;
@@ -540,7 +552,7 @@ void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt,
             for (i=0; i<n; i++) {
                 tableDecode[position].baseValue = s;
                 position = (position + step) & tableMask;
-                while (position > highThreshold) position = (position + step) & tableMask;   /* lowprob area */
+                while (UNLIKELY(position > highThreshold)) position = (position + step) & tableMask;   /* lowprob area */
         }   }
         assert(position == 0); /* position must reach all cells once, otherwise normalizedCounter is incorrect */
     }
@@ -551,7 +563,7 @@ void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt,
         for (u=0; u<tableSize; u++) {
             U32 const symbol = tableDecode[u].baseValue;
             U32 const nextState = symbolNext[symbol]++;
-            tableDecode[u].nbBits = (BYTE) (tableLog - BIT_highbit32(nextState) );
+            tableDecode[u].nbBits = (BYTE) (tableLog - ZSTD_highbit32(nextState) );
             tableDecode[u].nextState = (U16) ( (nextState << tableDecode[u].nbBits) - tableSize);
             assert(nbAdditionalBits[symbol] < 255);
             tableDecode[u].nbAdditionalBits = nbAdditionalBits[symbol];
@@ -964,6 +976,11 @@ size_t ZSTD_execSequence(BYTE* op,
 
     assert(op != NULL /* Precondition */);
     assert(oend_w < oend /* No underflow */);
+
+#if defined(__aarch64__)
+    /* prefetch sequence starting from match that will be used for copy later */
+    PREFETCH_L1(match);
+#endif
     /* Handle edge cases in a slow path:
      *   - Read beyond end of literals
      *   - Match end is within WILDCOPY_OVERLIMIT of oend
@@ -1154,7 +1171,7 @@ ZSTD_updateFseStateWithDInfo(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, U16
 }
 
 /* We need to add at most (ZSTD_WINDOWLOG_MAX_32 - 1) bits to read the maximum
- * offset bits. But we can only read at most (STREAM_ACCUMULATOR_MIN_32 - 1)
+ * offset bits. But we can only read at most STREAM_ACCUMULATOR_MIN_32
  * bits before reloading. This value is the maximum number of bytes we read
  * after reloading when we are decoding long offsets.
  */
@@ -1169,9 +1186,27 @@ FORCE_INLINE_TEMPLATE seq_t
 ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets)
 {
     seq_t seq;
+    /*
+     * ZSTD_seqSymbol is a structure with a total of 64 bits wide. So it can be
+     * loaded in one operation and extracted its fields by simply shifting or
+     * bit-extracting on aarch64.
+     * GCC doesn't recognize this and generates more unnecessary ldr/ldrb/ldrh
+     * operations that cause performance drop. This can be avoided by using this
+     * ZSTD_memcpy hack.
+     */
+#if defined(__aarch64__) && (defined(__GNUC__) && !defined(__clang__))
+    ZSTD_seqSymbol llDInfoS, mlDInfoS, ofDInfoS;
+    ZSTD_seqSymbol* const llDInfo = &llDInfoS;
+    ZSTD_seqSymbol* const mlDInfo = &mlDInfoS;
+    ZSTD_seqSymbol* const ofDInfo = &ofDInfoS;
+    ZSTD_memcpy(llDInfo, seqState->stateLL.table + seqState->stateLL.state, sizeof(ZSTD_seqSymbol));
+    ZSTD_memcpy(mlDInfo, seqState->stateML.table + seqState->stateML.state, sizeof(ZSTD_seqSymbol));
+    ZSTD_memcpy(ofDInfo, seqState->stateOffb.table + seqState->stateOffb.state, sizeof(ZSTD_seqSymbol));
+#else
     const ZSTD_seqSymbol* const llDInfo = seqState->stateLL.table + seqState->stateLL.state;
     const ZSTD_seqSymbol* const mlDInfo = seqState->stateML.table + seqState->stateML.state;
     const ZSTD_seqSymbol* const ofDInfo = seqState->stateOffb.table + seqState->stateOffb.state;
+#endif
     seq.matchLength = mlDInfo->baseValue;
     seq.litLength = llDInfo->baseValue;
     {   U32 const ofBase = ofDInfo->baseValue;
@@ -1186,28 +1221,31 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets)
         U32 const llnbBits = llDInfo->nbBits;
         U32 const mlnbBits = mlDInfo->nbBits;
         U32 const ofnbBits = ofDInfo->nbBits;
+
+        assert(llBits <= MaxLLBits);
+        assert(mlBits <= MaxMLBits);
+        assert(ofBits <= MaxOff);
         /*
          * As gcc has better branch and block analyzers, sometimes it is only
-         * valuable to mark likelyness for clang, it gives around 3-4% of
+         * valuable to mark likeliness for clang, it gives around 3-4% of
          * performance.
          */
 
         /* sequence */
         {   size_t offset;
-    #if defined(__clang__)
-            if (LIKELY(ofBits > 1)) {
-    #else
             if (ofBits > 1) {
-    #endif
                 ZSTD_STATIC_ASSERT(ZSTD_lo_isLongOffset == 1);
                 ZSTD_STATIC_ASSERT(LONG_OFFSETS_MAX_EXTRA_BITS_32 == 5);
-                assert(ofBits <= MaxOff);
+                ZSTD_STATIC_ASSERT(STREAM_ACCUMULATOR_MIN_32 > LONG_OFFSETS_MAX_EXTRA_BITS_32);
+                ZSTD_STATIC_ASSERT(STREAM_ACCUMULATOR_MIN_32 - LONG_OFFSETS_MAX_EXTRA_BITS_32 >= MaxMLBits);
                 if (MEM_32bits() && longOffsets && (ofBits >= STREAM_ACCUMULATOR_MIN_32)) {
-                    U32 const extraBits = ofBits - MIN(ofBits, 32 - seqState->DStream.bitsConsumed);
+                    /* Always read extra bits, this keeps the logic simple,
+                     * avoids branches, and avoids accidentally reading 0 bits.
+                     */
+                    U32 const extraBits = LONG_OFFSETS_MAX_EXTRA_BITS_32;
                     offset = ofBase + (BIT_readBitsFast(&seqState->DStream, ofBits - extraBits) << extraBits);
                     BIT_reloadDStream(&seqState->DStream);
-                    if (extraBits) offset += BIT_readBitsFast(&seqState->DStream, extraBits);
-                    assert(extraBits <= LONG_OFFSETS_MAX_EXTRA_BITS_32);   /* to avoid another reload */
+                    offset += BIT_readBitsFast(&seqState->DStream, extraBits);
                 } else {
                     offset = ofBase + BIT_readBitsFast(&seqState->DStream, ofBits/*>0*/);   /* <=  (ZSTD_WINDOWLOG_MAX-1) bits */
                     if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream);
@@ -1232,11 +1270,7 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets)
             seq.offset = offset;
         }
 
-    #if defined(__clang__)
-        if (UNLIKELY(mlBits > 0))
-    #else
         if (mlBits > 0)
-    #endif
             seq.matchLength += BIT_readBitsFast(&seqState->DStream, mlBits/*>0*/);
 
         if (MEM_32bits() && (mlBits+llBits >= STREAM_ACCUMULATOR_MIN_32-LONG_OFFSETS_MAX_EXTRA_BITS_32))
@@ -1246,11 +1280,7 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets)
         /* Ensure there are enough bits to read the rest of data in 64-bit mode. */
         ZSTD_STATIC_ASSERT(16+LLFSELog+MLFSELog+OffFSELog < STREAM_ACCUMULATOR_MIN_64);
 
-    #if defined(__clang__)
-        if (UNLIKELY(llBits > 0))
-    #else
         if (llBits > 0)
-    #endif
             seq.litLength += BIT_readBitsFast(&seqState->DStream, llBits/*>0*/);
 
         if (MEM_32bits())
@@ -1552,7 +1582,7 @@ ZSTD_decompressSequences_body(ZSTD_DCtx* dctx,
     const BYTE* const prefixStart = (const BYTE*)(dctx->prefixStart);
     const BYTE* const vBase = (const BYTE*)(dctx->virtualStart);
     const BYTE* const dictEnd = (const BYTE*)(dctx->dictEnd);
-    DEBUGLOG(5, "ZSTD_decompressSequences_body");
+    DEBUGLOG(5, "ZSTD_decompressSequences_body: nbSeq = %d", nbSeq);
     (void)frame;
 
     /* Regen sequences */
@@ -1945,34 +1975,79 @@ ZSTD_decompressSequencesLong(ZSTD_DCtx* dctx,
 #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */
 
 
+/*
+ * @returns The total size of the history referenceable by zstd, including
+ * both the prefix and the extDict. At @p op any offset larger than this
+ * is invalid.
+ */
+static size_t ZSTD_totalHistorySize(BYTE* op, BYTE const* virtualStart)
+{
+    return (size_t)(op - virtualStart);
+}
+
+typedef struct {
+    unsigned longOffsetShare;
+    unsigned maxNbAdditionalBits;
+} ZSTD_OffsetInfo;
 
-#if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
-    !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
-/* ZSTD_getLongOffsetsShare() :
+/* ZSTD_getOffsetInfo() :
  * condition : offTable must be valid
  * @return : "share" of long offsets (arbitrarily defined as > (1<<23))
- *           compared to maximum possible of (1<<OffFSELog) */
-static unsigned
-ZSTD_getLongOffsetsShare(const ZSTD_seqSymbol* offTable)
+ *           compared to maximum possible of (1<<OffFSELog),
+ *           as well as the maximum number additional bits required.
+ */
+static ZSTD_OffsetInfo
+ZSTD_getOffsetInfo(const ZSTD_seqSymbol* offTable, int nbSeq)
 {
-    const void* ptr = offTable;
-    U32 const tableLog = ((const ZSTD_seqSymbol_header*)ptr)[0].tableLog;
-    const ZSTD_seqSymbol* table = offTable + 1;
-    U32 const max = 1 << tableLog;
-    U32 u, total = 0;
-    DEBUGLOG(5, "ZSTD_getLongOffsetsShare: (tableLog=%u)", tableLog);
-
-    assert(max <= (1 << OffFSELog));  /* max not too large */
-    for (u=0; u<max; u++) {
-        if (table[u].nbAdditionalBits > 22) total += 1;
+    ZSTD_OffsetInfo info = {0, 0};
+    /* If nbSeq == 0, then the offTable is uninitialized, but we have
+     * no sequences, so both values should be 0.
+     */
+    if (nbSeq != 0) {
+        const void* ptr = offTable;
+        U32 const tableLog = ((const ZSTD_seqSymbol_header*)ptr)[0].tableLog;
+        const ZSTD_seqSymbol* table = offTable + 1;
+        U32 const max = 1 << tableLog;
+        U32 u;
+        DEBUGLOG(5, "ZSTD_getLongOffsetsShare: (tableLog=%u)", tableLog);
+
+        assert(max <= (1 << OffFSELog));  /* max not too large */
+        for (u=0; u<max; u++) {
+            info.maxNbAdditionalBits = MAX(info.maxNbAdditionalBits, table[u].nbAdditionalBits);
+            if (table[u].nbAdditionalBits > 22) info.longOffsetShare += 1;
+        }
+
+        assert(tableLog <= OffFSELog);
+        info.longOffsetShare <<= (OffFSELog - tableLog);  /* scale to OffFSELog */
     }
 
-    assert(tableLog <= OffFSELog);
-    total <<= (OffFSELog - tableLog);  /* scale to OffFSELog */
+    return info;
+}
 
-    return total;
+/*
+ * @returns The maximum offset we can decode in one read of our bitstream, without
+ * reloading more bits in the middle of the offset bits read. Any offsets larger
+ * than this must use the long offset decoder.
+ */
+static size_t ZSTD_maxShortOffset(void)
+{
+    if (MEM_64bits()) {
+        /* We can decode any offset without reloading bits.
+         * This might change if the max window size grows.
+         */
+        ZSTD_STATIC_ASSERT(ZSTD_WINDOWLOG_MAX <= 31);
+        return (size_t)-1;
+    } else {
+        /* The maximum offBase is (1 << (STREAM_ACCUMULATOR_MIN + 1)) - 1.
+         * This offBase would require STREAM_ACCUMULATOR_MIN extra bits.
+         * Then we have to subtract ZSTD_REP_NUM to get the maximum possible offset.
+         */
+        size_t const maxOffbase = ((size_t)1 << (STREAM_ACCUMULATOR_MIN + 1)) - 1;
+        size_t const maxOffset = maxOffbase - ZSTD_REP_NUM;
+        assert(ZSTD_highbit32((U32)maxOffbase) == STREAM_ACCUMULATOR_MIN);
+        return maxOffset;
+    }
 }
-#endif
 
 size_t
 ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
@@ -1980,20 +2055,21 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
                         const void* src, size_t srcSize, const int frame, const streaming_operation streaming)
 {   /* blockType == blockCompressed */
     const BYTE* ip = (const BYTE*)src;
-    /* isLongOffset must be true if there are long offsets.
-     * Offsets are long if they are larger than 2^STREAM_ACCUMULATOR_MIN.
-     * We don't expect that to be the case in 64-bit mode.
-     * In block mode, window size is not known, so we have to be conservative.
-     * (note: but it could be evaluated from current-lowLimit)
-     */
-    ZSTD_longOffset_e const isLongOffset = (ZSTD_longOffset_e)(MEM_32bits() && (!frame || (dctx->fParams.windowSize > (1ULL << STREAM_ACCUMULATOR_MIN))));
     DEBUGLOG(5, "ZSTD_decompressBlock_internal (size : %u)", (U32)srcSize);
 
-    RETURN_ERROR_IF(srcSize >= ZSTD_BLOCKSIZE_MAX, srcSize_wrong, "");
+    /* Note : the wording of the specification
+     * allows compressed block to be sized exactly ZSTD_BLOCKSIZE_MAX.
+     * This generally does not happen, as it makes little sense,
+     * since an uncompressed block would feature same size and have no decompression cost.
+     * Also, note that decoder from reference libzstd before < v1.5.4
+     * would consider this edge case as an error.
+     * As a consequence, avoid generating compressed blocks of size ZSTD_BLOCKSIZE_MAX
+     * for broader compatibility with the deployed ecosystem of zstd decoders */
+    RETURN_ERROR_IF(srcSize > ZSTD_BLOCKSIZE_MAX, srcSize_wrong, "");
 
     /* Decode literals section */
     {   size_t const litCSize = ZSTD_decodeLiteralsBlock(dctx, src, srcSize, dst, dstCapacity, streaming);
-        DEBUGLOG(5, "ZSTD_decodeLiteralsBlock : %u", (U32)litCSize);
+        DEBUGLOG(5, "ZSTD_decodeLiteralsBlock : cSize=%u, nbLiterals=%zu", (U32)litCSize, dctx->litSize);
         if (ZSTD_isError(litCSize)) return litCSize;
         ip += litCSize;
         srcSize -= litCSize;
@@ -2001,6 +2077,23 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
 
     /* Build Decoding Tables */
     {
+        /* Compute the maximum block size, which must also work when !frame and fParams are unset.
+         * Additionally, take the min with dstCapacity to ensure that the totalHistorySize fits in a size_t.
+         */
+        size_t const blockSizeMax = MIN(dstCapacity, (frame ? dctx->fParams.blockSizeMax : ZSTD_BLOCKSIZE_MAX));
+        size_t const totalHistorySize = ZSTD_totalHistorySize((BYTE*)dst + blockSizeMax, (BYTE const*)dctx->virtualStart);
+        /* isLongOffset must be true if there are long offsets.
+         * Offsets are long if they are larger than ZSTD_maxShortOffset().
+         * We don't expect that to be the case in 64-bit mode.
+         *
+         * We check here to see if our history is large enough to allow long offsets.
+         * If it isn't, then we can't possible have (valid) long offsets. If the offset
+         * is invalid, then it is okay to read it incorrectly.
+         *
+         * If isLongOffsets is true, then we will later check our decoding table to see
+         * if it is even possible to generate long offsets.
+         */
+        ZSTD_longOffset_e isLongOffset = (ZSTD_longOffset_e)(MEM_32bits() && (totalHistorySize > ZSTD_maxShortOffset()));
         /* These macros control at build-time which decompressor implementation
          * we use. If neither is defined, we do some inspection and dispatch at
          * runtime.
@@ -2008,6 +2101,11 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
 #if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
     !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
         int usePrefetchDecoder = dctx->ddictIsCold;
+#else
+        /* Set to 1 to avoid computing offset info if we don't need to.
+         * Otherwise this value is ignored.
+         */
+        int usePrefetchDecoder = 1;
 #endif
         int nbSeq;
         size_t const seqHSize = ZSTD_decodeSeqHeaders(dctx, &nbSeq, ip, srcSize);
@@ -2015,28 +2113,42 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
         ip += seqHSize;
         srcSize -= seqHSize;
 
-        RETURN_ERROR_IF(dst == NULL && nbSeq > 0, dstSize_tooSmall, "NULL not handled");
+        RETURN_ERROR_IF((dst == NULL || dstCapacity == 0) && nbSeq > 0, dstSize_tooSmall, "NULL not handled");
+        RETURN_ERROR_IF(MEM_64bits() && sizeof(size_t) == sizeof(void*) && (size_t)(-1) - (size_t)dst < (size_t)(1 << 20), dstSize_tooSmall,
+                "invalid dst");
 
-#if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
-    !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
-        if ( !usePrefetchDecoder
-          && (!frame || (dctx->fParams.windowSize > (1<<24)))
-          && (nbSeq>ADVANCED_SEQS) ) {  /* could probably use a larger nbSeq limit */
-            U32 const shareLongOffsets = ZSTD_getLongOffsetsShare(dctx->OFTptr);
-            U32 const minShare = MEM_64bits() ? 7 : 20; /* heuristic values, correspond to 2.73% and 7.81% */
-            usePrefetchDecoder = (shareLongOffsets >= minShare);
+        /* If we could potentially have long offsets, or we might want to use the prefetch decoder,
+         * compute information about the share of long offsets, and the maximum nbAdditionalBits.
+         * NOTE: could probably use a larger nbSeq limit
+         */
+        if (isLongOffset || (!usePrefetchDecoder && (totalHistorySize > (1u << 24)) && (nbSeq > 8))) {
+            ZSTD_OffsetInfo const info = ZSTD_getOffsetInfo(dctx->OFTptr, nbSeq);
+            if (isLongOffset && info.maxNbAdditionalBits <= STREAM_ACCUMULATOR_MIN) {
+                /* If isLongOffset, but the maximum number of additional bits that we see in our table is small
+                 * enough, then we know it is impossible to have too long an offset in this block, so we can
+                 * use the regular offset decoder.
+                 */
+                isLongOffset = ZSTD_lo_isRegularOffset;
+            }
+            if (!usePrefetchDecoder) {
+                U32 const minShare = MEM_64bits() ? 7 : 20; /* heuristic values, correspond to 2.73% and 7.81% */
+                usePrefetchDecoder = (info.longOffsetShare >= minShare);
+            }
         }
-#endif
 
         dctx->ddictIsCold = 0;
 
 #if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
     !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
-        if (usePrefetchDecoder)
+        if (usePrefetchDecoder) {
+#else
+        (void)usePrefetchDecoder;
+        {
 #endif
 #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT
             return ZSTD_decompressSequencesLong(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame);
 #endif
+        }
 
 #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
         /* else */
@@ -2060,9 +2172,9 @@ void ZSTD_checkContinuity(ZSTD_DCtx* dctx, const void* dst, size_t dstSize)
 }
 
 
-size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx,
-                            void* dst, size_t dstCapacity,
-                      const void* src, size_t srcSize)
+size_t ZSTD_decompressBlock_deprecated(ZSTD_DCtx* dctx,
+                                       void* dst, size_t dstCapacity,
+                                 const void* src, size_t srcSize)
 {
     size_t dSize;
     ZSTD_checkContinuity(dctx, dst, dstCapacity);
@@ -2070,3 +2182,12 @@ size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx,
     dctx->previousDstEnd = (char*)dst + dSize;
     return dSize;
 }
+
+
+/* NOTE: Must just wrap ZSTD_decompressBlock_deprecated() */
+size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx,
+                            void* dst, size_t dstCapacity,
+                      const void* src, size_t srcSize)
+{
+    return ZSTD_decompressBlock_deprecated(dctx, dst, dstCapacity, src, srcSize);
+}
diff --git a/lib/zstd/decompress/zstd_decompress_block.h b/lib/zstd/decompress/zstd_decompress_block.h
index 3d2d57a5d25a78..5888e6cc788b59 100644
--- a/lib/zstd/decompress/zstd_decompress_block.h
+++ b/lib/zstd/decompress/zstd_decompress_block.h
@@ -1,5 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -64,5 +65,10 @@ void ZSTD_buildFSETable(ZSTD_seqSymbol* dt,
                    unsigned tableLog, void* wksp, size_t wkspSize,
                    int bmi2);
 
+/* Internal definition of ZSTD_decompressBlock() to avoid deprecation warnings. */
+size_t ZSTD_decompressBlock_deprecated(ZSTD_DCtx* dctx,
+                            void* dst, size_t dstCapacity,
+                      const void* src, size_t srcSize);
+
 
 #endif /* ZSTD_DEC_BLOCK_H */
diff --git a/lib/zstd/decompress/zstd_decompress_internal.h b/lib/zstd/decompress/zstd_decompress_internal.h
index 98102edb6a8324..32f79fb2873df8 100644
--- a/lib/zstd/decompress/zstd_decompress_internal.h
+++ b/lib/zstd/decompress/zstd_decompress_internal.h
@@ -1,5 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -75,12 +76,13 @@ static UNUSED_ATTR const U32 ML_base[MaxML+1] = {
 
 #define ZSTD_BUILD_FSE_TABLE_WKSP_SIZE (sizeof(S16) * (MaxSeq + 1) + (1u << MaxFSELog) + sizeof(U64))
 #define ZSTD_BUILD_FSE_TABLE_WKSP_SIZE_U32 ((ZSTD_BUILD_FSE_TABLE_WKSP_SIZE + sizeof(U32) - 1) / sizeof(U32))
+#define ZSTD_HUFFDTABLE_CAPACITY_LOG 12
 
 typedef struct {
     ZSTD_seqSymbol LLTable[SEQSYMBOL_TABLE_SIZE(LLFSELog)];    /* Note : Space reserved for FSE Tables */
     ZSTD_seqSymbol OFTable[SEQSYMBOL_TABLE_SIZE(OffFSELog)];   /* is also used as temporary workspace while building hufTable during DDict creation */
     ZSTD_seqSymbol MLTable[SEQSYMBOL_TABLE_SIZE(MLFSELog)];    /* and therefore must be at least HUF_DECOMPRESS_WORKSPACE_SIZE large */
-    HUF_DTable hufTable[HUF_DTABLE_SIZE(HufLog)];  /* can accommodate HUF_decompress4X */
+    HUF_DTable hufTable[HUF_DTABLE_SIZE(ZSTD_HUFFDTABLE_CAPACITY_LOG)];  /* can accommodate HUF_decompress4X */
     U32 rep[ZSTD_REP_NUM];
     U32 workspace[ZSTD_BUILD_FSE_TABLE_WKSP_SIZE_U32];
 } ZSTD_entropyDTables_t;
@@ -164,6 +166,7 @@ struct ZSTD_DCtx_s
     ZSTD_dictUses_e dictUses;
     ZSTD_DDictHashSet* ddictSet;                    /* Hash set for multiple ddicts */
     ZSTD_refMultipleDDicts_e refMultipleDDicts;     /* User specified: if == 1, will allow references to multiple DDicts. Default == 0 (disabled) */
+    int disableHufAsm;
 
     /* streaming */
     ZSTD_dStreamStage streamStage;
diff --git a/lib/zstd/decompress_sources.h b/lib/zstd/decompress_sources.h
index a06ca187aab5f4..8a47eb2a451451 100644
--- a/lib/zstd/decompress_sources.h
+++ b/lib/zstd/decompress_sources.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /*
- * Copyright (c) Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
diff --git a/lib/zstd/zstd_common_module.c b/lib/zstd/zstd_common_module.c
index 22686e367e6f0f..466828e3575256 100644
--- a/lib/zstd/zstd_common_module.c
+++ b/lib/zstd/zstd_common_module.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
 /*
- * Copyright (c) Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -24,9 +24,6 @@ EXPORT_SYMBOL_GPL(HUF_readStats_wksp);
 EXPORT_SYMBOL_GPL(ZSTD_isError);
 EXPORT_SYMBOL_GPL(ZSTD_getErrorName);
 EXPORT_SYMBOL_GPL(ZSTD_getErrorCode);
-EXPORT_SYMBOL_GPL(ZSTD_customMalloc);
-EXPORT_SYMBOL_GPL(ZSTD_customCalloc);
-EXPORT_SYMBOL_GPL(ZSTD_customFree);
 
 MODULE_LICENSE("Dual BSD/GPL");
 MODULE_DESCRIPTION("Zstd Common");
diff --git a/lib/zstd/zstd_compress_module.c b/lib/zstd/zstd_compress_module.c
index 04e1b5c01d9b64..8ecf43226af2f3 100644
--- a/lib/zstd/zstd_compress_module.c
+++ b/lib/zstd/zstd_compress_module.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
 /*
- * Copyright (c) Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
diff --git a/lib/zstd/zstd_decompress_module.c b/lib/zstd/zstd_decompress_module.c
index f4ed952ed4852a..7d31518e9d5abb 100644
--- a/lib/zstd/zstd_decompress_module.c
+++ b/lib/zstd/zstd_decompress_module.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
 /*
- * Copyright (c) Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -77,7 +77,7 @@ EXPORT_SYMBOL(zstd_init_dstream);
 
 size_t zstd_reset_dstream(zstd_dstream *dstream)
 {
-	return ZSTD_resetDStream(dstream);
+	return ZSTD_DCtx_reset(dstream, ZSTD_reset_session_only);
 }
 EXPORT_SYMBOL(zstd_reset_dstream);
 

From 40eb0e915deb4d57f90befa2f4c7781136309da0 Mon Sep 17 00:00:00 2001
From: Nick Terrell <terrelln@meta.com>
Date: Mon, 20 Nov 2023 12:39:49 -0800
Subject: [PATCH 0008/1406] zstd: Backport Huffman speed improvement from
 upstream

Backport upstream commit c7269ad [0] to improve zstd decoding speed.

Updating the kernel to zstd v1.5.5 earlier in this patch series
regressed zstd decoding speed. This turned out to be because gcc was not
unrolling the inner loops of the Huffman decoder which are executed a
constant number of times [1]. This really hurts performance, as we expect
this loop to be completely branch-free. This commit fixes the issue by
unrolling the loop manually [2].

The commit fixes one more minor issue, which is to mask a variable shift
by 0x3F. The shift was guaranteed to be less than 64, but gcc couldn't
prove that, and emitted suboptimal code.

Finally, the upstream commit added a build macro
`HUF_DISABLE_FAST_DECODE` which is not used in the kernel, but is
maintained to keep a clean import from upstream.

This commit was generated from upstream signed tag v1.5.5-kernel [3] by:

  export ZSTD=/path/to/repo/zstd/
  export LINUX=/path/to/repo/linux/
  cd "$ZSTD/contrib/linux-kernel"
  git checkout v1.5.5-kernel
  make import LINUX="$LINUX"

I ran my benchmark & test suite before and after this commit to measure
the overall decompression speed benefit. It benchmarks zstd at several
compression levels. These benchmarks measure the total time it takes to
read data from the compressed filesystem.

Component,	Level,	Read time delta
Btrfs    ,	    1,	-7.0%
Btrfs    ,	    3,	-3.9%
Btrfs    ,	    5,	-4.7%
Btrfs    ,	    7,	-5.5%
Btrfs    ,	    9,	-2.4%
Squashfs ,	    1,	-9.1%

Link: https://github.com/facebook/zstd/commit/c7269add7eaf028ed828d9af41e732cf01993aad
Link: https://gist.github.com/terrelln/2e14ff1fb197102a08d7823d8044978d
Link: https://gist.github.com/terrelln/a70bde22a2abc800691fb65c21eabc2a
Link: https://github.com/facebook/zstd/tree/v1.5.5-kernel
Signed-off-by: Nick Terrell <terrelln@fb.com>
---
 lib/zstd/decompress/huf_decompress.c | 171 ++++++++++++++++-----------
 1 file changed, 105 insertions(+), 66 deletions(-)

diff --git a/lib/zstd/decompress/huf_decompress.c b/lib/zstd/decompress/huf_decompress.c
index d172e35fbd9a61..db670d71fdab78 100644
--- a/lib/zstd/decompress/huf_decompress.c
+++ b/lib/zstd/decompress/huf_decompress.c
@@ -35,6 +35,12 @@
 *  Macros
 ****************************************************************/
 
+#ifdef HUF_DISABLE_FAST_DECODE
+# define HUF_ENABLE_FAST_DECODE 0
+#else
+# define HUF_ENABLE_FAST_DECODE 1
+#endif
+
 /* These two optional macros force the use one way or another of the two
  * Huffman decompression implementations. You can't force in both directions
  * at the same time.
@@ -289,6 +295,24 @@ static size_t HUF_initRemainingDStream(BIT_DStream_t* bit, HUF_DecompressFastArg
     return 0;
 }
 
+/* Calls X(N) for each stream 0, 1, 2, 3. */
+#define HUF_4X_FOR_EACH_STREAM(X) \
+    {                             \
+        X(0)                      \
+        X(1)                      \
+        X(2)                      \
+        X(3)                      \
+    }
+
+/* Calls X(N, var) for each stream 0, 1, 2, 3. */
+#define HUF_4X_FOR_EACH_STREAM_WITH_VAR(X, var) \
+    {                                           \
+        X(0, (var))                             \
+        X(1, (var))                             \
+        X(2, (var))                             \
+        X(3, (var))                             \
+    }
+
 
 #ifndef HUF_FORCE_DECOMPRESS_X2
 
@@ -702,7 +726,6 @@ void HUF_decompress4X1_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs*
     for (;;) {
         BYTE* olimit;
         int stream;
-        int symbol;
 
         /* Assert loop preconditions */
 #ifndef NDEBUG
@@ -749,27 +772,42 @@ void HUF_decompress4X1_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs*
         }
 #endif
 
+#define HUF_4X1_DECODE_SYMBOL(_stream, _symbol)                 \
+    {                                                           \
+        int const index = (int)(bits[(_stream)] >> 53);         \
+        int const entry = (int)dtable[index];                   \
+        bits[(_stream)] <<= (entry & 0x3F);                     \
+        op[(_stream)][(_symbol)] = (BYTE)((entry >> 8) & 0xFF); \
+    }
+
+#define HUF_4X1_RELOAD_STREAM(_stream)                              \
+    {                                                               \
+        int const ctz = ZSTD_countTrailingZeros64(bits[(_stream)]); \
+        int const nbBits = ctz & 7;                                 \
+        int const nbBytes = ctz >> 3;                               \
+        op[(_stream)] += 5;                                         \
+        ip[(_stream)] -= nbBytes;                                   \
+        bits[(_stream)] = MEM_read64(ip[(_stream)]) | 1;            \
+        bits[(_stream)] <<= nbBits;                                 \
+    }
+
+        /* Manually unroll the loop because compilers don't consistently
+         * unroll the inner loops, which destroys performance.
+         */
         do {
             /* Decode 5 symbols in each of the 4 streams */
-            for (symbol = 0; symbol < 5; ++symbol) {
-                for (stream = 0; stream < 4; ++stream) {
-                    int const index = (int)(bits[stream] >> 53);
-                    int const entry = (int)dtable[index];
-                    bits[stream] <<= (entry & 63);
-                    op[stream][symbol] = (BYTE)((entry >> 8) & 0xFF);
-                }
-            }
-            /* Reload the bitstreams */
-            for (stream = 0; stream < 4; ++stream) {
-                int const ctz = ZSTD_countTrailingZeros64(bits[stream]);
-                int const nbBits = ctz & 7;
-                int const nbBytes = ctz >> 3;
-                op[stream] += 5;
-                ip[stream] -= nbBytes;
-                bits[stream] = MEM_read64(ip[stream]) | 1;
-                bits[stream] <<= nbBits;
-            }
+            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 0)
+            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 1)
+            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 2)
+            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 3)
+            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 4)
+
+            /* Reload each of the 4 the bitstreams */
+            HUF_4X_FOR_EACH_STREAM(HUF_4X1_RELOAD_STREAM)
         } while (op[3] < olimit);
+
+#undef HUF_4X1_DECODE_SYMBOL
+#undef HUF_4X1_RELOAD_STREAM
     }
 
 _out:
@@ -865,7 +903,7 @@ static size_t HUF_decompress4X1_usingDTable_internal(void* dst, size_t dstSize,
     }
 #endif
 
-    if (!(flags & HUF_flags_disableFast)) {
+    if (HUF_ENABLE_FAST_DECODE && !(flags & HUF_flags_disableFast)) {
         size_t const ret = HUF_decompress4X1_usingDTable_internal_fast(dst, dstSize, cSrc, cSrcSize, DTable, loopFn);
         if (ret != 0)
             return ret;
@@ -1487,7 +1525,6 @@ void HUF_decompress4X2_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs*
     for (;;) {
         BYTE* olimit;
         int stream;
-        int symbol;
 
         /* Assert loop preconditions */
 #ifndef NDEBUG
@@ -1544,54 +1581,56 @@ void HUF_decompress4X2_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs*
         }
 #endif
 
+#define HUF_4X2_DECODE_SYMBOL(_stream, _decode3)        \
+    if ((_decode3) || (_stream) != 3) {                 \
+        int const index = (int)(bits[(_stream)] >> 53); \
+        HUF_DEltX2 const entry = dtable[index];         \
+        MEM_write16(op[(_stream)], entry.sequence);     \
+        bits[(_stream)] <<= (entry.nbBits) & 0x3F;      \
+        op[(_stream)] += (entry.length);                \
+    }
+
+#define HUF_4X2_RELOAD_STREAM(_stream)                                  \
+    {                                                                   \
+        HUF_4X2_DECODE_SYMBOL(3, 1)                                     \
+        {                                                               \
+            int const ctz = ZSTD_countTrailingZeros64(bits[(_stream)]); \
+            int const nbBits = ctz & 7;                                 \
+            int const nbBytes = ctz >> 3;                               \
+            ip[(_stream)] -= nbBytes;                                   \
+            bits[(_stream)] = MEM_read64(ip[(_stream)]) | 1;            \
+            bits[(_stream)] <<= nbBits;                                 \
+        }                                                               \
+    }
+
+        /* Manually unroll the loop because compilers don't consistently
+         * unroll the inner loops, which destroys performance.
+         */
         do {
-            /* Do 5 table lookups for each of the first 3 streams */
-            for (symbol = 0; symbol < 5; ++symbol) {
-                for (stream = 0; stream < 3; ++stream) {
-                    int const index = (int)(bits[stream] >> 53);
-                    HUF_DEltX2 const entry = dtable[index];
-                    MEM_write16(op[stream], entry.sequence);
-                    bits[stream] <<= (entry.nbBits);
-                    op[stream] += (entry.length);
-                }
-            }
-            /* Do 1 table lookup from the final stream */
-            {
-                int const index = (int)(bits[3] >> 53);
-                HUF_DEltX2 const entry = dtable[index];
-                MEM_write16(op[3], entry.sequence);
-                bits[3] <<= (entry.nbBits);
-                op[3] += (entry.length);
-            }
-            /* Do 4 table lookups from the final stream & reload bitstreams */
-            for (stream = 0; stream < 4; ++stream) {
-                /* Do a table lookup from the final stream.
-                 * This is interleaved with the reloading to reduce register
-                 * pressure. This shouldn't be necessary, but compilers can
-                 * struggle with codegen with high register pressure.
-                 */
-                {
-                    int const index = (int)(bits[3] >> 53);
-                    HUF_DEltX2 const entry = dtable[index];
-                    MEM_write16(op[3], entry.sequence);
-                    bits[3] <<= (entry.nbBits);
-                    op[3] += (entry.length);
-                }
-                /* Reload the bistreams. The final bitstream must be reloaded
-                 * after the 5th symbol was decoded.
-                 */
-                {
-                    int const ctz = ZSTD_countTrailingZeros64(bits[stream]);
-                    int const nbBits = ctz & 7;
-                    int const nbBytes = ctz >> 3;
-                    ip[stream] -= nbBytes;
-                    bits[stream] = MEM_read64(ip[stream]) | 1;
-                    bits[stream] <<= nbBits;
-                }
-            }
+            /* Decode 5 symbols from each of the first 3 streams.
+             * The final stream will be decoded during the reload phase
+             * to reduce register pressure.
+             */
+            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0)
+            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0)
+            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0)
+            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0)
+            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0)
+
+            /* Decode one symbol from the final stream */
+            HUF_4X2_DECODE_SYMBOL(3, 1)
+
+            /* Decode 4 symbols from the final stream & reload bitstreams.
+             * The final stream is reloaded last, meaning that all 5 symbols
+             * are decoded from the final stream before it is reloaded.
+             */
+            HUF_4X_FOR_EACH_STREAM(HUF_4X2_RELOAD_STREAM)
         } while (op[3] < olimit);
     }
 
+#undef HUF_4X2_DECODE_SYMBOL
+#undef HUF_4X2_RELOAD_STREAM
+
 _out:
 
     /* Save the final values of each of the state variables back to args. */
@@ -1676,7 +1715,7 @@ static size_t HUF_decompress4X2_usingDTable_internal(void* dst, size_t dstSize,
     }
 #endif
 
-    if (!(flags & HUF_flags_disableFast)) {
+    if (HUF_ENABLE_FAST_DECODE && !(flags & HUF_flags_disableFast)) {
         size_t const ret = HUF_decompress4X2_usingDTable_internal_fast(dst, dstSize, cSrc, cSrcSize, DTable, loopFn);
         if (ret != 0)
             return ret;

From 3f832dfb8a8eafee3cecd479d99651a64a61485a Mon Sep 17 00:00:00 2001
From: Ben Dooks <ben.dooks@codethink.co.uk>
Date: Fri, 16 Jun 2023 15:44:00 +0100
Subject: [PATCH 0009/1406] zstd: fix g_debuglevel export warning

The g_debuglevel variable in debug.c is only used when DEBUGLEVEL is
defined to be above 2. This means by default there's no actual definition
of this in the headers, so sparse is giving the following warning:

lib/zstd/common/debug.c:24:5: warning: symbol 'g_debuglevel' was not declared. Should it be static?

We can use the same check as in the header to remove this if it isn't
going to be used, silencing the warning and removing a small bit of unused
data.

Signed-off-by: Ben Dooks <ben.dooks@codethink.co.uk>
Signed-off-by: Nick Terrell <terrelln@fb.com>
---
 lib/zstd/common/debug.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/lib/zstd/common/debug.c b/lib/zstd/common/debug.c
index e56ff6464e9187..d77926cbad14b9 100644
--- a/lib/zstd/common/debug.c
+++ b/lib/zstd/common/debug.c
@@ -22,4 +22,6 @@
 
 #include "debug.h"
 
+#if (DEBUGLEVEL>=2)
 int g_debuglevel = DEBUGLEVEL;
+#endif

From 564eac2860bdbe6ac651e6909ac07ecd93d778f3 Mon Sep 17 00:00:00 2001
From: Peter Martincic <pmartincic@microsoft.com>
Date: Mon, 27 Nov 2023 13:35:24 -0800
Subject: [PATCH 0010/1406] hv_utils: Allow implicit ICTIMESYNCFLAG_SYNC

Hyper-V hosts can omit the _SYNC flag to due a bug on resume from modern
suspend. In such a case, the guest may fail to update its time-of-day to
account for the period when it was suspended, and could proceed with a
significantly wrong time-of-day. In such a case when the guest is
significantly behind, fix it by treating a _SAMPLE the same as if _SYNC
was received so that the guest time-of-day is updated.

This is hidden behind param hv_utils.timesync_implicit.

Signed-off-by: Peter Martincic <pmartincic@microsoft.com>
Acked-by: Boqun Feng <boqun.feng@gmail.com>
Link: https://lore.kernel.org/r/20231127213524.52783-1-pmartincic@linux.microsoft.com
Signed-off-by: Wei Liu <wei.liu@kernel.org>
Message-ID: <20231127213524.52783-1-pmartincic@linux.microsoft.com>
---
 drivers/hv/hv_util.c | 31 ++++++++++++++++++++++++++++++-
 1 file changed, 30 insertions(+), 1 deletion(-)

diff --git a/drivers/hv/hv_util.c b/drivers/hv/hv_util.c
index 42aec2c5606af7..9c97c4065fe736 100644
--- a/drivers/hv/hv_util.c
+++ b/drivers/hv/hv_util.c
@@ -296,6 +296,11 @@ static struct {
 	spinlock_t			lock;
 } host_ts;
 
+static bool timesync_implicit;
+
+module_param(timesync_implicit, bool, 0644);
+MODULE_PARM_DESC(timesync_implicit, "If set treat SAMPLE as SYNC when clock is behind");
+
 static inline u64 reftime_to_ns(u64 reftime)
 {
 	return (reftime - WLTIMEDELTA) * 100;
@@ -344,6 +349,29 @@ static void hv_set_host_time(struct work_struct *work)
 		do_settimeofday64(&ts);
 }
 
+/*
+ * Due to a bug on Hyper-V hosts, the sync flag may not always be sent on resume.
+ * Force a sync if the guest is behind.
+ */
+static inline bool hv_implicit_sync(u64 host_time)
+{
+	struct timespec64 new_ts;
+	struct timespec64 threshold_ts;
+
+	new_ts = ns_to_timespec64(reftime_to_ns(host_time));
+	ktime_get_real_ts64(&threshold_ts);
+
+	threshold_ts.tv_sec += 5;
+
+	/*
+	 * If guest behind the host by 5 or more seconds.
+	 */
+	if (timespec64_compare(&new_ts, &threshold_ts) >= 0)
+		return true;
+
+	return false;
+}
+
 /*
  * Synchronize time with host after reboot, restore, etc.
  *
@@ -384,7 +412,8 @@ static inline void adj_guesttime(u64 hosttime, u64 reftime, u8 adj_flags)
 	spin_unlock_irqrestore(&host_ts.lock, flags);
 
 	/* Schedule work to do do_settimeofday64() */
-	if (adj_flags & ICTIMESYNCFLAG_SYNC)
+	if ((adj_flags & ICTIMESYNCFLAG_SYNC) ||
+	    (timesync_implicit && hv_implicit_sync(host_ts.host_time)))
 		schedule_work(&adj_time_work);
 }
 

From 96c4f072b2ed4beaed7b001c9eb1a4d997ff3a22 Mon Sep 17 00:00:00 2001
From: Zev Weiss <zev@bewilderbeest.net>
Date: Mon, 20 Nov 2023 04:19:56 -0800
Subject: [PATCH 0011/1406] dt-bindings: arm: aspeed: document ASRock
 SPC621D8HM3

Document ASRock SPC621D8HM3 board compatible.

Signed-off-by: Zev Weiss <zev@bewilderbeest.net>
Acked-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Link: https://lore.kernel.org/r/20231120121954.19926-5-zev@bewilderbeest.net
Signed-off-by: Joel Stanley <joel@jms.id.au>
---
 Documentation/devicetree/bindings/arm/aspeed/aspeed.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Documentation/devicetree/bindings/arm/aspeed/aspeed.yaml b/Documentation/devicetree/bindings/arm/aspeed/aspeed.yaml
index 749ee54a3ff83a..f8f66821cb5faa 100644
--- a/Documentation/devicetree/bindings/arm/aspeed/aspeed.yaml
+++ b/Documentation/devicetree/bindings/arm/aspeed/aspeed.yaml
@@ -36,6 +36,7 @@ properties:
               - aspeed,ast2500-evb
               - asrock,e3c246d4i-bmc
               - asrock,romed8hm3-bmc
+              - asrock,spc621d8hm3-bmc
               - bytedance,g220a-bmc
               - facebook,cmm-bmc
               - facebook,minipack-bmc

From 2e09eb0615f012fb0d967e864d18b121b8ed2ae4 Mon Sep 17 00:00:00 2001
From: Zev Weiss <zev@bewilderbeest.net>
Date: Mon, 20 Nov 2023 04:19:57 -0800
Subject: [PATCH 0012/1406] ARM: dts: aspeed: Add ASRock SPC621D8HM3 BMC

This is a Xeon board broadly similar (aside from CPU vendor) to the
already-support romed8hm3 (half-width, single-socket, ast2500).  It
doesn't require anything terribly special for OpenBMC support, so this
device-tree should provide everything necessary for basic
functionality with it.

Signed-off-by: Zev Weiss <zev@bewilderbeest.net>
Link: https://lore.kernel.org/r/20231120121954.19926-6-zev@bewilderbeest.net
Signed-off-by: Joel Stanley <joel@jms.id.au>
---
 arch/arm/boot/dts/aspeed/Makefile             |   1 +
 .../aspeed/aspeed-bmc-asrock-spc621d8hm3.dts  | 324 ++++++++++++++++++
 2 files changed, 325 insertions(+)
 create mode 100644 arch/arm/boot/dts/aspeed/aspeed-bmc-asrock-spc621d8hm3.dts

diff --git a/arch/arm/boot/dts/aspeed/Makefile b/arch/arm/boot/dts/aspeed/Makefile
index d3ac20e316d01e..2df0a2e88df712 100644
--- a/arch/arm/boot/dts/aspeed/Makefile
+++ b/arch/arm/boot/dts/aspeed/Makefile
@@ -10,6 +10,7 @@ dtb-$(CONFIG_ARCH_ASPEED) += \
 	aspeed-bmc-arm-stardragon4800-rep2.dtb \
 	aspeed-bmc-asrock-e3c246d4i.dtb \
 	aspeed-bmc-asrock-romed8hm3.dtb \
+	aspeed-bmc-asrock-spc621d8hm3.dtb \
 	aspeed-bmc-bytedance-g220a.dtb \
 	aspeed-bmc-delta-ahe50dc.dtb \
 	aspeed-bmc-facebook-bletchley.dtb \
diff --git a/arch/arm/boot/dts/aspeed/aspeed-bmc-asrock-spc621d8hm3.dts b/arch/arm/boot/dts/aspeed/aspeed-bmc-asrock-spc621d8hm3.dts
new file mode 100644
index 00000000000000..555485871e7a7d
--- /dev/null
+++ b/arch/arm/boot/dts/aspeed/aspeed-bmc-asrock-spc621d8hm3.dts
@@ -0,0 +1,324 @@
+// SPDX-License-Identifier: GPL-2.0+
+/dts-v1/;
+
+#include "aspeed-g5.dtsi"
+#include <dt-bindings/gpio/aspeed-gpio.h>
+#include <dt-bindings/i2c/i2c.h>
+#include <dt-bindings/interrupt-controller/irq.h>
+#include <dt-bindings/leds/common.h>
+
+/{
+	model = "ASRock SPC621D8HM3 BMC";
+	compatible = "asrock,spc621d8hm3-bmc", "aspeed,ast2500";
+
+	aliases {
+		serial4 = &uart5;
+
+		i2c20 = &i2c1mux0ch0;
+		i2c21 = &i2c1mux0ch1;
+	};
+
+	chosen {
+		stdout-path = &uart5;
+	};
+
+	memory@80000000 {
+		reg = <0x80000000 0x20000000>;
+	};
+
+	leds {
+		compatible = "gpio-leds";
+
+		/* BMC heartbeat */
+		led-0 {
+			gpios = <&gpio ASPEED_GPIO(H, 6) GPIO_ACTIVE_LOW>;
+			function = LED_FUNCTION_HEARTBEAT;
+			color = <LED_COLOR_ID_GREEN>;
+			linux,default-trigger = "timer";
+		};
+
+		/* system fault */
+		led-1 {
+			gpios = <&gpio ASPEED_GPIO(Z, 2) GPIO_ACTIVE_LOW>;
+			function = LED_FUNCTION_FAULT;
+			color = <LED_COLOR_ID_RED>;
+			panic-indicator;
+		};
+	};
+
+	iio-hwmon {
+		compatible = "iio-hwmon";
+		io-channels = <&adc 0>, <&adc 1>, <&adc 2>, <&adc 3>,
+			<&adc 4>, <&adc 5>, <&adc 6>, <&adc 7>,
+			<&adc 8>, <&adc 9>, <&adc 10>, <&adc 11>,
+			<&adc 12>, <&adc 13>, <&adc 14>, <&adc 15>;
+	};
+};
+
+&fmc {
+	status = "okay";
+	flash@0 {
+		status = "okay";
+		m25p,fast-read;
+		label = "bmc";
+		spi-max-frequency = <50000000>; /* 50 MHz */
+#include "openbmc-flash-layout-64.dtsi"
+	};
+};
+
+&uart5 {
+	status = "okay";
+};
+
+&vuart {
+	status = "okay";
+	aspeed,lpc-io-reg = <0x2f8>;
+	aspeed,lpc-interrupts = <3 IRQ_TYPE_LEVEL_HIGH>;
+};
+
+&mac0 {
+	status = "okay";
+
+	pinctrl-names = "default";
+	pinctrl-0 = <&pinctrl_rgmii1_default &pinctrl_mdio1_default>;
+
+	nvmem-cells = <&eth0_macaddress>;
+	nvmem-cell-names = "mac-address";
+};
+
+&i2c0 {
+	status = "okay";
+};
+
+&i2c1 {
+	status = "okay";
+
+	/* hardware monitor/thermal sensor */
+	temperature-sensor@29 {
+		compatible = "nuvoton,nct7802";
+		reg = <0x29>;
+	};
+
+	/* motherboard temp sensor (TMP1, near BMC) */
+	temperature-sensor@4c {
+		compatible = "nuvoton,w83773g";
+		reg = <0x4c>;
+	};
+
+	/* motherboard FRU eeprom */
+	eeprom@50 {
+		compatible = "st,24c128", "atmel,24c128";
+		reg = <0x50>;
+		pagesize = <16>;
+		#address-cells = <1>;
+		#size-cells = <1>;
+
+		eth0_macaddress: macaddress@3f80 {
+			reg = <0x3f80 6>;
+		};
+	};
+
+	/* M.2 slot smbus mux */
+	i2c-mux@71 {
+		compatible = "nxp,pca9545";
+		reg = <0x71>;
+		#address-cells = <1>;
+		#size-cells = <0>;
+
+		i2c1mux0ch0: i2c@0 {
+			#address-cells = <1>;
+			#size-cells = <0>;
+			reg = <0>;
+		};
+
+		i2c1mux0ch1: i2c@1 {
+			#address-cells = <1>;
+			#size-cells = <0>;
+			reg = <1>;
+		};
+	};
+};
+
+&i2c2 {
+	status = "okay";
+};
+
+&i2c3 {
+	status = "okay";
+};
+
+&i2c4 {
+	status = "okay";
+};
+
+&i2c5 {
+	status = "okay";
+};
+
+&i2c6 {
+	status = "okay";
+};
+
+&i2c7 {
+	status = "okay";
+};
+
+&i2c8 {
+	status = "okay";
+};
+
+&i2c9 {
+	status = "okay";
+};
+
+&i2c10 {
+	status = "okay";
+};
+
+&i2c11 {
+	status = "okay";
+};
+
+&i2c12 {
+	status = "okay";
+};
+
+&i2c13 {
+	status = "okay";
+};
+
+&video {
+	status = "okay";
+};
+
+&vhub {
+	status = "okay";
+};
+
+&lpc_ctrl {
+	status = "okay";
+};
+
+&lpc_snoop {
+	status = "okay";
+	snoop-ports = <0x80>;
+};
+
+&kcs3 {
+	status = "okay";
+	aspeed,lpc-io-reg = <0xca2>;
+};
+
+&peci0 {
+	status = "okay";
+};
+
+&pwm_tacho {
+	status = "okay";
+	pinctrl-names = "default";
+	pinctrl-0 = <&pinctrl_pwm0_default
+		&pinctrl_pwm2_default
+		&pinctrl_pwm3_default
+		&pinctrl_pwm4_default>;
+
+	fan@0 {
+		reg = <0x00>;
+		aspeed,fan-tach-ch = /bits/ 8 <0x00>;
+	};
+
+	fan@2 {
+		reg = <0x02>;
+		aspeed,fan-tach-ch = /bits/ 8 <0x02>;
+	};
+
+	fan@3 {
+		reg = <0x03>;
+		aspeed,fan-tach-ch = /bits/ 8 <0x03>;
+	};
+
+	fan@4 {
+		reg = <0x04>;
+		aspeed,fan-tach-ch = /bits/ 8 <0x04>;
+	};
+};
+
+&gpio {
+	status = "okay";
+	gpio-line-names =
+		/*  A */ "LOCATORLED_STATUS_N", "LOCATORBTN_N",
+			"BMC_READY_N", "FM_SPD_DDRCPU_LVLSHFT_EN",
+			"", "", "", "",
+		/*  B */ "NODE_ID_1", "NODE_ID_2", "PSU_FAN_FAIL_N", "",
+			"", "", "", "GPIO_RST",
+		/*  C */ "", "", "", "", "", "", "", "",
+		/*  D */ "FP_PWR_BTN_MUX_N", "FM_BMC_PWRBTN_OUT_N",
+			"FP_RST_BTN_N", "RST_BMC_RSTBTN_OUT_N",
+			"NMI_BTN_N", "BMC_NMI",
+			"", "",
+		/*  E */ "", "", "", "FM_ME_RCVR_N", "", "", "", "",
+		/*  F */ "BMC_SMB_SEL_N", "FM_CPU2_DISABLE_COD_N",
+			"FM_REMOTE_DEBUG_BMC_EN", "FM_CPU_ERR0_LVT3_EN",
+			"FM_CPU_ERR1_LVT3_EN", "FM_CPU_ERR2_LVT3_EN",
+			"FM_MEM_THERM_EVENT_CPU1_LVT3_N", "FM_MEM_THERM_EVENT_CPU2_LVT3_N",
+		/*  G */ "HWM_BAT_EN", "", "BMC_PHYRST_N", "FM_BIOS_SPI_BMC_CTRL",
+			"BMC_ALERT1_N", "BMC_ALERT2_N", "BMC_ALERT3_N", "IRQ_SML0_ALERT_N",
+		/*  H */ "BMC_SMB_PRESENT_1_N", "FM_PCH_CORE_VID_0", "FM_PCH_CORE_VID_1", "",
+			"FM_MFG_MODE", "BMC_RTCRST", "BMC_HB_LED_N", "BMC_CASEOPEN",
+		/*  I */ "IRQ_PVDDQ_ABCD_CPU1_VRHOT_LVC3_N", "IRQ_PVDDQ_ABCD_CPU2_VRHOT_LVC3_N",
+			"IRQ_PVDDQ_EFGH_CPU1_VRHOT_LVC3_N", "IRQ_PVDDQ_EFGH_CPU2_VRHOT_LVC3_N",
+			"", "", "", "",
+		/*  J */ "", "", "", "", "", "", "", "",
+		/*  K */ "", "", "", "", "", "", "", "",
+		/*  L */ "", "", "", "", "", "", "", "",
+		/*  M */ "FM_PVCCIN_CPU1_PWR_IN_ALERT_N", "FM_PVCCIN_CPU2_PWR_IN_ALERT_N",
+			"IRQ_PVCCIN_CPU1_VRHOT_LVC3_N", "IRQ_PVCCIN_CPU2_VRHOT_LVC3_N",
+			"FM_CPU1_PROCHOT_BMC_LVC3_N", "",
+			"FM_CPU1_MEMHOT_OUT_N", "FM_CPU2_MEMHOT_OUT_N",
+		/*  N */ "", "", "", "", "", "", "", "",
+		/*  O */ "", "", "", "", "", "", "", "",
+		/*  P */ "", "", "", "", "", "", "", "",
+		/*  Q */ "", "", "", "", "", "", "RST_GLB_RST_WARN_N", "PCIE_WAKE_N",
+		/*  R */ "", "", "FM_BMC_SUSACK_N", "FM_BMC_EUP_LOT6_N",
+			"", "FM_BMC_PCH_SCI_LPC_N", "", "",
+		/*  S */ "FM_DBP_PRESENT_N", "FM_CPU2_SKTOCC_LCT3_N",
+			"FM_CPU1_FIVR_FAULT_LVT3", "FM_CPU2_FIVR_FAULT_LVT3",
+			 "", "", "", "",
+		/*  T */ "", "", "", "", "", "", "", "",
+		/*  U */ "", "", "", "", "", "", "", "",
+		/*  V */ "", "", "", "", "", "", "", "",
+		/*  W */ "", "", "", "", "", "", "", "",
+		/*  X */ "", "", "", "", "", "", "", "",
+		/*  Y */ "FM_SLPS3_N", "FM_SLPS4_N", "", "FM_BMC_ONCTL_N_PLD",
+			"", "", "", "",
+		/*  Z */ "FM_CPU_MSMI_CATERR_LVT3_N", "", "SYSTEM_FAULT_LED_N", "BMC_THROTTLE_N",
+			"", "", "", "",
+		/* AA */ "FM_CPU1_THERMTRIP_LATCH_LVT3_N", "FM_CPU2_THERMTRIP_LATCH_LVT3_N",
+			"FM_BIOS_POST_COMPLT_N", "DBP_BMC_SYSPWROK",
+			"", "IRQ_SML0_ALERT_MUX_N",
+			"IRQ_SMI_ACTIVE_N", "IRQ_NMI_EVENT_N",
+		/* AB */ "FM_PCH_BMC_THERMTRIP_N", "PWRGD_SYS_PWROK",
+			"ME_OVERRIDE", "IRQ_BMC_PCH_SMI_LPC_N",
+			"", "", "", "",
+		/* AC */ "", "", "", "", "", "", "", "";
+};
+
+&adc {
+	status = "okay";
+	pinctrl-names = "default";
+	pinctrl-0 = <&pinctrl_adc0_default /* 3VSB */
+		&pinctrl_adc1_default	   /* 5VSB */
+		&pinctrl_adc2_default	   /* CPU1 */
+		&pinctrl_adc3_default	   /* NC */
+		&pinctrl_adc4_default	   /* VCCMABCD */
+		&pinctrl_adc5_default	   /* VCCMEFGH */
+		&pinctrl_adc6_default	   /* NC */
+		&pinctrl_adc7_default	   /* NC */
+		&pinctrl_adc8_default	   /* PVNN_PCH */
+		&pinctrl_adc9_default	   /* 1P05PCH */
+		&pinctrl_adc10_default	   /* 1P8PCH */
+		&pinctrl_adc11_default	   /* BAT */
+		&pinctrl_adc12_default	   /* 3V */
+		&pinctrl_adc13_default	   /* 5V */
+		&pinctrl_adc14_default	   /* 12V */
+		&pinctrl_adc15_default>;   /* GND */
+};

From 01bb8d5bf1ab1bd847a277f546e5f9af2c6933e1 Mon Sep 17 00:00:00 2001
From: Zev Weiss <zev@bewilderbeest.net>
Date: Mon, 20 Nov 2023 04:19:03 -0800
Subject: [PATCH 0013/1406] dt-bindings: arm: aspeed: document ASRock E3C256D4I

Document ASRock E3C256D4I board compatible.

Signed-off-by: Zev Weiss <zev@bewilderbeest.net>
Acked-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Link: https://lore.kernel.org/r/20231120121901.19817-5-zev@bewilderbeest.net
Signed-off-by: Joel Stanley <joel@jms.id.au>
---
 Documentation/devicetree/bindings/arm/aspeed/aspeed.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Documentation/devicetree/bindings/arm/aspeed/aspeed.yaml b/Documentation/devicetree/bindings/arm/aspeed/aspeed.yaml
index f8f66821cb5faa..6f7543463d894c 100644
--- a/Documentation/devicetree/bindings/arm/aspeed/aspeed.yaml
+++ b/Documentation/devicetree/bindings/arm/aspeed/aspeed.yaml
@@ -35,6 +35,7 @@ properties:
               - ampere,mtjade-bmc
               - aspeed,ast2500-evb
               - asrock,e3c246d4i-bmc
+              - asrock,e3c256d4i-bmc
               - asrock,romed8hm3-bmc
               - asrock,spc621d8hm3-bmc
               - bytedance,g220a-bmc

From f957714c0f5353b151639654a62680d27cf53e44 Mon Sep 17 00:00:00 2001
From: Zev Weiss <zev@bewilderbeest.net>
Date: Mon, 20 Nov 2023 04:19:04 -0800
Subject: [PATCH 0014/1406] ARM: dts: aspeed: Add ASRock E3C256D4I BMC

Like the E3C246D4I, this is a reasonably affordable off-the-shelf
mini-ITX AST2500/Xeon motherboard with good potential as an OpenBMC
development platform.  Booting the host requires a modicum of eSPI
support that's not yet in the mainline kernel, but most other basic
BMC functionality is available with this device-tree.

Signed-off-by: Zev Weiss <zev@bewilderbeest.net>
Link: https://lore.kernel.org/r/20231120121901.19817-6-zev@bewilderbeest.net
Signed-off-by: Joel Stanley <joel@jms.id.au>
---
 arch/arm/boot/dts/aspeed/Makefile             |   1 +
 .../aspeed/aspeed-bmc-asrock-e3c256d4i.dts    | 322 ++++++++++++++++++
 2 files changed, 323 insertions(+)
 create mode 100644 arch/arm/boot/dts/aspeed/aspeed-bmc-asrock-e3c256d4i.dts

diff --git a/arch/arm/boot/dts/aspeed/Makefile b/arch/arm/boot/dts/aspeed/Makefile
index 2df0a2e88df712..3e3e6b96cb799d 100644
--- a/arch/arm/boot/dts/aspeed/Makefile
+++ b/arch/arm/boot/dts/aspeed/Makefile
@@ -9,6 +9,7 @@ dtb-$(CONFIG_ARCH_ASPEED) += \
 	aspeed-bmc-ampere-mtmitchell.dtb \
 	aspeed-bmc-arm-stardragon4800-rep2.dtb \
 	aspeed-bmc-asrock-e3c246d4i.dtb \
+	aspeed-bmc-asrock-e3c256d4i.dtb \
 	aspeed-bmc-asrock-romed8hm3.dtb \
 	aspeed-bmc-asrock-spc621d8hm3.dtb \
 	aspeed-bmc-bytedance-g220a.dtb \
diff --git a/arch/arm/boot/dts/aspeed/aspeed-bmc-asrock-e3c256d4i.dts b/arch/arm/boot/dts/aspeed/aspeed-bmc-asrock-e3c256d4i.dts
new file mode 100644
index 00000000000000..263fcc8106ffaa
--- /dev/null
+++ b/arch/arm/boot/dts/aspeed/aspeed-bmc-asrock-e3c256d4i.dts
@@ -0,0 +1,322 @@
+// SPDX-License-Identifier: GPL-2.0+
+/dts-v1/;
+
+#include "aspeed-g5.dtsi"
+#include <dt-bindings/gpio/aspeed-gpio.h>
+#include <dt-bindings/i2c/i2c.h>
+#include <dt-bindings/interrupt-controller/irq.h>
+#include <dt-bindings/leds/common.h>
+#include <dt-bindings/watchdog/aspeed-wdt.h>
+
+/{
+	model = "ASRock E3C256D4I BMC";
+	compatible = "asrock,e3c256d4i-bmc", "aspeed,ast2500";
+
+	aliases {
+		serial4 = &uart5;
+
+		i2c20 = &i2c2mux0ch0;
+		i2c21 = &i2c2mux0ch1;
+		i2c22 = &i2c2mux0ch2;
+		i2c23 = &i2c2mux0ch3;
+	};
+
+	chosen {
+		stdout-path = &uart5;
+	};
+
+	memory@80000000 {
+		reg = <0x80000000 0x20000000>;
+	};
+
+	leds {
+		compatible = "gpio-leds";
+
+		/* BMC heartbeat */
+		led-0 {
+			gpios = <&gpio ASPEED_GPIO(H, 6) GPIO_ACTIVE_LOW>;
+			function = LED_FUNCTION_HEARTBEAT;
+			color = <LED_COLOR_ID_GREEN>;
+			linux,default-trigger = "timer";
+		};
+
+		/* system fault */
+		led-1 {
+			gpios = <&gpio ASPEED_GPIO(Z, 2) GPIO_ACTIVE_LOW>;
+			function = LED_FUNCTION_FAULT;
+			color = <LED_COLOR_ID_RED>;
+			panic-indicator;
+		};
+	};
+
+	iio-hwmon {
+		compatible = "iio-hwmon";
+		io-channels = <&adc 0>, <&adc 1>, <&adc 2>, <&adc 3>,
+			<&adc 4>, <&adc 5>, <&adc 6>, <&adc 7>,
+			<&adc 8>, <&adc 9>, <&adc 10>, <&adc 11>,
+			<&adc 12>, <&adc 13>, <&adc 14>, <&adc 15>;
+	};
+};
+
+&fmc {
+	status = "okay";
+	flash@0 {
+		status = "okay";
+		m25p,fast-read;
+		label = "bmc";
+		spi-max-frequency = <100000000>; /* 100 MHz */
+#include "openbmc-flash-layout-64.dtsi"
+	};
+};
+
+&uart1 {
+	status = "okay";
+};
+
+&uart2 {
+	status = "okay";
+};
+
+&uart3 {
+	status = "okay";
+};
+
+&uart4 {
+	status = "okay";
+};
+
+&uart5 {
+	status = "okay";
+};
+
+&uart_routing {
+	status = "okay";
+};
+
+&mac0 {
+	status = "okay";
+
+	pinctrl-names = "default";
+	pinctrl-0 = <&pinctrl_rgmii1_default &pinctrl_mdio1_default>;
+
+	nvmem-cells = <&eth0_macaddress>;
+	nvmem-cell-names = "mac-address";
+};
+
+&i2c0 {
+	status = "okay";
+};
+
+&i2c1 {
+	status = "okay";
+};
+
+&i2c2 {
+	status = "okay";
+
+	i2c-mux@70 {
+		compatible = "nxp,pca9545";
+		reg = <0x70>;
+		#address-cells = <1>;
+		#size-cells = <0>;
+
+		i2c2mux0ch0: i2c@0 {
+			#address-cells = <1>;
+			#size-cells = <0>;
+			reg = <0>;
+		};
+
+		i2c2mux0ch1: i2c@1 {
+			#address-cells = <1>;
+			#size-cells = <0>;
+			reg = <1>;
+		};
+
+		i2c2mux0ch2: i2c@2 {
+			#address-cells = <1>;
+			#size-cells = <0>;
+			reg = <2>;
+		};
+
+		i2c2mux0ch3: i2c@3 {
+			#address-cells = <1>;
+			#size-cells = <0>;
+			reg = <3>;
+		};
+	};
+};
+
+&i2c3 {
+	status = "okay";
+};
+
+&i2c4 {
+	status = "okay";
+};
+
+&i2c5 {
+	status = "okay";
+};
+
+&i2c6 {
+	status = "okay";
+};
+
+&i2c7 {
+	status = "okay";
+};
+
+&i2c9 {
+	status = "okay";
+};
+
+&i2c10 {
+	status = "okay";
+};
+
+&i2c11 {
+	status = "okay";
+
+	vrm@60 {
+		compatible = "renesas,isl69269", "isl69269";
+		reg = <0x60>;
+	};
+};
+
+&i2c12 {
+	status = "okay";
+
+	/* FRU eeprom */
+	eeprom@57 {
+		compatible = "st,24c128", "atmel,24c128";
+		reg = <0x57>;
+		pagesize = <16>;
+		#address-cells = <1>;
+		#size-cells = <1>;
+
+		eth0_macaddress: macaddress@3f80 {
+			reg = <0x3f80 6>;
+		};
+	};
+};
+
+&video {
+	status = "okay";
+};
+
+&vhub {
+	status = "okay";
+};
+
+&lpc_ctrl {
+	status = "okay";
+};
+
+&lpc_snoop {
+	status = "okay";
+	snoop-ports = <0x80>;
+};
+
+&kcs3 {
+	status = "okay";
+	aspeed,lpc-io-reg = <0xca2>;
+};
+
+&peci0 {
+	status = "okay";
+};
+
+&wdt1 {
+	aspeed,reset-mask = <(AST2500_WDT_RESET_DEFAULT & ~AST2500_WDT_RESET_LPC)>;
+};
+
+&wdt2 {
+	aspeed,reset-mask = <(AST2500_WDT_RESET_DEFAULT & ~AST2500_WDT_RESET_LPC)>;
+};
+
+&pwm_tacho {
+	status = "okay";
+	pinctrl-names = "default";
+	pinctrl-0 = <&pinctrl_pwm0_default /* CPU */
+		&pinctrl_pwm2_default      /* rear */
+		&pinctrl_pwm4_default>;    /* front */
+
+	/* CPU */
+	fan@0 {
+		reg = <0x00>;
+		aspeed,fan-tach-ch = /bits/ 8 <0x00>;
+	};
+
+	/* rear */
+	fan@2 {
+		reg = <0x02>;
+		aspeed,fan-tach-ch = /bits/ 8 <0x02>;
+	};
+
+	/* front */
+	fan@4 {
+		reg = <0x04>;
+		aspeed,fan-tach-ch = /bits/ 8 <0x04>;
+	};
+};
+
+&gpio {
+	status = "okay";
+	gpio-line-names =
+		/*  A */ "", "", "NMI_BTN_N", "BMC_NMI", "", "", "", "",
+		/*  B */ "", "", "", "", "", "", "", "",
+		/*  C */ "", "", "", "", "", "", "", "",
+		/*  D */ "BMC_PSIN", "BMC_PSOUT", "BMC_RESETCON", "RESETCON",
+			"", "", "", "",
+		/*  E */ "", "", "", "", "", "", "", "",
+		/*  F */ "LOCATORLED_STATUS_N", "LOCATORBTN", "", "",
+			"", "", "BMC_PCH_SCI_LPC", "BMC_NCSI_MUX_CTL",
+		/*  G */ "HWM_BAT_EN", "CHASSIS_ID0", "CHASSIS_ID1", "CHASSIS_ID2",
+			"", "", "", "",
+		/*  H */ "FM_ME_RCVR_N", "O_PWROK", "", "D4_DIMM_EVENT_3V_N",
+			"MFG_MODE_N", "BMC_RTCRST", "BMC_HB_LED_N", "BMC_CASEOPEN",
+		/*  I */ "", "", "", "", "", "", "", "",
+		/*  J */ "BMC_READY", "BMC_PCH_BIOS_CS_N", "BMC_SMI", "", "", "", "", "",
+		/*  K */ "", "", "", "", "", "", "", "",
+		/*  L */ "", "", "", "", "", "", "", "",
+		/*  M */ "", "", "", "", "", "", "", "",
+		/*  N */ "", "", "", "", "", "", "", "",
+		/*  O */ "", "", "", "", "", "", "", "",
+		/*  P */ "", "", "", "", "", "", "", "",
+		/*  Q */ "", "", "", "", "", "", "", "",
+		/*  R */ "", "", "", "", "", "", "", "",
+		/*  S */ "PCHHOT_BMC_N", "", "RSMRST", "", "", "", "", "",
+		/*  T */ "", "", "", "", "", "", "", "",
+		/*  U */ "", "", "", "", "", "", "", "",
+		/*  V */ "", "", "", "", "", "", "", "",
+		/*  W */ "", "", "", "", "", "", "", "",
+		/*  X */ "", "", "", "", "", "", "", "",
+		/*  Y */ "SLP_S3", "SLP_S5", "", "", "", "", "", "",
+		/*  Z */ "CPU_CATERR_BMC_N", "", "SYSTEM_FAULT_LED_N", "BMC_THROTTLE_N",
+			"", "", "", "",
+		/* AA */ "CPU1_THERMTRIP_LATCH_N", "", "CPU1_PROCHOT_N", "",
+			"", "", "IRQ_SMI_ACTIVE_N", "FM_BIOS_POST_CMPLT_N",
+		/* AB */ "", "", "ME_OVERRIDE", "BMC_DMI_MODIFY", "", "", "", "",
+		/* AC */ "", "", "", "", "", "", "", "";
+};
+
+&adc {
+	status = "okay";
+	pinctrl-names = "default";
+	pinctrl-0 = <&pinctrl_adc0_default /* 3VSB */
+		&pinctrl_adc1_default	   /* 5VSB */
+		&pinctrl_adc2_default	   /* CPU1 */
+		&pinctrl_adc3_default	   /* VCCSA */
+		&pinctrl_adc4_default	   /* VCCM */
+		&pinctrl_adc5_default	   /* V10M */
+		&pinctrl_adc6_default	   /* VCCIO */
+		&pinctrl_adc7_default	   /* VCCGT */
+		&pinctrl_adc8_default	   /* VPPM */
+		&pinctrl_adc9_default	   /* BAT */
+		&pinctrl_adc10_default	   /* 3V */
+		&pinctrl_adc11_default	   /* 5V */
+		&pinctrl_adc12_default	   /* 12V */
+		&pinctrl_adc13_default	   /* GND */
+		&pinctrl_adc14_default	   /* GND */
+		&pinctrl_adc15_default>;   /* GND */
+};

From eadd52a6233d4e50391eb68a7a77c24a8c262313 Mon Sep 17 00:00:00 2001
From: Renze Nicolai <renze@rnplus.nl>
Date: Sat, 2 Dec 2023 01:38:44 +0100
Subject: [PATCH 0015/1406] dt-bindings: arm: aspeed: add Asrock X570D4U board

Document Asrock X570D4U compatible.

Signed-off-by: Renze Nicolai <renze@rnplus.nl>
Acked-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Reviewed-by: Joel Stanley <joel@jms.id.au>
Link: https://lore.kernel.org/r/20231202003908.3635695-2-renze@rnplus.nl
Signed-off-by: Joel Stanley <joel@jms.id.au>
---
 Documentation/devicetree/bindings/arm/aspeed/aspeed.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Documentation/devicetree/bindings/arm/aspeed/aspeed.yaml b/Documentation/devicetree/bindings/arm/aspeed/aspeed.yaml
index 6f7543463d894c..85e2c00a238477 100644
--- a/Documentation/devicetree/bindings/arm/aspeed/aspeed.yaml
+++ b/Documentation/devicetree/bindings/arm/aspeed/aspeed.yaml
@@ -38,6 +38,7 @@ properties:
               - asrock,e3c256d4i-bmc
               - asrock,romed8hm3-bmc
               - asrock,spc621d8hm3-bmc
+              - asrock,x570d4u-bmc
               - bytedance,g220a-bmc
               - facebook,cmm-bmc
               - facebook,minipack-bmc

From ecab6c95f79bb6143090d0d48ee26501f28e0a59 Mon Sep 17 00:00:00 2001
From: Renze Nicolai <renze@rnplus.nl>
Date: Sat, 2 Dec 2023 01:38:45 +0100
Subject: [PATCH 0016/1406] ARM: dts: aspeed: asrock: Add ASRock X570D4U BMC

This is a relatively low-cost AST2500-based Amd Ryzen 5000 Series
micro-ATX board that we hope can provide a decent platform for OpenBMC
development.

This initial device-tree provides the necessary configuration for
basic BMC functionality such as serial console, KVM support
and POST code snooping.

Signed-off-by: Renze Nicolai <renze@rnplus.nl>
Reviewed-by: Joel Stanley <joel@jms.id.au>
Link: https://lore.kernel.org/r/20231202003908.3635695-3-renze@rnplus.nl
Signed-off-by: Joel Stanley <joel@jms.id.au>
---
 arch/arm/boot/dts/aspeed/Makefile             |   1 +
 .../dts/aspeed/aspeed-bmc-asrock-x570d4u.dts  | 377 ++++++++++++++++++
 2 files changed, 378 insertions(+)
 create mode 100644 arch/arm/boot/dts/aspeed/aspeed-bmc-asrock-x570d4u.dts

diff --git a/arch/arm/boot/dts/aspeed/Makefile b/arch/arm/boot/dts/aspeed/Makefile
index 3e3e6b96cb799d..fb9cc95f1b60f3 100644
--- a/arch/arm/boot/dts/aspeed/Makefile
+++ b/arch/arm/boot/dts/aspeed/Makefile
@@ -12,6 +12,7 @@ dtb-$(CONFIG_ARCH_ASPEED) += \
 	aspeed-bmc-asrock-e3c256d4i.dtb \
 	aspeed-bmc-asrock-romed8hm3.dtb \
 	aspeed-bmc-asrock-spc621d8hm3.dtb \
+	aspeed-bmc-asrock-x570d4u.dtb \
 	aspeed-bmc-bytedance-g220a.dtb \
 	aspeed-bmc-delta-ahe50dc.dtb \
 	aspeed-bmc-facebook-bletchley.dtb \
diff --git a/arch/arm/boot/dts/aspeed/aspeed-bmc-asrock-x570d4u.dts b/arch/arm/boot/dts/aspeed/aspeed-bmc-asrock-x570d4u.dts
new file mode 100644
index 00000000000000..3c975bc41ae7de
--- /dev/null
+++ b/arch/arm/boot/dts/aspeed/aspeed-bmc-asrock-x570d4u.dts
@@ -0,0 +1,377 @@
+// SPDX-License-Identifier: GPL-2.0+
+/dts-v1/;
+#include "aspeed-g5.dtsi"
+#include <dt-bindings/gpio/aspeed-gpio.h>
+#include <dt-bindings/leds/common.h>
+
+/ {
+	model = "Asrock Rack X570D4U BMC";
+	compatible = "asrock,x570d4u-bmc", "aspeed,ast2500";
+
+	aliases {
+		i2c40 = &i2c4mux0ch0;
+		i2c41 = &i2c4mux0ch1;
+		i2c42 = &i2c4mux0ch2;
+		i2c43 = &i2c4mux0ch3;
+	};
+
+	chosen {
+		stdout-path = &uart5;
+	};
+
+	memory@80000000 {
+		reg = <0x80000000 0x20000000>;
+	};
+
+	reserved-memory {
+		#address-cells = <1>;
+		#size-cells = <1>;
+		ranges;
+
+		pci_memory: region@9a000000 {
+			no-map;
+			reg = <0x9a000000 0x00010000>; /* 64K */
+		};
+
+		video_engine_memory: jpegbuffer {
+			size = <0x02800000>;	/* 40M */
+			alignment = <0x01000000>;
+			compatible = "shared-dma-pool";
+			reusable;
+		};
+
+		gfx_memory: framebuffer {
+			size = <0x01000000>;
+			alignment = <0x01000000>;
+			compatible = "shared-dma-pool";
+			reusable;
+		};
+	};
+
+	leds {
+		compatible = "gpio-leds";
+
+		led-0 {
+			/* led-heartbeat-n */
+			gpios = <&gpio ASPEED_GPIO(H, 6) GPIO_ACTIVE_LOW>;
+			color = <LED_COLOR_ID_GREEN>;
+			function = LED_FUNCTION_HEARTBEAT;
+			linux,default-trigger = "timer";
+		};
+
+		led-1 {
+			/* led-fault-n */
+			gpios = <&gpio ASPEED_GPIO(Z, 2) GPIO_ACTIVE_LOW>;
+			color = <LED_COLOR_ID_AMBER>;
+			function = LED_FUNCTION_FAULT;
+			panic-indicator;
+		};
+	};
+
+	iio-hwmon {
+		compatible = "iio-hwmon";
+		io-channels = <&adc 0>, <&adc 1>, <&adc 2>, <&adc 3>, <&adc 4>,
+			<&adc 5>, <&adc 6>, <&adc 7>, <&adc 8>, <&adc 9>,
+			<&adc 10>, <&adc 11>, <&adc 12>;
+	};
+};
+
+&gpio {
+	status = "okay";
+	gpio-line-names =
+	/*A0-A3*/       "status-locatorled-n",                    "",                      "button-nmi-n",          "",
+	/*A4-A7*/       "",                                       "",                      "",                      "",
+	/*B0-B3*/       "input-bios-post-cmplt-n",                "",                      "",                      "",
+	/*B4-B7*/       "",                                       "",                      "",                      "",
+	/*C0-C3*/       "",                                       "",                      "",                      "",
+	/*C4-C7*/       "",                                       "",                      "control-locatorbutton", "",
+	/*D0-D3*/       "button-power",                           "control-power",         "button-reset",          "control-reset",
+	/*D4-D7*/       "",                                       "",                      "",                      "",
+	/*E0-E3*/       "",                                       "",                      "",                      "",
+	/*E4-E7*/       "",                                       "",                      "",                      "",
+	/*F0-F3*/       "",                                       "",                      "",                      "",
+	/*F4-F7*/       "",                                       "",                      "",                      "",
+	/*G0-G3*/       "output-rtc-battery-voltage-read-enable", "input-id0",             "input-id1",             "input-id2",
+	/*G4-G7*/       "input-alert1-n",                         "input-alert2-n",        "input-alert3-n",        "",
+	/*H0-H3*/       "",                                       "",                      "",                      "",
+	/*H4-H7*/       "input-mfg",                              "",                      "led-heartbeat-n",       "input-caseopen",
+	/*I0-I3*/       "",                                       "",                      "",                      "",
+	/*I4-I7*/       "",                                       "",                      "",                      "",
+	/*J0-J3*/       "output-bmc-ready",                       "",                      "",                      "",
+	/*J4-J7*/       "",                                       "",                      "",                      "",
+	/*K0-K3*/       "",                                       "",                      "",                      "",
+	/*K4-K7*/       "",                                       "",                      "",                      "",
+	/*L0-L3*/       "",                                       "",                      "",                      "",
+	/*L4-L7*/       "",                                       "",                      "",                      "",
+	/*M0-M3*/       "",                                       "",                      "",                      "",
+	/*M4-M7*/       "",                                       "",                      "",                      "",
+	/*N0-N3*/       "",                                       "",                      "",                      "",
+	/*N4-N7*/       "",                                       "",                      "",                      "",
+	/*O0-O3*/       "",                                       "",                      "",                      "",
+	/*O4-O7*/       "",                                       "",                      "",                      "",
+	/*P0-P3*/       "",                                       "",                      "",                      "",
+	/*P4-P7*/       "",                                       "",                      "",                      "",
+	/*Q0-Q3*/       "",                                       "",                      "",                      "",
+	/*Q4-Q7*/       "",                                       "",                      "",                      "",
+	/*R0-R3*/       "",                                       "",                      "",                      "",
+	/*R4-R7*/       "",                                       "",                      "",                      "",
+	/*S0-S3*/       "input-bmc-pchhot-n",                     "",                      "",                      "",
+	/*S4-S7*/       "",                                       "",                      "",                      "",
+	/*T0-T3*/       "",                                       "",                      "",                      "",
+	/*T4-T7*/       "",                                       "",                      "",                      "",
+	/*U0-U3*/       "",                                       "",                      "",                      "",
+	/*U4-U7*/       "",                                       "",                      "",                      "",
+	/*V0-V3*/       "",                                       "",                      "",                      "",
+	/*V4-V7*/       "",                                       "",                      "",                      "",
+	/*W0-W3*/       "",                                       "",                      "",                      "",
+	/*W4-W7*/       "",                                       "",                      "",                      "",
+	/*X0-X3*/       "",                                       "",                      "",                      "",
+	/*X4-X7*/       "",                                       "",                      "",                      "",
+	/*Y0-Y3*/       "",                                       "",                      "",                      "",
+	/*Y4-Y7*/       "",                                       "",                      "",                      "",
+	/*Z0-Z3*/       "",                                       "",                      "led-fault-n",           "output-bmc-throttle-n",
+	/*Z4-Z7*/       "",                                       "",                      "",                      "",
+	/*AA0-AA3*/     "input-cpu1-thermtrip-latch-n",           "",                      "input-cpu1-prochot-n",  "",
+	/*AA4-AC7*/     "",                                       "",                      "",                      "",
+	/*AB0-AB3*/     "",                                       "",                      "",                      "",
+	/*AB4-AC7*/     "",                                       "",                      "",                      "",
+	/*AC0-AC3*/     "",                                       "",                      "",                      "",
+	/*AC4-AC7*/     "",                                       "",                      "",                      "";
+};
+
+&fmc {
+	status = "okay";
+	flash@0 {
+		status = "okay";
+		label = "bmc";
+		m25p,fast-read;
+		spi-max-frequency = <10000000>;
+#include "openbmc-flash-layout-64.dtsi"
+	};
+};
+
+&uart5 {
+	status = "okay";
+};
+
+&vuart {
+	status = "okay";
+};
+
+&mac0 {
+	status = "okay";
+	pinctrl-names = "default";
+	pinctrl-0 = <&pinctrl_rgmii1_default &pinctrl_mdio1_default>;
+
+	nvmem-cells = <&eth0_macaddress>;
+	nvmem-cell-names = "mac-address";
+};
+
+&mac1 {
+	status = "okay";
+	pinctrl-names = "default";
+	pinctrl-0 = <&pinctrl_rmii2_default &pinctrl_mdio2_default>;
+	use-ncsi;
+
+	nvmem-cells = <&eth1_macaddress>;
+	nvmem-cell-names = "mac-address";
+};
+
+&i2c0 {
+	/* SMBus on auxiliary panel header (AUX_PANEL1) */
+	status = "okay";
+};
+
+&i2c1 {
+	status = "okay";
+
+	w83773g@4c {
+		compatible = "nuvoton,w83773g";
+		reg = <0x4c>;
+	};
+};
+
+&i2c2 {
+	/* PSU SMBus (PSU_SMB1) */
+	status = "okay";
+};
+
+&i2c3 {
+	status = "okay";
+};
+
+&i2c4 {
+	status = "okay";
+
+	i2c-mux@70 {
+		compatible = "nxp,pca9545";
+		reg = <0x70>;
+		#address-cells = <1>;
+		#size-cells = <0>;
+
+		i2c4mux0ch0: i2c@0 {
+			/* SMBus on PCI express 16x slot */
+			#address-cells = <1>;
+			#size-cells = <0>;
+			reg = <0>;
+		};
+
+		i2c4mux0ch1: i2c@1 {
+			/* SMBus on PCI express 8x slot */
+			#address-cells = <1>;
+			#size-cells = <0>;
+			reg = <1>;
+		};
+
+		i2c4mux0ch2: i2c@2 {
+			/* Unknown */
+			#address-cells = <1>;
+			#size-cells = <0>;
+			reg = <2>;
+		};
+
+		i2c4mux0ch3: i2c@3 {
+			/* SMBus on PCI express 1x slot */
+			#address-cells = <1>;
+			#size-cells = <0>;
+			reg = <3>;
+		};
+	};
+};
+
+&i2c5 {
+	status = "okay";
+};
+
+&i2c7 {
+	/* FRU and SPD EEPROM SMBus */
+	status = "okay";
+
+	eeprom@57 {
+		compatible = "st,24c128", "atmel,24c128";
+		reg = <0x57>;
+		pagesize = <16>;
+		#address-cells = <1>;
+		#size-cells = <1>;
+
+		eth0_macaddress: macaddress@3f80 {
+			reg = <0x3f80 6>;
+		};
+
+		eth1_macaddress: macaddress@3f88 {
+			reg = <0x3f88 6>;
+		};
+	};
+};
+
+&gfx {
+	status = "okay";
+};
+
+&pinctrl {
+	aspeed,external-nodes = <&gfx &lhc>;
+};
+
+&vhub {
+	status = "okay";
+};
+
+&ehci1 {
+	status = "okay";
+};
+
+&uhci {
+	status = "okay";
+};
+
+&kcs3 {
+	aspeed,lpc-io-reg = <0xca2>;
+	status = "okay";
+};
+
+&lpc_ctrl {
+	status = "okay";
+};
+
+&lpc_snoop {
+	status = "okay";
+	snoop-ports = <0x80>;
+};
+
+&p2a {
+	status = "okay";
+	memory-region = <&pci_memory>;
+};
+
+&video {
+	status = "okay";
+	memory-region = <&video_engine_memory>;
+};
+
+&pwm_tacho {
+	status = "okay";
+	pinctrl-names = "default";
+	pinctrl-0 = <&pinctrl_pwm0_default
+				&pinctrl_pwm1_default
+				&pinctrl_pwm2_default
+				&pinctrl_pwm3_default
+				&pinctrl_pwm4_default
+				&pinctrl_pwm5_default>;
+
+	fan@0 {
+		/* FAN1 (4-pin) */
+		reg = <0x00>;
+		aspeed,fan-tach-ch = /bits/ 8 <0x00>;
+	};
+
+	fan@1 {
+		/* FAN2 (4-pin) */
+		reg = <0x01>;
+		aspeed,fan-tach-ch = /bits/ 8 <0x01>;
+	};
+
+	fan@2 {
+		/* FAN3 (4-pin) */
+		reg = <0x02>;
+		aspeed,fan-tach-ch = /bits/ 8 <0x02>;
+	};
+
+	fan@3 {
+		/* FAN4 (6-pin) */
+		reg = <0x03>;
+		aspeed,fan-tach-ch = /bits/ 8 <0x04 0x0b>;
+	};
+
+	fan@4 {
+		/* FAN6 (6-pin) */
+		reg = <0x04>;
+		aspeed,fan-tach-ch = /bits/ 8 <0x06 0x0d>;
+	};
+
+	fan@5 {
+		/* FAN5 (6-pin) */
+		reg = <0x05>;
+		aspeed,fan-tach-ch = /bits/ 8 <0x05 0x0c>;
+	};
+};
+
+&adc {
+	status = "okay";
+	pinctrl-names = "default";
+	pinctrl-0 = <&pinctrl_adc0_default
+				&pinctrl_adc1_default
+				&pinctrl_adc2_default
+				&pinctrl_adc3_default
+				&pinctrl_adc4_default
+				&pinctrl_adc5_default
+				&pinctrl_adc6_default
+				&pinctrl_adc7_default
+				&pinctrl_adc8_default
+				&pinctrl_adc9_default
+				&pinctrl_adc10_default
+				&pinctrl_adc11_default
+				&pinctrl_adc12_default
+				&pinctrl_adc13_default
+				&pinctrl_adc14_default
+				&pinctrl_adc15_default>;
+};

From 763f0b3f1402cbb9d1ec7a0a37bcc6ebb465b119 Mon Sep 17 00:00:00 2001
From: Peter Yin <peteryin.openbmc@gmail.com>
Date: Tue, 12 Dec 2023 00:26:54 +0800
Subject: [PATCH 0017/1406] dt-bindings: arm: aspeed: add Meta Harma board

Document the new compatibles used on Meta Harma.

Acked-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Signed-off-by: Peter Yin <peteryin.openbmc@gmail.com>
Link: https://lore.kernel.org/r/20231211162656.2564267-2-peteryin.openbmc@gmail.com
Signed-off-by: Joel Stanley <joel@jms.id.au>
---
 Documentation/devicetree/bindings/arm/aspeed/aspeed.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Documentation/devicetree/bindings/arm/aspeed/aspeed.yaml b/Documentation/devicetree/bindings/arm/aspeed/aspeed.yaml
index 85e2c00a238477..7dfcdc2d571eb0 100644
--- a/Documentation/devicetree/bindings/arm/aspeed/aspeed.yaml
+++ b/Documentation/devicetree/bindings/arm/aspeed/aspeed.yaml
@@ -82,6 +82,7 @@ properties:
               - facebook,elbert-bmc
               - facebook,fuji-bmc
               - facebook,greatlakes-bmc
+              - facebook,harma-bmc
               - facebook,minerva-cmc
               - facebook,yosemite4-bmc
               - ibm,everest-bmc

From e17770a3388e05aa59d6a8d4fbcd2a4130222db7 Mon Sep 17 00:00:00 2001
From: Peter Yin <peteryin.openbmc@gmail.com>
Date: Tue, 12 Dec 2023 00:26:55 +0800
Subject: [PATCH 0018/1406] ARM: dts: aspeed: Harma: Add Meta Harma (AST2600)
 BMC

Add linux device tree entry related to
the Meta(Facebook) computer-node system use an AT2600 BMC.
This node is named "Harma".

Signed-off-by: Peter Yin <peteryin.openbmc@gmail.com>
Reviewed-by: Joel Stanley <joel@jms.id.au>
Link: https://lore.kernel.org/r/20231211162656.2564267-3-peteryin.openbmc@gmail.com
Signed-off-by: Joel Stanley <joel@jms.id.au>
---
 arch/arm/boot/dts/aspeed/Makefile             |   1 +
 .../dts/aspeed/aspeed-bmc-facebook-harma.dts  | 585 ++++++++++++++++++
 2 files changed, 586 insertions(+)
 create mode 100644 arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-harma.dts

diff --git a/arch/arm/boot/dts/aspeed/Makefile b/arch/arm/boot/dts/aspeed/Makefile
index fb9cc95f1b60f3..6ecc21d04a6299 100644
--- a/arch/arm/boot/dts/aspeed/Makefile
+++ b/arch/arm/boot/dts/aspeed/Makefile
@@ -22,6 +22,7 @@ dtb-$(CONFIG_ARCH_ASPEED) += \
 	aspeed-bmc-facebook-fuji.dtb \
 	aspeed-bmc-facebook-galaxy100.dtb \
 	aspeed-bmc-facebook-greatlakes.dtb \
+	aspeed-bmc-facebook-harma.dtb \
 	aspeed-bmc-facebook-minerva-cmc.dtb \
 	aspeed-bmc-facebook-minipack.dtb \
 	aspeed-bmc-facebook-tiogapass.dtb \
diff --git a/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-harma.dts b/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-harma.dts
new file mode 100644
index 00000000000000..7db3f9eb00161a
--- /dev/null
+++ b/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-harma.dts
@@ -0,0 +1,585 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+// Copyright 2023 Facebook Inc.
+
+/dts-v1/;
+#include "aspeed-g6.dtsi"
+#include <dt-bindings/gpio/aspeed-gpio.h>
+#include <dt-bindings/i2c/i2c.h>
+
+/ {
+	model = "Facebook Harma";
+	compatible = "facebook,harma-bmc", "aspeed,ast2600";
+
+	aliases {
+		serial0 = &uart1;
+		serial1 = &uart6;
+		serial2 = &uart2;
+		serial4 = &uart5;
+
+		i2c20 = &imux20;
+		i2c21 = &imux21;
+		i2c22 = &imux22;
+		i2c23 = &imux23;
+		i2c24 = &imux24;
+		i2c25 = &imux25;
+		i2c26 = &imux26;
+		i2c27 = &imux27;
+		i2c28 = &imux28;
+		i2c29 = &imux29;
+		i2c30 = &imux30;
+		i2c31 = &imux31;
+	};
+
+	chosen {
+		stdout-path = &uart5;
+	};
+
+	memory@80000000 {
+		device_type = "memory";
+		reg = <0x80000000 0x80000000>;
+	};
+
+	iio-hwmon {
+		compatible = "iio-hwmon";
+		io-channels = <&adc0 0>, <&adc0 1>, <&adc0 2>, <&adc0 3>,
+			      <&adc0 4>, <&adc0 5>, <&adc0 6>, <&adc0 7>,
+			      <&adc1 2>;
+	};
+
+	leds {
+		compatible = "gpio-leds";
+
+		led-0 {
+			label = "bmc_heartbeat_amber";
+			gpios = <&gpio0 ASPEED_GPIO(P, 7) GPIO_ACTIVE_LOW>;
+			linux,default-trigger = "heartbeat";
+		};
+
+		led-1 {
+			label = "fp_id_amber";
+			default-state = "off";
+			gpios = <&gpio0 13 GPIO_ACTIVE_HIGH>;
+		};
+
+		led-2 {
+			label = "power_blue";
+			default-state = "off";
+			gpios = <&gpio0 124 GPIO_ACTIVE_HIGH>;
+		};
+	};
+};
+
+// HOST BIOS Debug
+&uart1 {
+	status = "okay";
+};
+
+// SOL Host Console
+&uart2 {
+	status = "okay";
+	pinctrl-0 = <>;
+};
+
+// SOL BMC Console
+&uart4 {
+	status = "okay";
+	pinctrl-0 = <>;
+};
+
+// BMC Debug Console
+&uart5 {
+	status = "okay";
+};
+
+// MTIA
+&uart6 {
+	status = "okay";
+};
+
+&uart_routing {
+	status = "okay";
+};
+
+&vuart1 {
+	status = "okay";
+};
+
+&wdt1 {
+	status = "okay";
+	pinctrl-names = "default";
+	pinctrl-0 = <&pinctrl_wdtrst1_default>;
+	aspeed,reset-type = "soc";
+	aspeed,external-signal;
+	aspeed,ext-push-pull;
+	aspeed,ext-active-high;
+	aspeed,ext-pulse-duration = <256>;
+};
+
+&mac3 {
+	status = "okay";
+	pinctrl-names = "default";
+	pinctrl-0 = <&pinctrl_rmii4_default>;
+	use-ncsi;
+	mlx,multi-host;
+};
+
+&rtc {
+	status = "okay";
+};
+
+&fmc {
+	status = "okay";
+
+	flash@0 {
+		status = "okay";
+		m25p,fast-read;
+		label = "bmc";
+		spi-max-frequency = <50000000>;
+#include "openbmc-flash-layout-128.dtsi"
+	};
+
+	flash@1 {
+		status = "okay";
+		m25p,fast-read;
+		label = "alt-bmc";
+		spi-max-frequency = <50000000>;
+	};
+};
+
+// BIOS Flash
+&spi2 {
+	status = "okay";
+	pinctrl-names = "default";
+	pinctrl-0 = <&pinctrl_spi2_default>;
+
+	flash@0 {
+		status = "okay";
+		m25p,fast-read;
+		label = "pnor";
+		spi-max-frequency = <12000000>;
+		spi-tx-bus-width = <2>;
+		spi-rx-bus-width = <2>;
+	};
+};
+
+&kcs2 {
+	status = "okay";
+	aspeed,lpc-io-reg = <0xca8>;
+};
+
+&kcs3 {
+	status = "okay";
+	aspeed,lpc-io-reg = <0xca2>;
+};
+
+&i2c0 {
+	status = "okay";
+
+	max31790@30{
+		compatible = "max31790";
+		reg = <0x30>;
+		#address-cells = <1>;
+		#size-cells = <0>;
+	};
+};
+
+&i2c1 {
+	status = "okay";
+
+	tmp75@4b {
+		compatible = "ti,tmp75";
+		reg = <0x4b>;
+	};
+};
+
+&i2c2 {
+	status = "okay";
+
+	max31790@30{
+		compatible = "max31790";
+		reg = <0x30>;
+		#address-cells = <1>;
+		#size-cells = <0>;
+	};
+};
+
+&i2c3 {
+	status = "okay";
+
+	i2c-mux@70 {
+		compatible = "nxp,pca9543";
+		reg = <0x70>;
+		#address-cells = <1>;
+		#size-cells = <0>;
+
+		imux20: i2c@0 {
+			#address-cells = <1>;
+			#size-cells = <0>;
+			reg = <0>;
+			//Retimer Flash
+			eeprom@50 {
+				compatible = "atmel,24c2048";
+				reg = <0x50>;
+				pagesize = <128>;
+			};
+		};
+		imux21: i2c@1 {
+			#address-cells = <1>;
+			#size-cells = <0>;
+			reg = <1>;
+		};
+	};
+};
+
+&i2c4 {
+	status = "okay";
+	// PDB FRU
+	eeprom@52 {
+		compatible = "atmel,24c64";
+		reg = <0x52>;
+	};
+
+	delta_brick@69 {
+		compatible = "pmbus";
+		reg = <0x69>;
+	};
+};
+
+&i2c5 {
+	status = "okay";
+};
+
+&i2c6 {
+	status = "okay";
+
+	i2c-mux@70 {
+		compatible = "nxp,pca9543";
+		reg = <0x70>;
+		#address-cells = <1>;
+		#size-cells = <0>;
+
+		imux22: i2c@0 {
+			#address-cells = <1>;
+			#size-cells = <0>;
+			reg = <0>;
+		};
+		imux23: i2c@1 {
+			#address-cells = <1>;
+			#size-cells = <0>;
+			reg = <1>;
+		};
+	};
+};
+
+&i2c7 {
+	status = "okay";
+};
+
+&i2c8 {
+	status = "okay";
+};
+
+&i2c9 {
+	status = "okay";
+
+	gpio@30 {
+		compatible = "nxp,pca9555";
+		reg = <0x30>;
+		gpio-controller;
+		#gpio-cells = <2>;
+	};
+	gpio@31 {
+		compatible = "nxp,pca9555";
+		reg = <0x31>;
+		gpio-controller;
+		#gpio-cells = <2>;
+	};
+
+	i2c-mux@71 {
+		compatible = "nxp,pca9546";
+		reg = <0x71>;
+		#address-cells = <1>;
+		#size-cells = <0>;
+
+		imux24: i2c@0 {
+			#address-cells = <1>;
+			#size-cells = <0>;
+			reg = <0>;
+		};
+		imux25: i2c@1 {
+			#address-cells = <1>;
+			#size-cells = <0>;
+			reg = <1>;
+		};
+		imux26: i2c@2 {
+			#address-cells = <1>;
+			#size-cells = <0>;
+			reg = <2>;
+		};
+		imux27: i2c@3 {
+			#address-cells = <1>;
+			#size-cells = <0>;
+			reg = <3>;
+		};
+	};
+	// PTTV FRU
+	eeprom@52 {
+		compatible = "atmel,24c64";
+		reg = <0x52>;
+	};
+};
+
+&i2c11 {
+	status = "okay";
+};
+
+&i2c12 {
+	status = "okay";
+};
+
+&i2c13 {
+	status = "okay";
+
+	i2c-mux@70 {
+		compatible = "nxp,pca9545";
+		reg = <0x70>;
+		#address-cells = <1>;
+		#size-cells = <0>;
+
+		imux28: i2c@0 {
+			#address-cells = <1>;
+			#size-cells = <0>;
+			reg = <0>;
+		};
+		imux29: i2c@1 {
+			#address-cells = <1>;
+			#size-cells = <0>;
+			reg = <1>;
+			//MB FRU
+			eeprom@54 {
+				compatible = "atmel,24c64";
+				reg = <0x54>;
+			};
+		};
+		imux30: i2c@2 {
+			#address-cells = <1>;
+			#size-cells = <0>;
+			reg = <2>;
+		};
+		imux31: i2c@3 {
+			#address-cells = <1>;
+			#size-cells = <0>;
+			reg = <3>;
+		};
+	};
+};
+
+// To Debug card
+&i2c14 {
+	status = "okay";
+	multi-master;
+
+	ipmb@10 {
+		compatible = "ipmb-dev";
+		reg = <(0x10 | I2C_OWN_SLAVE_ADDRESS)>;
+		i2c-protocol;
+	};
+};
+
+&i2c15 {
+	status = "okay";
+
+	// SCM FRU
+	eeprom@50 {
+		compatible = "atmel,24c64";
+		reg = <0x50>;
+	};
+
+	// BSM FRU
+	eeprom@56 {
+		compatible = "atmel,24c64";
+		reg = <0x56>;
+	};
+};
+
+&adc0 {
+	aspeed,int-vref-microvolt = <2500000>;
+	status = "okay";
+	pinctrl-0 = <&pinctrl_adc0_default &pinctrl_adc1_default
+		&pinctrl_adc2_default &pinctrl_adc3_default
+		&pinctrl_adc4_default &pinctrl_adc5_default
+		&pinctrl_adc6_default &pinctrl_adc7_default>;
+};
+
+&adc1 {
+	aspeed,int-vref-microvolt = <2500000>;
+	status = "okay";
+	pinctrl-names = "default";
+	pinctrl-0 = <&pinctrl_adc10_default>;
+};
+
+&ehci0 {
+	status = "okay";
+};
+
+&gpio0 {
+	pinctrl-names = "default";
+	gpio-line-names =
+	/*A0-A7*/	"","","","","","","","",
+	/*B0-B7*/	"","","","",
+			"bmc-spi-mux-select-0","led-identify","","",
+	/*C0-C7*/	"","","","","","","","",
+	/*D0-D7*/	"","","sol-uart-select","","","","","",
+	/*E0-E7*/	"","","","","","","","",
+	/*F0-F7*/	"","","","","","","","",
+	/*G0-G7*/	"","","","","","","","",
+	/*H0-H7*/	"","","","","","","","",
+	/*I0-I7*/	"","","","","","","","",
+	/*J0-J7*/	"","","","","","","","",
+	/*K0-K7*/	"","","","","","","","",
+	/*L0-L7*/	"","","","","","","","",
+	/*M0-M7*/	"","","","","","","","",
+	/*N0-N7*/	"led-postcode-0","led-postcode-1",
+			"led-postcode-2","led-postcode-3",
+			"led-postcode-4","led-postcode-5",
+			"led-postcode-6","led-postcode-7",
+	/*O0-O7*/	"","","","","","","","",
+	/*P0-P7*/	"power-button","power-host-control",
+			"reset-button","","led-power","","","",
+	/*Q0-Q7*/	"","","","","","","","",
+	/*R0-R7*/	"","","","","","","","",
+	/*S0-S7*/	"","","","","","","","",
+	/*T0-T7*/	"","","","","","","","",
+	/*U0-U7*/	"","","","","","","led-identify-gate","",
+	/*V0-V7*/	"","","","",
+			"rtc-battery-voltage-read-enable","","","",
+	/*W0-W7*/	"","","","","","","","",
+	/*X0-X7*/	"","","","","","","","",
+	/*Y0-Y7*/	"","","","","","","","",
+	/*Z0-Z7*/	"","","","","","","","";
+};
+
+&sgpiom0 {
+	status = "okay";
+	max-ngpios = <128>;
+	ngpios = <128>;
+	bus-frequency = <2000000>;
+	gpio-line-names =
+	/*in - out - in - out */
+	/*A0-A3 line 0-7*/
+	"presence-scm-cable","power-config-disable-e1s-0",
+	"","",
+	"","power-config-disable-e1s-1",
+	"","",
+	/*A4-A7 line 8-15*/
+	"","power-config-asic-module-enable",
+	"","power-config-asic-power-good",
+	"","power-config-pdb-power-good",
+	"presence-cpu","smi-control-n",
+	/*B0-B3 line 16-23*/
+	"","nmi-control-n",
+	"","nmi-control-sync-flood-n",
+	"","",
+	"","",
+	/*B4-B7 line 24-31*/
+	"","FM_CPU_SP5R1",
+	"reset-cause-rsmrst","FM_CPU_SP5R2",
+	"","FM_CPU_SP5R3",
+	"","FM_CPU_SP5R4",
+	/*C0-C3 line 32-39*/
+	"","FM_CPU0_SA0",
+	"","FM_CPU0_SA1",
+	"","rt-cpu0-p0-enable",
+	"","rt-cpu0-p1-enable",
+	/*C4-C7 line 40-47*/
+	"","smb-rt-rom-p0-select",
+	"","smb-rt-rom-p1-select",
+	"","i3c-cpu-mux0-oe-n",
+	"","i3c-cpu-mux0-select",
+	/*D0-D3 line 48-55*/
+	"","i3c-cpu-mux1-oe-n",
+	"","i3c-cpu-mux1-select",
+	"","reset-control-bmc",
+	"","reset-control-cpu0-p0-mux",
+	/*D4-D7 line 56-63*/
+	"","reset-control-cpu0-p1-mux",
+	"","reset-control-e1s-mux",
+	"power-host-good","reset-control-mb-mux",
+	"","reset-control-smb-e1s",
+	/*E0-E3 line 64-71*/
+	"","reset-control-smb-e1s",
+	"host-ready-n","reset-control-srst",
+	"presence-e1s-0","reset-control-usb-hub",
+	"","reset-control",
+	/*E4-E7 line 72-79*/
+	"presence-e1s-1","reset-control-cpu-kbrst",
+	"","reset-control-platrst",
+	"","bmc-jtag-mux-select-0",
+	"","bmc-jtag-mux-select-1",
+	/*F0-F3 line 80-87*/
+	"","bmc-jtag-select",
+	"","bmc-ready-n",
+	"","bmc-ready-sgpio",
+	"","rt-cpu0-p0-force-enable",
+	/*F4-F7 line 88-95*/
+	"presence-asic-modules-0","rt-cpu0-p1-force-enable",
+	"presence-asic-modules-1","bios-debug-msg-disable",
+	"","uart-control-buffer-select",
+	"","ac-control-n",
+	/*G0-G3 line 96-103*/
+	"FM_CPU_CORETYPE2","",
+	"FM_CPU_CORETYPE1","",
+	"FM_CPU_CORETYPE0","",
+	"FM_BOARD_REV_ID5","",
+	/*G4-G7 line 104-111*/
+	"FM_BOARD_REV_ID4","",
+	"FM_BOARD_REV_ID3","",
+	"FM_BOARD_REV_ID2","",
+	"FM_BOARD_REV_ID1","",
+	/*H0-H3 line 112-119*/
+	"FM_BOARD_REV_ID0","",
+	"","","","","","",
+	/*H4-H7 line 120-127*/
+	"","",
+	"reset-control-pcie-expansion-3","",
+	"reset-control-pcie-expansion-2","",
+	"reset-control-pcie-expansion-1","",
+	/*I0-I3 line 128-135*/
+	"reset-control-pcie-expansion-0","",
+	"FM_EXP_SLOT_ID1","",
+	"FM_EXP_SLOT_ID0","",
+	"","",
+	/*I4-I7 line 136-143*/
+	"","","","","","","","",
+	/*J0-J3 line 144-151*/
+	"","","","","","","","",
+	/*J4-J7 line 152-159*/
+	"SLOT_ID_BCB_0","",
+	"SLOT_ID_BCB_1","",
+	"SLOT_ID_BCB_2","",
+	"SLOT_ID_BCB_3","",
+	/*K0-K3 line 160-167*/
+	"","","","","","","","",
+	/*K4-K7 line 168-175*/
+	"","","","","","","","",
+	/*L0-L3 line 176-183*/
+	"","","","","","","","",
+	/*L4-L7 line 184-191*/
+	"","","","","","","","",
+	/*M0-M3 line 192-199*/
+	"","","","","","","","",
+	/*M4-M7 line 200-207*/
+	"","","","","","","","",
+	/*N0-N3 line 208-215*/
+	"","","","","","","","",
+	/*N4-N7 line 216-223*/
+	"","","","","","","","",
+	/*O0-O3 line 224-231*/
+	"","","","","","","","",
+	/*O4-O7 line 232-239*/
+	"","","","","","","","",
+	/*P0-P3 line 240-247*/
+	"","","","","","","","",
+	/*P4-P7 line 248-255*/
+	"","","","","","","","";
+};

From 965a8ea59ec55483fb9311036ef928d16770a63a Mon Sep 17 00:00:00 2001
From: Yang Chen <yangchen.openbmc@gmail.com>
Date: Tue, 12 Dec 2023 15:51:50 +0800
Subject: [PATCH 0019/1406] ARM: dts: aspeed: minerva: Revise the name of DTS

The project Minerva which is the platform used by Meta has two boards: the
Chassis Management Module (Minerva) and the Motherboard (Harma), so change
the DTS name to minerva here for CMM use.

Signed-off-by: Yang Chen <yangchen.openbmc@gmail.com>
Reviewed-by: Joel Stanley <joel@jms.id.au>
Link: https://lore.kernel.org/r/20231212075200.983536-2-yangchen.openbmc@gmail.com
Signed-off-by: Joel Stanley <joel@jms.id.au>
---
 arch/arm/boot/dts/aspeed/Makefile                               | 2 +-
 ...facebook-minerva-cmc.dts => aspeed-bmc-facebook-minerva.dts} | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)
 rename arch/arm/boot/dts/aspeed/{aspeed-bmc-facebook-minerva-cmc.dts => aspeed-bmc-facebook-minerva.dts} (99%)

diff --git a/arch/arm/boot/dts/aspeed/Makefile b/arch/arm/boot/dts/aspeed/Makefile
index 6ecc21d04a6299..75fff585675a0b 100644
--- a/arch/arm/boot/dts/aspeed/Makefile
+++ b/arch/arm/boot/dts/aspeed/Makefile
@@ -23,7 +23,7 @@ dtb-$(CONFIG_ARCH_ASPEED) += \
 	aspeed-bmc-facebook-galaxy100.dtb \
 	aspeed-bmc-facebook-greatlakes.dtb \
 	aspeed-bmc-facebook-harma.dtb \
-	aspeed-bmc-facebook-minerva-cmc.dtb \
+	aspeed-bmc-facebook-minerva.dtb \
 	aspeed-bmc-facebook-minipack.dtb \
 	aspeed-bmc-facebook-tiogapass.dtb \
 	aspeed-bmc-facebook-wedge40.dtb \
diff --git a/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva-cmc.dts b/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts
similarity index 99%
rename from arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva-cmc.dts
rename to arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts
index f04ef906352080..c755fb3258a485 100644
--- a/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva-cmc.dts
+++ b/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts
@@ -7,7 +7,7 @@
 #include <dt-bindings/i2c/i2c.h>
 
 / {
-	model = "Facebook Minerva CMC";
+	model = "Facebook Minerva CMM";
 	compatible = "facebook,minerva-cmc", "aspeed,ast2600";
 
 	aliases {

From bbdcf72f21fd0e1cc118ad304a867fa863ce2a21 Mon Sep 17 00:00:00 2001
From: Yang Chen <yangchen.openbmc@gmail.com>
Date: Tue, 12 Dec 2023 15:51:51 +0800
Subject: [PATCH 0020/1406] ARM: dts: aspeed: minerva: Modify mac3 setting

Remove the unuse setting and fix the link to 100 M

Signed-off-by: Yang Chen <yangchen.openbmc@gmail.com>
Reviewed-by: Joel Stanley <joel@jms.id.au>
Link: https://lore.kernel.org/r/20231212075200.983536-3-yangchen.openbmc@gmail.com
Signed-off-by: Joel Stanley <joel@jms.id.au>
---
 arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts b/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts
index c755fb3258a485..9979dba1ef0e4a 100644
--- a/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts
+++ b/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts
@@ -48,10 +48,13 @@
 
 &mac3 {
 	status = "okay";
+	phy-mode = "rmii";
 	pinctrl-names = "default";
 	pinctrl-0 = <&pinctrl_rmii4_default>;
-	use-ncsi;
-	mlx,multi-host;
+	fixed-link {
+		speed = <100>;
+		full-duplex;
+	};
 };
 
 &fmc {

From f15468aa4cdf3fe6568555d0f3e6df477f8b616d Mon Sep 17 00:00:00 2001
From: Yang Chen <yangchen.openbmc@gmail.com>
Date: Tue, 12 Dec 2023 15:51:52 +0800
Subject: [PATCH 0021/1406] ARM: dts: aspeed: minerva: Change sgpio use

Correct the sgpio use from sgpiom1 to sgpiom0

Signed-off-by: Yang Chen <yangchen.openbmc@gmail.com>
Reviewed-by: Joel Stanley <joel@jms.id.au>
Link: https://lore.kernel.org/r/20231212075200.983536-4-yangchen.openbmc@gmail.com
Signed-off-by: Joel Stanley <joel@jms.id.au>
---
 arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts b/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts
index 9979dba1ef0e4a..ad77057f921cd1 100644
--- a/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts
+++ b/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts
@@ -78,7 +78,7 @@
 	status = "okay";
 };
 
-&sgpiom1 {
+&sgpiom0 {
 	status = "okay";
 	ngpios = <128>;
 	bus-frequency = <2000000>;

From aca2d2f36bf73b6f129d7626b60ddc1f58e91f6c Mon Sep 17 00:00:00 2001
From: Yang Chen <yangchen.openbmc@gmail.com>
Date: Tue, 12 Dec 2023 15:51:53 +0800
Subject: [PATCH 0022/1406] ARM: dts: aspeed: minerva: Enable power monitor
 device

Enable power monitor device ina230 and ltc2945 on the i2c bus 0

Signed-off-by: Yang Chen <yangchen.openbmc@gmail.com>
Reviewed-by: Joel Stanley <joel@jms.id.au>
Link: https://lore.kernel.org/r/20231212075200.983536-5-yangchen.openbmc@gmail.com
Signed-off-by: Joel Stanley <joel@jms.id.au>
---
 .../aspeed/aspeed-bmc-facebook-minerva.dts    | 22 +++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts b/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts
index ad77057f921cd1..ee9691647e4a50 100644
--- a/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts
+++ b/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts
@@ -86,6 +86,28 @@
 
 &i2c0 {
 	status = "okay";
+
+	power-monitor@40 {
+		compatible = "ti,ina230";
+		reg = <0x40>;
+		shunt-resistor = <1000>;
+	};
+
+	power-monitor@41 {
+		compatible = "ti,ina230";
+		reg = <0x41>;
+		shunt-resistor = <1000>;
+	};
+
+	power-monitor@67 {
+		compatible = "adi,ltc2945";
+		reg = <0x67>;
+	};
+
+	power-monitor@68 {
+		compatible = "adi,ltc2945";
+		reg = <0x68>;
+	};
 };
 
 &i2c1 {

From 10f776c80b1a5d2834430c850457a1d10f12a9ce Mon Sep 17 00:00:00 2001
From: Yang Chen <yangchen.openbmc@gmail.com>
Date: Tue, 12 Dec 2023 15:51:54 +0800
Subject: [PATCH 0023/1406] ARM: dts: aspeed: minerva: Add temperature sensor

Add one temperature sensor on i2c bus 1

Signed-off-by: Yang Chen <yangchen.openbmc@gmail.com>
Reviewed-by: Joel Stanley <joel@jms.id.au>
Link: https://lore.kernel.org/r/20231212075200.983536-6-yangchen.openbmc@gmail.com
Signed-off-by: Joel Stanley <joel@jms.id.au>
---
 arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts b/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts
index ee9691647e4a50..783d4d5a8f3d7e 100644
--- a/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts
+++ b/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts
@@ -115,7 +115,12 @@
 
 	temperature-sensor@4b {
 		compatible = "ti,tmp75";
-		reg = <0x4B>;
+		reg = <0x4b>;
+	};
+
+	temperature-sensor@48 {
+		compatible = "ti,tmp75";
+		reg = <0x48>;
 	};
 
 	eeprom@51 {

From 96b198848ecd07165f58d93e195ca63244c8dbd4 Mon Sep 17 00:00:00 2001
From: Yang Chen <yangchen.openbmc@gmail.com>
Date: Tue, 12 Dec 2023 15:51:55 +0800
Subject: [PATCH 0024/1406] ARM: dts: aspeed: minerva: correct the address of
 eeprom

Correct the address from 0x51 to 0x54 of eeprom on the i2c bus 1

Signed-off-by: Yang Chen <yangchen.openbmc@gmail.com>
Reviewed-by: Joel Stanley <joel@jms.id.au>
Link: https://lore.kernel.org/r/20231212075200.983536-7-yangchen.openbmc@gmail.com
Signed-off-by: Joel Stanley <joel@jms.id.au>
---
 arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts b/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts
index 783d4d5a8f3d7e..f2a48033ac5ce7 100644
--- a/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts
+++ b/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts
@@ -123,9 +123,9 @@
 		reg = <0x48>;
 	};
 
-	eeprom@51 {
+	eeprom@54 {
 		compatible = "atmel,24c128";
-		reg = <0x51>;
+		reg = <0x54>;
 	};
 };
 

From 0a40f5979a40e6fe43eaede7d3244dfde86653ee Mon Sep 17 00:00:00 2001
From: Yang Chen <yangchen.openbmc@gmail.com>
Date: Tue, 12 Dec 2023 15:51:56 +0800
Subject: [PATCH 0025/1406] ARM: dts: aspeed: minerva: add bus labels and
 aliases

Add bus labels and aliases for the fan control board.

Signed-off-by: Yang Chen <yangchen.openbmc@gmail.com>
Reviewed-by: Joel Stanley <joel@jms.id.au>
Link: https://lore.kernel.org/r/20231212075200.983536-8-yangchen.openbmc@gmail.com
Signed-off-by: Joel Stanley <joel@jms.id.au>
---
 .../aspeed/aspeed-bmc-facebook-minerva.dts    | 22 ++++++++++++++-----
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts b/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts
index f2a48033ac5ce7..f4cb5ef72310f9 100644
--- a/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts
+++ b/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts
@@ -12,6 +12,16 @@
 
 	aliases {
 		serial5 = &uart5;
+		/*
+		 * PCA9548 (2-0077) provides 8 channels connecting to
+		 * 6 pcs of FCB (Fan Controller Board).
+		 */
+		i2c16 = &imux16;
+		i2c17 = &imux17;
+		i2c18 = &imux18;
+		i2c19 = &imux19;
+		i2c20 = &imux20;
+		i2c21 = &imux21;
 	};
 
 	chosen {
@@ -139,7 +149,7 @@
 		#size-cells = <0>;
 		i2c-mux-idle-disconnect;
 
-		i2c@0 {
+		imux16: i2c@0 {
 			#address-cells = <1>;
 			#size-cells = <0>;
 			reg = <0>;
@@ -150,7 +160,7 @@
 			};
 		};
 
-		i2c@1 {
+		imux17: i2c@1 {
 			#address-cells = <1>;
 			#size-cells = <0>;
 			reg = <1>;
@@ -161,7 +171,7 @@
 			};
 		};
 
-		i2c@2 {
+		imux18: i2c@2 {
 			#address-cells = <1>;
 			#size-cells = <0>;
 			reg = <2>;
@@ -172,7 +182,7 @@
 			};
 		};
 
-		i2c@3 {
+		imux19: i2c@3 {
 			#address-cells = <1>;
 			#size-cells = <0>;
 			reg = <3>;
@@ -183,7 +193,7 @@
 			};
 		};
 
-		i2c@4 {
+		imux20: i2c@4 {
 			#address-cells = <1>;
 			#size-cells = <0>;
 			reg = <4>;
@@ -194,7 +204,7 @@
 			};
 		};
 
-		i2c@5 {
+		imux21: i2c@5 {
 			#address-cells = <1>;
 			#size-cells = <0>;
 			reg = <5>;

From 165a1f2db3dd5cf7e7c6b65b0c0750412be75d5d Mon Sep 17 00:00:00 2001
From: Yang Chen <yangchen.openbmc@gmail.com>
Date: Tue, 12 Dec 2023 15:51:57 +0800
Subject: [PATCH 0026/1406] ARM: dts: aspeed: minerva: add fan rpm controller

Add fan rpm controller max31790 on all bus of FCB.

Signed-off-by: Yang Chen <yangchen.openbmc@gmail.com>
Reviewed-by: Joel Stanley <joel@jms.id.au>
Link: https://lore.kernel.org/r/20231212075200.983536-9-yangchen.openbmc@gmail.com
Signed-off-by: Joel Stanley <joel@jms.id.au>
---
 .../aspeed/aspeed-bmc-facebook-minerva.dts    | 42 +++++++++++++++++++
 1 file changed, 42 insertions(+)

diff --git a/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts b/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts
index f4cb5ef72310f9..c7445c819baf8e 100644
--- a/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts
+++ b/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts
@@ -158,6 +158,13 @@
 				compatible = "atmel,24c128";
 				reg = <0x50>;
 			};
+
+			pwm@5e{
+				compatible = "max31790";
+				reg = <0x5e>;
+				#address-cells = <1>;
+				#size-cells = <0>;
+			};
 		};
 
 		imux17: i2c@1 {
@@ -169,6 +176,13 @@
 				compatible = "atmel,24c128";
 				reg = <0x50>;
 			};
+
+			pwm@5e{
+				compatible = "max31790";
+				reg = <0x5e>;
+				#address-cells = <1>;
+				#size-cells = <0>;
+			};
 		};
 
 		imux18: i2c@2 {
@@ -180,6 +194,13 @@
 				compatible = "atmel,24c128";
 				reg = <0x50>;
 			};
+
+			pwm@5e{
+				compatible = "max31790";
+				reg = <0x5e>;
+				#address-cells = <1>;
+				#size-cells = <0>;
+			};
 		};
 
 		imux19: i2c@3 {
@@ -191,6 +212,13 @@
 				compatible = "atmel,24c128";
 				reg = <0x50>;
 			};
+
+			pwm@5e{
+				compatible = "max31790";
+				reg = <0x5e>;
+				#address-cells = <1>;
+				#size-cells = <0>;
+			};
 		};
 
 		imux20: i2c@4 {
@@ -202,6 +230,13 @@
 				compatible = "atmel,24c128";
 				reg = <0x50>;
 			};
+
+			pwm@5e{
+				compatible = "max31790";
+				reg = <0x5e>;
+				#address-cells = <1>;
+				#size-cells = <0>;
+			};
 		};
 
 		imux21: i2c@5 {
@@ -213,6 +248,13 @@
 				compatible = "atmel,24c128";
 				reg = <0x50>;
 			};
+
+			pwm@5e{
+				compatible = "max31790";
+				reg = <0x5e>;
+				#address-cells = <1>;
+				#size-cells = <0>;
+			};
 		};
 	};
 };

From f5dac195b500520257bc608f22937b928c2d0145 Mon Sep 17 00:00:00 2001
From: Yang Chen <yangchen.openbmc@gmail.com>
Date: Tue, 12 Dec 2023 15:51:58 +0800
Subject: [PATCH 0027/1406] ARM: dts: aspeed: minerva: Add led-fan-fault gpio

Add led-fan-fault gpio pin on the PCA9555 on the i2c bus 0.

Signed-off-by: Yang Chen <yangchen.openbmc@gmail.com>
Reviewed-by: Joel Stanley <joel@jms.id.au>
Link: https://lore.kernel.org/r/20231212075200.983536-10-yangchen.openbmc@gmail.com
Signed-off-by: Joel Stanley <joel@jms.id.au>
---
 .../dts/aspeed/aspeed-bmc-facebook-minerva.dts  | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts b/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts
index c7445c819baf8e..090fe2f6b1d897 100644
--- a/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts
+++ b/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts
@@ -39,6 +39,16 @@
 			<&adc0 4>, <&adc0 5>, <&adc0 6>, <&adc0 7>,
 			<&adc1 2>;
 	};
+
+	leds {
+		compatible = "gpio-leds";
+
+		led-fan-fault {
+			label = "led-fan-fault";
+			gpios = <&leds_gpio 9 GPIO_ACTIVE_HIGH>;
+			default-state = "off";
+		};
+	};
 };
 
 &uart6 {
@@ -118,6 +128,13 @@
 		compatible = "adi,ltc2945";
 		reg = <0x68>;
 	};
+
+	leds_gpio: gpio@19 {
+		compatible = "nxp,pca9555";
+		reg = <0x19>;
+		gpio-controller;
+		#gpio-cells = <2>;
+	};
 };
 
 &i2c1 {

From 7a7ed4a02a945c3fa8689c67711ed1aa34aa2415 Mon Sep 17 00:00:00 2001
From: Yang Chen <yangchen.openbmc@gmail.com>
Date: Tue, 12 Dec 2023 15:51:59 +0800
Subject: [PATCH 0028/1406] ARM: dts: aspeed: minerva: add gpio line name

Add the GPIO line name that the project's function can use by the
meaningful name.

Signed-off-by: Yang Chen <yangchen.openbmc@gmail.com>
Reviewed-by: Joel Stanley <joel@jms.id.au>
Link: https://lore.kernel.org/r/20231212075200.983536-11-yangchen.openbmc@gmail.com
Signed-off-by: Joel Stanley <joel@jms.id.au>
---
 .../aspeed/aspeed-bmc-facebook-minerva.dts    | 30 +++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts b/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts
index 090fe2f6b1d897..31197183cc59e5 100644
--- a/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts
+++ b/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts
@@ -362,3 +362,33 @@
 &uhci {
 	status = "okay";
 };
+
+&gpio0 {
+	gpio-line-names =
+	/*A0-A7*/	"","","","","","","","",
+	/*B0-B7*/	"","","","","","","","",
+	/*C0-C7*/	"","","","","BLADE_UART_SEL2","","","",
+	/*D0-D7*/	"","","","","","","","",
+	/*E0-E7*/	"","","","","","","","",
+	/*F0-F7*/	"","","","","","","","",
+	/*G0-G7*/	"","","","","","","","",
+	/*H0-H7*/	"","","","","","","","",
+	/*I0-I7*/	"","","","","","","","",
+	/*J0-J7*/	"","","","","","","","",
+	/*K0-K7*/	"","","","","","","","",
+	/*L0-L7*/	"","","","","BLADE_UART_SEL0","","","",
+	/*M0-M7*/	"","","","","","BLADE_UART_SEL1","","",
+	/*N0-N7*/	"","","","","","","","",
+	/*O0-O7*/	"","","","","","","","",
+	/*P0-P7*/	"","","","","","","","",
+	/*Q0-Q7*/	"","","","","","","","",
+	/*R0-R7*/	"","","","","","","","",
+	/*S0-S7*/	"","","","","","","","",
+	/*T0-T7*/	"","","","","","","","",
+	/*U0-U7*/	"","","","","","","","",
+	/*V0-V7*/	"","","","","BAT_DETECT","","","",
+	/*W0-W7*/	"","","","","","","","",
+	/*X0-X7*/	"","","BLADE_UART_SEL3","","","","","",
+	/*Y0-Y7*/	"","","","","","","","",
+	/*Z0-Z7*/	"","","","","","","","";
+};

From e60f7a99d3789b5d0b24d3c0571b013309e56815 Mon Sep 17 00:00:00 2001
From: Yang Chen <yangchen.openbmc@gmail.com>
Date: Tue, 12 Dec 2023 15:52:00 +0800
Subject: [PATCH 0029/1406] ARM: dts: aspeed: minerva: add sgpio line name

Add the SGPIO line name that the project's function can use by the
meaningful name.

Signed-off-by: Yang Chen <yangchen.openbmc@gmail.com>
Reviewed-by: Joel Stanley <joel@jms.id.au>
Link: https://lore.kernel.org/r/20231212075200.983536-12-yangchen.openbmc@gmail.com
Signed-off-by: Joel Stanley <joel@jms.id.au>
---
 .../aspeed/aspeed-bmc-facebook-minerva.dts    | 149 ++++++++++++++++++
 1 file changed, 149 insertions(+)

diff --git a/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts b/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts
index 31197183cc59e5..942e53d5c71409 100644
--- a/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts
+++ b/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts
@@ -392,3 +392,152 @@
 	/*Y0-Y7*/	"","","","","","","","",
 	/*Z0-Z7*/	"","","","","","","","";
 };
+
+&sgpiom0 {
+	gpio-line-names =
+	/*"input pin","output pin"*/
+	/*A0 - A7*/
+	"PRSNT_MTIA_BLADE0_N","PWREN_MTIA_BLADE0_EN",
+	"PRSNT_MTIA_BLADE1_N","PWREN_MTIA_BLADE1_EN",
+	"PRSNT_MTIA_BLADE2_N","PWREN_MTIA_BLADE2_EN",
+	"PRSNT_MTIA_BLADE3_N","PWREN_MTIA_BLADE3_EN",
+	"PRSNT_MTIA_BLADE4_N","PWREN_MTIA_BLADE4_EN",
+	"PRSNT_MTIA_BLADE5_N","PWREN_MTIA_BLADE5_EN",
+	"PRSNT_MTIA_BLADE6_N","PWREN_MTIA_BLADE6_EN",
+	"PRSNT_MTIA_BLADE7_N","PWREN_MTIA_BLADE7_EN",
+	/*B0 - B7*/
+	"PRSNT_MTIA_BLADE8_N","PWREN_MTIA_BLADE8_EN",
+	"PRSNT_MTIA_BLADE9_N","PWREN_MTIA_BLADE9_EN",
+	"PRSNT_MTIA_BLADE10_N","PWREN_MTIA_BLADE10_EN",
+	"PRSNT_MTIA_BLADE11_N","PWREN_MTIA_BLADE11_EN",
+	"PRSNT_MTIA_BLADE12_N","PWREN_MTIA_BLADE12_EN",
+	"PRSNT_MTIA_BLADE13_N","PWREN_MTIA_BLADE13_EN",
+	"PRSNT_MTIA_BLADE14_N","PWREN_MTIA_BLADE14_EN",
+	"PRSNT_MTIA_BLADE15_N","PWREN_MTIA_BLADE15_EN",
+	/*C0 - C7*/
+	"PRSNT_NW_BLADE0_N","PWREN_NW_BLADE0_EN",
+	"PRSNT_NW_BLADE1_N","PWREN_NW_BLADE1_EN",
+	"PRSNT_NW_BLADE2_N","PWREN_NW_BLADE2_EN",
+	"PRSNT_NW_BLADE3_N","PWREN_NW_BLADE3_EN",
+	"PRSNT_NW_BLADE4_N","PWREN_NW_BLADE4_EN",
+	"PRSNT_NW_BLADE5_N","PWREN_NW_BLADE5_EN",
+	"PRSNT_FCB_TOP_0_N","PWREN_MTIA_BLADE0_HSC_EN",
+	"PRSNT_FCB_TOP_1_N","PWREN_MTIA_BLADE1_HSC_EN",
+	/*D0 - D7*/
+	"PRSNT_FCB_MIDDLE_0_N","PWREN_MTIA_BLADE2_HSC_EN",
+	"PRSNT_FCB_MIDDLE_1_N","PWREN_MTIA_BLADE3_HSC_EN",
+	"PRSNT_FCB_BOTTOM_0_N","PWREN_MTIA_BLADE4_HSC_EN",
+	"PRSNT_FCB_BOTTOM_1_N","PWREN_MTIA_BLADE5_HSC_EN",
+	"PWRGD_MTIA_BLADE0_PWROK_L_BUF","PWREN_MTIA_BLADE6_HSC_EN",
+	"PWRGD_MTIA_BLADE1_PWROK_L_BUF","PWREN_MTIA_BLADE7_HSC_EN",
+	"PWRGD_MTIA_BLADE2_PWROK_L_BUF","PWREN_MTIA_BLADE8_HSC_EN",
+	"PWRGD_MTIA_BLADE3_PWROK_L_BUF","PWREN_MTIA_BLADE9_HSC_EN",
+	/*E0 - E7*/
+	"PWRGD_MTIA_BLADE4_PWROK_L_BUF","PWREN_MTIA_BLADE10_HSC_EN",
+	"PWRGD_MTIA_BLADE5_PWROK_L_BUF","PWREN_MTIA_BLADE11_HSC_EN",
+	"PWRGD_MTIA_BLADE6_PWROK_L_BUF","PWREN_MTIA_BLADE12_HSC_EN",
+	"PWRGD_MTIA_BLADE7_PWROK_L_BUF","PWREN_MTIA_BLADE13_HSC_EN",
+	"PWRGD_MTIA_BLADE8_PWROK_L_BUF","PWREN_MTIA_BLADE14_HSC_EN",
+	"PWRGD_MTIA_BLADE9_PWROK_L_BUF","PWREN_MTIA_BLADE15_HSC_EN",
+	"PWRGD_MTIA_BLADE10_PWROK_L_BUF","PWREN_NW_BLADE0_HSC_EN",
+	"PWRGD_MTIA_BLADE11_PWROK_L_BUF","PWREN_NW_BLADE1_HSC_EN",
+	/*F0 - F7*/
+	"PWRGD_MTIA_BLADE12_PWROK_L_BUF","PWREN_NW_BLADE2_HSC_EN",
+	"PWRGD_MTIA_BLADE13_PWROK_L_BUF","PWREN_NW_BLADE3_HSC_EN",
+	"PWRGD_MTIA_BLADE14_PWROK_L_BUF","PWREN_NW_BLADE4_HSC_EN",
+	"PWRGD_MTIA_BLADE15_PWROK_L_BUF","PWREN_NW_BLADE5_HSC_EN",
+	"PWRGD_NW_BLADE0_PWROK_L_BUF","PWREN_FCB_TOP_L_EN",
+	"PWRGD_NW_BLADE1_PWROK_L_BUF","PWREN_FCB_TOP_R_EN",
+	"PWRGD_NW_BLADE2_PWROK_L_BUF","PWREN_FCB_MIDDLE_L_EN",
+	"PWRGD_NW_BLADE3_PWROK_L_BUF","PWREN_FCB_MIDDLE_R_EN",
+	/*G0 - G7*/
+	"PWRGD_NW_BLADE4_PWROK_L_BUF","PWREN_FCB_BOTTOM_L_EN",
+	"PWRGD_NW_BLADE5_PWROK_L_BUF","PWREN_FCB_BOTTOM_R_EN",
+	"PWRGD_FCB_TOP_0_PWROK_L_BUF","FM_CMM_AC_CYCLE_N",
+	"PWRGD_FCB_TOP_1_PWROK_L_BUF","MGMT_SFP_TX_DIS",
+	"PWRGD_FCB_MIDDLE_0_PWROK_L_BUF","",
+	"PWRGD_FCB_MIDDLE_1_PWROK_L_BUF","RST_I2CRST_MTIA_BLADE0_1_N",
+	"PWRGD_FCB_BOTTOM_0_PWROK_L_BUF","RST_I2CRST_MTIA_BLADE2_3_N",
+	"PWRGD_FCB_BOTTOM_1_PWROK_L_BUF","RST_I2CRST_MTIA_BLADE4_5_N",
+	/*H0 - H7*/
+	"LEAK_DETECT_MTIA_BLADE0_N_BUF","RST_I2CRST_MTIA_BLADE6_7_N",
+	"LEAK_DETECT_MTIA_BLADE1_N_BUF","RST_I2CRST_MTIA_BLADE8_9_N",
+	"LEAK_DETECT_MTIA_BLADE2_N_BUF","RST_I2CRST_MTIA_BLADE10_11_N",
+	"LEAK_DETECT_MTIA_BLADE3_N_BUF","RST_I2CRST_MTIA_BLADE12_13_N",
+	"LEAK_DETECT_MTIA_BLADE4_N_BUF","RST_I2CRST_MTIA_BLADE14_15_N",
+	"LEAK_DETECT_MTIA_BLADE5_N_BUF","RST_I2CRST_NW_BLADE0_1_2_N",
+	"LEAK_DETECT_MTIA_BLADE6_N_BUF","RST_I2CRST_NW_BLADE3_4_5_N",
+	"LEAK_DETECT_MTIA_BLADE7_N_BUF","RST_I2CRST_FCB_N",
+	/*I0 - I7*/
+	"LEAK_DETECT_MTIA_BLADE8_N_BUF","RST_I2CRST_FCB_B_L_N",
+	"LEAK_DETECT_MTIA_BLADE9_N_BUF","RST_I2CRST_FCB_B_R_N",
+	"LEAK_DETECT_MTIA_BLADE10_N_BUF","RST_I2CRST_FCB_M_L_N",
+	"LEAK_DETECT_MTIA_BLADE11_N_BUF","RST_I2CRST_FCB_M_R_N",
+	"LEAK_DETECT_MTIA_BLADE12_N_BUF","RST_I2CRST_FCB_T_L_N",
+	"LEAK_DETECT_MTIA_BLADE13_N_BUF","RST_I2CRST_FCB_T_R_N",
+	"LEAK_DETECT_MTIA_BLADE14_N_BUF","BMC_READY",
+	"LEAK_DETECT_MTIA_BLADE15_N_BUF","wFM_88E6393X_BIN_UPDATE_EN_N",
+	/*J0 - J7*/
+	"LEAK_DETECT_NW_BLADE0_N_BUF","WATER_VALVE_CLOSED_N",
+	"LEAK_DETECT_NW_BLADE1_N_BUF","",
+	"LEAK_DETECT_NW_BLADE2_N_BUF","",
+	"LEAK_DETECT_NW_BLADE3_N_BUF","",
+	"LEAK_DETECT_NW_BLADE4_N_BUF","",
+	"LEAK_DETECT_NW_BLADE5_N_BUF","",
+	"MTIA_BLADE0_STATUS_LED","",
+	"MTIA_BLADE1_STATUS_LED","",
+	/*K0 - K7*/
+	"MTIA_BLADE2_STATUS_LED","",
+	"MTIA_BLADE3_STATUS_LED","",
+	"MTIA_BLADE4_STATUS_LED","",
+	"MTIA_BLADE5_STATUS_LED","",
+	"MTIA_BLADE6_STATUS_LED","",
+	"MTIA_BLADE7_STATUS_LED","",
+	"MTIA_BLADE8_STATUS_LED","",
+	"MTIA_BLADE9_STATUS_LED","",
+	/*L0 - L7*/
+	"MTIA_BLADE10_STATUS_LED","",
+	"MTIA_BLADE11_STATUS_LED","",
+	"MTIA_BLADE12_STATUS_LED","",
+	"MTIA_BLADE13_STATUS_LED","",
+	"MTIA_BLADE14_STATUS_LED","",
+	"MTIA_BLADE15_STATUS_LED","",
+	"NW_BLADE0_STATUS_LED","",
+	"NW_BLADE1_STATUS_LED","",
+	/*M0 - M7*/
+	"NW_BLADE2_STATUS_LED","",
+	"NW_BLADE3_STATUS_LED","",
+	"NW_BLADE4_STATUS_LED","",
+	"NW_BLADE5_STATUS_LED","",
+	"RPU_READY","",
+	"IT_GEAR_RPU_LINK_N","",
+	"IT_GEAR_LEAK","",
+	"WATER_VALVE_CLOSED_N","",
+	/*N0 - N7*/
+	"VALVE_STS0","",
+	"VALVE_STS1","",
+	"VALVE_STS2","",
+	"VALVE_STS3","",
+	"CR_TOGGLE_BOOT_BUF_N","",
+	"CMM_LC_RDY_LED_N","",
+	"CMM_LC_UNRDY_LED_N","",
+	"CMM_CABLE_CARTRIDGE_PRSNT_BOT_N","",
+	/*O0 - O7*/
+	"CMM_CABLE_CARTRIDGE_PRSNT_TOP_N","",
+	"BOT_BCB_CABLE_PRSNT_N","",
+	"TOP_BCB_CABLE_PRSNT_N","",
+	"CHASSIS0_LEAK_Q_N","",
+	"CHASSIS1_LEAK_Q_N","",
+	"LEAK0_DETECT","",
+	"LEAK1_DETECT","",
+	"MGMT_SFP_PRSNT_N","",
+	/*P0 - P7*/
+	"MGMT_SFP_TX_FAULT","",
+	"MGMT_SFP_RX_LOS","",
+	"","",
+	"","",
+	"","",
+	"","",
+	"","",
+	"","";
+};

From ec084e4ec314df8041882e22975fca2caff3bcce Mon Sep 17 00:00:00 2001
From: Ninad Palsule <ninad@linux.ibm.com>
Date: Tue, 10 Oct 2023 15:43:47 -0500
Subject: [PATCH 0030/1406] fsi: sbefifo: Bump up user write cmd length

This commit increases user write limit for command length from 1MB to
4MB. This is required to support images larger than 1MB.

As per 'commit 15e2a7218c27 ("fsi: sbefifo: Bump max command length")'
the alternate solution is to break image into 1MB pieces by cronous
server that means kernel driver needs to provide way to send end of
message command once all pieces are transferred. This requires
restructuring of both kernel driver and cronus server (application).
Hence this commit chose to bump up cmd length to reduce code impact.

Testing:
  Loaded 3 MB image through cronus server.

Signed-off-by: Ninad Palsule <ninad@linux.ibm.com>
Reviewed-by: Eddie James <eajames@linux.ibm.com>
Link: https://lore.kernel.org/r/20231010204348.2600242-2-ninad@linux.ibm.com
Signed-off-by: Joel Stanley <joel@jms.id.au>
---
 drivers/fsi/fsi-sbefifo.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/fsi/fsi-sbefifo.c b/drivers/fsi/fsi-sbefifo.c
index 0a98517f395918..c47426b9bfc6bf 100644
--- a/drivers/fsi/fsi-sbefifo.c
+++ b/drivers/fsi/fsi-sbefifo.c
@@ -113,7 +113,7 @@ enum sbe_state
 #define SBEFIFO_TIMEOUT_IN_RSP		1000
 
 /* Other constants */
-#define SBEFIFO_MAX_USER_CMD_LEN	(0x100000 + PAGE_SIZE)
+#define SBEFIFO_MAX_USER_CMD_LEN       (0x400000 + PAGE_SIZE)
 #define SBEFIFO_RESET_MAGIC		0x52534554 /* "RSET" */
 
 struct sbefifo {

From f7236a0c919eca31c5def62bf52aa8aabcc6effb Mon Sep 17 00:00:00 2001
From: Ninad Palsule <ninad@linux.ibm.com>
Date: Tue, 10 Oct 2023 15:43:48 -0500
Subject: [PATCH 0031/1406] fsi: sbefifo: Handle pending write command

If previous write command is still pending then free it first.

As per the current kernel driver design, write operation prepares a
buffer for FSI write, the actual FSI write is performed on next read
operation. There is a possibility of memory leak if buggy application
sends two back to back writes or two parallel writes.

Signed-off-by: Ninad Palsule <ninad@linux.ibm.com>
Reviewed-by: Eddie James <eajames@linux.ibm.com>
Link: https://lore.kernel.org/r/20231010204348.2600242-3-ninad@linux.ibm.com
Signed-off-by: Joel Stanley <joel@jms.id.au>
---
 drivers/fsi/fsi-sbefifo.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/drivers/fsi/fsi-sbefifo.c b/drivers/fsi/fsi-sbefifo.c
index c47426b9bfc6bf..0385476bfb03ae 100644
--- a/drivers/fsi/fsi-sbefifo.c
+++ b/drivers/fsi/fsi-sbefifo.c
@@ -882,6 +882,13 @@ static ssize_t sbefifo_user_write(struct file *file, const char __user *buf,
 
 	mutex_lock(&user->file_lock);
 
+	/* If previous write command is still pending then free it. It is safe
+	 * to do that because read cannot be in progress since we hold the
+	 * lock.
+	 */
+	if (user->pending_cmd)
+		sbefifo_release_command(user);
+
 	/* Can we use the pre-allocate buffer ? If not, allocate */
 	if (len <= PAGE_SIZE)
 		user->pending_cmd = user->cmd_page;

From c5eeb63edac9497f9a0d46d3b75cf8b293771ecf Mon Sep 17 00:00:00 2001
From: Lakshmi Yadlapati <lakshmiy@us.ibm.com>
Date: Wed, 13 Dec 2023 18:07:44 -0600
Subject: [PATCH 0032/1406] fsi: Fix panic on scom file read

Reading the scom file without the custom open method (i2cr_scom_open)
causes a kernel panic. This change replaces simple_open with i2cr_scom_open
to properly initialize the private_data field in the file structure,
preventing the panic during scom file operations.

Fixes: c0b34bed0bbf ("fsi: Add I2C Responder SCOM driver")
Signed-off-by: Lakshmi Yadlapati <lakshmiy@us.ibm.com>
Reviewed-by: Ninad Palsule <ninad@linux.ibm.com>
Link: https://lore.kernel.org/r/20231214000744.1281464-1-lakshmiy@us.ibm.com
Signed-off-by: Joel Stanley <joel@jms.id.au>
---
 drivers/fsi/i2cr-scom.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/drivers/fsi/i2cr-scom.c b/drivers/fsi/i2cr-scom.c
index cb7e02213032cc..8d65c562b488f5 100644
--- a/drivers/fsi/i2cr-scom.c
+++ b/drivers/fsi/i2cr-scom.c
@@ -73,9 +73,18 @@ static ssize_t i2cr_scom_write(struct file *filep, const char __user *buf, size_
 	return len;
 }
 
+static int i2cr_scom_open(struct inode *inode, struct file *file)
+{
+	struct i2cr_scom *scom = container_of(inode->i_cdev, struct i2cr_scom, cdev);
+
+	file->private_data = scom;
+
+	return 0;
+}
+
 static const struct file_operations i2cr_scom_fops = {
 	.owner		= THIS_MODULE,
-	.open		= simple_open,
+	.open		= i2cr_scom_open,
 	.llseek		= i2cr_scom_llseek,
 	.read		= i2cr_scom_read,
 	.write		= i2cr_scom_write,

From f7b487648986ecc0510996c5c638c61f5f811ccc Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Mon, 11 Dec 2023 18:27:15 -0800
Subject: [PATCH 0033/1406] lib/find: add atomic find_bit() primitives

Add helpers around test_and_{set,clear}_bit() that allow to search for
clear or set bits and flip them atomically.

The target patterns may look like this:

	for (idx = 0; idx < nbits; idx++)
		if (test_and_clear_bit(idx, bitmap))
			do_something(idx);

Or like this:

	do {
		bit = find_first_bit(bitmap, nbits);
		if (bit >= nbits)
			return nbits;
	} while (!test_and_clear_bit(bit, bitmap));
	return bit;

In both cases, the opencoded loop may be converted to a single function
or iterator call. Correspondingly:

	for_each_test_and_clear_bit(idx, bitmap, nbits)
		do_something(idx);

Or:
	return find_and_clear_bit(bitmap, nbits);

Obviously, the less routine code people have to write themself, the
less probability to make a mistake.

Those are not only handy helpers but also resolve a non-trivial
issue of using non-atomic find_bit() together with atomic
test_and_{set,clear)_bit().

The trick is that find_bit() implies that the bitmap is a regular
non-volatile piece of memory, and compiler is allowed to use such
optimization techniques like re-fetching memory instead of caching it.

For example, find_first_bit() is implemented like this:

      for (idx = 0; idx * BITS_PER_LONG < sz; idx++) {
              val = addr[idx];
              if (val) {
                      sz = min(idx * BITS_PER_LONG + __ffs(val), sz);
                      break;
              }
      }

On register-memory architectures, like x86, compiler may decide to
access memory twice - first time to compare against 0, and second time
to fetch its value to pass it to __ffs().

When running find_first_bit() on volatile memory, the memory may get
changed in-between, and for instance, it may lead to passing 0 to
__ffs(), which is undefined. This is a potentially dangerous call.

find_and_clear_bit() as a wrapper around test_and_clear_bit()
naturally treats underlying bitmap as a volatile memory and prevents
compiler from such optimizations.

Now that KCSAN is catching exactly this type of situations and warns on
undercover memory modifications. We can use it to reveal improper usage
of find_bit(), and convert it to atomic find_and_*_bit() as appropriate.

In some cases concurrent operations with plain find_bit() are acceptable.
For example:

 - two threads running find_*_bit(): safe wrt ffs(0) and returns correct
   value, because underlying bitmap is unchanged;
 - find_next_bit() in parallel with set or clear_bit(), when modifying
   a bit prior to the start bit to search: safe and correct;
 - find_first_bit() in parallel with set_bit(): safe, but may return wrong
   bit number;
 - find_first_zero_bit() in parallel with clear_bit(): same as above.

In last 2 cases find_bit() may not return a correct bit number, but
it may be OK if caller requires any (not exactly the first) set or clear
bit, correspondingly.

In such cases, KCSAN may be safely silenced with data_race(). But in most
cases where KCSAN detects concurrency people should carefully review their
code and likely protect critical sections or switch to atomic
find_and_bit(), as appropriate.

The 1st patch of the series adds the following atomic primitives:

	find_and_set_bit(addr, nbits);
	find_and_set_next_bit(addr, nbits, start);
	...

Here find_and_{set,clear} part refers to the corresponding
test_and_{set,clear}_bit function. Suffixes like _wrap or _lock
derive their semantics from corresponding find() or test() functions.

For brevity, the naming omits the fact that we search for zero bit in
find_and_set, and correspondingly search for set bit in find_and_clear
functions.

The patch also adds iterators with atomic semantics, like
for_each_test_and_set_bit(). Here, the naming rule is to simply prefix
corresponding atomic operation with 'for_each'.

CC: Bart Van Assche <bvanassche@acm.org>
CC: Sergey Shtylyov <s.shtylyov@omp.ru>
Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 include/linux/find.h | 293 +++++++++++++++++++++++++++++++++++++++++++
 lib/find_bit.c       |  85 +++++++++++++
 2 files changed, 378 insertions(+)

diff --git a/include/linux/find.h b/include/linux/find.h
index c69598e383c161..50eeeed5d8a34b 100644
--- a/include/linux/find.h
+++ b/include/linux/find.h
@@ -32,6 +32,16 @@ extern unsigned long _find_first_and_bit(const unsigned long *addr1,
 extern unsigned long _find_first_zero_bit(const unsigned long *addr, unsigned long size);
 extern unsigned long _find_last_bit(const unsigned long *addr, unsigned long size);
 
+unsigned long _find_and_set_bit(volatile unsigned long *addr, unsigned long nbits);
+unsigned long _find_and_set_next_bit(volatile unsigned long *addr, unsigned long nbits,
+				unsigned long start);
+unsigned long _find_and_set_bit_lock(volatile unsigned long *addr, unsigned long nbits);
+unsigned long _find_and_set_next_bit_lock(volatile unsigned long *addr, unsigned long nbits,
+					  unsigned long start);
+unsigned long _find_and_clear_bit(volatile unsigned long *addr, unsigned long nbits);
+unsigned long _find_and_clear_next_bit(volatile unsigned long *addr, unsigned long nbits,
+				unsigned long start);
+
 #ifdef __BIG_ENDIAN
 unsigned long _find_first_zero_bit_le(const unsigned long *addr, unsigned long size);
 unsigned long _find_next_zero_bit_le(const  unsigned long *addr, unsigned
@@ -460,6 +470,267 @@ unsigned long __for_each_wrap(const unsigned long *bitmap, unsigned long size,
 	return bit < start ? bit : size;
 }
 
+/**
+ * find_and_set_bit - Find a zero bit and set it atomically
+ * @addr: The address to base the search on
+ * @nbits: The bitmap size in bits
+ *
+ * This function is designed to operate in concurrent access environment.
+ *
+ * Because of concurrency and volatile nature of underlying bitmap, it's not
+ * guaranteed that the found bit is the 1st bit in the bitmap. It's also not
+ * guaranteed that if @nbits is returned, the bitmap is empty.
+ *
+ * The function does guarantee that if returned value is in range [0 .. @nbits),
+ * the acquired bit belongs to the caller exclusively.
+ *
+ * Returns: found and set bit, or @nbits if no bits found
+ */
+static inline
+unsigned long find_and_set_bit(volatile unsigned long *addr, unsigned long nbits)
+{
+	if (small_const_nbits(nbits)) {
+		unsigned long val, ret;
+
+		do {
+			val = *addr | ~GENMASK(nbits - 1, 0);
+			if (val == ~0UL)
+				return nbits;
+			ret = ffz(val);
+		} while (test_and_set_bit(ret, addr));
+
+		return ret;
+	}
+
+	return _find_and_set_bit(addr, nbits);
+}
+
+
+/**
+ * find_and_set_next_bit - Find a zero bit and set it, starting from @offset
+ * @addr: The address to base the search on
+ * @nbits: The bitmap nbits in bits
+ * @offset: The bitnumber to start searching at
+ *
+ * This function is designed to operate in concurrent access environment.
+ *
+ * Because of concurrency and volatile nature of underlying bitmap, it's not
+ * guaranteed that the found bit is the 1st bit in the bitmap, starting from @offset.
+ * It's also not guaranteed that if @nbits is returned, the bitmap is empty.
+ *
+ * The function does guarantee that if returned value is in range [@offset .. @nbits),
+ * the acquired bit belongs to the caller exclusively.
+ *
+ * Returns: found and set bit, or @nbits if no bits found
+ */
+static inline
+unsigned long find_and_set_next_bit(volatile unsigned long *addr,
+				    unsigned long nbits, unsigned long offset)
+{
+	if (small_const_nbits(nbits)) {
+		unsigned long val, ret;
+
+		do {
+			val = *addr | ~GENMASK(nbits - 1, offset);
+			if (val == ~0UL)
+				return nbits;
+			ret = ffz(val);
+		} while (test_and_set_bit(ret, addr));
+
+		return ret;
+	}
+
+	return _find_and_set_next_bit(addr, nbits, offset);
+}
+
+/**
+ * find_and_set_bit_wrap - find and set bit starting at @offset, wrapping around zero
+ * @addr: The first address to base the search on
+ * @nbits: The bitmap size in bits
+ * @offset: The bitnumber to start searching at
+ *
+ * Returns: the bit number for the next clear bit, or first clear bit up to @offset,
+ * while atomically setting it. If no bits are found, returns @nbits.
+ */
+static inline
+unsigned long find_and_set_bit_wrap(volatile unsigned long *addr,
+					unsigned long nbits, unsigned long offset)
+{
+	unsigned long bit = find_and_set_next_bit(addr, nbits, offset);
+
+	if (bit < nbits || offset == 0)
+		return bit;
+
+	bit = find_and_set_bit(addr, offset);
+	return bit < offset ? bit : nbits;
+}
+
+/**
+ * find_and_set_bit_lock - find a zero bit, then set it atomically with lock
+ * @addr: The address to base the search on
+ * @nbits: The bitmap nbits in bits
+ *
+ * This function is designed to operate in concurrent access environment.
+ *
+ * Because of concurrency and volatile nature of underlying bitmap, it's not
+ * guaranteed that the found bit is the 1st bit in the bitmap. It's also not
+ * guaranteed that if @nbits is returned, the bitmap is empty.
+ *
+ * The function does guarantee that if returned value is in range [0 .. @nbits),
+ * the acquired bit belongs to the caller exclusively.
+ *
+ * Returns: found and set bit, or @nbits if no bits found
+ */
+static inline
+unsigned long find_and_set_bit_lock(volatile unsigned long *addr, unsigned long nbits)
+{
+	if (small_const_nbits(nbits)) {
+		unsigned long val, ret;
+
+		do {
+			val = *addr | ~GENMASK(nbits - 1, 0);
+			if (val == ~0UL)
+				return nbits;
+			ret = ffz(val);
+		} while (test_and_set_bit_lock(ret, addr));
+
+		return ret;
+	}
+
+	return _find_and_set_bit_lock(addr, nbits);
+}
+
+/**
+ * find_and_set_next_bit_lock - find a zero bit and set it atomically with lock
+ * @addr: The address to base the search on
+ * @nbits: The bitmap size in bits
+ * @offset: The bitnumber to start searching at
+ *
+ * This function is designed to operate in concurrent access environment.
+ *
+ * Because of concurrency and volatile nature of underlying bitmap, it's not
+ * guaranteed that the found bit is the 1st bit in the range. It's also not
+ * guaranteed that if @nbits is returned, the bitmap is empty.
+ *
+ * The function does guarantee that if returned value is in range [@offset .. @nbits),
+ * the acquired bit belongs to the caller exclusively.
+ *
+ * Returns: found and set bit, or @nbits if no bits found
+ */
+static inline
+unsigned long find_and_set_next_bit_lock(volatile unsigned long *addr,
+					 unsigned long nbits, unsigned long offset)
+{
+	if (small_const_nbits(nbits)) {
+		unsigned long val, ret;
+
+		do {
+			val = *addr | ~GENMASK(nbits - 1, offset);
+			if (val == ~0UL)
+				return nbits;
+			ret = ffz(val);
+		} while (test_and_set_bit_lock(ret, addr));
+
+		return ret;
+	}
+
+	return _find_and_set_next_bit_lock(addr, nbits, offset);
+}
+
+/**
+ * find_and_set_bit_wrap_lock - find zero bit starting at @ofset and set it
+ *				with lock, and wrap around zero if nothing found
+ * @addr: The first address to base the search on
+ * @nbits: The bitmap size in bits
+ * @offset: The bitnumber to start searching at
+ *
+ * Returns: the bit number for the next set bit, or first set bit up to @offset
+ * If no bits are set, returns @nbits.
+ */
+static inline
+unsigned long find_and_set_bit_wrap_lock(volatile unsigned long *addr,
+					unsigned long nbits, unsigned long offset)
+{
+	unsigned long bit = find_and_set_next_bit_lock(addr, nbits, offset);
+
+	if (bit < nbits || offset == 0)
+		return bit;
+
+	bit = find_and_set_bit_lock(addr, offset);
+	return bit < offset ? bit : nbits;
+}
+
+/**
+ * find_and_clear_bit - Find a set bit and clear it atomically
+ * @addr: The address to base the search on
+ * @nbits: The bitmap nbits in bits
+ *
+ * This function is designed to operate in concurrent access environment.
+ *
+ * Because of concurrency and volatile nature of underlying bitmap, it's not
+ * guaranteed that the found bit is the 1st bit in the bitmap. It's also not
+ * guaranteed that if @nbits is returned, the bitmap is empty.
+ *
+ * The function does guarantee that if returned value is in range [0 .. @nbits),
+ * the acquired bit belongs to the caller exclusively.
+ *
+ * Returns: found and cleared bit, or @nbits if no bits found
+ */
+static inline unsigned long find_and_clear_bit(volatile unsigned long *addr, unsigned long nbits)
+{
+	if (small_const_nbits(nbits)) {
+		unsigned long val, ret;
+
+		do {
+			val = *addr & GENMASK(nbits - 1, 0);
+			if (val == 0)
+				return nbits;
+			ret = __ffs(val);
+		} while (!test_and_clear_bit(ret, addr));
+
+		return ret;
+	}
+
+	return _find_and_clear_bit(addr, nbits);
+}
+
+/**
+ * find_and_clear_next_bit - Find a set bit next after @offset, and clear it atomically
+ * @addr: The address to base the search on
+ * @nbits: The bitmap nbits in bits
+ * @offset: bit offset at which to start searching
+ *
+ * This function is designed to operate in concurrent access environment.
+ *
+ * Because of concurrency and volatile nature of underlying bitmap, it's not
+ * guaranteed that the found bit is the 1st bit in the range It's also not
+ * guaranteed that if @nbits is returned, there's no set bits after @offset.
+ *
+ * The function does guarantee that if returned value is in range [@offset .. @nbits),
+ * the acquired bit belongs to the caller exclusively.
+ *
+ * Returns: found and cleared bit, or @nbits if no bits found
+ */
+static inline
+unsigned long find_and_clear_next_bit(volatile unsigned long *addr,
+					unsigned long nbits, unsigned long offset)
+{
+	if (small_const_nbits(nbits)) {
+		unsigned long val, ret;
+
+		do {
+			val = *addr & GENMASK(nbits - 1, offset);
+			if (val == 0)
+				return nbits;
+			ret = __ffs(val);
+		} while (!test_and_clear_bit(ret, addr));
+
+		return ret;
+	}
+
+	return _find_and_clear_next_bit(addr, nbits, offset);
+}
+
 /**
  * find_next_clump8 - find next 8-bit clump with set bits in a memory region
  * @clump: location to store copy of found clump
@@ -577,6 +848,28 @@ unsigned long find_next_bit_le(const void *addr, unsigned
 #define for_each_set_bit_from(bit, addr, size) \
 	for (; (bit) = find_next_bit((addr), (size), (bit)), (bit) < (size); (bit)++)
 
+/* same as for_each_set_bit() but atomically clears each found bit */
+#define for_each_test_and_clear_bit(bit, addr, size) \
+	for ((bit) = 0; \
+	     (bit) = find_and_clear_next_bit((addr), (size), (bit)), (bit) < (size); \
+	     (bit)++)
+
+/* same as for_each_set_bit_from() but atomically clears each found bit */
+#define for_each_test_and_clear_bit_from(bit, addr, size) \
+	for (; (bit) = find_and_clear_next_bit((addr), (size), (bit)), (bit) < (size); (bit)++)
+
+/* same as for_each_clear_bit() but atomically sets each found bit */
+#define for_each_test_and_set_bit(bit, addr, size) \
+	for ((bit) = 0; \
+	     (bit) = find_and_set_next_bit((addr), (size), (bit)), (bit) < (size); \
+	     (bit)++)
+
+/* same as for_each_clear_bit_from() but atomically clears each found bit */
+#define for_each_test_and_set_bit_from(bit, addr, size) \
+	for (; \
+	     (bit) = find_and_set_next_bit((addr), (size), (bit)), (bit) < (size); \
+	     (bit)++)
+
 #define for_each_clear_bit(bit, addr, size) \
 	for ((bit) = 0;									\
 	     (bit) = find_next_zero_bit((addr), (size), (bit)), (bit) < (size);		\
diff --git a/lib/find_bit.c b/lib/find_bit.c
index 32f99e9a670e64..c9b6b9f966108f 100644
--- a/lib/find_bit.c
+++ b/lib/find_bit.c
@@ -116,6 +116,91 @@ unsigned long _find_first_and_bit(const unsigned long *addr1,
 EXPORT_SYMBOL(_find_first_and_bit);
 #endif
 
+unsigned long _find_and_set_bit(volatile unsigned long *addr, unsigned long nbits)
+{
+	unsigned long bit;
+
+	do {
+		bit = FIND_FIRST_BIT(~addr[idx], /* nop */, nbits);
+		if (bit >= nbits)
+			return nbits;
+	} while (test_and_set_bit(bit, addr));
+
+	return bit;
+}
+EXPORT_SYMBOL(_find_and_set_bit);
+
+unsigned long _find_and_set_next_bit(volatile unsigned long *addr,
+				     unsigned long nbits, unsigned long start)
+{
+	unsigned long bit;
+
+	do {
+		bit = FIND_NEXT_BIT(~addr[idx], /* nop */, nbits, start);
+		if (bit >= nbits)
+			return nbits;
+	} while (test_and_set_bit(bit, addr));
+
+	return bit;
+}
+EXPORT_SYMBOL(_find_and_set_next_bit);
+
+unsigned long _find_and_set_bit_lock(volatile unsigned long *addr, unsigned long nbits)
+{
+	unsigned long bit;
+
+	do {
+		bit = FIND_FIRST_BIT(~addr[idx], /* nop */, nbits);
+		if (bit >= nbits)
+			return nbits;
+	} while (test_and_set_bit_lock(bit, addr));
+
+	return bit;
+}
+EXPORT_SYMBOL(_find_and_set_bit_lock);
+
+unsigned long _find_and_set_next_bit_lock(volatile unsigned long *addr,
+					  unsigned long nbits, unsigned long start)
+{
+	unsigned long bit;
+
+	do {
+		bit = FIND_NEXT_BIT(~addr[idx], /* nop */, nbits, start);
+		if (bit >= nbits)
+			return nbits;
+	} while (test_and_set_bit_lock(bit, addr));
+
+	return bit;
+}
+EXPORT_SYMBOL(_find_and_set_next_bit_lock);
+
+unsigned long _find_and_clear_bit(volatile unsigned long *addr, unsigned long nbits)
+{
+	unsigned long bit;
+
+	do {
+		bit = FIND_FIRST_BIT(addr[idx], /* nop */, nbits);
+		if (bit >= nbits)
+			return nbits;
+	} while (!test_and_clear_bit(bit, addr));
+
+	return bit;
+}
+EXPORT_SYMBOL(_find_and_clear_bit);
+
+unsigned long _find_and_clear_next_bit(volatile unsigned long *addr,
+					unsigned long nbits, unsigned long start)
+{
+	do {
+		start =  FIND_NEXT_BIT(addr[idx], /* nop */, nbits, start);
+		if (start >= nbits)
+			return nbits;
+	} while (!test_and_clear_bit(start, addr));
+
+	return start;
+}
+EXPORT_SYMBOL(_find_and_clear_next_bit);
+
 #ifndef find_first_zero_bit
 /*
  * Find the first cleared bit in a memory region.

From 9297e20670743257f6c3fe7ebd6be5802b1dc8c7 Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Mon, 11 Dec 2023 18:27:16 -0800
Subject: [PATCH 0034/1406] lib/find: add test for atomic find_bit() ops

Add basic functionality test for new API.

Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 lib/test_bitmap.c | 61 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 61 insertions(+)

diff --git a/lib/test_bitmap.c b/lib/test_bitmap.c
index 65f22c2578b066..277e1ca9fd2847 100644
--- a/lib/test_bitmap.c
+++ b/lib/test_bitmap.c
@@ -221,6 +221,65 @@ static void __init test_zero_clear(void)
 	expect_eq_pbl("", bmap, 1024);
 }
 
+static void __init test_find_and_bit(void)
+{
+	unsigned long w, w_part, bit, cnt = 0;
+	DECLARE_BITMAP(bmap, EXP1_IN_BITS);
+
+	/*
+	 * Test find_and_clear{_next}_bit() and corresponding
+	 * iterators
+	 */
+	bitmap_copy(bmap, exp1, EXP1_IN_BITS);
+	w = bitmap_weight(bmap, EXP1_IN_BITS);
+
+	for_each_test_and_clear_bit(bit, bmap, EXP1_IN_BITS)
+		cnt++;
+
+	expect_eq_uint(w, cnt);
+	expect_eq_uint(0, bitmap_weight(bmap, EXP1_IN_BITS));
+
+	bitmap_copy(bmap, exp1, EXP1_IN_BITS);
+	w = bitmap_weight(bmap, EXP1_IN_BITS);
+	w_part = bitmap_weight(bmap, EXP1_IN_BITS / 3);
+
+	cnt = 0;
+	bit = EXP1_IN_BITS / 3;
+	for_each_test_and_clear_bit_from(bit, bmap, EXP1_IN_BITS)
+		cnt++;
+
+	expect_eq_uint(bitmap_weight(bmap, EXP1_IN_BITS), bitmap_weight(bmap, EXP1_IN_BITS / 3));
+	expect_eq_uint(w_part, bitmap_weight(bmap, EXP1_IN_BITS));
+	expect_eq_uint(w - w_part, cnt);
+
+	/*
+	 * Test find_and_set{_next}_bit() and corresponding
+	 * iterators
+	 */
+	bitmap_copy(bmap, exp1, EXP1_IN_BITS);
+	w = bitmap_weight(bmap, EXP1_IN_BITS);
+	cnt = 0;
+
+	for_each_test_and_set_bit(bit, bmap, EXP1_IN_BITS)
+		cnt++;
+
+	expect_eq_uint(EXP1_IN_BITS - w, cnt);
+	expect_eq_uint(EXP1_IN_BITS, bitmap_weight(bmap, EXP1_IN_BITS));
+
+	bitmap_copy(bmap, exp1, EXP1_IN_BITS);
+	w = bitmap_weight(bmap, EXP1_IN_BITS);
+	w_part = bitmap_weight(bmap, EXP1_IN_BITS / 3);
+	cnt = 0;
+
+	bit = EXP1_IN_BITS / 3;
+	for_each_test_and_set_bit_from(bit, bmap, EXP1_IN_BITS)
+		cnt++;
+
+	expect_eq_uint(EXP1_IN_BITS - bitmap_weight(bmap, EXP1_IN_BITS),
+			EXP1_IN_BITS / 3 - bitmap_weight(bmap, EXP1_IN_BITS / 3));
+	expect_eq_uint(EXP1_IN_BITS * 2 / 3 - (w - w_part), cnt);
+}
+
 static void __init test_find_nth_bit(void)
 {
 	unsigned long b, bit, cnt = 0;
@@ -1273,6 +1332,8 @@ static void __init selftest(void)
 	test_for_each_clear_bitrange_from();
 	test_for_each_set_clump8();
 	test_for_each_set_bit_wrap();
+
+	test_find_and_bit();
 }
 
 KSTM_MODULE_LOADERS(test_bitmap);

From 0af7b0df61f906c57568b8730df70058820a7613 Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Mon, 11 Dec 2023 18:27:17 -0800
Subject: [PATCH 0035/1406] lib/sbitmap; optimize __sbitmap_get_word() by using
 find_and_set_bit()

__sbitmap_get_word() opencodes either find_and_set_bit_wrap(), or
find_and_set_next_bit() depending on wrap parameter. Simplify it by using
atomic find_bit() API.

While here, simplify sbitmap_find_bit_in_word(), which calls it.

CC: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Yury Norov <yury.norov@gmail.com>
Reviewed-by: Jan Kara <jack@suse.cz>
---
 lib/sbitmap.c | 46 +++++++++-------------------------------------
 1 file changed, 9 insertions(+), 37 deletions(-)

diff --git a/lib/sbitmap.c b/lib/sbitmap.c
index d0a5081dfd122e..8ecd830ba9e896 100644
--- a/lib/sbitmap.c
+++ b/lib/sbitmap.c
@@ -133,38 +133,13 @@ void sbitmap_resize(struct sbitmap *sb, unsigned int depth)
 }
 EXPORT_SYMBOL_GPL(sbitmap_resize);
 
-static int __sbitmap_get_word(unsigned long *word, unsigned long depth,
+static inline int __sbitmap_get_word(unsigned long *word, unsigned long depth,
 			      unsigned int hint, bool wrap)
 {
-	int nr;
-
-	/* don't wrap if starting from 0 */
-	wrap = wrap && hint;
-
-	while (1) {
-		nr = find_next_zero_bit(word, depth, hint);
-		if (unlikely(nr >= depth)) {
-			/*
-			 * We started with an offset, and we didn't reset the
-			 * offset to 0 in a failure case, so start from 0 to
-			 * exhaust the map.
-			 */
-			if (hint && wrap) {
-				hint = 0;
-				continue;
-			}
-			return -1;
-		}
+	if (wrap)
+		return find_and_set_bit_wrap_lock(word, depth, hint);
 
-		if (!test_and_set_bit_lock(nr, word))
-			break;
-
-		hint = nr + 1;
-		if (hint >= depth - 1)
-			hint = 0;
-	}
-
-	return nr;
+	return find_and_set_next_bit_lock(word, depth, hint);
 }
 
 static int sbitmap_find_bit_in_word(struct sbitmap_word *map,
@@ -175,15 +150,12 @@ static int sbitmap_find_bit_in_word(struct sbitmap_word *map,
 	int nr;
 
 	do {
-		nr = __sbitmap_get_word(&map->word, depth,
-					alloc_hint, wrap);
-		if (nr != -1)
-			break;
-		if (!sbitmap_deferred_clear(map))
-			break;
-	} while (1);
+		nr = __sbitmap_get_word(&map->word, depth, alloc_hint, wrap);
+		if (nr < depth)
+			return nr;
+	} while (sbitmap_deferred_clear(map));
 
-	return nr;
+	return -1;
 }
 
 static int sbitmap_find_bit(struct sbitmap *sb,

From fc3bdc592a724edeb6546f39f168067571007d7a Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Mon, 11 Dec 2023 18:27:18 -0800
Subject: [PATCH 0036/1406] watch_queue: optimize post_one_notification() by
 using find_and_clear_bit()

post_one_notification() searches for a set bit in wqueue->notes_bitmap,
and after some housekeeping work clears it, firing a BUG() if someone
else cleared the bit in-between.

We can allocate the bit atomically with an atomic find_and_clear_bit(),
and remove the BUG() possibility entirely.

Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 kernel/watch_queue.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c
index 778b4056700ff5..07edd4a2b4636b 100644
--- a/kernel/watch_queue.c
+++ b/kernel/watch_queue.c
@@ -112,7 +112,7 @@ static bool post_one_notification(struct watch_queue *wqueue,
 	if (pipe_full(head, tail, pipe->ring_size))
 		goto lost;
 
-	note = find_first_bit(wqueue->notes_bitmap, wqueue->nr_notes);
+	note = find_and_clear_bit(wqueue->notes_bitmap, wqueue->nr_notes);
 	if (note >= wqueue->nr_notes)
 		goto lost;
 
@@ -133,10 +133,6 @@ static bool post_one_notification(struct watch_queue *wqueue,
 	buf->flags = PIPE_BUF_FLAG_WHOLE;
 	smp_store_release(&pipe->head, head + 1); /* vs pipe_read() */
 
-	if (!test_and_clear_bit(note, wqueue->notes_bitmap)) {
-		spin_unlock_irq(&pipe->rd_wait.lock);
-		BUG();
-	}
 	wake_up_interruptible_sync_poll_locked(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM);
 	done = true;
 

From cd6c08c6647d524d71f51428b78fe72590d42c16 Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Mon, 11 Dec 2023 18:27:19 -0800
Subject: [PATCH 0037/1406] sched: add cpumask_find_and_set() and use it in
 __mm_cid_get()

__mm_cid_get() uses __mm_cid_try_get() helper to atomically acquire a
bit in mm cid mask. Now that we have atomic find_and_set_bit(), we can
easily extend it to cpumasks and use in the scheduler code.

cpumask_find_and_set() considers cid mask as a volatile region of memory,
as it actually is in this case. So, if it's changed while search is in
progress, KCSAN wouldn't fire warning on it.

CC: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Yury Norov <yury.norov@gmail.com>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
---
 include/linux/cpumask.h | 12 ++++++++++++
 kernel/sched/sched.h    | 14 +++++---------
 2 files changed, 17 insertions(+), 9 deletions(-)

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index cfb545841a2c74..c2acced8be4ece 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -271,6 +271,18 @@ unsigned int cpumask_next_and(int n, const struct cpumask *src1p,
 		small_cpumask_bits, n + 1);
 }
 
+/**
+ * cpumask_find_and_set - find the first unset cpu in a cpumask and
+ *			  set it atomically
+ * @srcp: the cpumask pointer
+ *
+ * Return: >= nr_cpu_ids if nothing is found.
+ */
+static inline unsigned int cpumask_find_and_set(volatile struct cpumask *srcp)
+{
+	return find_and_set_bit(cpumask_bits(srcp), small_cpumask_bits);
+}
+
 /**
  * for_each_cpu - iterate over every cpu in a mask
  * @cpu: the (optionally unsigned) integer iterator
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 2e5a95486a4222..2ce9112de89be1 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3347,23 +3347,19 @@ static inline void mm_cid_put(struct mm_struct *mm)
 
 static inline int __mm_cid_try_get(struct mm_struct *mm)
 {
-	struct cpumask *cpumask;
-	int cid;
+	struct cpumask *cpumask = mm_cidmask(mm);
+	int cid = nr_cpu_ids;
 
-	cpumask = mm_cidmask(mm);
 	/*
 	 * Retry finding first zero bit if the mask is temporarily
 	 * filled. This only happens during concurrent remote-clear
 	 * which owns a cid without holding a rq lock.
 	 */
-	for (;;) {
-		cid = cpumask_first_zero(cpumask);
-		if (cid < nr_cpu_ids)
-			break;
+	while (cid >= nr_cpu_ids) {
+		cid = cpumask_find_and_set(cpumask);
 		cpu_relax();
 	}
-	if (cpumask_test_and_set_cpu(cid, cpumask))
-		return -1;
+
 	return cid;
 }
 

From 991411e2febc2f472d7ace069bb371d1bf2f6df4 Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Mon, 11 Dec 2023 18:27:20 -0800
Subject: [PATCH 0038/1406] mips: sgi-ip30: optimize heart_alloc_int() by using
 find_and_set_bit()

heart_alloc_int() opencodes find_and_set_bit(). Simplify it by using the
dedicated function, and make an nice one-liner.

Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 arch/mips/sgi-ip30/ip30-irq.c | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/arch/mips/sgi-ip30/ip30-irq.c b/arch/mips/sgi-ip30/ip30-irq.c
index 423c32cb66ed52..3c4d4e947817fb 100644
--- a/arch/mips/sgi-ip30/ip30-irq.c
+++ b/arch/mips/sgi-ip30/ip30-irq.c
@@ -28,17 +28,9 @@ static DEFINE_PER_CPU(unsigned long, irq_enable_mask);
 
 static inline int heart_alloc_int(void)
 {
-	int bit;
+	int bit = find_and_set_bit(heart_irq_map, HEART_NUM_IRQS);
 
-again:
-	bit = find_first_zero_bit(heart_irq_map, HEART_NUM_IRQS);
-	if (bit >= HEART_NUM_IRQS)
-		return -ENOSPC;
-
-	if (test_and_set_bit(bit, heart_irq_map))
-		goto again;
-
-	return bit;
+	return bit < HEART_NUM_IRQS ? bit : -ENOSPC;
 }
 
 static void ip30_error_irq(struct irq_desc *desc)

From 448a89c116ca108606670621012cc5424016410f Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Mon, 11 Dec 2023 18:27:21 -0800
Subject: [PATCH 0039/1406] sparc: optimize alloc_msi() by using
 find_and_set_bit()

alloc_msi() opencodes find_and_set_bit(). Simplify it by using the
dedicated function, and make an nice one-liner.

Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 arch/sparc/kernel/pci_msi.c | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/arch/sparc/kernel/pci_msi.c b/arch/sparc/kernel/pci_msi.c
index fc7402948b7bc0..91105c788d1d9d 100644
--- a/arch/sparc/kernel/pci_msi.c
+++ b/arch/sparc/kernel/pci_msi.c
@@ -96,14 +96,9 @@ static u32 pick_msiq(struct pci_pbm_info *pbm)
 
 static int alloc_msi(struct pci_pbm_info *pbm)
 {
-	int i;
-
-	for (i = 0; i < pbm->msi_num; i++) {
-		if (!test_and_set_bit(i, pbm->msi_bitmap))
-			return i + pbm->msi_first;
-	}
+	int i = find_and_set_bit(pbm->msi_bitmap, pbm->msi_num);
 
-	return -ENOENT;
+	return i < pbm->msi_num ? i + pbm->msi_first : -ENOENT;
 }
 
 static void free_msi(struct pci_pbm_info *pbm, int msi_num)

From e905d8a7d76b2b45f70ff0c8584d2ce99ee4c387 Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Mon, 11 Dec 2023 18:27:22 -0800
Subject: [PATCH 0040/1406] perf/arm: use atomic find_bit() API

Simplify subsystem by use atomic find_bit() or atomic API where
applicable.

CC: Will Deacon <will@kernel.org>
Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 drivers/perf/arm-cci.c        | 24 ++++++------------------
 drivers/perf/arm-ccn.c        | 10 ++--------
 drivers/perf/arm_dmc620_pmu.c |  9 ++-------
 drivers/perf/arm_pmuv3.c      |  8 ++------
 4 files changed, 12 insertions(+), 39 deletions(-)

diff --git a/drivers/perf/arm-cci.c b/drivers/perf/arm-cci.c
index 61de861eaf91e3..cb15b4cee5f75e 100644
--- a/drivers/perf/arm-cci.c
+++ b/drivers/perf/arm-cci.c
@@ -320,12 +320,9 @@ static int cci400_get_event_idx(struct cci_pmu *cci_pmu,
 		return CCI400_PMU_CYCLE_CNTR_IDX;
 	}
 
-	for (idx = CCI400_PMU_CNTR0_IDX; idx <= CCI_PMU_CNTR_LAST(cci_pmu); ++idx)
-		if (!test_and_set_bit(idx, hw->used_mask))
-			return idx;
-
-	/* No counters available */
-	return -EAGAIN;
+	idx = find_and_set_next_bit(hw->used_mask, CCI_PMU_CNTR_LAST(cci_pmu) + 1,
+							CCI400_PMU_CNTR0_IDX);
+	return idx < CCI_PMU_CNTR_LAST(cci_pmu) + 1 ? idx : -EAGAIN;
 }
 
 static int cci400_validate_hw_event(struct cci_pmu *cci_pmu, unsigned long hw_event)
@@ -802,13 +799,8 @@ static int pmu_get_event_idx(struct cci_pmu_hw_events *hw, struct perf_event *ev
 	if (cci_pmu->model->get_event_idx)
 		return cci_pmu->model->get_event_idx(cci_pmu, hw, cci_event);
 
-	/* Generic code to find an unused idx from the mask */
-	for (idx = 0; idx <= CCI_PMU_CNTR_LAST(cci_pmu); idx++)
-		if (!test_and_set_bit(idx, hw->used_mask))
-			return idx;
-
-	/* No counters available */
-	return -EAGAIN;
+	idx = find_and_set_bit(hw->used_mask, CCI_PMU_CNTR_LAST(cci_pmu) + 1);
+	return idx < CCI_PMU_CNTR_LAST(cci_pmu) + 1 ? idx : -EAGAIN;
 }
 
 static int pmu_map_event(struct perf_event *event)
@@ -861,12 +853,8 @@ static void pmu_free_irq(struct cci_pmu *cci_pmu)
 {
 	int i;
 
-	for (i = 0; i < cci_pmu->nr_irqs; i++) {
-		if (!test_and_clear_bit(i, &cci_pmu->active_irqs))
-			continue;
-
+	for_each_test_and_clear_bit(i, &cci_pmu->active_irqs, cci_pmu->nr_irqs)
 		free_irq(cci_pmu->irqs[i], cci_pmu);
-	}
 }
 
 static u32 pmu_read_counter(struct perf_event *event)
diff --git a/drivers/perf/arm-ccn.c b/drivers/perf/arm-ccn.c
index 728d13d8e98ac9..d657701b1f236c 100644
--- a/drivers/perf/arm-ccn.c
+++ b/drivers/perf/arm-ccn.c
@@ -589,15 +589,9 @@ static const struct attribute_group *arm_ccn_pmu_attr_groups[] = {
 
 static int arm_ccn_pmu_alloc_bit(unsigned long *bitmap, unsigned long size)
 {
-	int bit;
-
-	do {
-		bit = find_first_zero_bit(bitmap, size);
-		if (bit >= size)
-			return -EAGAIN;
-	} while (test_and_set_bit(bit, bitmap));
+	int bit = find_and_set_bit(bitmap, size);
 
-	return bit;
+	return bit < size ? bit : -EAGAIN;
 }
 
 /* All RN-I and RN-D nodes have identical PMUs */
diff --git a/drivers/perf/arm_dmc620_pmu.c b/drivers/perf/arm_dmc620_pmu.c
index 30cea685957470..e41c84dabc3ebe 100644
--- a/drivers/perf/arm_dmc620_pmu.c
+++ b/drivers/perf/arm_dmc620_pmu.c
@@ -303,13 +303,8 @@ static int dmc620_get_event_idx(struct perf_event *event)
 		end_idx = DMC620_PMU_MAX_COUNTERS;
 	}
 
-	for (idx = start_idx; idx < end_idx; ++idx) {
-		if (!test_and_set_bit(idx, dmc620_pmu->used_mask))
-			return idx;
-	}
-
-	/* The counters are all in use. */
-	return -EAGAIN;
+	idx = find_and_set_next_bit(dmc620_pmu->used_mask, end_idx, start_idx);
+	return idx < end_idx ? idx : -EAGAIN;
 }
 
 static inline
diff --git a/drivers/perf/arm_pmuv3.c b/drivers/perf/arm_pmuv3.c
index 6ca7be05229c10..f046ad9e71f1aa 100644
--- a/drivers/perf/arm_pmuv3.c
+++ b/drivers/perf/arm_pmuv3.c
@@ -825,13 +825,9 @@ static irqreturn_t armv8pmu_handle_irq(struct arm_pmu *cpu_pmu)
 static int armv8pmu_get_single_idx(struct pmu_hw_events *cpuc,
 				    struct arm_pmu *cpu_pmu)
 {
-	int idx;
+	int idx = find_and_set_next_bit(cpuc->used_mask, cpu_pmu->num_events, ARMV8_IDX_COUNTER0);
 
-	for (idx = ARMV8_IDX_COUNTER0; idx < cpu_pmu->num_events; idx++) {
-		if (!test_and_set_bit(idx, cpuc->used_mask))
-			return idx;
-	}
-	return -EAGAIN;
+	return idx < cpu_pmu->num_events ? idx : -EAGAIN;
 }
 
 static int armv8pmu_get_chain_idx(struct pmu_hw_events *cpuc,

From f5f61c6f9f2771aeabdb796302a378e50080d6bf Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Mon, 11 Dec 2023 18:27:23 -0800
Subject: [PATCH 0041/1406] drivers/perf: optimize ali_drw_get_counter_idx() by
 using find_and_set_bit()

The function searches used_mask for a set bit in a for-loop bit by bit.
Simplify it by using atomic find_and_set_bit().

Signed-off-by: Yury Norov <yury.norov@gmail.com>
Acked-by: Will Deacon <will@kernel.org>
---
 drivers/perf/alibaba_uncore_drw_pmu.c | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/drivers/perf/alibaba_uncore_drw_pmu.c b/drivers/perf/alibaba_uncore_drw_pmu.c
index 19d459a36be55c..2a3b7701d568bd 100644
--- a/drivers/perf/alibaba_uncore_drw_pmu.c
+++ b/drivers/perf/alibaba_uncore_drw_pmu.c
@@ -274,15 +274,9 @@ static const struct attribute_group *ali_drw_pmu_attr_groups[] = {
 static int ali_drw_get_counter_idx(struct perf_event *event)
 {
 	struct ali_drw_pmu *drw_pmu = to_ali_drw_pmu(event->pmu);
-	int idx;
+	int idx = find_and_set_bit(drw_pmu->used_mask, ALI_DRW_PMU_COMMON_MAX_COUNTERS);
 
-	for (idx = 0; idx < ALI_DRW_PMU_COMMON_MAX_COUNTERS; ++idx) {
-		if (!test_and_set_bit(idx, drw_pmu->used_mask))
-			return idx;
-	}
-
-	/* The counters are all in use. */
-	return -EBUSY;
+	return idx < ALI_DRW_PMU_COMMON_MAX_COUNTERS ? idx : -EBUSY;
 }
 
 static u64 ali_drw_pmu_read_counter(struct perf_event *event)

From 37cd1b38270a3eb1555b533ad597252f5bfd9ecc Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Mon, 11 Dec 2023 18:27:24 -0800
Subject: [PATCH 0042/1406] dmaengine: idxd: optimize perfmon_assign_event()

The function searches used_mask for a set bit in a for-loop bit by bit.
Simplify it by using atomic find_and_set_bit(), and make a nice
one-liner.

Signed-off-by: Yury Norov <yury.norov@gmail.com>
Reviewed-by: Dave Jiang <dave.jiang@intel.com>
Acked-by: Vinod Koul <vkoul@kernel.org>
Reviewed-by: Fenghua Yu <fenghua.yu@intel.com>
---
 drivers/dma/idxd/perfmon.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/drivers/dma/idxd/perfmon.c b/drivers/dma/idxd/perfmon.c
index fdda6d60426295..4dd9c0d979c388 100644
--- a/drivers/dma/idxd/perfmon.c
+++ b/drivers/dma/idxd/perfmon.c
@@ -134,13 +134,9 @@ static void perfmon_assign_hw_event(struct idxd_pmu *idxd_pmu,
 static int perfmon_assign_event(struct idxd_pmu *idxd_pmu,
 				struct perf_event *event)
 {
-	int i;
-
-	for (i = 0; i < IDXD_PMU_EVENT_MAX; i++)
-		if (!test_and_set_bit(i, idxd_pmu->used_mask))
-			return i;
+	int i = find_and_set_bit(idxd_pmu->used_mask, IDXD_PMU_EVENT_MAX);
 
-	return -EINVAL;
+	return i < IDXD_PMU_EVENT_MAX ? i : -EINVAL;
 }
 
 /*

From 10922d08df496bc8e0963ce810936f30ad56c81c Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Mon, 11 Dec 2023 18:27:25 -0800
Subject: [PATCH 0043/1406] ath10k: optimize ath10k_snoc_napi_poll()

ath10k_snoc_napi_poll() traverses pending_ce_irqs bitmap bit by bit.
Simplify it by using for_each_test_and_clear_bit() iterator.

Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 drivers/net/wireless/ath/ath10k/snoc.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/drivers/net/wireless/ath/ath10k/snoc.c b/drivers/net/wireless/ath/ath10k/snoc.c
index 2c39bad7ebfb9a..a1db5a973780c4 100644
--- a/drivers/net/wireless/ath/ath10k/snoc.c
+++ b/drivers/net/wireless/ath/ath10k/snoc.c
@@ -1237,11 +1237,10 @@ static int ath10k_snoc_napi_poll(struct napi_struct *ctx, int budget)
 		return done;
 	}
 
-	for (ce_id = 0; ce_id < CE_COUNT; ce_id++)
-		if (test_and_clear_bit(ce_id, ar_snoc->pending_ce_irqs)) {
-			ath10k_ce_per_engine_service(ar, ce_id);
-			ath10k_ce_enable_interrupt(ar, ce_id);
-		}
+	for_each_test_and_clear_bit(ce_id, ar_snoc->pending_ce_irqs, CE_COUNT) {
+		ath10k_ce_per_engine_service(ar, ce_id);
+		ath10k_ce_enable_interrupt(ar, ce_id);
+	}
 
 	done = ath10k_htt_txrx_compl_task(ar, budget);
 

From f55f49707defc7939df686487535ac5702de9a67 Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Mon, 11 Dec 2023 18:27:26 -0800
Subject: [PATCH 0044/1406] wifi: rtw88: optimize the driver by using atomic
 iterator

rtw_pci_tx_kick_off() and rtw89_pci_tx_kick_off_pending() traverse bitmaps
bit by bit. Simplify it by using atomic for_each_test_and_clear_bit()
iterator.

Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 drivers/net/wireless/realtek/rtw88/pci.c | 5 ++---
 drivers/net/wireless/realtek/rtw89/pci.c | 5 +----
 2 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/drivers/net/wireless/realtek/rtw88/pci.c b/drivers/net/wireless/realtek/rtw88/pci.c
index 2bfc0e822b8d0b..a0d69c75a38185 100644
--- a/drivers/net/wireless/realtek/rtw88/pci.c
+++ b/drivers/net/wireless/realtek/rtw88/pci.c
@@ -789,9 +789,8 @@ static void rtw_pci_tx_kick_off(struct rtw_dev *rtwdev)
 	struct rtw_pci *rtwpci = (struct rtw_pci *)rtwdev->priv;
 	enum rtw_tx_queue_type queue;
 
-	for (queue = 0; queue < RTK_MAX_TX_QUEUE_NUM; queue++)
-		if (test_and_clear_bit(queue, rtwpci->tx_queued))
-			rtw_pci_tx_kick_off_queue(rtwdev, queue);
+	for_each_test_and_clear_bit(queue, rtwpci->tx_queued, RTK_MAX_TX_QUEUE_NUM)
+		rtw_pci_tx_kick_off_queue(rtwdev, queue);
 }
 
 static int rtw_pci_tx_write_data(struct rtw_dev *rtwdev,
diff --git a/drivers/net/wireless/realtek/rtw89/pci.c b/drivers/net/wireless/realtek/rtw89/pci.c
index 14ddb0d39e6374..184d41b774d7c3 100644
--- a/drivers/net/wireless/realtek/rtw89/pci.c
+++ b/drivers/net/wireless/realtek/rtw89/pci.c
@@ -1077,10 +1077,7 @@ static void rtw89_pci_tx_kick_off_pending(struct rtw89_dev *rtwdev)
 	struct rtw89_pci_tx_ring *tx_ring;
 	int txch;
 
-	for (txch = 0; txch < RTW89_TXCH_NUM; txch++) {
-		if (!test_and_clear_bit(txch, rtwpci->kick_map))
-			continue;
-
+	for_each_test_and_clear_bit(txch, rtwpci->kick_map, RTW89_TXCH_NUM) {
 		tx_ring = &rtwpci->tx_rings[txch];
 		__rtw89_pci_tx_kick_off(rtwdev, tx_ring);
 	}

From 252479be16f72ecd6a0cf0ab88d79cac70eee826 Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Mon, 11 Dec 2023 18:27:27 -0800
Subject: [PATCH 0045/1406] KVM: x86: hyper-v: optimize and cleanup
 kvm_hv_process_stimers()

The function traverses stimer_pending_bitmap in a for-loop bit by bit.
Simplify it by using atomic for_each_test_and_clear_bit().

Because there are only 4 bits, using for_each_test_and_clear_bit() will
still generate inline code, so no excessive bloating with the new API.

While here, refactor the logic by decreasing indentation level.

CC: Sean Christopherson <seanjc@google.com>
Signed-off-by: Yury Norov <yury.norov@gmail.com>
Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Acked-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/hyperv.c | 40 ++++++++++++++++++++--------------------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 238afd7335e46d..d541524ca49f74 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -870,27 +870,27 @@ void kvm_hv_process_stimers(struct kvm_vcpu *vcpu)
 	if (!hv_vcpu)
 		return;
 
-	for (i = 0; i < ARRAY_SIZE(hv_vcpu->stimer); i++)
-		if (test_and_clear_bit(i, hv_vcpu->stimer_pending_bitmap)) {
-			stimer = &hv_vcpu->stimer[i];
-			if (stimer->config.enable) {
-				exp_time = stimer->exp_time;
-
-				if (exp_time) {
-					time_now =
-						get_time_ref_counter(vcpu->kvm);
-					if (time_now >= exp_time)
-						stimer_expiration(stimer);
-				}
-
-				if ((stimer->config.enable) &&
-				    stimer->count) {
-					if (!stimer->msg_pending)
-						stimer_start(stimer);
-				} else
-					stimer_cleanup(stimer);
-			}
+	for_each_test_and_clear_bit(i, hv_vcpu->stimer_pending_bitmap,
+				    ARRAY_SIZE(hv_vcpu->stimer)) {
+		stimer = &hv_vcpu->stimer[i];
+		if (!stimer->config.enable)
+			continue;
+
+		exp_time = stimer->exp_time;
+
+		if (exp_time) {
+			time_now = get_time_ref_counter(vcpu->kvm);
+			if (time_now >= exp_time)
+				stimer_expiration(stimer);
 		}
+
+		if (stimer->config.enable && stimer->count) {
+			if (!stimer->msg_pending)
+				stimer_start(stimer);
+		} else {
+			stimer_cleanup(stimer);
+		}
+	}
 }
 
 void kvm_hv_vcpu_uninit(struct kvm_vcpu *vcpu)

From 020c02d58ef080a486c05f0adf8dd146e2549a74 Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Mon, 11 Dec 2023 18:27:28 -0800
Subject: [PATCH 0046/1406] PCI: hv: Optimize hv_get_dom_num() by using
 find_and_set_bit()

The function traverses bitmap with for_each_clear_bit() just to allocate
a bit atomically. Simplify it by using dedicated find_and_set_bit().

Signed-off-by: Yury Norov <yury.norov@gmail.com>
Reviewed-by: Michael Kelley <mhklinux@outlook.com>
Acked-by: Wei Liu <wei.liu@kernel.org>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/pci/controller/pci-hyperv.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c
index 30c7dfeccb16f5..033b1fb7f4eb44 100644
--- a/drivers/pci/controller/pci-hyperv.c
+++ b/drivers/pci/controller/pci-hyperv.c
@@ -3605,12 +3605,9 @@ static u16 hv_get_dom_num(u16 dom)
 	if (test_and_set_bit(dom, hvpci_dom_map) == 0)
 		return dom;
 
-	for_each_clear_bit(i, hvpci_dom_map, HVPCI_DOM_MAP_SIZE) {
-		if (test_and_set_bit(i, hvpci_dom_map) == 0)
-			return i;
-	}
+	i = find_and_set_bit(hvpci_dom_map, HVPCI_DOM_MAP_SIZE);
 
-	return HVPCI_DOM_INVALID;
+	return i < HVPCI_DOM_MAP_SIZE ? i : HVPCI_DOM_INVALID;
 }
 
 /**

From 57dd83bdbe3c0b3a0e83b339cd777568a179e329 Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Mon, 11 Dec 2023 18:27:29 -0800
Subject: [PATCH 0047/1406] scsi: core: optimize scsi_evt_emit() by using an
 atomic iterator

A plain loop in scsi_evt_thread() opencodes optimized atomic bit traversing
macro. Simplify it by using the dedicated iterator.

CC: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 drivers/scsi/scsi_lib.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index cf3864f7209309..a4c5c9b4bfc94e 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -2494,14 +2494,13 @@ static void scsi_evt_emit(struct scsi_device *sdev, struct scsi_event *evt)
 void scsi_evt_thread(struct work_struct *work)
 {
 	struct scsi_device *sdev;
-	enum scsi_device_event evt_type;
+	enum scsi_device_event evt_type = SDEV_EVT_FIRST;
 	LIST_HEAD(event_list);
 
 	sdev = container_of(work, struct scsi_device, event_work);
 
-	for (evt_type = SDEV_EVT_FIRST; evt_type <= SDEV_EVT_LAST; evt_type++)
-		if (test_and_clear_bit(evt_type, sdev->pending_events))
-			sdev_evt_send_simple(sdev, evt_type, GFP_KERNEL);
+	for_each_test_and_clear_bit_from(evt_type, sdev->pending_events, SDEV_EVT_LAST + 1)
+		sdev_evt_send_simple(sdev, evt_type, GFP_KERNEL);
 
 	while (1) {
 		struct scsi_event *evt;

From b0bfc29429fd4d989b0d7e6901c60e453d888e45 Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Mon, 11 Dec 2023 18:27:30 -0800
Subject: [PATCH 0048/1406] scsi: mpi3mr: optimize the driver by using
 find_and_set_bit()

mpi3mr_dev_rmhs_send_tm() and mpi3mr_send_event_ack() opencode
find_and_set_bit(). Simplify them by using dedicated function.

CC: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 drivers/scsi/mpi3mr/mpi3mr_os.c | 21 ++++++---------------
 1 file changed, 6 insertions(+), 15 deletions(-)

diff --git a/drivers/scsi/mpi3mr/mpi3mr_os.c b/drivers/scsi/mpi3mr/mpi3mr_os.c
index 040031eb0c12d4..11139a2008fdac 100644
--- a/drivers/scsi/mpi3mr/mpi3mr_os.c
+++ b/drivers/scsi/mpi3mr/mpi3mr_os.c
@@ -2276,13 +2276,9 @@ static void mpi3mr_dev_rmhs_send_tm(struct mpi3mr_ioc *mrioc, u16 handle,
 	if (drv_cmd)
 		goto issue_cmd;
 	do {
-		cmd_idx = find_first_zero_bit(mrioc->devrem_bitmap,
-		    MPI3MR_NUM_DEVRMCMD);
-		if (cmd_idx < MPI3MR_NUM_DEVRMCMD) {
-			if (!test_and_set_bit(cmd_idx, mrioc->devrem_bitmap))
-				break;
-			cmd_idx = MPI3MR_NUM_DEVRMCMD;
-		}
+		cmd_idx = find_and_set_bit(mrioc->devrem_bitmap, MPI3MR_NUM_DEVRMCMD);
+		if (cmd_idx < MPI3MR_NUM_DEVRMCMD)
+			break;
 	} while (retrycount--);
 
 	if (cmd_idx >= MPI3MR_NUM_DEVRMCMD) {
@@ -2417,14 +2413,9 @@ static void mpi3mr_send_event_ack(struct mpi3mr_ioc *mrioc, u8 event,
 	    "sending event ack in the top half for event(0x%02x), event_ctx(0x%08x)\n",
 	    event, event_ctx);
 	do {
-		cmd_idx = find_first_zero_bit(mrioc->evtack_cmds_bitmap,
-		    MPI3MR_NUM_EVTACKCMD);
-		if (cmd_idx < MPI3MR_NUM_EVTACKCMD) {
-			if (!test_and_set_bit(cmd_idx,
-			    mrioc->evtack_cmds_bitmap))
-				break;
-			cmd_idx = MPI3MR_NUM_EVTACKCMD;
-		}
+		cmd_idx = find_and_set_bit(mrioc->evtack_cmds_bitmap, MPI3MR_NUM_EVTACKCMD);
+		if (cmd_idx < MPI3MR_NUM_EVTACKCMD)
+			break;
 	} while (retrycount--);
 
 	if (cmd_idx >= MPI3MR_NUM_EVTACKCMD) {

From 2cedc5c4cbed533323b2de3147dcae85d1b656f6 Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Mon, 11 Dec 2023 18:27:31 -0800
Subject: [PATCH 0049/1406] scsi: qedi: optimize qedi_get_task_idx() by using
 find_and_set_bit()

qedi_get_task_idx() opencodes find_and_set_bit(). Simplify it and make the
whole function a simiple almost one-liner.

CC: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 drivers/scsi/qedi/qedi_main.c | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/drivers/scsi/qedi/qedi_main.c b/drivers/scsi/qedi/qedi_main.c
index cd0180b1f5b9da..2f940c6898ef3d 100644
--- a/drivers/scsi/qedi/qedi_main.c
+++ b/drivers/scsi/qedi/qedi_main.c
@@ -1824,20 +1824,13 @@ int qedi_get_task_idx(struct qedi_ctx *qedi)
 {
 	s16 tmp_idx;
 
-again:
-	tmp_idx = find_first_zero_bit(qedi->task_idx_map,
-				      MAX_ISCSI_TASK_ENTRIES);
+	tmp_idx = find_and_set_bit(qedi->task_idx_map, MAX_ISCSI_TASK_ENTRIES);
 
 	if (tmp_idx >= MAX_ISCSI_TASK_ENTRIES) {
 		QEDI_ERR(&qedi->dbg_ctx, "FW task context pool is full.\n");
 		tmp_idx = -1;
-		goto err_idx;
 	}
 
-	if (test_and_set_bit(tmp_idx, qedi->task_idx_map))
-		goto again;
-
-err_idx:
 	return tmp_idx;
 }
 

From 1e9e099525e5fea93c761c289a8f30542b306da4 Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Mon, 11 Dec 2023 18:27:32 -0800
Subject: [PATCH 0050/1406] powerpc: optimize arch code by using atomic
 find_bit() API

Use find_and_{set,clear}_bit() where appropriate and simplify the logic.

Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 arch/powerpc/mm/book3s32/mmu_context.c     | 10 ++---
 arch/powerpc/platforms/pasemi/dma_lib.c    | 45 +++++-----------------
 arch/powerpc/platforms/powernv/pci-sriov.c | 12 ++----
 3 files changed, 17 insertions(+), 50 deletions(-)

diff --git a/arch/powerpc/mm/book3s32/mmu_context.c b/arch/powerpc/mm/book3s32/mmu_context.c
index 1922f9a6b05850..7db19f173c2ed6 100644
--- a/arch/powerpc/mm/book3s32/mmu_context.c
+++ b/arch/powerpc/mm/book3s32/mmu_context.c
@@ -50,13 +50,11 @@ static unsigned long context_map[LAST_CONTEXT / BITS_PER_LONG + 1];
 
 unsigned long __init_new_context(void)
 {
-	unsigned long ctx = next_mmu_context;
+	unsigned long ctx;
 
-	while (test_and_set_bit(ctx, context_map)) {
-		ctx = find_next_zero_bit(context_map, LAST_CONTEXT+1, ctx);
-		if (ctx > LAST_CONTEXT)
-			ctx = 0;
-	}
+	ctx = find_and_set_next_bit(context_map, LAST_CONTEXT + 1, next_mmu_context);
+	if (ctx > LAST_CONTEXT)
+		ctx = 0;
 	next_mmu_context = (ctx + 1) & LAST_CONTEXT;
 
 	return ctx;
diff --git a/arch/powerpc/platforms/pasemi/dma_lib.c b/arch/powerpc/platforms/pasemi/dma_lib.c
index 1be1f18f6f0982..906dabee013249 100644
--- a/arch/powerpc/platforms/pasemi/dma_lib.c
+++ b/arch/powerpc/platforms/pasemi/dma_lib.c
@@ -118,14 +118,9 @@ static int pasemi_alloc_tx_chan(enum pasemi_dmachan_type type)
 		limit = MAX_TXCH;
 		break;
 	}
-retry:
-	bit = find_next_bit(txch_free, MAX_TXCH, start);
-	if (bit >= limit)
-		return -ENOSPC;
-	if (!test_and_clear_bit(bit, txch_free))
-		goto retry;
-
-	return bit;
+
+	bit = find_and_clear_next_bit(txch_free, MAX_TXCH, start);
+	return bit < limit ? bit : -ENOSPC;
 }
 
 static void pasemi_free_tx_chan(int chan)
@@ -136,15 +131,9 @@ static void pasemi_free_tx_chan(int chan)
 
 static int pasemi_alloc_rx_chan(void)
 {
-	int bit;
-retry:
-	bit = find_first_bit(rxch_free, MAX_RXCH);
-	if (bit >= MAX_TXCH)
-		return -ENOSPC;
-	if (!test_and_clear_bit(bit, rxch_free))
-		goto retry;
-
-	return bit;
+	int bit = find_and_clear_bit(rxch_free, MAX_RXCH);
+
+	return bit < MAX_TXCH ? bit : -ENOSPC;
 }
 
 static void pasemi_free_rx_chan(int chan)
@@ -374,16 +363,9 @@ EXPORT_SYMBOL(pasemi_dma_free_buf);
  */
 int pasemi_dma_alloc_flag(void)
 {
-	int bit;
+	int bit = find_and_clear_bit(flags_free, MAX_FLAGS);
 
-retry:
-	bit = find_first_bit(flags_free, MAX_FLAGS);
-	if (bit >= MAX_FLAGS)
-		return -ENOSPC;
-	if (!test_and_clear_bit(bit, flags_free))
-		goto retry;
-
-	return bit;
+	return bit < MAX_FLAGS ? bit : -ENOSPC;
 }
 EXPORT_SYMBOL(pasemi_dma_alloc_flag);
 
@@ -439,16 +421,9 @@ EXPORT_SYMBOL(pasemi_dma_clear_flag);
  */
 int pasemi_dma_alloc_fun(void)
 {
-	int bit;
-
-retry:
-	bit = find_first_bit(fun_free, MAX_FLAGS);
-	if (bit >= MAX_FLAGS)
-		return -ENOSPC;
-	if (!test_and_clear_bit(bit, fun_free))
-		goto retry;
+	int bit = find_and_clear_bit(fun_free, MAX_FLAGS);
 
-	return bit;
+	return bit < MAX_FLAGS ? bit : -ENOSPC;
 }
 EXPORT_SYMBOL(pasemi_dma_alloc_fun);
 
diff --git a/arch/powerpc/platforms/powernv/pci-sriov.c b/arch/powerpc/platforms/powernv/pci-sriov.c
index 59882da3e74253..640e387e6d839c 100644
--- a/arch/powerpc/platforms/powernv/pci-sriov.c
+++ b/arch/powerpc/platforms/powernv/pci-sriov.c
@@ -397,18 +397,12 @@ static int64_t pnv_ioda_map_m64_single(struct pnv_phb *phb,
 
 static int pnv_pci_alloc_m64_bar(struct pnv_phb *phb, struct pnv_iov_data *iov)
 {
-	int win;
+	int win = find_and_set_bit(&phb->ioda.m64_bar_alloc, phb->ioda.m64_bar_idx + 1);
 
-	do {
-		win = find_next_zero_bit(&phb->ioda.m64_bar_alloc,
-				phb->ioda.m64_bar_idx + 1, 0);
-
-		if (win >= phb->ioda.m64_bar_idx + 1)
-			return -1;
-	} while (test_and_set_bit(win, &phb->ioda.m64_bar_alloc));
+	if (win >= phb->ioda.m64_bar_idx + 1)
+		return -1;
 
 	set_bit(win, iov->used_m64_bar_mask);
-
 	return win;
 }
 

From 2ebbcd7bcb17f6695329416e82256ce87c362eaa Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Mon, 11 Dec 2023 18:27:33 -0800
Subject: [PATCH 0051/1406] iommu: optimize subsystem by using atomic
 find_bit() API

Simplify  __arm_smmu_alloc_bitmap() and msm_iommu_alloc_ctx() by using
a dedicated API, and make them nice one-liner wrappers.

While here, refactor msm_iommu_attach_dev() and msm_iommu_alloc_ctx()
so that error codes don't mismatch.

Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 drivers/iommu/arm/arm-smmu/arm-smmu.h | 10 ++--------
 drivers/iommu/msm_iommu.c             | 18 ++++--------------
 2 files changed, 6 insertions(+), 22 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.h b/drivers/iommu/arm/arm-smmu/arm-smmu.h
index 703fd5817ec11f..004a4704ebf15c 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu.h
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu.h
@@ -453,15 +453,9 @@ struct arm_smmu_impl {
 
 static inline int __arm_smmu_alloc_bitmap(unsigned long *map, int start, int end)
 {
-	int idx;
+	int idx = find_and_set_next_bit(map, end, start);
 
-	do {
-		idx = find_next_zero_bit(map, end, start);
-		if (idx == end)
-			return -ENOSPC;
-	} while (test_and_set_bit(idx, map));
-
-	return idx;
+	return idx < end ? idx : -ENOSPC;
 }
 
 static inline void __iomem *arm_smmu_page(struct arm_smmu_device *smmu, int n)
diff --git a/drivers/iommu/msm_iommu.c b/drivers/iommu/msm_iommu.c
index f86af9815d6f98..67124f4228b1f0 100644
--- a/drivers/iommu/msm_iommu.c
+++ b/drivers/iommu/msm_iommu.c
@@ -185,17 +185,9 @@ static const struct iommu_flush_ops msm_iommu_flush_ops = {
 	.tlb_add_page = __flush_iotlb_page,
 };
 
-static int msm_iommu_alloc_ctx(unsigned long *map, int start, int end)
+static int msm_iommu_alloc_ctx(struct msm_iommu_dev *iommu)
 {
-	int idx;
-
-	do {
-		idx = find_next_zero_bit(map, end, start);
-		if (idx == end)
-			return -ENOSPC;
-	} while (test_and_set_bit(idx, map));
-
-	return idx;
+	return find_and_set_bit(iommu->context_map, iommu->ncb);
 }
 
 static void msm_iommu_free_ctx(unsigned long *map, int idx)
@@ -418,10 +410,8 @@ static int msm_iommu_attach_dev(struct iommu_domain *domain, struct device *dev)
 					ret = -EEXIST;
 					goto fail;
 				}
-				master->num =
-					msm_iommu_alloc_ctx(iommu->context_map,
-							    0, iommu->ncb);
-				if (IS_ERR_VALUE(master->num)) {
+				master->num = msm_iommu_alloc_ctx(iommu);
+				if (master->num >= iommu->ncb) {
 					ret = -ENODEV;
 					goto fail;
 				}

From 4678bace092ce861e09a6ab4dd3f0e043bdb49a5 Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Mon, 11 Dec 2023 18:27:34 -0800
Subject: [PATCH 0052/1406] media: radio-shark: optimize the driver by using
 atomic find_bit() API

Despite that it's only 2- or 3-bit maps, convert for-loop followed by
test_bit() to for_each_test_and_clear_bit() as it makes the code cleaner.

Signed-off-by: Yury Norov <yury.norov@gmail.com>
Acked-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
---
 drivers/media/radio/radio-shark.c  | 5 +----
 drivers/media/radio/radio-shark2.c | 5 +----
 2 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/drivers/media/radio/radio-shark.c b/drivers/media/radio/radio-shark.c
index 127a3be0e0f070..0c50b3a9623e88 100644
--- a/drivers/media/radio/radio-shark.c
+++ b/drivers/media/radio/radio-shark.c
@@ -158,10 +158,7 @@ static void shark_led_work(struct work_struct *work)
 		container_of(work, struct shark_device, led_work);
 	int i, res, brightness, actual_len;
 
-	for (i = 0; i < 3; i++) {
-		if (!test_and_clear_bit(i, &shark->brightness_new))
-			continue;
-
+	for_each_test_and_clear_bit(i, &shark->brightness_new, 3) {
 		brightness = atomic_read(&shark->brightness[i]);
 		memset(shark->transfer_buffer, 0, TB_LEN);
 		if (i != RED_LED) {
diff --git a/drivers/media/radio/radio-shark2.c b/drivers/media/radio/radio-shark2.c
index f1c5c0a6a335cb..d9ef241e177806 100644
--- a/drivers/media/radio/radio-shark2.c
+++ b/drivers/media/radio/radio-shark2.c
@@ -145,10 +145,7 @@ static void shark_led_work(struct work_struct *work)
 		container_of(work, struct shark_device, led_work);
 	int i, res, brightness, actual_len;
 
-	for (i = 0; i < 2; i++) {
-		if (!test_and_clear_bit(i, &shark->brightness_new))
-			continue;
-
+	for_each_test_and_clear_bit(i, &shark->brightness_new, 2) {
 		brightness = atomic_read(&shark->brightness[i]);
 		memset(shark->transfer_buffer, 0, TB_LEN);
 		shark->transfer_buffer[0] = 0x83 + i;

From 7b39dbf951db07db3001f6113db0765c9598c00a Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Mon, 11 Dec 2023 18:27:35 -0800
Subject: [PATCH 0053/1406] sfc: optimize the driver by using atomic find_bit()
 API

SFC code traverses rps_slot_map and rxq_retry_mask bit by bit. Simplify
it by using dedicated atomic find_bit() functions, as they skip already
clear bits.

Signed-off-by: Yury Norov <yury.norov@gmail.com>
Reviewed-by: Edward Cree <ecree.xilinx@gmail.com>
---
 drivers/net/ethernet/sfc/rx_common.c         |  4 +---
 drivers/net/ethernet/sfc/siena/rx_common.c   |  4 +---
 drivers/net/ethernet/sfc/siena/siena_sriov.c | 14 ++++++--------
 3 files changed, 8 insertions(+), 14 deletions(-)

diff --git a/drivers/net/ethernet/sfc/rx_common.c b/drivers/net/ethernet/sfc/rx_common.c
index d2f35ee15effeb..0112968b3fe7c6 100644
--- a/drivers/net/ethernet/sfc/rx_common.c
+++ b/drivers/net/ethernet/sfc/rx_common.c
@@ -950,9 +950,7 @@ int efx_filter_rfs(struct net_device *net_dev, const struct sk_buff *skb,
 	int rc;
 
 	/* find a free slot */
-	for (slot_idx = 0; slot_idx < EFX_RPS_MAX_IN_FLIGHT; slot_idx++)
-		if (!test_and_set_bit(slot_idx, &efx->rps_slot_map))
-			break;
+	slot_idx = find_and_set_bit(&efx->rps_slot_map, EFX_RPS_MAX_IN_FLIGHT);
 	if (slot_idx >= EFX_RPS_MAX_IN_FLIGHT)
 		return -EBUSY;
 
diff --git a/drivers/net/ethernet/sfc/siena/rx_common.c b/drivers/net/ethernet/sfc/siena/rx_common.c
index 4579f43484c367..160b16aa74862b 100644
--- a/drivers/net/ethernet/sfc/siena/rx_common.c
+++ b/drivers/net/ethernet/sfc/siena/rx_common.c
@@ -958,9 +958,7 @@ int efx_siena_filter_rfs(struct net_device *net_dev, const struct sk_buff *skb,
 	int rc;
 
 	/* find a free slot */
-	for (slot_idx = 0; slot_idx < EFX_RPS_MAX_IN_FLIGHT; slot_idx++)
-		if (!test_and_set_bit(slot_idx, &efx->rps_slot_map))
-			break;
+	slot_idx = find_and_set_bit(&efx->rps_slot_map, EFX_RPS_MAX_IN_FLIGHT);
 	if (slot_idx >= EFX_RPS_MAX_IN_FLIGHT)
 		return -EBUSY;
 
diff --git a/drivers/net/ethernet/sfc/siena/siena_sriov.c b/drivers/net/ethernet/sfc/siena/siena_sriov.c
index 8353c15dc23336..554b799288b8e2 100644
--- a/drivers/net/ethernet/sfc/siena/siena_sriov.c
+++ b/drivers/net/ethernet/sfc/siena/siena_sriov.c
@@ -722,14 +722,12 @@ static int efx_vfdi_fini_all_queues(struct siena_vf *vf)
 					     efx_vfdi_flush_wake(vf),
 					     timeout);
 		rxqs_count = 0;
-		for (index = 0; index < count; ++index) {
-			if (test_and_clear_bit(index, vf->rxq_retry_mask)) {
-				atomic_dec(&vf->rxq_retry_count);
-				MCDI_SET_ARRAY_DWORD(
-					inbuf, FLUSH_RX_QUEUES_IN_QID_OFST,
-					rxqs_count, vf_offset + index);
-				rxqs_count++;
-			}
+		for_each_test_and_clear_bit(index, vf->rxq_retry_mask, count) {
+			atomic_dec(&vf->rxq_retry_count);
+			MCDI_SET_ARRAY_DWORD(
+				inbuf, FLUSH_RX_QUEUES_IN_QID_OFST,
+				rxqs_count, vf_offset + index);
+			rxqs_count++;
 		}
 	}
 

From 468cf2a9e8267d38c3756ea6576db439d9e219c1 Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Mon, 11 Dec 2023 18:27:36 -0800
Subject: [PATCH 0054/1406] tty: nozomi: optimize interrupt_handler()

In the exit path of interrupt_handler(), dc->flip map is traversed bit
by bit to find and clear set bits and call tty_flip_buffer_push() for
corresponding ports.

Simplify it by using for_each_test_and_clear_bit(), as it skips already
clear bits.

Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 drivers/tty/nozomi.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/tty/nozomi.c b/drivers/tty/nozomi.c
index 02cd40147b3a80..de0503247391a5 100644
--- a/drivers/tty/nozomi.c
+++ b/drivers/tty/nozomi.c
@@ -1220,9 +1220,8 @@ static irqreturn_t interrupt_handler(int irq, void *dev_id)
 exit_handler:
 	spin_unlock(&dc->spin_mutex);
 
-	for (a = 0; a < NOZOMI_MAX_PORTS; a++)
-		if (test_and_clear_bit(a, &dc->flip))
-			tty_flip_buffer_push(&dc->port[a].port);
+	for_each_test_and_clear_bit(a, &dc->flip, NOZOMI_MAX_PORTS)
+		tty_flip_buffer_push(&dc->port[a].port);
 
 	return IRQ_HANDLED;
 none:

From d744113d7dacab078a5272378ee6a2c478d50cc5 Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Mon, 11 Dec 2023 18:27:37 -0800
Subject: [PATCH 0055/1406] usb: cdc-acm: optimize acm_softint()

acm_softint() uses for-loop to traverse urbs_in_error_delay bitmap
bit by bit to find and clear set bits.

Simplify it by using for_each_test_and_clear_bit(), because it doesn't
test already clear bits.

Signed-off-by: Yury Norov <yury.norov@gmail.com>
Acked-by: Oliver Neukum <oneukum@suse.com>
---
 drivers/usb/class/cdc-acm.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/usb/class/cdc-acm.c b/drivers/usb/class/cdc-acm.c
index a1f4e1ead97ff4..8664b63050b0c7 100644
--- a/drivers/usb/class/cdc-acm.c
+++ b/drivers/usb/class/cdc-acm.c
@@ -613,9 +613,8 @@ static void acm_softint(struct work_struct *work)
 	}
 
 	if (test_and_clear_bit(ACM_ERROR_DELAY, &acm->flags)) {
-		for (i = 0; i < acm->rx_buflimit; i++)
-			if (test_and_clear_bit(i, &acm->urbs_in_error_delay))
-				acm_submit_read_urb(acm, i, GFP_KERNEL);
+		for_each_test_and_clear_bit(i, &acm->urbs_in_error_delay, acm->rx_buflimit)
+			acm_submit_read_urb(acm, i, GFP_KERNEL);
 	}
 
 	if (test_and_clear_bit(EVENT_TTY_WAKEUP, &acm->flags))

From f3687f2f7db4f0f464ab5cb43595c0964655eded Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Mon, 11 Dec 2023 18:27:38 -0800
Subject: [PATCH 0056/1406] block: null_blk: replace get_tag() with a generic
 find_and_set_bit_lock()

get_tag() opencodes find_and_set_bit(). Simplify the code by getting
rid of it.

Signed-off-by: Yury Norov <yury.norov@gmail.com>
Reviewed-by: Chengming Zhou <zhouchengming@bytedance.com>
Reviewed-by: Jan Kara <jack@suse.cz>
---
 drivers/block/null_blk/main.c | 41 +++++++++++------------------------
 1 file changed, 13 insertions(+), 28 deletions(-)

diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c
index 3021d58ca51c1f..671dbb9ab928af 100644
--- a/drivers/block/null_blk/main.c
+++ b/drivers/block/null_blk/main.c
@@ -760,19 +760,6 @@ static void put_tag(struct nullb_queue *nq, unsigned int tag)
 		wake_up(&nq->wait);
 }
 
-static unsigned int get_tag(struct nullb_queue *nq)
-{
-	unsigned int tag;
-
-	do {
-		tag = find_first_zero_bit(nq->tag_map, nq->queue_depth);
-		if (tag >= nq->queue_depth)
-			return -1U;
-	} while (test_and_set_bit_lock(tag, nq->tag_map));
-
-	return tag;
-}
-
 static void free_cmd(struct nullb_cmd *cmd)
 {
 	put_tag(cmd->nq, cmd->tag);
@@ -782,24 +769,22 @@ static enum hrtimer_restart null_cmd_timer_expired(struct hrtimer *timer);
 
 static struct nullb_cmd *__alloc_cmd(struct nullb_queue *nq)
 {
+	unsigned int tag = find_and_set_bit_lock(nq->tag_map, nq->queue_depth);
 	struct nullb_cmd *cmd;
-	unsigned int tag;
-
-	tag = get_tag(nq);
-	if (tag != -1U) {
-		cmd = &nq->cmds[tag];
-		cmd->tag = tag;
-		cmd->error = BLK_STS_OK;
-		cmd->nq = nq;
-		if (nq->dev->irqmode == NULL_IRQ_TIMER) {
-			hrtimer_init(&cmd->timer, CLOCK_MONOTONIC,
-				     HRTIMER_MODE_REL);
-			cmd->timer.function = null_cmd_timer_expired;
-		}
-		return cmd;
+
+	if (tag >= nq->queue_depth)
+		return NULL;
+
+	cmd = &nq->cmds[tag];
+	cmd->tag = tag;
+	cmd->error = BLK_STS_OK;
+	cmd->nq = nq;
+	if (nq->dev->irqmode == NULL_IRQ_TIMER) {
+		hrtimer_init(&cmd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+		cmd->timer.function = null_cmd_timer_expired;
 	}
 
-	return NULL;
+	return cmd;
 }
 
 static struct nullb_cmd *alloc_cmd(struct nullb_queue *nq, struct bio *bio)

From fe80b801ee439640f9a1b9df80b7ae0bc4d4bfb8 Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Mon, 11 Dec 2023 18:27:39 -0800
Subject: [PATCH 0057/1406] RDMA/rtrs: optimize __rtrs_get_permit() by using
 find_and_set_bit_lock()

The function opencodes find_and_set_bit_lock() with a while-loop polling
on test_and_set_bit_lock(). Use the dedicated function instead.

Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 drivers/infiniband/ulp/rtrs/rtrs-clt.c | 15 +++------------
 1 file changed, 3 insertions(+), 12 deletions(-)

diff --git a/drivers/infiniband/ulp/rtrs/rtrs-clt.c b/drivers/infiniband/ulp/rtrs/rtrs-clt.c
index 07261523c55473..2f3b0ad42e8aa7 100644
--- a/drivers/infiniband/ulp/rtrs/rtrs-clt.c
+++ b/drivers/infiniband/ulp/rtrs/rtrs-clt.c
@@ -72,18 +72,9 @@ __rtrs_get_permit(struct rtrs_clt_sess *clt, enum rtrs_clt_con_type con_type)
 	struct rtrs_permit *permit;
 	int bit;
 
-	/*
-	 * Adapted from null_blk get_tag(). Callers from different cpus may
-	 * grab the same bit, since find_first_zero_bit is not atomic.
-	 * But then the test_and_set_bit_lock will fail for all the
-	 * callers but one, so that they will loop again.
-	 * This way an explicit spinlock is not required.
-	 */
-	do {
-		bit = find_first_zero_bit(clt->permits_map, max_depth);
-		if (bit >= max_depth)
-			return NULL;
-	} while (test_and_set_bit_lock(bit, clt->permits_map));
+	bit = find_and_set_bit_lock(clt->permits_map, max_depth);
+	if (bit >= max_depth)
+		return NULL;
 
 	permit = get_permit(clt, bit);
 	WARN_ON(permit->mem_id != bit);

From fea0ea785cef13d881cdc3c5136d98851f14335f Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Mon, 11 Dec 2023 18:27:40 -0800
Subject: [PATCH 0058/1406] mISDN: optimize get_free_devid()

get_free_devid() traverses each bit in device_ids in an open-coded loop.
Simplify it by using the dedicated find_and_set_bit().

It makes the whole function a nice one-liner, and because MAX_DEVICE_ID
is a small constant-time value (63), on 64-bit platforms find_and_set_bit()
call will be optimized to:

	ffs();
	test_and_set_bit().

Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 drivers/isdn/mISDN/core.c | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/drivers/isdn/mISDN/core.c b/drivers/isdn/mISDN/core.c
index ab8513a7acd52d..c829c4eac0e23e 100644
--- a/drivers/isdn/mISDN/core.c
+++ b/drivers/isdn/mISDN/core.c
@@ -197,14 +197,9 @@ get_mdevice_count(void)
 static int
 get_free_devid(void)
 {
-	u_int	i;
+	int i = find_and_set_bit((u_long *)&device_ids, MAX_DEVICE_ID + 1);
 
-	for (i = 0; i <= MAX_DEVICE_ID; i++)
-		if (!test_and_set_bit(i, (u_long *)&device_ids))
-			break;
-	if (i > MAX_DEVICE_ID)
-		return -EBUSY;
-	return i;
+	return i <= MAX_DEVICE_ID ? i : -EBUSY;
 }
 
 int

From fbde99eaa647bc93ca3fbf78f4cec929beabf0f7 Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Mon, 11 Dec 2023 18:27:41 -0800
Subject: [PATCH 0059/1406] media: em28xx: cx231xx: optimize drivers by using
 find_and_set_bit()

Functions in the media/usb drivers opencode find_and_set_bit(). Simplify
them by using the function.

Signed-off-by: Yury Norov <yury.norov@gmail.com>
Acked-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
---
 drivers/media/usb/cx231xx/cx231xx-cards.c | 16 ++++------
 drivers/media/usb/em28xx/em28xx-cards.c   | 37 +++++++++--------------
 2 files changed, 21 insertions(+), 32 deletions(-)

diff --git a/drivers/media/usb/cx231xx/cx231xx-cards.c b/drivers/media/usb/cx231xx/cx231xx-cards.c
index 92efe6c1f47bae..b314603932d772 100644
--- a/drivers/media/usb/cx231xx/cx231xx-cards.c
+++ b/drivers/media/usb/cx231xx/cx231xx-cards.c
@@ -1708,16 +1708,12 @@ static int cx231xx_usb_probe(struct usb_interface *interface,
 		return -ENODEV;
 
 	/* Check to see next free device and mark as used */
-	do {
-		nr = find_first_zero_bit(&cx231xx_devused, CX231XX_MAXBOARDS);
-		if (nr >= CX231XX_MAXBOARDS) {
-			/* No free device slots */
-			dev_err(d,
-				"Supports only %i devices.\n",
-				CX231XX_MAXBOARDS);
-			return -ENOMEM;
-		}
-	} while (test_and_set_bit(nr, &cx231xx_devused));
+	nr = find_and_set_bit(&cx231xx_devused, CX231XX_MAXBOARDS);
+	if (nr >= CX231XX_MAXBOARDS) {
+		/* No free device slots */
+		dev_err(d, "Supports only %i devices.\n", CX231XX_MAXBOARDS);
+		return -ENOMEM;
+	}
 
 	udev = usb_get_dev(interface_to_usbdev(interface));
 
diff --git a/drivers/media/usb/em28xx/em28xx-cards.c b/drivers/media/usb/em28xx/em28xx-cards.c
index 4d037c92af7c58..af4809fe74a857 100644
--- a/drivers/media/usb/em28xx/em28xx-cards.c
+++ b/drivers/media/usb/em28xx/em28xx-cards.c
@@ -3684,17 +3684,14 @@ static int em28xx_duplicate_dev(struct em28xx *dev)
 		return -ENOMEM;
 	}
 	/* Check to see next free device and mark as used */
-	do {
-		nr = find_first_zero_bit(em28xx_devused, EM28XX_MAXBOARDS);
-		if (nr >= EM28XX_MAXBOARDS) {
-			/* No free device slots */
-			dev_warn(&dev->intf->dev, ": Supports only %i em28xx boards.\n",
-				 EM28XX_MAXBOARDS);
-			kfree(sec_dev);
-			dev->dev_next = NULL;
-			return -ENOMEM;
-		}
-	} while (test_and_set_bit(nr, em28xx_devused));
+	nr = find_and_set_bit(em28xx_devused, EM28XX_MAXBOARDS);
+	if (nr >= EM28XX_MAXBOARDS) {
+		/* No free device slots */
+		dev_warn(&dev->intf->dev, ": Supports only %i em28xx boards.\n", EM28XX_MAXBOARDS);
+		kfree(sec_dev);
+		dev->dev_next = NULL;
+		return -ENOMEM;
+	}
 	sec_dev->devno = nr;
 	snprintf(sec_dev->name, 28, "em28xx #%d", nr);
 	sec_dev->dev_next = NULL;
@@ -3827,17 +3824,13 @@ static int em28xx_usb_probe(struct usb_interface *intf,
 	udev = usb_get_dev(interface_to_usbdev(intf));
 
 	/* Check to see next free device and mark as used */
-	do {
-		nr = find_first_zero_bit(em28xx_devused, EM28XX_MAXBOARDS);
-		if (nr >= EM28XX_MAXBOARDS) {
-			/* No free device slots */
-			dev_err(&intf->dev,
-				"Driver supports up to %i em28xx boards.\n",
-			       EM28XX_MAXBOARDS);
-			retval = -ENOMEM;
-			goto err_no_slot;
-		}
-	} while (test_and_set_bit(nr, em28xx_devused));
+	nr = find_and_set_bit(em28xx_devused, EM28XX_MAXBOARDS);
+	if (nr >= EM28XX_MAXBOARDS) {
+		/* No free device slots */
+		dev_err(&intf->dev, "Driver supports up to %i em28xx boards.\n", EM28XX_MAXBOARDS);
+		retval = -ENOMEM;
+		goto err_no_slot;
+	}
 
 	/* Don't register audio interfaces */
 	if (intf->altsetting[0].desc.bInterfaceClass == USB_CLASS_AUDIO) {

From 35a11cd220c710b8c42203cf07aea6f6f873f6a5 Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Mon, 11 Dec 2023 18:27:42 -0800
Subject: [PATCH 0060/1406] ethernet: rocker: optimize
 ofdpa_port_internal_vlan_id_get()

Optimize ofdpa_port_internal_vlan_id_get() by using find_and_set_bit(),
instead of polling every bit from bitmap in a for-loop.

Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 drivers/net/ethernet/rocker/rocker_ofdpa.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/rocker/rocker_ofdpa.c b/drivers/net/ethernet/rocker/rocker_ofdpa.c
index 826990459fa443..449be8af7ffce6 100644
--- a/drivers/net/ethernet/rocker/rocker_ofdpa.c
+++ b/drivers/net/ethernet/rocker/rocker_ofdpa.c
@@ -2249,14 +2249,11 @@ static __be16 ofdpa_port_internal_vlan_id_get(struct ofdpa_port *ofdpa_port,
 	found = entry;
 	hash_add(ofdpa->internal_vlan_tbl, &found->entry, found->ifindex);
 
-	for (i = 0; i < OFDPA_N_INTERNAL_VLANS; i++) {
-		if (test_and_set_bit(i, ofdpa->internal_vlan_bitmap))
-			continue;
+	i = find_and_set_bit(ofdpa->internal_vlan_bitmap, OFDPA_N_INTERNAL_VLANS);
+	if (i < OFDPA_N_INTERNAL_VLANS)
 		found->vlan_id = htons(OFDPA_INTERNAL_VLAN_ID_BASE + i);
-		goto found;
-	}
-
-	netdev_err(ofdpa_port->dev, "Out of internal VLAN IDs\n");
+	else
+		netdev_err(ofdpa_port->dev, "Out of internal VLAN IDs\n");
 
 found:
 	found->ref_count++;

From e63a961be48f2855a76d034e23471995ad6b9972 Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Mon, 11 Dec 2023 18:27:43 -0800
Subject: [PATCH 0061/1406] serial: sc12is7xx: optimize sc16is7xx_alloc_line()

Instead of polling every bit in sc16is7xx_lines, use a dedicated
find_and_set_bit(), and make the function a simple one-liner.

Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 drivers/tty/serial/sc16is7xx.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/drivers/tty/serial/sc16is7xx.c b/drivers/tty/serial/sc16is7xx.c
index db2bb1c0d36c26..6a463988d5e002 100644
--- a/drivers/tty/serial/sc16is7xx.c
+++ b/drivers/tty/serial/sc16is7xx.c
@@ -427,15 +427,9 @@ static void sc16is7xx_port_update(struct uart_port *port, u8 reg,
 
 static int sc16is7xx_alloc_line(void)
 {
-	int i;
-
 	BUILD_BUG_ON(SC16IS7XX_MAX_DEVS > BITS_PER_LONG);
 
-	for (i = 0; i < SC16IS7XX_MAX_DEVS; i++)
-		if (!test_and_set_bit(i, &sc16is7xx_lines))
-			break;
-
-	return i;
+	return find_and_set_bit(&sc16is7xx_lines, SC16IS7XX_MAX_DEVS);
 }
 
 static void sc16is7xx_power(struct uart_port *port, int on)

From 137ce860bc80dd529ab6b04dde2a6e761b32b3ec Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Mon, 11 Dec 2023 18:27:44 -0800
Subject: [PATCH 0062/1406] bluetooth: optimize cmtp_alloc_block_id()

Instead of polling every bit in blockids, use a dedicated
find_and_set_bit(), and make the function a simple one-liner.

Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 net/bluetooth/cmtp/core.c | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/net/bluetooth/cmtp/core.c b/net/bluetooth/cmtp/core.c
index 90d130588a3e51..b1330acbbff366 100644
--- a/net/bluetooth/cmtp/core.c
+++ b/net/bluetooth/cmtp/core.c
@@ -88,15 +88,9 @@ static void __cmtp_copy_session(struct cmtp_session *session, struct cmtp_connin
 
 static inline int cmtp_alloc_block_id(struct cmtp_session *session)
 {
-	int i, id = -1;
+	int id = find_and_set_bit(&session->blockids, 16);
 
-	for (i = 0; i < 16; i++)
-		if (!test_and_set_bit(i, &session->blockids)) {
-			id = i;
-			break;
-		}
-
-	return id;
+	return id < 16 ? id : -1;
 }
 
 static inline void cmtp_free_block_id(struct cmtp_session *session, int id)

From 668284d460b55af73b9a001aad57b25eb1f674b7 Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Mon, 11 Dec 2023 18:27:45 -0800
Subject: [PATCH 0063/1406] net: smc: optimize smc_wr_tx_get_free_slot_index()

Simplify the function by using find_and_set_bit() and make it a simple
almost one-liner.

While here, drop explicit initialization of *idx, because it's already
initialized by the caller in case of ENOLINK, or set properly with
->wr_tx_mask, if nothing is found, in case of EBUSY.

CC: Tony Lu <tonylu@linux.alibaba.com>
Signed-off-by: Yury Norov <yury.norov@gmail.com>
Reviewed-by: Alexandra Winter <wintera@linux.ibm.com>
Reviewed-by: Wen Gu <guwen@linux.alibaba.com>
---
 net/smc/smc_wr.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c
index 0021065a600a03..b6f0cfc527882f 100644
--- a/net/smc/smc_wr.c
+++ b/net/smc/smc_wr.c
@@ -170,15 +170,11 @@ void smc_wr_tx_cq_handler(struct ib_cq *ib_cq, void *cq_context)
 
 static inline int smc_wr_tx_get_free_slot_index(struct smc_link *link, u32 *idx)
 {
-	*idx = link->wr_tx_cnt;
 	if (!smc_link_sendable(link))
 		return -ENOLINK;
-	for_each_clear_bit(*idx, link->wr_tx_mask, link->wr_tx_cnt) {
-		if (!test_and_set_bit(*idx, link->wr_tx_mask))
-			return 0;
-	}
-	*idx = link->wr_tx_cnt;
-	return -EBUSY;
+
+	*idx = find_and_set_bit(link->wr_tx_mask, link->wr_tx_cnt);
+	return *idx < link->wr_tx_cnt ? 0 : -EBUSY;
 }
 
 /**

From 78cdf2d0f4565c5866eb1ec2105397835003f15d Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Mon, 11 Dec 2023 18:27:46 -0800
Subject: [PATCH 0064/1406] ALSA: use atomic find_bit() functions where
 applicable

ALSA code tests each bit in bitmaps in a for() loop. Switch it to
using dedicated atomic find_bit() API.

Signed-off-by: Yury Norov <yury.norov@gmail.com>
Acked-by: Takashi Iwai <tiwai@suse.de>
---
 sound/pci/hda/hda_codec.c |  7 +++----
 sound/usb/caiaq/audio.c   | 13 +++++--------
 2 files changed, 8 insertions(+), 12 deletions(-)

diff --git a/sound/pci/hda/hda_codec.c b/sound/pci/hda/hda_codec.c
index 01718b1fc9a7f8..29254005f3941a 100644
--- a/sound/pci/hda/hda_codec.c
+++ b/sound/pci/hda/hda_codec.c
@@ -3275,10 +3275,9 @@ static int get_empty_pcm_device(struct hda_bus *bus, unsigned int type)
 
 #ifdef CONFIG_SND_DYNAMIC_MINORS
 	/* non-fixed slots starting from 10 */
-	for (i = 10; i < 32; i++) {
-		if (!test_and_set_bit(i, bus->pcm_dev_bits))
-			return i;
-	}
+	i = find_and_set_next_bit(bus->pcm_dev_bits, 32, 10);
+	if (i < 32)
+		return i;
 #endif
 
 	dev_warn(bus->card->dev, "Too many %s devices\n",
diff --git a/sound/usb/caiaq/audio.c b/sound/usb/caiaq/audio.c
index 4981753652a7fe..74dfcf32b439d2 100644
--- a/sound/usb/caiaq/audio.c
+++ b/sound/usb/caiaq/audio.c
@@ -610,7 +610,7 @@ static void read_completed(struct urb *urb)
 	struct snd_usb_caiaq_cb_info *info = urb->context;
 	struct snd_usb_caiaqdev *cdev;
 	struct device *dev;
-	struct urb *out = NULL;
+	struct urb *out;
 	int i, frame, len, send_it = 0, outframe = 0;
 	unsigned long flags;
 	size_t offset = 0;
@@ -625,17 +625,14 @@ static void read_completed(struct urb *urb)
 		return;
 
 	/* find an unused output urb that is unused */
-	for (i = 0; i < N_URBS; i++)
-		if (test_and_set_bit(i, &cdev->outurb_active_mask) == 0) {
-			out = cdev->data_urbs_out[i];
-			break;
-		}
-
-	if (!out) {
+	i = find_and_set_bit(&cdev->outurb_active_mask, N_URBS);
+	if (i >= N_URBS) {
 		dev_err(dev, "Unable to find an output urb to use\n");
 		goto requeue;
 	}
 
+	out = cdev->data_urbs_out[i];
+
 	/* read the recently received packet and send back one which has
 	 * the same layout */
 	for (frame = 0; frame < FRAMES_PER_URB; frame++) {

From 4d56bf2e0c2321894cdb654b96944710fbd62314 Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Mon, 11 Dec 2023 18:27:47 -0800
Subject: [PATCH 0065/1406] m68k: optimize get_mmu_context()

get_mmu_context() opencodes atomic find_and_set_bit_wrap(). Simplify
it by using find_and_set_bit_wrap().

CC: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Yury Norov <yury.norov@gmail.com>
Acked-by: Greg Ungerer <gerg@linux-m68k.org>
---
 arch/m68k/include/asm/mmu_context.h | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/arch/m68k/include/asm/mmu_context.h b/arch/m68k/include/asm/mmu_context.h
index 141bbdfad96019..0419ad87a1c122 100644
--- a/arch/m68k/include/asm/mmu_context.h
+++ b/arch/m68k/include/asm/mmu_context.h
@@ -35,12 +35,11 @@ static inline void get_mmu_context(struct mm_struct *mm)
 		atomic_inc(&nr_free_contexts);
 		steal_context();
 	}
-	ctx = next_mmu_context;
-	while (test_and_set_bit(ctx, context_map)) {
-		ctx = find_next_zero_bit(context_map, LAST_CONTEXT+1, ctx);
-		if (ctx > LAST_CONTEXT)
-			ctx = 0;
-	}
+
+	do {
+		ctx = find_and_set_bit_wrap(context_map, LAST_CONTEXT + 1, next_mmu_context);
+	} while (ctx > LAST_CONTEXT);
+
 	next_mmu_context = (ctx + 1) & LAST_CONTEXT;
 	mm->context = ctx;
 	context_mm[ctx] = mm;

From 18eda5a178066852986b0380f201295cefa582e7 Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Mon, 11 Dec 2023 18:27:48 -0800
Subject: [PATCH 0066/1406] microblaze: optimize get_mmu_context()

Simplify get_mmu_context() by using find_and_set_bit_wrap().

Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 arch/microblaze/include/asm/mmu_context_mm.h | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/arch/microblaze/include/asm/mmu_context_mm.h b/arch/microblaze/include/asm/mmu_context_mm.h
index c2c77f70845562..209c3a62353a99 100644
--- a/arch/microblaze/include/asm/mmu_context_mm.h
+++ b/arch/microblaze/include/asm/mmu_context_mm.h
@@ -82,12 +82,11 @@ static inline void get_mmu_context(struct mm_struct *mm)
 		return;
 	while (atomic_dec_if_positive(&nr_free_contexts) < 0)
 		steal_context();
-	ctx = next_mmu_context;
-	while (test_and_set_bit(ctx, context_map)) {
-		ctx = find_next_zero_bit(context_map, LAST_CONTEXT+1, ctx);
-		if (ctx > LAST_CONTEXT)
-			ctx = 0;
-	}
+
+	do {
+		ctx = find_and_set_bit_wrap(context_map, LAST_CONTEXT + 1, next_mmu_context);
+	} while (ctx > LAST_CONTEXT);
+
 	next_mmu_context = (ctx + 1) & LAST_CONTEXT;
 	mm->context = ctx;
 	context_mm[ctx] = mm;

From 5e95ee6fd52b06432da9636032ac6986112feec1 Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Mon, 11 Dec 2023 18:27:49 -0800
Subject: [PATCH 0067/1406] sh: mach-x3proto: optimize ilsel_enable()

Simplify ilsel_enable() by using find_and_set_bit().

CC: John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de>
Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 arch/sh/boards/mach-x3proto/ilsel.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/sh/boards/mach-x3proto/ilsel.c b/arch/sh/boards/mach-x3proto/ilsel.c
index f0d5eb41521a49..7fadc479a80bf7 100644
--- a/arch/sh/boards/mach-x3proto/ilsel.c
+++ b/arch/sh/boards/mach-x3proto/ilsel.c
@@ -99,8 +99,8 @@ int ilsel_enable(ilsel_source_t set)
 	}
 
 	do {
-		bit = find_first_zero_bit(&ilsel_level_map, ILSEL_LEVELS);
-	} while (test_and_set_bit(bit, &ilsel_level_map));
+		bit = find_and_set_bit(&ilsel_level_map, ILSEL_LEVELS);
+	} while (bit >= ILSEL_LEVELS);
 
 	__ilsel_enable(set, bit);
 

From 071ad962baf5e857fd965595421cf6fb588610ed Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Fri, 22 Dec 2023 16:04:02 +0200
Subject: [PATCH 0068/1406] bitmap: Step down as a reviewer

Too many things are going on, and reviewing BITMAP related code
seems not the best I can do, hence step down as a reviewer of
the BITMAP library.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 MAINTAINERS | 1 -
 1 file changed, 1 deletion(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index 788be9ab5b733a..51983ed2d4e483 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3553,7 +3553,6 @@ F:	include/uapi/linux/bfs_fs.h
 
 BITMAP API
 M:	Yury Norov <yury.norov@gmail.com>
-R:	Andy Shevchenko <andriy.shevchenko@linux.intel.com>
 R:	Rasmus Villemoes <linux@rasmusvillemoes.dk>
 S:	Maintained
 F:	include/linux/bitfield.h

From 7f29d67809293992a721edeab3903ad498e0e9cd Mon Sep 17 00:00:00 2001
From: Gabriel Krisman Bertazi <krisman@suse.de>
Date: Fri, 22 Dec 2023 22:37:03 -0500
Subject: [PATCH 0069/1406] ovl: Reject mounting case-insensitive filesystems

overlayfs relies on the filesystem setting DCACHE_OP_HASH or
DCACHE_OP_COMPARE to reject mounting over case-insensitive directories.

Since commit bb9cd9106b22 ("fscrypt: Have filesystems handle their
d_ops"), we set ->d_op through a hook in ->d_lookup, which
means the root dentry won't have them, causing the mount to accidentally
succeed.

In v6.7-rc7, the following sequence will succeed to mount, but any
dentry other than the root dentry will be a "weird" dentry to ovl and
fail with EREMOTE.

  mkfs.ext4 -O casefold lower.img
  mount -O loop lower.img lower
  mount -t overlay -o lowerdir=lower,upperdir=upper,workdir=work ovl /mnt

Mounting on a subdirectory fails, as expected, because DCACHE_OP_HASH
and DCACHE_OP_COMPARE are properly set by ->lookup.

Fix by explicitly rejecting superblocks that allow case-insensitive
dentries.

While there, re-sort the entries to have more descriptive error messages
first.

Fixes: bb9cd9106b22 ("fscrypt: Have filesystems handle their d_ops")
Signed-off-by: Gabriel Krisman Bertazi <krisman@suse.de>
Acked-by: Amir Goldstein <amir73il@gmail.com>
---
 fs/overlayfs/params.c | 13 ++++++++++---
 include/linux/fs.h    |  9 +++++++++
 2 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/fs/overlayfs/params.c b/fs/overlayfs/params.c
index 112b4b12f8252a..a3edd9e1bdc464 100644
--- a/fs/overlayfs/params.c
+++ b/fs/overlayfs/params.c
@@ -280,12 +280,19 @@ static int ovl_mount_dir_check(struct fs_context *fc, const struct path *path,
 {
 	struct ovl_fs_context *ctx = fc->fs_private;
 
-	if (ovl_dentry_weird(path->dentry))
-		return invalfc(fc, "filesystem on %s not supported", name);
-
 	if (!d_is_dir(path->dentry))
 		return invalfc(fc, "%s is not a directory", name);
 
+	/*
+	 * Root dentries of case-insensitive filesystems might not have
+	 * the dentry operations set, but still be incompatible with
+	 * overlayfs.  Check explicitly to prevent post-mount failures.
+	 */
+	if (sb_has_encoding(path->mnt->mnt_sb))
+		return invalfc(fc, "case-insensitive filesystem on %s not supported", name);
+
+	if (ovl_dentry_weird(path->dentry))
+		return invalfc(fc, "filesystem on %s not supported", name);
 
 	/*
 	 * Check whether upper path is read-only here to report failures
diff --git a/include/linux/fs.h b/include/linux/fs.h
index e6ba0cc6f2eeea..a0eb8b5759a6fe 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3282,6 +3282,15 @@ extern int generic_check_addressable(unsigned, u64);
 
 extern void generic_set_encrypted_ci_d_ops(struct dentry *dentry);
 
+static inline bool sb_has_encoding(const struct super_block *sb)
+{
+#if IS_ENABLED(CONFIG_UNICODE)
+	return !!sb->s_encoding;
+#else
+	return false;
+#endif
+}
+
 int may_setattr(struct mnt_idmap *idmap, struct inode *inode,
 		unsigned int ia_valid);
 int setattr_prepare(struct mnt_idmap *, struct dentry *, struct iattr *);

From 213b755e42e2e7127777f74d2174bb4843a9b03a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20J=C3=BCcker?= <martin.juecker@gmail.com>
Date: Fri, 22 Dec 2023 00:02:58 +0100
Subject: [PATCH 0070/1406] ARM: defconfig: enable STMicroelectronics
 accelerometer and gyro for Exynos
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Enable STMicroelectronics accelerometer and gyro drivers for the Samsung
P4note device family in exynos and multi_v7 defconfigs.

Signed-off-by: Martin Jücker <martin.juecker@gmail.com>
Link: https://lore.kernel.org/r/20231221230258.56272-2-martin.juecker@gmail.com
Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
---
 arch/arm/configs/exynos_defconfig   | 3 +++
 arch/arm/configs/multi_v7_defconfig | 3 +++
 2 files changed, 6 insertions(+)

diff --git a/arch/arm/configs/exynos_defconfig b/arch/arm/configs/exynos_defconfig
index c98d5ff8a1ed08..7ad48fdda1dac6 100644
--- a/arch/arm/configs/exynos_defconfig
+++ b/arch/arm/configs/exynos_defconfig
@@ -318,8 +318,11 @@ CONFIG_EXTCON_MAX77693=y
 CONFIG_EXTCON_MAX8997=y
 CONFIG_EXYNOS5422_DMC=y
 CONFIG_IIO=y
+CONFIG_IIO_ST_ACCEL_3AXIS=m
+# CONFIG_IIO_ST_ACCEL_SPI_3AXIS is not set
 CONFIG_EXYNOS_ADC=y
 CONFIG_STMPE_ADC=y
+CONFIG_IIO_ST_GYRO_3AXIS=m
 CONFIG_CM36651=y
 CONFIG_AK8975=y
 CONFIG_SENSORS_ISL29018=y
diff --git a/arch/arm/configs/multi_v7_defconfig b/arch/arm/configs/multi_v7_defconfig
index ecb3e286107a4c..0d885cb6120679 100644
--- a/arch/arm/configs/multi_v7_defconfig
+++ b/arch/arm/configs/multi_v7_defconfig
@@ -1150,6 +1150,8 @@ CONFIG_STM32_FMC2_EBI=y
 CONFIG_EXYNOS5422_DMC=m
 CONFIG_IIO=y
 CONFIG_IIO_SW_TRIGGER=y
+CONFIG_IIO_ST_ACCEL_3AXIS=m
+# CONFIG_IIO_ST_ACCEL_SPI_3AXIS is not set
 CONFIG_ASPEED_ADC=m
 CONFIG_AT91_ADC=m
 CONFIG_AT91_SAMA5D2_ADC=m
@@ -1169,6 +1171,7 @@ CONFIG_IIO_CROS_EC_SENSORS_CORE=m
 CONFIG_IIO_CROS_EC_SENSORS=m
 CONFIG_STM32_DAC=m
 CONFIG_MPU3050_I2C=y
+CONFIG_IIO_ST_GYRO_3AXIS=m
 CONFIG_CM36651=m
 CONFIG_IIO_CROS_EC_LIGHT_PROX=m
 CONFIG_SENSORS_ISL29018=y

From f55fcdb06f529c5031ada7edd5ede63dfdcc4a54 Mon Sep 17 00:00:00 2001
From: Jay <merqqcury@gmail.com>
Date: Tue, 9 Jan 2024 15:29:27 +0800
Subject: [PATCH 0071/1406] fs: fix a typo in attr.c

The word "filesytem" should be "filesystem"

Signed-off-by: Jay <merqqcury@gmail.com>
Link: https://lore.kernel.org/r/20240109072927.29626-1-merqqcury@gmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/attr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/attr.c b/fs/attr.c
index 5a13f0c8495fde..49d23b5dbab4b9 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -352,7 +352,7 @@ int may_setattr(struct mnt_idmap *idmap, struct inode *inode,
 EXPORT_SYMBOL(may_setattr);
 
 /**
- * notify_change - modify attributes of a filesytem object
+ * notify_change - modify attributes of a filesystem object
  * @idmap:	idmap of the mount the inode was found from
  * @dentry:	object affected
  * @attr:	new attributes

From a121e297aac51c3ddb3f1f9aea58d4289aea8bc8 Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruenba@redhat.com>
Date: Mon, 8 Jan 2024 18:20:40 +0100
Subject: [PATCH 0072/1406] fs: Wrong function name in comment

This comment refers to function mark_buffer_inode_dirty(), but the
function is actually called mark_buffer_dirty_inode(), so fix the
comment.

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
Link: https://lore.kernel.org/r/20240108172040.178173-1-agruenba@redhat.com
Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/buffer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/buffer.c b/fs/buffer.c
index d3bcf601d3e5a5..dcafee512089a2 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -464,7 +464,7 @@ EXPORT_SYMBOL(mark_buffer_async_write);
  * a successful fsync().  For example, ext2 indirect blocks need to be
  * written back and waited upon before fsync() returns.
  *
- * The functions mark_buffer_inode_dirty(), fsync_inode_buffers(),
+ * The functions mark_buffer_dirty_inode(), fsync_inode_buffers(),
  * inode_has_buffers() and invalidate_inode_buffers() are provided for the
  * management of a list of dependent buffers at ->i_mapping->i_private_list.
  *

From 30c45816e23523cd4527a8d20bf20c7d6bfd5a16 Mon Sep 17 00:00:00 2001
From: David Disseldorp <ddiss@suse.de>
Date: Thu, 11 Jan 2024 17:22:40 +1100
Subject: [PATCH 0073/1406] initramfs: remove duplicate built-in
 __initramfs_start unpacking

If initrd_start cpio extraction fails, CONFIG_BLK_DEV_RAM triggers
fallback to initrd.image handling via populate_initrd_image().
The populate_initrd_image() call follows successful extraction of any
built-in cpio archive at __initramfs_start, but currently performs
built-in archive extraction a second time.

Prior to commit b2a74d5f9d446 ("initramfs: remove clean_rootfs"),
the second built-in initramfs unpack call was used to repopulate entries
removed by clean_rootfs(), but it's no longer necessary now the contents
of the previous extraction are retained.

Signed-off-by: David Disseldorp <ddiss@suse.de>
Link: https://lore.kernel.org/r/20240111062240.9362-1-ddiss@suse.de
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 init/initramfs.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/init/initramfs.c b/init/initramfs.c
index 76deb48c38cb16..d3c623dde01a88 100644
--- a/init/initramfs.c
+++ b/init/initramfs.c
@@ -679,8 +679,6 @@ static void __init populate_initrd_image(char *err)
 	struct file *file;
 	loff_t pos = 0;
 
-	unpack_to_rootfs(__initramfs_start, __initramfs_size);
-
 	printk(KERN_INFO "rootfs image is not initramfs (%s); looks like an initrd\n",
 			err);
 	file = filp_open("/initrd.image", O_WRONLY | O_CREAT, 0700);

From ce2128e96b51c78732010fc79763d7a7141259e2 Mon Sep 17 00:00:00 2001
From: Wen Yang <wenyang.linux@foxmail.com>
Date: Wed, 10 Jan 2024 23:47:40 +0800
Subject: [PATCH 0074/1406] eventfd: add a BUILD_BUG_ON() to ensure consistency
 between EFD_SEMAPHORE and the uapi

introduce a BUILD_BUG_ON to check that the EFD_SEMAPHORE is equal to its
definition in the uapi file, just like EFD_CLOEXEC and EFD_NONBLOCK.

Signed-off-by: Wen Yang <wenyang.linux@foxmail.com>
Link: https://lore.kernel.org/r/tencent_0BAA2DEAF9208D49987457E6583F9BE79507@qq.com
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Jan Kara <jack@suse.cz>
Cc: <linux-fsdevel@vger.kernel.org>
Cc: <linux-kernel@vger.kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/eventfd.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/eventfd.c b/fs/eventfd.c
index ad8186d47ba760..0252b71099fbca 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -383,6 +383,7 @@ static int do_eventfd(unsigned int count, int flags)
 	/* Check the EFD_* constants for consistency.  */
 	BUILD_BUG_ON(EFD_CLOEXEC != O_CLOEXEC);
 	BUILD_BUG_ON(EFD_NONBLOCK != O_NONBLOCK);
+	BUILD_BUG_ON(EFD_SEMAPHORE != (1 << 0));
 
 	if (flags & ~EFD_FLAGS_SET)
 		return -EINVAL;

From 4148bf4c5e6dc0932e3d4649047b203e9d554d3c Mon Sep 17 00:00:00 2001
From: Kunwu Chan <chentao@kylinos.cn>
Date: Tue, 16 Jan 2024 17:11:37 +0800
Subject: [PATCH 0075/1406] buffer: Use KMEM_CACHE instead of
 kmem_cache_create()

Use the new KMEM_CACHE() macro instead of direct kmem_cache_create
to simplify the creation of SLAB caches.

Signed-off-by: Kunwu Chan <chentao@kylinos.cn>
Link: https://lore.kernel.org/r/20240116091137.92375-1-chentao@kylinos.cn
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/buffer.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/fs/buffer.c b/fs/buffer.c
index dcafee512089a2..b55dea034a5d83 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -3121,12 +3121,8 @@ void __init buffer_init(void)
 	unsigned long nrpages;
 	int ret;
 
-	bh_cachep = kmem_cache_create("buffer_head",
-			sizeof(struct buffer_head), 0,
-				(SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
-				SLAB_MEM_SPREAD),
-				NULL);
-
+	bh_cachep = KMEM_CACHE(buffer_head,
+				SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD);
 	/*
 	 * Limit the bh occupancy to 10% of ZONE_NORMAL
 	 */

From c2f1af4e033e17f9332e47e8fc3266afd95ad548 Mon Sep 17 00:00:00 2001
From: Baolin Wang <baolin.wang@linux.alibaba.com>
Date: Tue, 16 Jan 2024 15:53:35 +0800
Subject: [PATCH 0076/1406] fs: improve dump_mapping() robustness

We met a kernel crash issue when running stress-ng testing, and the
system crashes when printing the dentry name in dump_mapping().

Unable to handle kernel NULL pointer dereference at virtual address 0000000000000000
pc : dentry_name+0xd8/0x224
lr : pointer+0x22c/0x370
sp : ffff800025f134c0
......
Call trace:
  dentry_name+0xd8/0x224
  pointer+0x22c/0x370
  vsnprintf+0x1ec/0x730
  vscnprintf+0x2c/0x60
  vprintk_store+0x70/0x234
  vprintk_emit+0xe0/0x24c
  vprintk_default+0x3c/0x44
  vprintk_func+0x84/0x2d0
  printk+0x64/0x88
  __dump_page+0x52c/0x530
  dump_page+0x14/0x20
  set_migratetype_isolate+0x110/0x224
  start_isolate_page_range+0xc4/0x20c
  offline_pages+0x124/0x474
  memory_block_offline+0x44/0xf4
  memory_subsys_offline+0x3c/0x70
  device_offline+0xf0/0x120
  ......

The root cause is that, one thread is doing page migration, and we will
use the target page's ->mapping field to save 'anon_vma' pointer between
page unmap and page move, and now the target page is locked and refcount
is 1.

Currently, there is another stress-ng thread performing memory hotplug,
attempting to offline the target page that is being migrated. It discovers
that the refcount of this target page is 1, preventing the offline operation,
thus proceeding to dump the page. However, page_mapping() of the target
page may return an incorrect file mapping to crash the system in dump_mapping(),
since the target page->mapping only saves 'anon_vma' pointer without setting
PAGE_MAPPING_ANON flag.

The page migration issue has been fixed by commit d1adb25df711 ("mm: migrate:
fix getting incorrect page mapping during page migration"). In addition,
Matthew suggested we should also improve dump_mapping()'s robustness to
resilient against the kernel crash [1].

With checking the 'dentry.parent' and 'dentry.d_name.name' used by
dentry_name(), I can see dump_mapping() will output the invalid dentry
instead of crashing the system when this issue is reproduced again.

[12211.189128] page:fffff7de047741c0 refcount:1 mapcount:0 mapping:ffff989117f55ea0 index:0x1 pfn:0x211dd07
[12211.189144] aops:0x0 ino:1 invalid dentry:74786574206e6870
[12211.189148] flags: 0x57ffffc0000001(locked|node=1|zone=2|lastcpupid=0x1fffff)
[12211.189150] page_type: 0xffffffff()
[12211.189153] raw: 0057ffffc0000001 0000000000000000 dead000000000122 ffff989117f55ea0
[12211.189154] raw: 0000000000000001 0000000000000001 00000001ffffffff 0000000000000000
[12211.189155] page dumped because: unmovable page

[1] https://lore.kernel.org/all/ZXxn%2F0oixJxxAnpF@casper.infradead.org/

Suggested-by: Matthew Wilcox <willy@infradead.org>
Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Link: https://lore.kernel.org/r/937ab1f87328516821d39be672b6bc18861d9d3e.1705391420.git.baolin.wang@linux.alibaba.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/inode.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/inode.c b/fs/inode.c
index 91048c4c9c9e7d..6d0d5423036380 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -588,7 +588,8 @@ void dump_mapping(const struct address_space *mapping)
 	}
 
 	dentry_ptr = container_of(dentry_first, struct dentry, d_u.d_alias);
-	if (get_kernel_nofault(dentry, dentry_ptr)) {
+	if (get_kernel_nofault(dentry, dentry_ptr) ||
+	    !dentry.d_parent || !dentry.d_name.name) {
 		pr_warn("aops:%ps ino:%lx invalid dentry:%px\n",
 				a_ops, ino, dentry_ptr);
 		return;

From 352f0ba021364519f95aa378dcca18cf7600fba1 Mon Sep 17 00:00:00 2001
From: Hu Yadi <hu.yadi@h3c.com>
Date: Fri, 12 Jan 2024 15:40:59 +0800
Subject: [PATCH 0077/1406] selftests/filesystems:fix build error in overlayfs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

One build issue comes up due to both mount.h included dev_in_maps.c

In file included from dev_in_maps.c:10:
/usr/include/sys/mount.h:35:3: error: expected identifier before numeric constant
   35 |   MS_RDONLY = 1,  /* Mount read-only.  */
      |   ^~~~~~~~~
In file included from dev_in_maps.c:13:

Remove one of them to solve conflict, another error comes up:

dev_in_maps.c:170:6: error: implicit declaration of function ‘mount’ [-Werror=implicit-function-declaration]
  170 |  if (mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL) == -1) {
      |      ^~~~~
cc1: all warnings being treated as errors

and then , add sys_mount definition to solve it
After both above, dev_in_maps.c can be built correctly on my mache(gcc 10.2,glibc-2.32,kernel-5.10)

Signed-off-by: Hu Yadi <hu.yadi@h3c.com>
Link: https://lore.kernel.org/r/20240112074059.29673-1-hu.yadi@h3c.com
Acked-by: Andrei Vagin <avagin@google.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 .../selftests/filesystems/overlayfs/dev_in_maps.c      | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/tools/testing/selftests/filesystems/overlayfs/dev_in_maps.c b/tools/testing/selftests/filesystems/overlayfs/dev_in_maps.c
index e19ab0e8570913..759f86e7d263e4 100644
--- a/tools/testing/selftests/filesystems/overlayfs/dev_in_maps.c
+++ b/tools/testing/selftests/filesystems/overlayfs/dev_in_maps.c
@@ -10,7 +10,6 @@
 #include <linux/mount.h>
 #include <sys/syscall.h>
 #include <sys/stat.h>
-#include <sys/mount.h>
 #include <sys/mman.h>
 #include <sched.h>
 #include <fcntl.h>
@@ -32,7 +31,11 @@ static int sys_fsmount(int fd, unsigned int flags, unsigned int attr_flags)
 {
 	return syscall(__NR_fsmount, fd, flags, attr_flags);
 }
-
+static int sys_mount(const char *src, const char *tgt, const char *fst,
+		unsigned long flags, const void *data)
+{
+	return syscall(__NR_mount, src, tgt, fst, flags, data);
+}
 static int sys_move_mount(int from_dfd, const char *from_pathname,
 			  int to_dfd, const char *to_pathname,
 			  unsigned int flags)
@@ -166,8 +169,7 @@ int main(int argc, char **argv)
 		ksft_test_result_skip("unable to create a new mount namespace\n");
 		return 1;
 	}
-
-	if (mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL) == -1) {
+	if (sys_mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL) == -1) {
 		pr_perror("mount");
 		return 1;
 	}

From b872e2a5ac3456dd1a3229657e1ea46aff06d2f1 Mon Sep 17 00:00:00 2001
From: "Hu.Yadi" <hu.yadi@h3c.com>
Date: Thu, 11 Jan 2024 19:32:29 +0800
Subject: [PATCH 0078/1406] selftests/move_mount_set_group:Make tests build
 with old libc
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace SYS_<syscall> with __NR_<syscall>.  Using the __NR_<syscall>
notation, provided by UAPI, is useful to build tests on systems without
the SYS_<syscall> definitions.

Replace SYS_move_mount with __NR_move_mount

Similar changes: commit 87129ef13603 ("selftests/landlock: Make tests build with old libc")

Acked-by: Mickaël Salaün <mic@digikod.net>
Signed-off-by: Hu.Yadi <hu.yadi@h3c.com>
Link: https://lore.kernel.org/r/20240111113229.10820-1-hu.yadi@h3c.com
Reviewed-by: Berlin <berlin@h3c.com>
Suggested-by: Jiao <jiaoxupo@h3c.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 .../move_mount_set_group/move_mount_set_group_test.c          | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/move_mount_set_group/move_mount_set_group_test.c b/tools/testing/selftests/move_mount_set_group/move_mount_set_group_test.c
index 50ed5d475dd131..bcf51d785a3712 100644
--- a/tools/testing/selftests/move_mount_set_group/move_mount_set_group_test.c
+++ b/tools/testing/selftests/move_mount_set_group/move_mount_set_group_test.c
@@ -218,7 +218,7 @@ static bool move_mount_set_group_supported(void)
 	if (mount(NULL, SET_GROUP_FROM, NULL, MS_SHARED, 0))
 		return -1;
 
-	ret = syscall(SYS_move_mount, AT_FDCWD, SET_GROUP_FROM,
+	ret = syscall(__NR_move_mount, AT_FDCWD, SET_GROUP_FROM,
 		      AT_FDCWD, SET_GROUP_TO, MOVE_MOUNT_SET_GROUP);
 	umount2("/tmp", MNT_DETACH);
 
@@ -363,7 +363,7 @@ TEST_F(move_mount_set_group, complex_sharing_copying)
 		       CLONE_VM | CLONE_FILES); ASSERT_GT(pid, 0);
 	ASSERT_EQ(wait_for_pid(pid), 0);
 
-	ASSERT_EQ(syscall(SYS_move_mount, ca_from.mntfd, "",
+	ASSERT_EQ(syscall(__NR_move_mount, ca_from.mntfd, "",
 			  ca_to.mntfd, "", MOVE_MOUNT_SET_GROUP
 			  | MOVE_MOUNT_F_EMPTY_PATH | MOVE_MOUNT_T_EMPTY_PATH),
 		  0);

From 74ad68a64b60029c9e5dc78f63ae3fffc9a5569b Mon Sep 17 00:00:00 2001
From: Rich Felker <dalias@libc.org>
Date: Mon, 31 Aug 2020 11:32:08 -0400
Subject: [PATCH 0079/1406] vfs: add RWF_NOAPPEND flag for pwritev2

The pwrite function, originally defined by POSIX (thus the "p"), is
defined to ignore O_APPEND and write at the offset passed as its
argument. However, historically Linux honored O_APPEND if set and
ignored the offset. This cannot be changed due to stability policy,
but is documented in the man page as a bug.

Now that there's a pwritev2 syscall providing a superset of the pwrite
functionality that has a flags argument, the conforming behavior can
be offered to userspace via a new flag. Since pwritev2 checks flag
validity (in kiocb_set_rw_flags) and reports unknown ones with
EOPNOTSUPP, callers will not get wrong behavior on old kernels that
don't support the new flag; the error is reported and the caller can
decide how to handle it.

Signed-off-by: Rich Felker <dalias@libc.org>
Link: https://lore.kernel.org/r/20200831153207.GO3265@brightrain.aerifal.cx
Reviewed-by: Jann Horn <jannh@google.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/fs.h      | 8 ++++++++
 include/uapi/linux/fs.h | 5 ++++-
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/include/linux/fs.h b/include/linux/fs.h
index ed5966a7049512..4f7cfda29143e2 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3335,6 +3335,8 @@ static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags)
 		return 0;
 	if (unlikely(flags & ~RWF_SUPPORTED))
 		return -EOPNOTSUPP;
+	if (unlikely((flags & RWF_APPEND) && (flags & RWF_NOAPPEND)))
+		return -EINVAL;
 
 	if (flags & RWF_NOWAIT) {
 		if (!(ki->ki_filp->f_mode & FMODE_NOWAIT))
@@ -3345,6 +3347,12 @@ static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags)
 	if (flags & RWF_SYNC)
 		kiocb_flags |= IOCB_DSYNC;
 
+	if ((flags & RWF_NOAPPEND) && (ki->ki_flags & IOCB_APPEND)) {
+		if (IS_APPEND(file_inode(ki->ki_filp)))
+			return -EPERM;
+		ki->ki_flags &= ~IOCB_APPEND;
+	}
+
 	ki->ki_flags |= kiocb_flags;
 	return 0;
 }
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index 48ad69f7722e1a..2203d3194b91a7 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -301,9 +301,12 @@ typedef int __bitwise __kernel_rwf_t;
 /* per-IO O_APPEND */
 #define RWF_APPEND	((__force __kernel_rwf_t)0x00000010)
 
+/* per-IO negation of O_APPEND */
+#define RWF_NOAPPEND	((__force __kernel_rwf_t)0x00000020)
+
 /* mask of flags supported by the kernel */
 #define RWF_SUPPORTED	(RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\
-			 RWF_APPEND)
+			 RWF_APPEND | RWF_NOAPPEND)
 
 /* Pagemap ioctl */
 #define PAGEMAP_SCAN	_IOWR('f', 16, struct pm_scan_arg)

From 172827cc44e92f3416392a43e4d219cbcf4d012c Mon Sep 17 00:00:00 2001
From: Kemeng Shi <shikemeng@huaweicloud.com>
Date: Fri, 19 Jan 2024 04:33:39 +0800
Subject: [PATCH 0080/1406] writeback: move wb_wakeup_delayed defination to
 fs-writeback.c

The wb_wakeup_delayed is only used in fs-writeback.c. Move it to
fs-writeback.c after defination of wb_wakeup and make it static.

Signed-off-by: Kemeng Shi <shikemeng@huaweicloud.com>
Link: https://lore.kernel.org/r/20240118203339.764093-1-shikemeng@huaweicloud.com
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/fs-writeback.c           | 25 +++++++++++++++++++++++++
 include/linux/backing-dev.h |  1 -
 mm/backing-dev.c            | 25 -------------------------
 3 files changed, 25 insertions(+), 26 deletions(-)

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 3d84fcc471c600..e4f17c53ddfcf3 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -141,6 +141,31 @@ static void wb_wakeup(struct bdi_writeback *wb)
 	spin_unlock_irq(&wb->work_lock);
 }
 
+/*
+ * This function is used when the first inode for this wb is marked dirty. It
+ * wakes-up the corresponding bdi thread which should then take care of the
+ * periodic background write-out of dirty inodes. Since the write-out would
+ * starts only 'dirty_writeback_interval' centisecs from now anyway, we just
+ * set up a timer which wakes the bdi thread up later.
+ *
+ * Note, we wouldn't bother setting up the timer, but this function is on the
+ * fast-path (used by '__mark_inode_dirty()'), so we save few context switches
+ * by delaying the wake-up.
+ *
+ * We have to be careful not to postpone flush work if it is scheduled for
+ * earlier. Thus we use queue_delayed_work().
+ */
+static void wb_wakeup_delayed(struct bdi_writeback *wb)
+{
+	unsigned long timeout;
+
+	timeout = msecs_to_jiffies(dirty_writeback_interval * 10);
+	spin_lock_irq(&wb->work_lock);
+	if (test_bit(WB_registered, &wb->state))
+		queue_delayed_work(bdi_wq, &wb->dwork, timeout);
+	spin_unlock_irq(&wb->work_lock);
+}
+
 static void finish_writeback_work(struct bdi_writeback *wb,
 				  struct wb_writeback_work *work)
 {
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 1a97277f99b1b8..8e7af9a03b41dd 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -38,7 +38,6 @@ struct backing_dev_info *bdi_alloc(int node_id);
 
 void wb_start_background_writeback(struct bdi_writeback *wb);
 void wb_workfn(struct work_struct *work);
-void wb_wakeup_delayed(struct bdi_writeback *wb);
 
 void wb_wait_for_completion(struct wb_completion *done);
 
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 1e3447bccdb14d..039dc74b505a85 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -372,31 +372,6 @@ static int __init default_bdi_init(void)
 }
 subsys_initcall(default_bdi_init);
 
-/*
- * This function is used when the first inode for this wb is marked dirty. It
- * wakes-up the corresponding bdi thread which should then take care of the
- * periodic background write-out of dirty inodes. Since the write-out would
- * starts only 'dirty_writeback_interval' centisecs from now anyway, we just
- * set up a timer which wakes the bdi thread up later.
- *
- * Note, we wouldn't bother setting up the timer, but this function is on the
- * fast-path (used by '__mark_inode_dirty()'), so we save few context switches
- * by delaying the wake-up.
- *
- * We have to be careful not to postpone flush work if it is scheduled for
- * earlier. Thus we use queue_delayed_work().
- */
-void wb_wakeup_delayed(struct bdi_writeback *wb)
-{
-	unsigned long timeout;
-
-	timeout = msecs_to_jiffies(dirty_writeback_interval * 10);
-	spin_lock_irq(&wb->work_lock);
-	if (test_bit(WB_registered, &wb->state))
-		queue_delayed_work(bdi_wq, &wb->dwork, timeout);
-	spin_unlock_irq(&wb->work_lock);
-}
-
 static void wb_update_bandwidth_workfn(struct work_struct *work)
 {
 	struct bdi_writeback *wb = container_of(to_delayed_work(work),

From 19e062e48b33f9f52dbb6f87def79b709f407260 Mon Sep 17 00:00:00 2001
From: Nikita Zhandarovich <n.zhandarovich@fintech.ru>
Date: Fri, 19 Jan 2024 07:39:06 -0800
Subject: [PATCH 0081/1406] do_sys_name_to_handle(): use kzalloc() to fix
 kernel-infoleak

syzbot identified a kernel information leak vulnerability in
do_sys_name_to_handle() and issued the following report [1].

[1]
"BUG: KMSAN: kernel-infoleak in instrument_copy_to_user include/linux/instrumented.h:114 [inline]
BUG: KMSAN: kernel-infoleak in _copy_to_user+0xbc/0x100 lib/usercopy.c:40
 instrument_copy_to_user include/linux/instrumented.h:114 [inline]
 _copy_to_user+0xbc/0x100 lib/usercopy.c:40
 copy_to_user include/linux/uaccess.h:191 [inline]
 do_sys_name_to_handle fs/fhandle.c:73 [inline]
 __do_sys_name_to_handle_at fs/fhandle.c:112 [inline]
 __se_sys_name_to_handle_at+0x949/0xb10 fs/fhandle.c:94
 __x64_sys_name_to_handle_at+0xe4/0x140 fs/fhandle.c:94
 ...

Uninit was created at:
 slab_post_alloc_hook+0x129/0xa70 mm/slab.h:768
 slab_alloc_node mm/slub.c:3478 [inline]
 __kmem_cache_alloc_node+0x5c9/0x970 mm/slub.c:3517
 __do_kmalloc_node mm/slab_common.c:1006 [inline]
 __kmalloc+0x121/0x3c0 mm/slab_common.c:1020
 kmalloc include/linux/slab.h:604 [inline]
 do_sys_name_to_handle fs/fhandle.c:39 [inline]
 __do_sys_name_to_handle_at fs/fhandle.c:112 [inline]
 __se_sys_name_to_handle_at+0x441/0xb10 fs/fhandle.c:94
 __x64_sys_name_to_handle_at+0xe4/0x140 fs/fhandle.c:94
 ...

Bytes 18-19 of 20 are uninitialized
Memory access of size 20 starts at ffff888128a46380
Data copied to user address 0000000020000240"

Per Chuck Lever's suggestion, use kzalloc() instead of kmalloc() to
solve the problem.

Fixes: 990d6c2d7aee ("vfs: Add name to file handle conversion support")
Suggested-by: Chuck Lever III <chuck.lever@oracle.com>
Reported-and-tested-by: <syzbot+09b349b3066c2e0b1e96@syzkaller.appspotmail.com>
Signed-off-by: Nikita Zhandarovich <n.zhandarovich@fintech.ru>
Link: https://lore.kernel.org/r/20240119153906.4367-1-n.zhandarovich@fintech.ru
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/fhandle.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/fhandle.c b/fs/fhandle.c
index 18b3ba8dc8ead7..57a12614addfd4 100644
--- a/fs/fhandle.c
+++ b/fs/fhandle.c
@@ -36,7 +36,7 @@ static long do_sys_name_to_handle(const struct path *path,
 	if (f_handle.handle_bytes > MAX_HANDLE_SZ)
 		return -EINVAL;
 
-	handle = kmalloc(sizeof(struct file_handle) + f_handle.handle_bytes,
+	handle = kzalloc(sizeof(struct file_handle) + f_handle.handle_bytes,
 			 GFP_KERNEL);
 	if (!handle)
 		return -ENOMEM;

From 04b945e4cf81a12365f8207a4d34dbc81ba17413 Mon Sep 17 00:00:00 2001
From: Jeff Johnson <quic_jjohnson@quicinc.com>
Date: Thu, 21 Dec 2023 07:16:04 -0800
Subject: [PATCH 0082/1406] slimbus: qcom-ngd-ctrl: Make QMI message rules
 const

Commit ff6d365898d4 ("soc: qcom: qmi: use const for struct
qmi_elem_info") allows QMI message encoding/decoding rules
to be const, so do that for qcom-ngd-ctrl.c.

Signed-off-by: Jeff Johnson <quic_jjohnson@quicinc.com>
Signed-off-by: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
---
 drivers/slimbus/qcom-ngd-ctrl.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/slimbus/qcom-ngd-ctrl.c b/drivers/slimbus/qcom-ngd-ctrl.c
index 77aa6d26476cd2..efeba8275a6691 100644
--- a/drivers/slimbus/qcom-ngd-ctrl.c
+++ b/drivers/slimbus/qcom-ngd-ctrl.c
@@ -220,7 +220,7 @@ struct slimbus_power_resp_msg_v01 {
 	struct qmi_response_type_v01 resp;
 };
 
-static struct qmi_elem_info slimbus_select_inst_req_msg_v01_ei[] = {
+static const struct qmi_elem_info slimbus_select_inst_req_msg_v01_ei[] = {
 	{
 		.data_type  = QMI_UNSIGNED_4_BYTE,
 		.elem_len   = 1,
@@ -262,7 +262,7 @@ static struct qmi_elem_info slimbus_select_inst_req_msg_v01_ei[] = {
 	},
 };
 
-static struct qmi_elem_info slimbus_select_inst_resp_msg_v01_ei[] = {
+static const struct qmi_elem_info slimbus_select_inst_resp_msg_v01_ei[] = {
 	{
 		.data_type  = QMI_STRUCT,
 		.elem_len   = 1,
@@ -284,7 +284,7 @@ static struct qmi_elem_info slimbus_select_inst_resp_msg_v01_ei[] = {
 	},
 };
 
-static struct qmi_elem_info slimbus_power_req_msg_v01_ei[] = {
+static const struct qmi_elem_info slimbus_power_req_msg_v01_ei[] = {
 	{
 		.data_type  = QMI_UNSIGNED_4_BYTE,
 		.elem_len   = 1,
@@ -324,7 +324,7 @@ static struct qmi_elem_info slimbus_power_req_msg_v01_ei[] = {
 	},
 };
 
-static struct qmi_elem_info slimbus_power_resp_msg_v01_ei[] = {
+static const struct qmi_elem_info slimbus_power_resp_msg_v01_ei[] = {
 	{
 		.data_type  = QMI_STRUCT,
 		.elem_len   = 1,

From ace4b31b297dfd7b8c969ff5046c8128c3e025be Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Thu, 18 Jan 2024 16:19:13 +0530
Subject: [PATCH 0083/1406] cpufreq: Move
 dev_pm_opp_{init|free}_cpufreq_table() to pm_opp.h

Move the declaration of functions defined in the OPP core to pm_opp.h.
These were added to cpufreq.h as it was the only user of the APIs, but
that was a mistake perhaps. Fix it.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 include/linux/cpufreq.h | 20 --------------------
 include/linux/pm_opp.h  | 16 ++++++++++++++++
 2 files changed, 16 insertions(+), 20 deletions(-)

diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index afda5f24d3ddc6..8ff3e79727d80c 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -694,26 +694,6 @@ struct cpufreq_frequency_table {
 				    * order */
 };
 
-#if defined(CONFIG_CPU_FREQ) && defined(CONFIG_PM_OPP)
-int dev_pm_opp_init_cpufreq_table(struct device *dev,
-				  struct cpufreq_frequency_table **table);
-void dev_pm_opp_free_cpufreq_table(struct device *dev,
-				   struct cpufreq_frequency_table **table);
-#else
-static inline int dev_pm_opp_init_cpufreq_table(struct device *dev,
-						struct cpufreq_frequency_table
-						**table)
-{
-	return -EINVAL;
-}
-
-static inline void dev_pm_opp_free_cpufreq_table(struct device *dev,
-						 struct cpufreq_frequency_table
-						 **table)
-{
-}
-#endif
-
 /*
  * cpufreq_for_each_entry -	iterate over a cpufreq_frequency_table
  * @pos:	the cpufreq_frequency_table * to use as a loop cursor.
diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h
index 76dcb7f37bcdff..f1ac8bde09cb56 100644
--- a/include/linux/pm_opp.h
+++ b/include/linux/pm_opp.h
@@ -16,6 +16,7 @@
 #include <linux/notifier.h>
 
 struct clk;
+struct cpufreq_frequency_table;
 struct regulator;
 struct dev_pm_opp;
 struct device;
@@ -444,6 +445,21 @@ static inline int dev_pm_opp_sync_regulators(struct device *dev)
 
 #endif		/* CONFIG_PM_OPP */
 
+#if defined(CONFIG_CPU_FREQ) && defined(CONFIG_PM_OPP)
+int dev_pm_opp_init_cpufreq_table(struct device *dev, struct cpufreq_frequency_table **table);
+void dev_pm_opp_free_cpufreq_table(struct device *dev, struct cpufreq_frequency_table **table);
+#else
+static inline int dev_pm_opp_init_cpufreq_table(struct device *dev, struct cpufreq_frequency_table **table)
+{
+	return -EINVAL;
+}
+
+static inline void dev_pm_opp_free_cpufreq_table(struct device *dev, struct cpufreq_frequency_table **table)
+{
+}
+#endif
+
+
 #if defined(CONFIG_PM_OPP) && defined(CONFIG_OF)
 int dev_pm_opp_of_add_table(struct device *dev);
 int dev_pm_opp_of_add_table_indexed(struct device *dev, int index);

From 52501486483e1646852f78f3f5af89ab573d2caf Mon Sep 17 00:00:00 2001
From: Wen Yang <wenyang.linux@foxmail.com>
Date: Mon, 15 Jan 2024 23:27:00 +0800
Subject: [PATCH 0084/1406] eventfd: move 'eventfd-count' printing out of
 spinlock

When printing eventfd->count, interrupts will be disabled and a spinlock
will be obtained, competing with eventfd_write(). By moving the
"eventfd-count" print out of the spinlock and merging multiple
seq_printf() into one, it could improve a bit, just like timerfd_show().

Signed-off-by: Wen Yang <wenyang.linux@foxmail.com>
Link: https://lore.kernel.org/r/tencent_B0B3D2BD9861FD009E03AB18A81783322709@qq.com
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Dylan Yudaken <dylany@fb.com>
Cc: David Woodhouse <dwmw@amazon.co.uk>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Eric Biggers <ebiggers@google.com>
Cc: <linux-fsdevel@vger.kernel.org>
Cc: <linux-kernel@vger.kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/eventfd.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/fs/eventfd.c b/fs/eventfd.c
index 0252b71099fbca..fc4d8109076392 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -283,13 +283,18 @@ static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t c
 static void eventfd_show_fdinfo(struct seq_file *m, struct file *f)
 {
 	struct eventfd_ctx *ctx = f->private_data;
+	__u64 cnt;
 
 	spin_lock_irq(&ctx->wqh.lock);
-	seq_printf(m, "eventfd-count: %16llx\n",
-		   (unsigned long long)ctx->count);
+	cnt = ctx->count;
 	spin_unlock_irq(&ctx->wqh.lock);
-	seq_printf(m, "eventfd-id: %d\n", ctx->id);
-	seq_printf(m, "eventfd-semaphore: %d\n",
+
+	seq_printf(m,
+		   "eventfd-count: %16llx\n"
+		   "eventfd-id: %d\n"
+		   "eventfd-semaphore: %d\n",
+		   cnt,
+		   ctx->id,
 		   !!(ctx->flags & EFD_SEMAPHORE));
 }
 #endif

From 5b3d743da951aeaedda401304d88b7b6ec1969d7 Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Date: Tue, 23 Jan 2024 09:34:50 +0100
Subject: [PATCH 0085/1406] dt-bindings: sram: narrow regex for unit address to
 hex numbers

Regular expression used to match the unit address part should not allow
non-hex numbers.

Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Acked-by: Jernej Skrabec <jernej.skrabec@gmail.com>
Link: https://lore.kernel.org/r/20240123083450.20996-1-krzysztof.kozlowski@linaro.org
Signed-off-by: Jernej Skrabec <jernej.skrabec@gmail.com>
---
 .../bindings/sram/allwinner,sun4i-a10-system-control.yaml       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/devicetree/bindings/sram/allwinner,sun4i-a10-system-control.yaml b/Documentation/devicetree/bindings/sram/allwinner,sun4i-a10-system-control.yaml
index a1c96985951ff2..cf07b8f787a6ed 100644
--- a/Documentation/devicetree/bindings/sram/allwinner,sun4i-a10-system-control.yaml
+++ b/Documentation/devicetree/bindings/sram/allwinner,sun4i-a10-system-control.yaml
@@ -56,7 +56,7 @@ properties:
   ranges: true
 
 patternProperties:
-  "^sram@[a-z0-9]+":
+  "^sram@[a-f0-9]+":
     $ref: /schemas/sram/sram.yaml#
     unevaluatedProperties: false
 

From 26ca757780d1ba9a982f8b79aef8e4cf5d171182 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Sat, 20 Jan 2024 21:18:57 -0800
Subject: [PATCH 0086/1406] clk: sunxi: usb: fix kernel-doc warnings
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Move the function description comment to immediately above the
function implementation, the add function parameter descriptions to
prevent kernel-doc warnings:

clk-usb.c:80: warning: expecting prototype for sunxi_usb_clk_setup(). Prototype was for SUNXI_USB_MAX_SIZE() instead
clk-usb.c:91: warning: Function parameter or struct member 'node' not described in 'sunxi_usb_clk_setup'
clk-usb.c:91: warning: Function parameter or struct member 'data' not described in 'sunxi_usb_clk_setup'
clk-usb.c:91: warning: Function parameter or struct member 'lock' not described in 'sunxi_usb_clk_setup'

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Cc: Emilio López <emilio@elopez.com.ar>
Cc: Michael Turquette <mturquette@baylibre.com>
Cc: Stephen Boyd <sboyd@kernel.org>
Cc:  <linux-clk@vger.kernel.org>
Cc: Chen-Yu Tsai <wens@csie.org>
Cc: Jernej Skrabec <jernej.skrabec@gmail.com>
Cc: Samuel Holland <samuel@sholland.org>
Cc:  <linux-arm-kernel@lists.infradead.org>
Cc:  <linux-sunxi@lists.linux.dev>
Reviewed-by: Andre Przywara <andre.przywara@arm.com>
Acked-by: Jernej Skrabec <jernej.skrabec@gmail.com>
Link: https://lore.kernel.org/r/20240121051858.17647-1-rdunlap@infradead.org
Signed-off-by: Jernej Skrabec <jernej.skrabec@gmail.com>
---
 drivers/clk/sunxi/clk-usb.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/drivers/clk/sunxi/clk-usb.c b/drivers/clk/sunxi/clk-usb.c
index 5460218f3467ab..3c53f65002a285 100644
--- a/drivers/clk/sunxi/clk-usb.c
+++ b/drivers/clk/sunxi/clk-usb.c
@@ -73,9 +73,6 @@ static const struct reset_control_ops sunxi_usb_reset_ops = {
 	.deassert	= sunxi_usb_reset_deassert,
 };
 
-/**
- * sunxi_usb_clk_setup() - Setup function for usb gate clocks
- */
 
 #define SUNXI_USB_MAX_SIZE 32
 
@@ -85,6 +82,12 @@ struct usb_clk_data {
 	bool reset_needs_clk;
 };
 
+/**
+ * sunxi_usb_clk_setup() - Setup function for usb gate clocks
+ * @node: &struct device_node for the clock
+ * @data: &struct usb_clk_data for the clock
+ * @lock: spinlock for the clock
+ */
 static void __init sunxi_usb_clk_setup(struct device_node *node,
 				       const struct usb_clk_data *data,
 				       spinlock_t *lock)

From 367122c529f35b4655acbe33c0cc4d6d3b32ba71 Mon Sep 17 00:00:00 2001
From: Gabriel Krisman Bertazi <krisman@suse.de>
Date: Wed, 24 Jan 2024 15:13:40 -0300
Subject: [PATCH 0087/1406] libfs: Attempt exact-match comparison first during
 casefolded lookup

Casefolded comparisons are (obviously) way more costly than a simple
memcmp.  Try the case-sensitive comparison first, falling-back to the
case-insensitive lookup only when needed.  This allows any exact-match
lookup to complete without having to walk the utf8 trie.

Note that, for strict mode, generic_ci_d_compare used to reject an
invalid UTF-8 string, which would now be considered valid if it
exact-matches the disk-name.  But, if that is the case, the filesystem
is corrupt.  More than that, it really doesn't matter in practice,
because the name-under-lookup will have already been rejected by
generic_ci_d_hash and we won't even get here.

The memcmp is safe under RCU because we are operating on str/len instead
of dentry->d_name directly, and the caller guarantees their consistency
between each other in __d_lookup_rcu_op_compare.

Link: https://lore.kernel.org/r/87ttn2sip7.fsf_-_@mailhost.krisman.be
Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Gabriel Krisman Bertazi <krisman@suse.de>
---
 fs/libfs.c | 40 +++++++++++++++++++++++-----------------
 1 file changed, 23 insertions(+), 17 deletions(-)

diff --git a/fs/libfs.c b/fs/libfs.c
index eec6031b015544..306a0510b7dc25 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -1704,16 +1704,28 @@ bool is_empty_dir_inode(struct inode *inode)
 static int generic_ci_d_compare(const struct dentry *dentry, unsigned int len,
 				const char *str, const struct qstr *name)
 {
-	const struct dentry *parent = READ_ONCE(dentry->d_parent);
-	const struct inode *dir = READ_ONCE(parent->d_inode);
-	const struct super_block *sb = dentry->d_sb;
-	const struct unicode_map *um = sb->s_encoding;
-	struct qstr qstr = QSTR_INIT(str, len);
+	const struct dentry *parent;
+	const struct inode *dir;
 	char strbuf[DNAME_INLINE_LEN];
-	int ret;
+	struct qstr qstr;
+
+	/*
+	 * Attempt a case-sensitive match first. It is cheaper and
+	 * should cover most lookups, including all the sane
+	 * applications that expect a case-sensitive filesystem.
+	 *
+	 * This comparison is safe under RCU because the caller
+	 * guarantees the consistency between str and len. See
+	 * __d_lookup_rcu_op_compare() for details.
+	 */
+	if (len == name->len && !memcmp(str, name->name, len))
+		return 0;
 
+	parent = READ_ONCE(dentry->d_parent);
+	dir = READ_ONCE(parent->d_inode);
 	if (!dir || !IS_CASEFOLDED(dir))
-		goto fallback;
+		return 1;
+
 	/*
 	 * If the dentry name is stored in-line, then it may be concurrently
 	 * modified by a rename.  If this happens, the VFS will eventually retry
@@ -1724,20 +1736,14 @@ static int generic_ci_d_compare(const struct dentry *dentry, unsigned int len,
 	if (len <= DNAME_INLINE_LEN - 1) {
 		memcpy(strbuf, str, len);
 		strbuf[len] = 0;
-		qstr.name = strbuf;
+		str = strbuf;
 		/* prevent compiler from optimizing out the temporary buffer */
 		barrier();
 	}
-	ret = utf8_strncasecmp(um, name, &qstr);
-	if (ret >= 0)
-		return ret;
+	qstr.len = len;
+	qstr.name = str;
 
-	if (sb_has_strict_encoding(sb))
-		return -EINVAL;
-fallback:
-	if (len != name->len)
-		return 1;
-	return !!memcmp(str, name->name, len);
+	return utf8_strncasecmp(dentry->d_sb->s_encoding, name, &qstr);
 }
 
 /**

From 0b3bbd8f9baf245ec77d86f6f5bc902105b4bfa9 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Fri, 22 Dec 2023 21:05:11 -0800
Subject: [PATCH 0088/1406] counter: linux/counter.h: fix Excess kernel-doc
 description warning

Remove the @priv: line to prevent the kernel-doc warning:

include/linux/counter.h:400: warning: Excess struct member 'priv' description in 'counter_device'

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Fixes: f2ee4759fb70 ("counter: remove old and now unused registration API")
Link: https://lore.kernel.org/r/20231223050511.13849-1-rdunlap@infradead.org
Signed-off-by: William Breathitt Gray <william.gray@linaro.org>
---
 include/linux/counter.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/include/linux/counter.h b/include/linux/counter.h
index 702e9108bbb44e..b767b5c821f58e 100644
--- a/include/linux/counter.h
+++ b/include/linux/counter.h
@@ -359,7 +359,6 @@ struct counter_ops {
  * @num_counts:		number of Counts specified in @counts
  * @ext:		optional array of Counter device extensions
  * @num_ext:		number of Counter device extensions specified in @ext
- * @priv:		optional private data supplied by driver
  * @dev:		internal device structure
  * @chrdev:		internal character device structure
  * @events_list:	list of current watching Counter events

From 3b75d271e161e22aff8171940a77510d2fb2ad6f Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Sun, 14 Jan 2024 16:39:21 +0200
Subject: [PATCH 0089/1406] backlight: hx8357: Fix potential NULL pointer
 dereference

The "im" pins are optional. Add missing check in the hx8357_probe().

Reported-by: Dan Carpenter <dan.carpenter@linaro.org>
Closes: https://lore.kernel.org/r/642e1230-3358-4006-a17f-3f297897ae74@moroto.mountain
Fixes: 7d84a63a39b7 ("backlight: hx8357: Convert to agnostic GPIO API")
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Daniel Thompson <daniel.thompson@linaro.org>
Link: https://lore.kernel.org/r/20240114143921.550736-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/video/backlight/hx8357.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/drivers/video/backlight/hx8357.c b/drivers/video/backlight/hx8357.c
index d7298376cf74dd..bf18337ff0c2c0 100644
--- a/drivers/video/backlight/hx8357.c
+++ b/drivers/video/backlight/hx8357.c
@@ -609,11 +609,13 @@ static int hx8357_probe(struct spi_device *spi)
 	lcd->im_pins = devm_gpiod_get_array_optional(dev, "im", GPIOD_OUT_LOW);
 	if (IS_ERR(lcd->im_pins))
 		return dev_err_probe(dev, PTR_ERR(lcd->im_pins), "failed to request im GPIOs\n");
-	if (lcd->im_pins->ndescs < HX8357_NUM_IM_PINS)
-		return dev_err_probe(dev, -EINVAL, "not enough im GPIOs\n");
+	if (lcd->im_pins) {
+		if (lcd->im_pins->ndescs < HX8357_NUM_IM_PINS)
+			return dev_err_probe(dev, -EINVAL, "not enough im GPIOs\n");
 
-	for (i = 0; i < HX8357_NUM_IM_PINS; i++)
-		gpiod_set_consumer_name(lcd->im_pins->desc[i], "im_pins");
+		for (i = 0; i < HX8357_NUM_IM_PINS; i++)
+			gpiod_set_consumer_name(lcd->im_pins->desc[i], "im_pins");
+	}
 
 	lcdev = devm_lcd_device_register(&spi->dev, "mxsfb", &spi->dev, lcd,
 					&hx8357_ops);

From 6b7704ff03d397788e75b8db78479222f0e80d3f Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 23 Jan 2024 15:24:46 -0700
Subject: [PATCH 0090/1406] iov_iter: streamline iovec/bvec alignment iteration

Rewrite the alignment checking iterators for iovec and bvec to be easier
to read, and also significantly more compact in terms of generated code.
This saves 270 bytes of text on x86-64 for me (with clang-18) and 224
bytes on arm64 (with gcc-13).

In profiles, also saves a bit of time as well for the same workload:

     0.81%     -0.18%  [kernel.vmlinux]  [k] iov_iter_aligned_bvec
     0.48%     -0.09%  [kernel.vmlinux]  [k] iov_iter_is_aligned

which is a nice side benefit as well.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
Link: https://lore.kernel.org/r/544b31f7-6d4b-42f5-a544-1420501f081f@kernel.dk
Reviewed-by: Keith Busch <kbusch@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>

v2: do the other half of the iterators too, as suggested by Keith.
    This further saves some text.
---
 lib/iov_iter.c | 55 +++++++++++++++++++++++++-------------------------
 1 file changed, 28 insertions(+), 27 deletions(-)

diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index e0aa6b440ca5f4..15f5040709c36e 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -714,12 +714,11 @@ EXPORT_SYMBOL(iov_iter_discard);
 static bool iov_iter_aligned_iovec(const struct iov_iter *i, unsigned addr_mask,
 				   unsigned len_mask)
 {
+	const struct iovec *iov = iter_iov(i);
 	size_t size = i->count;
 	size_t skip = i->iov_offset;
-	unsigned k;
 
-	for (k = 0; k < i->nr_segs; k++, skip = 0) {
-		const struct iovec *iov = iter_iov(i) + k;
+	do {
 		size_t len = iov->iov_len - skip;
 
 		if (len > size)
@@ -729,34 +728,36 @@ static bool iov_iter_aligned_iovec(const struct iov_iter *i, unsigned addr_mask,
 		if ((unsigned long)(iov->iov_base + skip) & addr_mask)
 			return false;
 
+		iov++;
 		size -= len;
-		if (!size)
-			break;
-	}
+		skip = 0;
+	} while (size);
+
 	return true;
 }
 
 static bool iov_iter_aligned_bvec(const struct iov_iter *i, unsigned addr_mask,
 				  unsigned len_mask)
 {
-	size_t size = i->count;
+	const struct bio_vec *bvec = i->bvec;
 	unsigned skip = i->iov_offset;
-	unsigned k;
+	size_t size = i->count;
 
-	for (k = 0; k < i->nr_segs; k++, skip = 0) {
-		size_t len = i->bvec[k].bv_len - skip;
+	do {
+		size_t len = bvec->bv_len;
 
 		if (len > size)
 			len = size;
 		if (len & len_mask)
 			return false;
-		if ((unsigned long)(i->bvec[k].bv_offset + skip) & addr_mask)
+		if ((unsigned long)(bvec->bv_offset + skip) & addr_mask)
 			return false;
 
+		bvec++;
 		size -= len;
-		if (!size)
-			break;
-	}
+		skip = 0;
+	} while (size);
+
 	return true;
 }
 
@@ -800,13 +801,12 @@ EXPORT_SYMBOL_GPL(iov_iter_is_aligned);
 
 static unsigned long iov_iter_alignment_iovec(const struct iov_iter *i)
 {
+	const struct iovec *iov = iter_iov(i);
 	unsigned long res = 0;
 	size_t size = i->count;
 	size_t skip = i->iov_offset;
-	unsigned k;
 
-	for (k = 0; k < i->nr_segs; k++, skip = 0) {
-		const struct iovec *iov = iter_iov(i) + k;
+	do {
 		size_t len = iov->iov_len - skip;
 		if (len) {
 			res |= (unsigned long)iov->iov_base + skip;
@@ -814,30 +814,31 @@ static unsigned long iov_iter_alignment_iovec(const struct iov_iter *i)
 				len = size;
 			res |= len;
 			size -= len;
-			if (!size)
-				break;
 		}
-	}
+		iov++;
+		skip = 0;
+	} while (size);
 	return res;
 }
 
 static unsigned long iov_iter_alignment_bvec(const struct iov_iter *i)
 {
+	const struct bio_vec *bvec = i->bvec;
 	unsigned res = 0;
 	size_t size = i->count;
 	unsigned skip = i->iov_offset;
-	unsigned k;
 
-	for (k = 0; k < i->nr_segs; k++, skip = 0) {
-		size_t len = i->bvec[k].bv_len - skip;
-		res |= (unsigned long)i->bvec[k].bv_offset + skip;
+	do {
+		size_t len = bvec->bv_len - skip;
+		res |= (unsigned long)bvec->bv_offset + skip;
 		if (len > size)
 			len = size;
 		res |= len;
+		bvec++;
 		size -= len;
-		if (!size)
-			break;
-	}
+		skip = 0;
+	} while (size);
+
 	return res;
 }
 

From 38c5f831b7aed53416db6c3b0297ea4cfac41294 Mon Sep 17 00:00:00 2001
From: Baokun Li <libaokun1@huawei.com>
Date: Wed, 24 Jan 2024 22:28:55 +0800
Subject: [PATCH 0091/1406] fs: make the i_size_read/write helpers be
 smp_load_acquire/store_release()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In [Link] Linus mentions that acquire/release makes it clear which
_particular_ memory accesses are the ordered ones, and it's unlikely
to make any performance difference, so it's much better to pair up
the release->acquire ordering than have a "wmb->rmb" ordering.

=========================================================
 update pagecache
 folio_mark_uptodate(folio)
   smp_wmb()
   set_bit PG_uptodate

 === ↑↑↑ STLR ↑↑↑ === smp_store_release(&inode->i_size, i_size)

 folio_test_uptodate(folio)
   test_bit PG_uptodate
   smp_rmb()

 === ↓↓↓ LDAR ↓↓↓ === smp_load_acquire(&inode->i_size)

 copy_page_to_iter()
=========================================================

Calling smp_store_release() in i_size_write() ensures that the data
in the page and the PG_uptodate bit are updated before the isize is
updated, and calling smp_load_acquire() in i_size_read ensures that
it will not read a newer isize than the data in the page. Therefore,
this avoids buffered read-write inconsistencies caused by Load-Load
reordering.

Link: https://lore.kernel.org/r/CAHk-=wifOnmeJq+sn+2s-P46zw0SFEbw9BSCGgp2c5fYPtRPGw@mail.gmail.com/
Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Baokun Li <libaokun1@huawei.com>
Link: https://lore.kernel.org/r/20240124142857.4146716-2-libaokun1@huawei.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/fs.h | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 4f6669147b9e8d..ebce4763b4bb9a 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -907,7 +907,8 @@ static inline loff_t i_size_read(const struct inode *inode)
 	preempt_enable();
 	return i_size;
 #else
-	return inode->i_size;
+	/* Pairs with smp_store_release() in i_size_write() */
+	return smp_load_acquire(&inode->i_size);
 #endif
 }
 
@@ -929,7 +930,12 @@ static inline void i_size_write(struct inode *inode, loff_t i_size)
 	inode->i_size = i_size;
 	preempt_enable();
 #else
-	inode->i_size = i_size;
+	/*
+	 * Pairs with smp_load_acquire() in i_size_read() to ensure
+	 * changes related to inode size (such as page contents) are
+	 * visible before we see the changed inode size.
+	 */
+	smp_store_release(&inode->i_size, i_size);
 #endif
 }
 

From a17ab4403eaf06f54de8bd2f2217b4b69089ba93 Mon Sep 17 00:00:00 2001
From: Baokun Li <libaokun1@huawei.com>
Date: Wed, 24 Jan 2024 22:28:56 +0800
Subject: [PATCH 0092/1406] Revert "mm/filemap: avoid buffered read/write race
 to read inconsistent data"

This reverts commit e2c27b803bb6 ("mm/filemap: avoid buffered read/write
race to read inconsistent data"). After making the i_size_read/write
helpers be smp_load_acquire/store_release(), it is already guaranteed that
changes to page contents are visible before we see increased inode size,
so the extra smp_rmb() in filemap_read() can be removed.

Signed-off-by: Baokun Li <libaokun1@huawei.com>
Link: https://lore.kernel.org/r/20240124142857.4146716-3-libaokun1@huawei.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 mm/filemap.c | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/mm/filemap.c b/mm/filemap.c
index 750e779c23db74..a72dd2eafd5ace 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2608,15 +2608,6 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
 			goto put_folios;
 		end_offset = min_t(loff_t, isize, iocb->ki_pos + iter->count);
 
-		/*
-		 * Pairs with a barrier in
-		 * block_write_end()->mark_buffer_dirty() or other page
-		 * dirtying routines like iomap_write_end() to ensure
-		 * changes to page contents are visible before we see
-		 * increased inode size.
-		 */
-		smp_rmb();
-
 		/*
 		 * Once we start copying data, we don't want to be touching any
 		 * cachelines that might be contended:

From cf6e3cf145eb352e28812a741fde5065f1652ee8 Mon Sep 17 00:00:00 2001
From: Baokun Li <libaokun1@huawei.com>
Date: Wed, 24 Jan 2024 22:28:57 +0800
Subject: [PATCH 0093/1406] asm-generic: remove extra type checking in
 acquire/release for non-SMP case

If CONFIG_SMP is not enabled, the smp_load_acquire/smp_store_release is
implemented as READ_ONCE/READ_ONCE and barrier() and type checking.
READ_ONCE/READ_ONCE already checks the pointer type, and then checks it
more stringently outside, but the non-SMP case simply isn't relevant, so
remove the extra compiletime_assert_atomic_type() to avoid compilation
errors.

Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202401230837.TXro0PHi-lkp@intel.com/
Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Baokun Li <libaokun1@huawei.com>
Link: https://lore.kernel.org/r/20240124142857.4146716-4-libaokun1@huawei.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/asm-generic/barrier.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/include/asm-generic/barrier.h b/include/asm-generic/barrier.h
index 961f4d88f9ef78..0c0695763bea39 100644
--- a/include/asm-generic/barrier.h
+++ b/include/asm-generic/barrier.h
@@ -193,7 +193,6 @@ do {									\
 #ifndef smp_store_release
 #define smp_store_release(p, v)						\
 do {									\
-	compiletime_assert_atomic_type(*p);				\
 	barrier();							\
 	WRITE_ONCE(*p, v);						\
 } while (0)
@@ -203,7 +202,6 @@ do {									\
 #define smp_load_acquire(p)						\
 ({									\
 	__unqual_scalar_typeof(*p) ___p1 = READ_ONCE(*p);		\
-	compiletime_assert_atomic_type(*p);				\
 	barrier();							\
 	(typeof(*p))___p1;						\
 })

From 34b2321cc648a246d08cc51e423532eac690ccf1 Mon Sep 17 00:00:00 2001
From: Andreas Larsson <andreas@gaisler.com>
Date: Mon, 15 Jan 2024 16:02:00 +0100
Subject: [PATCH 0094/1406] MAINTAINERS: Add Andreas Larsson as co-maintainer
 for arch/sparc

Dave has not been very active on arch/sparc for the past two years.
I have been contributing to the SPARC32 port as well as maintaining
out-of-tree SPARC32 patches for LEON3/4/5 (SPARCv8 with CAS support)
since 2012. I am willing to step up as an arch/sparc (co-)maintainer.

For recent discussions on the matter, see [1] and [2].

[1] https://lore.kernel.org/r/20230713075235.2164609-1-u.kleine-koenig@pengutronix.de
[2] https://lore.kernel.org/r/20231209105816.GA1085691@ravnborg.org/

Signed-off-by: Andreas Larsson <andreas@gaisler.com>
Suggested-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Sam Ravnborg <sam@ravnborg.org>
Acked-by: John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Jose E. Marchesi <jose.marchesi@oracle.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 MAINTAINERS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index 8d1052fa6a6924..542ab762be7de4 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -20549,6 +20549,7 @@ F:	Documentation/translations/sp_SP/
 
 SPARC + UltraSPARC (sparc/sparc64)
 M:	"David S. Miller" <davem@davemloft.net>
+M:	Andreas Larsson <andreas@gaisler.com>
 L:	sparclinux@vger.kernel.org
 S:	Maintained
 Q:	http://patchwork.ozlabs.org/project/sparclinux/list/

From e14aeb6cda036f78bd6a1eba2eb04b6d8beb2814 Mon Sep 17 00:00:00 2001
From: Florian Fainelli <florian.fainelli@broadcom.com>
Date: Tue, 18 Oct 2022 16:23:53 -0700
Subject: [PATCH 0095/1406] ARM: brcmstb: Add debug UART entry for 74165

BCM74165 uses the same address map as the 7278 family (v7 memory map)
therefore re-use that constant and shit down the other labels to keep
numerical ordering.

Signed-off-by: Florian Fainelli <florian.fainelli@broadcom.com>
---
 arch/arm/include/debug/brcmstb.S | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/arch/arm/include/debug/brcmstb.S b/arch/arm/include/debug/brcmstb.S
index f6175e6e28cd22..3f7d68740ed4c9 100644
--- a/arch/arm/include/debug/brcmstb.S
+++ b/arch/arm/include/debug/brcmstb.S
@@ -27,6 +27,7 @@
 #define UARTA_72165		UARTA_7278
 #define UARTA_7364		REG_PHYS_ADDR(0x40b000)
 #define UARTA_7366		UARTA_7364
+#define UARTA_74165		UARTA_7278
 #define UARTA_74371		REG_PHYS_ADDR(0x406b00)
 #define UARTA_7439		REG_PHYS_ADDR(0x40a900)
 #define UARTA_7445		REG_PHYS_ADDR(0x40ab00)
@@ -88,9 +89,10 @@ ARM_BE8(	rev	\rv, \rv )
 30:		checkuart(\rp, \rv, 0x72780000, 7278)
 31:		checkuart(\rp, \rv, 0x73640000, 7364)
 32:		checkuart(\rp, \rv, 0x73660000, 7366)
-33:		checkuart(\rp, \rv, 0x07437100, 74371)
-34:		checkuart(\rp, \rv, 0x74390000, 7439)
-35:		checkuart(\rp, \rv, 0x74450000, 7445)
+33:		checkuart(\rp, \rv, 0x07416500, 74165)
+34:		checkuart(\rp, \rv, 0x07437100, 74371)
+35:		checkuart(\rp, \rv, 0x74390000, 7439)
+36:		checkuart(\rp, \rv, 0x74450000, 7445)
 
 		/* No valid UART found */
 90:		mov	\rp, #0

From 6ae2b145edd725c2234c7fde36ebcc5e1a4d4e7d Mon Sep 17 00:00:00 2001
From: Chen-Yu Tsai <wens@csie.org>
Date: Sun, 28 Jan 2024 00:32:45 +0800
Subject: [PATCH 0096/1406] arm64: dts: allwinner: h6: Add RX DMA channel for
 SPDIF

The SPDIF hardware found on the H6 supports both transmit and receive
functions. However it is missing the RX DMA channel.

Add the SPDIF hardware block's RX DMA channel. Also remove the
by-default pinmux, since the end device can choose to implement
either or both functionalities.

Fixes: f95b598df419 ("arm64: dts: allwinner: Add SPDIF node for Allwinner H6")
Signed-off-by: Chen-Yu Tsai <wens@csie.org>
Reviewed-by: Andre Przywara <andre.przywara@arm.com>
Reviewed-by: Jernej Skrabec <jernej.skrabec@gmail.com>
Link: https://lore.kernel.org/r/20240127163247.384439-6-wens@kernel.org
Signed-off-by: Jernej Skrabec <jernej.skrabec@gmail.com>
---
 arch/arm64/boot/dts/allwinner/sun50i-h6-beelink-gs1.dts | 2 ++
 arch/arm64/boot/dts/allwinner/sun50i-h6-tanix.dtsi      | 2 ++
 arch/arm64/boot/dts/allwinner/sun50i-h6.dtsi            | 7 +++----
 3 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/arch/arm64/boot/dts/allwinner/sun50i-h6-beelink-gs1.dts b/arch/arm64/boot/dts/allwinner/sun50i-h6-beelink-gs1.dts
index 9ec49ac2f6fd5d..381d58cea092d9 100644
--- a/arch/arm64/boot/dts/allwinner/sun50i-h6-beelink-gs1.dts
+++ b/arch/arm64/boot/dts/allwinner/sun50i-h6-beelink-gs1.dts
@@ -291,6 +291,8 @@
 };
 
 &spdif {
+	pinctrl-names = "default";
+	pinctrl-0 = <&spdif_tx_pin>;
 	status = "okay";
 };
 
diff --git a/arch/arm64/boot/dts/allwinner/sun50i-h6-tanix.dtsi b/arch/arm64/boot/dts/allwinner/sun50i-h6-tanix.dtsi
index 4903d6358112de..855b7d43bc503a 100644
--- a/arch/arm64/boot/dts/allwinner/sun50i-h6-tanix.dtsi
+++ b/arch/arm64/boot/dts/allwinner/sun50i-h6-tanix.dtsi
@@ -166,6 +166,8 @@
 };
 
 &spdif {
+	pinctrl-names = "default";
+	pinctrl-0 = <&spdif_tx_pin>;
 	status = "okay";
 };
 
diff --git a/arch/arm64/boot/dts/allwinner/sun50i-h6.dtsi b/arch/arm64/boot/dts/allwinner/sun50i-h6.dtsi
index ca1d287a0a01d9..d11e5041bae9a4 100644
--- a/arch/arm64/boot/dts/allwinner/sun50i-h6.dtsi
+++ b/arch/arm64/boot/dts/allwinner/sun50i-h6.dtsi
@@ -406,6 +406,7 @@
 				function = "spi1";
 			};
 
+			/omit-if-no-ref/
 			spdif_tx_pin: spdif-tx-pin {
 				pins = "PH7";
 				function = "spdif";
@@ -655,10 +656,8 @@
 			clocks = <&ccu CLK_BUS_SPDIF>, <&ccu CLK_SPDIF>;
 			clock-names = "apb", "spdif";
 			resets = <&ccu RST_BUS_SPDIF>;
-			dmas = <&dma 2>;
-			dma-names = "tx";
-			pinctrl-names = "default";
-			pinctrl-0 = <&spdif_tx_pin>;
+			dmas = <&dma 2>, <&dma 2>;
+			dma-names = "rx", "tx";
 			status = "disabled";
 		};
 

From adb3ebfc285eb2c0ad67039bda58683be415647f Mon Sep 17 00:00:00 2001
From: Chen-Yu Tsai <wens@csie.org>
Date: Sun, 28 Jan 2024 00:32:46 +0800
Subject: [PATCH 0097/1406] arm64: dts: allwinner: h616: Add DMA controller and
 DMA channels

The DMA controllers found on the H616 and H618 are the same as the one
found on the A100. The only difference is the DMA endpoint (DRQ) layout.

Add a device node for it, and add DMA channels for existing peripherals.

Signed-off-by: Chen-Yu Tsai <wens@csie.org>
Reviewed-by: Andre Przywara <andre.przywara@arm.com>
Link: https://lore.kernel.org/r/20240127163247.384439-7-wens@kernel.org
Signed-off-by: Jernej Skrabec <jernej.skrabec@gmail.com>
---
 .../arm64/boot/dts/allwinner/sun50i-h616.dtsi | 41 +++++++++++++++++++
 1 file changed, 41 insertions(+)

diff --git a/arch/arm64/boot/dts/allwinner/sun50i-h616.dtsi b/arch/arm64/boot/dts/allwinner/sun50i-h616.dtsi
index d549d277d9729f..885809137b9de3 100644
--- a/arch/arm64/boot/dts/allwinner/sun50i-h616.dtsi
+++ b/arch/arm64/boot/dts/allwinner/sun50i-h616.dtsi
@@ -133,6 +133,19 @@
 			#reset-cells = <1>;
 		};
 
+		dma: dma-controller@3002000 {
+			compatible = "allwinner,sun50i-h616-dma",
+				     "allwinner,sun50i-a100-dma";
+			reg = <0x03002000 0x1000>;
+			interrupts = <GIC_SPI 42 IRQ_TYPE_LEVEL_HIGH>;
+			clocks = <&ccu CLK_BUS_DMA>, <&ccu CLK_MBUS_DMA>;
+			clock-names = "bus", "mbus";
+			dma-channels = <16>;
+			dma-requests = <49>;
+			resets = <&ccu RST_BUS_DMA>;
+			#dma-cells = <1>;
+		};
+
 		sid: efuse@3006000 {
 			compatible = "allwinner,sun50i-h616-sid", "allwinner,sun50i-a64-sid";
 			reg = <0x03006000 0x1000>;
@@ -339,6 +352,8 @@
 			reg-shift = <2>;
 			reg-io-width = <4>;
 			clocks = <&ccu CLK_BUS_UART0>;
+			dmas = <&dma 14>, <&dma 14>;
+			dma-names = "tx", "rx";
 			resets = <&ccu RST_BUS_UART0>;
 			status = "disabled";
 		};
@@ -350,6 +365,8 @@
 			reg-shift = <2>;
 			reg-io-width = <4>;
 			clocks = <&ccu CLK_BUS_UART1>;
+			dmas = <&dma 15>, <&dma 15>;
+			dma-names = "tx", "rx";
 			resets = <&ccu RST_BUS_UART1>;
 			status = "disabled";
 		};
@@ -361,6 +378,8 @@
 			reg-shift = <2>;
 			reg-io-width = <4>;
 			clocks = <&ccu CLK_BUS_UART2>;
+			dmas = <&dma 16>, <&dma 16>;
+			dma-names = "tx", "rx";
 			resets = <&ccu RST_BUS_UART2>;
 			status = "disabled";
 		};
@@ -372,6 +391,8 @@
 			reg-shift = <2>;
 			reg-io-width = <4>;
 			clocks = <&ccu CLK_BUS_UART3>;
+			dmas = <&dma 17>, <&dma 17>;
+			dma-names = "tx", "rx";
 			resets = <&ccu RST_BUS_UART3>;
 			status = "disabled";
 		};
@@ -383,6 +404,8 @@
 			reg-shift = <2>;
 			reg-io-width = <4>;
 			clocks = <&ccu CLK_BUS_UART4>;
+			dmas = <&dma 18>, <&dma 18>;
+			dma-names = "tx", "rx";
 			resets = <&ccu RST_BUS_UART4>;
 			status = "disabled";
 		};
@@ -394,6 +417,8 @@
 			reg-shift = <2>;
 			reg-io-width = <4>;
 			clocks = <&ccu CLK_BUS_UART5>;
+			dmas = <&dma 19>, <&dma 19>;
+			dma-names = "tx", "rx";
 			resets = <&ccu RST_BUS_UART5>;
 			status = "disabled";
 		};
@@ -405,6 +430,8 @@
 			reg = <0x05002000 0x400>;
 			interrupts = <GIC_SPI 6 IRQ_TYPE_LEVEL_HIGH>;
 			clocks = <&ccu CLK_BUS_I2C0>;
+			dmas = <&dma 43>, <&dma 43>;
+			dma-names = "rx", "tx";
 			resets = <&ccu RST_BUS_I2C0>;
 			pinctrl-names = "default";
 			pinctrl-0 = <&i2c0_pins>;
@@ -420,6 +447,8 @@
 			reg = <0x05002400 0x400>;
 			interrupts = <GIC_SPI 7 IRQ_TYPE_LEVEL_HIGH>;
 			clocks = <&ccu CLK_BUS_I2C1>;
+			dmas = <&dma 44>, <&dma 44>;
+			dma-names = "rx", "tx";
 			resets = <&ccu RST_BUS_I2C1>;
 			status = "disabled";
 			#address-cells = <1>;
@@ -433,6 +462,8 @@
 			reg = <0x05002800 0x400>;
 			interrupts = <GIC_SPI 8 IRQ_TYPE_LEVEL_HIGH>;
 			clocks = <&ccu CLK_BUS_I2C2>;
+			dmas = <&dma 45>, <&dma 45>;
+			dma-names = "rx", "tx";
 			resets = <&ccu RST_BUS_I2C2>;
 			status = "disabled";
 			#address-cells = <1>;
@@ -446,6 +477,8 @@
 			reg = <0x05002c00 0x400>;
 			interrupts = <GIC_SPI 9 IRQ_TYPE_LEVEL_HIGH>;
 			clocks = <&ccu CLK_BUS_I2C3>;
+			dmas = <&dma 46>, <&dma 46>;
+			dma-names = "rx", "tx";
 			resets = <&ccu RST_BUS_I2C3>;
 			status = "disabled";
 			#address-cells = <1>;
@@ -459,6 +492,8 @@
 			reg = <0x05003000 0x400>;
 			interrupts = <GIC_SPI 10 IRQ_TYPE_LEVEL_HIGH>;
 			clocks = <&ccu CLK_BUS_I2C4>;
+			dmas = <&dma 47>, <&dma 47>;
+			dma-names = "rx", "tx";
 			resets = <&ccu RST_BUS_I2C4>;
 			status = "disabled";
 			#address-cells = <1>;
@@ -472,6 +507,8 @@
 			interrupts = <GIC_SPI 12 IRQ_TYPE_LEVEL_HIGH>;
 			clocks = <&ccu CLK_BUS_SPI0>, <&ccu CLK_SPI0>;
 			clock-names = "ahb", "mod";
+			dmas = <&dma 22>, <&dma 22>;
+			dma-names = "rx", "tx";
 			resets = <&ccu RST_BUS_SPI0>;
 			status = "disabled";
 			#address-cells = <1>;
@@ -485,6 +522,8 @@
 			interrupts = <GIC_SPI 13 IRQ_TYPE_LEVEL_HIGH>;
 			clocks = <&ccu CLK_BUS_SPI1>, <&ccu CLK_SPI1>;
 			clock-names = "ahb", "mod";
+			dmas = <&dma 23>, <&dma 23>;
+			dma-names = "rx", "tx";
 			resets = <&ccu RST_BUS_SPI1>;
 			status = "disabled";
 			#address-cells = <1>;
@@ -734,6 +773,8 @@
 			reg = <0x07081400 0x400>;
 			interrupts = <GIC_SPI 105 IRQ_TYPE_LEVEL_HIGH>;
 			clocks = <&r_ccu CLK_R_APB2_I2C>;
+			dmas = <&dma 48>, <&dma 48>;
+			dma-names = "rx", "tx";
 			resets = <&r_ccu RST_R_APB2_I2C>;
 			status = "disabled";
 			#address-cells = <1>;

From 7ef7d495bb10fe338f85f0769e6a3cac4ebf2d74 Mon Sep 17 00:00:00 2001
From: Chen-Yu Tsai <wens@csie.org>
Date: Sun, 28 Jan 2024 00:32:47 +0800
Subject: [PATCH 0098/1406] arm64: dts: allwinner: h616: Add SPDIF device node

The H616 SoC has an SPDIF transmitter hardware block, which has the same
layout as the one in the H6, minus the receiver side.

Add a device node for it, and a default pinmux.

Signed-off-by: Chen-Yu Tsai <wens@csie.org>
Reviewed-by: Andre Przywara <andre.przywara@arm.com>
Acked-by: Jernej Skrabec <jernej.skrabec@gmail.com>
Link: https://lore.kernel.org/r/20240127163247.384439-8-wens@kernel.org
Signed-off-by: Jernej Skrabec <jernej.skrabec@gmail.com>
---
 .../arm64/boot/dts/allwinner/sun50i-h616.dtsi | 20 +++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/arch/arm64/boot/dts/allwinner/sun50i-h616.dtsi b/arch/arm64/boot/dts/allwinner/sun50i-h616.dtsi
index 885809137b9de3..b1bf4fb5fc58b8 100644
--- a/arch/arm64/boot/dts/allwinner/sun50i-h616.dtsi
+++ b/arch/arm64/boot/dts/allwinner/sun50i-h616.dtsi
@@ -253,6 +253,11 @@
 				function = "spi1";
 			};
 
+			spdif_tx_pin: spdif-tx-pin {
+				pins = "PH4";
+				function = "spdif";
+			};
+
 			uart0_ph_pins: uart0-ph-pins {
 				pins = "PH0", "PH1";
 				function = "uart0";
@@ -550,6 +555,21 @@
 			};
 		};
 
+		spdif: spdif@5093000 {
+			compatible = "allwinner,sun50i-h616-spdif";
+			reg = <0x05093000 0x400>;
+			interrupts = <GIC_SPI 21 IRQ_TYPE_LEVEL_HIGH>;
+			clocks = <&ccu CLK_BUS_SPDIF>, <&ccu CLK_SPDIF>;
+			clock-names = "apb", "spdif";
+			resets = <&ccu RST_BUS_SPDIF>;
+			dmas = <&dma 2>;
+			dma-names = "tx";
+			pinctrl-names = "default";
+			pinctrl-0 = <&spdif_tx_pin>;
+			#sound-dai-cells = <0>;
+			status = "disabled";
+		};
+
 		usbotg: usb@5100000 {
 			compatible = "allwinner,sun50i-h616-musb",
 				     "allwinner,sun8i-h3-musb";

From 1b5af823d703ee183ffdde188aaf584ab93eea19 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 3 Jan 2024 11:26:49 +0100
Subject: [PATCH 0099/1406] soc/tegra: fix build failure on Tegra241

If all the other SoCs are disabled, the driver fails to build:

drivers/soc/tegra/fuse/fuse-tegra30.c:684:17: error: 'tegra30_fuse_read' undeclared here (not in a function); did you mean 'tegra_fuse_readl'?
  684 |         .read = tegra30_fuse_read,
      |                 ^~~~~~~~~~~~~~~~~
      |                 tegra_fuse_readl
drivers/soc/tegra/fuse/fuse-tegra30.c:694:17: error: 'tegra30_fuse_init' undeclared here (not in a function); did you mean 'tegra_fuse_info'?
  694 |         .init = tegra30_fuse_init,
      |                 ^~~~~~~~~~~~~~~~~

Fix the list of SoCs using this function to include the newly added one.

Fixes: dee509eb9cd5 ("soc/tegra: fuse: Add support for Tegra241")
Reviewed-by: Jon Hunter <jonathanh@nvidia.com>
Reviewed-by: Kartik <kkartik@nvidia.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 drivers/soc/tegra/fuse/fuse-tegra30.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/soc/tegra/fuse/fuse-tegra30.c b/drivers/soc/tegra/fuse/fuse-tegra30.c
index e94d46372a6396..92ac5693382637 100644
--- a/drivers/soc/tegra/fuse/fuse-tegra30.c
+++ b/drivers/soc/tegra/fuse/fuse-tegra30.c
@@ -38,7 +38,8 @@
     defined(CONFIG_ARCH_TEGRA_210_SOC) || \
     defined(CONFIG_ARCH_TEGRA_186_SOC) || \
     defined(CONFIG_ARCH_TEGRA_194_SOC) || \
-    defined(CONFIG_ARCH_TEGRA_234_SOC)
+    defined(CONFIG_ARCH_TEGRA_234_SOC) || \
+    defined(CONFIG_ARCH_TEGRA_241_SOC)
 static u32 tegra30_fuse_read_early(struct tegra_fuse *fuse, unsigned int offset)
 {
 	if (WARN_ON(!fuse->base))

From 25ae5f5f4168bbf91e7b6b126d24c30c91ef952e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Duje=20Mihanovi=C4=87?= <duje.mihanovic@skole.hr>
Date: Thu, 25 Jan 2024 16:30:53 +0100
Subject: [PATCH 0100/1406] leds: Introduce ExpressWire library
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The ExpressWire protocol is shared between at least KTD2692 and KTD2801
with slight differences such as timings and the former not having a
defined set of pulses for enabling the protocol (possibly because it
does not support PWM unlike KTD2801). Despite these differences the
ExpressWire handling code can be shared between the two, so in
preparation for adding KTD2801 support introduce a library implementing
this protocol.

Suggested-by: Daniel Thompson <daniel.thompson@linaro.org>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Reviewed-by: Daniel Thompson <daniel.thompson@linaro.org>
Signed-off-by: Duje Mihanović <duje.mihanovic@skole.hr>
Link: https://lore.kernel.org/r/20240125-ktd2801-v5-1-e22da232a825@skole.hr
Signed-off-by: Lee Jones <lee@kernel.org>
---
 MAINTAINERS                      |  7 ++++
 drivers/leds/Kconfig             |  4 ++
 drivers/leds/Makefile            |  3 ++
 drivers/leds/leds-expresswire.c  | 68 ++++++++++++++++++++++++++++++++
 include/linux/leds-expresswire.h | 36 +++++++++++++++++
 5 files changed, 118 insertions(+)
 create mode 100644 drivers/leds/leds-expresswire.c
 create mode 100644 include/linux/leds-expresswire.h

diff --git a/MAINTAINERS b/MAINTAINERS
index 8d1052fa6a6924..e1c83e0e837a84 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7979,6 +7979,13 @@ S:	Maintained
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/linkinjeon/exfat.git
 F:	fs/exfat/
 
+EXPRESSWIRE PROTOCOL LIBRARY
+M:	Duje Mihanović <duje.mihanovic@skole.hr>
+L:	linux-leds@vger.kernel.org
+S:	Maintained
+F:	drivers/leds/leds-expresswire.c
+F:	include/linux/leds-expresswire.h
+
 EXT2 FILE SYSTEM
 M:	Jan Kara <jack@suse.com>
 L:	linux-ext4@vger.kernel.org
diff --git a/drivers/leds/Kconfig b/drivers/leds/Kconfig
index d721b254e1e450..64bb2de237e950 100644
--- a/drivers/leds/Kconfig
+++ b/drivers/leds/Kconfig
@@ -186,6 +186,10 @@ config LEDS_EL15203000
 	  To compile this driver as a module, choose M here: the module
 	  will be called leds-el15203000.
 
+config LEDS_EXPRESSWIRE
+	bool
+	depends on GPIOLIB
+
 config LEDS_TURRIS_OMNIA
 	tristate "LED support for CZ.NIC's Turris Omnia"
 	depends on LEDS_CLASS_MULTICOLOR
diff --git a/drivers/leds/Makefile b/drivers/leds/Makefile
index ce07dc295ff000..effdfc6f1e9510 100644
--- a/drivers/leds/Makefile
+++ b/drivers/leds/Makefile
@@ -91,6 +91,9 @@ obj-$(CONFIG_LEDS_WM831X_STATUS)	+= leds-wm831x-status.o
 obj-$(CONFIG_LEDS_WM8350)		+= leds-wm8350.o
 obj-$(CONFIG_LEDS_WRAP)			+= leds-wrap.o
 
+# Kinetic ExpressWire Protocol
+obj-$(CONFIG_LEDS_EXPRESSWIRE)		+= leds-expresswire.o
+
 # LED SPI Drivers
 obj-$(CONFIG_LEDS_CR0014114)		+= leds-cr0014114.o
 obj-$(CONFIG_LEDS_DAC124S085)		+= leds-dac124s085.o
diff --git a/drivers/leds/leds-expresswire.c b/drivers/leds/leds-expresswire.c
new file mode 100644
index 00000000000000..89e147b0e019c4
--- /dev/null
+++ b/drivers/leds/leds-expresswire.c
@@ -0,0 +1,68 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Shared library for Kinetic's ExpressWire protocol.
+ * This protocol works by pulsing the ExpressWire IC's control GPIO.
+ * ktd2692 and ktd2801 are known to use this protocol.
+ */
+
+#include <linux/delay.h>
+#include <linux/gpio/consumer.h>
+#include <linux/leds-expresswire.h>
+
+void expresswire_power_off(struct expresswire_common_props *props)
+{
+	gpiod_set_value_cansleep(props->ctrl_gpio, 0);
+	usleep_range(props->timing.poweroff_us, props->timing.poweroff_us * 2);
+}
+EXPORT_SYMBOL_NS_GPL(expresswire_power_off, EXPRESSWIRE);
+
+void expresswire_enable(struct expresswire_common_props *props)
+{
+	gpiod_set_value(props->ctrl_gpio, 1);
+	udelay(props->timing.detect_delay_us);
+	gpiod_set_value(props->ctrl_gpio, 0);
+	udelay(props->timing.detect_us);
+	gpiod_set_value(props->ctrl_gpio, 1);
+}
+EXPORT_SYMBOL_NS_GPL(expresswire_enable, EXPRESSWIRE);
+
+void expresswire_start(struct expresswire_common_props *props)
+{
+	gpiod_set_value(props->ctrl_gpio, 1);
+	udelay(props->timing.data_start_us);
+}
+EXPORT_SYMBOL_NS_GPL(expresswire_start, EXPRESSWIRE);
+
+void expresswire_end(struct expresswire_common_props *props)
+{
+	gpiod_set_value(props->ctrl_gpio, 0);
+	udelay(props->timing.end_of_data_low_us);
+	gpiod_set_value(props->ctrl_gpio, 1);
+	udelay(props->timing.end_of_data_high_us);
+}
+EXPORT_SYMBOL_NS_GPL(expresswire_end, EXPRESSWIRE);
+
+void expresswire_set_bit(struct expresswire_common_props *props, bool bit)
+{
+	if (bit) {
+		gpiod_set_value(props->ctrl_gpio, 0);
+		udelay(props->timing.short_bitset_us);
+		gpiod_set_value(props->ctrl_gpio, 1);
+		udelay(props->timing.long_bitset_us);
+	} else {
+		gpiod_set_value(props->ctrl_gpio, 0);
+		udelay(props->timing.long_bitset_us);
+		gpiod_set_value(props->ctrl_gpio, 1);
+		udelay(props->timing.short_bitset_us);
+	}
+}
+EXPORT_SYMBOL_NS_GPL(expresswire_set_bit, EXPRESSWIRE);
+
+void expresswire_write_u8(struct expresswire_common_props *props, u8 val)
+{
+	expresswire_start(props);
+	for (int i = 7; i >= 0; i--)
+		expresswire_set_bit(props, val & BIT(i));
+	expresswire_end(props);
+}
+EXPORT_SYMBOL_NS_GPL(expresswire_write_u8, EXPRESSWIRE);
diff --git a/include/linux/leds-expresswire.h b/include/linux/leds-expresswire.h
new file mode 100644
index 00000000000000..3c61902ccac8c8
--- /dev/null
+++ b/include/linux/leds-expresswire.h
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Shared library for Kinetic's ExpressWire protocol.
+ * This protocol works by pulsing the ExpressWire IC's control GPIO.
+ * ktd2692 and ktd2801 are known to use this protocol.
+ */
+
+#ifndef _LEDS_EXPRESSWIRE_H
+#define _LEDS_EXPRESSWIRE_H
+
+#include <linux/gpio/consumer.h>
+
+struct expresswire_timing {
+	unsigned long poweroff_us;
+	unsigned long detect_delay_us;
+	unsigned long detect_us;
+	unsigned long data_start_us;
+	unsigned long end_of_data_low_us;
+	unsigned long end_of_data_high_us;
+	unsigned long short_bitset_us;
+	unsigned long long_bitset_us;
+};
+
+struct expresswire_common_props {
+	struct gpio_desc *ctrl_gpio;
+	struct expresswire_timing timing;
+};
+
+void expresswire_power_off(struct expresswire_common_props *props);
+void expresswire_enable(struct expresswire_common_props *props);
+void expresswire_start(struct expresswire_common_props *props);
+void expresswire_end(struct expresswire_common_props *props);
+void expresswire_set_bit(struct expresswire_common_props *props, bool bit);
+void expresswire_write_u8(struct expresswire_common_props *props, u8 val);
+
+#endif /* _LEDS_EXPRESSWIRE_H */

From e59a15af7aa690fa0997758df23069a9f0756c49 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Duje=20Mihanovi=C4=87?= <duje.mihanovic@skole.hr>
Date: Thu, 25 Jan 2024 16:30:54 +0100
Subject: [PATCH 0101/1406] leds: ktd2692: Convert to use ExpressWire library
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The KTD2692 uses the ExpressWire protocol implemented in the newly
introduced ExpressWire library. Convert the driver to use the library.

Suggested-by: Daniel Thompson <daniel.thompson@linaro.org>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Reviewed-by: Daniel Thompson <daniel.thompson@linaro.org>
Signed-off-by: Duje Mihanović <duje.mihanovic@skole.hr>
Link: https://lore.kernel.org/r/20240125-ktd2801-v5-2-e22da232a825@skole.hr
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/leds/flash/Kconfig        |   2 +-
 drivers/leds/flash/leds-ktd2692.c | 116 +++++++-----------------------
 2 files changed, 28 insertions(+), 90 deletions(-)

diff --git a/drivers/leds/flash/Kconfig b/drivers/leds/flash/Kconfig
index 4e08dbc057096f..a0fb755b58dcc5 100644
--- a/drivers/leds/flash/Kconfig
+++ b/drivers/leds/flash/Kconfig
@@ -23,7 +23,7 @@ config LEDS_AS3645A
 config LEDS_KTD2692
 	tristate "LED support for Kinetic KTD2692 flash LED controller"
 	depends on OF
-	depends on GPIOLIB || COMPILE_TEST
+	select LEDS_EXPRESSWIRE
 	help
 	  This option enables support for Kinetic KTD2692 LED flash connected
 	  through ExpressWire interface.
diff --git a/drivers/leds/flash/leds-ktd2692.c b/drivers/leds/flash/leds-ktd2692.c
index 598eee5daa5271..7bb0aa2753e365 100644
--- a/drivers/leds/flash/leds-ktd2692.c
+++ b/drivers/leds/flash/leds-ktd2692.c
@@ -6,9 +6,9 @@
  * Ingi Kim <ingi2.kim@samsung.com>
  */
 
-#include <linux/delay.h>
 #include <linux/err.h>
 #include <linux/gpio/consumer.h>
+#include <linux/leds-expresswire.h>
 #include <linux/led-class-flash.h>
 #include <linux/module.h>
 #include <linux/mutex.h>
@@ -37,22 +37,9 @@
 #define KTD2692_REG_FLASH_CURRENT_BASE		0x80
 #define KTD2692_REG_MODE_BASE			0xA0
 
-/* Set bit coding time for expresswire interface */
-#define KTD2692_TIME_RESET_US			700
-#define KTD2692_TIME_DATA_START_TIME_US		10
-#define KTD2692_TIME_HIGH_END_OF_DATA_US	350
-#define KTD2692_TIME_LOW_END_OF_DATA_US		10
-#define KTD2692_TIME_SHORT_BITSET_US		4
-#define KTD2692_TIME_LONG_BITSET_US		12
-
 /* KTD2692 default length of name */
 #define KTD2692_NAME_LENGTH			20
 
-enum ktd2692_bitset {
-	KTD2692_LOW = 0,
-	KTD2692_HIGH,
-};
-
 /* Movie / Flash Mode Control */
 enum ktd2692_led_mode {
 	KTD2692_MODE_DISABLE = 0,	/* default */
@@ -71,7 +58,19 @@ struct ktd2692_led_config_data {
 	enum led_brightness max_brightness;
 };
 
+const struct expresswire_timing ktd2692_timing = {
+	.poweroff_us = 700,
+	.data_start_us = 10,
+	.end_of_data_low_us = 10,
+	.end_of_data_high_us = 350,
+	.short_bitset_us = 4,
+	.long_bitset_us = 12
+};
+
 struct ktd2692_context {
+	/* Common ExpressWire properties (ctrl GPIO and timing) */
+	struct expresswire_common_props props;
+
 	/* Related LED Flash class device */
 	struct led_classdev_flash fled_cdev;
 
@@ -80,7 +79,6 @@ struct ktd2692_context {
 	struct regulator *regulator;
 
 	struct gpio_desc *aux_gpio;
-	struct gpio_desc *ctrl_gpio;
 
 	enum ktd2692_led_mode mode;
 	enum led_brightness torch_brightness;
@@ -92,67 +90,6 @@ static struct ktd2692_context *fled_cdev_to_led(
 	return container_of(fled_cdev, struct ktd2692_context, fled_cdev);
 }
 
-static void ktd2692_expresswire_start(struct ktd2692_context *led)
-{
-	gpiod_direction_output(led->ctrl_gpio, KTD2692_HIGH);
-	udelay(KTD2692_TIME_DATA_START_TIME_US);
-}
-
-static void ktd2692_expresswire_reset(struct ktd2692_context *led)
-{
-	gpiod_direction_output(led->ctrl_gpio, KTD2692_LOW);
-	udelay(KTD2692_TIME_RESET_US);
-}
-
-static void ktd2692_expresswire_end(struct ktd2692_context *led)
-{
-	gpiod_direction_output(led->ctrl_gpio, KTD2692_LOW);
-	udelay(KTD2692_TIME_LOW_END_OF_DATA_US);
-	gpiod_direction_output(led->ctrl_gpio, KTD2692_HIGH);
-	udelay(KTD2692_TIME_HIGH_END_OF_DATA_US);
-}
-
-static void ktd2692_expresswire_set_bit(struct ktd2692_context *led, bool bit)
-{
-	/*
-	 * The Low Bit(0) and High Bit(1) is based on a time detection
-	 * algorithm between time low and time high
-	 * Time_(L_LB) : Low time of the Low Bit(0)
-	 * Time_(H_LB) : High time of the LOW Bit(0)
-	 * Time_(L_HB) : Low time of the High Bit(1)
-	 * Time_(H_HB) : High time of the High Bit(1)
-	 *
-	 * It can be simplified to:
-	 * Low Bit(0) : 2 * Time_(H_LB) < Time_(L_LB)
-	 * High Bit(1) : 2 * Time_(L_HB) < Time_(H_HB)
-	 * HIGH  ___           ____    _..     _________    ___
-	 *          |_________|    |_..  |____|         |__|
-	 * LOW        <L_LB>  <H_LB>     <L_HB>  <H_HB>
-	 *          [  Low Bit (0) ]     [  High Bit(1) ]
-	 */
-	if (bit) {
-		gpiod_direction_output(led->ctrl_gpio, KTD2692_LOW);
-		udelay(KTD2692_TIME_SHORT_BITSET_US);
-		gpiod_direction_output(led->ctrl_gpio, KTD2692_HIGH);
-		udelay(KTD2692_TIME_LONG_BITSET_US);
-	} else {
-		gpiod_direction_output(led->ctrl_gpio, KTD2692_LOW);
-		udelay(KTD2692_TIME_LONG_BITSET_US);
-		gpiod_direction_output(led->ctrl_gpio, KTD2692_HIGH);
-		udelay(KTD2692_TIME_SHORT_BITSET_US);
-	}
-}
-
-static void ktd2692_expresswire_write(struct ktd2692_context *led, u8 value)
-{
-	int i;
-
-	ktd2692_expresswire_start(led);
-	for (i = 7; i >= 0; i--)
-		ktd2692_expresswire_set_bit(led, value & BIT(i));
-	ktd2692_expresswire_end(led);
-}
-
 static int ktd2692_led_brightness_set(struct led_classdev *led_cdev,
 				       enum led_brightness brightness)
 {
@@ -163,14 +100,14 @@ static int ktd2692_led_brightness_set(struct led_classdev *led_cdev,
 
 	if (brightness == LED_OFF) {
 		led->mode = KTD2692_MODE_DISABLE;
-		gpiod_direction_output(led->aux_gpio, KTD2692_LOW);
+		gpiod_direction_output(led->aux_gpio, 0);
 	} else {
-		ktd2692_expresswire_write(led, brightness |
+		expresswire_write_u8(&led->props, brightness |
 					KTD2692_REG_MOVIE_CURRENT_BASE);
 		led->mode = KTD2692_MODE_MOVIE;
 	}
 
-	ktd2692_expresswire_write(led, led->mode | KTD2692_REG_MODE_BASE);
+	expresswire_write_u8(&led->props, led->mode | KTD2692_REG_MODE_BASE);
 	mutex_unlock(&led->lock);
 
 	return 0;
@@ -187,17 +124,17 @@ static int ktd2692_led_flash_strobe_set(struct led_classdev_flash *fled_cdev,
 
 	if (state) {
 		flash_tm_reg = GET_TIMEOUT_OFFSET(timeout->val, timeout->step);
-		ktd2692_expresswire_write(led, flash_tm_reg
+		expresswire_write_u8(&led->props, flash_tm_reg
 				| KTD2692_REG_FLASH_TIMEOUT_BASE);
 
 		led->mode = KTD2692_MODE_FLASH;
-		gpiod_direction_output(led->aux_gpio, KTD2692_HIGH);
+		gpiod_direction_output(led->aux_gpio, 1);
 	} else {
 		led->mode = KTD2692_MODE_DISABLE;
-		gpiod_direction_output(led->aux_gpio, KTD2692_LOW);
+		gpiod_direction_output(led->aux_gpio, 0);
 	}
 
-	ktd2692_expresswire_write(led, led->mode | KTD2692_REG_MODE_BASE);
+	expresswire_write_u8(&led->props, led->mode | KTD2692_REG_MODE_BASE);
 
 	fled_cdev->led_cdev.brightness = LED_OFF;
 	led->mode = KTD2692_MODE_DISABLE;
@@ -247,12 +184,12 @@ static void ktd2692_init_flash_timeout(struct led_classdev_flash *fled_cdev,
 static void ktd2692_setup(struct ktd2692_context *led)
 {
 	led->mode = KTD2692_MODE_DISABLE;
-	ktd2692_expresswire_reset(led);
-	gpiod_direction_output(led->aux_gpio, KTD2692_LOW);
+	expresswire_power_off(&led->props);
+	gpiod_direction_output(led->aux_gpio, 0);
 
-	ktd2692_expresswire_write(led, (KTD2692_MM_MIN_CURR_THRESHOLD_SCALE - 1)
+	expresswire_write_u8(&led->props, (KTD2692_MM_MIN_CURR_THRESHOLD_SCALE - 1)
 				 | KTD2692_REG_MM_MIN_CURR_THRESHOLD_BASE);
-	ktd2692_expresswire_write(led, KTD2692_FLASH_MODE_CURR_PERCENT(45)
+	expresswire_write_u8(&led->props, KTD2692_FLASH_MODE_CURR_PERCENT(45)
 				 | KTD2692_REG_FLASH_CURRENT_BASE);
 }
 
@@ -277,8 +214,8 @@ static int ktd2692_parse_dt(struct ktd2692_context *led, struct device *dev,
 	if (!np)
 		return -ENXIO;
 
-	led->ctrl_gpio = devm_gpiod_get(dev, "ctrl", GPIOD_ASIS);
-	ret = PTR_ERR_OR_ZERO(led->ctrl_gpio);
+	led->props.ctrl_gpio = devm_gpiod_get(dev, "ctrl", GPIOD_ASIS);
+	ret = PTR_ERR_OR_ZERO(led->props.ctrl_gpio);
 	if (ret)
 		return dev_err_probe(dev, ret, "cannot get ctrl-gpios\n");
 
@@ -412,6 +349,7 @@ static struct platform_driver ktd2692_driver = {
 
 module_platform_driver(ktd2692_driver);
 
+MODULE_IMPORT_NS(EXPRESSWIRE);
 MODULE_AUTHOR("Ingi Kim <ingi2.kim@samsung.com>");
 MODULE_DESCRIPTION("Kinetic KTD2692 LED driver");
 MODULE_LICENSE("GPL v2");

From 4ac621a418ce8f4c562b50ea6f676196bd5262da Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Duje=20Mihanovi=C4=87?= <duje.mihanovic@skole.hr>
Date: Thu, 25 Jan 2024 16:30:55 +0100
Subject: [PATCH 0102/1406] dt-bindings: backlight: Add Kinetic KTD2801 binding
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

KTD2801 is a LED backlight driver IC found in samsung,coreprimevelte.
The brightness can be set using PWM or the ExpressWire protocol. Add
a DT binding for the KTD2801.

Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Reviewed-by: Daniel Thompson <daniel.thompson@linaro.org>
Signed-off-by: Duje Mihanović <duje.mihanovic@skole.hr>
Link: https://lore.kernel.org/r/20240125-ktd2801-v5-3-e22da232a825@skole.hr
Signed-off-by: Lee Jones <lee@kernel.org>
---
 .../leds/backlight/kinetic,ktd2801.yaml       | 46 +++++++++++++++++++
 1 file changed, 46 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/leds/backlight/kinetic,ktd2801.yaml

diff --git a/Documentation/devicetree/bindings/leds/backlight/kinetic,ktd2801.yaml b/Documentation/devicetree/bindings/leds/backlight/kinetic,ktd2801.yaml
new file mode 100644
index 00000000000000..b005065e0f48a7
--- /dev/null
+++ b/Documentation/devicetree/bindings/leds/backlight/kinetic,ktd2801.yaml
@@ -0,0 +1,46 @@
+# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/leds/backlight/kinetic,ktd2801.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Kinetic Technologies KTD2801 one-wire backlight
+
+maintainers:
+  - Duje Mihanović <duje.mihanovic@skole.hr>
+
+description: |
+  The Kinetic Technologies KTD2801 is a LED backlight driver controlled
+  by a single GPIO line. The driver can be controlled with a PWM signal
+  or by pulsing the GPIO line to set the backlight level. This is called
+  "ExpressWire".
+
+allOf:
+  - $ref: common.yaml#
+
+properties:
+  compatible:
+    const: kinetic,ktd2801
+
+  ctrl-gpios:
+    maxItems: 1
+
+  default-brightness: true
+  max-brightness: true
+
+required:
+  - compatible
+  - ctrl-gpios
+
+additionalProperties: false
+
+examples:
+  - |
+    #include <dt-bindings/gpio/gpio.h>
+
+    backlight {
+        compatible = "kinetic,ktd2801";
+        ctrl-gpios = <&gpio 97 GPIO_ACTIVE_HIGH>;
+        max-brightness = <210>;
+        default-brightness = <100>;
+    };

From 66c76c1cd984c14660453dfa2118014817924375 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Duje=20Mihanovi=C4=87?= <duje.mihanovic@skole.hr>
Date: Thu, 25 Jan 2024 16:30:56 +0100
Subject: [PATCH 0103/1406] backlight: Add Kinetic KTD2801 backlight support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

KTD2801 is a LED backlight driver IC found in samsung,coreprimevelte.
The brightness can be set using PWM or the ExpressWire protocol. Add
support for the KTD2801.

Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Reviewed-by: Daniel Thompson <daniel.thompson@linaro.org>
Signed-off-by: Duje Mihanović <duje.mihanovic@skole.hr>
Link: https://lore.kernel.org/r/20240125-ktd2801-v5-4-e22da232a825@skole.hr
Signed-off-by: Lee Jones <lee@kernel.org>
---
 MAINTAINERS                                 |   6 +
 drivers/video/backlight/Kconfig             |   7 ++
 drivers/video/backlight/Makefile            |   1 +
 drivers/video/backlight/ktd2801-backlight.c | 128 ++++++++++++++++++++
 4 files changed, 142 insertions(+)
 create mode 100644 drivers/video/backlight/ktd2801-backlight.c

diff --git a/MAINTAINERS b/MAINTAINERS
index e1c83e0e837a84..01cd1a46090765 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -12052,6 +12052,12 @@ S:	Maintained
 F:	Documentation/devicetree/bindings/leds/backlight/kinetic,ktd253.yaml
 F:	drivers/video/backlight/ktd253-backlight.c
 
+KTD2801 BACKLIGHT DRIVER
+M:	Duje Mihanović <duje.mihanovic@skole.hr>
+S:	Maintained
+F:	Documentation/devicetree/bindings/leds/backlight/kinetic,ktd2801.yaml
+F:	drivers/video/backlight/ktd2801-backlight.c
+
 KTEST
 M:	Steven Rostedt <rostedt@goodmis.org>
 M:	John Hawley <warthog9@eaglescrag.net>
diff --git a/drivers/video/backlight/Kconfig b/drivers/video/backlight/Kconfig
index ea2d0d69bd8cc1..230bca07b09dc9 100644
--- a/drivers/video/backlight/Kconfig
+++ b/drivers/video/backlight/Kconfig
@@ -183,6 +183,13 @@ config BACKLIGHT_KTD253
 	  which is a 1-wire GPIO-controlled backlight found in some mobile
 	  phones.
 
+config BACKLIGHT_KTD2801
+	tristate "Backlight Driver for Kinetic KTD2801"
+	select LEDS_EXPRESSWIRE
+	help
+	  Say Y to enable the backlight driver for the Kinetic KTD2801 1-wire
+	  GPIO-controlled backlight found in Samsung Galaxy Core Prime VE LTE.
+
 config BACKLIGHT_KTZ8866
 	tristate "Backlight Driver for Kinetic KTZ8866"
 	depends on I2C
diff --git a/drivers/video/backlight/Makefile b/drivers/video/backlight/Makefile
index 06966cb204597b..8d2cb252042db7 100644
--- a/drivers/video/backlight/Makefile
+++ b/drivers/video/backlight/Makefile
@@ -34,6 +34,7 @@ obj-$(CONFIG_BACKLIGHT_HP680)		+= hp680_bl.o
 obj-$(CONFIG_BACKLIGHT_HP700)		+= jornada720_bl.o
 obj-$(CONFIG_BACKLIGHT_IPAQ_MICRO)	+= ipaq_micro_bl.o
 obj-$(CONFIG_BACKLIGHT_KTD253)		+= ktd253-backlight.o
+obj-$(CONFIG_BACKLIGHT_KTD2801)		+= ktd2801-backlight.o
 obj-$(CONFIG_BACKLIGHT_KTZ8866)		+= ktz8866.o
 obj-$(CONFIG_BACKLIGHT_LM3533)		+= lm3533_bl.o
 obj-$(CONFIG_BACKLIGHT_LM3630A)		+= lm3630a_bl.o
diff --git a/drivers/video/backlight/ktd2801-backlight.c b/drivers/video/backlight/ktd2801-backlight.c
new file mode 100644
index 00000000000000..c020acff40f138
--- /dev/null
+++ b/drivers/video/backlight/ktd2801-backlight.c
@@ -0,0 +1,128 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Datasheet:
+ * https://www.kinet-ic.com/uploads/web/KTD2801/KTD2801-04b.pdf
+ */
+#include <linux/backlight.h>
+#include <linux/gpio/consumer.h>
+#include <linux/leds-expresswire.h>
+#include <linux/platform_device.h>
+#include <linux/property.h>
+
+#define KTD2801_DEFAULT_BRIGHTNESS	100
+#define KTD2801_MAX_BRIGHTNESS		255
+
+/* These values have been extracted from Samsung's driver. */
+const struct expresswire_timing ktd2801_timing = {
+	.poweroff_us = 2600,
+	.detect_delay_us = 150,
+	.detect_us = 270,
+	.data_start_us = 5,
+	.short_bitset_us = 5,
+	.long_bitset_us = 15,
+	.end_of_data_low_us = 10,
+	.end_of_data_high_us = 350
+};
+
+struct ktd2801_backlight {
+	struct expresswire_common_props props;
+	struct backlight_device *bd;
+	bool was_on;
+};
+
+static int ktd2801_update_status(struct backlight_device *bd)
+{
+	struct ktd2801_backlight *ktd2801 = bl_get_data(bd);
+	u8 brightness = (u8) backlight_get_brightness(bd);
+
+	if (backlight_is_blank(bd)) {
+		expresswire_power_off(&ktd2801->props);
+		ktd2801->was_on = false;
+		return 0;
+	}
+
+	if (!ktd2801->was_on) {
+		expresswire_enable(&ktd2801->props);
+		ktd2801->was_on = true;
+	}
+
+	expresswire_write_u8(&ktd2801->props, brightness);
+
+	return 0;
+}
+
+static const struct backlight_ops ktd2801_backlight_ops = {
+	.update_status = ktd2801_update_status,
+};
+
+static int ktd2801_backlight_probe(struct platform_device *pdev)
+{
+	struct device *dev = &pdev->dev;
+	struct backlight_device *bd;
+	struct ktd2801_backlight *ktd2801;
+	u32 brightness, max_brightness;
+	int ret;
+
+	ktd2801 = devm_kzalloc(dev, sizeof(*ktd2801), GFP_KERNEL);
+	if (!ktd2801)
+		return -ENOMEM;
+	ktd2801->was_on = true;
+	ktd2801->props.timing = ktd2801_timing;
+
+	ret = device_property_read_u32(dev, "max-brightness", &max_brightness);
+	if (ret)
+		max_brightness = KTD2801_MAX_BRIGHTNESS;
+	if (max_brightness > KTD2801_MAX_BRIGHTNESS) {
+		dev_err(dev, "illegal max brightness specified\n");
+		max_brightness = KTD2801_MAX_BRIGHTNESS;
+	}
+
+	ret = device_property_read_u32(dev, "default-brightness", &brightness);
+	if (ret)
+		brightness = KTD2801_DEFAULT_BRIGHTNESS;
+	if (brightness > max_brightness) {
+		dev_err(dev, "default brightness exceeds max\n");
+		brightness = max_brightness;
+	}
+
+	ktd2801->props.ctrl_gpio = devm_gpiod_get(dev, "ctrl", GPIOD_OUT_HIGH);
+	if (IS_ERR(ktd2801->props.ctrl_gpio))
+		return dev_err_probe(dev, PTR_ERR(ktd2801->props.ctrl_gpio),
+				"failed to get backlight GPIO");
+	gpiod_set_consumer_name(ktd2801->props.ctrl_gpio, dev_name(dev));
+
+	bd = devm_backlight_device_register(dev, dev_name(dev), dev, ktd2801,
+			&ktd2801_backlight_ops, NULL);
+	if (IS_ERR(bd))
+		return dev_err_probe(dev, PTR_ERR(bd),
+				"failed to register backlight");
+
+	bd->props.max_brightness = max_brightness;
+	bd->props.brightness = brightness;
+
+	ktd2801->bd = bd;
+	platform_set_drvdata(pdev, bd);
+	backlight_update_status(bd);
+
+	return 0;
+}
+
+static const struct of_device_id ktd2801_of_match[] = {
+	{ .compatible = "kinetic,ktd2801" },
+	{ }
+};
+MODULE_DEVICE_TABLE(of, ktd2801_of_match);
+
+static struct platform_driver ktd2801_backlight_driver = {
+	.driver = {
+		.name = "ktd2801-backlight",
+		.of_match_table = ktd2801_of_match,
+	},
+	.probe = ktd2801_backlight_probe,
+};
+module_platform_driver(ktd2801_backlight_driver);
+
+MODULE_IMPORT_NS(EXPRESSWIRE);
+MODULE_AUTHOR("Duje Mihanović <duje.mihanovic@skole.hr>");
+MODULE_DESCRIPTION("Kinetic KTD2801 Backlight Driver");
+MODULE_LICENSE("GPL");

From 27d87f10e51e88b2cb378ef9afee73108e6a8543 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 7 Dec 2023 08:26:57 +0100
Subject: [PATCH 0104/1406] iomap: clear the per-folio dirty bits on all
 writeback failures

write_cache_pages always clear the page dirty bit before calling into the
file systems, and leaves folios with a writeback failure without the
dirty bit after return.  We also clear the per-block writeback bits for
writeback failures unless no I/O has submitted, which will leave the
folio in an inconsistent state where it doesn't have the folio dirty,
but one or more per-block dirty bits.  This seems to be due the place
where the iomap_clear_range_dirty call was inserted into the existing
not very clearly structured code when adding per-block dirty bit support
and not actually intentional.  Switch to always clearing the dirty on
writeback failure.

Fixes: 4ce02c679722 ("iomap: Add per-block dirty state tracking to improve performance")
Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20231207072710.176093-2-hch@lst.de
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/iomap/buffered-io.c | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 093c4515b22a53..228fd2e05e12f8 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -1833,16 +1833,10 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
 	if (unlikely(error)) {
 		/*
 		 * Let the filesystem know what portion of the current page
-		 * failed to map. If the page hasn't been added to ioend, it
-		 * won't be affected by I/O completion and we must unlock it
-		 * now.
+		 * failed to map.
 		 */
 		if (wpc->ops->discard_folio)
 			wpc->ops->discard_folio(folio, pos);
-		if (!count) {
-			folio_unlock(folio);
-			goto done;
-		}
 	}
 
 	/*
@@ -1851,6 +1845,16 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
 	 * all the dirty bits in the folio here.
 	 */
 	iomap_clear_range_dirty(folio, 0, folio_size(folio));
+
+	/*
+	 * If the page hasn't been added to the ioend, it won't be affected by
+	 * I/O completion and we must unlock it now.
+	 */
+	if (error && !count) {
+		folio_unlock(folio);
+		goto done;
+	}
+
 	folio_start_writeback(folio);
 	folio_unlock(folio);
 

From f46da70cb717969a9abfb1d12d03a8b838ecbb9d Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 7 Dec 2023 08:26:58 +0100
Subject: [PATCH 0105/1406] iomap: treat inline data in iomap_writepage_map as
 an I/O error

iomap_writepage_map aready warns about inline data, but then just ignores
it.  Treat it as an error and return -EIO.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20231207072710.176093-3-hch@lst.de
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/iomap/buffered-io.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 228fd2e05e12f8..1492706cdc3d21 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -1808,8 +1808,10 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
 		if (error)
 			break;
 		trace_iomap_writepage_map(inode, &wpc->iomap);
-		if (WARN_ON_ONCE(wpc->iomap.type == IOMAP_INLINE))
-			continue;
+		if (WARN_ON_ONCE(wpc->iomap.type == IOMAP_INLINE)) {
+			error = -EIO;
+			break;
+		}
 		if (wpc->iomap.type == IOMAP_HOLE)
 			continue;
 		iomap_add_to_ioend(inode, pos, folio, ifs, wpc, wbc,

From 92a29732aee864f146c476cbec4bd63991cfe0a1 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 7 Dec 2023 08:26:59 +0100
Subject: [PATCH 0106/1406] iomap: move the io_folios field out of struct
 iomap_ioend

The io_folios member in struct iomap_ioend counts the number of folios
added to an ioend.  It is only used at submission time and can thus be
moved to iomap_writepage_ctx instead.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20231207072710.176093-4-hch@lst.de
Reviewed-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/iomap/buffered-io.c | 7 ++++---
 include/linux/iomap.h  | 2 +-
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 1492706cdc3d21..c013d35b07b78a 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -1675,10 +1675,11 @@ iomap_alloc_ioend(struct inode *inode, struct iomap_writepage_ctx *wpc,
 	ioend->io_flags = wpc->iomap.flags;
 	ioend->io_inode = inode;
 	ioend->io_size = 0;
-	ioend->io_folios = 0;
 	ioend->io_offset = offset;
 	ioend->io_bio = bio;
 	ioend->io_sector = sector;
+
+	wpc->nr_folios = 0;
 	return ioend;
 }
 
@@ -1722,7 +1723,7 @@ iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t offset,
 	 * also prevents long tight loops ending page writeback on all the
 	 * folios in the ioend.
 	 */
-	if (wpc->ioend->io_folios >= IOEND_BATCH_SIZE)
+	if (wpc->nr_folios >= IOEND_BATCH_SIZE)
 		return false;
 	return true;
 }
@@ -1819,7 +1820,7 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
 		count++;
 	}
 	if (count)
-		wpc->ioend->io_folios++;
+		wpc->nr_folios++;
 
 	WARN_ON_ONCE(!wpc->ioend && !list_empty(&submit_list));
 	WARN_ON_ONCE(!folio_test_locked(folio));
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 96dd0acbba44ac..b2a05dff914d0c 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -293,7 +293,6 @@ struct iomap_ioend {
 	struct list_head	io_list;	/* next ioend in chain */
 	u16			io_type;
 	u16			io_flags;	/* IOMAP_F_* */
-	u32			io_folios;	/* folios added to ioend */
 	struct inode		*io_inode;	/* file being written to */
 	size_t			io_size;	/* size of the extent */
 	loff_t			io_offset;	/* offset in the file */
@@ -329,6 +328,7 @@ struct iomap_writepage_ctx {
 	struct iomap		iomap;
 	struct iomap_ioend	*ioend;
 	const struct iomap_writeback_ops *ops;
+	u32			nr_folios;	/* folios added to the ioend */
 };
 
 void iomap_finish_ioends(struct iomap_ioend *ioend, int error);

From 24a55f31c39d65833b502295f410ff7e28516cd0 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 7 Dec 2023 08:27:00 +0100
Subject: [PATCH 0107/1406] iomap: move the PF_MEMALLOC check to
 iomap_writepages

The iomap writepage implementation has been removed in commit
478af190cb6c ("iomap: remove iomap_writepage") and this code is now only
called through ->writepages which never happens from memory reclaim.

Nove the check from iomap_do_writepage to iomap_writepages so that is
only called once per ->writepage invocation.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20231207072710.176093-5-hch@lst.de
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/iomap/buffered-io.c | 24 ++++++++----------------
 1 file changed, 8 insertions(+), 16 deletions(-)

diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index c013d35b07b78a..292ab7dade213a 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -1902,20 +1902,6 @@ static int iomap_do_writepage(struct folio *folio,
 
 	trace_iomap_writepage(inode, folio_pos(folio), folio_size(folio));
 
-	/*
-	 * Refuse to write the folio out if we're called from reclaim context.
-	 *
-	 * This avoids stack overflows when called from deeply used stacks in
-	 * random callers for direct reclaim or memcg reclaim.  We explicitly
-	 * allow reclaim from kswapd as the stack usage there is relatively low.
-	 *
-	 * This should never happen except in the case of a VM regression so
-	 * warn about it.
-	 */
-	if (WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) ==
-			PF_MEMALLOC))
-		goto redirty;
-
 	/*
 	 * Is this folio beyond the end of the file?
 	 *
@@ -1981,8 +1967,6 @@ static int iomap_do_writepage(struct folio *folio,
 
 	return iomap_writepage_map(wpc, wbc, inode, folio, end_pos);
 
-redirty:
-	folio_redirty_for_writepage(wbc, folio);
 unlock:
 	folio_unlock(folio);
 	return 0;
@@ -1995,6 +1979,14 @@ iomap_writepages(struct address_space *mapping, struct writeback_control *wbc,
 {
 	int			ret;
 
+	/*
+	 * Writeback from reclaim context should never happen except in the case
+	 * of a VM regression so warn about it and refuse to write the data.
+	 */
+	if (WARN_ON_ONCE((current->flags & (PF_MEMALLOC | PF_KSWAPD)) ==
+			PF_MEMALLOC))
+		return -EIO;
+
 	wpc->ops = ops;
 	ret = write_cache_pages(mapping, wbc, iomap_do_writepage, wpc);
 	if (!wpc->ioend)

From 3b10c9ec680d19961f626a1923958cb1e712ddcd Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 7 Dec 2023 08:27:01 +0100
Subject: [PATCH 0108/1406] iomap: factor out a iomap_writepage_handle_eof
 helper

Most of iomap_do_writepage is dedidcated to handling a folio crossing or
beyond i_size.  Split this is into a separate helper and update the
commens to deal with folios instead of pages and make them more readable.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20231207072710.176093-6-hch@lst.de
Reviewed-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/iomap/buffered-io.c | 128 ++++++++++++++++++++---------------------
 1 file changed, 62 insertions(+), 66 deletions(-)

diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 292ab7dade213a..75278e1b05f822 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -1758,6 +1758,64 @@ iomap_add_to_ioend(struct inode *inode, loff_t pos, struct folio *folio,
 	wbc_account_cgroup_owner(wbc, &folio->page, len);
 }
 
+/*
+ * Check interaction of the folio with the file end.
+ *
+ * If the folio is entirely beyond i_size, return false.  If it straddles
+ * i_size, adjust end_pos and zero all data beyond i_size.
+ */
+static bool iomap_writepage_handle_eof(struct folio *folio, struct inode *inode,
+		u64 *end_pos)
+{
+	u64 isize = i_size_read(inode);
+
+	if (*end_pos > isize) {
+		size_t poff = offset_in_folio(folio, isize);
+		pgoff_t end_index = isize >> PAGE_SHIFT;
+
+		/*
+		 * If the folio is entirely ouside of i_size, skip it.
+		 *
+		 * This can happen due to a truncate operation that is in
+		 * progress and in that case truncate will finish it off once
+		 * we've dropped the folio lock.
+		 *
+		 * Note that the pgoff_t used for end_index is an unsigned long.
+		 * If the given offset is greater than 16TB on a 32-bit system,
+		 * then if we checked if the folio is fully outside i_size with
+		 * "if (folio->index >= end_index + 1)", "end_index + 1" would
+		 * overflow and evaluate to 0.  Hence this folio would be
+		 * redirtied and written out repeatedly, which would result in
+		 * an infinite loop; the user program performing this operation
+		 * would hang.  Instead, we can detect this situation by
+		 * checking if the folio is totally beyond i_size or if its
+		 * offset is just equal to the EOF.
+		 */
+		if (folio->index > end_index ||
+		    (folio->index == end_index && poff == 0))
+			return false;
+
+		/*
+		 * The folio straddles i_size.
+		 *
+		 * It must be zeroed out on each and every writepage invocation
+		 * because it may be mmapped:
+		 *
+		 *    A file is mapped in multiples of the page size.  For a
+		 *    file that is not a multiple of the page size, the
+		 *    remaining memory is zeroed when mapped, and writes to that
+		 *    region are not written out to the file.
+		 *
+		 * Also adjust the writeback range to skip all blocks entirely
+		 * beyond i_size.
+		 */
+		folio_zero_segment(folio, poff, folio_size(folio));
+		*end_pos = isize;
+	}
+
+	return true;
+}
+
 /*
  * We implement an immediate ioend submission policy here to avoid needing to
  * chain multiple ioends and hence nest mempool allocations which can violate
@@ -1898,78 +1956,16 @@ static int iomap_do_writepage(struct folio *folio,
 {
 	struct iomap_writepage_ctx *wpc = data;
 	struct inode *inode = folio->mapping->host;
-	u64 end_pos, isize;
+	u64 end_pos = folio_pos(folio) + folio_size(folio);
 
 	trace_iomap_writepage(inode, folio_pos(folio), folio_size(folio));
 
-	/*
-	 * Is this folio beyond the end of the file?
-	 *
-	 * The folio index is less than the end_index, adjust the end_pos
-	 * to the highest offset that this folio should represent.
-	 * -----------------------------------------------------
-	 * |			file mapping	       | <EOF> |
-	 * -----------------------------------------------------
-	 * | Page ... | Page N-2 | Page N-1 |  Page N  |       |
-	 * ^--------------------------------^----------|--------
-	 * |     desired writeback range    |      see else    |
-	 * ---------------------------------^------------------|
-	 */
-	isize = i_size_read(inode);
-	end_pos = folio_pos(folio) + folio_size(folio);
-	if (end_pos > isize) {
-		/*
-		 * Check whether the page to write out is beyond or straddles
-		 * i_size or not.
-		 * -------------------------------------------------------
-		 * |		file mapping		        | <EOF>  |
-		 * -------------------------------------------------------
-		 * | Page ... | Page N-2 | Page N-1 |  Page N   | Beyond |
-		 * ^--------------------------------^-----------|---------
-		 * |				    |      Straddles     |
-		 * ---------------------------------^-----------|--------|
-		 */
-		size_t poff = offset_in_folio(folio, isize);
-		pgoff_t end_index = isize >> PAGE_SHIFT;
-
-		/*
-		 * Skip the page if it's fully outside i_size, e.g.
-		 * due to a truncate operation that's in progress.  We've
-		 * cleaned this page and truncate will finish things off for
-		 * us.
-		 *
-		 * Note that the end_index is unsigned long.  If the given
-		 * offset is greater than 16TB on a 32-bit system then if we
-		 * checked if the page is fully outside i_size with
-		 * "if (page->index >= end_index + 1)", "end_index + 1" would
-		 * overflow and evaluate to 0.  Hence this page would be
-		 * redirtied and written out repeatedly, which would result in
-		 * an infinite loop; the user program performing this operation
-		 * would hang.  Instead, we can detect this situation by
-		 * checking if the page is totally beyond i_size or if its
-		 * offset is just equal to the EOF.
-		 */
-		if (folio->index > end_index ||
-		    (folio->index == end_index && poff == 0))
-			goto unlock;
-
-		/*
-		 * The page straddles i_size.  It must be zeroed out on each
-		 * and every writepage invocation because it may be mmapped.
-		 * "A file is mapped in multiples of the page size.  For a file
-		 * that is not a multiple of the page size, the remaining
-		 * memory is zeroed when mapped, and writes to that region are
-		 * not written out to the file."
-		 */
-		folio_zero_segment(folio, poff, folio_size(folio));
-		end_pos = isize;
+	if (!iomap_writepage_handle_eof(folio, inode, &end_pos)) {
+		folio_unlock(folio);
+		return 0;
 	}
 
 	return iomap_writepage_map(wpc, wbc, inode, folio, end_pos);
-
-unlock:
-	folio_unlock(folio);
-	return 0;
 }
 
 int

From 58b1ea9dcd6dffafda8757b3c09d4d84d42774c9 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 7 Dec 2023 08:27:02 +0100
Subject: [PATCH 0109/1406] iomap: move all remaining per-folio logic into
 iomap_writepage_map

Move the tracepoint and the iomap check from iomap_do_writepage into
iomap_writepage_map.  This keeps all logic in one places, and leaves
iomap_do_writepage just as the wrapper for the callback conventions of
write_cache_pages, which will go away when that is converted to an
iterator.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20231207072710.176093-7-hch@lst.de
Reviewed-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/iomap/buffered-io.c | 34 +++++++++++-----------------------
 1 file changed, 11 insertions(+), 23 deletions(-)

diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 75278e1b05f822..e3175d3cc0362a 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -1832,19 +1832,25 @@ static bool iomap_writepage_handle_eof(struct folio *folio, struct inode *inode,
  * At the end of a writeback pass, there will be a cached ioend remaining on the
  * writepage context that the caller will need to submit.
  */
-static int
-iomap_writepage_map(struct iomap_writepage_ctx *wpc,
-		struct writeback_control *wbc, struct inode *inode,
-		struct folio *folio, u64 end_pos)
+static int iomap_writepage_map(struct iomap_writepage_ctx *wpc,
+		struct writeback_control *wbc, struct folio *folio)
 {
 	struct iomap_folio_state *ifs = folio->private;
+	struct inode *inode = folio->mapping->host;
 	struct iomap_ioend *ioend, *next;
 	unsigned len = i_blocksize(inode);
 	unsigned nblocks = i_blocks_per_folio(inode, folio);
 	u64 pos = folio_pos(folio);
+	u64 end_pos = pos + folio_size(folio);
 	int error = 0, count = 0, i;
 	LIST_HEAD(submit_list);
 
+	trace_iomap_writepage(inode, pos, folio_size(folio));
+
+	if (!iomap_writepage_handle_eof(folio, inode, &end_pos)) {
+		folio_unlock(folio);
+		return 0;
+	}
 	WARN_ON_ONCE(end_pos <= pos);
 
 	if (!ifs && nblocks > 1) {
@@ -1944,28 +1950,10 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
 	return error;
 }
 
-/*
- * Write out a dirty page.
- *
- * For delalloc space on the page, we need to allocate space and flush it.
- * For unwritten space on the page, we need to start the conversion to
- * regular allocated space.
- */
 static int iomap_do_writepage(struct folio *folio,
 		struct writeback_control *wbc, void *data)
 {
-	struct iomap_writepage_ctx *wpc = data;
-	struct inode *inode = folio->mapping->host;
-	u64 end_pos = folio_pos(folio) + folio_size(folio);
-
-	trace_iomap_writepage(inode, folio_pos(folio), folio_size(folio));
-
-	if (!iomap_writepage_handle_eof(folio, inode, &end_pos)) {
-		folio_unlock(folio);
-		return 0;
-	}
-
-	return iomap_writepage_map(wpc, wbc, inode, folio, end_pos);
+	return iomap_writepage_map(data, wbc, folio);
 }
 
 int

From 1a61893a10720519db9f6a4702c3640a4bbc9c68 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 7 Dec 2023 08:27:03 +0100
Subject: [PATCH 0110/1406] iomap: clean up the iomap_alloc_ioend calling
 convention

Switch to the same argument order as iomap_writepage_map and remove the
ifs argument that can be trivially recalculated.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20231207072710.176093-8-hch@lst.de
Reviewed-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/iomap/buffered-io.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index e3175d3cc0362a..92f032a12c1400 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -1732,11 +1732,11 @@ iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t offset,
  * Test to see if we have an existing ioend structure that we could append to
  * first; otherwise finish off the current ioend and start another.
  */
-static void
-iomap_add_to_ioend(struct inode *inode, loff_t pos, struct folio *folio,
-		struct iomap_folio_state *ifs, struct iomap_writepage_ctx *wpc,
-		struct writeback_control *wbc, struct list_head *iolist)
+static void iomap_add_to_ioend(struct iomap_writepage_ctx *wpc,
+		struct writeback_control *wbc, struct folio *folio,
+		struct inode *inode, loff_t pos, struct list_head *iolist)
 {
+	struct iomap_folio_state *ifs = folio->private;
 	sector_t sector = iomap_sector(&wpc->iomap, pos);
 	unsigned len = i_blocksize(inode);
 	size_t poff = offset_in_folio(folio, pos);
@@ -1879,8 +1879,7 @@ static int iomap_writepage_map(struct iomap_writepage_ctx *wpc,
 		}
 		if (wpc->iomap.type == IOMAP_HOLE)
 			continue;
-		iomap_add_to_ioend(inode, pos, folio, ifs, wpc, wbc,
-				 &submit_list);
+		iomap_add_to_ioend(wpc, wbc, folio, inode, pos, &submit_list);
 		count++;
 	}
 	if (count)

From cff3d22b9827f8bb5846d03fda7c05989ff39d38 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 7 Dec 2023 08:27:04 +0100
Subject: [PATCH 0111/1406] iomap: move the iomap_sector sector calculation out
 of iomap_add_to_ioend

The calculation in iomap_sector is pretty trivial and most of the time
iomap_add_to_ioend only callers either iomap_can_add_to_ioend or
iomap_alloc_ioend from a single invocation.

Calculate the sector in the two lower level functions and stop passing it
from iomap_add_to_ioend and update the iomap_alloc_ioend argument passing
order to match that of iomap_add_to_ioend.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20231207072710.176093-9-hch@lst.de
Reviewed-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/iomap/buffered-io.c | 25 +++++++++++--------------
 1 file changed, 11 insertions(+), 14 deletions(-)

diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 92f032a12c1400..3a3f3ebc070cbf 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -1656,9 +1656,8 @@ iomap_submit_ioend(struct iomap_writepage_ctx *wpc, struct iomap_ioend *ioend,
 	return 0;
 }
 
-static struct iomap_ioend *
-iomap_alloc_ioend(struct inode *inode, struct iomap_writepage_ctx *wpc,
-		loff_t offset, sector_t sector, struct writeback_control *wbc)
+static struct iomap_ioend *iomap_alloc_ioend(struct iomap_writepage_ctx *wpc,
+		struct writeback_control *wbc, struct inode *inode, loff_t pos)
 {
 	struct iomap_ioend *ioend;
 	struct bio *bio;
@@ -1666,7 +1665,7 @@ iomap_alloc_ioend(struct inode *inode, struct iomap_writepage_ctx *wpc,
 	bio = bio_alloc_bioset(wpc->iomap.bdev, BIO_MAX_VECS,
 			       REQ_OP_WRITE | wbc_to_write_flags(wbc),
 			       GFP_NOFS, &iomap_ioend_bioset);
-	bio->bi_iter.bi_sector = sector;
+	bio->bi_iter.bi_sector = iomap_sector(&wpc->iomap, pos);
 	wbc_init_bio(wbc, bio);
 
 	ioend = container_of(bio, struct iomap_ioend, io_inline_bio);
@@ -1675,9 +1674,9 @@ iomap_alloc_ioend(struct inode *inode, struct iomap_writepage_ctx *wpc,
 	ioend->io_flags = wpc->iomap.flags;
 	ioend->io_inode = inode;
 	ioend->io_size = 0;
-	ioend->io_offset = offset;
+	ioend->io_offset = pos;
 	ioend->io_bio = bio;
-	ioend->io_sector = sector;
+	ioend->io_sector = bio->bi_iter.bi_sector;
 
 	wpc->nr_folios = 0;
 	return ioend;
@@ -1705,18 +1704,17 @@ iomap_chain_bio(struct bio *prev)
 	return new;
 }
 
-static bool
-iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t offset,
-		sector_t sector)
+static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos)
 {
 	if ((wpc->iomap.flags & IOMAP_F_SHARED) !=
 	    (wpc->ioend->io_flags & IOMAP_F_SHARED))
 		return false;
 	if (wpc->iomap.type != wpc->ioend->io_type)
 		return false;
-	if (offset != wpc->ioend->io_offset + wpc->ioend->io_size)
+	if (pos != wpc->ioend->io_offset + wpc->ioend->io_size)
 		return false;
-	if (sector != bio_end_sector(wpc->ioend->io_bio))
+	if (iomap_sector(&wpc->iomap, pos) !=
+	    bio_end_sector(wpc->ioend->io_bio))
 		return false;
 	/*
 	 * Limit ioend bio chain lengths to minimise IO completion latency. This
@@ -1737,14 +1735,13 @@ static void iomap_add_to_ioend(struct iomap_writepage_ctx *wpc,
 		struct inode *inode, loff_t pos, struct list_head *iolist)
 {
 	struct iomap_folio_state *ifs = folio->private;
-	sector_t sector = iomap_sector(&wpc->iomap, pos);
 	unsigned len = i_blocksize(inode);
 	size_t poff = offset_in_folio(folio, pos);
 
-	if (!wpc->ioend || !iomap_can_add_to_ioend(wpc, pos, sector)) {
+	if (!wpc->ioend || !iomap_can_add_to_ioend(wpc, pos)) {
 		if (wpc->ioend)
 			list_add(&wpc->ioend->io_list, iolist);
-		wpc->ioend = iomap_alloc_ioend(inode, wpc, pos, sector, wbc);
+		wpc->ioend = iomap_alloc_ioend(wpc, wbc, inode, pos);
 	}
 
 	if (!bio_add_folio(wpc->ioend->io_bio, folio, len, poff)) {

From 589509aeaa361d3723ce4262bb11b92942adf6a6 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 7 Dec 2023 08:27:05 +0100
Subject: [PATCH 0112/1406] iomap: don't chain bios

Back in the days when a single bio could only be filled to the hardware
limits, and we scheduled a work item for each bio completion, chaining
multiple bios for a single ioend made a lot of sense to reduce the number
of completions.  But these days bios can be filled until we reach the
number of vectors or total size limit, which means we can always fit at
least 1 megabyte worth of data in the worst case, but usually a lot more
due to large folios.  The only thing bio chaining is buying us now is
to reduce the size of the allocation from an ioend with an embedded bio
into a plain bio, which is a 52 bytes differences on 64-bit systems.

This is not worth the added complexity, so remove the bio chaining and
only use the bio embedded into the ioend.  This will help to simplify
further changes to the iomap writeback code.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20231207072710.176093-10-hch@lst.de
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/iomap/buffered-io.c | 90 +++++++++++-------------------------------
 fs/xfs/xfs_aops.c      |  6 +--
 include/linux/iomap.h  |  8 +++-
 3 files changed, 32 insertions(+), 72 deletions(-)

diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 3a3f3ebc070cbf..5f6affbe7056ba 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -1479,40 +1479,23 @@ static u32
 iomap_finish_ioend(struct iomap_ioend *ioend, int error)
 {
 	struct inode *inode = ioend->io_inode;
-	struct bio *bio = &ioend->io_inline_bio;
-	struct bio *last = ioend->io_bio, *next;
-	u64 start = bio->bi_iter.bi_sector;
-	loff_t offset = ioend->io_offset;
-	bool quiet = bio_flagged(bio, BIO_QUIET);
+	struct bio *bio = &ioend->io_bio;
+	struct folio_iter fi;
 	u32 folio_count = 0;
 
-	for (bio = &ioend->io_inline_bio; bio; bio = next) {
-		struct folio_iter fi;
-
-		/*
-		 * For the last bio, bi_private points to the ioend, so we
-		 * need to explicitly end the iteration here.
-		 */
-		if (bio == last)
-			next = NULL;
-		else
-			next = bio->bi_private;
-
-		/* walk all folios in bio, ending page IO on them */
-		bio_for_each_folio_all(fi, bio) {
-			iomap_finish_folio_write(inode, fi.folio, fi.length,
-					error);
-			folio_count++;
-		}
-		bio_put(bio);
+	/* walk all folios in bio, ending page IO on them */
+	bio_for_each_folio_all(fi, bio) {
+		iomap_finish_folio_write(inode, fi.folio, fi.length, error);
+		folio_count++;
 	}
-	/* The ioend has been freed by bio_put() */
 
-	if (unlikely(error && !quiet)) {
+	if (unlikely(error && !bio_flagged(bio, BIO_QUIET))) {
 		printk_ratelimited(KERN_ERR
 "%s: writeback error on inode %lu, offset %lld, sector %llu",
-			inode->i_sb->s_id, inode->i_ino, offset, start);
+			inode->i_sb->s_id, inode->i_ino,
+			ioend->io_offset, ioend->io_sector);
 	}
+	bio_put(bio);	/* frees the ioend */
 	return folio_count;
 }
 
@@ -1553,7 +1536,7 @@ EXPORT_SYMBOL_GPL(iomap_finish_ioends);
 static bool
 iomap_ioend_can_merge(struct iomap_ioend *ioend, struct iomap_ioend *next)
 {
-	if (ioend->io_bio->bi_status != next->io_bio->bi_status)
+	if (ioend->io_bio.bi_status != next->io_bio.bi_status)
 		return false;
 	if ((ioend->io_flags & IOMAP_F_SHARED) ^
 	    (next->io_flags & IOMAP_F_SHARED))
@@ -1618,9 +1601,8 @@ EXPORT_SYMBOL_GPL(iomap_sort_ioends);
 
 static void iomap_writepage_end_bio(struct bio *bio)
 {
-	struct iomap_ioend *ioend = bio->bi_private;
-
-	iomap_finish_ioend(ioend, blk_status_to_errno(bio->bi_status));
+	iomap_finish_ioend(iomap_ioend_from_bio(bio),
+			blk_status_to_errno(bio->bi_status));
 }
 
 /*
@@ -1635,9 +1617,6 @@ static int
 iomap_submit_ioend(struct iomap_writepage_ctx *wpc, struct iomap_ioend *ioend,
 		int error)
 {
-	ioend->io_bio->bi_private = ioend;
-	ioend->io_bio->bi_end_io = iomap_writepage_end_bio;
-
 	if (wpc->ops->prepare_ioend)
 		error = wpc->ops->prepare_ioend(ioend, error);
 	if (error) {
@@ -1647,12 +1626,12 @@ iomap_submit_ioend(struct iomap_writepage_ctx *wpc, struct iomap_ioend *ioend,
 		 * as there is only one reference to the ioend at this point in
 		 * time.
 		 */
-		ioend->io_bio->bi_status = errno_to_blk_status(error);
-		bio_endio(ioend->io_bio);
+		ioend->io_bio.bi_status = errno_to_blk_status(error);
+		bio_endio(&ioend->io_bio);
 		return error;
 	}
 
-	submit_bio(ioend->io_bio);
+	submit_bio(&ioend->io_bio);
 	return 0;
 }
 
@@ -1666,44 +1645,22 @@ static struct iomap_ioend *iomap_alloc_ioend(struct iomap_writepage_ctx *wpc,
 			       REQ_OP_WRITE | wbc_to_write_flags(wbc),
 			       GFP_NOFS, &iomap_ioend_bioset);
 	bio->bi_iter.bi_sector = iomap_sector(&wpc->iomap, pos);
+	bio->bi_end_io = iomap_writepage_end_bio;
 	wbc_init_bio(wbc, bio);
 
-	ioend = container_of(bio, struct iomap_ioend, io_inline_bio);
+	ioend = iomap_ioend_from_bio(bio);
 	INIT_LIST_HEAD(&ioend->io_list);
 	ioend->io_type = wpc->iomap.type;
 	ioend->io_flags = wpc->iomap.flags;
 	ioend->io_inode = inode;
 	ioend->io_size = 0;
 	ioend->io_offset = pos;
-	ioend->io_bio = bio;
 	ioend->io_sector = bio->bi_iter.bi_sector;
 
 	wpc->nr_folios = 0;
 	return ioend;
 }
 
-/*
- * Allocate a new bio, and chain the old bio to the new one.
- *
- * Note that we have to perform the chaining in this unintuitive order
- * so that the bi_private linkage is set up in the right direction for the
- * traversal in iomap_finish_ioend().
- */
-static struct bio *
-iomap_chain_bio(struct bio *prev)
-{
-	struct bio *new;
-
-	new = bio_alloc(prev->bi_bdev, BIO_MAX_VECS, prev->bi_opf, GFP_NOFS);
-	bio_clone_blkg_association(new, prev);
-	new->bi_iter.bi_sector = bio_end_sector(prev);
-
-	bio_chain(prev, new);
-	bio_get(prev);		/* for iomap_finish_ioend */
-	submit_bio(prev);
-	return new;
-}
-
 static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos)
 {
 	if ((wpc->iomap.flags & IOMAP_F_SHARED) !=
@@ -1714,7 +1671,7 @@ static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos)
 	if (pos != wpc->ioend->io_offset + wpc->ioend->io_size)
 		return false;
 	if (iomap_sector(&wpc->iomap, pos) !=
-	    bio_end_sector(wpc->ioend->io_bio))
+	    bio_end_sector(&wpc->ioend->io_bio))
 		return false;
 	/*
 	 * Limit ioend bio chain lengths to minimise IO completion latency. This
@@ -1739,15 +1696,14 @@ static void iomap_add_to_ioend(struct iomap_writepage_ctx *wpc,
 	size_t poff = offset_in_folio(folio, pos);
 
 	if (!wpc->ioend || !iomap_can_add_to_ioend(wpc, pos)) {
+new_ioend:
 		if (wpc->ioend)
 			list_add(&wpc->ioend->io_list, iolist);
 		wpc->ioend = iomap_alloc_ioend(wpc, wbc, inode, pos);
 	}
 
-	if (!bio_add_folio(wpc->ioend->io_bio, folio, len, poff)) {
-		wpc->ioend->io_bio = iomap_chain_bio(wpc->ioend->io_bio);
-		bio_add_folio_nofail(wpc->ioend->io_bio, folio, len, poff);
-	}
+	if (!bio_add_folio(&wpc->ioend->io_bio, folio, len, poff))
+		goto new_ioend;
 
 	if (ifs)
 		atomic_add(len, &ifs->write_bytes_pending);
@@ -1978,7 +1934,7 @@ EXPORT_SYMBOL_GPL(iomap_writepages);
 static int __init iomap_init(void)
 {
 	return bioset_init(&iomap_ioend_bioset, 4 * (PAGE_SIZE / SECTOR_SIZE),
-			   offsetof(struct iomap_ioend, io_inline_bio),
+			   offsetof(struct iomap_ioend, io_bio),
 			   BIOSET_NEED_BVECS);
 }
 fs_initcall(iomap_init);
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 813f85156b0c3b..4fb244bb884dc4 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -112,7 +112,7 @@ xfs_end_ioend(
 	 * longer dirty. If we don't remove delalloc blocks here, they become
 	 * stale and can corrupt free space accounting on unmount.
 	 */
-	error = blk_status_to_errno(ioend->io_bio->bi_status);
+	error = blk_status_to_errno(ioend->io_bio.bi_status);
 	if (unlikely(error)) {
 		if (ioend->io_flags & IOMAP_F_SHARED) {
 			xfs_reflink_cancel_cow_range(ip, offset, size, true);
@@ -179,7 +179,7 @@ STATIC void
 xfs_end_bio(
 	struct bio		*bio)
 {
-	struct iomap_ioend	*ioend = bio->bi_private;
+	struct iomap_ioend	*ioend = iomap_ioend_from_bio(bio);
 	struct xfs_inode	*ip = XFS_I(ioend->io_inode);
 	unsigned long		flags;
 
@@ -444,7 +444,7 @@ xfs_prepare_ioend(
 	/* send ioends that might require a transaction to the completion wq */
 	if (xfs_ioend_is_append(ioend) || ioend->io_type == IOMAP_UNWRITTEN ||
 	    (ioend->io_flags & IOMAP_F_SHARED))
-		ioend->io_bio->bi_end_io = xfs_end_bio;
+		ioend->io_bio.bi_end_io = xfs_end_bio;
 	return status;
 }
 
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index b2a05dff914d0c..b8d3b658ad2b03 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -297,10 +297,14 @@ struct iomap_ioend {
 	size_t			io_size;	/* size of the extent */
 	loff_t			io_offset;	/* offset in the file */
 	sector_t		io_sector;	/* start sector of ioend */
-	struct bio		*io_bio;	/* bio being built */
-	struct bio		io_inline_bio;	/* MUST BE LAST! */
+	struct bio		io_bio;		/* MUST BE LAST! */
 };
 
+static inline struct iomap_ioend *iomap_ioend_from_bio(struct bio *bio)
+{
+	return container_of(bio, struct iomap_ioend, io_bio);
+}
+
 struct iomap_writeback_ops {
 	/*
 	 * Required, maps the blocks so that writeback can be performed on

From bfd93f03d93850eb4d642257b0ebd86ef4c9627b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 7 Dec 2023 08:27:06 +0100
Subject: [PATCH 0113/1406] iomap: only call mapping_set_error once for each
 failed bio

Instead of clling mapping_set_error once per folio, only do that once
per bio, and consolidate all the writeback error handling code in
iomap_finish_ioend.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20231207072710.176093-11-hch@lst.de
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/iomap/buffered-io.c | 27 ++++++++++++++-------------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 5f6affbe7056ba..71f0aafcc2774d 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -1454,15 +1454,10 @@ vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops)
 EXPORT_SYMBOL_GPL(iomap_page_mkwrite);
 
 static void iomap_finish_folio_write(struct inode *inode, struct folio *folio,
-		size_t len, int error)
+		size_t len)
 {
 	struct iomap_folio_state *ifs = folio->private;
 
-	if (error) {
-		folio_set_error(folio);
-		mapping_set_error(inode->i_mapping, error);
-	}
-
 	WARN_ON_ONCE(i_blocks_per_folio(inode, folio) > 1 && !ifs);
 	WARN_ON_ONCE(ifs && atomic_read(&ifs->write_bytes_pending) <= 0);
 
@@ -1483,18 +1478,24 @@ iomap_finish_ioend(struct iomap_ioend *ioend, int error)
 	struct folio_iter fi;
 	u32 folio_count = 0;
 
+	if (error) {
+		mapping_set_error(inode->i_mapping, error);
+		if (!bio_flagged(bio, BIO_QUIET)) {
+			pr_err_ratelimited(
+"%s: writeback error on inode %lu, offset %lld, sector %llu",
+				inode->i_sb->s_id, inode->i_ino,
+				ioend->io_offset, ioend->io_sector);
+		}
+	}
+
 	/* walk all folios in bio, ending page IO on them */
 	bio_for_each_folio_all(fi, bio) {
-		iomap_finish_folio_write(inode, fi.folio, fi.length, error);
+		if (error)
+			folio_set_error(fi.folio);
+		iomap_finish_folio_write(inode, fi.folio, fi.length);
 		folio_count++;
 	}
 
-	if (unlikely(error && !bio_flagged(bio, BIO_QUIET))) {
-		printk_ratelimited(KERN_ERR
-"%s: writeback error on inode %lu, offset %lld, sector %llu",
-			inode->i_sb->s_id, inode->i_ino,
-			ioend->io_offset, ioend->io_sector);
-	}
 	bio_put(bio);	/* frees the ioend */
 	return folio_count;
 }

From 0ea6417161053f5d8f8a51041820a57daeb51f26 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 7 Dec 2023 08:27:07 +0100
Subject: [PATCH 0114/1406] iomap: factor out a iomap_writepage_map_block
 helper

Split the loop body that calls into the file system to map a block and
add it to the ioend into a separate helper to prefer for refactoring of
the surrounding code.

Note that this was the only place in iomap_writepage_map that could
return an error, so include the call to ->discard_folio into the new
helper as that will help to avoid code duplication in the future.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20231207072710.176093-12-hch@lst.de
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/iomap/buffered-io.c | 70 ++++++++++++++++++++++++++----------------
 1 file changed, 43 insertions(+), 27 deletions(-)

diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 71f0aafcc2774d..b065df8a7c1f5e 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -1712,6 +1712,45 @@ static void iomap_add_to_ioend(struct iomap_writepage_ctx *wpc,
 	wbc_account_cgroup_owner(wbc, &folio->page, len);
 }
 
+static int iomap_writepage_map_blocks(struct iomap_writepage_ctx *wpc,
+		struct writeback_control *wbc, struct folio *folio,
+		struct inode *inode, u64 pos, unsigned *count,
+		struct list_head *submit_list)
+{
+	int error;
+
+	error = wpc->ops->map_blocks(wpc, inode, pos);
+	if (error)
+		goto fail;
+	trace_iomap_writepage_map(inode, &wpc->iomap);
+
+	switch (wpc->iomap.type) {
+	case IOMAP_INLINE:
+		WARN_ON_ONCE(1);
+		error = -EIO;
+		break;
+	case IOMAP_HOLE:
+		break;
+	default:
+		iomap_add_to_ioend(wpc, wbc, folio, inode, pos, submit_list);
+		(*count)++;
+	}
+
+fail:
+	/*
+	 * We cannot cancel the ioend directly here on error.  We may have
+	 * already set other pages under writeback and hence we have to run I/O
+	 * completion to mark the error state of the pages under writeback
+	 * appropriately.
+	 *
+	 * Just let the file system know what portion of the folio failed to
+	 * map.
+	 */
+	if (error && wpc->ops->discard_folio)
+		wpc->ops->discard_folio(folio, pos);
+	return error;
+}
+
 /*
  * Check interaction of the folio with the file end.
  *
@@ -1796,7 +1835,8 @@ static int iomap_writepage_map(struct iomap_writepage_ctx *wpc,
 	unsigned nblocks = i_blocks_per_folio(inode, folio);
 	u64 pos = folio_pos(folio);
 	u64 end_pos = pos + folio_size(folio);
-	int error = 0, count = 0, i;
+	unsigned count = 0;
+	int error = 0, i;
 	LIST_HEAD(submit_list);
 
 	trace_iomap_writepage(inode, pos, folio_size(folio));
@@ -1822,19 +1862,10 @@ static int iomap_writepage_map(struct iomap_writepage_ctx *wpc,
 	for (i = 0; i < nblocks && pos < end_pos; i++, pos += len) {
 		if (ifs && !ifs_block_is_dirty(folio, ifs, i))
 			continue;
-
-		error = wpc->ops->map_blocks(wpc, inode, pos);
+		error = iomap_writepage_map_blocks(wpc, wbc, folio, inode, pos,
+				&count, &submit_list);
 		if (error)
 			break;
-		trace_iomap_writepage_map(inode, &wpc->iomap);
-		if (WARN_ON_ONCE(wpc->iomap.type == IOMAP_INLINE)) {
-			error = -EIO;
-			break;
-		}
-		if (wpc->iomap.type == IOMAP_HOLE)
-			continue;
-		iomap_add_to_ioend(wpc, wbc, folio, inode, pos, &submit_list);
-		count++;
 	}
 	if (count)
 		wpc->nr_folios++;
@@ -1844,21 +1875,6 @@ static int iomap_writepage_map(struct iomap_writepage_ctx *wpc,
 	WARN_ON_ONCE(folio_test_writeback(folio));
 	WARN_ON_ONCE(folio_test_dirty(folio));
 
-	/*
-	 * We cannot cancel the ioend directly here on error.  We may have
-	 * already set other pages under writeback and hence we have to run I/O
-	 * completion to mark the error state of the pages under writeback
-	 * appropriately.
-	 */
-	if (unlikely(error)) {
-		/*
-		 * Let the filesystem know what portion of the current page
-		 * failed to map.
-		 */
-		if (wpc->ops->discard_folio)
-			wpc->ops->discard_folio(folio, pos);
-	}
-
 	/*
 	 * We can have dirty bits set past end of file in page_mkwrite path
 	 * while mapping the last partial folio. Hence it's better to clear

From 73e96b2f42adc04f21c78e94f1a8a297691fc59e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 7 Dec 2023 08:27:08 +0100
Subject: [PATCH 0115/1406] iomap: submit ioends immediately

Currently the writeback code delays submitting fill ioends until we
reach the end of the folio.  The reason for that is that otherwise
the end I/O handler could clear the writeback bit before we've even
finished submitting all I/O for the folio.

Add a bias to ifs->write_bytes_pending while we are submitting I/O
for a folio so that it never reaches zero until all I/O is completed
to prevent the early writeback bit clearing, and remove the now
superfluous submit_list.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20231207072710.176093-13-hch@lst.de
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/iomap/buffered-io.c | 162 +++++++++++++++++++----------------------
 1 file changed, 76 insertions(+), 86 deletions(-)

diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index b065df8a7c1f5e..17d46580cec867 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -1610,30 +1610,34 @@ static void iomap_writepage_end_bio(struct bio *bio)
  * Submit the final bio for an ioend.
  *
  * If @error is non-zero, it means that we have a situation where some part of
- * the submission process has failed after we've marked pages for writeback
- * and unlocked them.  In this situation, we need to fail the bio instead of
- * submitting it.  This typically only happens on a filesystem shutdown.
+ * the submission process has failed after we've marked pages for writeback.
+ * We cannot cancel ioend directly in that case, so call the bio end I/O handler
+ * with the error status here to run the normal I/O completion handler to clear
+ * the writeback bit and let the file system proess the errors.
  */
-static int
-iomap_submit_ioend(struct iomap_writepage_ctx *wpc, struct iomap_ioend *ioend,
-		int error)
+static int iomap_submit_ioend(struct iomap_writepage_ctx *wpc, int error)
 {
+	if (!wpc->ioend)
+		return error;
+
+	/*
+	 * Let the file systems prepare the I/O submission and hook in an I/O
+	 * comletion handler.  This also needs to happen in case after a
+	 * failure happened so that the file system end I/O handler gets called
+	 * to clean up.
+	 */
 	if (wpc->ops->prepare_ioend)
-		error = wpc->ops->prepare_ioend(ioend, error);
+		error = wpc->ops->prepare_ioend(wpc->ioend, error);
+
 	if (error) {
-		/*
-		 * If we're failing the IO now, just mark the ioend with an
-		 * error and finish it.  This will run IO completion immediately
-		 * as there is only one reference to the ioend at this point in
-		 * time.
-		 */
-		ioend->io_bio.bi_status = errno_to_blk_status(error);
-		bio_endio(&ioend->io_bio);
-		return error;
+		wpc->ioend->io_bio.bi_status = errno_to_blk_status(error);
+		bio_endio(&wpc->ioend->io_bio);
+	} else {
+		submit_bio(&wpc->ioend->io_bio);
 	}
 
-	submit_bio(&ioend->io_bio);
-	return 0;
+	wpc->ioend = NULL;
+	return error;
 }
 
 static struct iomap_ioend *iomap_alloc_ioend(struct iomap_writepage_ctx *wpc,
@@ -1687,19 +1691,28 @@ static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos)
 /*
  * Test to see if we have an existing ioend structure that we could append to
  * first; otherwise finish off the current ioend and start another.
+ *
+ * If a new ioend is created and cached, the old ioend is submitted to the block
+ * layer instantly.  Batching optimisations are provided by higher level block
+ * plugging.
+ *
+ * At the end of a writeback pass, there will be a cached ioend remaining on the
+ * writepage context that the caller will need to submit.
  */
-static void iomap_add_to_ioend(struct iomap_writepage_ctx *wpc,
+static int iomap_add_to_ioend(struct iomap_writepage_ctx *wpc,
 		struct writeback_control *wbc, struct folio *folio,
-		struct inode *inode, loff_t pos, struct list_head *iolist)
+		struct inode *inode, loff_t pos)
 {
 	struct iomap_folio_state *ifs = folio->private;
 	unsigned len = i_blocksize(inode);
 	size_t poff = offset_in_folio(folio, pos);
+	int error;
 
 	if (!wpc->ioend || !iomap_can_add_to_ioend(wpc, pos)) {
 new_ioend:
-		if (wpc->ioend)
-			list_add(&wpc->ioend->io_list, iolist);
+		error = iomap_submit_ioend(wpc, 0);
+		if (error)
+			return error;
 		wpc->ioend = iomap_alloc_ioend(wpc, wbc, inode, pos);
 	}
 
@@ -1710,12 +1723,12 @@ static void iomap_add_to_ioend(struct iomap_writepage_ctx *wpc,
 		atomic_add(len, &ifs->write_bytes_pending);
 	wpc->ioend->io_size += len;
 	wbc_account_cgroup_owner(wbc, &folio->page, len);
+	return 0;
 }
 
 static int iomap_writepage_map_blocks(struct iomap_writepage_ctx *wpc,
 		struct writeback_control *wbc, struct folio *folio,
-		struct inode *inode, u64 pos, unsigned *count,
-		struct list_head *submit_list)
+		struct inode *inode, u64 pos, unsigned *count)
 {
 	int error;
 
@@ -1732,8 +1745,9 @@ static int iomap_writepage_map_blocks(struct iomap_writepage_ctx *wpc,
 	case IOMAP_HOLE:
 		break;
 	default:
-		iomap_add_to_ioend(wpc, wbc, folio, inode, pos, submit_list);
-		(*count)++;
+		error = iomap_add_to_ioend(wpc, wbc, folio, inode, pos);
+		if (!error)
+			(*count)++;
 	}
 
 fail:
@@ -1809,35 +1823,21 @@ static bool iomap_writepage_handle_eof(struct folio *folio, struct inode *inode,
 	return true;
 }
 
-/*
- * We implement an immediate ioend submission policy here to avoid needing to
- * chain multiple ioends and hence nest mempool allocations which can violate
- * the forward progress guarantees we need to provide. The current ioend we're
- * adding blocks to is cached in the writepage context, and if the new block
- * doesn't append to the cached ioend, it will create a new ioend and cache that
- * instead.
- *
- * If a new ioend is created and cached, the old ioend is returned and queued
- * locally for submission once the entire page is processed or an error has been
- * detected.  While ioends are submitted immediately after they are completed,
- * batching optimisations are provided by higher level block plugging.
- *
- * At the end of a writeback pass, there will be a cached ioend remaining on the
- * writepage context that the caller will need to submit.
- */
 static int iomap_writepage_map(struct iomap_writepage_ctx *wpc,
 		struct writeback_control *wbc, struct folio *folio)
 {
 	struct iomap_folio_state *ifs = folio->private;
 	struct inode *inode = folio->mapping->host;
-	struct iomap_ioend *ioend, *next;
 	unsigned len = i_blocksize(inode);
 	unsigned nblocks = i_blocks_per_folio(inode, folio);
 	u64 pos = folio_pos(folio);
 	u64 end_pos = pos + folio_size(folio);
 	unsigned count = 0;
 	int error = 0, i;
-	LIST_HEAD(submit_list);
+
+	WARN_ON_ONCE(!folio_test_locked(folio));
+	WARN_ON_ONCE(folio_test_dirty(folio));
+	WARN_ON_ONCE(folio_test_writeback(folio));
 
 	trace_iomap_writepage(inode, pos, folio_size(folio));
 
@@ -1847,12 +1847,27 @@ static int iomap_writepage_map(struct iomap_writepage_ctx *wpc,
 	}
 	WARN_ON_ONCE(end_pos <= pos);
 
-	if (!ifs && nblocks > 1) {
-		ifs = ifs_alloc(inode, folio, 0);
-		iomap_set_range_dirty(folio, 0, end_pos - pos);
+	if (nblocks > 1) {
+		if (!ifs) {
+			ifs = ifs_alloc(inode, folio, 0);
+			iomap_set_range_dirty(folio, 0, end_pos - pos);
+		}
+
+		/*
+		 * Keep the I/O completion handler from clearing the writeback
+		 * bit until we have submitted all blocks by adding a bias to
+		 * ifs->write_bytes_pending, which is dropped after submitting
+		 * all blocks.
+		 */
+		WARN_ON_ONCE(atomic_read(&ifs->write_bytes_pending) != 0);
+		atomic_inc(&ifs->write_bytes_pending);
 	}
 
-	WARN_ON_ONCE(ifs && atomic_read(&ifs->write_bytes_pending) != 0);
+	/*
+	 * Set the writeback bit ASAP, as the I/O completion for the single
+	 * block per folio case happen hit as soon as we're submitting the bio.
+	 */
+	folio_start_writeback(folio);
 
 	/*
 	 * Walk through the folio to find areas to write back. If we
@@ -1863,18 +1878,13 @@ static int iomap_writepage_map(struct iomap_writepage_ctx *wpc,
 		if (ifs && !ifs_block_is_dirty(folio, ifs, i))
 			continue;
 		error = iomap_writepage_map_blocks(wpc, wbc, folio, inode, pos,
-				&count, &submit_list);
+				&count);
 		if (error)
 			break;
 	}
 	if (count)
 		wpc->nr_folios++;
 
-	WARN_ON_ONCE(!wpc->ioend && !list_empty(&submit_list));
-	WARN_ON_ONCE(!folio_test_locked(folio));
-	WARN_ON_ONCE(folio_test_writeback(folio));
-	WARN_ON_ONCE(folio_test_dirty(folio));
-
 	/*
 	 * We can have dirty bits set past end of file in page_mkwrite path
 	 * while mapping the last partial folio. Hence it's better to clear
@@ -1883,38 +1893,20 @@ static int iomap_writepage_map(struct iomap_writepage_ctx *wpc,
 	iomap_clear_range_dirty(folio, 0, folio_size(folio));
 
 	/*
-	 * If the page hasn't been added to the ioend, it won't be affected by
-	 * I/O completion and we must unlock it now.
+	 * Usually the writeback bit is cleared by the I/O completion handler.
+	 * But we may end up either not actually writing any blocks, or (when
+	 * there are multiple blocks in a folio) all I/O might have finished
+	 * already at this point.  In that case we need to clear the writeback
+	 * bit ourselves right after unlocking the page.
 	 */
-	if (error && !count) {
-		folio_unlock(folio);
-		goto done;
-	}
-
-	folio_start_writeback(folio);
 	folio_unlock(folio);
-
-	/*
-	 * Preserve the original error if there was one; catch
-	 * submission errors here and propagate into subsequent ioend
-	 * submissions.
-	 */
-	list_for_each_entry_safe(ioend, next, &submit_list, io_list) {
-		int error2;
-
-		list_del_init(&ioend->io_list);
-		error2 = iomap_submit_ioend(wpc, ioend, error);
-		if (error2 && !error)
-			error = error2;
+	if (ifs) {
+		if (atomic_dec_and_test(&ifs->write_bytes_pending))
+			folio_end_writeback(folio);
+	} else {
+		if (!count)
+			folio_end_writeback(folio);
 	}
-
-	/*
-	 * We can end up here with no error and nothing to write only if we race
-	 * with a partial page truncate on a sub-page block sized filesystem.
-	 */
-	if (!count)
-		folio_end_writeback(folio);
-done:
 	mapping_set_error(inode->i_mapping, error);
 	return error;
 }
@@ -1942,9 +1934,7 @@ iomap_writepages(struct address_space *mapping, struct writeback_control *wbc,
 
 	wpc->ops = ops;
 	ret = write_cache_pages(mapping, wbc, iomap_do_writepage, wpc);
-	if (!wpc->ioend)
-		return ret;
-	return iomap_submit_ioend(wpc, wpc->ioend, ret);
+	return iomap_submit_ioend(wpc, ret);
 }
 EXPORT_SYMBOL_GPL(iomap_writepages);
 

From fd07e0aa23c444075fe07674630762ccfd9333c7 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 7 Dec 2023 08:27:09 +0100
Subject: [PATCH 0116/1406] iomap: map multiple blocks at a time

The ->map_blocks interface returns a valid range for writeback, but we
still call back into it for every block, which is a bit inefficient.

Change iomap_writepage_map to use the valid range in the map until the
end of the folio or the dirty range inside the folio instead of calling
back into every block.

Note that the range is not used over folio boundaries as we need to be
able to check the mapping sequence count under the folio lock.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20231207072710.176093-14-hch@lst.de
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/iomap/buffered-io.c | 116 ++++++++++++++++++++++++++++-------------
 include/linux/iomap.h  |   7 +++
 2 files changed, 88 insertions(+), 35 deletions(-)

diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 17d46580cec867..3dab060aed6d7b 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
  * Copyright (C) 2010 Red Hat, Inc.
- * Copyright (C) 2016-2019 Christoph Hellwig.
+ * Copyright (C) 2016-2023 Christoph Hellwig.
  */
 #include <linux/module.h>
 #include <linux/compiler.h>
@@ -95,6 +95,44 @@ static inline bool ifs_block_is_dirty(struct folio *folio,
 	return test_bit(block + blks_per_folio, ifs->state);
 }
 
+static unsigned ifs_find_dirty_range(struct folio *folio,
+		struct iomap_folio_state *ifs, u64 *range_start, u64 range_end)
+{
+	struct inode *inode = folio->mapping->host;
+	unsigned start_blk =
+		offset_in_folio(folio, *range_start) >> inode->i_blkbits;
+	unsigned end_blk = min_not_zero(
+		offset_in_folio(folio, range_end) >> inode->i_blkbits,
+		i_blocks_per_folio(inode, folio));
+	unsigned nblks = 1;
+
+	while (!ifs_block_is_dirty(folio, ifs, start_blk))
+		if (++start_blk == end_blk)
+			return 0;
+
+	while (start_blk + nblks < end_blk) {
+		if (!ifs_block_is_dirty(folio, ifs, start_blk + nblks))
+			break;
+		nblks++;
+	}
+
+	*range_start = folio_pos(folio) + (start_blk << inode->i_blkbits);
+	return nblks << inode->i_blkbits;
+}
+
+static unsigned iomap_find_dirty_range(struct folio *folio, u64 *range_start,
+		u64 range_end)
+{
+	struct iomap_folio_state *ifs = folio->private;
+
+	if (*range_start >= range_end)
+		return 0;
+
+	if (ifs)
+		return ifs_find_dirty_range(folio, ifs, range_start, range_end);
+	return range_end - *range_start;
+}
+
 static void ifs_clear_range_dirty(struct folio *folio,
 		struct iomap_folio_state *ifs, size_t off, size_t len)
 {
@@ -1701,10 +1739,9 @@ static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos)
  */
 static int iomap_add_to_ioend(struct iomap_writepage_ctx *wpc,
 		struct writeback_control *wbc, struct folio *folio,
-		struct inode *inode, loff_t pos)
+		struct inode *inode, loff_t pos, unsigned len)
 {
 	struct iomap_folio_state *ifs = folio->private;
-	unsigned len = i_blocksize(inode);
 	size_t poff = offset_in_folio(folio, pos);
 	int error;
 
@@ -1728,29 +1765,41 @@ static int iomap_add_to_ioend(struct iomap_writepage_ctx *wpc,
 
 static int iomap_writepage_map_blocks(struct iomap_writepage_ctx *wpc,
 		struct writeback_control *wbc, struct folio *folio,
-		struct inode *inode, u64 pos, unsigned *count)
+		struct inode *inode, u64 pos, unsigned dirty_len,
+		unsigned *count)
 {
 	int error;
 
-	error = wpc->ops->map_blocks(wpc, inode, pos);
-	if (error)
-		goto fail;
-	trace_iomap_writepage_map(inode, &wpc->iomap);
-
-	switch (wpc->iomap.type) {
-	case IOMAP_INLINE:
-		WARN_ON_ONCE(1);
-		error = -EIO;
-		break;
-	case IOMAP_HOLE:
-		break;
-	default:
-		error = iomap_add_to_ioend(wpc, wbc, folio, inode, pos);
-		if (!error)
-			(*count)++;
-	}
+	do {
+		unsigned map_len;
+
+		error = wpc->ops->map_blocks(wpc, inode, pos);
+		if (error)
+			break;
+		trace_iomap_writepage_map(inode, &wpc->iomap);
+
+		map_len = min_t(u64, dirty_len,
+			wpc->iomap.offset + wpc->iomap.length - pos);
+		WARN_ON_ONCE(!folio->private && map_len < dirty_len);
+
+		switch (wpc->iomap.type) {
+		case IOMAP_INLINE:
+			WARN_ON_ONCE(1);
+			error = -EIO;
+			break;
+		case IOMAP_HOLE:
+			break;
+		default:
+			error = iomap_add_to_ioend(wpc, wbc, folio, inode, pos,
+					map_len);
+			if (!error)
+				(*count)++;
+			break;
+		}
+		dirty_len -= map_len;
+		pos += map_len;
+	} while (dirty_len && !error);
 
-fail:
 	/*
 	 * We cannot cancel the ioend directly here on error.  We may have
 	 * already set other pages under writeback and hence we have to run I/O
@@ -1817,7 +1866,7 @@ static bool iomap_writepage_handle_eof(struct folio *folio, struct inode *inode,
 		 * beyond i_size.
 		 */
 		folio_zero_segment(folio, poff, folio_size(folio));
-		*end_pos = isize;
+		*end_pos = round_up(isize, i_blocksize(inode));
 	}
 
 	return true;
@@ -1828,12 +1877,11 @@ static int iomap_writepage_map(struct iomap_writepage_ctx *wpc,
 {
 	struct iomap_folio_state *ifs = folio->private;
 	struct inode *inode = folio->mapping->host;
-	unsigned len = i_blocksize(inode);
-	unsigned nblocks = i_blocks_per_folio(inode, folio);
 	u64 pos = folio_pos(folio);
 	u64 end_pos = pos + folio_size(folio);
 	unsigned count = 0;
-	int error = 0, i;
+	int error = 0;
+	u32 rlen;
 
 	WARN_ON_ONCE(!folio_test_locked(folio));
 	WARN_ON_ONCE(folio_test_dirty(folio));
@@ -1847,7 +1895,7 @@ static int iomap_writepage_map(struct iomap_writepage_ctx *wpc,
 	}
 	WARN_ON_ONCE(end_pos <= pos);
 
-	if (nblocks > 1) {
+	if (i_blocks_per_folio(inode, folio) > 1) {
 		if (!ifs) {
 			ifs = ifs_alloc(inode, folio, 0);
 			iomap_set_range_dirty(folio, 0, end_pos - pos);
@@ -1870,18 +1918,16 @@ static int iomap_writepage_map(struct iomap_writepage_ctx *wpc,
 	folio_start_writeback(folio);
 
 	/*
-	 * Walk through the folio to find areas to write back. If we
-	 * run off the end of the current map or find the current map
-	 * invalid, grab a new one.
+	 * Walk through the folio to find dirty areas to write back.
 	 */
-	for (i = 0; i < nblocks && pos < end_pos; i++, pos += len) {
-		if (ifs && !ifs_block_is_dirty(folio, ifs, i))
-			continue;
-		error = iomap_writepage_map_blocks(wpc, wbc, folio, inode, pos,
-				&count);
+	while ((rlen = iomap_find_dirty_range(folio, &pos, end_pos))) {
+		error = iomap_writepage_map_blocks(wpc, wbc, folio, inode,
+				pos, rlen, &count);
 		if (error)
 			break;
+		pos += rlen;
 	}
+
 	if (count)
 		wpc->nr_folios++;
 
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index b8d3b658ad2b03..49d93f53878565 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -309,6 +309,13 @@ struct iomap_writeback_ops {
 	/*
 	 * Required, maps the blocks so that writeback can be performed on
 	 * the range starting at offset.
+	 *
+	 * Can return arbitrarily large regions, but we need to call into it at
+	 * least once per folio to allow the file systems to synchronize with
+	 * the write path that could be invalidating mappings.
+	 *
+	 * An existing mapping from a previous call to this method can be reused
+	 * by the file system if it is still valid.
 	 */
 	int (*map_blocks)(struct iomap_writepage_ctx *wpc, struct inode *inode,
 				loff_t offset);

From 4837bb5c535395c6934f82582f7ba4052c870c3c Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 7 Dec 2023 08:27:10 +0100
Subject: [PATCH 0117/1406] iomap: pass the length of the dirty region to
 ->map_blocks

Let the file system know how much dirty data exists at the passed
in offset.  This allows file systems to allocate the right amount
of space that actually is written back if they can't eagerly
convert (e.g. because they don't support unwritten extents).

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20231207072710.176093-15-hch@lst.de
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 block/fops.c           | 2 +-
 fs/gfs2/bmap.c         | 2 +-
 fs/iomap/buffered-io.c | 2 +-
 fs/xfs/xfs_aops.c      | 3 ++-
 fs/zonefs/file.c       | 3 ++-
 include/linux/iomap.h  | 2 +-
 6 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/block/fops.c b/block/fops.c
index 0cf8cf72cdfa10..93bae17ce660c8 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -482,7 +482,7 @@ static void blkdev_readahead(struct readahead_control *rac)
 }
 
 static int blkdev_map_blocks(struct iomap_writepage_ctx *wpc,
-		struct inode *inode, loff_t offset)
+		struct inode *inode, loff_t offset, unsigned int len)
 {
 	loff_t isize = i_size_read(inode);
 
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index d9ccfd27e4f11f..789af5c8fade9d 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -2465,7 +2465,7 @@ int __gfs2_punch_hole(struct file *file, loff_t offset, loff_t length)
 }
 
 static int gfs2_map_blocks(struct iomap_writepage_ctx *wpc, struct inode *inode,
-		loff_t offset)
+		loff_t offset, unsigned int len)
 {
 	int ret;
 
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 3dab060aed6d7b..2ad0e287c70448 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -1773,7 +1773,7 @@ static int iomap_writepage_map_blocks(struct iomap_writepage_ctx *wpc,
 	do {
 		unsigned map_len;
 
-		error = wpc->ops->map_blocks(wpc, inode, pos);
+		error = wpc->ops->map_blocks(wpc, inode, pos, dirty_len);
 		if (error)
 			break;
 		trace_iomap_writepage_map(inode, &wpc->iomap);
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 4fb244bb884dc4..1698507d1ac73a 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -276,7 +276,8 @@ static int
 xfs_map_blocks(
 	struct iomap_writepage_ctx *wpc,
 	struct inode		*inode,
-	loff_t			offset)
+	loff_t			offset,
+	unsigned int		len)
 {
 	struct xfs_inode	*ip = XFS_I(inode);
 	struct xfs_mount	*mp = ip->i_mount;
diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c
index 6ab2318a9c8e80..8dab4c2ad3007e 100644
--- a/fs/zonefs/file.c
+++ b/fs/zonefs/file.c
@@ -125,7 +125,8 @@ static void zonefs_readahead(struct readahead_control *rac)
  * which implies that the page range can only be within the fixed inode size.
  */
 static int zonefs_write_map_blocks(struct iomap_writepage_ctx *wpc,
-				   struct inode *inode, loff_t offset)
+				   struct inode *inode, loff_t offset,
+				   unsigned int len)
 {
 	struct zonefs_zone *z = zonefs_inode_zone(inode);
 
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 49d93f53878565..6fc1c858013d1e 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -318,7 +318,7 @@ struct iomap_writeback_ops {
 	 * by the file system if it is still valid.
 	 */
 	int (*map_blocks)(struct iomap_writepage_ctx *wpc, struct inode *inode,
-				loff_t offset);
+			  loff_t offset, unsigned len);
 
 	/*
 	 * Optional, allows the file systems to perform actions just before

From d8c70720ebfd4a0ed066d8c05a4fdc669b5896ff Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?N=C3=ADcolas=20F=2E=20R=2E=20A=2E=20Prado?=
 <nfraprado@collabora.com>
Date: Wed, 10 Jan 2024 18:05:02 -0300
Subject: [PATCH 0118/1406] pstore/ram: Register to module device table
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Register the compatible for this module on the module device table so
it can be automatically loaded when a matching DT node is present,
allowing logging of panics and oopses without any intervention.

Signed-off-by: Nícolas F. R. A. Prado <nfraprado@collabora.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Reviewed-by: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
Link: https://lore.kernel.org/r/20240110210600.787703-2-nfraprado@collabora.com
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 fs/pstore/ram.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index 88b34fdbf7592f..b1a455f42e9328 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -893,6 +893,7 @@ static const struct of_device_id dt_match[] = {
 	{ .compatible = "ramoops" },
 	{}
 };
+MODULE_DEVICE_TABLE(of, dt_match);
 
 static struct platform_driver ramoops_driver = {
 	.probe		= ramoops_probe,

From 393bd157dbf8471587f2e7c5fadee971e117dc5a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?N=C3=ADcolas=20F=2E=20R=2E=20A=2E=20Prado?=
 <nfraprado@collabora.com>
Date: Wed, 10 Jan 2024 18:05:03 -0300
Subject: [PATCH 0119/1406] arm64: defconfig: Enable PSTORE_RAM
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Enable PSTORE_RAM, that is the ramoops driver, in the defconfig, to
allow logging and retrieving panics and oopses to/from RAM automatically
for platforms that have a ramoops reserved memory node in DT.

Signed-off-by: Nícolas F. R. A. Prado <nfraprado@collabora.com>
Reviewed-by: David Heidelberg <david@ixit.cz>
Link: https://lore.kernel.org/r/20240110210600.787703-3-nfraprado@collabora.com
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 arch/arm64/configs/defconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig
index e6cf3e5d63c301..b0652c52bf3ab8 100644
--- a/arch/arm64/configs/defconfig
+++ b/arch/arm64/configs/defconfig
@@ -1590,6 +1590,7 @@ CONFIG_CONFIGFS_FS=y
 CONFIG_EFIVAR_FS=y
 CONFIG_UBIFS_FS=m
 CONFIG_SQUASHFS=y
+CONFIG_PSTORE_RAM=m
 CONFIG_NFS_FS=y
 CONFIG_NFS_V4=y
 CONFIG_NFS_V4_1=y

From c3f849caf81b7317d3cc2132294226314c710f62 Mon Sep 17 00:00:00 2001
From: "Guilherme G. Piccoli" <gpiccoli@igalia.com>
Date: Wed, 3 Jan 2024 15:40:32 -0300
Subject: [PATCH 0120/1406] efi: pstore: Allow dynamic initialization based on
 module parameter

The efi-pstore module parameter "pstore_disable" warrants that users
are able to deactivate such backend. There is also a Kconfig option
for the default value of this parameter. It was originally added due
to some bad UEFI FW implementations that could break with many variables
written.

Some distros (such as Arch Linux) set this in their config file still
nowadays. And once it is set, even being a writable module parameter,
there is effectively no way to make use of efi-pstore anymore.
If "pstore_disable" is set to true, the init function of the module exits
early and is never called again after the initcall processing.

Let's switch this module parameter to have a callback and perform the
pstore backend registration again each time it's set from Y->N (and
vice-versa). With this, the writable nature of the parameter starts to
make sense, given that users now can switch back to using efi-pstore
or not during runtime by writing into it.

Signed-off-by: Guilherme G. Piccoli <gpiccoli@igalia.com>
Link: https://lore.kernel.org/r/20240103184053.226203-1-gpiccoli@igalia.com
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 drivers/firmware/efi/efi-pstore.c | 43 +++++++++++++++++++++++++------
 1 file changed, 35 insertions(+), 8 deletions(-)

diff --git a/drivers/firmware/efi/efi-pstore.c b/drivers/firmware/efi/efi-pstore.c
index e7b9ec6f8a86a9..833cbb995dd3f2 100644
--- a/drivers/firmware/efi/efi-pstore.c
+++ b/drivers/firmware/efi/efi-pstore.c
@@ -14,16 +14,43 @@ static unsigned int record_size = 1024;
 module_param(record_size, uint, 0444);
 MODULE_PARM_DESC(record_size, "size of each pstore UEFI var (in bytes, min/default=1024)");
 
-static bool efivars_pstore_disable =
-	IS_ENABLED(CONFIG_EFI_VARS_PSTORE_DEFAULT_DISABLE);
-
-module_param_named(pstore_disable, efivars_pstore_disable, bool, 0644);
-
 #define PSTORE_EFI_ATTRIBUTES \
 	(EFI_VARIABLE_NON_VOLATILE | \
 	 EFI_VARIABLE_BOOTSERVICE_ACCESS | \
 	 EFI_VARIABLE_RUNTIME_ACCESS)
 
+static bool pstore_disable = IS_ENABLED(CONFIG_EFI_VARS_PSTORE_DEFAULT_DISABLE);
+
+static int efivars_pstore_init(void);
+static void efivars_pstore_exit(void);
+
+static int efi_pstore_disable_set(const char *val, const struct kernel_param *kp)
+{
+	int err;
+	bool old_pstore_disable = pstore_disable;
+
+	err = param_set_bool(val, kp);
+	if (err)
+		return err;
+
+	if (old_pstore_disable != pstore_disable) {
+		if (pstore_disable)
+			efivars_pstore_exit();
+		else
+			efivars_pstore_init();
+	}
+
+	return 0;
+}
+
+static const struct kernel_param_ops pstore_disable_ops = {
+	.set	= efi_pstore_disable_set,
+	.get	= param_get_bool,
+};
+
+module_param_cb(pstore_disable, &pstore_disable_ops, &pstore_disable, 0644);
+__MODULE_PARM_TYPE(pstore_disable, "bool");
+
 static int efi_pstore_open(struct pstore_info *psi)
 {
 	int err;
@@ -218,12 +245,12 @@ static struct pstore_info efi_pstore_info = {
 	.erase		= efi_pstore_erase,
 };
 
-static __init int efivars_pstore_init(void)
+static int efivars_pstore_init(void)
 {
 	if (!efivar_supports_writes())
 		return 0;
 
-	if (efivars_pstore_disable)
+	if (pstore_disable)
 		return 0;
 
 	/*
@@ -250,7 +277,7 @@ static __init int efivars_pstore_init(void)
 	return 0;
 }
 
-static __exit void efivars_pstore_exit(void)
+static void efivars_pstore_exit(void)
 {
 	if (!efi_pstore_info.bufsize)
 		return;

From 69f381e67d6fe94c4f1416fbd5672b715cae098a Mon Sep 17 00:00:00 2001
From: Kunwu Chan <chentao@kylinos.cn>
Date: Thu, 18 Jan 2024 18:02:06 +0800
Subject: [PATCH 0121/1406] pstore/zone: Add a null pointer check to the
 psz_kmsg_read

kasprintf() returns a pointer to dynamically allocated memory
which can be NULL upon failure. Ensure the allocation was successful
by checking the pointer validity.

Signed-off-by: Kunwu Chan <chentao@kylinos.cn>
Link: https://lore.kernel.org/r/20240118100206.213928-1-chentao@kylinos.cn
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 fs/pstore/zone.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/pstore/zone.c b/fs/pstore/zone.c
index 2770746bb7aa16..abca117725c816 100644
--- a/fs/pstore/zone.c
+++ b/fs/pstore/zone.c
@@ -973,6 +973,8 @@ static ssize_t psz_kmsg_read(struct pstore_zone *zone,
 		char *buf = kasprintf(GFP_KERNEL, "%s: Total %d times\n",
 				      kmsg_dump_reason_str(record->reason),
 				      record->count);
+		if (!buf)
+			return -ENOMEM;
 		hlen = strlen(buf);
 		record->buf = krealloc(buf, hlen + size, GFP_KERNEL);
 		if (!record->buf) {

From f25e87ea4aea3c2d472c54d31318e852043d56fa Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 26 Jan 2024 21:01:05 -0500
Subject: [PATCH 0122/1406] fs/pipe: Convert to lockdep_cmp_fn

*_lock_nested() is fundamentally broken; lockdep needs to check lock
ordering, but we cannot device a total ordering on an unbounded number
of elements with only a few subclasses.

the replacement is to define lock ordering with a proper comparison
function.

fs/pipe.c was already doing everything correctly otherwise, nothing
much changes here.

Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Jan Kara <jack@suse.cz>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
Link: https://lore.kernel.org/r/20240127020111.487218-2-kent.overstreet@linux.dev
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/pipe.c | 81 +++++++++++++++++++++++++------------------------------
 1 file changed, 36 insertions(+), 45 deletions(-)

diff --git a/fs/pipe.c b/fs/pipe.c
index f1adbfe743d4a7..50c8a8596b5245 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -76,18 +76,20 @@ static unsigned long pipe_user_pages_soft = PIPE_DEF_BUFFERS * INR_OPEN_CUR;
  * -- Manfred Spraul <manfred@colorfullife.com> 2002-05-09
  */
 
-static void pipe_lock_nested(struct pipe_inode_info *pipe, int subclass)
+#define cmp_int(l, r)		((l > r) - (l < r))
+
+#ifdef CONFIG_PROVE_LOCKING
+static int pipe_lock_cmp_fn(const struct lockdep_map *a,
+			    const struct lockdep_map *b)
 {
-	if (pipe->files)
-		mutex_lock_nested(&pipe->mutex, subclass);
+	return cmp_int((unsigned long) a, (unsigned long) b);
 }
+#endif
 
 void pipe_lock(struct pipe_inode_info *pipe)
 {
-	/*
-	 * pipe_lock() nests non-pipe inode locks (for writing to a file)
-	 */
-	pipe_lock_nested(pipe, I_MUTEX_PARENT);
+	if (pipe->files)
+		mutex_lock(&pipe->mutex);
 }
 EXPORT_SYMBOL(pipe_lock);
 
@@ -98,28 +100,16 @@ void pipe_unlock(struct pipe_inode_info *pipe)
 }
 EXPORT_SYMBOL(pipe_unlock);
 
-static inline void __pipe_lock(struct pipe_inode_info *pipe)
-{
-	mutex_lock_nested(&pipe->mutex, I_MUTEX_PARENT);
-}
-
-static inline void __pipe_unlock(struct pipe_inode_info *pipe)
-{
-	mutex_unlock(&pipe->mutex);
-}
-
 void pipe_double_lock(struct pipe_inode_info *pipe1,
 		      struct pipe_inode_info *pipe2)
 {
 	BUG_ON(pipe1 == pipe2);
 
-	if (pipe1 < pipe2) {
-		pipe_lock_nested(pipe1, I_MUTEX_PARENT);
-		pipe_lock_nested(pipe2, I_MUTEX_CHILD);
-	} else {
-		pipe_lock_nested(pipe2, I_MUTEX_PARENT);
-		pipe_lock_nested(pipe1, I_MUTEX_CHILD);
-	}
+	if (pipe1 > pipe2)
+		swap(pipe1, pipe2);
+
+	pipe_lock(pipe1);
+	pipe_lock(pipe2);
 }
 
 static void anon_pipe_buf_release(struct pipe_inode_info *pipe,
@@ -271,7 +261,7 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to)
 		return 0;
 
 	ret = 0;
-	__pipe_lock(pipe);
+	mutex_lock(&pipe->mutex);
 
 	/*
 	 * We only wake up writers if the pipe was full when we started
@@ -368,7 +358,7 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to)
 			ret = -EAGAIN;
 			break;
 		}
-		__pipe_unlock(pipe);
+		mutex_unlock(&pipe->mutex);
 
 		/*
 		 * We only get here if we didn't actually read anything.
@@ -400,13 +390,13 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to)
 		if (wait_event_interruptible_exclusive(pipe->rd_wait, pipe_readable(pipe)) < 0)
 			return -ERESTARTSYS;
 
-		__pipe_lock(pipe);
+		mutex_lock(&pipe->mutex);
 		was_full = pipe_full(pipe->head, pipe->tail, pipe->max_usage);
 		wake_next_reader = true;
 	}
 	if (pipe_empty(pipe->head, pipe->tail))
 		wake_next_reader = false;
-	__pipe_unlock(pipe);
+	mutex_unlock(&pipe->mutex);
 
 	if (was_full)
 		wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM);
@@ -462,7 +452,7 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
 	if (unlikely(total_len == 0))
 		return 0;
 
-	__pipe_lock(pipe);
+	mutex_lock(&pipe->mutex);
 
 	if (!pipe->readers) {
 		send_sig(SIGPIPE, current, 0);
@@ -582,19 +572,19 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
 		 * after waiting we need to re-check whether the pipe
 		 * become empty while we dropped the lock.
 		 */
-		__pipe_unlock(pipe);
+		mutex_unlock(&pipe->mutex);
 		if (was_empty)
 			wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM);
 		kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
 		wait_event_interruptible_exclusive(pipe->wr_wait, pipe_writable(pipe));
-		__pipe_lock(pipe);
+		mutex_lock(&pipe->mutex);
 		was_empty = pipe_empty(pipe->head, pipe->tail);
 		wake_next_writer = true;
 	}
 out:
 	if (pipe_full(pipe->head, pipe->tail, pipe->max_usage))
 		wake_next_writer = false;
-	__pipe_unlock(pipe);
+	mutex_unlock(&pipe->mutex);
 
 	/*
 	 * If we do do a wakeup event, we do a 'sync' wakeup, because we
@@ -629,7 +619,7 @@ static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 
 	switch (cmd) {
 	case FIONREAD:
-		__pipe_lock(pipe);
+		mutex_lock(&pipe->mutex);
 		count = 0;
 		head = pipe->head;
 		tail = pipe->tail;
@@ -639,16 +629,16 @@ static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 			count += pipe->bufs[tail & mask].len;
 			tail++;
 		}
-		__pipe_unlock(pipe);
+		mutex_unlock(&pipe->mutex);
 
 		return put_user(count, (int __user *)arg);
 
 #ifdef CONFIG_WATCH_QUEUE
 	case IOC_WATCH_QUEUE_SET_SIZE: {
 		int ret;
-		__pipe_lock(pipe);
+		mutex_lock(&pipe->mutex);
 		ret = watch_queue_set_size(pipe, arg);
-		__pipe_unlock(pipe);
+		mutex_unlock(&pipe->mutex);
 		return ret;
 	}
 
@@ -734,7 +724,7 @@ pipe_release(struct inode *inode, struct file *file)
 {
 	struct pipe_inode_info *pipe = file->private_data;
 
-	__pipe_lock(pipe);
+	mutex_lock(&pipe->mutex);
 	if (file->f_mode & FMODE_READ)
 		pipe->readers--;
 	if (file->f_mode & FMODE_WRITE)
@@ -747,7 +737,7 @@ pipe_release(struct inode *inode, struct file *file)
 		kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
 		kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
 	}
-	__pipe_unlock(pipe);
+	mutex_unlock(&pipe->mutex);
 
 	put_pipe_info(inode, pipe);
 	return 0;
@@ -759,7 +749,7 @@ pipe_fasync(int fd, struct file *filp, int on)
 	struct pipe_inode_info *pipe = filp->private_data;
 	int retval = 0;
 
-	__pipe_lock(pipe);
+	mutex_lock(&pipe->mutex);
 	if (filp->f_mode & FMODE_READ)
 		retval = fasync_helper(fd, filp, on, &pipe->fasync_readers);
 	if ((filp->f_mode & FMODE_WRITE) && retval >= 0) {
@@ -768,7 +758,7 @@ pipe_fasync(int fd, struct file *filp, int on)
 			/* this can happen only if on == T */
 			fasync_helper(-1, filp, 0, &pipe->fasync_readers);
 	}
-	__pipe_unlock(pipe);
+	mutex_unlock(&pipe->mutex);
 	return retval;
 }
 
@@ -834,6 +824,7 @@ struct pipe_inode_info *alloc_pipe_info(void)
 		pipe->nr_accounted = pipe_bufs;
 		pipe->user = user;
 		mutex_init(&pipe->mutex);
+		lock_set_cmp_fn(&pipe->mutex, pipe_lock_cmp_fn, NULL);
 		return pipe;
 	}
 
@@ -1144,7 +1135,7 @@ static int fifo_open(struct inode *inode, struct file *filp)
 	filp->private_data = pipe;
 	/* OK, we have a pipe and it's pinned down */
 
-	__pipe_lock(pipe);
+	mutex_lock(&pipe->mutex);
 
 	/* We can only do regular read/write on fifos */
 	stream_open(inode, filp);
@@ -1214,7 +1205,7 @@ static int fifo_open(struct inode *inode, struct file *filp)
 	}
 
 	/* Ok! */
-	__pipe_unlock(pipe);
+	mutex_unlock(&pipe->mutex);
 	return 0;
 
 err_rd:
@@ -1230,7 +1221,7 @@ static int fifo_open(struct inode *inode, struct file *filp)
 	goto err;
 
 err:
-	__pipe_unlock(pipe);
+	mutex_unlock(&pipe->mutex);
 
 	put_pipe_info(inode, pipe);
 	return ret;
@@ -1411,7 +1402,7 @@ long pipe_fcntl(struct file *file, unsigned int cmd, unsigned int arg)
 	if (!pipe)
 		return -EBADF;
 
-	__pipe_lock(pipe);
+	mutex_lock(&pipe->mutex);
 
 	switch (cmd) {
 	case F_SETPIPE_SZ:
@@ -1425,7 +1416,7 @@ long pipe_fcntl(struct file *file, unsigned int cmd, unsigned int arg)
 		break;
 	}
 
-	__pipe_unlock(pipe);
+	mutex_unlock(&pipe->mutex);
 	return ret;
 }
 

From e4787a3b8664a0a626a9887a192f7ecd2573b1e4 Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Mon, 10 Apr 2023 21:04:50 +0900
Subject: [PATCH 0123/1406] sysv: don't call sb_bread() with pointers_lock held

syzbot is reporting sleep in atomic context in SysV filesystem [1], for
sb_bread() is called with rw_spinlock held.

A "write_lock(&pointers_lock) => read_lock(&pointers_lock) deadlock" bug
and a "sb_bread() with write_lock(&pointers_lock)" bug were introduced by
"Replace BKL for chain locking with sysvfs-private rwlock" in Linux 2.5.12.

Then, "[PATCH] err1-40: sysvfs locking fix" in Linux 2.6.8 fixed the
former bug by moving pointers_lock lock to the callers, but instead
introduced a "sb_bread() with read_lock(&pointers_lock)" bug (which made
this problem easier to hit).

Al Viro suggested that why not to do like get_branch()/get_block()/
find_shared() in Minix filesystem does. And doing like that is almost a
revert of "[PATCH] err1-40: sysvfs locking fix" except that get_branch()
 from with find_shared() is called without write_lock(&pointers_lock).

Reported-by: syzbot <syzbot+69b40dc5fd40f32c199f@syzkaller.appspotmail.com>
Link: https://syzkaller.appspot.com/bug?extid=69b40dc5fd40f32c199f
Suggested-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Link: https://lore.kernel.org/r/0d195f93-a22a-49a2-0020-103534d6f7f6@I-love.SAKURA.ne.jp
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/sysv/itree.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c
index 410ab2a44d2f60..19bcb51a220366 100644
--- a/fs/sysv/itree.c
+++ b/fs/sysv/itree.c
@@ -83,9 +83,6 @@ static inline sysv_zone_t *block_end(struct buffer_head *bh)
 	return (sysv_zone_t*)((char*)bh->b_data + bh->b_size);
 }
 
-/*
- * Requires read_lock(&pointers_lock) or write_lock(&pointers_lock)
- */
 static Indirect *get_branch(struct inode *inode,
 			    int depth,
 			    int offsets[],
@@ -105,15 +102,18 @@ static Indirect *get_branch(struct inode *inode,
 		bh = sb_bread(sb, block);
 		if (!bh)
 			goto failure;
+		read_lock(&pointers_lock);
 		if (!verify_chain(chain, p))
 			goto changed;
 		add_chain(++p, bh, (sysv_zone_t*)bh->b_data + *++offsets);
+		read_unlock(&pointers_lock);
 		if (!p->key)
 			goto no_block;
 	}
 	return NULL;
 
 changed:
+	read_unlock(&pointers_lock);
 	brelse(bh);
 	*err = -EAGAIN;
 	goto no_block;
@@ -219,9 +219,7 @@ static int get_block(struct inode *inode, sector_t iblock, struct buffer_head *b
 		goto out;
 
 reread:
-	read_lock(&pointers_lock);
 	partial = get_branch(inode, depth, offsets, chain, &err);
-	read_unlock(&pointers_lock);
 
 	/* Simplest case - block found, no allocation needed */
 	if (!partial) {
@@ -291,9 +289,9 @@ static Indirect *find_shared(struct inode *inode,
 	*top = 0;
 	for (k = depth; k > 1 && !offsets[k-1]; k--)
 		;
+	partial = get_branch(inode, k, offsets, chain, &err);
 
 	write_lock(&pointers_lock);
-	partial = get_branch(inode, k, offsets, chain, &err);
 	if (!partial)
 		partial = chain + k-1;
 	/*

From 84baca3a56159bbc1fdd1adc54a469f3784ff3f0 Mon Sep 17 00:00:00 2001
From: Alexander Mikhalitsyn <aleksandr.mikhalitsyn@canonical.com>
Date: Mon, 29 Jan 2024 19:00:23 +0100
Subject: [PATCH 0124/1406] ntfs3: use file_mnt_idmap helper

Let's use file_mnt_idmap() as we do that across the tree.

No functional impact.

Cc: Christian Brauner <brauner@kernel.org>
Cc: Konstantin Komarov <almaz.alexandrovich@paragon-software.com>
Cc: <ntfs3@lists.linux.dev>
Cc: <linux-fsdevel@vger.kernel.org>
Cc: <linux-kernel@vger.kernel.org>
Signed-off-by: Alexander Mikhalitsyn <aleksandr.mikhalitsyn@canonical.com>
Link: https://lore.kernel.org/r/20240129180024.219766-1-aleksandr.mikhalitsyn@canonical.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/ntfs3/namei.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/ntfs3/namei.c b/fs/ntfs3/namei.c
index ee3093be51701e..144aa80cca433e 100644
--- a/fs/ntfs3/namei.c
+++ b/fs/ntfs3/namei.c
@@ -419,7 +419,7 @@ static int ntfs_atomic_open(struct inode *dir, struct dentry *dentry,
 	 * fnd contains tree's path to insert to.
 	 * If fnd is not NULL then dir is locked.
 	 */
-	inode = ntfs_create_inode(mnt_idmap(file->f_path.mnt), dir, dentry, uni,
+	inode = ntfs_create_inode(file_mnt_idmap(file), dir, dentry, uni,
 				  mode, 0, NULL, 0, fnd);
 	err = IS_ERR(inode) ? PTR_ERR(inode) :
 			      finish_open(file, dentry, ntfs_file_open);

From 41ef33da2c49e0f36c8a088c6bd123c49f43e0b6 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Mon, 29 Jan 2024 10:37:29 -0800
Subject: [PATCH 0125/1406] iov_iter: Avoid wrap-around instrumentation in
 copy_compat_iovec_from_user()

The loop counter "i" in copy_compat_iovec_from_user() is an int, but
because the nr_segs argument is unsigned long, the signed overflow
sanitizer got worried "i" could wrap around. Instead of making "i" an
unsigned long (which may enlarge the type size), switch both nr_segs
and i to u32. There is no truncation with nr_segs since it is never
larger than UIO_MAXIOV anyway. This keeps sanitizer instrumentation[1]
out of a UACCESS path:

vmlinux.o: warning: objtool: copy_compat_iovec_from_user+0xa9: call to __ubsan_handle_add_overflow() with UACCESS enabled

Link: https://github.com/KSPP/linux/issues/26 [1]
Cc: Christian Brauner <brauner@kernel.org>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20240129183729.work.991-kees@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 lib/iov_iter.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index 15f5040709c36e..73715d10c812bf 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -1167,11 +1167,12 @@ const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags)
 EXPORT_SYMBOL(dup_iter);
 
 static __noclone int copy_compat_iovec_from_user(struct iovec *iov,
-		const struct iovec __user *uvec, unsigned long nr_segs)
+		const struct iovec __user *uvec, u32 nr_segs)
 {
 	const struct compat_iovec __user *uiov =
 		(const struct compat_iovec __user *)uvec;
-	int ret = -EFAULT, i;
+	int ret = -EFAULT;
+	u32 i;
 
 	if (!user_access_begin(uiov, nr_segs * sizeof(*uiov)))
 		return -EFAULT;

From 8cf1cc7e3c79bf7582ddc54d7036c231a2979271 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Mon, 29 Jan 2024 10:40:15 -0800
Subject: [PATCH 0126/1406] select: Avoid wrap-around instrumentation in
 do_sys_poll()

The mix of int, unsigned int, and unsigned long used by struct
poll_list::len, todo, len, and j meant that the signed overflow
sanitizer got worried it needed to instrument several places where
arithmetic happens between these variables. Since all of the variables
are always positive and bounded by unsigned int, use a single type in
all places. Additionally expand the zero-test into an explicit range
check before updating "todo".

This keeps sanitizer instrumentation[1] out of a UACCESS path:

vmlinux.o: warning: objtool: do_sys_poll+0x285: call to __ubsan_handle_sub_overflow() with UACCESS enabled

Link: https://github.com/KSPP/linux/issues/26 [1]
Cc: Christian Brauner <brauner@kernel.org>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Jan Kara <jack@suse.cz>
Cc: <linux-fsdevel@vger.kernel.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20240129184014.work.593-kees@kernel.org
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/select.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/fs/select.c b/fs/select.c
index 0ee55af1a55c29..11a3b1312abeff 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -839,7 +839,7 @@ SYSCALL_DEFINE1(old_select, struct sel_arg_struct __user *, arg)
 
 struct poll_list {
 	struct poll_list *next;
-	int len;
+	unsigned int len;
 	struct pollfd entries[];
 };
 
@@ -975,14 +975,15 @@ static int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds,
 		struct timespec64 *end_time)
 {
 	struct poll_wqueues table;
-	int err = -EFAULT, fdcount, len;
+	int err = -EFAULT, fdcount;
 	/* Allocate small arguments on the stack to save memory and be
 	   faster - use long to make sure the buffer is aligned properly
 	   on 64 bit archs to avoid unaligned access */
 	long stack_pps[POLL_STACK_ALLOC/sizeof(long)];
 	struct poll_list *const head = (struct poll_list *)stack_pps;
  	struct poll_list *walk = head;
- 	unsigned long todo = nfds;
+	unsigned int todo = nfds;
+	unsigned int len;
 
 	if (nfds > rlimit(RLIMIT_NOFILE))
 		return -EINVAL;
@@ -998,9 +999,9 @@ static int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds,
 					sizeof(struct pollfd) * walk->len))
 			goto out_fds;
 
-		todo -= walk->len;
-		if (!todo)
+		if (walk->len >= todo)
 			break;
+		todo -= walk->len;
 
 		len = min(todo, POLLFD_PER_PAGE);
 		walk = walk->next = kmalloc(struct_size(walk, entries, len),
@@ -1020,7 +1021,7 @@ static int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds,
 
 	for (walk = head; walk; walk = walk->next) {
 		struct pollfd *fds = walk->entries;
-		int j;
+		unsigned int j;
 
 		for (j = walk->len; j; fds++, ufds++, j--)
 			unsafe_put_user(fds->revents, &ufds->revents, Efault);

From b14310e704918c921d7365992caf7a46af430a7f Mon Sep 17 00:00:00 2001
From: Kunwu Chan <chentao@kylinos.cn>
Date: Wed, 31 Jan 2024 15:09:41 +0800
Subject: [PATCH 0127/1406] fs: Use KMEM_CACHE instead of kmem_cache_create

commit 0a31bd5f2bbb ("KMEM_CACHE(): simplify slab cache creation")
introduces a new macro.
Use the new KMEM_CACHE() macro instead of direct kmem_cache_create
to simplify the creation of SLAB caches.

Signed-off-by: Kunwu Chan <chentao@kylinos.cn>
Link: https://lore.kernel.org/r/20240131070941.135178-1-chentao@kylinos.cn
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/backing-file.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/fs/backing-file.c b/fs/backing-file.c
index a681f38d84d8e1..740185198db347 100644
--- a/fs/backing-file.c
+++ b/fs/backing-file.c
@@ -325,9 +325,7 @@ EXPORT_SYMBOL_GPL(backing_file_mmap);
 
 static int __init backing_aio_init(void)
 {
-	backing_aio_cachep = kmem_cache_create("backing_aio",
-					       sizeof(struct backing_aio),
-					       0, SLAB_HWCACHE_ALIGN, NULL);
+	backing_aio_cachep = KMEM_CACHE(backing_aio, SLAB_HWCACHE_ALIGN);
 	if (!backing_aio_cachep)
 		return -ENOMEM;
 

From 3a3dbab01e6f8ebd9921bf1bca4c71f61dee9ee0 Mon Sep 17 00:00:00 2001
From: Kunwu Chan <chentao@kylinos.cn>
Date: Thu, 1 Feb 2024 17:34:26 +0800
Subject: [PATCH 0128/1406] mbcache: Simplify the allocation of slab caches

Use the new KMEM_CACHE() macro instead of direct kmem_cache_create
to simplify the creation of SLAB caches.

Signed-off-by: Kunwu Chan <chentao@kylinos.cn>
Link: https://lore.kernel.org/r/20240201093426.207932-1-chentao@kylinos.cn
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/mbcache.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/fs/mbcache.c b/fs/mbcache.c
index 82aa7a35db26b3..fe2624e1725339 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -426,9 +426,8 @@ EXPORT_SYMBOL(mb_cache_destroy);
 
 static int __init mbcache_init(void)
 {
-	mb_entry_cache = kmem_cache_create("mbcache",
-				sizeof(struct mb_cache_entry), 0,
-				SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL);
+	mb_entry_cache = KMEM_CACHE(mb_cache_entry,
+					 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD);
 	if (!mb_entry_cache)
 		return -ENOMEM;
 	return 0;

From 29fe925f36e2909743bc0f25bcf32a9ef80bd5db Mon Sep 17 00:00:00 2001
From: Chen Hanxiao <chenhx.fnst@fujitsu.com>
Date: Fri, 2 Feb 2024 15:20:42 +0800
Subject: [PATCH 0129/1406] __fs_parse: Correct a documentation comment

Commit 7f5d38141e30 ("new primitive: __fs_parse()")
taking p_log instead of fs_context.

So, update that comment to refer to p_log instead

Signed-off-by: Chen Hanxiao <chenhx.fnst@fujitsu.com>
Link: https://lore.kernel.org/r/20240202072042.906-1-chenhx.fnst@fujitsu.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/fs_parser.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/fs_parser.c b/fs/fs_parser.c
index edb3712dcfa580..a4d6ca0b8971e6 100644
--- a/fs/fs_parser.c
+++ b/fs/fs_parser.c
@@ -83,8 +83,8 @@ static const struct fs_parameter_spec *fs_lookup_key(
 }
 
 /*
- * fs_parse - Parse a filesystem configuration parameter
- * @fc: The filesystem context to log errors through.
+ * __fs_parse - Parse a filesystem configuration parameter
+ * @log: The filesystem context to log errors through.
  * @desc: The parameter description to use.
  * @param: The parameter.
  * @result: Where to place the result of the parse

From 6b7ea67e3f20b64730df5bc6dd9243b5ebf9abe4 Mon Sep 17 00:00:00 2001
From: Johan Hovold <johan+linaro@kernel.org>
Date: Wed, 27 Dec 2023 11:10:03 +0100
Subject: [PATCH 0130/1406] Bluetooth: hci_bcm4377: do not mark valid bd_addr
 as invalid

A recent commit restored the original (and still documented) semantics
for the HCI_QUIRK_USE_BDADDR_PROPERTY quirk so that the device address
is considered invalid unless an address is provided by firmware.

This specifically means that this flag must only be set for devices with
invalid addresses, but the Broadcom BCM4377 driver has so far been
setting this flag unconditionally.

Fortunately the driver already checks for invalid addresses during setup
and sets the HCI_QUIRK_INVALID_BDADDR flag, which can simply be replaced
with HCI_QUIRK_USE_BDADDR_PROPERTY to indicate that the default address
is invalid but can be overridden by firmware (long term, this should
probably just always be allowed).

Fixes: 6945795bc81a ("Bluetooth: fix use-bdaddr-property quirk")
Cc: stable@vger.kernel.org      # 6.5
Reported-by: Felix Zhang <mrman@mrman314.tech>
Link: https://lore.kernel.org/r/77419ffacc5b4875e920e038332575a2a5bff29f.camel@mrman314.tech/
Signed-off-by: Johan Hovold <johan+linaro@kernel.org>
Reported-by: Felix Zhang <mrman@mrman314.tech>
Reviewed-by: Neal Gompa <neal@gompa.dev>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 drivers/bluetooth/hci_bcm4377.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/bluetooth/hci_bcm4377.c b/drivers/bluetooth/hci_bcm4377.c
index a617578356953c..9a7243d5db71ff 100644
--- a/drivers/bluetooth/hci_bcm4377.c
+++ b/drivers/bluetooth/hci_bcm4377.c
@@ -1417,7 +1417,7 @@ static int bcm4377_check_bdaddr(struct bcm4377_data *bcm4377)
 
 	bda = (struct hci_rp_read_bd_addr *)skb->data;
 	if (!bcm4377_is_valid_bdaddr(bcm4377, &bda->bdaddr))
-		set_bit(HCI_QUIRK_INVALID_BDADDR, &bcm4377->hdev->quirks);
+		set_bit(HCI_QUIRK_USE_BDADDR_PROPERTY, &bcm4377->hdev->quirks);
 
 	kfree_skb(skb);
 	return 0;
@@ -2368,7 +2368,6 @@ static int bcm4377_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	hdev->set_bdaddr = bcm4377_hci_set_bdaddr;
 	hdev->setup = bcm4377_hci_setup;
 
-	set_bit(HCI_QUIRK_USE_BDADDR_PROPERTY, &hdev->quirks);
 	if (bcm4377->hw->broken_mws_transport_config)
 		set_bit(HCI_QUIRK_BROKEN_MWS_TRANSPORT_CONFIG, &hdev->quirks);
 	if (bcm4377->hw->broken_ext_scan)

From efe0ec2934ec398b65ceac53940d7af34dac087e Mon Sep 17 00:00:00 2001
From: Max Chou <max.chou@realtek.com>
Date: Tue, 26 Dec 2023 19:45:17 +0800
Subject: [PATCH 0131/1406] Bluetooth: btrtl: Add the support for
 RTL8852BT/RTL8852BE-VT

Add the support for RTL8852BT/RTL8852BE-VT BT controller on USB interface.
The necessary firmware will be submitted to linux-firmware project.

The device info from /sys/kernel/debug/usb/devices as below.

T:  Bus=02 Lev=02 Prnt=02 Port=05 Cnt=01 Dev#=  8 Spd=12   MxCh= 0
D:  Ver= 1.00 Cls=e0(wlcon) Sub=01 Prot=01 MxPS=64 #Cfgs=  1
P:  Vendor=0bda ProdID=8520 Rev= 0.00
S:  Manufacturer=Realtek
S:  Product=Bluetooth Radio
S:  SerialNumber=00e04c000001
C:* #Ifs= 2 Cfg#= 1 Atr=e0 MxPwr=500mA
I:* If#= 0 Alt= 0 #EPs= 3 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=81(I) Atr=03(Int.) MxPS=  16 Ivl=1ms
E:  Ad=02(O) Atr=02(Bulk) MxPS=  64 Ivl=0ms
E:  Ad=82(I) Atr=02(Bulk) MxPS=  64 Ivl=0ms
I:* If#= 1 Alt= 0 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=03(O) Atr=01(Isoc) MxPS=   0 Ivl=1ms
E:  Ad=83(I) Atr=01(Isoc) MxPS=   0 Ivl=1ms
I:  If#= 1 Alt= 1 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=03(O) Atr=01(Isoc) MxPS=   9 Ivl=1ms
E:  Ad=83(I) Atr=01(Isoc) MxPS=   9 Ivl=1ms
I:  If#= 1 Alt= 2 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=03(O) Atr=01(Isoc) MxPS=  17 Ivl=1ms
E:  Ad=83(I) Atr=01(Isoc) MxPS=  17 Ivl=1ms
I:  If#= 1 Alt= 3 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=03(O) Atr=01(Isoc) MxPS=  25 Ivl=1ms
E:  Ad=83(I) Atr=01(Isoc) MxPS=  25 Ivl=1ms
I:  If#= 1 Alt= 4 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=03(O) Atr=01(Isoc) MxPS=  33 Ivl=1ms
E:  Ad=83(I) Atr=01(Isoc) MxPS=  33 Ivl=1ms
I:  If#= 1 Alt= 5 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=03(O) Atr=01(Isoc) MxPS=  49 Ivl=1ms
E:  Ad=83(I) Atr=01(Isoc) MxPS=  49 Ivl=1ms

Signed-off-by: Max Chou <max.chou@realtek.com>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 drivers/bluetooth/btrtl.c | 14 ++++++++++++++
 drivers/bluetooth/btusb.c |  3 +++
 2 files changed, 17 insertions(+)

diff --git a/drivers/bluetooth/btrtl.c b/drivers/bluetooth/btrtl.c
index 277d039ecbb429..cc50de69e8dc98 100644
--- a/drivers/bluetooth/btrtl.c
+++ b/drivers/bluetooth/btrtl.c
@@ -69,6 +69,7 @@ enum btrtl_chip_id {
 	CHIP_ID_8852B = 20,
 	CHIP_ID_8852C = 25,
 	CHIP_ID_8851B = 36,
+	CHIP_ID_8852BT = 47,
 };
 
 struct id_table {
@@ -307,6 +308,15 @@ static const struct id_table ic_id_table[] = {
 	  .fw_name  = "rtl_bt/rtl8851bu_fw",
 	  .cfg_name = "rtl_bt/rtl8851bu_config",
 	  .hw_info  = "rtl8851bu" },
+
+	/* 8852BT/8852BE-VT */
+	{ IC_INFO(RTL_ROM_LMP_8852A, 0x87, 0xc, HCI_USB),
+	  .config_needed = false,
+	  .has_rom_version = true,
+	  .has_msft_ext = true,
+	  .fw_name  = "rtl_bt/rtl8852btu_fw",
+	  .cfg_name = "rtl_bt/rtl8852btu_config",
+	  .hw_info  = "rtl8852btu" },
 	};
 
 static const struct id_table *btrtl_match_ic(u16 lmp_subver, u16 hci_rev,
@@ -645,6 +655,7 @@ static int rtlbt_parse_firmware(struct hci_dev *hdev,
 		{ RTL_ROM_LMP_8852A, 20 },	/* 8852B */
 		{ RTL_ROM_LMP_8852A, 25 },	/* 8852C */
 		{ RTL_ROM_LMP_8851B, 36 },	/* 8851B */
+		{ RTL_ROM_LMP_8852A, 47 },	/* 8852BT */
 	};
 
 	if (btrtl_dev->fw_len <= 8)
@@ -1275,6 +1286,7 @@ void btrtl_set_quirks(struct hci_dev *hdev, struct btrtl_device_info *btrtl_dev)
 	case CHIP_ID_8852B:
 	case CHIP_ID_8852C:
 	case CHIP_ID_8851B:
+	case CHIP_ID_8852BT:
 		set_bit(HCI_QUIRK_VALID_LE_STATES, &hdev->quirks);
 		set_bit(HCI_QUIRK_WIDEBAND_SPEECH_SUPPORTED, &hdev->quirks);
 
@@ -1505,6 +1517,8 @@ MODULE_FIRMWARE("rtl_bt/rtl8852bs_fw.bin");
 MODULE_FIRMWARE("rtl_bt/rtl8852bs_config.bin");
 MODULE_FIRMWARE("rtl_bt/rtl8852bu_fw.bin");
 MODULE_FIRMWARE("rtl_bt/rtl8852bu_config.bin");
+MODULE_FIRMWARE("rtl_bt/rtl8852btu_fw.bin");
+MODULE_FIRMWARE("rtl_bt/rtl8852btu_config.bin");
 MODULE_FIRMWARE("rtl_bt/rtl8852cu_fw.bin");
 MODULE_FIRMWARE("rtl_bt/rtl8852cu_fw_v2.bin");
 MODULE_FIRMWARE("rtl_bt/rtl8852cu_config.bin");
diff --git a/drivers/bluetooth/btusb.c b/drivers/bluetooth/btusb.c
index d31edad7a05607..c4e0456153d8e3 100644
--- a/drivers/bluetooth/btusb.c
+++ b/drivers/bluetooth/btusb.c
@@ -553,6 +553,9 @@ static const struct usb_device_id quirks_table[] = {
 	{ USB_DEVICE(0x13d3, 0x3572), .driver_info = BTUSB_REALTEK |
 						     BTUSB_WIDEBAND_SPEECH },
 
+	/* Realtek 8852BT/8852BE-VT Bluetooth devices */
+	{ USB_DEVICE(0x0bda, 0x8520), .driver_info = BTUSB_REALTEK |
+						     BTUSB_WIDEBAND_SPEECH },
 	/* Realtek Bluetooth devices */
 	{ USB_VENDOR_AND_INTERFACE_INFO(0x0bda, 0xe0, 0x01, 0x01),
 	  .driver_info = BTUSB_REALTEK },

From 7479b901338689f21866ef9db0b5158dc78e2103 Mon Sep 17 00:00:00 2001
From: Neeraj Sanjay Kale <neeraj.sanjaykale@nxp.com>
Date: Wed, 27 Dec 2023 18:59:27 +0530
Subject: [PATCH 0132/1406] Bluetooth: btnxpuart: Resolve TX timeout error in
 power save stress test

This fixes the tx timeout issue seen while running a stress test on
btnxpuart for couple of hours, such that the interval between two HCI
commands coincide with the power save timeout value of 2 seconds.

Test procedure using bash script:
<load btnxpuart.ko>
hciconfig hci0 up
//Enable Power Save feature
hcitool -i hci0 cmd 3f 23 02 00 00
while (true)
do
    hciconfig hci0 leadv
    sleep 2
    hciconfig hci0 noleadv
    sleep 2
done

Error log, after adding few more debug prints:
Bluetooth: btnxpuart_queue_skb(): 01 0A 20 01 00
Bluetooth: hci0: Set UART break: on, status=0
Bluetooth: hci0: btnxpuart_tx_wakeup() tx_work scheduled
Bluetooth: hci0: btnxpuart_tx_work() dequeue: 01 0A 20 01 00
Can't set advertise mode on hci0: Connection timed out (110)
Bluetooth: hci0: command 0x200a tx timeout

When the power save mechanism turns on UART break, and btnxpuart_tx_work()
is scheduled simultaneously, psdata->ps_state is read as PS_STATE_AWAKE,
which prevents the psdata->work from being scheduled, which is responsible
to turn OFF UART break.

This issue is fixed by adding a ps_lock mutex around UART break on/off as
well as around ps_state read/write.
btnxpuart_tx_wakeup() will now read updated ps_state value. If ps_state is
PS_STATE_SLEEP, it will first schedule psdata->work, and then it will
reschedule itself once UART break has been turned off and ps_state is
PS_STATE_AWAKE.

Tested above script for 50,000 iterations and TX timeout error was not
observed anymore.

Signed-off-by: Neeraj Sanjay Kale <neeraj.sanjaykale@nxp.com>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 drivers/bluetooth/btnxpuart.c | 24 +++++++++++++++++++++---
 1 file changed, 21 insertions(+), 3 deletions(-)

diff --git a/drivers/bluetooth/btnxpuart.c b/drivers/bluetooth/btnxpuart.c
index 1d592ac413d1ff..55b6e3dcd4ecf4 100644
--- a/drivers/bluetooth/btnxpuart.c
+++ b/drivers/bluetooth/btnxpuart.c
@@ -126,6 +126,7 @@ struct ps_data {
 	struct hci_dev *hdev;
 	struct work_struct work;
 	struct timer_list ps_timer;
+	struct mutex ps_lock;
 };
 
 struct wakeup_cmd_payload {
@@ -317,6 +318,9 @@ static void ps_start_timer(struct btnxpuart_dev *nxpdev)
 
 	if (psdata->cur_psmode == PS_MODE_ENABLE)
 		mod_timer(&psdata->ps_timer, jiffies + msecs_to_jiffies(psdata->h2c_ps_interval));
+
+	if (psdata->ps_state == PS_STATE_AWAKE && psdata->ps_cmd == PS_CMD_ENTER_PS)
+		cancel_work_sync(&psdata->work);
 }
 
 static void ps_cancel_timer(struct btnxpuart_dev *nxpdev)
@@ -337,6 +341,7 @@ static void ps_control(struct hci_dev *hdev, u8 ps_state)
 	    !test_bit(BTNXPUART_SERDEV_OPEN, &nxpdev->tx_state))
 		return;
 
+	mutex_lock(&psdata->ps_lock);
 	switch (psdata->cur_h2c_wakeupmode) {
 	case WAKEUP_METHOD_DTR:
 		if (ps_state == PS_STATE_AWAKE)
@@ -350,12 +355,15 @@ static void ps_control(struct hci_dev *hdev, u8 ps_state)
 			status = serdev_device_break_ctl(nxpdev->serdev, 0);
 		else
 			status = serdev_device_break_ctl(nxpdev->serdev, -1);
+		msleep(20); /* Allow chip to detect UART-break and enter sleep */
 		bt_dev_dbg(hdev, "Set UART break: %s, status=%d",
 			   str_on_off(ps_state == PS_STATE_SLEEP), status);
 		break;
 	}
 	if (!status)
 		psdata->ps_state = ps_state;
+	mutex_unlock(&psdata->ps_lock);
+
 	if (ps_state == PS_STATE_AWAKE)
 		btnxpuart_tx_wakeup(nxpdev);
 }
@@ -391,17 +399,25 @@ static void ps_setup(struct hci_dev *hdev)
 
 	psdata->hdev = hdev;
 	INIT_WORK(&psdata->work, ps_work_func);
+	mutex_init(&psdata->ps_lock);
 	timer_setup(&psdata->ps_timer, ps_timeout_func, 0);
 }
 
-static void ps_wakeup(struct btnxpuart_dev *nxpdev)
+static bool ps_wakeup(struct btnxpuart_dev *nxpdev)
 {
 	struct ps_data *psdata = &nxpdev->psdata;
+	u8 ps_state;
 
-	if (psdata->ps_state != PS_STATE_AWAKE) {
+	mutex_lock(&psdata->ps_lock);
+	ps_state = psdata->ps_state;
+	mutex_unlock(&psdata->ps_lock);
+
+	if (ps_state != PS_STATE_AWAKE) {
 		psdata->ps_cmd = PS_CMD_EXIT_PS;
 		schedule_work(&psdata->work);
+		return true;
 	}
+	return false;
 }
 
 static int send_ps_cmd(struct hci_dev *hdev, void *data)
@@ -1171,7 +1187,6 @@ static struct sk_buff *nxp_dequeue(void *data)
 {
 	struct btnxpuart_dev *nxpdev = (struct btnxpuart_dev *)data;
 
-	ps_wakeup(nxpdev);
 	ps_start_timer(nxpdev);
 	return skb_dequeue(&nxpdev->txq);
 }
@@ -1186,6 +1201,9 @@ static void btnxpuart_tx_work(struct work_struct *work)
 	struct sk_buff *skb;
 	int len;
 
+	if (ps_wakeup(nxpdev))
+		return;
+
 	while ((skb = nxp_dequeue(nxpdev))) {
 		len = serdev_device_write_buf(serdev, skb->data, skb->len);
 		hdev->stat.byte_tx += len;

From e82fd71a581857dd4fa2693a9c96c3871415244f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jonas=20Dre=C3=9Fler?= <verdre@v0yd.nl>
Date: Tue, 2 Jan 2024 19:08:08 +0100
Subject: [PATCH 0133/1406] Bluetooth: hci_sync: Check the correct flag before
 starting a scan
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There's a very confusing mistake in the code starting a HCI inquiry: We're
calling hci_dev_test_flag() to test for HCI_INQUIRY, but hci_dev_test_flag()
checks hdev->dev_flags instead of hdev->flags. HCI_INQUIRY is a bit that's
set on hdev->flags, not on hdev->dev_flags though.

HCI_INQUIRY equals the integer 7, and in hdev->dev_flags, 7 means
HCI_BONDABLE, so we were actually checking for HCI_BONDABLE here.

The mistake is only present in the synchronous code for starting an inquiry,
not in the async one. Also devices are typically bondable while doing an
inquiry, so that might be the reason why nobody noticed it so far.

Fixes: abfeea476c68 ("Bluetooth: hci_sync: Convert MGMT_OP_START_DISCOVERY")
Signed-off-by: Jonas Dreßler <verdre@v0yd.nl>
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 net/bluetooth/hci_sync.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c
index a6fc8a2a5c673d..b3141e3f9cf620 100644
--- a/net/bluetooth/hci_sync.c
+++ b/net/bluetooth/hci_sync.c
@@ -5559,7 +5559,7 @@ static int hci_inquiry_sync(struct hci_dev *hdev, u8 length)
 
 	bt_dev_dbg(hdev, "");
 
-	if (hci_dev_test_flag(hdev, HCI_INQUIRY))
+	if (test_bit(HCI_INQUIRY, &hdev->flags))
 		return 0;
 
 	hci_dev_lock(hdev);

From e6e6b4163d47cc43903efd57eb0554dffaabd11b Mon Sep 17 00:00:00 2001
From: Ying Hsu <yinghsu@chromium.org>
Date: Thu, 4 Jan 2024 11:56:32 +0000
Subject: [PATCH 0134/1406] Bluetooth: Avoid potential use-after-free in
 hci_error_reset

While handling the HCI_EV_HARDWARE_ERROR event, if the underlying
BT controller is not responding, the GPIO reset mechanism would
free the hci_dev and lead to a use-after-free in hci_error_reset.

Here's the call trace observed on a ChromeOS device with Intel AX201:
   queue_work_on+0x3e/0x6c
   __hci_cmd_sync_sk+0x2ee/0x4c0 [bluetooth <HASH:3b4a6>]
   ? init_wait_entry+0x31/0x31
   __hci_cmd_sync+0x16/0x20 [bluetooth <HASH:3b4a 6>]
   hci_error_reset+0x4f/0xa4 [bluetooth <HASH:3b4a 6>]
   process_one_work+0x1d8/0x33f
   worker_thread+0x21b/0x373
   kthread+0x13a/0x152
   ? pr_cont_work+0x54/0x54
   ? kthread_blkcg+0x31/0x31
    ret_from_fork+0x1f/0x30

This patch holds the reference count on the hci_dev while processing
a HCI_EV_HARDWARE_ERROR event to avoid potential crash.

Fixes: c7741d16a57c ("Bluetooth: Perform a power cycle when receiving hardware error event")
Signed-off-by: Ying Hsu <yinghsu@chromium.org>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 net/bluetooth/hci_core.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index 65601aa52e0d8b..2821a42cefdc6e 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -1049,6 +1049,7 @@ static void hci_error_reset(struct work_struct *work)
 {
 	struct hci_dev *hdev = container_of(work, struct hci_dev, error_reset);
 
+	hci_dev_hold(hdev);
 	BT_DBG("%s", hdev->name);
 
 	if (hdev->hw_error)
@@ -1056,10 +1057,10 @@ static void hci_error_reset(struct work_struct *work)
 	else
 		bt_dev_err(hdev, "hardware error 0x%2.2x", hdev->hw_error_code);
 
-	if (hci_dev_do_close(hdev))
-		return;
+	if (!hci_dev_do_close(hdev))
+		hci_dev_do_open(hdev);
 
-	hci_dev_do_open(hdev);
+	hci_dev_put(hdev);
 }
 
 void hci_uuids_clear(struct hci_dev *hdev)

From 15bf14ad2cab73015c1df2c183afdafbe7123ee3 Mon Sep 17 00:00:00 2001
From: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Date: Fri, 5 Jan 2024 10:43:26 -0500
Subject: [PATCH 0135/1406] Bluetooth: hci_sync: Fix accept_list when
 attempting to suspend

During suspend, only wakeable devices can be in acceptlist, so if the
device was previously added it needs to be removed otherwise the device
can end up waking up the system prematurely.

Fixes: 3b42055388c3 ("Bluetooth: hci_sync: Fix attempting to suspend with unfiltered passive scan")
Signed-off-by: Clancy Shang <clancy.shang@quectel.com>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Reviewed-by: Paul Menzel <pmenzel@molgen.mpg.de>
---
 net/bluetooth/hci_sync.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c
index b3141e3f9cf620..5716345a26dfb7 100644
--- a/net/bluetooth/hci_sync.c
+++ b/net/bluetooth/hci_sync.c
@@ -2206,8 +2206,11 @@ static int hci_le_add_accept_list_sync(struct hci_dev *hdev,
 
 	/* During suspend, only wakeable devices can be in acceptlist */
 	if (hdev->suspended &&
-	    !(params->flags & HCI_CONN_FLAG_REMOTE_WAKEUP))
+	    !(params->flags & HCI_CONN_FLAG_REMOTE_WAKEUP)) {
+		hci_le_del_accept_list_sync(hdev, &params->addr,
+					    params->addr_type);
 		return 0;
+	}
 
 	/* Select filter policy to accept all advertising */
 	if (*num_entries >= hdev->le_accept_list_size)

From ae7298405ce1c0e3624642fb454ebe2543632f07 Mon Sep 17 00:00:00 2001
From: Yuxuan Hu <20373622@buaa.edu.cn>
Date: Wed, 3 Jan 2024 17:10:43 +0800
Subject: [PATCH 0136/1406] Bluetooth: rfcomm: Fix null-ptr-deref in
 rfcomm_check_security

During our fuzz testing of the connection and disconnection process at the
RFCOMM layer, we discovered this bug. By comparing the packets from a
normal connection and disconnection process with the testcase that
triggered a KASAN report. We analyzed the cause of this bug as follows:

1. In the packets captured during a normal connection, the host sends a
`Read Encryption Key Size` type of `HCI_CMD` packet
(Command Opcode: 0x1408) to the controller to inquire the length of
encryption key.After receiving this packet, the controller immediately
replies with a Command Completepacket (Event Code: 0x0e) to return the
Encryption Key Size.

2. In our fuzz test case, the timing of the controller's response to this
packet was delayed to an unexpected point: after the RFCOMM and L2CAP
layers had disconnected but before the HCI layer had disconnected.

3. After receiving the Encryption Key Size Response at the time described
in point 2, the host still called the rfcomm_check_security function.
However, by this time `struct l2cap_conn *conn = l2cap_pi(sk)->chan->conn;`
had already been released, and when the function executed
`return hci_conn_security(conn->hcon, d->sec_level, auth_type, d->out);`,
specifically when accessing `conn->hcon`, a null-ptr-deref error occurred.

To fix this bug, check if `sk->sk_state` is BT_CLOSED before calling
rfcomm_recv_frame in rfcomm_process_rx.

Signed-off-by: Yuxuan Hu <20373622@buaa.edu.cn>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 net/bluetooth/rfcomm/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/bluetooth/rfcomm/core.c b/net/bluetooth/rfcomm/core.c
index 053ef8f25fae47..1d34d849703329 100644
--- a/net/bluetooth/rfcomm/core.c
+++ b/net/bluetooth/rfcomm/core.c
@@ -1941,7 +1941,7 @@ static struct rfcomm_session *rfcomm_process_rx(struct rfcomm_session *s)
 	/* Get data directly from socket receive queue without copying it. */
 	while ((skb = skb_dequeue(&sk->sk_receive_queue))) {
 		skb_orphan(skb);
-		if (!skb_linearize(skb)) {
+		if (!skb_linearize(skb) && sk->sk_state != BT_CLOSED) {
 			s = rfcomm_recv_frame(s, skb);
 			if (!s)
 				break;

From 2a0ccde3b14119a5028d6ec5402806f9130156ce Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jonas=20Dre=C3=9Fler?= <verdre@v0yd.nl>
Date: Sun, 7 Jan 2024 19:02:47 +0100
Subject: [PATCH 0137/1406] Bluetooth: Remove HCI_POWER_OFF_TIMEOUT
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

With commit cf75ad8b41d2 ("Bluetooth: hci_sync: Convert MGMT_SET_POWERED"),
the power off sequence got refactored so that this timeout was no longer
necessary, let's remove the leftover define from the header too.

Fixes: cf75ad8b41d2 ("Bluetooth: hci_sync: Convert MGMT_SET_POWERED")
Signed-off-by: Jonas Dreßler <verdre@v0yd.nl>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/hci.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h
index bdee5d649cc61d..f7918c7551834b 100644
--- a/include/net/bluetooth/hci.h
+++ b/include/net/bluetooth/hci.h
@@ -437,7 +437,6 @@ enum {
 #define HCI_NCMD_TIMEOUT	msecs_to_jiffies(4000)	/* 4 seconds */
 #define HCI_ACL_TX_TIMEOUT	msecs_to_jiffies(45000)	/* 45 seconds */
 #define HCI_AUTO_OFF_TIMEOUT	msecs_to_jiffies(2000)	/* 2 seconds */
-#define HCI_POWER_OFF_TIMEOUT	msecs_to_jiffies(5000)	/* 5 seconds */
 #define HCI_LE_CONN_TIMEOUT	msecs_to_jiffies(20000)	/* 20 seconds */
 #define HCI_LE_AUTOCONN_TIMEOUT	msecs_to_jiffies(4000)	/* 4 seconds */
 

From 4d195d129dd4072af3fe8019e3b2da13be95cb08 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jonas=20Dre=C3=9Fler?= <verdre@v0yd.nl>
Date: Sun, 7 Jan 2024 19:02:48 +0100
Subject: [PATCH 0138/1406] Bluetooth: mgmt: Remove leftover queuing of
 power_off work
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Queuing of power_off work was introduced in these functions with commits
8b064a3ad377 ("Bluetooth: Clean up HCI state when doing power off") and
c9910d0fb4fc ("Bluetooth: Fix disconnecting connections in non-connected
states") in an effort to clean up state and do things like disconnecting
devices before actually powering off the device.

After that, commit a3172b7eb4a2 ("Bluetooth: Add timer to force power off")
introduced a timeout to ensure that the device actually got powered off,
even if some of the cleanup work would never complete.

This code later got refactored with commit cf75ad8b41d2 ("Bluetooth:
hci_sync: Convert MGMT_SET_POWERED"), which made powering off the device
synchronous and removed the need for initiating the power_off work from
other places. The timeout mentioned above got removed too, because we now
also made use of the command timeout during power on/off.

These days the power_off work still exists, but it only seems to only be
used for HCI_AUTO_OFF functionality, which is why we never noticed
those two leftover places where we queue power_off work. So let's remove
that code.

Fixes: cf75ad8b41d2 ("Bluetooth: hci_sync: Convert MGMT_SET_POWERED")
Signed-off-by: Jonas Dreßler <verdre@v0yd.nl>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 net/bluetooth/mgmt.c | 16 ----------------
 1 file changed, 16 deletions(-)

diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c
index bb72ff6eb22f4b..d1c55e409659f0 100644
--- a/net/bluetooth/mgmt.c
+++ b/net/bluetooth/mgmt.c
@@ -9764,14 +9764,6 @@ void mgmt_device_disconnected(struct hci_dev *hdev, bdaddr_t *bdaddr,
 	struct mgmt_ev_device_disconnected ev;
 	struct sock *sk = NULL;
 
-	/* The connection is still in hci_conn_hash so test for 1
-	 * instead of 0 to know if this is the last one.
-	 */
-	if (mgmt_powering_down(hdev) && hci_conn_count(hdev) == 1) {
-		cancel_delayed_work(&hdev->power_off);
-		queue_work(hdev->req_workqueue, &hdev->power_off.work);
-	}
-
 	if (!mgmt_connected)
 		return;
 
@@ -9828,14 +9820,6 @@ void mgmt_connect_failed(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 link_type,
 {
 	struct mgmt_ev_connect_failed ev;
 
-	/* The connection is still in hci_conn_hash so test for 1
-	 * instead of 0 to know if this is the last one.
-	 */
-	if (mgmt_powering_down(hdev) && hci_conn_count(hdev) == 1) {
-		cancel_delayed_work(&hdev->power_off);
-		queue_work(hdev->req_workqueue, &hdev->power_off.work);
-	}
-
 	bacpy(&ev.addr.bdaddr, bdaddr);
 	ev.addr.type = link_to_bdaddr(link_type, addr_type);
 	ev.status = mgmt_status(status);

From 61d49033fbddf4541c6254413d2047179b8fc818 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jonas=20Dre=C3=9Fler?= <verdre@v0yd.nl>
Date: Sun, 7 Jan 2024 19:02:49 +0100
Subject: [PATCH 0139/1406] Bluetooth: Add new state HCI_POWERING_DOWN
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a new state HCI_POWERING_DOWN that indicates that the device is
currently powering down, this will be useful for the next commit.

Signed-off-by: Jonas Dreßler <verdre@v0yd.nl>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/hci.h |  1 +
 net/bluetooth/hci_sync.c    | 16 +++++++++++-----
 net/bluetooth/mgmt.c        | 14 ++++++++++++++
 3 files changed, 26 insertions(+), 5 deletions(-)

diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h
index f7918c7551834b..a94a8491ec7a1a 100644
--- a/include/net/bluetooth/hci.h
+++ b/include/net/bluetooth/hci.h
@@ -372,6 +372,7 @@ enum {
 	HCI_SETUP,
 	HCI_CONFIG,
 	HCI_DEBUGFS_CREATED,
+	HCI_POWERING_DOWN,
 	HCI_AUTO_OFF,
 	HCI_RFKILLED,
 	HCI_MGMT,
diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c
index 5716345a26dfb7..b146562a65fc40 100644
--- a/net/bluetooth/hci_sync.c
+++ b/net/bluetooth/hci_sync.c
@@ -5403,27 +5403,33 @@ static int hci_power_off_sync(struct hci_dev *hdev)
 	if (!test_bit(HCI_UP, &hdev->flags))
 		return 0;
 
+	hci_dev_set_flag(hdev, HCI_POWERING_DOWN);
+
 	if (test_bit(HCI_ISCAN, &hdev->flags) ||
 	    test_bit(HCI_PSCAN, &hdev->flags)) {
 		err = hci_write_scan_enable_sync(hdev, 0x00);
 		if (err)
-			return err;
+			goto out;
 	}
 
 	err = hci_clear_adv_sync(hdev, NULL, false);
 	if (err)
-		return err;
+		goto out;
 
 	err = hci_stop_discovery_sync(hdev);
 	if (err)
-		return err;
+		goto out;
 
 	/* Terminated due to Power Off */
 	err = hci_disconnect_all_sync(hdev, HCI_ERROR_REMOTE_POWER_OFF);
 	if (err)
-		return err;
+		goto out;
+
+	err = hci_dev_close_sync(hdev);
 
-	return hci_dev_close_sync(hdev);
+out:
+	hci_dev_clear_flag(hdev, HCI_POWERING_DOWN);
+	return err;
 }
 
 int hci_set_powered_sync(struct hci_dev *hdev, u8 val)
diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c
index d1c55e409659f0..cabc5466401754 100644
--- a/net/bluetooth/mgmt.c
+++ b/net/bluetooth/mgmt.c
@@ -1388,6 +1388,14 @@ static int set_powered(struct sock *sk, struct hci_dev *hdev, void *data,
 
 	hci_dev_lock(hdev);
 
+	if (!cp->val) {
+		if (hci_dev_test_flag(hdev, HCI_POWERING_DOWN)) {
+			err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_POWERED,
+					      MGMT_STATUS_BUSY);
+			goto failed;
+		}
+	}
+
 	if (pending_find(MGMT_OP_SET_POWERED, hdev)) {
 		err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_POWERED,
 				      MGMT_STATUS_BUSY);
@@ -9746,6 +9754,9 @@ bool mgmt_powering_down(struct hci_dev *hdev)
 	struct mgmt_pending_cmd *cmd;
 	struct mgmt_mode *cp;
 
+	if (hci_dev_test_flag(hdev, HCI_POWERING_DOWN))
+		return true;
+
 	cmd = pending_find(MGMT_OP_SET_POWERED, hdev);
 	if (!cmd)
 		return false;
@@ -10053,6 +10064,9 @@ void mgmt_set_local_name_complete(struct hci_dev *hdev, u8 *name, u8 status)
 		/* If this is a HCI command related to powering on the
 		 * HCI dev don't send any mgmt signals.
 		 */
+		if (hci_dev_test_flag(hdev, HCI_POWERING_DOWN))
+			return;
+
 		if (pending_find(MGMT_OP_SET_POWERED, hdev))
 			return;
 	}

From 1a239efd6bc480057c6b722bd8c1925081cc2c3a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jonas=20Dre=C3=9Fler?= <verdre@v0yd.nl>
Date: Sun, 7 Jan 2024 19:02:50 +0100
Subject: [PATCH 0140/1406] Bluetooth: Disconnect connected devices before
 rfkilling adapter
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

On a lot of platforms (at least the MS Surface devices, M1 macbooks, and
a few ThinkPads) firmware doesn't do its job when rfkilling a device
and the bluetooth adapter is not actually shut down properly on rfkill.
This leads to connected devices remaining in connected state and the
bluetooth connection eventually timing out after rfkilling an adapter.

Use the rfkill hook in the HCI driver to go through the full power-off
sequence (including stopping scans and disconnecting devices) before
rfkilling it, just like MGMT_OP_SET_POWERED would do.

In case anything during the larger power-off sequence fails, make sure
the device is still closed and the rfkill ends up being effective in
the end.

Signed-off-by: Jonas Dreßler <verdre@v0yd.nl>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 net/bluetooth/hci_core.c | 35 +++++++++++++++++++++++++++++++++--
 1 file changed, 33 insertions(+), 2 deletions(-)

diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index 2821a42cefdc6e..e5cb618fa6d39c 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -940,20 +940,51 @@ int hci_get_dev_info(void __user *arg)
 
 /* ---- Interface to HCI drivers ---- */
 
+static int hci_dev_do_poweroff(struct hci_dev *hdev)
+{
+	int err;
+
+	BT_DBG("%s %p", hdev->name, hdev);
+
+	hci_req_sync_lock(hdev);
+
+	err = hci_set_powered_sync(hdev, false);
+
+	hci_req_sync_unlock(hdev);
+
+	return err;
+}
+
 static int hci_rfkill_set_block(void *data, bool blocked)
 {
 	struct hci_dev *hdev = data;
+	int err;
 
 	BT_DBG("%p name %s blocked %d", hdev, hdev->name, blocked);
 
 	if (hci_dev_test_flag(hdev, HCI_USER_CHANNEL))
 		return -EBUSY;
 
+	if (blocked == hci_dev_test_flag(hdev, HCI_RFKILLED))
+		return 0;
+
 	if (blocked) {
 		hci_dev_set_flag(hdev, HCI_RFKILLED);
+
 		if (!hci_dev_test_flag(hdev, HCI_SETUP) &&
-		    !hci_dev_test_flag(hdev, HCI_CONFIG))
-			hci_dev_do_close(hdev);
+		    !hci_dev_test_flag(hdev, HCI_CONFIG)) {
+			err = hci_dev_do_poweroff(hdev);
+			if (err) {
+				bt_dev_err(hdev, "Error when powering off device on rfkill (%d)",
+					   err);
+
+				/* Make sure the device is still closed even if
+				 * anything during power off sequence (eg.
+				 * disconnecting devices) failed.
+				 */
+				hci_dev_do_close(hdev);
+			}
+		}
 	} else {
 		hci_dev_clear_flag(hdev, HCI_RFKILLED);
 	}

From 1fccf2b5c0fadc0c49725fa0d88a46dd807da52f Mon Sep 17 00:00:00 2001
From: Zijun Hu <quic_zijuhu@quicinc.com>
Date: Tue, 9 Jan 2024 19:03:23 +0800
Subject: [PATCH 0141/1406] Bluetooth: hci_event: Fix wrongly recorded wakeup
 BD_ADDR

hci_store_wake_reason() wrongly parses event HCI_Connection_Request
as HCI_Connection_Complete and HCI_Connection_Complete as
HCI_Connection_Request, so causes recording wakeup BD_ADDR error and
potential stability issue, fix it by using the correct field.

Fixes: 2f20216c1d6f ("Bluetooth: Emit controller suspend and resume events")
Signed-off-by: Zijun Hu <quic_zijuhu@quicinc.com>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 net/bluetooth/hci_event.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index ef8c3bed73617e..22b22c264c2a5e 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -7420,10 +7420,10 @@ static void hci_store_wake_reason(struct hci_dev *hdev, u8 event,
 	 * keep track of the bdaddr of the connection event that woke us up.
 	 */
 	if (event == HCI_EV_CONN_REQUEST) {
-		bacpy(&hdev->wake_addr, &conn_complete->bdaddr);
+		bacpy(&hdev->wake_addr, &conn_request->bdaddr);
 		hdev->wake_addr_type = BDADDR_BREDR;
 	} else if (event == HCI_EV_CONN_COMPLETE) {
-		bacpy(&hdev->wake_addr, &conn_request->bdaddr);
+		bacpy(&hdev->wake_addr, &conn_complete->bdaddr);
 		hdev->wake_addr_type = BDADDR_BREDR;
 	} else if (event == HCI_EV_LE_META) {
 		struct hci_ev_le_meta *le_ev = (void *)skb->data;

From 06084cdd45cc8d7adf8917cec8748c7c79e7567a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jonas=20Dre=C3=9Fler?= <verdre@v0yd.nl>
Date: Mon, 8 Jan 2024 23:46:06 +0100
Subject: [PATCH 0142/1406] Bluetooth: Remove superfluous call to
 hci_conn_check_pending()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The "pending connections" feature was originally introduced with commit
4c67bc74f016 ("[Bluetooth] Support concurrent connect requests") and
6bd57416127e ("[Bluetooth] Handling pending connect attempts after
inquiry") to handle controllers supporting only a single connection request
at a time. Later things were extended to also cancel ongoing inquiries on
connect() with commit 89e65975fea5 ("Bluetooth: Cancel Inquiry before
Create Connection").

With commit a9de9248064b ("[Bluetooth] Switch from OGF+OCF to using only
opcodes"), hci_conn_check_pending() was introduced as a helper to
consolidate a few places where we check for pending connections (indicated
by the BT_CONNECT2 flag) and then try to connect.

This refactoring commit also snuck in two more calls to
hci_conn_check_pending():

- One is in the failure callback of hci_cs_inquiry(), this one probably
makes sense: If we send an "HCI Inquiry" command and then immediately
after a "Create Connection" command, the "Create Connection" command might
fail before the "HCI Inquiry" command, and then we want to retry the
"Create Connection" on failure of the "HCI Inquiry".

- The other added call to hci_conn_check_pending() is in the event handler
for the "Remote Name" event, this seems unrelated and is possibly a
copy-paste error, so remove that one.

Fixes: a9de9248064b ("[Bluetooth] Switch from OGF+OCF to using only opcodes")
Signed-off-by: Jonas Dreßler <verdre@v0yd.nl>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 net/bluetooth/hci_event.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index 22b22c264c2a5e..23e0e63ac312be 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -3556,8 +3556,6 @@ static void hci_remote_name_evt(struct hci_dev *hdev, void *data,
 
 	bt_dev_dbg(hdev, "status 0x%2.2x", ev->status);
 
-	hci_conn_check_pending(hdev);
-
 	hci_dev_lock(hdev);
 
 	conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &ev->bdaddr);

From 1a11f88be5412f67ed745863b33270dc985ae16a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jonas=20Dre=C3=9Fler?= <verdre@v0yd.nl>
Date: Mon, 8 Jan 2024 23:46:07 +0100
Subject: [PATCH 0143/1406] Bluetooth: hci_event: Use HCI error defines instead
 of magic values
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We have error defines already, so let's use them.

Signed-off-by: Jonas Dreßler <verdre@v0yd.nl>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/hci.h | 2 ++
 net/bluetooth/hci_event.c   | 8 ++++----
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h
index a94a8491ec7a1a..1cd212bb378916 100644
--- a/include/net/bluetooth/hci.h
+++ b/include/net/bluetooth/hci.h
@@ -653,6 +653,7 @@ enum {
 #define HCI_ERROR_PIN_OR_KEY_MISSING	0x06
 #define HCI_ERROR_MEMORY_EXCEEDED	0x07
 #define HCI_ERROR_CONNECTION_TIMEOUT	0x08
+#define HCI_ERROR_COMMAND_DISALLOWED	0x0c
 #define HCI_ERROR_REJ_LIMITED_RESOURCES	0x0d
 #define HCI_ERROR_REJ_BAD_ADDR		0x0f
 #define HCI_ERROR_INVALID_PARAMETERS	0x12
@@ -661,6 +662,7 @@ enum {
 #define HCI_ERROR_REMOTE_POWER_OFF	0x15
 #define HCI_ERROR_LOCAL_HOST_TERM	0x16
 #define HCI_ERROR_PAIRING_NOT_ALLOWED	0x18
+#define HCI_ERROR_UNSUPPORTED_REMOTE_FEATURE	0x1e
 #define HCI_ERROR_INVALID_LL_PARAMS	0x1e
 #define HCI_ERROR_UNSPECIFIED		0x1f
 #define HCI_ERROR_ADVERTISING_TIMEOUT	0x3c
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index 23e0e63ac312be..6130c969f361a7 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -95,11 +95,11 @@ static u8 hci_cc_inquiry_cancel(struct hci_dev *hdev, void *data,
 	/* It is possible that we receive Inquiry Complete event right
 	 * before we receive Inquiry Cancel Command Complete event, in
 	 * which case the latter event should have status of Command
-	 * Disallowed (0x0c). This should not be treated as error, since
+	 * Disallowed. This should not be treated as error, since
 	 * we actually achieve what Inquiry Cancel wants to achieve,
 	 * which is to end the last Inquiry session.
 	 */
-	if (rp->status == 0x0c && !test_bit(HCI_INQUIRY, &hdev->flags)) {
+	if (rp->status == HCI_ERROR_COMMAND_DISALLOWED && !test_bit(HCI_INQUIRY, &hdev->flags)) {
 		bt_dev_warn(hdev, "Ignoring error of Inquiry Cancel command");
 		rp->status = 0x00;
 	}
@@ -2342,7 +2342,7 @@ static void hci_cs_create_conn(struct hci_dev *hdev, __u8 status)
 
 	if (status) {
 		if (conn && conn->state == BT_CONNECT) {
-			if (status != 0x0c || conn->attempt > 2) {
+			if (status != HCI_ERROR_COMMAND_DISALLOWED || conn->attempt > 2) {
 				conn->state = BT_CLOSED;
 				hci_connect_cfm(conn, status);
 				hci_conn_del(conn);
@@ -6679,7 +6679,7 @@ static void hci_le_remote_feat_complete_evt(struct hci_dev *hdev, void *data,
 			 * transition into connected state and mark it as
 			 * successful.
 			 */
-			if (!conn->out && ev->status == 0x1a &&
+			if (!conn->out && ev->status == HCI_ERROR_UNSUPPORTED_REMOTE_FEATURE &&
 			    (hdev->le_features[0] & HCI_LE_PERIPHERAL_FEATURES))
 				status = 0x00;
 			else

From 5cfa8e3633d1ef62bfa323b2d7dc78ac6d3a2248 Mon Sep 17 00:00:00 2001
From: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Date: Tue, 9 Jan 2024 13:45:40 -0500
Subject: [PATCH 0144/1406] Bluetooth: hci_core: Cancel request on command
 timeout

If command has timed out call __hci_cmd_sync_cancel to notify the
hci_req since it will inevitably cause a timeout.

This also rework the code around __hci_cmd_sync_cancel since it was
wrongly assuming it needs to cancel timer as well, but sometimes the
timers have not been started or in fact they already had timed out in
which case they don't need to be cancel yet again.

Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/hci_sync.h |  2 +-
 net/bluetooth/hci_core.c         | 84 ++++++++++++++++++++++----------
 net/bluetooth/hci_request.c      |  2 +-
 net/bluetooth/hci_sync.c         | 20 ++++----
 net/bluetooth/mgmt.c             |  2 +-
 5 files changed, 71 insertions(+), 39 deletions(-)

diff --git a/include/net/bluetooth/hci_sync.h b/include/net/bluetooth/hci_sync.h
index 6efbc2152146bd..e2582c24254498 100644
--- a/include/net/bluetooth/hci_sync.h
+++ b/include/net/bluetooth/hci_sync.h
@@ -42,7 +42,7 @@ int __hci_cmd_sync_status_sk(struct hci_dev *hdev, u16 opcode, u32 plen,
 void hci_cmd_sync_init(struct hci_dev *hdev);
 void hci_cmd_sync_clear(struct hci_dev *hdev);
 void hci_cmd_sync_cancel(struct hci_dev *hdev, int err);
-void __hci_cmd_sync_cancel(struct hci_dev *hdev, int err);
+void hci_cmd_sync_cancel_sync(struct hci_dev *hdev, int err);
 
 int hci_cmd_sync_submit(struct hci_dev *hdev, hci_cmd_sync_work_func_t func,
 			void *data, hci_cmd_sync_work_destroy_t destroy);
diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index e5cb618fa6d39c..de730d210ccb45 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -1523,10 +1523,11 @@ static void hci_cmd_timeout(struct work_struct *work)
 					    cmd_timer.work);
 
 	if (hdev->sent_cmd) {
-		struct hci_command_hdr *sent = (void *) hdev->sent_cmd->data;
-		u16 opcode = __le16_to_cpu(sent->opcode);
+		u16 opcode = hci_skb_opcode(hdev->sent_cmd);
 
 		bt_dev_err(hdev, "command 0x%4.4x tx timeout", opcode);
+
+		hci_cmd_sync_cancel_sync(hdev, ETIMEDOUT);
 	} else {
 		bt_dev_err(hdev, "command tx timeout");
 	}
@@ -2857,6 +2858,23 @@ int hci_unregister_suspend_notifier(struct hci_dev *hdev)
 	return ret;
 }
 
+/* Cancel ongoing command synchronously:
+ *
+ * - Cancel command timer
+ * - Reset command counter
+ * - Cancel command request
+ */
+static void hci_cancel_cmd_sync(struct hci_dev *hdev, int err)
+{
+	bt_dev_dbg(hdev, "err 0x%2.2x", err);
+
+	cancel_delayed_work_sync(&hdev->cmd_timer);
+	cancel_delayed_work_sync(&hdev->ncmd_timer);
+	atomic_set(&hdev->cmd_cnt, 1);
+
+	hci_cmd_sync_cancel_sync(hdev, -err);
+}
+
 /* Suspend HCI device */
 int hci_suspend_dev(struct hci_dev *hdev)
 {
@@ -2874,7 +2892,7 @@ int hci_suspend_dev(struct hci_dev *hdev)
 		return 0;
 
 	/* Cancel potentially blocking sync operation before suspend */
-	__hci_cmd_sync_cancel(hdev, -EHOSTDOWN);
+	hci_cancel_cmd_sync(hdev, -EHOSTDOWN);
 
 	hci_req_sync_lock(hdev);
 	ret = hci_suspend_sync(hdev);
@@ -4159,6 +4177,33 @@ static void hci_rx_work(struct work_struct *work)
 	}
 }
 
+static void hci_send_cmd_sync(struct hci_dev *hdev, struct sk_buff *skb)
+{
+	int err;
+
+	bt_dev_dbg(hdev, "skb %p", skb);
+
+	kfree_skb(hdev->sent_cmd);
+
+	hdev->sent_cmd = skb_clone(skb, GFP_KERNEL);
+	if (!hdev->sent_cmd) {
+		skb_queue_head(&hdev->cmd_q, skb);
+		queue_work(hdev->workqueue, &hdev->cmd_work);
+		return;
+	}
+
+	err = hci_send_frame(hdev, skb);
+	if (err < 0) {
+		hci_cmd_sync_cancel_sync(hdev, err);
+		return;
+	}
+
+	if (hci_req_status_pend(hdev))
+		hci_dev_set_flag(hdev, HCI_CMD_PENDING);
+
+	atomic_dec(&hdev->cmd_cnt);
+}
+
 static void hci_cmd_work(struct work_struct *work)
 {
 	struct hci_dev *hdev = container_of(work, struct hci_dev, cmd_work);
@@ -4173,30 +4218,15 @@ static void hci_cmd_work(struct work_struct *work)
 		if (!skb)
 			return;
 
-		kfree_skb(hdev->sent_cmd);
-
-		hdev->sent_cmd = skb_clone(skb, GFP_KERNEL);
-		if (hdev->sent_cmd) {
-			int res;
-			if (hci_req_status_pend(hdev))
-				hci_dev_set_flag(hdev, HCI_CMD_PENDING);
-			atomic_dec(&hdev->cmd_cnt);
+		hci_send_cmd_sync(hdev, skb);
 
-			res = hci_send_frame(hdev, skb);
-			if (res < 0)
-				__hci_cmd_sync_cancel(hdev, -res);
-
-			rcu_read_lock();
-			if (test_bit(HCI_RESET, &hdev->flags) ||
-			    hci_dev_test_flag(hdev, HCI_CMD_DRAIN_WORKQUEUE))
-				cancel_delayed_work(&hdev->cmd_timer);
-			else
-				queue_delayed_work(hdev->workqueue, &hdev->cmd_timer,
-						   HCI_CMD_TIMEOUT);
-			rcu_read_unlock();
-		} else {
-			skb_queue_head(&hdev->cmd_q, skb);
-			queue_work(hdev->workqueue, &hdev->cmd_work);
-		}
+		rcu_read_lock();
+		if (test_bit(HCI_RESET, &hdev->flags) ||
+		    hci_dev_test_flag(hdev, HCI_CMD_DRAIN_WORKQUEUE))
+			cancel_delayed_work(&hdev->cmd_timer);
+		else
+			queue_delayed_work(hdev->workqueue, &hdev->cmd_timer,
+					   HCI_CMD_TIMEOUT);
+		rcu_read_unlock();
 	}
 }
diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c
index 6e023b0104b039..00e02138003ece 100644
--- a/net/bluetooth/hci_request.c
+++ b/net/bluetooth/hci_request.c
@@ -895,7 +895,7 @@ void hci_request_setup(struct hci_dev *hdev)
 
 void hci_request_cancel_all(struct hci_dev *hdev)
 {
-	__hci_cmd_sync_cancel(hdev, ENODEV);
+	hci_cmd_sync_cancel_sync(hdev, ENODEV);
 
 	cancel_interleave_scan(hdev);
 }
diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c
index b146562a65fc40..1122296ce3fa3f 100644
--- a/net/bluetooth/hci_sync.c
+++ b/net/bluetooth/hci_sync.c
@@ -584,7 +584,7 @@ void hci_cmd_sync_clear(struct hci_dev *hdev)
 	mutex_unlock(&hdev->cmd_sync_work_lock);
 }
 
-void __hci_cmd_sync_cancel(struct hci_dev *hdev, int err)
+void hci_cmd_sync_cancel(struct hci_dev *hdev, int err)
 {
 	bt_dev_dbg(hdev, "err 0x%2.2x", err);
 
@@ -592,15 +592,17 @@ void __hci_cmd_sync_cancel(struct hci_dev *hdev, int err)
 		hdev->req_result = err;
 		hdev->req_status = HCI_REQ_CANCELED;
 
-		cancel_delayed_work_sync(&hdev->cmd_timer);
-		cancel_delayed_work_sync(&hdev->ncmd_timer);
-		atomic_set(&hdev->cmd_cnt, 1);
-
-		wake_up_interruptible(&hdev->req_wait_q);
+		queue_work(hdev->workqueue, &hdev->cmd_sync_cancel_work);
 	}
 }
+EXPORT_SYMBOL(hci_cmd_sync_cancel);
 
-void hci_cmd_sync_cancel(struct hci_dev *hdev, int err)
+/* Cancel ongoing command request synchronously:
+ *
+ * - Set result and mark status to HCI_REQ_CANCELED
+ * - Wakeup command sync thread
+ */
+void hci_cmd_sync_cancel_sync(struct hci_dev *hdev, int err)
 {
 	bt_dev_dbg(hdev, "err 0x%2.2x", err);
 
@@ -608,10 +610,10 @@ void hci_cmd_sync_cancel(struct hci_dev *hdev, int err)
 		hdev->req_result = err;
 		hdev->req_status = HCI_REQ_CANCELED;
 
-		queue_work(hdev->workqueue, &hdev->cmd_sync_cancel_work);
+		wake_up_interruptible(&hdev->req_wait_q);
 	}
 }
-EXPORT_SYMBOL(hci_cmd_sync_cancel);
+EXPORT_SYMBOL(hci_cmd_sync_cancel_sync);
 
 /* Submit HCI command to be run in as cmd_sync_work:
  *
diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c
index cabc5466401754..173986f3405f7a 100644
--- a/net/bluetooth/mgmt.c
+++ b/net/bluetooth/mgmt.c
@@ -1415,7 +1415,7 @@ static int set_powered(struct sock *sk, struct hci_dev *hdev, void *data,
 
 	/* Cancel potentially blocking sync operation before power off */
 	if (cp->val == 0x00) {
-		__hci_cmd_sync_cancel(hdev, -EHOSTDOWN);
+		hci_cmd_sync_cancel_sync(hdev, -EHOSTDOWN);
 		err = hci_cmd_sync_queue(hdev, set_powered_sync, cmd,
 					 mgmt_set_powered_complete);
 	} else {

From e35f9744e33270429dbc62180814640b437f02d5 Mon Sep 17 00:00:00 2001
From: Ulrik Strid <ulrik@strid.tech>
Date: Sat, 13 Jan 2024 15:27:38 +0800
Subject: [PATCH 0145/1406] Bluetooth: btusb: Add new VID/PID 13d3/3602 for
 MT7925

Add VID 13d3 & PID 3602 for MediaTek MT7925 USB Bluetooth chip.

The information in /sys/kernel/debug/usb/devices about the Bluetooth
device is listed as the below.

T:  Bus=07 Lev=01 Prnt=01 Port=10 Cnt=02 Dev#=  2 Spd=480  MxCh= 0
D:  Ver= 2.10 Cls=ef(misc ) Sub=02 Prot=01 MxPS=64 #Cfgs=  1
P:  Vendor=13d3 ProdID=3602 Rev= 1.00
S:  Manufacturer=MediaTek Inc.
S:  Product=Wireless_Device
S:  SerialNumber=000000000
C:* #Ifs= 3 Cfg#= 1 Atr=e0 MxPwr=100mA
A:  FirstIf#= 0 IfCount= 3 Cls=e0(wlcon) Sub=01 Prot=01
I:* If#= 0 Alt= 0 #EPs= 3 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=81(I) Atr=03(Int.) MxPS=  16 Ivl=125us
E:  Ad=82(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms
E:  Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms
I:* If#= 1 Alt= 0 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=83(I) Atr=01(Isoc) MxPS=   0 Ivl=1ms
E:  Ad=03(O) Atr=01(Isoc) MxPS=   0 Ivl=1ms
I:  If#= 1 Alt= 1 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=83(I) Atr=01(Isoc) MxPS=   9 Ivl=1ms
E:  Ad=03(O) Atr=01(Isoc) MxPS=   9 Ivl=1ms
I:  If#= 1 Alt= 2 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=83(I) Atr=01(Isoc) MxPS=  17 Ivl=1ms
E:  Ad=03(O) Atr=01(Isoc) MxPS=  17 Ivl=1ms
I:  If#= 1 Alt= 3 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=83(I) Atr=01(Isoc) MxPS=  25 Ivl=1ms
E:  Ad=03(O) Atr=01(Isoc) MxPS=  25 Ivl=1ms
I:  If#= 1 Alt= 4 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=83(I) Atr=01(Isoc) MxPS=  33 Ivl=1ms
E:  Ad=03(O) Atr=01(Isoc) MxPS=  33 Ivl=1ms
I:  If#= 1 Alt= 5 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=83(I) Atr=01(Isoc) MxPS=  49 Ivl=1ms
E:  Ad=03(O) Atr=01(Isoc) MxPS=  49 Ivl=1ms
I:  If#= 1 Alt= 6 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=83(I) Atr=01(Isoc) MxPS=  63 Ivl=1ms
E:  Ad=03(O) Atr=01(Isoc) MxPS=  63 Ivl=1ms
I:* If#= 2 Alt= 0 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=(none)
E:  Ad=8a(I) Atr=03(Int.) MxPS=  64 Ivl=125us
E:  Ad=0a(O) Atr=03(Int.) MxPS=  64 Ivl=125us
I:  If#= 2 Alt= 1 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=(none)
E:  Ad=8a(I) Atr=03(Int.) MxPS= 512 Ivl=125us
E:  Ad=0a(O) Atr=03(Int.) MxPS= 512 Ivl=125us

Signed-off-by: Ulrik Strid <ulrik@strid.tech>
Signed-off-by: Deren Wu <deren.wu@mediatek.com>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 drivers/bluetooth/btusb.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/drivers/bluetooth/btusb.c b/drivers/bluetooth/btusb.c
index c4e0456153d8e3..edfb49bbaa28fd 100644
--- a/drivers/bluetooth/btusb.c
+++ b/drivers/bluetooth/btusb.c
@@ -658,6 +658,11 @@ static const struct usb_device_id quirks_table[] = {
 						     BTUSB_WIDEBAND_SPEECH |
 						     BTUSB_VALID_LE_STATES },
 
+	/* Additional MediaTek MT7925 Bluetooth devices */
+	{ USB_DEVICE(0x13d3, 0x3602), .driver_info = BTUSB_MEDIATEK |
+						     BTUSB_WIDEBAND_SPEECH |
+						     BTUSB_VALID_LE_STATES },
+
 	/* Additional Realtek 8723AE Bluetooth devices */
 	{ USB_DEVICE(0x0930, 0x021d), .driver_info = BTUSB_REALTEK },
 	{ USB_DEVICE(0x13d3, 0x3394), .driver_info = BTUSB_REALTEK },

From 7cf588b0cec9d4cffcaaf85dd8d598258b02ec7c Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Mon, 15 Jan 2024 21:12:19 +0100
Subject: [PATCH 0146/1406] Bluetooth: Remove usage of the deprecated
 ida_simple_xx() API

ida_alloc() and ida_free() should be preferred to the deprecated
ida_simple_get() and ida_simple_remove().

Note that the upper limit of ida_simple_get() is exclusive, but the one of
ida_alloc_max() is inclusive. So a -1 has been added when needed.

Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 net/bluetooth/hci_core.c | 9 +++++----
 net/bluetooth/hci_sock.c | 4 ++--
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index de730d210ccb45..34c8dca2069f6b 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -2640,10 +2640,11 @@ int hci_register_dev(struct hci_dev *hdev)
 	 */
 	switch (hdev->dev_type) {
 	case HCI_PRIMARY:
-		id = ida_simple_get(&hci_index_ida, 0, HCI_MAX_ID, GFP_KERNEL);
+		id = ida_alloc_max(&hci_index_ida, HCI_MAX_ID - 1, GFP_KERNEL);
 		break;
 	case HCI_AMP:
-		id = ida_simple_get(&hci_index_ida, 1, HCI_MAX_ID, GFP_KERNEL);
+		id = ida_alloc_range(&hci_index_ida, 1, HCI_MAX_ID - 1,
+				     GFP_KERNEL);
 		break;
 	default:
 		return -EINVAL;
@@ -2742,7 +2743,7 @@ int hci_register_dev(struct hci_dev *hdev)
 	destroy_workqueue(hdev->workqueue);
 	destroy_workqueue(hdev->req_workqueue);
 err:
-	ida_simple_remove(&hci_index_ida, hdev->id);
+	ida_free(&hci_index_ida, hdev->id);
 
 	return error;
 }
@@ -2825,7 +2826,7 @@ void hci_release_dev(struct hci_dev *hdev)
 	hci_dev_unlock(hdev);
 
 	ida_destroy(&hdev->unset_handle_ida);
-	ida_simple_remove(&hci_index_ida, hdev->id);
+	ida_free(&hci_index_ida, hdev->id);
 	kfree_skb(hdev->sent_cmd);
 	kfree_skb(hdev->recv_event);
 	kfree(hdev);
diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c
index 3e7cd330d731ac..4ee1b976678b25 100644
--- a/net/bluetooth/hci_sock.c
+++ b/net/bluetooth/hci_sock.c
@@ -101,7 +101,7 @@ static bool hci_sock_gen_cookie(struct sock *sk)
 	int id = hci_pi(sk)->cookie;
 
 	if (!id) {
-		id = ida_simple_get(&sock_cookie_ida, 1, 0, GFP_KERNEL);
+		id = ida_alloc_min(&sock_cookie_ida, 1, GFP_KERNEL);
 		if (id < 0)
 			id = 0xffffffff;
 
@@ -119,7 +119,7 @@ static void hci_sock_free_cookie(struct sock *sk)
 
 	if (id) {
 		hci_pi(sk)->cookie = 0xffffffff;
-		ida_simple_remove(&sock_cookie_ida, id);
+		ida_free(&sock_cookie_ida, id);
 	}
 }
 

From 62a9afa086a7207ea66906826cf2b487c08dd579 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fr=C3=A9d=C3=A9ric=20Danis?= <frederic.danis@collabora.com>
Date: Mon, 22 Jan 2024 17:59:55 +0100
Subject: [PATCH 0147/1406] Bluetooth: mgmt: Fix limited discoverable off
 timeout
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

LIMITED_DISCOVERABLE flag is not reset from Class of Device and
advertisement on limited discoverable timeout. This prevents to pass PTS
test GAP/DISC/LIMM/BV-02-C

Calling set_discoverable_sync as when the limited discovery is set
correctly update the Class of Device and advertisement.

Signed-off-by: Frédéric Danis <frederic.danis@collabora.com>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 net/bluetooth/mgmt.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c
index 173986f3405f7a..8c4493255f92ab 100644
--- a/net/bluetooth/mgmt.c
+++ b/net/bluetooth/mgmt.c
@@ -1045,6 +1045,8 @@ static void rpa_expired(struct work_struct *work)
 	hci_cmd_sync_queue(hdev, rpa_expired_sync, NULL, NULL);
 }
 
+static int set_discoverable_sync(struct hci_dev *hdev, void *data);
+
 static void discov_off(struct work_struct *work)
 {
 	struct hci_dev *hdev = container_of(work, struct hci_dev,
@@ -1063,7 +1065,7 @@ static void discov_off(struct work_struct *work)
 	hci_dev_clear_flag(hdev, HCI_DISCOVERABLE);
 	hdev->discov_timeout = 0;
 
-	hci_update_discoverable(hdev);
+	hci_cmd_sync_queue(hdev, set_discoverable_sync, NULL, NULL);
 
 	mgmt_new_settings(hdev);
 

From 2b46caebfee4825b2cc8490cd7cc01bb8339a08f Mon Sep 17 00:00:00 2001
From: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Date: Mon, 22 Jan 2024 09:02:47 -0500
Subject: [PATCH 0148/1406] Bluetooth: hci_event: Fix handling of
 HCI_EV_IO_CAPA_REQUEST

If we received HCI_EV_IO_CAPA_REQUEST while
HCI_OP_READ_REMOTE_EXT_FEATURES is yet to be responded assume the remote
does support SSP since otherwise this event shouldn't be generated.

Link: https://lore.kernel.org/linux-bluetooth/CABBYNZ+9UdG1cMZVmdtN3U2aS16AKMCyTARZZyFX7xTEDWcMOw@mail.gmail.com/T/#t
Fixes: c7f59461f5a7 ("Bluetooth: Fix a refcnt underflow problem for hci_conn")
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 net/bluetooth/hci_event.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index 6130c969f361a7..a15924db83d9fe 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -5327,9 +5327,12 @@ static void hci_io_capa_request_evt(struct hci_dev *hdev, void *data,
 	hci_dev_lock(hdev);
 
 	conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &ev->bdaddr);
-	if (!conn || !hci_conn_ssp_enabled(conn))
+	if (!conn || !hci_dev_test_flag(hdev, HCI_SSP_ENABLED))
 		goto unlock;
 
+	/* Assume remote supports SSP since it has triggered this event */
+	set_bit(HCI_CONN_SSP_ENABLED, &conn->flags);
+
 	hci_conn_hold(conn);
 
 	if (!hci_dev_test_flag(hdev, HCI_MGMT))

From 7f96febb813f7ef1bce5c249f843c60abca56cf1 Mon Sep 17 00:00:00 2001
From: Kai-Heng Feng <kai.heng.feng@canonical.com>
Date: Thu, 25 Jan 2024 14:50:28 +0800
Subject: [PATCH 0149/1406] Bluetooth: Enforce validation on max value of
 connection interval

Right now Linux BT stack cannot pass test case "GAP/CONN/CPUP/BV-05-C
'Connection Parameter Update Procedure Invalid Parameters Central
Responder'" in Bluetooth Test Suite revision GAP.TS.p44. [0]

That was revoled by commit c49a8682fc5d ("Bluetooth: validate BLE
connection interval updates"), but later got reverted due to devices
like keyboards and mice may require low connection interval.

So only validate the max value connection interval to pass the Test
Suite, and let devices to request low connection interval if needed.

[0] https://www.bluetooth.org/docman/handlers/DownloadDoc.ashx?doc_id=229869
Fixes: 68d19d7d9957 ("Revert "Bluetooth: validate BLE connection interval updates"")

Signed-off-by: Kai-Heng Feng <kai.heng.feng@canonical.com>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 net/bluetooth/hci_event.c  | 4 ++++
 net/bluetooth/l2cap_core.c | 8 +++++++-
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index a15924db83d9fe..31df5f5b799455 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -6795,6 +6795,10 @@ static void hci_le_remote_conn_param_req_evt(struct hci_dev *hdev, void *data,
 		return send_conn_param_neg_reply(hdev, handle,
 						 HCI_ERROR_UNKNOWN_CONN_ID);
 
+	if (max > hcon->le_conn_max_interval)
+		return send_conn_param_neg_reply(hdev, handle,
+						 HCI_ERROR_INVALID_LL_PARAMS);
+
 	if (hci_check_conn_params(min, max, latency, timeout))
 		return send_conn_param_neg_reply(hdev, handle,
 						 HCI_ERROR_INVALID_LL_PARAMS);
diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c
index 60298975d5c456..656f49b299d20d 100644
--- a/net/bluetooth/l2cap_core.c
+++ b/net/bluetooth/l2cap_core.c
@@ -5613,7 +5613,13 @@ static inline int l2cap_conn_param_update_req(struct l2cap_conn *conn,
 
 	memset(&rsp, 0, sizeof(rsp));
 
-	err = hci_check_conn_params(min, max, latency, to_multiplier);
+	if (max > hcon->le_conn_max_interval) {
+		BT_DBG("requested connection interval exceeds current bounds.");
+		err = -EINVAL;
+	} else {
+		err = hci_check_conn_params(min, max, latency, to_multiplier);
+	}
+
 	if (err)
 		rsp.result = cpu_to_le16(L2CAP_CONN_PARAM_REJECTED);
 	else

From 36618e0d5a6db48bbddf6ee947440a6e19eace53 Mon Sep 17 00:00:00 2001
From: Edward Adam Davis <eadavis@qq.com>
Date: Thu, 18 Jan 2024 12:40:34 +0800
Subject: [PATCH 0150/1406] Bluetooth: btintel: Fix null ptr deref in
 btintel_read_version

If hci_cmd_sync_complete() is triggered and skb is NULL, then
hdev->req_skb is NULL, which will cause this issue.

Reported-and-tested-by: syzbot+830d9e3fa61968246abd@syzkaller.appspotmail.com
Signed-off-by: Edward Adam Davis <eadavis@qq.com>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 drivers/bluetooth/btintel.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/bluetooth/btintel.c b/drivers/bluetooth/btintel.c
index cdc5c08824a0ad..e5b043d9620730 100644
--- a/drivers/bluetooth/btintel.c
+++ b/drivers/bluetooth/btintel.c
@@ -435,7 +435,7 @@ int btintel_read_version(struct hci_dev *hdev, struct intel_version *ver)
 	struct sk_buff *skb;
 
 	skb = __hci_cmd_sync(hdev, 0xfc05, 0, NULL, HCI_CMD_TIMEOUT);
-	if (IS_ERR(skb)) {
+	if (IS_ERR_OR_NULL(skb)) {
 		bt_dev_err(hdev, "Reading Intel version information failed (%ld)",
 			   PTR_ERR(skb));
 		return PTR_ERR(skb);

From d6cc80d0954b4f35bf727664191b3987c77853e7 Mon Sep 17 00:00:00 2001
From: Zijun Hu <quic_zijuhu@quicinc.com>
Date: Fri, 19 Jan 2024 17:45:30 +0800
Subject: [PATCH 0151/1406] Bluetooth: qca: Fix wrong event type for patch
 config command

Vendor-specific command patch config has HCI_Command_Complete event as
response, but qca_send_patch_config_cmd() wrongly expects vendor-specific
event for the command, fixed by using right event type.

Btmon log for the vendor-specific command are shown below:
< HCI Command: Vendor (0x3f|0x0000) plen 5
        28 01 00 00 00
> HCI Event: Command Complete (0x0e) plen 5
      Vendor (0x3f|0x0000) ncmd 1
        Status: Success (0x00)
        28

Fixes: 4fac8a7ac80b ("Bluetooth: btqca: sequential validation")
Signed-off-by: Zijun Hu <quic_zijuhu@quicinc.com>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 drivers/bluetooth/btqca.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/bluetooth/btqca.c b/drivers/bluetooth/btqca.c
index fdb0fae88d1c58..b40b32fa7f1c38 100644
--- a/drivers/bluetooth/btqca.c
+++ b/drivers/bluetooth/btqca.c
@@ -152,7 +152,7 @@ static int qca_send_patch_config_cmd(struct hci_dev *hdev)
 	bt_dev_dbg(hdev, "QCA Patch config");
 
 	skb = __hci_cmd_sync_ev(hdev, EDL_PATCH_CMD_OPCODE, sizeof(cmd),
-				cmd, HCI_EV_VENDOR, HCI_INIT_TIMEOUT);
+				cmd, 0, HCI_INIT_TIMEOUT);
 	if (IS_ERR(skb)) {
 		err = PTR_ERR(skb);
 		bt_dev_err(hdev, "Sending QCA Patch config failed (%d)", err);

From cec9f3c5561d62c883553583e6152932bd14c59f Mon Sep 17 00:00:00 2001
From: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Date: Thu, 1 Feb 2024 11:18:58 -0500
Subject: [PATCH 0152/1406] Bluetooth: Remove BT_HS

High Speed, Alternate MAC and PHY (AMP) extension, has been removed from
Bluetooth Core specification on 5.3:

https://www.bluetooth.com/blog/new-core-specification-v5-3-feature-enhancements/

Fixes: 244bc377591c ("Bluetooth: Add BT_HS config option")
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/hci.h   |    1 -
 include/net/bluetooth/l2cap.h |   42 --
 net/bluetooth/Kconfig         |    8 -
 net/bluetooth/Makefile        |    1 -
 net/bluetooth/a2mp.c          | 1054 --------------------------------
 net/bluetooth/a2mp.h          |  154 -----
 net/bluetooth/amp.c           |  590 ------------------
 net/bluetooth/amp.h           |   60 --
 net/bluetooth/hci_conn.c      |    4 -
 net/bluetooth/hci_event.c     |    2 -
 net/bluetooth/l2cap_core.c    | 1069 +--------------------------------
 net/bluetooth/l2cap_sock.c    |   18 +-
 net/bluetooth/mgmt.c          |   73 +--
 13 files changed, 20 insertions(+), 3056 deletions(-)
 delete mode 100644 net/bluetooth/a2mp.c
 delete mode 100644 net/bluetooth/a2mp.h
 delete mode 100644 net/bluetooth/amp.c
 delete mode 100644 net/bluetooth/amp.h

diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h
index 1cd212bb378916..aa6c69053d7cd6 100644
--- a/include/net/bluetooth/hci.h
+++ b/include/net/bluetooth/hci.h
@@ -394,7 +394,6 @@ enum {
 	HCI_LIMITED_PRIVACY,
 	HCI_RPA_EXPIRED,
 	HCI_RPA_RESOLVING,
-	HCI_HS_ENABLED,
 	HCI_LE_ENABLED,
 	HCI_ADVERTISING,
 	HCI_ADVERTISING_CONNECTABLE,
diff --git a/include/net/bluetooth/l2cap.h b/include/net/bluetooth/l2cap.h
index cf393e72d6ed67..92d7197f9a5636 100644
--- a/include/net/bluetooth/l2cap.h
+++ b/include/net/bluetooth/l2cap.h
@@ -59,8 +59,6 @@
 #define L2CAP_WAIT_ACK_POLL_PERIOD	msecs_to_jiffies(200)
 #define L2CAP_WAIT_ACK_TIMEOUT		msecs_to_jiffies(10000)
 
-#define L2CAP_A2MP_DEFAULT_MTU		670
-
 /* L2CAP socket address */
 struct sockaddr_l2 {
 	sa_family_t	l2_family;
@@ -109,12 +107,6 @@ struct l2cap_conninfo {
 #define L2CAP_ECHO_RSP		0x09
 #define L2CAP_INFO_REQ		0x0a
 #define L2CAP_INFO_RSP		0x0b
-#define L2CAP_CREATE_CHAN_REQ	0x0c
-#define L2CAP_CREATE_CHAN_RSP	0x0d
-#define L2CAP_MOVE_CHAN_REQ	0x0e
-#define L2CAP_MOVE_CHAN_RSP	0x0f
-#define L2CAP_MOVE_CHAN_CFM	0x10
-#define L2CAP_MOVE_CHAN_CFM_RSP	0x11
 #define L2CAP_CONN_PARAM_UPDATE_REQ	0x12
 #define L2CAP_CONN_PARAM_UPDATE_RSP	0x13
 #define L2CAP_LE_CONN_REQ	0x14
@@ -144,7 +136,6 @@ struct l2cap_conninfo {
 /* L2CAP fixed channels */
 #define L2CAP_FC_SIG_BREDR	0x02
 #define L2CAP_FC_CONNLESS	0x04
-#define L2CAP_FC_A2MP		0x08
 #define L2CAP_FC_ATT		0x10
 #define L2CAP_FC_SIG_LE		0x20
 #define L2CAP_FC_SMP_LE		0x40
@@ -267,7 +258,6 @@ struct l2cap_conn_rsp {
 /* channel identifier */
 #define L2CAP_CID_SIGNALING	0x0001
 #define L2CAP_CID_CONN_LESS	0x0002
-#define L2CAP_CID_A2MP		0x0003
 #define L2CAP_CID_ATT		0x0004
 #define L2CAP_CID_LE_SIGNALING	0x0005
 #define L2CAP_CID_SMP		0x0006
@@ -282,7 +272,6 @@ struct l2cap_conn_rsp {
 #define L2CAP_CR_BAD_PSM	0x0002
 #define L2CAP_CR_SEC_BLOCK	0x0003
 #define L2CAP_CR_NO_MEM		0x0004
-#define L2CAP_CR_BAD_AMP	0x0005
 #define L2CAP_CR_INVALID_SCID	0x0006
 #define L2CAP_CR_SCID_IN_USE	0x0007
 
@@ -404,29 +393,6 @@ struct l2cap_info_rsp {
 	__u8        data[];
 } __packed;
 
-struct l2cap_create_chan_req {
-	__le16      psm;
-	__le16      scid;
-	__u8        amp_id;
-} __packed;
-
-struct l2cap_create_chan_rsp {
-	__le16      dcid;
-	__le16      scid;
-	__le16      result;
-	__le16      status;
-} __packed;
-
-struct l2cap_move_chan_req {
-	__le16      icid;
-	__u8        dest_amp_id;
-} __packed;
-
-struct l2cap_move_chan_rsp {
-	__le16      icid;
-	__le16      result;
-} __packed;
-
 #define L2CAP_MR_SUCCESS	0x0000
 #define L2CAP_MR_PEND		0x0001
 #define L2CAP_MR_BAD_ID		0x0002
@@ -539,8 +505,6 @@ struct l2cap_seq_list {
 
 struct l2cap_chan {
 	struct l2cap_conn	*conn;
-	struct hci_conn		*hs_hcon;
-	struct hci_chan		*hs_hchan;
 	struct kref	kref;
 	atomic_t	nesting;
 
@@ -591,12 +555,6 @@ struct l2cap_chan {
 	unsigned long	conn_state;
 	unsigned long	flags;
 
-	__u8		remote_amp_id;
-	__u8		local_amp_id;
-	__u8		move_id;
-	__u8		move_state;
-	__u8		move_role;
-
 	__u16		next_tx_seq;
 	__u16		expected_ack_seq;
 	__u16		expected_tx_seq;
diff --git a/net/bluetooth/Kconfig b/net/bluetooth/Kconfig
index da7cac0a1b716b..6b2b65a667008b 100644
--- a/net/bluetooth/Kconfig
+++ b/net/bluetooth/Kconfig
@@ -62,14 +62,6 @@ source "net/bluetooth/cmtp/Kconfig"
 
 source "net/bluetooth/hidp/Kconfig"
 
-config BT_HS
-	bool "Bluetooth High Speed (HS) features"
-	depends on BT_BREDR
-	help
-	  Bluetooth High Speed includes support for off-loading
-	  Bluetooth connections via 802.11 (wifi) physical layer
-	  available with Bluetooth version 3.0 or later.
-
 config BT_LE
 	bool "Bluetooth Low Energy (LE) features"
 	depends on BT
diff --git a/net/bluetooth/Makefile b/net/bluetooth/Makefile
index 141ac1fda0bfa5..628d448d78be3a 100644
--- a/net/bluetooth/Makefile
+++ b/net/bluetooth/Makefile
@@ -21,7 +21,6 @@ bluetooth-$(CONFIG_DEV_COREDUMP) += coredump.o
 
 bluetooth-$(CONFIG_BT_BREDR) += sco.o
 bluetooth-$(CONFIG_BT_LE) += iso.o
-bluetooth-$(CONFIG_BT_HS) += a2mp.o amp.o
 bluetooth-$(CONFIG_BT_LEDS) += leds.o
 bluetooth-$(CONFIG_BT_MSFTEXT) += msft.o
 bluetooth-$(CONFIG_BT_AOSPEXT) += aosp.o
diff --git a/net/bluetooth/a2mp.c b/net/bluetooth/a2mp.c
deleted file mode 100644
index e7adb8a98cf90f..00000000000000
--- a/net/bluetooth/a2mp.c
+++ /dev/null
@@ -1,1054 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
-   Copyright (c) 2010,2011 Code Aurora Forum.  All rights reserved.
-   Copyright (c) 2011,2012 Intel Corp.
-
-*/
-
-#include <net/bluetooth/bluetooth.h>
-#include <net/bluetooth/hci_core.h>
-#include <net/bluetooth/l2cap.h>
-
-#include "hci_request.h"
-#include "a2mp.h"
-#include "amp.h"
-
-#define A2MP_FEAT_EXT	0x8000
-
-/* Global AMP Manager list */
-static LIST_HEAD(amp_mgr_list);
-static DEFINE_MUTEX(amp_mgr_list_lock);
-
-/* A2MP build & send command helper functions */
-static struct a2mp_cmd *__a2mp_build(u8 code, u8 ident, u16 len, void *data)
-{
-	struct a2mp_cmd *cmd;
-	int plen;
-
-	plen = sizeof(*cmd) + len;
-	cmd = kzalloc(plen, GFP_KERNEL);
-	if (!cmd)
-		return NULL;
-
-	cmd->code = code;
-	cmd->ident = ident;
-	cmd->len = cpu_to_le16(len);
-
-	memcpy(cmd->data, data, len);
-
-	return cmd;
-}
-
-static void a2mp_send(struct amp_mgr *mgr, u8 code, u8 ident, u16 len, void *data)
-{
-	struct l2cap_chan *chan = mgr->a2mp_chan;
-	struct a2mp_cmd *cmd;
-	u16 total_len = len + sizeof(*cmd);
-	struct kvec iv;
-	struct msghdr msg;
-
-	cmd = __a2mp_build(code, ident, len, data);
-	if (!cmd)
-		return;
-
-	iv.iov_base = cmd;
-	iv.iov_len = total_len;
-
-	memset(&msg, 0, sizeof(msg));
-
-	iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, &iv, 1, total_len);
-
-	l2cap_chan_send(chan, &msg, total_len);
-
-	kfree(cmd);
-}
-
-static u8 __next_ident(struct amp_mgr *mgr)
-{
-	if (++mgr->ident == 0)
-		mgr->ident = 1;
-
-	return mgr->ident;
-}
-
-static struct amp_mgr *amp_mgr_lookup_by_state(u8 state)
-{
-	struct amp_mgr *mgr;
-
-	mutex_lock(&amp_mgr_list_lock);
-	list_for_each_entry(mgr, &amp_mgr_list, list) {
-		if (test_and_clear_bit(state, &mgr->state)) {
-			amp_mgr_get(mgr);
-			mutex_unlock(&amp_mgr_list_lock);
-			return mgr;
-		}
-	}
-	mutex_unlock(&amp_mgr_list_lock);
-
-	return NULL;
-}
-
-/* hci_dev_list shall be locked */
-static void __a2mp_add_cl(struct amp_mgr *mgr, struct a2mp_cl *cl)
-{
-	struct hci_dev *hdev;
-	int i = 1;
-
-	cl[0].id = AMP_ID_BREDR;
-	cl[0].type = AMP_TYPE_BREDR;
-	cl[0].status = AMP_STATUS_BLUETOOTH_ONLY;
-
-	list_for_each_entry(hdev, &hci_dev_list, list) {
-		if (hdev->dev_type == HCI_AMP) {
-			cl[i].id = hdev->id;
-			cl[i].type = hdev->amp_type;
-			if (test_bit(HCI_UP, &hdev->flags))
-				cl[i].status = hdev->amp_status;
-			else
-				cl[i].status = AMP_STATUS_POWERED_DOWN;
-			i++;
-		}
-	}
-}
-
-/* Processing A2MP messages */
-static int a2mp_command_rej(struct amp_mgr *mgr, struct sk_buff *skb,
-			    struct a2mp_cmd *hdr)
-{
-	struct a2mp_cmd_rej *rej = (void *) skb->data;
-
-	if (le16_to_cpu(hdr->len) < sizeof(*rej))
-		return -EINVAL;
-
-	BT_DBG("ident %u reason %d", hdr->ident, le16_to_cpu(rej->reason));
-
-	skb_pull(skb, sizeof(*rej));
-
-	return 0;
-}
-
-static int a2mp_discover_req(struct amp_mgr *mgr, struct sk_buff *skb,
-			     struct a2mp_cmd *hdr)
-{
-	struct a2mp_discov_req *req = (void *) skb->data;
-	u16 len = le16_to_cpu(hdr->len);
-	struct a2mp_discov_rsp *rsp;
-	u16 ext_feat;
-	u8 num_ctrl;
-	struct hci_dev *hdev;
-
-	if (len < sizeof(*req))
-		return -EINVAL;
-
-	skb_pull(skb, sizeof(*req));
-
-	ext_feat = le16_to_cpu(req->ext_feat);
-
-	BT_DBG("mtu %d efm 0x%4.4x", le16_to_cpu(req->mtu), ext_feat);
-
-	/* check that packet is not broken for now */
-	while (ext_feat & A2MP_FEAT_EXT) {
-		if (len < sizeof(ext_feat))
-			return -EINVAL;
-
-		ext_feat = get_unaligned_le16(skb->data);
-		BT_DBG("efm 0x%4.4x", ext_feat);
-		len -= sizeof(ext_feat);
-		skb_pull(skb, sizeof(ext_feat));
-	}
-
-	read_lock(&hci_dev_list_lock);
-
-	/* at minimum the BR/EDR needs to be listed */
-	num_ctrl = 1;
-
-	list_for_each_entry(hdev, &hci_dev_list, list) {
-		if (hdev->dev_type == HCI_AMP)
-			num_ctrl++;
-	}
-
-	len = struct_size(rsp, cl, num_ctrl);
-	rsp = kmalloc(len, GFP_ATOMIC);
-	if (!rsp) {
-		read_unlock(&hci_dev_list_lock);
-		return -ENOMEM;
-	}
-
-	rsp->mtu = cpu_to_le16(L2CAP_A2MP_DEFAULT_MTU);
-	rsp->ext_feat = 0;
-
-	__a2mp_add_cl(mgr, rsp->cl);
-
-	read_unlock(&hci_dev_list_lock);
-
-	a2mp_send(mgr, A2MP_DISCOVER_RSP, hdr->ident, len, rsp);
-
-	kfree(rsp);
-	return 0;
-}
-
-static int a2mp_discover_rsp(struct amp_mgr *mgr, struct sk_buff *skb,
-			     struct a2mp_cmd *hdr)
-{
-	struct a2mp_discov_rsp *rsp = (void *) skb->data;
-	u16 len = le16_to_cpu(hdr->len);
-	struct a2mp_cl *cl;
-	u16 ext_feat;
-	bool found = false;
-
-	if (len < sizeof(*rsp))
-		return -EINVAL;
-
-	len -= sizeof(*rsp);
-	skb_pull(skb, sizeof(*rsp));
-
-	ext_feat = le16_to_cpu(rsp->ext_feat);
-
-	BT_DBG("mtu %d efm 0x%4.4x", le16_to_cpu(rsp->mtu), ext_feat);
-
-	/* check that packet is not broken for now */
-	while (ext_feat & A2MP_FEAT_EXT) {
-		if (len < sizeof(ext_feat))
-			return -EINVAL;
-
-		ext_feat = get_unaligned_le16(skb->data);
-		BT_DBG("efm 0x%4.4x", ext_feat);
-		len -= sizeof(ext_feat);
-		skb_pull(skb, sizeof(ext_feat));
-	}
-
-	cl = (void *) skb->data;
-	while (len >= sizeof(*cl)) {
-		BT_DBG("Remote AMP id %u type %u status %u", cl->id, cl->type,
-		       cl->status);
-
-		if (cl->id != AMP_ID_BREDR && cl->type != AMP_TYPE_BREDR) {
-			struct a2mp_info_req req;
-
-			found = true;
-
-			memset(&req, 0, sizeof(req));
-
-			req.id = cl->id;
-			a2mp_send(mgr, A2MP_GETINFO_REQ, __next_ident(mgr),
-				  sizeof(req), &req);
-		}
-
-		len -= sizeof(*cl);
-		cl = skb_pull(skb, sizeof(*cl));
-	}
-
-	/* Fall back to L2CAP init sequence */
-	if (!found) {
-		struct l2cap_conn *conn = mgr->l2cap_conn;
-		struct l2cap_chan *chan;
-
-		mutex_lock(&conn->chan_lock);
-
-		list_for_each_entry(chan, &conn->chan_l, list) {
-
-			BT_DBG("chan %p state %s", chan,
-			       state_to_string(chan->state));
-
-			if (chan->scid == L2CAP_CID_A2MP)
-				continue;
-
-			l2cap_chan_lock(chan);
-
-			if (chan->state == BT_CONNECT)
-				l2cap_send_conn_req(chan);
-
-			l2cap_chan_unlock(chan);
-		}
-
-		mutex_unlock(&conn->chan_lock);
-	}
-
-	return 0;
-}
-
-static int a2mp_change_notify(struct amp_mgr *mgr, struct sk_buff *skb,
-			      struct a2mp_cmd *hdr)
-{
-	struct a2mp_cl *cl = (void *) skb->data;
-
-	while (skb->len >= sizeof(*cl)) {
-		BT_DBG("Controller id %u type %u status %u", cl->id, cl->type,
-		       cl->status);
-		cl = skb_pull(skb, sizeof(*cl));
-	}
-
-	/* TODO send A2MP_CHANGE_RSP */
-
-	return 0;
-}
-
-static void read_local_amp_info_complete(struct hci_dev *hdev, u8 status,
-					 u16 opcode)
-{
-	BT_DBG("%s status 0x%2.2x", hdev->name, status);
-
-	a2mp_send_getinfo_rsp(hdev);
-}
-
-static int a2mp_getinfo_req(struct amp_mgr *mgr, struct sk_buff *skb,
-			    struct a2mp_cmd *hdr)
-{
-	struct a2mp_info_req *req  = (void *) skb->data;
-	struct hci_dev *hdev;
-	struct hci_request hreq;
-	int err = 0;
-
-	if (le16_to_cpu(hdr->len) < sizeof(*req))
-		return -EINVAL;
-
-	BT_DBG("id %u", req->id);
-
-	hdev = hci_dev_get(req->id);
-	if (!hdev || hdev->dev_type != HCI_AMP) {
-		struct a2mp_info_rsp rsp;
-
-		memset(&rsp, 0, sizeof(rsp));
-
-		rsp.id = req->id;
-		rsp.status = A2MP_STATUS_INVALID_CTRL_ID;
-
-		a2mp_send(mgr, A2MP_GETINFO_RSP, hdr->ident, sizeof(rsp),
-			  &rsp);
-
-		goto done;
-	}
-
-	set_bit(READ_LOC_AMP_INFO, &mgr->state);
-	hci_req_init(&hreq, hdev);
-	hci_req_add(&hreq, HCI_OP_READ_LOCAL_AMP_INFO, 0, NULL);
-	err = hci_req_run(&hreq, read_local_amp_info_complete);
-	if (err < 0)
-		a2mp_send_getinfo_rsp(hdev);
-
-done:
-	if (hdev)
-		hci_dev_put(hdev);
-
-	skb_pull(skb, sizeof(*req));
-	return 0;
-}
-
-static int a2mp_getinfo_rsp(struct amp_mgr *mgr, struct sk_buff *skb,
-			    struct a2mp_cmd *hdr)
-{
-	struct a2mp_info_rsp *rsp = (struct a2mp_info_rsp *) skb->data;
-	struct a2mp_amp_assoc_req req;
-	struct amp_ctrl *ctrl;
-
-	if (le16_to_cpu(hdr->len) < sizeof(*rsp))
-		return -EINVAL;
-
-	BT_DBG("id %u status 0x%2.2x", rsp->id, rsp->status);
-
-	if (rsp->status)
-		return -EINVAL;
-
-	ctrl = amp_ctrl_add(mgr, rsp->id);
-	if (!ctrl)
-		return -ENOMEM;
-
-	memset(&req, 0, sizeof(req));
-
-	req.id = rsp->id;
-	a2mp_send(mgr, A2MP_GETAMPASSOC_REQ, __next_ident(mgr), sizeof(req),
-		  &req);
-
-	skb_pull(skb, sizeof(*rsp));
-	return 0;
-}
-
-static int a2mp_getampassoc_req(struct amp_mgr *mgr, struct sk_buff *skb,
-				struct a2mp_cmd *hdr)
-{
-	struct a2mp_amp_assoc_req *req = (void *) skb->data;
-	struct hci_dev *hdev;
-	struct amp_mgr *tmp;
-
-	if (le16_to_cpu(hdr->len) < sizeof(*req))
-		return -EINVAL;
-
-	BT_DBG("id %u", req->id);
-
-	/* Make sure that other request is not processed */
-	tmp = amp_mgr_lookup_by_state(READ_LOC_AMP_ASSOC);
-
-	hdev = hci_dev_get(req->id);
-	if (!hdev || hdev->amp_type == AMP_TYPE_BREDR || tmp) {
-		struct a2mp_amp_assoc_rsp rsp;
-
-		memset(&rsp, 0, sizeof(rsp));
-		rsp.id = req->id;
-
-		if (tmp) {
-			rsp.status = A2MP_STATUS_COLLISION_OCCURED;
-			amp_mgr_put(tmp);
-		} else {
-			rsp.status = A2MP_STATUS_INVALID_CTRL_ID;
-		}
-
-		a2mp_send(mgr, A2MP_GETAMPASSOC_RSP, hdr->ident, sizeof(rsp),
-			  &rsp);
-
-		goto done;
-	}
-
-	amp_read_loc_assoc(hdev, mgr);
-
-done:
-	if (hdev)
-		hci_dev_put(hdev);
-
-	skb_pull(skb, sizeof(*req));
-	return 0;
-}
-
-static int a2mp_getampassoc_rsp(struct amp_mgr *mgr, struct sk_buff *skb,
-				struct a2mp_cmd *hdr)
-{
-	struct a2mp_amp_assoc_rsp *rsp = (void *) skb->data;
-	u16 len = le16_to_cpu(hdr->len);
-	struct hci_dev *hdev;
-	struct amp_ctrl *ctrl;
-	struct hci_conn *hcon;
-	size_t assoc_len;
-
-	if (len < sizeof(*rsp))
-		return -EINVAL;
-
-	assoc_len = len - sizeof(*rsp);
-
-	BT_DBG("id %u status 0x%2.2x assoc len %zu", rsp->id, rsp->status,
-	       assoc_len);
-
-	if (rsp->status)
-		return -EINVAL;
-
-	/* Save remote ASSOC data */
-	ctrl = amp_ctrl_lookup(mgr, rsp->id);
-	if (ctrl) {
-		u8 *assoc;
-
-		assoc = kmemdup(rsp->amp_assoc, assoc_len, GFP_KERNEL);
-		if (!assoc) {
-			amp_ctrl_put(ctrl);
-			return -ENOMEM;
-		}
-
-		ctrl->assoc = assoc;
-		ctrl->assoc_len = assoc_len;
-		ctrl->assoc_rem_len = assoc_len;
-		ctrl->assoc_len_so_far = 0;
-
-		amp_ctrl_put(ctrl);
-	}
-
-	/* Create Phys Link */
-	hdev = hci_dev_get(rsp->id);
-	if (!hdev)
-		return -EINVAL;
-
-	hcon = phylink_add(hdev, mgr, rsp->id, true);
-	if (!hcon)
-		goto done;
-
-	BT_DBG("Created hcon %p: loc:%u -> rem:%u", hcon, hdev->id, rsp->id);
-
-	mgr->bredr_chan->remote_amp_id = rsp->id;
-
-	amp_create_phylink(hdev, mgr, hcon);
-
-done:
-	hci_dev_put(hdev);
-	skb_pull(skb, len);
-	return 0;
-}
-
-static int a2mp_createphyslink_req(struct amp_mgr *mgr, struct sk_buff *skb,
-				   struct a2mp_cmd *hdr)
-{
-	struct a2mp_physlink_req *req = (void *) skb->data;
-	struct a2mp_physlink_rsp rsp;
-	struct hci_dev *hdev;
-	struct hci_conn *hcon;
-	struct amp_ctrl *ctrl;
-
-	if (le16_to_cpu(hdr->len) < sizeof(*req))
-		return -EINVAL;
-
-	BT_DBG("local_id %u, remote_id %u", req->local_id, req->remote_id);
-
-	memset(&rsp, 0, sizeof(rsp));
-
-	rsp.local_id = req->remote_id;
-	rsp.remote_id = req->local_id;
-
-	hdev = hci_dev_get(req->remote_id);
-	if (!hdev || hdev->amp_type == AMP_TYPE_BREDR) {
-		rsp.status = A2MP_STATUS_INVALID_CTRL_ID;
-		goto send_rsp;
-	}
-
-	ctrl = amp_ctrl_lookup(mgr, rsp.remote_id);
-	if (!ctrl) {
-		ctrl = amp_ctrl_add(mgr, rsp.remote_id);
-		if (ctrl) {
-			amp_ctrl_get(ctrl);
-		} else {
-			rsp.status = A2MP_STATUS_UNABLE_START_LINK_CREATION;
-			goto send_rsp;
-		}
-	}
-
-	if (ctrl) {
-		size_t assoc_len = le16_to_cpu(hdr->len) - sizeof(*req);
-		u8 *assoc;
-
-		assoc = kmemdup(req->amp_assoc, assoc_len, GFP_KERNEL);
-		if (!assoc) {
-			amp_ctrl_put(ctrl);
-			hci_dev_put(hdev);
-			return -ENOMEM;
-		}
-
-		ctrl->assoc = assoc;
-		ctrl->assoc_len = assoc_len;
-		ctrl->assoc_rem_len = assoc_len;
-		ctrl->assoc_len_so_far = 0;
-
-		amp_ctrl_put(ctrl);
-	}
-
-	hcon = phylink_add(hdev, mgr, req->local_id, false);
-	if (hcon) {
-		amp_accept_phylink(hdev, mgr, hcon);
-		rsp.status = A2MP_STATUS_SUCCESS;
-	} else {
-		rsp.status = A2MP_STATUS_UNABLE_START_LINK_CREATION;
-	}
-
-send_rsp:
-	if (hdev)
-		hci_dev_put(hdev);
-
-	/* Reply error now and success after HCI Write Remote AMP Assoc
-	   command complete with success status
-	 */
-	if (rsp.status != A2MP_STATUS_SUCCESS) {
-		a2mp_send(mgr, A2MP_CREATEPHYSLINK_RSP, hdr->ident,
-			  sizeof(rsp), &rsp);
-	} else {
-		set_bit(WRITE_REMOTE_AMP_ASSOC, &mgr->state);
-		mgr->ident = hdr->ident;
-	}
-
-	skb_pull(skb, le16_to_cpu(hdr->len));
-	return 0;
-}
-
-static int a2mp_discphyslink_req(struct amp_mgr *mgr, struct sk_buff *skb,
-				 struct a2mp_cmd *hdr)
-{
-	struct a2mp_physlink_req *req = (void *) skb->data;
-	struct a2mp_physlink_rsp rsp;
-	struct hci_dev *hdev;
-	struct hci_conn *hcon;
-
-	if (le16_to_cpu(hdr->len) < sizeof(*req))
-		return -EINVAL;
-
-	BT_DBG("local_id %u remote_id %u", req->local_id, req->remote_id);
-
-	memset(&rsp, 0, sizeof(rsp));
-
-	rsp.local_id = req->remote_id;
-	rsp.remote_id = req->local_id;
-	rsp.status = A2MP_STATUS_SUCCESS;
-
-	hdev = hci_dev_get(req->remote_id);
-	if (!hdev) {
-		rsp.status = A2MP_STATUS_INVALID_CTRL_ID;
-		goto send_rsp;
-	}
-
-	hcon = hci_conn_hash_lookup_ba(hdev, AMP_LINK,
-				       &mgr->l2cap_conn->hcon->dst);
-	if (!hcon) {
-		bt_dev_err(hdev, "no phys link exist");
-		rsp.status = A2MP_STATUS_NO_PHYSICAL_LINK_EXISTS;
-		goto clean;
-	}
-
-	/* TODO Disconnect Phys Link here */
-
-clean:
-	hci_dev_put(hdev);
-
-send_rsp:
-	a2mp_send(mgr, A2MP_DISCONNPHYSLINK_RSP, hdr->ident, sizeof(rsp), &rsp);
-
-	skb_pull(skb, sizeof(*req));
-	return 0;
-}
-
-static inline int a2mp_cmd_rsp(struct amp_mgr *mgr, struct sk_buff *skb,
-			       struct a2mp_cmd *hdr)
-{
-	BT_DBG("ident %u code 0x%2.2x", hdr->ident, hdr->code);
-
-	skb_pull(skb, le16_to_cpu(hdr->len));
-	return 0;
-}
-
-/* Handle A2MP signalling */
-static int a2mp_chan_recv_cb(struct l2cap_chan *chan, struct sk_buff *skb)
-{
-	struct a2mp_cmd *hdr;
-	struct amp_mgr *mgr = chan->data;
-	int err = 0;
-
-	amp_mgr_get(mgr);
-
-	while (skb->len >= sizeof(*hdr)) {
-		u16 len;
-
-		hdr = (void *) skb->data;
-		len = le16_to_cpu(hdr->len);
-
-		BT_DBG("code 0x%2.2x id %u len %u", hdr->code, hdr->ident, len);
-
-		skb_pull(skb, sizeof(*hdr));
-
-		if (len > skb->len || !hdr->ident) {
-			err = -EINVAL;
-			break;
-		}
-
-		mgr->ident = hdr->ident;
-
-		switch (hdr->code) {
-		case A2MP_COMMAND_REJ:
-			a2mp_command_rej(mgr, skb, hdr);
-			break;
-
-		case A2MP_DISCOVER_REQ:
-			err = a2mp_discover_req(mgr, skb, hdr);
-			break;
-
-		case A2MP_CHANGE_NOTIFY:
-			err = a2mp_change_notify(mgr, skb, hdr);
-			break;
-
-		case A2MP_GETINFO_REQ:
-			err = a2mp_getinfo_req(mgr, skb, hdr);
-			break;
-
-		case A2MP_GETAMPASSOC_REQ:
-			err = a2mp_getampassoc_req(mgr, skb, hdr);
-			break;
-
-		case A2MP_CREATEPHYSLINK_REQ:
-			err = a2mp_createphyslink_req(mgr, skb, hdr);
-			break;
-
-		case A2MP_DISCONNPHYSLINK_REQ:
-			err = a2mp_discphyslink_req(mgr, skb, hdr);
-			break;
-
-		case A2MP_DISCOVER_RSP:
-			err = a2mp_discover_rsp(mgr, skb, hdr);
-			break;
-
-		case A2MP_GETINFO_RSP:
-			err = a2mp_getinfo_rsp(mgr, skb, hdr);
-			break;
-
-		case A2MP_GETAMPASSOC_RSP:
-			err = a2mp_getampassoc_rsp(mgr, skb, hdr);
-			break;
-
-		case A2MP_CHANGE_RSP:
-		case A2MP_CREATEPHYSLINK_RSP:
-		case A2MP_DISCONNPHYSLINK_RSP:
-			err = a2mp_cmd_rsp(mgr, skb, hdr);
-			break;
-
-		default:
-			BT_ERR("Unknown A2MP sig cmd 0x%2.2x", hdr->code);
-			err = -EINVAL;
-			break;
-		}
-	}
-
-	if (err) {
-		struct a2mp_cmd_rej rej;
-
-		memset(&rej, 0, sizeof(rej));
-
-		rej.reason = cpu_to_le16(0);
-		hdr = (void *) skb->data;
-
-		BT_DBG("Send A2MP Rej: cmd 0x%2.2x err %d", hdr->code, err);
-
-		a2mp_send(mgr, A2MP_COMMAND_REJ, hdr->ident, sizeof(rej),
-			  &rej);
-	}
-
-	/* Always free skb and return success error code to prevent
-	   from sending L2CAP Disconnect over A2MP channel */
-	kfree_skb(skb);
-
-	amp_mgr_put(mgr);
-
-	return 0;
-}
-
-static void a2mp_chan_close_cb(struct l2cap_chan *chan)
-{
-	l2cap_chan_put(chan);
-}
-
-static void a2mp_chan_state_change_cb(struct l2cap_chan *chan, int state,
-				      int err)
-{
-	struct amp_mgr *mgr = chan->data;
-
-	if (!mgr)
-		return;
-
-	BT_DBG("chan %p state %s", chan, state_to_string(state));
-
-	chan->state = state;
-
-	switch (state) {
-	case BT_CLOSED:
-		if (mgr)
-			amp_mgr_put(mgr);
-		break;
-	}
-}
-
-static struct sk_buff *a2mp_chan_alloc_skb_cb(struct l2cap_chan *chan,
-					      unsigned long hdr_len,
-					      unsigned long len, int nb)
-{
-	struct sk_buff *skb;
-
-	skb = bt_skb_alloc(hdr_len + len, GFP_KERNEL);
-	if (!skb)
-		return ERR_PTR(-ENOMEM);
-
-	return skb;
-}
-
-static const struct l2cap_ops a2mp_chan_ops = {
-	.name = "L2CAP A2MP channel",
-	.recv = a2mp_chan_recv_cb,
-	.close = a2mp_chan_close_cb,
-	.state_change = a2mp_chan_state_change_cb,
-	.alloc_skb = a2mp_chan_alloc_skb_cb,
-
-	/* Not implemented for A2MP */
-	.new_connection = l2cap_chan_no_new_connection,
-	.teardown = l2cap_chan_no_teardown,
-	.ready = l2cap_chan_no_ready,
-	.defer = l2cap_chan_no_defer,
-	.resume = l2cap_chan_no_resume,
-	.set_shutdown = l2cap_chan_no_set_shutdown,
-	.get_sndtimeo = l2cap_chan_no_get_sndtimeo,
-};
-
-static struct l2cap_chan *a2mp_chan_open(struct l2cap_conn *conn, bool locked)
-{
-	struct l2cap_chan *chan;
-	int err;
-
-	chan = l2cap_chan_create();
-	if (!chan)
-		return NULL;
-
-	BT_DBG("chan %p", chan);
-
-	chan->chan_type = L2CAP_CHAN_FIXED;
-	chan->scid = L2CAP_CID_A2MP;
-	chan->dcid = L2CAP_CID_A2MP;
-	chan->omtu = L2CAP_A2MP_DEFAULT_MTU;
-	chan->imtu = L2CAP_A2MP_DEFAULT_MTU;
-	chan->flush_to = L2CAP_DEFAULT_FLUSH_TO;
-
-	chan->ops = &a2mp_chan_ops;
-
-	l2cap_chan_set_defaults(chan);
-	chan->remote_max_tx = chan->max_tx;
-	chan->remote_tx_win = chan->tx_win;
-
-	chan->retrans_timeout = L2CAP_DEFAULT_RETRANS_TO;
-	chan->monitor_timeout = L2CAP_DEFAULT_MONITOR_TO;
-
-	skb_queue_head_init(&chan->tx_q);
-
-	chan->mode = L2CAP_MODE_ERTM;
-
-	err = l2cap_ertm_init(chan);
-	if (err < 0) {
-		l2cap_chan_del(chan, 0);
-		return NULL;
-	}
-
-	chan->conf_state = 0;
-
-	if (locked)
-		__l2cap_chan_add(conn, chan);
-	else
-		l2cap_chan_add(conn, chan);
-
-	chan->remote_mps = chan->omtu;
-	chan->mps = chan->omtu;
-
-	chan->state = BT_CONNECTED;
-
-	return chan;
-}
-
-/* AMP Manager functions */
-struct amp_mgr *amp_mgr_get(struct amp_mgr *mgr)
-{
-	BT_DBG("mgr %p orig refcnt %d", mgr, kref_read(&mgr->kref));
-
-	kref_get(&mgr->kref);
-
-	return mgr;
-}
-
-static void amp_mgr_destroy(struct kref *kref)
-{
-	struct amp_mgr *mgr = container_of(kref, struct amp_mgr, kref);
-
-	BT_DBG("mgr %p", mgr);
-
-	mutex_lock(&amp_mgr_list_lock);
-	list_del(&mgr->list);
-	mutex_unlock(&amp_mgr_list_lock);
-
-	amp_ctrl_list_flush(mgr);
-	kfree(mgr);
-}
-
-int amp_mgr_put(struct amp_mgr *mgr)
-{
-	BT_DBG("mgr %p orig refcnt %d", mgr, kref_read(&mgr->kref));
-
-	return kref_put(&mgr->kref, &amp_mgr_destroy);
-}
-
-static struct amp_mgr *amp_mgr_create(struct l2cap_conn *conn, bool locked)
-{
-	struct amp_mgr *mgr;
-	struct l2cap_chan *chan;
-
-	mgr = kzalloc(sizeof(*mgr), GFP_KERNEL);
-	if (!mgr)
-		return NULL;
-
-	BT_DBG("conn %p mgr %p", conn, mgr);
-
-	mgr->l2cap_conn = conn;
-
-	chan = a2mp_chan_open(conn, locked);
-	if (!chan) {
-		kfree(mgr);
-		return NULL;
-	}
-
-	mgr->a2mp_chan = chan;
-	chan->data = mgr;
-
-	conn->hcon->amp_mgr = mgr;
-
-	kref_init(&mgr->kref);
-
-	/* Remote AMP ctrl list initialization */
-	INIT_LIST_HEAD(&mgr->amp_ctrls);
-	mutex_init(&mgr->amp_ctrls_lock);
-
-	mutex_lock(&amp_mgr_list_lock);
-	list_add(&mgr->list, &amp_mgr_list);
-	mutex_unlock(&amp_mgr_list_lock);
-
-	return mgr;
-}
-
-struct l2cap_chan *a2mp_channel_create(struct l2cap_conn *conn,
-				       struct sk_buff *skb)
-{
-	struct amp_mgr *mgr;
-
-	if (conn->hcon->type != ACL_LINK)
-		return NULL;
-
-	mgr = amp_mgr_create(conn, false);
-	if (!mgr) {
-		BT_ERR("Could not create AMP manager");
-		return NULL;
-	}
-
-	BT_DBG("mgr: %p chan %p", mgr, mgr->a2mp_chan);
-
-	return mgr->a2mp_chan;
-}
-
-void a2mp_send_getinfo_rsp(struct hci_dev *hdev)
-{
-	struct amp_mgr *mgr;
-	struct a2mp_info_rsp rsp;
-
-	mgr = amp_mgr_lookup_by_state(READ_LOC_AMP_INFO);
-	if (!mgr)
-		return;
-
-	BT_DBG("%s mgr %p", hdev->name, mgr);
-
-	memset(&rsp, 0, sizeof(rsp));
-
-	rsp.id = hdev->id;
-	rsp.status = A2MP_STATUS_INVALID_CTRL_ID;
-
-	if (hdev->amp_type != AMP_TYPE_BREDR) {
-		rsp.status = 0;
-		rsp.total_bw = cpu_to_le32(hdev->amp_total_bw);
-		rsp.max_bw = cpu_to_le32(hdev->amp_max_bw);
-		rsp.min_latency = cpu_to_le32(hdev->amp_min_latency);
-		rsp.pal_cap = cpu_to_le16(hdev->amp_pal_cap);
-		rsp.assoc_size = cpu_to_le16(hdev->amp_assoc_size);
-	}
-
-	a2mp_send(mgr, A2MP_GETINFO_RSP, mgr->ident, sizeof(rsp), &rsp);
-	amp_mgr_put(mgr);
-}
-
-void a2mp_send_getampassoc_rsp(struct hci_dev *hdev, u8 status)
-{
-	struct amp_mgr *mgr;
-	struct amp_assoc *loc_assoc = &hdev->loc_assoc;
-	struct a2mp_amp_assoc_rsp *rsp;
-	size_t len;
-
-	mgr = amp_mgr_lookup_by_state(READ_LOC_AMP_ASSOC);
-	if (!mgr)
-		return;
-
-	BT_DBG("%s mgr %p", hdev->name, mgr);
-
-	len = sizeof(struct a2mp_amp_assoc_rsp) + loc_assoc->len;
-	rsp = kzalloc(len, GFP_KERNEL);
-	if (!rsp) {
-		amp_mgr_put(mgr);
-		return;
-	}
-
-	rsp->id = hdev->id;
-
-	if (status) {
-		rsp->status = A2MP_STATUS_INVALID_CTRL_ID;
-	} else {
-		rsp->status = A2MP_STATUS_SUCCESS;
-		memcpy(rsp->amp_assoc, loc_assoc->data, loc_assoc->len);
-	}
-
-	a2mp_send(mgr, A2MP_GETAMPASSOC_RSP, mgr->ident, len, rsp);
-	amp_mgr_put(mgr);
-	kfree(rsp);
-}
-
-void a2mp_send_create_phy_link_req(struct hci_dev *hdev, u8 status)
-{
-	struct amp_mgr *mgr;
-	struct amp_assoc *loc_assoc = &hdev->loc_assoc;
-	struct a2mp_physlink_req *req;
-	struct l2cap_chan *bredr_chan;
-	size_t len;
-
-	mgr = amp_mgr_lookup_by_state(READ_LOC_AMP_ASSOC_FINAL);
-	if (!mgr)
-		return;
-
-	len = sizeof(*req) + loc_assoc->len;
-
-	BT_DBG("%s mgr %p assoc_len %zu", hdev->name, mgr, len);
-
-	req = kzalloc(len, GFP_KERNEL);
-	if (!req) {
-		amp_mgr_put(mgr);
-		return;
-	}
-
-	bredr_chan = mgr->bredr_chan;
-	if (!bredr_chan)
-		goto clean;
-
-	req->local_id = hdev->id;
-	req->remote_id = bredr_chan->remote_amp_id;
-	memcpy(req->amp_assoc, loc_assoc->data, loc_assoc->len);
-
-	a2mp_send(mgr, A2MP_CREATEPHYSLINK_REQ, __next_ident(mgr), len, req);
-
-clean:
-	amp_mgr_put(mgr);
-	kfree(req);
-}
-
-void a2mp_send_create_phy_link_rsp(struct hci_dev *hdev, u8 status)
-{
-	struct amp_mgr *mgr;
-	struct a2mp_physlink_rsp rsp;
-	struct hci_conn *hs_hcon;
-
-	mgr = amp_mgr_lookup_by_state(WRITE_REMOTE_AMP_ASSOC);
-	if (!mgr)
-		return;
-
-	memset(&rsp, 0, sizeof(rsp));
-
-	hs_hcon = hci_conn_hash_lookup_state(hdev, AMP_LINK, BT_CONNECT);
-	if (!hs_hcon) {
-		rsp.status = A2MP_STATUS_UNABLE_START_LINK_CREATION;
-	} else {
-		rsp.remote_id = hs_hcon->remote_id;
-		rsp.status = A2MP_STATUS_SUCCESS;
-	}
-
-	BT_DBG("%s mgr %p hs_hcon %p status %u", hdev->name, mgr, hs_hcon,
-	       status);
-
-	rsp.local_id = hdev->id;
-	a2mp_send(mgr, A2MP_CREATEPHYSLINK_RSP, mgr->ident, sizeof(rsp), &rsp);
-	amp_mgr_put(mgr);
-}
-
-void a2mp_discover_amp(struct l2cap_chan *chan)
-{
-	struct l2cap_conn *conn = chan->conn;
-	struct amp_mgr *mgr = conn->hcon->amp_mgr;
-	struct a2mp_discov_req req;
-
-	BT_DBG("chan %p conn %p mgr %p", chan, conn, mgr);
-
-	if (!mgr) {
-		mgr = amp_mgr_create(conn, true);
-		if (!mgr)
-			return;
-	}
-
-	mgr->bredr_chan = chan;
-
-	memset(&req, 0, sizeof(req));
-
-	req.mtu = cpu_to_le16(L2CAP_A2MP_DEFAULT_MTU);
-	req.ext_feat = 0;
-	a2mp_send(mgr, A2MP_DISCOVER_REQ, 1, sizeof(req), &req);
-}
diff --git a/net/bluetooth/a2mp.h b/net/bluetooth/a2mp.h
deleted file mode 100644
index 2fd253a61a2a16..00000000000000
--- a/net/bluetooth/a2mp.h
+++ /dev/null
@@ -1,154 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
-   Copyright (c) 2010,2011 Code Aurora Forum.  All rights reserved.
-   Copyright (c) 2011,2012 Intel Corp.
-
-*/
-
-#ifndef __A2MP_H
-#define __A2MP_H
-
-#include <net/bluetooth/l2cap.h>
-
-enum amp_mgr_state {
-	READ_LOC_AMP_INFO,
-	READ_LOC_AMP_ASSOC,
-	READ_LOC_AMP_ASSOC_FINAL,
-	WRITE_REMOTE_AMP_ASSOC,
-};
-
-struct amp_mgr {
-	struct list_head	list;
-	struct l2cap_conn	*l2cap_conn;
-	struct l2cap_chan	*a2mp_chan;
-	struct l2cap_chan	*bredr_chan;
-	struct kref		kref;
-	__u8			ident;
-	__u8			handle;
-	unsigned long		state;
-	unsigned long		flags;
-
-	struct list_head	amp_ctrls;
-	struct mutex		amp_ctrls_lock;
-};
-
-struct a2mp_cmd {
-	__u8	code;
-	__u8	ident;
-	__le16	len;
-	__u8	data[];
-} __packed;
-
-/* A2MP command codes */
-#define A2MP_COMMAND_REJ         0x01
-struct a2mp_cmd_rej {
-	__le16	reason;
-	__u8	data[];
-} __packed;
-
-#define A2MP_DISCOVER_REQ        0x02
-struct a2mp_discov_req {
-	__le16	mtu;
-	__le16	ext_feat;
-} __packed;
-
-struct a2mp_cl {
-	__u8	id;
-	__u8	type;
-	__u8	status;
-} __packed;
-
-#define A2MP_DISCOVER_RSP        0x03
-struct a2mp_discov_rsp {
-	__le16     mtu;
-	__le16     ext_feat;
-	struct a2mp_cl cl[];
-} __packed;
-
-#define A2MP_CHANGE_NOTIFY       0x04
-#define A2MP_CHANGE_RSP          0x05
-
-#define A2MP_GETINFO_REQ         0x06
-struct a2mp_info_req {
-	__u8       id;
-} __packed;
-
-#define A2MP_GETINFO_RSP         0x07
-struct a2mp_info_rsp {
-	__u8	id;
-	__u8	status;
-	__le32	total_bw;
-	__le32	max_bw;
-	__le32	min_latency;
-	__le16	pal_cap;
-	__le16	assoc_size;
-} __packed;
-
-#define A2MP_GETAMPASSOC_REQ     0x08
-struct a2mp_amp_assoc_req {
-	__u8	id;
-} __packed;
-
-#define A2MP_GETAMPASSOC_RSP     0x09
-struct a2mp_amp_assoc_rsp {
-	__u8	id;
-	__u8	status;
-	__u8	amp_assoc[];
-} __packed;
-
-#define A2MP_CREATEPHYSLINK_REQ  0x0A
-#define A2MP_DISCONNPHYSLINK_REQ 0x0C
-struct a2mp_physlink_req {
-	__u8	local_id;
-	__u8	remote_id;
-	__u8	amp_assoc[];
-} __packed;
-
-#define A2MP_CREATEPHYSLINK_RSP  0x0B
-#define A2MP_DISCONNPHYSLINK_RSP 0x0D
-struct a2mp_physlink_rsp {
-	__u8	local_id;
-	__u8	remote_id;
-	__u8	status;
-} __packed;
-
-/* A2MP response status */
-#define A2MP_STATUS_SUCCESS			0x00
-#define A2MP_STATUS_INVALID_CTRL_ID		0x01
-#define A2MP_STATUS_UNABLE_START_LINK_CREATION	0x02
-#define A2MP_STATUS_NO_PHYSICAL_LINK_EXISTS	0x02
-#define A2MP_STATUS_COLLISION_OCCURED		0x03
-#define A2MP_STATUS_DISCONN_REQ_RECVD		0x04
-#define A2MP_STATUS_PHYS_LINK_EXISTS		0x05
-#define A2MP_STATUS_SECURITY_VIOLATION		0x06
-
-struct amp_mgr *amp_mgr_get(struct amp_mgr *mgr);
-
-#if IS_ENABLED(CONFIG_BT_HS)
-int amp_mgr_put(struct amp_mgr *mgr);
-struct l2cap_chan *a2mp_channel_create(struct l2cap_conn *conn,
-				       struct sk_buff *skb);
-void a2mp_discover_amp(struct l2cap_chan *chan);
-#else
-static inline int amp_mgr_put(struct amp_mgr *mgr)
-{
-	return 0;
-}
-
-static inline struct l2cap_chan *a2mp_channel_create(struct l2cap_conn *conn,
-						     struct sk_buff *skb)
-{
-	return NULL;
-}
-
-static inline void a2mp_discover_amp(struct l2cap_chan *chan)
-{
-}
-#endif
-
-void a2mp_send_getinfo_rsp(struct hci_dev *hdev);
-void a2mp_send_getampassoc_rsp(struct hci_dev *hdev, u8 status);
-void a2mp_send_create_phy_link_req(struct hci_dev *hdev, u8 status);
-void a2mp_send_create_phy_link_rsp(struct hci_dev *hdev, u8 status);
-
-#endif /* __A2MP_H */
diff --git a/net/bluetooth/amp.c b/net/bluetooth/amp.c
deleted file mode 100644
index 5d698f19868c5f..00000000000000
--- a/net/bluetooth/amp.c
+++ /dev/null
@@ -1,590 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
-   Copyright (c) 2011,2012 Intel Corp.
-
-*/
-
-#include <net/bluetooth/bluetooth.h>
-#include <net/bluetooth/hci.h>
-#include <net/bluetooth/hci_core.h>
-#include <crypto/hash.h>
-
-#include "hci_request.h"
-#include "a2mp.h"
-#include "amp.h"
-
-/* Remote AMP Controllers interface */
-void amp_ctrl_get(struct amp_ctrl *ctrl)
-{
-	BT_DBG("ctrl %p orig refcnt %d", ctrl,
-	       kref_read(&ctrl->kref));
-
-	kref_get(&ctrl->kref);
-}
-
-static void amp_ctrl_destroy(struct kref *kref)
-{
-	struct amp_ctrl *ctrl = container_of(kref, struct amp_ctrl, kref);
-
-	BT_DBG("ctrl %p", ctrl);
-
-	kfree(ctrl->assoc);
-	kfree(ctrl);
-}
-
-int amp_ctrl_put(struct amp_ctrl *ctrl)
-{
-	BT_DBG("ctrl %p orig refcnt %d", ctrl,
-	       kref_read(&ctrl->kref));
-
-	return kref_put(&ctrl->kref, &amp_ctrl_destroy);
-}
-
-struct amp_ctrl *amp_ctrl_add(struct amp_mgr *mgr, u8 id)
-{
-	struct amp_ctrl *ctrl;
-
-	ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL);
-	if (!ctrl)
-		return NULL;
-
-	kref_init(&ctrl->kref);
-	ctrl->id = id;
-
-	mutex_lock(&mgr->amp_ctrls_lock);
-	list_add(&ctrl->list, &mgr->amp_ctrls);
-	mutex_unlock(&mgr->amp_ctrls_lock);
-
-	BT_DBG("mgr %p ctrl %p", mgr, ctrl);
-
-	return ctrl;
-}
-
-void amp_ctrl_list_flush(struct amp_mgr *mgr)
-{
-	struct amp_ctrl *ctrl, *n;
-
-	BT_DBG("mgr %p", mgr);
-
-	mutex_lock(&mgr->amp_ctrls_lock);
-	list_for_each_entry_safe(ctrl, n, &mgr->amp_ctrls, list) {
-		list_del(&ctrl->list);
-		amp_ctrl_put(ctrl);
-	}
-	mutex_unlock(&mgr->amp_ctrls_lock);
-}
-
-struct amp_ctrl *amp_ctrl_lookup(struct amp_mgr *mgr, u8 id)
-{
-	struct amp_ctrl *ctrl;
-
-	BT_DBG("mgr %p id %u", mgr, id);
-
-	mutex_lock(&mgr->amp_ctrls_lock);
-	list_for_each_entry(ctrl, &mgr->amp_ctrls, list) {
-		if (ctrl->id == id) {
-			amp_ctrl_get(ctrl);
-			mutex_unlock(&mgr->amp_ctrls_lock);
-			return ctrl;
-		}
-	}
-	mutex_unlock(&mgr->amp_ctrls_lock);
-
-	return NULL;
-}
-
-/* Physical Link interface */
-static u8 __next_handle(struct amp_mgr *mgr)
-{
-	if (++mgr->handle == 0)
-		mgr->handle = 1;
-
-	return mgr->handle;
-}
-
-struct hci_conn *phylink_add(struct hci_dev *hdev, struct amp_mgr *mgr,
-			     u8 remote_id, bool out)
-{
-	bdaddr_t *dst = &mgr->l2cap_conn->hcon->dst;
-	struct hci_conn *hcon;
-	u8 role = out ? HCI_ROLE_MASTER : HCI_ROLE_SLAVE;
-
-	hcon = hci_conn_add(hdev, AMP_LINK, dst, role, __next_handle(mgr));
-	if (!hcon)
-		return NULL;
-
-	BT_DBG("hcon %p dst %pMR", hcon, dst);
-
-	hcon->state = BT_CONNECT;
-	hcon->attempt++;
-	hcon->remote_id = remote_id;
-	hcon->amp_mgr = amp_mgr_get(mgr);
-
-	return hcon;
-}
-
-/* AMP crypto key generation interface */
-static int hmac_sha256(u8 *key, u8 ksize, char *plaintext, u8 psize, u8 *output)
-{
-	struct crypto_shash *tfm;
-	struct shash_desc *shash;
-	int ret;
-
-	if (!ksize)
-		return -EINVAL;
-
-	tfm = crypto_alloc_shash("hmac(sha256)", 0, 0);
-	if (IS_ERR(tfm)) {
-		BT_DBG("crypto_alloc_ahash failed: err %ld", PTR_ERR(tfm));
-		return PTR_ERR(tfm);
-	}
-
-	ret = crypto_shash_setkey(tfm, key, ksize);
-	if (ret) {
-		BT_DBG("crypto_ahash_setkey failed: err %d", ret);
-		goto failed;
-	}
-
-	shash = kzalloc(sizeof(*shash) + crypto_shash_descsize(tfm),
-			GFP_KERNEL);
-	if (!shash) {
-		ret = -ENOMEM;
-		goto failed;
-	}
-
-	shash->tfm = tfm;
-
-	ret = crypto_shash_digest(shash, plaintext, psize, output);
-
-	kfree(shash);
-
-failed:
-	crypto_free_shash(tfm);
-	return ret;
-}
-
-int phylink_gen_key(struct hci_conn *conn, u8 *data, u8 *len, u8 *type)
-{
-	struct hci_dev *hdev = conn->hdev;
-	struct link_key *key;
-	u8 keybuf[HCI_AMP_LINK_KEY_SIZE];
-	u8 gamp_key[HCI_AMP_LINK_KEY_SIZE];
-	int err;
-
-	if (!hci_conn_check_link_mode(conn))
-		return -EACCES;
-
-	BT_DBG("conn %p key_type %d", conn, conn->key_type);
-
-	/* Legacy key */
-	if (conn->key_type < 3) {
-		bt_dev_err(hdev, "legacy key type %u", conn->key_type);
-		return -EACCES;
-	}
-
-	*type = conn->key_type;
-	*len = HCI_AMP_LINK_KEY_SIZE;
-
-	key = hci_find_link_key(hdev, &conn->dst);
-	if (!key) {
-		BT_DBG("No Link key for conn %p dst %pMR", conn, &conn->dst);
-		return -EACCES;
-	}
-
-	/* BR/EDR Link Key concatenated together with itself */
-	memcpy(&keybuf[0], key->val, HCI_LINK_KEY_SIZE);
-	memcpy(&keybuf[HCI_LINK_KEY_SIZE], key->val, HCI_LINK_KEY_SIZE);
-
-	/* Derive Generic AMP Link Key (gamp) */
-	err = hmac_sha256(keybuf, HCI_AMP_LINK_KEY_SIZE, "gamp", 4, gamp_key);
-	if (err) {
-		bt_dev_err(hdev, "could not derive Generic AMP Key: err %d", err);
-		return err;
-	}
-
-	if (conn->key_type == HCI_LK_DEBUG_COMBINATION) {
-		BT_DBG("Use Generic AMP Key (gamp)");
-		memcpy(data, gamp_key, HCI_AMP_LINK_KEY_SIZE);
-		return err;
-	}
-
-	/* Derive Dedicated AMP Link Key: "802b" is 802.11 PAL keyID */
-	return hmac_sha256(gamp_key, HCI_AMP_LINK_KEY_SIZE, "802b", 4, data);
-}
-
-static void read_local_amp_assoc_complete(struct hci_dev *hdev, u8 status,
-					  u16 opcode, struct sk_buff *skb)
-{
-	struct hci_rp_read_local_amp_assoc *rp = (void *)skb->data;
-	struct amp_assoc *assoc = &hdev->loc_assoc;
-	size_t rem_len, frag_len;
-
-	BT_DBG("%s status 0x%2.2x", hdev->name, rp->status);
-
-	if (rp->status)
-		goto send_rsp;
-
-	frag_len = skb->len - sizeof(*rp);
-	rem_len = __le16_to_cpu(rp->rem_len);
-
-	if (rem_len > frag_len) {
-		BT_DBG("frag_len %zu rem_len %zu", frag_len, rem_len);
-
-		memcpy(assoc->data + assoc->offset, rp->frag, frag_len);
-		assoc->offset += frag_len;
-
-		/* Read other fragments */
-		amp_read_loc_assoc_frag(hdev, rp->phy_handle);
-
-		return;
-	}
-
-	memcpy(assoc->data + assoc->offset, rp->frag, rem_len);
-	assoc->len = assoc->offset + rem_len;
-	assoc->offset = 0;
-
-send_rsp:
-	/* Send A2MP Rsp when all fragments are received */
-	a2mp_send_getampassoc_rsp(hdev, rp->status);
-	a2mp_send_create_phy_link_req(hdev, rp->status);
-}
-
-void amp_read_loc_assoc_frag(struct hci_dev *hdev, u8 phy_handle)
-{
-	struct hci_cp_read_local_amp_assoc cp;
-	struct amp_assoc *loc_assoc = &hdev->loc_assoc;
-	struct hci_request req;
-	int err;
-
-	BT_DBG("%s handle %u", hdev->name, phy_handle);
-
-	cp.phy_handle = phy_handle;
-	cp.max_len = cpu_to_le16(hdev->amp_assoc_size);
-	cp.len_so_far = cpu_to_le16(loc_assoc->offset);
-
-	hci_req_init(&req, hdev);
-	hci_req_add(&req, HCI_OP_READ_LOCAL_AMP_ASSOC, sizeof(cp), &cp);
-	err = hci_req_run_skb(&req, read_local_amp_assoc_complete);
-	if (err < 0)
-		a2mp_send_getampassoc_rsp(hdev, A2MP_STATUS_INVALID_CTRL_ID);
-}
-
-void amp_read_loc_assoc(struct hci_dev *hdev, struct amp_mgr *mgr)
-{
-	struct hci_cp_read_local_amp_assoc cp;
-	struct hci_request req;
-	int err;
-
-	memset(&hdev->loc_assoc, 0, sizeof(struct amp_assoc));
-	memset(&cp, 0, sizeof(cp));
-
-	cp.max_len = cpu_to_le16(hdev->amp_assoc_size);
-
-	set_bit(READ_LOC_AMP_ASSOC, &mgr->state);
-	hci_req_init(&req, hdev);
-	hci_req_add(&req, HCI_OP_READ_LOCAL_AMP_ASSOC, sizeof(cp), &cp);
-	err = hci_req_run_skb(&req, read_local_amp_assoc_complete);
-	if (err < 0)
-		a2mp_send_getampassoc_rsp(hdev, A2MP_STATUS_INVALID_CTRL_ID);
-}
-
-void amp_read_loc_assoc_final_data(struct hci_dev *hdev,
-				   struct hci_conn *hcon)
-{
-	struct hci_cp_read_local_amp_assoc cp;
-	struct amp_mgr *mgr = hcon->amp_mgr;
-	struct hci_request req;
-	int err;
-
-	if (!mgr)
-		return;
-
-	cp.phy_handle = hcon->handle;
-	cp.len_so_far = cpu_to_le16(0);
-	cp.max_len = cpu_to_le16(hdev->amp_assoc_size);
-
-	set_bit(READ_LOC_AMP_ASSOC_FINAL, &mgr->state);
-
-	/* Read Local AMP Assoc final link information data */
-	hci_req_init(&req, hdev);
-	hci_req_add(&req, HCI_OP_READ_LOCAL_AMP_ASSOC, sizeof(cp), &cp);
-	err = hci_req_run_skb(&req, read_local_amp_assoc_complete);
-	if (err < 0)
-		a2mp_send_getampassoc_rsp(hdev, A2MP_STATUS_INVALID_CTRL_ID);
-}
-
-static void write_remote_amp_assoc_complete(struct hci_dev *hdev, u8 status,
-					    u16 opcode, struct sk_buff *skb)
-{
-	struct hci_rp_write_remote_amp_assoc *rp = (void *)skb->data;
-
-	BT_DBG("%s status 0x%2.2x phy_handle 0x%2.2x",
-	       hdev->name, rp->status, rp->phy_handle);
-
-	if (rp->status)
-		return;
-
-	amp_write_rem_assoc_continue(hdev, rp->phy_handle);
-}
-
-/* Write AMP Assoc data fragments, returns true with last fragment written*/
-static bool amp_write_rem_assoc_frag(struct hci_dev *hdev,
-				     struct hci_conn *hcon)
-{
-	struct hci_cp_write_remote_amp_assoc *cp;
-	struct amp_mgr *mgr = hcon->amp_mgr;
-	struct amp_ctrl *ctrl;
-	struct hci_request req;
-	u16 frag_len, len;
-
-	ctrl = amp_ctrl_lookup(mgr, hcon->remote_id);
-	if (!ctrl)
-		return false;
-
-	if (!ctrl->assoc_rem_len) {
-		BT_DBG("all fragments are written");
-		ctrl->assoc_rem_len = ctrl->assoc_len;
-		ctrl->assoc_len_so_far = 0;
-
-		amp_ctrl_put(ctrl);
-		return true;
-	}
-
-	frag_len = min_t(u16, 248, ctrl->assoc_rem_len);
-	len = frag_len + sizeof(*cp);
-
-	cp = kzalloc(len, GFP_KERNEL);
-	if (!cp) {
-		amp_ctrl_put(ctrl);
-		return false;
-	}
-
-	BT_DBG("hcon %p ctrl %p frag_len %u assoc_len %u rem_len %u",
-	       hcon, ctrl, frag_len, ctrl->assoc_len, ctrl->assoc_rem_len);
-
-	cp->phy_handle = hcon->handle;
-	cp->len_so_far = cpu_to_le16(ctrl->assoc_len_so_far);
-	cp->rem_len = cpu_to_le16(ctrl->assoc_rem_len);
-	memcpy(cp->frag, ctrl->assoc, frag_len);
-
-	ctrl->assoc_len_so_far += frag_len;
-	ctrl->assoc_rem_len -= frag_len;
-
-	amp_ctrl_put(ctrl);
-
-	hci_req_init(&req, hdev);
-	hci_req_add(&req, HCI_OP_WRITE_REMOTE_AMP_ASSOC, len, cp);
-	hci_req_run_skb(&req, write_remote_amp_assoc_complete);
-
-	kfree(cp);
-
-	return false;
-}
-
-void amp_write_rem_assoc_continue(struct hci_dev *hdev, u8 handle)
-{
-	struct hci_conn *hcon;
-
-	BT_DBG("%s phy handle 0x%2.2x", hdev->name, handle);
-
-	hcon = hci_conn_hash_lookup_handle(hdev, handle);
-	if (!hcon)
-		return;
-
-	/* Send A2MP create phylink rsp when all fragments are written */
-	if (amp_write_rem_assoc_frag(hdev, hcon))
-		a2mp_send_create_phy_link_rsp(hdev, 0);
-}
-
-void amp_write_remote_assoc(struct hci_dev *hdev, u8 handle)
-{
-	struct hci_conn *hcon;
-
-	BT_DBG("%s phy handle 0x%2.2x", hdev->name, handle);
-
-	hcon = hci_conn_hash_lookup_handle(hdev, handle);
-	if (!hcon)
-		return;
-
-	BT_DBG("%s phy handle 0x%2.2x hcon %p", hdev->name, handle, hcon);
-
-	amp_write_rem_assoc_frag(hdev, hcon);
-}
-
-static void create_phylink_complete(struct hci_dev *hdev, u8 status,
-				    u16 opcode)
-{
-	struct hci_cp_create_phy_link *cp;
-
-	BT_DBG("%s status 0x%2.2x", hdev->name, status);
-
-	cp = hci_sent_cmd_data(hdev, HCI_OP_CREATE_PHY_LINK);
-	if (!cp)
-		return;
-
-	hci_dev_lock(hdev);
-
-	if (status) {
-		struct hci_conn *hcon;
-
-		hcon = hci_conn_hash_lookup_handle(hdev, cp->phy_handle);
-		if (hcon)
-			hci_conn_del(hcon);
-	} else {
-		amp_write_remote_assoc(hdev, cp->phy_handle);
-	}
-
-	hci_dev_unlock(hdev);
-}
-
-void amp_create_phylink(struct hci_dev *hdev, struct amp_mgr *mgr,
-			struct hci_conn *hcon)
-{
-	struct hci_cp_create_phy_link cp;
-	struct hci_request req;
-
-	cp.phy_handle = hcon->handle;
-
-	BT_DBG("%s hcon %p phy handle 0x%2.2x", hdev->name, hcon,
-	       hcon->handle);
-
-	if (phylink_gen_key(mgr->l2cap_conn->hcon, cp.key, &cp.key_len,
-			    &cp.key_type)) {
-		BT_DBG("Cannot create link key");
-		return;
-	}
-
-	hci_req_init(&req, hdev);
-	hci_req_add(&req, HCI_OP_CREATE_PHY_LINK, sizeof(cp), &cp);
-	hci_req_run(&req, create_phylink_complete);
-}
-
-static void accept_phylink_complete(struct hci_dev *hdev, u8 status,
-				    u16 opcode)
-{
-	struct hci_cp_accept_phy_link *cp;
-
-	BT_DBG("%s status 0x%2.2x", hdev->name, status);
-
-	if (status)
-		return;
-
-	cp = hci_sent_cmd_data(hdev, HCI_OP_ACCEPT_PHY_LINK);
-	if (!cp)
-		return;
-
-	amp_write_remote_assoc(hdev, cp->phy_handle);
-}
-
-void amp_accept_phylink(struct hci_dev *hdev, struct amp_mgr *mgr,
-			struct hci_conn *hcon)
-{
-	struct hci_cp_accept_phy_link cp;
-	struct hci_request req;
-
-	cp.phy_handle = hcon->handle;
-
-	BT_DBG("%s hcon %p phy handle 0x%2.2x", hdev->name, hcon,
-	       hcon->handle);
-
-	if (phylink_gen_key(mgr->l2cap_conn->hcon, cp.key, &cp.key_len,
-			    &cp.key_type)) {
-		BT_DBG("Cannot create link key");
-		return;
-	}
-
-	hci_req_init(&req, hdev);
-	hci_req_add(&req, HCI_OP_ACCEPT_PHY_LINK, sizeof(cp), &cp);
-	hci_req_run(&req, accept_phylink_complete);
-}
-
-void amp_physical_cfm(struct hci_conn *bredr_hcon, struct hci_conn *hs_hcon)
-{
-	struct hci_dev *bredr_hdev = hci_dev_hold(bredr_hcon->hdev);
-	struct amp_mgr *mgr = hs_hcon->amp_mgr;
-	struct l2cap_chan *bredr_chan;
-
-	BT_DBG("bredr_hcon %p hs_hcon %p mgr %p", bredr_hcon, hs_hcon, mgr);
-
-	if (!bredr_hdev || !mgr || !mgr->bredr_chan)
-		return;
-
-	bredr_chan = mgr->bredr_chan;
-
-	l2cap_chan_lock(bredr_chan);
-
-	set_bit(FLAG_EFS_ENABLE, &bredr_chan->flags);
-	bredr_chan->remote_amp_id = hs_hcon->remote_id;
-	bredr_chan->local_amp_id = hs_hcon->hdev->id;
-	bredr_chan->hs_hcon = hs_hcon;
-	bredr_chan->conn->mtu = hs_hcon->hdev->block_mtu;
-
-	__l2cap_physical_cfm(bredr_chan, 0);
-
-	l2cap_chan_unlock(bredr_chan);
-
-	hci_dev_put(bredr_hdev);
-}
-
-void amp_create_logical_link(struct l2cap_chan *chan)
-{
-	struct hci_conn *hs_hcon = chan->hs_hcon;
-	struct hci_cp_create_accept_logical_link cp;
-	struct hci_dev *hdev;
-
-	BT_DBG("chan %p hs_hcon %p dst %pMR", chan, hs_hcon,
-	       &chan->conn->hcon->dst);
-
-	if (!hs_hcon)
-		return;
-
-	hdev = hci_dev_hold(chan->hs_hcon->hdev);
-	if (!hdev)
-		return;
-
-	cp.phy_handle = hs_hcon->handle;
-
-	cp.tx_flow_spec.id = chan->local_id;
-	cp.tx_flow_spec.stype = chan->local_stype;
-	cp.tx_flow_spec.msdu = cpu_to_le16(chan->local_msdu);
-	cp.tx_flow_spec.sdu_itime = cpu_to_le32(chan->local_sdu_itime);
-	cp.tx_flow_spec.acc_lat = cpu_to_le32(chan->local_acc_lat);
-	cp.tx_flow_spec.flush_to = cpu_to_le32(chan->local_flush_to);
-
-	cp.rx_flow_spec.id = chan->remote_id;
-	cp.rx_flow_spec.stype = chan->remote_stype;
-	cp.rx_flow_spec.msdu = cpu_to_le16(chan->remote_msdu);
-	cp.rx_flow_spec.sdu_itime = cpu_to_le32(chan->remote_sdu_itime);
-	cp.rx_flow_spec.acc_lat = cpu_to_le32(chan->remote_acc_lat);
-	cp.rx_flow_spec.flush_to = cpu_to_le32(chan->remote_flush_to);
-
-	if (hs_hcon->out)
-		hci_send_cmd(hdev, HCI_OP_CREATE_LOGICAL_LINK, sizeof(cp),
-			     &cp);
-	else
-		hci_send_cmd(hdev, HCI_OP_ACCEPT_LOGICAL_LINK, sizeof(cp),
-			     &cp);
-
-	hci_dev_put(hdev);
-}
-
-void amp_disconnect_logical_link(struct hci_chan *hchan)
-{
-	struct hci_conn *hcon = hchan->conn;
-	struct hci_cp_disconn_logical_link cp;
-
-	if (hcon->state != BT_CONNECTED) {
-		BT_DBG("hchan %p not connected", hchan);
-		return;
-	}
-
-	cp.log_handle = cpu_to_le16(hchan->handle);
-	hci_send_cmd(hcon->hdev, HCI_OP_DISCONN_LOGICAL_LINK, sizeof(cp), &cp);
-}
-
-void amp_destroy_logical_link(struct hci_chan *hchan, u8 reason)
-{
-	BT_DBG("hchan %p", hchan);
-
-	hci_chan_del(hchan);
-}
diff --git a/net/bluetooth/amp.h b/net/bluetooth/amp.h
deleted file mode 100644
index 97c87abd129f64..00000000000000
--- a/net/bluetooth/amp.h
+++ /dev/null
@@ -1,60 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
-   Copyright (c) 2011,2012 Intel Corp.
-
-*/
-
-#ifndef __AMP_H
-#define __AMP_H
-
-struct amp_ctrl {
-	struct list_head	list;
-	struct kref		kref;
-	__u8			id;
-	__u16			assoc_len_so_far;
-	__u16			assoc_rem_len;
-	__u16			assoc_len;
-	__u8			*assoc;
-};
-
-int amp_ctrl_put(struct amp_ctrl *ctrl);
-void amp_ctrl_get(struct amp_ctrl *ctrl);
-struct amp_ctrl *amp_ctrl_add(struct amp_mgr *mgr, u8 id);
-struct amp_ctrl *amp_ctrl_lookup(struct amp_mgr *mgr, u8 id);
-void amp_ctrl_list_flush(struct amp_mgr *mgr);
-
-struct hci_conn *phylink_add(struct hci_dev *hdev, struct amp_mgr *mgr,
-			     u8 remote_id, bool out);
-
-int phylink_gen_key(struct hci_conn *hcon, u8 *data, u8 *len, u8 *type);
-
-void amp_read_loc_assoc_frag(struct hci_dev *hdev, u8 phy_handle);
-void amp_read_loc_assoc(struct hci_dev *hdev, struct amp_mgr *mgr);
-void amp_read_loc_assoc_final_data(struct hci_dev *hdev,
-				   struct hci_conn *hcon);
-void amp_create_phylink(struct hci_dev *hdev, struct amp_mgr *mgr,
-			struct hci_conn *hcon);
-void amp_accept_phylink(struct hci_dev *hdev, struct amp_mgr *mgr,
-			struct hci_conn *hcon);
-
-#if IS_ENABLED(CONFIG_BT_HS)
-void amp_create_logical_link(struct l2cap_chan *chan);
-void amp_disconnect_logical_link(struct hci_chan *hchan);
-#else
-static inline void amp_create_logical_link(struct l2cap_chan *chan)
-{
-}
-
-static inline void amp_disconnect_logical_link(struct hci_chan *hchan)
-{
-}
-#endif
-
-void amp_write_remote_assoc(struct hci_dev *hdev, u8 handle);
-void amp_write_rem_assoc_continue(struct hci_dev *hdev, u8 handle);
-void amp_physical_cfm(struct hci_conn *bredr_hcon, struct hci_conn *hs_hcon);
-void amp_create_logical_link(struct l2cap_chan *chan);
-void amp_disconnect_logical_link(struct hci_chan *hchan);
-void amp_destroy_logical_link(struct hci_chan *hchan, u8 reason);
-
-#endif /* __AMP_H */
diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c
index a41d2693f4d8c7..fc4d72f83ac25f 100644
--- a/net/bluetooth/hci_conn.c
+++ b/net/bluetooth/hci_conn.c
@@ -36,7 +36,6 @@
 
 #include "hci_request.h"
 #include "smp.h"
-#include "a2mp.h"
 #include "eir.h"
 
 struct sco_param {
@@ -1175,9 +1174,6 @@ void hci_conn_del(struct hci_conn *conn)
 		}
 	}
 
-	if (conn->amp_mgr)
-		amp_mgr_put(conn->amp_mgr);
-
 	skb_queue_purge(&conn->data_q);
 
 	/* Remove the connection from the list and cleanup its remaining
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index 31df5f5b799455..11b55d1f977272 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -36,8 +36,6 @@
 #include "hci_request.h"
 #include "hci_debugfs.h"
 #include "hci_codec.h"
-#include "a2mp.h"
-#include "amp.h"
 #include "smp.h"
 #include "msft.h"
 #include "eir.h"
diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c
index 656f49b299d20d..ab5a9d42fae71a 100644
--- a/net/bluetooth/l2cap_core.c
+++ b/net/bluetooth/l2cap_core.c
@@ -39,8 +39,6 @@
 #include <net/bluetooth/l2cap.h>
 
 #include "smp.h"
-#include "a2mp.h"
-#include "amp.h"
 
 #define LE_FLOWCTL_MAX_CREDITS 65535
 
@@ -167,24 +165,6 @@ static struct l2cap_chan *__l2cap_get_chan_by_ident(struct l2cap_conn *conn,
 	return NULL;
 }
 
-static struct l2cap_chan *l2cap_get_chan_by_ident(struct l2cap_conn *conn,
-						  u8 ident)
-{
-	struct l2cap_chan *c;
-
-	mutex_lock(&conn->chan_lock);
-	c = __l2cap_get_chan_by_ident(conn, ident);
-	if (c) {
-		/* Only lock if chan reference is not 0 */
-		c = l2cap_chan_hold_unless_zero(c);
-		if (c)
-			l2cap_chan_lock(c);
-	}
-	mutex_unlock(&conn->chan_lock);
-
-	return c;
-}
-
 static struct l2cap_chan *__l2cap_global_chan_by_addr(__le16 psm, bdaddr_t *src,
 						      u8 src_type)
 {
@@ -651,7 +631,6 @@ void l2cap_chan_del(struct l2cap_chan *chan, int err)
 	chan->ops->teardown(chan, err);
 
 	if (conn) {
-		struct amp_mgr *mgr = conn->hcon->amp_mgr;
 		/* Delete from channel list */
 		list_del(&chan->list);
 
@@ -666,16 +645,6 @@ void l2cap_chan_del(struct l2cap_chan *chan, int err)
 		if (chan->chan_type != L2CAP_CHAN_FIXED ||
 		    test_bit(FLAG_HOLD_HCI_CONN, &chan->flags))
 			hci_conn_drop(conn->hcon);
-
-		if (mgr && mgr->bredr_chan == chan)
-			mgr->bredr_chan = NULL;
-	}
-
-	if (chan->hs_hchan) {
-		struct hci_chan *hs_hchan = chan->hs_hchan;
-
-		BT_DBG("chan %p disconnect hs_hchan %p", chan, hs_hchan);
-		amp_disconnect_logical_link(hs_hchan);
 	}
 
 	if (test_bit(CONF_NOT_COMPLETE, &chan->conf_state))
@@ -977,12 +946,6 @@ static void l2cap_send_cmd(struct l2cap_conn *conn, u8 ident, u8 code, u16 len,
 	hci_send_acl(conn->hchan, skb, flags);
 }
 
-static bool __chan_is_moving(struct l2cap_chan *chan)
-{
-	return chan->move_state != L2CAP_MOVE_STABLE &&
-	       chan->move_state != L2CAP_MOVE_WAIT_PREPARE;
-}
-
 static void l2cap_do_send(struct l2cap_chan *chan, struct sk_buff *skb)
 {
 	struct hci_conn *hcon = chan->conn->hcon;
@@ -991,15 +954,6 @@ static void l2cap_do_send(struct l2cap_chan *chan, struct sk_buff *skb)
 	BT_DBG("chan %p, skb %p len %d priority %u", chan, skb, skb->len,
 	       skb->priority);
 
-	if (chan->hs_hcon && !__chan_is_moving(chan)) {
-		if (chan->hs_hchan)
-			hci_send_acl(chan->hs_hchan, skb, ACL_COMPLETE);
-		else
-			kfree_skb(skb);
-
-		return;
-	}
-
 	/* Use NO_FLUSH for LE links (where this is the only option) or
 	 * if the BR/EDR link supports it and flushing has not been
 	 * explicitly requested (through FLAG_FLUSHABLE).
@@ -1180,9 +1134,6 @@ static void l2cap_send_sframe(struct l2cap_chan *chan,
 	if (!control->sframe)
 		return;
 
-	if (__chan_is_moving(chan))
-		return;
-
 	if (test_and_clear_bit(CONN_SEND_FBIT, &chan->conn_state) &&
 	    !control->poll)
 		control->final = 1;
@@ -1237,40 +1188,6 @@ static inline int __l2cap_no_conn_pending(struct l2cap_chan *chan)
 	return !test_bit(CONF_CONNECT_PEND, &chan->conf_state);
 }
 
-static bool __amp_capable(struct l2cap_chan *chan)
-{
-	struct l2cap_conn *conn = chan->conn;
-	struct hci_dev *hdev;
-	bool amp_available = false;
-
-	if (!(conn->local_fixed_chan & L2CAP_FC_A2MP))
-		return false;
-
-	if (!(conn->remote_fixed_chan & L2CAP_FC_A2MP))
-		return false;
-
-	read_lock(&hci_dev_list_lock);
-	list_for_each_entry(hdev, &hci_dev_list, list) {
-		if (hdev->amp_type != AMP_TYPE_BREDR &&
-		    test_bit(HCI_UP, &hdev->flags)) {
-			amp_available = true;
-			break;
-		}
-	}
-	read_unlock(&hci_dev_list_lock);
-
-	if (chan->chan_policy == BT_CHANNEL_POLICY_AMP_PREFERRED)
-		return amp_available;
-
-	return false;
-}
-
-static bool l2cap_check_efs(struct l2cap_chan *chan)
-{
-	/* Check EFS parameters */
-	return true;
-}
-
 void l2cap_send_conn_req(struct l2cap_chan *chan)
 {
 	struct l2cap_conn *conn = chan->conn;
@@ -1286,76 +1203,6 @@ void l2cap_send_conn_req(struct l2cap_chan *chan)
 	l2cap_send_cmd(conn, chan->ident, L2CAP_CONN_REQ, sizeof(req), &req);
 }
 
-static void l2cap_send_create_chan_req(struct l2cap_chan *chan, u8 amp_id)
-{
-	struct l2cap_create_chan_req req;
-	req.scid = cpu_to_le16(chan->scid);
-	req.psm  = chan->psm;
-	req.amp_id = amp_id;
-
-	chan->ident = l2cap_get_ident(chan->conn);
-
-	l2cap_send_cmd(chan->conn, chan->ident, L2CAP_CREATE_CHAN_REQ,
-		       sizeof(req), &req);
-}
-
-static void l2cap_move_setup(struct l2cap_chan *chan)
-{
-	struct sk_buff *skb;
-
-	BT_DBG("chan %p", chan);
-
-	if (chan->mode != L2CAP_MODE_ERTM)
-		return;
-
-	__clear_retrans_timer(chan);
-	__clear_monitor_timer(chan);
-	__clear_ack_timer(chan);
-
-	chan->retry_count = 0;
-	skb_queue_walk(&chan->tx_q, skb) {
-		if (bt_cb(skb)->l2cap.retries)
-			bt_cb(skb)->l2cap.retries = 1;
-		else
-			break;
-	}
-
-	chan->expected_tx_seq = chan->buffer_seq;
-
-	clear_bit(CONN_REJ_ACT, &chan->conn_state);
-	clear_bit(CONN_SREJ_ACT, &chan->conn_state);
-	l2cap_seq_list_clear(&chan->retrans_list);
-	l2cap_seq_list_clear(&chan->srej_list);
-	skb_queue_purge(&chan->srej_q);
-
-	chan->tx_state = L2CAP_TX_STATE_XMIT;
-	chan->rx_state = L2CAP_RX_STATE_MOVE;
-
-	set_bit(CONN_REMOTE_BUSY, &chan->conn_state);
-}
-
-static void l2cap_move_done(struct l2cap_chan *chan)
-{
-	u8 move_role = chan->move_role;
-	BT_DBG("chan %p", chan);
-
-	chan->move_state = L2CAP_MOVE_STABLE;
-	chan->move_role = L2CAP_MOVE_ROLE_NONE;
-
-	if (chan->mode != L2CAP_MODE_ERTM)
-		return;
-
-	switch (move_role) {
-	case L2CAP_MOVE_ROLE_INITIATOR:
-		l2cap_tx(chan, NULL, NULL, L2CAP_EV_EXPLICIT_POLL);
-		chan->rx_state = L2CAP_RX_STATE_WAIT_F;
-		break;
-	case L2CAP_MOVE_ROLE_RESPONDER:
-		chan->rx_state = L2CAP_RX_STATE_WAIT_P;
-		break;
-	}
-}
-
 static void l2cap_chan_ready(struct l2cap_chan *chan)
 {
 	/* The channel may have already been flagged as connected in
@@ -1505,10 +1352,7 @@ static void l2cap_le_start(struct l2cap_chan *chan)
 
 static void l2cap_start_connection(struct l2cap_chan *chan)
 {
-	if (__amp_capable(chan)) {
-		BT_DBG("chan %p AMP capable: discover AMPs", chan);
-		a2mp_discover_amp(chan);
-	} else if (chan->conn->hcon->type == LE_LINK) {
+	if (chan->conn->hcon->type == LE_LINK) {
 		l2cap_le_start(chan);
 	} else {
 		l2cap_send_conn_req(chan);
@@ -1611,11 +1455,6 @@ static void l2cap_send_disconn_req(struct l2cap_chan *chan, int err)
 		__clear_ack_timer(chan);
 	}
 
-	if (chan->scid == L2CAP_CID_A2MP) {
-		l2cap_state_change(chan, BT_DISCONN);
-		return;
-	}
-
 	req.dcid = cpu_to_le16(chan->dcid);
 	req.scid = cpu_to_le16(chan->scid);
 	l2cap_send_cmd(conn, l2cap_get_ident(conn), L2CAP_DISCONN_REQ,
@@ -1754,11 +1593,6 @@ static void l2cap_conn_ready(struct l2cap_conn *conn)
 
 		l2cap_chan_lock(chan);
 
-		if (chan->scid == L2CAP_CID_A2MP) {
-			l2cap_chan_unlock(chan);
-			continue;
-		}
-
 		if (hcon->type == LE_LINK) {
 			l2cap_le_start(chan);
 		} else if (chan->chan_type != L2CAP_CHAN_CONN_ORIENTED) {
@@ -2067,9 +1901,6 @@ static void l2cap_streaming_send(struct l2cap_chan *chan,
 
 	BT_DBG("chan %p, skbs %p", chan, skbs);
 
-	if (__chan_is_moving(chan))
-		return;
-
 	skb_queue_splice_tail_init(skbs, &chan->tx_q);
 
 	while (!skb_queue_empty(&chan->tx_q)) {
@@ -2112,9 +1943,6 @@ static int l2cap_ertm_send(struct l2cap_chan *chan)
 	if (test_bit(CONN_REMOTE_BUSY, &chan->conn_state))
 		return 0;
 
-	if (__chan_is_moving(chan))
-		return 0;
-
 	while (chan->tx_send_head &&
 	       chan->unacked_frames < chan->remote_tx_win &&
 	       chan->tx_state == L2CAP_TX_STATE_XMIT) {
@@ -2180,9 +2008,6 @@ static void l2cap_ertm_resend(struct l2cap_chan *chan)
 	if (test_bit(CONN_REMOTE_BUSY, &chan->conn_state))
 		return;
 
-	if (__chan_is_moving(chan))
-		return;
-
 	while (chan->retrans_list.head != L2CAP_SEQ_LIST_CLEAR) {
 		seq = l2cap_seq_list_pop(&chan->retrans_list);
 
@@ -2522,8 +2347,7 @@ static int l2cap_segment_sdu(struct l2cap_chan *chan,
 	pdu_len = chan->conn->mtu;
 
 	/* Constrain PDU size for BR/EDR connections */
-	if (!chan->hs_hcon)
-		pdu_len = min_t(size_t, pdu_len, L2CAP_BREDR_MAX_PAYLOAD);
+	pdu_len = min_t(size_t, pdu_len, L2CAP_BREDR_MAX_PAYLOAD);
 
 	/* Adjust for largest possible L2CAP overhead. */
 	if (chan->fcs)
@@ -3287,11 +3111,6 @@ int l2cap_ertm_init(struct l2cap_chan *chan)
 
 	skb_queue_head_init(&chan->tx_q);
 
-	chan->local_amp_id = AMP_ID_BREDR;
-	chan->move_id = AMP_ID_BREDR;
-	chan->move_state = L2CAP_MOVE_STABLE;
-	chan->move_role = L2CAP_MOVE_ROLE_NONE;
-
 	if (chan->mode != L2CAP_MODE_ERTM)
 		return 0;
 
@@ -3326,52 +3145,19 @@ static inline __u8 l2cap_select_mode(__u8 mode, __u16 remote_feat_mask)
 
 static inline bool __l2cap_ews_supported(struct l2cap_conn *conn)
 {
-	return ((conn->local_fixed_chan & L2CAP_FC_A2MP) &&
-		(conn->feat_mask & L2CAP_FEAT_EXT_WINDOW));
+	return (conn->feat_mask & L2CAP_FEAT_EXT_WINDOW);
 }
 
 static inline bool __l2cap_efs_supported(struct l2cap_conn *conn)
 {
-	return ((conn->local_fixed_chan & L2CAP_FC_A2MP) &&
-		(conn->feat_mask & L2CAP_FEAT_EXT_FLOW));
+	return (conn->feat_mask & L2CAP_FEAT_EXT_FLOW);
 }
 
 static void __l2cap_set_ertm_timeouts(struct l2cap_chan *chan,
 				      struct l2cap_conf_rfc *rfc)
 {
-	if (chan->local_amp_id != AMP_ID_BREDR && chan->hs_hcon) {
-		u64 ertm_to = chan->hs_hcon->hdev->amp_be_flush_to;
-
-		/* Class 1 devices have must have ERTM timeouts
-		 * exceeding the Link Supervision Timeout.  The
-		 * default Link Supervision Timeout for AMP
-		 * controllers is 10 seconds.
-		 *
-		 * Class 1 devices use 0xffffffff for their
-		 * best-effort flush timeout, so the clamping logic
-		 * will result in a timeout that meets the above
-		 * requirement.  ERTM timeouts are 16-bit values, so
-		 * the maximum timeout is 65.535 seconds.
-		 */
-
-		/* Convert timeout to milliseconds and round */
-		ertm_to = DIV_ROUND_UP_ULL(ertm_to, 1000);
-
-		/* This is the recommended formula for class 2 devices
-		 * that start ERTM timers when packets are sent to the
-		 * controller.
-		 */
-		ertm_to = 3 * ertm_to + 500;
-
-		if (ertm_to > 0xffff)
-			ertm_to = 0xffff;
-
-		rfc->retrans_timeout = cpu_to_le16((u16) ertm_to);
-		rfc->monitor_timeout = rfc->retrans_timeout;
-	} else {
-		rfc->retrans_timeout = cpu_to_le16(L2CAP_DEFAULT_RETRANS_TO);
-		rfc->monitor_timeout = cpu_to_le16(L2CAP_DEFAULT_MONITOR_TO);
-	}
+	rfc->retrans_timeout = cpu_to_le16(L2CAP_DEFAULT_RETRANS_TO);
+	rfc->monitor_timeout = cpu_to_le16(L2CAP_DEFAULT_MONITOR_TO);
 }
 
 static inline void l2cap_txwin_setup(struct l2cap_chan *chan)
@@ -3623,13 +3409,7 @@ static int l2cap_parse_conf_req(struct l2cap_chan *chan, void *data, size_t data
 		case L2CAP_CONF_EWS:
 			if (olen != 2)
 				break;
-			if (!(chan->conn->local_fixed_chan & L2CAP_FC_A2MP))
-				return -ECONNREFUSED;
-			set_bit(FLAG_EXT_CTRL, &chan->flags);
-			set_bit(CONF_EWS_RECV, &chan->conf_state);
-			chan->tx_win_max = L2CAP_DEFAULT_EXT_WINDOW;
-			chan->remote_tx_win = val;
-			break;
+			return -ECONNREFUSED;
 
 		default:
 			if (hint)
@@ -4027,11 +3807,7 @@ void __l2cap_connect_rsp_defer(struct l2cap_chan *chan)
 	rsp.dcid   = cpu_to_le16(chan->scid);
 	rsp.result = cpu_to_le16(L2CAP_CR_SUCCESS);
 	rsp.status = cpu_to_le16(L2CAP_CS_NO_INFO);
-
-	if (chan->hs_hcon)
-		rsp_code = L2CAP_CREATE_CHAN_RSP;
-	else
-		rsp_code = L2CAP_CONN_RSP;
+	rsp_code = L2CAP_CONN_RSP;
 
 	BT_DBG("chan %p rsp_code %u", chan, rsp_code);
 
@@ -4190,7 +3966,6 @@ static struct l2cap_chan *l2cap_connect(struct l2cap_conn *conn,
 	chan->dst_type = bdaddr_dst_type(conn->hcon);
 	chan->psm  = psm;
 	chan->dcid = scid;
-	chan->local_amp_id = amp_id;
 
 	__l2cap_chan_add(conn, chan);
 
@@ -4516,10 +4291,7 @@ static inline int l2cap_config_req(struct l2cap_conn *conn,
 		/* check compatibility */
 
 		/* Send rsp for BR/EDR channel */
-		if (!chan->hs_hcon)
-			l2cap_send_efs_conf_rsp(chan, rsp, cmd->ident, flags);
-		else
-			chan->ident = cmd->ident;
+		l2cap_send_efs_conf_rsp(chan, rsp, cmd->ident, flags);
 	}
 
 unlock:
@@ -4571,15 +4343,7 @@ static inline int l2cap_config_rsp(struct l2cap_conn *conn,
 				goto done;
 			}
 
-			if (!chan->hs_hcon) {
-				l2cap_send_efs_conf_rsp(chan, buf, cmd->ident,
-							0);
-			} else {
-				if (l2cap_check_efs(chan)) {
-					amp_create_logical_link(chan);
-					chan->ident = cmd->ident;
-				}
-			}
+			l2cap_send_efs_conf_rsp(chan, buf, cmd->ident, 0);
 		}
 		goto done;
 
@@ -4750,9 +4514,6 @@ static inline int l2cap_information_req(struct l2cap_conn *conn,
 		if (!disable_ertm)
 			feat_mask |= L2CAP_FEAT_ERTM | L2CAP_FEAT_STREAMING
 				| L2CAP_FEAT_FCS;
-		if (conn->local_fixed_chan & L2CAP_FC_A2MP)
-			feat_mask |= L2CAP_FEAT_EXT_FLOW
-				| L2CAP_FEAT_EXT_WINDOW;
 
 		put_unaligned_le32(feat_mask, rsp->data);
 		l2cap_send_cmd(conn, cmd->ident, L2CAP_INFO_RSP, sizeof(buf),
@@ -4841,751 +4602,6 @@ static inline int l2cap_information_rsp(struct l2cap_conn *conn,
 	return 0;
 }
 
-static int l2cap_create_channel_req(struct l2cap_conn *conn,
-				    struct l2cap_cmd_hdr *cmd,
-				    u16 cmd_len, void *data)
-{
-	struct l2cap_create_chan_req *req = data;
-	struct l2cap_create_chan_rsp rsp;
-	struct l2cap_chan *chan;
-	struct hci_dev *hdev;
-	u16 psm, scid;
-
-	if (cmd_len != sizeof(*req))
-		return -EPROTO;
-
-	if (!(conn->local_fixed_chan & L2CAP_FC_A2MP))
-		return -EINVAL;
-
-	psm = le16_to_cpu(req->psm);
-	scid = le16_to_cpu(req->scid);
-
-	BT_DBG("psm 0x%2.2x, scid 0x%4.4x, amp_id %d", psm, scid, req->amp_id);
-
-	/* For controller id 0 make BR/EDR connection */
-	if (req->amp_id == AMP_ID_BREDR) {
-		l2cap_connect(conn, cmd, data, L2CAP_CREATE_CHAN_RSP,
-			      req->amp_id);
-		return 0;
-	}
-
-	/* Validate AMP controller id */
-	hdev = hci_dev_get(req->amp_id);
-	if (!hdev)
-		goto error;
-
-	if (hdev->dev_type != HCI_AMP || !test_bit(HCI_UP, &hdev->flags)) {
-		hci_dev_put(hdev);
-		goto error;
-	}
-
-	chan = l2cap_connect(conn, cmd, data, L2CAP_CREATE_CHAN_RSP,
-			     req->amp_id);
-	if (chan) {
-		struct amp_mgr *mgr = conn->hcon->amp_mgr;
-		struct hci_conn *hs_hcon;
-
-		hs_hcon = hci_conn_hash_lookup_ba(hdev, AMP_LINK,
-						  &conn->hcon->dst);
-		if (!hs_hcon) {
-			hci_dev_put(hdev);
-			cmd_reject_invalid_cid(conn, cmd->ident, chan->scid,
-					       chan->dcid);
-			return 0;
-		}
-
-		BT_DBG("mgr %p bredr_chan %p hs_hcon %p", mgr, chan, hs_hcon);
-
-		mgr->bredr_chan = chan;
-		chan->hs_hcon = hs_hcon;
-		chan->fcs = L2CAP_FCS_NONE;
-		conn->mtu = hdev->block_mtu;
-	}
-
-	hci_dev_put(hdev);
-
-	return 0;
-
-error:
-	rsp.dcid = 0;
-	rsp.scid = cpu_to_le16(scid);
-	rsp.result = cpu_to_le16(L2CAP_CR_BAD_AMP);
-	rsp.status = cpu_to_le16(L2CAP_CS_NO_INFO);
-
-	l2cap_send_cmd(conn, cmd->ident, L2CAP_CREATE_CHAN_RSP,
-		       sizeof(rsp), &rsp);
-
-	return 0;
-}
-
-static void l2cap_send_move_chan_req(struct l2cap_chan *chan, u8 dest_amp_id)
-{
-	struct l2cap_move_chan_req req;
-	u8 ident;
-
-	BT_DBG("chan %p, dest_amp_id %d", chan, dest_amp_id);
-
-	ident = l2cap_get_ident(chan->conn);
-	chan->ident = ident;
-
-	req.icid = cpu_to_le16(chan->scid);
-	req.dest_amp_id = dest_amp_id;
-
-	l2cap_send_cmd(chan->conn, ident, L2CAP_MOVE_CHAN_REQ, sizeof(req),
-		       &req);
-
-	__set_chan_timer(chan, L2CAP_MOVE_TIMEOUT);
-}
-
-static void l2cap_send_move_chan_rsp(struct l2cap_chan *chan, u16 result)
-{
-	struct l2cap_move_chan_rsp rsp;
-
-	BT_DBG("chan %p, result 0x%4.4x", chan, result);
-
-	rsp.icid = cpu_to_le16(chan->dcid);
-	rsp.result = cpu_to_le16(result);
-
-	l2cap_send_cmd(chan->conn, chan->ident, L2CAP_MOVE_CHAN_RSP,
-		       sizeof(rsp), &rsp);
-}
-
-static void l2cap_send_move_chan_cfm(struct l2cap_chan *chan, u16 result)
-{
-	struct l2cap_move_chan_cfm cfm;
-
-	BT_DBG("chan %p, result 0x%4.4x", chan, result);
-
-	chan->ident = l2cap_get_ident(chan->conn);
-
-	cfm.icid = cpu_to_le16(chan->scid);
-	cfm.result = cpu_to_le16(result);
-
-	l2cap_send_cmd(chan->conn, chan->ident, L2CAP_MOVE_CHAN_CFM,
-		       sizeof(cfm), &cfm);
-
-	__set_chan_timer(chan, L2CAP_MOVE_TIMEOUT);
-}
-
-static void l2cap_send_move_chan_cfm_icid(struct l2cap_conn *conn, u16 icid)
-{
-	struct l2cap_move_chan_cfm cfm;
-
-	BT_DBG("conn %p, icid 0x%4.4x", conn, icid);
-
-	cfm.icid = cpu_to_le16(icid);
-	cfm.result = cpu_to_le16(L2CAP_MC_UNCONFIRMED);
-
-	l2cap_send_cmd(conn, l2cap_get_ident(conn), L2CAP_MOVE_CHAN_CFM,
-		       sizeof(cfm), &cfm);
-}
-
-static void l2cap_send_move_chan_cfm_rsp(struct l2cap_conn *conn, u8 ident,
-					 u16 icid)
-{
-	struct l2cap_move_chan_cfm_rsp rsp;
-
-	BT_DBG("icid 0x%4.4x", icid);
-
-	rsp.icid = cpu_to_le16(icid);
-	l2cap_send_cmd(conn, ident, L2CAP_MOVE_CHAN_CFM_RSP, sizeof(rsp), &rsp);
-}
-
-static void __release_logical_link(struct l2cap_chan *chan)
-{
-	chan->hs_hchan = NULL;
-	chan->hs_hcon = NULL;
-
-	/* Placeholder - release the logical link */
-}
-
-static void l2cap_logical_fail(struct l2cap_chan *chan)
-{
-	/* Logical link setup failed */
-	if (chan->state != BT_CONNECTED) {
-		/* Create channel failure, disconnect */
-		l2cap_send_disconn_req(chan, ECONNRESET);
-		return;
-	}
-
-	switch (chan->move_role) {
-	case L2CAP_MOVE_ROLE_RESPONDER:
-		l2cap_move_done(chan);
-		l2cap_send_move_chan_rsp(chan, L2CAP_MR_NOT_SUPP);
-		break;
-	case L2CAP_MOVE_ROLE_INITIATOR:
-		if (chan->move_state == L2CAP_MOVE_WAIT_LOGICAL_COMP ||
-		    chan->move_state == L2CAP_MOVE_WAIT_LOGICAL_CFM) {
-			/* Remote has only sent pending or
-			 * success responses, clean up
-			 */
-			l2cap_move_done(chan);
-		}
-
-		/* Other amp move states imply that the move
-		 * has already aborted
-		 */
-		l2cap_send_move_chan_cfm(chan, L2CAP_MC_UNCONFIRMED);
-		break;
-	}
-}
-
-static void l2cap_logical_finish_create(struct l2cap_chan *chan,
-					struct hci_chan *hchan)
-{
-	struct l2cap_conf_rsp rsp;
-
-	chan->hs_hchan = hchan;
-	chan->hs_hcon->l2cap_data = chan->conn;
-
-	l2cap_send_efs_conf_rsp(chan, &rsp, chan->ident, 0);
-
-	if (test_bit(CONF_INPUT_DONE, &chan->conf_state)) {
-		int err;
-
-		set_default_fcs(chan);
-
-		err = l2cap_ertm_init(chan);
-		if (err < 0)
-			l2cap_send_disconn_req(chan, -err);
-		else
-			l2cap_chan_ready(chan);
-	}
-}
-
-static void l2cap_logical_finish_move(struct l2cap_chan *chan,
-				      struct hci_chan *hchan)
-{
-	chan->hs_hcon = hchan->conn;
-	chan->hs_hcon->l2cap_data = chan->conn;
-
-	BT_DBG("move_state %d", chan->move_state);
-
-	switch (chan->move_state) {
-	case L2CAP_MOVE_WAIT_LOGICAL_COMP:
-		/* Move confirm will be sent after a success
-		 * response is received
-		 */
-		chan->move_state = L2CAP_MOVE_WAIT_RSP_SUCCESS;
-		break;
-	case L2CAP_MOVE_WAIT_LOGICAL_CFM:
-		if (test_bit(CONN_LOCAL_BUSY, &chan->conn_state)) {
-			chan->move_state = L2CAP_MOVE_WAIT_LOCAL_BUSY;
-		} else if (chan->move_role == L2CAP_MOVE_ROLE_INITIATOR) {
-			chan->move_state = L2CAP_MOVE_WAIT_CONFIRM_RSP;
-			l2cap_send_move_chan_cfm(chan, L2CAP_MC_CONFIRMED);
-		} else if (chan->move_role == L2CAP_MOVE_ROLE_RESPONDER) {
-			chan->move_state = L2CAP_MOVE_WAIT_CONFIRM;
-			l2cap_send_move_chan_rsp(chan, L2CAP_MR_SUCCESS);
-		}
-		break;
-	default:
-		/* Move was not in expected state, free the channel */
-		__release_logical_link(chan);
-
-		chan->move_state = L2CAP_MOVE_STABLE;
-	}
-}
-
-/* Call with chan locked */
-void l2cap_logical_cfm(struct l2cap_chan *chan, struct hci_chan *hchan,
-		       u8 status)
-{
-	BT_DBG("chan %p, hchan %p, status %d", chan, hchan, status);
-
-	if (status) {
-		l2cap_logical_fail(chan);
-		__release_logical_link(chan);
-		return;
-	}
-
-	if (chan->state != BT_CONNECTED) {
-		/* Ignore logical link if channel is on BR/EDR */
-		if (chan->local_amp_id != AMP_ID_BREDR)
-			l2cap_logical_finish_create(chan, hchan);
-	} else {
-		l2cap_logical_finish_move(chan, hchan);
-	}
-}
-
-void l2cap_move_start(struct l2cap_chan *chan)
-{
-	BT_DBG("chan %p", chan);
-
-	if (chan->local_amp_id == AMP_ID_BREDR) {
-		if (chan->chan_policy != BT_CHANNEL_POLICY_AMP_PREFERRED)
-			return;
-		chan->move_role = L2CAP_MOVE_ROLE_INITIATOR;
-		chan->move_state = L2CAP_MOVE_WAIT_PREPARE;
-		/* Placeholder - start physical link setup */
-	} else {
-		chan->move_role = L2CAP_MOVE_ROLE_INITIATOR;
-		chan->move_state = L2CAP_MOVE_WAIT_RSP_SUCCESS;
-		chan->move_id = 0;
-		l2cap_move_setup(chan);
-		l2cap_send_move_chan_req(chan, 0);
-	}
-}
-
-static void l2cap_do_create(struct l2cap_chan *chan, int result,
-			    u8 local_amp_id, u8 remote_amp_id)
-{
-	BT_DBG("chan %p state %s %u -> %u", chan, state_to_string(chan->state),
-	       local_amp_id, remote_amp_id);
-
-	chan->fcs = L2CAP_FCS_NONE;
-
-	/* Outgoing channel on AMP */
-	if (chan->state == BT_CONNECT) {
-		if (result == L2CAP_CR_SUCCESS) {
-			chan->local_amp_id = local_amp_id;
-			l2cap_send_create_chan_req(chan, remote_amp_id);
-		} else {
-			/* Revert to BR/EDR connect */
-			l2cap_send_conn_req(chan);
-		}
-
-		return;
-	}
-
-	/* Incoming channel on AMP */
-	if (__l2cap_no_conn_pending(chan)) {
-		struct l2cap_conn_rsp rsp;
-		char buf[128];
-		rsp.scid = cpu_to_le16(chan->dcid);
-		rsp.dcid = cpu_to_le16(chan->scid);
-
-		if (result == L2CAP_CR_SUCCESS) {
-			/* Send successful response */
-			rsp.result = cpu_to_le16(L2CAP_CR_SUCCESS);
-			rsp.status = cpu_to_le16(L2CAP_CS_NO_INFO);
-		} else {
-			/* Send negative response */
-			rsp.result = cpu_to_le16(L2CAP_CR_NO_MEM);
-			rsp.status = cpu_to_le16(L2CAP_CS_NO_INFO);
-		}
-
-		l2cap_send_cmd(chan->conn, chan->ident, L2CAP_CREATE_CHAN_RSP,
-			       sizeof(rsp), &rsp);
-
-		if (result == L2CAP_CR_SUCCESS) {
-			l2cap_state_change(chan, BT_CONFIG);
-			set_bit(CONF_REQ_SENT, &chan->conf_state);
-			l2cap_send_cmd(chan->conn, l2cap_get_ident(chan->conn),
-				       L2CAP_CONF_REQ,
-				       l2cap_build_conf_req(chan, buf, sizeof(buf)), buf);
-			chan->num_conf_req++;
-		}
-	}
-}
-
-static void l2cap_do_move_initiate(struct l2cap_chan *chan, u8 local_amp_id,
-				   u8 remote_amp_id)
-{
-	l2cap_move_setup(chan);
-	chan->move_id = local_amp_id;
-	chan->move_state = L2CAP_MOVE_WAIT_RSP;
-
-	l2cap_send_move_chan_req(chan, remote_amp_id);
-}
-
-static void l2cap_do_move_respond(struct l2cap_chan *chan, int result)
-{
-	struct hci_chan *hchan = NULL;
-
-	/* Placeholder - get hci_chan for logical link */
-
-	if (hchan) {
-		if (hchan->state == BT_CONNECTED) {
-			/* Logical link is ready to go */
-			chan->hs_hcon = hchan->conn;
-			chan->hs_hcon->l2cap_data = chan->conn;
-			chan->move_state = L2CAP_MOVE_WAIT_CONFIRM;
-			l2cap_send_move_chan_rsp(chan, L2CAP_MR_SUCCESS);
-
-			l2cap_logical_cfm(chan, hchan, L2CAP_MR_SUCCESS);
-		} else {
-			/* Wait for logical link to be ready */
-			chan->move_state = L2CAP_MOVE_WAIT_LOGICAL_CFM;
-		}
-	} else {
-		/* Logical link not available */
-		l2cap_send_move_chan_rsp(chan, L2CAP_MR_NOT_ALLOWED);
-	}
-}
-
-static void l2cap_do_move_cancel(struct l2cap_chan *chan, int result)
-{
-	if (chan->move_role == L2CAP_MOVE_ROLE_RESPONDER) {
-		u8 rsp_result;
-		if (result == -EINVAL)
-			rsp_result = L2CAP_MR_BAD_ID;
-		else
-			rsp_result = L2CAP_MR_NOT_ALLOWED;
-
-		l2cap_send_move_chan_rsp(chan, rsp_result);
-	}
-
-	chan->move_role = L2CAP_MOVE_ROLE_NONE;
-	chan->move_state = L2CAP_MOVE_STABLE;
-
-	/* Restart data transmission */
-	l2cap_ertm_send(chan);
-}
-
-/* Invoke with locked chan */
-void __l2cap_physical_cfm(struct l2cap_chan *chan, int result)
-{
-	u8 local_amp_id = chan->local_amp_id;
-	u8 remote_amp_id = chan->remote_amp_id;
-
-	BT_DBG("chan %p, result %d, local_amp_id %d, remote_amp_id %d",
-	       chan, result, local_amp_id, remote_amp_id);
-
-	if (chan->state == BT_DISCONN || chan->state == BT_CLOSED)
-		return;
-
-	if (chan->state != BT_CONNECTED) {
-		l2cap_do_create(chan, result, local_amp_id, remote_amp_id);
-	} else if (result != L2CAP_MR_SUCCESS) {
-		l2cap_do_move_cancel(chan, result);
-	} else {
-		switch (chan->move_role) {
-		case L2CAP_MOVE_ROLE_INITIATOR:
-			l2cap_do_move_initiate(chan, local_amp_id,
-					       remote_amp_id);
-			break;
-		case L2CAP_MOVE_ROLE_RESPONDER:
-			l2cap_do_move_respond(chan, result);
-			break;
-		default:
-			l2cap_do_move_cancel(chan, result);
-			break;
-		}
-	}
-}
-
-static inline int l2cap_move_channel_req(struct l2cap_conn *conn,
-					 struct l2cap_cmd_hdr *cmd,
-					 u16 cmd_len, void *data)
-{
-	struct l2cap_move_chan_req *req = data;
-	struct l2cap_move_chan_rsp rsp;
-	struct l2cap_chan *chan;
-	u16 icid = 0;
-	u16 result = L2CAP_MR_NOT_ALLOWED;
-
-	if (cmd_len != sizeof(*req))
-		return -EPROTO;
-
-	icid = le16_to_cpu(req->icid);
-
-	BT_DBG("icid 0x%4.4x, dest_amp_id %d", icid, req->dest_amp_id);
-
-	if (!(conn->local_fixed_chan & L2CAP_FC_A2MP))
-		return -EINVAL;
-
-	chan = l2cap_get_chan_by_dcid(conn, icid);
-	if (!chan) {
-		rsp.icid = cpu_to_le16(icid);
-		rsp.result = cpu_to_le16(L2CAP_MR_NOT_ALLOWED);
-		l2cap_send_cmd(conn, cmd->ident, L2CAP_MOVE_CHAN_RSP,
-			       sizeof(rsp), &rsp);
-		return 0;
-	}
-
-	chan->ident = cmd->ident;
-
-	if (chan->scid < L2CAP_CID_DYN_START ||
-	    chan->chan_policy == BT_CHANNEL_POLICY_BREDR_ONLY ||
-	    (chan->mode != L2CAP_MODE_ERTM &&
-	     chan->mode != L2CAP_MODE_STREAMING)) {
-		result = L2CAP_MR_NOT_ALLOWED;
-		goto send_move_response;
-	}
-
-	if (chan->local_amp_id == req->dest_amp_id) {
-		result = L2CAP_MR_SAME_ID;
-		goto send_move_response;
-	}
-
-	if (req->dest_amp_id != AMP_ID_BREDR) {
-		struct hci_dev *hdev;
-		hdev = hci_dev_get(req->dest_amp_id);
-		if (!hdev || hdev->dev_type != HCI_AMP ||
-		    !test_bit(HCI_UP, &hdev->flags)) {
-			if (hdev)
-				hci_dev_put(hdev);
-
-			result = L2CAP_MR_BAD_ID;
-			goto send_move_response;
-		}
-		hci_dev_put(hdev);
-	}
-
-	/* Detect a move collision.  Only send a collision response
-	 * if this side has "lost", otherwise proceed with the move.
-	 * The winner has the larger bd_addr.
-	 */
-	if ((__chan_is_moving(chan) ||
-	     chan->move_role != L2CAP_MOVE_ROLE_NONE) &&
-	    bacmp(&conn->hcon->src, &conn->hcon->dst) > 0) {
-		result = L2CAP_MR_COLLISION;
-		goto send_move_response;
-	}
-
-	chan->move_role = L2CAP_MOVE_ROLE_RESPONDER;
-	l2cap_move_setup(chan);
-	chan->move_id = req->dest_amp_id;
-
-	if (req->dest_amp_id == AMP_ID_BREDR) {
-		/* Moving to BR/EDR */
-		if (test_bit(CONN_LOCAL_BUSY, &chan->conn_state)) {
-			chan->move_state = L2CAP_MOVE_WAIT_LOCAL_BUSY;
-			result = L2CAP_MR_PEND;
-		} else {
-			chan->move_state = L2CAP_MOVE_WAIT_CONFIRM;
-			result = L2CAP_MR_SUCCESS;
-		}
-	} else {
-		chan->move_state = L2CAP_MOVE_WAIT_PREPARE;
-		/* Placeholder - uncomment when amp functions are available */
-		/*amp_accept_physical(chan, req->dest_amp_id);*/
-		result = L2CAP_MR_PEND;
-	}
-
-send_move_response:
-	l2cap_send_move_chan_rsp(chan, result);
-
-	l2cap_chan_unlock(chan);
-	l2cap_chan_put(chan);
-
-	return 0;
-}
-
-static void l2cap_move_continue(struct l2cap_conn *conn, u16 icid, u16 result)
-{
-	struct l2cap_chan *chan;
-	struct hci_chan *hchan = NULL;
-
-	chan = l2cap_get_chan_by_scid(conn, icid);
-	if (!chan) {
-		l2cap_send_move_chan_cfm_icid(conn, icid);
-		return;
-	}
-
-	__clear_chan_timer(chan);
-	if (result == L2CAP_MR_PEND)
-		__set_chan_timer(chan, L2CAP_MOVE_ERTX_TIMEOUT);
-
-	switch (chan->move_state) {
-	case L2CAP_MOVE_WAIT_LOGICAL_COMP:
-		/* Move confirm will be sent when logical link
-		 * is complete.
-		 */
-		chan->move_state = L2CAP_MOVE_WAIT_LOGICAL_CFM;
-		break;
-	case L2CAP_MOVE_WAIT_RSP_SUCCESS:
-		if (result == L2CAP_MR_PEND) {
-			break;
-		} else if (test_bit(CONN_LOCAL_BUSY,
-				    &chan->conn_state)) {
-			chan->move_state = L2CAP_MOVE_WAIT_LOCAL_BUSY;
-		} else {
-			/* Logical link is up or moving to BR/EDR,
-			 * proceed with move
-			 */
-			chan->move_state = L2CAP_MOVE_WAIT_CONFIRM_RSP;
-			l2cap_send_move_chan_cfm(chan, L2CAP_MC_CONFIRMED);
-		}
-		break;
-	case L2CAP_MOVE_WAIT_RSP:
-		/* Moving to AMP */
-		if (result == L2CAP_MR_SUCCESS) {
-			/* Remote is ready, send confirm immediately
-			 * after logical link is ready
-			 */
-			chan->move_state = L2CAP_MOVE_WAIT_LOGICAL_CFM;
-		} else {
-			/* Both logical link and move success
-			 * are required to confirm
-			 */
-			chan->move_state = L2CAP_MOVE_WAIT_LOGICAL_COMP;
-		}
-
-		/* Placeholder - get hci_chan for logical link */
-		if (!hchan) {
-			/* Logical link not available */
-			l2cap_send_move_chan_cfm(chan, L2CAP_MC_UNCONFIRMED);
-			break;
-		}
-
-		/* If the logical link is not yet connected, do not
-		 * send confirmation.
-		 */
-		if (hchan->state != BT_CONNECTED)
-			break;
-
-		/* Logical link is already ready to go */
-
-		chan->hs_hcon = hchan->conn;
-		chan->hs_hcon->l2cap_data = chan->conn;
-
-		if (result == L2CAP_MR_SUCCESS) {
-			/* Can confirm now */
-			l2cap_send_move_chan_cfm(chan, L2CAP_MC_CONFIRMED);
-		} else {
-			/* Now only need move success
-			 * to confirm
-			 */
-			chan->move_state = L2CAP_MOVE_WAIT_RSP_SUCCESS;
-		}
-
-		l2cap_logical_cfm(chan, hchan, L2CAP_MR_SUCCESS);
-		break;
-	default:
-		/* Any other amp move state means the move failed. */
-		chan->move_id = chan->local_amp_id;
-		l2cap_move_done(chan);
-		l2cap_send_move_chan_cfm(chan, L2CAP_MC_UNCONFIRMED);
-	}
-
-	l2cap_chan_unlock(chan);
-	l2cap_chan_put(chan);
-}
-
-static void l2cap_move_fail(struct l2cap_conn *conn, u8 ident, u16 icid,
-			    u16 result)
-{
-	struct l2cap_chan *chan;
-
-	chan = l2cap_get_chan_by_ident(conn, ident);
-	if (!chan) {
-		/* Could not locate channel, icid is best guess */
-		l2cap_send_move_chan_cfm_icid(conn, icid);
-		return;
-	}
-
-	__clear_chan_timer(chan);
-
-	if (chan->move_role == L2CAP_MOVE_ROLE_INITIATOR) {
-		if (result == L2CAP_MR_COLLISION) {
-			chan->move_role = L2CAP_MOVE_ROLE_RESPONDER;
-		} else {
-			/* Cleanup - cancel move */
-			chan->move_id = chan->local_amp_id;
-			l2cap_move_done(chan);
-		}
-	}
-
-	l2cap_send_move_chan_cfm(chan, L2CAP_MC_UNCONFIRMED);
-
-	l2cap_chan_unlock(chan);
-	l2cap_chan_put(chan);
-}
-
-static int l2cap_move_channel_rsp(struct l2cap_conn *conn,
-				  struct l2cap_cmd_hdr *cmd,
-				  u16 cmd_len, void *data)
-{
-	struct l2cap_move_chan_rsp *rsp = data;
-	u16 icid, result;
-
-	if (cmd_len != sizeof(*rsp))
-		return -EPROTO;
-
-	icid = le16_to_cpu(rsp->icid);
-	result = le16_to_cpu(rsp->result);
-
-	BT_DBG("icid 0x%4.4x, result 0x%4.4x", icid, result);
-
-	if (result == L2CAP_MR_SUCCESS || result == L2CAP_MR_PEND)
-		l2cap_move_continue(conn, icid, result);
-	else
-		l2cap_move_fail(conn, cmd->ident, icid, result);
-
-	return 0;
-}
-
-static int l2cap_move_channel_confirm(struct l2cap_conn *conn,
-				      struct l2cap_cmd_hdr *cmd,
-				      u16 cmd_len, void *data)
-{
-	struct l2cap_move_chan_cfm *cfm = data;
-	struct l2cap_chan *chan;
-	u16 icid, result;
-
-	if (cmd_len != sizeof(*cfm))
-		return -EPROTO;
-
-	icid = le16_to_cpu(cfm->icid);
-	result = le16_to_cpu(cfm->result);
-
-	BT_DBG("icid 0x%4.4x, result 0x%4.4x", icid, result);
-
-	chan = l2cap_get_chan_by_dcid(conn, icid);
-	if (!chan) {
-		/* Spec requires a response even if the icid was not found */
-		l2cap_send_move_chan_cfm_rsp(conn, cmd->ident, icid);
-		return 0;
-	}
-
-	if (chan->move_state == L2CAP_MOVE_WAIT_CONFIRM) {
-		if (result == L2CAP_MC_CONFIRMED) {
-			chan->local_amp_id = chan->move_id;
-			if (chan->local_amp_id == AMP_ID_BREDR)
-				__release_logical_link(chan);
-		} else {
-			chan->move_id = chan->local_amp_id;
-		}
-
-		l2cap_move_done(chan);
-	}
-
-	l2cap_send_move_chan_cfm_rsp(conn, cmd->ident, icid);
-
-	l2cap_chan_unlock(chan);
-	l2cap_chan_put(chan);
-
-	return 0;
-}
-
-static inline int l2cap_move_channel_confirm_rsp(struct l2cap_conn *conn,
-						 struct l2cap_cmd_hdr *cmd,
-						 u16 cmd_len, void *data)
-{
-	struct l2cap_move_chan_cfm_rsp *rsp = data;
-	struct l2cap_chan *chan;
-	u16 icid;
-
-	if (cmd_len != sizeof(*rsp))
-		return -EPROTO;
-
-	icid = le16_to_cpu(rsp->icid);
-
-	BT_DBG("icid 0x%4.4x", icid);
-
-	chan = l2cap_get_chan_by_scid(conn, icid);
-	if (!chan)
-		return 0;
-
-	__clear_chan_timer(chan);
-
-	if (chan->move_state == L2CAP_MOVE_WAIT_CONFIRM_RSP) {
-		chan->local_amp_id = chan->move_id;
-
-		if (chan->local_amp_id == AMP_ID_BREDR && chan->hs_hchan)
-			__release_logical_link(chan);
-
-		l2cap_move_done(chan);
-	}
-
-	l2cap_chan_unlock(chan);
-	l2cap_chan_put(chan);
-
-	return 0;
-}
-
 static inline int l2cap_conn_param_update_req(struct l2cap_conn *conn,
 					      struct l2cap_cmd_hdr *cmd,
 					      u16 cmd_len, u8 *data)
@@ -5745,7 +4761,6 @@ static inline int l2cap_bredr_sig_cmd(struct l2cap_conn *conn,
 		break;
 
 	case L2CAP_CONN_RSP:
-	case L2CAP_CREATE_CHAN_RSP:
 		l2cap_connect_create_rsp(conn, cmd, cmd_len, data);
 		break;
 
@@ -5780,26 +4795,6 @@ static inline int l2cap_bredr_sig_cmd(struct l2cap_conn *conn,
 		l2cap_information_rsp(conn, cmd, cmd_len, data);
 		break;
 
-	case L2CAP_CREATE_CHAN_REQ:
-		err = l2cap_create_channel_req(conn, cmd, cmd_len, data);
-		break;
-
-	case L2CAP_MOVE_CHAN_REQ:
-		err = l2cap_move_channel_req(conn, cmd, cmd_len, data);
-		break;
-
-	case L2CAP_MOVE_CHAN_RSP:
-		l2cap_move_channel_rsp(conn, cmd, cmd_len, data);
-		break;
-
-	case L2CAP_MOVE_CHAN_CFM:
-		err = l2cap_move_channel_confirm(conn, cmd, cmd_len, data);
-		break;
-
-	case L2CAP_MOVE_CHAN_CFM_RSP:
-		l2cap_move_channel_confirm_rsp(conn, cmd, cmd_len, data);
-		break;
-
 	default:
 		BT_ERR("Unknown BR/EDR signaling command 0x%2.2x", cmd->code);
 		err = -EINVAL;
@@ -7051,8 +6046,8 @@ static int l2cap_rx_state_recv(struct l2cap_chan *chan,
 		if (control->final) {
 			clear_bit(CONN_REMOTE_BUSY, &chan->conn_state);
 
-			if (!test_and_clear_bit(CONN_REJ_ACT, &chan->conn_state) &&
-			    !__chan_is_moving(chan)) {
+			if (!test_and_clear_bit(CONN_REJ_ACT,
+						&chan->conn_state)) {
 				control->final = 0;
 				l2cap_retransmit_all(chan, control);
 			}
@@ -7245,11 +6240,7 @@ static int l2cap_finish_move(struct l2cap_chan *chan)
 	BT_DBG("chan %p", chan);
 
 	chan->rx_state = L2CAP_RX_STATE_RECV;
-
-	if (chan->hs_hcon)
-		chan->conn->mtu = chan->hs_hcon->hdev->block_mtu;
-	else
-		chan->conn->mtu = chan->conn->hcon->hdev->acl_mtu;
+	chan->conn->mtu = chan->conn->hcon->hdev->acl_mtu;
 
 	return l2cap_resegment(chan);
 }
@@ -7316,11 +6307,7 @@ static int l2cap_rx_state_wait_f(struct l2cap_chan *chan,
 	 */
 	chan->next_tx_seq = control->reqseq;
 	chan->unacked_frames = 0;
-
-	if (chan->hs_hcon)
-		chan->conn->mtu = chan->hs_hcon->hdev->block_mtu;
-	else
-		chan->conn->mtu = chan->conn->hcon->hdev->acl_mtu;
+	chan->conn->mtu = chan->conn->hcon->hdev->acl_mtu;
 
 	err = l2cap_resegment(chan);
 
@@ -7672,21 +6659,10 @@ static void l2cap_data_channel(struct l2cap_conn *conn, u16 cid,
 
 	chan = l2cap_get_chan_by_scid(conn, cid);
 	if (!chan) {
-		if (cid == L2CAP_CID_A2MP) {
-			chan = a2mp_channel_create(conn, skb);
-			if (!chan) {
-				kfree_skb(skb);
-				return;
-			}
-
-			l2cap_chan_hold(chan);
-			l2cap_chan_lock(chan);
-		} else {
-			BT_DBG("unknown cid 0x%4.4x", cid);
-			/* Drop packet and return */
-			kfree_skb(skb);
-			return;
-		}
+		BT_DBG("unknown cid 0x%4.4x", cid);
+		/* Drop packet and return */
+		kfree_skb(skb);
+		return;
 	}
 
 	BT_DBG("chan %p, len %d", chan, skb->len);
@@ -7887,10 +6863,6 @@ static struct l2cap_conn *l2cap_conn_add(struct hci_conn *hcon)
 
 	conn->local_fixed_chan = L2CAP_FC_SIG_BREDR | L2CAP_FC_CONNLESS;
 
-	if (hcon->type == ACL_LINK &&
-	    hci_dev_test_flag(hcon->hdev, HCI_HS_ENABLED))
-		conn->local_fixed_chan |= L2CAP_FC_A2MP;
-
 	if (hci_dev_test_flag(hcon->hdev, HCI_LE_ENABLED) &&
 	    (bredr_sc_enabled(hcon->hdev) ||
 	     hci_dev_test_flag(hcon->hdev, HCI_FORCE_BREDR_SMP)))
@@ -8355,11 +7327,6 @@ static void l2cap_security_cfm(struct hci_conn *hcon, u8 status, u8 encrypt)
 		BT_DBG("chan %p scid 0x%4.4x state %s", chan, chan->scid,
 		       state_to_string(chan->state));
 
-		if (chan->scid == L2CAP_CID_A2MP) {
-			l2cap_chan_unlock(chan);
-			continue;
-		}
-
 		if (!status && encrypt)
 			chan->sec_level = hcon->sec_level;
 
diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c
index e50d3d102078ec..ee7a41d6994fc2 100644
--- a/net/bluetooth/l2cap_sock.c
+++ b/net/bluetooth/l2cap_sock.c
@@ -1027,23 +1027,7 @@ static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname,
 			break;
 		}
 
-		if (opt > BT_CHANNEL_POLICY_AMP_PREFERRED) {
-			err = -EINVAL;
-			break;
-		}
-
-		if (chan->mode != L2CAP_MODE_ERTM &&
-		    chan->mode != L2CAP_MODE_STREAMING) {
-			err = -EOPNOTSUPP;
-			break;
-		}
-
-		chan->chan_policy = (u8) opt;
-
-		if (sk->sk_state == BT_CONNECTED &&
-		    chan->move_role == L2CAP_MOVE_ROLE_NONE)
-			l2cap_move_start(chan);
-
+		err = -EOPNOTSUPP;
 		break;
 
 	case BT_SNDMTU:
diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c
index 8c4493255f92ab..7713e2cdf9e12b 100644
--- a/net/bluetooth/mgmt.c
+++ b/net/bluetooth/mgmt.c
@@ -835,8 +835,6 @@ static u32 get_supported_settings(struct hci_dev *hdev)
 
 		if (lmp_ssp_capable(hdev)) {
 			settings |= MGMT_SETTING_SSP;
-			if (IS_ENABLED(CONFIG_BT_HS))
-				settings |= MGMT_SETTING_HS;
 		}
 
 		if (lmp_sc_capable(hdev))
@@ -901,9 +899,6 @@ static u32 get_current_settings(struct hci_dev *hdev)
 	if (hci_dev_test_flag(hdev, HCI_SSP_ENABLED))
 		settings |= MGMT_SETTING_SSP;
 
-	if (hci_dev_test_flag(hdev, HCI_HS_ENABLED))
-		settings |= MGMT_SETTING_HS;
-
 	if (hci_dev_test_flag(hdev, HCI_ADVERTISING))
 		settings |= MGMT_SETTING_ADVERTISING;
 
@@ -1938,7 +1933,6 @@ static void set_ssp_complete(struct hci_dev *hdev, void *data, int err)
 
 		if (enable && hci_dev_test_and_clear_flag(hdev,
 							  HCI_SSP_ENABLED)) {
-			hci_dev_clear_flag(hdev, HCI_HS_ENABLED);
 			new_settings(hdev, NULL);
 		}
 
@@ -1951,12 +1945,6 @@ static void set_ssp_complete(struct hci_dev *hdev, void *data, int err)
 		changed = !hci_dev_test_and_set_flag(hdev, HCI_SSP_ENABLED);
 	} else {
 		changed = hci_dev_test_and_clear_flag(hdev, HCI_SSP_ENABLED);
-
-		if (!changed)
-			changed = hci_dev_test_and_clear_flag(hdev,
-							      HCI_HS_ENABLED);
-		else
-			hci_dev_clear_flag(hdev, HCI_HS_ENABLED);
 	}
 
 	mgmt_pending_foreach(MGMT_OP_SET_SSP, hdev, settings_rsp, &match);
@@ -2020,11 +2008,6 @@ static int set_ssp(struct sock *sk, struct hci_dev *hdev, void *data, u16 len)
 		} else {
 			changed = hci_dev_test_and_clear_flag(hdev,
 							      HCI_SSP_ENABLED);
-			if (!changed)
-				changed = hci_dev_test_and_clear_flag(hdev,
-								      HCI_HS_ENABLED);
-			else
-				hci_dev_clear_flag(hdev, HCI_HS_ENABLED);
 		}
 
 		err = send_settings_rsp(sk, MGMT_OP_SET_SSP, hdev);
@@ -2070,63 +2053,10 @@ static int set_ssp(struct sock *sk, struct hci_dev *hdev, void *data, u16 len)
 
 static int set_hs(struct sock *sk, struct hci_dev *hdev, void *data, u16 len)
 {
-	struct mgmt_mode *cp = data;
-	bool changed;
-	u8 status;
-	int err;
-
 	bt_dev_dbg(hdev, "sock %p", sk);
 
-	if (!IS_ENABLED(CONFIG_BT_HS))
-		return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_HS,
+	return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_HS,
 				       MGMT_STATUS_NOT_SUPPORTED);
-
-	status = mgmt_bredr_support(hdev);
-	if (status)
-		return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_HS, status);
-
-	if (!lmp_ssp_capable(hdev))
-		return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_HS,
-				       MGMT_STATUS_NOT_SUPPORTED);
-
-	if (!hci_dev_test_flag(hdev, HCI_SSP_ENABLED))
-		return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_HS,
-				       MGMT_STATUS_REJECTED);
-
-	if (cp->val != 0x00 && cp->val != 0x01)
-		return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_HS,
-				       MGMT_STATUS_INVALID_PARAMS);
-
-	hci_dev_lock(hdev);
-
-	if (pending_find(MGMT_OP_SET_SSP, hdev)) {
-		err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_HS,
-				      MGMT_STATUS_BUSY);
-		goto unlock;
-	}
-
-	if (cp->val) {
-		changed = !hci_dev_test_and_set_flag(hdev, HCI_HS_ENABLED);
-	} else {
-		if (hdev_is_powered(hdev)) {
-			err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_HS,
-					      MGMT_STATUS_REJECTED);
-			goto unlock;
-		}
-
-		changed = hci_dev_test_and_clear_flag(hdev, HCI_HS_ENABLED);
-	}
-
-	err = send_settings_rsp(sk, MGMT_OP_SET_HS, hdev);
-	if (err < 0)
-		goto unlock;
-
-	if (changed)
-		err = new_settings(hdev, sk);
-
-unlock:
-	hci_dev_unlock(hdev);
-	return err;
 }
 
 static void set_le_complete(struct hci_dev *hdev, void *data, int err)
@@ -6774,7 +6704,6 @@ static int set_bredr(struct sock *sk, struct hci_dev *hdev, void *data, u16 len)
 			hci_dev_clear_flag(hdev, HCI_SSP_ENABLED);
 			hci_dev_clear_flag(hdev, HCI_LINK_SECURITY);
 			hci_dev_clear_flag(hdev, HCI_FAST_CONNECTABLE);
-			hci_dev_clear_flag(hdev, HCI_HS_ENABLED);
 		}
 
 		hci_dev_change_flag(hdev, HCI_BREDR_ENABLED);

From 874870f318a4a8afcd4b4c180329079add03c039 Mon Sep 17 00:00:00 2001
From: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Date: Wed, 31 Jan 2024 11:24:19 -0500
Subject: [PATCH 0153/1406] Bluetooth: hci_event: Fix not indicating new
 connection for BIG Sync

BIG Sync (aka. Broadcast sink) requires to inform that the device is
connected when a data path is active otherwise userspace could attempt
to free resources allocated to the device object while scanning.

Fixes: 1d11d70d1f6b ("Bluetooth: ISO: Pass BIG encryption info through QoS")
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 net/bluetooth/hci_event.c | 21 ++++++++++++++-------
 net/bluetooth/mgmt.c      |  4 ++++
 2 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index 11b55d1f977272..271c00792801c4 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -2524,9 +2524,7 @@ static void hci_check_pending_name(struct hci_dev *hdev, struct hci_conn *conn,
 	 * Only those in BT_CONFIG or BT_CONNECTED states can be
 	 * considered connected.
 	 */
-	if (conn &&
-	    (conn->state == BT_CONFIG || conn->state == BT_CONNECTED) &&
-	    !test_and_set_bit(HCI_CONN_MGMT_CONNECTED, &conn->flags))
+	if (conn && (conn->state == BT_CONFIG || conn->state == BT_CONNECTED))
 		mgmt_device_connected(hdev, conn, name, name_len);
 
 	if (discov->state == DISCOVERY_STOPPED)
@@ -3758,8 +3756,9 @@ static void hci_remote_features_evt(struct hci_dev *hdev, void *data,
 		bacpy(&cp.bdaddr, &conn->dst);
 		cp.pscan_rep_mode = 0x02;
 		hci_send_cmd(hdev, HCI_OP_REMOTE_NAME_REQ, sizeof(cp), &cp);
-	} else if (!test_and_set_bit(HCI_CONN_MGMT_CONNECTED, &conn->flags))
+	} else {
 		mgmt_device_connected(hdev, conn, NULL, 0);
+	}
 
 	if (!hci_outgoing_auth_needed(hdev, conn)) {
 		conn->state = BT_CONNECTED;
@@ -3932,6 +3931,11 @@ static u8 hci_cc_le_setup_iso_path(struct hci_dev *hdev, void *data,
 		 * last.
 		 */
 		hci_connect_cfm(conn, rp->status);
+
+		/* Notify device connected in case it is a BIG Sync */
+		if (!rp->status && test_bit(HCI_CONN_BIG_SYNC, &conn->flags))
+			mgmt_device_connected(hdev, conn, NULL, 0);
+
 		break;
 	}
 
@@ -5006,8 +5010,9 @@ static void hci_remote_ext_features_evt(struct hci_dev *hdev, void *data,
 		bacpy(&cp.bdaddr, &conn->dst);
 		cp.pscan_rep_mode = 0x02;
 		hci_send_cmd(hdev, HCI_OP_REMOTE_NAME_REQ, sizeof(cp), &cp);
-	} else if (!test_and_set_bit(HCI_CONN_MGMT_CONNECTED, &conn->flags))
+	} else {
 		mgmt_device_connected(hdev, conn, NULL, 0);
+	}
 
 	if (!hci_outgoing_auth_needed(hdev, conn)) {
 		conn->state = BT_CONNECTED;
@@ -5980,8 +5985,7 @@ static void le_conn_complete_evt(struct hci_dev *hdev, u8 status,
 		goto unlock;
 	}
 
-	if (!test_and_set_bit(HCI_CONN_MGMT_CONNECTED, &conn->flags))
-		mgmt_device_connected(hdev, conn, NULL, 0);
+	mgmt_device_connected(hdev, conn, NULL, 0);
 
 	conn->sec_level = BT_SECURITY_LOW;
 	conn->state = BT_CONFIG;
@@ -7210,6 +7214,9 @@ static void hci_le_big_info_adv_report_evt(struct hci_dev *hdev, void *data,
 	/* Notify iso layer */
 	hci_connect_cfm(pa_sync, 0x00);
 
+	/* Notify MGMT layer */
+	mgmt_device_connected(hdev, pa_sync, NULL, 0);
+
 unlock:
 	hci_dev_unlock(hdev);
 }
diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c
index 7713e2cdf9e12b..064a67157d438b 100644
--- a/net/bluetooth/mgmt.c
+++ b/net/bluetooth/mgmt.c
@@ -3126,6 +3126,7 @@ static int disconnect(struct sock *sk, struct hci_dev *hdev, void *data,
 static u8 link_to_bdaddr(u8 link_type, u8 addr_type)
 {
 	switch (link_type) {
+	case ISO_LINK:
 	case LE_LINK:
 		switch (addr_type) {
 		case ADDR_LE_DEV_PUBLIC:
@@ -9618,6 +9619,9 @@ void mgmt_device_connected(struct hci_dev *hdev, struct hci_conn *conn,
 	u16 eir_len = 0;
 	u32 flags = 0;
 
+	if (test_and_set_bit(HCI_CONN_MGMT_CONNECTED, &conn->flags))
+		return;
+
 	/* allocate buff for LE or BR/EDR adv */
 	if (conn->le_adv_data_len > 0)
 		skb = mgmt_alloc_skb(hdev, MGMT_EV_DEVICE_CONNECTED,

From 3103af4848b8968f8e128ea590138ec65fe1662a Mon Sep 17 00:00:00 2001
From: Janaki Ramaiah Thota <quic_janathot@quicinc.com>
Date: Wed, 24 Jan 2024 20:00:42 +0530
Subject: [PATCH 0154/1406] Bluetooth: hci_qca: Set BDA quirk bit if fwnode
 exists in DT

BT adapter going into UNCONFIGURED state during BT turn ON when
devicetree has no local-bd-address node.

Bluetooth will not work out of the box on such devices, to avoid this
problem, added check to set HCI_QUIRK_USE_BDADDR_PROPERTY based on
local-bd-address node entry.

When this quirk is not set, the public Bluetooth address read by host
from controller though HCI Read BD Address command is
considered as valid.

Fixes: e668eb1e1578 ("Bluetooth: hci_core: Don't stop BT if the BD address missing in dts")

Signed-off-by: Janaki Ramaiah Thota <quic_janathot@quicinc.com>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 drivers/bluetooth/hci_qca.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/drivers/bluetooth/hci_qca.c b/drivers/bluetooth/hci_qca.c
index 94b8c406f0c0ed..06193546ebb65d 100644
--- a/drivers/bluetooth/hci_qca.c
+++ b/drivers/bluetooth/hci_qca.c
@@ -7,6 +7,7 @@
  *
  *  Copyright (C) 2007 Texas Instruments, Inc.
  *  Copyright (c) 2010, 2012, 2018 The Linux Foundation. All rights reserved.
+ *  Copyright (c) 2023 Qualcomm Innovation Center, Inc. All rights reserved.
  *
  *  Acknowledgements:
  *  This file is based on hci_ll.c, which was...
@@ -1904,7 +1905,17 @@ static int qca_setup(struct hci_uart *hu)
 	case QCA_WCN6750:
 	case QCA_WCN6855:
 	case QCA_WCN7850:
-		set_bit(HCI_QUIRK_USE_BDADDR_PROPERTY, &hdev->quirks);
+
+		/* Set BDA quirk bit for reading BDA value from fwnode property
+		 * only if that property exist in DT.
+		 */
+		if (fwnode_property_present(dev_fwnode(hdev->dev.parent), "local-bd-address")) {
+			set_bit(HCI_QUIRK_USE_BDADDR_PROPERTY, &hdev->quirks);
+			bt_dev_info(hdev, "setting quirk bit to read BDA from fwnode later");
+		} else {
+			bt_dev_dbg(hdev, "local-bd-address` is not present in the devicetree so not setting quirk bit for BDA");
+		}
+
 		hci_set_aosp_capable(hdev);
 
 		ret = qca_read_soc_version(hdev, &ver, soc_type);

From a93e9d51834e2cdfe357fcf875e895b3d2605f10 Mon Sep 17 00:00:00 2001
From: Zijun Hu <quic_zijuhu@quicinc.com>
Date: Fri, 26 Jan 2024 17:00:24 +0800
Subject: [PATCH 0155/1406] Bluetooth: qca: Fix triggering coredump
 implementation

hci_coredump_qca() uses __hci_cmd_sync() to send a vendor-specific command
to trigger firmware coredump, but the command does not have any event as
its sync response, so it is not suitable to use __hci_cmd_sync(), fixed by
using __hci_cmd_send().

Fixes: 06d3fdfcdf5c ("Bluetooth: hci_qca: Add qcom devcoredump support")
Signed-off-by: Zijun Hu <quic_zijuhu@quicinc.com>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 drivers/bluetooth/hci_qca.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/drivers/bluetooth/hci_qca.c b/drivers/bluetooth/hci_qca.c
index 06193546ebb65d..edd2a81b4d5ed7 100644
--- a/drivers/bluetooth/hci_qca.c
+++ b/drivers/bluetooth/hci_qca.c
@@ -1807,13 +1807,12 @@ static int qca_power_on(struct hci_dev *hdev)
 
 static void hci_coredump_qca(struct hci_dev *hdev)
 {
+	int err;
 	static const u8 param[] = { 0x26 };
-	struct sk_buff *skb;
 
-	skb = __hci_cmd_sync(hdev, 0xfc0c, 1, param, HCI_CMD_TIMEOUT);
-	if (IS_ERR(skb))
-		bt_dev_err(hdev, "%s: trigger crash failed (%ld)", __func__, PTR_ERR(skb));
-	kfree_skb(skb);
+	err = __hci_cmd_send(hdev, 0xfc0c, 1, param);
+	if (err < 0)
+		bt_dev_err(hdev, "%s: trigger crash failed (%d)", __func__, err);
 }
 
 static int qca_get_data_path_id(struct hci_dev *hdev, __u8 *data_path_id)

From 8680970410625875c34d6b97b6fd89d7d62a74ec Mon Sep 17 00:00:00 2001
From: Marco Pagani <marpagan@redhat.com>
Date: Thu, 11 Jan 2024 17:02:42 +0100
Subject: [PATCH 0156/1406] fpga: add an owner and use it to take the low-level
 module's refcount

Add a module owner field to the fpga_manager struct to take the
low-level control module refcount instead of assuming that the parent
device has a driver and using its owner pointer. The owner is now
passed as an additional argument at registration time. To this end,
the functions for registration have been modified to take an additional
owner parameter and renamed to avoid conflicts. The old function names
are now used for helper macros that automatically set the module that
registers the fpga manager as the owner. This ensures compatibility
with existing low-level control modules and reduces the chances of
registering a manager without setting the owner.

To detect when the owner module pointer becomes stale, set the mops
pointer to null during fpga_mgr_unregister() and test it before taking
the module's refcount. Use a mutex to protect against a crash that can
happen if __fpga_mgr_get() gets suspended between testing the mops
pointer and taking the refcount while the low-level module is being
unloaded.

Update the documentation to keep it consistent with the new interface
for registering an fpga manager.

Other changes: opportunistically move put_device() from __fpga_mgr_get()
to fpga_mgr_get() and of_fpga_mgr_get() to improve code clarity since
the device refcount is taken in these functions.

Fixes: 654ba4cc0f3e ("fpga manager: ensure lifetime with of_fpga_mgr_get")
Suggested-by: Xu Yilun <yilun.xu@intel.com>
Signed-off-by: Marco Pagani <marpagan@redhat.com>
Acked-by: Xu Yilun <yilun.xu@intel.com>
Link: https://lore.kernel.org/r/20240111160242.149265-2-marpagan@redhat.com
Signed-off-by: Xu Yilun <yilun.xu@linux.intel.com>
---
 Documentation/driver-api/fpga/fpga-mgr.rst | 34 ++++----
 drivers/fpga/fpga-mgr.c                    | 93 ++++++++++++++--------
 include/linux/fpga/fpga-mgr.h              | 29 +++++--
 3 files changed, 103 insertions(+), 53 deletions(-)

diff --git a/Documentation/driver-api/fpga/fpga-mgr.rst b/Documentation/driver-api/fpga/fpga-mgr.rst
index 49c0a951265320..8d2b79f696c1fb 100644
--- a/Documentation/driver-api/fpga/fpga-mgr.rst
+++ b/Documentation/driver-api/fpga/fpga-mgr.rst
@@ -24,7 +24,8 @@ How to support a new FPGA device
 --------------------------------
 
 To add another FPGA manager, write a driver that implements a set of ops.  The
-probe function calls fpga_mgr_register() or fpga_mgr_register_full(), such as::
+probe function calls ``fpga_mgr_register()`` or ``fpga_mgr_register_full()``,
+such as::
 
 	static const struct fpga_manager_ops socfpga_fpga_ops = {
 		.write_init = socfpga_fpga_ops_configure_init,
@@ -69,10 +70,11 @@ probe function calls fpga_mgr_register() or fpga_mgr_register_full(), such as::
 	}
 
 Alternatively, the probe function could call one of the resource managed
-register functions, devm_fpga_mgr_register() or devm_fpga_mgr_register_full().
-When these functions are used, the parameter syntax is the same, but the call
-to fpga_mgr_unregister() should be removed. In the above example, the
-socfpga_fpga_remove() function would not be required.
+register functions, ``devm_fpga_mgr_register()`` or
+``devm_fpga_mgr_register_full()``.  When these functions are used, the
+parameter syntax is the same, but the call to ``fpga_mgr_unregister()`` should be
+removed. In the above example, the ``socfpga_fpga_remove()`` function would not be
+required.
 
 The ops will implement whatever device specific register writes are needed to
 do the programming sequence for this particular FPGA.  These ops return 0 for
@@ -125,15 +127,19 @@ API for implementing a new FPGA Manager driver
 * struct fpga_manager -  the FPGA manager struct
 * struct fpga_manager_ops -  Low level FPGA manager driver ops
 * struct fpga_manager_info -  Parameter structure for fpga_mgr_register_full()
-* fpga_mgr_register_full() -  Create and register an FPGA manager using the
+* __fpga_mgr_register_full() -  Create and register an FPGA manager using the
   fpga_mgr_info structure to provide the full flexibility of options
-* fpga_mgr_register() -  Create and register an FPGA manager using standard
+* __fpga_mgr_register() -  Create and register an FPGA manager using standard
   arguments
-* devm_fpga_mgr_register_full() -  Resource managed version of
-  fpga_mgr_register_full()
-* devm_fpga_mgr_register() -  Resource managed version of fpga_mgr_register()
+* __devm_fpga_mgr_register_full() -  Resource managed version of
+  __fpga_mgr_register_full()
+* __devm_fpga_mgr_register() -  Resource managed version of __fpga_mgr_register()
 * fpga_mgr_unregister() -  Unregister an FPGA manager
 
+Helper macros ``fpga_mgr_register_full()``, ``fpga_mgr_register()``,
+``devm_fpga_mgr_register_full()``, and ``devm_fpga_mgr_register()`` are available
+to ease the registration.
+
 .. kernel-doc:: include/linux/fpga/fpga-mgr.h
    :functions: fpga_mgr_states
 
@@ -147,16 +153,16 @@ API for implementing a new FPGA Manager driver
    :functions: fpga_manager_info
 
 .. kernel-doc:: drivers/fpga/fpga-mgr.c
-   :functions: fpga_mgr_register_full
+   :functions: __fpga_mgr_register_full
 
 .. kernel-doc:: drivers/fpga/fpga-mgr.c
-   :functions: fpga_mgr_register
+   :functions: __fpga_mgr_register
 
 .. kernel-doc:: drivers/fpga/fpga-mgr.c
-   :functions: devm_fpga_mgr_register_full
+   :functions: __devm_fpga_mgr_register_full
 
 .. kernel-doc:: drivers/fpga/fpga-mgr.c
-   :functions: devm_fpga_mgr_register
+   :functions: __devm_fpga_mgr_register
 
 .. kernel-doc:: drivers/fpga/fpga-mgr.c
    :functions: fpga_mgr_unregister
diff --git a/drivers/fpga/fpga-mgr.c b/drivers/fpga/fpga-mgr.c
index 06651389c59262..d7bfbdfdf2fc89 100644
--- a/drivers/fpga/fpga-mgr.c
+++ b/drivers/fpga/fpga-mgr.c
@@ -664,20 +664,20 @@ static struct attribute *fpga_mgr_attrs[] = {
 };
 ATTRIBUTE_GROUPS(fpga_mgr);
 
-static struct fpga_manager *__fpga_mgr_get(struct device *dev)
+static struct fpga_manager *__fpga_mgr_get(struct device *mgr_dev)
 {
 	struct fpga_manager *mgr;
 
-	mgr = to_fpga_manager(dev);
+	mgr = to_fpga_manager(mgr_dev);
 
-	if (!try_module_get(dev->parent->driver->owner))
-		goto err_dev;
+	mutex_lock(&mgr->mops_mutex);
 
-	return mgr;
+	if (!mgr->mops || !try_module_get(mgr->mops_owner))
+		mgr = ERR_PTR(-ENODEV);
 
-err_dev:
-	put_device(dev);
-	return ERR_PTR(-ENODEV);
+	mutex_unlock(&mgr->mops_mutex);
+
+	return mgr;
 }
 
 static int fpga_mgr_dev_match(struct device *dev, const void *data)
@@ -693,12 +693,18 @@ static int fpga_mgr_dev_match(struct device *dev, const void *data)
  */
 struct fpga_manager *fpga_mgr_get(struct device *dev)
 {
-	struct device *mgr_dev = class_find_device(&fpga_mgr_class, NULL, dev,
-						   fpga_mgr_dev_match);
+	struct fpga_manager *mgr;
+	struct device *mgr_dev;
+
+	mgr_dev = class_find_device(&fpga_mgr_class, NULL, dev, fpga_mgr_dev_match);
 	if (!mgr_dev)
 		return ERR_PTR(-ENODEV);
 
-	return __fpga_mgr_get(mgr_dev);
+	mgr = __fpga_mgr_get(mgr_dev);
+	if (IS_ERR(mgr))
+		put_device(mgr_dev);
+
+	return mgr;
 }
 EXPORT_SYMBOL_GPL(fpga_mgr_get);
 
@@ -711,13 +717,18 @@ EXPORT_SYMBOL_GPL(fpga_mgr_get);
  */
 struct fpga_manager *of_fpga_mgr_get(struct device_node *node)
 {
-	struct device *dev;
+	struct fpga_manager *mgr;
+	struct device *mgr_dev;
 
-	dev = class_find_device_by_of_node(&fpga_mgr_class, node);
-	if (!dev)
+	mgr_dev = class_find_device_by_of_node(&fpga_mgr_class, node);
+	if (!mgr_dev)
 		return ERR_PTR(-ENODEV);
 
-	return __fpga_mgr_get(dev);
+	mgr = __fpga_mgr_get(mgr_dev);
+	if (IS_ERR(mgr))
+		put_device(mgr_dev);
+
+	return mgr;
 }
 EXPORT_SYMBOL_GPL(of_fpga_mgr_get);
 
@@ -727,7 +738,7 @@ EXPORT_SYMBOL_GPL(of_fpga_mgr_get);
  */
 void fpga_mgr_put(struct fpga_manager *mgr)
 {
-	module_put(mgr->dev.parent->driver->owner);
+	module_put(mgr->mops_owner);
 	put_device(&mgr->dev);
 }
 EXPORT_SYMBOL_GPL(fpga_mgr_put);
@@ -766,9 +777,10 @@ void fpga_mgr_unlock(struct fpga_manager *mgr)
 EXPORT_SYMBOL_GPL(fpga_mgr_unlock);
 
 /**
- * fpga_mgr_register_full - create and register an FPGA Manager device
+ * __fpga_mgr_register_full - create and register an FPGA Manager device
  * @parent:	fpga manager device from pdev
  * @info:	parameters for fpga manager
+ * @owner:	owner module containing the ops
  *
  * The caller of this function is responsible for calling fpga_mgr_unregister().
  * Using devm_fpga_mgr_register_full() instead is recommended.
@@ -776,7 +788,8 @@ EXPORT_SYMBOL_GPL(fpga_mgr_unlock);
  * Return: pointer to struct fpga_manager pointer or ERR_PTR()
  */
 struct fpga_manager *
-fpga_mgr_register_full(struct device *parent, const struct fpga_manager_info *info)
+__fpga_mgr_register_full(struct device *parent, const struct fpga_manager_info *info,
+			 struct module *owner)
 {
 	const struct fpga_manager_ops *mops = info->mops;
 	struct fpga_manager *mgr;
@@ -803,6 +816,9 @@ fpga_mgr_register_full(struct device *parent, const struct fpga_manager_info *in
 	}
 
 	mutex_init(&mgr->ref_mutex);
+	mutex_init(&mgr->mops_mutex);
+
+	mgr->mops_owner = owner;
 
 	mgr->name = info->name;
 	mgr->mops = info->mops;
@@ -841,14 +857,15 @@ fpga_mgr_register_full(struct device *parent, const struct fpga_manager_info *in
 
 	return ERR_PTR(ret);
 }
-EXPORT_SYMBOL_GPL(fpga_mgr_register_full);
+EXPORT_SYMBOL_GPL(__fpga_mgr_register_full);
 
 /**
- * fpga_mgr_register - create and register an FPGA Manager device
+ * __fpga_mgr_register - create and register an FPGA Manager device
  * @parent:	fpga manager device from pdev
  * @name:	fpga manager name
  * @mops:	pointer to structure of fpga manager ops
  * @priv:	fpga manager private data
+ * @owner:	owner module containing the ops
  *
  * The caller of this function is responsible for calling fpga_mgr_unregister().
  * Using devm_fpga_mgr_register() instead is recommended. This simple
@@ -859,8 +876,8 @@ EXPORT_SYMBOL_GPL(fpga_mgr_register_full);
  * Return: pointer to struct fpga_manager pointer or ERR_PTR()
  */
 struct fpga_manager *
-fpga_mgr_register(struct device *parent, const char *name,
-		  const struct fpga_manager_ops *mops, void *priv)
+__fpga_mgr_register(struct device *parent, const char *name,
+		    const struct fpga_manager_ops *mops, void *priv, struct module *owner)
 {
 	struct fpga_manager_info info = { 0 };
 
@@ -868,9 +885,9 @@ fpga_mgr_register(struct device *parent, const char *name,
 	info.mops = mops;
 	info.priv = priv;
 
-	return fpga_mgr_register_full(parent, &info);
+	return __fpga_mgr_register_full(parent, &info, owner);
 }
-EXPORT_SYMBOL_GPL(fpga_mgr_register);
+EXPORT_SYMBOL_GPL(__fpga_mgr_register);
 
 /**
  * fpga_mgr_unregister - unregister an FPGA manager
@@ -888,6 +905,12 @@ void fpga_mgr_unregister(struct fpga_manager *mgr)
 	 */
 	fpga_mgr_fpga_remove(mgr);
 
+	mutex_lock(&mgr->mops_mutex);
+
+	mgr->mops = NULL;
+
+	mutex_unlock(&mgr->mops_mutex);
+
 	device_unregister(&mgr->dev);
 }
 EXPORT_SYMBOL_GPL(fpga_mgr_unregister);
@@ -900,9 +923,10 @@ static void devm_fpga_mgr_unregister(struct device *dev, void *res)
 }
 
 /**
- * devm_fpga_mgr_register_full - resource managed variant of fpga_mgr_register()
+ * __devm_fpga_mgr_register_full - resource managed variant of fpga_mgr_register()
  * @parent:	fpga manager device from pdev
  * @info:	parameters for fpga manager
+ * @owner:	owner module containing the ops
  *
  * Return:  fpga manager pointer on success, negative error code otherwise.
  *
@@ -910,7 +934,8 @@ static void devm_fpga_mgr_unregister(struct device *dev, void *res)
  * function will be called automatically when the managing device is detached.
  */
 struct fpga_manager *
-devm_fpga_mgr_register_full(struct device *parent, const struct fpga_manager_info *info)
+__devm_fpga_mgr_register_full(struct device *parent, const struct fpga_manager_info *info,
+			      struct module *owner)
 {
 	struct fpga_mgr_devres *dr;
 	struct fpga_manager *mgr;
@@ -919,7 +944,7 @@ devm_fpga_mgr_register_full(struct device *parent, const struct fpga_manager_inf
 	if (!dr)
 		return ERR_PTR(-ENOMEM);
 
-	mgr = fpga_mgr_register_full(parent, info);
+	mgr = __fpga_mgr_register_full(parent, info, owner);
 	if (IS_ERR(mgr)) {
 		devres_free(dr);
 		return mgr;
@@ -930,14 +955,15 @@ devm_fpga_mgr_register_full(struct device *parent, const struct fpga_manager_inf
 
 	return mgr;
 }
-EXPORT_SYMBOL_GPL(devm_fpga_mgr_register_full);
+EXPORT_SYMBOL_GPL(__devm_fpga_mgr_register_full);
 
 /**
- * devm_fpga_mgr_register - resource managed variant of fpga_mgr_register()
+ * __devm_fpga_mgr_register - resource managed variant of fpga_mgr_register()
  * @parent:	fpga manager device from pdev
  * @name:	fpga manager name
  * @mops:	pointer to structure of fpga manager ops
  * @priv:	fpga manager private data
+ * @owner:	owner module containing the ops
  *
  * Return:  fpga manager pointer on success, negative error code otherwise.
  *
@@ -946,8 +972,9 @@ EXPORT_SYMBOL_GPL(devm_fpga_mgr_register_full);
  * device is detached.
  */
 struct fpga_manager *
-devm_fpga_mgr_register(struct device *parent, const char *name,
-		       const struct fpga_manager_ops *mops, void *priv)
+__devm_fpga_mgr_register(struct device *parent, const char *name,
+			 const struct fpga_manager_ops *mops, void *priv,
+			 struct module *owner)
 {
 	struct fpga_manager_info info = { 0 };
 
@@ -955,9 +982,9 @@ devm_fpga_mgr_register(struct device *parent, const char *name,
 	info.mops = mops;
 	info.priv = priv;
 
-	return devm_fpga_mgr_register_full(parent, &info);
+	return __devm_fpga_mgr_register_full(parent, &info, owner);
 }
-EXPORT_SYMBOL_GPL(devm_fpga_mgr_register);
+EXPORT_SYMBOL_GPL(__devm_fpga_mgr_register);
 
 static void fpga_mgr_dev_release(struct device *dev)
 {
diff --git a/include/linux/fpga/fpga-mgr.h b/include/linux/fpga/fpga-mgr.h
index 54f63459efd6e2..844dded7a5304c 100644
--- a/include/linux/fpga/fpga-mgr.h
+++ b/include/linux/fpga/fpga-mgr.h
@@ -201,6 +201,8 @@ struct fpga_manager_ops {
  * @state: state of fpga manager
  * @compat_id: FPGA manager id for compatibility check.
  * @mops: pointer to struct of fpga manager ops
+ * @mops_mutex: protects mops from low-level module removal
+ * @mops_owner: module containing the mops
  * @priv: low level driver private date
  */
 struct fpga_manager {
@@ -210,6 +212,8 @@ struct fpga_manager {
 	enum fpga_mgr_states state;
 	struct fpga_compat_id *compat_id;
 	const struct fpga_manager_ops *mops;
+	struct mutex mops_mutex;
+	struct module *mops_owner;
 	void *priv;
 };
 
@@ -230,18 +234,31 @@ struct fpga_manager *fpga_mgr_get(struct device *dev);
 
 void fpga_mgr_put(struct fpga_manager *mgr);
 
+#define fpga_mgr_register_full(parent, info) \
+	__fpga_mgr_register_full(parent, info, THIS_MODULE)
 struct fpga_manager *
-fpga_mgr_register_full(struct device *parent, const struct fpga_manager_info *info);
+__fpga_mgr_register_full(struct device *parent, const struct fpga_manager_info *info,
+			 struct module *owner);
 
+#define fpga_mgr_register(parent, name, mops, priv) \
+	__fpga_mgr_register(parent, name, mops, priv, THIS_MODULE)
 struct fpga_manager *
-fpga_mgr_register(struct device *parent, const char *name,
-		  const struct fpga_manager_ops *mops, void *priv);
+__fpga_mgr_register(struct device *parent, const char *name,
+		    const struct fpga_manager_ops *mops, void *priv, struct module *owner);
+
 void fpga_mgr_unregister(struct fpga_manager *mgr);
 
+#define devm_fpga_mgr_register_full(parent, info) \
+	__devm_fpga_mgr_register_full(parent, info, THIS_MODULE)
 struct fpga_manager *
-devm_fpga_mgr_register_full(struct device *parent, const struct fpga_manager_info *info);
+__devm_fpga_mgr_register_full(struct device *parent, const struct fpga_manager_info *info,
+			      struct module *owner);
+
+#define devm_fpga_mgr_register(parent, name, mops, priv) \
+	__devm_fpga_mgr_register(parent, name, mops, priv, THIS_MODULE)
 struct fpga_manager *
-devm_fpga_mgr_register(struct device *parent, const char *name,
-		       const struct fpga_manager_ops *mops, void *priv);
+__devm_fpga_mgr_register(struct device *parent, const char *name,
+			 const struct fpga_manager_ops *mops, void *priv,
+			 struct module *owner);
 
 #endif /*_LINUX_FPGA_MGR_H */

From 2a42e144dd0b62eaf79148394ab057145afbc3c5 Mon Sep 17 00:00:00 2001
From: JonasZhou <JonasZhou@zhaoxin.com>
Date: Fri, 2 Feb 2024 16:33:04 +0800
Subject: [PATCH 0157/1406] fs/address_space: move i_mmap_rwsem to mitigate a
 false sharing with i_mmap.

In the struct address_space, there is a 32-byte gap between i_mmap
and i_mmap_rwsem. Due to the alignment of struct address_space
variables to 8 bytes, in certain situations, i_mmap and i_mmap_rwsem
may end up in the same CACHE line.

While running Unixbench/execl, we observe high false sharing issues
when accessing i_mmap against i_mmap_rwsem. We move i_mmap_rwsem
after i_private_list, ensuring a 64-byte gap between i_mmap and
i_mmap_rwsem.

For Intel Silver machines (2 sockets) using kernel v6.8 rc-2, the score
of Unixbench/execl improves by ~3.94%, and the score of Unixbench/shell
improves by ~3.26%.

Baseline:
-------------------------------------------------------------
  162      546      748    11374       21  0xffff92e266af90c0
-------------------------------------------------------------
        46.89%   44.65%    0.00%    0.00%                 0x0     1       1  0xffffffff86d5fb96       460       258       271     1069        32  [k] __handle_mm_fault          [kernel.vmlinux]  memory.c:2940            0  1
         4.21%    4.41%    0.00%    0.00%                 0x4     1       1  0xffffffff86d0ed54       473       311       288       95        28  [k] filemap_read               [kernel.vmlinux]  atomic.h:23              0  1
         0.00%    0.00%    0.04%    4.76%                 0x8     1       1  0xffffffff86d4bcf1         0         0         0        5         4  [k] vma_interval_tree_remove   [kernel.vmlinux]  rbtree_augmented.h:204   0  1
         6.41%    6.02%    0.00%    0.00%                 0x8     1       1  0xffffffff86d4ba85       411       271       339      210        32  [k] vma_interval_tree_insert   [kernel.vmlinux]  interval_tree.c:23       0  1
         0.00%    0.00%    0.47%   95.24%                0x10     1       1  0xffffffff86d4bd34         0         0         0       74        32  [k] vma_interval_tree_remove   [kernel.vmlinux]  rbtree_augmented.h:339   0  1
         0.37%    0.13%    0.00%    0.00%                0x10     1       1  0xffffffff86d4bb4f       328       212       380        7         5  [k] vma_interval_tree_remove   [kernel.vmlinux]  rbtree_augmented.h:338   0  1
         5.13%    5.08%    0.00%    0.00%                0x10     1       1  0xffffffff86d4bb4b       416       255       357      197        32  [k] vma_interval_tree_remove   [kernel.vmlinux]  rbtree_augmented.h:338   0  1
         1.10%    0.53%    0.00%    0.00%                0x28     1       1  0xffffffff86e06eb8       395       228       351       24        14  [k] do_dentry_open             [kernel.vmlinux]  open.c:966               0  1
         1.10%    2.14%   57.07%    0.00%                0x38     1       1  0xffffffff878c9225      1364       792       462     7003        32  [k] down_write                 [kernel.vmlinux]  atomic64_64.h:109        0  1
         0.00%    0.00%    0.01%    0.00%                0x38     1       1  0xffffffff878c8e75         0         0       252        3         2  [k] rwsem_down_write_slowpath  [kernel.vmlinux]  atomic64_64.h:109        0  1
         0.00%    0.13%    0.00%    0.00%                0x38     1       1  0xffffffff878c8e23         0       596        63        2         2  [k] rwsem_down_write_slowpath  [kernel.vmlinux]  atomic64_64.h:15         0  1
         2.38%    2.94%    6.53%    0.00%                0x38     1       1  0xffffffff878c8ccb      1150       818       570     1197        32  [k] rwsem_down_write_slowpath  [kernel.vmlinux]  atomic64_64.h:109        0  1
        30.59%   32.22%    0.00%    0.00%                0x38     1       1  0xffffffff878c8cb4       423       251       380      648        32  [k] rwsem_down_write_slowpath  [kernel.vmlinux]  atomic64_64.h:15         0  1
         1.83%    1.74%   35.88%    0.00%                0x38     1       1  0xffffffff86b4f833      1217      1112       565     4586        32  [k] up_write                   [kernel.vmlinux]  atomic64_64.h:91         0  1

with this change:
-------------------------------------------------------------
  360       12      300       57       35  0xffff982cdae76400
-------------------------------------------------------------
        50.00%   59.67%    0.00%    0.00%                 0x0     1       1  0xffffffff8215fb86       352       200       191      558        32  [k] __handle_mm_fault         [kernel.vmlinux]  memory.c:2940            0  1
         8.33%    5.00%    0.00%    0.00%                 0x4     1       1  0xffffffff8210ed44       370       284       263       42        24  [k] filemap_read              [kernel.vmlinux]  atomic.h:23              0  1
         0.00%    0.00%    5.26%    2.86%                 0x8     1       1  0xffffffff8214bce1         0         0         0        4         4  [k] vma_interval_tree_remove  [kernel.vmlinux]  rbtree_augmented.h:204   0  1
        33.33%   14.33%    0.00%    0.00%                 0x8     1       1  0xffffffff8214ba75       344       186       219      140        32  [k] vma_interval_tree_insert  [kernel.vmlinux]  interval_tree.c:23       0  1
         0.00%    0.00%   94.74%   97.14%                0x10     1       1  0xffffffff8214bd24         0         0         0       88        29  [k] vma_interval_tree_remove  [kernel.vmlinux]  rbtree_augmented.h:339   0  1
         8.33%   20.00%    0.00%    0.00%                0x10     1       1  0xffffffff8214bb3b       296       209       226      167        31  [k] vma_interval_tree_remove  [kernel.vmlinux]  rbtree_augmented.h:338   0  1
         0.00%    0.67%    0.00%    0.00%                0x28     1       1  0xffffffff82206f45         0       140       334        4         3  [k] do_dentry_open            [kernel.vmlinux]  open.c:966               0  1
         0.00%    0.33%    0.00%    0.00%                0x38     1       1  0xffffffff8250a6c4         0       286       126        5         5  [k] errseq_sample             [kernel.vmlinux]  errseq.c:125             0

Signed-off-by: JonasZhou <JonasZhou@zhaoxin.com>
Link: https://lore.kernel.org/r/20240202083304.10995-1-JonasZhou-oc@zhaoxin.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/fs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/fs.h b/include/linux/fs.h
index ebce4763b4bb9a..9efd6220b7c64b 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -482,10 +482,10 @@ struct address_space {
 	pgoff_t			writeback_index;
 	const struct address_space_operations *a_ops;
 	unsigned long		flags;
-	struct rw_semaphore	i_mmap_rwsem;
 	errseq_t		wb_err;
 	spinlock_t		i_private_lock;
 	struct list_head	i_private_list;
+	struct rw_semaphore	i_mmap_rwsem;
 	void *			i_private_data;
 } __attribute__((aligned(sizeof(long)))) __randomize_layout;
 	/*

From ff49b00e9621402cf723c3cb11489dff2d09a738 Mon Sep 17 00:00:00 2001
From: "Ricardo B. Marliere" <ricardo@marliere.net>
Date: Sun, 4 Feb 2024 13:14:05 -0300
Subject: [PATCH 0158/1406] fpga: dfl: make dfl_bus_type const

Now that the driver core can properly handle constant struct bus_type,
move the dfl_bus_type variable to be a constant structure as well,
placing it into read-only memory which can not be modified at runtime.

Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Suggested-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Ricardo B. Marliere <ricardo@marliere.net>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Acked-by: Xu Yilun <yilun.xu@intel.com>
Link: https://lore.kernel.org/r/20240204-bus_cleanup-fpga-v1-1-dae8b5bf7220@marliere.net
Signed-off-by: Xu Yilun <yilun.xu@linux.intel.com>
---
 drivers/fpga/dfl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/fpga/dfl.c b/drivers/fpga/dfl.c
index e6d12fbab653fb..094ee97ea26cb1 100644
--- a/drivers/fpga/dfl.c
+++ b/drivers/fpga/dfl.c
@@ -327,7 +327,7 @@ static struct attribute *dfl_dev_attrs[] = {
 };
 ATTRIBUTE_GROUPS(dfl_dev);
 
-static struct bus_type dfl_bus_type = {
+static const struct bus_type dfl_bus_type = {
 	.name		= "dfl",
 	.match		= dfl_bus_match,
 	.probe		= dfl_bus_probe,

From 91d5bbf6d41eea2f1f84705f2ebcc308bbcf6c7e Mon Sep 17 00:00:00 2001
From: Huang Xiaojia <huangxiaojia2@huawei.com>
Date: Tue, 6 Feb 2024 09:43:53 +0800
Subject: [PATCH 0159/1406] epoll: Remove ep_scan_ready_list() in comments

Since commit 443f1a042233 ("lift the calls of ep_send_events_proc()
into the callers"), ep_scan_ready_list() has been removed.
But there are still several in comments. All of them should
be replaced with other caller functions.

Signed-off-by: Huang Xiaojia <huangxiaojia2@huawei.com>
Link: https://lore.kernel.org/r/20240206014353.4191262-1-huangxiaojia2@huawei.com
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/eventpoll.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 3534d36a147400..786e023a48b2f8 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -206,7 +206,7 @@ struct eventpoll {
 	 */
 	struct epitem *ovflist;
 
-	/* wakeup_source used when ep_scan_ready_list is running */
+	/* wakeup_source used when ep_send_events or __ep_eventpoll_poll is running */
 	struct wakeup_source *ws;
 
 	/* The user that created the eventpoll descriptor */
@@ -1153,7 +1153,7 @@ static inline bool chain_epi_lockless(struct epitem *epi)
  * This callback takes a read lock in order not to contend with concurrent
  * events from another file descriptor, thus all modifications to ->rdllist
  * or ->ovflist are lockless.  Read lock is paired with the write lock from
- * ep_scan_ready_list(), which stops all list modifications and guarantees
+ * ep_start/done_scan(), which stops all list modifications and guarantees
  * that lists state is seen correctly.
  *
  * Another thing worth to mention is that ep_poll_callback() can be called
@@ -1751,7 +1751,7 @@ static int ep_send_events(struct eventpoll *ep,
 			 * availability. At this point, no one can insert
 			 * into ep->rdllist besides us. The epoll_ctl()
 			 * callers are locked out by
-			 * ep_scan_ready_list() holding "mtx" and the
+			 * ep_send_events() holding "mtx" and the
 			 * poll callback will queue them in ep->ovflist.
 			 */
 			list_add_tail(&epi->rdllink, &ep->rdllist);
@@ -1904,7 +1904,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
 		__set_current_state(TASK_INTERRUPTIBLE);
 
 		/*
-		 * Do the final check under the lock. ep_scan_ready_list()
+		 * Do the final check under the lock. ep_start/done_scan()
 		 * plays with two lists (->rdllist and ->ovflist) and there
 		 * is always a race when both lists are empty for short
 		 * period of time although events are pending, so lock is

From 29bb008cb2968b803f09de31f43a2147fd394616 Mon Sep 17 00:00:00 2001
From: Joao Martins <joao.m.martins@oracle.com>
Date: Fri, 2 Feb 2024 13:34:07 +0000
Subject: [PATCH 0160/1406] iommufd/iova_bitmap: Bounds check mapped::pages
 access

Dirty IOMMU hugepages reported on a base page page-size granularity can
lead to an attempt to set dirty pages in the bitmap beyond the limits that
are pinned.

Bounds check the page index of the array we are trying to access is within
the limits before we kmap() and return otherwise.

While it is also a defensive check, this is also in preparation to defer
setting bits (outside the mapped range) to the next iteration(s) when the
pages become available.

Fixes: b058ea3ab5af ("vfio/iova_bitmap: refactor iova_bitmap_set() to better handle page boundaries")
Link: https://lore.kernel.org/r/20240202133415.23819-2-joao.m.martins@oracle.com
Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 drivers/iommu/iommufd/iova_bitmap.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/iommu/iommufd/iova_bitmap.c b/drivers/iommu/iommufd/iova_bitmap.c
index 0a92c9eeaf7f50..a3606b4c222920 100644
--- a/drivers/iommu/iommufd/iova_bitmap.c
+++ b/drivers/iommu/iommufd/iova_bitmap.c
@@ -409,6 +409,7 @@ void iova_bitmap_set(struct iova_bitmap *bitmap,
 			mapped->pgshift) + mapped->pgoff * BITS_PER_BYTE;
 	unsigned long last_bit = (((iova + length - 1) - mapped->iova) >>
 			mapped->pgshift) + mapped->pgoff * BITS_PER_BYTE;
+	unsigned long last_page_idx = mapped->npages - 1;
 
 	do {
 		unsigned int page_idx = cur_bit / BITS_PER_PAGE;
@@ -417,6 +418,9 @@ void iova_bitmap_set(struct iova_bitmap *bitmap,
 					 last_bit - cur_bit + 1);
 		void *kaddr;
 
+		if (unlikely(page_idx > last_page_idx))
+			break;
+
 		kaddr = kmap_local_page(mapped->pages[page_idx]);
 		bitmap_set(kaddr, offset, nbits);
 		kunmap_local(kaddr);

From 9b232fb23a6681ca966dee3b91cecf3d0e72639a Mon Sep 17 00:00:00 2001
From: Joao Martins <joao.m.martins@oracle.com>
Date: Fri, 2 Feb 2024 13:34:08 +0000
Subject: [PATCH 0161/1406] iommufd/iova_bitmap: Switch iova_bitmap::bitmap to
 an u8 array

iova_bitmap_mapped_length() don't deal correctly with the small bitmaps
(< 2M bitmaps) when the starting address isn't u64 aligned, leading to
skipping a tiny part of the IOVA range. This is materialized as not
marking data dirty that should otherwise have been.

Fix that by using a u8 * in the internal state of IOVA bitmap. Most of the
data structures use the type of the bitmap to adjust its indexes, thus
changing the type of the bitmap decreases the granularity of the bitmap
indexes.

Fixes: b058ea3ab5af ("vfio/iova_bitmap: refactor iova_bitmap_set() to better handle page boundaries")
Link: https://lore.kernel.org/r/20240202133415.23819-3-joao.m.martins@oracle.com
Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 drivers/iommu/iommufd/iova_bitmap.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/iommu/iommufd/iova_bitmap.c b/drivers/iommu/iommufd/iova_bitmap.c
index a3606b4c222920..9d42ab51a6bb36 100644
--- a/drivers/iommu/iommufd/iova_bitmap.c
+++ b/drivers/iommu/iommufd/iova_bitmap.c
@@ -100,7 +100,7 @@ struct iova_bitmap {
 	struct iova_bitmap_map mapped;
 
 	/* userspace address of the bitmap */
-	u64 __user *bitmap;
+	u8 __user *bitmap;
 
 	/* u64 index that @mapped points to */
 	unsigned long mapped_base_index;
@@ -162,7 +162,7 @@ static int iova_bitmap_get(struct iova_bitmap *bitmap)
 {
 	struct iova_bitmap_map *mapped = &bitmap->mapped;
 	unsigned long npages;
-	u64 __user *addr;
+	u8 __user *addr;
 	long ret;
 
 	/*
@@ -247,7 +247,7 @@ struct iova_bitmap *iova_bitmap_alloc(unsigned long iova, size_t length,
 
 	mapped = &bitmap->mapped;
 	mapped->pgshift = __ffs(page_size);
-	bitmap->bitmap = data;
+	bitmap->bitmap = (u8 __user *)data;
 	bitmap->mapped_total_index =
 		iova_bitmap_offset_to_index(bitmap, length - 1) + 1;
 	bitmap->iova = iova;
@@ -304,7 +304,7 @@ static unsigned long iova_bitmap_mapped_remaining(struct iova_bitmap *bitmap)
 
 	remaining = bitmap->mapped_total_index - bitmap->mapped_base_index;
 	remaining = min_t(unsigned long, remaining,
-			  bytes / sizeof(*bitmap->bitmap));
+			  DIV_ROUND_UP(bytes, sizeof(*bitmap->bitmap)));
 
 	return remaining;
 }

From 82bbd2dc660fd05be1ea39fef7ab4e49ab4600b4 Mon Sep 17 00:00:00 2001
From: Joao Martins <joao.m.martins@oracle.com>
Date: Fri, 2 Feb 2024 13:34:09 +0000
Subject: [PATCH 0162/1406] iommufd/selftest: Test u64 unaligned bitmaps

Exercise the dirty tracking bitmaps with byte unaligned addresses in
addition to the PAGE_SIZE unaligned bitmaps, using a address towards the
end of the page boundary.

In doing so, increase the tailroom we allocate for the bitmap from
MOCK_PAGE_SIZE(2K) into PAGE_SIZE(4K), such that we can test end of bitmap
boundary.

Link: https://lore.kernel.org/r/20240202133415.23819-4-joao.m.martins@oracle.com
Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 tools/testing/selftests/iommu/iommufd.c | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/iommu/iommufd.c b/tools/testing/selftests/iommu/iommufd.c
index 1a881e7a21d1b2..49774a720314e5 100644
--- a/tools/testing/selftests/iommu/iommufd.c
+++ b/tools/testing/selftests/iommu/iommufd.c
@@ -1741,9 +1741,9 @@ FIXTURE_SETUP(iommufd_dirty_tracking)
 	self->bitmap_size =
 		variant->buffer_size / self->page_size / BITS_PER_BYTE;
 
-	/* Provision with an extra (MOCK_PAGE_SIZE) for the unaligned case */
+	/* Provision with an extra (PAGE_SIZE) for the unaligned case */
 	rc = posix_memalign(&self->bitmap, PAGE_SIZE,
-			    self->bitmap_size + MOCK_PAGE_SIZE);
+			    self->bitmap_size + PAGE_SIZE);
 	assert(!rc);
 	assert(self->bitmap);
 	assert((uintptr_t)self->bitmap % PAGE_SIZE == 0);
@@ -1873,6 +1873,13 @@ TEST_F(iommufd_dirty_tracking, get_dirty_bitmap)
 				self->bitmap + MOCK_PAGE_SIZE,
 				self->bitmap_size, 0, _metadata);
 
+	/* u64 unaligned bitmap */
+	test_mock_dirty_bitmaps(hwpt_id, variant->buffer_size,
+				MOCK_APERTURE_START, self->page_size,
+				self->bitmap + 0xff1,
+				self->bitmap_size, 0, _metadata);
+
+
 	test_ioctl_destroy(stddev_id);
 	test_ioctl_destroy(hwpt_id);
 }
@@ -1907,6 +1914,14 @@ TEST_F(iommufd_dirty_tracking, get_dirty_bitmap_no_clear)
 				IOMMU_HWPT_GET_DIRTY_BITMAP_NO_CLEAR,
 				_metadata);
 
+	/* u64 unaligned bitmap */
+	test_mock_dirty_bitmaps(hwpt_id, variant->buffer_size,
+				MOCK_APERTURE_START, self->page_size,
+				self->bitmap + 0xff1,
+				self->bitmap_size,
+				IOMMU_HWPT_GET_DIRTY_BITMAP_NO_CLEAR,
+				_metadata);
+
 	test_ioctl_destroy(stddev_id);
 	test_ioctl_destroy(hwpt_id);
 }

From 0a553cd32254b7c1d85fc5c4388ab086cae40d31 Mon Sep 17 00:00:00 2001
From: Joao Martins <joao.m.martins@oracle.com>
Date: Fri, 2 Feb 2024 13:34:10 +0000
Subject: [PATCH 0163/1406] iommufd/iova_bitmap: Handle recording beyond the
 mapped pages

IOVA bitmap is a zero-copy scheme of recording dirty bits that iterate the
different bitmap user pages at chunks of a maximum of
PAGE_SIZE/sizeof(struct page*) pages.

When the iterations are split up into 64G, the end of the range may be
broken up in a way that's aligned with a non base page PTE size. This
leads to only part of the huge page being recorded in the bitmap. Note
that in pratice this is only a problem for IOMMU dirty tracking i.e. when
the backing PTEs are in IOMMU hugepages and the bitmap is in base page
granularity. So far this not something that affects VF dirty trackers
(which reports and records at the same granularity).

To fix that, if there is a remainder of bits left to set in which the
current IOVA bitmap doesn't cover, make a copy of the bitmap structure and
iterate-and-set the rest of the bits remaining. Finally, when advancing
the iterator, skip all the bits that were set ahead.

Link: https://lore.kernel.org/r/20240202133415.23819-5-joao.m.martins@oracle.com
Reported-by: Avihai Horon <avihaih@nvidia.com>
Fixes: f35f22cc760e ("iommu/vt-d: Access/Dirty bit support for SS domains")
Fixes: 421a511a293f ("iommu/amd: Access/Dirty bit support in IOPTEs")
Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 drivers/iommu/iommufd/iova_bitmap.c | 43 +++++++++++++++++++++++++++++
 1 file changed, 43 insertions(+)

diff --git a/drivers/iommu/iommufd/iova_bitmap.c b/drivers/iommu/iommufd/iova_bitmap.c
index 9d42ab51a6bb36..b370e8ee886654 100644
--- a/drivers/iommu/iommufd/iova_bitmap.c
+++ b/drivers/iommu/iommufd/iova_bitmap.c
@@ -113,6 +113,9 @@ struct iova_bitmap {
 
 	/* length of the IOVA range for the whole bitmap */
 	size_t length;
+
+	/* length of the IOVA range set ahead the pinned pages */
+	unsigned long set_ahead_length;
 };
 
 /*
@@ -341,6 +344,32 @@ static bool iova_bitmap_done(struct iova_bitmap *bitmap)
 	return bitmap->mapped_base_index >= bitmap->mapped_total_index;
 }
 
+static int iova_bitmap_set_ahead(struct iova_bitmap *bitmap,
+				 size_t set_ahead_length)
+{
+	int ret = 0;
+
+	while (set_ahead_length > 0 && !iova_bitmap_done(bitmap)) {
+		unsigned long length = iova_bitmap_mapped_length(bitmap);
+		unsigned long iova = iova_bitmap_mapped_iova(bitmap);
+
+		ret = iova_bitmap_get(bitmap);
+		if (ret)
+			break;
+
+		length = min(length, set_ahead_length);
+		iova_bitmap_set(bitmap, iova, length);
+
+		set_ahead_length -= length;
+		bitmap->mapped_base_index +=
+			iova_bitmap_offset_to_index(bitmap, length - 1) + 1;
+		iova_bitmap_put(bitmap);
+	}
+
+	bitmap->set_ahead_length = 0;
+	return ret;
+}
+
 /*
  * Advances to the next range, releases the current pinned
  * pages and pins the next set of bitmap pages.
@@ -357,6 +386,15 @@ static int iova_bitmap_advance(struct iova_bitmap *bitmap)
 	if (iova_bitmap_done(bitmap))
 		return 0;
 
+	/* Iterate, set and skip any bits requested for next iteration */
+	if (bitmap->set_ahead_length) {
+		int ret;
+
+		ret = iova_bitmap_set_ahead(bitmap, bitmap->set_ahead_length);
+		if (ret)
+			return ret;
+	}
+
 	/* When advancing the index we pin the next set of bitmap pages */
 	return iova_bitmap_get(bitmap);
 }
@@ -426,5 +464,10 @@ void iova_bitmap_set(struct iova_bitmap *bitmap,
 		kunmap_local(kaddr);
 		cur_bit += nbits;
 	} while (cur_bit <= last_bit);
+
+	if (unlikely(cur_bit <= last_bit)) {
+		bitmap->set_ahead_length =
+			((last_bit - cur_bit + 1) << bitmap->mapped.pgshift);
+	}
 }
 EXPORT_SYMBOL_NS_GPL(iova_bitmap_set, IOMMUFD);

From 686b09f3ae3f35fc9c449c923207162699ffd83a Mon Sep 17 00:00:00 2001
From: Joao Martins <joao.m.martins@oracle.com>
Date: Fri, 2 Feb 2024 13:34:11 +0000
Subject: [PATCH 0164/1406] iommufd/selftest: Refactor dirty bitmap tests

Rework the functions that test and set the bitmaps to receive a new
parameter (the pte_page_size) that reflects the expected PTE size in the
page tables. The same scheme is still used i.e. even bits are dirty and
odd page indexes aren't dirty. Here it just refactors to consider the size
of the PTE rather than hardcoded to IOMMU mock base page assumptions.

While at it, refactor dirty bitmap tests to use the idev_id created by the
fixture instead of creating a new one.

This is in preparation for doing tests with IOMMU hugepages where multiple
bits set as part of recording a whole hugepage as dirty and thus the
pte_page_size will vary depending on io hugepages or io base pages.

Link: https://lore.kernel.org/r/20240202133415.23819-6-joao.m.martins@oracle.com
Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 tools/testing/selftests/iommu/iommufd.c       | 28 ++++++-------
 tools/testing/selftests/iommu/iommufd_utils.h | 39 ++++++++++++-------
 2 files changed, 36 insertions(+), 31 deletions(-)

diff --git a/tools/testing/selftests/iommu/iommufd.c b/tools/testing/selftests/iommu/iommufd.c
index 49774a720314e5..56c3e511a0ab7d 100644
--- a/tools/testing/selftests/iommu/iommufd.c
+++ b/tools/testing/selftests/iommu/iommufd.c
@@ -1849,7 +1849,7 @@ TEST_F(iommufd_dirty_tracking, device_dirty_capability)
 
 TEST_F(iommufd_dirty_tracking, get_dirty_bitmap)
 {
-	uint32_t stddev_id;
+	uint32_t page_size = MOCK_PAGE_SIZE;
 	uint32_t hwpt_id;
 	uint32_t ioas_id;
 
@@ -1859,34 +1859,31 @@ TEST_F(iommufd_dirty_tracking, get_dirty_bitmap)
 
 	test_cmd_hwpt_alloc(self->idev_id, ioas_id,
 			    IOMMU_HWPT_ALLOC_DIRTY_TRACKING, &hwpt_id);
-	test_cmd_mock_domain(hwpt_id, &stddev_id, NULL, NULL);
 
 	test_cmd_set_dirty_tracking(hwpt_id, true);
 
 	test_mock_dirty_bitmaps(hwpt_id, variant->buffer_size,
-				MOCK_APERTURE_START, self->page_size,
+				MOCK_APERTURE_START, self->page_size, page_size,
 				self->bitmap, self->bitmap_size, 0, _metadata);
 
 	/* PAGE_SIZE unaligned bitmap */
 	test_mock_dirty_bitmaps(hwpt_id, variant->buffer_size,
-				MOCK_APERTURE_START, self->page_size,
+				MOCK_APERTURE_START, self->page_size, page_size,
 				self->bitmap + MOCK_PAGE_SIZE,
 				self->bitmap_size, 0, _metadata);
 
 	/* u64 unaligned bitmap */
 	test_mock_dirty_bitmaps(hwpt_id, variant->buffer_size,
-				MOCK_APERTURE_START, self->page_size,
-				self->bitmap + 0xff1,
-				self->bitmap_size, 0, _metadata);
-
+				MOCK_APERTURE_START, self->page_size, page_size,
+				self->bitmap + 0xff1, self->bitmap_size, 0,
+				_metadata);
 
-	test_ioctl_destroy(stddev_id);
 	test_ioctl_destroy(hwpt_id);
 }
 
 TEST_F(iommufd_dirty_tracking, get_dirty_bitmap_no_clear)
 {
-	uint32_t stddev_id;
+	uint32_t page_size = MOCK_PAGE_SIZE;
 	uint32_t hwpt_id;
 	uint32_t ioas_id;
 
@@ -1896,19 +1893,18 @@ TEST_F(iommufd_dirty_tracking, get_dirty_bitmap_no_clear)
 
 	test_cmd_hwpt_alloc(self->idev_id, ioas_id,
 			    IOMMU_HWPT_ALLOC_DIRTY_TRACKING, &hwpt_id);
-	test_cmd_mock_domain(hwpt_id, &stddev_id, NULL, NULL);
 
 	test_cmd_set_dirty_tracking(hwpt_id, true);
 
 	test_mock_dirty_bitmaps(hwpt_id, variant->buffer_size,
-				MOCK_APERTURE_START, self->page_size,
+				MOCK_APERTURE_START, self->page_size, page_size,
 				self->bitmap, self->bitmap_size,
 				IOMMU_HWPT_GET_DIRTY_BITMAP_NO_CLEAR,
 				_metadata);
 
 	/* Unaligned bitmap */
 	test_mock_dirty_bitmaps(hwpt_id, variant->buffer_size,
-				MOCK_APERTURE_START, self->page_size,
+				MOCK_APERTURE_START, self->page_size, page_size,
 				self->bitmap + MOCK_PAGE_SIZE,
 				self->bitmap_size,
 				IOMMU_HWPT_GET_DIRTY_BITMAP_NO_CLEAR,
@@ -1916,13 +1912,11 @@ TEST_F(iommufd_dirty_tracking, get_dirty_bitmap_no_clear)
 
 	/* u64 unaligned bitmap */
 	test_mock_dirty_bitmaps(hwpt_id, variant->buffer_size,
-				MOCK_APERTURE_START, self->page_size,
-				self->bitmap + 0xff1,
-				self->bitmap_size,
+				MOCK_APERTURE_START, self->page_size, page_size,
+				self->bitmap + 0xff1, self->bitmap_size,
 				IOMMU_HWPT_GET_DIRTY_BITMAP_NO_CLEAR,
 				_metadata);
 
-	test_ioctl_destroy(stddev_id);
 	test_ioctl_destroy(hwpt_id);
 }
 
diff --git a/tools/testing/selftests/iommu/iommufd_utils.h b/tools/testing/selftests/iommu/iommufd_utils.h
index c646264aa41fdc..8d2b46b2114da8 100644
--- a/tools/testing/selftests/iommu/iommufd_utils.h
+++ b/tools/testing/selftests/iommu/iommufd_utils.h
@@ -344,16 +344,19 @@ static int _test_cmd_mock_domain_set_dirty(int fd, __u32 hwpt_id, size_t length,
 						  page_size, bitmap, nr))
 
 static int _test_mock_dirty_bitmaps(int fd, __u32 hwpt_id, size_t length,
-				    __u64 iova, size_t page_size, __u64 *bitmap,
+				    __u64 iova, size_t page_size,
+				    size_t pte_page_size, __u64 *bitmap,
 				    __u64 bitmap_size, __u32 flags,
 				    struct __test_metadata *_metadata)
 {
-	unsigned long i, nbits = bitmap_size * BITS_PER_BYTE;
-	unsigned long nr = nbits / 2;
+	unsigned long npte = pte_page_size / page_size, pteset = 2 * npte;
+	unsigned long nbits = bitmap_size * BITS_PER_BYTE;
+	unsigned long j, i, nr = nbits / pteset ?: 1;
 	__u64 out_dirty = 0;
 
 	/* Mark all even bits as dirty in the mock domain */
-	for (i = 0; i < nbits; i += 2)
+	memset(bitmap, 0, bitmap_size);
+	for (i = 0; i < nbits; i += pteset)
 		set_bit(i, (unsigned long *)bitmap);
 
 	test_cmd_mock_domain_set_dirty(fd, hwpt_id, length, iova, page_size,
@@ -365,8 +368,12 @@ static int _test_mock_dirty_bitmaps(int fd, __u32 hwpt_id, size_t length,
 	test_cmd_get_dirty_bitmap(fd, hwpt_id, length, iova, page_size, bitmap,
 				  flags);
 	/* Beware ASSERT_EQ() is two statements -- braces are not redundant! */
-	for (i = 0; i < nbits; i++) {
-		ASSERT_EQ(!(i % 2), test_bit(i, (unsigned long *)bitmap));
+	for (i = 0; i < nbits; i += pteset) {
+		for (j = 0; j < pteset; j++) {
+			ASSERT_EQ(j < npte,
+				  test_bit(i + j, (unsigned long *)bitmap));
+		}
+		ASSERT_EQ(!(i % pteset), test_bit(i, (unsigned long *)bitmap));
 	}
 
 	memset(bitmap, 0, bitmap_size);
@@ -374,19 +381,23 @@ static int _test_mock_dirty_bitmaps(int fd, __u32 hwpt_id, size_t length,
 				  flags);
 
 	/* It as read already -- expect all zeroes */
-	for (i = 0; i < nbits; i++) {
-		ASSERT_EQ(!(i % 2) && (flags &
-				       IOMMU_HWPT_GET_DIRTY_BITMAP_NO_CLEAR),
-			  test_bit(i, (unsigned long *)bitmap));
+	for (i = 0; i < nbits; i += pteset) {
+		for (j = 0; j < pteset; j++) {
+			ASSERT_EQ(
+				(j < npte) &&
+					(flags &
+					 IOMMU_HWPT_GET_DIRTY_BITMAP_NO_CLEAR),
+				test_bit(i + j, (unsigned long *)bitmap));
+		}
 	}
 
 	return 0;
 }
-#define test_mock_dirty_bitmaps(hwpt_id, length, iova, page_size, bitmap,      \
-				bitmap_size, flags, _metadata)                 \
+#define test_mock_dirty_bitmaps(hwpt_id, length, iova, page_size, pte_size,\
+				bitmap, bitmap_size, flags, _metadata)     \
 	ASSERT_EQ(0, _test_mock_dirty_bitmaps(self->fd, hwpt_id, length, iova, \
-					      page_size, bitmap, bitmap_size,  \
-					      flags, _metadata))
+					      page_size, pte_size, bitmap,     \
+					      bitmap_size, flags, _metadata))
 
 static int _test_cmd_create_access(int fd, unsigned int ioas_id,
 				   __u32 *access_id, unsigned int flags)

From 63edc142411d8861e549d302fdeb902ae73f5a7e Mon Sep 17 00:00:00 2001
From: Joao Martins <joao.m.martins@oracle.com>
Date: Fri, 2 Feb 2024 13:34:12 +0000
Subject: [PATCH 0165/1406] iommufd/selftest: Refactor
 mock_domain_read_and_clear_dirty()

Move the clearing of the dirty bit of the mock domain into
mock_domain_test_and_clear_dirty() helper, simplifying the caller
function.

Additionally, rework the mock_domain_read_and_clear_dirty() loop to
iterate over a potentially variable IO page size. No functional change
intended with the loop refactor.

This is in preparation for dirty tracking support for IOMMU hugepage mock
domains.

Link: https://lore.kernel.org/r/20240202133415.23819-7-joao.m.martins@oracle.com
Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 drivers/iommu/iommufd/selftest.c | 64 ++++++++++++++++++++++----------
 1 file changed, 45 insertions(+), 19 deletions(-)

diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c
index d9e9920c7eba41..796e7e3ec0cfd8 100644
--- a/drivers/iommu/iommufd/selftest.c
+++ b/drivers/iommu/iommufd/selftest.c
@@ -191,6 +191,34 @@ static int mock_domain_set_dirty_tracking(struct iommu_domain *domain,
 	return 0;
 }
 
+static bool mock_test_and_clear_dirty(struct mock_iommu_domain *mock,
+				      unsigned long iova, size_t page_size,
+				      unsigned long flags)
+{
+	unsigned long cur, end = iova + page_size - 1;
+	bool dirty = false;
+	void *ent, *old;
+
+	for (cur = iova; cur < end; cur += MOCK_IO_PAGE_SIZE) {
+		ent = xa_load(&mock->pfns, cur / MOCK_IO_PAGE_SIZE);
+		if (!ent || !(xa_to_value(ent) & MOCK_PFN_DIRTY_IOVA))
+			continue;
+
+		dirty = true;
+		/* Clear dirty */
+		if (!(flags & IOMMU_DIRTY_NO_CLEAR)) {
+			unsigned long val;
+
+			val = xa_to_value(ent) & ~MOCK_PFN_DIRTY_IOVA;
+			old = xa_store(&mock->pfns, cur / MOCK_IO_PAGE_SIZE,
+				       xa_mk_value(val), GFP_KERNEL);
+			WARN_ON_ONCE(ent != old);
+		}
+	}
+
+	return dirty;
+}
+
 static int mock_domain_read_and_clear_dirty(struct iommu_domain *domain,
 					    unsigned long iova, size_t size,
 					    unsigned long flags,
@@ -198,31 +226,29 @@ static int mock_domain_read_and_clear_dirty(struct iommu_domain *domain,
 {
 	struct mock_iommu_domain *mock =
 		container_of(domain, struct mock_iommu_domain, domain);
-	unsigned long i, max = size / MOCK_IO_PAGE_SIZE;
-	void *ent, *old;
+	unsigned long end = iova + size;
+	void *ent;
 
 	if (!(mock->flags & MOCK_DIRTY_TRACK) && dirty->bitmap)
 		return -EINVAL;
 
-	for (i = 0; i < max; i++) {
-		unsigned long cur = iova + i * MOCK_IO_PAGE_SIZE;
+	do {
+		unsigned long pgsize = MOCK_IO_PAGE_SIZE;
+		unsigned long head;
 
-		ent = xa_load(&mock->pfns, cur / MOCK_IO_PAGE_SIZE);
-		if (ent && (xa_to_value(ent) & MOCK_PFN_DIRTY_IOVA)) {
-			/* Clear dirty */
-			if (!(flags & IOMMU_DIRTY_NO_CLEAR)) {
-				unsigned long val;
-
-				val = xa_to_value(ent) & ~MOCK_PFN_DIRTY_IOVA;
-				old = xa_store(&mock->pfns,
-					       cur / MOCK_IO_PAGE_SIZE,
-					       xa_mk_value(val), GFP_KERNEL);
-				WARN_ON_ONCE(ent != old);
-			}
-			iommu_dirty_bitmap_record(dirty, cur,
-						  MOCK_IO_PAGE_SIZE);
+		ent = xa_load(&mock->pfns, iova / MOCK_IO_PAGE_SIZE);
+		if (!ent) {
+			iova += pgsize;
+			continue;
 		}
-	}
+
+		head = iova & ~(pgsize - 1);
+
+		/* Clear dirty */
+		if (mock_test_and_clear_dirty(mock, head, pgsize, flags))
+			iommu_dirty_bitmap_record(dirty, head, pgsize);
+		iova = head + pgsize;
+	} while (iova < end);
 
 	return 0;
 }

From ffd651702c5e388320cbb25b6823c4820bcad82c Mon Sep 17 00:00:00 2001
From: Joao Martins <joao.m.martins@oracle.com>
Date: Fri, 2 Feb 2024 13:34:13 +0000
Subject: [PATCH 0166/1406] iommufd/selftest: Hugepage mock domain support

Add support to mock iommu hugepages of 1M (for a 2K mock io page size). To
avoid breaking test suite defaults, the way this is done is by explicitly
creating a iommu mock device which has hugepage support (i.e. through
MOCK_FLAGS_DEVICE_HUGE_IOVA).

The same scheme is maintained of mock base page index tracking in the
XArray, except that an extra bit is added to mark it as a hugepage. One
subpage containing the dirty bit, means that the whole hugepage is dirty
(similar to AMD IOMMU non-standard page sizes). For clearing, same thing
applies, and it must clear all dirty subpages.

This is in preparation for dirty tracking to mark mock hugepages as
dirty to exercise all the iova-bitmap fixes.

Link: https://lore.kernel.org/r/20240202133415.23819-8-joao.m.martins@oracle.com
Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 drivers/iommu/iommufd/iommufd_test.h |  1 +
 drivers/iommu/iommufd/selftest.c     | 15 +++++++++++++--
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/drivers/iommu/iommufd/iommufd_test.h b/drivers/iommu/iommufd/iommufd_test.h
index 482d4059f5db6a..e854d3f672051b 100644
--- a/drivers/iommu/iommufd/iommufd_test.h
+++ b/drivers/iommu/iommufd/iommufd_test.h
@@ -45,6 +45,7 @@ enum {
 
 enum {
 	MOCK_FLAGS_DEVICE_NO_DIRTY = 1 << 0,
+	MOCK_FLAGS_DEVICE_HUGE_IOVA = 1 << 1,
 };
 
 enum {
diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c
index 796e7e3ec0cfd8..8abf9747773eb8 100644
--- a/drivers/iommu/iommufd/selftest.c
+++ b/drivers/iommu/iommufd/selftest.c
@@ -41,6 +41,7 @@ static atomic_t mock_dev_num;
 enum {
 	MOCK_DIRTY_TRACK = 1,
 	MOCK_IO_PAGE_SIZE = PAGE_SIZE / 2,
+	MOCK_HUGE_PAGE_SIZE = 512 * MOCK_IO_PAGE_SIZE,
 
 	/*
 	 * Like a real page table alignment requires the low bits of the address
@@ -53,6 +54,7 @@ enum {
 	MOCK_PFN_START_IOVA = _MOCK_PFN_START,
 	MOCK_PFN_LAST_IOVA = _MOCK_PFN_START,
 	MOCK_PFN_DIRTY_IOVA = _MOCK_PFN_START << 1,
+	MOCK_PFN_HUGE_IOVA = _MOCK_PFN_START << 2,
 };
 
 /*
@@ -242,6 +244,8 @@ static int mock_domain_read_and_clear_dirty(struct iommu_domain *domain,
 			continue;
 		}
 
+		if (xa_to_value(ent) & MOCK_PFN_HUGE_IOVA)
+			pgsize = MOCK_HUGE_PAGE_SIZE;
 		head = iova & ~(pgsize - 1);
 
 		/* Clear dirty */
@@ -260,6 +264,7 @@ const struct iommu_dirty_ops dirty_ops = {
 
 static struct iommu_domain *mock_domain_alloc_paging(struct device *dev)
 {
+	struct mock_dev *mdev = container_of(dev, struct mock_dev, dev);
 	struct mock_iommu_domain *mock;
 
 	mock = kzalloc(sizeof(*mock), GFP_KERNEL);
@@ -268,6 +273,8 @@ static struct iommu_domain *mock_domain_alloc_paging(struct device *dev)
 	mock->domain.geometry.aperture_start = MOCK_APERTURE_START;
 	mock->domain.geometry.aperture_end = MOCK_APERTURE_LAST;
 	mock->domain.pgsize_bitmap = MOCK_IO_PAGE_SIZE;
+	if (dev && mdev->flags & MOCK_FLAGS_DEVICE_HUGE_IOVA)
+		mock->domain.pgsize_bitmap |= MOCK_HUGE_PAGE_SIZE;
 	mock->domain.ops = mock_ops.default_domain_ops;
 	mock->domain.type = IOMMU_DOMAIN_UNMANAGED;
 	xa_init(&mock->pfns);
@@ -313,7 +320,7 @@ mock_domain_alloc_user(struct device *dev, u32 flags,
 			return ERR_PTR(-EOPNOTSUPP);
 		if (user_data || (has_dirty_flag && no_dirty_ops))
 			return ERR_PTR(-EOPNOTSUPP);
-		domain = mock_domain_alloc_paging(NULL);
+		domain = mock_domain_alloc_paging(dev);
 		if (!domain)
 			return ERR_PTR(-ENOMEM);
 		if (has_dirty_flag)
@@ -376,6 +383,9 @@ static int mock_domain_map_pages(struct iommu_domain *domain,
 
 			if (pgcount == 1 && cur + MOCK_IO_PAGE_SIZE == pgsize)
 				flags = MOCK_PFN_LAST_IOVA;
+			if (pgsize != MOCK_IO_PAGE_SIZE) {
+				flags |= MOCK_PFN_HUGE_IOVA;
+			}
 			old = xa_store(&mock->pfns, iova / MOCK_IO_PAGE_SIZE,
 				       xa_mk_value((paddr / MOCK_IO_PAGE_SIZE) |
 						   flags),
@@ -630,7 +640,8 @@ static struct mock_dev *mock_dev_create(unsigned long dev_flags)
 	struct mock_dev *mdev;
 	int rc;
 
-	if (dev_flags & ~(MOCK_FLAGS_DEVICE_NO_DIRTY))
+	if (dev_flags &
+	    ~(MOCK_FLAGS_DEVICE_NO_DIRTY | MOCK_FLAGS_DEVICE_HUGE_IOVA))
 		return ERR_PTR(-EINVAL);
 
 	mdev = kzalloc(sizeof(*mdev), GFP_KERNEL);

From f6f3fed26eeae9f523ad2b9cc10e33dbd78fd1ca Mon Sep 17 00:00:00 2001
From: Joao Martins <joao.m.martins@oracle.com>
Date: Fri, 2 Feb 2024 13:34:14 +0000
Subject: [PATCH 0167/1406] iommufd/selftest: Add mock IO hugepages tests

Leverage previously added MOCK_FLAGS_DEVICE_HUGE_IOVA flag to create an
IOMMU domain with more than MOCK_IO_PAGE_SIZE supported.

Plumb the hugetlb backing memory for buffer allocation and change the
expected page size to MOCK_HUGE_PAGE_SIZE (1M) when hugepage variant test
cases are used. These so far are limited to 128M and 256M IOVA range tests
cases which is when 1M hugepages can be used.

Link: https://lore.kernel.org/r/20240202133415.23819-9-joao.m.martins@oracle.com
Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 tools/testing/selftests/iommu/iommufd.c | 45 +++++++++++++++++++++++--
 1 file changed, 42 insertions(+), 3 deletions(-)

diff --git a/tools/testing/selftests/iommu/iommufd.c b/tools/testing/selftests/iommu/iommufd.c
index 56c3e511a0ab7d..edf1c99c9936c8 100644
--- a/tools/testing/selftests/iommu/iommufd.c
+++ b/tools/testing/selftests/iommu/iommufd.c
@@ -12,6 +12,7 @@
 static unsigned long HUGEPAGE_SIZE;
 
 #define MOCK_PAGE_SIZE (PAGE_SIZE / 2)
+#define MOCK_HUGE_PAGE_SIZE (512 * MOCK_PAGE_SIZE)
 
 static unsigned long get_huge_page_size(void)
 {
@@ -1716,10 +1717,12 @@ FIXTURE(iommufd_dirty_tracking)
 FIXTURE_VARIANT(iommufd_dirty_tracking)
 {
 	unsigned long buffer_size;
+	bool hugepages;
 };
 
 FIXTURE_SETUP(iommufd_dirty_tracking)
 {
+	int mmap_flags;
 	void *vrc;
 	int rc;
 
@@ -1732,9 +1735,17 @@ FIXTURE_SETUP(iommufd_dirty_tracking)
 			   variant->buffer_size, rc);
 	}
 
+	mmap_flags = MAP_SHARED | MAP_ANONYMOUS | MAP_FIXED;
+	if (variant->hugepages) {
+		/*
+		 * MAP_POPULATE will cause the kernel to fail mmap if THPs are
+		 * not available.
+		 */
+		mmap_flags |= MAP_HUGETLB | MAP_POPULATE;
+	}
 	assert((uintptr_t)self->buffer % HUGEPAGE_SIZE == 0);
 	vrc = mmap(self->buffer, variant->buffer_size, PROT_READ | PROT_WRITE,
-		   MAP_SHARED | MAP_ANONYMOUS | MAP_FIXED, -1, 0);
+		   mmap_flags, -1, 0);
 	assert(vrc == self->buffer);
 
 	self->page_size = MOCK_PAGE_SIZE;
@@ -1749,8 +1760,16 @@ FIXTURE_SETUP(iommufd_dirty_tracking)
 	assert((uintptr_t)self->bitmap % PAGE_SIZE == 0);
 
 	test_ioctl_ioas_alloc(&self->ioas_id);
-	test_cmd_mock_domain(self->ioas_id, &self->stdev_id, &self->hwpt_id,
-			     &self->idev_id);
+	/* Enable 1M mock IOMMU hugepages */
+	if (variant->hugepages) {
+		test_cmd_mock_domain_flags(self->ioas_id,
+					   MOCK_FLAGS_DEVICE_HUGE_IOVA,
+					   &self->stdev_id, &self->hwpt_id,
+					   &self->idev_id);
+	} else {
+		test_cmd_mock_domain(self->ioas_id, &self->stdev_id,
+				     &self->hwpt_id, &self->idev_id);
+	}
 }
 
 FIXTURE_TEARDOWN(iommufd_dirty_tracking)
@@ -1784,12 +1803,26 @@ FIXTURE_VARIANT_ADD(iommufd_dirty_tracking, domain_dirty128M)
 	.buffer_size = 128UL * 1024UL * 1024UL,
 };
 
+FIXTURE_VARIANT_ADD(iommufd_dirty_tracking, domain_dirty128M_huge)
+{
+	/* 4K bitmap (128M IOVA range) */
+	.buffer_size = 128UL * 1024UL * 1024UL,
+	.hugepages = true,
+};
+
 FIXTURE_VARIANT_ADD(iommufd_dirty_tracking, domain_dirty256M)
 {
 	/* 8K bitmap (256M IOVA range) */
 	.buffer_size = 256UL * 1024UL * 1024UL,
 };
 
+FIXTURE_VARIANT_ADD(iommufd_dirty_tracking, domain_dirty256M_huge)
+{
+	/* 8K bitmap (256M IOVA range) */
+	.buffer_size = 256UL * 1024UL * 1024UL,
+	.hugepages = true,
+};
+
 TEST_F(iommufd_dirty_tracking, enforce_dirty)
 {
 	uint32_t ioas_id, stddev_id, idev_id;
@@ -1853,6 +1886,9 @@ TEST_F(iommufd_dirty_tracking, get_dirty_bitmap)
 	uint32_t hwpt_id;
 	uint32_t ioas_id;
 
+	if (variant->hugepages)
+		page_size = MOCK_HUGE_PAGE_SIZE;
+
 	test_ioctl_ioas_alloc(&ioas_id);
 	test_ioctl_ioas_map_fixed_id(ioas_id, self->buffer,
 				     variant->buffer_size, MOCK_APERTURE_START);
@@ -1887,6 +1923,9 @@ TEST_F(iommufd_dirty_tracking, get_dirty_bitmap_no_clear)
 	uint32_t hwpt_id;
 	uint32_t ioas_id;
 
+	if (variant->hugepages)
+		page_size = MOCK_HUGE_PAGE_SIZE;
+
 	test_ioctl_ioas_alloc(&ioas_id);
 	test_ioctl_ioas_map_fixed_id(ioas_id, self->buffer,
 				     variant->buffer_size, MOCK_APERTURE_START);

From 28b9f669e10f5584aba9856c5aa9d86d64ec9f69 Mon Sep 17 00:00:00 2001
From: Joao Martins <joao.m.martins@oracle.com>
Date: Fri, 2 Feb 2024 13:34:15 +0000
Subject: [PATCH 0168/1406] iommufd/iova_bitmap: Consider page offset for the
 pages to be pinned

For small bitmaps that aren't PAGE_SIZE aligned *and* that are less than
512 pages in bitmap length, use an extra page to be able to cover the
entire range e.g. [1M..3G] which would be iterated more efficiently in a
single iteration, rather than two.

Fixes: b058ea3ab5af ("vfio/iova_bitmap: refactor iova_bitmap_set() to better handle page boundaries")
Link: https://lore.kernel.org/r/20240202133415.23819-10-joao.m.martins@oracle.com
Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 drivers/iommu/iommufd/iova_bitmap.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/drivers/iommu/iommufd/iova_bitmap.c b/drivers/iommu/iommufd/iova_bitmap.c
index b370e8ee886654..db8c46bee1559a 100644
--- a/drivers/iommu/iommufd/iova_bitmap.c
+++ b/drivers/iommu/iommufd/iova_bitmap.c
@@ -178,18 +178,19 @@ static int iova_bitmap_get(struct iova_bitmap *bitmap)
 			       bitmap->mapped_base_index) *
 			       sizeof(*bitmap->bitmap), PAGE_SIZE);
 
-	/*
-	 * We always cap at max number of 'struct page' a base page can fit.
-	 * This is, for example, on x86 means 2M of bitmap data max.
-	 */
-	npages = min(npages,  PAGE_SIZE / sizeof(struct page *));
-
 	/*
 	 * Bitmap address to be pinned is calculated via pointer arithmetic
 	 * with bitmap u64 word index.
 	 */
 	addr = bitmap->bitmap + bitmap->mapped_base_index;
 
+	/*
+	 * We always cap at max number of 'struct page' a base page can fit.
+	 * This is, for example, on x86 means 2M of bitmap data max.
+	 */
+	npages = min(npages + !!offset_in_page(addr),
+		     PAGE_SIZE / sizeof(struct page *));
+
 	ret = pin_user_pages_fast((unsigned long)addr, npages,
 				  FOLL_WRITE, mapped->pages);
 	if (ret <= 0)

From c6c14f926fbe37330af6271d26f98e70d1a07372 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Fri, 2 Feb 2024 13:01:31 +0200
Subject: [PATCH 0169/1406] fs: make file_dentry() a simple accessor

file_dentry() is a relic from the days that overlayfs was using files with
a "fake" path, meaning, f_path on overlayfs and f_inode on underlying fs.

In those days, file_dentry() was needed to get the underlying fs dentry
that matches f_inode.

Files with "fake" path should not exist nowadays, so make file_dentry() a
simple accessor and use an assertion to make sure that file_dentry() was
not papering over filesystem bugs.

Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Link: https://lore.kernel.org/r/20240202110132.1584111-2-amir73il@gmail.com
Tested-by: Stefan Berger <stefanb@linux.ibm.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/fs.h | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 9efd6220b7c64b..2e07cbbf92e3d6 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1084,9 +1084,20 @@ static inline struct inode *file_inode(const struct file *f)
 	return f->f_inode;
 }
 
+/*
+ * file_dentry() is a relic from the days that overlayfs was using files with a
+ * "fake" path, meaning, f_path on overlayfs and f_inode on underlying fs.
+ * In those days, file_dentry() was needed to get the underlying fs dentry that
+ * matches f_inode.
+ * Files with "fake" path should not exist nowadays, so use an assertion to make
+ * sure that file_dentry() was not papering over filesystem bugs.
+ */
 static inline struct dentry *file_dentry(const struct file *file)
 {
-	return d_real(file->f_path.dentry, file_inode(file));
+	struct dentry *dentry = file->f_path.dentry;
+
+	WARN_ON_ONCE(d_inode(dentry) != file_inode(file));
+	return dentry;
 }
 
 struct fasync_struct {

From 2109cc619e733c8709250b62d7f1d43461589f57 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Fri, 2 Feb 2024 13:01:32 +0200
Subject: [PATCH 0170/1406] fs: remove the inode argument to ->d_real() method

The only remaining user of ->d_real() method is d_real_inode(), which
passed NULL inode argument to get the real data dentry.

There are no longer any users that call ->d_real() with a non-NULL
inode argument for getting a detry from a specific underlying layer.

Remove the inode argument of the method and replace it with an integer
'type' argument, to allow callers to request the real metadata dentry
instead of the real data dentry.

All the current users of d_real_inode() (e.g. uprobe) continue to get
the real data inode.  Caller that need to get the real metadata inode
(e.g. IMA/EVM) can use d_inode(d_real(dentry, D_REAL_METADATA)).

Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Link: https://lore.kernel.org/r/20240202110132.1584111-3-amir73il@gmail.com
Tested-by: Stefan Berger <stefanb@linux.ibm.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 Documentation/filesystems/locking.rst |  2 +-
 Documentation/filesystems/vfs.rst     | 16 ++++-----
 fs/overlayfs/super.c                  | 52 ++++++++++++---------------
 include/linux/dcache.h                | 18 ++++++----
 4 files changed, 41 insertions(+), 47 deletions(-)

diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst
index d5bf4b6b7509b0..e664061ed55dc1 100644
--- a/Documentation/filesystems/locking.rst
+++ b/Documentation/filesystems/locking.rst
@@ -29,7 +29,7 @@ prototypes::
 	char *(*d_dname)((struct dentry *dentry, char *buffer, int buflen);
 	struct vfsmount *(*d_automount)(struct path *path);
 	int (*d_manage)(const struct path *, bool);
-	struct dentry *(*d_real)(struct dentry *, const struct inode *);
+	struct dentry *(*d_real)(struct dentry *, enum d_real_type type);
 
 locking rules:
 
diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst
index eebcc0f9e2bcd1..6e903a903f8f69 100644
--- a/Documentation/filesystems/vfs.rst
+++ b/Documentation/filesystems/vfs.rst
@@ -1264,7 +1264,7 @@ defined:
 		char *(*d_dname)(struct dentry *, char *, int);
 		struct vfsmount *(*d_automount)(struct path *);
 		int (*d_manage)(const struct path *, bool);
-		struct dentry *(*d_real)(struct dentry *, const struct inode *);
+		struct dentry *(*d_real)(struct dentry *, enum d_real_type type);
 	};
 
 ``d_revalidate``
@@ -1419,16 +1419,14 @@ defined:
 	the dentry being transited from.
 
 ``d_real``
-	overlay/union type filesystems implement this method to return
-	one of the underlying dentries hidden by the overlay.  It is
-	used in two different modes:
+	overlay/union type filesystems implement this method to return one
+	of the underlying dentries of a regular file hidden by the overlay.
 
-	Called from file_dentry() it returns the real dentry matching
-	the inode argument.  The real dentry may be from a lower layer
-	already copied up, but still referenced from the file.  This
-	mode is selected with a non-NULL inode argument.
+	The 'type' argument takes the values D_REAL_DATA or D_REAL_METADATA
+	for returning the real underlying dentry that refers to the inode
+	hosting the file's data or metadata respectively.
 
-	With NULL inode the topmost real underlying dentry is returned.
+	For non-regular files, the 'dentry' argument is returned.
 
 Each dentry has a pointer to its parent dentry, as well as a hash list
 of child dentries.  Child dentries are basically like files in a
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index 4ab66e3d4cff98..df2ad2f6079829 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -28,41 +28,38 @@ MODULE_LICENSE("GPL");
 
 struct ovl_dir_cache;
 
-static struct dentry *ovl_d_real(struct dentry *dentry,
-				 const struct inode *inode)
+static struct dentry *ovl_d_real(struct dentry *dentry, enum d_real_type type)
 {
-	struct dentry *real = NULL, *lower;
+	struct dentry *upper, *lower;
 	int err;
 
-	/*
-	 * vfs is only expected to call d_real() with NULL from d_real_inode()
-	 * and with overlay inode from file_dentry() on an overlay file.
-	 *
-	 * TODO: remove @inode argument from d_real() API, remove code in this
-	 * function that deals with non-NULL @inode and remove d_real() call
-	 * from file_dentry().
-	 */
-	if (inode && d_inode(dentry) == inode)
-		return dentry;
-	else if (inode)
+	switch (type) {
+	case D_REAL_DATA:
+	case D_REAL_METADATA:
+		break;
+	default:
 		goto bug;
+	}
 
 	if (!d_is_reg(dentry)) {
 		/* d_real_inode() is only relevant for regular files */
 		return dentry;
 	}
 
-	real = ovl_dentry_upper(dentry);
-	if (real && (inode == d_inode(real)))
-		return real;
+	upper = ovl_dentry_upper(dentry);
+	if (upper && (type == D_REAL_METADATA ||
+		      ovl_has_upperdata(d_inode(dentry))))
+		return upper;
 
-	if (real && !inode && ovl_has_upperdata(d_inode(dentry)))
-		return real;
+	if (type == D_REAL_METADATA) {
+		lower = ovl_dentry_lower(dentry);
+		goto real_lower;
+	}
 
 	/*
-	 * Best effort lazy lookup of lowerdata for !inode case to return
+	 * Best effort lazy lookup of lowerdata for D_REAL_DATA case to return
 	 * the real lowerdata dentry.  The only current caller of d_real() with
-	 * NULL inode is d_real_inode() from trace_uprobe and this caller is
+	 * D_REAL_DATA is d_real_inode() from trace_uprobe and this caller is
 	 * likely going to be followed reading from the file, before placing
 	 * uprobes on offset within the file, so lowerdata should be available
 	 * when setting the uprobe.
@@ -73,18 +70,13 @@ static struct dentry *ovl_d_real(struct dentry *dentry,
 	lower = ovl_dentry_lowerdata(dentry);
 	if (!lower)
 		goto bug;
-	real = lower;
 
-	/* Handle recursion */
-	real = d_real(real, inode);
+real_lower:
+	/* Handle recursion into stacked lower fs */
+	return d_real(lower, type);
 
-	if (!inode || inode == d_inode(real))
-		return real;
 bug:
-	WARN(1, "%s(%pd4, %s:%lu): real dentry (%p/%lu) not found\n",
-	     __func__, dentry, inode ? inode->i_sb->s_id : "NULL",
-	     inode ? inode->i_ino : 0, real,
-	     real && d_inode(real) ? d_inode(real)->i_ino : 0);
+	WARN(1, "%s(%pd4, %d): real dentry not found\n", __func__, dentry, type);
 	return dentry;
 }
 
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 1666c387861f7a..d616a745a34c69 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -125,6 +125,11 @@ enum dentry_d_lock_class
 	DENTRY_D_LOCK_NESTED
 };
 
+enum d_real_type {
+	D_REAL_DATA,
+	D_REAL_METADATA,
+};
+
 struct dentry_operations {
 	int (*d_revalidate)(struct dentry *, unsigned int);
 	int (*d_weak_revalidate)(struct dentry *, unsigned int);
@@ -139,7 +144,7 @@ struct dentry_operations {
 	char *(*d_dname)(struct dentry *, char *, int);
 	struct vfsmount *(*d_automount)(struct path *);
 	int (*d_manage)(const struct path *, bool);
-	struct dentry *(*d_real)(struct dentry *, const struct inode *);
+	struct dentry *(*d_real)(struct dentry *, enum d_real_type type);
 } ____cacheline_aligned;
 
 /*
@@ -546,24 +551,23 @@ static inline struct inode *d_backing_inode(const struct dentry *upper)
 /**
  * d_real - Return the real dentry
  * @dentry: the dentry to query
- * @inode: inode to select the dentry from multiple layers (can be NULL)
+ * @type: the type of real dentry (data or metadata)
  *
  * If dentry is on a union/overlay, then return the underlying, real dentry.
  * Otherwise return the dentry itself.
  *
  * See also: Documentation/filesystems/vfs.rst
  */
-static inline struct dentry *d_real(struct dentry *dentry,
-				    const struct inode *inode)
+static inline struct dentry *d_real(struct dentry *dentry, enum d_real_type type)
 {
 	if (unlikely(dentry->d_flags & DCACHE_OP_REAL))
-		return dentry->d_op->d_real(dentry, inode);
+		return dentry->d_op->d_real(dentry, type);
 	else
 		return dentry;
 }
 
 /**
- * d_real_inode - Return the real inode
+ * d_real_inode - Return the real inode hosting the data
  * @dentry: The dentry to query
  *
  * If dentry is on a union/overlay, then return the underlying, real inode.
@@ -572,7 +576,7 @@ static inline struct dentry *d_real(struct dentry *dentry,
 static inline struct inode *d_real_inode(const struct dentry *dentry)
 {
 	/* This usage of d_real() results in const dentry */
-	return d_backing_inode(d_real((struct dentry *) dentry, NULL));
+	return d_inode(d_real((struct dentry *) dentry, D_REAL_DATA));
 }
 
 struct name_snapshot {

From 456561ba8e495e9320c1f304bf1cd3d1043cbe7b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jonas=20Dre=C3=9Fler?= <verdre@v0yd.nl>
Date: Tue, 6 Feb 2024 12:08:13 +0100
Subject: [PATCH 0171/1406] Bluetooth: hci_conn: Only do ACL connections
 sequentially
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Pretty much all bluetooth chipsets only support paging a single device at
a time, and if they don't reject a secondary "Create Connection" request
while another is still ongoing, they'll most likely serialize those
requests in the firware.

With commit 4c67bc74f016 ("[Bluetooth] Support concurrent connect
requests") we started adding some serialization of our own in case the
adapter returns "Command Disallowed" HCI error.

This commit was using the BT_CONNECT2 state for the serialization, this
state is also used for a few more things (most notably to indicate we're
waiting for an inquiry to cancel) and therefore a bit unreliable. Also
not all BT firwares would respond with "Command Disallowed" on too many
connection requests, some will also respond with "Hardware Failure"
(BCM4378), and others will error out later and send a "Connect Complete"
event with error "Rejected Limited Resources" (Marvell 88W8897).

We can clean things up a bit and also make the serialization more reliable
by using our hci_sync machinery to always do "Create Connection" requests
in a sequential manner.

This is very similar to what we're already doing for establishing LE
connections, and it works well there.

Note that this causes a test failure in mgmt-tester (test "Pair Device
- Power off 1") because the hci_abort_conn_sync() changes the error we
return on timeout of the "Create Connection". We'll fix this on the
mgmt-tester side by adjusting the expected error for the test.

Signed-off-by: Jonas Dreßler <verdre@v0yd.nl>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/hci.h      |  1 +
 include/net/bluetooth/hci_sync.h |  3 ++
 net/bluetooth/hci_conn.c         | 69 ++++---------------------------
 net/bluetooth/hci_sync.c         | 70 ++++++++++++++++++++++++++++++++
 4 files changed, 83 insertions(+), 60 deletions(-)

diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h
index aa6c69053d7cd6..08cb5cb249a497 100644
--- a/include/net/bluetooth/hci.h
+++ b/include/net/bluetooth/hci.h
@@ -437,6 +437,7 @@ enum {
 #define HCI_NCMD_TIMEOUT	msecs_to_jiffies(4000)	/* 4 seconds */
 #define HCI_ACL_TX_TIMEOUT	msecs_to_jiffies(45000)	/* 45 seconds */
 #define HCI_AUTO_OFF_TIMEOUT	msecs_to_jiffies(2000)	/* 2 seconds */
+#define HCI_ACL_CONN_TIMEOUT	msecs_to_jiffies(20000)	/* 20 seconds */
 #define HCI_LE_CONN_TIMEOUT	msecs_to_jiffies(20000)	/* 20 seconds */
 #define HCI_LE_AUTOCONN_TIMEOUT	msecs_to_jiffies(4000)	/* 4 seconds */
 
diff --git a/include/net/bluetooth/hci_sync.h b/include/net/bluetooth/hci_sync.h
index e2582c24254498..824660f8f30da6 100644
--- a/include/net/bluetooth/hci_sync.h
+++ b/include/net/bluetooth/hci_sync.h
@@ -138,3 +138,6 @@ int hci_le_terminate_big_sync(struct hci_dev *hdev, u8 handle, u8 reason);
 int hci_le_big_terminate_sync(struct hci_dev *hdev, u8 handle);
 
 int hci_le_pa_terminate_sync(struct hci_dev *hdev, u16 handle);
+
+int hci_acl_create_connection_sync(struct hci_dev *hdev,
+				   struct hci_conn *conn);
diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c
index fc4d72f83ac25f..816be7667a8cfc 100644
--- a/net/bluetooth/hci_conn.c
+++ b/net/bluetooth/hci_conn.c
@@ -178,64 +178,6 @@ static void hci_conn_cleanup(struct hci_conn *conn)
 	hci_dev_put(hdev);
 }
 
-static void hci_acl_create_connection(struct hci_conn *conn)
-{
-	struct hci_dev *hdev = conn->hdev;
-	struct inquiry_entry *ie;
-	struct hci_cp_create_conn cp;
-
-	BT_DBG("hcon %p", conn);
-
-	/* Many controllers disallow HCI Create Connection while it is doing
-	 * HCI Inquiry. So we cancel the Inquiry first before issuing HCI Create
-	 * Connection. This may cause the MGMT discovering state to become false
-	 * without user space's request but it is okay since the MGMT Discovery
-	 * APIs do not promise that discovery should be done forever. Instead,
-	 * the user space monitors the status of MGMT discovering and it may
-	 * request for discovery again when this flag becomes false.
-	 */
-	if (test_bit(HCI_INQUIRY, &hdev->flags)) {
-		/* Put this connection to "pending" state so that it will be
-		 * executed after the inquiry cancel command complete event.
-		 */
-		conn->state = BT_CONNECT2;
-		hci_send_cmd(hdev, HCI_OP_INQUIRY_CANCEL, 0, NULL);
-		return;
-	}
-
-	conn->state = BT_CONNECT;
-	conn->out = true;
-	conn->role = HCI_ROLE_MASTER;
-
-	conn->attempt++;
-
-	conn->link_policy = hdev->link_policy;
-
-	memset(&cp, 0, sizeof(cp));
-	bacpy(&cp.bdaddr, &conn->dst);
-	cp.pscan_rep_mode = 0x02;
-
-	ie = hci_inquiry_cache_lookup(hdev, &conn->dst);
-	if (ie) {
-		if (inquiry_entry_age(ie) <= INQUIRY_ENTRY_AGE_MAX) {
-			cp.pscan_rep_mode = ie->data.pscan_rep_mode;
-			cp.pscan_mode     = ie->data.pscan_mode;
-			cp.clock_offset   = ie->data.clock_offset |
-					    cpu_to_le16(0x8000);
-		}
-
-		memcpy(conn->dev_class, ie->data.dev_class, 3);
-	}
-
-	cp.pkt_type = cpu_to_le16(conn->pkt_type);
-	if (lmp_rswitch_capable(hdev) && !(hdev->link_mode & HCI_LM_MASTER))
-		cp.role_switch = 0x01;
-	else
-		cp.role_switch = 0x00;
-
-	hci_send_cmd(hdev, HCI_OP_CREATE_CONN, sizeof(cp), &cp);
-}
-
 int hci_disconnect(struct hci_conn *conn, __u8 reason)
 {
 	BT_DBG("hcon %p", conn);
@@ -1696,10 +1638,17 @@ struct hci_conn *hci_connect_acl(struct hci_dev *hdev, bdaddr_t *dst,
 
 	acl->conn_reason = conn_reason;
 	if (acl->state == BT_OPEN || acl->state == BT_CLOSED) {
+		int err;
+
 		acl->sec_level = BT_SECURITY_LOW;
 		acl->pending_sec_level = sec_level;
 		acl->auth_type = auth_type;
-		hci_acl_create_connection(acl);
+
+		err = hci_acl_create_connection_sync(hdev, acl);
+		if (err) {
+			hci_conn_del(acl);
+			return ERR_PTR(err);
+		}
 	}
 
 	return acl;
@@ -2654,7 +2603,7 @@ void hci_conn_check_pending(struct hci_dev *hdev)
 
 	conn = hci_conn_hash_lookup_state(hdev, ACL_LINK, BT_CONNECT2);
 	if (conn)
-		hci_acl_create_connection(conn);
+		hci_acl_create_connection_sync(hdev, conn);
 
 	hci_dev_unlock(hdev);
 }
diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c
index 1122296ce3fa3f..617407b81ffeca 100644
--- a/net/bluetooth/hci_sync.c
+++ b/net/bluetooth/hci_sync.c
@@ -6492,3 +6492,73 @@ int hci_update_adv_data(struct hci_dev *hdev, u8 instance)
 	return hci_cmd_sync_queue(hdev, _update_adv_data_sync,
 				  UINT_PTR(instance), NULL);
 }
+
+static int __hci_acl_create_connection_sync(struct hci_dev *hdev, void *data)
+{
+	struct hci_conn *conn = data;
+	struct inquiry_entry *ie;
+	struct hci_cp_create_conn cp;
+	int err;
+
+	/* Many controllers disallow HCI Create Connection while it is doing
+	 * HCI Inquiry. So we cancel the Inquiry first before issuing HCI Create
+	 * Connection. This may cause the MGMT discovering state to become false
+	 * without user space's request but it is okay since the MGMT Discovery
+	 * APIs do not promise that discovery should be done forever. Instead,
+	 * the user space monitors the status of MGMT discovering and it may
+	 * request for discovery again when this flag becomes false.
+	 */
+	if (test_bit(HCI_INQUIRY, &hdev->flags)) {
+		err = __hci_cmd_sync_status(hdev, HCI_OP_INQUIRY_CANCEL, 0,
+					    NULL, HCI_CMD_TIMEOUT);
+		if (err)
+			bt_dev_warn(hdev, "Failed to cancel inquiry %d", err);
+	}
+
+	conn->state = BT_CONNECT;
+	conn->out = true;
+	conn->role = HCI_ROLE_MASTER;
+
+	conn->attempt++;
+
+	conn->link_policy = hdev->link_policy;
+
+	memset(&cp, 0, sizeof(cp));
+	bacpy(&cp.bdaddr, &conn->dst);
+	cp.pscan_rep_mode = 0x02;
+
+	ie = hci_inquiry_cache_lookup(hdev, &conn->dst);
+	if (ie) {
+		if (inquiry_entry_age(ie) <= INQUIRY_ENTRY_AGE_MAX) {
+			cp.pscan_rep_mode = ie->data.pscan_rep_mode;
+			cp.pscan_mode     = ie->data.pscan_mode;
+			cp.clock_offset   = ie->data.clock_offset |
+					    cpu_to_le16(0x8000);
+		}
+
+		memcpy(conn->dev_class, ie->data.dev_class, 3);
+	}
+
+	cp.pkt_type = cpu_to_le16(conn->pkt_type);
+	if (lmp_rswitch_capable(hdev) && !(hdev->link_mode & HCI_LM_MASTER))
+		cp.role_switch = 0x01;
+	else
+		cp.role_switch = 0x00;
+
+	err = __hci_cmd_sync_status_sk(hdev, HCI_OP_CREATE_CONN,
+				       sizeof(cp), &cp,
+				       HCI_EV_CONN_COMPLETE,
+				       HCI_ACL_CONN_TIMEOUT, NULL);
+
+	if (err == -ETIMEDOUT)
+		hci_abort_conn_sync(hdev, conn, HCI_ERROR_LOCAL_HOST_TERM);
+
+	return err;
+}
+
+int hci_acl_create_connection_sync(struct hci_dev *hdev,
+				   struct hci_conn *conn)
+{
+	return hci_cmd_sync_queue(hdev, __hci_acl_create_connection_sync,
+				  conn, NULL);
+}

From 8e14d581d125a11e62bf837ad709b106095e5557 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jonas=20Dre=C3=9Fler?= <verdre@v0yd.nl>
Date: Tue, 6 Feb 2024 12:08:14 +0100
Subject: [PATCH 0172/1406] Bluetooth: Remove pending ACL connection attempts
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

With the last commit we moved to using the hci_sync queue for "Create
Connection" requests, removing the need for retrying the paging after
finished/failed "Create Connection" requests and after the end of
inquiries.

hci_conn_check_pending() was used to trigger this retry, we can remove it
now.

Note that we can also remove the special handling for COMMAND_DISALLOWED
errors in the completion handler of "Create Connection", because "Create
Connection" requests are now always serialized.

This is somewhat reverting commit 4c67bc74f016 ("[Bluetooth] Support
concurrent connect requests").

With this, the BT_CONNECT2 state of ACL hci_conn objects should now be
back to meaning only one thing: That we received a "Connection Request"
from another device (see hci_conn_request_evt), but the response to that
is going to be deferred.

Signed-off-by: Jonas Dreßler <verdre@v0yd.nl>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/hci_core.h |  1 -
 net/bluetooth/hci_conn.c         | 16 ----------------
 net/bluetooth/hci_event.c        | 21 ++++-----------------
 3 files changed, 4 insertions(+), 34 deletions(-)

diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index 8f8dd917371423..34aa9d0290fee6 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -1480,7 +1480,6 @@ struct hci_conn *hci_conn_add_unset(struct hci_dev *hdev, int type,
 				    bdaddr_t *dst, u8 role);
 void hci_conn_del(struct hci_conn *conn);
 void hci_conn_hash_flush(struct hci_dev *hdev);
-void hci_conn_check_pending(struct hci_dev *hdev);
 
 struct hci_chan *hci_chan_create(struct hci_conn *conn);
 void hci_chan_del(struct hci_chan *chan);
diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c
index 816be7667a8cfc..a4beed8587eb76 100644
--- a/net/bluetooth/hci_conn.c
+++ b/net/bluetooth/hci_conn.c
@@ -2592,22 +2592,6 @@ void hci_conn_hash_flush(struct hci_dev *hdev)
 	}
 }
 
-/* Check pending connect attempts */
-void hci_conn_check_pending(struct hci_dev *hdev)
-{
-	struct hci_conn *conn;
-
-	BT_DBG("hdev %s", hdev->name);
-
-	hci_dev_lock(hdev);
-
-	conn = hci_conn_hash_lookup_state(hdev, ACL_LINK, BT_CONNECT2);
-	if (conn)
-		hci_acl_create_connection_sync(hdev, conn);
-
-	hci_dev_unlock(hdev);
-}
-
 static u32 get_link_mode(struct hci_conn *conn)
 {
 	u32 link_mode = 0;
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index 271c00792801c4..e7887bae334b28 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -118,8 +118,6 @@ static u8 hci_cc_inquiry_cancel(struct hci_dev *hdev, void *data,
 		hci_discovery_set_state(hdev, DISCOVERY_STOPPED);
 	hci_dev_unlock(hdev);
 
-	hci_conn_check_pending(hdev);
-
 	return rp->status;
 }
 
@@ -150,8 +148,6 @@ static u8 hci_cc_exit_periodic_inq(struct hci_dev *hdev, void *data,
 
 	hci_dev_clear_flag(hdev, HCI_PERIODIC_INQ);
 
-	hci_conn_check_pending(hdev);
-
 	return rp->status;
 }
 
@@ -2312,10 +2308,8 @@ static void hci_cs_inquiry(struct hci_dev *hdev, __u8 status)
 {
 	bt_dev_dbg(hdev, "status 0x%2.2x", status);
 
-	if (status) {
-		hci_conn_check_pending(hdev);
+	if (status)
 		return;
-	}
 
 	if (hci_sent_cmd_data(hdev, HCI_OP_INQUIRY))
 		set_bit(HCI_INQUIRY, &hdev->flags);
@@ -2340,12 +2334,9 @@ static void hci_cs_create_conn(struct hci_dev *hdev, __u8 status)
 
 	if (status) {
 		if (conn && conn->state == BT_CONNECT) {
-			if (status != HCI_ERROR_COMMAND_DISALLOWED || conn->attempt > 2) {
-				conn->state = BT_CLOSED;
-				hci_connect_cfm(conn, status);
-				hci_conn_del(conn);
-			} else
-				conn->state = BT_CONNECT2;
+			conn->state = BT_CLOSED;
+			hci_connect_cfm(conn, status);
+			hci_conn_del(conn);
 		}
 	} else {
 		if (!conn) {
@@ -3035,8 +3026,6 @@ static void hci_inquiry_complete_evt(struct hci_dev *hdev, void *data,
 
 	bt_dev_dbg(hdev, "status 0x%2.2x", ev->status);
 
-	hci_conn_check_pending(hdev);
-
 	if (!test_and_clear_bit(HCI_INQUIRY, &hdev->flags))
 		return;
 
@@ -3258,8 +3247,6 @@ static void hci_conn_complete_evt(struct hci_dev *hdev, void *data,
 
 unlock:
 	hci_dev_unlock(hdev);
-
-	hci_conn_check_pending(hdev);
 }
 
 static void hci_reject_conn(struct hci_dev *hdev, bdaddr_t *bdaddr)

From 2c312f328894df239807173b5b34f1db4cfc397f Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Thu, 18 Jan 2024 15:06:05 -0800
Subject: [PATCH 0173/1406] ubsan: Reintroduce signed overflow sanitizer

In order to mitigate unexpected signed wrap-around[1], bring back the
signed integer overflow sanitizer. It was removed in commit 6aaa31aeb9cf
("ubsan: remove overflow checks") because it was effectively a no-op
when combined with -fno-strict-overflow (which correctly changes signed
overflow from being "undefined" to being explicitly "wrap around").

Compilers are adjusting their sanitizers to trap wrap-around and to
detecting common code patterns that should not be instrumented
(e.g. "var + offset < var"). Prepare for this and explicitly rename
the option from "OVERFLOW" to "WRAP".

To annotate intentional wrap-around arithmetic, the add/sub/mul_wrap()
helpers can be used for individual statements. At the function level,
the __signed_wrap attribute can be used to mark an entire function as
expecting its signed arithmetic to wrap around. For a single object file
the Makefile can use "UBSAN_WRAP_SIGNED_target.o := n" to mark it as
wrapping, and for an entire directory, "UBSAN_WRAP_SIGNED := n" can be
used.

Additionally keep these disabled under CONFIG_COMPILE_TEST for now.

Link: https://github.com/KSPP/linux/issues/26 [1]
Cc: Miguel Ojeda <ojeda@kernel.org>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Hao Luo <haoluo@google.com>
Reviewed-by: Marco Elver <elver@google.com>
Reviewed-by: Justin Stitt <justinstitt@google.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 include/linux/compiler_types.h |  9 ++++-
 lib/Kconfig.ubsan              | 14 +++++++
 lib/test_ubsan.c               | 37 ++++++++++++++++++
 lib/ubsan.c                    | 68 ++++++++++++++++++++++++++++++++++
 lib/ubsan.h                    |  4 ++
 scripts/Makefile.lib           |  3 ++
 scripts/Makefile.ubsan         |  3 ++
 7 files changed, 137 insertions(+), 1 deletion(-)

diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h
index 6f1ca49306d2f7..ee9d272008a5ed 100644
--- a/include/linux/compiler_types.h
+++ b/include/linux/compiler_types.h
@@ -282,11 +282,18 @@ struct ftrace_likely_data {
 #define __no_sanitize_or_inline __always_inline
 #endif
 
+/* Do not trap wrapping arithmetic within an annotated function. */
+#ifdef CONFIG_UBSAN_SIGNED_WRAP
+# define __signed_wrap __attribute__((no_sanitize("signed-integer-overflow")))
+#else
+# define __signed_wrap
+#endif
+
 /* Section for code which can't be instrumented at all */
 #define __noinstr_section(section)					\
 	noinline notrace __attribute((__section__(section)))		\
 	__no_kcsan __no_sanitize_address __no_profile __no_sanitize_coverage \
-	__no_sanitize_memory
+	__no_sanitize_memory __signed_wrap
 
 #define noinstr __noinstr_section(".noinstr.text")
 
diff --git a/lib/Kconfig.ubsan b/lib/Kconfig.ubsan
index 56d7653f494138..129e9bc21877bc 100644
--- a/lib/Kconfig.ubsan
+++ b/lib/Kconfig.ubsan
@@ -116,6 +116,20 @@ config UBSAN_UNREACHABLE
 	  This option enables -fsanitize=unreachable which checks for control
 	  flow reaching an expected-to-be-unreachable position.
 
+config UBSAN_SIGNED_WRAP
+	bool "Perform checking for signed arithmetic wrap-around"
+	default UBSAN
+	depends on !COMPILE_TEST
+	depends on $(cc-option,-fsanitize=signed-integer-overflow)
+	help
+	  This option enables -fsanitize=signed-integer-overflow which checks
+	  for wrap-around of any arithmetic operations with signed integers.
+	  This currently performs nearly no instrumentation due to the
+	  kernel's use of -fno-strict-overflow which converts all would-be
+	  arithmetic undefined behavior into wrap-around arithmetic. Future
+	  sanitizer versions will allow for wrap-around checking (rather than
+	  exclusively undefined behavior).
+
 config UBSAN_BOOL
 	bool "Perform checking for non-boolean values used as boolean"
 	default UBSAN
diff --git a/lib/test_ubsan.c b/lib/test_ubsan.c
index f4ee2484d4b5e3..276c12140ee26d 100644
--- a/lib/test_ubsan.c
+++ b/lib/test_ubsan.c
@@ -11,6 +11,39 @@ typedef void(*test_ubsan_fp)(void);
 			#config, IS_ENABLED(config) ? "y" : "n");	\
 	} while (0)
 
+static void test_ubsan_add_overflow(void)
+{
+	volatile int val = INT_MAX;
+
+	UBSAN_TEST(CONFIG_UBSAN_SIGNED_WRAP);
+	val += 2;
+}
+
+static void test_ubsan_sub_overflow(void)
+{
+	volatile int val = INT_MIN;
+	volatile int val2 = 2;
+
+	UBSAN_TEST(CONFIG_UBSAN_SIGNED_WRAP);
+	val -= val2;
+}
+
+static void test_ubsan_mul_overflow(void)
+{
+	volatile int val = INT_MAX / 2;
+
+	UBSAN_TEST(CONFIG_UBSAN_SIGNED_WRAP);
+	val *= 3;
+}
+
+static void test_ubsan_negate_overflow(void)
+{
+	volatile int val = INT_MIN;
+
+	UBSAN_TEST(CONFIG_UBSAN_SIGNED_WRAP);
+	val = -val;
+}
+
 static void test_ubsan_divrem_overflow(void)
 {
 	volatile int val = 16;
@@ -90,6 +123,10 @@ static void test_ubsan_misaligned_access(void)
 }
 
 static const test_ubsan_fp test_ubsan_array[] = {
+	test_ubsan_add_overflow,
+	test_ubsan_sub_overflow,
+	test_ubsan_mul_overflow,
+	test_ubsan_negate_overflow,
 	test_ubsan_shift_out_of_bounds,
 	test_ubsan_out_of_bounds,
 	test_ubsan_load_invalid_value,
diff --git a/lib/ubsan.c b/lib/ubsan.c
index df4f8d1354bbf4..5fc107f61934c2 100644
--- a/lib/ubsan.c
+++ b/lib/ubsan.c
@@ -222,6 +222,74 @@ static void ubsan_epilogue(void)
 	check_panic_on_warn("UBSAN");
 }
 
+static void handle_overflow(struct overflow_data *data, void *lhs,
+			void *rhs, char op)
+{
+
+	struct type_descriptor *type = data->type;
+	char lhs_val_str[VALUE_LENGTH];
+	char rhs_val_str[VALUE_LENGTH];
+
+	if (suppress_report(&data->location))
+		return;
+
+	ubsan_prologue(&data->location, type_is_signed(type) ?
+			"signed-integer-overflow" :
+			"unsigned-integer-overflow");
+
+	val_to_string(lhs_val_str, sizeof(lhs_val_str), type, lhs);
+	val_to_string(rhs_val_str, sizeof(rhs_val_str), type, rhs);
+	pr_err("%s %c %s cannot be represented in type %s\n",
+		lhs_val_str,
+		op,
+		rhs_val_str,
+		type->type_name);
+
+	ubsan_epilogue();
+}
+
+void __ubsan_handle_add_overflow(void *data,
+				void *lhs, void *rhs)
+{
+
+	handle_overflow(data, lhs, rhs, '+');
+}
+EXPORT_SYMBOL(__ubsan_handle_add_overflow);
+
+void __ubsan_handle_sub_overflow(void *data,
+				void *lhs, void *rhs)
+{
+	handle_overflow(data, lhs, rhs, '-');
+}
+EXPORT_SYMBOL(__ubsan_handle_sub_overflow);
+
+void __ubsan_handle_mul_overflow(void *data,
+				void *lhs, void *rhs)
+{
+	handle_overflow(data, lhs, rhs, '*');
+}
+EXPORT_SYMBOL(__ubsan_handle_mul_overflow);
+
+void __ubsan_handle_negate_overflow(void *_data, void *old_val)
+{
+	struct overflow_data *data = _data;
+	char old_val_str[VALUE_LENGTH];
+
+	if (suppress_report(&data->location))
+		return;
+
+	ubsan_prologue(&data->location, "negation-overflow");
+
+	val_to_string(old_val_str, sizeof(old_val_str), data->type, old_val);
+
+	pr_err("negation of %s cannot be represented in type %s:\n",
+		old_val_str, data->type->type_name);
+
+	ubsan_epilogue();
+}
+EXPORT_SYMBOL(__ubsan_handle_negate_overflow);
+
+
 void __ubsan_handle_divrem_overflow(void *_data, void *lhs, void *rhs)
 {
 	struct overflow_data *data = _data;
diff --git a/lib/ubsan.h b/lib/ubsan.h
index 5d99ab81913bbd..0abbbac8700d19 100644
--- a/lib/ubsan.h
+++ b/lib/ubsan.h
@@ -124,6 +124,10 @@ typedef s64 s_max;
 typedef u64 u_max;
 #endif
 
+void __ubsan_handle_add_overflow(void *data, void *lhs, void *rhs);
+void __ubsan_handle_sub_overflow(void *data, void *lhs, void *rhs);
+void __ubsan_handle_mul_overflow(void *data, void *lhs, void *rhs);
+void __ubsan_handle_negate_overflow(void *_data, void *old_val);
 void __ubsan_handle_divrem_overflow(void *_data, void *lhs, void *rhs);
 void __ubsan_handle_type_mismatch(struct type_mismatch_data *data, void *ptr);
 void __ubsan_handle_type_mismatch_v1(void *_data, void *ptr);
diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib
index 52efc520ae4fa3..7ce8ecccc65a25 100644
--- a/scripts/Makefile.lib
+++ b/scripts/Makefile.lib
@@ -177,6 +177,9 @@ ifeq ($(CONFIG_UBSAN),y)
 _c_flags += $(if $(patsubst n%,, \
 		$(UBSAN_SANITIZE_$(basetarget).o)$(UBSAN_SANITIZE)y), \
 		$(CFLAGS_UBSAN))
+_c_flags += $(if $(patsubst n%,, \
+		$(UBSAN_WRAP_SIGNED_$(basetarget).o)$(UBSAN_SANITIZE_$(basetarget).o)$(UBSAN_WRAP_SIGNED)$(UBSAN_SANITIZE)y), \
+		$(CFLAGS_UBSAN_WRAP_SIGNED))
 endif
 
 ifeq ($(CONFIG_KCOV),y)
diff --git a/scripts/Makefile.ubsan b/scripts/Makefile.ubsan
index 7cf42231042b67..bc957add0b4d94 100644
--- a/scripts/Makefile.ubsan
+++ b/scripts/Makefile.ubsan
@@ -13,3 +13,6 @@ ubsan-cflags-$(CONFIG_UBSAN_ENUM)		+= -fsanitize=enum
 ubsan-cflags-$(CONFIG_UBSAN_TRAP)		+= $(call cc-option,-fsanitize-trap=undefined,-fsanitize-undefined-trap-on-error)
 
 export CFLAGS_UBSAN := $(ubsan-cflags-y)
+
+ubsan-wrap-signed-cflags-$(CONFIG_UBSAN_SIGNED_WRAP)     += -fsanitize=signed-integer-overflow
+export CFLAGS_UBSAN_WRAP_SIGNED := $(ubsan-wrap-signed-cflags-y)

From 8a05fa1fd114b2e274e43829ce64d3fbea16a283 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 2 Feb 2024 03:18:14 -0800
Subject: [PATCH 0174/1406] string: Redefine strscpy_pad() as a macro

In preparation for making strscpy_pad()'s 3rd argument optional, redefine
it as a macro. This also has the benefit of allowing greater FORITFY
introspection, as it couldn't see into the strscpy() nor the memset()
within strscpy_pad().

Cc: Andy Shevchenko <andy@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc:  <linux-hardening@vger.kernel.org>
Reviewed-by: Justin Stitt <justinstitt@google.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 include/linux/string.h | 33 +++++++++++++++++++++++++++++++--
 lib/string_helpers.c   | 34 ----------------------------------
 2 files changed, 31 insertions(+), 36 deletions(-)

diff --git a/include/linux/string.h b/include/linux/string.h
index ab148d8dbfc146..78b28004c5ba76 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -70,8 +70,37 @@ extern char * strncpy(char *,const char *, __kernel_size_t);
 ssize_t strscpy(char *, const char *, size_t);
 #endif
 
-/* Wraps calls to strscpy()/memset(), no arch specific code required */
-ssize_t strscpy_pad(char *dest, const char *src, size_t count);
+/**
+ * strscpy_pad() - Copy a C-string into a sized buffer
+ * @dest: Where to copy the string to
+ * @src: Where to copy the string from
+ * @count: Size of destination buffer
+ *
+ * Copy the string, or as much of it as fits, into the dest buffer. The
+ * behavior is undefined if the string buffers overlap. The destination
+ * buffer is always %NUL terminated, unless it's zero-sized.
+ *
+ * If the source string is shorter than the destination buffer, the
+ * remaining bytes in the buffer will be filled with %NUL bytes.
+ *
+ * For full explanation of why you may want to consider using the
+ * 'strscpy' functions please see the function docstring for strscpy().
+ *
+ * Returns:
+ * * The number of characters copied (not including the trailing %NULs)
+ * * -E2BIG if count is 0 or @src was truncated.
+ */
+#define strscpy_pad(dest, src, count)	({			\
+	char *__dst = (dest);						\
+	const char *__src = (src);					\
+	const size_t __count = (count);					\
+	ssize_t __wrote;						\
+									\
+	__wrote = strscpy(__dst, __src, __count);			\
+	if (__wrote >= 0 && __wrote < __count)				\
+		memset(__dst + __wrote + 1, 0, __count - __wrote - 1);	\
+	__wrote;							\
+})
 
 #ifndef __HAVE_ARCH_STRCAT
 extern char * strcat(char *, const char *);
diff --git a/lib/string_helpers.c b/lib/string_helpers.c
index 7713f73e66b0f3..606c3099013fdd 100644
--- a/lib/string_helpers.c
+++ b/lib/string_helpers.c
@@ -825,40 +825,6 @@ char **devm_kasprintf_strarray(struct device *dev, const char *prefix, size_t n)
 }
 EXPORT_SYMBOL_GPL(devm_kasprintf_strarray);
 
-/**
- * strscpy_pad() - Copy a C-string into a sized buffer
- * @dest: Where to copy the string to
- * @src: Where to copy the string from
- * @count: Size of destination buffer
- *
- * Copy the string, or as much of it as fits, into the dest buffer.  The
- * behavior is undefined if the string buffers overlap.  The destination
- * buffer is always %NUL terminated, unless it's zero-sized.
- *
- * If the source string is shorter than the destination buffer, zeros
- * the tail of the destination buffer.
- *
- * For full explanation of why you may want to consider using the
- * 'strscpy' functions please see the function docstring for strscpy().
- *
- * Returns:
- * * The number of characters copied (not including the trailing %NUL)
- * * -E2BIG if count is 0 or @src was truncated.
- */
-ssize_t strscpy_pad(char *dest, const char *src, size_t count)
-{
-	ssize_t written;
-
-	written = strscpy(dest, src, count);
-	if (written < 0 || written == count - 1)
-		return written;
-
-	memset(dest + written + 1, 0, count - written - 1);
-
-	return written;
-}
-EXPORT_SYMBOL(strscpy_pad);
-
 /**
  * skip_spaces - Removes leading whitespace from @str.
  * @str: The string to be stripped.

From 671fa62f7a99df877f89ee77c394e966ee4e631a Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 20 Sep 2023 12:38:14 -0700
Subject: [PATCH 0175/1406] string: Allow 2-argument strscpy()

Using sizeof(dst) for the "size" argument in strscpy() is the
overwhelmingly common case. Instead of requiring this everywhere, allow a
2-argument version to be used that will use the sizeof() internally. There
are other functions in the kernel with optional arguments[1], so this
isn't unprecedented, and improves readability. Update and relocate the
kern-doc for strscpy() too, and drop __HAVE_ARCH_STRSCPY as it is unused.

Adjust ARCH=um build to notice the changed export name, as it doesn't
do full header includes for the string helpers.

This could additionally let us save a few hundred lines of code:
 1177 files changed, 2455 insertions(+), 3026 deletions(-)
with a treewide cleanup using Coccinelle:

@needless_arg@
expression DST, SRC;
@@

        strscpy(DST, SRC
-, sizeof(DST)
        )

Link: https://elixir.bootlin.com/linux/v6.7/source/include/linux/pci.h#L1517 [1]
Reviewed-by: Justin Stitt <justinstitt@google.com>
Cc: Andy Shevchenko <andy@kernel.org>
Cc: linux-hardening@vger.kernel.org
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 arch/um/include/shared/user.h  |  3 ++-
 include/linux/fortify-string.h | 22 ++------------------
 include/linux/string.h         | 38 +++++++++++++++++++++++++++++++---
 lib/string.c                   |  6 ++----
 4 files changed, 41 insertions(+), 28 deletions(-)

diff --git a/arch/um/include/shared/user.h b/arch/um/include/shared/user.h
index 981e11d8e02543..9568cc04cbb7b0 100644
--- a/arch/um/include/shared/user.h
+++ b/arch/um/include/shared/user.h
@@ -51,7 +51,8 @@ static inline int printk(const char *fmt, ...)
 
 extern int in_aton(char *str);
 extern size_t strlcat(char *, const char *, size_t);
-extern size_t strscpy(char *, const char *, size_t);
+extern size_t sized_strscpy(char *, const char *, size_t);
+#define strscpy(dst, src, size)	sized_strscpy(dst, src, size)
 
 /* Copied from linux/compiler-gcc.h since we can't include it directly */
 #define barrier() __asm__ __volatile__("": : :"memory")
diff --git a/include/linux/fortify-string.h b/include/linux/fortify-string.h
index 89a6888f2f9e50..06b3aaa63724d0 100644
--- a/include/linux/fortify-string.h
+++ b/include/linux/fortify-string.h
@@ -215,26 +215,8 @@ __kernel_size_t __fortify_strlen(const char * const POS p)
 }
 
 /* Defined after fortified strnlen() to reuse it. */
-extern ssize_t __real_strscpy(char *, const char *, size_t) __RENAME(strscpy);
-/**
- * strscpy - Copy a C-string into a sized buffer
- *
- * @p: Where to copy the string to
- * @q: Where to copy the string from
- * @size: Size of destination buffer
- *
- * Copy the source string @q, or as much of it as fits, into the destination
- * @p buffer. The behavior is undefined if the string buffers overlap. The
- * destination @p buffer is always NUL terminated, unless it's zero-sized.
- *
- * Preferred to strncpy() since it always returns a valid string, and
- * doesn't unnecessarily force the tail of the destination buffer to be
- * zero padded. If padding is desired please use strscpy_pad().
- *
- * Returns the number of characters copied in @p (not including the
- * trailing %NUL) or -E2BIG if @size is 0 or the copy of @q was truncated.
- */
-__FORTIFY_INLINE ssize_t strscpy(char * const POS p, const char * const POS q, size_t size)
+extern ssize_t __real_strscpy(char *, const char *, size_t) __RENAME(sized_strscpy);
+__FORTIFY_INLINE ssize_t sized_strscpy(char * const POS p, const char * const POS q, size_t size)
 {
 	/* Use string size rather than possible enclosing struct size. */
 	const size_t p_size = __member_size(p);
diff --git a/include/linux/string.h b/include/linux/string.h
index 78b28004c5ba76..0d66bf9407fdd4 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -2,6 +2,7 @@
 #ifndef _LINUX_STRING_H_
 #define _LINUX_STRING_H_
 
+#include <linux/args.h>
 #include <linux/array_size.h>
 #include <linux/compiler.h>	/* for inline */
 #include <linux/types.h>	/* for size_t */
@@ -66,9 +67,40 @@ extern char * strcpy(char *,const char *);
 #ifndef __HAVE_ARCH_STRNCPY
 extern char * strncpy(char *,const char *, __kernel_size_t);
 #endif
-#ifndef __HAVE_ARCH_STRSCPY
-ssize_t strscpy(char *, const char *, size_t);
-#endif
+ssize_t sized_strscpy(char *, const char *, size_t);
+
+/*
+ * The 2 argument style can only be used when dst is an array with a
+ * known size.
+ */
+#define __strscpy0(dst, src, ...)	\
+	sized_strscpy(dst, src, sizeof(dst) + __must_be_array(dst))
+#define __strscpy1(dst, src, size)	sized_strscpy(dst, src, size)
+
+/**
+ * strscpy - Copy a C-string into a sized buffer
+ * @dst: Where to copy the string to
+ * @src: Where to copy the string from
+ * @...: Size of destination buffer (optional)
+ *
+ * Copy the source string @src, or as much of it as fits, into the
+ * destination @dst buffer. The behavior is undefined if the string
+ * buffers overlap. The destination @dst buffer is always NUL terminated,
+ * unless it's zero-sized.
+ *
+ * The size argument @... is only required when @dst is not an array, or
+ * when the copy needs to be smaller than sizeof(@dst).
+ *
+ * Preferred to strncpy() since it always returns a valid string, and
+ * doesn't unnecessarily force the tail of the destination buffer to be
+ * zero padded. If padding is desired please use strscpy_pad().
+ *
+ * Returns the number of characters copied in @dst (not including the
+ * trailing %NUL) or -E2BIG if @size is 0 or the copy from @src was
+ * truncated.
+ */
+#define strscpy(dst, src, ...)	\
+	CONCATENATE(__strscpy, COUNT_ARGS(__VA_ARGS__))(dst, src, __VA_ARGS__)
 
 /**
  * strscpy_pad() - Copy a C-string into a sized buffer
diff --git a/lib/string.c b/lib/string.c
index f791559102f69e..966da44bfc8693 100644
--- a/lib/string.c
+++ b/lib/string.c
@@ -104,8 +104,7 @@ char *strncpy(char *dest, const char *src, size_t count)
 EXPORT_SYMBOL(strncpy);
 #endif
 
-#ifndef __HAVE_ARCH_STRSCPY
-ssize_t strscpy(char *dest, const char *src, size_t count)
+ssize_t sized_strscpy(char *dest, const char *src, size_t count)
 {
 	const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;
 	size_t max = count;
@@ -171,8 +170,7 @@ ssize_t strscpy(char *dest, const char *src, size_t count)
 
 	return -E2BIG;
 }
-EXPORT_SYMBOL(strscpy);
-#endif
+EXPORT_SYMBOL(sized_strscpy);
 
 /**
  * stpcpy - copy a string from src to dest returning a pointer to the new end

From 854c513be130aa96622d915d86b9ba9e741b990b Mon Sep 17 00:00:00 2001
From: Lukas Bulwahn <lukas.bulwahn@gmail.com>
Date: Wed, 7 Feb 2024 14:42:11 +0100
Subject: [PATCH 0176/1406] Bluetooth: hci_event: Remove code to removed
 CONFIG_BT_HS

Commit cec9f3c5561d ("Bluetooth: Remove BT_HS") removes config BT_HS, but
misses two "ifdef BT_HS" blocks in hci_event.c.

Remove this dead code from this removed config option.

Signed-off-by: Lukas Bulwahn <lukas.bulwahn@gmail.com>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 net/bluetooth/hci_event.c | 163 --------------------------------------
 1 file changed, 163 deletions(-)

diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index e7887bae334b28..6071a1226e1b4c 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -5663,150 +5663,6 @@ static void hci_remote_oob_data_request_evt(struct hci_dev *hdev, void *edata,
 	hci_dev_unlock(hdev);
 }
 
-#if IS_ENABLED(CONFIG_BT_HS)
-static void hci_chan_selected_evt(struct hci_dev *hdev, void *data,
-				  struct sk_buff *skb)
-{
-	struct hci_ev_channel_selected *ev = data;
-	struct hci_conn *hcon;
-
-	bt_dev_dbg(hdev, "handle 0x%2.2x", ev->phy_handle);
-
-	hcon = hci_conn_hash_lookup_handle(hdev, ev->phy_handle);
-	if (!hcon)
-		return;
-
-	amp_read_loc_assoc_final_data(hdev, hcon);
-}
-
-static void hci_phy_link_complete_evt(struct hci_dev *hdev, void *data,
-				      struct sk_buff *skb)
-{
-	struct hci_ev_phy_link_complete *ev = data;
-	struct hci_conn *hcon, *bredr_hcon;
-
-	bt_dev_dbg(hdev, "handle 0x%2.2x status 0x%2.2x", ev->phy_handle,
-		   ev->status);
-
-	hci_dev_lock(hdev);
-
-	hcon = hci_conn_hash_lookup_handle(hdev, ev->phy_handle);
-	if (!hcon)
-		goto unlock;
-
-	if (!hcon->amp_mgr)
-		goto unlock;
-
-	if (ev->status) {
-		hci_conn_del(hcon);
-		goto unlock;
-	}
-
-	bredr_hcon = hcon->amp_mgr->l2cap_conn->hcon;
-
-	hcon->state = BT_CONNECTED;
-	bacpy(&hcon->dst, &bredr_hcon->dst);
-
-	hci_conn_hold(hcon);
-	hcon->disc_timeout = HCI_DISCONN_TIMEOUT;
-	hci_conn_drop(hcon);
-
-	hci_debugfs_create_conn(hcon);
-	hci_conn_add_sysfs(hcon);
-
-	amp_physical_cfm(bredr_hcon, hcon);
-
-unlock:
-	hci_dev_unlock(hdev);
-}
-
-static void hci_loglink_complete_evt(struct hci_dev *hdev, void *data,
-				     struct sk_buff *skb)
-{
-	struct hci_ev_logical_link_complete *ev = data;
-	struct hci_conn *hcon;
-	struct hci_chan *hchan;
-	struct amp_mgr *mgr;
-
-	bt_dev_dbg(hdev, "log_handle 0x%4.4x phy_handle 0x%2.2x status 0x%2.2x",
-		   le16_to_cpu(ev->handle), ev->phy_handle, ev->status);
-
-	hcon = hci_conn_hash_lookup_handle(hdev, ev->phy_handle);
-	if (!hcon)
-		return;
-
-	/* Create AMP hchan */
-	hchan = hci_chan_create(hcon);
-	if (!hchan)
-		return;
-
-	hchan->handle = le16_to_cpu(ev->handle);
-	hchan->amp = true;
-
-	BT_DBG("hcon %p mgr %p hchan %p", hcon, hcon->amp_mgr, hchan);
-
-	mgr = hcon->amp_mgr;
-	if (mgr && mgr->bredr_chan) {
-		struct l2cap_chan *bredr_chan = mgr->bredr_chan;
-
-		l2cap_chan_lock(bredr_chan);
-
-		bredr_chan->conn->mtu = hdev->block_mtu;
-		l2cap_logical_cfm(bredr_chan, hchan, 0);
-		hci_conn_hold(hcon);
-
-		l2cap_chan_unlock(bredr_chan);
-	}
-}
-
-static void hci_disconn_loglink_complete_evt(struct hci_dev *hdev, void *data,
-					     struct sk_buff *skb)
-{
-	struct hci_ev_disconn_logical_link_complete *ev = data;
-	struct hci_chan *hchan;
-
-	bt_dev_dbg(hdev, "handle 0x%4.4x status 0x%2.2x",
-		   le16_to_cpu(ev->handle), ev->status);
-
-	if (ev->status)
-		return;
-
-	hci_dev_lock(hdev);
-
-	hchan = hci_chan_lookup_handle(hdev, le16_to_cpu(ev->handle));
-	if (!hchan || !hchan->amp)
-		goto unlock;
-
-	amp_destroy_logical_link(hchan, ev->reason);
-
-unlock:
-	hci_dev_unlock(hdev);
-}
-
-static void hci_disconn_phylink_complete_evt(struct hci_dev *hdev, void *data,
-					     struct sk_buff *skb)
-{
-	struct hci_ev_disconn_phy_link_complete *ev = data;
-	struct hci_conn *hcon;
-
-	bt_dev_dbg(hdev, "status 0x%2.2x", ev->status);
-
-	if (ev->status)
-		return;
-
-	hci_dev_lock(hdev);
-
-	hcon = hci_conn_hash_lookup_handle(hdev, ev->phy_handle);
-	if (hcon && hcon->type == AMP_LINK) {
-		hcon->state = BT_CLOSED;
-		hci_disconn_cfm(hcon, ev->reason);
-		hci_conn_del(hcon);
-	}
-
-	hci_dev_unlock(hdev);
-}
-#endif
-
 static void le_conn_update_addr(struct hci_conn *conn, bdaddr_t *bdaddr,
 				u8 bdaddr_type, bdaddr_t *local_rpa)
 {
@@ -7616,25 +7472,6 @@ static const struct hci_ev {
 	/* [0x3e = HCI_EV_LE_META] */
 	HCI_EV_REQ_VL(HCI_EV_LE_META, hci_le_meta_evt,
 		      sizeof(struct hci_ev_le_meta), HCI_MAX_EVENT_SIZE),
-#if IS_ENABLED(CONFIG_BT_HS)
-	/* [0x40 = HCI_EV_PHY_LINK_COMPLETE] */
-	HCI_EV(HCI_EV_PHY_LINK_COMPLETE, hci_phy_link_complete_evt,
-	       sizeof(struct hci_ev_phy_link_complete)),
-	/* [0x41 = HCI_EV_CHANNEL_SELECTED] */
-	HCI_EV(HCI_EV_CHANNEL_SELECTED, hci_chan_selected_evt,
-	       sizeof(struct hci_ev_channel_selected)),
-	/* [0x42 = HCI_EV_DISCONN_PHY_LINK_COMPLETE] */
-	HCI_EV(HCI_EV_DISCONN_LOGICAL_LINK_COMPLETE,
-	       hci_disconn_loglink_complete_evt,
-	       sizeof(struct hci_ev_disconn_logical_link_complete)),
-	/* [0x45 = HCI_EV_LOGICAL_LINK_COMPLETE] */
-	HCI_EV(HCI_EV_LOGICAL_LINK_COMPLETE, hci_loglink_complete_evt,
-	       sizeof(struct hci_ev_logical_link_complete)),
-	/* [0x46 = HCI_EV_DISCONN_LOGICAL_LINK_COMPLETE] */
-	HCI_EV(HCI_EV_DISCONN_PHY_LINK_COMPLETE,
-	       hci_disconn_phylink_complete_evt,
-	       sizeof(struct hci_ev_disconn_phy_link_complete)),
-#endif
 	/* [0x48 = HCI_EV_NUM_COMP_BLOCKS] */
 	HCI_EV(HCI_EV_NUM_COMP_BLOCKS, hci_num_comp_blocks_evt,
 	       sizeof(struct hci_ev_num_comp_blocks)),

From c8d2bfabba89b4f31ea5d666ea0e61d8a5f498bc Mon Sep 17 00:00:00 2001
From: Wen Yang <wenyang.linux@foxmail.com>
Date: Wed, 7 Feb 2024 00:35:18 +0800
Subject: [PATCH 0177/1406] eventfd: strictly check the count parameter of
 eventfd_write to avoid inputting illegal strings

Since eventfd's document has clearly stated: A write(2) call adds
the 8-byte integer value supplied in its buffer to the counter.

However, in the current implementation, the following code snippet
did not cause an error:

	char str[16] = "hello world";
	uint64_t value;
	ssize_t size;
	int fd;

	fd = eventfd(0, 0);
	size = write(fd, &str, strlen(str));
	printf("eventfd: test writing a string, size=%ld\n", size);
	size = read(fd, &value, sizeof(value));
	printf("eventfd: test reading as uint64, size=%ld, valus=0x%lX\n",
	       size, value);

	close(fd);

And its output is:
eventfd: test writing a string, size=8
eventfd: test reading as uint64, size=8, valus=0x6F77206F6C6C6568

By checking whether count is equal to sizeof(ucnt), such errors
could be detected. It also follows the requirements of the manual.

Signed-off-by: Wen Yang <wenyang.linux@foxmail.com>
Link: https://lore.kernel.org/r/tencent_10AAA44731FFFA493F9F5501521F07DD4D0A@qq.com
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Jan Kara <jack@suse.cz>
Cc: David Woodhouse <dwmw@amazon.co.uk>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Eric Biggers <ebiggers@google.com>
Cc: linux-fsdevel@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/eventfd.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/eventfd.c b/fs/eventfd.c
index fc4d8109076392..9afdb722fa9257 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -251,7 +251,7 @@ static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t c
 	ssize_t res;
 	__u64 ucnt;
 
-	if (count < sizeof(ucnt))
+	if (count != sizeof(ucnt))
 		return -EINVAL;
 	if (copy_from_user(&ucnt, buf, sizeof(ucnt)))
 		return -EFAULT;

From b4291c7fd9e550b91b10c3d7787b9bf5be38de67 Mon Sep 17 00:00:00 2001
From: Taylor Jackson <taylor.a.jackson@me.com>
Date: Thu, 8 Feb 2024 03:02:54 +0000
Subject: [PATCH 0178/1406] fs/mnt_idmapping.c: Return -EINVAL when no map is
 written
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently, it is possible to create an idmapped mount using a user
namespace without any mappings. However, this yields an idmapped
mount that doesn't actually map the ids. With the following change,
it will no longer be possible to create an idmapped mount when using
a user namespace with no mappings, and will instead return EINVAL,
an “invalid argument” error code.

Reviewed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Taylor Jackson <taylor.a.jackson@me.com>
Link: https://lore.kernel.org/r/20240208-mnt-idmap-inval-v2-1-58ef26d194e0@me.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/mnt_idmapping.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/mnt_idmapping.c b/fs/mnt_idmapping.c
index 64c5205e2b5e7d..3c60f1eaca615a 100644
--- a/fs/mnt_idmapping.c
+++ b/fs/mnt_idmapping.c
@@ -214,7 +214,7 @@ static int copy_mnt_idmap(struct uid_gid_map *map_from,
 	 * anything at all.
 	 */
 	if (nr_extents == 0)
-		return 0;
+		return -EINVAL;
 
 	/*
 	 * Here we know that nr_extents is greater than zero which means

From 7c0ff4ae24bf4589b5bb460e4b31817c61fa2bef Mon Sep 17 00:00:00 2001
From: Jianhua Lu <lujianhua000@gmail.com>
Date: Mon, 29 Jan 2024 20:28:29 +0800
Subject: [PATCH 0179/1406] backlight: ktz8866: Correct the check for
 of_property_read_u32

of_property_read_u32 returns 0 when success, so reverse the
return value to get the true value.

Fixes: f8449c8f7355 ("backlight: ktz8866: Add support for Kinetic KTZ8866 backlight")
Signed-off-by: Jianhua Lu <lujianhua000@gmail.com>
Reviewed-by: Daniel Thompson <daniel.thompson@linaro.org>
Link: https://lore.kernel.org/r/20240129122829.16248-1-lujianhua000@gmail.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/video/backlight/ktz8866.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/video/backlight/ktz8866.c b/drivers/video/backlight/ktz8866.c
index 9c980f2571ee35..014877b5a9848f 100644
--- a/drivers/video/backlight/ktz8866.c
+++ b/drivers/video/backlight/ktz8866.c
@@ -97,20 +97,20 @@ static void ktz8866_init(struct ktz8866 *ktz)
 {
 	unsigned int val = 0;
 
-	if (of_property_read_u32(ktz->client->dev.of_node, "current-num-sinks", &val))
+	if (!of_property_read_u32(ktz->client->dev.of_node, "current-num-sinks", &val))
 		ktz8866_write(ktz, BL_EN, BIT(val) - 1);
 	else
 		/* Enable all 6 current sinks if the number of current sinks isn't specified. */
 		ktz8866_write(ktz, BL_EN, BIT(6) - 1);
 
-	if (of_property_read_u32(ktz->client->dev.of_node, "kinetic,current-ramp-delay-ms", &val)) {
+	if (!of_property_read_u32(ktz->client->dev.of_node, "kinetic,current-ramp-delay-ms", &val)) {
 		if (val <= 128)
 			ktz8866_write(ktz, BL_CFG2, BIT(7) | (ilog2(val) << 3) | PWM_HYST);
 		else
 			ktz8866_write(ktz, BL_CFG2, BIT(7) | ((5 + val / 64) << 3) | PWM_HYST);
 	}
 
-	if (of_property_read_u32(ktz->client->dev.of_node, "kinetic,led-enable-ramp-delay-ms", &val)) {
+	if (!of_property_read_u32(ktz->client->dev.of_node, "kinetic,led-enable-ramp-delay-ms", &val)) {
 		if (val == 0)
 			ktz8866_write(ktz, BL_DIMMING, 0);
 		else {

From 7feb4ec8ec900daf29602bcdf7c04178c63205ac Mon Sep 17 00:00:00 2001
From: Sean Young <sean@mess.org>
Date: Sun, 28 Jan 2024 15:49:04 +0000
Subject: [PATCH 0180/1406] backlight: mp3309c: Use pwm_apply_might_sleep()

pwm_apply_state() is deprecated since commit c748a6d77c06a ("pwm: Rename
pwm_apply_state() to pwm_apply_might_sleep()"). This is the final user
in the tree.

Signed-off-by: Sean Young <sean@mess.org>
Tested-by: Flavio Suligoi <f.suligoi@asem.it>
Reviewed-by: Daniel Thompson <daniel.thompson@linaro.org>
Link: https://lore.kernel.org/r/20240128154905.407302-1-sean@mess.org
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/video/backlight/mp3309c.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/video/backlight/mp3309c.c b/drivers/video/backlight/mp3309c.c
index 34d71259fac1d7..b0d9aef6942b3b 100644
--- a/drivers/video/backlight/mp3309c.c
+++ b/drivers/video/backlight/mp3309c.c
@@ -131,7 +131,7 @@ static int mp3309c_bl_update_status(struct backlight_device *bl)
 					    chip->pdata->levels[brightness],
 					    chip->pdata->levels[chip->pdata->max_brightness]);
 		pwmstate.enabled = true;
-		ret = pwm_apply_state(chip->pwmd, &pwmstate);
+		ret = pwm_apply_might_sleep(chip->pwmd, &pwmstate);
 		if (ret)
 			return ret;
 
@@ -393,7 +393,7 @@ static int mp3309c_probe(struct i2c_client *client)
 					    chip->pdata->default_brightness,
 					    chip->pdata->max_brightness);
 		pwmstate.enabled = true;
-		ret = pwm_apply_state(chip->pwmd, &pwmstate);
+		ret = pwm_apply_might_sleep(chip->pwmd, &pwmstate);
 		if (ret)
 			return dev_err_probe(chip->dev, ret,
 					     "error setting pwm device\n");

From d965a5ee7c95ce9414259181cbdccb1d2f1c1247 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Thu, 1 Feb 2024 16:47:42 +0200
Subject: [PATCH 0181/1406] backlight: hx8357: Make use of device properties

Convert the module to be property provider agnostic and allow
it to be used on non-OF platforms.

Include mod_devicetable.h explicitly to replace the dropped of.h
which included mod_devicetable.h indirectly.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Javier Martinez Canillas <javierm@redhat.com>
Reviewed-by: Daniel Thompson <daniel.thompson@linaro.org>
Link: https://lore.kernel.org/r/20240201144951.294215-2-andriy.shevchenko@linux.intel.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/video/backlight/hx8357.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/drivers/video/backlight/hx8357.c b/drivers/video/backlight/hx8357.c
index bf18337ff0c2c0..ac65609e5d8474 100644
--- a/drivers/video/backlight/hx8357.c
+++ b/drivers/video/backlight/hx8357.c
@@ -8,9 +8,9 @@
 #include <linux/delay.h>
 #include <linux/gpio/consumer.h>
 #include <linux/lcd.h>
+#include <linux/mod_devicetable.h>
 #include <linux/module.h>
-#include <linux/of.h>
-#include <linux/of_device.h>
+#include <linux/property.h>
 #include <linux/spi/spi.h>
 
 #define HX8357_NUM_IM_PINS	3
@@ -564,6 +564,8 @@ static struct lcd_ops hx8357_ops = {
 	.get_power	= hx8357_get_power,
 };
 
+typedef int (*hx8357_init_fn)(struct lcd_device *);
+
 static const struct of_device_id hx8357_dt_ids[] = {
 	{
 		.compatible = "himax,hx8357",
@@ -582,7 +584,7 @@ static int hx8357_probe(struct spi_device *spi)
 	struct device *dev = &spi->dev;
 	struct lcd_device *lcdev;
 	struct hx8357_data *lcd;
-	const struct of_device_id *match;
+	hx8357_init_fn init_fn;
 	int i, ret;
 
 	lcd = devm_kzalloc(&spi->dev, sizeof(*lcd), GFP_KERNEL);
@@ -597,8 +599,8 @@ static int hx8357_probe(struct spi_device *spi)
 
 	lcd->spi = spi;
 
-	match = of_match_device(hx8357_dt_ids, &spi->dev);
-	if (!match || !match->data)
+	init_fn = device_get_match_data(dev);
+	if (!init_fn)
 		return -EINVAL;
 
 	lcd->reset = devm_gpiod_get(dev, "reset", GPIOD_OUT_LOW);
@@ -627,7 +629,7 @@ static int hx8357_probe(struct spi_device *spi)
 
 	hx8357_lcd_reset(lcdev);
 
-	ret = ((int (*)(struct lcd_device *))match->data)(lcdev);
+	ret = init_fn(lcdev);
 	if (ret) {
 		dev_err(&spi->dev, "Couldn't initialize panel\n");
 		return ret;

From 3d226ecdfd83c0d89c1d4a430706e8228022685d Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Thu, 1 Feb 2024 16:47:43 +0200
Subject: [PATCH 0182/1406] backlight: hx8357: Move OF table closer to its
 consumer

Move OF table near to the user.

While at it, drop comma at terminator entry.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Daniel Thompson <daniel.thompson@linaro.org>
Reviewed-by: Javier Martinez Canillas <javierm@redhat.com>
Link: https://lore.kernel.org/r/20240201144951.294215-3-andriy.shevchenko@linux.intel.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/video/backlight/hx8357.c | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/drivers/video/backlight/hx8357.c b/drivers/video/backlight/hx8357.c
index ac65609e5d8474..81d0984e9d8b66 100644
--- a/drivers/video/backlight/hx8357.c
+++ b/drivers/video/backlight/hx8357.c
@@ -566,19 +566,6 @@ static struct lcd_ops hx8357_ops = {
 
 typedef int (*hx8357_init_fn)(struct lcd_device *);
 
-static const struct of_device_id hx8357_dt_ids[] = {
-	{
-		.compatible = "himax,hx8357",
-		.data = hx8357_lcd_init,
-	},
-	{
-		.compatible = "himax,hx8369",
-		.data = hx8369_lcd_init,
-	},
-	{},
-};
-MODULE_DEVICE_TABLE(of, hx8357_dt_ids);
-
 static int hx8357_probe(struct spi_device *spi)
 {
 	struct device *dev = &spi->dev;
@@ -640,6 +627,19 @@ static int hx8357_probe(struct spi_device *spi)
 	return 0;
 }
 
+static const struct of_device_id hx8357_dt_ids[] = {
+	{
+		.compatible = "himax,hx8357",
+		.data = hx8357_lcd_init,
+	},
+	{
+		.compatible = "himax,hx8369",
+		.data = hx8369_lcd_init,
+	},
+	{}
+};
+MODULE_DEVICE_TABLE(of, hx8357_dt_ids);
+
 static struct spi_driver hx8357_driver = {
 	.probe  = hx8357_probe,
 	.driver = {

From f0ed1589885ae933e2b2f9c63e16f5be3fb0324d Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Thu, 1 Feb 2024 16:47:44 +0200
Subject: [PATCH 0183/1406] backlight: hx8357: Make use of dev_err_probe()

Simplify the error handling in probe function by switching from
dev_err() to dev_err_probe().

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Daniel Thompson <daniel.thompson@linaro.org>
Reviewed-by: Javier Martinez Canillas <javierm@redhat.com>
Link: https://lore.kernel.org/r/20240201144951.294215-4-andriy.shevchenko@linux.intel.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/video/backlight/hx8357.c | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/drivers/video/backlight/hx8357.c b/drivers/video/backlight/hx8357.c
index 81d0984e9d8b66..70a62755805aa5 100644
--- a/drivers/video/backlight/hx8357.c
+++ b/drivers/video/backlight/hx8357.c
@@ -579,10 +579,8 @@ static int hx8357_probe(struct spi_device *spi)
 		return -ENOMEM;
 
 	ret = spi_setup(spi);
-	if (ret < 0) {
-		dev_err(&spi->dev, "SPI setup failed.\n");
-		return ret;
-	}
+	if (ret < 0)
+		return dev_err_probe(dev, ret, "SPI setup failed.\n");
 
 	lcd->spi = spi;
 
@@ -617,10 +615,8 @@ static int hx8357_probe(struct spi_device *spi)
 	hx8357_lcd_reset(lcdev);
 
 	ret = init_fn(lcdev);
-	if (ret) {
-		dev_err(&spi->dev, "Couldn't initialize panel\n");
-		return ret;
-	}
+	if (ret)
+		return dev_err_probe(dev, ret, "Couldn't initialize panel\n");
 
 	dev_info(&spi->dev, "Panel probed\n");
 

From 27a4701c92250ae0aecb2edea1109f89cf344ba1 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Thu, 1 Feb 2024 16:47:45 +0200
Subject: [PATCH 0184/1406] backlight: hx8357: Utilise temporary variable for
 struct device

We have a temporary variable to keep pointer to struct device.
Utilise it inside the ->probe() implementation.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Daniel Thompson <daniel.thompson@linaro.org>
Reviewed-by: Javier Martinez Canillas <javierm@redhat.com>
Link: https://lore.kernel.org/r/20240201144951.294215-5-andriy.shevchenko@linux.intel.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/video/backlight/hx8357.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/drivers/video/backlight/hx8357.c b/drivers/video/backlight/hx8357.c
index 70a62755805aa5..339d9128fbdeb1 100644
--- a/drivers/video/backlight/hx8357.c
+++ b/drivers/video/backlight/hx8357.c
@@ -574,7 +574,7 @@ static int hx8357_probe(struct spi_device *spi)
 	hx8357_init_fn init_fn;
 	int i, ret;
 
-	lcd = devm_kzalloc(&spi->dev, sizeof(*lcd), GFP_KERNEL);
+	lcd = devm_kzalloc(dev, sizeof(*lcd), GFP_KERNEL);
 	if (!lcd)
 		return -ENOMEM;
 
@@ -604,8 +604,7 @@ static int hx8357_probe(struct spi_device *spi)
 			gpiod_set_consumer_name(lcd->im_pins->desc[i], "im_pins");
 	}
 
-	lcdev = devm_lcd_device_register(&spi->dev, "mxsfb", &spi->dev, lcd,
-					&hx8357_ops);
+	lcdev = devm_lcd_device_register(dev, "mxsfb", dev, lcd, &hx8357_ops);
 	if (IS_ERR(lcdev)) {
 		ret = PTR_ERR(lcdev);
 		return ret;
@@ -618,7 +617,7 @@ static int hx8357_probe(struct spi_device *spi)
 	if (ret)
 		return dev_err_probe(dev, ret, "Couldn't initialize panel\n");
 
-	dev_info(&spi->dev, "Panel probed\n");
+	dev_info(dev, "Panel probed\n");
 
 	return 0;
 }

From 1405ad2e8a9d0c143dfa0d94e995cdeab88d68dc Mon Sep 17 00:00:00 2001
From: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Date: Wed, 7 Feb 2024 15:26:20 -0500
Subject: [PATCH 0185/1406] Bluetooth: hci_conn: Always use sk_timeo as
 conn_timeout

This aligns the use socket sk_timeo as conn_timeout when initiating a
connection and then use it when scheduling the resulting HCI command,
that way the command is actually aborted synchronously thus not
blocking commands generated by hci_abort_conn_sync to inform the
controller the connection is to be aborted.

Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/hci_core.h |  5 +++--
 include/net/bluetooth/l2cap.h    |  2 +-
 net/bluetooth/6lowpan.c          |  2 +-
 net/bluetooth/hci_conn.c         |  8 +++++---
 net/bluetooth/hci_sync.c         |  2 +-
 net/bluetooth/l2cap_core.c       | 10 ++++------
 net/bluetooth/l2cap_sock.c       |  3 ++-
 net/bluetooth/mgmt.c             |  3 ++-
 net/bluetooth/sco.c              |  3 ++-
 9 files changed, 21 insertions(+), 17 deletions(-)

diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index 34aa9d0290fee6..2bdea85b7c447c 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -1495,9 +1495,10 @@ struct hci_conn *hci_connect_le(struct hci_dev *hdev, bdaddr_t *dst,
 				u16 conn_timeout, u8 role);
 struct hci_conn *hci_connect_acl(struct hci_dev *hdev, bdaddr_t *dst,
 				 u8 sec_level, u8 auth_type,
-				 enum conn_reasons conn_reason);
+				 enum conn_reasons conn_reason, u16 timeout);
 struct hci_conn *hci_connect_sco(struct hci_dev *hdev, int type, bdaddr_t *dst,
-				 __u16 setting, struct bt_codec *codec);
+				 __u16 setting, struct bt_codec *codec,
+				 u16 timeout);
 struct hci_conn *hci_bind_cis(struct hci_dev *hdev, bdaddr_t *dst,
 			      __u8 dst_type, struct bt_iso_qos *qos);
 struct hci_conn *hci_bind_bis(struct hci_dev *hdev, bdaddr_t *dst,
diff --git a/include/net/bluetooth/l2cap.h b/include/net/bluetooth/l2cap.h
index 92d7197f9a5636..a4278aa618ab11 100644
--- a/include/net/bluetooth/l2cap.h
+++ b/include/net/bluetooth/l2cap.h
@@ -939,7 +939,7 @@ int l2cap_add_scid(struct l2cap_chan *chan,  __u16 scid);
 struct l2cap_chan *l2cap_chan_create(void);
 void l2cap_chan_close(struct l2cap_chan *chan, int reason);
 int l2cap_chan_connect(struct l2cap_chan *chan, __le16 psm, u16 cid,
-		       bdaddr_t *dst, u8 dst_type);
+		       bdaddr_t *dst, u8 dst_type, u16 timeout);
 int l2cap_chan_reconfigure(struct l2cap_chan *chan, __u16 mtu);
 int l2cap_chan_send(struct l2cap_chan *chan, struct msghdr *msg, size_t len);
 void l2cap_chan_busy(struct l2cap_chan *chan, int busy);
diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c
index 4eb1b3ced0d27e..715cbafbf6631c 100644
--- a/net/bluetooth/6lowpan.c
+++ b/net/bluetooth/6lowpan.c
@@ -892,7 +892,7 @@ static int bt_6lowpan_connect(bdaddr_t *addr, u8 dst_type)
 	chan->ops = &bt_6lowpan_chan_ops;
 
 	err = l2cap_chan_connect(chan, cpu_to_le16(L2CAP_PSM_IPSP), 0,
-				 addr, dst_type);
+				 addr, dst_type, L2CAP_CONN_TIMEOUT);
 
 	BT_DBG("chan %p err %d", chan, err);
 	if (err < 0)
diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c
index a4beed8587eb76..8164502234c555 100644
--- a/net/bluetooth/hci_conn.c
+++ b/net/bluetooth/hci_conn.c
@@ -1607,7 +1607,7 @@ struct hci_conn *hci_connect_le_scan(struct hci_dev *hdev, bdaddr_t *dst,
 
 struct hci_conn *hci_connect_acl(struct hci_dev *hdev, bdaddr_t *dst,
 				 u8 sec_level, u8 auth_type,
-				 enum conn_reasons conn_reason)
+				 enum conn_reasons conn_reason, u16 timeout)
 {
 	struct hci_conn *acl;
 
@@ -1643,6 +1643,7 @@ struct hci_conn *hci_connect_acl(struct hci_dev *hdev, bdaddr_t *dst,
 		acl->sec_level = BT_SECURITY_LOW;
 		acl->pending_sec_level = sec_level;
 		acl->auth_type = auth_type;
+		acl->conn_timeout = timeout;
 
 		err = hci_acl_create_connection_sync(hdev, acl);
 		if (err) {
@@ -1683,14 +1684,15 @@ static struct hci_link *hci_conn_link(struct hci_conn *parent,
 }
 
 struct hci_conn *hci_connect_sco(struct hci_dev *hdev, int type, bdaddr_t *dst,
-				 __u16 setting, struct bt_codec *codec)
+				 __u16 setting, struct bt_codec *codec,
+				 u16 timeout)
 {
 	struct hci_conn *acl;
 	struct hci_conn *sco;
 	struct hci_link *link;
 
 	acl = hci_connect_acl(hdev, dst, BT_SECURITY_LOW, HCI_AT_NO_BONDING,
-			      CONN_REASON_SCO_CONNECT);
+			      CONN_REASON_SCO_CONNECT, timeout);
 	if (IS_ERR(acl))
 		return acl;
 
diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c
index 617407b81ffeca..788a889210d868 100644
--- a/net/bluetooth/hci_sync.c
+++ b/net/bluetooth/hci_sync.c
@@ -6548,7 +6548,7 @@ static int __hci_acl_create_connection_sync(struct hci_dev *hdev, void *data)
 	err = __hci_cmd_sync_status_sk(hdev, HCI_OP_CREATE_CONN,
 				       sizeof(cp), &cp,
 				       HCI_EV_CONN_COMPLETE,
-				       HCI_ACL_CONN_TIMEOUT, NULL);
+				       conn->conn_timeout, NULL);
 
 	if (err == -ETIMEDOUT)
 		hci_abort_conn_sync(hdev, conn, HCI_ERROR_LOCAL_HOST_TERM);
diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c
index ab5a9d42fae71a..467b242d8be071 100644
--- a/net/bluetooth/l2cap_core.c
+++ b/net/bluetooth/l2cap_core.c
@@ -6925,7 +6925,7 @@ static void l2cap_chan_by_pid(struct l2cap_chan *chan, void *data)
 }
 
 int l2cap_chan_connect(struct l2cap_chan *chan, __le16 psm, u16 cid,
-		       bdaddr_t *dst, u8 dst_type)
+		       bdaddr_t *dst, u8 dst_type, u16 timeout)
 {
 	struct l2cap_conn *conn;
 	struct hci_conn *hcon;
@@ -7018,19 +7018,17 @@ int l2cap_chan_connect(struct l2cap_chan *chan, __le16 psm, u16 cid,
 
 		if (hci_dev_test_flag(hdev, HCI_ADVERTISING))
 			hcon = hci_connect_le(hdev, dst, dst_type, false,
-					      chan->sec_level,
-					      HCI_LE_CONN_TIMEOUT,
+					      chan->sec_level, timeout,
 					      HCI_ROLE_SLAVE);
 		else
 			hcon = hci_connect_le_scan(hdev, dst, dst_type,
-						   chan->sec_level,
-						   HCI_LE_CONN_TIMEOUT,
+						   chan->sec_level, timeout,
 						   CONN_REASON_L2CAP_CHAN);
 
 	} else {
 		u8 auth_type = l2cap_get_auth_type(chan);
 		hcon = hci_connect_acl(hdev, dst, chan->sec_level, auth_type,
-				       CONN_REASON_L2CAP_CHAN);
+				       CONN_REASON_L2CAP_CHAN, timeout);
 	}
 
 	if (IS_ERR(hcon)) {
diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c
index ee7a41d6994fc2..4287aa6cc988e3 100644
--- a/net/bluetooth/l2cap_sock.c
+++ b/net/bluetooth/l2cap_sock.c
@@ -254,7 +254,8 @@ static int l2cap_sock_connect(struct socket *sock, struct sockaddr *addr,
 		chan->mode = L2CAP_MODE_LE_FLOWCTL;
 
 	err = l2cap_chan_connect(chan, la.l2_psm, __le16_to_cpu(la.l2_cid),
-				 &la.l2_bdaddr, la.l2_bdaddr_type);
+				 &la.l2_bdaddr, la.l2_bdaddr_type,
+				 sk->sk_sndtimeo);
 	if (err)
 		return err;
 
diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c
index 064a67157d438b..78ab562807d0f1 100644
--- a/net/bluetooth/mgmt.c
+++ b/net/bluetooth/mgmt.c
@@ -3444,7 +3444,8 @@ static int pair_device(struct sock *sk, struct hci_dev *hdev, void *data,
 
 	if (cp->addr.type == BDADDR_BREDR) {
 		conn = hci_connect_acl(hdev, &cp->addr.bdaddr, sec_level,
-				       auth_type, CONN_REASON_PAIR_DEVICE);
+				       auth_type, CONN_REASON_PAIR_DEVICE,
+				       HCI_ACL_CONN_TIMEOUT);
 	} else {
 		u8 addr_type = le_addr_type(cp->addr.type);
 		struct hci_conn_params *p;
diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c
index c736186aba26be..43daf965a01e4a 100644
--- a/net/bluetooth/sco.c
+++ b/net/bluetooth/sco.c
@@ -264,7 +264,8 @@ static int sco_connect(struct sock *sk)
 	}
 
 	hcon = hci_connect_sco(hdev, type, &sco_pi(sk)->dst,
-			       sco_pi(sk)->setting, &sco_pi(sk)->codec);
+			       sco_pi(sk)->setting, &sco_pi(sk)->codec,
+			       sk->sk_sndtimeo);
 	if (IS_ERR(hcon)) {
 		err = PTR_ERR(hcon);
 		goto unlock;

From c2eb366dc5d1e5821c253a928e5d36739ebda055 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Thu, 21 Dec 2023 18:31:48 +0100
Subject: [PATCH 0186/1406] leds: trigger: netdev: Skip setting baseline state
 in activate if hw-controlled

The current codes uses the sw_control path in set_baseline_state() when
called from netdev_trig_activate() even if we're hw-controlled. This
may result in errors when led_set_brightness() is called because we may
not have set_brightness led ops (if hw doesn't support setting a "LED"
to ON). In addition this path may schedule trigger_data->work which
doesn't make sense when being hw-controlled.

Therefore set trigger_data->hw_control = true before calling
set_device_name() from netdev_trig_activate(). In this call chain we
have to prevent set_baseline_state() from being called, because this
would call hw_control_set(). Use led_cdev->trigger_data == NULL as
indicator for being called from netdev_trig_activate().

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Link: https://lore.kernel.org/r/d3f2859c-2673-401c-a4f7-fcaef2167991@gmail.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/leds/trigger/ledtrig-netdev.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/drivers/leds/trigger/ledtrig-netdev.c b/drivers/leds/trigger/ledtrig-netdev.c
index 8e5475819590e0..1a0cfbba597638 100644
--- a/drivers/leds/trigger/ledtrig-netdev.c
+++ b/drivers/leds/trigger/ledtrig-netdev.c
@@ -277,7 +277,10 @@ static int set_device_name(struct led_netdev_data *trigger_data,
 
 	trigger_data->last_activity = 0;
 
-	set_baseline_state(trigger_data);
+	/* Skip if we're called from netdev_trig_activate() and hw_control is true */
+	if (!trigger_data->hw_control || led_get_trigger_data(trigger_data->led_cdev))
+		set_baseline_state(trigger_data);
+
 	mutex_unlock(&trigger_data->lock);
 	rtnl_unlock();
 
@@ -617,8 +620,8 @@ static int netdev_trig_activate(struct led_classdev *led_cdev)
 		if (dev) {
 			const char *name = dev_name(dev);
 
-			set_device_name(trigger_data, name, strlen(name));
 			trigger_data->hw_control = true;
+			set_device_name(trigger_data, name, strlen(name));
 
 			rc = led_cdev->hw_control_get(led_cdev, &mode);
 			if (!rc)

From cf493b66f5a6ee27a65e9ab66676b3ab42b9ca6f Mon Sep 17 00:00:00 2001
From: Anjelique Melendez <quic_amelende@quicinc.com>
Date: Thu, 21 Dec 2023 10:58:32 -0800
Subject: [PATCH 0187/1406] dt-bindings: leds: leds-qcom-lpg: Add support for
 LPG PPG

Update leds-qcom-lpg binding to support LPG PPG.

Signed-off-by: Anjelique Melendez <quic_amelende@quicinc.com>
Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Link: https://lore.kernel.org/r/20231221185838.28440-3-quic_amelende@quicinc.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 .../bindings/leds/leds-qcom-lpg.yaml          | 82 ++++++++++++++++++-
 1 file changed, 81 insertions(+), 1 deletion(-)

diff --git a/Documentation/devicetree/bindings/leds/leds-qcom-lpg.yaml b/Documentation/devicetree/bindings/leds/leds-qcom-lpg.yaml
index ea84ad426df18e..6649ca2ec80502 100644
--- a/Documentation/devicetree/bindings/leds/leds-qcom-lpg.yaml
+++ b/Documentation/devicetree/bindings/leds/leds-qcom-lpg.yaml
@@ -11,7 +11,7 @@ maintainers:
 
 description: >
   The Qualcomm Light Pulse Generator consists of three different hardware blocks;
-  a ramp generator with lookup table, the light pulse generator and a three
+  a ramp generator with lookup table (LUT), the light pulse generator and a three
   channel current sink. These blocks are found in a wide range of Qualcomm PMICs.
 
 properties:
@@ -63,6 +63,29 @@ properties:
         - description: dtest line to attach
         - description: flags for the attachment
 
+  nvmem:
+    description: >
+      This property is required for PMICs that supports PPG, which is when a
+      PMIC stores LPG per-channel data and pattern LUT in SDAM modules instead
+      of in a LUT peripheral. For PMICs, such as PM8350C, per-channel data
+      and pattern LUT is separated into 2 SDAM modules. In that case, phandles
+      to both SDAM modules need to be specified.
+    minItems: 1
+    maxItems: 2
+
+  nvmem-names:
+    minItems: 1
+    items:
+      - const: lpg_chan_sdam
+      - const: lut_sdam
+
+  qcom,pbs:
+    $ref: /schemas/types.yaml#/definitions/phandle
+    description: >
+      Phandle of the Qualcomm Programmable Boot Sequencer node (PBS).
+      PBS node is used to trigger LPG pattern sequences for PMICs that support
+      single SDAM PPG.
+
   multi-led:
     type: object
     $ref: leds-class-multicolor.yaml#
@@ -106,6 +129,32 @@ required:
 
 additionalProperties: false
 
+allOf:
+  - if:
+      properties:
+        compatible:
+          contains:
+            const: qcom,pmi632-lpg
+    then:
+      properties:
+        nvmem:
+          maxItems: 1
+        nvmem-names:
+          maxItems: 1
+  - if:
+      properties:
+        compatible:
+          contains:
+            enum:
+              - qcom,pm8350c-pwm
+              - qcom,pm8550-pwm
+    then:
+      properties:
+        nvmem:
+          minItems: 2
+        nvmem-names:
+          minItems: 2
+
 examples:
   - |
     #include <dt-bindings/leds/common.h>
@@ -191,4 +240,35 @@ examples:
       compatible = "qcom,pm8916-pwm";
       #pwm-cells = <2>;
     };
+  - |
+    #include <dt-bindings/leds/common.h>
+
+    led-controller {
+      compatible = "qcom,pmi632-lpg";
+      #address-cells = <1>;
+      #size-cells = <0>;
+      #pwm-cells = <2>;
+      nvmem-names = "lpg_chan_sdam";
+      nvmem = <&pmi632_sdam_7>;
+      qcom,pbs = <&pmi632_pbs_client3>;
+
+      led@1 {
+        reg = <1>;
+        color = <LED_COLOR_ID_RED>;
+        label = "red";
+      };
+
+      led@2 {
+        reg = <2>;
+        color = <LED_COLOR_ID_GREEN>;
+        label = "green";
+      };
+
+      led@3 {
+        reg = <3>;
+        color = <LED_COLOR_ID_BLUE>;
+        label = "blue";
+      };
+    };
+
 ...

From 214110175679aadf4ccaf5ddccc7e252295b2349 Mon Sep 17 00:00:00 2001
From: Anjelique Melendez <quic_amelende@quicinc.com>
Date: Thu, 21 Dec 2023 10:58:34 -0800
Subject: [PATCH 0188/1406] leds: rgb: leds-qcom-lpg: Add support for PPG
 through single SDAM

In some PMICs like pmi632, the pattern look up table (LUT) and LPG
configuration is stored in a single SDAM module instead of LUT
peripheral. This feature is called PPG. PPG uses Qualcomm Programmable
Boot Sequencer (PBS) inorder to trigger pattern sequences for PMICs.

Signed-off-by: Anjelique Melendez <quic_amelende@quicinc.com>
Tested-by: Luca Weiss <luca.weiss@fairphone.com>
Link: https://lore.kernel.org/r/20231221185838.28440-5-quic_amelende@quicinc.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/leds/rgb/leds-qcom-lpg.c | 268 ++++++++++++++++++++++++++++---
 1 file changed, 244 insertions(+), 24 deletions(-)

diff --git a/drivers/leds/rgb/leds-qcom-lpg.c b/drivers/leds/rgb/leds-qcom-lpg.c
index 156b73d1f4a29d..2bdcf17e510727 100644
--- a/drivers/leds/rgb/leds-qcom-lpg.c
+++ b/drivers/leds/rgb/leds-qcom-lpg.c
@@ -8,11 +8,13 @@
 #include <linux/bitfield.h>
 #include <linux/led-class-multicolor.h>
 #include <linux/module.h>
+#include <linux/nvmem-consumer.h>
 #include <linux/of.h>
 #include <linux/platform_device.h>
 #include <linux/pwm.h>
 #include <linux/regmap.h>
 #include <linux/slab.h>
+#include <linux/soc/qcom/qcom-pbs.h>
 
 #define LPG_SUBTYPE_REG		0x05
 #define  LPG_SUBTYPE_LPG	0x2
@@ -39,6 +41,8 @@
 #define PWM_SEC_ACCESS_REG	0xd0
 #define PWM_DTEST_REG(x)	(0xe2 + (x) - 1)
 
+#define SDAM_REG_PBS_SEQ_EN		0x42
+
 #define TRI_LED_SRC_SEL		0x45
 #define TRI_LED_EN_CTL		0x46
 #define TRI_LED_ATC_CTL		0x47
@@ -48,9 +52,25 @@
 
 #define LPG_RESOLUTION_9BIT	BIT(9)
 #define LPG_RESOLUTION_15BIT	BIT(15)
+#define PPG_MAX_LED_BRIGHTNESS	255
+
 #define LPG_MAX_M		7
 #define LPG_MAX_PREDIV		6
 
+#define DEFAULT_TICK_DURATION_US	7800
+#define RAMP_STEP_DURATION(x)		(((x) * 1000 / DEFAULT_TICK_DURATION_US) & 0xff)
+
+/* LPG common config settings for PPG */
+#define SDAM_REG_RAMP_STEP_DURATION		0x47
+#define SDAM_LPG_SDAM_LUT_PATTERN_OFFSET	0x80
+
+/* LPG per channel config settings for PPG */
+#define SDAM_LUT_EN_OFFSET			0x0
+#define SDAM_PATTERN_CONFIG_OFFSET		0x1
+#define SDAM_END_INDEX_OFFSET			0x3
+#define SDAM_START_INDEX_OFFSET		0x4
+#define SDAM_PBS_SCRATCH_LUT_COUNTER_OFFSET	0x6
+
 struct lpg_channel;
 struct lpg_data;
 
@@ -64,6 +84,9 @@ struct lpg_data;
  * @lut_base:	base address of the LUT block (optional)
  * @lut_size:	number of entries in the LUT block
  * @lut_bitmap:	allocation bitmap for LUT entries
+ * @pbs_dev:	PBS device
+ * @lpg_chan_sdam:	LPG SDAM peripheral device
+ * @pbs_en_bitmap:	bitmap for tracking PBS triggers
  * @triled_base: base address of the TRILED block (optional)
  * @triled_src:	power-source for the TRILED
  * @triled_has_atc_ctl:	true if there is TRI_LED_ATC_CTL register
@@ -85,6 +108,10 @@ struct lpg {
 	u32 lut_size;
 	unsigned long *lut_bitmap;
 
+	struct pbs_dev *pbs_dev;
+	struct nvmem_device *lpg_chan_sdam;
+	unsigned long pbs_en_bitmap;
+
 	u32 triled_base;
 	u32 triled_src;
 	bool triled_has_atc_ctl;
@@ -101,6 +128,7 @@ struct lpg {
  * @triled_mask: mask in TRILED to enable this channel
  * @lut_mask:	mask in LUT to start pattern generator for this channel
  * @subtype:	PMIC hardware block subtype
+ * @sdam_offset:	channel offset in LPG SDAM
  * @in_use:	channel is exposed to LED framework
  * @color:	color of the LED attached to this channel
  * @dtest_line:	DTEST line for output, or 0 if disabled
@@ -129,6 +157,7 @@ struct lpg_channel {
 	unsigned int triled_mask;
 	unsigned int lut_mask;
 	unsigned int subtype;
+	u32 sdam_offset;
 
 	bool in_use;
 
@@ -178,10 +207,12 @@ struct lpg_led {
 
 /**
  * struct lpg_channel_data - per channel initialization data
+ * @sdam_offset:	Channel offset in LPG SDAM
  * @base:		base address for PWM channel registers
  * @triled_mask:	bitmask for controlling this channel in TRILED
  */
 struct lpg_channel_data {
+	unsigned int sdam_offset;
 	unsigned int base;
 	u8 triled_mask;
 };
@@ -206,6 +237,52 @@ struct lpg_data {
 	const struct lpg_channel_data *channels;
 };
 
+#define PBS_SW_TRIG_BIT		BIT(0)
+
+static int lpg_clear_pbs_trigger(struct lpg *lpg, unsigned int lut_mask)
+{
+	u8 val = 0;
+	int rc;
+
+	lpg->pbs_en_bitmap &= (~lut_mask);
+	if (!lpg->pbs_en_bitmap) {
+		rc = nvmem_device_write(lpg->lpg_chan_sdam, SDAM_REG_PBS_SEQ_EN, 1, &val);
+		if (rc < 0)
+			return rc;
+	}
+
+	return 0;
+}
+
+static int lpg_set_pbs_trigger(struct lpg *lpg, unsigned int lut_mask)
+{
+	u8 val = PBS_SW_TRIG_BIT;
+	int rc;
+
+	if (!lpg->pbs_en_bitmap) {
+		rc = nvmem_device_write(lpg->lpg_chan_sdam, SDAM_REG_PBS_SEQ_EN, 1, &val);
+		if (rc < 0)
+			return rc;
+
+		rc = qcom_pbs_trigger_event(lpg->pbs_dev, val);
+		if (rc < 0)
+			return rc;
+	}
+	lpg->pbs_en_bitmap |= lut_mask;
+
+	return 0;
+}
+
+static int lpg_sdam_configure_triggers(struct lpg_channel *chan, u8 set_trig)
+{
+	u32 addr = SDAM_LUT_EN_OFFSET + chan->sdam_offset;
+
+	if (!chan->lpg->lpg_chan_sdam)
+		return 0;
+
+	return nvmem_device_write(chan->lpg->lpg_chan_sdam, addr, 1, &set_trig);
+}
+
 static int triled_set(struct lpg *lpg, unsigned int mask, unsigned int enable)
 {
 	/* Skip if we don't have a triled block */
@@ -216,6 +293,40 @@ static int triled_set(struct lpg *lpg, unsigned int mask, unsigned int enable)
 				  mask, enable);
 }
 
+static int lpg_lut_store_sdam(struct lpg *lpg, struct led_pattern *pattern,
+			 size_t len, unsigned int *lo_idx, unsigned int *hi_idx)
+{
+	unsigned int idx;
+	u8 brightness;
+	int i, rc;
+	u16 addr;
+
+	if (len > lpg->lut_size) {
+		dev_err(lpg->dev, "Pattern length (%zu) exceeds maximum pattern length (%d)\n",
+			len, lpg->lut_size);
+		return -EINVAL;
+	}
+
+	idx = bitmap_find_next_zero_area(lpg->lut_bitmap, lpg->lut_size, 0, len, 0);
+	if (idx >= lpg->lut_size)
+		return -ENOSPC;
+
+	for (i = 0; i < len; i++) {
+		brightness = pattern[i].brightness;
+		addr = SDAM_LPG_SDAM_LUT_PATTERN_OFFSET + i + idx;
+		rc = nvmem_device_write(lpg->lpg_chan_sdam, addr, 1, &brightness);
+		if (rc < 0)
+			return rc;
+	}
+
+	bitmap_set(lpg->lut_bitmap, idx, len);
+
+	*lo_idx = idx;
+	*hi_idx = idx + len - 1;
+
+	return 0;
+}
+
 static int lpg_lut_store(struct lpg *lpg, struct led_pattern *pattern,
 			 size_t len, unsigned int *lo_idx, unsigned int *hi_idx)
 {
@@ -256,6 +367,9 @@ static void lpg_lut_free(struct lpg *lpg, unsigned int lo_idx, unsigned int hi_i
 
 static int lpg_lut_sync(struct lpg *lpg, unsigned int mask)
 {
+	if (!lpg->lut_base)
+		return 0;
+
 	return regmap_write(lpg->map, lpg->lut_base + RAMP_CONTROL_REG, mask);
 }
 
@@ -462,6 +576,28 @@ static void lpg_apply_pwm_value(struct lpg_channel *chan)
 #define LPG_PATTERN_CONFIG_PAUSE_HI	BIT(1)
 #define LPG_PATTERN_CONFIG_PAUSE_LO	BIT(0)
 
+static void lpg_sdam_apply_lut_control(struct lpg_channel *chan)
+{
+	struct nvmem_device *lpg_chan_sdam = chan->lpg->lpg_chan_sdam;
+	unsigned int lo_idx = chan->pattern_lo_idx;
+	unsigned int hi_idx = chan->pattern_hi_idx;
+	u8 val = 0, conf = 0;
+
+	if (!chan->ramp_enabled || chan->pattern_lo_idx == chan->pattern_hi_idx)
+		return;
+
+	if (!chan->ramp_oneshot)
+		conf |= LPG_PATTERN_CONFIG_REPEAT;
+
+	nvmem_device_write(lpg_chan_sdam, SDAM_PBS_SCRATCH_LUT_COUNTER_OFFSET + chan->sdam_offset, 1, &val);
+	nvmem_device_write(lpg_chan_sdam, SDAM_PATTERN_CONFIG_OFFSET + chan->sdam_offset, 1, &conf);
+	nvmem_device_write(lpg_chan_sdam, SDAM_END_INDEX_OFFSET + chan->sdam_offset, 1, &hi_idx);
+	nvmem_device_write(lpg_chan_sdam, SDAM_START_INDEX_OFFSET + chan->sdam_offset, 1, &lo_idx);
+
+	val = RAMP_STEP_DURATION(chan->ramp_tick_ms);
+	nvmem_device_write(lpg_chan_sdam, SDAM_REG_RAMP_STEP_DURATION, 1, &val);
+}
+
 static void lpg_apply_lut_control(struct lpg_channel *chan)
 {
 	struct lpg *lpg = chan->lpg;
@@ -596,7 +732,10 @@ static void lpg_apply(struct lpg_channel *chan)
 	lpg_apply_pwm_value(chan);
 	lpg_apply_control(chan);
 	lpg_apply_sync(chan);
-	lpg_apply_lut_control(chan);
+	if (chan->lpg->lpg_chan_sdam)
+		lpg_sdam_apply_lut_control(chan);
+	else
+		lpg_apply_lut_control(chan);
 	lpg_enable_glitch(chan);
 }
 
@@ -621,6 +760,7 @@ static void lpg_brightness_set(struct lpg_led *led, struct led_classdev *cdev,
 			chan->ramp_enabled = false;
 		} else if (chan->pattern_lo_idx != chan->pattern_hi_idx) {
 			lpg_calc_freq(chan, NSEC_PER_MSEC);
+			lpg_sdam_configure_triggers(chan, 1);
 
 			chan->enabled = true;
 			chan->ramp_enabled = true;
@@ -648,8 +788,10 @@ static void lpg_brightness_set(struct lpg_led *led, struct led_classdev *cdev,
 		triled_set(lpg, triled_mask, triled_enabled);
 
 	/* Trigger start of ramp generator(s) */
-	if (lut_mask)
+	if (lut_mask) {
 		lpg_lut_sync(lpg, lut_mask);
+		lpg_set_pbs_trigger(lpg, lut_mask);
+	}
 }
 
 static int lpg_brightness_single_set(struct led_classdev *cdev,
@@ -766,9 +908,9 @@ static int lpg_pattern_set(struct lpg_led *led, struct led_pattern *led_pattern,
 	struct led_pattern *pattern;
 	unsigned int brightness_a;
 	unsigned int brightness_b;
+	unsigned int hi_pause = 0;
+	unsigned int lo_pause = 0;
 	unsigned int actual_len;
-	unsigned int hi_pause;
-	unsigned int lo_pause;
 	unsigned int delta_t;
 	unsigned int lo_idx;
 	unsigned int hi_idx;
@@ -835,18 +977,23 @@ static int lpg_pattern_set(struct lpg_led *led, struct led_pattern *led_pattern,
 	 * If the specified pattern is a palindrome the ping pong mode is
 	 * enabled. In this scenario the delta_t of the middle entry (i.e. the
 	 * last in the programmed pattern) determines the "high pause".
+	 *
+	 * SDAM-based devices do not support "ping-pong", "low pause" or "high pause"
 	 */
 
 	/* Detect palindromes and use "ping pong" to reduce LUT usage */
-	for (i = 0; i < len / 2; i++) {
-		brightness_a = pattern[i].brightness;
-		brightness_b = pattern[len - i - 1].brightness;
-
-		if (brightness_a != brightness_b) {
-			ping_pong = false;
-			break;
+	if (lpg->lut_base) {
+		for (i = 0; i < len / 2; i++) {
+			brightness_a = pattern[i].brightness;
+			brightness_b = pattern[len - i - 1].brightness;
+
+			if (brightness_a != brightness_b) {
+				ping_pong = false;
+				break;
+			}
 		}
-	}
+	} else
+		ping_pong = false;
 
 	/* The pattern length to be written to the LUT */
 	if (ping_pong)
@@ -874,12 +1021,26 @@ static int lpg_pattern_set(struct lpg_led *led, struct led_pattern *led_pattern,
 	if (delta_t >= BIT(9))
 		goto out_free_pattern;
 
-	/* Find "low pause" and "high pause" in the pattern */
-	lo_pause = pattern[0].delta_t;
-	hi_pause = pattern[actual_len - 1].delta_t;
+	/*
+	 * Find "low pause" and "high pause" in the pattern in the LUT case.
+	 * SDAM-based devices require equal duration of all steps
+	 */
+	if (lpg->lut_base) {
+		lo_pause = pattern[0].delta_t;
+		hi_pause = pattern[actual_len - 1].delta_t;
+	} else {
+		if (delta_t != pattern[0].delta_t || delta_t != pattern[actual_len - 1].delta_t)
+			goto out_free_pattern;
+	}
+
 
 	mutex_lock(&lpg->lock);
-	ret = lpg_lut_store(lpg, pattern, actual_len, &lo_idx, &hi_idx);
+
+	if (lpg->lut_base)
+		ret = lpg_lut_store(lpg, pattern, actual_len, &lo_idx, &hi_idx);
+	else
+		ret = lpg_lut_store_sdam(lpg, pattern, actual_len, &lo_idx, &hi_idx);
+
 	if (ret < 0)
 		goto out_unlock;
 
@@ -927,7 +1088,12 @@ static int lpg_pattern_mc_set(struct led_classdev *cdev,
 {
 	struct led_classdev_mc *mc = lcdev_to_mccdev(cdev);
 	struct lpg_led *led = container_of(mc, struct lpg_led, mcdev);
-	int ret;
+	unsigned int triled_mask = 0;
+	int ret, i;
+
+	for (i = 0; i < led->num_channels; i++)
+		triled_mask |= led->channels[i]->triled_mask;
+	triled_set(led->lpg, triled_mask, 0);
 
 	ret = lpg_pattern_set(led, pattern, len, repeat);
 	if (ret < 0)
@@ -952,6 +1118,8 @@ static int lpg_pattern_clear(struct lpg_led *led)
 
 	for (i = 0; i < led->num_channels; i++) {
 		chan = led->channels[i];
+		lpg_sdam_configure_triggers(chan, 0);
+		lpg_clear_pbs_trigger(chan->lpg, chan->lut_mask);
 		chan->pattern_lo_idx = 0;
 		chan->pattern_hi_idx = 0;
 	}
@@ -1187,8 +1355,8 @@ static int lpg_add_led(struct lpg *lpg, struct device_node *np)
 		cdev->brightness_set_blocking = lpg_brightness_mc_set;
 		cdev->blink_set = lpg_blink_mc_set;
 
-		/* Register pattern accessors only if we have a LUT block */
-		if (lpg->lut_base) {
+		/* Register pattern accessors if we have a LUT block or when using PPG */
+		if (lpg->lut_base || lpg->lpg_chan_sdam) {
 			cdev->pattern_set = lpg_pattern_mc_set;
 			cdev->pattern_clear = lpg_pattern_mc_clear;
 		}
@@ -1201,15 +1369,19 @@ static int lpg_add_led(struct lpg *lpg, struct device_node *np)
 		cdev->brightness_set_blocking = lpg_brightness_single_set;
 		cdev->blink_set = lpg_blink_single_set;
 
-		/* Register pattern accessors only if we have a LUT block */
-		if (lpg->lut_base) {
+		/* Register pattern accessors if we have a LUT block or when using PPG */
+		if (lpg->lut_base || lpg->lpg_chan_sdam) {
 			cdev->pattern_set = lpg_pattern_single_set;
 			cdev->pattern_clear = lpg_pattern_single_clear;
 		}
 	}
 
 	cdev->default_trigger = of_get_property(np, "linux,default-trigger", NULL);
-	cdev->max_brightness = LPG_RESOLUTION_9BIT - 1;
+
+	if (lpg->lpg_chan_sdam)
+		cdev->max_brightness = PPG_MAX_LED_BRIGHTNESS;
+	else
+		cdev->max_brightness = LPG_RESOLUTION_9BIT - 1;
 
 	if (!of_property_read_string(np, "default-state", &state) &&
 	    !strcmp(state, "on"))
@@ -1250,6 +1422,7 @@ static int lpg_init_channels(struct lpg *lpg)
 		chan->base = data->channels[i].base;
 		chan->triled_mask = data->channels[i].triled_mask;
 		chan->lut_mask = BIT(i);
+		chan->sdam_offset = data->channels[i].sdam_offset;
 
 		regmap_read(lpg->map, chan->base + LPG_SUBTYPE_REG, &chan->subtype);
 	}
@@ -1295,11 +1468,12 @@ static int lpg_init_lut(struct lpg *lpg)
 {
 	const struct lpg_data *data = lpg->data;
 
-	if (!data->lut_base)
+	if (!data->lut_size)
 		return 0;
 
-	lpg->lut_base = data->lut_base;
 	lpg->lut_size = data->lut_size;
+	if (data->lut_base)
+		lpg->lut_base = data->lut_base;
 
 	lpg->lut_bitmap = devm_bitmap_zalloc(lpg->dev, lpg->lut_size, GFP_KERNEL);
 	if (!lpg->lut_bitmap)
@@ -1308,6 +1482,48 @@ static int lpg_init_lut(struct lpg *lpg)
 	return 0;
 }
 
+static int lpg_init_sdam(struct lpg *lpg)
+{
+	int i, sdam_count, rc;
+	u8 val = 0;
+
+	sdam_count = of_property_count_strings(lpg->dev->of_node, "nvmem-names");
+	if (sdam_count <= 0)
+		return 0;
+
+	/* Get the SDAM device for LPG/LUT config */
+	lpg->lpg_chan_sdam = devm_nvmem_device_get(lpg->dev, "lpg_chan_sdam");
+	if (IS_ERR(lpg->lpg_chan_sdam))
+		return dev_err_probe(lpg->dev, PTR_ERR(lpg->lpg_chan_sdam),
+				"Failed to get LPG chan SDAM device\n");
+
+	lpg->pbs_dev = get_pbs_client_device(lpg->dev);
+	if (IS_ERR(lpg->pbs_dev))
+		return dev_err_probe(lpg->dev, PTR_ERR(lpg->pbs_dev),
+				"Failed to get PBS client device\n");
+
+	for (i = 0; i < lpg->num_channels; i++) {
+		struct lpg_channel *chan = &lpg->channels[i];
+
+		if (chan->sdam_offset) {
+			rc = nvmem_device_write(lpg->lpg_chan_sdam,
+				SDAM_PBS_SCRATCH_LUT_COUNTER_OFFSET + chan->sdam_offset, 1, &val);
+			if (rc < 0)
+				return rc;
+
+			rc = lpg_sdam_configure_triggers(chan, 0);
+			if (rc < 0)
+				return rc;
+
+			rc = lpg_clear_pbs_trigger(chan->lpg, chan->lut_mask);
+			if (rc < 0)
+				return rc;
+		}
+	}
+
+	return 0;
+}
+
 static int lpg_probe(struct platform_device *pdev)
 {
 	struct device_node *np;
@@ -1342,6 +1558,10 @@ static int lpg_probe(struct platform_device *pdev)
 	if (ret < 0)
 		return ret;
 
+	ret = lpg_init_sdam(lpg);
+	if (ret < 0)
+		return ret;
+
 	ret = lpg_init_lut(lpg);
 	if (ret < 0)
 		return ret;

From cabf7243e82ebd68c2a61ec746718991d2ba77de Mon Sep 17 00:00:00 2001
From: Anjelique Melendez <quic_amelende@quicinc.com>
Date: Thu, 21 Dec 2023 10:58:35 -0800
Subject: [PATCH 0189/1406] leds: rgb: leds-qcom-lpg: Update PMI632 lpg_data to
 support PPG

Update the pmi632 lpg_data struct so that pmi632 devices use PPG
for LUT pattern.

Signed-off-by: Anjelique Melendez <quic_amelende@quicinc.com>
Tested-by: Luca Weiss <luca.weiss@fairphone.com>
Link: https://lore.kernel.org/r/20231221185838.28440-6-quic_amelende@quicinc.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/leds/rgb/leds-qcom-lpg.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/drivers/leds/rgb/leds-qcom-lpg.c b/drivers/leds/rgb/leds-qcom-lpg.c
index 2bdcf17e510727..d1b82dfcbb9986 100644
--- a/drivers/leds/rgb/leds-qcom-lpg.c
+++ b/drivers/leds/rgb/leds-qcom-lpg.c
@@ -1627,11 +1627,13 @@ static const struct lpg_data pm8994_lpg_data = {
 static const struct lpg_data pmi632_lpg_data = {
 	.triled_base = 0xd000,
 
+	.lut_size = 64,
+
 	.num_channels = 5,
 	.channels = (const struct lpg_channel_data[]) {
-		{ .base = 0xb300, .triled_mask = BIT(7) },
-		{ .base = 0xb400, .triled_mask = BIT(6) },
-		{ .base = 0xb500, .triled_mask = BIT(5) },
+		{ .base = 0xb300, .triled_mask = BIT(7), .sdam_offset = 0x48 },
+		{ .base = 0xb400, .triled_mask = BIT(6), .sdam_offset = 0x56 },
+		{ .base = 0xb500, .triled_mask = BIT(5), .sdam_offset = 0x64 },
 		{ .base = 0xb600 },
 		{ .base = 0xb700 },
 	},

From aa78768e95e62b24a9c05a46bea3b96763748e10 Mon Sep 17 00:00:00 2001
From: Anjelique Melendez <quic_amelende@quicinc.com>
Date: Thu, 21 Dec 2023 10:58:36 -0800
Subject: [PATCH 0190/1406] leds: rgb: leds-qcom-lpg: Include support for PPG
 with dedicated LUT SDAM

On PMICs such as PM8350C, the pattern lookup table (LUT) is stored in a
separate SDAM from the one where the lpg per-channel data is stored.

Add support for PPG with a dedicated LUT SDAM while maintaining backward
compatibility for those targets that use only a single SDAM.

Co-developed-by: Guru Das Srinagesh <quic_gurus@quicinc.com>
Signed-off-by: Guru Das Srinagesh <quic_gurus@quicinc.com>
Signed-off-by: Anjelique Melendez <quic_amelende@quicinc.com>
Link: https://lore.kernel.org/r/20231221185838.28440-7-quic_amelende@quicinc.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/leds/rgb/leds-qcom-lpg.c | 92 +++++++++++++++++++++++++++-----
 1 file changed, 78 insertions(+), 14 deletions(-)

diff --git a/drivers/leds/rgb/leds-qcom-lpg.c b/drivers/leds/rgb/leds-qcom-lpg.c
index d1b82dfcbb9986..9d0717f770bacd 100644
--- a/drivers/leds/rgb/leds-qcom-lpg.c
+++ b/drivers/leds/rgb/leds-qcom-lpg.c
@@ -42,6 +42,8 @@
 #define PWM_DTEST_REG(x)	(0xe2 + (x) - 1)
 
 #define SDAM_REG_PBS_SEQ_EN		0x42
+#define SDAM_PBS_TRIG_SET		0xe5
+#define SDAM_PBS_TRIG_CLR		0xe6
 
 #define TRI_LED_SRC_SEL		0x45
 #define TRI_LED_EN_CTL		0x46
@@ -60,8 +62,12 @@
 #define DEFAULT_TICK_DURATION_US	7800
 #define RAMP_STEP_DURATION(x)		(((x) * 1000 / DEFAULT_TICK_DURATION_US) & 0xff)
 
+#define SDAM_MAX_DEVICES	2
 /* LPG common config settings for PPG */
+#define SDAM_START_BASE			0x40
 #define SDAM_REG_RAMP_STEP_DURATION		0x47
+
+#define SDAM_LUT_SDAM_LUT_PATTERN_OFFSET	0x45
 #define SDAM_LPG_SDAM_LUT_PATTERN_OFFSET	0x80
 
 /* LPG per channel config settings for PPG */
@@ -70,6 +76,8 @@
 #define SDAM_END_INDEX_OFFSET			0x3
 #define SDAM_START_INDEX_OFFSET		0x4
 #define SDAM_PBS_SCRATCH_LUT_COUNTER_OFFSET	0x6
+#define SDAM_PAUSE_HI_MULTIPLIER_OFFSET	0x8
+#define SDAM_PAUSE_LO_MULTIPLIER_OFFSET	0x9
 
 struct lpg_channel;
 struct lpg_data;
@@ -86,6 +94,7 @@ struct lpg_data;
  * @lut_bitmap:	allocation bitmap for LUT entries
  * @pbs_dev:	PBS device
  * @lpg_chan_sdam:	LPG SDAM peripheral device
+ * @lut_sdam:	LUT SDAM peripheral device
  * @pbs_en_bitmap:	bitmap for tracking PBS triggers
  * @triled_base: base address of the TRILED block (optional)
  * @triled_src:	power-source for the TRILED
@@ -110,6 +119,7 @@ struct lpg {
 
 	struct pbs_dev *pbs_dev;
 	struct nvmem_device *lpg_chan_sdam;
+	struct nvmem_device *lut_sdam;
 	unsigned long pbs_en_bitmap;
 
 	u32 triled_base;
@@ -249,6 +259,13 @@ static int lpg_clear_pbs_trigger(struct lpg *lpg, unsigned int lut_mask)
 		rc = nvmem_device_write(lpg->lpg_chan_sdam, SDAM_REG_PBS_SEQ_EN, 1, &val);
 		if (rc < 0)
 			return rc;
+
+		if (lpg->lut_sdam) {
+			val = PBS_SW_TRIG_BIT;
+			rc = nvmem_device_write(lpg->lpg_chan_sdam, SDAM_PBS_TRIG_CLR, 1, &val);
+			if (rc < 0)
+				return rc;
+		}
 	}
 
 	return 0;
@@ -264,9 +281,15 @@ static int lpg_set_pbs_trigger(struct lpg *lpg, unsigned int lut_mask)
 		if (rc < 0)
 			return rc;
 
-		rc = qcom_pbs_trigger_event(lpg->pbs_dev, val);
-		if (rc < 0)
-			return rc;
+		if (lpg->lut_sdam) {
+			rc = nvmem_device_write(lpg->lpg_chan_sdam, SDAM_PBS_TRIG_SET, 1, &val);
+			if (rc < 0)
+				return rc;
+		} else {
+			rc = qcom_pbs_trigger_event(lpg->pbs_dev, val);
+			if (rc < 0)
+				return rc;
+		}
 	}
 	lpg->pbs_en_bitmap |= lut_mask;
 
@@ -313,8 +336,15 @@ static int lpg_lut_store_sdam(struct lpg *lpg, struct led_pattern *pattern,
 
 	for (i = 0; i < len; i++) {
 		brightness = pattern[i].brightness;
-		addr = SDAM_LPG_SDAM_LUT_PATTERN_OFFSET + i + idx;
-		rc = nvmem_device_write(lpg->lpg_chan_sdam, addr, 1, &brightness);
+
+		if (lpg->lut_sdam) {
+			addr = SDAM_LUT_SDAM_LUT_PATTERN_OFFSET + i + idx;
+			rc = nvmem_device_write(lpg->lut_sdam, addr, 1, &brightness);
+		} else {
+			addr = SDAM_LPG_SDAM_LUT_PATTERN_OFFSET + i + idx;
+			rc = nvmem_device_write(lpg->lpg_chan_sdam, addr, 1, &brightness);
+		}
+
 		if (rc < 0)
 			return rc;
 	}
@@ -581,13 +611,28 @@ static void lpg_sdam_apply_lut_control(struct lpg_channel *chan)
 	struct nvmem_device *lpg_chan_sdam = chan->lpg->lpg_chan_sdam;
 	unsigned int lo_idx = chan->pattern_lo_idx;
 	unsigned int hi_idx = chan->pattern_hi_idx;
-	u8 val = 0, conf = 0;
+	u8 val = 0, conf = 0, lut_offset = 0;
+	unsigned int hi_pause, lo_pause;
+	struct lpg *lpg = chan->lpg;
 
 	if (!chan->ramp_enabled || chan->pattern_lo_idx == chan->pattern_hi_idx)
 		return;
 
+	hi_pause = DIV_ROUND_UP(chan->ramp_hi_pause_ms, chan->ramp_tick_ms);
+	lo_pause = DIV_ROUND_UP(chan->ramp_lo_pause_ms, chan->ramp_tick_ms);
+
 	if (!chan->ramp_oneshot)
 		conf |= LPG_PATTERN_CONFIG_REPEAT;
+	if (chan->ramp_hi_pause_ms && lpg->lut_sdam)
+		conf |= LPG_PATTERN_CONFIG_PAUSE_HI;
+	if (chan->ramp_lo_pause_ms && lpg->lut_sdam)
+		conf |= LPG_PATTERN_CONFIG_PAUSE_LO;
+
+	if (lpg->lut_sdam) {
+		lut_offset = SDAM_LUT_SDAM_LUT_PATTERN_OFFSET - SDAM_START_BASE;
+		hi_idx += lut_offset;
+		lo_idx += lut_offset;
+	}
 
 	nvmem_device_write(lpg_chan_sdam, SDAM_PBS_SCRATCH_LUT_COUNTER_OFFSET + chan->sdam_offset, 1, &val);
 	nvmem_device_write(lpg_chan_sdam, SDAM_PATTERN_CONFIG_OFFSET + chan->sdam_offset, 1, &conf);
@@ -596,6 +641,12 @@ static void lpg_sdam_apply_lut_control(struct lpg_channel *chan)
 
 	val = RAMP_STEP_DURATION(chan->ramp_tick_ms);
 	nvmem_device_write(lpg_chan_sdam, SDAM_REG_RAMP_STEP_DURATION, 1, &val);
+
+	if (lpg->lut_sdam) {
+		nvmem_device_write(lpg_chan_sdam, SDAM_PAUSE_HI_MULTIPLIER_OFFSET + chan->sdam_offset, 1, &hi_pause);
+		nvmem_device_write(lpg_chan_sdam, SDAM_PAUSE_LO_MULTIPLIER_OFFSET + chan->sdam_offset, 1, &lo_pause);
+	}
+
 }
 
 static void lpg_apply_lut_control(struct lpg_channel *chan)
@@ -978,7 +1029,8 @@ static int lpg_pattern_set(struct lpg_led *led, struct led_pattern *led_pattern,
 	 * enabled. In this scenario the delta_t of the middle entry (i.e. the
 	 * last in the programmed pattern) determines the "high pause".
 	 *
-	 * SDAM-based devices do not support "ping-pong", "low pause" or "high pause"
+	 * SDAM-based devices do not support "ping pong", and only supports
+	 * "low pause" and "high pause" with a dedicated SDAM LUT.
 	 */
 
 	/* Detect palindromes and use "ping pong" to reduce LUT usage */
@@ -1023,9 +1075,10 @@ static int lpg_pattern_set(struct lpg_led *led, struct led_pattern *led_pattern,
 
 	/*
 	 * Find "low pause" and "high pause" in the pattern in the LUT case.
-	 * SDAM-based devices require equal duration of all steps
+	 * SDAM-based devices without dedicated LUT SDAM require equal
+	 * duration of all steps.
 	 */
-	if (lpg->lut_base) {
+	if (lpg->lut_base || lpg->lut_sdam) {
 		lo_pause = pattern[0].delta_t;
 		hi_pause = pattern[actual_len - 1].delta_t;
 	} else {
@@ -1490,17 +1543,28 @@ static int lpg_init_sdam(struct lpg *lpg)
 	sdam_count = of_property_count_strings(lpg->dev->of_node, "nvmem-names");
 	if (sdam_count <= 0)
 		return 0;
+	if (sdam_count > SDAM_MAX_DEVICES)
+		return -EINVAL;
 
-	/* Get the SDAM device for LPG/LUT config */
+	/* Get the 1st SDAM device for LPG/LUT config */
 	lpg->lpg_chan_sdam = devm_nvmem_device_get(lpg->dev, "lpg_chan_sdam");
 	if (IS_ERR(lpg->lpg_chan_sdam))
 		return dev_err_probe(lpg->dev, PTR_ERR(lpg->lpg_chan_sdam),
 				"Failed to get LPG chan SDAM device\n");
 
-	lpg->pbs_dev = get_pbs_client_device(lpg->dev);
-	if (IS_ERR(lpg->pbs_dev))
-		return dev_err_probe(lpg->dev, PTR_ERR(lpg->pbs_dev),
-				"Failed to get PBS client device\n");
+	if (sdam_count == 1) {
+		/* Get PBS device node if single SDAM device */
+		lpg->pbs_dev = get_pbs_client_device(lpg->dev);
+		if (IS_ERR(lpg->pbs_dev))
+			return dev_err_probe(lpg->dev, PTR_ERR(lpg->pbs_dev),
+					"Failed to get PBS client device\n");
+	} else if (sdam_count == 2) {
+		/* Get the 2nd SDAM device for LUT pattern */
+		lpg->lut_sdam = devm_nvmem_device_get(lpg->dev, "lut_sdam");
+		if (IS_ERR(lpg->lut_sdam))
+			return dev_err_probe(lpg->dev, PTR_ERR(lpg->lut_sdam),
+					"Failed to get LPG LUT SDAM device\n");
+	}
 
 	for (i = 0; i < lpg->num_channels; i++) {
 		struct lpg_channel *chan = &lpg->channels[i];

From 261603923fc99de24cc7b58434397933e7211190 Mon Sep 17 00:00:00 2001
From: Anjelique Melendez <quic_amelende@quicinc.com>
Date: Thu, 21 Dec 2023 10:58:37 -0800
Subject: [PATCH 0191/1406] leds: rgb: Update PM8350C lpg_data to support
 two-nvmem PPG Scheme

Update the pm8350c lpg_data struct so that pm8350c devices are treated as
PWM devices that support two-nvmem PPG scheme.

Signed-off-by: Anjelique Melendez <quic_amelende@quicinc.com>
Link: https://lore.kernel.org/r/20231221185838.28440-8-quic_amelende@quicinc.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/leds/rgb/leds-qcom-lpg.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/drivers/leds/rgb/leds-qcom-lpg.c b/drivers/leds/rgb/leds-qcom-lpg.c
index 9d0717f770bacd..6226864145a653 100644
--- a/drivers/leds/rgb/leds-qcom-lpg.c
+++ b/drivers/leds/rgb/leds-qcom-lpg.c
@@ -1770,11 +1770,13 @@ static const struct lpg_data pm8150l_lpg_data = {
 static const struct lpg_data pm8350c_pwm_data = {
 	.triled_base = 0xef00,
 
+	.lut_size = 122,
+
 	.num_channels = 4,
 	.channels = (const struct lpg_channel_data[]) {
-		{ .base = 0xe800, .triled_mask = BIT(7) },
-		{ .base = 0xe900, .triled_mask = BIT(6) },
-		{ .base = 0xea00, .triled_mask = BIT(5) },
+		{ .base = 0xe800, .triled_mask = BIT(7), .sdam_offset = 0x48 },
+		{ .base = 0xe900, .triled_mask = BIT(6), .sdam_offset = 0x56 },
+		{ .base = 0xea00, .triled_mask = BIT(5), .sdam_offset = 0x64 },
 		{ .base = 0xeb00 },
 	},
 };

From e34aab02c63adcb57e378d54e3a1b9783ac742c0 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Thu, 21 Dec 2023 23:19:17 +0100
Subject: [PATCH 0192/1406] leds: trigger: Load trigger modules on-demand if
 used as default trigger

Even if a trigger is set as default trigger for a LED device,
the respective trigger module (if built as module) isn't automatically
loaded by the kernel if the LED device is registered. I think we can
do better. Try to load the module asynchronously by alias
ledtrig:<trigger name>. This requires that such an alias is added to
relevant triggers.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Link: https://lore.kernel.org/r/79adb260-06ad-443a-a68e-abe4498c3298@gmail.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/leds/led-triggers.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/drivers/leds/led-triggers.c b/drivers/leds/led-triggers.c
index bd59a14a4a90c2..71cb0aee528c81 100644
--- a/drivers/leds/led-triggers.c
+++ b/drivers/leds/led-triggers.c
@@ -250,6 +250,7 @@ EXPORT_SYMBOL_GPL(led_trigger_remove);
 void led_trigger_set_default(struct led_classdev *led_cdev)
 {
 	struct led_trigger *trig;
+	bool found = false;
 
 	if (!led_cdev->default_trigger)
 		return;
@@ -259,6 +260,7 @@ void led_trigger_set_default(struct led_classdev *led_cdev)
 	list_for_each_entry(trig, &trigger_list, next_trig) {
 		if (!strcmp(led_cdev->default_trigger, trig->name) &&
 		    trigger_relevant(led_cdev, trig)) {
+			found = true;
 			led_cdev->flags |= LED_INIT_DEFAULT_TRIGGER;
 			led_trigger_set(led_cdev, trig);
 			break;
@@ -266,6 +268,13 @@ void led_trigger_set_default(struct led_classdev *led_cdev)
 	}
 	up_write(&led_cdev->trigger_lock);
 	up_read(&triggers_list_lock);
+
+	/*
+	 * If default trigger wasn't found, maybe trigger module isn't loaded yet.
+	 * Once loaded it will re-probe with all led_cdev's.
+	 */
+	if (!found)
+		request_module_nowait("ledtrig:%s", led_cdev->default_trigger);
 }
 EXPORT_SYMBOL_GPL(led_trigger_set_default);
 

From 3c56893060dfc5a6cd70852859e0029607e9c530 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Thu, 21 Dec 2023 23:20:26 +0100
Subject: [PATCH 0193/1406] leds: trigger: netdev: Add module alias
 ledtrig:netdev

Add module alias ledtrig:netdev to enable auto-loading of the module.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Link: https://lore.kernel.org/r/84a1bbd3-1ac7-4f37-849a-7f4d31698f76@gmail.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/leds/trigger/ledtrig-netdev.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/leds/trigger/ledtrig-netdev.c b/drivers/leds/trigger/ledtrig-netdev.c
index 1a0cfbba597638..e22a7b08279be0 100644
--- a/drivers/leds/trigger/ledtrig-netdev.c
+++ b/drivers/leds/trigger/ledtrig-netdev.c
@@ -666,3 +666,4 @@ MODULE_AUTHOR("Ben Whitten <ben.whitten@gmail.com>");
 MODULE_AUTHOR("Oliver Jowett <oliver@opencloud.com>");
 MODULE_DESCRIPTION("Netdev LED trigger");
 MODULE_LICENSE("GPL v2");
+MODULE_ALIAS("ledtrig:netdev");

From 8e78980832eb0e07c9d4cc7efdc9477046573724 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Fri, 22 Dec 2023 22:32:28 +0100
Subject: [PATCH 0194/1406] leds: class: If no default trigger is given, make
 hw_control trigger the default trigger

If a hw_control_trigger is defined, it's usually desirable to make it
the default trigger. Therefore make it the default trigger, except
the driver explicitly set a default trigger.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Link: https://lore.kernel.org/r/f33543de-3800-488f-a779-1fa282614462@gmail.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/leds/led-class.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/leds/led-class.c b/drivers/leds/led-class.c
index ba1be15cfd8ea3..24fcff682b24ad 100644
--- a/drivers/leds/led-class.c
+++ b/drivers/leds/led-class.c
@@ -552,6 +552,12 @@ int led_classdev_register_ext(struct device *parent,
 	led_init_core(led_cdev);
 
 #ifdef CONFIG_LEDS_TRIGGERS
+	/*
+	 * If no default trigger was given and hw_control_trigger is set,
+	 * make it the default trigger.
+	 */
+	if (!led_cdev->default_trigger && led_cdev->hw_control_trigger)
+		led_cdev->default_trigger = led_cdev->hw_control_trigger;
 	led_trigger_set_default(led_cdev);
 #endif
 

From e01e2225511d2d0ff8b7f6bd008ee3e92bb15d09 Mon Sep 17 00:00:00 2001
From: Christian Marangi <ansuelsmth@gmail.com>
Date: Thu, 11 Jan 2024 17:04:54 +0100
Subject: [PATCH 0195/1406] leds: trigger: netdev: Display only supported link
 speed attribute
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

With the addition of more link speed mode to the netdev trigger, it was
pointed out that there may be a problem with bloating the attribute list
with modes that won't ever be supported by the trigger as the attached
device name doesn't support them.

To clear and address this problem, change the logic where these
additional trigger modes are listed.

Since the netdev trigger REQUIRE a device name to be set, attach to the
device name change function additional logic to parse the supported link
speed modes using ethtool APIs and show only the supported link speed
modes attribute.

Link speed attribute are refreshed on device_name set and on
NETDEV_CHANGE events.

This only apply to the link speed modes and every other mode is still
provided by default.

Signed-off-by: Christian Marangi <ansuelsmth@gmail.com>
Reviewed-by: Marek Behún <kabel@kernel.org>
Link: https://lore.kernel.org/r/20240111160501.1774-1-ansuelsmth@gmail.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/leds/trigger/ledtrig-netdev.c | 90 +++++++++++++++++++++++++--
 1 file changed, 84 insertions(+), 6 deletions(-)

diff --git a/drivers/leds/trigger/ledtrig-netdev.c b/drivers/leds/trigger/ledtrig-netdev.c
index e22a7b08279be0..f0eb5820c48ccb 100644
--- a/drivers/leds/trigger/ledtrig-netdev.c
+++ b/drivers/leds/trigger/ledtrig-netdev.c
@@ -18,10 +18,12 @@
 #include <linux/jiffies.h>
 #include <linux/kernel.h>
 #include <linux/leds.h>
+#include <linux/linkmode.h>
 #include <linux/list.h>
 #include <linux/module.h>
 #include <linux/netdevice.h>
 #include <linux/mutex.h>
+#include <linux/phy.h>
 #include <linux/rtnetlink.h>
 #include <linux/timer.h>
 #include "../leds.h"
@@ -65,12 +67,15 @@ struct led_netdev_data {
 
 	unsigned long mode;
 	int link_speed;
+	__ETHTOOL_DECLARE_LINK_MODE_MASK(supported_link_modes);
 	u8 duplex;
 
 	bool carrier_link_up;
 	bool hw_control;
 };
 
+static const struct attribute_group netdev_trig_link_speed_attrs_group;
+
 static void set_baseline_state(struct led_netdev_data *trigger_data)
 {
 	int current_brightness;
@@ -218,13 +223,20 @@ static void get_device_state(struct led_netdev_data *trigger_data)
 	struct ethtool_link_ksettings cmd;
 
 	trigger_data->carrier_link_up = netif_carrier_ok(trigger_data->net_dev);
-	if (!trigger_data->carrier_link_up)
+
+	if (__ethtool_get_link_ksettings(trigger_data->net_dev, &cmd))
 		return;
 
-	if (!__ethtool_get_link_ksettings(trigger_data->net_dev, &cmd)) {
+	if (trigger_data->carrier_link_up) {
 		trigger_data->link_speed = cmd.base.speed;
 		trigger_data->duplex = cmd.base.duplex;
 	}
+
+	/*
+	 * Have a local copy of the link speed supported to avoid rtnl lock every time
+	 * modes are refreshed on any change event
+	 */
+	linkmode_copy(trigger_data->supported_link_modes, cmd.link_modes.supported);
 }
 
 static ssize_t device_name_show(struct device *dev,
@@ -298,6 +310,10 @@ static ssize_t device_name_store(struct device *dev,
 
 	if (ret < 0)
 		return ret;
+
+	/* Refresh link_speed visibility */
+	sysfs_update_group(&dev->kobj, &netdev_trig_link_speed_attrs_group);
+
 	return size;
 }
 
@@ -461,15 +477,63 @@ static ssize_t offloaded_show(struct device *dev,
 
 static DEVICE_ATTR_RO(offloaded);
 
-static struct attribute *netdev_trig_attrs[] = {
-	&dev_attr_device_name.attr,
-	&dev_attr_link.attr,
+#define CHECK_LINK_MODE_ATTR(link_speed) \
+	do { \
+		if (attr == &dev_attr_link_##link_speed.attr && \
+		    link_ksettings.base.speed == SPEED_##link_speed) \
+			return attr->mode; \
+	} while (0)
+
+static umode_t netdev_trig_link_speed_visible(struct kobject *kobj,
+					      struct attribute *attr, int n)
+{
+	struct device *dev = kobj_to_dev(kobj);
+	struct led_netdev_data *trigger_data;
+	unsigned long *supported_link_modes;
+	u32 mode;
+
+	trigger_data = led_trigger_get_drvdata(dev);
+	supported_link_modes = trigger_data->supported_link_modes;
+
+	/*
+	 * Search in the supported link mode mask a matching supported mode.
+	 * Stop at the first matching entry as we care only to check if a particular
+	 * speed is supported and not the kind.
+	 */
+	for_each_set_bit(mode, supported_link_modes, __ETHTOOL_LINK_MODE_MASK_NBITS) {
+		struct ethtool_link_ksettings link_ksettings;
+
+		ethtool_params_from_link_mode(&link_ksettings, mode);
+
+		CHECK_LINK_MODE_ATTR(10);
+		CHECK_LINK_MODE_ATTR(100);
+		CHECK_LINK_MODE_ATTR(1000);
+		CHECK_LINK_MODE_ATTR(2500);
+		CHECK_LINK_MODE_ATTR(5000);
+		CHECK_LINK_MODE_ATTR(10000);
+	}
+
+	return 0;
+}
+
+static struct attribute *netdev_trig_link_speed_attrs[] = {
 	&dev_attr_link_10.attr,
 	&dev_attr_link_100.attr,
 	&dev_attr_link_1000.attr,
 	&dev_attr_link_2500.attr,
 	&dev_attr_link_5000.attr,
 	&dev_attr_link_10000.attr,
+	NULL
+};
+
+static const struct attribute_group netdev_trig_link_speed_attrs_group = {
+	.attrs = netdev_trig_link_speed_attrs,
+	.is_visible = netdev_trig_link_speed_visible,
+};
+
+static struct attribute *netdev_trig_attrs[] = {
+	&dev_attr_device_name.attr,
+	&dev_attr_link.attr,
 	&dev_attr_full_duplex.attr,
 	&dev_attr_half_duplex.attr,
 	&dev_attr_rx.attr,
@@ -478,7 +542,16 @@ static struct attribute *netdev_trig_attrs[] = {
 	&dev_attr_offloaded.attr,
 	NULL
 };
-ATTRIBUTE_GROUPS(netdev_trig);
+
+static const struct attribute_group netdev_trig_attrs_group = {
+	.attrs = netdev_trig_attrs,
+};
+
+static const struct attribute_group *netdev_trig_groups[] = {
+	&netdev_trig_attrs_group,
+	&netdev_trig_link_speed_attrs_group,
+	NULL,
+};
 
 static int netdev_trig_notify(struct notifier_block *nb,
 			      unsigned long evt, void *dv)
@@ -487,6 +560,7 @@ static int netdev_trig_notify(struct notifier_block *nb,
 		netdev_notifier_info_to_dev((struct netdev_notifier_info *)dv);
 	struct led_netdev_data *trigger_data =
 		container_of(nb, struct led_netdev_data, notifier);
+	struct led_classdev *led_cdev = trigger_data->led_cdev;
 
 	if (evt != NETDEV_UP && evt != NETDEV_DOWN && evt != NETDEV_CHANGE
 	    && evt != NETDEV_REGISTER && evt != NETDEV_UNREGISTER
@@ -521,6 +595,10 @@ static int netdev_trig_notify(struct notifier_block *nb,
 	case NETDEV_UP:
 	case NETDEV_CHANGE:
 		get_device_state(trigger_data);
+		/* Refresh link_speed visibility */
+		if (evt == NETDEV_CHANGE)
+			sysfs_update_group(&led_cdev->dev->kobj,
+					   &netdev_trig_link_speed_attrs_group);
 		break;
 	}
 

From 1d8cc1ff2dfa675d3473725df4dd910872d96126 Mon Sep 17 00:00:00 2001
From: Christian Marangi <ansuelsmth@gmail.com>
Date: Thu, 11 Jan 2024 17:04:55 +0100
Subject: [PATCH 0196/1406] docs: ABI: sysfs-class-led-trigger-netdev: Document
 now hidable link_*
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Document now hidable link speed modes for the LED netdev trigger.

Link speed modes are now showed only if the named network device
supports them and are hidden if not.

Signed-off-by: Christian Marangi <ansuelsmth@gmail.com>
Reviewed-by: Marek Behún <kabel@kernel.org>
Link: https://lore.kernel.org/r/20240111160501.1774-2-ansuelsmth@gmail.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 .../ABI/testing/sysfs-class-led-trigger-netdev       | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/Documentation/ABI/testing/sysfs-class-led-trigger-netdev b/Documentation/ABI/testing/sysfs-class-led-trigger-netdev
index a6c307c4befa09..ed46b37ab8a284 100644
--- a/Documentation/ABI/testing/sysfs-class-led-trigger-netdev
+++ b/Documentation/ABI/testing/sysfs-class-led-trigger-netdev
@@ -88,6 +88,8 @@ Description:
 		speed of 10MBps of the named network device.
 		Setting this value also immediately changes the LED state.
 
+		Present only if the named network device supports 10Mbps link speed.
+
 What:		/sys/class/leds/<led>/link_100
 Date:		Jun 2023
 KernelVersion:	6.5
@@ -101,6 +103,8 @@ Description:
 		speed of 100Mbps of the named network device.
 		Setting this value also immediately changes the LED state.
 
+		Present only if the named network device supports 100Mbps link speed.
+
 What:		/sys/class/leds/<led>/link_1000
 Date:		Jun 2023
 KernelVersion:	6.5
@@ -114,6 +118,8 @@ Description:
 		speed of 1000Mbps of the named network device.
 		Setting this value also immediately changes the LED state.
 
+		Present only if the named network device supports 1000Mbps link speed.
+
 What:		/sys/class/leds/<led>/link_2500
 Date:		Nov 2023
 KernelVersion:	6.8
@@ -127,6 +133,8 @@ Description:
 		speed of 2500Mbps of the named network device.
 		Setting this value also immediately changes the LED state.
 
+		Present only if the named network device supports 2500Mbps link speed.
+
 What:		/sys/class/leds/<led>/link_5000
 Date:		Nov 2023
 KernelVersion:	6.8
@@ -140,6 +148,8 @@ Description:
 		speed of 5000Mbps of the named network device.
 		Setting this value also immediately changes the LED state.
 
+		Present only if the named network device supports 5000Mbps link speed.
+
 What:		/sys/class/leds/<led>/link_10000
 Date:		Nov 2023
 KernelVersion:	6.8
@@ -153,6 +163,8 @@ Description:
 		speed of 10000Mbps of the named network device.
 		Setting this value also immediately changes the LED state.
 
+		Present only if the named network device supports 10000Mbps link speed.
+
 What:		/sys/class/leds/<led>/half_duplex
 Date:		Jun 2023
 KernelVersion:	6.5

From 4694dcab92cf0e78ff65978888ae14a6373f1ceb Mon Sep 17 00:00:00 2001
From: Florian Eckert <fe@dev.tdt.de>
Date: Wed, 10 Jan 2024 14:34:10 +0100
Subject: [PATCH 0197/1406] Documentation: leds: Update led-trigger-tty ABI
 description

The 'led-trigger-tty' uses the same naming in the ABI documentation as
the 'led-trigger-netdev'. Which leads to the following warning when
building the documentation.

Warning: /sys/class/leds/<led>/rx is defined 2 times:
Documentation/ABI/testing/sysfs-class-led-trigger-tty:7
Documentation/ABI/testing/sysfs-class-led-trigger-netdev:49
Warning: /sys/class/leds/<led>/tx is defined 2 times:
Documentation/ABI/testing/sysfs-class-led-trigger-tty:15
Documentation/ABI/testing/sysfs-class-led-trigger-netdev:34

Renaming the 'What' path by prefixing it with 'tty_' solves this problem.

Fixes: 6dec659896b4 ("leds: ledtrig-tty: Add additional line state evaluation")
Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Florian Eckert <fe@dev.tdt.de>
Link: https://lore.kernel.org/r/20240110133410.81645-1-fe@dev.tdt.de
Signed-off-by: Lee Jones <lee@kernel.org>
(cherry picked from commit ea411a8422c1d7f8193d726fb76ba09534b6a5fe)
Signed-off-by: Lee Jones <lee@kernel.org>
---
 .../ABI/testing/sysfs-class-led-trigger-tty        | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/Documentation/ABI/testing/sysfs-class-led-trigger-tty b/Documentation/ABI/testing/sysfs-class-led-trigger-tty
index 30cef9ac0f493a..308fbc3627cd1a 100644
--- a/Documentation/ABI/testing/sysfs-class-led-trigger-tty
+++ b/Documentation/ABI/testing/sysfs-class-led-trigger-tty
@@ -1,11 +1,11 @@
-What:		/sys/class/leds/<led>/ttyname
+What:		/sys/class/leds/<tty_led>/ttyname
 Date:		Dec 2020
 KernelVersion:	5.10
 Contact:	linux-leds@vger.kernel.org
 Description:
 		Specifies the tty device name of the triggering tty
 
-What:		/sys/class/leds/<led>/rx
+What:		/sys/class/leds/<tty_led>/rx
 Date:		February 2024
 KernelVersion:	6.8
 Description:
@@ -13,7 +13,7 @@ Description:
 		If set to 0, the LED will not blink on reception.
 		If set to 1 (default), the LED will blink on reception.
 
-What:		/sys/class/leds/<led>/tx
+What:		/sys/class/leds/<tty_led>/tx
 Date:		February 2024
 KernelVersion:	6.8
 Description:
@@ -21,7 +21,7 @@ Description:
 		If set to 0, the LED will not blink on transmission.
 		If set to 1 (default), the LED will blink on transmission.
 
-What:		/sys/class/leds/<led>/cts
+What:		/sys/class/leds/<tty_led>/cts
 Date:		February 2024
 KernelVersion:	6.8
 Description:
@@ -31,7 +31,7 @@ Description:
 		If set to 0 (default), the LED will not evaluate CTS.
 		If set to 1, the LED will evaluate CTS.
 
-What:		/sys/class/leds/<led>/dsr
+What:		/sys/class/leds/<tty_led>/dsr
 Date:		February 2024
 KernelVersion:	6.8
 Description:
@@ -41,7 +41,7 @@ Description:
 		If set to 0 (default), the LED will not evaluate DSR.
 		If set to 1, the LED will evaluate DSR.
 
-What:		/sys/class/leds/<led>/dcd
+What:		/sys/class/leds/<tty_led>/dcd
 Date:		February 2024
 KernelVersion:	6.8
 Description:
@@ -51,7 +51,7 @@ Description:
 		If set to 0 (default), the LED will not evaluate CAR (DCD).
 		If set to 1, the LED will evaluate CAR (DCD).
 
-What:		/sys/class/leds/<led>/rng
+What:		/sys/class/leds/<tty_led>/rng
 Date:		February 2024
 KernelVersion:	6.8
 Description:

From 83323575228082de61a1278cc5babc090ddb2b54 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Sat, 13 Jan 2024 17:00:11 +0100
Subject: [PATCH 0198/1406] leds: trigger: audio: Set module alias for module
 auto-loading

This a follow-up to 5edf7f11313d ("leds: trigger: Load trigger modules
on-demand if used as default trigger") and sets an alias for the audio
triggers.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Link: https://lore.kernel.org/r/4663d2d8-660d-4af2-9f65-d95e95263923@gmail.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/leds/trigger/ledtrig-audio.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/leds/trigger/ledtrig-audio.c b/drivers/leds/trigger/ledtrig-audio.c
index c6b437e6369b8d..2ecd4b760fc36a 100644
--- a/drivers/leds/trigger/ledtrig-audio.c
+++ b/drivers/leds/trigger/ledtrig-audio.c
@@ -63,3 +63,5 @@ module_exit(ledtrig_audio_exit);
 
 MODULE_DESCRIPTION("LED trigger for audio mute control");
 MODULE_LICENSE("GPL v2");
+MODULE_ALIAS("ledtrig:audio-mute");
+MODULE_ALIAS("ledtrig:audio-micmute");

From 1e147c37ed09dda38fe738e73a8a3d64c22b1751 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Mon, 15 Jan 2024 22:46:23 +0100
Subject: [PATCH 0199/1406] leds: triggers: default-on: Add module alias for
 module auto-loading

A bigger number of board device tree files, plus few drivers, set
default-on as default trigger for LED's. Therefore add an alias for
module auto-loading.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Link: https://lore.kernel.org/r/7e94d26b-d772-4a07-b0f6-bb3111b9ff75@gmail.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/leds/trigger/ledtrig-default-on.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/leds/trigger/ledtrig-default-on.c b/drivers/leds/trigger/ledtrig-default-on.c
index 8207f85eceb168..8678e64a5c3376 100644
--- a/drivers/leds/trigger/ledtrig-default-on.c
+++ b/drivers/leds/trigger/ledtrig-default-on.c
@@ -28,3 +28,4 @@ module_led_trigger(defon_led_trigger);
 MODULE_AUTHOR("Nick Forbes <nick.forbes@incepta.com>");
 MODULE_DESCRIPTION("Default-ON LED trigger");
 MODULE_LICENSE("GPL v2");
+MODULE_ALIAS("ledtrig:default-on");

From cf177262ac22eee478a339b1c7a5c4d2f5da904a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= <rafal@milecki.pl>
Date: Wed, 17 Jan 2024 16:17:36 +0100
Subject: [PATCH 0200/1406] dt-bindings: leds: Add FUNCTION defines for
 per-band WLANs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Most wireless routers and access points can operate in multiple bands
simultaneously. Vendors often equip their devices with per-band LEDs.

Add defines for those very common functions to allow cleaner & clearer
bindings.

Signed-off-by: Rafał Miłecki <rafal@milecki.pl>
Acked-by: Rob Herring <robh@kernel.org>
Link: https://lore.kernel.org/r/20240117151736.27440-1-zajec5@gmail.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 include/dt-bindings/leds/common.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/include/dt-bindings/leds/common.h b/include/dt-bindings/leds/common.h
index 9a0d33d027fff4..c56785bb9c9c49 100644
--- a/include/dt-bindings/leds/common.h
+++ b/include/dt-bindings/leds/common.h
@@ -101,6 +101,9 @@
 #define LED_FUNCTION_USB "usb"
 #define LED_FUNCTION_WAN "wan"
 #define LED_FUNCTION_WLAN "wlan"
+#define LED_FUNCTION_WLAN_2GHZ "wlan-2ghz"
+#define LED_FUNCTION_WLAN_5GHZ "wlan-5ghz"
+#define LED_FUNCTION_WLAN_6GHZ "wlan-6ghz"
 #define LED_FUNCTION_WPS "wps"
 
 #endif /* __DT_BINDINGS_LEDS_H */

From a7f3597e61b845d1c30b598ef7258edc08267258 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Sat, 9 Dec 2023 23:54:51 +0100
Subject: [PATCH 0201/1406] leds: trigger: panic: Simplify
 led_trigger_set_panic

I don't see why we iterate over all triggers to find the panic trigger.
We *are* the panic trigger. Therefore we also know that the panic
trigger doesn't have an activate() hook. So we can simplify the code
significantly.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Reviewed-by: Jacek Anaszewski <jacek.anaszewski@gmail.com>
Link: https://lore.kernel.org/r/84c0fa67-2f03-4474-aa75-914d65d88dd0@gmail.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/leds/trigger/ledtrig-panic.c | 23 +++++++----------------
 1 file changed, 7 insertions(+), 16 deletions(-)

diff --git a/drivers/leds/trigger/ledtrig-panic.c b/drivers/leds/trigger/ledtrig-panic.c
index 5a6b21bfeb9af4..1d49c10780910c 100644
--- a/drivers/leds/trigger/ledtrig-panic.c
+++ b/drivers/leds/trigger/ledtrig-panic.c
@@ -21,24 +21,15 @@ static struct led_trigger *trigger;
  */
 static void led_trigger_set_panic(struct led_classdev *led_cdev)
 {
-	struct led_trigger *trig;
+	if (led_cdev->trigger)
+		list_del(&led_cdev->trig_list);
+	list_add_tail(&led_cdev->trig_list, &trigger->led_cdevs);
 
-	list_for_each_entry(trig, &trigger_list, next_trig) {
-		if (strcmp("panic", trig->name))
-			continue;
-		if (led_cdev->trigger)
-			list_del(&led_cdev->trig_list);
-		list_add_tail(&led_cdev->trig_list, &trig->led_cdevs);
+	/* Avoid the delayed blink path */
+	led_cdev->blink_delay_on = 0;
+	led_cdev->blink_delay_off = 0;
 
-		/* Avoid the delayed blink path */
-		led_cdev->blink_delay_on = 0;
-		led_cdev->blink_delay_off = 0;
-
-		led_cdev->trigger = trig;
-		if (trig->activate)
-			trig->activate(led_cdev);
-		break;
-	}
+	led_cdev->trigger = trigger;
 }
 
 static int led_trigger_panic_notifier(struct notifier_block *nb,

From 11173194833f4cad1fb657274140d637ff378b38 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.i.king@gmail.com>
Date: Fri, 19 Jan 2024 23:26:30 +0000
Subject: [PATCH 0202/1406] leds: aw200xx: Make read-only array coeff_table
 static const

Don't populate the read-only array coeff_table on the stack at
run time, instead make it static const.

Signed-off-by: Colin Ian King <colin.i.king@gmail.com>
Link: https://lore.kernel.org/r/20240119232630.2752239-1-colin.i.king@gmail.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/leds/leds-aw200xx.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/leds/leds-aw200xx.c b/drivers/leds/leds-aw200xx.c
index f584a7f98fc5b8..6c8c9f2c19e339 100644
--- a/drivers/leds/leds-aw200xx.c
+++ b/drivers/leds/leds-aw200xx.c
@@ -282,7 +282,7 @@ static int aw200xx_set_imax(const struct aw200xx *const chip,
 			    u32 led_imax_uA)
 {
 	u32 g_imax_uA = aw200xx_imax_to_global(chip, led_imax_uA);
-	u32 coeff_table[] = {1, 2, 3, 4, 6, 8, 12, 16};
+	static const u32 coeff_table[] = {1, 2, 3, 4, 6, 8, 12, 16};
 	u32 gccr_imax = UINT_MAX;
 	u32 cur_imax = 0;
 	int i;

From ac3bd9ed1f55da7901c374c42b7c55efc552b578 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 9 Jan 2024 10:06:39 +0100
Subject: [PATCH 0203/1406] leds: Remove led_init_default_state_get() and
 devm_led_classdev_register_ext() stubs

These two functions have stub implementations that are called when
NEW_LEDS and/or LEDS_CLASS are disabled, theorerically allowing drivers
to optionally use the LED subsystem.

However, this has never really worked because a built-in driver is
unable to link against these functions if the LED class is in a loadable
module. Heiner ran into this problem with a driver that newly gained
a LEDS_CLASS dependency and suggested using an IS_REACHABLE() check.

This is the reverse approach, removing the stub entirely to acknowledge
that it is pointless in its current form, and that not having it avoids
misleading developers into thinking that they can rely on it.

This survived around 1000 randconfig builds to validate that any callers
of the interface already have the correct Kconfig dependency already,
with the exception of the one that Heiner just added.

Cc: Heiner Kallweit <hkallweit1@gmail.com>
Link: https://lore.kernel.org/linux-leds/0f6f432b-c650-4bb8-a1b5-fe3372804d52@gmail.com/T/#u
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Link: https://lore.kernel.org/r/20240109090715.982332-1-arnd@kernel.org
Signed-off-by: Lee Jones <lee@kernel.org>
---
 include/linux/leds.h | 19 -------------------
 1 file changed, 19 deletions(-)

diff --git a/include/linux/leds.h b/include/linux/leds.h
index 4754b02d3a2c58..7598d472903a76 100644
--- a/include/linux/leds.h
+++ b/include/linux/leds.h
@@ -82,15 +82,7 @@ struct led_init_data {
 	bool devname_mandatory;
 };
 
-#if IS_ENABLED(CONFIG_NEW_LEDS)
 enum led_default_state led_init_default_state_get(struct fwnode_handle *fwnode);
-#else
-static inline enum led_default_state
-led_init_default_state_get(struct fwnode_handle *fwnode)
-{
-	return LEDS_DEFSTATE_OFF;
-}
-#endif
 
 struct led_hw_trigger_type {
 	int dummy;
@@ -279,20 +271,9 @@ static inline int led_classdev_register(struct device *parent,
 	return led_classdev_register_ext(parent, led_cdev, NULL);
 }
 
-#if IS_ENABLED(CONFIG_LEDS_CLASS)
 int devm_led_classdev_register_ext(struct device *parent,
 					  struct led_classdev *led_cdev,
 					  struct led_init_data *init_data);
-#else
-static inline int
-devm_led_classdev_register_ext(struct device *parent,
-			       struct led_classdev *led_cdev,
-			       struct led_init_data *init_data)
-{
-	return 0;
-}
-#endif
-
 static inline int devm_led_classdev_register(struct device *parent,
 					     struct led_classdev *led_cdev)
 {

From ff7d3c763f63502b627575b02bee8ef46068233a Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 9 Jan 2024 10:06:40 +0100
Subject: [PATCH 0204/1406] leds: Make flash and multicolor dependencies
 unconditional

Along the same lines as making devm_led_classdev_register() declared
extern unconditional, do the same thing for the two sub-classes
that have similar stubs.

The users of these interfaces go to great lengths to allow building
with both the generic leds API and the extended version, but realistically
there is not much use in this, so just simplify it to always rely
on it and remove the confusing fallback logic.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Link: https://lore.kernel.org/r/20240109090715.982332-2-arnd@kernel.org
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/leds/Kconfig                 |  4 ++--
 drivers/leds/flash/Kconfig           |  4 ++--
 drivers/staging/greybus/Kconfig      |  2 +-
 drivers/staging/greybus/light.c      | 21 --------------------
 include/linux/led-class-flash.h      | 24 -----------------------
 include/linux/led-class-multicolor.h | 29 ----------------------------
 6 files changed, 5 insertions(+), 79 deletions(-)

diff --git a/drivers/leds/Kconfig b/drivers/leds/Kconfig
index d721b254e1e450..9613a45a35bd15 100644
--- a/drivers/leds/Kconfig
+++ b/drivers/leds/Kconfig
@@ -395,7 +395,7 @@ config LEDS_LP3952
 config LEDS_LP50XX
 	tristate "LED Support for TI LP5036/30/24/18/12/09 LED driver chip"
 	depends on LEDS_CLASS && REGMAP_I2C
-	depends on LEDS_CLASS_MULTICOLOR || !LEDS_CLASS_MULTICOLOR
+	depends on LEDS_CLASS_MULTICOLOR
 	help
 	  If you say yes here you get support for the Texas Instruments
 	  LP5036, LP5030, LP5024, LP5018, LP5012 and LP5009 LED driver.
@@ -406,7 +406,7 @@ config LEDS_LP50XX
 config LEDS_LP55XX_COMMON
 	tristate "Common Driver for TI/National LP5521/5523/55231/5562/8501"
 	depends on LEDS_CLASS
-	depends on LEDS_CLASS_MULTICOLOR || !LEDS_CLASS_MULTICOLOR
+	depends on LEDS_CLASS_MULTICOLOR
 	depends on OF
 	depends on I2C
 	select FW_LOADER
diff --git a/drivers/leds/flash/Kconfig b/drivers/leds/flash/Kconfig
index 4e08dbc057096f..b95f90cd57499b 100644
--- a/drivers/leds/flash/Kconfig
+++ b/drivers/leds/flash/Kconfig
@@ -51,8 +51,8 @@ config LEDS_MAX77693
 config LEDS_MT6360
 	tristate "LED Support for Mediatek MT6360 PMIC"
 	depends on LEDS_CLASS && OF
-	depends on LEDS_CLASS_FLASH || !LEDS_CLASS_FLASH
-	depends on LEDS_CLASS_MULTICOLOR || !LEDS_CLASS_MULTICOLOR
+	depends on LEDS_CLASS_FLASH
+	depends on LEDS_CLASS_MULTICOLOR
 	depends on V4L2_FLASH_LED_CLASS || !V4L2_FLASH_LED_CLASS
 	depends on MFD_MT6360
 	help
diff --git a/drivers/staging/greybus/Kconfig b/drivers/staging/greybus/Kconfig
index 927cfa4bc9898c..1e745a8d439c89 100644
--- a/drivers/staging/greybus/Kconfig
+++ b/drivers/staging/greybus/Kconfig
@@ -64,7 +64,7 @@ config GREYBUS_HID
 
 config GREYBUS_LIGHT
 	tristate "Greybus LED Class driver"
-	depends on LEDS_CLASS
+	depends on LEDS_CLASS_FLASH
 	help
 	  Select this option if you have a device that follows the
 	  Greybus LED Class specification.
diff --git a/drivers/staging/greybus/light.c b/drivers/staging/greybus/light.c
index 87d36948c61067..d62f97249aca67 100644
--- a/drivers/staging/greybus/light.c
+++ b/drivers/staging/greybus/light.c
@@ -29,13 +29,9 @@ struct gb_channel {
 	struct attribute_group		*attr_group;
 	const struct attribute_group	**attr_groups;
 	struct led_classdev		*led;
-#if IS_REACHABLE(CONFIG_LEDS_CLASS_FLASH)
 	struct led_classdev_flash	fled;
 	struct led_flash_setting	intensity_uA;
 	struct led_flash_setting	timeout_us;
-#else
-	struct led_classdev		cled;
-#endif
 	struct gb_light			*light;
 	bool				is_registered;
 	bool				releasing;
@@ -84,7 +80,6 @@ static bool is_channel_flash(struct gb_channel *channel)
 				   | GB_CHANNEL_MODE_INDICATOR));
 }
 
-#if IS_REACHABLE(CONFIG_LEDS_CLASS_FLASH)
 static struct gb_channel *get_channel_from_cdev(struct led_classdev *cdev)
 {
 	struct led_classdev_flash *fled_cdev = lcdev_to_flcdev(cdev);
@@ -153,22 +148,6 @@ static int __gb_lights_flash_brightness_set(struct gb_channel *channel)
 
 	return __gb_lights_flash_intensity_set(channel, intensity);
 }
-#else
-static struct gb_channel *get_channel_from_cdev(struct led_classdev *cdev)
-{
-	return container_of(cdev, struct gb_channel, cled);
-}
-
-static struct led_classdev *get_channel_cdev(struct gb_channel *channel)
-{
-	return &channel->cled;
-}
-
-static int __gb_lights_flash_brightness_set(struct gb_channel *channel)
-{
-	return 0;
-}
-#endif
 
 static int gb_lights_color_set(struct gb_channel *channel, u32 color);
 static int gb_lights_fade_set(struct gb_channel *channel);
diff --git a/include/linux/led-class-flash.h b/include/linux/led-class-flash.h
index 612b4cab3819ec..36df927ec4b7dc 100644
--- a/include/linux/led-class-flash.h
+++ b/include/linux/led-class-flash.h
@@ -85,7 +85,6 @@ static inline struct led_classdev_flash *lcdev_to_flcdev(
 	return container_of(lcdev, struct led_classdev_flash, led_cdev);
 }
 
-#if IS_ENABLED(CONFIG_LEDS_CLASS_FLASH)
 /**
  * led_classdev_flash_register_ext - register a new object of LED class with
  *				     init data and with support for flash LEDs
@@ -116,29 +115,6 @@ int devm_led_classdev_flash_register_ext(struct device *parent,
 void devm_led_classdev_flash_unregister(struct device *parent,
 					struct led_classdev_flash *fled_cdev);
 
-#else
-
-static inline int led_classdev_flash_register_ext(struct device *parent,
-				    struct led_classdev_flash *fled_cdev,
-				    struct led_init_data *init_data)
-{
-	return 0;
-}
-
-static inline void led_classdev_flash_unregister(struct led_classdev_flash *fled_cdev) {};
-static inline int devm_led_classdev_flash_register_ext(struct device *parent,
-				     struct led_classdev_flash *fled_cdev,
-				     struct led_init_data *init_data)
-{
-	return 0;
-}
-
-static inline void devm_led_classdev_flash_unregister(struct device *parent,
-					struct led_classdev_flash *fled_cdev)
-{};
-
-#endif  /* IS_ENABLED(CONFIG_LEDS_CLASS_FLASH) */
-
 static inline int led_classdev_flash_register(struct device *parent,
 					   struct led_classdev_flash *fled_cdev)
 {
diff --git a/include/linux/led-class-multicolor.h b/include/linux/led-class-multicolor.h
index 210d57bcd767aa..db9f34c6736e9a 100644
--- a/include/linux/led-class-multicolor.h
+++ b/include/linux/led-class-multicolor.h
@@ -30,7 +30,6 @@ static inline struct led_classdev_mc *lcdev_to_mccdev(
 	return container_of(led_cdev, struct led_classdev_mc, led_cdev);
 }
 
-#if IS_ENABLED(CONFIG_LEDS_CLASS_MULTICOLOR)
 /**
  * led_classdev_multicolor_register_ext - register a new object of led_classdev
  *				      class with support for multicolor LEDs
@@ -64,34 +63,6 @@ int devm_led_classdev_multicolor_register_ext(struct device *parent,
 
 void devm_led_classdev_multicolor_unregister(struct device *parent,
 					    struct led_classdev_mc *mcled_cdev);
-#else
-
-static inline int led_classdev_multicolor_register_ext(struct device *parent,
-					    struct led_classdev_mc *mcled_cdev,
-					    struct led_init_data *init_data)
-{
-	return 0;
-}
-
-static inline void led_classdev_multicolor_unregister(struct led_classdev_mc *mcled_cdev) {};
-static inline int led_mc_calc_color_components(struct led_classdev_mc *mcled_cdev,
-					       enum led_brightness brightness)
-{
-	return 0;
-}
-
-static inline int devm_led_classdev_multicolor_register_ext(struct device *parent,
-					  struct led_classdev_mc *mcled_cdev,
-					  struct led_init_data *init_data)
-{
-	return 0;
-}
-
-static inline void devm_led_classdev_multicolor_unregister(struct device *parent,
-					    struct led_classdev_mc *mcled_cdev)
-{};
-
-#endif  /* IS_ENABLED(CONFIG_LEDS_CLASS_MULTICOLOR) */
 
 static inline int led_classdev_multicolor_register(struct device *parent,
 					    struct led_classdev_mc *mcled_cdev)

From 87ce6eaa089dc246aeb1b6bb56944c5f89b0fbe1 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Wed, 31 Jan 2024 15:30:53 +0100
Subject: [PATCH 0205/1406] leds: trigger: Stop exporting trigger_list

Commit 682e98564ffb ("leds: trigger: panic: Simplify
led_trigger_set_panic") removed the last external user of variable
trigger_list. So stop exporting it. If in future a need should arise
again to access this variable, I think we better add some accessor
instead of exporting the variable directly.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Link: https://lore.kernel.org/r/ca185fb1-3a66-46b9-920e-bfecbe39c6bf@gmail.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/leds/led-triggers.c | 2 +-
 drivers/leds/leds.h         | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/leds/led-triggers.c b/drivers/leds/led-triggers.c
index 71cb0aee528c81..371000770d75bc 100644
--- a/drivers/leds/led-triggers.c
+++ b/drivers/leds/led-triggers.c
@@ -23,7 +23,7 @@
  * Nests outside led_cdev->trigger_lock
  */
 static DECLARE_RWSEM(triggers_list_lock);
-LIST_HEAD(trigger_list);
+static LIST_HEAD(trigger_list);
 
  /* Used by LED Class */
 
diff --git a/drivers/leds/leds.h b/drivers/leds/leds.h
index 345062ccabdaa7..1138e2ab82e55a 100644
--- a/drivers/leds/leds.h
+++ b/drivers/leds/leds.h
@@ -30,7 +30,6 @@ ssize_t led_trigger_write(struct file *filp, struct kobject *kobj,
 
 extern struct rw_semaphore leds_list_lock;
 extern struct list_head leds_list;
-extern struct list_head trigger_list;
 extern const char * const led_colors[LED_COLOR_ID_MAX];
 
 #endif	/* __LEDS_H_INCLUDED */

From 27c110ccae2b12375f8275ebb15dd941d20ad250 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Wed, 31 Jan 2024 15:33:08 +0100
Subject: [PATCH 0206/1406] leds: triggers: Add helper
 led_match_default_trigger

Avoid code duplication and factor out common functionality to new
helper led_match_default_trigger().

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Link: https://lore.kernel.org/r/d78eef6f-c18c-4546-b83e-6d1890849154@gmail.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/leds/led-triggers.c | 29 +++++++++++++++++------------
 1 file changed, 17 insertions(+), 12 deletions(-)

diff --git a/drivers/leds/led-triggers.c b/drivers/leds/led-triggers.c
index 371000770d75bc..0f5ac30053ad26 100644
--- a/drivers/leds/led-triggers.c
+++ b/drivers/leds/led-triggers.c
@@ -247,6 +247,19 @@ void led_trigger_remove(struct led_classdev *led_cdev)
 }
 EXPORT_SYMBOL_GPL(led_trigger_remove);
 
+static bool led_match_default_trigger(struct led_classdev *led_cdev,
+				      struct led_trigger *trig)
+{
+	if (!strcmp(led_cdev->default_trigger, trig->name) &&
+	    trigger_relevant(led_cdev, trig)) {
+		led_cdev->flags |= LED_INIT_DEFAULT_TRIGGER;
+		led_trigger_set(led_cdev, trig);
+		return true;
+	}
+
+	return false;
+}
+
 void led_trigger_set_default(struct led_classdev *led_cdev)
 {
 	struct led_trigger *trig;
@@ -258,13 +271,9 @@ void led_trigger_set_default(struct led_classdev *led_cdev)
 	down_read(&triggers_list_lock);
 	down_write(&led_cdev->trigger_lock);
 	list_for_each_entry(trig, &trigger_list, next_trig) {
-		if (!strcmp(led_cdev->default_trigger, trig->name) &&
-		    trigger_relevant(led_cdev, trig)) {
-			found = true;
-			led_cdev->flags |= LED_INIT_DEFAULT_TRIGGER;
-			led_trigger_set(led_cdev, trig);
+		found = led_match_default_trigger(led_cdev, trig);
+		if (found)
 			break;
-		}
 	}
 	up_write(&led_cdev->trigger_lock);
 	up_read(&triggers_list_lock);
@@ -306,12 +315,8 @@ int led_trigger_register(struct led_trigger *trig)
 	down_read(&leds_list_lock);
 	list_for_each_entry(led_cdev, &leds_list, node) {
 		down_write(&led_cdev->trigger_lock);
-		if (!led_cdev->trigger && led_cdev->default_trigger &&
-		    !strcmp(led_cdev->default_trigger, trig->name) &&
-		    trigger_relevant(led_cdev, trig)) {
-			led_cdev->flags |= LED_INIT_DEFAULT_TRIGGER;
-			led_trigger_set(led_cdev, trig);
-		}
+		if (!led_cdev->trigger && led_cdev->default_trigger)
+			led_match_default_trigger(led_cdev, trig);
 		up_write(&led_cdev->trigger_lock);
 	}
 	up_read(&leds_list_lock);

From 3e3c40d6b1fe42afbc28b7653c852c2e6561ad05 Mon Sep 17 00:00:00 2001
From: Amitesh Singh <singh.amitesh@gmail.com>
Date: Sat, 3 Feb 2024 21:55:24 +0530
Subject: [PATCH 0207/1406] leds: pca963x: Add support for suspend and resume

This implements power management for pca9633 which enables
device sleep and resume on system-wide sleep/hibernation

Signed-off-by: Amitesh Singh <singh.amitesh@gmail.com>
Link: https://lore.kernel.org/r/20240203162524.343936-1-singh.amitesh@gmail.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/leds/leds-pca963x.c | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/drivers/leds/leds-pca963x.c b/drivers/leds/leds-pca963x.c
index 47223c850e4b4d..b53905da359201 100644
--- a/drivers/leds/leds-pca963x.c
+++ b/drivers/leds/leds-pca963x.c
@@ -39,6 +39,7 @@
 #define PCA963X_LED_PWM		0x2	/* Controlled through PWM */
 #define PCA963X_LED_GRP_PWM	0x3	/* Controlled through PWM/GRPPWM */
 
+#define PCA963X_MODE1_SLEEP	0x04    /* Normal mode or Low Power mode, oscillator off */
 #define PCA963X_MODE2_OUTDRV	0x04	/* Open-drain or totem pole */
 #define PCA963X_MODE2_INVRT	0x10	/* Normal or inverted direction */
 #define PCA963X_MODE2_DMBLNK	0x20	/* Enable blinking */
@@ -380,6 +381,32 @@ static int pca963x_register_leds(struct i2c_client *client,
 	return ret;
 }
 
+static int pca963x_suspend(struct device *dev)
+{
+	struct pca963x *chip = dev_get_drvdata(dev);
+	u8 reg;
+
+	reg = i2c_smbus_read_byte_data(chip->client, PCA963X_MODE1);
+	reg = reg | BIT(PCA963X_MODE1_SLEEP);
+	i2c_smbus_write_byte_data(chip->client, PCA963X_MODE1, reg);
+
+	return 0;
+}
+
+static int pca963x_resume(struct device *dev)
+{
+	struct pca963x *chip = dev_get_drvdata(dev);
+	u8 reg;
+
+	reg = i2c_smbus_read_byte_data(chip->client, PCA963X_MODE1);
+	reg = reg & ~BIT(PCA963X_MODE1_SLEEP);
+	i2c_smbus_write_byte_data(chip->client, PCA963X_MODE1, reg);
+
+	return 0;
+}
+
+static DEFINE_SIMPLE_DEV_PM_OPS(pca963x_pm, pca963x_suspend, pca963x_resume);
+
 static const struct of_device_id of_pca963x_match[] = {
 	{ .compatible = "nxp,pca9632", },
 	{ .compatible = "nxp,pca9633", },
@@ -430,6 +457,7 @@ static struct i2c_driver pca963x_driver = {
 	.driver = {
 		.name	= "leds-pca963x",
 		.of_match_table = of_pca963x_match,
+		.pm = pm_sleep_ptr(&pca963x_pm)
 	},
 	.probe = pca963x_probe,
 	.id_table = pca963x_id,

From af8d42cae2941cb87087b8e6ee314935ac8adc3a Mon Sep 17 00:00:00 2001
From: Stefan Kalscheuer <stefan@stklcode.de>
Date: Sun, 4 Feb 2024 16:07:26 +0100
Subject: [PATCH 0208/1406] leds: spi-byte: Use
 devm_led_classdev_register_ext()

Use extended classdev registration to generate generic device names from
color and function enums instead of reading only the label from the
device tree.

Signed-off-by: Stefan Kalscheuer <stefan@stklcode.de>
Link: https://lore.kernel.org/r/20240204150726.29783-1-stefan@stklcode.de
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/leds/leds-spi-byte.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/drivers/leds/leds-spi-byte.c b/drivers/leds/leds-spi-byte.c
index 9d91f21842f2b5..96296db5f410de 100644
--- a/drivers/leds/leds-spi-byte.c
+++ b/drivers/leds/leds-spi-byte.c
@@ -83,7 +83,7 @@ static int spi_byte_probe(struct spi_device *spi)
 	struct device_node *child;
 	struct device *dev = &spi->dev;
 	struct spi_byte_led *led;
-	const char *name = "leds-spi-byte::";
+	struct led_init_data init_data = {};
 	const char *state;
 	int ret;
 
@@ -97,12 +97,9 @@ static int spi_byte_probe(struct spi_device *spi)
 	if (!led)
 		return -ENOMEM;
 
-	of_property_read_string(child, "label", &name);
-	strscpy(led->name, name, sizeof(led->name));
 	led->spi = spi;
 	mutex_init(&led->mutex);
 	led->cdef = device_get_match_data(dev);
-	led->ldev.name = led->name;
 	led->ldev.brightness = LED_OFF;
 	led->ldev.max_brightness = led->cdef->max_value - led->cdef->off_value;
 	led->ldev.brightness_set_blocking = spi_byte_brightness_set_blocking;
@@ -120,7 +117,11 @@ static int spi_byte_probe(struct spi_device *spi)
 	spi_byte_brightness_set_blocking(&led->ldev,
 					 led->ldev.brightness);
 
-	ret = devm_led_classdev_register(&spi->dev, &led->ldev);
+	init_data.fwnode = of_fwnode_handle(child);
+	init_data.devicename = "leds-spi-byte";
+	init_data.default_label = ":";
+
+	ret = devm_led_classdev_register_ext(&spi->dev, &led->ldev, &init_data);
 	if (ret) {
 		mutex_destroy(&led->mutex);
 		return ret;

From f63b9c34b9dd18dedf55d6e14dd47d015b38b838 Mon Sep 17 00:00:00 2001
From: Marijn Suijten <marijn.suijten@somainline.org>
Date: Sun, 4 Feb 2024 18:24:20 +0100
Subject: [PATCH 0209/1406] leds: qcom-lpg: Add PM660L configuration and
 compatible

Inherit PM660L PMIC LPG/triled block configuration from downstream
drivers and DT sources, consisting of a triled block with automatic
trickle charge control and source selection, three colored led channels
belonging to the synchronized triled block and one loose PWM channel.

Signed-off-by: Marijn Suijten <marijn.suijten@somainline.org>
Reviewed-by: Bjorn Andersson <bjorn.andersson@linaro.org>
Link: https://lore.kernel.org/r/20240204-pm660l-lpg-v5-1-2f54d1a0894b@somainline.org
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/leds/rgb/leds-qcom-lpg.c | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/drivers/leds/rgb/leds-qcom-lpg.c b/drivers/leds/rgb/leds-qcom-lpg.c
index 6226864145a653..0c7d8388667033 100644
--- a/drivers/leds/rgb/leds-qcom-lpg.c
+++ b/drivers/leds/rgb/leds-qcom-lpg.c
@@ -1644,6 +1644,23 @@ static int lpg_probe(struct platform_device *pdev)
 	return lpg_add_pwm(lpg);
 }
 
+static const struct lpg_data pm660l_lpg_data = {
+	.lut_base = 0xb000,
+	.lut_size = 49,
+
+	.triled_base = 0xd000,
+	.triled_has_atc_ctl = true,
+	.triled_has_src_sel = true,
+
+	.num_channels = 4,
+	.channels = (const struct lpg_channel_data[]) {
+		{ .base = 0xb100, .triled_mask = BIT(5) },
+		{ .base = 0xb200, .triled_mask = BIT(6) },
+		{ .base = 0xb300, .triled_mask = BIT(7) },
+		{ .base = 0xb400 },
+	},
+};
+
 static const struct lpg_data pm8916_pwm_data = {
 	.num_channels = 1,
 	.channels = (const struct lpg_channel_data[]) {
@@ -1790,6 +1807,7 @@ static const struct lpg_data pmk8550_pwm_data = {
 };
 
 static const struct of_device_id lpg_of_table[] = {
+	{ .compatible = "qcom,pm660l-lpg", .data = &pm660l_lpg_data },
 	{ .compatible = "qcom,pm8150b-lpg", .data = &pm8150b_lpg_data },
 	{ .compatible = "qcom,pm8150l-lpg", .data = &pm8150l_lpg_data },
 	{ .compatible = "qcom,pm8350c-pwm", .data = &pm8350c_pwm_data },

From 12ce20e02e532f101b725d71c52a36c5cc8ad1e6 Mon Sep 17 00:00:00 2001
From: Christian Marangi <ansuelsmth@gmail.com>
Date: Sun, 4 Feb 2024 00:54:01 +0100
Subject: [PATCH 0210/1406] leds: trigger: netdev: Fix kernel panic on
 interface rename trig notify

Commit d5e01266e7f5 ("leds: trigger: netdev: add additional specific link
speed mode") in the various changes, reworked the way to set the LINKUP
mode in commit cee4bd16c319 ("leds: trigger: netdev: Recheck
NETDEV_LED_MODE_LINKUP on dev rename") and moved it to a generic function.

This changed the logic where, in the previous implementation the dev
from the trigger event was used to check if the carrier was ok, but in
the new implementation with the generic function, the dev in
trigger_data is used instead.

This is problematic and cause a possible kernel panic due to the fact
that the dev in the trigger_data still reference the old one as the
new one (passed from the trigger event) still has to be hold and saved
in the trigger_data struct (done in the NETDEV_REGISTER case).

On calling of get_device_state(), an invalid net_dev is used and this
cause a kernel panic.

To handle this correctly, move the call to get_device_state() after the
new net_dev is correctly set in trigger_data (in the NETDEV_REGISTER
case) and correctly parse the new dev.

Fixes: d5e01266e7f5 ("leds: trigger: netdev: add additional specific link speed mode")
Cc: stable@vger.kernel.org
Signed-off-by: Christian Marangi <ansuelsmth@gmail.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Link: https://lore.kernel.org/r/20240203235413.1146-1-ansuelsmth@gmail.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/leds/trigger/ledtrig-netdev.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/leds/trigger/ledtrig-netdev.c b/drivers/leds/trigger/ledtrig-netdev.c
index f0eb5820c48ccb..ea00f6c708826b 100644
--- a/drivers/leds/trigger/ledtrig-netdev.c
+++ b/drivers/leds/trigger/ledtrig-netdev.c
@@ -581,12 +581,12 @@ static int netdev_trig_notify(struct notifier_block *nb,
 	trigger_data->duplex = DUPLEX_UNKNOWN;
 	switch (evt) {
 	case NETDEV_CHANGENAME:
-		get_device_state(trigger_data);
-		fallthrough;
 	case NETDEV_REGISTER:
 		dev_put(trigger_data->net_dev);
 		dev_hold(dev);
 		trigger_data->net_dev = dev;
+		if (evt == NETDEV_CHANGENAME)
+			get_device_state(trigger_data);
 		break;
 	case NETDEV_UNREGISTER:
 		dev_put(trigger_data->net_dev);

From 10c8ef5621acae78857d7680fb93597be2d83ed6 Mon Sep 17 00:00:00 2001
From: Aleksandrs Vinarskis <alex.vinarskis@gmail.com>
Date: Thu, 21 Dec 2023 19:51:41 +0100
Subject: [PATCH 0211/1406] mfd: intel-lpss: Switch to generalized quirk table

Introduce generic quirk table, and port existing walkaround for select
Microsoft devices to it. This is a preparation for
QUIRK_CLOCK_DIVIDER_UNITY.

Signed-off-by: Aleksandrs Vinarskis <alex.vinarskis@gmail.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20231221185142.9224-2-alex.vinarskis@gmail.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/mfd/intel-lpss-pci.c | 23 +++++++++++++++--------
 drivers/mfd/intel-lpss.c     |  2 +-
 drivers/mfd/intel-lpss.h     |  9 ++++++++-
 3 files changed, 24 insertions(+), 10 deletions(-)

diff --git a/drivers/mfd/intel-lpss-pci.c b/drivers/mfd/intel-lpss-pci.c
index 4621d3950b8f9f..07713a2f694f32 100644
--- a/drivers/mfd/intel-lpss-pci.c
+++ b/drivers/mfd/intel-lpss-pci.c
@@ -23,12 +23,17 @@
 
 #include "intel-lpss.h"
 
-/* Some DSDTs have an unused GEXP ACPI device conflicting with I2C4 resources */
-static const struct pci_device_id ignore_resource_conflicts_ids[] = {
-	/* Microsoft Surface Go (version 1) I2C4 */
-	{ PCI_DEVICE_SUB(PCI_VENDOR_ID_INTEL, 0x9d64, 0x152d, 0x1182), },
-	/* Microsoft Surface Go 2 I2C4 */
-	{ PCI_DEVICE_SUB(PCI_VENDOR_ID_INTEL, 0x9d64, 0x152d, 0x1237), },
+static const struct pci_device_id quirk_ids[] = {
+	{
+		/* Microsoft Surface Go (version 1) I2C4 */
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_INTEL, 0x9d64, 0x152d, 0x1182),
+		.driver_data = QUIRK_IGNORE_RESOURCE_CONFLICTS,
+	},
+	{
+		/* Microsoft Surface Go 2 I2C4 */
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_INTEL, 0x9d64, 0x152d, 0x1237),
+		.driver_data = QUIRK_IGNORE_RESOURCE_CONFLICTS,
+	},
 	{ }
 };
 
@@ -36,6 +41,7 @@ static int intel_lpss_pci_probe(struct pci_dev *pdev,
 				const struct pci_device_id *id)
 {
 	const struct intel_lpss_platform_info *data = (void *)id->driver_data;
+	const struct pci_device_id *quirk_pci_info;
 	struct intel_lpss_platform_info *info;
 	int ret;
 
@@ -55,8 +61,9 @@ static int intel_lpss_pci_probe(struct pci_dev *pdev,
 	info->mem = pci_resource_n(pdev, 0);
 	info->irq = pci_irq_vector(pdev, 0);
 
-	if (pci_match_id(ignore_resource_conflicts_ids, pdev))
-		info->ignore_resource_conflicts = true;
+	quirk_pci_info = pci_match_id(quirk_ids, pdev);
+	if (quirk_pci_info)
+		info->quirks = quirk_pci_info->driver_data;
 
 	pdev->d3cold_delay = 0;
 
diff --git a/drivers/mfd/intel-lpss.c b/drivers/mfd/intel-lpss.c
index eff423f7dd2847..aafa0da5f8dbfd 100644
--- a/drivers/mfd/intel-lpss.c
+++ b/drivers/mfd/intel-lpss.c
@@ -412,7 +412,7 @@ int intel_lpss_probe(struct device *dev,
 		return ret;
 
 	lpss->cell->swnode = info->swnode;
-	lpss->cell->ignore_resource_conflicts = info->ignore_resource_conflicts;
+	lpss->cell->ignore_resource_conflicts = info->quirks & QUIRK_IGNORE_RESOURCE_CONFLICTS;
 
 	intel_lpss_init_dev(lpss);
 
diff --git a/drivers/mfd/intel-lpss.h b/drivers/mfd/intel-lpss.h
index c1d72b117ed5e6..2fa9ef9162580e 100644
--- a/drivers/mfd/intel-lpss.h
+++ b/drivers/mfd/intel-lpss.h
@@ -11,16 +11,23 @@
 #ifndef __MFD_INTEL_LPSS_H
 #define __MFD_INTEL_LPSS_H
 
+#include <linux/bits.h>
 #include <linux/pm.h>
 
+/*
+ * Some DSDTs have an unused GEXP ACPI device conflicting with I2C4 resources.
+ * Set to ignore resource conflicts with ACPI declared SystemMemory regions.
+ */
+#define QUIRK_IGNORE_RESOURCE_CONFLICTS BIT(0)
+
 struct device;
 struct resource;
 struct software_node;
 
 struct intel_lpss_platform_info {
 	struct resource *mem;
-	bool ignore_resource_conflicts;
 	int irq;
+	unsigned int quirks;
 	unsigned long clk_rate;
 	const char *clk_con_id;
 	const struct software_node *swnode;

From acad8f56520be4259d1e74e05b02f7a9ef98f838 Mon Sep 17 00:00:00 2001
From: Aleksandrs Vinarskis <alex.vinarskis@gmail.com>
Date: Thu, 21 Dec 2023 19:51:42 +0100
Subject: [PATCH 0212/1406] mfd: intel-lpss: Introduce
 QUIRK_CLOCK_DIVIDER_UNITY for XPS 9530

Some devices (eg. Dell XPS 9530, 2023) due to a firmware bug have a
misconfigured clock divider, which should've been 1:1. This introduces
quirk which conditionally re-configures the clock divider to 1:1.

Signed-off-by: Aleksandrs Vinarskis <alex.vinarskis@gmail.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20231221185142.9224-3-alex.vinarskis@gmail.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/mfd/intel-lpss-pci.c | 5 +++++
 drivers/mfd/intel-lpss.c     | 7 +++++++
 drivers/mfd/intel-lpss.h     | 5 +++++
 3 files changed, 17 insertions(+)

diff --git a/drivers/mfd/intel-lpss-pci.c b/drivers/mfd/intel-lpss-pci.c
index 07713a2f694f32..8c00e0c695c5b9 100644
--- a/drivers/mfd/intel-lpss-pci.c
+++ b/drivers/mfd/intel-lpss-pci.c
@@ -34,6 +34,11 @@ static const struct pci_device_id quirk_ids[] = {
 		PCI_DEVICE_SUB(PCI_VENDOR_ID_INTEL, 0x9d64, 0x152d, 0x1237),
 		.driver_data = QUIRK_IGNORE_RESOURCE_CONFLICTS,
 	},
+	{
+		/* Dell XPS 9530 (2023) */
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_INTEL, 0x51fb, 0x1028, 0x0beb),
+		.driver_data = QUIRK_CLOCK_DIVIDER_UNITY,
+	},
 	{ }
 };
 
diff --git a/drivers/mfd/intel-lpss.c b/drivers/mfd/intel-lpss.c
index aafa0da5f8dbfd..2a9018112dfc86 100644
--- a/drivers/mfd/intel-lpss.c
+++ b/drivers/mfd/intel-lpss.c
@@ -300,6 +300,7 @@ static int intel_lpss_register_clock_divider(struct intel_lpss *lpss,
 {
 	char name[32];
 	struct clk *tmp = *clk;
+	int ret;
 
 	snprintf(name, sizeof(name), "%s-enable", devname);
 	tmp = clk_register_gate(NULL, name, __clk_get_name(tmp), 0,
@@ -316,6 +317,12 @@ static int intel_lpss_register_clock_divider(struct intel_lpss *lpss,
 		return PTR_ERR(tmp);
 	*clk = tmp;
 
+	if (lpss->info->quirks & QUIRK_CLOCK_DIVIDER_UNITY) {
+		ret = clk_set_rate(tmp, lpss->info->clk_rate);
+		if (ret)
+			return ret;
+	}
+
 	snprintf(name, sizeof(name), "%s-update", devname);
 	tmp = clk_register_gate(NULL, name, __clk_get_name(tmp),
 				CLK_SET_RATE_PARENT, lpss->priv, 31, 0, NULL);
diff --git a/drivers/mfd/intel-lpss.h b/drivers/mfd/intel-lpss.h
index 2fa9ef9162580e..6f8f668f4c6f08 100644
--- a/drivers/mfd/intel-lpss.h
+++ b/drivers/mfd/intel-lpss.h
@@ -19,6 +19,11 @@
  * Set to ignore resource conflicts with ACPI declared SystemMemory regions.
  */
 #define QUIRK_IGNORE_RESOURCE_CONFLICTS BIT(0)
+/*
+ * Some devices have misconfigured clock divider due to a firmware bug.
+ * Set this to force the clock divider to 1:1 ratio.
+ */
+#define QUIRK_CLOCK_DIVIDER_UNITY		BIT(1)
 
 struct device;
 struct resource;

From 012a97942c5380bd38cf77a0b619f0522895b669 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= <ilpo.jarvinen@linux.intel.com>
Date: Fri, 29 Dec 2023 16:50:59 +0200
Subject: [PATCH 0213/1406] mfd: lpc_ich: Use ALIGN_DOWN() to obtain the start
 of the SPI base range
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Instead of open coding, use ALIGN_DOWN() for alignment.

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Link: https://lore.kernel.org/r/20231229145059.6138-1-ilpo.jarvinen@linux.intel.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/mfd/lpc_ich.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/mfd/lpc_ich.c b/drivers/mfd/lpc_ich.c
index 73a0e7f9bd3116..f14901660147f5 100644
--- a/drivers/mfd/lpc_ich.c
+++ b/drivers/mfd/lpc_ich.c
@@ -38,6 +38,7 @@
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
+#include <linux/align.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/errno.h>
@@ -1321,7 +1322,7 @@ static int lpc_ich_init_spi(struct pci_dev *dev)
 	case INTEL_SPI_BYT:
 		pci_read_config_dword(dev, SPIBASE_BYT, &spi_base);
 		if (spi_base & SPIBASE_BYT_EN) {
-			res->start = spi_base & ~(SPIBASE_BYT_SZ - 1);
+			res->start = ALIGN_DOWN(spi_base, SPIBASE_BYT_SZ);
 			res->end = res->start + SPIBASE_BYT_SZ - 1;
 
 			info->set_writeable = lpc_ich_byt_set_writeable;

From 0d036630ad9a6d8b34e4a09dfc851b3b80b3a15a Mon Sep 17 00:00:00 2001
From: Fuyao Kashizuku <fuyao@sjterm.com>
Date: Wed, 27 Dec 2023 10:01:17 +0800
Subject: [PATCH 0214/1406] mfd: sun4i-gpadc: Correct specified GPADC interrupt
 numbers

The identifiers are used as IRQ resource numbers, where 0 is treated
specially.

This fixes sun4i-gpadc-iio probe failed when request irq.

The backstack:
	WARNING: CPU: 3 PID: 1 at drivers/base/platform.c:451
	__platform_get_irq_byname+0xb8/0xc4
	0 is an invalid IRQ number
	Modules linked in:
	CPU: 3 PID: 1 Comm: swapper/0 Not tainted 6.7.0-rc6 #9
	Hardware name: Allwinner sun8i Family
	 unwind_backtrace
	 show_stack
	 dump_stack_lvl
	 __warn
	 warn_slowpath_fmt
	 __platform_get_irq_byname
	 platform_get_irq_byname
	 sun4i_irq_init
	 sun4i_gpadc_probe
	 platform_probe
	 really_probe
	 __driver_probe_device
	 driver_probe_device
	 __driver_attach
	 bus_for_each_dev
	 bus_add_driver
	 driver_register
	 do_one_initcall
	 do_initcall_level
	 do_initcalls
	 kernel_init_freeable
	 kernel_init

Log reports:
sun4i-gpadc-iio sun6i-a31-gpadc-iio.0: error -EINVAL: IRQ FIFO_DATA_PENDING
not found
sun4i-gpadc-iio: probe of sun6i-a31-gpadc-iio.0 failed with error -22

Signed-off-by: Fuyao Kashizuku <fuyao@sjterm.com>
Acked-by: Jernej Skrabec <jernej.skrabec@gmail.com>
Link: https://lore.kernel.org/r/ZYuFbUUus9apiCpq@debian.cyg
Signed-off-by: Lee Jones <lee@kernel.org>
---
 include/linux/mfd/sun4i-gpadc.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/linux/mfd/sun4i-gpadc.h b/include/linux/mfd/sun4i-gpadc.h
index ea0ccf33a459ef..021f820f9d52bd 100644
--- a/include/linux/mfd/sun4i-gpadc.h
+++ b/include/linux/mfd/sun4i-gpadc.h
@@ -81,8 +81,8 @@
 #define SUN4I_GPADC_TEMP_DATA				0x20
 #define SUN4I_GPADC_DATA				0x24
 
-#define SUN4I_GPADC_IRQ_FIFO_DATA			0
-#define SUN4I_GPADC_IRQ_TEMP_DATA			1
+#define SUN4I_GPADC_IRQ_FIFO_DATA			1
+#define SUN4I_GPADC_IRQ_TEMP_DATA			2
 
 /* 10s delay before suspending the IP */
 #define SUN4I_GPADC_AUTOSUSPEND_DELAY			10000

From 9703ab5697fe0376a22cba108b56b8307d5e0362 Mon Sep 17 00:00:00 2001
From: Lee Jones <lee@kernel.org>
Date: Thu, 11 Jan 2024 16:21:13 +0000
Subject: [PATCH 0215/1406] mfd: omap-usb-host: Increase size of buffer to
 include all possible values
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Avoid these nasty W=1 errors:

  drivers/mfd/omap-usb-host.c: In function ‘usbhs_omap_probe’:
  drivers/mfd/omap-usb-host.c:706:54: error: ‘_clk’ directive output may be truncated writing 4 bytes into a region of size between 1 and 11 [-Werror=format-truncation=]
  drivers/mfd/omap-usb-host.c:705:17: note: ‘snprintf’ output between 24 and 34 bytes into a destination of size 30
  drivers/mfd/omap-usb-host.c:721:56: error: ‘%d’ directive output may be truncated writing between 1 and 11 bytes into a region of size 8 [-Werror=format-truncation=]
  drivers/mfd/omap-usb-host.c:721:33: note: directive argument in the range [-2147483640, 2147483647]
  drivers/mfd/omap-usb-host.c:720:17: note: ‘snprintf’ output between 28 and 38 bytes into a destination of size 30
  drivers/mfd/omap-usb-host.c:731:55: error: ‘%d’ directive output may be truncated writing between 1 and 11 bytes into a region of size 9 [-Werror=format-truncation=]
  drivers/mfd/omap-usb-host.c:731:33: note: directive argument in the range [-2147483640, 2147483647]
  drivers/mfd/omap-usb-host.c:730:17: note: ‘snprintf’ output between 27 and 37 bytes into a destination of size 30

Cc: Tony Lindgren <tony@atomide.com>
Cc: Keshava Munegowda <keshava_mgowda@ti.com>
Cc: Roger Quadros <rogerq@ti.com>
Cc: linux-omap@vger.kernel.org
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/mfd/omap-usb-host.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/mfd/omap-usb-host.c b/drivers/mfd/omap-usb-host.c
index ebc62033db169d..949feb03d4f8d8 100644
--- a/drivers/mfd/omap-usb-host.c
+++ b/drivers/mfd/omap-usb-host.c
@@ -699,7 +699,7 @@ static int usbhs_omap_probe(struct platform_device *pdev)
 	}
 
 	for (i = 0; i < omap->nports; i++) {
-		char clkname[30];
+		char clkname[40];
 
 		/* clock names are indexed from 1*/
 		snprintf(clkname, sizeof(clkname),

From 5963996d6ffd7fcb5adf48f03c71fdfa00752b7e Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Date: Mon, 15 Jan 2024 19:20:42 +0100
Subject: [PATCH 0216/1406] dt-bindings: mfd: iqs62x: Do not override
 firmware-name $ref

dtschema package defines firmware-name as string-array, so individual
bindings should not make it a string but instead just narrow the number
of expected firmware file names.

Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Acked-by: Conor Dooley <conor.dooley@microchip.com>
Acked-by: Jeff LaBundy <jeff@labundy.com>
Link: https://lore.kernel.org/r/20240115182042.1610134-1-krzysztof.kozlowski@linaro.org
Signed-off-by: Lee Jones <lee@kernel.org>
---
 Documentation/devicetree/bindings/mfd/iqs62x.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/devicetree/bindings/mfd/iqs62x.yaml b/Documentation/devicetree/bindings/mfd/iqs62x.yaml
index 044cd7542c2bcf..f438c237496639 100644
--- a/Documentation/devicetree/bindings/mfd/iqs62x.yaml
+++ b/Documentation/devicetree/bindings/mfd/iqs62x.yaml
@@ -31,7 +31,7 @@ properties:
     maxItems: 1
 
   firmware-name:
-    $ref: /schemas/types.yaml#/definitions/string
+    maxItems: 1
     description:
       Specifies the name of the calibration and configuration file selected by
       the driver. If this property is omitted, the name is chosen based on the

From 0ec33a13d75d36591ead0e7413c68251d08549bf Mon Sep 17 00:00:00 2001
From: Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
Date: Tue, 16 Jan 2024 03:08:27 +0200
Subject: [PATCH 0217/1406] dt-bindings: mfd: qcom,tcsr: Add compatibles for
 QCM2290 and SM6115

Add qcom,qcm2290-tcsr and qcom,sm6115-tcsr, compatibles for TCSR blocks
on the corresponding platforms.

Signed-off-by: Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Link: https://lore.kernel.org/r/20240116-usbc-phy-vls-clamp-v1-1-73b2da7691c5@linaro.org
Signed-off-by: Lee Jones <lee@kernel.org>
---
 Documentation/devicetree/bindings/mfd/qcom,tcsr.yaml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Documentation/devicetree/bindings/mfd/qcom,tcsr.yaml b/Documentation/devicetree/bindings/mfd/qcom,tcsr.yaml
index 798705ab6a4601..b97d77015335f1 100644
--- a/Documentation/devicetree/bindings/mfd/qcom,tcsr.yaml
+++ b/Documentation/devicetree/bindings/mfd/qcom,tcsr.yaml
@@ -19,6 +19,7 @@ properties:
       - enum:
           - qcom,msm8976-tcsr
           - qcom,msm8998-tcsr
+          - qcom,qcm2290-tcsr
           - qcom,qcs404-tcsr
           - qcom,sc7180-tcsr
           - qcom,sc7280-tcsr
@@ -28,6 +29,7 @@ properties:
           - qcom,sdx55-tcsr
           - qcom,sdx65-tcsr
           - qcom,sm4450-tcsr
+          - qcom,sm6115-tcsr
           - qcom,sm8150-tcsr
           - qcom,sm8250-tcsr
           - qcom,sm8350-tcsr

From 117baa75f5a4c3ab72c9e76cf344b79eb601b2b3 Mon Sep 17 00:00:00 2001
From: Lukasz Majczak <lma@chromium.org>
Date: Fri, 19 Jan 2024 08:43:27 +0000
Subject: [PATCH 0218/1406] mfd: cros_ec: Register EC-based watchdog subdevice

Add ChromeOS EC-based watchdog as EC subdevice.

Signed-off-by: Lukasz Majczak <lma@chromium.org>
Link: https://lore.kernel.org/r/20240119084328.3135503-4-lma@chromium.org
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/mfd/cros_ec_dev.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/drivers/mfd/cros_ec_dev.c b/drivers/mfd/cros_ec_dev.c
index 603b1cd5278507..4996220ce64b7b 100644
--- a/drivers/mfd/cros_ec_dev.c
+++ b/drivers/mfd/cros_ec_dev.c
@@ -91,6 +91,10 @@ static const struct mfd_cell cros_usbpd_notify_cells[] = {
 	{ .name = "cros-usbpd-notify", },
 };
 
+static const struct mfd_cell cros_ec_wdt_cells[] = {
+	{ .name = "cros-ec-wdt", }
+};
+
 static const struct cros_feature_to_cells cros_subdevices[] = {
 	{
 		.id		= EC_FEATURE_CEC,
@@ -107,6 +111,11 @@ static const struct cros_feature_to_cells cros_subdevices[] = {
 		.mfd_cells	= cros_usbpd_charger_cells,
 		.num_cells	= ARRAY_SIZE(cros_usbpd_charger_cells),
 	},
+	{
+		.id		= EC_FEATURE_HANG_DETECT,
+		.mfd_cells	= cros_ec_wdt_cells,
+		.num_cells	= ARRAY_SIZE(cros_ec_wdt_cells),
+	},
 };
 
 static const struct mfd_cell cros_ec_platform_cells[] = {

From 39b2f9c1556ee8d86a404bcdfd7509b77a7fcbfe Mon Sep 17 00:00:00 2001
From: Simon Horman <horms@kernel.org>
Date: Tue, 23 Jan 2024 09:59:48 +0000
Subject: [PATCH 0219/1406] mfd: rave-sp: Avoid unnecessary use of comma
 operator

Although it does not seem to have any untoward side-effects,
the use of ';' to separate to assignments seems more appropriate than ','.

Flagged by clang-17 -Wcomma

No functional change intended. Compile tested only.

Signed-off-by: Simon Horman <horms@kernel.org>
Link: https://lore.kernel.org/r/20240123-rave-sp-comma-v1-1-84e9b15ba205@kernel.org
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/mfd/rave-sp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/mfd/rave-sp.c b/drivers/mfd/rave-sp.c
index 6ff84b2600c543..ea5fbcbbe4a56f 100644
--- a/drivers/mfd/rave-sp.c
+++ b/drivers/mfd/rave-sp.c
@@ -358,7 +358,7 @@ int rave_sp_exec(struct rave_sp *sp,
 
 	ackid       = atomic_inc_return(&sp->ackid);
 	reply.ackid = ackid;
-	reply.code  = rave_sp_reply_code((u8)command),
+	reply.code  = rave_sp_reply_code((u8)command);
 
 	mutex_lock(&sp->bus_lock);
 

From 7245b9a0e69a44ddd6f285fe2f2003b05834c4d2 Mon Sep 17 00:00:00 2001
From: Maciej Strozek <mstrozek@opensource.cirrus.com>
Date: Tue, 23 Jan 2024 15:42:59 +0000
Subject: [PATCH 0220/1406] mfd: wm831x: Remove redundant forever while loop

Current code executes only once despite the while loop, so remove the
loop. Also msleep(1) will likely result in a larger sleep, so increase
its value for clarity while keeping the same behaviour.

Signed-off-by: Maciej Strozek <mstrozek@opensource.cirrus.com>
Reviewed-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Link: https://lore.kernel.org/r/20240123154259.81258-1-mstrozek@opensource.cirrus.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/mfd/wm831x-auxadc.c | 43 ++++++++++++++++---------------------
 1 file changed, 18 insertions(+), 25 deletions(-)

diff --git a/drivers/mfd/wm831x-auxadc.c b/drivers/mfd/wm831x-auxadc.c
index 65b98f3fbd9291..18618a8f92062e 100644
--- a/drivers/mfd/wm831x-auxadc.c
+++ b/drivers/mfd/wm831x-auxadc.c
@@ -152,7 +152,7 @@ static irqreturn_t wm831x_auxadc_irq(int irq, void *irq_data)
 static int wm831x_auxadc_read_polled(struct wm831x *wm831x,
 				     enum wm831x_auxadc input)
 {
-	int ret, src, timeout;
+	int ret, src;
 
 	mutex_lock(&wm831x->auxadc_lock);
 
@@ -179,32 +179,25 @@ static int wm831x_auxadc_read_polled(struct wm831x *wm831x,
 		goto disable;
 	}
 
-	/* If we're not using interrupts then poll the
-	 * interrupt status register */
-	timeout = 5;
-	while (timeout) {
-		msleep(1);
+	/* If we're not using interrupts then read the interrupt status register */
+	msleep(20);
 
-		ret = wm831x_reg_read(wm831x,
-				      WM831X_INTERRUPT_STATUS_1);
-		if (ret < 0) {
-			dev_err(wm831x->dev,
-				"ISR 1 read failed: %d\n", ret);
-			goto disable;
-		}
+	ret = wm831x_reg_read(wm831x, WM831X_INTERRUPT_STATUS_1);
+	if (ret < 0) {
+		dev_err(wm831x->dev,
+			"ISR 1 read failed: %d\n", ret);
+		goto disable;
+	}
 
-		/* Did it complete? */
-		if (ret & WM831X_AUXADC_DATA_EINT) {
-			wm831x_reg_write(wm831x,
-					 WM831X_INTERRUPT_STATUS_1,
-					 WM831X_AUXADC_DATA_EINT);
-			break;
-		} else {
-			dev_err(wm831x->dev,
-				"AUXADC conversion timeout\n");
-			ret = -EBUSY;
-			goto disable;
-		}
+	/* Did it complete? */
+	if (ret & WM831X_AUXADC_DATA_EINT) {
+		wm831x_reg_write(wm831x, WM831X_INTERRUPT_STATUS_1,
+				WM831X_AUXADC_DATA_EINT);
+	} else {
+		dev_err(wm831x->dev,
+			"AUXADC conversion timeout\n");
+		ret = -EBUSY;
+		goto disable;
 	}
 
 	ret = wm831x_reg_read(wm831x, WM831X_AUXADC_DATA);

From 6632d701513dab29ed6e21d3a88717552a9db015 Mon Sep 17 00:00:00 2001
From: Charles Keepax <ckeepax@opensource.cirrus.com>
Date: Mon, 29 Jan 2024 15:25:53 +0000
Subject: [PATCH 0221/1406] mfd: cs42l43: Tidy up header includes

Use more forward declarations, move header guards to cover other
includes, and rely less on including headers through other headers.

Suggested-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Signed-off-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Link: https://lore.kernel.org/r/20240129152557.3221212-2-ckeepax@opensource.cirrus.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/mfd/cs42l43-i2c.c   |  6 +++++-
 drivers/mfd/cs42l43-sdw.c   |  6 +++++-
 drivers/mfd/cs42l43.c       |  8 +++++++-
 drivers/mfd/cs42l43.h       | 10 ++++++----
 include/linux/mfd/cs42l43.h | 13 +++++++------
 5 files changed, 30 insertions(+), 13 deletions(-)

diff --git a/drivers/mfd/cs42l43-i2c.c b/drivers/mfd/cs42l43-i2c.c
index 4922211680c964..7162274a0b5510 100644
--- a/drivers/mfd/cs42l43-i2c.c
+++ b/drivers/mfd/cs42l43-i2c.c
@@ -6,11 +6,15 @@
  *                         Cirrus Logic International Semiconductor Ltd.
  */
 
+#include <linux/array_size.h>
 #include <linux/err.h>
-#include <linux/errno.h>
 #include <linux/i2c.h>
+#include <linux/mfd/cs42l43.h>
 #include <linux/mfd/cs42l43-regs.h>
+#include <linux/mod_devicetable.h>
 #include <linux/module.h>
+#include <linux/pm.h>
+#include <linux/regmap.h>
 
 #include "cs42l43.h"
 
diff --git a/drivers/mfd/cs42l43-sdw.c b/drivers/mfd/cs42l43-sdw.c
index 1d85bbf8cdd5d1..d6962a5a35f65e 100644
--- a/drivers/mfd/cs42l43-sdw.c
+++ b/drivers/mfd/cs42l43-sdw.c
@@ -6,11 +6,15 @@
  *                         Cirrus Logic International Semiconductor Ltd.
  */
 
+#include <linux/array_size.h>
 #include <linux/device.h>
 #include <linux/err.h>
-#include <linux/errno.h>
+#include <linux/mfd/cs42l43.h>
 #include <linux/mfd/cs42l43-regs.h>
+#include <linux/mod_devicetable.h>
 #include <linux/module.h>
+#include <linux/pm.h>
+#include <linux/regmap.h>
 #include <linux/soundwire/sdw.h>
 #include <linux/soundwire/sdw_registers.h>
 #include <linux/soundwire/sdw_type.h>
diff --git a/drivers/mfd/cs42l43.c b/drivers/mfd/cs42l43.c
index 7b6d07cbe6fc6f..4e2bc5ad244a56 100644
--- a/drivers/mfd/cs42l43.c
+++ b/drivers/mfd/cs42l43.c
@@ -6,18 +6,24 @@
  *                         Cirrus Logic International Semiconductor Ltd.
  */
 
+#include <linux/array_size.h>
 #include <linux/bitops.h>
 #include <linux/build_bug.h>
 #include <linux/delay.h>
+#include <linux/device.h>
 #include <linux/err.h>
-#include <linux/errno.h>
 #include <linux/firmware.h>
+#include <linux/gpio/consumer.h>
 #include <linux/jiffies.h>
 #include <linux/mfd/core.h>
+#include <linux/mfd/cs42l43.h>
 #include <linux/mfd/cs42l43-regs.h>
 #include <linux/module.h>
+#include <linux/pm.h>
 #include <linux/pm_runtime.h>
+#include <linux/regmap.h>
 #include <linux/soundwire/sdw.h>
+#include <linux/types.h>
 
 #include "cs42l43.h"
 
diff --git a/drivers/mfd/cs42l43.h b/drivers/mfd/cs42l43.h
index eb4caf3938332f..8d1b1b0f5a4732 100644
--- a/drivers/mfd/cs42l43.h
+++ b/drivers/mfd/cs42l43.h
@@ -6,15 +6,17 @@
  *                         Cirrus Logic International Semiconductor Ltd.
  */
 
-#include <linux/mfd/cs42l43.h>
-#include <linux/pm.h>
-#include <linux/regmap.h>
-
 #ifndef CS42L43_CORE_INT_H
 #define CS42L43_CORE_INT_H
 
 #define CS42L43_N_DEFAULTS 176
 
+struct dev_pm_ops;
+struct device;
+struct reg_default;
+
+struct cs42l43;
+
 extern const struct dev_pm_ops cs42l43_pm_ops;
 extern const struct reg_default cs42l43_reg_default[CS42L43_N_DEFAULTS];
 
diff --git a/include/linux/mfd/cs42l43.h b/include/linux/mfd/cs42l43.h
index cf8263aab41bd2..2239d8585e7856 100644
--- a/include/linux/mfd/cs42l43.h
+++ b/include/linux/mfd/cs42l43.h
@@ -6,20 +6,21 @@
  *                         Cirrus Logic International Semiconductor Ltd.
  */
 
+#ifndef CS42L43_CORE_EXT_H
+#define CS42L43_CORE_EXT_H
+
 #include <linux/completion.h>
-#include <linux/device.h>
-#include <linux/gpio/consumer.h>
 #include <linux/mutex.h>
 #include <linux/regmap.h>
 #include <linux/regulator/consumer.h>
-#include <linux/soundwire/sdw.h>
 #include <linux/workqueue.h>
 
-#ifndef CS42L43_CORE_EXT_H
-#define CS42L43_CORE_EXT_H
-
 #define CS42L43_N_SUPPLIES		3
 
+struct device;
+struct gpio_desc;
+struct sdw_slave;
+
 enum cs42l43_irq_numbers {
 	CS42L43_PLL_LOST_LOCK,
 	CS42L43_PLL_READY,

From b88b7f517ce47de2d179a176014a94efa8fe90ec Mon Sep 17 00:00:00 2001
From: Charles Keepax <ckeepax@opensource.cirrus.com>
Date: Mon, 29 Jan 2024 15:25:54 +0000
Subject: [PATCH 0222/1406] mfd: cs42l43: Use __u8 type rather than u8 for
 firmware interface

__xxx is the preferred type for firmware interfaces.

Suggested-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Signed-off-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Link: https://lore.kernel.org/r/20240129152557.3221212-3-ckeepax@opensource.cirrus.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/mfd/cs42l43.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/mfd/cs42l43.c b/drivers/mfd/cs42l43.c
index 4e2bc5ad244a56..65a331481d975d 100644
--- a/drivers/mfd/cs42l43.c
+++ b/drivers/mfd/cs42l43.c
@@ -55,8 +55,8 @@
 struct cs42l43_patch_header {
 	__le16 version;
 	__le16 size;
-	u8 reserved;
-	u8 secure;
+	__u8 reserved;
+	__u8 secure;
 	__le16 bss_size;
 	__le32 apply_addr;
 	__le32 checksum;

From b4550eb9806bcec117c91eb59024e52a1efbe078 Mon Sep 17 00:00:00 2001
From: Charles Keepax <ckeepax@opensource.cirrus.com>
Date: Mon, 29 Jan 2024 15:25:55 +0000
Subject: [PATCH 0223/1406] mfd: cs42l43: Add time postfixes on defines

Make the defines a little clearer by adding time based postfixes.

Suggested-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Signed-off-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Link: https://lore.kernel.org/r/20240129152557.3221212-4-ckeepax@opensource.cirrus.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/mfd/cs42l43.c | 36 ++++++++++++++++++------------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/drivers/mfd/cs42l43.c b/drivers/mfd/cs42l43.c
index 65a331481d975d..aea0f8f4857856 100644
--- a/drivers/mfd/cs42l43.c
+++ b/drivers/mfd/cs42l43.c
@@ -27,30 +27,30 @@
 
 #include "cs42l43.h"
 
-#define CS42L43_RESET_DELAY			20
+#define CS42L43_RESET_DELAY_MS			20
 
-#define CS42L43_SDW_ATTACH_TIMEOUT		500
-#define CS42L43_SDW_DETACH_TIMEOUT		100
+#define CS42L43_SDW_ATTACH_TIMEOUT_MS		500
+#define CS42L43_SDW_DETACH_TIMEOUT_MS		100
 
 #define CS42L43_MCU_BOOT_STAGE1			1
 #define CS42L43_MCU_BOOT_STAGE2			2
 #define CS42L43_MCU_BOOT_STAGE3			3
 #define CS42L43_MCU_BOOT_STAGE4			4
-#define CS42L43_MCU_POLL			5000
-#define CS42L43_MCU_CMD_TIMEOUT			20000
+#define CS42L43_MCU_POLL_US			5000
+#define CS42L43_MCU_CMD_TIMEOUT_US		20000
 #define CS42L43_MCU_UPDATE_FORMAT		3
 #define CS42L43_MCU_UPDATE_OFFSET		0x100000
-#define CS42L43_MCU_UPDATE_TIMEOUT		500000
+#define CS42L43_MCU_UPDATE_TIMEOUT_US		500000
 #define CS42L43_MCU_UPDATE_RETRIES		5
 
 #define CS42L43_MCU_SUPPORTED_REV		0x2105
 #define CS42L43_MCU_SHADOW_REGS_REQUIRED_REV	0x2200
 #define CS42L43_MCU_SUPPORTED_BIOS_REV		0x0001
 
-#define CS42L43_VDDP_DELAY			50
-#define CS42L43_VDDD_DELAY			1000
+#define CS42L43_VDDP_DELAY_US			50
+#define CS42L43_VDDD_DELAY_US			1000
 
-#define CS42L43_AUTOSUSPEND_TIME		250
+#define CS42L43_AUTOSUSPEND_TIME_MS		250
 
 struct cs42l43_patch_header {
 	__le16 version;
@@ -538,10 +538,10 @@ static int cs42l43_soft_reset(struct cs42l43 *cs42l43)
 	regcache_cache_only(cs42l43->regmap, true);
 	regmap_multi_reg_write_bypassed(cs42l43->regmap, reset, ARRAY_SIZE(reset));
 
-	msleep(CS42L43_RESET_DELAY);
+	msleep(CS42L43_RESET_DELAY_MS);
 
 	if (cs42l43->sdw) {
-		unsigned long timeout = msecs_to_jiffies(CS42L43_SDW_DETACH_TIMEOUT);
+		unsigned long timeout = msecs_to_jiffies(CS42L43_SDW_DETACH_TIMEOUT_MS);
 		unsigned long time;
 
 		time = wait_for_completion_timeout(&cs42l43->device_detach, timeout);
@@ -561,7 +561,7 @@ static int cs42l43_soft_reset(struct cs42l43 *cs42l43)
 static int cs42l43_wait_for_attach(struct cs42l43 *cs42l43)
 {
 	if (!cs42l43->attached) {
-		unsigned long timeout = msecs_to_jiffies(CS42L43_SDW_ATTACH_TIMEOUT);
+		unsigned long timeout = msecs_to_jiffies(CS42L43_SDW_ATTACH_TIMEOUT_MS);
 		unsigned long time;
 
 		time = wait_for_completion_timeout(&cs42l43->device_attach, timeout);
@@ -603,7 +603,7 @@ static int cs42l43_mcu_stage_2_3(struct cs42l43 *cs42l43, bool shadow)
 
 	ret = regmap_read_poll_timeout(cs42l43->regmap, CS42L43_BOOT_STATUS,
 				       val, (val == CS42L43_MCU_BOOT_STAGE3),
-				       CS42L43_MCU_POLL, CS42L43_MCU_CMD_TIMEOUT);
+				       CS42L43_MCU_POLL_US, CS42L43_MCU_CMD_TIMEOUT_US);
 	if (ret) {
 		dev_err(cs42l43->dev, "Failed to move to stage 3: %d, 0x%x\n", ret, val);
 		return ret;
@@ -652,7 +652,7 @@ static int cs42l43_mcu_disable(struct cs42l43 *cs42l43)
 
 	ret = regmap_read_poll_timeout(cs42l43->regmap, CS42L43_SOFT_INT_SHADOW, val,
 				       (val & CS42L43_CONTROL_APPLIED_INT_MASK),
-				       CS42L43_MCU_POLL, CS42L43_MCU_CMD_TIMEOUT);
+				       CS42L43_MCU_POLL_US, CS42L43_MCU_CMD_TIMEOUT_US);
 	if (ret) {
 		dev_err(cs42l43->dev, "Failed to disable firmware: %d, 0x%x\n", ret, val);
 		return ret;
@@ -696,7 +696,7 @@ static void cs42l43_mcu_load_firmware(const struct firmware *firmware, void *con
 
 	ret = regmap_read_poll_timeout(cs42l43->regmap, CS42L43_SOFT_INT_SHADOW, val,
 				       (val & CS42L43_PATCH_APPLIED_INT_MASK),
-				       CS42L43_MCU_POLL, CS42L43_MCU_UPDATE_TIMEOUT);
+				       CS42L43_MCU_POLL_US, CS42L43_MCU_UPDATE_TIMEOUT_US);
 	if (ret) {
 		dev_err(cs42l43->dev, "Failed to update firmware: %d, 0x%x\n", ret, val);
 		cs42l43->firmware_error = ret;
@@ -957,7 +957,7 @@ static int cs42l43_power_up(struct cs42l43 *cs42l43)
 	}
 
 	/* vdd-p must be on for 50uS before any other supply */
-	usleep_range(CS42L43_VDDP_DELAY, 2 * CS42L43_VDDP_DELAY);
+	usleep_range(CS42L43_VDDP_DELAY_US, 2 * CS42L43_VDDP_DELAY_US);
 
 	gpiod_set_value_cansleep(cs42l43->reset, 1);
 
@@ -973,7 +973,7 @@ static int cs42l43_power_up(struct cs42l43 *cs42l43)
 		goto err_core_supplies;
 	}
 
-	usleep_range(CS42L43_VDDD_DELAY, 2 * CS42L43_VDDD_DELAY);
+	usleep_range(CS42L43_VDDD_DELAY_US, 2 * CS42L43_VDDD_DELAY_US);
 
 	return 0;
 
@@ -1057,7 +1057,7 @@ int cs42l43_dev_probe(struct cs42l43 *cs42l43)
 	if (ret)
 		return ret;
 
-	pm_runtime_set_autosuspend_delay(cs42l43->dev, CS42L43_AUTOSUSPEND_TIME);
+	pm_runtime_set_autosuspend_delay(cs42l43->dev, CS42L43_AUTOSUSPEND_TIME_MS);
 	pm_runtime_use_autosuspend(cs42l43->dev);
 	pm_runtime_set_active(cs42l43->dev);
 	/*

From 08b58577784176aeb04b5305b6c491d42b5328ac Mon Sep 17 00:00:00 2001
From: Charles Keepax <ckeepax@opensource.cirrus.com>
Date: Mon, 29 Jan 2024 15:25:56 +0000
Subject: [PATCH 0224/1406] mfd: cs42l43: Add some missing dev_err_probe()s

Use of dev_err_probe() was missed in the i2c and sdw parts of the code,
update the missing parts.

Suggested-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Signed-off-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Link: https://lore.kernel.org/r/20240129152557.3221212-5-ckeepax@opensource.cirrus.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/mfd/cs42l43-i2c.c | 9 +++------
 drivers/mfd/cs42l43-sdw.c | 9 +++------
 2 files changed, 6 insertions(+), 12 deletions(-)

diff --git a/drivers/mfd/cs42l43-i2c.c b/drivers/mfd/cs42l43-i2c.c
index 7162274a0b5510..c9e4ea76149a88 100644
--- a/drivers/mfd/cs42l43-i2c.c
+++ b/drivers/mfd/cs42l43-i2c.c
@@ -38,7 +38,6 @@ static const struct regmap_config cs42l43_i2c_regmap = {
 static int cs42l43_i2c_probe(struct i2c_client *i2c)
 {
 	struct cs42l43 *cs42l43;
-	int ret;
 
 	cs42l43 = devm_kzalloc(&i2c->dev, sizeof(*cs42l43), GFP_KERNEL);
 	if (!cs42l43)
@@ -50,11 +49,9 @@ static int cs42l43_i2c_probe(struct i2c_client *i2c)
 	cs42l43->attached = true;
 
 	cs42l43->regmap = devm_regmap_init_i2c(i2c, &cs42l43_i2c_regmap);
-	if (IS_ERR(cs42l43->regmap)) {
-		ret = PTR_ERR(cs42l43->regmap);
-		dev_err(cs42l43->dev, "Failed to allocate regmap: %d\n", ret);
-		return ret;
-	}
+	if (IS_ERR(cs42l43->regmap))
+		return dev_err_probe(cs42l43->dev, PTR_ERR(cs42l43->regmap),
+				     "Failed to allocate regmap\n");
 
 	return cs42l43_dev_probe(cs42l43);
 }
diff --git a/drivers/mfd/cs42l43-sdw.c b/drivers/mfd/cs42l43-sdw.c
index d6962a5a35f65e..65f7b1d7824861 100644
--- a/drivers/mfd/cs42l43-sdw.c
+++ b/drivers/mfd/cs42l43-sdw.c
@@ -171,7 +171,6 @@ static int cs42l43_sdw_probe(struct sdw_slave *sdw, const struct sdw_device_id *
 {
 	struct cs42l43 *cs42l43;
 	struct device *dev = &sdw->dev;
-	int ret;
 
 	cs42l43 = devm_kzalloc(dev, sizeof(*cs42l43), GFP_KERNEL);
 	if (!cs42l43)
@@ -181,11 +180,9 @@ static int cs42l43_sdw_probe(struct sdw_slave *sdw, const struct sdw_device_id *
 	cs42l43->sdw = sdw;
 
 	cs42l43->regmap = devm_regmap_init_sdw(sdw, &cs42l43_sdw_regmap);
-	if (IS_ERR(cs42l43->regmap)) {
-		ret = PTR_ERR(cs42l43->regmap);
-		dev_err(cs42l43->dev, "Failed to allocate regmap: %d\n", ret);
-		return ret;
-	}
+	if (IS_ERR(cs42l43->regmap))
+		return dev_err_probe(cs42l43->dev, PTR_ERR(cs42l43->regmap),
+				     "Failed to allocate regmap\n");
 
 	return cs42l43_dev_probe(cs42l43);
 }

From 4b9e19f6898b06cfc30fca6e7d5aec779e381f45 Mon Sep 17 00:00:00 2001
From: Charles Keepax <ckeepax@opensource.cirrus.com>
Date: Mon, 29 Jan 2024 15:25:57 +0000
Subject: [PATCH 0225/1406] mfd: cs42l43: Handle error from
 devm_pm_runtime_enable()

As it devm_pm_runtime_enable() can fail due to memory allocations, it
is best to handle the error.

Suggested-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Signed-off-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Link: https://lore.kernel.org/r/20240129152557.3221212-6-ckeepax@opensource.cirrus.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/mfd/cs42l43.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/mfd/cs42l43.c b/drivers/mfd/cs42l43.c
index aea0f8f4857856..56bd9dbbe10b0c 100644
--- a/drivers/mfd/cs42l43.c
+++ b/drivers/mfd/cs42l43.c
@@ -1065,7 +1065,9 @@ int cs42l43_dev_probe(struct cs42l43 *cs42l43)
 	 * the boot work runs.
 	 */
 	pm_runtime_get_noresume(cs42l43->dev);
-	devm_pm_runtime_enable(cs42l43->dev);
+	ret = devm_pm_runtime_enable(cs42l43->dev);
+	if (ret)
+		return ret;
 
 	queue_work(system_long_wq, &cs42l43->boot_work);
 

From c2144fb52a6eb2b09e37b3e12aa84036fee589ca Mon Sep 17 00:00:00 2001
From: Dharma Balasubiramani <dharma.b@microchip.com>
Date: Fri, 2 Feb 2024 05:47:33 +0530
Subject: [PATCH 0226/1406] dt-bindings: mfd: atmel,hlcdc: Convert to DT schema
 format

Convert the atmel,hlcdc binding to DT schema format.

Align clocks and clock-names properties to clearly indicate that the LCD
controller expects lvds_pll_clk when interfaced with the lvds display. This
alignment with the specific hardware requirements ensures accurate device tree
configuration for systems utilizing the HLCDC IP.

Signed-off-by: Dharma Balasubiramani <dharma.b@microchip.com>
Reviewed-by: Conor Dooley <conor.dooley@microchip.com>
Link: https://lore.kernel.org/r/20240202001733.91455-4-dharma.b@microchip.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 .../devicetree/bindings/mfd/atmel,hlcdc.yaml  | 99 +++++++++++++++++++
 .../devicetree/bindings/mfd/atmel-hlcdc.txt   | 56 -----------
 2 files changed, 99 insertions(+), 56 deletions(-)
 create mode 100644 Documentation/devicetree/bindings/mfd/atmel,hlcdc.yaml
 delete mode 100644 Documentation/devicetree/bindings/mfd/atmel-hlcdc.txt

diff --git a/Documentation/devicetree/bindings/mfd/atmel,hlcdc.yaml b/Documentation/devicetree/bindings/mfd/atmel,hlcdc.yaml
new file mode 100644
index 00000000000000..4aa36903e755bb
--- /dev/null
+++ b/Documentation/devicetree/bindings/mfd/atmel,hlcdc.yaml
@@ -0,0 +1,99 @@
+# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/mfd/atmel,hlcdc.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Atmel's HLCD Controller
+
+maintainers:
+  - Nicolas Ferre <nicolas.ferre@microchip.com>
+  - Alexandre Belloni <alexandre.belloni@bootlin.com>
+  - Claudiu Beznea <claudiu.beznea@tuxon.dev>
+
+description:
+  The Atmel HLCDC (HLCD Controller) IP available on Atmel SoCs exposes two
+  subdevices, a PWM chip and a Display Controller.
+
+properties:
+  compatible:
+    enum:
+      - atmel,at91sam9n12-hlcdc
+      - atmel,at91sam9x5-hlcdc
+      - atmel,sama5d2-hlcdc
+      - atmel,sama5d3-hlcdc
+      - atmel,sama5d4-hlcdc
+      - microchip,sam9x60-hlcdc
+      - microchip,sam9x75-xlcdc
+
+  reg:
+    maxItems: 1
+
+  interrupts:
+    maxItems: 1
+
+  clocks:
+    minItems: 3
+
+  clock-names:
+    items:
+      - const: periph_clk
+      - const: sys_clk
+      - const: slow_clk
+      - const: lvds_pll_clk
+    minItems: 3
+
+  display-controller:
+    $ref: /schemas/display/atmel/atmel,hlcdc-display-controller.yaml
+
+  pwm:
+    $ref: /schemas/pwm/atmel,hlcdc-pwm.yaml
+
+required:
+  - compatible
+  - reg
+  - clocks
+  - clock-names
+  - interrupts
+
+additionalProperties: false
+
+examples:
+  - |
+    #include <dt-bindings/clock/at91.h>
+    #include <dt-bindings/dma/at91.h>
+    #include <dt-bindings/interrupt-controller/arm-gic.h>
+
+    lcd_controller: lcd-controller@f0030000 {
+      compatible = "atmel,sama5d3-hlcdc";
+      reg = <0xf0030000 0x2000>;
+      clocks = <&lcdc_clk>, <&lcdck>, <&clk32k>;
+      clock-names = "periph_clk", "sys_clk", "slow_clk";
+      interrupts = <36 IRQ_TYPE_LEVEL_HIGH 0>;
+
+      display-controller {
+        compatible = "atmel,hlcdc-display-controller";
+        pinctrl-names = "default";
+        pinctrl-0 = <&pinctrl_lcd_base &pinctrl_lcd_rgb888>;
+        #address-cells = <1>;
+        #size-cells = <0>;
+
+        port@0 {
+          #address-cells = <1>;
+          #size-cells = <0>;
+          reg = <0>;
+
+          hlcdc_panel_output: endpoint@0 {
+            reg = <0>;
+            remote-endpoint = <&panel_input>;
+          };
+        };
+      };
+
+      pwm {
+        compatible = "atmel,hlcdc-pwm";
+        pinctrl-names = "default";
+        pinctrl-0 = <&pinctrl_lcd_pwm>;
+        #pwm-cells = <3>;
+      };
+    };
diff --git a/Documentation/devicetree/bindings/mfd/atmel-hlcdc.txt b/Documentation/devicetree/bindings/mfd/atmel-hlcdc.txt
deleted file mode 100644
index 7de696eefaed4b..00000000000000
--- a/Documentation/devicetree/bindings/mfd/atmel-hlcdc.txt
+++ /dev/null
@@ -1,56 +0,0 @@
-Device-Tree bindings for Atmel's HLCDC (High LCD Controller) MFD driver
-
-Required properties:
- - compatible: value should be one of the following:
-   "atmel,at91sam9n12-hlcdc"
-   "atmel,at91sam9x5-hlcdc"
-   "atmel,sama5d2-hlcdc"
-   "atmel,sama5d3-hlcdc"
-   "atmel,sama5d4-hlcdc"
-   "microchip,sam9x60-hlcdc"
-   "microchip,sam9x75-xlcdc"
- - reg: base address and size of the HLCDC device registers.
- - clock-names: the name of the 3 clocks requested by the HLCDC device.
-   Should contain "periph_clk", "sys_clk" and "slow_clk".
- - clocks: should contain the 3 clocks requested by the HLCDC device.
- - interrupts: should contain the description of the HLCDC interrupt line
-
-The HLCDC IP exposes two subdevices:
- - a PWM chip: see ../pwm/atmel-hlcdc-pwm.txt
- - a Display Controller: see ../display/atmel/hlcdc-dc.txt
-
-Example:
-
-	hlcdc: hlcdc@f0030000 {
-		compatible = "atmel,sama5d3-hlcdc";
-		reg = <0xf0030000 0x2000>;
-		clocks = <&lcdc_clk>, <&lcdck>, <&clk32k>;
-		clock-names = "periph_clk","sys_clk", "slow_clk";
-		interrupts = <36 IRQ_TYPE_LEVEL_HIGH 0>;
-
-		hlcdc-display-controller {
-			compatible = "atmel,hlcdc-display-controller";
-			pinctrl-names = "default";
-			pinctrl-0 = <&pinctrl_lcd_base &pinctrl_lcd_rgb888>;
-			#address-cells = <1>;
-			#size-cells = <0>;
-
-			port@0 {
-				#address-cells = <1>;
-				#size-cells = <0>;
-				reg = <0>;
-
-				hlcdc_panel_output: endpoint@0 {
-					reg = <0>;
-					remote-endpoint = <&panel_input>;
-				};
-			};
-		};
-
-		hlcdc_pwm: hlcdc-pwm {
-			compatible = "atmel,hlcdc-pwm";
-			pinctrl-names = "default";
-			pinctrl-0 = <&pinctrl_lcd_pwm>;
-			#pwm-cells = <3>;
-		};
-	};

From a5528ee9b0e4af1a7c87b92ddbbc853ec33a0099 Mon Sep 17 00:00:00 2001
From: Siddharth Vadapalli <s-vadapalli@ti.com>
Date: Sun, 4 Feb 2024 14:33:36 +0530
Subject: [PATCH 0227/1406] dt-bindings: mfd: syscon: Add ti,j784s4-pcie-ctrl
 compatible

The PCIE_CTRL registers within the CTRL_MMR space of TI's J784S4 SoC
are used to configure the link speed, lane count and mode of operation
of the respective PCIe instance. Add compatible for allowing the PCIe
driver to obtain a regmap for the PCIE_CTRL register within the System
Controller device-tree node in order to configure the PCIe instance
accordingly.

The Technical Reference Manual for J784S4 SoC with details of the
PCIE_CTRL registers is available at: https://www.ti.com/lit/zip/spruj52

Signed-off-by: Siddharth Vadapalli <s-vadapalli@ti.com>
Acked-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Link: https://lore.kernel.org/r/20240204090336.3209063-1-s-vadapalli@ti.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 Documentation/devicetree/bindings/mfd/syscon.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Documentation/devicetree/bindings/mfd/syscon.yaml b/Documentation/devicetree/bindings/mfd/syscon.yaml
index 084b5c2a2a3c28..2376b612f94ef2 100644
--- a/Documentation/devicetree/bindings/mfd/syscon.yaml
+++ b/Documentation/devicetree/bindings/mfd/syscon.yaml
@@ -73,6 +73,7 @@ properties:
               - rockchip,rv1126-qos
               - starfive,jh7100-sysmain
               - ti,am654-dss-oldi-io-ctrl
+              - ti,j784s4-pcie-ctrl
 
           - const: syscon
 

From 2f7f325ead3f02972f2d38315c76f739c01196dd Mon Sep 17 00:00:00 2001
From: Andrew Davis <afd@ti.com>
Date: Mon, 5 Feb 2024 11:47:36 -0600
Subject: [PATCH 0228/1406] dt-bindings: mfd: syscon: Add ti,am654-serdes-ctrl
 compatible

Add TI SERDES control registers compatible. This is a region found in the
TI AM65 CTRL_MMR0 register space[0]. Each instance is used to control a
SERDES clock and lane select mux.

[0] https://www.ti.com/lit/pdf/spruid7

Signed-off-by: Andrew Davis <afd@ti.com>
Acked-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Link: https://lore.kernel.org/r/20240205174736.27749-1-afd@ti.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 Documentation/devicetree/bindings/mfd/syscon.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Documentation/devicetree/bindings/mfd/syscon.yaml b/Documentation/devicetree/bindings/mfd/syscon.yaml
index 2376b612f94ef2..820c364c01f224 100644
--- a/Documentation/devicetree/bindings/mfd/syscon.yaml
+++ b/Documentation/devicetree/bindings/mfd/syscon.yaml
@@ -73,6 +73,7 @@ properties:
               - rockchip,rv1126-qos
               - starfive,jh7100-sysmain
               - ti,am654-dss-oldi-io-ctrl
+              - ti,am654-serdes-ctrl
               - ti,j784s4-pcie-ctrl
 
           - const: syscon

From c1d6347bd5258593e3445e2985b89d66ad41de05 Mon Sep 17 00:00:00 2001
From: Charles Keepax <ckeepax@opensource.cirrus.com>
Date: Mon, 29 Jan 2024 15:25:52 +0000
Subject: [PATCH 0229/1406] spi: cs42l43: Tidy up header includes

Including some missing headers.

Suggested-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Signed-off-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Acked-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20240129152557.3221212-1-ckeepax@opensource.cirrus.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/spi/spi-cs42l43.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/spi/spi-cs42l43.c b/drivers/spi/spi-cs42l43.c
index f13073e1259364..16b2c6c2e006a0 100644
--- a/drivers/spi/spi-cs42l43.c
+++ b/drivers/spi/spi-cs42l43.c
@@ -11,7 +11,9 @@
 #include <linux/errno.h>
 #include <linux/mfd/cs42l43.h>
 #include <linux/mfd/cs42l43-regs.h>
+#include <linux/mod_devicetable.h>
 #include <linux/module.h>
+#include <linux/of.h>
 #include <linux/platform_device.h>
 #include <linux/pm_runtime.h>
 #include <linux/regmap.h>

From eb0b64130e016814eed1e60e7f203413448da280 Mon Sep 17 00:00:00 2001
From: "Ricardo B. Marliere" <ricardo@marliere.net>
Date: Sun, 4 Feb 2024 17:10:32 -0300
Subject: [PATCH 0230/1406] mfd: mcp-core: Make mcp_bus_type const

Now that the driver core can properly handle constant struct bus_type,
move the mcp_bus_type variable to be a constant structure as well,
placing it into read-only memory which can not be modified at runtime.

Suggested-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: "Ricardo B. Marliere" <ricardo@marliere.net>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Link: https://lore.kernel.org/r/20240204-bus_cleanup-mfd-v1-1-07335ebc034f@marliere.net
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/mfd/mcp-core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/mfd/mcp-core.c b/drivers/mfd/mcp-core.c
index 2fa592c37c6f04..16ca23311cab49 100644
--- a/drivers/mfd/mcp-core.c
+++ b/drivers/mfd/mcp-core.c
@@ -41,7 +41,7 @@ static void mcp_bus_remove(struct device *dev)
 	drv->remove(mcp);
 }
 
-static struct bus_type mcp_bus_type = {
+static const struct bus_type mcp_bus_type = {
 	.name		= "mcp",
 	.match		= mcp_bus_match,
 	.probe		= mcp_bus_probe,

From 8745a81ac22f42c7233fbddc6aa5922b231aac2a Mon Sep 17 00:00:00 2001
From: Bo Liu <liubo03@inspur.com>
Date: Tue, 6 Feb 2024 02:12:57 -0500
Subject: [PATCH 0231/1406] mfd: ac100: Convert to use maple tree register
 cache

The maple tree register cache is based on a much more modern data structure
than the rbtree cache and makes optimisation choices which are probably
more appropriate for modern systems than those made by the rbtree cache.

Signed-off-by: Bo Liu <liubo03@inspur.com>
Link: https://lore.kernel.org/r/20240206071314.8721-2-liubo03@inspur.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/mfd/ac100.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/mfd/ac100.c b/drivers/mfd/ac100.c
index 6d49d7fb5f14ed..8f47c392cbd127 100644
--- a/drivers/mfd/ac100.c
+++ b/drivers/mfd/ac100.c
@@ -72,7 +72,7 @@ static const struct regmap_config ac100_regmap_config = {
 	.wr_table	= &ac100_writeable_table,
 	.volatile_table	= &ac100_volatile_table,
 	.max_register	= AC100_RTC_GP(15),
-	.cache_type	= REGCACHE_RBTREE,
+	.cache_type	= REGCACHE_MAPLE,
 };
 
 static struct mfd_cell ac100_cells[] = {

From 780b1aa441a3d64e400cb2ea96bc3e162322eb31 Mon Sep 17 00:00:00 2001
From: Bo Liu <liubo03@inspur.com>
Date: Tue, 6 Feb 2024 02:12:58 -0500
Subject: [PATCH 0232/1406] mfd: as3711: Convert to use maple tree register
 cache

The maple tree register cache is based on a much more modern data structure
than the rbtree cache and makes optimisation choices which are probably
more appropriate for modern systems than those made by the rbtree cache.

Signed-off-by: Bo Liu <liubo03@inspur.com>
Link: https://lore.kernel.org/r/20240206071314.8721-3-liubo03@inspur.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/mfd/as3711.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/mfd/as3711.c b/drivers/mfd/as3711.c
index c7e85ff3801327..9741977031df07 100644
--- a/drivers/mfd/as3711.c
+++ b/drivers/mfd/as3711.c
@@ -106,7 +106,7 @@ static const struct regmap_config as3711_regmap_config = {
 	.precious_reg = as3711_precious_reg,
 	.max_register = AS3711_MAX_REG,
 	.num_reg_defaults_raw = AS3711_NUM_REGS,
-	.cache_type = REGCACHE_RBTREE,
+	.cache_type = REGCACHE_MAPLE,
 };
 
 #ifdef CONFIG_OF

From aea5c3cbc51eee3ac25ff8f936252901c4549011 Mon Sep 17 00:00:00 2001
From: Bo Liu <liubo03@inspur.com>
Date: Tue, 6 Feb 2024 02:12:59 -0500
Subject: [PATCH 0233/1406] mfd: as3722: Convert to use maple tree register
 cache

The maple tree register cache is based on a much more modern data structure
than the rbtree cache and makes optimisation choices which are probably
more appropriate for modern systems than those made by the rbtree cache.

Signed-off-by: Bo Liu <liubo03@inspur.com>
Link: https://lore.kernel.org/r/20240206071314.8721-4-liubo03@inspur.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/mfd/as3722.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/mfd/as3722.c b/drivers/mfd/as3722.c
index a2bf68afc131d3..bec047bdd08848 100644
--- a/drivers/mfd/as3722.c
+++ b/drivers/mfd/as3722.c
@@ -299,7 +299,7 @@ static const struct regmap_config as3722_regmap_config = {
 	.reg_bits = 8,
 	.val_bits = 8,
 	.max_register = AS3722_MAX_REGISTER,
-	.cache_type = REGCACHE_RBTREE,
+	.cache_type = REGCACHE_MAPLE,
 	.rd_table = &as3722_readable_table,
 	.wr_table = &as3722_writable_table,
 	.volatile_table = &as3722_volatile_table,

From 009073d504f67146d936cc45f21cc27c1bc15490 Mon Sep 17 00:00:00 2001
From: Bo Liu <liubo03@inspur.com>
Date: Tue, 6 Feb 2024 02:13:00 -0500
Subject: [PATCH 0234/1406] mfd: axp20x: Convert to use maple tree register
 cache

The maple tree register cache is based on a much more modern data structure
than the rbtree cache and makes optimisation choices which are probably
more appropriate for modern systems than those made by the rbtree cache.

Signed-off-by: Bo Liu <liubo03@inspur.com>
Link: https://lore.kernel.org/r/20240206071314.8721-5-liubo03@inspur.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/mfd/axp20x.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/mfd/axp20x.c b/drivers/mfd/axp20x.c
index deaa969bab4e19..d8daa593ebd539 100644
--- a/drivers/mfd/axp20x.c
+++ b/drivers/mfd/axp20x.c
@@ -352,7 +352,7 @@ static const struct regmap_config axp192_regmap_config = {
 	.wr_table	= &axp192_writeable_table,
 	.volatile_table	= &axp192_volatile_table,
 	.max_register	= AXP20X_CC_CTRL,
-	.cache_type	= REGCACHE_RBTREE,
+	.cache_type	= REGCACHE_MAPLE,
 };
 
 static const struct regmap_config axp20x_regmap_config = {
@@ -388,7 +388,7 @@ static const struct regmap_config axp313a_regmap_config = {
 	.wr_table = &axp313a_writeable_table,
 	.volatile_table = &axp313a_volatile_table,
 	.max_register = AXP313A_IRQ_STATE,
-	.cache_type = REGCACHE_RBTREE,
+	.cache_type = REGCACHE_MAPLE,
 };
 
 static const struct regmap_config axp806_regmap_config = {

From 684c1f0b79417cf08810dfd57c5a549a9741ad7a Mon Sep 17 00:00:00 2001
From: Bo Liu <liubo03@inspur.com>
Date: Tue, 6 Feb 2024 02:13:01 -0500
Subject: [PATCH 0235/1406] mfd: bcm590xx: Convert to use maple tree register
 cache

The maple tree register cache is based on a much more modern data structure
than the rbtree cache and makes optimisation choices which are probably
more appropriate for modern systems than those made by the rbtree cache.

Signed-off-by: Bo Liu <liubo03@inspur.com>
Link: https://lore.kernel.org/r/20240206071314.8721-6-liubo03@inspur.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/mfd/bcm590xx.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/mfd/bcm590xx.c b/drivers/mfd/bcm590xx.c
index 92eede9a5e61b7..8b56786d85d018 100644
--- a/drivers/mfd/bcm590xx.c
+++ b/drivers/mfd/bcm590xx.c
@@ -27,14 +27,14 @@ static const struct regmap_config bcm590xx_regmap_config_pri = {
 	.reg_bits	= 8,
 	.val_bits	= 8,
 	.max_register	= BCM590XX_MAX_REGISTER_PRI,
-	.cache_type	= REGCACHE_RBTREE,
+	.cache_type	= REGCACHE_MAPLE,
 };
 
 static const struct regmap_config bcm590xx_regmap_config_sec = {
 	.reg_bits	= 8,
 	.val_bits	= 8,
 	.max_register	= BCM590XX_MAX_REGISTER_SEC,
-	.cache_type	= REGCACHE_RBTREE,
+	.cache_type	= REGCACHE_MAPLE,
 };
 
 static int bcm590xx_i2c_probe(struct i2c_client *i2c_pri)

From 35b1b8622ca3172b3c6b2a377e5913bac0367634 Mon Sep 17 00:00:00 2001
From: Bo Liu <liubo03@inspur.com>
Date: Tue, 6 Feb 2024 02:13:02 -0500
Subject: [PATCH 0236/1406] mfd: bd9571mwv: Convert to use maple tree register
 cache

The maple tree register cache is based on a much more modern data structure
than the rbtree cache and makes optimisation choices which are probably
more appropriate for modern systems than those made by the rbtree cache.

Signed-off-by: Bo Liu <liubo03@inspur.com>
Link: https://lore.kernel.org/r/20240206071314.8721-7-liubo03@inspur.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/mfd/bd9571mwv.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/mfd/bd9571mwv.c b/drivers/mfd/bd9571mwv.c
index 819d09e4d10077..0a955178d46974 100644
--- a/drivers/mfd/bd9571mwv.c
+++ b/drivers/mfd/bd9571mwv.c
@@ -67,7 +67,7 @@ static const struct regmap_access_table bd9571mwv_volatile_table = {
 static const struct regmap_config bd9571mwv_regmap_config = {
 	.reg_bits	= 8,
 	.val_bits	= 8,
-	.cache_type	= REGCACHE_RBTREE,
+	.cache_type	= REGCACHE_MAPLE,
 	.rd_table	= &bd9571mwv_readable_table,
 	.wr_table	= &bd9571mwv_writable_table,
 	.volatile_table	= &bd9571mwv_volatile_table,
@@ -152,7 +152,7 @@ static const struct regmap_access_table bd9574mwf_volatile_table = {
 static const struct regmap_config bd9574mwf_regmap_config = {
 	.reg_bits	= 8,
 	.val_bits	= 8,
-	.cache_type	= REGCACHE_RBTREE,
+	.cache_type	= REGCACHE_MAPLE,
 	.rd_table	= &bd9574mwf_readable_table,
 	.wr_table	= &bd9574mwf_writable_table,
 	.volatile_table	= &bd9574mwf_volatile_table,

From 1cb7a3bca56c124ca9e5c13fcac7de884c195c5c Mon Sep 17 00:00:00 2001
From: Bo Liu <liubo03@inspur.com>
Date: Tue, 6 Feb 2024 02:13:03 -0500
Subject: [PATCH 0237/1406] mfd: dialog: Convert to use maple tree register
 cache

The maple tree register cache is based on a much more modern data structure
than the rbtree cache and makes optimisation choices which are probably
more appropriate for modern systems than those made by the rbtree cache.

Signed-off-by: Bo Liu <liubo03@inspur.com>
Link: https://lore.kernel.org/r/20240206071314.8721-8-liubo03@inspur.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/mfd/da9052-core.c | 2 +-
 drivers/mfd/da9055-core.c | 2 +-
 drivers/mfd/da9062-core.c | 4 ++--
 drivers/mfd/da9063-i2c.c  | 2 +-
 drivers/mfd/da9150-core.c | 2 +-
 5 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/mfd/da9052-core.c b/drivers/mfd/da9052-core.c
index 150448cd2eb08e..dc85801b9fa085 100644
--- a/drivers/mfd/da9052-core.c
+++ b/drivers/mfd/da9052-core.c
@@ -533,7 +533,7 @@ const struct regmap_config da9052_regmap_config = {
 	.reg_bits = 8,
 	.val_bits = 8,
 
-	.cache_type = REGCACHE_RBTREE,
+	.cache_type = REGCACHE_MAPLE,
 
 	.max_register = DA9052_PAGE1_CON_REG,
 	.readable_reg = da9052_reg_readable,
diff --git a/drivers/mfd/da9055-core.c b/drivers/mfd/da9055-core.c
index 768302e05baa18..1f727ef60d6387 100644
--- a/drivers/mfd/da9055-core.c
+++ b/drivers/mfd/da9055-core.c
@@ -245,7 +245,7 @@ const struct regmap_config da9055_regmap_config = {
 	.reg_bits = 8,
 	.val_bits = 8,
 
-	.cache_type = REGCACHE_RBTREE,
+	.cache_type = REGCACHE_MAPLE,
 
 	.max_register = DA9055_MAX_REGISTER_CNT,
 	.readable_reg = da9055_register_readable,
diff --git a/drivers/mfd/da9062-core.c b/drivers/mfd/da9062-core.c
index 73a22107900c8c..dbbc4779170a0f 100644
--- a/drivers/mfd/da9062-core.c
+++ b/drivers/mfd/da9062-core.c
@@ -476,7 +476,7 @@ static struct regmap_config da9061_regmap_config = {
 	.ranges = da9061_range_cfg,
 	.num_ranges = ARRAY_SIZE(da9061_range_cfg),
 	.max_register = DA9062AA_CONFIG_ID,
-	.cache_type = REGCACHE_RBTREE,
+	.cache_type = REGCACHE_MAPLE,
 	.rd_table = &da9061_aa_readable_table,
 	.wr_table = &da9061_aa_writeable_table,
 	.volatile_table = &da9061_aa_volatile_table,
@@ -582,7 +582,7 @@ static struct regmap_config da9062_regmap_config = {
 	.ranges = da9062_range_cfg,
 	.num_ranges = ARRAY_SIZE(da9062_range_cfg),
 	.max_register = DA9062AA_CONFIG_ID,
-	.cache_type = REGCACHE_RBTREE,
+	.cache_type = REGCACHE_MAPLE,
 	.rd_table = &da9062_aa_readable_table,
 	.wr_table = &da9062_aa_writeable_table,
 	.volatile_table = &da9062_aa_volatile_table,
diff --git a/drivers/mfd/da9063-i2c.c b/drivers/mfd/da9063-i2c.c
index d715cf9a9e6883..c6235cd0dbdc40 100644
--- a/drivers/mfd/da9063-i2c.c
+++ b/drivers/mfd/da9063-i2c.c
@@ -342,7 +342,7 @@ static struct regmap_config da9063_regmap_config = {
 	.num_ranges = ARRAY_SIZE(da9063_range_cfg),
 	.max_register = DA9063_REG_CONFIG_ID,
 
-	.cache_type = REGCACHE_RBTREE,
+	.cache_type = REGCACHE_MAPLE,
 };
 
 static const struct of_device_id da9063_dt_ids[] = {
diff --git a/drivers/mfd/da9150-core.c b/drivers/mfd/da9150-core.c
index 94d621e20635dd..5c59cc869fb3e2 100644
--- a/drivers/mfd/da9150-core.c
+++ b/drivers/mfd/da9150-core.c
@@ -169,7 +169,7 @@ static const struct regmap_config da9150_regmap_config = {
 	.num_ranges = ARRAY_SIZE(da9150_range_cfg),
 	.max_register = DA9150_TBAT_RES_B,
 
-	.cache_type = REGCACHE_RBTREE,
+	.cache_type = REGCACHE_MAPLE,
 
 	.volatile_reg = da9150_volatile_reg,
 };

From 5f734f5f374f68960afd2130a5ce982912c14c15 Mon Sep 17 00:00:00 2001
From: Bo Liu <liubo03@inspur.com>
Date: Tue, 6 Feb 2024 02:13:04 -0500
Subject: [PATCH 0238/1406] mfd: khadas-mcu: Convert to use maple tree register
 cache

The maple tree register cache is based on a much more modern data structure
than the rbtree cache and makes optimisation choices which are probably
more appropriate for modern systems than those made by the rbtree cache.

Signed-off-by: Bo Liu <liubo03@inspur.com>
Reviewed-by: Neil Armstrong <neil.armstrong@linaro.org>
Link: https://lore.kernel.org/r/20240206071314.8721-9-liubo03@inspur.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/mfd/khadas-mcu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/mfd/khadas-mcu.c b/drivers/mfd/khadas-mcu.c
index 61396d824f16df..ba981a78869215 100644
--- a/drivers/mfd/khadas-mcu.c
+++ b/drivers/mfd/khadas-mcu.c
@@ -72,7 +72,7 @@ static const struct regmap_config khadas_mcu_regmap_config = {
 	.max_register	= KHADAS_MCU_CMD_FAN_STATUS_CTRL_REG,
 	.volatile_reg	= khadas_mcu_reg_volatile,
 	.writeable_reg	= khadas_mcu_reg_writeable,
-	.cache_type	= REGCACHE_RBTREE,
+	.cache_type	= REGCACHE_MAPLE,
 };
 
 static struct mfd_cell khadas_mcu_fan_cells[] = {

From 8881896e05b6a91251dde0ce21ce79c2c201888d Mon Sep 17 00:00:00 2001
From: Bo Liu <liubo03@inspur.com>
Date: Tue, 6 Feb 2024 02:13:05 -0500
Subject: [PATCH 0239/1406] mfd: lochnagar-i2c: Convert to use maple tree
 register cache

The maple tree register cache is based on a much more modern data structure
than the rbtree cache and makes optimisation choices which are probably
more appropriate for modern systems than those made by the rbtree cache.

Signed-off-by: Bo Liu <liubo03@inspur.com>
Tested-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Reviewed-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Link: https://lore.kernel.org/r/20240206071314.8721-10-liubo03@inspur.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/mfd/lochnagar-i2c.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/mfd/lochnagar-i2c.c b/drivers/mfd/lochnagar-i2c.c
index 0b76fcccd0bda7..6c930c57f2e23a 100644
--- a/drivers/mfd/lochnagar-i2c.c
+++ b/drivers/mfd/lochnagar-i2c.c
@@ -70,7 +70,7 @@ static const struct regmap_config lochnagar1_i2c_regmap = {
 	.use_single_read = true,
 	.use_single_write = true,
 
-	.cache_type = REGCACHE_RBTREE,
+	.cache_type = REGCACHE_MAPLE,
 };
 
 static const struct reg_sequence lochnagar1_patch[] = {
@@ -163,7 +163,7 @@ static const struct regmap_config lochnagar2_i2c_regmap = {
 	.readable_reg = lochnagar2_readable_register,
 	.volatile_reg = lochnagar2_volatile_register,
 
-	.cache_type = REGCACHE_RBTREE,
+	.cache_type = REGCACHE_MAPLE,
 };
 
 static const struct reg_sequence lochnagar2_patch[] = {

From f353b2c639e4f4ac9831d94c8237026cebaa8e73 Mon Sep 17 00:00:00 2001
From: Bo Liu <liubo03@inspur.com>
Date: Tue, 6 Feb 2024 02:13:06 -0500
Subject: [PATCH 0240/1406] mfd: wolfson: Convert to use maple tree register
 cache

The maple tree register cache is based on a much more modern data structure
than the rbtree cache and makes optimisation choices which are probably
more appropriate for modern systems than those made by the rbtree cache.

Signed-off-by: Bo Liu <liubo03@inspur.com>
Reviewed-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Link: https://lore.kernel.org/r/20240206071314.8721-11-liubo03@inspur.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/mfd/wm5102-tables.c | 2 +-
 drivers/mfd/wm5110-tables.c | 2 +-
 drivers/mfd/wm8350-regmap.c | 2 +-
 drivers/mfd/wm8400-core.c   | 2 +-
 drivers/mfd/wm97xx-core.c   | 6 +++---
 5 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/drivers/mfd/wm5102-tables.c b/drivers/mfd/wm5102-tables.c
index f77ecc635b6f10..6a8602c1c4ee74 100644
--- a/drivers/mfd/wm5102-tables.c
+++ b/drivers/mfd/wm5102-tables.c
@@ -1922,7 +1922,7 @@ const struct regmap_config wm5102_spi_regmap = {
 	.readable_reg = wm5102_readable_register,
 	.volatile_reg = wm5102_volatile_register,
 
-	.cache_type = REGCACHE_RBTREE,
+	.cache_type = REGCACHE_MAPLE,
 	.reg_defaults = wm5102_reg_default,
 	.num_reg_defaults = ARRAY_SIZE(wm5102_reg_default),
 };
diff --git a/drivers/mfd/wm5110-tables.c b/drivers/mfd/wm5110-tables.c
index eba324875afdb5..6ff33a54a068a6 100644
--- a/drivers/mfd/wm5110-tables.c
+++ b/drivers/mfd/wm5110-tables.c
@@ -3202,7 +3202,7 @@ const struct regmap_config wm5110_spi_regmap = {
 	.readable_reg = wm5110_readable_register,
 	.volatile_reg = wm5110_volatile_register,
 
-	.cache_type = REGCACHE_RBTREE,
+	.cache_type = REGCACHE_MAPLE,
 	.reg_defaults = wm5110_reg_default,
 	.num_reg_defaults = ARRAY_SIZE(wm5110_reg_default),
 };
diff --git a/drivers/mfd/wm8350-regmap.c b/drivers/mfd/wm8350-regmap.c
index 5663b8b0b3ad5b..3d0ebb004dbf1c 100644
--- a/drivers/mfd/wm8350-regmap.c
+++ b/drivers/mfd/wm8350-regmap.c
@@ -325,7 +325,7 @@ const struct regmap_config wm8350_regmap = {
 	.reg_bits = 8,
 	.val_bits = 16,
 
-	.cache_type = REGCACHE_RBTREE,
+	.cache_type = REGCACHE_MAPLE,
 
 	.max_register = WM8350_MAX_REGISTER,
 	.readable_reg = wm8350_readable,
diff --git a/drivers/mfd/wm8400-core.c b/drivers/mfd/wm8400-core.c
index 75483c9be0c4da..ddfb234849dd11 100644
--- a/drivers/mfd/wm8400-core.c
+++ b/drivers/mfd/wm8400-core.c
@@ -100,7 +100,7 @@ static const struct regmap_config wm8400_regmap_config = {
 
 	.volatile_reg = wm8400_volatile,
 
-	.cache_type = REGCACHE_RBTREE,
+	.cache_type = REGCACHE_MAPLE,
 };
 
 /**
diff --git a/drivers/mfd/wm97xx-core.c b/drivers/mfd/wm97xx-core.c
index 663acbb1854c93..1566a9b04b6a09 100644
--- a/drivers/mfd/wm97xx-core.c
+++ b/drivers/mfd/wm97xx-core.c
@@ -95,7 +95,7 @@ static const struct regmap_config wm9705_regmap_config = {
 	.reg_stride = 2,
 	.val_bits = 16,
 	.max_register = 0x7e,
-	.cache_type = REGCACHE_RBTREE,
+	.cache_type = REGCACHE_MAPLE,
 
 	.reg_defaults = wm9705_reg_defaults,
 	.num_reg_defaults = ARRAY_SIZE(wm9705_reg_defaults),
@@ -163,7 +163,7 @@ static const struct regmap_config wm9712_regmap_config = {
 	.reg_stride = 2,
 	.val_bits = 16,
 	.max_register = 0x7e,
-	.cache_type = REGCACHE_RBTREE,
+	.cache_type = REGCACHE_MAPLE,
 
 	.reg_defaults = wm9712_reg_defaults,
 	.num_reg_defaults = ARRAY_SIZE(wm9712_reg_defaults),
@@ -234,7 +234,7 @@ static const struct regmap_config wm9713_regmap_config = {
 	.reg_stride = 2,
 	.val_bits = 16,
 	.max_register = 0x7e,
-	.cache_type = REGCACHE_RBTREE,
+	.cache_type = REGCACHE_MAPLE,
 
 	.reg_defaults = wm9713_reg_defaults,
 	.num_reg_defaults = ARRAY_SIZE(wm9713_reg_defaults),

From aabd38f3779b8108b65cc7ae8b5c20a6c352b0a2 Mon Sep 17 00:00:00 2001
From: Bo Liu <liubo03@inspur.com>
Date: Tue, 6 Feb 2024 02:13:07 -0500
Subject: [PATCH 0241/1406] mfd: rohm: Convert to use maple tree register cache

The maple tree register cache is based on a much more modern data structure
than the rbtree cache and makes optimisation choices which are probably
more appropriate for modern systems than those made by the rbtree cache.

Signed-off-by: Bo Liu <liubo03@inspur.com>
Acked-by: Matti Vaittinen <mazziesaccount@gmail.com>
Link: https://lore.kernel.org/r/20240206071314.8721-12-liubo03@inspur.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/mfd/rohm-bd71828.c | 4 ++--
 drivers/mfd/rohm-bd718x7.c | 2 +-
 drivers/mfd/rohm-bd9576.c  | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/mfd/rohm-bd71828.c b/drivers/mfd/rohm-bd71828.c
index 594718f7e8e18e..2f3826c7eef49b 100644
--- a/drivers/mfd/rohm-bd71828.c
+++ b/drivers/mfd/rohm-bd71828.c
@@ -197,7 +197,7 @@ static const struct regmap_config bd71815_regmap = {
 	.val_bits = 8,
 	.volatile_table = &bd71815_volatile_regs,
 	.max_register = BD71815_MAX_REGISTER - 1,
-	.cache_type = REGCACHE_RBTREE,
+	.cache_type = REGCACHE_MAPLE,
 };
 
 static const struct regmap_config bd71828_regmap = {
@@ -205,7 +205,7 @@ static const struct regmap_config bd71828_regmap = {
 	.val_bits = 8,
 	.volatile_table = &bd71828_volatile_regs,
 	.max_register = BD71828_MAX_REGISTER,
-	.cache_type = REGCACHE_RBTREE,
+	.cache_type = REGCACHE_MAPLE,
 };
 
 /*
diff --git a/drivers/mfd/rohm-bd718x7.c b/drivers/mfd/rohm-bd718x7.c
index 4798bdf27afb6c..7755a4c073bfeb 100644
--- a/drivers/mfd/rohm-bd718x7.c
+++ b/drivers/mfd/rohm-bd718x7.c
@@ -87,7 +87,7 @@ static const struct regmap_config bd718xx_regmap_config = {
 	.val_bits = 8,
 	.volatile_table = &volatile_regs,
 	.max_register = BD718XX_MAX_REGISTER - 1,
-	.cache_type = REGCACHE_RBTREE,
+	.cache_type = REGCACHE_MAPLE,
 };
 
 static int bd718xx_init_press_duration(struct regmap *regmap,
diff --git a/drivers/mfd/rohm-bd9576.c b/drivers/mfd/rohm-bd9576.c
index bceac7016740db..3a9f61961721b0 100644
--- a/drivers/mfd/rohm-bd9576.c
+++ b/drivers/mfd/rohm-bd9576.c
@@ -62,7 +62,7 @@ static struct regmap_config bd957x_regmap = {
 	.val_bits = 8,
 	.volatile_table = &volatile_regs,
 	.max_register = BD957X_MAX_REGISTER,
-	.cache_type = REGCACHE_RBTREE,
+	.cache_type = REGCACHE_MAPLE,
 };
 
 static struct regmap_irq bd9576_irqs[] = {

From 08aa1e797a5cc97d2106476263f938342e72015e Mon Sep 17 00:00:00 2001
From: Bo Liu <liubo03@inspur.com>
Date: Tue, 6 Feb 2024 02:13:08 -0500
Subject: [PATCH 0242/1406] mfd: rk8xx: Convert to use maple tree register
 cache

The maple tree register cache is based on a much more modern data structure
than the rbtree cache and makes optimisation choices which are probably
more appropriate for modern systems than those made by the rbtree cache.

Signed-off-by: Bo Liu <liubo03@inspur.com>
Link: https://lore.kernel.org/r/20240206071314.8721-13-liubo03@inspur.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/mfd/rk8xx-spi.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/mfd/rk8xx-spi.c b/drivers/mfd/rk8xx-spi.c
index fd137f38c2c4a7..3405fb82ff9fbc 100644
--- a/drivers/mfd/rk8xx-spi.c
+++ b/drivers/mfd/rk8xx-spi.c
@@ -34,7 +34,7 @@ static const struct regmap_config rk806_regmap_config_spi = {
 	.reg_bits = 16,
 	.val_bits = 8,
 	.max_register = RK806_BUCK_RSERVE_REG5,
-	.cache_type = REGCACHE_RBTREE,
+	.cache_type = REGCACHE_MAPLE,
 	.volatile_table = &rk806_volatile_table,
 };
 

From a011cacb63c3ff6562bd86afae8cdd49d0f075c9 Mon Sep 17 00:00:00 2001
From: Bo Liu <liubo03@inspur.com>
Date: Tue, 6 Feb 2024 02:13:09 -0500
Subject: [PATCH 0243/1406] mfd: rn5t618: Convert to use maple tree register
 cache

The maple tree register cache is based on a much more modern data structure
than the rbtree cache and makes optimisation choices which are probably
more appropriate for modern systems than those made by the rbtree cache.

Signed-off-by: Bo Liu <liubo03@inspur.com>
Link: https://lore.kernel.org/r/20240206071314.8721-14-liubo03@inspur.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/mfd/rn5t618.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/mfd/rn5t618.c b/drivers/mfd/rn5t618.c
index 7336e6d8a00131..23ca00d2c624d9 100644
--- a/drivers/mfd/rn5t618.c
+++ b/drivers/mfd/rn5t618.c
@@ -62,7 +62,7 @@ static const struct regmap_config rn5t618_regmap_config = {
 	.val_bits	= 8,
 	.volatile_reg	= rn5t618_volatile_reg,
 	.max_register	= RN5T618_MAX_REG,
-	.cache_type	= REGCACHE_RBTREE,
+	.cache_type	= REGCACHE_MAPLE,
 };
 
 static const struct regmap_irq rc5t619_irqs[] = {

From b85821cecdf933420a6ba93658e1e4710644f5c1 Mon Sep 17 00:00:00 2001
From: Bo Liu <liubo03@inspur.com>
Date: Tue, 6 Feb 2024 02:13:10 -0500
Subject: [PATCH 0244/1406] mfd: rsmu_i2c: Convert to use maple tree register
 cache

The maple tree register cache is based on a much more modern data structure
than the rbtree cache and makes optimisation choices which are probably
more appropriate for modern systems than those made by the rbtree cache.

Signed-off-by: Bo Liu <liubo03@inspur.com>
Link: https://lore.kernel.org/r/20240206071314.8721-15-liubo03@inspur.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/mfd/rsmu_i2c.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/mfd/rsmu_i2c.c b/drivers/mfd/rsmu_i2c.c
index 06d78a1cf1ccbb..5711e512b6a2f3 100644
--- a/drivers/mfd/rsmu_i2c.c
+++ b/drivers/mfd/rsmu_i2c.c
@@ -188,7 +188,7 @@ static const struct regmap_config rsmu_sabre_regmap_config = {
 	.ranges = rsmu_sabre_range_cfg,
 	.num_ranges = ARRAY_SIZE(rsmu_sabre_range_cfg),
 	.volatile_reg = rsmu_sabre_volatile_reg,
-	.cache_type = REGCACHE_RBTREE,
+	.cache_type = REGCACHE_MAPLE,
 	.can_multi_write = true,
 };
 

From 030473871c3f4f126bd43018517abfcb070f6b95 Mon Sep 17 00:00:00 2001
From: Bo Liu <liubo03@inspur.com>
Date: Tue, 6 Feb 2024 02:13:11 -0500
Subject: [PATCH 0245/1406] mfd: si476x: Convert to use maple tree register
 cache

The maple tree register cache is based on a much more modern data structure
than the rbtree cache and makes optimisation choices which are probably
more appropriate for modern systems than those made by the rbtree cache.

Signed-off-by: Bo Liu <liubo03@inspur.com>
Link: https://lore.kernel.org/r/20240206071314.8721-16-liubo03@inspur.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/mfd/si476x-prop.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/mfd/si476x-prop.c b/drivers/mfd/si476x-prop.c
index f0608d138f02e1..3d5c118888b262 100644
--- a/drivers/mfd/si476x-prop.c
+++ b/drivers/mfd/si476x-prop.c
@@ -222,7 +222,7 @@ static const struct regmap_config si476x_regmap_config = {
 	.reg_read = si476x_core_regmap_read,
 	.reg_write = si476x_core_regmap_write,
 
-	.cache_type = REGCACHE_RBTREE,
+	.cache_type = REGCACHE_MAPLE,
 };
 
 struct regmap *devm_regmap_init_si476x(struct si476x_core *core)

From 87a48e35ce2ef992287b828b1268bedc7415d2de Mon Sep 17 00:00:00 2001
From: Bo Liu <liubo03@inspur.com>
Date: Tue, 6 Feb 2024 02:13:12 -0500
Subject: [PATCH 0246/1406] mfd: stmfx: Convert to use maple tree register
 cache

The maple tree register cache is based on a much more modern data structure
than the rbtree cache and makes optimisation choices which are probably
more appropriate for modern systems than those made by the rbtree cache.

Signed-off-by: Bo Liu <liubo03@inspur.com>
Link: https://lore.kernel.org/r/20240206071314.8721-17-liubo03@inspur.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/mfd/stmfx.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/mfd/stmfx.c b/drivers/mfd/stmfx.c
index c02cbd9c2f5d79..f391c2ccaa72a7 100644
--- a/drivers/mfd/stmfx.c
+++ b/drivers/mfd/stmfx.c
@@ -53,7 +53,7 @@ static const struct regmap_config stmfx_regmap_config = {
 	.max_register	= STMFX_REG_MAX,
 	.volatile_reg	= stmfx_reg_volatile,
 	.writeable_reg	= stmfx_reg_writeable,
-	.cache_type	= REGCACHE_RBTREE,
+	.cache_type	= REGCACHE_MAPLE,
 };
 
 static const struct resource stmfx_pinctrl_resources[] = {

From ab993d2d68951db3ebfeaa44567a790ec566526d Mon Sep 17 00:00:00 2001
From: Bo Liu <liubo03@inspur.com>
Date: Tue, 6 Feb 2024 02:13:13 -0500
Subject: [PATCH 0247/1406] mfd: stpmic1: Convert to use maple tree register
 cache

The maple tree register cache is based on a much more modern data structure
than the rbtree cache and makes optimisation choices which are probably
more appropriate for modern systems than those made by the rbtree cache.

Signed-off-by: Bo Liu <liubo03@inspur.com>
Link: https://lore.kernel.org/r/20240206071314.8721-18-liubo03@inspur.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/mfd/stpmic1.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/mfd/stpmic1.c b/drivers/mfd/stpmic1.c
index c5128fe96cc788..d8a603d95aa66e 100644
--- a/drivers/mfd/stpmic1.c
+++ b/drivers/mfd/stpmic1.c
@@ -63,7 +63,7 @@ static const struct regmap_access_table stpmic1_volatile_table = {
 static const struct regmap_config stpmic1_regmap_config = {
 	.reg_bits = 8,
 	.val_bits = 8,
-	.cache_type = REGCACHE_RBTREE,
+	.cache_type = REGCACHE_MAPLE,
 	.max_register = PMIC_MAX_REGISTER_ADDRESS,
 	.rd_table = &stpmic1_readable_table,
 	.wr_table = &stpmic1_writeable_table,

From d5132d176d6f21742ac67fd311ccc61fe830e999 Mon Sep 17 00:00:00 2001
From: Bo Liu <liubo03@inspur.com>
Date: Tue, 6 Feb 2024 02:13:14 -0500
Subject: [PATCH 0248/1406] mfd: rc5t583: Convert to use maple tree register
 cache

The maple tree register cache is based on a much more modern data structure
than the rbtree cache and makes optimisation choices which are probably
more appropriate for modern systems than those made by the rbtree cache.

Signed-off-by: Bo Liu <liubo03@inspur.com>
Link: https://lore.kernel.org/r/20240206071314.8721-19-liubo03@inspur.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/mfd/rc5t583.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/mfd/rc5t583.c b/drivers/mfd/rc5t583.c
index 5e81f011363ffd..2c0e8e9630f743 100644
--- a/drivers/mfd/rc5t583.c
+++ b/drivers/mfd/rc5t583.c
@@ -230,7 +230,7 @@ static const struct regmap_config rc5t583_regmap_config = {
 	.volatile_reg = volatile_reg,
 	.max_register = RC5T583_MAX_REG,
 	.num_reg_defaults_raw = RC5T583_NUM_REGS,
-	.cache_type = REGCACHE_RBTREE,
+	.cache_type = REGCACHE_MAPLE,
 };
 
 static int rc5t583_i2c_probe(struct i2c_client *i2c)

From 1e54bddadec40ee28cf5fba850a6df9296ee12ab Mon Sep 17 00:00:00 2001
From: "David E. Box" <david.e.box@linux.intel.com>
Date: Sun, 28 Jan 2024 15:32:08 -0800
Subject: [PATCH 0249/1406] PCI/ASPM: Always build aspm.c

Some ASPM-related tasks, such as save and restore of LTR and L1SS
capabilities, still need to be performed when CONFIG_PCIEASPM is not
enabled. To prepare for these changes, wrap the current code in aspm.c with
an #ifdef and always build the file. Also move pci_configure_ltr() and
pci_bridge_reconfigure_ltr() into aspm.c since they only build when
CONFIG_PCIEASPM is set.

Suggested-by: Bjorn Helgaas <bhelgaas@google.com>
Link: https://lore.kernel.org/r/20240128233212.1139663-2-david.e.box@linux.intel.com
Signed-off-by: David E. Box <david.e.box@linux.intel.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/pci/pci.c         | 18 ---------
 drivers/pci/pci.h         |  5 ++-
 drivers/pci/pcie/Makefile |  2 +-
 drivers/pci/pcie/aspm.c   | 78 +++++++++++++++++++++++++++++++++++++++
 drivers/pci/probe.c       | 61 ------------------------------
 5 files changed, 83 insertions(+), 81 deletions(-)

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index d8f11a078924c1..c783e0f1f2a971 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -1626,24 +1626,6 @@ static int pci_save_pcie_state(struct pci_dev *dev)
 	return 0;
 }
 
-void pci_bridge_reconfigure_ltr(struct pci_dev *dev)
-{
-#ifdef CONFIG_PCIEASPM
-	struct pci_dev *bridge;
-	u32 ctl;
-
-	bridge = pci_upstream_bridge(dev);
-	if (bridge && bridge->ltr_path) {
-		pcie_capability_read_dword(bridge, PCI_EXP_DEVCTL2, &ctl);
-		if (!(ctl & PCI_EXP_DEVCTL2_LTR_EN)) {
-			pci_dbg(bridge, "re-enabling LTR\n");
-			pcie_capability_set_word(bridge, PCI_EXP_DEVCTL2,
-						 PCI_EXP_DEVCTL2_LTR_EN);
-		}
-	}
-#endif
-}
-
 static void pci_restore_pcie_state(struct pci_dev *dev)
 {
 	int i = 0;
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 2336a8d1edab27..9aeba82facc41d 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -97,7 +97,6 @@ void pci_msi_init(struct pci_dev *dev);
 void pci_msix_init(struct pci_dev *dev);
 bool pci_bridge_d3_possible(struct pci_dev *dev);
 void pci_bridge_d3_update(struct pci_dev *dev);
-void pci_bridge_reconfigure_ltr(struct pci_dev *dev);
 int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type);
 
 static inline void pci_wakeup_event(struct pci_dev *dev)
@@ -573,11 +572,15 @@ void pcie_aspm_init_link_state(struct pci_dev *pdev);
 void pcie_aspm_exit_link_state(struct pci_dev *pdev);
 void pcie_aspm_pm_state_change(struct pci_dev *pdev);
 void pcie_aspm_powersave_config_link(struct pci_dev *pdev);
+void pci_configure_ltr(struct pci_dev *pdev);
+void pci_bridge_reconfigure_ltr(struct pci_dev *pdev);
 #else
 static inline void pcie_aspm_init_link_state(struct pci_dev *pdev) { }
 static inline void pcie_aspm_exit_link_state(struct pci_dev *pdev) { }
 static inline void pcie_aspm_pm_state_change(struct pci_dev *pdev) { }
 static inline void pcie_aspm_powersave_config_link(struct pci_dev *pdev) { }
+static inline void pci_configure_ltr(struct pci_dev *pdev) { }
+static inline void pci_bridge_reconfigure_ltr(struct pci_dev *pdev) { }
 #endif
 
 #ifdef CONFIG_PCIE_ECRC
diff --git a/drivers/pci/pcie/Makefile b/drivers/pci/pcie/Makefile
index 8de4ed5f98f145..6461aa93fe76ec 100644
--- a/drivers/pci/pcie/Makefile
+++ b/drivers/pci/pcie/Makefile
@@ -6,7 +6,7 @@ pcieportdrv-y			:= portdrv.o rcec.o
 
 obj-$(CONFIG_PCIEPORTBUS)	+= pcieportdrv.o
 
-obj-$(CONFIG_PCIEASPM)		+= aspm.o
+obj-y				+= aspm.o
 obj-$(CONFIG_PCIEAER)		+= aer.o err.o
 obj-$(CONFIG_PCIEAER_INJECT)	+= aer_inject.o
 obj-$(CONFIG_PCIE_PME)		+= pme.o
diff --git a/drivers/pci/pcie/aspm.c b/drivers/pci/pcie/aspm.c
index 5a0066ecc3c5ad..3c9b186683705f 100644
--- a/drivers/pci/pcie/aspm.c
+++ b/drivers/pci/pcie/aspm.c
@@ -24,6 +24,8 @@
 
 #include "../pci.h"
 
+#ifdef CONFIG_PCIEASPM
+
 #ifdef MODULE_PARAM_PREFIX
 #undef MODULE_PARAM_PREFIX
 #endif
@@ -938,6 +940,81 @@ void pcie_aspm_init_link_state(struct pci_dev *pdev)
 	up_read(&pci_bus_sem);
 }
 
+void pci_bridge_reconfigure_ltr(struct pci_dev *pdev)
+{
+	struct pci_dev *bridge;
+	u32 ctl;
+
+	bridge = pci_upstream_bridge(pdev);
+	if (bridge && bridge->ltr_path) {
+		pcie_capability_read_dword(bridge, PCI_EXP_DEVCTL2, &ctl);
+		if (!(ctl & PCI_EXP_DEVCTL2_LTR_EN)) {
+			pci_dbg(bridge, "re-enabling LTR\n");
+			pcie_capability_set_word(bridge, PCI_EXP_DEVCTL2,
+						 PCI_EXP_DEVCTL2_LTR_EN);
+		}
+	}
+}
+
+void pci_configure_ltr(struct pci_dev *pdev)
+{
+	struct pci_host_bridge *host = pci_find_host_bridge(pdev->bus);
+	struct pci_dev *bridge;
+	u32 cap, ctl;
+
+	if (!pci_is_pcie(pdev))
+		return;
+
+	/* Read L1 PM substate capabilities */
+	pdev->l1ss = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_L1SS);
+
+	pcie_capability_read_dword(pdev, PCI_EXP_DEVCAP2, &cap);
+	if (!(cap & PCI_EXP_DEVCAP2_LTR))
+		return;
+
+	pcie_capability_read_dword(pdev, PCI_EXP_DEVCTL2, &ctl);
+	if (ctl & PCI_EXP_DEVCTL2_LTR_EN) {
+		if (pci_pcie_type(pdev) == PCI_EXP_TYPE_ROOT_PORT) {
+			pdev->ltr_path = 1;
+			return;
+		}
+
+		bridge = pci_upstream_bridge(pdev);
+		if (bridge && bridge->ltr_path)
+			pdev->ltr_path = 1;
+
+		return;
+	}
+
+	if (!host->native_ltr)
+		return;
+
+	/*
+	 * Software must not enable LTR in an Endpoint unless the Root
+	 * Complex and all intermediate Switches indicate support for LTR.
+	 * PCIe r4.0, sec 6.18.
+	 */
+	if (pci_pcie_type(pdev) == PCI_EXP_TYPE_ROOT_PORT) {
+		pcie_capability_set_word(pdev, PCI_EXP_DEVCTL2,
+					 PCI_EXP_DEVCTL2_LTR_EN);
+		pdev->ltr_path = 1;
+		return;
+	}
+
+	/*
+	 * If we're configuring a hot-added device, LTR was likely
+	 * disabled in the upstream bridge, so re-enable it before enabling
+	 * it in the new device.
+	 */
+	bridge = pci_upstream_bridge(pdev);
+	if (bridge && bridge->ltr_path) {
+		pci_bridge_reconfigure_ltr(pdev);
+		pcie_capability_set_word(pdev, PCI_EXP_DEVCTL2,
+					 PCI_EXP_DEVCTL2_LTR_EN);
+		pdev->ltr_path = 1;
+	}
+}
+
 /* Recheck latencies and update aspm_capable for links under the root */
 static void pcie_update_aspm_capable(struct pcie_link_state *root)
 {
@@ -1442,3 +1519,4 @@ bool pcie_aspm_support_enabled(void)
 {
 	return aspm_support_enabled;
 }
+#endif /* CONFIG_PCIEASPM */
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index b7335be56008f7..b809c0b0e0e57e 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -2209,67 +2209,6 @@ static void pci_configure_relaxed_ordering(struct pci_dev *dev)
 	}
 }
 
-static void pci_configure_ltr(struct pci_dev *dev)
-{
-#ifdef CONFIG_PCIEASPM
-	struct pci_host_bridge *host = pci_find_host_bridge(dev->bus);
-	struct pci_dev *bridge;
-	u32 cap, ctl;
-
-	if (!pci_is_pcie(dev))
-		return;
-
-	/* Read L1 PM substate capabilities */
-	dev->l1ss = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_L1SS);
-
-	pcie_capability_read_dword(dev, PCI_EXP_DEVCAP2, &cap);
-	if (!(cap & PCI_EXP_DEVCAP2_LTR))
-		return;
-
-	pcie_capability_read_dword(dev, PCI_EXP_DEVCTL2, &ctl);
-	if (ctl & PCI_EXP_DEVCTL2_LTR_EN) {
-		if (pci_pcie_type(dev) == PCI_EXP_TYPE_ROOT_PORT) {
-			dev->ltr_path = 1;
-			return;
-		}
-
-		bridge = pci_upstream_bridge(dev);
-		if (bridge && bridge->ltr_path)
-			dev->ltr_path = 1;
-
-		return;
-	}
-
-	if (!host->native_ltr)
-		return;
-
-	/*
-	 * Software must not enable LTR in an Endpoint unless the Root
-	 * Complex and all intermediate Switches indicate support for LTR.
-	 * PCIe r4.0, sec 6.18.
-	 */
-	if (pci_pcie_type(dev) == PCI_EXP_TYPE_ROOT_PORT) {
-		pcie_capability_set_word(dev, PCI_EXP_DEVCTL2,
-					 PCI_EXP_DEVCTL2_LTR_EN);
-		dev->ltr_path = 1;
-		return;
-	}
-
-	/*
-	 * If we're configuring a hot-added device, LTR was likely
-	 * disabled in the upstream bridge, so re-enable it before enabling
-	 * it in the new device.
-	 */
-	bridge = pci_upstream_bridge(dev);
-	if (bridge && bridge->ltr_path) {
-		pci_bridge_reconfigure_ltr(dev);
-		pcie_capability_set_word(dev, PCI_EXP_DEVCTL2,
-					 PCI_EXP_DEVCTL2_LTR_EN);
-		dev->ltr_path = 1;
-	}
-#endif
-}
-
 static void pci_configure_eetlp_prefix(struct pci_dev *dev)
 {
 #ifdef CONFIG_PCI_PASID

From 9295f2bde9eb7ae70ef969b075126f2830d6c2d4 Mon Sep 17 00:00:00 2001
From: "David E. Box" <david.e.box@linux.intel.com>
Date: Sun, 28 Jan 2024 15:32:09 -0800
Subject: [PATCH 0250/1406] PCI/ASPM: Capture L1SS Capability offset always

Previously we only captured the L1SS Capability offset when
CONFIG_PCIEASPM=y.

Capture it always so a future change can save and restore the L1SS
registers even when CONFIG_PCIEASPM is not enabled.

Link: https://lore.kernel.org/r/20240128233212.1139663-3-david.e.box@linux.intel.com
Signed-off-by: David E. Box <david.e.box@linux.intel.com>
[bhelgaas: commit log]
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/pci/pci.h       | 1 +
 drivers/pci/pcie/aspm.c | 9 ++++++---
 drivers/pci/probe.c     | 1 +
 include/linux/pci.h     | 2 +-
 4 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 9aeba82facc41d..bf2559c5b82787 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -97,6 +97,7 @@ void pci_msi_init(struct pci_dev *dev);
 void pci_msix_init(struct pci_dev *dev);
 bool pci_bridge_d3_possible(struct pci_dev *dev);
 void pci_bridge_d3_update(struct pci_dev *dev);
+void pci_aspm_get_l1ss(struct pci_dev *pdev);
 int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type);
 
 static inline void pci_wakeup_event(struct pci_dev *dev)
diff --git a/drivers/pci/pcie/aspm.c b/drivers/pci/pcie/aspm.c
index 3c9b186683705f..62a8243629664f 100644
--- a/drivers/pci/pcie/aspm.c
+++ b/drivers/pci/pcie/aspm.c
@@ -24,6 +24,12 @@
 
 #include "../pci.h"
 
+void pci_aspm_get_l1ss(struct pci_dev *pdev)
+{
+	/* Read L1 PM substate capabilities */
+	pdev->l1ss = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_L1SS);
+}
+
 #ifdef CONFIG_PCIEASPM
 
 #ifdef MODULE_PARAM_PREFIX
@@ -965,9 +971,6 @@ void pci_configure_ltr(struct pci_dev *pdev)
 	if (!pci_is_pcie(pdev))
 		return;
 
-	/* Read L1 PM substate capabilities */
-	pdev->l1ss = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_L1SS);
-
 	pcie_capability_read_dword(pdev, PCI_EXP_DEVCAP2, &cap);
 	if (!(cap & PCI_EXP_DEVCAP2_LTR))
 		return;
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index b809c0b0e0e57e..b5ccf5a16dc153 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -2258,6 +2258,7 @@ static void pci_configure_device(struct pci_dev *dev)
 	pci_configure_mps(dev);
 	pci_configure_extended_tags(dev, NULL);
 	pci_configure_relaxed_ordering(dev);
+	pci_aspm_get_l1ss(dev);
 	pci_configure_ltr(dev);
 	pci_configure_eetlp_prefix(dev);
 	pci_configure_serr(dev);
diff --git a/include/linux/pci.h b/include/linux/pci.h
index add9368e6314b9..6967ae7b41154e 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -390,9 +390,9 @@ struct pci_dev {
 	unsigned int	d3hot_delay;	/* D3hot->D0 transition time in ms */
 	unsigned int	d3cold_delay;	/* D3cold->D0 transition time in ms */
 
+	u16		l1ss;		/* L1SS Capability pointer */
 #ifdef CONFIG_PCIEASPM
 	struct pcie_link_state	*link_state;	/* ASPM link state */
-	u16		l1ss;		/* L1SS Capability pointer */
 	unsigned int	ltr_path:1;	/* Latency Tolerance Reporting
 					   supported from root to here */
 #endif

From a96022196f007b1f248769ec50f1ca3cd0c8d25b Mon Sep 17 00:00:00 2001
From: "David E. Box" <david.e.box@linux.intel.com>
Date: Sun, 28 Jan 2024 15:32:10 -0800
Subject: [PATCH 0251/1406] PCI/ASPM: Save L1 PM Substates Capability for
 suspend/resume

4ff116d0d5fd ("PCI/ASPM: Save L1 PM Substates Capability for
suspend/resume") restored the L1 PM Substates Capability after resume,
which reduced power consumption by making the ASPM L1.x states work after
resume.

a7152be79b62 ("Revert "PCI/ASPM: Save L1 PM Substates Capability for
suspend/resume"") reverted 4ff116d0d5fd because resume failed on some
systems, so power consumption after resume increased again.

a7152be79b62 mentioned that we restore L1 PM substate configuration even
though ASPM L1 may already be enabled. This is due the fact that the
pci_restore_aspm_l1ss_state() was called before pci_restore_pcie_state().

Save and restore the L1 PM Substates Capability, following PCIe r6.1, sec
5.5.4 more closely by:

  1) Do not restore ASPM configuration in pci_restore_pcie_state() but
     do that after PCIe capability is restored in pci_restore_aspm_state()
     following PCIe r6.1, sec 5.5.4.

  2) If BIOS reenables L1SS, particularly L1.2, we need to clear the
     enables in the right order, downstream before upstream. Defer
     restoring the L1SS config until we are at the downstream component.
     Then update the config for both ends of the link in the prescribed
     order.

  3) Program ASPM L1 PM substate configuration before L1 enables.

  4) Program ASPM L1 PM substate enables last, after rest of the fields
     in the capability are programmed.

Link: https://lore.kernel.org/r/20240128233212.1139663-4-david.e.box@linux.intel.com
Reported-by: Koba Ko <koba.ko@canonical.com>
Closes: https://bugzilla.kernel.org/show_bug.cgi?id=217321
Link: https://bugzilla.kernel.org/show_bug.cgi?id=216782
Link: https://bugzilla.kernel.org/show_bug.cgi?id=216877
Co-developed-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Co-developed-by: David E. Box <david.e.box@linux.intel.com>
Signed-off-by: David E. Box <david.e.box@linux.intel.com>
[bhelgaas: commit log]
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Cc: Tasev Nikola <tasev.stefanoska@skynet.be>
Cc: Mark Enriquez <enriquezmark36@gmail.com>
Cc: Thomas Witt <kernel@witt.link>
Cc: Werner Sembach <wse@tuxedocomputers.com>
Cc: Vidya Sagar <vidyas@nvidia.com>
---
 drivers/pci/pcie/aspm.c | 114 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 114 insertions(+)

diff --git a/drivers/pci/pcie/aspm.c b/drivers/pci/pcie/aspm.c
index 62a8243629664f..ebf81f6afc8204 100644
--- a/drivers/pci/pcie/aspm.c
+++ b/drivers/pci/pcie/aspm.c
@@ -30,6 +30,120 @@ void pci_aspm_get_l1ss(struct pci_dev *pdev)
 	pdev->l1ss = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_L1SS);
 }
 
+void pci_save_aspm_state(struct pci_dev *pdev)
+{
+	struct pci_cap_saved_state *save_state;
+	u16 l1ss = pdev->l1ss;
+	u32 *cap;
+
+	/*
+	 * Save L1 substate configuration. The ASPM L0s/L1 configuration
+	 * is already saved in pci_save_pcie_state().
+	 */
+	if (!l1ss)
+		return;
+
+	save_state = pci_find_saved_ext_cap(pdev, PCI_EXT_CAP_ID_L1SS);
+	if (!save_state)
+		return;
+
+	cap = &save_state->cap.data[0];
+	pci_read_config_dword(pdev, l1ss + PCI_L1SS_CTL2, cap++);
+	pci_read_config_dword(pdev, l1ss + PCI_L1SS_CTL1, cap++);
+}
+
+static void pcie_restore_aspm_l1ss(struct pci_dev *pdev)
+{
+	struct pci_cap_saved_state *pl_save_state, *cl_save_state;
+	struct pci_dev *parent = pdev->bus->self;
+	u32 *cap, pl_ctl1, pl_ctl2, pl_l1_2_enable;
+	u32 cl_ctl1, cl_ctl2, cl_l1_2_enable;
+
+	/*
+	 * In case BIOS enabled L1.2 after resume, we need to disable it first
+	 * on the downstream component before the upstream. So, don't attempt to
+	 * restore either until we are at the downstream component.
+	 */
+	if (pcie_downstream_port(pdev) || !parent)
+		return;
+
+	if (!pdev->l1ss || !parent->l1ss)
+		return;
+
+	cl_save_state = pci_find_saved_ext_cap(pdev, PCI_EXT_CAP_ID_L1SS);
+	pl_save_state = pci_find_saved_ext_cap(parent, PCI_EXT_CAP_ID_L1SS);
+	if (!cl_save_state || !pl_save_state)
+		return;
+
+	cap = &cl_save_state->cap.data[0];
+	cl_ctl2 = *cap++;
+	cl_ctl1 = *cap;
+	cap = &pl_save_state->cap.data[0];
+	pl_ctl2 = *cap++;
+	pl_ctl1 = *cap;
+
+
+	/*
+	 * Disable L1.2 on this downstream endpoint device first, followed
+	 * by the upstream
+	 */
+	pci_clear_and_set_config_dword(pdev, pdev->l1ss + PCI_L1SS_CTL1,
+				       PCI_L1SS_CTL1_L1_2_MASK, 0);
+	pci_clear_and_set_config_dword(parent, parent->l1ss + PCI_L1SS_CTL1,
+				       PCI_L1SS_CTL1_L1_2_MASK, 0);
+
+	/*
+	 * In addition, Common_Mode_Restore_Time and LTR_L1.2_THRESHOLD
+	 * in PCI_L1SS_CTL1 must be programmed *before* setting the L1.2
+	 * enable bits, even though they're all in PCI_L1SS_CTL1.
+	 */
+	pl_l1_2_enable = pl_ctl1 & PCI_L1SS_CTL1_L1_2_MASK;
+	pl_ctl1 &= ~PCI_L1SS_CTL1_L1_2_MASK;
+	cl_l1_2_enable = cl_ctl1 & PCI_L1SS_CTL1_L1_2_MASK;
+	cl_ctl1 &= ~PCI_L1SS_CTL1_L1_2_MASK;
+
+	/* Write back without enables first (above we cleared them in ctl1) */
+	pci_write_config_dword(parent, parent->l1ss + PCI_L1SS_CTL2, pl_ctl2);
+	pci_write_config_dword(pdev, pdev->l1ss + PCI_L1SS_CTL2, cl_ctl2);
+	pci_write_config_dword(parent, parent->l1ss + PCI_L1SS_CTL1, pl_ctl1);
+	pci_write_config_dword(pdev, pdev->l1ss + PCI_L1SS_CTL1, cl_ctl1);
+
+
+	/* Then write back the enables */
+	if (pl_l1_2_enable || cl_l1_2_enable) {
+		pci_write_config_dword(parent, parent->l1ss + PCI_L1SS_CTL1,
+				       pl_ctl1 | pl_l1_2_enable);
+		pci_write_config_dword(pdev, pdev->l1ss + PCI_L1SS_CTL1,
+				       cl_ctl1 | cl_l1_2_enable);
+	}
+}
+
+void pci_restore_aspm_state(struct pci_dev *pdev)
+{
+	struct pci_cap_saved_state *save_state;
+	u16 *cap, val;
+
+	save_state = pci_find_saved_cap(pdev, PCI_CAP_ID_EXP);
+
+	if (!save_state)
+		return;
+
+	cap = (u16 *)&save_state->cap.data[0];
+	/* Must match the ordering in pci_save/restore_pcie_state() */
+	val = cap[1] & PCI_EXP_LNKCTL_ASPMC;
+	if (!val)
+		return;
+
+	/*
+	 * We restore L1 substate configuration first before enabling L1
+	 * as the PCIe spec 6.1 sec 5.5.4 suggests.
+	 */
+	pcie_restore_aspm_l1ss(pdev);
+
+	/* Re-enable L0s/L1 */
+	pcie_capability_set_word(pdev, PCI_EXP_LNKCTL, val);
+}
+
 #ifdef CONFIG_PCIEASPM
 
 #ifdef MODULE_PARAM_PREFIX

From b2059b2e54cde83492cf6e7d562dcd5bddaa6db4 Mon Sep 17 00:00:00 2001
From: "David E. Box" <david.e.box@linux.intel.com>
Date: Sun, 28 Jan 2024 15:32:11 -0800
Subject: [PATCH 0252/1406] PCI/ASPM: Move pci_save/restore_ltr_state() to
 aspm.c

Since the LTR Capability is linked with ASPM and only enabled when
CONFIG_PCIEASPM is set, move the save/restore code to aspm.c

Suggested-by: Bjorn Helgaas <bhelgaas@google.com>
Link: https://lore.kernel.org/r/20240128233212.1139663-5-david.e.box@linux.intel.com
Signed-off-by: David E. Box <david.e.box@linux.intel.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/pci/pci.c       | 59 ++++++++++++-----------------------------
 drivers/pci/pci.h       |  4 +++
 drivers/pci/pcie/aspm.c | 40 ++++++++++++++++++++++++++++
 3 files changed, 61 insertions(+), 42 deletions(-)

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index c783e0f1f2a971..b523cfed5b49f7 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -1623,6 +1623,8 @@ static int pci_save_pcie_state(struct pci_dev *dev)
 	pcie_capability_read_word(dev, PCI_EXP_LNKCTL2, &cap[i++]);
 	pcie_capability_read_word(dev, PCI_EXP_SLTCTL2, &cap[i++]);
 
+	pci_save_aspm_state(dev);
+
 	return 0;
 }
 
@@ -1630,7 +1632,7 @@ static void pci_restore_pcie_state(struct pci_dev *dev)
 {
 	int i = 0;
 	struct pci_cap_saved_state *save_state;
-	u16 *cap;
+	u16 *cap, val;
 
 	save_state = pci_find_saved_cap(dev, PCI_CAP_ID_EXP);
 	if (!save_state)
@@ -1645,12 +1647,20 @@ static void pci_restore_pcie_state(struct pci_dev *dev)
 
 	cap = (u16 *)&save_state->cap.data[0];
 	pcie_capability_write_word(dev, PCI_EXP_DEVCTL, cap[i++]);
-	pcie_capability_write_word(dev, PCI_EXP_LNKCTL, cap[i++]);
+
+	/*
+	 * Restore only the LNKCTL register with the ASPM control field
+	 * clear. ASPM will be restored in pci_restore_aspm_state().
+	 */
+	val = cap[i++] & ~PCI_EXP_LNKCTL_ASPMC;
+	pcie_capability_write_word(dev, PCI_EXP_LNKCTL, val);
 	pcie_capability_write_word(dev, PCI_EXP_SLTCTL, cap[i++]);
 	pcie_capability_write_word(dev, PCI_EXP_RTCTL, cap[i++]);
 	pcie_capability_write_word(dev, PCI_EXP_DEVCTL2, cap[i++]);
 	pcie_capability_write_word(dev, PCI_EXP_LNKCTL2, cap[i++]);
 	pcie_capability_write_word(dev, PCI_EXP_SLTCTL2, cap[i++]);
+
+	pci_restore_aspm_state(dev);
 }
 
 static int pci_save_pcix_state(struct pci_dev *dev)
@@ -1689,46 +1699,6 @@ static void pci_restore_pcix_state(struct pci_dev *dev)
 	pci_write_config_word(dev, pos + PCI_X_CMD, cap[i++]);
 }
 
-static void pci_save_ltr_state(struct pci_dev *dev)
-{
-	int ltr;
-	struct pci_cap_saved_state *save_state;
-	u32 *cap;
-
-	if (!pci_is_pcie(dev))
-		return;
-
-	ltr = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_LTR);
-	if (!ltr)
-		return;
-
-	save_state = pci_find_saved_ext_cap(dev, PCI_EXT_CAP_ID_LTR);
-	if (!save_state) {
-		pci_err(dev, "no suspend buffer for LTR; ASPM issues possible after resume\n");
-		return;
-	}
-
-	/* Some broken devices only support dword access to LTR */
-	cap = &save_state->cap.data[0];
-	pci_read_config_dword(dev, ltr + PCI_LTR_MAX_SNOOP_LAT, cap);
-}
-
-static void pci_restore_ltr_state(struct pci_dev *dev)
-{
-	struct pci_cap_saved_state *save_state;
-	int ltr;
-	u32 *cap;
-
-	save_state = pci_find_saved_ext_cap(dev, PCI_EXT_CAP_ID_LTR);
-	ltr = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_LTR);
-	if (!save_state || !ltr)
-		return;
-
-	/* Some broken devices only support dword access to LTR */
-	cap = &save_state->cap.data[0];
-	pci_write_config_dword(dev, ltr + PCI_LTR_MAX_SNOOP_LAT, *cap);
-}
-
 /**
  * pci_save_state - save the PCI configuration space of a device before
  *		    suspending
@@ -3562,6 +3532,11 @@ void pci_allocate_cap_save_buffers(struct pci_dev *dev)
 	if (error)
 		pci_err(dev, "unable to allocate suspend buffer for LTR\n");
 
+	error = pci_add_ext_cap_save_buffer(dev, PCI_EXT_CAP_ID_L1SS,
+					    2 * sizeof(u32));
+	if (error)
+		pci_err(dev, "unable to allocate suspend buffer for ASPM-L1SS\n");
+
 	pci_allocate_vc_save_buffers(dev);
 }
 
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index bf2559c5b82787..ecceb690fbbb56 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -98,6 +98,10 @@ void pci_msix_init(struct pci_dev *dev);
 bool pci_bridge_d3_possible(struct pci_dev *dev);
 void pci_bridge_d3_update(struct pci_dev *dev);
 void pci_aspm_get_l1ss(struct pci_dev *pdev);
+void pci_save_aspm_state(struct pci_dev *pdev);
+void pci_restore_aspm_state(struct pci_dev *pdev);
+void pci_save_ltr_state(struct pci_dev *dev);
+void pci_restore_ltr_state(struct pci_dev *dev);
 int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type);
 
 static inline void pci_wakeup_event(struct pci_dev *dev)
diff --git a/drivers/pci/pcie/aspm.c b/drivers/pci/pcie/aspm.c
index ebf81f6afc8204..60716fbf40a910 100644
--- a/drivers/pci/pcie/aspm.c
+++ b/drivers/pci/pcie/aspm.c
@@ -144,6 +144,46 @@ void pci_restore_aspm_state(struct pci_dev *pdev)
 	pcie_capability_set_word(pdev, PCI_EXP_LNKCTL, val);
 }
 
+void pci_save_ltr_state(struct pci_dev *dev)
+{
+	int ltr;
+	struct pci_cap_saved_state *save_state;
+	u32 *cap;
+
+	if (!pci_is_pcie(dev))
+		return;
+
+	ltr = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_LTR);
+	if (!ltr)
+		return;
+
+	save_state = pci_find_saved_ext_cap(dev, PCI_EXT_CAP_ID_LTR);
+	if (!save_state) {
+		pci_err(dev, "no suspend buffer for LTR; ASPM issues possible after resume\n");
+		return;
+	}
+
+	/* Some broken devices only support dword access to LTR */
+	cap = &save_state->cap.data[0];
+	pci_read_config_dword(dev, ltr + PCI_LTR_MAX_SNOOP_LAT, cap);
+}
+
+void pci_restore_ltr_state(struct pci_dev *dev)
+{
+	struct pci_cap_saved_state *save_state;
+	int ltr;
+	u32 *cap;
+
+	save_state = pci_find_saved_ext_cap(dev, PCI_EXT_CAP_ID_LTR);
+	ltr = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_LTR);
+	if (!save_state || !ltr)
+		return;
+
+	/* Some broken devices only support dword access to LTR */
+	cap = &save_state->cap.data[0];
+	pci_write_config_dword(dev, ltr + PCI_LTR_MAX_SNOOP_LAT, *cap);
+}
+
 #ifdef CONFIG_PCIEASPM
 
 #ifdef MODULE_PARAM_PREFIX

From be00f078ad2af958c0cd391ac9ead869be66e765 Mon Sep 17 00:00:00 2001
From: "David E. Box" <david.e.box@linux.intel.com>
Date: Sun, 28 Jan 2024 15:32:12 -0800
Subject: [PATCH 0253/1406] PCI/ASPM: Save and restore LTR state from
 pci_save/restore_pcie_state()

ASPM state is saved and restored from pci_save/restore_pcie_state().  Since
the LTR Capability is linked with ASPM, move the LTR save and restore calls
there as well.

Suggested-by: Bjorn Helgaas <bhelgaas@google.com>
Link: https://lore.kernel.org/r/20240128233212.1139663-6-david.e.box@linux.intel.com
Signed-off-by: David E. Box <david.e.box@linux.intel.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/pci/pci.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index b523cfed5b49f7..00139fad182780 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -1624,6 +1624,7 @@ static int pci_save_pcie_state(struct pci_dev *dev)
 	pcie_capability_read_word(dev, PCI_EXP_SLTCTL2, &cap[i++]);
 
 	pci_save_aspm_state(dev);
+	pci_save_ltr_state(dev);
 
 	return 0;
 }
@@ -1634,6 +1635,12 @@ static void pci_restore_pcie_state(struct pci_dev *dev)
 	struct pci_cap_saved_state *save_state;
 	u16 *cap, val;
 
+	/*
+	 * Restore max latencies (in the LTR capability) before enabling
+	 * LTR itself (in the PCIe capability).
+	 */
+	pci_restore_ltr_state(dev);
+
 	save_state = pci_find_saved_cap(dev, PCI_CAP_ID_EXP);
 	if (!save_state)
 		return;
@@ -1723,7 +1730,6 @@ int pci_save_state(struct pci_dev *dev)
 	if (i != 0)
 		return i;
 
-	pci_save_ltr_state(dev);
 	pci_save_dpc_state(dev);
 	pci_save_aer_state(dev);
 	pci_save_ptm_state(dev);
@@ -1824,12 +1830,6 @@ void pci_restore_state(struct pci_dev *dev)
 	if (!dev->state_saved)
 		return;
 
-	/*
-	 * Restore max latencies (in the LTR capability) before enabling
-	 * LTR itself (in the PCIe capability).
-	 */
-	pci_restore_ltr_state(dev);
-
 	pci_restore_pcie_state(dev);
 	pci_restore_pasid_state(dev);
 	pci_restore_pri_state(dev);

From 770c0f4975fd7b4bb68ca7cf150d3b1c9c864a99 Mon Sep 17 00:00:00 2001
From: Jeffrey Hugo <quic_jhugo@quicinc.com>
Date: Fri, 2 Feb 2024 11:01:51 -0700
Subject: [PATCH 0254/1406] dt-bindings: backlight: qcom-wled: Fix bouncing
 email addresses

Bjorn is no longer at Linaro.  Update his email address to @kernel to
match the .mailmap entry.

The servers for @codeaurora are long retired and messages sent there
will bounce.  Update Kiran's email address to match the .mailmap entry.

This will help anyone that is looking to reach out about this binding
and is not using .mailmap to pre-process their message.

Signed-off-by: Jeffrey Hugo <quic_jhugo@quicinc.com>
Reviewed-by: Daniel Thompson <daniel.thompson@linaro.org>
Acked-by: Rob Herring <robh@kernel.org>
Link: https://lore.kernel.org/r/20240202180151.4116329-1-quic_jhugo@quicinc.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 .../devicetree/bindings/leds/backlight/qcom-wled.yaml         | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Documentation/devicetree/bindings/leds/backlight/qcom-wled.yaml b/Documentation/devicetree/bindings/leds/backlight/qcom-wled.yaml
index 5f1849bdabba2f..a8490781011d1b 100644
--- a/Documentation/devicetree/bindings/leds/backlight/qcom-wled.yaml
+++ b/Documentation/devicetree/bindings/leds/backlight/qcom-wled.yaml
@@ -7,8 +7,8 @@ $schema: http://devicetree.org/meta-schemas/core.yaml#
 title: Qualcomm Technologies, Inc. WLED driver
 
 maintainers:
-  - Bjorn Andersson <bjorn.andersson@linaro.org>
-  - Kiran Gunda <kgunda@codeaurora.org>
+  - Bjorn Andersson <andersson@kernel.org>
+  - Kiran Gunda <quic_kgunda@quicinc.com>
 
 description: |
   WLED (White Light Emitting Diode) driver is used for controlling display

From 012e20909c25b42b13f6de8f5014d031a9e5ac7a Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Thu, 8 Feb 2024 19:08:37 +0100
Subject: [PATCH 0255/1406] file: prepare for new helper

In order to add a helper to open files that aren't accounted split
alloc_file() and parts of alloc_file_pseudo() into helpers. One to
prepare a path, another one to setup the file.

Suggested-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20240129160241.GA2793@lst.de
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/file_table.c | 59 +++++++++++++++++++++++++++++--------------------
 1 file changed, 35 insertions(+), 24 deletions(-)

diff --git a/fs/file_table.c b/fs/file_table.c
index b991f90571b4d3..e9feed48ad9275 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -276,21 +276,16 @@ struct file *alloc_empty_backing_file(int flags, const struct cred *cred)
 }
 
 /**
- * alloc_file - allocate and initialize a 'struct file'
+ * file_init - initialize a 'struct file'
  *
+ * @file: the file to set up
  * @path: the (dentry, vfsmount) pair for the new file
  * @flags: O_... flags with which the new file will be opened
  * @fop: the 'struct file_operations' for the new file
  */
-static struct file *alloc_file(const struct path *path, int flags,
-		const struct file_operations *fop)
+static void file_init(struct file *file, const struct path *path, int flags,
+		      const struct file_operations *fop)
 {
-	struct file *file;
-
-	file = alloc_empty_file(flags, current_cred());
-	if (IS_ERR(file))
-		return file;
-
 	file->f_path = *path;
 	file->f_inode = path->dentry->d_inode;
 	file->f_mapping = path->dentry->d_inode->i_mapping;
@@ -309,27 +304,40 @@ static struct file *alloc_file(const struct path *path, int flags,
 	file->f_op = fop;
 	if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
 		i_readcount_inc(path->dentry->d_inode);
-	return file;
 }
 
-struct file *alloc_file_pseudo(struct inode *inode, struct vfsmount *mnt,
-				const char *name, int flags,
-				const struct file_operations *fops)
+static inline int alloc_path_pseudo(const char *name, struct inode *inode,
+				    struct vfsmount *mnt, struct path *path)
 {
 	struct qstr this = QSTR_INIT(name, strlen(name));
+
+	path->dentry = d_alloc_pseudo(mnt->mnt_sb, &this);
+	if (!path->dentry)
+		return -ENOMEM;
+	path->mnt = mntget(mnt);
+	d_instantiate(path->dentry, inode);
+	return 0;
+}
+
+struct file *alloc_file_pseudo(struct inode *inode, struct vfsmount *mnt,
+			       const char *name, int flags,
+			       const struct file_operations *fops)
+{
+	int ret;
 	struct path path;
 	struct file *file;
 
-	path.dentry = d_alloc_pseudo(mnt->mnt_sb, &this);
-	if (!path.dentry)
-		return ERR_PTR(-ENOMEM);
-	path.mnt = mntget(mnt);
-	d_instantiate(path.dentry, inode);
-	file = alloc_file(&path, flags, fops);
+	ret = alloc_path_pseudo(name, inode, mnt, &path);
+	if (ret)
+		return ERR_PTR(ret);
+
+	file = alloc_empty_file(flags, current_cred());
 	if (IS_ERR(file)) {
 		ihold(inode);
 		path_put(&path);
+		return file;
 	}
+	file_init(file, &path, flags, fops);
 	return file;
 }
 EXPORT_SYMBOL(alloc_file_pseudo);
@@ -337,11 +345,14 @@ EXPORT_SYMBOL(alloc_file_pseudo);
 struct file *alloc_file_clone(struct file *base, int flags,
 				const struct file_operations *fops)
 {
-	struct file *f = alloc_file(&base->f_path, flags, fops);
-	if (!IS_ERR(f)) {
-		path_get(&f->f_path);
-		f->f_mapping = base->f_mapping;
-	}
+	struct file *f;
+
+	f = alloc_empty_file(flags, current_cred());
+	if (IS_ERR(f))
+		return f;
+	file_init(f, &base->f_path, flags, fops);
+	path_get(&f->f_path);
+	f->f_mapping = base->f_mapping;
 	return f;
 }
 

From 0d19e760478b65463043d0e06d98f5264eb3d765 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Thu, 8 Feb 2024 19:10:45 +0100
Subject: [PATCH 0256/1406] file: add alloc_file_pseudo_noaccount()

When we open block devices as files we want to make sure to not charge
them against the open file limit of the caller as that can cause
spurious failures.

Link: https://lore.kernel.org/r/20240123-vfs-bdev-file-v2-1-adbd023e19cc@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/file_table.c      | 24 ++++++++++++++++++++++++
 include/linux/file.h |  2 ++
 2 files changed, 26 insertions(+)

diff --git a/fs/file_table.c b/fs/file_table.c
index e9feed48ad9275..4d0edf20b96683 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -342,6 +342,30 @@ struct file *alloc_file_pseudo(struct inode *inode, struct vfsmount *mnt,
 }
 EXPORT_SYMBOL(alloc_file_pseudo);
 
+struct file *alloc_file_pseudo_noaccount(struct inode *inode,
+					 struct vfsmount *mnt, const char *name,
+					 int flags,
+					 const struct file_operations *fops)
+{
+	int ret;
+	struct path path;
+	struct file *file;
+
+	ret = alloc_path_pseudo(name, inode, mnt, &path);
+	if (ret)
+		return ERR_PTR(ret);
+
+	file = alloc_empty_file_noaccount(flags, current_cred());
+	if (IS_ERR(file)) {
+		ihold(inode);
+		path_put(&path);
+		return file;
+	}
+	file_init(file, &path, flags, fops);
+	return file;
+}
+EXPORT_SYMBOL_GPL(alloc_file_pseudo_noaccount);
+
 struct file *alloc_file_clone(struct file *base, int flags,
 				const struct file_operations *fops)
 {
diff --git a/include/linux/file.h b/include/linux/file.h
index 6834a29338c43c..169692cb1906d8 100644
--- a/include/linux/file.h
+++ b/include/linux/file.h
@@ -24,6 +24,8 @@ struct inode;
 struct path;
 extern struct file *alloc_file_pseudo(struct inode *, struct vfsmount *,
 	const char *, int flags, const struct file_operations *);
+extern struct file *alloc_file_pseudo_noaccount(struct inode *, struct vfsmount *,
+	const char *, int flags, const struct file_operations *);
 extern struct file *alloc_file_clone(struct file *, int flags,
 	const struct file_operations *);
 

From 295afc57e51c666fffc50d6b25c0ba542dbc6854 Mon Sep 17 00:00:00 2001
From: "Ricardo B. Marliere" <ricardo@marliere.net>
Date: Sun, 4 Feb 2024 13:02:30 -0300
Subject: [PATCH 0257/1406] counter: make counter_bus_type const

Now that the driver core can properly handle constant struct bus_type,
move the counter_bus_type variable to be a constant structure as well,
placing it into read-only memory which can not be modified at runtime.

Suggested-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: "Ricardo B. Marliere" <ricardo@marliere.net>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Link: https://lore.kernel.org/r/20240204-bus_cleanup-counter-v1-1-cef9dd719bdc@marliere.net
Signed-off-by: William Breathitt Gray <william.gray@linaro.org>
---
 drivers/counter/counter-core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/counter/counter-core.c b/drivers/counter/counter-core.c
index 09c77afb33ca84..f6a939d5117176 100644
--- a/drivers/counter/counter-core.c
+++ b/drivers/counter/counter-core.c
@@ -53,7 +53,7 @@ static struct device_type counter_device_type = {
 	.release = counter_device_release,
 };
 
-static struct bus_type counter_bus_type = {
+static const struct bus_type counter_bus_type = {
 	.name = "counter",
 	.dev_name = "counter",
 };

From 899d84013f77fab43bbfa9a960d6ffe99274e358 Mon Sep 17 00:00:00 2001
From: Lukas Wunner <lukas@wunner.de>
Date: Mon, 30 Oct 2023 13:32:12 +0100
Subject: [PATCH 0258/1406] PCI: Compile pci-sysfs.c only if CONFIG_SYSFS=y

It is possible to enable CONFIG_PCI but disable CONFIG_SYSFS and for
space-constrained devices such as routers, such a configuration may
actually make sense.

However pci-sysfs.c is compiled even if CONFIG_SYSFS is disabled,
unnecessarily increasing the kernel's size.

To rectify that:

* Move pci_mmap_fits() to mmap.c.  It is not only needed by
  pci-sysfs.c, but also proc.c.

* Move pci_dev_type to probe.c and make it private.  It references
  pci_dev_attr_groups in pci-sysfs.c.  Make that public instead for
  consistency with pci_dev_groups, pcibus_groups and pci_bus_groups,
  which are likewise public and referenced by struct definitions in
  pci-driver.c and probe.c.

* Define pci_dev_groups, pci_dev_attr_groups, pcibus_groups and
  pci_bus_groups to NULL if CONFIG_SYSFS is disabled.  Provide empty
  static inlines for pci_{create,remove}_legacy_files() and
  pci_{create,remove}_sysfs_dev_files().

Result:

vmlinux size is reduced by 122996 bytes in my arm 32-bit test build.

Link: https://lore.kernel.org/r/85ca95ae8e4d57ccf082c5c069b8b21eb141846e.1698668982.git.lukas@wunner.de
Signed-off-by: Lukas Wunner <lukas@wunner.de>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Alistair Francis <alistair.francis@wdc.com>
---
 drivers/pci/Makefile    |  4 ++--
 drivers/pci/mmap.c      | 29 +++++++++++++++++++++++++++++
 drivers/pci/pci-sysfs.c | 29 +----------------------------
 drivers/pci/pci.h       | 18 ++++++++++++++----
 drivers/pci/probe.c     |  4 ++++
 5 files changed, 50 insertions(+), 34 deletions(-)

diff --git a/drivers/pci/Makefile b/drivers/pci/Makefile
index cc8b4e01e29de5..96f4759f2bd314 100644
--- a/drivers/pci/Makefile
+++ b/drivers/pci/Makefile
@@ -4,7 +4,7 @@
 
 obj-$(CONFIG_PCI)		+= access.o bus.o probe.o host-bridge.o \
 				   remove.o pci.o pci-driver.o search.o \
-				   pci-sysfs.o rom.o setup-res.o irq.o vpd.o \
+				   rom.o setup-res.o irq.o vpd.o \
 				   setup-bus.o vc.o mmap.o setup-irq.o
 
 obj-$(CONFIG_PCI)		+= msi/
@@ -12,7 +12,7 @@ obj-$(CONFIG_PCI)		+= pcie/
 
 ifdef CONFIG_PCI
 obj-$(CONFIG_PROC_FS)		+= proc.o
-obj-$(CONFIG_SYSFS)		+= slot.o
+obj-$(CONFIG_SYSFS)		+= pci-sysfs.o slot.o
 obj-$(CONFIG_ACPI)		+= pci-acpi.o
 endif
 
diff --git a/drivers/pci/mmap.c b/drivers/pci/mmap.c
index 4504039056d1b5..8da3347a95c47a 100644
--- a/drivers/pci/mmap.c
+++ b/drivers/pci/mmap.c
@@ -11,6 +11,8 @@
 #include <linux/mm.h>
 #include <linux/pci.h>
 
+#include "pci.h"
+
 #ifdef ARCH_GENERIC_PCI_MMAP_RESOURCE
 
 static const struct vm_operations_struct pci_phys_vm_ops = {
@@ -50,3 +52,30 @@ int pci_mmap_resource_range(struct pci_dev *pdev, int bar,
 }
 
 #endif
+
+#if (defined(CONFIG_SYSFS) || defined(CONFIG_PROC_FS)) && \
+    (defined(HAVE_PCI_MMAP) || defined(ARCH_GENERIC_PCI_MMAP_RESOURCE))
+
+int pci_mmap_fits(struct pci_dev *pdev, int resno, struct vm_area_struct *vma,
+		  enum pci_mmap_api mmap_api)
+{
+	resource_size_t pci_start = 0, pci_end;
+	unsigned long nr, start, size;
+
+	if (pci_resource_len(pdev, resno) == 0)
+		return 0;
+	nr = vma_pages(vma);
+	start = vma->vm_pgoff;
+	size = ((pci_resource_len(pdev, resno) - 1) >> PAGE_SHIFT) + 1;
+	if (mmap_api == PCI_MMAP_PROCFS) {
+		pci_resource_to_user(pdev, resno, &pdev->resource[resno],
+				     &pci_start, &pci_end);
+		pci_start >>= PAGE_SHIFT;
+	}
+	if (start >= pci_start && start < pci_start + size &&
+	    start + nr <= pci_start + size)
+		return 1;
+	return 0;
+}
+
+#endif
diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c
index 2321fdfefd7db2..44ed30df08c321 100644
--- a/drivers/pci/pci-sysfs.c
+++ b/drivers/pci/pci-sysfs.c
@@ -1022,29 +1022,6 @@ void pci_remove_legacy_files(struct pci_bus *b)
 #endif /* HAVE_PCI_LEGACY */
 
 #if defined(HAVE_PCI_MMAP) || defined(ARCH_GENERIC_PCI_MMAP_RESOURCE)
-
-int pci_mmap_fits(struct pci_dev *pdev, int resno, struct vm_area_struct *vma,
-		  enum pci_mmap_api mmap_api)
-{
-	unsigned long nr, start, size;
-	resource_size_t pci_start = 0, pci_end;
-
-	if (pci_resource_len(pdev, resno) == 0)
-		return 0;
-	nr = vma_pages(vma);
-	start = vma->vm_pgoff;
-	size = ((pci_resource_len(pdev, resno) - 1) >> PAGE_SHIFT) + 1;
-	if (mmap_api == PCI_MMAP_PROCFS) {
-		pci_resource_to_user(pdev, resno, &pdev->resource[resno],
-				     &pci_start, &pci_end);
-		pci_start >>= PAGE_SHIFT;
-	}
-	if (start >= pci_start && start < pci_start + size &&
-			start + nr <= pci_start + size)
-		return 1;
-	return 0;
-}
-
 /**
  * pci_mmap_resource - map a PCI resource into user memory space
  * @kobj: kobject for mapping
@@ -1660,7 +1637,7 @@ static const struct attribute_group pcie_dev_attr_group = {
 	.is_visible = pcie_dev_attrs_are_visible,
 };
 
-static const struct attribute_group *pci_dev_attr_groups[] = {
+const struct attribute_group *pci_dev_attr_groups[] = {
 	&pci_dev_attr_group,
 	&pci_dev_hp_attr_group,
 #ifdef CONFIG_PCI_IOV
@@ -1677,7 +1654,3 @@ static const struct attribute_group *pci_dev_attr_groups[] = {
 #endif
 	NULL,
 };
-
-const struct device_type pci_dev_type = {
-	.groups = pci_dev_attr_groups,
-};
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 2336a8d1edab27..74d7f66b64a875 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -31,8 +31,6 @@ bool pcie_cap_has_rtctl(const struct pci_dev *dev);
 
 /* Functions internal to the PCI core code */
 
-int pci_create_sysfs_dev_files(struct pci_dev *pdev);
-void pci_remove_sysfs_dev_files(struct pci_dev *pdev);
 void pci_cleanup_rom(struct pci_dev *dev);
 #ifdef CONFIG_DMI
 extern const struct attribute_group pci_dev_smbios_attr_group;
@@ -152,7 +150,7 @@ static inline int pci_proc_detach_bus(struct pci_bus *bus) { return 0; }
 /* Functions for PCI Hotplug drivers to use */
 int pci_hp_add_bridge(struct pci_dev *dev);
 
-#ifdef HAVE_PCI_LEGACY
+#if defined(CONFIG_SYSFS) && defined(HAVE_PCI_LEGACY)
 void pci_create_legacy_files(struct pci_bus *bus);
 void pci_remove_legacy_files(struct pci_bus *bus);
 #else
@@ -185,10 +183,22 @@ static inline int pci_no_d1d2(struct pci_dev *dev)
 	return (dev->no_d1d2 || parent_dstates);
 
 }
+
+#ifdef CONFIG_SYSFS
+int pci_create_sysfs_dev_files(struct pci_dev *pdev);
+void pci_remove_sysfs_dev_files(struct pci_dev *pdev);
 extern const struct attribute_group *pci_dev_groups[];
+extern const struct attribute_group *pci_dev_attr_groups[];
 extern const struct attribute_group *pcibus_groups[];
-extern const struct device_type pci_dev_type;
 extern const struct attribute_group *pci_bus_groups[];
+#else
+static inline int pci_create_sysfs_dev_files(struct pci_dev *pdev) { return 0; }
+static inline void pci_remove_sysfs_dev_files(struct pci_dev *pdev) { }
+#define pci_dev_groups NULL
+#define pci_dev_attr_groups NULL
+#define pcibus_groups NULL
+#define pci_bus_groups NULL
+#endif
 
 extern unsigned long pci_hotplug_io_size;
 extern unsigned long pci_hotplug_mmio_size;
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index b7335be56008f7..c1496e683e7005 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -2357,6 +2357,10 @@ static void pci_release_dev(struct device *dev)
 	kfree(pci_dev);
 }
 
+static const struct device_type pci_dev_type = {
+	.groups = pci_dev_attr_groups,
+};
+
 struct pci_dev *pci_alloc_dev(struct pci_bus *bus)
 {
 	struct pci_dev *dev;

From 63d46932ef2167de293ba47f0f520eccf3423335 Mon Sep 17 00:00:00 2001
From: Lukas Wunner <lukas@wunner.de>
Date: Mon, 30 Oct 2023 13:33:13 +0100
Subject: [PATCH 0259/1406] PCI: Remove obsolete pci_cleanup_rom() declaration

Commit d9c8bea179a6 ("PCI: Remove unused IORESOURCE_ROM_COPY and
IORESOURCE_ROM_BIOS_COPY") removed pci_cleanup_rom(), but retained
its declaration in pci.h.

Remove it.

Link: https://lore.kernel.org/r/fc30de5276e21d5a3ebcb7e58a8b43e399f7e6e6.1698668982.git.lukas@wunner.de
Signed-off-by: Lukas Wunner <lukas@wunner.de>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Alistair Francis <alistair.francis@wdc.com>
---
 drivers/pci/pci.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 74d7f66b64a875..9e32227008d52c 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -31,7 +31,6 @@ bool pcie_cap_has_rtctl(const struct pci_dev *dev);
 
 /* Functions internal to the PCI core code */
 
-void pci_cleanup_rom(struct pci_dev *dev);
 #ifdef CONFIG_DMI
 extern const struct attribute_group pci_dev_smbios_attr_group;
 #endif

From f1d1f00279f6302e8a5bf6fd996a84e91cc9c69e Mon Sep 17 00:00:00 2001
From: Vincenzo Mezzela <vincenzo.mezzela@gmail.com>
Date: Thu, 8 Feb 2024 17:20:32 +0100
Subject: [PATCH 0260/1406] docs: filesystems: fix typo in docs

This patch resolves a spelling error in the filesystem documentation.

It is submitted as part of my application to the "Linux Kernel Bug
Fixing Spring Unpaid 2024" mentorship program of the Linux Kernel
Foundation.

Signed-off-by: Vincenzo Mezzela <vincenzo.mezzela@gmail.com>
Link: https://lore.kernel.org/r/20240208162032.109184-1-vincenzo.mezzela@gmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 Documentation/filesystems/files.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/filesystems/files.rst b/Documentation/filesystems/files.rst
index 9e38e4c221ca5d..eb770f891b275f 100644
--- a/Documentation/filesystems/files.rst
+++ b/Documentation/filesystems/files.rst
@@ -116,7 +116,7 @@ before and after the reference count increment. This pattern can be seen
 in get_file_rcu() and __files_get_rcu().
 
 In addition, it isn't possible to access or check fields in struct file
-without first aqcuiring a reference on it under rcu lookup. Not doing
+without first acquiring a reference on it under rcu lookup. Not doing
 that was always very dodgy and it was only usable for non-pointer data
 in struct file. With SLAB_TYPESAFE_BY_RCU it is necessary that callers
 either first acquire a reference or they must hold the files_lock of the

From f36e87456bf6aa2f1a660c7429bb25f56fbe61bf Mon Sep 17 00:00:00 2001
From: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Date: Fri, 9 Feb 2024 09:08:06 -0500
Subject: [PATCH 0261/1406] Bluetooth: hci_conn: Fix UAF Write in
 __hci_acl_create_connection_sync

This fixes the UAF on __hci_acl_create_connection_sync caused by
connection abortion, it uses the same logic as to LE_LINK which uses
hci_cmd_sync_cancel to prevent the callback to run if the connection is
abort prematurely.

Reported-by: syzbot+3f0a39be7a2035700868@syzkaller.appspotmail.com
Fixes: 456561ba8e49 ("Bluetooth: hci_conn: Only do ACL connections sequentially")
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/hci_sync.h |  3 +--
 net/bluetooth/hci_conn.c         |  3 ++-
 net/bluetooth/hci_sync.c         | 16 ++++++++++------
 3 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/include/net/bluetooth/hci_sync.h b/include/net/bluetooth/hci_sync.h
index 824660f8f30da6..ed334c253ebcd9 100644
--- a/include/net/bluetooth/hci_sync.h
+++ b/include/net/bluetooth/hci_sync.h
@@ -139,5 +139,4 @@ int hci_le_big_terminate_sync(struct hci_dev *hdev, u8 handle);
 
 int hci_le_pa_terminate_sync(struct hci_dev *hdev, u16 handle);
 
-int hci_acl_create_connection_sync(struct hci_dev *hdev,
-				   struct hci_conn *conn);
+int hci_connect_acl_sync(struct hci_dev *hdev, struct hci_conn *conn);
diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c
index 8164502234c555..587eb27f374c98 100644
--- a/net/bluetooth/hci_conn.c
+++ b/net/bluetooth/hci_conn.c
@@ -1645,7 +1645,7 @@ struct hci_conn *hci_connect_acl(struct hci_dev *hdev, bdaddr_t *dst,
 		acl->auth_type = auth_type;
 		acl->conn_timeout = timeout;
 
-		err = hci_acl_create_connection_sync(hdev, acl);
+		err = hci_connect_acl_sync(hdev, acl);
 		if (err) {
 			hci_conn_del(acl);
 			return ERR_PTR(err);
@@ -2942,6 +2942,7 @@ int hci_abort_conn(struct hci_conn *conn, u8 reason)
 	 */
 	if (conn->state == BT_CONNECT && hdev->req_status == HCI_REQ_PEND) {
 		switch (hci_skb_event(hdev->sent_cmd)) {
+		case HCI_EV_CONN_COMPLETE:
 		case HCI_EV_LE_CONN_COMPLETE:
 		case HCI_EV_LE_ENHANCED_CONN_COMPLETE:
 		case HCI_EVT_LE_CIS_ESTABLISHED:
diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c
index 788a889210d868..e1fdcb3c270625 100644
--- a/net/bluetooth/hci_sync.c
+++ b/net/bluetooth/hci_sync.c
@@ -6493,13 +6493,18 @@ int hci_update_adv_data(struct hci_dev *hdev, u8 instance)
 				  UINT_PTR(instance), NULL);
 }
 
-static int __hci_acl_create_connection_sync(struct hci_dev *hdev, void *data)
+static int hci_acl_create_conn_sync(struct hci_dev *hdev, void *data)
 {
-	struct hci_conn *conn = data;
+	struct hci_conn *conn;
+	u16 handle = PTR_UINT(data);
 	struct inquiry_entry *ie;
 	struct hci_cp_create_conn cp;
 	int err;
 
+	conn = hci_conn_hash_lookup_handle(hdev, handle);
+	if (!conn)
+		return 0;
+
 	/* Many controllers disallow HCI Create Connection while it is doing
 	 * HCI Inquiry. So we cancel the Inquiry first before issuing HCI Create
 	 * Connection. This may cause the MGMT discovering state to become false
@@ -6556,9 +6561,8 @@ static int __hci_acl_create_connection_sync(struct hci_dev *hdev, void *data)
 	return err;
 }
 
-int hci_acl_create_connection_sync(struct hci_dev *hdev,
-				   struct hci_conn *conn)
+int hci_connect_acl_sync(struct hci_dev *hdev, struct hci_conn *conn)
 {
-	return hci_cmd_sync_queue(hdev, __hci_acl_create_connection_sync,
-				  conn, NULL);
+	return hci_cmd_sync_queue(hdev, hci_acl_create_conn_sync,
+				  UINT_PTR(conn->handle), NULL);
 }

From b042f70ea0cd647073acb7738b29c21cec1dfbba Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Thu, 8 Feb 2024 18:47:35 +0100
Subject: [PATCH 0262/1406] bdev: open block device as files

Add two new helpers to allow opening block devices as files.
This is not the final infrastructure. This still opens the block device
before opening a struct a file. Until we have removed all references to
struct bdev_handle we can't switch the order:

* Introduce blk_to_file_flags() to translate from block specific to
  flags usable to pen a new file.
* Introduce bdev_file_open_by_{dev,path}().
* Introduce temporary sb_bdev_handle() helper to retrieve a struct
  bdev_handle from a block device file and update places that directly
  reference struct bdev_handle to rely on it.
* Don't count block device openes against the number of open files. A
  bdev_file_open_by_{dev,path}() file is never installed into any
  file descriptor table.

One idea that came to mind was to use kernel_tmpfile_open() which
would require us to pass a path and it would then call do_dentry_open()
going through the regular fops->open::blkdev_open() path. But then we're
back to the problem of routing block specific flags such as
BLK_OPEN_RESTRICT_WRITES through the open path and would have to waste
FMODE_* flags every time we add a new one. With this we can avoid using
a flag bit and we have more leeway in how we open block devices from
bdev_open_by_{dev,path}().

Link: https://lore.kernel.org/r/20240123-vfs-bdev-file-v2-1-adbd023e19cc@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 block/bdev.c           | 101 +++++++++++++++++++++++++++++++++++++++--
 fs/cramfs/inode.c      |   2 +-
 fs/f2fs/super.c        |   2 +-
 fs/jfs/jfs_logmgr.c    |   2 +-
 fs/romfs/super.c       |   2 +-
 fs/super.c             |  18 ++++----
 fs/xfs/xfs_super.c     |   2 +-
 include/linux/blkdev.h |   7 +++
 include/linux/fs.h     |  10 +++-
 9 files changed, 126 insertions(+), 20 deletions(-)

diff --git a/block/bdev.c b/block/bdev.c
index e9f1b12bd75c7b..e1149652c53285 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -49,6 +49,13 @@ struct block_device *I_BDEV(struct inode *inode)
 }
 EXPORT_SYMBOL(I_BDEV);
 
+struct block_device *file_bdev(struct file *bdev_file)
+{
+	struct bdev_handle *handle = bdev_file->private_data;
+	return handle->bdev;
+}
+EXPORT_SYMBOL(file_bdev);
+
 static void bdev_write_inode(struct block_device *bdev)
 {
 	struct inode *inode = bdev->bd_inode;
@@ -368,12 +375,12 @@ static struct file_system_type bd_type = {
 };
 
 struct super_block *blockdev_superblock __ro_after_init;
+struct vfsmount *blockdev_mnt __ro_after_init;
 EXPORT_SYMBOL_GPL(blockdev_superblock);
 
 void __init bdev_cache_init(void)
 {
 	int err;
-	static struct vfsmount *bd_mnt __ro_after_init;
 
 	bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode),
 			0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
@@ -382,10 +389,10 @@ void __init bdev_cache_init(void)
 	err = register_filesystem(&bd_type);
 	if (err)
 		panic("Cannot register bdev pseudo-fs");
-	bd_mnt = kern_mount(&bd_type);
-	if (IS_ERR(bd_mnt))
+	blockdev_mnt = kern_mount(&bd_type);
+	if (IS_ERR(blockdev_mnt))
 		panic("Cannot create bdev pseudo-fs");
-	blockdev_superblock = bd_mnt->mnt_sb;   /* For writeback */
+	blockdev_superblock = blockdev_mnt->mnt_sb;   /* For writeback */
 }
 
 struct block_device *bdev_alloc(struct gendisk *disk, u8 partno)
@@ -911,6 +918,92 @@ struct bdev_handle *bdev_open_by_dev(dev_t dev, blk_mode_t mode, void *holder,
 }
 EXPORT_SYMBOL(bdev_open_by_dev);
 
+/*
+ * If BLK_OPEN_WRITE_IOCTL is set then this is a historical quirk
+ * associated with the floppy driver where it has allowed ioctls if the
+ * file was opened for writing, but does not allow reads or writes.
+ * Make sure that this quirk is reflected in @f_flags.
+ *
+ * It can also happen if a block device is opened as O_RDWR | O_WRONLY.
+ */
+static unsigned blk_to_file_flags(blk_mode_t mode)
+{
+	unsigned int flags = 0;
+
+	if ((mode & (BLK_OPEN_READ | BLK_OPEN_WRITE)) ==
+	    (BLK_OPEN_READ | BLK_OPEN_WRITE))
+		flags |= O_RDWR;
+	else if (mode & BLK_OPEN_WRITE_IOCTL)
+		flags |= O_RDWR | O_WRONLY;
+	else if (mode & BLK_OPEN_WRITE)
+		flags |= O_WRONLY;
+	else if (mode & BLK_OPEN_READ)
+		flags |= O_RDONLY; /* homeopathic, because O_RDONLY is 0 */
+	else
+		WARN_ON_ONCE(true);
+
+	if (mode & BLK_OPEN_NDELAY)
+		flags |= O_NDELAY;
+
+	return flags;
+}
+
+struct file *bdev_file_open_by_dev(dev_t dev, blk_mode_t mode, void *holder,
+				   const struct blk_holder_ops *hops)
+{
+	struct file *bdev_file;
+	struct bdev_handle *handle;
+	unsigned int flags;
+
+	handle = bdev_open_by_dev(dev, mode, holder, hops);
+	if (IS_ERR(handle))
+		return ERR_CAST(handle);
+
+	flags = blk_to_file_flags(mode);
+	bdev_file = alloc_file_pseudo_noaccount(handle->bdev->bd_inode,
+			blockdev_mnt, "", flags | O_LARGEFILE, &def_blk_fops);
+	if (IS_ERR(bdev_file)) {
+		bdev_release(handle);
+		return bdev_file;
+	}
+	ihold(handle->bdev->bd_inode);
+
+	bdev_file->f_mode |= FMODE_BUF_RASYNC | FMODE_CAN_ODIRECT;
+	if (bdev_nowait(handle->bdev))
+		bdev_file->f_mode |= FMODE_NOWAIT;
+
+	bdev_file->f_mapping = handle->bdev->bd_inode->i_mapping;
+	bdev_file->f_wb_err = filemap_sample_wb_err(bdev_file->f_mapping);
+	bdev_file->private_data = handle;
+	return bdev_file;
+}
+EXPORT_SYMBOL(bdev_file_open_by_dev);
+
+struct file *bdev_file_open_by_path(const char *path, blk_mode_t mode,
+				    void *holder,
+				    const struct blk_holder_ops *hops)
+{
+	struct file *bdev_file;
+	dev_t dev;
+	int error;
+
+	error = lookup_bdev(path, &dev);
+	if (error)
+		return ERR_PTR(error);
+
+	bdev_file = bdev_file_open_by_dev(dev, mode, holder, hops);
+	if (!IS_ERR(bdev_file) && (mode & BLK_OPEN_WRITE)) {
+		struct bdev_handle *handle = bdev_file->private_data;
+		if (bdev_read_only(handle->bdev)) {
+			fput(bdev_file);
+			bdev_file = ERR_PTR(-EACCES);
+		}
+	}
+
+	return bdev_file;
+}
+EXPORT_SYMBOL(bdev_file_open_by_path);
+
 /**
  * bdev_open_by_path - open a block device by name
  * @path: path to the block device to open
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 60dbfa0f880514..39e75131fd5aa0 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -495,7 +495,7 @@ static void cramfs_kill_sb(struct super_block *sb)
 		sb->s_mtd = NULL;
 	} else if (IS_ENABLED(CONFIG_CRAMFS_BLOCKDEV) && sb->s_bdev) {
 		sync_blockdev(sb->s_bdev);
-		bdev_release(sb->s_bdev_handle);
+		fput(sb->s_bdev_file);
 	}
 	kfree(sbi);
 }
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index d45ab0992ae594..ea94c148fee566 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -4247,7 +4247,7 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi)
 
 	for (i = 0; i < max_devices; i++) {
 		if (i == 0)
-			FDEV(0).bdev_handle = sbi->sb->s_bdev_handle;
+			FDEV(0).bdev_handle = sb_bdev_handle(sbi->sb);
 		else if (!RDEV(i).path[0])
 			break;
 
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index cb6d1fda66a702..8691463956d17a 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -1162,7 +1162,7 @@ static int open_inline_log(struct super_block *sb)
 	init_waitqueue_head(&log->syncwait);
 
 	set_bit(log_INLINELOG, &log->flag);
-	log->bdev_handle = sb->s_bdev_handle;
+	log->bdev_handle = sb_bdev_handle(sb);
 	log->base = addressPXD(&JFS_SBI(sb)->logpxd);
 	log->size = lengthPXD(&JFS_SBI(sb)->logpxd) >>
 	    (L2LOGPSIZE - sb->s_blocksize_bits);
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index 545ad44f96b891..1ed468c035579e 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -594,7 +594,7 @@ static void romfs_kill_sb(struct super_block *sb)
 #ifdef CONFIG_ROMFS_ON_BLOCK
 	if (sb->s_bdev) {
 		sync_blockdev(sb->s_bdev);
-		bdev_release(sb->s_bdev_handle);
+		fput(sb->s_bdev_file);
 	}
 #endif
 }
diff --git a/fs/super.c b/fs/super.c
index d35e852954892d..08dcc3371aa09e 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -1532,16 +1532,16 @@ int setup_bdev_super(struct super_block *sb, int sb_flags,
 		struct fs_context *fc)
 {
 	blk_mode_t mode = sb_open_mode(sb_flags);
-	struct bdev_handle *bdev_handle;
+	struct file *bdev_file;
 	struct block_device *bdev;
 
-	bdev_handle = bdev_open_by_dev(sb->s_dev, mode, sb, &fs_holder_ops);
-	if (IS_ERR(bdev_handle)) {
+	bdev_file = bdev_file_open_by_dev(sb->s_dev, mode, sb, &fs_holder_ops);
+	if (IS_ERR(bdev_file)) {
 		if (fc)
 			errorf(fc, "%s: Can't open blockdev", fc->source);
-		return PTR_ERR(bdev_handle);
+		return PTR_ERR(bdev_file);
 	}
-	bdev = bdev_handle->bdev;
+	bdev = file_bdev(bdev_file);
 
 	/*
 	 * This really should be in blkdev_get_by_dev, but right now can't due
@@ -1549,7 +1549,7 @@ int setup_bdev_super(struct super_block *sb, int sb_flags,
 	 * writable from userspace even for a read-only block device.
 	 */
 	if ((mode & BLK_OPEN_WRITE) && bdev_read_only(bdev)) {
-		bdev_release(bdev_handle);
+		fput(bdev_file);
 		return -EACCES;
 	}
 
@@ -1560,11 +1560,11 @@ int setup_bdev_super(struct super_block *sb, int sb_flags,
 	if (atomic_read(&bdev->bd_fsfreeze_count) > 0) {
 		if (fc)
 			warnf(fc, "%pg: Can't mount, blockdev is frozen", bdev);
-		bdev_release(bdev_handle);
+		fput(bdev_file);
 		return -EBUSY;
 	}
 	spin_lock(&sb_lock);
-	sb->s_bdev_handle = bdev_handle;
+	sb->s_bdev_file = bdev_file;
 	sb->s_bdev = bdev;
 	sb->s_bdi = bdi_get(bdev->bd_disk->bdi);
 	if (bdev_stable_writes(bdev))
@@ -1680,7 +1680,7 @@ void kill_block_super(struct super_block *sb)
 	generic_shutdown_super(sb);
 	if (bdev) {
 		sync_blockdev(bdev);
-		bdev_release(sb->s_bdev_handle);
+		fput(sb->s_bdev_file);
 	}
 }
 
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index aff20ddd4a9f9c..e5ac0e59ede9de 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -467,7 +467,7 @@ xfs_open_devices(
 	 * Setup xfs_mount buffer target pointers
 	 */
 	error = -ENOMEM;
-	mp->m_ddev_targp = xfs_alloc_buftarg(mp, sb->s_bdev_handle);
+	mp->m_ddev_targp = xfs_alloc_buftarg(mp, sb_bdev_handle(sb));
 	if (!mp->m_ddev_targp)
 		goto out_close_rtdev;
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 99e4f5e722132c..76706aa473163d 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -24,6 +24,7 @@
 #include <linux/sbitmap.h>
 #include <linux/uuid.h>
 #include <linux/xarray.h>
+#include <linux/file.h>
 
 struct module;
 struct request_queue;
@@ -1474,6 +1475,7 @@ extern const struct blk_holder_ops fs_holder_ops;
 	(BLK_OPEN_READ | BLK_OPEN_RESTRICT_WRITES | \
 	 (((flags) & SB_RDONLY) ? 0 : BLK_OPEN_WRITE))
 
+/* @bdev_handle will be removed soon. */
 struct bdev_handle {
 	struct block_device *bdev;
 	void *holder;
@@ -1484,6 +1486,10 @@ struct bdev_handle *bdev_open_by_dev(dev_t dev, blk_mode_t mode, void *holder,
 		const struct blk_holder_ops *hops);
 struct bdev_handle *bdev_open_by_path(const char *path, blk_mode_t mode,
 		void *holder, const struct blk_holder_ops *hops);
+struct file *bdev_file_open_by_dev(dev_t dev, blk_mode_t mode, void *holder,
+		const struct blk_holder_ops *hops);
+struct file *bdev_file_open_by_path(const char *path, blk_mode_t mode,
+		void *holder, const struct blk_holder_ops *hops);
 int bd_prepare_to_claim(struct block_device *bdev, void *holder,
 		const struct blk_holder_ops *hops);
 void bd_abort_claiming(struct block_device *bdev, void *holder);
@@ -1494,6 +1500,7 @@ struct block_device *blkdev_get_no_open(dev_t dev);
 void blkdev_put_no_open(struct block_device *bdev);
 
 struct block_device *I_BDEV(struct inode *inode);
+struct block_device *file_bdev(struct file *bdev_file);
 
 #ifdef CONFIG_BLOCK
 void invalidate_bdev(struct block_device *bdev);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index ed5966a7049512..e9291e27cc47f3 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1228,8 +1228,8 @@ struct super_block {
 #endif
 	struct hlist_bl_head	s_roots;	/* alternate root dentries for NFS */
 	struct list_head	s_mounts;	/* list of mounts; _not_ for fs use */
-	struct block_device	*s_bdev;
-	struct bdev_handle	*s_bdev_handle;
+	struct block_device	*s_bdev;	/* can go away once we use an accessor for @s_bdev_file */
+	struct file		*s_bdev_file;
 	struct backing_dev_info *s_bdi;
 	struct mtd_info		*s_mtd;
 	struct hlist_node	s_instances;
@@ -1327,6 +1327,12 @@ struct super_block {
 	struct list_head	s_inodes_wb;	/* writeback inodes */
 } __randomize_layout;
 
+/* Temporary helper that will go away. */
+static inline struct bdev_handle *sb_bdev_handle(struct super_block *sb)
+{
+	return sb->s_bdev_file->private_data;
+}
+
 static inline struct user_namespace *i_user_ns(const struct inode *inode)
 {
 	return inode->i_sb->s_user_ns;

From a79d8b14fb0a922fa8489aa9e2beea4253461030 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Tue, 23 Jan 2024 14:26:19 +0100
Subject: [PATCH 0263/1406] block/ioctl: port blkdev_bszset() to file

Link: https://lore.kernel.org/r/20240123-vfs-bdev-file-v2-2-adbd023e19cc@kernel.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 block/ioctl.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/block/ioctl.c b/block/ioctl.c
index 9c73a763ef8838..5d0619e02e4ca9 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -471,7 +471,7 @@ static int blkdev_bszset(struct block_device *bdev, blk_mode_t mode,
 		int __user *argp)
 {
 	int ret, n;
-	struct bdev_handle *handle;
+	struct file *file;
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EACCES;
@@ -483,12 +483,11 @@ static int blkdev_bszset(struct block_device *bdev, blk_mode_t mode,
 	if (mode & BLK_OPEN_EXCL)
 		return set_blocksize(bdev, n);
 
-	handle = bdev_open_by_dev(bdev->bd_dev, mode, &bdev, NULL);
-	if (IS_ERR(handle))
+	file = bdev_file_open_by_dev(bdev->bd_dev, mode, &bdev, NULL);
+	if (IS_ERR(file))
 		return -EBUSY;
 	ret = set_blocksize(bdev, n);
-	bdev_release(handle);
-
+	fput(file);
 	return ret;
 }
 

From 1fdcada5524609e916cf37f7a0c9a8cdc8670ddf Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Tue, 23 Jan 2024 14:26:20 +0100
Subject: [PATCH 0264/1406] block/genhd: port disk_scan_partitions() to file

This may run from a kernel thread via device_add_disk(). So this could
also use __fput_sync() if we were worried about EBUSY. But when it is
called from a kernel thread it's always BLK_OPEN_READ so EBUSY can't
really happen even if we do BLK_OPEN_RESTRICT_WRITES or BLK_OPEN_EXCL.

Otherwise it's called from an ioctl on the block device which is only
called from userspace and can rely on task work.

Link: https://lore.kernel.org/r/20240123-vfs-bdev-file-v2-3-adbd023e19cc@kernel.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 block/genhd.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/block/genhd.c b/block/genhd.c
index d74fb5b4ae6818..a911d2969c0700 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -342,7 +342,7 @@ EXPORT_SYMBOL_GPL(disk_uevent);
 
 int disk_scan_partitions(struct gendisk *disk, blk_mode_t mode)
 {
-	struct bdev_handle *handle;
+	struct file *file;
 	int ret = 0;
 
 	if (disk->flags & (GENHD_FL_NO_PART | GENHD_FL_HIDDEN))
@@ -366,12 +366,12 @@ int disk_scan_partitions(struct gendisk *disk, blk_mode_t mode)
 	}
 
 	set_bit(GD_NEED_PART_SCAN, &disk->state);
-	handle = bdev_open_by_dev(disk_devt(disk), mode & ~BLK_OPEN_EXCL, NULL,
-				  NULL);
-	if (IS_ERR(handle))
-		ret = PTR_ERR(handle);
+	file = bdev_file_open_by_dev(disk_devt(disk), mode & ~BLK_OPEN_EXCL,
+				     NULL, NULL);
+	if (IS_ERR(file))
+		ret = PTR_ERR(file);
 	else
-		bdev_release(handle);
+		fput(file);
 
 	/*
 	 * If blkdev_get_by_dev() failed early, GD_NEED_PART_SCAN is still set,

From 696a597871ee22d4b8fad9a737b2652f737cfec6 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Tue, 23 Jan 2024 14:26:21 +0100
Subject: [PATCH 0265/1406] md: port block device access to file

Link: https://lore.kernel.org/r/20240123-vfs-bdev-file-v2-4-adbd023e19cc@kernel.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 drivers/md/dm.c               | 23 +++++++++++++----------
 drivers/md/md.c               | 12 ++++++------
 drivers/md/md.h               |  2 +-
 include/linux/device-mapper.h |  2 +-
 4 files changed, 21 insertions(+), 18 deletions(-)

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 8dcabf84d866e6..87de5b5682ade5 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -726,7 +726,8 @@ static struct table_device *open_table_device(struct mapped_device *md,
 		dev_t dev, blk_mode_t mode)
 {
 	struct table_device *td;
-	struct bdev_handle *bdev_handle;
+	struct file *bdev_file;
+	struct block_device *bdev;
 	u64 part_off;
 	int r;
 
@@ -735,34 +736,36 @@ static struct table_device *open_table_device(struct mapped_device *md,
 		return ERR_PTR(-ENOMEM);
 	refcount_set(&td->count, 1);
 
-	bdev_handle = bdev_open_by_dev(dev, mode, _dm_claim_ptr, NULL);
-	if (IS_ERR(bdev_handle)) {
-		r = PTR_ERR(bdev_handle);
+	bdev_file = bdev_file_open_by_dev(dev, mode, _dm_claim_ptr, NULL);
+	if (IS_ERR(bdev_file)) {
+		r = PTR_ERR(bdev_file);
 		goto out_free_td;
 	}
 
+	bdev = file_bdev(bdev_file);
+
 	/*
 	 * We can be called before the dm disk is added.  In that case we can't
 	 * register the holder relation here.  It will be done once add_disk was
 	 * called.
 	 */
 	if (md->disk->slave_dir) {
-		r = bd_link_disk_holder(bdev_handle->bdev, md->disk);
+		r = bd_link_disk_holder(bdev, md->disk);
 		if (r)
 			goto out_blkdev_put;
 	}
 
 	td->dm_dev.mode = mode;
-	td->dm_dev.bdev = bdev_handle->bdev;
-	td->dm_dev.bdev_handle = bdev_handle;
-	td->dm_dev.dax_dev = fs_dax_get_by_bdev(bdev_handle->bdev, &part_off,
+	td->dm_dev.bdev = bdev;
+	td->dm_dev.bdev_file = bdev_file;
+	td->dm_dev.dax_dev = fs_dax_get_by_bdev(bdev, &part_off,
 						NULL, NULL);
 	format_dev_t(td->dm_dev.name, dev);
 	list_add(&td->list, &md->table_devices);
 	return td;
 
 out_blkdev_put:
-	bdev_release(bdev_handle);
+	fput(bdev_file);
 out_free_td:
 	kfree(td);
 	return ERR_PTR(r);
@@ -775,7 +778,7 @@ static void close_table_device(struct table_device *td, struct mapped_device *md
 {
 	if (md->disk->slave_dir)
 		bd_unlink_disk_holder(td->dm_dev.bdev, md->disk);
-	bdev_release(td->dm_dev.bdev_handle);
+	fput(td->dm_dev.bdev_file);
 	put_dax(td->dm_dev.dax_dev);
 	list_del(&td->list);
 	kfree(td);
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 2266358d807466..0653584db63b38 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2578,7 +2578,7 @@ static void export_rdev(struct md_rdev *rdev, struct mddev *mddev)
 	if (test_bit(AutoDetected, &rdev->flags))
 		md_autodetect_dev(rdev->bdev->bd_dev);
 #endif
-	bdev_release(rdev->bdev_handle);
+	fput(rdev->bdev_file);
 	rdev->bdev = NULL;
 	kobject_put(&rdev->kobj);
 }
@@ -3773,16 +3773,16 @@ static struct md_rdev *md_import_device(dev_t newdev, int super_format, int supe
 	if (err)
 		goto out_clear_rdev;
 
-	rdev->bdev_handle = bdev_open_by_dev(newdev,
+	rdev->bdev_file = bdev_file_open_by_dev(newdev,
 			BLK_OPEN_READ | BLK_OPEN_WRITE,
 			super_format == -2 ? &claim_rdev : rdev, NULL);
-	if (IS_ERR(rdev->bdev_handle)) {
+	if (IS_ERR(rdev->bdev_file)) {
 		pr_warn("md: could not open device unknown-block(%u,%u).\n",
 			MAJOR(newdev), MINOR(newdev));
-		err = PTR_ERR(rdev->bdev_handle);
+		err = PTR_ERR(rdev->bdev_file);
 		goto out_clear_rdev;
 	}
-	rdev->bdev = rdev->bdev_handle->bdev;
+	rdev->bdev = file_bdev(rdev->bdev_file);
 
 	kobject_init(&rdev->kobj, &rdev_ktype);
 
@@ -3813,7 +3813,7 @@ static struct md_rdev *md_import_device(dev_t newdev, int super_format, int supe
 	return rdev;
 
 out_blkdev_put:
-	bdev_release(rdev->bdev_handle);
+	fput(rdev->bdev_file);
 out_clear_rdev:
 	md_rdev_clear(rdev);
 out_free_rdev:
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 8d881cc597992f..a079ee9b619044 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -59,7 +59,7 @@ struct md_rdev {
 	 */
 	struct block_device *meta_bdev;
 	struct block_device *bdev;	/* block device handle */
-	struct bdev_handle *bdev_handle;	/* Handle from open for bdev */
+	struct file *bdev_file;		/* Handle from open for bdev */
 
 	struct page	*sb_page, *bb_page;
 	int		sb_loaded;
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 772ab4d74d944b..82b2195efaca78 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -165,7 +165,7 @@ void dm_error(const char *message);
 
 struct dm_dev {
 	struct block_device *bdev;
-	struct bdev_handle *bdev_handle;
+	struct file *bdev_file;
 	struct dax_device *dax_dev;
 	blk_mode_t mode;
 	char name[16];

From fcac756d5d4c4fa59030093ced4c7508686415d3 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Tue, 23 Jan 2024 14:26:22 +0100
Subject: [PATCH 0266/1406] swap: port block device usage to file

Link: https://lore.kernel.org/r/20240123-vfs-bdev-file-v2-5-adbd023e19cc@kernel.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/swap.h |  2 +-
 mm/swapfile.c        | 22 +++++++++++-----------
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 4db00ddad26169..e5b82bc05e60db 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -298,7 +298,7 @@ struct swap_info_struct {
 	unsigned int __percpu *cluster_next_cpu; /*percpu index for next allocation */
 	struct percpu_cluster __percpu *percpu_cluster; /* per cpu's swap location */
 	struct rb_root swap_extent_root;/* root of the swap extent rbtree */
-	struct bdev_handle *bdev_handle;/* open handle of the bdev */
+	struct file *bdev_file;		/* open handle of the bdev */
 	struct block_device *bdev;	/* swap device or bdev of swap file */
 	struct file *swap_file;		/* seldom referenced */
 	unsigned int old_block_size;	/* seldom referenced */
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 556ff7347d5f04..73edd6fed6a2ed 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -2532,10 +2532,10 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
 	exit_swap_address_space(p->type);
 
 	inode = mapping->host;
-	if (p->bdev_handle) {
+	if (p->bdev_file) {
 		set_blocksize(p->bdev, old_block_size);
-		bdev_release(p->bdev_handle);
-		p->bdev_handle = NULL;
+		fput(p->bdev_file);
+		p->bdev_file = NULL;
 	}
 
 	inode_lock(inode);
@@ -2765,14 +2765,14 @@ static int claim_swapfile(struct swap_info_struct *p, struct inode *inode)
 	int error;
 
 	if (S_ISBLK(inode->i_mode)) {
-		p->bdev_handle = bdev_open_by_dev(inode->i_rdev,
+		p->bdev_file = bdev_file_open_by_dev(inode->i_rdev,
 				BLK_OPEN_READ | BLK_OPEN_WRITE, p, NULL);
-		if (IS_ERR(p->bdev_handle)) {
-			error = PTR_ERR(p->bdev_handle);
-			p->bdev_handle = NULL;
+		if (IS_ERR(p->bdev_file)) {
+			error = PTR_ERR(p->bdev_file);
+			p->bdev_file = NULL;
 			return error;
 		}
-		p->bdev = p->bdev_handle->bdev;
+		p->bdev = file_bdev(p->bdev_file);
 		p->old_block_size = block_size(p->bdev);
 		error = set_blocksize(p->bdev, PAGE_SIZE);
 		if (error < 0)
@@ -3208,10 +3208,10 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
 	p->percpu_cluster = NULL;
 	free_percpu(p->cluster_next_cpu);
 	p->cluster_next_cpu = NULL;
-	if (p->bdev_handle) {
+	if (p->bdev_file) {
 		set_blocksize(p->bdev, p->old_block_size);
-		bdev_release(p->bdev_handle);
-		p->bdev_handle = NULL;
+		fput(p->bdev_file);
+		p->bdev_file = NULL;
 	}
 	inode = NULL;
 	destroy_swap_extents(p);

From 24f1ef2ea9da41b99f4b62f75ac48bc3344dbf11 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Tue, 23 Jan 2024 14:26:23 +0100
Subject: [PATCH 0267/1406] power: port block device access to file

Link: https://lore.kernel.org/r/20240123-vfs-bdev-file-v2-6-adbd023e19cc@kernel.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 kernel/power/swap.c | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 6053ddddaf6540..692f12fe60c130 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -222,7 +222,7 @@ int swsusp_swap_in_use(void)
  */
 
 static unsigned short root_swap = 0xffff;
-static struct bdev_handle *hib_resume_bdev_handle;
+static struct file *hib_resume_bdev_file;
 
 struct hib_bio_batch {
 	atomic_t		count;
@@ -276,7 +276,7 @@ static int hib_submit_io(blk_opf_t opf, pgoff_t page_off, void *addr,
 	struct bio *bio;
 	int error = 0;
 
-	bio = bio_alloc(hib_resume_bdev_handle->bdev, 1, opf,
+	bio = bio_alloc(file_bdev(hib_resume_bdev_file), 1, opf,
 			GFP_NOIO | __GFP_HIGH);
 	bio->bi_iter.bi_sector = page_off * (PAGE_SIZE >> 9);
 
@@ -357,14 +357,14 @@ static int swsusp_swap_check(void)
 		return res;
 	root_swap = res;
 
-	hib_resume_bdev_handle = bdev_open_by_dev(swsusp_resume_device,
+	hib_resume_bdev_file = bdev_file_open_by_dev(swsusp_resume_device,
 			BLK_OPEN_WRITE, NULL, NULL);
-	if (IS_ERR(hib_resume_bdev_handle))
-		return PTR_ERR(hib_resume_bdev_handle);
+	if (IS_ERR(hib_resume_bdev_file))
+		return PTR_ERR(hib_resume_bdev_file);
 
-	res = set_blocksize(hib_resume_bdev_handle->bdev, PAGE_SIZE);
+	res = set_blocksize(file_bdev(hib_resume_bdev_file), PAGE_SIZE);
 	if (res < 0)
-		bdev_release(hib_resume_bdev_handle);
+		fput(hib_resume_bdev_file);
 
 	return res;
 }
@@ -1523,10 +1523,10 @@ int swsusp_check(bool exclusive)
 	void *holder = exclusive ? &swsusp_holder : NULL;
 	int error;
 
-	hib_resume_bdev_handle = bdev_open_by_dev(swsusp_resume_device,
+	hib_resume_bdev_file = bdev_file_open_by_dev(swsusp_resume_device,
 				BLK_OPEN_READ, holder, NULL);
-	if (!IS_ERR(hib_resume_bdev_handle)) {
-		set_blocksize(hib_resume_bdev_handle->bdev, PAGE_SIZE);
+	if (!IS_ERR(hib_resume_bdev_file)) {
+		set_blocksize(file_bdev(hib_resume_bdev_file), PAGE_SIZE);
 		clear_page(swsusp_header);
 		error = hib_submit_io(REQ_OP_READ, swsusp_resume_block,
 					swsusp_header, NULL);
@@ -1551,11 +1551,11 @@ int swsusp_check(bool exclusive)
 
 put:
 		if (error)
-			bdev_release(hib_resume_bdev_handle);
+			fput(hib_resume_bdev_file);
 		else
 			pr_debug("Image signature found, resuming\n");
 	} else {
-		error = PTR_ERR(hib_resume_bdev_handle);
+		error = PTR_ERR(hib_resume_bdev_file);
 	}
 
 	if (error)
@@ -1570,12 +1570,12 @@ int swsusp_check(bool exclusive)
 
 void swsusp_close(void)
 {
-	if (IS_ERR(hib_resume_bdev_handle)) {
+	if (IS_ERR(hib_resume_bdev_file)) {
 		pr_debug("Image device not initialised\n");
 		return;
 	}
 
-	bdev_release(hib_resume_bdev_handle);
+	fput(hib_resume_bdev_file);
 }
 
 /**

From fcac7a8246812f5f709cb52ba449926b7ec21ccc Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Tue, 23 Jan 2024 14:26:24 +0100
Subject: [PATCH 0268/1406] xfs: port block device access to files

Link: https://lore.kernel.org/r/20240123-vfs-bdev-file-v2-7-adbd023e19cc@kernel.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/xfs/xfs_buf.c   | 10 +++++-----
 fs/xfs/xfs_buf.h   |  4 ++--
 fs/xfs/xfs_super.c | 44 ++++++++++++++++++++++----------------------
 3 files changed, 29 insertions(+), 29 deletions(-)

diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 8e5bd50d29feb3..01b41fabbe3c7b 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1951,7 +1951,7 @@ xfs_free_buftarg(
 	fs_put_dax(btp->bt_daxdev, btp->bt_mount);
 	/* the main block device is closed by kill_block_super */
 	if (btp->bt_bdev != btp->bt_mount->m_super->s_bdev)
-		bdev_release(btp->bt_bdev_handle);
+		fput(btp->bt_bdev_file);
 
 	kmem_free(btp);
 }
@@ -1994,7 +1994,7 @@ xfs_setsize_buftarg_early(
 struct xfs_buftarg *
 xfs_alloc_buftarg(
 	struct xfs_mount	*mp,
-	struct bdev_handle	*bdev_handle)
+	struct file		*bdev_file)
 {
 	xfs_buftarg_t		*btp;
 	const struct dax_holder_operations *ops = NULL;
@@ -2005,9 +2005,9 @@ xfs_alloc_buftarg(
 	btp = kmem_zalloc(sizeof(*btp), KM_NOFS);
 
 	btp->bt_mount = mp;
-	btp->bt_bdev_handle = bdev_handle;
-	btp->bt_dev = bdev_handle->bdev->bd_dev;
-	btp->bt_bdev = bdev_handle->bdev;
+	btp->bt_bdev_file = bdev_file;
+	btp->bt_bdev = file_bdev(bdev_file);
+	btp->bt_dev = btp->bt_bdev->bd_dev;
 	btp->bt_daxdev = fs_dax_get_by_bdev(btp->bt_bdev, &btp->bt_dax_part_off,
 					    mp, ops);
 
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index b470de08a46ca8..304e858d04fb3c 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -98,7 +98,7 @@ typedef unsigned int xfs_buf_flags_t;
  */
 typedef struct xfs_buftarg {
 	dev_t			bt_dev;
-	struct bdev_handle	*bt_bdev_handle;
+	struct file		*bt_bdev_file;
 	struct block_device	*bt_bdev;
 	struct dax_device	*bt_daxdev;
 	u64			bt_dax_part_off;
@@ -366,7 +366,7 @@ xfs_buf_update_cksum(struct xfs_buf *bp, unsigned long cksum_offset)
  *	Handling of buftargs.
  */
 struct xfs_buftarg *xfs_alloc_buftarg(struct xfs_mount *mp,
-		struct bdev_handle *bdev_handle);
+		struct file *bdev_file);
 extern void xfs_free_buftarg(struct xfs_buftarg *);
 extern void xfs_buftarg_wait(struct xfs_buftarg *);
 extern void xfs_buftarg_drain(struct xfs_buftarg *);
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index e5ac0e59ede9de..3814d737116914 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -362,16 +362,16 @@ STATIC int
 xfs_blkdev_get(
 	xfs_mount_t		*mp,
 	const char		*name,
-	struct bdev_handle	**handlep)
+	struct file		**bdev_filep)
 {
 	int			error = 0;
 
-	*handlep = bdev_open_by_path(name,
+	*bdev_filep = bdev_file_open_by_path(name,
 		BLK_OPEN_READ | BLK_OPEN_WRITE | BLK_OPEN_RESTRICT_WRITES,
 		mp->m_super, &fs_holder_ops);
-	if (IS_ERR(*handlep)) {
-		error = PTR_ERR(*handlep);
-		*handlep = NULL;
+	if (IS_ERR(*bdev_filep)) {
+		error = PTR_ERR(*bdev_filep);
+		*bdev_filep = NULL;
 		xfs_warn(mp, "Invalid device [%s], error=%d", name, error);
 	}
 
@@ -436,26 +436,26 @@ xfs_open_devices(
 {
 	struct super_block	*sb = mp->m_super;
 	struct block_device	*ddev = sb->s_bdev;
-	struct bdev_handle	*logdev_handle = NULL, *rtdev_handle = NULL;
+	struct file		*logdev_file = NULL, *rtdev_file = NULL;
 	int			error;
 
 	/*
 	 * Open real time and log devices - order is important.
 	 */
 	if (mp->m_logname) {
-		error = xfs_blkdev_get(mp, mp->m_logname, &logdev_handle);
+		error = xfs_blkdev_get(mp, mp->m_logname, &logdev_file);
 		if (error)
 			return error;
 	}
 
 	if (mp->m_rtname) {
-		error = xfs_blkdev_get(mp, mp->m_rtname, &rtdev_handle);
+		error = xfs_blkdev_get(mp, mp->m_rtname, &rtdev_file);
 		if (error)
 			goto out_close_logdev;
 
-		if (rtdev_handle->bdev == ddev ||
-		    (logdev_handle &&
-		     rtdev_handle->bdev == logdev_handle->bdev)) {
+		if (file_bdev(rtdev_file) == ddev ||
+		    (logdev_file &&
+		     file_bdev(rtdev_file) == file_bdev(logdev_file))) {
 			xfs_warn(mp,
 	"Cannot mount filesystem with identical rtdev and ddev/logdev.");
 			error = -EINVAL;
@@ -467,25 +467,25 @@ xfs_open_devices(
 	 * Setup xfs_mount buffer target pointers
 	 */
 	error = -ENOMEM;
-	mp->m_ddev_targp = xfs_alloc_buftarg(mp, sb_bdev_handle(sb));
+	mp->m_ddev_targp = xfs_alloc_buftarg(mp, sb->s_bdev_file);
 	if (!mp->m_ddev_targp)
 		goto out_close_rtdev;
 
-	if (rtdev_handle) {
-		mp->m_rtdev_targp = xfs_alloc_buftarg(mp, rtdev_handle);
+	if (rtdev_file) {
+		mp->m_rtdev_targp = xfs_alloc_buftarg(mp, rtdev_file);
 		if (!mp->m_rtdev_targp)
 			goto out_free_ddev_targ;
 	}
 
-	if (logdev_handle && logdev_handle->bdev != ddev) {
-		mp->m_logdev_targp = xfs_alloc_buftarg(mp, logdev_handle);
+	if (logdev_file && file_bdev(logdev_file) != ddev) {
+		mp->m_logdev_targp = xfs_alloc_buftarg(mp, logdev_file);
 		if (!mp->m_logdev_targp)
 			goto out_free_rtdev_targ;
 	} else {
 		mp->m_logdev_targp = mp->m_ddev_targp;
 		/* Handle won't be used, drop it */
-		if (logdev_handle)
-			bdev_release(logdev_handle);
+		if (logdev_file)
+			fput(logdev_file);
 	}
 
 	return 0;
@@ -496,11 +496,11 @@ xfs_open_devices(
  out_free_ddev_targ:
 	xfs_free_buftarg(mp->m_ddev_targp);
  out_close_rtdev:
-	 if (rtdev_handle)
-		bdev_release(rtdev_handle);
+	 if (rtdev_file)
+		fput(rtdev_file);
  out_close_logdev:
-	if (logdev_handle)
-		bdev_release(logdev_handle);
+	if (logdev_file)
+		fput(logdev_file);
 	return error;
 }
 

From 5de06dae18c7f3c70b7b6ea108bd3e04ab68cdf5 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Tue, 23 Jan 2024 14:26:25 +0100
Subject: [PATCH 0269/1406] drbd: port block device access to file

Link: https://lore.kernel.org/r/20240123-vfs-bdev-file-v2-8-adbd023e19cc@kernel.org
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 drivers/block/drbd/drbd_int.h |  4 +--
 drivers/block/drbd/drbd_nl.c  | 58 +++++++++++++++++------------------
 2 files changed, 31 insertions(+), 31 deletions(-)

diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index c21e3732759ec2..94dc0a235919d7 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -524,9 +524,9 @@ struct drbd_md {
 
 struct drbd_backing_dev {
 	struct block_device *backing_bdev;
-	struct bdev_handle *backing_bdev_handle;
+	struct file *backing_bdev_file;
 	struct block_device *md_bdev;
-	struct bdev_handle *md_bdev_handle;
+	struct file *f_md_bdev;
 	struct drbd_md md;
 	struct disk_conf *disk_conf; /* RCU, for updates: resource->conf_update */
 	sector_t known_size; /* last known size of that backing device */
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 43747a1aae4353..6aed67278e8b2a 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -1635,45 +1635,45 @@ int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info)
 	return 0;
 }
 
-static struct bdev_handle *open_backing_dev(struct drbd_device *device,
+static struct file *open_backing_dev(struct drbd_device *device,
 		const char *bdev_path, void *claim_ptr, bool do_bd_link)
 {
-	struct bdev_handle *handle;
+	struct file *file;
 	int err = 0;
 
-	handle = bdev_open_by_path(bdev_path, BLK_OPEN_READ | BLK_OPEN_WRITE,
-				   claim_ptr, NULL);
-	if (IS_ERR(handle)) {
+	file = bdev_file_open_by_path(bdev_path, BLK_OPEN_READ | BLK_OPEN_WRITE,
+				      claim_ptr, NULL);
+	if (IS_ERR(file)) {
 		drbd_err(device, "open(\"%s\") failed with %ld\n",
-				bdev_path, PTR_ERR(handle));
-		return handle;
+				bdev_path, PTR_ERR(file));
+		return file;
 	}
 
 	if (!do_bd_link)
-		return handle;
+		return file;
 
-	err = bd_link_disk_holder(handle->bdev, device->vdisk);
+	err = bd_link_disk_holder(file_bdev(file), device->vdisk);
 	if (err) {
-		bdev_release(handle);
+		fput(file);
 		drbd_err(device, "bd_link_disk_holder(\"%s\", ...) failed with %d\n",
 				bdev_path, err);
-		handle = ERR_PTR(err);
+		file = ERR_PTR(err);
 	}
-	return handle;
+	return file;
 }
 
 static int open_backing_devices(struct drbd_device *device,
 		struct disk_conf *new_disk_conf,
 		struct drbd_backing_dev *nbc)
 {
-	struct bdev_handle *handle;
+	struct file *file;
 
-	handle = open_backing_dev(device, new_disk_conf->backing_dev, device,
+	file = open_backing_dev(device, new_disk_conf->backing_dev, device,
 				  true);
-	if (IS_ERR(handle))
+	if (IS_ERR(file))
 		return ERR_OPEN_DISK;
-	nbc->backing_bdev = handle->bdev;
-	nbc->backing_bdev_handle = handle;
+	nbc->backing_bdev = file_bdev(file);
+	nbc->backing_bdev_file = file;
 
 	/*
 	 * meta_dev_idx >= 0: external fixed size, possibly multiple
@@ -1683,7 +1683,7 @@ static int open_backing_devices(struct drbd_device *device,
 	 * should check it for you already; but if you don't, or
 	 * someone fooled it, we need to double check here)
 	 */
-	handle = open_backing_dev(device, new_disk_conf->meta_dev,
+	file = open_backing_dev(device, new_disk_conf->meta_dev,
 		/* claim ptr: device, if claimed exclusively; shared drbd_m_holder,
 		 * if potentially shared with other drbd minors */
 			(new_disk_conf->meta_dev_idx < 0) ? (void*)device : (void*)drbd_m_holder,
@@ -1691,21 +1691,21 @@ static int open_backing_devices(struct drbd_device *device,
 		 * as would happen with internal metadata. */
 			(new_disk_conf->meta_dev_idx != DRBD_MD_INDEX_FLEX_INT &&
 			 new_disk_conf->meta_dev_idx != DRBD_MD_INDEX_INTERNAL));
-	if (IS_ERR(handle))
+	if (IS_ERR(file))
 		return ERR_OPEN_MD_DISK;
-	nbc->md_bdev = handle->bdev;
-	nbc->md_bdev_handle = handle;
+	nbc->md_bdev = file_bdev(file);
+	nbc->f_md_bdev = file;
 	return NO_ERROR;
 }
 
 static void close_backing_dev(struct drbd_device *device,
-		struct bdev_handle *handle, bool do_bd_unlink)
+		struct file *bdev_file, bool do_bd_unlink)
 {
-	if (!handle)
+	if (!bdev_file)
 		return;
 	if (do_bd_unlink)
-		bd_unlink_disk_holder(handle->bdev, device->vdisk);
-	bdev_release(handle);
+		bd_unlink_disk_holder(file_bdev(bdev_file), device->vdisk);
+	fput(bdev_file);
 }
 
 void drbd_backing_dev_free(struct drbd_device *device, struct drbd_backing_dev *ldev)
@@ -1713,9 +1713,9 @@ void drbd_backing_dev_free(struct drbd_device *device, struct drbd_backing_dev *
 	if (ldev == NULL)
 		return;
 
-	close_backing_dev(device, ldev->md_bdev_handle,
+	close_backing_dev(device, ldev->f_md_bdev,
 			  ldev->md_bdev != ldev->backing_bdev);
-	close_backing_dev(device, ldev->backing_bdev_handle, true);
+	close_backing_dev(device, ldev->backing_bdev_file, true);
 
 	kfree(ldev->disk_conf);
 	kfree(ldev);
@@ -2131,9 +2131,9 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
  fail:
 	conn_reconfig_done(connection);
 	if (nbc) {
-		close_backing_dev(device, nbc->md_bdev_handle,
+		close_backing_dev(device, nbc->f_md_bdev,
 			  nbc->md_bdev != nbc->backing_bdev);
-		close_backing_dev(device, nbc->backing_bdev_handle, true);
+		close_backing_dev(device, nbc->backing_bdev_file, true);
 		kfree(nbc);
 	}
 	kfree(new_disk_conf);

From d3afd6375cddd2236ac4c97a85bdec4c4d7022df Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Tue, 23 Jan 2024 14:26:26 +0100
Subject: [PATCH 0270/1406] pktcdvd: port block device access to file

Link: https://lore.kernel.org/r/20240123-vfs-bdev-file-v2-9-adbd023e19cc@kernel.org
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 drivers/block/pktcdvd.c | 68 ++++++++++++++++++++---------------------
 include/linux/pktcdvd.h |  4 +--
 2 files changed, 36 insertions(+), 36 deletions(-)

diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index d56d972aadb36f..c21444716e4382 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -340,8 +340,8 @@ static ssize_t device_map_show(const struct class *c, const struct class_attribu
 		n += sysfs_emit_at(data, n, "%s %u:%u %u:%u\n",
 			pd->disk->disk_name,
 			MAJOR(pd->pkt_dev), MINOR(pd->pkt_dev),
-			MAJOR(pd->bdev_handle->bdev->bd_dev),
-			MINOR(pd->bdev_handle->bdev->bd_dev));
+			MAJOR(file_bdev(pd->bdev_file)->bd_dev),
+			MINOR(file_bdev(pd->bdev_file)->bd_dev));
 	}
 	mutex_unlock(&ctl_mutex);
 	return n;
@@ -438,7 +438,7 @@ static int pkt_seq_show(struct seq_file *m, void *p)
 	int states[PACKET_NUM_STATES];
 
 	seq_printf(m, "Writer %s mapped to %pg:\n", pd->disk->disk_name,
-		   pd->bdev_handle->bdev);
+		   file_bdev(pd->bdev_file));
 
 	seq_printf(m, "\nSettings:\n");
 	seq_printf(m, "\tpacket size:\t\t%dkB\n", pd->settings.size / 2);
@@ -715,7 +715,7 @@ static void pkt_rbtree_insert(struct pktcdvd_device *pd, struct pkt_rb_node *nod
  */
 static int pkt_generic_packet(struct pktcdvd_device *pd, struct packet_command *cgc)
 {
-	struct request_queue *q = bdev_get_queue(pd->bdev_handle->bdev);
+	struct request_queue *q = bdev_get_queue(file_bdev(pd->bdev_file));
 	struct scsi_cmnd *scmd;
 	struct request *rq;
 	int ret = 0;
@@ -1048,7 +1048,7 @@ static void pkt_gather_data(struct pktcdvd_device *pd, struct packet_data *pkt)
 			continue;
 
 		bio = pkt->r_bios[f];
-		bio_init(bio, pd->bdev_handle->bdev, bio->bi_inline_vecs, 1,
+		bio_init(bio, file_bdev(pd->bdev_file), bio->bi_inline_vecs, 1,
 			 REQ_OP_READ);
 		bio->bi_iter.bi_sector = pkt->sector + f * (CD_FRAMESIZE >> 9);
 		bio->bi_end_io = pkt_end_io_read;
@@ -1264,7 +1264,7 @@ static void pkt_start_write(struct pktcdvd_device *pd, struct packet_data *pkt)
 	struct device *ddev = disk_to_dev(pd->disk);
 	int f;
 
-	bio_init(pkt->w_bio, pd->bdev_handle->bdev, pkt->w_bio->bi_inline_vecs,
+	bio_init(pkt->w_bio, file_bdev(pd->bdev_file), pkt->w_bio->bi_inline_vecs,
 		 pkt->frames, REQ_OP_WRITE);
 	pkt->w_bio->bi_iter.bi_sector = pkt->sector;
 	pkt->w_bio->bi_end_io = pkt_end_io_packet_write;
@@ -2162,20 +2162,20 @@ static int pkt_open_dev(struct pktcdvd_device *pd, bool write)
 	int ret;
 	long lba;
 	struct request_queue *q;
-	struct bdev_handle *bdev_handle;
+	struct file *bdev_file;
 
 	/*
 	 * We need to re-open the cdrom device without O_NONBLOCK to be able
 	 * to read/write from/to it. It is already opened in O_NONBLOCK mode
 	 * so open should not fail.
 	 */
-	bdev_handle = bdev_open_by_dev(pd->bdev_handle->bdev->bd_dev,
+	bdev_file = bdev_file_open_by_dev(file_bdev(pd->bdev_file)->bd_dev,
 				       BLK_OPEN_READ, pd, NULL);
-	if (IS_ERR(bdev_handle)) {
-		ret = PTR_ERR(bdev_handle);
+	if (IS_ERR(bdev_file)) {
+		ret = PTR_ERR(bdev_file);
 		goto out;
 	}
-	pd->open_bdev_handle = bdev_handle;
+	pd->f_open_bdev = bdev_file;
 
 	ret = pkt_get_last_written(pd, &lba);
 	if (ret) {
@@ -2184,9 +2184,9 @@ static int pkt_open_dev(struct pktcdvd_device *pd, bool write)
 	}
 
 	set_capacity(pd->disk, lba << 2);
-	set_capacity_and_notify(pd->bdev_handle->bdev->bd_disk, lba << 2);
+	set_capacity_and_notify(file_bdev(pd->bdev_file)->bd_disk, lba << 2);
 
-	q = bdev_get_queue(pd->bdev_handle->bdev);
+	q = bdev_get_queue(file_bdev(pd->bdev_file));
 	if (write) {
 		ret = pkt_open_write(pd);
 		if (ret)
@@ -2218,7 +2218,7 @@ static int pkt_open_dev(struct pktcdvd_device *pd, bool write)
 	return 0;
 
 out_putdev:
-	bdev_release(bdev_handle);
+	fput(bdev_file);
 out:
 	return ret;
 }
@@ -2237,8 +2237,8 @@ static void pkt_release_dev(struct pktcdvd_device *pd, int flush)
 	pkt_lock_door(pd, 0);
 
 	pkt_set_speed(pd, MAX_SPEED, MAX_SPEED);
-	bdev_release(pd->open_bdev_handle);
-	pd->open_bdev_handle = NULL;
+	fput(pd->f_open_bdev);
+	pd->f_open_bdev = NULL;
 
 	pkt_shrink_pktlist(pd);
 }
@@ -2326,7 +2326,7 @@ static void pkt_end_io_read_cloned(struct bio *bio)
 
 static void pkt_make_request_read(struct pktcdvd_device *pd, struct bio *bio)
 {
-	struct bio *cloned_bio = bio_alloc_clone(pd->bdev_handle->bdev, bio,
+	struct bio *cloned_bio = bio_alloc_clone(file_bdev(pd->bdev_file), bio,
 		GFP_NOIO, &pkt_bio_set);
 	struct packet_stacked_data *psd = mempool_alloc(&psd_pool, GFP_NOIO);
 
@@ -2497,7 +2497,7 @@ static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev)
 {
 	struct device *ddev = disk_to_dev(pd->disk);
 	int i;
-	struct bdev_handle *bdev_handle;
+	struct file *bdev_file;
 	struct scsi_device *sdev;
 
 	if (pd->pkt_dev == dev) {
@@ -2508,9 +2508,9 @@ static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev)
 		struct pktcdvd_device *pd2 = pkt_devs[i];
 		if (!pd2)
 			continue;
-		if (pd2->bdev_handle->bdev->bd_dev == dev) {
+		if (file_bdev(pd2->bdev_file)->bd_dev == dev) {
 			dev_err(ddev, "%pg already setup\n",
-				pd2->bdev_handle->bdev);
+				file_bdev(pd2->bdev_file));
 			return -EBUSY;
 		}
 		if (pd2->pkt_dev == dev) {
@@ -2519,13 +2519,13 @@ static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev)
 		}
 	}
 
-	bdev_handle = bdev_open_by_dev(dev, BLK_OPEN_READ | BLK_OPEN_NDELAY,
+	bdev_file = bdev_file_open_by_dev(dev, BLK_OPEN_READ | BLK_OPEN_NDELAY,
 				       NULL, NULL);
-	if (IS_ERR(bdev_handle))
-		return PTR_ERR(bdev_handle);
-	sdev = scsi_device_from_queue(bdev_handle->bdev->bd_disk->queue);
+	if (IS_ERR(bdev_file))
+		return PTR_ERR(bdev_file);
+	sdev = scsi_device_from_queue(file_bdev(bdev_file)->bd_disk->queue);
 	if (!sdev) {
-		bdev_release(bdev_handle);
+		fput(bdev_file);
 		return -EINVAL;
 	}
 	put_device(&sdev->sdev_gendev);
@@ -2533,8 +2533,8 @@ static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev)
 	/* This is safe, since we have a reference from open(). */
 	__module_get(THIS_MODULE);
 
-	pd->bdev_handle = bdev_handle;
-	set_blocksize(bdev_handle->bdev, CD_FRAMESIZE);
+	pd->bdev_file = bdev_file;
+	set_blocksize(file_bdev(bdev_file), CD_FRAMESIZE);
 
 	pkt_init_queue(pd);
 
@@ -2546,11 +2546,11 @@ static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev)
 	}
 
 	proc_create_single_data(pd->disk->disk_name, 0, pkt_proc, pkt_seq_show, pd);
-	dev_notice(ddev, "writer mapped to %pg\n", bdev_handle->bdev);
+	dev_notice(ddev, "writer mapped to %pg\n", file_bdev(bdev_file));
 	return 0;
 
 out_mem:
-	bdev_release(bdev_handle);
+	fput(bdev_file);
 	/* This is safe: open() is still holding a reference. */
 	module_put(THIS_MODULE);
 	return -ENOMEM;
@@ -2605,9 +2605,9 @@ static unsigned int pkt_check_events(struct gendisk *disk,
 
 	if (!pd)
 		return 0;
-	if (!pd->bdev_handle)
+	if (!pd->bdev_file)
 		return 0;
-	attached_disk = pd->bdev_handle->bdev->bd_disk;
+	attached_disk = file_bdev(pd->bdev_file)->bd_disk;
 	if (!attached_disk || !attached_disk->fops->check_events)
 		return 0;
 	return attached_disk->fops->check_events(attached_disk, clearing);
@@ -2692,7 +2692,7 @@ static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev)
 		goto out_mem2;
 
 	/* inherit events of the host device */
-	disk->events = pd->bdev_handle->bdev->bd_disk->events;
+	disk->events = file_bdev(pd->bdev_file)->bd_disk->events;
 
 	ret = add_disk(disk);
 	if (ret)
@@ -2757,7 +2757,7 @@ static int pkt_remove_dev(dev_t pkt_dev)
 	pkt_debugfs_dev_remove(pd);
 	pkt_sysfs_dev_remove(pd);
 
-	bdev_release(pd->bdev_handle);
+	fput(pd->bdev_file);
 
 	remove_proc_entry(pd->disk->disk_name, pkt_proc);
 	dev_notice(ddev, "writer unmapped\n");
@@ -2784,7 +2784,7 @@ static void pkt_get_status(struct pkt_ctrl_command *ctrl_cmd)
 
 	pd = pkt_find_dev_from_minor(ctrl_cmd->dev_index);
 	if (pd) {
-		ctrl_cmd->dev = new_encode_dev(pd->bdev_handle->bdev->bd_dev);
+		ctrl_cmd->dev = new_encode_dev(file_bdev(pd->bdev_file)->bd_dev);
 		ctrl_cmd->pkt_dev = new_encode_dev(pd->pkt_dev);
 	} else {
 		ctrl_cmd->dev = 0;
diff --git a/include/linux/pktcdvd.h b/include/linux/pktcdvd.h
index 79594aeb160daf..2f1b952d596aa4 100644
--- a/include/linux/pktcdvd.h
+++ b/include/linux/pktcdvd.h
@@ -154,9 +154,9 @@ struct packet_stacked_data
 
 struct pktcdvd_device
 {
-	struct bdev_handle	*bdev_handle;	/* dev attached */
+	struct file		*bdev_file;	/* dev attached */
 	/* handle acquired for bdev during pkt_open_dev() */
-	struct bdev_handle	*open_bdev_handle;
+	struct file		*f_open_bdev;
 	dev_t			pkt_dev;	/* our dev */
 	struct packet_settings	settings;
 	struct packet_stats	stats;

From 734451e1cc5ecf577ec96f50923557c9497a841c Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Tue, 23 Jan 2024 14:26:27 +0100
Subject: [PATCH 0271/1406] rnbd: port block device access to file

Link: https://lore.kernel.org/r/20240123-vfs-bdev-file-v2-10-adbd023e19cc@kernel.org
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 drivers/block/rnbd/rnbd-srv.c | 28 ++++++++++++++--------------
 drivers/block/rnbd/rnbd-srv.h |  2 +-
 2 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/drivers/block/rnbd/rnbd-srv.c b/drivers/block/rnbd/rnbd-srv.c
index 3a0d5dcec6f255..f6e3a3c4b76cc4 100644
--- a/drivers/block/rnbd/rnbd-srv.c
+++ b/drivers/block/rnbd/rnbd-srv.c
@@ -145,7 +145,7 @@ static int process_rdma(struct rnbd_srv_session *srv_sess,
 	priv->sess_dev = sess_dev;
 	priv->id = id;
 
-	bio = bio_alloc(sess_dev->bdev_handle->bdev, 1,
+	bio = bio_alloc(file_bdev(sess_dev->bdev_file), 1,
 			rnbd_to_bio_flags(le32_to_cpu(msg->rw)), GFP_KERNEL);
 	if (bio_add_page(bio, virt_to_page(data), datalen,
 			offset_in_page(data)) != datalen) {
@@ -219,7 +219,7 @@ void rnbd_destroy_sess_dev(struct rnbd_srv_sess_dev *sess_dev, bool keep_id)
 	rnbd_put_sess_dev(sess_dev);
 	wait_for_completion(&dc); /* wait for inflights to drop to zero */
 
-	bdev_release(sess_dev->bdev_handle);
+	fput(sess_dev->bdev_file);
 	mutex_lock(&sess_dev->dev->lock);
 	list_del(&sess_dev->dev_list);
 	if (!sess_dev->readonly)
@@ -534,7 +534,7 @@ rnbd_srv_get_or_create_srv_dev(struct block_device *bdev,
 static void rnbd_srv_fill_msg_open_rsp(struct rnbd_msg_open_rsp *rsp,
 					struct rnbd_srv_sess_dev *sess_dev)
 {
-	struct block_device *bdev = sess_dev->bdev_handle->bdev;
+	struct block_device *bdev = file_bdev(sess_dev->bdev_file);
 
 	rsp->hdr.type = cpu_to_le16(RNBD_MSG_OPEN_RSP);
 	rsp->device_id = cpu_to_le32(sess_dev->device_id);
@@ -560,7 +560,7 @@ static void rnbd_srv_fill_msg_open_rsp(struct rnbd_msg_open_rsp *rsp,
 static struct rnbd_srv_sess_dev *
 rnbd_srv_create_set_sess_dev(struct rnbd_srv_session *srv_sess,
 			      const struct rnbd_msg_open *open_msg,
-			      struct bdev_handle *handle, bool readonly,
+			      struct file *bdev_file, bool readonly,
 			      struct rnbd_srv_dev *srv_dev)
 {
 	struct rnbd_srv_sess_dev *sdev = rnbd_sess_dev_alloc(srv_sess);
@@ -572,7 +572,7 @@ rnbd_srv_create_set_sess_dev(struct rnbd_srv_session *srv_sess,
 
 	strscpy(sdev->pathname, open_msg->dev_name, sizeof(sdev->pathname));
 
-	sdev->bdev_handle	= handle;
+	sdev->bdev_file		= bdev_file;
 	sdev->sess		= srv_sess;
 	sdev->dev		= srv_dev;
 	sdev->readonly		= readonly;
@@ -678,7 +678,7 @@ static int process_msg_open(struct rnbd_srv_session *srv_sess,
 	struct rnbd_srv_dev *srv_dev;
 	struct rnbd_srv_sess_dev *srv_sess_dev;
 	const struct rnbd_msg_open *open_msg = msg;
-	struct bdev_handle *bdev_handle;
+	struct file *bdev_file;
 	blk_mode_t open_flags = BLK_OPEN_READ;
 	char *full_path;
 	struct rnbd_msg_open_rsp *rsp = data;
@@ -716,15 +716,15 @@ static int process_msg_open(struct rnbd_srv_session *srv_sess,
 		goto reject;
 	}
 
-	bdev_handle = bdev_open_by_path(full_path, open_flags, NULL, NULL);
-	if (IS_ERR(bdev_handle)) {
-		ret = PTR_ERR(bdev_handle);
+	bdev_file = bdev_file_open_by_path(full_path, open_flags, NULL, NULL);
+	if (IS_ERR(bdev_file)) {
+		ret = PTR_ERR(bdev_file);
 		pr_err("Opening device '%s' on session %s failed, failed to open the block device, err: %pe\n",
-		       full_path, srv_sess->sessname, bdev_handle);
+		       full_path, srv_sess->sessname, bdev_file);
 		goto free_path;
 	}
 
-	srv_dev = rnbd_srv_get_or_create_srv_dev(bdev_handle->bdev, srv_sess,
+	srv_dev = rnbd_srv_get_or_create_srv_dev(file_bdev(bdev_file), srv_sess,
 						  open_msg->access_mode);
 	if (IS_ERR(srv_dev)) {
 		pr_err("Opening device '%s' on session %s failed, creating srv_dev failed, err: %pe\n",
@@ -734,7 +734,7 @@ static int process_msg_open(struct rnbd_srv_session *srv_sess,
 	}
 
 	srv_sess_dev = rnbd_srv_create_set_sess_dev(srv_sess, open_msg,
-				bdev_handle,
+				bdev_file,
 				open_msg->access_mode == RNBD_ACCESS_RO,
 				srv_dev);
 	if (IS_ERR(srv_sess_dev)) {
@@ -750,7 +750,7 @@ static int process_msg_open(struct rnbd_srv_session *srv_sess,
 	 */
 	mutex_lock(&srv_dev->lock);
 	if (!srv_dev->dev_kobj.state_in_sysfs) {
-		ret = rnbd_srv_create_dev_sysfs(srv_dev, bdev_handle->bdev);
+		ret = rnbd_srv_create_dev_sysfs(srv_dev, file_bdev(bdev_file));
 		if (ret) {
 			mutex_unlock(&srv_dev->lock);
 			rnbd_srv_err(srv_sess_dev,
@@ -793,7 +793,7 @@ static int process_msg_open(struct rnbd_srv_session *srv_sess,
 	}
 	rnbd_put_srv_dev(srv_dev);
 blkdev_put:
-	bdev_release(bdev_handle);
+	fput(bdev_file);
 free_path:
 	kfree(full_path);
 reject:
diff --git a/drivers/block/rnbd/rnbd-srv.h b/drivers/block/rnbd/rnbd-srv.h
index 343cc682b617b4..18d873808b8d83 100644
--- a/drivers/block/rnbd/rnbd-srv.h
+++ b/drivers/block/rnbd/rnbd-srv.h
@@ -46,7 +46,7 @@ struct rnbd_srv_dev {
 struct rnbd_srv_sess_dev {
 	/* Entry inside rnbd_srv_dev struct */
 	struct list_head		dev_list;
-	struct bdev_handle		*bdev_handle;
+	struct file			*bdev_file;
 	struct rnbd_srv_session		*sess;
 	struct rnbd_srv_dev		*dev;
 	struct kobject                  kobj;

From 54f5bdc2b980e4ff943bb33e124f77e56ef9ba39 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Tue, 23 Jan 2024 14:26:28 +0100
Subject: [PATCH 0272/1406] xen: port block device access to file

Link: https://lore.kernel.org/r/20240123-vfs-bdev-file-v2-11-adbd023e19cc@kernel.org
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 drivers/block/xen-blkback/blkback.c |  4 ++--
 drivers/block/xen-blkback/common.h  |  4 ++--
 drivers/block/xen-blkback/xenbus.c  | 37 ++++++++++++++---------------
 3 files changed, 22 insertions(+), 23 deletions(-)

diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c
index 4defd7f387c786..944576d582fb14 100644
--- a/drivers/block/xen-blkback/blkback.c
+++ b/drivers/block/xen-blkback/blkback.c
@@ -465,7 +465,7 @@ static int xen_vbd_translate(struct phys_req *req, struct xen_blkif *blkif,
 	}
 
 	req->dev  = vbd->pdevice;
-	req->bdev = vbd->bdev_handle->bdev;
+	req->bdev = file_bdev(vbd->bdev_file);
 	rc = 0;
 
  out:
@@ -969,7 +969,7 @@ static int dispatch_discard_io(struct xen_blkif_ring *ring,
 	int err = 0;
 	int status = BLKIF_RSP_OKAY;
 	struct xen_blkif *blkif = ring->blkif;
-	struct block_device *bdev = blkif->vbd.bdev_handle->bdev;
+	struct block_device *bdev = file_bdev(blkif->vbd.bdev_file);
 	struct phys_req preq;
 
 	xen_blkif_get(blkif);
diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h
index 1432c83183d098..b427d54bc1205e 100644
--- a/drivers/block/xen-blkback/common.h
+++ b/drivers/block/xen-blkback/common.h
@@ -221,7 +221,7 @@ struct xen_vbd {
 	unsigned char		type;
 	/* phys device that this vbd maps to. */
 	u32			pdevice;
-	struct bdev_handle	*bdev_handle;
+	struct file		*bdev_file;
 	/* Cached size parameter. */
 	sector_t		size;
 	unsigned int		flush_support:1;
@@ -360,7 +360,7 @@ struct pending_req {
 };
 
 
-#define vbd_sz(_v)	bdev_nr_sectors((_v)->bdev_handle->bdev)
+#define vbd_sz(_v)	bdev_nr_sectors(file_bdev((_v)->bdev_file))
 
 #define xen_blkif_get(_b) (atomic_inc(&(_b)->refcnt))
 #define xen_blkif_put(_b)				\
diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c
index e34219ea2b058c..0621878940ae57 100644
--- a/drivers/block/xen-blkback/xenbus.c
+++ b/drivers/block/xen-blkback/xenbus.c
@@ -81,7 +81,7 @@ static void xen_update_blkif_status(struct xen_blkif *blkif)
 	int i;
 
 	/* Not ready to connect? */
-	if (!blkif->rings || !blkif->rings[0].irq || !blkif->vbd.bdev_handle)
+	if (!blkif->rings || !blkif->rings[0].irq || !blkif->vbd.bdev_file)
 		return;
 
 	/* Already connected? */
@@ -99,13 +99,12 @@ static void xen_update_blkif_status(struct xen_blkif *blkif)
 		return;
 	}
 
-	err = sync_blockdev(blkif->vbd.bdev_handle->bdev);
+	err = sync_blockdev(file_bdev(blkif->vbd.bdev_file));
 	if (err) {
 		xenbus_dev_error(blkif->be->dev, err, "block flush");
 		return;
 	}
-	invalidate_inode_pages2(
-			blkif->vbd.bdev_handle->bdev->bd_inode->i_mapping);
+	invalidate_inode_pages2(blkif->vbd.bdev_file->f_mapping);
 
 	for (i = 0; i < blkif->nr_rings; i++) {
 		ring = &blkif->rings[i];
@@ -473,9 +472,9 @@ static void xenvbd_sysfs_delif(struct xenbus_device *dev)
 
 static void xen_vbd_free(struct xen_vbd *vbd)
 {
-	if (vbd->bdev_handle)
-		bdev_release(vbd->bdev_handle);
-	vbd->bdev_handle = NULL;
+	if (vbd->bdev_file)
+		fput(vbd->bdev_file);
+	vbd->bdev_file = NULL;
 }
 
 static int xen_vbd_create(struct xen_blkif *blkif, blkif_vdev_t handle,
@@ -483,7 +482,7 @@ static int xen_vbd_create(struct xen_blkif *blkif, blkif_vdev_t handle,
 			  int cdrom)
 {
 	struct xen_vbd *vbd;
-	struct bdev_handle *bdev_handle;
+	struct file *bdev_file;
 
 	vbd = &blkif->vbd;
 	vbd->handle   = handle;
@@ -492,17 +491,17 @@ static int xen_vbd_create(struct xen_blkif *blkif, blkif_vdev_t handle,
 
 	vbd->pdevice  = MKDEV(major, minor);
 
-	bdev_handle = bdev_open_by_dev(vbd->pdevice, vbd->readonly ?
+	bdev_file = bdev_file_open_by_dev(vbd->pdevice, vbd->readonly ?
 				 BLK_OPEN_READ : BLK_OPEN_WRITE, NULL, NULL);
 
-	if (IS_ERR(bdev_handle)) {
+	if (IS_ERR(bdev_file)) {
 		pr_warn("xen_vbd_create: device %08x could not be opened\n",
 			vbd->pdevice);
 		return -ENOENT;
 	}
 
-	vbd->bdev_handle = bdev_handle;
-	if (vbd->bdev_handle->bdev->bd_disk == NULL) {
+	vbd->bdev_file = bdev_file;
+	if (file_bdev(vbd->bdev_file)->bd_disk == NULL) {
 		pr_warn("xen_vbd_create: device %08x doesn't exist\n",
 			vbd->pdevice);
 		xen_vbd_free(vbd);
@@ -510,14 +509,14 @@ static int xen_vbd_create(struct xen_blkif *blkif, blkif_vdev_t handle,
 	}
 	vbd->size = vbd_sz(vbd);
 
-	if (cdrom || disk_to_cdi(vbd->bdev_handle->bdev->bd_disk))
+	if (cdrom || disk_to_cdi(file_bdev(vbd->bdev_file)->bd_disk))
 		vbd->type |= VDISK_CDROM;
-	if (vbd->bdev_handle->bdev->bd_disk->flags & GENHD_FL_REMOVABLE)
+	if (file_bdev(vbd->bdev_file)->bd_disk->flags & GENHD_FL_REMOVABLE)
 		vbd->type |= VDISK_REMOVABLE;
 
-	if (bdev_write_cache(bdev_handle->bdev))
+	if (bdev_write_cache(file_bdev(bdev_file)))
 		vbd->flush_support = true;
-	if (bdev_max_secure_erase_sectors(bdev_handle->bdev))
+	if (bdev_max_secure_erase_sectors(file_bdev(bdev_file)))
 		vbd->discard_secure = true;
 
 	pr_debug("Successful creation of handle=%04x (dom=%u)\n",
@@ -570,7 +569,7 @@ static void xen_blkbk_discard(struct xenbus_transaction xbt, struct backend_info
 	struct xen_blkif *blkif = be->blkif;
 	int err;
 	int state = 0;
-	struct block_device *bdev = be->blkif->vbd.bdev_handle->bdev;
+	struct block_device *bdev = file_bdev(be->blkif->vbd.bdev_file);
 
 	if (!xenbus_read_unsigned(dev->nodename, "discard-enable", 1))
 		return;
@@ -932,7 +931,7 @@ static void connect(struct backend_info *be)
 	}
 	err = xenbus_printf(xbt, dev->nodename, "sector-size", "%lu",
 			    (unsigned long)bdev_logical_block_size(
-					be->blkif->vbd.bdev_handle->bdev));
+					file_bdev(be->blkif->vbd.bdev_file)));
 	if (err) {
 		xenbus_dev_fatal(dev, err, "writing %s/sector-size",
 				 dev->nodename);
@@ -940,7 +939,7 @@ static void connect(struct backend_info *be)
 	}
 	err = xenbus_printf(xbt, dev->nodename, "physical-sector-size", "%u",
 			    bdev_physical_block_size(
-					be->blkif->vbd.bdev_handle->bdev));
+					file_bdev(be->blkif->vbd.bdev_file)));
 	if (err)
 		xenbus_dev_error(dev, err, "writing %s/physical-sector-size",
 				 dev->nodename);

From 942a53d9b8f4daf7bed3307c01c6f9c07e72d8f9 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Tue, 23 Jan 2024 14:26:29 +0100
Subject: [PATCH 0273/1406] zram: port block device access to file

Link: https://lore.kernel.org/r/20240123-vfs-bdev-file-v2-12-adbd023e19cc@kernel.org
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 drivers/block/zram/zram_drv.c | 26 +++++++++++++-------------
 drivers/block/zram/zram_drv.h |  2 +-
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 6772e0c654fa7f..d96b3851b5d314 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -426,11 +426,11 @@ static void reset_bdev(struct zram *zram)
 	if (!zram->backing_dev)
 		return;
 
-	bdev_release(zram->bdev_handle);
+	fput(zram->bdev_file);
 	/* hope filp_close flush all of IO */
 	filp_close(zram->backing_dev, NULL);
 	zram->backing_dev = NULL;
-	zram->bdev_handle = NULL;
+	zram->bdev_file = NULL;
 	zram->disk->fops = &zram_devops;
 	kvfree(zram->bitmap);
 	zram->bitmap = NULL;
@@ -476,7 +476,7 @@ static ssize_t backing_dev_store(struct device *dev,
 	struct address_space *mapping;
 	unsigned int bitmap_sz;
 	unsigned long nr_pages, *bitmap = NULL;
-	struct bdev_handle *bdev_handle = NULL;
+	struct file *bdev_file = NULL;
 	int err;
 	struct zram *zram = dev_to_zram(dev);
 
@@ -513,11 +513,11 @@ static ssize_t backing_dev_store(struct device *dev,
 		goto out;
 	}
 
-	bdev_handle = bdev_open_by_dev(inode->i_rdev,
+	bdev_file = bdev_file_open_by_dev(inode->i_rdev,
 				BLK_OPEN_READ | BLK_OPEN_WRITE, zram, NULL);
-	if (IS_ERR(bdev_handle)) {
-		err = PTR_ERR(bdev_handle);
-		bdev_handle = NULL;
+	if (IS_ERR(bdev_file)) {
+		err = PTR_ERR(bdev_file);
+		bdev_file = NULL;
 		goto out;
 	}
 
@@ -531,7 +531,7 @@ static ssize_t backing_dev_store(struct device *dev,
 
 	reset_bdev(zram);
 
-	zram->bdev_handle = bdev_handle;
+	zram->bdev_file = bdev_file;
 	zram->backing_dev = backing_dev;
 	zram->bitmap = bitmap;
 	zram->nr_pages = nr_pages;
@@ -544,8 +544,8 @@ static ssize_t backing_dev_store(struct device *dev,
 out:
 	kvfree(bitmap);
 
-	if (bdev_handle)
-		bdev_release(bdev_handle);
+	if (bdev_file)
+		fput(bdev_file);
 
 	if (backing_dev)
 		filp_close(backing_dev, NULL);
@@ -587,7 +587,7 @@ static void read_from_bdev_async(struct zram *zram, struct page *page,
 {
 	struct bio *bio;
 
-	bio = bio_alloc(zram->bdev_handle->bdev, 1, parent->bi_opf, GFP_NOIO);
+	bio = bio_alloc(file_bdev(zram->bdev_file), 1, parent->bi_opf, GFP_NOIO);
 	bio->bi_iter.bi_sector = entry * (PAGE_SIZE >> 9);
 	__bio_add_page(bio, page, PAGE_SIZE, 0);
 	bio_chain(bio, parent);
@@ -703,7 +703,7 @@ static ssize_t writeback_store(struct device *dev,
 			continue;
 		}
 
-		bio_init(&bio, zram->bdev_handle->bdev, &bio_vec, 1,
+		bio_init(&bio, file_bdev(zram->bdev_file), &bio_vec, 1,
 			 REQ_OP_WRITE | REQ_SYNC);
 		bio.bi_iter.bi_sector = blk_idx * (PAGE_SIZE >> 9);
 		__bio_add_page(&bio, page, PAGE_SIZE, 0);
@@ -785,7 +785,7 @@ static void zram_sync_read(struct work_struct *work)
 	struct bio_vec bv;
 	struct bio bio;
 
-	bio_init(&bio, zw->zram->bdev_handle->bdev, &bv, 1, REQ_OP_READ);
+	bio_init(&bio, file_bdev(zw->zram->bdev_file), &bv, 1, REQ_OP_READ);
 	bio.bi_iter.bi_sector = zw->entry * (PAGE_SIZE >> 9);
 	__bio_add_page(&bio, zw->page, PAGE_SIZE, 0);
 	zw->error = submit_bio_wait(&bio);
diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index 3b94d12f41b406..37bf29f34d26f0 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -132,7 +132,7 @@ struct zram {
 	spinlock_t wb_limit_lock;
 	bool wb_limit_enable;
 	u64 bd_wb_limit;
-	struct bdev_handle *bdev_handle;
+	struct file *bdev_file;
 	unsigned long *bitmap;
 	unsigned long nr_pages;
 #endif

From 301ef2f5b409e1f298bed5b51f09bb0b00935de2 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Tue, 23 Jan 2024 14:26:30 +0100
Subject: [PATCH 0274/1406] bcache: port block device access to files

Link: https://lore.kernel.org/r/20240123-vfs-bdev-file-v2-13-adbd023e19cc@kernel.org
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 drivers/md/bcache/bcache.h |  4 +--
 drivers/md/bcache/super.c  | 74 +++++++++++++++++++-------------------
 2 files changed, 39 insertions(+), 39 deletions(-)

diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 6ae2329052c92c..4e6afa89921fe0 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -300,7 +300,7 @@ struct cached_dev {
 	struct list_head	list;
 	struct bcache_device	disk;
 	struct block_device	*bdev;
-	struct bdev_handle	*bdev_handle;
+	struct file		*bdev_file;
 
 	struct cache_sb		sb;
 	struct cache_sb_disk	*sb_disk;
@@ -423,7 +423,7 @@ struct cache {
 
 	struct kobject		kobj;
 	struct block_device	*bdev;
-	struct bdev_handle	*bdev_handle;
+	struct file		*bdev_file;
 
 	struct task_struct	*alloc_thread;
 
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index dc3f50f6971417..d00b3abab1336b 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -1369,8 +1369,8 @@ static CLOSURE_CALLBACK(cached_dev_free)
 	if (dc->sb_disk)
 		put_page(virt_to_page(dc->sb_disk));
 
-	if (dc->bdev_handle)
-		bdev_release(dc->bdev_handle);
+	if (dc->bdev_file)
+		fput(dc->bdev_file);
 
 	wake_up(&unregister_wait);
 
@@ -1445,7 +1445,7 @@ static int cached_dev_init(struct cached_dev *dc, unsigned int block_size)
 /* Cached device - bcache superblock */
 
 static int register_bdev(struct cache_sb *sb, struct cache_sb_disk *sb_disk,
-				 struct bdev_handle *bdev_handle,
+				 struct file *bdev_file,
 				 struct cached_dev *dc)
 {
 	const char *err = "cannot allocate memory";
@@ -1453,8 +1453,8 @@ static int register_bdev(struct cache_sb *sb, struct cache_sb_disk *sb_disk,
 	int ret = -ENOMEM;
 
 	memcpy(&dc->sb, sb, sizeof(struct cache_sb));
-	dc->bdev_handle = bdev_handle;
-	dc->bdev = bdev_handle->bdev;
+	dc->bdev_file = bdev_file;
+	dc->bdev = file_bdev(bdev_file);
 	dc->sb_disk = sb_disk;
 
 	if (cached_dev_init(dc, sb->block_size << 9))
@@ -2218,8 +2218,8 @@ void bch_cache_release(struct kobject *kobj)
 	if (ca->sb_disk)
 		put_page(virt_to_page(ca->sb_disk));
 
-	if (ca->bdev_handle)
-		bdev_release(ca->bdev_handle);
+	if (ca->bdev_file)
+		fput(ca->bdev_file);
 
 	kfree(ca);
 	module_put(THIS_MODULE);
@@ -2339,18 +2339,18 @@ static int cache_alloc(struct cache *ca)
 }
 
 static int register_cache(struct cache_sb *sb, struct cache_sb_disk *sb_disk,
-				struct bdev_handle *bdev_handle,
+				struct file *bdev_file,
 				struct cache *ca)
 {
 	const char *err = NULL; /* must be set for any error case */
 	int ret = 0;
 
 	memcpy(&ca->sb, sb, sizeof(struct cache_sb));
-	ca->bdev_handle = bdev_handle;
-	ca->bdev = bdev_handle->bdev;
+	ca->bdev_file = bdev_file;
+	ca->bdev = file_bdev(bdev_file);
 	ca->sb_disk = sb_disk;
 
-	if (bdev_max_discard_sectors((bdev_handle->bdev)))
+	if (bdev_max_discard_sectors(file_bdev(bdev_file)))
 		ca->discard = CACHE_DISCARD(&ca->sb);
 
 	ret = cache_alloc(ca);
@@ -2361,20 +2361,20 @@ static int register_cache(struct cache_sb *sb, struct cache_sb_disk *sb_disk,
 			err = "cache_alloc(): cache device is too small";
 		else
 			err = "cache_alloc(): unknown error";
-		pr_notice("error %pg: %s\n", bdev_handle->bdev, err);
+		pr_notice("error %pg: %s\n", file_bdev(bdev_file), err);
 		/*
 		 * If we failed here, it means ca->kobj is not initialized yet,
 		 * kobject_put() won't be called and there is no chance to
-		 * call bdev_release() to bdev in bch_cache_release(). So
-		 * we explicitly call bdev_release() here.
+		 * call fput() to bdev in bch_cache_release(). So
+		 * we explicitly call fput() on the block device here.
 		 */
-		bdev_release(bdev_handle);
+		fput(bdev_file);
 		return ret;
 	}
 
-	if (kobject_add(&ca->kobj, bdev_kobj(bdev_handle->bdev), "bcache")) {
+	if (kobject_add(&ca->kobj, bdev_kobj(file_bdev(bdev_file)), "bcache")) {
 		pr_notice("error %pg: error calling kobject_add\n",
-			  bdev_handle->bdev);
+			  file_bdev(bdev_file));
 		ret = -ENOMEM;
 		goto out;
 	}
@@ -2388,7 +2388,7 @@ static int register_cache(struct cache_sb *sb, struct cache_sb_disk *sb_disk,
 		goto out;
 	}
 
-	pr_info("registered cache device %pg\n", ca->bdev_handle->bdev);
+	pr_info("registered cache device %pg\n", file_bdev(ca->bdev_file));
 
 out:
 	kobject_put(&ca->kobj);
@@ -2446,7 +2446,7 @@ struct async_reg_args {
 	char *path;
 	struct cache_sb *sb;
 	struct cache_sb_disk *sb_disk;
-	struct bdev_handle *bdev_handle;
+	struct file *bdev_file;
 	void *holder;
 };
 
@@ -2457,7 +2457,7 @@ static void register_bdev_worker(struct work_struct *work)
 		container_of(work, struct async_reg_args, reg_work.work);
 
 	mutex_lock(&bch_register_lock);
-	if (register_bdev(args->sb, args->sb_disk, args->bdev_handle,
+	if (register_bdev(args->sb, args->sb_disk, args->bdev_file,
 			  args->holder) < 0)
 		fail = true;
 	mutex_unlock(&bch_register_lock);
@@ -2478,7 +2478,7 @@ static void register_cache_worker(struct work_struct *work)
 		container_of(work, struct async_reg_args, reg_work.work);
 
 	/* blkdev_put() will be called in bch_cache_release() */
-	if (register_cache(args->sb, args->sb_disk, args->bdev_handle,
+	if (register_cache(args->sb, args->sb_disk, args->bdev_file,
 			   args->holder))
 		fail = true;
 
@@ -2516,7 +2516,7 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
 	char *path = NULL;
 	struct cache_sb *sb;
 	struct cache_sb_disk *sb_disk;
-	struct bdev_handle *bdev_handle, *bdev_handle2;
+	struct file *bdev_file, *bdev_file2;
 	void *holder = NULL;
 	ssize_t ret;
 	bool async_registration = false;
@@ -2549,15 +2549,15 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
 
 	ret = -EINVAL;
 	err = "failed to open device";
-	bdev_handle = bdev_open_by_path(strim(path), BLK_OPEN_READ, NULL, NULL);
-	if (IS_ERR(bdev_handle))
+	bdev_file = bdev_file_open_by_path(strim(path), BLK_OPEN_READ, NULL, NULL);
+	if (IS_ERR(bdev_file))
 		goto out_free_sb;
 
 	err = "failed to set blocksize";
-	if (set_blocksize(bdev_handle->bdev, 4096))
+	if (set_blocksize(file_bdev(bdev_file), 4096))
 		goto out_blkdev_put;
 
-	err = read_super(sb, bdev_handle->bdev, &sb_disk);
+	err = read_super(sb, file_bdev(bdev_file), &sb_disk);
 	if (err)
 		goto out_blkdev_put;
 
@@ -2569,13 +2569,13 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
 	}
 
 	/* Now reopen in exclusive mode with proper holder */
-	bdev_handle2 = bdev_open_by_dev(bdev_handle->bdev->bd_dev,
+	bdev_file2 = bdev_file_open_by_dev(file_bdev(bdev_file)->bd_dev,
 			BLK_OPEN_READ | BLK_OPEN_WRITE, holder, NULL);
-	bdev_release(bdev_handle);
-	bdev_handle = bdev_handle2;
-	if (IS_ERR(bdev_handle)) {
-		ret = PTR_ERR(bdev_handle);
-		bdev_handle = NULL;
+	fput(bdev_file);
+	bdev_file = bdev_file2;
+	if (IS_ERR(bdev_file)) {
+		ret = PTR_ERR(bdev_file);
+		bdev_file = NULL;
 		if (ret == -EBUSY) {
 			dev_t dev;
 
@@ -2610,7 +2610,7 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
 		args->path	= path;
 		args->sb	= sb;
 		args->sb_disk	= sb_disk;
-		args->bdev_handle	= bdev_handle;
+		args->bdev_file	= bdev_file;
 		args->holder	= holder;
 		register_device_async(args);
 		/* No wait and returns to user space */
@@ -2619,14 +2619,14 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
 
 	if (SB_IS_BDEV(sb)) {
 		mutex_lock(&bch_register_lock);
-		ret = register_bdev(sb, sb_disk, bdev_handle, holder);
+		ret = register_bdev(sb, sb_disk, bdev_file, holder);
 		mutex_unlock(&bch_register_lock);
 		/* blkdev_put() will be called in cached_dev_free() */
 		if (ret < 0)
 			goto out_free_sb;
 	} else {
 		/* blkdev_put() will be called in bch_cache_release() */
-		ret = register_cache(sb, sb_disk, bdev_handle, holder);
+		ret = register_cache(sb, sb_disk, bdev_file, holder);
 		if (ret)
 			goto out_free_sb;
 	}
@@ -2642,8 +2642,8 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
 out_put_sb_page:
 	put_page(virt_to_page(sb_disk));
 out_blkdev_put:
-	if (bdev_handle)
-		bdev_release(bdev_handle);
+	if (bdev_file)
+		fput(bdev_file);
 out_free_sb:
 	kfree(sb);
 out_free_path:

From d112daae100aaaa088741cc163008e0f0ccfeafc Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Tue, 23 Jan 2024 14:26:31 +0100
Subject: [PATCH 0275/1406] block2mtd: port device access to files

Link: https://lore.kernel.org/r/20240123-vfs-bdev-file-v2-14-adbd023e19cc@kernel.org
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 drivers/mtd/devices/block2mtd.c | 46 +++++++++++++++------------------
 1 file changed, 21 insertions(+), 25 deletions(-)

diff --git a/drivers/mtd/devices/block2mtd.c b/drivers/mtd/devices/block2mtd.c
index aa44a23ec0451e..97a00ec9a4d489 100644
--- a/drivers/mtd/devices/block2mtd.c
+++ b/drivers/mtd/devices/block2mtd.c
@@ -37,7 +37,7 @@
 /* Info for the block device */
 struct block2mtd_dev {
 	struct list_head list;
-	struct bdev_handle *bdev_handle;
+	struct file *bdev_file;
 	struct mtd_info mtd;
 	struct mutex write_mutex;
 };
@@ -55,8 +55,7 @@ static struct page *page_read(struct address_space *mapping, pgoff_t index)
 /* erase a specified part of the device */
 static int _block2mtd_erase(struct block2mtd_dev *dev, loff_t to, size_t len)
 {
-	struct address_space *mapping =
-				dev->bdev_handle->bdev->bd_inode->i_mapping;
+	struct address_space *mapping = dev->bdev_file->f_mapping;
 	struct page *page;
 	pgoff_t index = to >> PAGE_SHIFT;	// page index
 	int pages = len >> PAGE_SHIFT;
@@ -106,8 +105,7 @@ static int block2mtd_read(struct mtd_info *mtd, loff_t from, size_t len,
 		size_t *retlen, u_char *buf)
 {
 	struct block2mtd_dev *dev = mtd->priv;
-	struct address_space *mapping =
-				dev->bdev_handle->bdev->bd_inode->i_mapping;
+	struct address_space *mapping = dev->bdev_file->f_mapping;
 	struct page *page;
 	pgoff_t index = from >> PAGE_SHIFT;
 	int offset = from & (PAGE_SIZE-1);
@@ -142,8 +140,7 @@ static int _block2mtd_write(struct block2mtd_dev *dev, const u_char *buf,
 		loff_t to, size_t len, size_t *retlen)
 {
 	struct page *page;
-	struct address_space *mapping =
-				dev->bdev_handle->bdev->bd_inode->i_mapping;
+	struct address_space *mapping = dev->bdev_file->f_mapping;
 	pgoff_t index = to >> PAGE_SHIFT;	// page index
 	int offset = to & ~PAGE_MASK;	// page offset
 	int cpylen;
@@ -198,7 +195,7 @@ static int block2mtd_write(struct mtd_info *mtd, loff_t to, size_t len,
 static void block2mtd_sync(struct mtd_info *mtd)
 {
 	struct block2mtd_dev *dev = mtd->priv;
-	sync_blockdev(dev->bdev_handle->bdev);
+	sync_blockdev(file_bdev(dev->bdev_file));
 	return;
 }
 
@@ -210,10 +207,9 @@ static void block2mtd_free_device(struct block2mtd_dev *dev)
 
 	kfree(dev->mtd.name);
 
-	if (dev->bdev_handle) {
-		invalidate_mapping_pages(
-			dev->bdev_handle->bdev->bd_inode->i_mapping, 0, -1);
-		bdev_release(dev->bdev_handle);
+	if (dev->bdev_file) {
+		invalidate_mapping_pages(dev->bdev_file->f_mapping, 0, -1);
+		fput(dev->bdev_file);
 	}
 
 	kfree(dev);
@@ -223,10 +219,10 @@ static void block2mtd_free_device(struct block2mtd_dev *dev)
  * This function is marked __ref because it calls the __init marked
  * early_lookup_bdev when called from the early boot code.
  */
-static struct bdev_handle __ref *mdtblock_early_get_bdev(const char *devname,
+static struct file __ref *mdtblock_early_get_bdev(const char *devname,
 		blk_mode_t mode, int timeout, struct block2mtd_dev *dev)
 {
-	struct bdev_handle *bdev_handle = ERR_PTR(-ENODEV);
+	struct file *bdev_file = ERR_PTR(-ENODEV);
 #ifndef MODULE
 	int i;
 
@@ -234,7 +230,7 @@ static struct bdev_handle __ref *mdtblock_early_get_bdev(const char *devname,
 	 * We can't use early_lookup_bdev from a running system.
 	 */
 	if (system_state >= SYSTEM_RUNNING)
-		return bdev_handle;
+		return bdev_file;
 
 	/*
 	 * We might not have the root device mounted at this point.
@@ -253,20 +249,20 @@ static struct bdev_handle __ref *mdtblock_early_get_bdev(const char *devname,
 		wait_for_device_probe();
 
 		if (!early_lookup_bdev(devname, &devt)) {
-			bdev_handle = bdev_open_by_dev(devt, mode, dev, NULL);
-			if (!IS_ERR(bdev_handle))
+			bdev_file = bdev_file_open_by_dev(devt, mode, dev, NULL);
+			if (!IS_ERR(bdev_file))
 				break;
 		}
 	}
 #endif
-	return bdev_handle;
+	return bdev_file;
 }
 
 static struct block2mtd_dev *add_device(char *devname, int erase_size,
 		char *label, int timeout)
 {
 	const blk_mode_t mode = BLK_OPEN_READ | BLK_OPEN_WRITE;
-	struct bdev_handle *bdev_handle;
+	struct file *bdev_file;
 	struct block_device *bdev;
 	struct block2mtd_dev *dev;
 	char *name;
@@ -279,16 +275,16 @@ static struct block2mtd_dev *add_device(char *devname, int erase_size,
 		return NULL;
 
 	/* Get a handle on the device */
-	bdev_handle = bdev_open_by_path(devname, mode, dev, NULL);
-	if (IS_ERR(bdev_handle))
-		bdev_handle = mdtblock_early_get_bdev(devname, mode, timeout,
+	bdev_file = bdev_file_open_by_path(devname, mode, dev, NULL);
+	if (IS_ERR(bdev_file))
+		bdev_file = mdtblock_early_get_bdev(devname, mode, timeout,
 						      dev);
-	if (IS_ERR(bdev_handle)) {
+	if (IS_ERR(bdev_file)) {
 		pr_err("error: cannot open device %s\n", devname);
 		goto err_free_block2mtd;
 	}
-	dev->bdev_handle = bdev_handle;
-	bdev = bdev_handle->bdev;
+	dev->bdev_file = bdev_file;
+	bdev = file_bdev(bdev_file);
 
 	if (MAJOR(bdev->bd_dev) == MTD_BLOCK_MAJOR) {
 		pr_err("attempting to use an MTD device as a block device\n");

From b039169cf8bae15917175715eed7e33e6d1e3592 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Tue, 23 Jan 2024 14:26:32 +0100
Subject: [PATCH 0276/1406] nvme: port block device access to file

Link: https://lore.kernel.org/r/20240123-vfs-bdev-file-v2-15-adbd023e19cc@kernel.org
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 drivers/nvme/target/io-cmd-bdev.c | 16 ++++++++--------
 drivers/nvme/target/nvmet.h       |  2 +-
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c
index f11400a908f269..6426aac2634aeb 100644
--- a/drivers/nvme/target/io-cmd-bdev.c
+++ b/drivers/nvme/target/io-cmd-bdev.c
@@ -50,10 +50,10 @@ void nvmet_bdev_set_limits(struct block_device *bdev, struct nvme_id_ns *id)
 
 void nvmet_bdev_ns_disable(struct nvmet_ns *ns)
 {
-	if (ns->bdev_handle) {
-		bdev_release(ns->bdev_handle);
+	if (ns->bdev_file) {
+		fput(ns->bdev_file);
 		ns->bdev = NULL;
-		ns->bdev_handle = NULL;
+		ns->bdev_file = NULL;
 	}
 }
 
@@ -85,18 +85,18 @@ int nvmet_bdev_ns_enable(struct nvmet_ns *ns)
 	if (ns->buffered_io)
 		return -ENOTBLK;
 
-	ns->bdev_handle = bdev_open_by_path(ns->device_path,
+	ns->bdev_file = bdev_file_open_by_path(ns->device_path,
 				BLK_OPEN_READ | BLK_OPEN_WRITE, NULL, NULL);
-	if (IS_ERR(ns->bdev_handle)) {
-		ret = PTR_ERR(ns->bdev_handle);
+	if (IS_ERR(ns->bdev_file)) {
+		ret = PTR_ERR(ns->bdev_file);
 		if (ret != -ENOTBLK) {
 			pr_err("failed to open block device %s: (%d)\n",
 					ns->device_path, ret);
 		}
-		ns->bdev_handle = NULL;
+		ns->bdev_file = NULL;
 		return ret;
 	}
-	ns->bdev = ns->bdev_handle->bdev;
+	ns->bdev = file_bdev(ns->bdev_file);
 	ns->size = bdev_nr_bytes(ns->bdev);
 	ns->blksize_shift = blksize_bits(bdev_logical_block_size(ns->bdev));
 
diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
index 6c8acebe1a1a61..33e61b4f478b9a 100644
--- a/drivers/nvme/target/nvmet.h
+++ b/drivers/nvme/target/nvmet.h
@@ -58,7 +58,7 @@
 
 struct nvmet_ns {
 	struct percpu_ref	ref;
-	struct bdev_handle	*bdev_handle;
+	struct file		*bdev_file;
 	struct block_device	*bdev;
 	struct file		*file;
 	bool			readonly;

From b707fcd7894a775874068ea0cd43abb4b190ee97 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Tue, 23 Jan 2024 14:26:33 +0100
Subject: [PATCH 0277/1406] s390: port block device access to file

Link: https://lore.kernel.org/r/20240123-vfs-bdev-file-v2-16-adbd023e19cc@kernel.org
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 drivers/s390/block/dasd.c       | 10 ++++-----
 drivers/s390/block/dasd_genhd.c | 36 ++++++++++++++++-----------------
 drivers/s390/block/dasd_int.h   |  2 +-
 drivers/s390/block/dasd_ioctl.c |  2 +-
 4 files changed, 25 insertions(+), 25 deletions(-)

diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c
index 7327e81352e9c7..c833a7c7d7b26b 100644
--- a/drivers/s390/block/dasd.c
+++ b/drivers/s390/block/dasd.c
@@ -412,7 +412,7 @@ dasd_state_ready_to_online(struct dasd_device * device)
 					KOBJ_CHANGE);
 			return 0;
 		}
-		disk_uevent(device->block->bdev_handle->bdev->bd_disk,
+		disk_uevent(file_bdev(device->block->bdev_file)->bd_disk,
 			    KOBJ_CHANGE);
 	}
 	return 0;
@@ -433,7 +433,7 @@ static int dasd_state_online_to_ready(struct dasd_device *device)
 
 	device->state = DASD_STATE_READY;
 	if (device->block && !(device->features & DASD_FEATURE_USERAW))
-		disk_uevent(device->block->bdev_handle->bdev->bd_disk,
+		disk_uevent(file_bdev(device->block->bdev_file)->bd_disk,
 			    KOBJ_CHANGE);
 	return 0;
 }
@@ -3588,7 +3588,7 @@ int dasd_generic_set_offline(struct ccw_device *cdev)
 	 * in the other openers.
 	 */
 	if (device->block) {
-		max_count = device->block->bdev_handle ? 0 : -1;
+		max_count = device->block->bdev_file ? 0 : -1;
 		open_count = atomic_read(&device->block->open_count);
 		if (open_count > max_count) {
 			if (open_count > 0)
@@ -3634,8 +3634,8 @@ int dasd_generic_set_offline(struct ccw_device *cdev)
 		 * so sync bdev first and then wait for our queues to become
 		 * empty
 		 */
-		if (device->block && device->block->bdev_handle)
-			bdev_mark_dead(device->block->bdev_handle->bdev, false);
+		if (device->block && device->block->bdev_file)
+			bdev_mark_dead(file_bdev(device->block->bdev_file), false);
 		dasd_schedule_device_bh(device);
 		rc = wait_event_interruptible(shutdown_waitq,
 					      _wait_for_empty_queues(device));
diff --git a/drivers/s390/block/dasd_genhd.c b/drivers/s390/block/dasd_genhd.c
index 55e3abe94cde2f..8bf2cf0ccc15c9 100644
--- a/drivers/s390/block/dasd_genhd.c
+++ b/drivers/s390/block/dasd_genhd.c
@@ -127,15 +127,15 @@ void dasd_gendisk_free(struct dasd_block *block)
  */
 int dasd_scan_partitions(struct dasd_block *block)
 {
-	struct bdev_handle *bdev_handle;
+	struct file *bdev_file;
 	int rc;
 
-	bdev_handle = bdev_open_by_dev(disk_devt(block->gdp), BLK_OPEN_READ,
+	bdev_file = bdev_file_open_by_dev(disk_devt(block->gdp), BLK_OPEN_READ,
 				       NULL, NULL);
-	if (IS_ERR(bdev_handle)) {
+	if (IS_ERR(bdev_file)) {
 		DBF_DEV_EVENT(DBF_ERR, block->base,
 			      "scan partitions error, blkdev_get returned %ld",
-			      PTR_ERR(bdev_handle));
+			      PTR_ERR(bdev_file));
 		return -ENODEV;
 	}
 
@@ -147,15 +147,15 @@ int dasd_scan_partitions(struct dasd_block *block)
 				"scan partitions error, rc %d", rc);
 
 	/*
-	 * Since the matching bdev_release() call to the
-	 * bdev_open_by_path() in this function is not called before
+	 * Since the matching fput() call to the
+	 * bdev_file_open_by_path() in this function is not called before
 	 * dasd_destroy_partitions the offline open_count limit needs to be
-	 * increased from 0 to 1. This is done by setting device->bdev_handle
+	 * increased from 0 to 1. This is done by setting device->bdev_file
 	 * (see dasd_generic_set_offline). As long as the partition detection
 	 * is running no offline should be allowed. That is why the assignment
-	 * to block->bdev_handle is done AFTER the BLKRRPART ioctl.
+	 * to block->bdev_file is done AFTER the BLKRRPART ioctl.
 	 */
-	block->bdev_handle = bdev_handle;
+	block->bdev_file = bdev_file;
 	return 0;
 }
 
@@ -165,21 +165,21 @@ int dasd_scan_partitions(struct dasd_block *block)
  */
 void dasd_destroy_partitions(struct dasd_block *block)
 {
-	struct bdev_handle *bdev_handle;
+	struct file *bdev_file;
 
 	/*
-	 * Get the bdev_handle pointer from the device structure and clear
-	 * device->bdev_handle to lower the offline open_count limit again.
+	 * Get the bdev_file pointer from the device structure and clear
+	 * device->bdev_file to lower the offline open_count limit again.
 	 */
-	bdev_handle = block->bdev_handle;
-	block->bdev_handle = NULL;
+	bdev_file = block->bdev_file;
+	block->bdev_file = NULL;
 
-	mutex_lock(&bdev_handle->bdev->bd_disk->open_mutex);
-	bdev_disk_changed(bdev_handle->bdev->bd_disk, true);
-	mutex_unlock(&bdev_handle->bdev->bd_disk->open_mutex);
+	mutex_lock(&file_bdev(bdev_file)->bd_disk->open_mutex);
+	bdev_disk_changed(file_bdev(bdev_file)->bd_disk, true);
+	mutex_unlock(&file_bdev(bdev_file)->bd_disk->open_mutex);
 
 	/* Matching blkdev_put to the blkdev_get in dasd_scan_partitions. */
-	bdev_release(bdev_handle);
+	fput(bdev_file);
 }
 
 int dasd_gendisk_init(void)
diff --git a/drivers/s390/block/dasd_int.h b/drivers/s390/block/dasd_int.h
index 1b1b8a41c4d42e..aecd502aec5114 100644
--- a/drivers/s390/block/dasd_int.h
+++ b/drivers/s390/block/dasd_int.h
@@ -650,7 +650,7 @@ struct dasd_block {
 	struct gendisk *gdp;
 	spinlock_t request_queue_lock;
 	struct blk_mq_tag_set tag_set;
-	struct bdev_handle *bdev_handle;
+	struct file *bdev_file;
 	atomic_t open_count;
 
 	unsigned long blocks;	   /* size of volume in blocks */
diff --git a/drivers/s390/block/dasd_ioctl.c b/drivers/s390/block/dasd_ioctl.c
index 61b9675e2a675e..de85a5e4e21bd0 100644
--- a/drivers/s390/block/dasd_ioctl.c
+++ b/drivers/s390/block/dasd_ioctl.c
@@ -537,7 +537,7 @@ static int __dasd_ioctl_information(struct dasd_block *block,
 	 * This must be hidden from user-space.
 	 */
 	dasd_info->open_count = atomic_read(&block->open_count);
-	if (!block->bdev_handle)
+	if (!block->bdev_file)
 		dasd_info->open_count++;
 
 	/*

From 1001b94a488c69f7a20cc90877c1fe88ff8ed19a Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Tue, 23 Jan 2024 14:26:34 +0100
Subject: [PATCH 0278/1406] target: port block device access to file

Link: https://lore.kernel.org/r/20240123-vfs-bdev-file-v2-17-adbd023e19cc@kernel.org
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 drivers/target/target_core_iblock.c | 18 +++++++++---------
 drivers/target/target_core_iblock.h |  2 +-
 drivers/target/target_core_pscsi.c  | 22 +++++++++++-----------
 drivers/target/target_core_pscsi.h  |  2 +-
 4 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/drivers/target/target_core_iblock.c b/drivers/target/target_core_iblock.c
index 8eb9eb7ce5df52..7f6ca81778453b 100644
--- a/drivers/target/target_core_iblock.c
+++ b/drivers/target/target_core_iblock.c
@@ -91,7 +91,7 @@ static int iblock_configure_device(struct se_device *dev)
 {
 	struct iblock_dev *ib_dev = IBLOCK_DEV(dev);
 	struct request_queue *q;
-	struct bdev_handle *bdev_handle;
+	struct file *bdev_file;
 	struct block_device *bd;
 	struct blk_integrity *bi;
 	blk_mode_t mode = BLK_OPEN_READ;
@@ -117,14 +117,14 @@ static int iblock_configure_device(struct se_device *dev)
 	else
 		dev->dev_flags |= DF_READ_ONLY;
 
-	bdev_handle = bdev_open_by_path(ib_dev->ibd_udev_path, mode, ib_dev,
+	bdev_file = bdev_file_open_by_path(ib_dev->ibd_udev_path, mode, ib_dev,
 					NULL);
-	if (IS_ERR(bdev_handle)) {
-		ret = PTR_ERR(bdev_handle);
+	if (IS_ERR(bdev_file)) {
+		ret = PTR_ERR(bdev_file);
 		goto out_free_bioset;
 	}
-	ib_dev->ibd_bdev_handle = bdev_handle;
-	ib_dev->ibd_bd = bd = bdev_handle->bdev;
+	ib_dev->ibd_bdev_file = bdev_file;
+	ib_dev->ibd_bd = bd = file_bdev(bdev_file);
 
 	q = bdev_get_queue(bd);
 
@@ -180,7 +180,7 @@ static int iblock_configure_device(struct se_device *dev)
 	return 0;
 
 out_blkdev_put:
-	bdev_release(ib_dev->ibd_bdev_handle);
+	fput(ib_dev->ibd_bdev_file);
 out_free_bioset:
 	bioset_exit(&ib_dev->ibd_bio_set);
 out:
@@ -205,8 +205,8 @@ static void iblock_destroy_device(struct se_device *dev)
 {
 	struct iblock_dev *ib_dev = IBLOCK_DEV(dev);
 
-	if (ib_dev->ibd_bdev_handle)
-		bdev_release(ib_dev->ibd_bdev_handle);
+	if (ib_dev->ibd_bdev_file)
+		fput(ib_dev->ibd_bdev_file);
 	bioset_exit(&ib_dev->ibd_bio_set);
 }
 
diff --git a/drivers/target/target_core_iblock.h b/drivers/target/target_core_iblock.h
index 683f9a55945bb2..91f6f4280666cb 100644
--- a/drivers/target/target_core_iblock.h
+++ b/drivers/target/target_core_iblock.h
@@ -32,7 +32,7 @@ struct iblock_dev {
 	u32	ibd_flags;
 	struct bio_set	ibd_bio_set;
 	struct block_device *ibd_bd;
-	struct bdev_handle *ibd_bdev_handle;
+	struct file *ibd_bdev_file;
 	bool ibd_readonly;
 	struct iblock_dev_plug *ibd_plug;
 } ____cacheline_aligned;
diff --git a/drivers/target/target_core_pscsi.c b/drivers/target/target_core_pscsi.c
index 41b7489d37ce95..9aedd682d10c41 100644
--- a/drivers/target/target_core_pscsi.c
+++ b/drivers/target/target_core_pscsi.c
@@ -352,7 +352,7 @@ static int pscsi_create_type_disk(struct se_device *dev, struct scsi_device *sd)
 	struct pscsi_hba_virt *phv = dev->se_hba->hba_ptr;
 	struct pscsi_dev_virt *pdv = PSCSI_DEV(dev);
 	struct Scsi_Host *sh = sd->host;
-	struct bdev_handle *bdev_handle;
+	struct file *bdev_file;
 	int ret;
 
 	if (scsi_device_get(sd)) {
@@ -366,18 +366,18 @@ static int pscsi_create_type_disk(struct se_device *dev, struct scsi_device *sd)
 	 * Claim exclusive struct block_device access to struct scsi_device
 	 * for TYPE_DISK and TYPE_ZBC using supplied udev_path
 	 */
-	bdev_handle = bdev_open_by_path(dev->udev_path,
+	bdev_file = bdev_file_open_by_path(dev->udev_path,
 				BLK_OPEN_WRITE | BLK_OPEN_READ, pdv, NULL);
-	if (IS_ERR(bdev_handle)) {
+	if (IS_ERR(bdev_file)) {
 		pr_err("pSCSI: bdev_open_by_path() failed\n");
 		scsi_device_put(sd);
-		return PTR_ERR(bdev_handle);
+		return PTR_ERR(bdev_file);
 	}
-	pdv->pdv_bdev_handle = bdev_handle;
+	pdv->pdv_bdev_file = bdev_file;
 
 	ret = pscsi_add_device_to_list(dev, sd);
 	if (ret) {
-		bdev_release(bdev_handle);
+		fput(bdev_file);
 		scsi_device_put(sd);
 		return ret;
 	}
@@ -564,9 +564,9 @@ static void pscsi_destroy_device(struct se_device *dev)
 		 * from pscsi_create_type_disk()
 		 */
 		if ((sd->type == TYPE_DISK || sd->type == TYPE_ZBC) &&
-		    pdv->pdv_bdev_handle) {
-			bdev_release(pdv->pdv_bdev_handle);
-			pdv->pdv_bdev_handle = NULL;
+		    pdv->pdv_bdev_file) {
+			fput(pdv->pdv_bdev_file);
+			pdv->pdv_bdev_file = NULL;
 		}
 		/*
 		 * For HBA mode PHV_LLD_SCSI_HOST_NO, release the reference
@@ -994,8 +994,8 @@ static sector_t pscsi_get_blocks(struct se_device *dev)
 {
 	struct pscsi_dev_virt *pdv = PSCSI_DEV(dev);
 
-	if (pdv->pdv_bdev_handle)
-		return bdev_nr_sectors(pdv->pdv_bdev_handle->bdev);
+	if (pdv->pdv_bdev_file)
+		return bdev_nr_sectors(file_bdev(pdv->pdv_bdev_file));
 	return 0;
 }
 
diff --git a/drivers/target/target_core_pscsi.h b/drivers/target/target_core_pscsi.h
index b0a3ef136592a9..9acaa21e4c78a4 100644
--- a/drivers/target/target_core_pscsi.h
+++ b/drivers/target/target_core_pscsi.h
@@ -37,7 +37,7 @@ struct pscsi_dev_virt {
 	int	pdv_channel_id;
 	int	pdv_target_id;
 	int	pdv_lun_id;
-	struct bdev_handle *pdv_bdev_handle;
+	struct file *pdv_bdev_file;
 	struct scsi_device *pdv_sd;
 	struct Scsi_Host *pdv_lld_host;
 } ____cacheline_aligned;

From 1df39a40e912326c5ccde1090b096b642acc6556 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Tue, 23 Jan 2024 14:26:35 +0100
Subject: [PATCH 0279/1406] bcachefs: port block device access to file

Link: https://lore.kernel.org/r/20240123-vfs-bdev-file-v2-18-adbd023e19cc@kernel.org
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/bcachefs/super-io.c    | 20 ++++++++++----------
 fs/bcachefs/super_types.h |  2 +-
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index d60c7d27a0477c..ce8cf2d91f8444 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -142,8 +142,8 @@ void bch2_sb_field_delete(struct bch_sb_handle *sb,
 void bch2_free_super(struct bch_sb_handle *sb)
 {
 	kfree(sb->bio);
-	if (!IS_ERR_OR_NULL(sb->bdev_handle))
-		bdev_release(sb->bdev_handle);
+	if (!IS_ERR_OR_NULL(sb->s_bdev_file))
+		fput(sb->s_bdev_file);
 	kfree(sb->holder);
 	kfree(sb->sb_name);
 
@@ -704,22 +704,22 @@ static int __bch2_read_super(const char *path, struct bch_opts *opts,
 	if (!opt_get(*opts, nochanges))
 		sb->mode |= BLK_OPEN_WRITE;
 
-	sb->bdev_handle = bdev_open_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops);
-	if (IS_ERR(sb->bdev_handle) &&
-	    PTR_ERR(sb->bdev_handle) == -EACCES &&
+	sb->s_bdev_file = bdev_file_open_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops);
+	if (IS_ERR(sb->s_bdev_file) &&
+	    PTR_ERR(sb->s_bdev_file) == -EACCES &&
 	    opt_get(*opts, read_only)) {
 		sb->mode &= ~BLK_OPEN_WRITE;
 
-		sb->bdev_handle = bdev_open_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops);
-		if (!IS_ERR(sb->bdev_handle))
+		sb->s_bdev_file = bdev_file_open_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops);
+		if (!IS_ERR(sb->s_bdev_file))
 			opt_set(*opts, nochanges, true);
 	}
 
-	if (IS_ERR(sb->bdev_handle)) {
-		ret = PTR_ERR(sb->bdev_handle);
+	if (IS_ERR(sb->s_bdev_file)) {
+		ret = PTR_ERR(sb->s_bdev_file);
 		goto out;
 	}
-	sb->bdev = sb->bdev_handle->bdev;
+	sb->bdev = file_bdev(sb->s_bdev_file);
 
 	ret = bch2_sb_realloc(sb, 0);
 	if (ret) {
diff --git a/fs/bcachefs/super_types.h b/fs/bcachefs/super_types.h
index 0e5a14fc8e7fbf..ec784d975f6655 100644
--- a/fs/bcachefs/super_types.h
+++ b/fs/bcachefs/super_types.h
@@ -4,7 +4,7 @@
 
 struct bch_sb_handle {
 	struct bch_sb		*sb;
-	struct bdev_handle	*bdev_handle;
+	struct file		*s_bdev_file;
 	struct block_device	*bdev;
 	char			*sb_name;
 	struct bio		*bio;

From cf419c25837316ca9aa54bb901be6f0a0f33287e Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Tue, 23 Jan 2024 14:26:36 +0100
Subject: [PATCH 0280/1406] btrfs: port device access to file

Link: https://lore.kernel.org/r/20240123-vfs-bdev-file-v2-19-adbd023e19cc@kernel.org
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/btrfs/dev-replace.c | 14 +++----
 fs/btrfs/ioctl.c       | 16 ++++----
 fs/btrfs/volumes.c     | 92 +++++++++++++++++++++---------------------
 fs/btrfs/volumes.h     |  4 +-
 4 files changed, 63 insertions(+), 63 deletions(-)

diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 1502d664c89273..2eb11fe4bd0542 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -246,7 +246,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
 {
 	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
 	struct btrfs_device *device;
-	struct bdev_handle *bdev_handle;
+	struct file *bdev_file;
 	struct block_device *bdev;
 	u64 devid = BTRFS_DEV_REPLACE_DEVID;
 	int ret = 0;
@@ -257,13 +257,13 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
 		return -EINVAL;
 	}
 
-	bdev_handle = bdev_open_by_path(device_path, BLK_OPEN_WRITE,
+	bdev_file = bdev_file_open_by_path(device_path, BLK_OPEN_WRITE,
 					fs_info->bdev_holder, NULL);
-	if (IS_ERR(bdev_handle)) {
+	if (IS_ERR(bdev_file)) {
 		btrfs_err(fs_info, "target device %s is invalid!", device_path);
-		return PTR_ERR(bdev_handle);
+		return PTR_ERR(bdev_file);
 	}
-	bdev = bdev_handle->bdev;
+	bdev = file_bdev(bdev_file);
 
 	if (!btrfs_check_device_zone_type(fs_info, bdev)) {
 		btrfs_err(fs_info,
@@ -314,7 +314,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
 	device->commit_bytes_used = device->bytes_used;
 	device->fs_info = fs_info;
 	device->bdev = bdev;
-	device->bdev_handle = bdev_handle;
+	device->bdev_file = bdev_file;
 	set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
 	set_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
 	device->dev_stats_valid = 1;
@@ -335,7 +335,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
 	return 0;
 
 error:
-	bdev_release(bdev_handle);
+	fput(bdev_file);
 	return ret;
 }
 
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 41b479861b3c76..9e0b3932d90c22 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -2691,7 +2691,7 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
 	struct inode *inode = file_inode(file);
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 	struct btrfs_ioctl_vol_args_v2 *vol_args;
-	struct bdev_handle *bdev_handle = NULL;
+	struct file *bdev_file = NULL;
 	int ret;
 	bool cancel = false;
 
@@ -2728,7 +2728,7 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
 		goto err_drop;
 
 	/* Exclusive operation is now claimed */
-	ret = btrfs_rm_device(fs_info, &args, &bdev_handle);
+	ret = btrfs_rm_device(fs_info, &args, &bdev_file);
 
 	btrfs_exclop_finish(fs_info);
 
@@ -2742,8 +2742,8 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
 	}
 err_drop:
 	mnt_drop_write_file(file);
-	if (bdev_handle)
-		bdev_release(bdev_handle);
+	if (bdev_file)
+		fput(bdev_file);
 out:
 	btrfs_put_dev_args_from_path(&args);
 	kfree(vol_args);
@@ -2756,7 +2756,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
 	struct inode *inode = file_inode(file);
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 	struct btrfs_ioctl_vol_args *vol_args;
-	struct bdev_handle *bdev_handle = NULL;
+	struct file *bdev_file = NULL;
 	int ret;
 	bool cancel = false;
 
@@ -2783,15 +2783,15 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
 	ret = exclop_start_or_cancel_reloc(fs_info, BTRFS_EXCLOP_DEV_REMOVE,
 					   cancel);
 	if (ret == 0) {
-		ret = btrfs_rm_device(fs_info, &args, &bdev_handle);
+		ret = btrfs_rm_device(fs_info, &args, &bdev_file);
 		if (!ret)
 			btrfs_info(fs_info, "disk deleted %s", vol_args->name);
 		btrfs_exclop_finish(fs_info);
 	}
 
 	mnt_drop_write_file(file);
-	if (bdev_handle)
-		bdev_release(bdev_handle);
+	if (bdev_file)
+		fput(bdev_file);
 out:
 	btrfs_put_dev_args_from_path(&args);
 	kfree(vol_args);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 4c32497311d2ff..769a1dc4b756cd 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -468,39 +468,39 @@ static noinline struct btrfs_fs_devices *find_fsid(
 
 static int
 btrfs_get_bdev_and_sb(const char *device_path, blk_mode_t flags, void *holder,
-		      int flush, struct bdev_handle **bdev_handle,
+		      int flush, struct file **bdev_file,
 		      struct btrfs_super_block **disk_super)
 {
 	struct block_device *bdev;
 	int ret;
 
-	*bdev_handle = bdev_open_by_path(device_path, flags, holder, NULL);
+	*bdev_file = bdev_file_open_by_path(device_path, flags, holder, NULL);
 
-	if (IS_ERR(*bdev_handle)) {
-		ret = PTR_ERR(*bdev_handle);
+	if (IS_ERR(*bdev_file)) {
+		ret = PTR_ERR(*bdev_file);
 		goto error;
 	}
-	bdev = (*bdev_handle)->bdev;
+	bdev = file_bdev(*bdev_file);
 
 	if (flush)
 		sync_blockdev(bdev);
 	ret = set_blocksize(bdev, BTRFS_BDEV_BLOCKSIZE);
 	if (ret) {
-		bdev_release(*bdev_handle);
+		fput(*bdev_file);
 		goto error;
 	}
 	invalidate_bdev(bdev);
 	*disk_super = btrfs_read_dev_super(bdev);
 	if (IS_ERR(*disk_super)) {
 		ret = PTR_ERR(*disk_super);
-		bdev_release(*bdev_handle);
+		fput(*bdev_file);
 		goto error;
 	}
 
 	return 0;
 
 error:
-	*bdev_handle = NULL;
+	*bdev_file = NULL;
 	return ret;
 }
 
@@ -643,7 +643,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
 			struct btrfs_device *device, blk_mode_t flags,
 			void *holder)
 {
-	struct bdev_handle *bdev_handle;
+	struct file *bdev_file;
 	struct btrfs_super_block *disk_super;
 	u64 devid;
 	int ret;
@@ -654,7 +654,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
 		return -EINVAL;
 
 	ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
-				    &bdev_handle, &disk_super);
+				    &bdev_file, &disk_super);
 	if (ret)
 		return ret;
 
@@ -678,20 +678,20 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
 		clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
 		fs_devices->seeding = true;
 	} else {
-		if (bdev_read_only(bdev_handle->bdev))
+		if (bdev_read_only(file_bdev(bdev_file)))
 			clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
 		else
 			set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
 	}
 
-	if (!bdev_nonrot(bdev_handle->bdev))
+	if (!bdev_nonrot(file_bdev(bdev_file)))
 		fs_devices->rotating = true;
 
-	if (bdev_max_discard_sectors(bdev_handle->bdev))
+	if (bdev_max_discard_sectors(file_bdev(bdev_file)))
 		fs_devices->discardable = true;
 
-	device->bdev_handle = bdev_handle;
-	device->bdev = bdev_handle->bdev;
+	device->bdev_file = bdev_file;
+	device->bdev = file_bdev(bdev_file);
 	clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
 
 	fs_devices->open_devices++;
@@ -706,7 +706,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
 
 error_free_page:
 	btrfs_release_disk_super(disk_super);
-	bdev_release(bdev_handle);
+	fput(bdev_file);
 
 	return -EINVAL;
 }
@@ -1015,10 +1015,10 @@ static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices,
 		if (device->devid == BTRFS_DEV_REPLACE_DEVID)
 			continue;
 
-		if (device->bdev_handle) {
-			bdev_release(device->bdev_handle);
+		if (device->bdev_file) {
+			fput(device->bdev_file);
 			device->bdev = NULL;
-			device->bdev_handle = NULL;
+			device->bdev_file = NULL;
 			fs_devices->open_devices--;
 		}
 		if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
@@ -1063,7 +1063,7 @@ static void btrfs_close_bdev(struct btrfs_device *device)
 		invalidate_bdev(device->bdev);
 	}
 
-	bdev_release(device->bdev_handle);
+	fput(device->bdev_file);
 }
 
 static void btrfs_close_one_device(struct btrfs_device *device)
@@ -1316,7 +1316,7 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags,
 	struct btrfs_super_block *disk_super;
 	bool new_device_added = false;
 	struct btrfs_device *device = NULL;
-	struct bdev_handle *bdev_handle;
+	struct file *bdev_file;
 	u64 bytenr, bytenr_orig;
 	int ret;
 
@@ -1339,18 +1339,18 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags,
 	 * values temporarily, as the device paths of the fsid are the only
 	 * required information for assembling the volume.
 	 */
-	bdev_handle = bdev_open_by_path(path, flags, NULL, NULL);
-	if (IS_ERR(bdev_handle))
-		return ERR_CAST(bdev_handle);
+	bdev_file = bdev_file_open_by_path(path, flags, NULL, NULL);
+	if (IS_ERR(bdev_file))
+		return ERR_CAST(bdev_file);
 
 	bytenr_orig = btrfs_sb_offset(0);
-	ret = btrfs_sb_log_location_bdev(bdev_handle->bdev, 0, READ, &bytenr);
+	ret = btrfs_sb_log_location_bdev(file_bdev(bdev_file), 0, READ, &bytenr);
 	if (ret) {
 		device = ERR_PTR(ret);
 		goto error_bdev_put;
 	}
 
-	disk_super = btrfs_read_disk_super(bdev_handle->bdev, bytenr,
+	disk_super = btrfs_read_disk_super(file_bdev(bdev_file), bytenr,
 					   bytenr_orig);
 	if (IS_ERR(disk_super)) {
 		device = ERR_CAST(disk_super);
@@ -1381,7 +1381,7 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags,
 	btrfs_release_disk_super(disk_super);
 
 error_bdev_put:
-	bdev_release(bdev_handle);
+	fput(bdev_file);
 
 	return device;
 }
@@ -2057,7 +2057,7 @@ void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info,
 
 int btrfs_rm_device(struct btrfs_fs_info *fs_info,
 		    struct btrfs_dev_lookup_args *args,
-		    struct bdev_handle **bdev_handle)
+		    struct file **bdev_file)
 {
 	struct btrfs_trans_handle *trans;
 	struct btrfs_device *device;
@@ -2166,7 +2166,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info,
 
 	btrfs_assign_next_active_device(device, NULL);
 
-	if (device->bdev_handle) {
+	if (device->bdev_file) {
 		cur_devices->open_devices--;
 		/* remove sysfs entry */
 		btrfs_sysfs_remove_device(device);
@@ -2182,9 +2182,9 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info,
 	 * free the device.
 	 *
 	 * We cannot call btrfs_close_bdev() here because we're holding the sb
-	 * write lock, and bdev_release() will pull in the ->open_mutex on
-	 * the block device and it's dependencies.  Instead just flush the
-	 * device and let the caller do the final bdev_release.
+	 * write lock, and fput() on the block device will pull in the
+	 * ->open_mutex on the block device and it's dependencies.  Instead
+	 *  just flush the device and let the caller do the final bdev_release.
 	 */
 	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
 		btrfs_scratch_superblocks(fs_info, device->bdev,
@@ -2195,7 +2195,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info,
 		}
 	}
 
-	*bdev_handle = device->bdev_handle;
+	*bdev_file = device->bdev_file;
 	synchronize_rcu();
 	btrfs_free_device(device);
 
@@ -2332,7 +2332,7 @@ int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info,
 				 const char *path)
 {
 	struct btrfs_super_block *disk_super;
-	struct bdev_handle *bdev_handle;
+	struct file *bdev_file;
 	int ret;
 
 	if (!path || !path[0])
@@ -2350,7 +2350,7 @@ int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info,
 	}
 
 	ret = btrfs_get_bdev_and_sb(path, BLK_OPEN_READ, NULL, 0,
-				    &bdev_handle, &disk_super);
+				    &bdev_file, &disk_super);
 	if (ret) {
 		btrfs_put_dev_args_from_path(args);
 		return ret;
@@ -2363,7 +2363,7 @@ int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info,
 	else
 		memcpy(args->fsid, disk_super->fsid, BTRFS_FSID_SIZE);
 	btrfs_release_disk_super(disk_super);
-	bdev_release(bdev_handle);
+	fput(bdev_file);
 	return 0;
 }
 
@@ -2583,7 +2583,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
 	struct btrfs_root *root = fs_info->dev_root;
 	struct btrfs_trans_handle *trans;
 	struct btrfs_device *device;
-	struct bdev_handle *bdev_handle;
+	struct file *bdev_file;
 	struct super_block *sb = fs_info->sb;
 	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
 	struct btrfs_fs_devices *seed_devices = NULL;
@@ -2596,12 +2596,12 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
 	if (sb_rdonly(sb) && !fs_devices->seeding)
 		return -EROFS;
 
-	bdev_handle = bdev_open_by_path(device_path, BLK_OPEN_WRITE,
+	bdev_file = bdev_file_open_by_path(device_path, BLK_OPEN_WRITE,
 					fs_info->bdev_holder, NULL);
-	if (IS_ERR(bdev_handle))
-		return PTR_ERR(bdev_handle);
+	if (IS_ERR(bdev_file))
+		return PTR_ERR(bdev_file);
 
-	if (!btrfs_check_device_zone_type(fs_info, bdev_handle->bdev)) {
+	if (!btrfs_check_device_zone_type(fs_info, file_bdev(bdev_file))) {
 		ret = -EINVAL;
 		goto error;
 	}
@@ -2613,11 +2613,11 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
 		locked = true;
 	}
 
-	sync_blockdev(bdev_handle->bdev);
+	sync_blockdev(file_bdev(bdev_file));
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) {
-		if (device->bdev == bdev_handle->bdev) {
+		if (device->bdev == file_bdev(bdev_file)) {
 			ret = -EEXIST;
 			rcu_read_unlock();
 			goto error;
@@ -2633,8 +2633,8 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
 	}
 
 	device->fs_info = fs_info;
-	device->bdev_handle = bdev_handle;
-	device->bdev = bdev_handle->bdev;
+	device->bdev_file = bdev_file;
+	device->bdev = file_bdev(bdev_file);
 	ret = lookup_bdev(device_path, &device->devt);
 	if (ret)
 		goto error_free_device;
@@ -2817,7 +2817,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
 error_free_device:
 	btrfs_free_device(device);
 error:
-	bdev_release(bdev_handle);
+	fput(bdev_file);
 	if (locked) {
 		mutex_unlock(&uuid_mutex);
 		up_write(&sb->s_umount);
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 53f87f398da779..a11854912d535f 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -90,7 +90,7 @@ struct btrfs_device {
 
 	u64 generation;
 
-	struct bdev_handle *bdev_handle;
+	struct file *bdev_file;
 	struct block_device *bdev;
 
 	struct btrfs_zoned_device_info *zone_info;
@@ -661,7 +661,7 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
 void btrfs_put_dev_args_from_path(struct btrfs_dev_lookup_args *args);
 int btrfs_rm_device(struct btrfs_fs_info *fs_info,
 		    struct btrfs_dev_lookup_args *args,
-		    struct bdev_handle **bdev_handle);
+		    struct file **bdev_file);
 void __exit btrfs_cleanup_fs_uuids(void);
 int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len);
 int btrfs_grow_device(struct btrfs_trans_handle *trans,

From 9e6e6e8d1c88a29b3463cbe3bf6ee3c73d8f02ff Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Tue, 23 Jan 2024 14:26:37 +0100
Subject: [PATCH 0281/1406] erofs: port device access to file

Link: https://lore.kernel.org/r/20240123-vfs-bdev-file-v2-20-adbd023e19cc@kernel.org
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/erofs/data.c     |  6 +++---
 fs/erofs/internal.h |  2 +-
 fs/erofs/super.c    | 16 ++++++++--------
 3 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/fs/erofs/data.c b/fs/erofs/data.c
index c98aeda8abb215..433fc39ba4235d 100644
--- a/fs/erofs/data.c
+++ b/fs/erofs/data.c
@@ -220,7 +220,7 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map)
 			up_read(&devs->rwsem);
 			return 0;
 		}
-		map->m_bdev = dif->bdev_handle ? dif->bdev_handle->bdev : NULL;
+		map->m_bdev = dif->bdev_file ? file_bdev(dif->bdev_file) : NULL;
 		map->m_daxdev = dif->dax_dev;
 		map->m_dax_part_off = dif->dax_part_off;
 		map->m_fscache = dif->fscache;
@@ -238,8 +238,8 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map)
 			if (map->m_pa >= startoff &&
 			    map->m_pa < startoff + length) {
 				map->m_pa -= startoff;
-				map->m_bdev = dif->bdev_handle ?
-					      dif->bdev_handle->bdev : NULL;
+				map->m_bdev = dif->bdev_file ?
+					      file_bdev(dif->bdev_file) : NULL;
 				map->m_daxdev = dif->dax_dev;
 				map->m_dax_part_off = dif->dax_part_off;
 				map->m_fscache = dif->fscache;
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index b0409badb01723..0f0706325b7b47 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -49,7 +49,7 @@ typedef u32 erofs_blk_t;
 struct erofs_device_info {
 	char *path;
 	struct erofs_fscache *fscache;
-	struct bdev_handle *bdev_handle;
+	struct file *bdev_file;
 	struct dax_device *dax_dev;
 	u64 dax_part_off;
 
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index 5f60f163bd56e2..9b4b66dcdd4f10 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -177,7 +177,7 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb,
 	struct erofs_sb_info *sbi = EROFS_SB(sb);
 	struct erofs_fscache *fscache;
 	struct erofs_deviceslot *dis;
-	struct bdev_handle *bdev_handle;
+	struct file *bdev_file;
 	void *ptr;
 
 	ptr = erofs_read_metabuf(buf, sb, erofs_blknr(sb, *pos), EROFS_KMAP);
@@ -201,12 +201,12 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb,
 			return PTR_ERR(fscache);
 		dif->fscache = fscache;
 	} else if (!sbi->devs->flatdev) {
-		bdev_handle = bdev_open_by_path(dif->path, BLK_OPEN_READ,
+		bdev_file = bdev_file_open_by_path(dif->path, BLK_OPEN_READ,
 						sb->s_type, NULL);
-		if (IS_ERR(bdev_handle))
-			return PTR_ERR(bdev_handle);
-		dif->bdev_handle = bdev_handle;
-		dif->dax_dev = fs_dax_get_by_bdev(bdev_handle->bdev,
+		if (IS_ERR(bdev_file))
+			return PTR_ERR(bdev_file);
+		dif->bdev_file = bdev_file;
+		dif->dax_dev = fs_dax_get_by_bdev(file_bdev(bdev_file),
 				&dif->dax_part_off, NULL, NULL);
 	}
 
@@ -754,8 +754,8 @@ static int erofs_release_device_info(int id, void *ptr, void *data)
 	struct erofs_device_info *dif = ptr;
 
 	fs_put_dax(dif->dax_dev, NULL);
-	if (dif->bdev_handle)
-		bdev_release(dif->bdev_handle);
+	if (dif->bdev_file)
+		fput(dif->bdev_file);
 	erofs_fscache_unregister_cookie(dif->fscache);
 	dif->fscache = NULL;
 	kfree(dif->path);

From bc7c2a49da6f90a4db8a385e929d0ea63c40ddc2 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Tue, 23 Jan 2024 14:26:38 +0100
Subject: [PATCH 0282/1406] ext4: port block device access to file

Link: https://lore.kernel.org/r/20240123-vfs-bdev-file-v2-21-adbd023e19cc@kernel.org
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/ext4/ext4.h  |  2 +-
 fs/ext4/fsmap.c |  8 ++++----
 fs/ext4/super.c | 52 ++++++++++++++++++++++++-------------------------
 3 files changed, 31 insertions(+), 31 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index a5d784872303dd..dcdad5da419efd 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1548,7 +1548,7 @@ struct ext4_sb_info {
 	unsigned long s_commit_interval;
 	u32 s_max_batch_time;
 	u32 s_min_batch_time;
-	struct bdev_handle *s_journal_bdev_handle;
+	struct file *s_journal_bdev_file;
 #ifdef CONFIG_QUOTA
 	/* Names of quota files with journalled quota */
 	char __rcu *s_qf_names[EXT4_MAXQUOTAS];
diff --git a/fs/ext4/fsmap.c b/fs/ext4/fsmap.c
index 11e6f33677a2c8..df853c4d3a8c91 100644
--- a/fs/ext4/fsmap.c
+++ b/fs/ext4/fsmap.c
@@ -576,9 +576,9 @@ static bool ext4_getfsmap_is_valid_device(struct super_block *sb,
 	if (fm->fmr_device == 0 || fm->fmr_device == UINT_MAX ||
 	    fm->fmr_device == new_encode_dev(sb->s_bdev->bd_dev))
 		return true;
-	if (EXT4_SB(sb)->s_journal_bdev_handle &&
+	if (EXT4_SB(sb)->s_journal_bdev_file &&
 	    fm->fmr_device ==
-	    new_encode_dev(EXT4_SB(sb)->s_journal_bdev_handle->bdev->bd_dev))
+	    new_encode_dev(file_bdev(EXT4_SB(sb)->s_journal_bdev_file)->bd_dev))
 		return true;
 	return false;
 }
@@ -648,9 +648,9 @@ int ext4_getfsmap(struct super_block *sb, struct ext4_fsmap_head *head,
 	memset(handlers, 0, sizeof(handlers));
 	handlers[0].gfd_dev = new_encode_dev(sb->s_bdev->bd_dev);
 	handlers[0].gfd_fn = ext4_getfsmap_datadev;
-	if (EXT4_SB(sb)->s_journal_bdev_handle) {
+	if (EXT4_SB(sb)->s_journal_bdev_file) {
 		handlers[1].gfd_dev = new_encode_dev(
-			EXT4_SB(sb)->s_journal_bdev_handle->bdev->bd_dev);
+			file_bdev(EXT4_SB(sb)->s_journal_bdev_file)->bd_dev);
 		handlers[1].gfd_fn = ext4_getfsmap_logdev;
 	}
 
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index dcba0f85dfe245..aa007710cfc36d 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1359,14 +1359,14 @@ static void ext4_put_super(struct super_block *sb)
 
 	sync_blockdev(sb->s_bdev);
 	invalidate_bdev(sb->s_bdev);
-	if (sbi->s_journal_bdev_handle) {
+	if (sbi->s_journal_bdev_file) {
 		/*
 		 * Invalidate the journal device's buffers.  We don't want them
 		 * floating about in memory - the physical journal device may
 		 * hotswapped, and it breaks the `ro-after' testing code.
 		 */
-		sync_blockdev(sbi->s_journal_bdev_handle->bdev);
-		invalidate_bdev(sbi->s_journal_bdev_handle->bdev);
+		sync_blockdev(file_bdev(sbi->s_journal_bdev_file));
+		invalidate_bdev(file_bdev(sbi->s_journal_bdev_file));
 	}
 
 	ext4_xattr_destroy_cache(sbi->s_ea_inode_cache);
@@ -4233,7 +4233,7 @@ int ext4_calculate_overhead(struct super_block *sb)
 	 * Add the internal journal blocks whether the journal has been
 	 * loaded or not
 	 */
-	if (sbi->s_journal && !sbi->s_journal_bdev_handle)
+	if (sbi->s_journal && !sbi->s_journal_bdev_file)
 		overhead += EXT4_NUM_B2C(sbi, sbi->s_journal->j_total_len);
 	else if (ext4_has_feature_journal(sb) && !sbi->s_journal && j_inum) {
 		/* j_inum for internal journal is non-zero */
@@ -5670,9 +5670,9 @@ failed_mount9: __maybe_unused
 #endif
 	fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy);
 	brelse(sbi->s_sbh);
-	if (sbi->s_journal_bdev_handle) {
-		invalidate_bdev(sbi->s_journal_bdev_handle->bdev);
-		bdev_release(sbi->s_journal_bdev_handle);
+	if (sbi->s_journal_bdev_file) {
+		invalidate_bdev(file_bdev(sbi->s_journal_bdev_file));
+		fput(sbi->s_journal_bdev_file);
 	}
 out_fail:
 	invalidate_bdev(sb->s_bdev);
@@ -5842,30 +5842,30 @@ static journal_t *ext4_open_inode_journal(struct super_block *sb,
 	return journal;
 }
 
-static struct bdev_handle *ext4_get_journal_blkdev(struct super_block *sb,
+static struct file *ext4_get_journal_blkdev(struct super_block *sb,
 					dev_t j_dev, ext4_fsblk_t *j_start,
 					ext4_fsblk_t *j_len)
 {
 	struct buffer_head *bh;
 	struct block_device *bdev;
-	struct bdev_handle *bdev_handle;
+	struct file *bdev_file;
 	int hblock, blocksize;
 	ext4_fsblk_t sb_block;
 	unsigned long offset;
 	struct ext4_super_block *es;
 	int errno;
 
-	bdev_handle = bdev_open_by_dev(j_dev,
+	bdev_file = bdev_file_open_by_dev(j_dev,
 		BLK_OPEN_READ | BLK_OPEN_WRITE | BLK_OPEN_RESTRICT_WRITES,
 		sb, &fs_holder_ops);
-	if (IS_ERR(bdev_handle)) {
+	if (IS_ERR(bdev_file)) {
 		ext4_msg(sb, KERN_ERR,
 			 "failed to open journal device unknown-block(%u,%u) %ld",
-			 MAJOR(j_dev), MINOR(j_dev), PTR_ERR(bdev_handle));
-		return bdev_handle;
+			 MAJOR(j_dev), MINOR(j_dev), PTR_ERR(bdev_file));
+		return bdev_file;
 	}
 
-	bdev = bdev_handle->bdev;
+	bdev = file_bdev(bdev_file);
 	blocksize = sb->s_blocksize;
 	hblock = bdev_logical_block_size(bdev);
 	if (blocksize < hblock) {
@@ -5912,12 +5912,12 @@ static struct bdev_handle *ext4_get_journal_blkdev(struct super_block *sb,
 	*j_start = sb_block + 1;
 	*j_len = ext4_blocks_count(es);
 	brelse(bh);
-	return bdev_handle;
+	return bdev_file;
 
 out_bh:
 	brelse(bh);
 out_bdev:
-	bdev_release(bdev_handle);
+	fput(bdev_file);
 	return ERR_PTR(errno);
 }
 
@@ -5927,14 +5927,14 @@ static journal_t *ext4_open_dev_journal(struct super_block *sb,
 	journal_t *journal;
 	ext4_fsblk_t j_start;
 	ext4_fsblk_t j_len;
-	struct bdev_handle *bdev_handle;
+	struct file *bdev_file;
 	int errno = 0;
 
-	bdev_handle = ext4_get_journal_blkdev(sb, j_dev, &j_start, &j_len);
-	if (IS_ERR(bdev_handle))
-		return ERR_CAST(bdev_handle);
+	bdev_file = ext4_get_journal_blkdev(sb, j_dev, &j_start, &j_len);
+	if (IS_ERR(bdev_file))
+		return ERR_CAST(bdev_file);
 
-	journal = jbd2_journal_init_dev(bdev_handle->bdev, sb->s_bdev, j_start,
+	journal = jbd2_journal_init_dev(file_bdev(bdev_file), sb->s_bdev, j_start,
 					j_len, sb->s_blocksize);
 	if (IS_ERR(journal)) {
 		ext4_msg(sb, KERN_ERR, "failed to create device journal");
@@ -5949,14 +5949,14 @@ static journal_t *ext4_open_dev_journal(struct super_block *sb,
 		goto out_journal;
 	}
 	journal->j_private = sb;
-	EXT4_SB(sb)->s_journal_bdev_handle = bdev_handle;
+	EXT4_SB(sb)->s_journal_bdev_file = bdev_file;
 	ext4_init_journal_params(sb, journal);
 	return journal;
 
 out_journal:
 	jbd2_journal_destroy(journal);
 out_bdev:
-	bdev_release(bdev_handle);
+	fput(bdev_file);
 	return ERR_PTR(errno);
 }
 
@@ -7314,12 +7314,12 @@ static inline int ext3_feature_set_ok(struct super_block *sb)
 static void ext4_kill_sb(struct super_block *sb)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
-	struct bdev_handle *handle = sbi ? sbi->s_journal_bdev_handle : NULL;
+	struct file *bdev_file = sbi ? sbi->s_journal_bdev_file : NULL;
 
 	kill_block_super(sb);
 
-	if (handle)
-		bdev_release(handle);
+	if (bdev_file)
+		fput(bdev_file);
 }
 
 static struct file_system_type ext4_fs_type = {

From de0040bea70f33dc8a044a65503ed1ae60caccc9 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Tue, 23 Jan 2024 14:26:39 +0100
Subject: [PATCH 0283/1406] f2fs: port block device access to files

Link: https://lore.kernel.org/r/20240123-vfs-bdev-file-v2-22-adbd023e19cc@kernel.org
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/f2fs/f2fs.h  |  2 +-
 fs/f2fs/super.c | 12 ++++++------
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 65294e3b0bef88..6fc172c9991520 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -1239,7 +1239,7 @@ struct f2fs_bio_info {
 #define FDEV(i)				(sbi->devs[i])
 #define RDEV(i)				(raw_super->devs[i])
 struct f2fs_dev_info {
-	struct bdev_handle *bdev_handle;
+	struct file *bdev_file;
 	struct block_device *bdev;
 	char path[MAX_PATH_LEN];
 	unsigned int total_segments;
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index ea94c148fee566..557ea5c6c92656 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -1605,7 +1605,7 @@ static void destroy_device_list(struct f2fs_sb_info *sbi)
 
 	for (i = 0; i < sbi->s_ndevs; i++) {
 		if (i > 0)
-			bdev_release(FDEV(i).bdev_handle);
+			fput(FDEV(i).bdev_file);
 #ifdef CONFIG_BLK_DEV_ZONED
 		kvfree(FDEV(i).blkz_seq);
 #endif
@@ -4247,7 +4247,7 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi)
 
 	for (i = 0; i < max_devices; i++) {
 		if (i == 0)
-			FDEV(0).bdev_handle = sb_bdev_handle(sbi->sb);
+			FDEV(0).bdev_file = sbi->sb->s_bdev_file;
 		else if (!RDEV(i).path[0])
 			break;
 
@@ -4267,14 +4267,14 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi)
 				FDEV(i).end_blk = FDEV(i).start_blk +
 					(FDEV(i).total_segments <<
 					sbi->log_blocks_per_seg) - 1;
-				FDEV(i).bdev_handle = bdev_open_by_path(
+				FDEV(i).bdev_file = bdev_file_open_by_path(
 					FDEV(i).path, mode, sbi->sb, NULL);
 			}
 		}
-		if (IS_ERR(FDEV(i).bdev_handle))
-			return PTR_ERR(FDEV(i).bdev_handle);
+		if (IS_ERR(FDEV(i).bdev_file))
+			return PTR_ERR(FDEV(i).bdev_file);
 
-		FDEV(i).bdev = FDEV(i).bdev_handle->bdev;
+		FDEV(i).bdev = file_bdev(FDEV(i).bdev_file);
 		/* to release errored devices */
 		sbi->s_ndevs = i + 1;
 

From 6efc278c1431e5b33612c472641d4d1686223ea3 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Tue, 23 Jan 2024 14:26:40 +0100
Subject: [PATCH 0284/1406] jfs: port block device access to file

Link: https://lore.kernel.org/r/20240123-vfs-bdev-file-v2-23-adbd023e19cc@kernel.org
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/jfs/jfs_logmgr.c | 26 +++++++++++++-------------
 fs/jfs/jfs_logmgr.h |  2 +-
 fs/jfs/jfs_mount.c  |  2 +-
 3 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index 8691463956d17a..73389c68e25170 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -1058,7 +1058,7 @@ void jfs_syncpt(struct jfs_log *log, int hard_sync)
 int lmLogOpen(struct super_block *sb)
 {
 	int rc;
-	struct bdev_handle *bdev_handle;
+	struct file *bdev_file;
 	struct jfs_log *log;
 	struct jfs_sb_info *sbi = JFS_SBI(sb);
 
@@ -1070,7 +1070,7 @@ int lmLogOpen(struct super_block *sb)
 
 	mutex_lock(&jfs_log_mutex);
 	list_for_each_entry(log, &jfs_external_logs, journal_list) {
-		if (log->bdev_handle->bdev->bd_dev == sbi->logdev) {
+		if (file_bdev(log->bdev_file)->bd_dev == sbi->logdev) {
 			if (!uuid_equal(&log->uuid, &sbi->loguuid)) {
 				jfs_warn("wrong uuid on JFS journal");
 				mutex_unlock(&jfs_log_mutex);
@@ -1100,14 +1100,14 @@ int lmLogOpen(struct super_block *sb)
 	 * file systems to log may have n-to-1 relationship;
 	 */
 
-	bdev_handle = bdev_open_by_dev(sbi->logdev,
+	bdev_file = bdev_file_open_by_dev(sbi->logdev,
 			BLK_OPEN_READ | BLK_OPEN_WRITE, log, NULL);
-	if (IS_ERR(bdev_handle)) {
-		rc = PTR_ERR(bdev_handle);
+	if (IS_ERR(bdev_file)) {
+		rc = PTR_ERR(bdev_file);
 		goto free;
 	}
 
-	log->bdev_handle = bdev_handle;
+	log->bdev_file = bdev_file;
 	uuid_copy(&log->uuid, &sbi->loguuid);
 
 	/*
@@ -1141,7 +1141,7 @@ int lmLogOpen(struct super_block *sb)
 	lbmLogShutdown(log);
 
       close:		/* close external log device */
-	bdev_release(bdev_handle);
+	fput(bdev_file);
 
       free:		/* free log descriptor */
 	mutex_unlock(&jfs_log_mutex);
@@ -1162,7 +1162,7 @@ static int open_inline_log(struct super_block *sb)
 	init_waitqueue_head(&log->syncwait);
 
 	set_bit(log_INLINELOG, &log->flag);
-	log->bdev_handle = sb_bdev_handle(sb);
+	log->bdev_file = sb->s_bdev_file;
 	log->base = addressPXD(&JFS_SBI(sb)->logpxd);
 	log->size = lengthPXD(&JFS_SBI(sb)->logpxd) >>
 	    (L2LOGPSIZE - sb->s_blocksize_bits);
@@ -1436,7 +1436,7 @@ int lmLogClose(struct super_block *sb)
 {
 	struct jfs_sb_info *sbi = JFS_SBI(sb);
 	struct jfs_log *log = sbi->log;
-	struct bdev_handle *bdev_handle;
+	struct file *bdev_file;
 	int rc = 0;
 
 	jfs_info("lmLogClose: log:0x%p", log);
@@ -1482,10 +1482,10 @@ int lmLogClose(struct super_block *sb)
 	 *	external log as separate logical volume
 	 */
 	list_del(&log->journal_list);
-	bdev_handle = log->bdev_handle;
+	bdev_file = log->bdev_file;
 	rc = lmLogShutdown(log);
 
-	bdev_release(bdev_handle);
+	fput(bdev_file);
 
 	kfree(log);
 
@@ -1972,7 +1972,7 @@ static int lbmRead(struct jfs_log * log, int pn, struct lbuf ** bpp)
 
 	bp->l_flag |= lbmREAD;
 
-	bio = bio_alloc(log->bdev_handle->bdev, 1, REQ_OP_READ, GFP_NOFS);
+	bio = bio_alloc(file_bdev(log->bdev_file), 1, REQ_OP_READ, GFP_NOFS);
 	bio->bi_iter.bi_sector = bp->l_blkno << (log->l2bsize - 9);
 	__bio_add_page(bio, bp->l_page, LOGPSIZE, bp->l_offset);
 	BUG_ON(bio->bi_iter.bi_size != LOGPSIZE);
@@ -2115,7 +2115,7 @@ static void lbmStartIO(struct lbuf * bp)
 	jfs_info("lbmStartIO");
 
 	if (!log->no_integrity)
-		bdev = log->bdev_handle->bdev;
+		bdev = file_bdev(log->bdev_file);
 
 	bio = bio_alloc(bdev, 1, REQ_OP_WRITE | REQ_SYNC,
 			GFP_NOFS);
diff --git a/fs/jfs/jfs_logmgr.h b/fs/jfs/jfs_logmgr.h
index 84aa2d25390743..8b8994e48cd080 100644
--- a/fs/jfs/jfs_logmgr.h
+++ b/fs/jfs/jfs_logmgr.h
@@ -356,7 +356,7 @@ struct jfs_log {
 				 *    before writing syncpt.
 				 */
 	struct list_head journal_list; /* Global list */
-	struct bdev_handle *bdev_handle; /* 4: log lv pointer */
+	struct file *bdev_file;	/* 4: log lv pointer */
 	int serial;		/* 4: log mount serial number */
 
 	s64 base;		/* @8: log extent address (inline log ) */
diff --git a/fs/jfs/jfs_mount.c b/fs/jfs/jfs_mount.c
index 9b5c6a20b30c83..98f9a432c33662 100644
--- a/fs/jfs/jfs_mount.c
+++ b/fs/jfs/jfs_mount.c
@@ -431,7 +431,7 @@ int updateSuper(struct super_block *sb, uint state)
 	if (state == FM_MOUNT) {
 		/* record log's dev_t and mount serial number */
 		j_sb->s_logdev = cpu_to_le32(
-			new_encode_dev(sbi->log->bdev_handle->bdev->bd_dev));
+			new_encode_dev(file_bdev(sbi->log->bdev_file)->bd_dev));
 		j_sb->s_logserial = cpu_to_le32(sbi->log->serial);
 	} else if (state == FM_CLEAN) {
 		/*

From 5a95186f6e40f0199714ca19a435541ae32e7e0c Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Tue, 23 Jan 2024 14:26:41 +0100
Subject: [PATCH 0285/1406] nfs: port block device access to files

Link: https://lore.kernel.org/r/20240123-vfs-bdev-file-v2-24-adbd023e19cc@kernel.org
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/nfs/blocklayout/blocklayout.h |  2 +-
 fs/nfs/blocklayout/dev.c         | 68 ++++++++++++++++----------------
 2 files changed, 35 insertions(+), 35 deletions(-)

diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
index b4294a8aa2d4c5..f1eeb491419929 100644
--- a/fs/nfs/blocklayout/blocklayout.h
+++ b/fs/nfs/blocklayout/blocklayout.h
@@ -108,7 +108,7 @@ struct pnfs_block_dev {
 	struct pnfs_block_dev		*children;
 	u64				chunk_size;
 
-	struct bdev_handle		*bdev_handle;
+	struct file			*bdev_file;
 	u64				disk_offset;
 
 	u64				pr_key;
diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c
index c97ebc42ec0fee..93ef7f864980b1 100644
--- a/fs/nfs/blocklayout/dev.c
+++ b/fs/nfs/blocklayout/dev.c
@@ -25,17 +25,17 @@ bl_free_device(struct pnfs_block_dev *dev)
 	} else {
 		if (dev->pr_registered) {
 			const struct pr_ops *ops =
-				dev->bdev_handle->bdev->bd_disk->fops->pr_ops;
+				file_bdev(dev->bdev_file)->bd_disk->fops->pr_ops;
 			int error;
 
-			error = ops->pr_register(dev->bdev_handle->bdev,
+			error = ops->pr_register(file_bdev(dev->bdev_file),
 				dev->pr_key, 0, false);
 			if (error)
 				pr_err("failed to unregister PR key.\n");
 		}
 
-		if (dev->bdev_handle)
-			bdev_release(dev->bdev_handle);
+		if (dev->bdev_file)
+			fput(dev->bdev_file);
 	}
 }
 
@@ -169,7 +169,7 @@ static bool bl_map_simple(struct pnfs_block_dev *dev, u64 offset,
 	map->start = dev->start;
 	map->len = dev->len;
 	map->disk_offset = dev->disk_offset;
-	map->bdev = dev->bdev_handle->bdev;
+	map->bdev = file_bdev(dev->bdev_file);
 	return true;
 }
 
@@ -236,26 +236,26 @@ bl_parse_simple(struct nfs_server *server, struct pnfs_block_dev *d,
 		struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
 {
 	struct pnfs_block_volume *v = &volumes[idx];
-	struct bdev_handle *bdev_handle;
+	struct file *bdev_file;
 	dev_t dev;
 
 	dev = bl_resolve_deviceid(server, v, gfp_mask);
 	if (!dev)
 		return -EIO;
 
-	bdev_handle = bdev_open_by_dev(dev, BLK_OPEN_READ | BLK_OPEN_WRITE,
+	bdev_file = bdev_file_open_by_dev(dev, BLK_OPEN_READ | BLK_OPEN_WRITE,
 				       NULL, NULL);
-	if (IS_ERR(bdev_handle)) {
+	if (IS_ERR(bdev_file)) {
 		printk(KERN_WARNING "pNFS: failed to open device %d:%d (%ld)\n",
-			MAJOR(dev), MINOR(dev), PTR_ERR(bdev_handle));
-		return PTR_ERR(bdev_handle);
+			MAJOR(dev), MINOR(dev), PTR_ERR(bdev_file));
+		return PTR_ERR(bdev_file);
 	}
-	d->bdev_handle = bdev_handle;
-	d->len = bdev_nr_bytes(bdev_handle->bdev);
+	d->bdev_file = bdev_file;
+	d->len = bdev_nr_bytes(file_bdev(bdev_file));
 	d->map = bl_map_simple;
 
 	printk(KERN_INFO "pNFS: using block device %s\n",
-		bdev_handle->bdev->bd_disk->disk_name);
+		file_bdev(bdev_file)->bd_disk->disk_name);
 	return 0;
 }
 
@@ -300,10 +300,10 @@ bl_validate_designator(struct pnfs_block_volume *v)
 	}
 }
 
-static struct bdev_handle *
+static struct file *
 bl_open_path(struct pnfs_block_volume *v, const char *prefix)
 {
-	struct bdev_handle *bdev_handle;
+	struct file *bdev_file;
 	const char *devname;
 
 	devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/%s%*phN",
@@ -311,15 +311,15 @@ bl_open_path(struct pnfs_block_volume *v, const char *prefix)
 	if (!devname)
 		return ERR_PTR(-ENOMEM);
 
-	bdev_handle = bdev_open_by_path(devname, BLK_OPEN_READ | BLK_OPEN_WRITE,
+	bdev_file = bdev_file_open_by_path(devname, BLK_OPEN_READ | BLK_OPEN_WRITE,
 					NULL, NULL);
-	if (IS_ERR(bdev_handle)) {
+	if (IS_ERR(bdev_file)) {
 		pr_warn("pNFS: failed to open device %s (%ld)\n",
-			devname, PTR_ERR(bdev_handle));
+			devname, PTR_ERR(bdev_file));
 	}
 
 	kfree(devname);
-	return bdev_handle;
+	return bdev_file;
 }
 
 static int
@@ -327,7 +327,7 @@ bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d,
 		struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
 {
 	struct pnfs_block_volume *v = &volumes[idx];
-	struct bdev_handle *bdev_handle;
+	struct file *bdev_file;
 	const struct pr_ops *ops;
 	int error;
 
@@ -340,14 +340,14 @@ bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d,
 	 * On other distributions like Debian, the default SCSI by-id path will
 	 * point to the dm-multipath device if one exists.
 	 */
-	bdev_handle = bl_open_path(v, "dm-uuid-mpath-0x");
-	if (IS_ERR(bdev_handle))
-		bdev_handle = bl_open_path(v, "wwn-0x");
-	if (IS_ERR(bdev_handle))
-		return PTR_ERR(bdev_handle);
-	d->bdev_handle = bdev_handle;
-
-	d->len = bdev_nr_bytes(d->bdev_handle->bdev);
+	bdev_file = bl_open_path(v, "dm-uuid-mpath-0x");
+	if (IS_ERR(bdev_file))
+		bdev_file = bl_open_path(v, "wwn-0x");
+	if (IS_ERR(bdev_file))
+		return PTR_ERR(bdev_file);
+	d->bdev_file = bdev_file;
+
+	d->len = bdev_nr_bytes(file_bdev(d->bdev_file));
 	d->map = bl_map_simple;
 	d->pr_key = v->scsi.pr_key;
 
@@ -355,20 +355,20 @@ bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d,
 		return -ENODEV;
 
 	pr_info("pNFS: using block device %s (reservation key 0x%llx)\n",
-		d->bdev_handle->bdev->bd_disk->disk_name, d->pr_key);
+		file_bdev(d->bdev_file)->bd_disk->disk_name, d->pr_key);
 
-	ops = d->bdev_handle->bdev->bd_disk->fops->pr_ops;
+	ops = file_bdev(d->bdev_file)->bd_disk->fops->pr_ops;
 	if (!ops) {
 		pr_err("pNFS: block device %s does not support reservations.",
-				d->bdev_handle->bdev->bd_disk->disk_name);
+				file_bdev(d->bdev_file)->bd_disk->disk_name);
 		error = -EINVAL;
 		goto out_blkdev_put;
 	}
 
-	error = ops->pr_register(d->bdev_handle->bdev, 0, d->pr_key, true);
+	error = ops->pr_register(file_bdev(d->bdev_file), 0, d->pr_key, true);
 	if (error) {
 		pr_err("pNFS: failed to register key for block device %s.",
-				d->bdev_handle->bdev->bd_disk->disk_name);
+				file_bdev(d->bdev_file)->bd_disk->disk_name);
 		goto out_blkdev_put;
 	}
 
@@ -376,7 +376,7 @@ bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d,
 	return 0;
 
 out_blkdev_put:
-	bdev_release(d->bdev_handle);
+	fput(d->bdev_file);
 	return error;
 }
 

From 3f1e2c43e35eaaeed7b817d9c888c7206b7204c9 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Tue, 23 Jan 2024 14:26:42 +0100
Subject: [PATCH 0286/1406] ocfs2: port block device access to file

Link: https://lore.kernel.org/r/20240123-vfs-bdev-file-v2-25-adbd023e19cc@kernel.org
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/ocfs2/cluster/heartbeat.c | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 4d7efefa98c5ec..1bde1281d5146d 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -213,7 +213,7 @@ struct o2hb_region {
 	unsigned int		hr_num_pages;
 
 	struct page             **hr_slot_data;
-	struct bdev_handle	*hr_bdev_handle;
+	struct file		*hr_bdev_file;
 	struct o2hb_disk_slot	*hr_slots;
 
 	/* live node map of this region */
@@ -263,7 +263,7 @@ struct o2hb_region {
 
 static inline struct block_device *reg_bdev(struct o2hb_region *reg)
 {
-	return reg->hr_bdev_handle ? reg->hr_bdev_handle->bdev : NULL;
+	return reg->hr_bdev_file ? file_bdev(reg->hr_bdev_file) : NULL;
 }
 
 struct o2hb_bio_wait_ctxt {
@@ -1509,8 +1509,8 @@ static void o2hb_region_release(struct config_item *item)
 		kfree(reg->hr_slot_data);
 	}
 
-	if (reg->hr_bdev_handle)
-		bdev_release(reg->hr_bdev_handle);
+	if (reg->hr_bdev_file)
+		fput(reg->hr_bdev_file);
 
 	kfree(reg->hr_slots);
 
@@ -1569,7 +1569,7 @@ static ssize_t o2hb_region_block_bytes_store(struct config_item *item,
 	unsigned long block_bytes;
 	unsigned int block_bits;
 
-	if (reg->hr_bdev_handle)
+	if (reg->hr_bdev_file)
 		return -EINVAL;
 
 	status = o2hb_read_block_input(reg, page, &block_bytes,
@@ -1598,7 +1598,7 @@ static ssize_t o2hb_region_start_block_store(struct config_item *item,
 	char *p = (char *)page;
 	ssize_t ret;
 
-	if (reg->hr_bdev_handle)
+	if (reg->hr_bdev_file)
 		return -EINVAL;
 
 	ret = kstrtoull(p, 0, &tmp);
@@ -1623,7 +1623,7 @@ static ssize_t o2hb_region_blocks_store(struct config_item *item,
 	unsigned long tmp;
 	char *p = (char *)page;
 
-	if (reg->hr_bdev_handle)
+	if (reg->hr_bdev_file)
 		return -EINVAL;
 
 	tmp = simple_strtoul(p, &p, 0);
@@ -1642,7 +1642,7 @@ static ssize_t o2hb_region_dev_show(struct config_item *item, char *page)
 {
 	unsigned int ret = 0;
 
-	if (to_o2hb_region(item)->hr_bdev_handle)
+	if (to_o2hb_region(item)->hr_bdev_file)
 		ret = sprintf(page, "%pg\n", reg_bdev(to_o2hb_region(item)));
 
 	return ret;
@@ -1753,7 +1753,7 @@ static int o2hb_populate_slot_data(struct o2hb_region *reg)
 }
 
 /*
- * this is acting as commit; we set up all of hr_bdev_handle and hr_task or
+ * this is acting as commit; we set up all of hr_bdev_file and hr_task or
  * nothing
  */
 static ssize_t o2hb_region_dev_store(struct config_item *item,
@@ -1769,7 +1769,7 @@ static ssize_t o2hb_region_dev_store(struct config_item *item,
 	ssize_t ret = -EINVAL;
 	int live_threshold;
 
-	if (reg->hr_bdev_handle)
+	if (reg->hr_bdev_file)
 		goto out;
 
 	/* We can't heartbeat without having had our node number
@@ -1795,11 +1795,11 @@ static ssize_t o2hb_region_dev_store(struct config_item *item,
 	if (!S_ISBLK(f.file->f_mapping->host->i_mode))
 		goto out2;
 
-	reg->hr_bdev_handle = bdev_open_by_dev(f.file->f_mapping->host->i_rdev,
+	reg->hr_bdev_file = bdev_file_open_by_dev(f.file->f_mapping->host->i_rdev,
 			BLK_OPEN_WRITE | BLK_OPEN_READ, NULL, NULL);
-	if (IS_ERR(reg->hr_bdev_handle)) {
-		ret = PTR_ERR(reg->hr_bdev_handle);
-		reg->hr_bdev_handle = NULL;
+	if (IS_ERR(reg->hr_bdev_file)) {
+		ret = PTR_ERR(reg->hr_bdev_file);
+		reg->hr_bdev_file = NULL;
 		goto out2;
 	}
 
@@ -1903,8 +1903,8 @@ static ssize_t o2hb_region_dev_store(struct config_item *item,
 
 out3:
 	if (ret < 0) {
-		bdev_release(reg->hr_bdev_handle);
-		reg->hr_bdev_handle = NULL;
+		fput(reg->hr_bdev_file);
+		reg->hr_bdev_file = NULL;
 	}
 out2:
 	fdput(f);

From 7065d8cd471e2cd481a6ac49046d5f7f0683bc3a Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Tue, 23 Jan 2024 14:26:43 +0100
Subject: [PATCH 0287/1406] reiserfs: port block device access to file

Link: https://lore.kernel.org/r/20240123-vfs-bdev-file-v2-26-adbd023e19cc@kernel.org
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/reiserfs/journal.c  | 38 +++++++++++++++++++-------------------
 fs/reiserfs/procfs.c   |  2 +-
 fs/reiserfs/reiserfs.h |  8 ++++----
 3 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 171c912af50f6f..6474529c425306 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -2386,7 +2386,7 @@ static int journal_read(struct super_block *sb)
 
 	cur_dblock = SB_ONDISK_JOURNAL_1st_BLOCK(sb);
 	reiserfs_info(sb, "checking transaction log (%pg)\n",
-		      journal->j_bdev_handle->bdev);
+		      file_bdev(journal->j_bdev_file));
 	start = ktime_get_seconds();
 
 	/*
@@ -2447,7 +2447,7 @@ static int journal_read(struct super_block *sb)
 		 * device and journal device to be the same
 		 */
 		d_bh =
-		    reiserfs_breada(journal->j_bdev_handle->bdev, cur_dblock,
+		    reiserfs_breada(file_bdev(journal->j_bdev_file), cur_dblock,
 				    sb->s_blocksize,
 				    SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
 				    SB_ONDISK_JOURNAL_SIZE(sb));
@@ -2588,9 +2588,9 @@ static void journal_list_init(struct super_block *sb)
 
 static void release_journal_dev(struct reiserfs_journal *journal)
 {
-	if (journal->j_bdev_handle) {
-		bdev_release(journal->j_bdev_handle);
-		journal->j_bdev_handle = NULL;
+	if (journal->j_bdev_file) {
+		fput(journal->j_bdev_file);
+		journal->j_bdev_file = NULL;
 	}
 }
 
@@ -2605,7 +2605,7 @@ static int journal_init_dev(struct super_block *super,
 
 	result = 0;
 
-	journal->j_bdev_handle = NULL;
+	journal->j_bdev_file = NULL;
 	jdev = SB_ONDISK_JOURNAL_DEVICE(super) ?
 	    new_decode_dev(SB_ONDISK_JOURNAL_DEVICE(super)) : super->s_dev;
 
@@ -2616,37 +2616,37 @@ static int journal_init_dev(struct super_block *super,
 	if ((!jdev_name || !jdev_name[0])) {
 		if (jdev == super->s_dev)
 			holder = NULL;
-		journal->j_bdev_handle = bdev_open_by_dev(jdev, blkdev_mode,
+		journal->j_bdev_file = bdev_file_open_by_dev(jdev, blkdev_mode,
 							  holder, NULL);
-		if (IS_ERR(journal->j_bdev_handle)) {
-			result = PTR_ERR(journal->j_bdev_handle);
-			journal->j_bdev_handle = NULL;
+		if (IS_ERR(journal->j_bdev_file)) {
+			result = PTR_ERR(journal->j_bdev_file);
+			journal->j_bdev_file = NULL;
 			reiserfs_warning(super, "sh-458",
 					 "cannot init journal device unknown-block(%u,%u): %i",
 					 MAJOR(jdev), MINOR(jdev), result);
 			return result;
 		} else if (jdev != super->s_dev)
-			set_blocksize(journal->j_bdev_handle->bdev,
+			set_blocksize(file_bdev(journal->j_bdev_file),
 				      super->s_blocksize);
 
 		return 0;
 	}
 
-	journal->j_bdev_handle = bdev_open_by_path(jdev_name, blkdev_mode,
+	journal->j_bdev_file = bdev_file_open_by_path(jdev_name, blkdev_mode,
 						   holder, NULL);
-	if (IS_ERR(journal->j_bdev_handle)) {
-		result = PTR_ERR(journal->j_bdev_handle);
-		journal->j_bdev_handle = NULL;
+	if (IS_ERR(journal->j_bdev_file)) {
+		result = PTR_ERR(journal->j_bdev_file);
+		journal->j_bdev_file = NULL;
 		reiserfs_warning(super, "sh-457",
 				 "journal_init_dev: Cannot open '%s': %i",
 				 jdev_name, result);
 		return result;
 	}
 
-	set_blocksize(journal->j_bdev_handle->bdev, super->s_blocksize);
+	set_blocksize(file_bdev(journal->j_bdev_file), super->s_blocksize);
 	reiserfs_info(super,
 		      "journal_init_dev: journal device: %pg\n",
-		      journal->j_bdev_handle->bdev);
+		      file_bdev(journal->j_bdev_file));
 	return 0;
 }
 
@@ -2804,7 +2804,7 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
 				 "journal header magic %x (device %pg) does "
 				 "not match to magic found in super block %x",
 				 jh->jh_journal.jp_journal_magic,
-				 journal->j_bdev_handle->bdev,
+				 file_bdev(journal->j_bdev_file),
 				 sb_jp_journal_magic(rs));
 		brelse(bhjh);
 		goto free_and_return;
@@ -2828,7 +2828,7 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
 	reiserfs_info(sb, "journal params: device %pg, size %u, "
 		      "journal first block %u, max trans len %u, max batch %u, "
 		      "max commit age %u, max trans age %u\n",
-		      journal->j_bdev_handle->bdev,
+		      file_bdev(journal->j_bdev_file),
 		      SB_ONDISK_JOURNAL_SIZE(sb),
 		      SB_ONDISK_JOURNAL_1st_BLOCK(sb),
 		      journal->j_trans_max,
diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c
index 83cb9402e0f9c5..5c68a4a52d7881 100644
--- a/fs/reiserfs/procfs.c
+++ b/fs/reiserfs/procfs.c
@@ -354,7 +354,7 @@ static int show_journal(struct seq_file *m, void *unused)
 		   "prepare: \t%12lu\n"
 		   "prepare_retry: \t%12lu\n",
 		   DJP(jp_journal_1st_block),
-		   SB_JOURNAL(sb)->j_bdev_handle->bdev,
+		   file_bdev(SB_JOURNAL(sb)->j_bdev_file),
 		   DJP(jp_journal_dev),
 		   DJP(jp_journal_size),
 		   DJP(jp_journal_trans_max),
diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h
index 725667880e626a..0554903f42a909 100644
--- a/fs/reiserfs/reiserfs.h
+++ b/fs/reiserfs/reiserfs.h
@@ -299,7 +299,7 @@ struct reiserfs_journal {
 	/* oldest journal block.  start here for traverse */
 	struct reiserfs_journal_cnode *j_first;
 
-	struct bdev_handle *j_bdev_handle;
+	struct file *j_bdev_file;
 
 	/* first block on s_dev of reserved area journal */
 	int j_1st_reserved_block;
@@ -2810,10 +2810,10 @@ struct reiserfs_journal_header {
 
 /* We need these to make journal.c code more readable */
 #define journal_find_get_block(s, block) __find_get_block(\
-		SB_JOURNAL(s)->j_bdev_handle->bdev, block, s->s_blocksize)
-#define journal_getblk(s, block) __getblk(SB_JOURNAL(s)->j_bdev_handle->bdev,\
+		file_bdev(SB_JOURNAL(s)->j_bdev_file), block, s->s_blocksize)
+#define journal_getblk(s, block) __getblk(file_bdev(SB_JOURNAL(s)->j_bdev_file),\
 		block, s->s_blocksize)
-#define journal_bread(s, block) __bread(SB_JOURNAL(s)->j_bdev_handle->bdev,\
+#define journal_bread(s, block) __bread(file_bdev(SB_JOURNAL(s)->j_bdev_file),\
 		block, s->s_blocksize)
 
 enum reiserfs_bh_state_bits {

From 55219b9e43b23aa8c40b037c2fce49b476aed5eb Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Tue, 23 Jan 2024 14:26:44 +0100
Subject: [PATCH 0288/1406] bdev: remove bdev_open_by_path()

Link: https://lore.kernel.org/r/20240123-vfs-bdev-file-v2-27-adbd023e19cc@kernel.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 block/bdev.c           | 40 ----------------------------------------
 include/linux/blkdev.h |  2 --
 2 files changed, 42 deletions(-)

diff --git a/block/bdev.c b/block/bdev.c
index e1149652c53285..4003f8e1782a48 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -1004,46 +1004,6 @@ struct file *bdev_file_open_by_path(const char *path, blk_mode_t mode,
 }
 EXPORT_SYMBOL(bdev_file_open_by_path);
 
-/**
- * bdev_open_by_path - open a block device by name
- * @path: path to the block device to open
- * @mode: open mode (BLK_OPEN_*)
- * @holder: exclusive holder identifier
- * @hops: holder operations
- *
- * Open the block device described by the device file at @path.  If @holder is
- * not %NULL, the block device is opened with exclusive access.  Exclusive opens
- * may nest for the same @holder.
- *
- * CONTEXT:
- * Might sleep.
- *
- * RETURNS:
- * Handle with a reference to the block_device on success, ERR_PTR(-errno) on
- * failure.
- */
-struct bdev_handle *bdev_open_by_path(const char *path, blk_mode_t mode,
-		void *holder, const struct blk_holder_ops *hops)
-{
-	struct bdev_handle *handle;
-	dev_t dev;
-	int error;
-
-	error = lookup_bdev(path, &dev);
-	if (error)
-		return ERR_PTR(error);
-
-	handle = bdev_open_by_dev(dev, mode, holder, hops);
-	if (!IS_ERR(handle) && (mode & BLK_OPEN_WRITE) &&
-	    bdev_read_only(handle->bdev)) {
-		bdev_release(handle);
-		return ERR_PTR(-EACCES);
-	}
-
-	return handle;
-}
-EXPORT_SYMBOL(bdev_open_by_path);
-
 void bdev_release(struct bdev_handle *handle)
 {
 	struct block_device *bdev = handle->bdev;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 76706aa473163d..5880d5abfebe91 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1484,8 +1484,6 @@ struct bdev_handle {
 
 struct bdev_handle *bdev_open_by_dev(dev_t dev, blk_mode_t mode, void *holder,
 		const struct blk_holder_ops *hops);
-struct bdev_handle *bdev_open_by_path(const char *path, blk_mode_t mode,
-		void *holder, const struct blk_holder_ops *hops);
 struct file *bdev_file_open_by_dev(dev_t dev, blk_mode_t mode, void *holder,
 		const struct blk_holder_ops *hops);
 struct file *bdev_file_open_by_path(const char *path, blk_mode_t mode,

From 5bbd0388d97331d627a0a4b03a2e22d09d34e1e4 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Tue, 23 Jan 2024 14:26:45 +0100
Subject: [PATCH 0289/1406] bdev: make bdev_{release, open_by_dev}() private to
 block layer

Move both of them to the private block header. There's no caller in the
tree anymore that uses them directly.

Link: https://lore.kernel.org/r/20240123-vfs-bdev-file-v2-28-adbd023e19cc@kernel.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 block/bdev.c           | 2 --
 block/blk.h            | 4 ++++
 include/linux/blkdev.h | 3 ---
 3 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/block/bdev.c b/block/bdev.c
index 4003f8e1782a48..e6e46f24a89a92 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -916,7 +916,6 @@ struct bdev_handle *bdev_open_by_dev(dev_t dev, blk_mode_t mode, void *holder,
 	kfree(handle);
 	return ERR_PTR(ret);
 }
-EXPORT_SYMBOL(bdev_open_by_dev);
 
 /*
  * If BLK_OPEN_WRITE_IOCTL is set then this is a historical quirk
@@ -1042,7 +1041,6 @@ void bdev_release(struct bdev_handle *handle)
 	blkdev_put_no_open(bdev);
 	kfree(handle);
 }
-EXPORT_SYMBOL(bdev_release);
 
 /**
  * lookup_bdev() - Look up a struct block_device by name.
diff --git a/block/blk.h b/block/blk.h
index 1ef920f72e0f87..c9630774767d37 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -516,4 +516,8 @@ static inline int req_ref_read(struct request *req)
 	return atomic_read(&req->ref);
 }
 
+void bdev_release(struct bdev_handle *handle);
+struct bdev_handle *bdev_open_by_dev(dev_t dev, blk_mode_t mode, void *holder,
+		const struct blk_holder_ops *hops);
+
 #endif /* BLK_INTERNAL_H */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 5880d5abfebe91..495f5558720719 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1482,8 +1482,6 @@ struct bdev_handle {
 	blk_mode_t mode;
 };
 
-struct bdev_handle *bdev_open_by_dev(dev_t dev, blk_mode_t mode, void *holder,
-		const struct blk_holder_ops *hops);
 struct file *bdev_file_open_by_dev(dev_t dev, blk_mode_t mode, void *holder,
 		const struct blk_holder_ops *hops);
 struct file *bdev_file_open_by_path(const char *path, blk_mode_t mode,
@@ -1491,7 +1489,6 @@ struct file *bdev_file_open_by_path(const char *path, blk_mode_t mode,
 int bd_prepare_to_claim(struct block_device *bdev, void *holder,
 		const struct blk_holder_ops *hops);
 void bd_abort_claiming(struct block_device *bdev, void *holder);
-void bdev_release(struct bdev_handle *handle);
 
 /* just for blk-cgroup, don't use elsewhere */
 struct block_device *blkdev_get_no_open(dev_t dev);

From 632e7486da288654f0d00a601497e880ebc9996d Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Tue, 23 Jan 2024 14:26:46 +0100
Subject: [PATCH 0290/1406] bdev: make struct bdev_handle private to the block
 layer

Link: https://lore.kernel.org/r/20240123-vfs-bdev-file-v2-29-adbd023e19cc@kernel.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 block/bdev.c           | 119 +++++++++++++++++++++--------------------
 block/blk.h            |  12 +++--
 block/fops.c           |  37 ++++++-------
 include/linux/blkdev.h |   7 ---
 include/linux/fs.h     |   6 ---
 5 files changed, 86 insertions(+), 95 deletions(-)

diff --git a/block/bdev.c b/block/bdev.c
index e6e46f24a89a92..8f33f160e92328 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -703,6 +703,24 @@ static int blkdev_get_part(struct block_device *part, blk_mode_t mode)
 	return ret;
 }
 
+int bdev_permission(dev_t dev, blk_mode_t mode, void *holder)
+{
+	int ret;
+
+	ret = devcgroup_check_permission(DEVCG_DEV_BLOCK,
+			MAJOR(dev), MINOR(dev),
+			((mode & BLK_OPEN_READ) ? DEVCG_ACC_READ : 0) |
+			((mode & BLK_OPEN_WRITE) ? DEVCG_ACC_WRITE : 0));
+	if (ret)
+		return ret;
+
+	/* Blocking writes requires exclusive opener */
+	if (mode & BLK_OPEN_RESTRICT_WRITES && !holder)
+		return -EINVAL;
+
+	return 0;
+}
+
 static void blkdev_put_part(struct block_device *part)
 {
 	struct block_device *whole = bdev_whole(part);
@@ -795,69 +813,43 @@ static void bdev_yield_write_access(struct block_device *bdev, blk_mode_t mode)
 }
 
 /**
- * bdev_open_by_dev - open a block device by device number
- * @dev: device number of block device to open
+ * bdev_open - open a block device
+ * @bdev: block device to open
  * @mode: open mode (BLK_OPEN_*)
  * @holder: exclusive holder identifier
  * @hops: holder operations
+ * @bdev_file: file for the block device
  *
- * Open the block device described by device number @dev. If @holder is not
- * %NULL, the block device is opened with exclusive access.  Exclusive opens may
- * nest for the same @holder.
- *
- * Use this interface ONLY if you really do not have anything better - i.e. when
- * you are behind a truly sucky interface and all you are given is a device
- * number.  Everything else should use bdev_open_by_path().
+ * Open the block device. If @holder is not %NULL, the block device is opened
+ * with exclusive access.  Exclusive opens may nest for the same @holder.
  *
  * CONTEXT:
  * Might sleep.
  *
  * RETURNS:
- * Handle with a reference to the block_device on success, ERR_PTR(-errno) on
- * failure.
+ * zero on success, -errno on failure.
  */
-struct bdev_handle *bdev_open_by_dev(dev_t dev, blk_mode_t mode, void *holder,
-				     const struct blk_holder_ops *hops)
+int bdev_open(struct block_device *bdev, blk_mode_t mode, void *holder,
+	      const struct blk_holder_ops *hops, struct file *bdev_file)
 {
-	struct bdev_handle *handle = kmalloc(sizeof(struct bdev_handle),
-					     GFP_KERNEL);
-	struct block_device *bdev;
+	struct bdev_handle *handle;
 	bool unblock_events = true;
-	struct gendisk *disk;
+	struct gendisk *disk = bdev->bd_disk;
 	int ret;
 
+	handle = kmalloc(sizeof(struct bdev_handle), GFP_KERNEL);
 	if (!handle)
-		return ERR_PTR(-ENOMEM);
-
-	ret = devcgroup_check_permission(DEVCG_DEV_BLOCK,
-			MAJOR(dev), MINOR(dev),
-			((mode & BLK_OPEN_READ) ? DEVCG_ACC_READ : 0) |
-			((mode & BLK_OPEN_WRITE) ? DEVCG_ACC_WRITE : 0));
-	if (ret)
-		goto free_handle;
-
-	/* Blocking writes requires exclusive opener */
-	if (mode & BLK_OPEN_RESTRICT_WRITES && !holder) {
-		ret = -EINVAL;
-		goto free_handle;
-	}
-
-	bdev = blkdev_get_no_open(dev);
-	if (!bdev) {
-		ret = -ENXIO;
-		goto free_handle;
-	}
-	disk = bdev->bd_disk;
+		return -ENOMEM;
 
 	if (holder) {
 		mode |= BLK_OPEN_EXCL;
 		ret = bd_prepare_to_claim(bdev, holder, hops);
 		if (ret)
-			goto put_blkdev;
+			goto free_handle;
 	} else {
 		if (WARN_ON_ONCE(mode & BLK_OPEN_EXCL)) {
 			ret = -EIO;
-			goto put_blkdev;
+			goto free_handle;
 		}
 	}
 
@@ -902,7 +894,16 @@ struct bdev_handle *bdev_open_by_dev(dev_t dev, blk_mode_t mode, void *holder,
 	handle->bdev = bdev;
 	handle->holder = holder;
 	handle->mode = mode;
-	return handle;
+
+	bdev_file->f_flags |= O_LARGEFILE;
+	bdev_file->f_mode |= FMODE_BUF_RASYNC | FMODE_CAN_ODIRECT;
+	if (bdev_nowait(bdev))
+		bdev_file->f_mode |= FMODE_NOWAIT;
+	bdev_file->f_mapping = handle->bdev->bd_inode->i_mapping;
+	bdev_file->f_wb_err = filemap_sample_wb_err(bdev_file->f_mapping);
+	bdev_file->private_data = handle;
+
+	return 0;
 put_module:
 	module_put(disk->fops->owner);
 abort_claiming:
@@ -910,11 +911,9 @@ struct bdev_handle *bdev_open_by_dev(dev_t dev, blk_mode_t mode, void *holder,
 		bd_abort_claiming(bdev, holder);
 	mutex_unlock(&disk->open_mutex);
 	disk_unblock_events(disk);
-put_blkdev:
-	blkdev_put_no_open(bdev);
 free_handle:
 	kfree(handle);
-	return ERR_PTR(ret);
+	return ret;
 }
 
 /*
@@ -951,29 +950,33 @@ struct file *bdev_file_open_by_dev(dev_t dev, blk_mode_t mode, void *holder,
 				   const struct blk_holder_ops *hops)
 {
 	struct file *bdev_file;
-	struct bdev_handle *handle;
+	struct block_device *bdev;
 	unsigned int flags;
+	int ret;
 
-	handle = bdev_open_by_dev(dev, mode, holder, hops);
-	if (IS_ERR(handle))
-		return ERR_CAST(handle);
+	ret = bdev_permission(dev, mode, holder);
+	if (ret)
+		return ERR_PTR(ret);
+
+	bdev = blkdev_get_no_open(dev);
+	if (!bdev)
+		return ERR_PTR(-ENXIO);
 
 	flags = blk_to_file_flags(mode);
-	bdev_file = alloc_file_pseudo_noaccount(handle->bdev->bd_inode,
+	bdev_file = alloc_file_pseudo_noaccount(bdev->bd_inode,
 			blockdev_mnt, "", flags | O_LARGEFILE, &def_blk_fops);
 	if (IS_ERR(bdev_file)) {
-		bdev_release(handle);
+		blkdev_put_no_open(bdev);
 		return bdev_file;
 	}
-	ihold(handle->bdev->bd_inode);
+	ihold(bdev->bd_inode);
 
-	bdev_file->f_mode |= FMODE_BUF_RASYNC | FMODE_CAN_ODIRECT;
-	if (bdev_nowait(handle->bdev))
-		bdev_file->f_mode |= FMODE_NOWAIT;
-
-	bdev_file->f_mapping = handle->bdev->bd_inode->i_mapping;
-	bdev_file->f_wb_err = filemap_sample_wb_err(bdev_file->f_mapping);
-	bdev_file->private_data = handle;
+	ret = bdev_open(bdev, mode, holder, hops, bdev_file);
+	if (ret) {
+		blkdev_put_no_open(bdev);
+		fput(bdev_file);
+		return ERR_PTR(ret);
+	}
 	return bdev_file;
 }
 EXPORT_SYMBOL(bdev_file_open_by_dev);
diff --git a/block/blk.h b/block/blk.h
index c9630774767d37..19b15870284f43 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -25,6 +25,12 @@ struct blk_flush_queue {
 	struct request		*flush_rq;
 };
 
+struct bdev_handle {
+	struct block_device *bdev;
+	void *holder;
+	blk_mode_t mode;
+};
+
 bool is_flush_rq(struct request *req);
 
 struct blk_flush_queue *blk_alloc_flush_queue(int node, int cmd_size,
@@ -517,7 +523,7 @@ static inline int req_ref_read(struct request *req)
 }
 
 void bdev_release(struct bdev_handle *handle);
-struct bdev_handle *bdev_open_by_dev(dev_t dev, blk_mode_t mode, void *holder,
-		const struct blk_holder_ops *hops);
-
+int bdev_open(struct block_device *bdev, blk_mode_t mode, void *holder,
+	      const struct blk_holder_ops *hops, struct file *bdev_file);
+int bdev_permission(dev_t dev, blk_mode_t mode, void *holder);
 #endif /* BLK_INTERNAL_H */
diff --git a/block/fops.c b/block/fops.c
index 0cf8cf72cdfa10..a1ba1a50ae7787 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -599,36 +599,31 @@ blk_mode_t file_to_blk_mode(struct file *file)
 
 static int blkdev_open(struct inode *inode, struct file *filp)
 {
-	struct bdev_handle *handle;
+	struct block_device *bdev;
 	blk_mode_t mode;
-
-	/*
-	 * Preserve backwards compatibility and allow large file access
-	 * even if userspace doesn't ask for it explicitly. Some mkfs
-	 * binary needs it. We might want to drop this workaround
-	 * during an unstable branch.
-	 */
-	filp->f_flags |= O_LARGEFILE;
-	filp->f_mode |= FMODE_BUF_RASYNC | FMODE_CAN_ODIRECT;
+	void *holder;
+	int ret;
 
 	mode = file_to_blk_mode(filp);
-	handle = bdev_open_by_dev(inode->i_rdev, mode,
-			mode & BLK_OPEN_EXCL ? filp : NULL, NULL);
-	if (IS_ERR(handle))
-		return PTR_ERR(handle);
+	holder = mode & BLK_OPEN_EXCL ? filp : NULL;
+	ret = bdev_permission(inode->i_rdev, mode, holder);
+	if (ret)
+		return ret;
 
-	if (bdev_nowait(handle->bdev))
-		filp->f_mode |= FMODE_NOWAIT;
+	bdev = blkdev_get_no_open(inode->i_rdev);
+	if (!bdev)
+		return -ENXIO;
 
-	filp->f_mapping = handle->bdev->bd_inode->i_mapping;
-	filp->f_wb_err = filemap_sample_wb_err(filp->f_mapping);
-	filp->private_data = handle;
-	return 0;
+	ret = bdev_open(bdev, mode, holder, NULL, filp);
+	if (ret)
+		blkdev_put_no_open(bdev);
+	return ret;
 }
 
 static int blkdev_release(struct inode *inode, struct file *filp)
 {
-	bdev_release(filp->private_data);
+	if (filp->private_data)
+		bdev_release(filp->private_data);
 	return 0;
 }
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 495f5558720719..2f5dbde23094a7 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1475,13 +1475,6 @@ extern const struct blk_holder_ops fs_holder_ops;
 	(BLK_OPEN_READ | BLK_OPEN_RESTRICT_WRITES | \
 	 (((flags) & SB_RDONLY) ? 0 : BLK_OPEN_WRITE))
 
-/* @bdev_handle will be removed soon. */
-struct bdev_handle {
-	struct block_device *bdev;
-	void *holder;
-	blk_mode_t mode;
-};
-
 struct file *bdev_file_open_by_dev(dev_t dev, blk_mode_t mode, void *holder,
 		const struct blk_holder_ops *hops);
 struct file *bdev_file_open_by_path(const char *path, blk_mode_t mode,
diff --git a/include/linux/fs.h b/include/linux/fs.h
index e9291e27cc47f3..6e0714d35d9b2a 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1327,12 +1327,6 @@ struct super_block {
 	struct list_head	s_inodes_wb;	/* writeback inodes */
 } __randomize_layout;
 
-/* Temporary helper that will go away. */
-static inline struct bdev_handle *sb_bdev_handle(struct super_block *sb)
-{
-	return sb->s_bdev_file->private_data;
-}
-
 static inline struct user_namespace *i_user_ns(const struct inode *inode)
 {
 	return inode->i_sb->s_user_ns;

From ef67835f49a8d74a0172f933ba987eb8789f48bf Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Tue, 23 Jan 2024 14:26:47 +0100
Subject: [PATCH 0291/1406] bdev: remove bdev pointer from struct bdev_handle

We can always go directly via:

* I_BDEV(bdev_file->f_inode)
* I_BDEV(bdev_file->f_mapping->host)

So keeping struct bdev in struct bdev_handle is redundant.

Link: https://lore.kernel.org/r/20240123-vfs-bdev-file-v2-30-adbd023e19cc@kernel.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 block/bdev.c | 26 ++++++++++++--------------
 block/blk.h  |  3 +--
 block/fops.c |  2 +-
 3 files changed, 14 insertions(+), 17 deletions(-)

diff --git a/block/bdev.c b/block/bdev.c
index 8f33f160e92328..4e4527c5df00b5 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -51,8 +51,7 @@ EXPORT_SYMBOL(I_BDEV);
 
 struct block_device *file_bdev(struct file *bdev_file)
 {
-	struct bdev_handle *handle = bdev_file->private_data;
-	return handle->bdev;
+	return I_BDEV(bdev_file->f_mapping->host);
 }
 EXPORT_SYMBOL(file_bdev);
 
@@ -891,7 +890,6 @@ int bdev_open(struct block_device *bdev, blk_mode_t mode, void *holder,
 
 	if (unblock_events)
 		disk_unblock_events(disk);
-	handle->bdev = bdev;
 	handle->holder = holder;
 	handle->mode = mode;
 
@@ -899,7 +897,7 @@ int bdev_open(struct block_device *bdev, blk_mode_t mode, void *holder,
 	bdev_file->f_mode |= FMODE_BUF_RASYNC | FMODE_CAN_ODIRECT;
 	if (bdev_nowait(bdev))
 		bdev_file->f_mode |= FMODE_NOWAIT;
-	bdev_file->f_mapping = handle->bdev->bd_inode->i_mapping;
+	bdev_file->f_mapping = bdev->bd_inode->i_mapping;
 	bdev_file->f_wb_err = filemap_sample_wb_err(bdev_file->f_mapping);
 	bdev_file->private_data = handle;
 
@@ -985,7 +983,7 @@ struct file *bdev_file_open_by_path(const char *path, blk_mode_t mode,
 				    void *holder,
 				    const struct blk_holder_ops *hops)
 {
-	struct file *bdev_file;
+	struct file *file;
 	dev_t dev;
 	int error;
 
@@ -993,22 +991,22 @@ struct file *bdev_file_open_by_path(const char *path, blk_mode_t mode,
 	if (error)
 		return ERR_PTR(error);
 
-	bdev_file = bdev_file_open_by_dev(dev, mode, holder, hops);
-	if (!IS_ERR(bdev_file) && (mode & BLK_OPEN_WRITE)) {
-		struct bdev_handle *handle = bdev_file->private_data;
-		if (bdev_read_only(handle->bdev)) {
-			fput(bdev_file);
-			bdev_file = ERR_PTR(-EACCES);
+	file = bdev_file_open_by_dev(dev, mode, holder, hops);
+	if (!IS_ERR(file) && (mode & BLK_OPEN_WRITE)) {
+		if (bdev_read_only(file_bdev(file))) {
+			fput(file);
+			file = ERR_PTR(-EACCES);
 		}
 	}
 
-	return bdev_file;
+	return file;
 }
 EXPORT_SYMBOL(bdev_file_open_by_path);
 
-void bdev_release(struct bdev_handle *handle)
+void bdev_release(struct file *bdev_file)
 {
-	struct block_device *bdev = handle->bdev;
+	struct block_device *bdev = file_bdev(bdev_file);
+	struct bdev_handle *handle = bdev_file->private_data;
 	struct gendisk *disk = bdev->bd_disk;
 
 	/*
diff --git a/block/blk.h b/block/blk.h
index 19b15870284f43..7ca24814f3a0db 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -26,7 +26,6 @@ struct blk_flush_queue {
 };
 
 struct bdev_handle {
-	struct block_device *bdev;
 	void *holder;
 	blk_mode_t mode;
 };
@@ -522,7 +521,7 @@ static inline int req_ref_read(struct request *req)
 	return atomic_read(&req->ref);
 }
 
-void bdev_release(struct bdev_handle *handle);
+void bdev_release(struct file *bdev_file);
 int bdev_open(struct block_device *bdev, blk_mode_t mode, void *holder,
 	      const struct blk_holder_ops *hops, struct file *bdev_file);
 int bdev_permission(dev_t dev, blk_mode_t mode, void *holder);
diff --git a/block/fops.c b/block/fops.c
index a1ba1a50ae7787..aab9b89e4c77d8 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -623,7 +623,7 @@ static int blkdev_open(struct inode *inode, struct file *filp)
 static int blkdev_release(struct inode *inode, struct file *filp)
 {
 	if (filp->private_data)
-		bdev_release(filp->private_data);
+		bdev_release(filp);
 	return 0;
 }
 

From cb6fc1becbb9f0ffa813208039b9d1cb626eb71b Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Tue, 23 Jan 2024 14:26:48 +0100
Subject: [PATCH 0292/1406] block: don't rely on BLK_OPEN_RESTRICT_WRITES when
 yielding write access

Make it possible to detected a block device that was opened with
restricted write access based only on BLK_OPEN_WRITE and
bdev->bd_writers < 0 so we won't have to claim another FMODE_* flag.

Link: https://lore.kernel.org/r/20240123-vfs-bdev-file-v2-31-adbd023e19cc@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 block/bdev.c | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/block/bdev.c b/block/bdev.c
index 4e4527c5df00b5..efecc9b97e1eca 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -799,16 +799,21 @@ static void bdev_claim_write_access(struct block_device *bdev, blk_mode_t mode)
 		bdev->bd_writers++;
 }
 
-static void bdev_yield_write_access(struct block_device *bdev, blk_mode_t mode)
+static void bdev_yield_write_access(struct file *bdev_file, blk_mode_t mode)
 {
+	struct block_device *bdev;
+
 	if (bdev_allow_write_mounted)
 		return;
 
+	bdev = file_bdev(bdev_file);
 	/* Yield exclusive or shared write access. */
-	if (mode & BLK_OPEN_RESTRICT_WRITES)
-		bdev_unblock_writes(bdev);
-	else if (mode & BLK_OPEN_WRITE)
-		bdev->bd_writers--;
+	if (mode & BLK_OPEN_WRITE) {
+		if (bdev_writes_blocked(bdev))
+			bdev_unblock_writes(bdev);
+		else
+			bdev->bd_writers--;
+	}
 }
 
 /**
@@ -1020,7 +1025,7 @@ void bdev_release(struct file *bdev_file)
 		sync_blockdev(bdev);
 
 	mutex_lock(&disk->open_mutex);
-	bdev_yield_write_access(bdev, handle->mode);
+	bdev_yield_write_access(bdev_file, handle->mode);
 
 	if (handle->holder)
 		bd_end_claim(bdev, handle->holder);

From c2f16dcf04384612e9e4b6da25bbcd5e96dbf04f Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Tue, 23 Jan 2024 14:26:49 +0100
Subject: [PATCH 0293/1406] block: remove bdev_handle completely

We just need to use the holder to indicate whether a block device open
was exclusive or not. We did use to do that before but had to give that
up once we switched to struct bdev_handle. Before struct bdev_handle we
only stashed stuff in file->private_data if this was an exclusive open
but after struct bdev_handle we always set file->private_data to a
struct bdev_handle and so we had to use bdev_handle->mode or
bdev_handle->holder. Now that we don't use struct bdev_handle anymore we
can revert back to the old behavior.

Link: https://lore.kernel.org/r/20240123-vfs-bdev-file-v2-32-adbd023e19cc@kernel.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 block/bdev.c | 47 ++++++++++++++++++++++++-----------------------
 block/blk.h  |  5 -----
 block/fops.c | 21 ++++++++++-----------
 3 files changed, 34 insertions(+), 39 deletions(-)

diff --git a/block/bdev.c b/block/bdev.c
index efecc9b97e1eca..140093c99bdcf8 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -717,6 +717,13 @@ int bdev_permission(dev_t dev, blk_mode_t mode, void *holder)
 	if (mode & BLK_OPEN_RESTRICT_WRITES && !holder)
 		return -EINVAL;
 
+	/*
+	 * We're using error pointers to indicate to ->release() when we
+	 * failed to open that block device. Also this doesn't make sense.
+	 */
+	if (WARN_ON_ONCE(IS_ERR(holder)))
+		return -EINVAL;
+
 	return 0;
 }
 
@@ -799,7 +806,7 @@ static void bdev_claim_write_access(struct block_device *bdev, blk_mode_t mode)
 		bdev->bd_writers++;
 }
 
-static void bdev_yield_write_access(struct file *bdev_file, blk_mode_t mode)
+static void bdev_yield_write_access(struct file *bdev_file)
 {
 	struct block_device *bdev;
 
@@ -808,7 +815,7 @@ static void bdev_yield_write_access(struct file *bdev_file, blk_mode_t mode)
 
 	bdev = file_bdev(bdev_file);
 	/* Yield exclusive or shared write access. */
-	if (mode & BLK_OPEN_WRITE) {
+	if (bdev_file->f_mode & FMODE_WRITE) {
 		if (bdev_writes_blocked(bdev))
 			bdev_unblock_writes(bdev);
 		else
@@ -836,25 +843,18 @@ static void bdev_yield_write_access(struct file *bdev_file, blk_mode_t mode)
 int bdev_open(struct block_device *bdev, blk_mode_t mode, void *holder,
 	      const struct blk_holder_ops *hops, struct file *bdev_file)
 {
-	struct bdev_handle *handle;
 	bool unblock_events = true;
 	struct gendisk *disk = bdev->bd_disk;
 	int ret;
 
-	handle = kmalloc(sizeof(struct bdev_handle), GFP_KERNEL);
-	if (!handle)
-		return -ENOMEM;
-
 	if (holder) {
 		mode |= BLK_OPEN_EXCL;
 		ret = bd_prepare_to_claim(bdev, holder, hops);
 		if (ret)
-			goto free_handle;
+			return ret;
 	} else {
-		if (WARN_ON_ONCE(mode & BLK_OPEN_EXCL)) {
-			ret = -EIO;
-			goto free_handle;
-		}
+		if (WARN_ON_ONCE(mode & BLK_OPEN_EXCL))
+			return -EIO;
 	}
 
 	disk_block_events(disk);
@@ -895,8 +895,6 @@ int bdev_open(struct block_device *bdev, blk_mode_t mode, void *holder,
 
 	if (unblock_events)
 		disk_unblock_events(disk);
-	handle->holder = holder;
-	handle->mode = mode;
 
 	bdev_file->f_flags |= O_LARGEFILE;
 	bdev_file->f_mode |= FMODE_BUF_RASYNC | FMODE_CAN_ODIRECT;
@@ -904,7 +902,7 @@ int bdev_open(struct block_device *bdev, blk_mode_t mode, void *holder,
 		bdev_file->f_mode |= FMODE_NOWAIT;
 	bdev_file->f_mapping = bdev->bd_inode->i_mapping;
 	bdev_file->f_wb_err = filemap_sample_wb_err(bdev_file->f_mapping);
-	bdev_file->private_data = handle;
+	bdev_file->private_data = holder;
 
 	return 0;
 put_module:
@@ -914,8 +912,6 @@ int bdev_open(struct block_device *bdev, blk_mode_t mode, void *holder,
 		bd_abort_claiming(bdev, holder);
 	mutex_unlock(&disk->open_mutex);
 	disk_unblock_events(disk);
-free_handle:
-	kfree(handle);
 	return ret;
 }
 
@@ -976,7 +972,8 @@ struct file *bdev_file_open_by_dev(dev_t dev, blk_mode_t mode, void *holder,
 
 	ret = bdev_open(bdev, mode, holder, hops, bdev_file);
 	if (ret) {
-		blkdev_put_no_open(bdev);
+		/* We failed to open the block device. Let ->release() know. */
+		bdev_file->private_data = ERR_PTR(ret);
 		fput(bdev_file);
 		return ERR_PTR(ret);
 	}
@@ -1011,9 +1008,13 @@ EXPORT_SYMBOL(bdev_file_open_by_path);
 void bdev_release(struct file *bdev_file)
 {
 	struct block_device *bdev = file_bdev(bdev_file);
-	struct bdev_handle *handle = bdev_file->private_data;
+	void *holder = bdev_file->private_data;
 	struct gendisk *disk = bdev->bd_disk;
 
+	/* We failed to open that block device. */
+	if (IS_ERR(holder))
+		goto put_no_open;
+
 	/*
 	 * Sync early if it looks like we're the last one.  If someone else
 	 * opens the block device between now and the decrement of bd_openers
@@ -1025,10 +1026,10 @@ void bdev_release(struct file *bdev_file)
 		sync_blockdev(bdev);
 
 	mutex_lock(&disk->open_mutex);
-	bdev_yield_write_access(bdev_file, handle->mode);
+	bdev_yield_write_access(bdev_file);
 
-	if (handle->holder)
-		bd_end_claim(bdev, handle->holder);
+	if (holder)
+		bd_end_claim(bdev, holder);
 
 	/*
 	 * Trigger event checking and tell drivers to flush MEDIA_CHANGE
@@ -1044,8 +1045,8 @@ void bdev_release(struct file *bdev_file)
 	mutex_unlock(&disk->open_mutex);
 
 	module_put(disk->fops->owner);
+put_no_open:
 	blkdev_put_no_open(bdev);
-	kfree(handle);
 }
 
 /**
diff --git a/block/blk.h b/block/blk.h
index 7ca24814f3a0db..f02b25f22e8b34 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -25,11 +25,6 @@ struct blk_flush_queue {
 	struct request		*flush_rq;
 };
 
-struct bdev_handle {
-	void *holder;
-	blk_mode_t mode;
-};
-
 bool is_flush_rq(struct request *req);
 
 struct blk_flush_queue *blk_alloc_flush_queue(int node, int cmd_size,
diff --git a/block/fops.c b/block/fops.c
index aab9b89e4c77d8..029e787f011971 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -569,18 +569,17 @@ static int blkdev_fsync(struct file *filp, loff_t start, loff_t end,
 blk_mode_t file_to_blk_mode(struct file *file)
 {
 	blk_mode_t mode = 0;
-	struct bdev_handle *handle = file->private_data;
 
 	if (file->f_mode & FMODE_READ)
 		mode |= BLK_OPEN_READ;
 	if (file->f_mode & FMODE_WRITE)
 		mode |= BLK_OPEN_WRITE;
 	/*
-	 * do_dentry_open() clears O_EXCL from f_flags, use handle->mode to
-	 * determine whether the open was exclusive for already open files.
+	 * do_dentry_open() clears O_EXCL from f_flags, use file->private_data
+	 * to determine whether the open was exclusive for already open files.
 	 */
-	if (handle)
-		mode |= handle->mode & BLK_OPEN_EXCL;
+	if (file->private_data)
+		mode |= BLK_OPEN_EXCL;
 	else if (file->f_flags & O_EXCL)
 		mode |= BLK_OPEN_EXCL;
 	if (file->f_flags & O_NDELAY)
@@ -601,12 +600,13 @@ static int blkdev_open(struct inode *inode, struct file *filp)
 {
 	struct block_device *bdev;
 	blk_mode_t mode;
-	void *holder;
 	int ret;
 
 	mode = file_to_blk_mode(filp);
-	holder = mode & BLK_OPEN_EXCL ? filp : NULL;
-	ret = bdev_permission(inode->i_rdev, mode, holder);
+	/* Use the file as the holder. */
+	if (mode & BLK_OPEN_EXCL)
+		filp->private_data = filp;
+	ret = bdev_permission(inode->i_rdev, mode, filp->private_data);
 	if (ret)
 		return ret;
 
@@ -614,7 +614,7 @@ static int blkdev_open(struct inode *inode, struct file *filp)
 	if (!bdev)
 		return -ENXIO;
 
-	ret = bdev_open(bdev, mode, holder, NULL, filp);
+	ret = bdev_open(bdev, mode, filp->private_data, NULL, filp);
 	if (ret)
 		blkdev_put_no_open(bdev);
 	return ret;
@@ -622,8 +622,7 @@ static int blkdev_open(struct inode *inode, struct file *filp)
 
 static int blkdev_release(struct inode *inode, struct file *filp)
 {
-	if (filp->private_data)
-		bdev_release(filp);
+	bdev_release(filp);
 	return 0;
 }
 

From cd9f863c1f0c2f3e41629a32a5fc6bca2667e64b Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 2 Feb 2024 03:40:23 -0800
Subject: [PATCH 0294/1406] string: Allow 2-argument strscpy_pad()

Similar to strscpy(), update strscpy_pad()'s 3rd argument to be
optional when the destination is a compile-time known size array.

Cc: Andy Shevchenko <andy@kernel.org>
Cc:  <linux-hardening@vger.kernel.org>
Reviewed-by: Justin Stitt <justinstitt@google.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 include/linux/string.h | 33 ++++++++++++++++++++-------------
 1 file changed, 20 insertions(+), 13 deletions(-)

diff --git a/include/linux/string.h b/include/linux/string.h
index 0d66bf9407fdd4..96e6b1af86b5aa 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -77,6 +77,10 @@ ssize_t sized_strscpy(char *, const char *, size_t);
 	sized_strscpy(dst, src, sizeof(dst) + __must_be_array(dst))
 #define __strscpy1(dst, src, size)	sized_strscpy(dst, src, size)
 
+#define __strscpy_pad0(dst, src, ...)	\
+	sized_strscpy_pad(dst, src, sizeof(dst) + __must_be_array(dst))
+#define __strscpy_pad1(dst, src, size)	sized_strscpy_pad(dst, src, size)
+
 /**
  * strscpy - Copy a C-string into a sized buffer
  * @dst: Where to copy the string to
@@ -102,11 +106,23 @@ ssize_t sized_strscpy(char *, const char *, size_t);
 #define strscpy(dst, src, ...)	\
 	CONCATENATE(__strscpy, COUNT_ARGS(__VA_ARGS__))(dst, src, __VA_ARGS__)
 
+#define sized_strscpy_pad(dest, src, count)	({			\
+	char *__dst = (dest);						\
+	const char *__src = (src);					\
+	const size_t __count = (count);					\
+	ssize_t __wrote;						\
+									\
+	__wrote = sized_strscpy(__dst, __src, __count);			\
+	if (__wrote >= 0 && __wrote < __count)				\
+		memset(__dst + __wrote + 1, 0, __count - __wrote - 1);	\
+	__wrote;							\
+})
+
 /**
  * strscpy_pad() - Copy a C-string into a sized buffer
- * @dest: Where to copy the string to
+ * @dst: Where to copy the string to
  * @src: Where to copy the string from
- * @count: Size of destination buffer
+ * @...: Size of destination buffer
  *
  * Copy the string, or as much of it as fits, into the dest buffer. The
  * behavior is undefined if the string buffers overlap. The destination
@@ -122,17 +138,8 @@ ssize_t sized_strscpy(char *, const char *, size_t);
  * * The number of characters copied (not including the trailing %NULs)
  * * -E2BIG if count is 0 or @src was truncated.
  */
-#define strscpy_pad(dest, src, count)	({			\
-	char *__dst = (dest);						\
-	const char *__src = (src);					\
-	const size_t __count = (count);					\
-	ssize_t __wrote;						\
-									\
-	__wrote = strscpy(__dst, __src, __count);			\
-	if (__wrote >= 0 && __wrote < __count)				\
-		memset(__dst + __wrote + 1, 0, __count - __wrote - 1);	\
-	__wrote;							\
-})
+#define strscpy_pad(dst, src, ...)	\
+	CONCATENATE(__strscpy_pad, COUNT_ARGS(__VA_ARGS__))(dst, src, __VA_ARGS__)
 
 #ifndef __HAVE_ARCH_STRCAT
 extern char * strcat(char *, const char *);

From 991ce752683ae8f7f969a4f330da4cb35b2d31d8 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 2 Feb 2024 03:55:00 -0800
Subject: [PATCH 0295/1406] um: Convert strscpy() usage to 2-argument style

The ARCH=um build has its own idea about strscpy()'s definition. Adjust
the callers to remove the redundant sizeof() arguments ahead of treewide
changes, since it needs a manual adjustment for the newly named
sized_strscpy() export.

Cc: Richard Weinberger <richard@nod.at>
Cc: linux-um@lists.infradead.org
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 arch/um/drivers/net_kern.c               | 2 +-
 arch/um/drivers/vector_kern.c            | 2 +-
 arch/um/drivers/vector_user.c            | 4 ++--
 arch/um/include/shared/user.h            | 2 +-
 arch/um/os-Linux/drivers/ethertap_user.c | 2 +-
 arch/um/os-Linux/drivers/tuntap_user.c   | 2 +-
 arch/um/os-Linux/umid.c                  | 6 +++---
 7 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/arch/um/drivers/net_kern.c b/arch/um/drivers/net_kern.c
index cabcc501b448a3..77c4afb8ab9071 100644
--- a/arch/um/drivers/net_kern.c
+++ b/arch/um/drivers/net_kern.c
@@ -265,7 +265,7 @@ static void uml_net_poll_controller(struct net_device *dev)
 static void uml_net_get_drvinfo(struct net_device *dev,
 				struct ethtool_drvinfo *info)
 {
-	strscpy(info->driver, DRIVER_NAME, sizeof(info->driver));
+	strscpy(info->driver, DRIVER_NAME);
 }
 
 static const struct ethtool_ops uml_net_ethtool_ops = {
diff --git a/arch/um/drivers/vector_kern.c b/arch/um/drivers/vector_kern.c
index 131b7cb2957672..dc2feae789cbb2 100644
--- a/arch/um/drivers/vector_kern.c
+++ b/arch/um/drivers/vector_kern.c
@@ -1373,7 +1373,7 @@ static void vector_net_poll_controller(struct net_device *dev)
 static void vector_net_get_drvinfo(struct net_device *dev,
 				struct ethtool_drvinfo *info)
 {
-	strscpy(info->driver, DRIVER_NAME, sizeof(info->driver));
+	strscpy(info->driver, DRIVER_NAME);
 }
 
 static int vector_net_load_bpf_flash(struct net_device *dev,
diff --git a/arch/um/drivers/vector_user.c b/arch/um/drivers/vector_user.c
index c719e1ec464512..b16a5e5619d31f 100644
--- a/arch/um/drivers/vector_user.c
+++ b/arch/um/drivers/vector_user.c
@@ -141,7 +141,7 @@ static int create_tap_fd(char *iface)
 	}
 	memset(&ifr, 0, sizeof(ifr));
 	ifr.ifr_flags = IFF_TAP | IFF_NO_PI | IFF_VNET_HDR;
-	strscpy(ifr.ifr_name, iface, sizeof(ifr.ifr_name));
+	strscpy(ifr.ifr_name, iface);
 
 	err = ioctl(fd, TUNSETIFF, (void *) &ifr);
 	if (err != 0) {
@@ -171,7 +171,7 @@ static int create_raw_fd(char *iface, int flags, int proto)
 		goto raw_fd_cleanup;
 	}
 	memset(&ifr, 0, sizeof(ifr));
-	strscpy(ifr.ifr_name, iface, sizeof(ifr.ifr_name));
+	strscpy(ifr.ifr_name, iface);
 	if (ioctl(fd, SIOCGIFINDEX, (void *) &ifr) < 0) {
 		err = -errno;
 		goto raw_fd_cleanup;
diff --git a/arch/um/include/shared/user.h b/arch/um/include/shared/user.h
index 9568cc04cbb7b0..326e52450e4143 100644
--- a/arch/um/include/shared/user.h
+++ b/arch/um/include/shared/user.h
@@ -52,7 +52,7 @@ static inline int printk(const char *fmt, ...)
 extern int in_aton(char *str);
 extern size_t strlcat(char *, const char *, size_t);
 extern size_t sized_strscpy(char *, const char *, size_t);
-#define strscpy(dst, src, size)	sized_strscpy(dst, src, size)
+#define strscpy(dst, src)	sized_strscpy(dst, src, sizeof(dst))
 
 /* Copied from linux/compiler-gcc.h since we can't include it directly */
 #define barrier() __asm__ __volatile__("": : :"memory")
diff --git a/arch/um/os-Linux/drivers/ethertap_user.c b/arch/um/os-Linux/drivers/ethertap_user.c
index 3363851a4ae81e..bdf215c0eca75d 100644
--- a/arch/um/os-Linux/drivers/ethertap_user.c
+++ b/arch/um/os-Linux/drivers/ethertap_user.c
@@ -105,7 +105,7 @@ static int etap_tramp(char *dev, char *gate, int control_me,
 	sprintf(data_fd_buf, "%d", data_remote);
 	sprintf(version_buf, "%d", UML_NET_VERSION);
 	if (gate != NULL) {
-		strscpy(gate_buf, gate, sizeof(gate_buf));
+		strscpy(gate_buf, gate);
 		args = setup_args;
 	}
 	else args = nosetup_args;
diff --git a/arch/um/os-Linux/drivers/tuntap_user.c b/arch/um/os-Linux/drivers/tuntap_user.c
index 2284e9c1cbbbf7..91f0e27ca3a6e4 100644
--- a/arch/um/os-Linux/drivers/tuntap_user.c
+++ b/arch/um/os-Linux/drivers/tuntap_user.c
@@ -146,7 +146,7 @@ static int tuntap_open(void *data)
 		}
 		memset(&ifr, 0, sizeof(ifr));
 		ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
-		strscpy(ifr.ifr_name, pri->dev_name, sizeof(ifr.ifr_name));
+		strscpy(ifr.ifr_name, pri->dev_name);
 		if (ioctl(pri->fd, TUNSETIFF, &ifr) < 0) {
 			err = -errno;
 			printk(UM_KERN_ERR "TUNSETIFF failed, errno = %d\n",
diff --git a/arch/um/os-Linux/umid.c b/arch/um/os-Linux/umid.c
index 288c422bfa9647..e09d65b05d1cab 100644
--- a/arch/um/os-Linux/umid.c
+++ b/arch/um/os-Linux/umid.c
@@ -40,7 +40,7 @@ static int __init make_uml_dir(void)
 				__func__);
 			goto err;
 		}
-		strscpy(dir, home, sizeof(dir));
+		strscpy(dir, home);
 		uml_dir++;
 	}
 	strlcat(dir, uml_dir, sizeof(dir));
@@ -243,7 +243,7 @@ int __init set_umid(char *name)
 	if (strlen(name) > UMID_LEN - 1)
 		return -E2BIG;
 
-	strscpy(umid, name, sizeof(umid));
+	strscpy(umid, name);
 
 	return 0;
 }
@@ -262,7 +262,7 @@ static int __init make_umid(void)
 	make_uml_dir();
 
 	if (*umid == '\0') {
-		strscpy(tmp, uml_dir, sizeof(tmp));
+		strscpy(tmp, uml_dir);
 		strlcat(tmp, "XXXXXX", sizeof(tmp));
 		fd = mkstemp(tmp);
 		if (fd < 0) {

From e293defd26cb1365ecb6d795b16a1987c1ac4f0f Mon Sep 17 00:00:00 2001
From: Lukas Bulwahn <lukas.bulwahn@gmail.com>
Date: Thu, 8 Feb 2024 10:10:44 +0100
Subject: [PATCH 0296/1406] hardening: drop obsolete UBSAN_SANITIZE_ALL from
 config fragment

Commit 7a628f818499 ("ubsan: Remove CONFIG_UBSAN_SANITIZE_ALL") removes the
config UBSAN_SANITIZE_ALL, but one reference to that config is left in the
hardening.config fragment.

Drop this reference in hardening.config fragment.

Note that CONFIG_UBSAN is still enabled in the hardening.config fragment,
so the functionality when using this fragment remains the same.

Signed-off-by: Lukas Bulwahn <lukas.bulwahn@gmail.com>
Link: https://lore.kernel.org/r/20240208091045.9219-2-lukas.bulwahn@gmail.com
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 kernel/configs/hardening.config | 1 -
 1 file changed, 1 deletion(-)

diff --git a/kernel/configs/hardening.config b/kernel/configs/hardening.config
index 95a400f042b12c..4dc0cd342cede3 100644
--- a/kernel/configs/hardening.config
+++ b/kernel/configs/hardening.config
@@ -44,7 +44,6 @@ CONFIG_UBSAN_BOUNDS=y
 # CONFIG_UBSAN_BOOL
 # CONFIG_UBSAN_ENUM
 # CONFIG_UBSAN_ALIGNMENT
-CONFIG_UBSAN_SANITIZE_ALL=y
 
 # Linked list integrity checking.
 CONFIG_LIST_HARDENED=y

From b5c35b957eee55fac7770d6dd724b0b09254b950 Mon Sep 17 00:00:00 2001
From: Lukas Bulwahn <lukas.bulwahn@gmail.com>
Date: Thu, 8 Feb 2024 10:10:45 +0100
Subject: [PATCH 0297/1406] hardening: drop obsolete DRM_LEGACY from config
 fragment

Commit 94f8f319cbcb ("drm: Remove Kconfig option for legacy support
(CONFIG_DRM_LEGACY)") removes the config DRM_LEGACY, but one reference to
that config is left in the hardening.config fragment.

As there is no drm legacy driver left, we do not need to recommend this
attack surface reduction anymore.

Drop this reference in hardening.config fragment.

Signed-off-by: Lukas Bulwahn <lukas.bulwahn@gmail.com>
Link: https://lore.kernel.org/r/20240208091045.9219-3-lukas.bulwahn@gmail.com
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 kernel/configs/hardening.config | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/kernel/configs/hardening.config b/kernel/configs/hardening.config
index 4dc0cd342cede3..ed126d7b5e83b7 100644
--- a/kernel/configs/hardening.config
+++ b/kernel/configs/hardening.config
@@ -92,6 +92,3 @@ CONFIG_SYN_COOKIES=y
 
 # Attack surface reduction: Use the modern PTY interface (devpts) only.
 # CONFIG_LEGACY_PTYS is not set
-
-# Attack surface reduction: Use only modesetting video drivers.
-# CONFIG_DRM_LEGACY is not set

From 3f643cd2351099e6b859533b6f984463e5315e5f Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Fri, 9 Feb 2024 15:49:45 +0100
Subject: [PATCH 0298/1406] pidfd: allow to override signal scope in
 pidfd_send_signal()

Right now we determine the scope of the signal based on the type of
pidfd. There are use-cases where it's useful to override the scope of
the signal. For example in [1]. Add flags to determine the scope of the
signal:

(1) PIDFD_SIGNAL_THREAD: send signal to specific thread reference by @pidfd
(2) PIDFD_SIGNAL_THREAD_GROUP: send signal to thread-group of @pidfd
(2) PIDFD_SIGNAL_PROCESS_GROUP: send signal to process-group of @pidfd

Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Link: https://github.com/systemd/systemd/issues/31093 [1]
Link: https://lore.kernel.org/r/20240210-chihuahua-hinzog-3945b6abd44a@brauner
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/uapi/linux/pidfd.h |  5 +++++
 kernel/signal.c            | 44 +++++++++++++++++++++++++++++++-------
 2 files changed, 41 insertions(+), 8 deletions(-)

diff --git a/include/uapi/linux/pidfd.h b/include/uapi/linux/pidfd.h
index 2e6461459877ba..72ec000a97cda3 100644
--- a/include/uapi/linux/pidfd.h
+++ b/include/uapi/linux/pidfd.h
@@ -10,4 +10,9 @@
 #define PIDFD_NONBLOCK	O_NONBLOCK
 #define PIDFD_THREAD	O_EXCL
 
+/* Flags for pidfd_send_signal(). */
+#define PIDFD_SIGNAL_THREAD		(1UL << 0)
+#define PIDFD_SIGNAL_THREAD_GROUP	(1UL << 1)
+#define PIDFD_SIGNAL_PROCESS_GROUP	(1UL << 2)
+
 #endif /* _UAPI_LINUX_PIDFD_H */
diff --git a/kernel/signal.c b/kernel/signal.c
index 8b81696238503c..cf6539a6b1cb36 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1905,16 +1905,19 @@ int send_sig_fault_trapno(int sig, int code, void __user *addr, int trapno,
 	return send_sig_info(info.si_signo, &info, t);
 }
 
-int kill_pgrp(struct pid *pid, int sig, int priv)
+static int kill_pgrp_info(int sig, struct kernel_siginfo *info, struct pid *pgrp)
 {
 	int ret;
-
 	read_lock(&tasklist_lock);
-	ret = __kill_pgrp_info(sig, __si_special(priv), pid);
+	ret = __kill_pgrp_info(sig, info, pgrp);
 	read_unlock(&tasklist_lock);
-
 	return ret;
 }
+
+int kill_pgrp(struct pid *pid, int sig, int priv)
+{
+	return kill_pgrp_info(sig, __si_special(priv), pid);
+}
 EXPORT_SYMBOL(kill_pgrp);
 
 int kill_pid(struct pid *pid, int sig, int priv)
@@ -3873,6 +3876,10 @@ static struct pid *pidfd_to_pid(const struct file *file)
 	return tgid_pidfd_to_pid(file);
 }
 
+#define PIDFD_SEND_SIGNAL_FLAGS                            \
+	(PIDFD_SIGNAL_THREAD | PIDFD_SIGNAL_THREAD_GROUP | \
+	 PIDFD_SIGNAL_PROCESS_GROUP)
+
 /**
  * sys_pidfd_send_signal - Signal a process through a pidfd
  * @pidfd:  file descriptor of the process
@@ -3897,7 +3904,11 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig,
 	enum pid_type type;
 
 	/* Enforce flags be set to 0 until we add an extension. */
-	if (flags)
+	if (flags & ~PIDFD_SEND_SIGNAL_FLAGS)
+		return -EINVAL;
+
+	/* Ensure that only a single signal scope determining flag is set. */
+	if (hweight32(flags & PIDFD_SEND_SIGNAL_FLAGS) > 1)
 		return -EINVAL;
 
 	f = fdget(pidfd);
@@ -3915,10 +3926,24 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig,
 	if (!access_pidfd_pidns(pid))
 		goto err;
 
-	if (f.file->f_flags & PIDFD_THREAD)
+	switch (flags) {
+	case 0:
+		/* Infer scope from the type of pidfd. */
+		if (f.file->f_flags & PIDFD_THREAD)
+			type = PIDTYPE_PID;
+		else
+			type = PIDTYPE_TGID;
+		break;
+	case PIDFD_SIGNAL_THREAD:
 		type = PIDTYPE_PID;
-	else
+		break;
+	case PIDFD_SIGNAL_THREAD_GROUP:
 		type = PIDTYPE_TGID;
+		break;
+	case PIDFD_SIGNAL_PROCESS_GROUP:
+		type = PIDTYPE_PGID;
+		break;
+	}
 
 	if (info) {
 		ret = copy_siginfo_from_user_any(&kinfo, info);
@@ -3938,7 +3963,10 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig,
 		prepare_kill_siginfo(sig, &kinfo, type);
 	}
 
-	ret = kill_pid_info_type(sig, &kinfo, pid, type);
+	if (type == PIDTYPE_PGID)
+		ret = kill_pgrp_info(sig, &kinfo, pid);
+	else
+		ret = kill_pid_info_type(sig, &kinfo, pid, type);
 err:
 	fdput(f);
 	return ret;

From 3fd5f075759bc0026e4b26393f209ba33a3fb1c3 Mon Sep 17 00:00:00 2001
From: Andre Przywara <andre.przywara@arm.com>
Date: Fri, 9 Feb 2024 11:40:16 +0000
Subject: [PATCH 0299/1406] dt-bindings: vendor-prefixes: add Jide

Jide tech once create the Remix OS Android system, and shipped it on
some custom hardware. Add their name to the bindings.

Signed-off-by: Andre Przywara <andre.przywara@arm.com>
Acked-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Link: https://lore.kernel.org/r/20240209114018.3580370-2-andre.przywara@arm.com
Signed-off-by: Jernej Skrabec <jernej.skrabec@gmail.com>
---
 Documentation/devicetree/bindings/vendor-prefixes.yaml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Documentation/devicetree/bindings/vendor-prefixes.yaml b/Documentation/devicetree/bindings/vendor-prefixes.yaml
index 1a0dc04f1db478..dfb834a9279f20 100644
--- a/Documentation/devicetree/bindings/vendor-prefixes.yaml
+++ b/Documentation/devicetree/bindings/vendor-prefixes.yaml
@@ -719,6 +719,8 @@ patternProperties:
     description: JetHome (IP Sokolov P.A.)
   "^jianda,.*":
     description: Jiandangjing Technology Co., Ltd.
+  "^jide,.*":
+    description: Jide Tech
   "^joz,.*":
     description: JOZ BV
   "^kam,.*":

From e71abf65df471cb6eaf67ad25bab4996d5bb3f9a Mon Sep 17 00:00:00 2001
From: Andre Przywara <andre.przywara@arm.com>
Date: Fri, 9 Feb 2024 11:40:17 +0000
Subject: [PATCH 0300/1406] dt-bindings: arm: sunxi: document Remix Mini PC
 name

The Jide Remix Mini PC is a mini computer that ships with the Remix OS
Android based system. The SoC is an Allwinner H64, which is very close,
if not identical to the Allwinner A64.

Add the board/SoC compatible string pair to the list of known boards.
There are some drivers that look explicitly for the A64 compatible name,
so retain this name to increase compatibility.

Signed-off-by: Andre Przywara <andre.przywara@arm.com>
Acked-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Link: https://lore.kernel.org/r/20240209114018.3580370-3-andre.przywara@arm.com
Signed-off-by: Jernej Skrabec <jernej.skrabec@gmail.com>
---
 Documentation/devicetree/bindings/arm/sunxi.yaml | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/Documentation/devicetree/bindings/arm/sunxi.yaml b/Documentation/devicetree/bindings/arm/sunxi.yaml
index a9d8e85565b899..dab7a248c88da2 100644
--- a/Documentation/devicetree/bindings/arm/sunxi.yaml
+++ b/Documentation/devicetree/bindings/arm/sunxi.yaml
@@ -815,6 +815,12 @@ properties:
           - const: allwinner,r7-tv-dongle
           - const: allwinner,sun5i-a10s
 
+      - description: Remix Mini PC
+        items:
+          - const: jide,remix-mini-pc
+          - const: allwinner,sun50i-h64
+          - const: allwinner,sun50i-a64
+
       - description: RerVision H3-DVK
         items:
           - const: rervision,h3-dvk

From f0d86f545a47f6c6ea9c5b78d3da1b5dae4669bb Mon Sep 17 00:00:00 2001
From: Andre Przywara <andre.przywara@arm.com>
Date: Fri, 9 Feb 2024 11:40:18 +0000
Subject: [PATCH 0301/1406] arm64: dts: allwinner: Add Jide Remix Mini PC
 support

The Remix Mini PC is a "mini computer" using the Allwinner H64 SoC,
which appears to be just a relabelled A64. It was launched in 2015 by
the now defunct company Jide, and shipped with a desktop optimised
version of Android. It features
	- Allwinner H64 Soc (4 * Arm Cortex-A53 cores)
	- 1 or 2 GB DRAM
	- 8 or 16 GB eMMC flash
	- 100 MBit Ethernet port (using an X-Powers AC200 PHY)
	- RTL8723BS WiFi & Bluetooth chip
	- HDMI port
	- two USB 2.0 ports
	- 3.5mm AV port
	- microSD card slot

The devicetree covers most peripherals, though there is no agreed
binding for the PHY chip yet, so this is left out.
The eMMC did not work with the MMC DDR speed mode, so this mode property
is omitted.

Signed-off-by: Andre Przywara <andre.przywara@arm.com>
Reviewed-by: Jernej Skrabec <jernej.skrabec@gmail.com>
Link: https://lore.kernel.org/r/20240209114018.3580370-4-andre.przywara@arm.com
Signed-off-by: Jernej Skrabec <jernej.skrabec@gmail.com>
---
 arch/arm64/boot/dts/allwinner/Makefile        |   1 +
 .../allwinner/sun50i-h64-remix-mini-pc.dts    | 356 ++++++++++++++++++
 2 files changed, 357 insertions(+)
 create mode 100644 arch/arm64/boot/dts/allwinner/sun50i-h64-remix-mini-pc.dts

diff --git a/arch/arm64/boot/dts/allwinner/Makefile b/arch/arm64/boot/dts/allwinner/Makefile
index 91d505b385de5a..2db3b15ad09f2c 100644
--- a/arch/arm64/boot/dts/allwinner/Makefile
+++ b/arch/arm64/boot/dts/allwinner/Makefile
@@ -16,6 +16,7 @@ dtb-$(CONFIG_ARCH_SUNXI) += sun50i-a64-pinetab.dtb
 dtb-$(CONFIG_ARCH_SUNXI) += sun50i-a64-pinetab-early-adopter.dtb
 dtb-$(CONFIG_ARCH_SUNXI) += sun50i-a64-sopine-baseboard.dtb
 dtb-$(CONFIG_ARCH_SUNXI) += sun50i-a64-teres-i.dtb
+dtb-$(CONFIG_ARCH_SUNXI) += sun50i-h64-remix-mini-pc.dtb
 dtb-$(CONFIG_ARCH_SUNXI) += sun50i-a100-allwinner-perf1.dtb
 dtb-$(CONFIG_ARCH_SUNXI) += sun50i-h5-bananapi-m2-plus.dtb
 dtb-$(CONFIG_ARCH_SUNXI) += sun50i-h5-bananapi-m2-plus-v1.2.dtb
diff --git a/arch/arm64/boot/dts/allwinner/sun50i-h64-remix-mini-pc.dts b/arch/arm64/boot/dts/allwinner/sun50i-h64-remix-mini-pc.dts
new file mode 100644
index 00000000000000..b6e3c169797f05
--- /dev/null
+++ b/arch/arm64/boot/dts/allwinner/sun50i-h64-remix-mini-pc.dts
@@ -0,0 +1,356 @@
+// SPDX-License-Identifier: (GPL-2.0+ OR MIT)
+// Copyright (c) 2023 ARM Ltd.
+
+/dts-v1/;
+
+#include "sun50i-a64.dtsi"
+#include "sun50i-a64-cpu-opp.dtsi"
+
+#include <dt-bindings/gpio/gpio.h>
+
+/ {
+	model = "Remix Mini PC";
+	compatible = "jide,remix-mini-pc", "allwinner,sun50i-h64",
+		     "allwinner,sun50i-a64";
+
+	aliases {
+		ethernet1 = &rtl8723bs;
+		serial0 = &uart0;
+	};
+
+	chosen {
+		stdout-path = "serial0:115200n8";
+	};
+
+	hdmi-connector {
+		compatible = "hdmi-connector";
+		type = "a";
+
+		port {
+			hdmi_con_in: endpoint {
+				remote-endpoint = <&hdmi_out_con>;
+			};
+		};
+	};
+
+	reg_vcc5v: regulator-5v {
+		/* board wide 5V supply directly from the DC input */
+		compatible = "regulator-fixed";
+		regulator-name = "vcc-5v";
+		regulator-min-microvolt = <5000000>;
+		regulator-max-microvolt = <5000000>;
+		regulator-always-on;
+	};
+
+	wifi_pwrseq: wifi_pwrseq {
+		compatible = "mmc-pwrseq-simple";
+		reset-gpios = <&r_pio 0 2 GPIO_ACTIVE_LOW>; /* PL2 */
+		post-power-on-delay-ms = <200>;
+	};
+};
+
+&codec {
+	status = "okay";
+};
+
+&codec_analog {
+	cpvdd-supply = <&reg_eldo1>;
+	status = "okay";
+};
+
+&cpu0 {
+	cpu-supply = <&reg_dcdc2>;
+};
+
+&cpu1 {
+	cpu-supply = <&reg_dcdc2>;
+};
+
+&cpu2 {
+	cpu-supply = <&reg_dcdc2>;
+};
+
+&cpu3 {
+	cpu-supply = <&reg_dcdc2>;
+};
+
+&dai {
+	status = "okay";
+};
+
+&de {
+	status = "okay";
+};
+
+&ehci0 {
+	status = "okay";
+};
+
+&ehci1 {
+	status = "okay";
+};
+
+&hdmi {
+	hvcc-supply = <&reg_dldo1>;
+	status = "okay";
+};
+
+&hdmi_out {
+	hdmi_out_con: endpoint {
+		remote-endpoint = <&hdmi_con_in>;
+	};
+};
+
+/* Connects to the AC200 chip */
+&i2c0 {
+	pinctrl-names = "default";
+	pinctrl-0 = <&i2c0_pins>;
+	status = "okay";
+};
+
+&i2c0_pins {
+	bias-pull-up;
+};
+
+&mmc0 {
+	pinctrl-names = "default";
+	pinctrl-0 = <&mmc0_pins>;
+	vmmc-supply = <&reg_dcdc1>;
+	cd-gpios = <&pio 5 6 GPIO_ACTIVE_LOW>;
+	disable-wp;
+	bus-width = <4>;
+	status = "okay";
+};
+
+&mmc1 {
+	pinctrl-names = "default";
+	pinctrl-0 = <&mmc1_pins>;
+	vmmc-supply = <&reg_aldo1>;
+	vqmmc-supply = <&reg_dldo4>;
+	mmc-pwrseq = <&wifi_pwrseq>;
+	bus-width = <4>;
+	non-removable;
+	status = "okay";
+
+	rtl8723bs: wifi@1 {
+		reg = <1>;
+		interrupt-parent = <&r_pio>;
+		interrupts = <0 3 IRQ_TYPE_LEVEL_LOW>; /* PL3 */
+		interrupt-names = "host-wake";
+	};
+};
+
+&mmc2 {
+	pinctrl-names = "default";
+	pinctrl-0 = <&mmc2_pins>, <&mmc2_ds_pin>;
+	vmmc-supply = <&reg_dcdc1>;
+	vqmmc-supply = <&reg_eldo1>;
+	bus-width = <8>;
+	non-removable;
+	mmc-hs200-1_8v;
+	mmc-hs400-1_8v;
+	cap-mmc-hw-reset;
+	status = "okay";
+};
+
+&ohci0 {
+	status = "okay";
+};
+
+&ohci1 {
+	status = "okay";
+};
+
+&pio {
+	vcc-pb-supply = <&reg_dcdc1>;
+	vcc-pc-supply = <&reg_dcdc1>;
+	vcc-pd-supply = <&reg_dcdc1>;
+	vcc-pe-supply = <&reg_dcdc1>;
+	vcc-pf-supply = <&reg_dcdc1>;
+	vcc-pg-supply = <&reg_dldo4>;
+	vcc-ph-supply = <&reg_dcdc1>;
+};
+
+&r_ir {
+	status = "okay";
+};
+
+&r_pio {
+	/*
+	 * We cannot add that supply for now since it would create a circular
+	 * dependency between pinctrl, the regulator and the RSB Bus.
+	 *
+	 * vcc-pl-supply = <&reg_aldo2>;
+	 */
+};
+
+&r_rsb {
+	status = "okay";
+
+	axp803: pmic@3a3 {
+		compatible = "x-powers,axp803";
+		reg = <0x3a3>;
+		interrupt-parent = <&r_intc>;
+		interrupts = <GIC_SPI 0 IRQ_TYPE_LEVEL_LOW>;
+		x-powers,drive-vbus-en;
+
+		vin1-supply = <&reg_vcc5v>;
+		vin2-supply = <&reg_vcc5v>;
+		vin3-supply = <&reg_vcc5v>;
+		vin5-supply = <&reg_vcc5v>;
+		vin6-supply = <&reg_vcc5v>;
+		aldoin-supply = <&reg_vcc5v>;
+		dldoin-supply = <&reg_vcc5v>;
+		eldoin-supply = <&reg_vcc5v>;
+		fldoin-supply = <&reg_vcc5v>;
+		drivevbus-supply = <&reg_vcc5v>;
+		ips-supply = <&reg_vcc5v>;
+
+		status = "okay";
+	};
+};
+
+#include "axp803.dtsi"
+
+&ac_power_supply {
+	status = "okay";
+};
+
+&reg_dcdc1 {
+	regulator-always-on;
+	regulator-min-microvolt = <3300000>;
+	regulator-max-microvolt = <3300000>;
+	regulator-name = "vcc-3v3";
+};
+
+&reg_dcdc2 {
+	regulator-always-on;
+	regulator-min-microvolt = <1040000>;
+	regulator-max-microvolt = <1300000>;
+	regulator-name = "vdd-cpux";
+};
+
+/* DCDC3 is polyphased with DCDC2 */
+
+&reg_dcdc5 {
+	regulator-always-on;
+	regulator-min-microvolt = <1500000>;
+	regulator-max-microvolt = <1500000>;
+	regulator-name = "vcc-dram";
+};
+
+/* Deviates from the reset default of 1.1V. */
+&reg_dcdc6 {
+	regulator-always-on;
+	regulator-min-microvolt = <1200000>;
+	regulator-max-microvolt = <1200000>;
+	regulator-name = "vdd-sys";
+};
+
+&reg_aldo1 {
+	regulator-min-microvolt = <3300000>;
+	regulator-max-microvolt = <3300000>;
+	regulator-name = "vcc-wifi";
+};
+
+&reg_aldo2 {
+	/* Specifying R_PIO consumer would create circular dependency. */
+	regulator-always-on;
+	regulator-min-microvolt = <3300000>;
+	regulator-max-microvolt = <3300000>;
+	regulator-name = "vcc-pl";
+};
+
+&reg_aldo3 {
+	regulator-always-on;
+	regulator-min-microvolt = <3000000>;
+	regulator-max-microvolt = <3000000>;
+	regulator-name = "vcc-pll-avcc";
+};
+
+/* AC200 power supply */
+&reg_dldo1 {
+	regulator-always-on;
+	regulator-min-microvolt = <3300000>;
+	regulator-max-microvolt = <3300000>;
+	regulator-name = "vcc-ave-33";
+};
+
+&reg_dldo4 {
+	regulator-min-microvolt = <3300000>;
+	regulator-max-microvolt = <3300000>;
+	regulator-name = "vcc-wifi-io";
+};
+
+&reg_drivevbus {
+	regulator-name = "usb0-vbus";
+	status = "okay";
+};
+
+&reg_eldo1 {
+	regulator-always-on;
+	regulator-min-microvolt = <1800000>;
+	regulator-max-microvolt = <1800000>;
+	regulator-name = "vcc-cpvdd-dram-emmc";
+};
+
+/* Supplies the arisc management core, needed by TF-A to power off cores. */
+&reg_fldo2 {
+	regulator-always-on;
+	regulator-min-microvolt = <1100000>;
+	regulator-max-microvolt = <1100000>;
+	regulator-name = "vdd-cpus";
+};
+
+&reg_rtc_ldo {
+	regulator-name = "vcc-rtc";
+};
+
+&simplefb_hdmi {
+	vcc-hdmi-supply = <&reg_dcdc1>;
+};
+
+&sound {
+	simple-audio-card,aux-devs = <&codec_analog>;
+	simple-audio-card,widgets = "Microphone", "Microphone Jack",
+				    "Headphone", "Headphone Jack";
+	simple-audio-card,routing =
+			"Left DAC", "DACL",
+			"Right DAC", "DACR",
+			"Headphone Jack", "HP",
+			"ADCL", "Left ADC",
+			"ADCR", "Right ADC",
+			"MIC2", "Microphone Jack";
+	status = "okay";
+};
+
+/* On the (unpopulated) UART pads. */
+&uart0 {
+	pinctrl-names = "default";
+	pinctrl-0 = <&uart0_pb_pins>;
+	status = "okay";
+};
+
+&uart1 {
+	pinctrl-names = "default";
+	pinctrl-0 = <&uart1_pins>, <&uart1_rts_cts_pins>;
+	uart-has-rtscts;
+	status = "okay";
+
+	bluetooth {
+		compatible = "realtek,rtl8723bs-bt";
+		enable-gpios = <&r_pio 0 4 GPIO_ACTIVE_HIGH>; /* PL4 */
+		max-speed = <1500000>;
+	};
+};
+
+&usb_otg {
+	dr_mode = "host";
+	status = "okay";
+};
+
+&usbphy {
+	usb0_vbus-supply = <&reg_drivevbus>;
+	usb1_vbus-supply = <&reg_drivevbus>;
+	status = "okay";
+};

From 69b92598af490f8eefb12510cd3c4f3926bf7b76 Mon Sep 17 00:00:00 2001
From: Andre Przywara <andre.przywara@arm.com>
Date: Fri, 9 Feb 2024 11:57:58 +0000
Subject: [PATCH 0302/1406] arm64: dts: allwinner: h616: Add 32K fanout pin

On some boards the designers saved on a 32KHz crystal for some external
chips, so the SoC has to help out, with providing a 32 KHz clock signal.

Add a pinctrl group node to allow DT nodes to reference this fanout signal.

Signed-off-by: Andre Przywara <andre.przywara@arm.com>
Reviewed-by: Jernej Skrabec <jernej.skrabec@gmail.com>
Link: https://lore.kernel.org/r/20240209115759.3582869-2-andre.przywara@arm.com
Signed-off-by: Jernej Skrabec <jernej.skrabec@gmail.com>
---
 arch/arm64/boot/dts/allwinner/sun50i-h616.dtsi | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/arch/arm64/boot/dts/allwinner/sun50i-h616.dtsi b/arch/arm64/boot/dts/allwinner/sun50i-h616.dtsi
index b1bf4fb5fc58b8..a07d3aa789f925 100644
--- a/arch/arm64/boot/dts/allwinner/sun50i-h616.dtsi
+++ b/arch/arm64/boot/dts/allwinner/sun50i-h616.dtsi
@@ -274,6 +274,12 @@
 				pins = "PG8", "PG9";
 				function = "uart1";
 			};
+
+			/omit-if-no-ref/
+			x32clk_fanout_pin: x32clk-fanout-pin {
+				pins = "PG10";
+				function = "clock";
+			};
 		};
 
 		gic: interrupt-controller@3021000 {

From c46671c251c1f432105423fca5dbbd8e0ac754ce Mon Sep 17 00:00:00 2001
From: Andre Przywara <andre.przywara@arm.com>
Date: Fri, 9 Feb 2024 11:57:59 +0000
Subject: [PATCH 0303/1406] arm64: dts: allwinner: Transpeed 8K618-T: add WiFi
 nodes

In contrast to other devices using Allwinner SoCs, the Transpeed 8K618-T
TV box uses a mainline supported WiFi chip: it's Broadcom 4335 compatible,
packaged by Murata.

Add the required DT nodes to let DT users know about the SDIO device.
There is an otherwise empty MMC device node, to receive the MAC address,
that firmware might want to write in there.

Signed-off-by: Andre Przywara <andre.przywara@arm.com>
Reviewed-by: Jernej Skrabec <jernej.skrabec@gmail.com>
Link: https://lore.kernel.org/r/20240209115759.3582869-3-andre.przywara@arm.com
Signed-off-by: Jernej Skrabec <jernej.skrabec@gmail.com>
---
 .../sun50i-h618-transpeed-8k618-t.dts         | 23 +++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/arch/arm64/boot/dts/allwinner/sun50i-h618-transpeed-8k618-t.dts b/arch/arm64/boot/dts/allwinner/sun50i-h618-transpeed-8k618-t.dts
index 8ea1fd41aebaa0..ac0a2b7ea6f310 100644
--- a/arch/arm64/boot/dts/allwinner/sun50i-h618-transpeed-8k618-t.dts
+++ b/arch/arm64/boot/dts/allwinner/sun50i-h618-transpeed-8k618-t.dts
@@ -15,6 +15,7 @@
 	compatible = "transpeed,8k618-t", "allwinner,sun50i-h618";
 
 	aliases {
+		ethernet1 = &sdio_wifi;
 		serial0 = &uart0;
 	};
 
@@ -39,6 +40,15 @@
 		regulator-max-microvolt = <3300000>;
 		regulator-always-on;
 	};
+
+	wifi_pwrseq: wifi_pwrseq {
+		compatible = "mmc-pwrseq-simple";
+		clocks = <&rtc CLK_OSC32K_FANOUT>;
+		clock-names = "ext_clock";
+		pinctrl-0 = <&x32clk_fanout_pin>;
+		pinctrl-names = "default";
+		reset-gpios = <&pio 6 18 GPIO_ACTIVE_LOW>; /* PG18 */
+	};
 };
 
 &ehci0 {
@@ -60,6 +70,19 @@
 	status = "okay";
 };
 
+&mmc1 {
+	vmmc-supply = <&reg_dldo1>;
+	vqmmc-supply = <&reg_aldo1>;
+	mmc-pwrseq = <&wifi_pwrseq>;
+	bus-width = <4>;
+	non-removable;
+	status = "okay";
+
+	sdio_wifi: wifi@1 {
+		reg = <1>;
+	};
+};
+
 &mmc2 {
 	vmmc-supply = <&reg_dldo1>;
 	vqmmc-supply = <&reg_aldo1>;

From 668aa84be1795a5b04fc17b0f5e9ee060d679e65 Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Date: Thu, 8 Feb 2024 11:53:00 +0100
Subject: [PATCH 0304/1406] arm64: dts: allwinner: use capital "OR" for
 multiple licenses in SPDX

Documentation/process/license-rules.rst and checkpatch expect the SPDX
identifier syntax for multiple licenses to use capital "OR".  Correct it
to keep consistent format and avoid copy-paste issues.

Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Reviewed-by: Andre Przywara <andre.przywara@arm.com>
Acked-by: Jernej Skrabec <jernej.skrabec@gmail.com>
Link: https://lore.kernel.org/r/20240208105301.129005-1-krzysztof.kozlowski@linaro.org
Signed-off-by: Jernej Skrabec <jernej.skrabec@gmail.com>
---
 .../boot/dts/allwinner/sun50i-h616-bigtreetech-cb1-manta.dts    | 2 +-
 arch/arm64/boot/dts/allwinner/sun50i-h616-bigtreetech-cb1.dtsi  | 2 +-
 arch/arm64/boot/dts/allwinner/sun50i-h616-bigtreetech-pi.dts    | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/boot/dts/allwinner/sun50i-h616-bigtreetech-cb1-manta.dts b/arch/arm64/boot/dts/allwinner/sun50i-h616-bigtreetech-cb1-manta.dts
index dbce61b355d65e..4bfb52609c942a 100644
--- a/arch/arm64/boot/dts/allwinner/sun50i-h616-bigtreetech-cb1-manta.dts
+++ b/arch/arm64/boot/dts/allwinner/sun50i-h616-bigtreetech-cb1-manta.dts
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: (GPL-2.0+ or MIT)
+// SPDX-License-Identifier: (GPL-2.0+ OR MIT)
 /*
  * Copyright (C) 2023 Martin Botka <martin.botka@somainline.org>.
  */
diff --git a/arch/arm64/boot/dts/allwinner/sun50i-h616-bigtreetech-cb1.dtsi b/arch/arm64/boot/dts/allwinner/sun50i-h616-bigtreetech-cb1.dtsi
index 1fed2b46cfe87a..b2988f50023197 100644
--- a/arch/arm64/boot/dts/allwinner/sun50i-h616-bigtreetech-cb1.dtsi
+++ b/arch/arm64/boot/dts/allwinner/sun50i-h616-bigtreetech-cb1.dtsi
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: (GPL-2.0+ or MIT)
+// SPDX-License-Identifier: (GPL-2.0+ OR MIT)
 /*
  * Copyright (C) 2023 Martin Botka <martin.botka@somainline.org>.
  */
diff --git a/arch/arm64/boot/dts/allwinner/sun50i-h616-bigtreetech-pi.dts b/arch/arm64/boot/dts/allwinner/sun50i-h616-bigtreetech-pi.dts
index 832f08b2b26080..ff84a379447036 100644
--- a/arch/arm64/boot/dts/allwinner/sun50i-h616-bigtreetech-pi.dts
+++ b/arch/arm64/boot/dts/allwinner/sun50i-h616-bigtreetech-pi.dts
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: (GPL-2.0+ or MIT)
+// SPDX-License-Identifier: (GPL-2.0+ OR MIT)
 /*
  * Copyright (C) 2023 Martin Botka <martin@biqu3d.com>.
  */

From 3db737fa0d5a73de64769aef8f1715c0564afe25 Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Date: Thu, 8 Feb 2024 11:53:01 +0100
Subject: [PATCH 0305/1406] arm64: dts: allwinner: h616: minor whitespace
 cleanup

The DTS code coding style expects exactly one space before '{'
character.

Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Reviewed-by: Andre Przywara <andre.przywara@arm.com>
Acked-by: Jernej Skrabec <jernej.skrabec@gmail.com>
Link: https://lore.kernel.org/r/20240208105301.129005-2-krzysztof.kozlowski@linaro.org
Signed-off-by: Jernej Skrabec <jernej.skrabec@gmail.com>
---
 arch/arm64/boot/dts/allwinner/sun50i-h616-bigtreetech-cb1.dtsi | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm64/boot/dts/allwinner/sun50i-h616-bigtreetech-cb1.dtsi b/arch/arm64/boot/dts/allwinner/sun50i-h616-bigtreetech-cb1.dtsi
index b2988f50023197..af421ba24ce0c6 100644
--- a/arch/arm64/boot/dts/allwinner/sun50i-h616-bigtreetech-cb1.dtsi
+++ b/arch/arm64/boot/dts/allwinner/sun50i-h616-bigtreetech-cb1.dtsi
@@ -93,7 +93,7 @@
 		interrupt-controller;
 		#interrupt-cells = <1>;
 
-		regulators{
+		regulators {
 			reg_dcdc1: dcdc1 {
 				regulator-name = "vdd-gpu-sys";
 				regulator-min-microvolt = <810000>;

From abab990ce4198a817d7d33be4664481f565cd97a Mon Sep 17 00:00:00 2001
From: Nuno Sa <nuno.sa@analog.com>
Date: Mon, 29 Jan 2024 17:13:25 +0100
Subject: [PATCH 0306/1406] hwmon: ltc4282: add support for the LTC4282 chip

The LTC4282 hot swap controller allows a board to be safely inserted and
removed from a live backplane. Using one or more external N-channel pass
transistors, board supply voltage and inrush current are ramped up at an
adjustable rate. An I2C interface and onboard ADC allows for monitoring
of board current, voltage, power, energy and fault status.

Signed-off-by: Nuno Sa <nuno.sa@analog.com>
Link: https://lore.kernel.org/r/20240129-b4-ltc4282-support-v4-3-fe75798164cc@analog.com
[groeck: clamp value range in ltc4282_write_voltage_byte_cached()]
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
---
 Documentation/hwmon/index.rst   |    1 +
 Documentation/hwmon/ltc4282.rst |  133 +++
 MAINTAINERS                     |    2 +
 drivers/hwmon/Kconfig           |   11 +
 drivers/hwmon/Makefile          |    1 +
 drivers/hwmon/ltc4282.c         | 1782 +++++++++++++++++++++++++++++++
 6 files changed, 1930 insertions(+)
 create mode 100644 Documentation/hwmon/ltc4282.rst
 create mode 100644 drivers/hwmon/ltc4282.c

diff --git a/Documentation/hwmon/index.rst b/Documentation/hwmon/index.rst
index c7ed1f73ac0661..f16c6dfaec7dc9 100644
--- a/Documentation/hwmon/index.rst
+++ b/Documentation/hwmon/index.rst
@@ -129,6 +129,7 @@ Hardware Monitoring Kernel Drivers
    ltc4245
    ltc4260
    ltc4261
+   ltc4282
    ltc4286
    max127
    max15301
diff --git a/Documentation/hwmon/ltc4282.rst b/Documentation/hwmon/ltc4282.rst
new file mode 100644
index 00000000000000..a87ec3564998fe
--- /dev/null
+++ b/Documentation/hwmon/ltc4282.rst
@@ -0,0 +1,133 @@
+.. SPDX-License-Identifier: GPL-2.0-only
+
+Kernel drivers ltc4282
+==========================================
+
+Supported chips:
+
+  * Analog Devices LTC4282
+
+    Prefix: 'ltc4282'
+
+    Addresses scanned: - I2C 0x40 - 0x5A (7-bit)
+    Addresses scanned: - I2C 0x80 - 0xB4 with a step of 2 (8-bit)
+
+    Datasheet:
+
+        https://www.analog.com/media/en/technical-documentation/data-sheets/ltc4282.pdf
+
+Author: Nuno Sá <nuno.sa@analog.com>
+
+Description
+___________
+
+The LTC4282 hot swap controller allows a board to be safely inserted and removed
+from a live backplane. Using one or more external N-channel pass transistors,
+board supply voltage and inrush current are ramped up at an adjustable rate. An
+I2C interface and onboard ADC allows for monitoring of board current, voltage,
+power, energy and fault status. The device features analog foldback current
+limiting and supply monitoring for applications from 2.9V to 33V. Dual 12V gate
+drive allows high power applications to either share safe operating area across
+parallel MOSFETs or support a 2-stage start-up that first charges the load
+capacitance followed by enabling a low on-resistance path to the load. The
+LTC4282 is well suited to high power applications because the precise monitoring
+capability and accurate current limiting reduce the extremes in which both loads
+and power supplies must safely operate. Non-volatile configuration allows for
+flexibility in the autonomous generation of alerts and response to faults.
+
+Sysfs entries
+_____________
+
+The following attributes are supported. Limits are read-write and all the other
+attributes are read-only. Note that in0 and in1 are mutually exclusive. Enabling
+one disables the other and disabling one enables the other.
+
+======================= ==========================================
+in0_input		Output voltage (mV).
+in0_min			Undervoltage threshold
+in0_max                 Overvoltage threshold
+in0_lowest		Lowest measured voltage
+in0_highest		Highest measured voltage
+in0_reset_history	Write 1 to reset in0 history.
+			Also clears fet bad and short fault logs.
+in0_min_alarm		Undervoltage alarm
+in0_max_alarm           Overvoltage alarm
+in0_enable		Enable/Disable VSOURCE monitoring
+in0_fault		Failure in the MOSFETs. Either bad or shorted FET.
+in0_label		Channel label (VSOURCE)
+
+in1_input		Input voltage (mV).
+in1_min			Undervoltage threshold
+in1_max                 Overvoltage threshold
+in1_lowest		Lowest measured voltage
+in1_highest		Highest measured voltage
+in1_reset_history	Write 1 to reset in1 history.
+			Also clears over/undervoltage fault logs.
+in1_min_alarm		Undervoltage alarm
+in1_max_alarm           Overvoltage alarm
+in1_lcrit_alarm         Critical Undervoltage alarm
+in1_crit_alarm          Critical Overvoltage alarm
+in1_enable		Enable/Disable VDD monitoring
+in1_label		Channel label (VDD)
+
+in2_input		GPIO voltage (mV)
+in2_min			Undervoltage threshold
+in2_max			Overvoltage threshold
+in2_lowest		Lowest measured voltage
+in2_highest		Highest measured voltage
+in2_reset_history	Write 1 to reset in2 history
+in2_min_alarm		Undervoltage alarm
+in2_max_alarm		Overvoltage alarm
+in2_label		Channel label (VGPIO)
+
+curr1_input		Sense current (mA)
+curr1_min		Undercurrent threshold
+curr1_max		Overcurrent threshold
+curr1_lowest		Lowest measured current
+curr1_highest		Highest measured current
+curr1_reset_history	Write 1 to reset curr1 history.
+			Also clears overcurrent fault logs.
+curr1_min_alarm		Undercurrent alarm
+curr1_max_alarm		Overcurrent alarm
+curr1_crit_alarm        Critical Overcurrent alarm
+curr1_label		Channel label (ISENSE)
+
+power1_input		Power (in uW)
+power1_min		Low power threshold
+power1_max		High power threshold
+power1_input_lowest	Historical minimum power use
+power1_input_highest	Historical maximum power use
+power1_reset_history	Write 1 to reset power1 history.
+			Also clears power bad fault logs.
+power1_min_alarm	Low power alarm
+power1_max_alarm	High power alarm
+power1_label		Channel label (Power)
+
+energy1_input		Measured energy over time (in microJoule)
+energy1_enable		Enable/Disable Energy accumulation
+======================= ==========================================
+
+DebugFs entries
+_______________
+
+The chip also has a fault log register where failures can be logged. Hence,
+as these are logging events, we give access to them in debugfs. Note that
+even if some failure is detected in these logs, it does necessarily mean
+that the failure is still present. As mentioned in the proper Sysfs entries,
+these logs can be cleared by writing in the proper reset_history attribute.
+
+.. warning:: The debugfs interface is subject to change without notice
+             and is only available when the kernel is compiled with
+             ``CONFIG_DEBUG_FS`` defined.
+
+``/sys/kernel/debug/ltc4282-hwmon[X]/``
+contains the following attributes:
+
+=======================  ==========================================
+power1_bad_fault_log     Set to 1 by a power1 bad fault occurring.
+in0_fet_short_fault_log	 Set to 1 when the ADC detects a FET-short fault.
+in0_fet_bad_fault_log    Set to 1 when a FET-BAD fault occurs.
+in1_crit_fault_log       Set to 1 by a VDD overvoltage fault occurring.
+in1_lcrit_fault_log      Set to 1 by a VDD undervoltage fault occurring.
+curr1_crit_fault_log	 Set to 1 by an overcurrent fault occurring.
+=======================  ==========================================
diff --git a/MAINTAINERS b/MAINTAINERS
index 3625754d7d1bba..df8ea2d94a0cb7 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -12765,6 +12765,8 @@ M:	Nuno Sa <nuno.sa@analog.com>
 L:	linux-hwmon@vger.kernel.org
 S:	Supported
 F:	Documentation/devicetree/bindings/hwmon/adi,ltc4282.yaml
+F:	Documentation/hwmon/ltc4282.rst
+F:	drivers/hwmon/ltc4282.c
 
 LTC4286 HARDWARE MONITOR DRIVER
 M:	Delphine CC Chiu <Delphine_CC_Chiu@Wiwynn.com>
diff --git a/drivers/hwmon/Kconfig b/drivers/hwmon/Kconfig
index a608264da87df8..f6160cc7007773 100644
--- a/drivers/hwmon/Kconfig
+++ b/drivers/hwmon/Kconfig
@@ -1038,6 +1038,17 @@ config SENSORS_LTC4261
 	  This driver can also be built as a module. If so, the module will
 	  be called ltc4261.
 
+config SENSORS_LTC4282
+	tristate "Analog Devices LTC4282"
+	depends on I2C
+	select REGMAP_I2C
+	help
+	  If you say yes here you get support for Analog Devices LTC4282
+	  High Current Hot Swap Controller I2C interface.
+
+	  This driver can also be built as a module. If so, the module will
+	  be called ltc4282.
+
 config SENSORS_LTQ_CPUTEMP
 	bool "Lantiq cpu temperature sensor driver"
 	depends on SOC_XWAY
diff --git a/drivers/hwmon/Makefile b/drivers/hwmon/Makefile
index 47be39af5c0381..8bfc422a29e532 100644
--- a/drivers/hwmon/Makefile
+++ b/drivers/hwmon/Makefile
@@ -136,6 +136,7 @@ obj-$(CONFIG_SENSORS_LTC4222)	+= ltc4222.o
 obj-$(CONFIG_SENSORS_LTC4245)	+= ltc4245.o
 obj-$(CONFIG_SENSORS_LTC4260)	+= ltc4260.o
 obj-$(CONFIG_SENSORS_LTC4261)	+= ltc4261.o
+obj-$(CONFIG_SENSORS_LTC4282)	+= ltc4282.o
 obj-$(CONFIG_SENSORS_LTQ_CPUTEMP) += ltq-cputemp.o
 obj-$(CONFIG_SENSORS_MAX1111)	+= max1111.o
 obj-$(CONFIG_SENSORS_MAX127)	+= max127.o
diff --git a/drivers/hwmon/ltc4282.c b/drivers/hwmon/ltc4282.c
new file mode 100644
index 00000000000000..4f608a3790fb72
--- /dev/null
+++ b/drivers/hwmon/ltc4282.c
@@ -0,0 +1,1782 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Analog Devices LTC4282 I2C High Current Hot Swap Controller over I2C
+ *
+ * Copyright 2023 Analog Devices Inc.
+ */
+#include <linux/bitfield.h>
+#include <linux/cleanup.h>
+#include <linux/clk.h>
+#include <linux/clk-provider.h>
+#include <linux/debugfs.h>
+#include <linux/delay.h>
+#include <linux/device.h>
+#include <linux/hwmon.h>
+#include <linux/hwmon-sysfs.h>
+#include <linux/i2c.h>
+#include <linux/math.h>
+#include <linux/minmax.h>
+#include <linux/module.h>
+#include <linux/mod_devicetable.h>
+#include <linux/mutex.h>
+#include <linux/regmap.h>
+#include <linux/property.h>
+#include <linux/string.h>
+#include <linux/units.h>
+#include <linux/util_macros.h>
+
+#define LTC4282_CTRL_LSB			0x00
+  #define LTC4282_CTRL_OV_RETRY_MASK		BIT(0)
+  #define LTC4282_CTRL_UV_RETRY_MASK		BIT(1)
+  #define LTC4282_CTRL_OC_RETRY_MASK		BIT(2)
+  #define LTC4282_CTRL_ON_ACTIVE_LOW_MASK	BIT(5)
+  #define LTC4282_CTRL_ON_DELAY_MASK		BIT(6)
+#define LTC4282_CTRL_MSB			0x01
+  #define LTC4282_CTRL_VIN_MODE_MASK		GENMASK(1, 0)
+  #define LTC4282_CTRL_OV_MODE_MASK		GENMASK(3, 2)
+  #define LTC4282_CTRL_UV_MODE_MASK		GENMASK(5, 4)
+#define LTC4282_FAULT_LOG			0x04
+  #define LTC4282_OV_FAULT_MASK			BIT(0)
+  #define LTC4282_UV_FAULT_MASK			BIT(1)
+  #define LTC4282_VDD_FAULT_MASK \
+		(LTC4282_OV_FAULT_MASK | LTC4282_UV_FAULT_MASK)
+  #define LTC4282_OC_FAULT_MASK			BIT(2)
+  #define LTC4282_POWER_BAD_FAULT_MASK		BIT(3)
+  #define LTC4282_FET_SHORT_FAULT_MASK		BIT(5)
+  #define LTC4282_FET_BAD_FAULT_MASK		BIT(6)
+  #define LTC4282_FET_FAILURE_FAULT_MASK \
+		(LTC4282_FET_SHORT_FAULT_MASK | LTC4282_FET_BAD_FAULT_MASK)
+#define LTC4282_ADC_ALERT_LOG			0x05
+  #define LTC4282_GPIO_ALARM_L_MASK		BIT(0)
+  #define LTC4282_GPIO_ALARM_H_MASK		BIT(1)
+  #define LTC4282_VSOURCE_ALARM_L_MASK		BIT(2)
+  #define LTC4282_VSOURCE_ALARM_H_MASK		BIT(3)
+  #define LTC4282_VSENSE_ALARM_L_MASK		BIT(4)
+  #define LTC4282_VSENSE_ALARM_H_MASK		BIT(5)
+  #define LTC4282_POWER_ALARM_L_MASK		BIT(6)
+  #define LTC4282_POWER_ALARM_H_MASK		BIT(7)
+#define LTC4282_FET_BAD_FAULT_TIMEOUT		0x06
+  #define LTC4282_FET_BAD_MAX_TIMEOUT		255
+#define LTC4282_GPIO_CONFIG			0x07
+  #define LTC4282_GPIO_2_FET_STRESS_MASK	BIT(1)
+  #define LTC4282_GPIO_1_CONFIG_MASK		GENMASK(5, 4)
+#define LTC4282_VGPIO_MIN			0x08
+#define LTC4282_VGPIO_MAX			0x09
+#define LTC4282_VSOURCE_MIN			0x0a
+#define LTC4282_VSOURCE_MAX			0x0b
+#define LTC4282_VSENSE_MIN			0x0c
+#define LTC4282_VSENSE_MAX			0x0d
+#define LTC4282_POWER_MIN			0x0e
+#define LTC4282_POWER_MAX			0x0f
+#define LTC4282_CLK_DIV				0x10
+  #define LTC4282_CLK_DIV_MASK			GENMASK(4, 0)
+  #define LTC4282_CLKOUT_MASK			GENMASK(6, 5)
+#define LTC4282_ILIM_ADJUST			0x11
+  #define LTC4282_GPIO_MODE_MASK		BIT(1)
+  #define LTC4282_VDD_MONITOR_MASK		BIT(2)
+  #define LTC4282_FOLDBACK_MODE_MASK		GENMASK(4, 3)
+  #define LTC4282_ILIM_ADJUST_MASK		GENMASK(7, 5)
+#define LTC4282_ENERGY				0x12
+#define LTC4282_TIME_COUNTER			0x18
+#define LTC4282_ALERT_CTRL			0x1c
+  #define LTC4282_ALERT_OUT_MASK		BIT(6)
+#define LTC4282_ADC_CTRL			0x1d
+  #define LTC4282_FAULT_LOG_EN_MASK		BIT(2)
+  #define LTC4282_METER_HALT_MASK		BIT(5)
+  #define LTC4282_METER_RESET_MASK		BIT(6)
+  #define LTC4282_RESET_MASK			BIT(7)
+#define LTC4282_STATUS_LSB			0x1e
+  #define LTC4282_OV_STATUS_MASK		BIT(0)
+  #define LTC4282_UV_STATUS_MASK		BIT(1)
+  #define LTC4282_VDD_STATUS_MASK \
+		(LTC4282_OV_STATUS_MASK | LTC4282_UV_STATUS_MASK)
+  #define LTC4282_OC_STATUS_MASK		BIT(2)
+  #define LTC4282_POWER_GOOD_MASK		BIT(3)
+  #define LTC4282_FET_FAILURE_MASK		GENMASK(6, 5)
+#define LTC4282_STATUS_MSB			0x1f
+#define LTC4282_RESERVED_1			0x32
+#define LTC4282_RESERVED_2			0x33
+#define LTC4282_VGPIO				0x34
+#define LTC4282_VGPIO_LOWEST			0x36
+#define LTC4282_VGPIO_HIGHEST			0x38
+#define LTC4282_VSOURCE				0x3a
+#define LTC4282_VSOURCE_LOWEST			0x3c
+#define LTC4282_VSOURCE_HIGHEST			0x3e
+#define LTC4282_VSENSE				0x40
+#define LTC4282_VSENSE_LOWEST			0x42
+#define LTC4282_VSENSE_HIGHEST			0x44
+#define LTC4282_POWER				0x46
+#define LTC4282_POWER_LOWEST			0x48
+#define LTC4282_POWER_HIGHEST			0x4a
+#define LTC4282_RESERVED_3			0x50
+
+#define LTC4282_CLKIN_MIN	(250 * KILO)
+#define LTC4282_CLKIN_MAX	(15500 * KILO)
+#define LTC4282_CLKIN_RANGE	(LTC4282_CLKIN_MAX - LTC4282_CLKIN_MIN + 1)
+#define LTC4282_CLKOUT_SYSTEM	(250 * KILO)
+#define LTC4282_CLKOUT_CNV	15
+
+enum {
+	LTC4282_CHAN_VSOURCE,
+	LTC4282_CHAN_VDD,
+	LTC4282_CHAN_VGPIO,
+};
+
+struct ltc4282_cache {
+	u32 in_max_raw;
+	u32 in_min_raw;
+	long in_highest;
+	long in_lowest;
+	bool en;
+};
+
+struct ltc4282_state {
+	struct regmap *map;
+	/* Protect against multiple accesses to the device registers */
+	struct mutex lock;
+	struct clk_hw clk_hw;
+	/*
+	 * Used to cache values for VDD/VSOURCE depending which will be used
+	 * when hwmon is not enabled for that channel. Needed because they share
+	 * the same registers.
+	 */
+	struct ltc4282_cache in0_1_cache[LTC4282_CHAN_VGPIO];
+	u32 vsense_max;
+	long power_max;
+	u32 rsense;
+	u16 vdd;
+	u16 vfs_out;
+	bool energy_en;
+};
+
+enum {
+	LTC4282_CLKOUT_NONE,
+	LTC4282_CLKOUT_INT,
+	LTC4282_CLKOUT_TICK,
+};
+
+static int ltc4282_set_rate(struct clk_hw *hw,
+			    unsigned long rate, unsigned long parent_rate)
+{
+	struct ltc4282_state *st = container_of(hw, struct ltc4282_state,
+						clk_hw);
+	u32 val = LTC4282_CLKOUT_INT;
+
+	if (rate == LTC4282_CLKOUT_CNV)
+		val = LTC4282_CLKOUT_TICK;
+
+	return regmap_update_bits(st->map, LTC4282_CLK_DIV, LTC4282_CLKOUT_MASK,
+				  FIELD_PREP(LTC4282_CLKOUT_MASK, val));
+}
+
+/*
+ * Note the 15HZ conversion rate assumes 12bit ADC which is what we are
+ * supporting for now.
+ */
+static const unsigned int ltc4282_out_rates[] = {
+	LTC4282_CLKOUT_CNV, LTC4282_CLKOUT_SYSTEM
+};
+
+static long ltc4282_round_rate(struct clk_hw *hw, unsigned long rate,
+			       unsigned long *parent_rate)
+{
+	int idx = find_closest(rate, ltc4282_out_rates,
+			       ARRAY_SIZE(ltc4282_out_rates));
+
+	return ltc4282_out_rates[idx];
+}
+
+static unsigned long ltc4282_recalc_rate(struct clk_hw *hw,
+					 unsigned long parent)
+{
+	struct ltc4282_state *st = container_of(hw, struct ltc4282_state,
+						clk_hw);
+	u32 clkdiv;
+	int ret;
+
+	ret = regmap_read(st->map, LTC4282_CLK_DIV, &clkdiv);
+	if (ret)
+		return 0;
+
+	clkdiv = FIELD_GET(LTC4282_CLKOUT_MASK, clkdiv);
+	if (!clkdiv)
+		return 0;
+	if (clkdiv == LTC4282_CLKOUT_INT)
+		return LTC4282_CLKOUT_SYSTEM;
+
+	return LTC4282_CLKOUT_CNV;
+}
+
+static void ltc4282_disable(struct clk_hw *clk_hw)
+{
+	struct ltc4282_state *st = container_of(clk_hw, struct ltc4282_state,
+						clk_hw);
+
+	regmap_clear_bits(st->map, LTC4282_CLK_DIV, LTC4282_CLKOUT_MASK);
+}
+
+static int ltc4282_read_voltage_word(const struct ltc4282_state *st, u32 reg,
+				     u32 fs, long *val)
+{
+	__be16 in;
+	int ret;
+
+	ret = regmap_bulk_read(st->map, reg, &in, sizeof(in));
+	if (ret)
+		return ret;
+
+	/*
+	 * This is also used to calculate current in which case fs comes in
+	 * 10 * uV. Hence the ULL usage.
+	 */
+	*val = DIV_ROUND_CLOSEST_ULL(be16_to_cpu(in) * (u64)fs, U16_MAX);
+	return 0;
+}
+
+static int ltc4282_read_voltage_byte_cached(const struct ltc4282_state *st,
+					    u32 reg, u32 fs, long *val,
+					    u32 *cached_raw)
+{
+	int ret;
+	u32 in;
+
+	if (cached_raw) {
+		in = *cached_raw;
+	} else {
+		ret = regmap_read(st->map, reg, &in);
+		if (ret)
+			return ret;
+	}
+
+	*val = DIV_ROUND_CLOSEST(in * fs, U8_MAX);
+	return 0;
+}
+
+static int ltc4282_read_voltage_byte(const struct ltc4282_state *st, u32 reg,
+				     u32 fs, long *val)
+{
+	return ltc4282_read_voltage_byte_cached(st, reg, fs, val, NULL);
+}
+
+static int __ltc4282_read_alarm(struct ltc4282_state *st, u32 reg, u32 mask,
+				long *val)
+{
+	u32 alarm;
+	int ret;
+
+	ret = regmap_read(st->map, reg, &alarm);
+	if (ret)
+		return ret;
+
+	*val = !!(alarm & mask);
+
+	/* if not status/fault logs, clear the alarm after reading it */
+	if (reg != LTC4282_STATUS_LSB && reg != LTC4282_FAULT_LOG)
+		return regmap_clear_bits(st->map, reg, mask);
+
+	return 0;
+}
+
+static int ltc4282_read_alarm(struct ltc4282_state *st, u32 reg, u32 mask,
+			      long *val)
+{
+	guard(mutex)(&st->lock);
+	return __ltc4282_read_alarm(st, reg, mask, val);
+}
+
+static int ltc4282_vdd_source_read_in(struct ltc4282_state *st, u32 channel,
+				      long *val)
+{
+	guard(mutex)(&st->lock);
+	if (!st->in0_1_cache[channel].en)
+		return -ENODATA;
+
+	return ltc4282_read_voltage_word(st, LTC4282_VSOURCE, st->vfs_out, val);
+}
+
+static int ltc4282_vdd_source_read_hist(struct ltc4282_state *st, u32 reg,
+					u32 channel, long *cached, long *val)
+{
+	int ret;
+
+	guard(mutex)(&st->lock);
+	if (!st->in0_1_cache[channel].en) {
+		*val = *cached;
+		return 0;
+	}
+
+	ret = ltc4282_read_voltage_word(st, reg, st->vfs_out, val);
+	if (ret)
+		return ret;
+
+	*cached = *val;
+	return 0;
+}
+
+static int ltc4282_vdd_source_read_lim(struct ltc4282_state *st, u32 reg,
+				       u32 channel, u32 *cached, long *val)
+{
+	guard(mutex)(&st->lock);
+	if (!st->in0_1_cache[channel].en)
+		return ltc4282_read_voltage_byte_cached(st, reg, st->vfs_out,
+							val, cached);
+
+	return ltc4282_read_voltage_byte(st, reg, st->vfs_out, val);
+}
+
+static int ltc4282_vdd_source_read_alm(struct ltc4282_state *st, u32 mask,
+				       u32 channel, long *val)
+{
+	guard(mutex)(&st->lock);
+	if (!st->in0_1_cache[channel].en) {
+		/*
+		 * Do this otherwise alarms can get confused because we clear
+		 * them after reading them. So, if someone mistakenly reads
+		 * VSOURCE right before VDD (or the other way around), we might
+		 * get no alarm just because it was cleared when reading VSOURCE
+		 * and had no time for a new conversion and thus having the
+		 * alarm again.
+		 */
+		*val = 0;
+		return 0;
+	}
+
+	return __ltc4282_read_alarm(st, LTC4282_ADC_ALERT_LOG, mask, val);
+}
+
+static int ltc4282_read_in(struct ltc4282_state *st, u32 attr, long *val,
+			   u32 channel)
+{
+	switch (attr) {
+	case hwmon_in_input:
+		if (channel == LTC4282_CHAN_VGPIO)
+			return ltc4282_read_voltage_word(st, LTC4282_VGPIO,
+							 1280, val);
+
+		return ltc4282_vdd_source_read_in(st, channel, val);
+	case hwmon_in_highest:
+		if (channel == LTC4282_CHAN_VGPIO)
+			return ltc4282_read_voltage_word(st,
+							 LTC4282_VGPIO_HIGHEST,
+							 1280, val);
+
+		return ltc4282_vdd_source_read_hist(st, LTC4282_VSOURCE_HIGHEST,
+						    channel,
+						    &st->in0_1_cache[channel].in_highest, val);
+	case hwmon_in_lowest:
+		if (channel == LTC4282_CHAN_VGPIO)
+			return ltc4282_read_voltage_word(st, LTC4282_VGPIO_LOWEST,
+							 1280, val);
+
+		return ltc4282_vdd_source_read_hist(st, LTC4282_VSOURCE_LOWEST,
+						    channel,
+						    &st->in0_1_cache[channel].in_lowest, val);
+	case hwmon_in_max_alarm:
+		if (channel == LTC4282_CHAN_VGPIO)
+			return ltc4282_read_alarm(st, LTC4282_ADC_ALERT_LOG,
+						  LTC4282_GPIO_ALARM_H_MASK,
+						  val);
+
+		return ltc4282_vdd_source_read_alm(st,
+						   LTC4282_VSOURCE_ALARM_H_MASK,
+						   channel, val);
+	case hwmon_in_min_alarm:
+		if (channel == LTC4282_CHAN_VGPIO)
+			ltc4282_read_alarm(st, LTC4282_ADC_ALERT_LOG,
+					   LTC4282_GPIO_ALARM_L_MASK, val);
+
+		return ltc4282_vdd_source_read_alm(st,
+						   LTC4282_VSOURCE_ALARM_L_MASK,
+						   channel, val);
+	case hwmon_in_crit_alarm:
+		return ltc4282_read_alarm(st, LTC4282_STATUS_LSB,
+					  LTC4282_OV_STATUS_MASK, val);
+	case hwmon_in_lcrit_alarm:
+		return ltc4282_read_alarm(st, LTC4282_STATUS_LSB,
+					  LTC4282_UV_STATUS_MASK, val);
+	case hwmon_in_max:
+		if (channel == LTC4282_CHAN_VGPIO)
+			return ltc4282_read_voltage_byte(st, LTC4282_VGPIO_MAX,
+							 1280, val);
+
+		return ltc4282_vdd_source_read_lim(st, LTC4282_VSOURCE_MAX,
+						   channel,
+						   &st->in0_1_cache[channel].in_max_raw, val);
+	case hwmon_in_min:
+		if (channel == LTC4282_CHAN_VGPIO)
+			return ltc4282_read_voltage_byte(st, LTC4282_VGPIO_MIN,
+							 1280, val);
+
+		return ltc4282_vdd_source_read_lim(st, LTC4282_VSOURCE_MIN,
+						   channel,
+						   &st->in0_1_cache[channel].in_min_raw, val);
+	case hwmon_in_enable:
+		scoped_guard(mutex, &st->lock) {
+			*val = st->in0_1_cache[channel].en;
+		}
+		return 0;
+	case hwmon_in_fault:
+		/*
+		 * We report failure if we detect either a fer_bad or a
+		 * fet_short in the status register.
+		 */
+		return ltc4282_read_alarm(st, LTC4282_STATUS_LSB,
+					  LTC4282_FET_FAILURE_MASK, val);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static int ltc4282_read_current_word(const struct ltc4282_state *st, u32 reg,
+				     long *val)
+{
+	long in;
+	int ret;
+
+	/*
+	 * We pass in full scale in 10 * micro (note that 40 is already
+	 * millivolt) so we have better approximations to calculate current.
+	 */
+	ret = ltc4282_read_voltage_word(st, reg, DECA * 40 * MILLI, &in);
+	if (ret)
+		return ret;
+
+	*val = DIV_ROUND_CLOSEST(in * MILLI, st->rsense);
+
+	return 0;
+}
+
+static int ltc4282_read_current_byte(const struct ltc4282_state *st, u32 reg,
+				     long *val)
+{
+	long in;
+	int ret;
+
+	ret = ltc4282_read_voltage_byte(st, reg, DECA * 40 * MILLI, &in);
+	if (ret)
+		return ret;
+
+	*val = DIV_ROUND_CLOSEST(in * MILLI, st->rsense);
+
+	return 0;
+}
+
+static int ltc4282_read_curr(struct ltc4282_state *st, const u32 attr,
+			     long *val)
+{
+	switch (attr) {
+	case hwmon_curr_input:
+		return ltc4282_read_current_word(st, LTC4282_VSENSE, val);
+	case hwmon_curr_highest:
+		return ltc4282_read_current_word(st, LTC4282_VSENSE_HIGHEST,
+						 val);
+	case hwmon_curr_lowest:
+		return ltc4282_read_current_word(st, LTC4282_VSENSE_LOWEST,
+						 val);
+	case hwmon_curr_max:
+		return ltc4282_read_current_byte(st, LTC4282_VSENSE_MAX, val);
+	case hwmon_curr_min:
+		return ltc4282_read_current_byte(st, LTC4282_VSENSE_MIN, val);
+	case hwmon_curr_max_alarm:
+		return ltc4282_read_alarm(st, LTC4282_ADC_ALERT_LOG,
+					  LTC4282_VSENSE_ALARM_H_MASK, val);
+	case hwmon_curr_min_alarm:
+		return ltc4282_read_alarm(st, LTC4282_ADC_ALERT_LOG,
+					  LTC4282_VSENSE_ALARM_L_MASK, val);
+	case hwmon_curr_crit_alarm:
+		return ltc4282_read_alarm(st, LTC4282_STATUS_LSB,
+					  LTC4282_OC_STATUS_MASK, val);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static int ltc4282_read_power_word(const struct ltc4282_state *st, u32 reg,
+				   long *val)
+{
+	u64 temp =  DECA * 40ULL * st->vfs_out * BIT(16), temp_2;
+	__be16 raw;
+	u16 power;
+	int ret;
+
+	ret = regmap_bulk_read(st->map, reg, &raw, sizeof(raw));
+	if (ret)
+		return ret;
+
+	power = be16_to_cpu(raw);
+	/*
+	 * Power is given by:
+	 *     P = CODE(16b) * 0.040 * Vfs(out) * 2^16 / ((2^16 - 1)^2 * Rsense)
+	 */
+	if (check_mul_overflow(power * temp, MICRO, &temp_2)) {
+		temp = DIV_ROUND_CLOSEST_ULL(power * temp, U16_MAX);
+		*val = DIV64_U64_ROUND_CLOSEST(temp * MICRO,
+					       U16_MAX * (u64)st->rsense);
+		return 0;
+	}
+
+	*val = DIV64_U64_ROUND_CLOSEST(temp_2,
+				       st->rsense * int_pow(U16_MAX, 2));
+
+	return 0;
+}
+
+static int ltc4282_read_power_byte(const struct ltc4282_state *st, u32 reg,
+				   long *val)
+{
+	u32 power;
+	u64 temp;
+	int ret;
+
+	ret = regmap_read(st->map, reg, &power);
+	if (ret)
+		return ret;
+
+	temp = power * 40 * DECA * st->vfs_out * BIT_ULL(8);
+	*val = DIV64_U64_ROUND_CLOSEST(temp * MICRO,
+				       int_pow(U8_MAX, 2) * st->rsense);
+
+	return 0;
+}
+
+static int ltc4282_read_energy(const struct ltc4282_state *st, u64 *val)
+{
+	u64 temp, energy;
+	__be64 raw;
+	int ret;
+
+	ret = regmap_bulk_read(st->map, LTC4282_ENERGY, &raw, 6);
+	if (ret)
+		return ret;
+
+	energy =  be64_to_cpu(raw) >> 16;
+	/*
+	 * The formula for energy is given by:
+	 *	E = CODE(48b) * 0.040 * Vfs(out) * Tconv * 256 /
+	 *						((2^16 - 1)^2 * Rsense)
+	 *
+	 * Since we only support 12bit ADC, Tconv = 0.065535s. Passing Vfs(out)
+	 * and 0.040 to mV and Tconv to us, we can simplify the formula to:
+	 *	E = CODE(48b) * 40 * Vfs(out) * 256 / (U16_MAX * Rsense)
+	 *
+	 * As Rsense can have tenths of micro-ohm resolution, we need to
+	 * multiply by DECA to get microujoule.
+	 */
+	if (check_mul_overflow(DECA * st->vfs_out * 40 * BIT(8), energy, &temp)) {
+		temp = DIV_ROUND_CLOSEST(DECA * st->vfs_out * 40 * BIT(8), U16_MAX);
+		*val = DIV_ROUND_CLOSEST_ULL(temp * energy, st->rsense);
+		return 0;
+	}
+
+	*val = DIV64_U64_ROUND_CLOSEST(temp, U16_MAX * (u64)st->rsense);
+
+	return 0;
+}
+
+static int ltc4282_read_power(struct ltc4282_state *st, const u32 attr,
+			      long *val)
+{
+	switch (attr) {
+	case hwmon_power_input:
+		return ltc4282_read_power_word(st, LTC4282_POWER, val);
+	case hwmon_power_input_highest:
+		return ltc4282_read_power_word(st, LTC4282_POWER_HIGHEST, val);
+	case hwmon_power_input_lowest:
+		return ltc4282_read_power_word(st, LTC4282_POWER_LOWEST, val);
+	case hwmon_power_max_alarm:
+		return ltc4282_read_alarm(st, LTC4282_ADC_ALERT_LOG,
+					  LTC4282_POWER_ALARM_H_MASK, val);
+	case hwmon_power_min_alarm:
+		return ltc4282_read_alarm(st, LTC4282_ADC_ALERT_LOG,
+					  LTC4282_POWER_ALARM_L_MASK, val);
+	case hwmon_power_max:
+		return ltc4282_read_power_byte(st, LTC4282_POWER_MAX, val);
+	case hwmon_power_min:
+		return ltc4282_read_power_byte(st, LTC4282_POWER_MIN, val);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static int ltc4282_read(struct device *dev, enum hwmon_sensor_types type,
+			u32 attr, int channel, long *val)
+{
+	struct ltc4282_state *st = dev_get_drvdata(dev);
+
+	switch (type) {
+	case hwmon_in:
+		return ltc4282_read_in(st, attr, val, channel);
+	case hwmon_curr:
+		return ltc4282_read_curr(st, attr, val);
+	case hwmon_power:
+		return ltc4282_read_power(st, attr, val);
+	case hwmon_energy:
+		scoped_guard(mutex, &st->lock) {
+			*val = st->energy_en;
+		}
+		return 0;
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static int ltc4282_write_power_byte(const struct ltc4282_state *st, u32 reg,
+				    long val)
+{
+	u32 power;
+	u64 temp;
+
+	if (val > st->power_max)
+		val = st->power_max;
+
+	temp = val * int_pow(U8_MAX, 2) * st->rsense;
+	power = DIV64_U64_ROUND_CLOSEST(temp,
+					MICRO * DECA * 256ULL * st->vfs_out * 40);
+
+	return regmap_write(st->map, reg, power);
+}
+
+static int ltc4282_write_power_word(const struct ltc4282_state *st, u32 reg,
+				    long val)
+{
+	u64 temp = int_pow(U16_MAX, 2) * st->rsense, temp_2;
+	__be16 __raw;
+	u16 code;
+
+	if (check_mul_overflow(temp, val, &temp_2)) {
+		temp = DIV_ROUND_CLOSEST_ULL(temp, DECA * MICRO);
+		code = DIV64_U64_ROUND_CLOSEST(temp * val,
+					       40ULL * BIT(16) * st->vfs_out);
+	} else {
+		temp =  DECA * MICRO * 40ULL * BIT(16) * st->vfs_out;
+		code = DIV64_U64_ROUND_CLOSEST(temp_2, temp);
+	}
+
+	__raw = cpu_to_be16(code);
+	return regmap_bulk_write(st->map, reg, &__raw, sizeof(__raw));
+}
+
+static int __ltc4282_in_write_history(const struct ltc4282_state *st, u32 reg,
+				      long lowest, long highest, u32 fs)
+{
+	__be16 __raw;
+	u16 tmp;
+	int ret;
+
+	tmp = DIV_ROUND_CLOSEST(U16_MAX * lowest, fs);
+
+	__raw = cpu_to_be16(tmp);
+
+	ret = regmap_bulk_write(st->map, reg, &__raw, 2);
+	if (ret)
+		return ret;
+
+	tmp = DIV_ROUND_CLOSEST(U16_MAX * highest, fs);
+
+	__raw = cpu_to_be16(tmp);
+
+	return regmap_bulk_write(st->map, reg + 2, &__raw, 2);
+}
+
+static int ltc4282_in_write_history(struct ltc4282_state *st, u32 reg,
+				    long lowest, long highest, u32 fs)
+{
+	guard(mutex)(&st->lock);
+	return __ltc4282_in_write_history(st, reg, lowest, highest, fs);
+}
+
+static int ltc4282_power_reset_hist(struct ltc4282_state *st)
+{
+	int ret;
+
+	guard(mutex)(&st->lock);
+
+	ret = ltc4282_write_power_word(st, LTC4282_POWER_LOWEST,
+				       st->power_max);
+	if (ret)
+		return ret;
+
+	ret = ltc4282_write_power_word(st, LTC4282_POWER_HIGHEST, 0);
+	if (ret)
+		return ret;
+
+	/* now, let's also clear possible power_bad fault logs */
+	return regmap_clear_bits(st->map, LTC4282_FAULT_LOG,
+				 LTC4282_POWER_BAD_FAULT_MASK);
+}
+
+static int ltc4282_write_power(struct ltc4282_state *st, u32 attr,
+			       long val)
+{
+	switch (attr) {
+	case hwmon_power_max:
+		return ltc4282_write_power_byte(st, LTC4282_POWER_MAX, val);
+	case hwmon_power_min:
+		return ltc4282_write_power_byte(st, LTC4282_POWER_MIN, val);
+	case hwmon_power_reset_history:
+		return ltc4282_power_reset_hist(st);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static int ltc4282_write_voltage_byte_cached(const struct ltc4282_state *st,
+					     u32 reg, u32 fs, long val,
+					     u32 *cache_raw)
+{
+	u32 in;
+
+	val = clamp_val(val, 0, fs);
+	in = DIV_ROUND_CLOSEST(val * U8_MAX, fs);
+
+	if (cache_raw) {
+		*cache_raw = in;
+		return 0;
+	}
+
+	return regmap_write(st->map, reg, in);
+}
+
+static int ltc4282_write_voltage_byte(const struct ltc4282_state *st, u32 reg,
+				      u32 fs, long val)
+{
+	return ltc4282_write_voltage_byte_cached(st, reg, fs, val, NULL);
+}
+
+static int ltc4282_cache_history(struct ltc4282_state *st, u32 channel)
+{
+	long val;
+	int ret;
+
+	ret = ltc4282_read_voltage_word(st, LTC4282_VSOURCE_LOWEST, st->vfs_out,
+					&val);
+	if (ret)
+		return ret;
+
+	st->in0_1_cache[channel].in_lowest = val;
+
+	ret = ltc4282_read_voltage_word(st, LTC4282_VSOURCE_HIGHEST,
+					st->vfs_out, &val);
+	if (ret)
+		return ret;
+
+	st->in0_1_cache[channel].in_highest = val;
+
+	ret = regmap_read(st->map, LTC4282_VSOURCE_MIN,
+			  &st->in0_1_cache[channel].in_min_raw);
+	if (ret)
+		return ret;
+
+	return regmap_read(st->map, LTC4282_VSOURCE_MAX,
+			  &st->in0_1_cache[channel].in_max_raw);
+}
+
+static int ltc4282_cache_sync(struct ltc4282_state *st, u32 channel)
+{
+	int ret;
+
+	ret = __ltc4282_in_write_history(st, LTC4282_VSOURCE_LOWEST,
+					 st->in0_1_cache[channel].in_lowest,
+					 st->in0_1_cache[channel].in_highest,
+					 st->vfs_out);
+	if (ret)
+		return ret;
+
+	ret = regmap_write(st->map, LTC4282_VSOURCE_MIN,
+			   st->in0_1_cache[channel].in_min_raw);
+	if (ret)
+		return ret;
+
+	return regmap_write(st->map, LTC4282_VSOURCE_MAX,
+			    st->in0_1_cache[channel].in_max_raw);
+}
+
+static int ltc4282_vdd_source_write_lim(struct ltc4282_state *st, u32 reg,
+					int channel, u32 *cache, long val)
+{
+	int ret;
+
+	guard(mutex)(&st->lock);
+	if (st->in0_1_cache[channel].en)
+		ret = ltc4282_write_voltage_byte(st, reg, st->vfs_out, val);
+	else
+		ret = ltc4282_write_voltage_byte_cached(st, reg, st->vfs_out,
+							val, cache);
+
+	return ret;
+}
+
+static int ltc4282_vdd_source_reset_hist(struct ltc4282_state *st, int channel)
+{
+	long lowest = st->vfs_out;
+	int ret;
+
+	if (channel == LTC4282_CHAN_VDD)
+		lowest = st->vdd;
+
+	guard(mutex)(&st->lock);
+	if (st->in0_1_cache[channel].en) {
+		ret = __ltc4282_in_write_history(st, LTC4282_VSOURCE_LOWEST,
+						 lowest, 0, st->vfs_out);
+		if (ret)
+			return ret;
+	}
+
+	st->in0_1_cache[channel].in_lowest = lowest;
+	st->in0_1_cache[channel].in_highest = 0;
+
+	/*
+	 * We are also clearing possible fault logs in reset_history. Clearing
+	 * the logs might be important when the auto retry bits are not enabled
+	 * as the chip only enables the output again after having these logs
+	 * cleared. As some of these logs are related to limits, it makes sense
+	 * to clear them in here. For VDD, we need to clear under/over voltage
+	 * events. For VSOURCE, fet_short and fet_bad...
+	 */
+	if (channel == LTC4282_CHAN_VSOURCE)
+		return regmap_clear_bits(st->map, LTC4282_FAULT_LOG,
+					 LTC4282_FET_FAILURE_FAULT_MASK);
+
+	return regmap_clear_bits(st->map, LTC4282_FAULT_LOG,
+				 LTC4282_VDD_FAULT_MASK);
+}
+
+/*
+ * We need to mux between VSOURCE and VDD which means they are mutually
+ * exclusive. Moreover, we can't really disable both VDD and VSOURCE as the ADC
+ * is continuously running (we cannot independently halt it without also
+ * stopping VGPIO). Hence, the logic is that disabling or enabling VDD will
+ * automatically have the reverse effect on VSOURCE and vice-versa.
+ */
+static int ltc4282_vdd_source_enable(struct ltc4282_state *st, int channel,
+				     long val)
+{
+	int ret, other_chan = ~channel & 0x1;
+	u8 __val = val;
+
+	guard(mutex)(&st->lock);
+	if (st->in0_1_cache[channel].en == !!val)
+		return 0;
+
+	/* clearing the bit makes the ADC to monitor VDD */
+	if (channel == LTC4282_CHAN_VDD)
+		__val = !__val;
+
+	ret = regmap_update_bits(st->map, LTC4282_ILIM_ADJUST,
+				 LTC4282_VDD_MONITOR_MASK,
+				 FIELD_PREP(LTC4282_VDD_MONITOR_MASK, !!__val));
+	if (ret)
+		return ret;
+
+	st->in0_1_cache[channel].en = !!val;
+	st->in0_1_cache[other_chan].en = !val;
+
+	if (st->in0_1_cache[channel].en) {
+		/*
+		 * Then, we are disabling @other_chan. Let's save it's current
+		 * history.
+		 */
+		ret = ltc4282_cache_history(st, other_chan);
+		if (ret)
+			return ret;
+
+		return ltc4282_cache_sync(st, channel);
+	}
+	/*
+	 * Then, we are enabling @other_chan. We need to do the opposite from
+	 * above.
+	 */
+	ret = ltc4282_cache_history(st, channel);
+	if (ret)
+		return ret;
+
+	return ltc4282_cache_sync(st, other_chan);
+}
+
+static int ltc4282_write_in(struct ltc4282_state *st, u32 attr, long val,
+			    int channel)
+{
+	switch (attr) {
+	case hwmon_in_max:
+		if (channel == LTC4282_CHAN_VGPIO)
+			return ltc4282_write_voltage_byte(st, LTC4282_VGPIO_MAX,
+							  1280, val);
+
+		return ltc4282_vdd_source_write_lim(st, LTC4282_VSOURCE_MAX,
+						    channel,
+						    &st->in0_1_cache[channel].in_max_raw, val);
+	case hwmon_in_min:
+		if (channel == LTC4282_CHAN_VGPIO)
+			return ltc4282_write_voltage_byte(st, LTC4282_VGPIO_MIN,
+							  1280, val);
+
+		return ltc4282_vdd_source_write_lim(st, LTC4282_VSOURCE_MIN,
+						    channel,
+						    &st->in0_1_cache[channel].in_min_raw, val);
+	case hwmon_in_reset_history:
+		if (channel == LTC4282_CHAN_VGPIO)
+			return ltc4282_in_write_history(st,
+							LTC4282_VGPIO_LOWEST,
+							1280, 0, 1280);
+
+		return ltc4282_vdd_source_reset_hist(st, channel);
+	case hwmon_in_enable:
+		return ltc4282_vdd_source_enable(st, channel, val);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static int ltc4282_curr_reset_hist(struct ltc4282_state *st)
+{
+	int ret;
+
+	guard(mutex)(&st->lock);
+
+	ret = __ltc4282_in_write_history(st, LTC4282_VSENSE_LOWEST,
+					 st->vsense_max, 0, 40 * MILLI);
+	if (ret)
+		return ret;
+
+	/* now, let's also clear possible overcurrent fault logs */
+	return regmap_clear_bits(st->map, LTC4282_FAULT_LOG,
+				 LTC4282_OC_FAULT_MASK);
+}
+
+static int ltc4282_write_curr(struct ltc4282_state *st, u32 attr,
+			      long val)
+{
+	/* need to pass it in millivolt */
+	u32 in = DIV_ROUND_CLOSEST_ULL((u64)val * st->rsense, DECA * MICRO);
+
+	switch (attr) {
+	case hwmon_curr_max:
+		return ltc4282_write_voltage_byte(st, LTC4282_VSENSE_MAX, 40,
+						  in);
+	case hwmon_curr_min:
+		return ltc4282_write_voltage_byte(st, LTC4282_VSENSE_MIN, 40,
+						  in);
+	case hwmon_curr_reset_history:
+		return ltc4282_curr_reset_hist(st);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static int ltc4282_energy_enable_set(struct ltc4282_state *st, long val)
+{
+	int ret;
+
+	guard(mutex)(&st->lock);
+	/* setting the bit halts the meter */
+	ret = regmap_update_bits(st->map, LTC4282_ADC_CTRL,
+				 LTC4282_METER_HALT_MASK,
+				 FIELD_PREP(LTC4282_METER_HALT_MASK, !val));
+	if (ret)
+		return ret;
+
+	st->energy_en = !!val;
+
+	return 0;
+}
+
+static int ltc4282_write(struct device *dev,
+			 enum hwmon_sensor_types type,
+			 u32 attr, int channel, long val)
+{
+	struct ltc4282_state *st = dev_get_drvdata(dev);
+
+	switch (type) {
+	case hwmon_power:
+		return ltc4282_write_power(st, attr, val);
+	case hwmon_in:
+		return ltc4282_write_in(st, attr, val, channel);
+	case hwmon_curr:
+		return ltc4282_write_curr(st, attr, val);
+	case hwmon_energy:
+		return ltc4282_energy_enable_set(st, val);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static umode_t ltc4282_in_is_visible(const struct ltc4282_state *st, u32 attr)
+{
+	switch (attr) {
+	case hwmon_in_input:
+	case hwmon_in_highest:
+	case hwmon_in_lowest:
+	case hwmon_in_max_alarm:
+	case hwmon_in_min_alarm:
+	case hwmon_in_label:
+	case hwmon_in_lcrit_alarm:
+	case hwmon_in_crit_alarm:
+	case hwmon_in_fault:
+		return 0444;
+	case hwmon_in_max:
+	case hwmon_in_min:
+	case hwmon_in_enable:
+	case hwmon_in_reset_history:
+		return 0644;
+	default:
+		return 0;
+	}
+}
+
+static umode_t ltc4282_curr_is_visible(u32 attr)
+{
+	switch (attr) {
+	case hwmon_curr_input:
+	case hwmon_curr_highest:
+	case hwmon_curr_lowest:
+	case hwmon_curr_max_alarm:
+	case hwmon_curr_min_alarm:
+	case hwmon_curr_crit_alarm:
+	case hwmon_curr_label:
+		return 0444;
+	case hwmon_curr_max:
+	case hwmon_curr_min:
+	case hwmon_curr_reset_history:
+		return 0644;
+	default:
+		return 0;
+	}
+}
+
+static umode_t ltc4282_power_is_visible(u32 attr)
+{
+	switch (attr) {
+	case hwmon_power_input:
+	case hwmon_power_input_highest:
+	case hwmon_power_input_lowest:
+	case hwmon_power_label:
+	case hwmon_power_max_alarm:
+	case hwmon_power_min_alarm:
+		return 0444;
+	case hwmon_power_max:
+	case hwmon_power_min:
+	case hwmon_power_reset_history:
+		return 0644;
+	default:
+		return 0;
+	}
+}
+
+static umode_t ltc4282_is_visible(const void *data,
+				  enum hwmon_sensor_types type,
+				  u32 attr, int channel)
+{
+	switch (type) {
+	case hwmon_in:
+		return ltc4282_in_is_visible(data, attr);
+	case hwmon_curr:
+		return ltc4282_curr_is_visible(attr);
+	case hwmon_power:
+		return ltc4282_power_is_visible(attr);
+	case hwmon_energy:
+		/* hwmon_energy_enable */
+		return 0644;
+	default:
+		return 0;
+	}
+}
+
+static const char * const ltc4282_in_strs[] = {
+	"VSOURCE", "VDD", "VGPIO"
+};
+
+static int ltc4282_read_labels(struct device *dev,
+			       enum hwmon_sensor_types type,
+			       u32 attr, int channel, const char **str)
+{
+	switch (type) {
+	case hwmon_in:
+		*str = ltc4282_in_strs[channel];
+		return 0;
+	case hwmon_curr:
+		*str = "ISENSE";
+		return 0;
+	case hwmon_power:
+		*str = "Power";
+		return 0;
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static ssize_t ltc4282_energy_show(struct device *dev,
+				   struct device_attribute *da, char *buf)
+{
+	struct ltc4282_state *st = dev_get_drvdata(dev);
+	u64 energy;
+	int ret;
+
+	guard(mutex)(&st->lock);
+	if (!st->energy_en)
+		return -ENODATA;
+
+	ret = ltc4282_read_energy(st, &energy);
+	if (ret < 0)
+		return ret;
+
+	return sysfs_emit(buf, "%llu\n", energy);
+}
+
+static const struct clk_ops ltc4282_ops = {
+	.recalc_rate = ltc4282_recalc_rate,
+	.round_rate = ltc4282_round_rate,
+	.set_rate = ltc4282_set_rate,
+	.disable = ltc4282_disable,
+};
+
+static int ltc428_clk_provider_setup(struct ltc4282_state *st,
+				     struct device *dev)
+{
+	struct clk_init_data init;
+	int ret;
+
+	if (!IS_ENABLED(CONFIG_COMMON_CLK))
+		return 0;
+
+	init.name =  devm_kasprintf(dev, GFP_KERNEL, "%s-clk",
+				    fwnode_get_name(dev_fwnode(dev)));
+	if (!init.name)
+		return -ENOMEM;
+
+	init.ops = &ltc4282_ops;
+	init.flags = CLK_GET_RATE_NOCACHE;
+	st->clk_hw.init = &init;
+
+	ret = devm_clk_hw_register(dev, &st->clk_hw);
+	if (ret)
+		return ret;
+
+	return devm_of_clk_add_hw_provider(dev, of_clk_hw_simple_get,
+					   &st->clk_hw);
+}
+
+static int ltc428_clks_setup(struct ltc4282_state *st, struct device *dev)
+{
+	unsigned long rate;
+	struct clk *clkin;
+	u32 val;
+	int ret;
+
+	ret = ltc428_clk_provider_setup(st, dev);
+	if (ret)
+		return ret;
+
+	clkin = devm_clk_get_optional_enabled(dev, NULL);
+	if (IS_ERR(clkin))
+		return dev_err_probe(dev, PTR_ERR(clkin),
+				     "Failed to get clkin");
+	if (!clkin)
+		return 0;
+
+	rate = clk_get_rate(clkin);
+	if (!in_range(rate, LTC4282_CLKIN_MIN, LTC4282_CLKIN_RANGE))
+		return dev_err_probe(dev, -EINVAL,
+				     "Invalid clkin range(%lu) [%lu %lu]\n",
+				     rate, LTC4282_CLKIN_MIN,
+				     LTC4282_CLKIN_MAX);
+
+	/*
+	 * Clocks faster than 250KHZ should be reduced to 250KHZ. The clock
+	 * frequency is divided by twice the value in the register.
+	 */
+	val = rate / (2 * LTC4282_CLKIN_MIN);
+
+	return regmap_update_bits(st->map, LTC4282_CLK_DIV,
+				  LTC4282_CLK_DIV_MASK,
+				  FIELD_PREP(LTC4282_CLK_DIV_MASK, val));
+}
+
+static const int ltc4282_curr_lim_uv[] = {
+	12500, 15625, 18750, 21875, 25000, 28125, 31250, 34375
+};
+
+static int ltc4282_get_defaults(struct ltc4282_state *st, u32 *vin_mode)
+{
+	u32 reg_val, ilm_adjust;
+	int ret;
+
+	ret = regmap_read(st->map, LTC4282_ADC_CTRL, &reg_val);
+	if (ret)
+		return ret;
+
+	st->energy_en = !FIELD_GET(LTC4282_METER_HALT_MASK, reg_val);
+
+	ret = regmap_read(st->map, LTC4282_CTRL_MSB, &reg_val);
+	if (ret)
+		return ret;
+
+	*vin_mode = FIELD_GET(LTC4282_CTRL_VIN_MODE_MASK, reg_val);
+
+	ret = regmap_read(st->map, LTC4282_ILIM_ADJUST, &reg_val);
+	if (ret)
+		return ret;
+
+	ilm_adjust = FIELD_GET(LTC4282_ILIM_ADJUST_MASK, reg_val);
+	st->vsense_max = ltc4282_curr_lim_uv[ilm_adjust];
+
+	st->in0_1_cache[LTC4282_CHAN_VSOURCE].en = FIELD_GET(LTC4282_VDD_MONITOR_MASK,
+							     ilm_adjust);
+	if (!st->in0_1_cache[LTC4282_CHAN_VSOURCE].en) {
+		st->in0_1_cache[LTC4282_CHAN_VDD].en = true;
+		return regmap_read(st->map, LTC4282_VSOURCE_MAX,
+				   &st->in0_1_cache[LTC4282_CHAN_VSOURCE].in_max_raw);
+	}
+
+	return regmap_read(st->map, LTC4282_VSOURCE_MAX,
+			   &st->in0_1_cache[LTC4282_CHAN_VDD].in_max_raw);
+}
+
+/*
+ * Set max limits for ISENSE and Power as that depends on the max voltage on
+ * rsense that is defined in ILIM_ADJUST. This is specially important for power
+ * because for some rsense and vfsout values, if we allow the default raw 255
+ * value, that would overflow long in 32bit archs when reading back the max
+ * power limit.
+ *
+ * Also set meaningful historic values for VDD and VSOURCE
+ * (0 would not mean much).
+ */
+static int ltc4282_set_max_limits(struct ltc4282_state *st)
+{
+	int ret;
+
+	ret = ltc4282_write_voltage_byte(st, LTC4282_VSENSE_MAX, 40 * MILLI,
+					 st->vsense_max);
+	if (ret)
+		return ret;
+
+	/* Power is given by ISENSE * Vout. */
+	st->power_max = DIV_ROUND_CLOSEST(st->vsense_max * DECA * MILLI, st->rsense) * st->vfs_out;
+	ret = ltc4282_write_power_byte(st, LTC4282_POWER_MAX, st->power_max);
+	if (ret)
+		return ret;
+
+	if (st->in0_1_cache[LTC4282_CHAN_VDD].en) {
+		st->in0_1_cache[LTC4282_CHAN_VSOURCE].in_lowest = st->vfs_out;
+		return __ltc4282_in_write_history(st, LTC4282_VSOURCE_LOWEST,
+						  st->vdd, 0, st->vfs_out);
+	}
+
+	st->in0_1_cache[LTC4282_CHAN_VDD].in_lowest = st->vdd;
+	return __ltc4282_in_write_history(st, LTC4282_VSOURCE_LOWEST,
+					  st->vfs_out, 0, st->vfs_out);
+}
+
+static const char * const ltc4282_gpio1_modes[] = {
+	"power_bad", "power_good"
+};
+
+static const char * const ltc4282_gpio2_modes[] = {
+	"adc_input", "stress_fet"
+};
+
+static int ltc4282_gpio_setup(struct ltc4282_state *st, struct device *dev)
+{
+	const char *func = NULL;
+	int ret;
+
+	ret = device_property_read_string(dev, "adi,gpio1-mode", &func);
+	if (!ret) {
+		ret = match_string(ltc4282_gpio1_modes,
+				   ARRAY_SIZE(ltc4282_gpio1_modes), func);
+		if (ret < 0)
+			return dev_err_probe(dev, ret,
+					     "Invalid func(%s) for gpio1\n",
+					     func);
+
+		ret = regmap_update_bits(st->map, LTC4282_GPIO_CONFIG,
+					 LTC4282_GPIO_1_CONFIG_MASK,
+					 FIELD_PREP(LTC4282_GPIO_1_CONFIG_MASK, ret));
+		if (ret)
+			return ret;
+	}
+
+	ret = device_property_read_string(dev, "adi,gpio2-mode", &func);
+	if (!ret) {
+		ret = match_string(ltc4282_gpio2_modes,
+				   ARRAY_SIZE(ltc4282_gpio2_modes), func);
+		if (ret < 0)
+			return dev_err_probe(dev, ret,
+					     "Invalid func(%s) for gpio2\n",
+					     func);
+		if (!ret) {
+			/* setting the bit to 1 so the ADC to monitors GPIO2 */
+			ret = regmap_set_bits(st->map, LTC4282_ILIM_ADJUST,
+					      LTC4282_GPIO_MODE_MASK);
+		} else {
+			ret = regmap_update_bits(st->map, LTC4282_GPIO_CONFIG,
+						 LTC4282_GPIO_2_FET_STRESS_MASK,
+						 FIELD_PREP(LTC4282_GPIO_2_FET_STRESS_MASK, 1));
+		}
+
+		if (ret)
+			return ret;
+	}
+
+	if (!device_property_read_bool(dev, "adi,gpio3-monitor-enable"))
+		return 0;
+
+	if (func && !strcmp(func, "adc_input"))
+		return dev_err_probe(dev, -EINVAL,
+				     "Cannot have both gpio2 and gpio3 muxed into the ADC");
+
+	return regmap_clear_bits(st->map, LTC4282_ILIM_ADJUST,
+				 LTC4282_GPIO_MODE_MASK);
+}
+
+static const char * const ltc4282_dividers[] = {
+	"external", "vdd_5_percent", "vdd_10_percent", "vdd_15_percent"
+};
+
+/* This maps the Vout full scale for the given Vin mode */
+static const u16 ltc4282_vfs_milli[] = { 5540, 8320, 16640, 33280 };
+
+static const u16 ltc4282_vdd_milli[] = { 3300, 5000, 12000, 24000 };
+
+enum {
+	LTC4282_VIN_3_3V,
+	LTC4282_VIN_5V,
+	LTC4282_VIN_12V,
+	LTC4282_VIN_24V,
+};
+
+static int ltc4282_setup(struct ltc4282_state *st, struct device *dev)
+{
+	const char *divider;
+	u32 val, vin_mode;
+	int ret;
+
+	/* The part has an eeprom so let's get the needed defaults from it */
+	ret = ltc4282_get_defaults(st, &vin_mode);
+	if (ret)
+		return ret;
+
+	ret = device_property_read_u32(dev, "adi,rsense-nano-ohms",
+				       &st->rsense);
+	if (ret)
+		return dev_err_probe(dev, ret,
+				     "Failed to read adi,rsense-nano-ohms\n");
+	if (st->rsense < CENTI)
+		return dev_err_probe(dev, -EINVAL,
+				     "adi,rsense-nano-ohms too small (< %lu)\n",
+				     CENTI);
+
+	/*
+	 * The resolution for rsense is tenths of micro (eg: 62.5 uOhm) which
+	 * means we need nano in the bindings. However, to make things easier to
+	 * handle (with respect to overflows) we divide it by 100 as we don't
+	 * really need the last two digits.
+	 */
+	st->rsense /= CENTI;
+
+	val = vin_mode;
+	ret = device_property_read_u32(dev, "adi,vin-mode-microvolt", &val);
+	if (!ret) {
+		switch (val) {
+		case 3300000:
+			val = LTC4282_VIN_3_3V;
+			break;
+		case 5000000:
+			val = LTC4282_VIN_5V;
+			break;
+		case 12000000:
+			val = LTC4282_VIN_12V;
+			break;
+		case 24000000:
+			val = LTC4282_VIN_24V;
+			break;
+		default:
+			return dev_err_probe(dev, -EINVAL,
+					     "Invalid val(%u) for vin-mode-microvolt\n",
+					     val);
+		}
+
+		ret = regmap_update_bits(st->map, LTC4282_CTRL_MSB,
+					 LTC4282_CTRL_VIN_MODE_MASK,
+					 FIELD_PREP(LTC4282_CTRL_VIN_MODE_MASK, val));
+		if (ret)
+			return ret;
+
+		/* Foldback mode should also be set to the input voltage */
+		ret = regmap_update_bits(st->map, LTC4282_ILIM_ADJUST,
+					 LTC4282_FOLDBACK_MODE_MASK,
+					 FIELD_PREP(LTC4282_FOLDBACK_MODE_MASK, val));
+		if (ret)
+			return ret;
+	}
+
+	st->vfs_out = ltc4282_vfs_milli[val];
+	st->vdd = ltc4282_vdd_milli[val];
+
+	ret = device_property_read_u32(dev, "adi,current-limit-sense-microvolt",
+				       &st->vsense_max);
+	if (!ret) {
+		int reg_val;
+
+		switch (val) {
+		case 12500:
+			reg_val = 0;
+			break;
+		case 15625:
+			reg_val = 1;
+			break;
+		case 18750:
+			reg_val = 2;
+			break;
+		case 21875:
+			reg_val = 3;
+			break;
+		case 25000:
+			reg_val = 4;
+			break;
+		case 28125:
+			reg_val = 5;
+			break;
+		case 31250:
+			reg_val = 6;
+			break;
+		case 34375:
+			reg_val = 7;
+			break;
+		default:
+			return dev_err_probe(dev, -EINVAL,
+					     "Invalid val(%u) for adi,current-limit-microvolt\n",
+					     st->vsense_max);
+		}
+
+		ret = regmap_update_bits(st->map, LTC4282_ILIM_ADJUST,
+					 LTC4282_ILIM_ADJUST_MASK,
+					 FIELD_PREP(LTC4282_ILIM_ADJUST_MASK, reg_val));
+		if (ret)
+			return ret;
+	}
+
+	ret = ltc4282_set_max_limits(st);
+	if (ret)
+		return ret;
+
+	ret = device_property_read_string(dev, "adi,overvoltage-dividers",
+					  &divider);
+	if (!ret) {
+		int div = match_string(ltc4282_dividers,
+				       ARRAY_SIZE(ltc4282_dividers), divider);
+		if (div < 0)
+			return dev_err_probe(dev, -EINVAL,
+					     "Invalid val(%s) for adi,overvoltage-divider\n",
+					     divider);
+
+		ret = regmap_update_bits(st->map, LTC4282_CTRL_MSB,
+					 LTC4282_CTRL_OV_MODE_MASK,
+					 FIELD_PREP(LTC4282_CTRL_OV_MODE_MASK, div));
+	}
+
+	ret = device_property_read_string(dev, "adi,undervoltage-dividers",
+					  &divider);
+	if (!ret) {
+		int div = match_string(ltc4282_dividers,
+				       ARRAY_SIZE(ltc4282_dividers), divider);
+		if (div < 0)
+			return dev_err_probe(dev, -EINVAL,
+					     "Invalid val(%s) for adi,undervoltage-divider\n",
+					     divider);
+
+		ret = regmap_update_bits(st->map, LTC4282_CTRL_MSB,
+					 LTC4282_CTRL_UV_MODE_MASK,
+					 FIELD_PREP(LTC4282_CTRL_UV_MODE_MASK, div));
+	}
+
+	if (device_property_read_bool(dev, "adi,overcurrent-retry")) {
+		ret = regmap_set_bits(st->map, LTC4282_CTRL_LSB,
+				      LTC4282_CTRL_OC_RETRY_MASK);
+		if (ret)
+			return ret;
+	}
+
+	if (device_property_read_bool(dev, "adi,overvoltage-retry-disable")) {
+		ret = regmap_clear_bits(st->map, LTC4282_CTRL_LSB,
+					LTC4282_CTRL_OV_RETRY_MASK);
+		if (ret)
+			return ret;
+	}
+
+	if (device_property_read_bool(dev, "adi,undervoltage-retry-disable")) {
+		ret = regmap_clear_bits(st->map, LTC4282_CTRL_LSB,
+					LTC4282_CTRL_UV_RETRY_MASK);
+		if (ret)
+			return ret;
+	}
+
+	if (device_property_read_bool(dev, "adi,fault-log-enable")) {
+		ret = regmap_set_bits(st->map, LTC4282_ADC_CTRL,
+				      LTC4282_FAULT_LOG_EN_MASK);
+		if (ret)
+			return ret;
+	}
+
+	if (device_property_read_bool(dev, "adi,fault-log-enable")) {
+		ret = regmap_set_bits(st->map, LTC4282_ADC_CTRL, LTC4282_FAULT_LOG_EN_MASK);
+		if (ret)
+			return ret;
+	}
+
+	ret = device_property_read_u32(dev, "adi,fet-bad-timeout-ms", &val);
+	if (!ret) {
+		if (val > LTC4282_FET_BAD_MAX_TIMEOUT)
+			return dev_err_probe(dev, -EINVAL,
+					     "Invalid value(%u) for adi,fet-bad-timeout-ms",
+					     val);
+
+		ret = regmap_write(st->map, LTC4282_FET_BAD_FAULT_TIMEOUT, val);
+		if (ret)
+			return ret;
+	}
+
+	return ltc4282_gpio_setup(st, dev);
+}
+
+static bool ltc4282_readable_reg(struct device *dev, unsigned int reg)
+{
+	if (reg == LTC4282_RESERVED_1 || reg == LTC4282_RESERVED_2)
+		return false;
+
+	return true;
+}
+
+static bool ltc4282_writable_reg(struct device *dev, unsigned int reg)
+{
+	if (reg == LTC4282_STATUS_LSB || reg == LTC4282_STATUS_MSB)
+		return false;
+	if (reg == LTC4282_RESERVED_1 || reg == LTC4282_RESERVED_2)
+		return false;
+
+	return true;
+}
+
+static const struct regmap_config ltc4282_regmap_config = {
+	.reg_bits = 8,
+	.val_bits = 8,
+	.max_register = LTC4282_RESERVED_3,
+	.readable_reg = ltc4282_readable_reg,
+	.writeable_reg = ltc4282_writable_reg,
+};
+
+static const struct hwmon_channel_info * const ltc4282_info[] = {
+	HWMON_CHANNEL_INFO(in,
+			   HWMON_I_INPUT | HWMON_I_LOWEST | HWMON_I_HIGHEST |
+			   HWMON_I_MAX | HWMON_I_MIN | HWMON_I_MIN_ALARM |
+			   HWMON_I_MAX_ALARM | HWMON_I_ENABLE |
+			   HWMON_I_RESET_HISTORY | HWMON_I_FAULT |
+			   HWMON_I_LABEL,
+			   HWMON_I_INPUT | HWMON_I_LOWEST | HWMON_I_HIGHEST |
+			   HWMON_I_MAX | HWMON_I_MIN | HWMON_I_MIN_ALARM |
+			   HWMON_I_MAX_ALARM | HWMON_I_LCRIT_ALARM |
+			   HWMON_I_CRIT_ALARM | HWMON_I_ENABLE |
+			   HWMON_I_RESET_HISTORY | HWMON_I_LABEL,
+			   HWMON_I_INPUT | HWMON_I_LOWEST | HWMON_I_HIGHEST |
+			   HWMON_I_MAX | HWMON_I_MIN | HWMON_I_MIN_ALARM |
+			   HWMON_I_RESET_HISTORY | HWMON_I_MAX_ALARM |
+			   HWMON_I_LABEL),
+	HWMON_CHANNEL_INFO(curr,
+			   HWMON_C_INPUT | HWMON_C_LOWEST | HWMON_C_HIGHEST |
+			   HWMON_C_MAX | HWMON_C_MIN | HWMON_C_MIN_ALARM |
+			   HWMON_C_MAX_ALARM | HWMON_C_CRIT_ALARM |
+			   HWMON_C_RESET_HISTORY | HWMON_C_LABEL),
+	HWMON_CHANNEL_INFO(power,
+			   HWMON_P_INPUT | HWMON_P_INPUT_LOWEST |
+			   HWMON_P_INPUT_HIGHEST | HWMON_P_MAX | HWMON_P_MIN |
+			   HWMON_P_MAX_ALARM | HWMON_P_MIN_ALARM |
+			   HWMON_P_RESET_HISTORY | HWMON_P_LABEL),
+	HWMON_CHANNEL_INFO(energy,
+			   HWMON_E_ENABLE),
+	NULL
+};
+
+static const struct hwmon_ops ltc4282_hwmon_ops = {
+	.read = ltc4282_read,
+	.write = ltc4282_write,
+	.is_visible = ltc4282_is_visible,
+	.read_string = ltc4282_read_labels,
+};
+
+static const struct hwmon_chip_info ltc2947_chip_info = {
+	.ops = &ltc4282_hwmon_ops,
+	.info = ltc4282_info,
+};
+
+/* energy attributes are 6bytes wide so we need u64 */
+static SENSOR_DEVICE_ATTR_RO(energy1_input, ltc4282_energy, 0);
+
+static struct attribute *ltc4282_attrs[] = {
+	&sensor_dev_attr_energy1_input.dev_attr.attr,
+	NULL
+};
+ATTRIBUTE_GROUPS(ltc4282);
+
+static int ltc4282_show_fault_log(void *arg, u64 *val, u32 mask)
+{
+	struct ltc4282_state *st = arg;
+	long alarm;
+	int ret;
+
+	ret = ltc4282_read_alarm(st, LTC4282_FAULT_LOG,	mask, &alarm);
+	if (ret)
+		return ret;
+
+	*val = alarm;
+
+	return 0;
+}
+
+static int ltc4282_show_curr1_crit_fault_log(void *arg, u64 *val)
+{
+	return ltc4282_show_fault_log(arg, val, LTC4282_OC_FAULT_MASK);
+}
+DEFINE_DEBUGFS_ATTRIBUTE(ltc4282_curr1_crit_fault_log,
+			 ltc4282_show_curr1_crit_fault_log, NULL, "%llu\n");
+
+static int ltc4282_show_in1_lcrit_fault_log(void *arg, u64 *val)
+{
+	return ltc4282_show_fault_log(arg, val, LTC4282_UV_FAULT_MASK);
+}
+DEFINE_DEBUGFS_ATTRIBUTE(ltc4282_in1_lcrit_fault_log,
+			 ltc4282_show_in1_lcrit_fault_log, NULL, "%llu\n");
+
+static int ltc4282_show_in1_crit_fault_log(void *arg, u64 *val)
+{
+	return ltc4282_show_fault_log(arg, val, LTC4282_OV_FAULT_MASK);
+}
+DEFINE_DEBUGFS_ATTRIBUTE(ltc4282_in1_crit_fault_log,
+			 ltc4282_show_in1_crit_fault_log, NULL, "%llu\n");
+
+static int ltc4282_show_fet_bad_fault_log(void *arg, u64 *val)
+{
+	return ltc4282_show_fault_log(arg, val, LTC4282_FET_BAD_FAULT_MASK);
+}
+DEFINE_DEBUGFS_ATTRIBUTE(ltc4282_fet_bad_fault_log,
+			 ltc4282_show_fet_bad_fault_log, NULL, "%llu\n");
+
+static int ltc4282_show_fet_short_fault_log(void *arg, u64 *val)
+{
+	return ltc4282_show_fault_log(arg, val, LTC4282_FET_SHORT_FAULT_MASK);
+}
+DEFINE_DEBUGFS_ATTRIBUTE(ltc4282_fet_short_fault_log,
+			 ltc4282_show_fet_short_fault_log, NULL, "%llu\n");
+
+static int ltc4282_show_power1_bad_fault_log(void *arg, u64 *val)
+{
+	return ltc4282_show_fault_log(arg, val, LTC4282_POWER_BAD_FAULT_MASK);
+}
+DEFINE_DEBUGFS_ATTRIBUTE(ltc4282_power1_bad_fault_log,
+			 ltc4282_show_power1_bad_fault_log, NULL, "%llu\n");
+
+static void ltc4282_debugfs_remove(void *dir)
+{
+	debugfs_remove_recursive(dir);
+}
+
+static void ltc4282_debugfs_init(struct ltc4282_state *st,
+				 struct i2c_client *i2c,
+				 const struct device *hwmon)
+{
+	const char *debugfs_name;
+	struct dentry *dentry;
+	int ret;
+
+	if (!IS_ENABLED(CONFIG_DEBUG_FS))
+		return;
+
+	debugfs_name = devm_kasprintf(&i2c->dev, GFP_KERNEL, "ltc4282-%s",
+				      dev_name(hwmon));
+	if (!debugfs_name)
+		return;
+
+	dentry = debugfs_create_dir(debugfs_name, NULL);
+	if (IS_ERR(dentry))
+		return;
+
+	ret = devm_add_action_or_reset(&i2c->dev, ltc4282_debugfs_remove,
+				       dentry);
+	if (ret)
+		return;
+
+	debugfs_create_file_unsafe("power1_bad_fault_log", 0400, dentry, st,
+				   &ltc4282_power1_bad_fault_log);
+	debugfs_create_file_unsafe("in0_fet_short_fault_log", 0400, dentry, st,
+				   &ltc4282_fet_short_fault_log);
+	debugfs_create_file_unsafe("in0_fet_bad_fault_log", 0400, dentry, st,
+				   &ltc4282_fet_bad_fault_log);
+	debugfs_create_file_unsafe("in1_crit_fault_log", 0400, dentry, st,
+				   &ltc4282_in1_crit_fault_log);
+	debugfs_create_file_unsafe("in1_lcrit_fault_log", 0400, dentry, st,
+				   &ltc4282_in1_lcrit_fault_log);
+	debugfs_create_file_unsafe("curr1_crit_fault_log", 0400, dentry, st,
+				   &ltc4282_curr1_crit_fault_log);
+}
+
+static int ltc4282_probe(struct i2c_client *i2c)
+{
+	struct device *dev = &i2c->dev, *hwmon;
+	struct ltc4282_state *st;
+	int ret;
+
+	st = devm_kzalloc(dev, sizeof(*st), GFP_KERNEL);
+	if (!st)
+		return dev_err_probe(dev, -ENOMEM,
+				     "Failed to allocate memory\n");
+
+	st->map = devm_regmap_init_i2c(i2c, &ltc4282_regmap_config);
+	if (IS_ERR(st->map))
+		return dev_err_probe(dev, PTR_ERR(st->map),
+				     "failed regmap init\n");
+
+	/* Soft reset */
+	ret = regmap_set_bits(st->map, LTC4282_ADC_CTRL, LTC4282_RESET_MASK);
+	if (ret)
+		return ret;
+
+	/* Yes, it's big but it is as specified in the datasheet */
+	msleep(3200);
+
+	ret = ltc428_clks_setup(st, dev);
+	if (ret)
+		return ret;
+
+	ret = ltc4282_setup(st, dev);
+	if (ret)
+		return ret;
+
+	mutex_init(&st->lock);
+	hwmon = devm_hwmon_device_register_with_info(dev, "ltc4282", st,
+						     &ltc2947_chip_info,
+						     ltc4282_groups);
+	if (IS_ERR(hwmon))
+		return PTR_ERR(hwmon);
+
+	ltc4282_debugfs_init(st, i2c, hwmon);
+
+	return 0;
+}
+
+static const struct of_device_id ltc4282_of_match[] = {
+	{ .compatible = "adi,ltc4282" },
+	{}
+};
+MODULE_DEVICE_TABLE(of, ltc4282_of_match);
+
+static struct i2c_driver ltc4282_driver = {
+	.driver = {
+		.name = "ltc4282",
+		.of_match_table = ltc4282_of_match,
+	},
+	.probe = ltc4282_probe,
+};
+module_i2c_driver(ltc4282_driver);
+
+MODULE_AUTHOR("Nuno Sa <nuno.sa@analog.com>");
+MODULE_DESCRIPTION("LTC4282 I2C High Current Hot Swap Controller");
+MODULE_LICENSE("GPL");

From c34fd707333a94e71745f01c2fd91f383a113880 Mon Sep 17 00:00:00 2001
From: Ivor Wanders <ivor@iwanders.net>
Date: Tue, 30 Jan 2024 19:58:55 -0500
Subject: [PATCH 0307/1406] hwmon: add fan speed monitoring driver for Surface
 devices

Adds a driver that provides read only access to the fan speed for Microsoft
Surface Pro devices. The fan speed is always regulated by the EC and cannot
be influenced directly.

Signed-off-by: Ivor Wanders <ivor@iwanders.net>
Link: https://github.com/linux-surface/kernel/pull/144
Reviewed-by: Maximilian Luz <luzmaximilian@gmail.com>
Reviewed-by: Armin Wolf <W_Armin@gmx.de>
Link: https://lore.kernel.org/r/20240131005856.10180-2-ivor@iwanders.net
[groeck:
 - Declare surface_fan_hwmon_is_visible() static
 - Add dependency on SURFACE_AGGREGATOR_BUS
]
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
---
 Documentation/hwmon/index.rst       |  1 +
 Documentation/hwmon/surface_fan.rst | 25 ++++++++
 MAINTAINERS                         |  8 +++
 drivers/hwmon/Kconfig               | 14 +++++
 drivers/hwmon/Makefile              |  1 +
 drivers/hwmon/surface_fan.c         | 91 +++++++++++++++++++++++++++++
 6 files changed, 140 insertions(+)
 create mode 100644 Documentation/hwmon/surface_fan.rst
 create mode 100644 drivers/hwmon/surface_fan.c

diff --git a/Documentation/hwmon/index.rst b/Documentation/hwmon/index.rst
index f16c6dfaec7dc9..8f73badecba11c 100644
--- a/Documentation/hwmon/index.rst
+++ b/Documentation/hwmon/index.rst
@@ -209,6 +209,7 @@ Hardware Monitoring Kernel Drivers
    smsc47m1
    sparx5-temp
    stpddc60
+   surface_fan
    sy7636a-hwmon
    tc654
    tc74
diff --git a/Documentation/hwmon/surface_fan.rst b/Documentation/hwmon/surface_fan.rst
new file mode 100644
index 00000000000000..07942574c4f0cf
--- /dev/null
+++ b/Documentation/hwmon/surface_fan.rst
@@ -0,0 +1,25 @@
+.. SPDX-License-Identifier: GPL-2.0-or-later
+
+Kernel driver surface_fan
+=========================
+
+Supported Devices:
+
+  * Microsoft Surface Pro 9
+
+Author: Ivor Wanders <ivor@iwanders.net>
+
+Description
+-----------
+
+This provides monitoring of the fan found in some Microsoft Surface Pro devices,
+like the Surface Pro 9. The fan is always controlled by the onboard controller.
+
+Sysfs interface
+---------------
+
+======================= ======= =========================================
+Name                    Perm    Description
+======================= ======= =========================================
+``fan1_input``          RO      Current fan speed in RPM.
+======================= ======= =========================================
diff --git a/MAINTAINERS b/MAINTAINERS
index df8ea2d94a0cb7..dd69696f3c0e83 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -14549,6 +14549,14 @@ F:	Documentation/driver-api/surface_aggregator/clients/dtx.rst
 F:	drivers/platform/surface/surface_dtx.c
 F:	include/uapi/linux/surface_aggregator/dtx.h
 
+MICROSOFT SURFACE SENSOR FAN DRIVER
+M:	Maximilian Luz <luzmaximilian@gmail.com>
+M:	Ivor Wanders <ivor@iwanders.net>
+L:	linux-hwmon@vger.kernel.org
+S:	Maintained
+F:	Documentation/hwmon/surface_fan.rst
+F:	drivers/hwmon/surface_fan.c
+
 MICROSOFT SURFACE GPE LID SUPPORT DRIVER
 M:	Maximilian Luz <luzmaximilian@gmail.com>
 L:	platform-driver-x86@vger.kernel.org
diff --git a/drivers/hwmon/Kconfig b/drivers/hwmon/Kconfig
index f6160cc7007773..cdf228fe50e2be 100644
--- a/drivers/hwmon/Kconfig
+++ b/drivers/hwmon/Kconfig
@@ -2005,6 +2005,20 @@ config SENSORS_SFCTEMP
 	  This driver can also be built as a module.  If so, the module
 	  will be called sfctemp.
 
+config SENSORS_SURFACE_FAN
+	tristate "Surface Fan Driver"
+	depends on SURFACE_AGGREGATOR
+	depends on SURFACE_AGGREGATOR_BUS
+	help
+	  Driver that provides monitoring of the fan on Surface Pro devices that
+	  have a fan, like the Surface Pro 9.
+
+	  This makes the fan's current speed accessible through the hwmon
+	  system. It does not provide control over the fan, the firmware is
+	  responsible for that, this driver merely provides monitoring.
+
+	  Select M or Y here, if you want to be able to read the fan's speed.
+
 config SENSORS_ADC128D818
 	tristate "Texas Instruments ADC128D818"
 	depends on I2C
diff --git a/drivers/hwmon/Makefile b/drivers/hwmon/Makefile
index 8bfc422a29e532..a49704cd48a4b5 100644
--- a/drivers/hwmon/Makefile
+++ b/drivers/hwmon/Makefile
@@ -202,6 +202,7 @@ obj-$(CONFIG_SENSORS_SMSC47M1)	+= smsc47m1.o
 obj-$(CONFIG_SENSORS_SMSC47M192)+= smsc47m192.o
 obj-$(CONFIG_SENSORS_SPARX5)	+= sparx5-temp.o
 obj-$(CONFIG_SENSORS_STTS751)	+= stts751.o
+obj-$(CONFIG_SENSORS_SURFACE_FAN)+= surface_fan.o
 obj-$(CONFIG_SENSORS_SY7636A)	+= sy7636a-hwmon.o
 obj-$(CONFIG_SENSORS_AMC6821)	+= amc6821.o
 obj-$(CONFIG_SENSORS_TC74)	+= tc74.o
diff --git a/drivers/hwmon/surface_fan.c b/drivers/hwmon/surface_fan.c
new file mode 100644
index 00000000000000..de3c5a2409c618
--- /dev/null
+++ b/drivers/hwmon/surface_fan.c
@@ -0,0 +1,91 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Surface Fan driver for Surface System Aggregator Module. It provides access
+ * to the fan's rpm through the hwmon system.
+ *
+ * Copyright (C) 2023 Ivor Wanders <ivor@iwanders.net>
+ */
+
+#include <linux/hwmon.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/surface_aggregator/device.h>
+#include <linux/types.h>
+
+// SSAM
+SSAM_DEFINE_SYNC_REQUEST_CL_R(__ssam_fan_rpm_get, __le16, {
+	.target_category = SSAM_SSH_TC_FAN,
+	.command_id      = 0x01,
+});
+
+// hwmon
+static umode_t surface_fan_hwmon_is_visible(const void *drvdata,
+					    enum hwmon_sensor_types type, u32 attr,
+					    int channel)
+{
+	return 0444;
+}
+
+static int surface_fan_hwmon_read(struct device *dev,
+				  enum hwmon_sensor_types type, u32 attr,
+				  int channel, long *val)
+{
+	struct ssam_device *sdev = dev_get_drvdata(dev);
+	int ret;
+	__le16 value;
+
+	ret = __ssam_fan_rpm_get(sdev, &value);
+	if (ret)
+		return ret;
+
+	*val = le16_to_cpu(value);
+
+	return 0;
+}
+
+static const struct hwmon_channel_info *const surface_fan_info[] = {
+	HWMON_CHANNEL_INFO(fan, HWMON_F_INPUT),
+	NULL
+};
+
+static const struct hwmon_ops surface_fan_hwmon_ops = {
+	.is_visible = surface_fan_hwmon_is_visible,
+	.read = surface_fan_hwmon_read,
+};
+
+static const struct hwmon_chip_info surface_fan_chip_info = {
+	.ops = &surface_fan_hwmon_ops,
+	.info = surface_fan_info,
+};
+
+static int surface_fan_probe(struct ssam_device *sdev)
+{
+	struct device *hdev;
+
+	hdev = devm_hwmon_device_register_with_info(&sdev->dev,
+						    "surface_fan", sdev,
+						    &surface_fan_chip_info,
+						    NULL);
+
+	return PTR_ERR_OR_ZERO(hdev);
+}
+
+static const struct ssam_device_id ssam_fan_match[] = {
+	{ SSAM_SDEV(FAN, SAM, 0x01, 0x01) },
+	{},
+};
+MODULE_DEVICE_TABLE(ssam, ssam_fan_match);
+
+static struct ssam_device_driver surface_fan = {
+	.probe = surface_fan_probe,
+	.match_table = ssam_fan_match,
+	.driver = {
+		.name = "surface_fan",
+		.probe_type = PROBE_PREFER_ASYNCHRONOUS,
+	},
+};
+module_ssam_device_driver(surface_fan);
+
+MODULE_AUTHOR("Ivor Wanders <ivor@iwanders.net>");
+MODULE_DESCRIPTION("Fan Driver for Surface System Aggregator Module");
+MODULE_LICENSE("GPL");

From ce70fba87f41bf61ecea2e0e90758779b084f2bb Mon Sep 17 00:00:00 2001
From: Charles Hsu <ythsu0511@gmail.com>
Date: Wed, 31 Jan 2024 15:48:21 +0800
Subject: [PATCH 0308/1406] dt-bindings: Add MPQ8785 voltage regulator device

Monolithic Power Systems, Inc. (MPS) synchronous step-down converter.

Signed-off-by: Charles Hsu <ythsu0511@gmail.com>
Acked-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Link: https://lore.kernel.org/r/20240131074822.2962078-1-ythsu0511@gmail.com
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
---
 Documentation/devicetree/bindings/trivial-devices.yaml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Documentation/devicetree/bindings/trivial-devices.yaml b/Documentation/devicetree/bindings/trivial-devices.yaml
index 79dcd92c4a4345..088b23ed2ae6c8 100644
--- a/Documentation/devicetree/bindings/trivial-devices.yaml
+++ b/Documentation/devicetree/bindings/trivial-devices.yaml
@@ -129,6 +129,8 @@ properties:
           - mps,mp2975
             # Monolithic Power Systems Inc. multi-phase hot-swap controller mp5990
           - mps,mp5990
+            # Monolithic Power Systems Inc. synchronous step-down converter mpq8785
+          - mps,mpq8785
             # Honeywell Humidicon HIH-6130 humidity/temperature sensor
           - honeywell,hi6130
             # IBM Common Form Factor Power Supply Versions (all versions)

From c5068b4d9076ec558522af0b50a589e22c674efc Mon Sep 17 00:00:00 2001
From: Charles Hsu <ythsu0511@gmail.com>
Date: Wed, 31 Jan 2024 15:48:22 +0800
Subject: [PATCH 0309/1406] hwmon: Add driver for MPS MPQ8785 Synchronous
 Step-Down Converter

Add support for mpq8785 device from Monolithic Power Systems, Inc.
(MPS) vendor. This is synchronous step-down controller.

Signed-off-by: Charles Hsu <ythsu0511@gmail.com>
Link: https://lore.kernel.org/r/20240131074822.2962078-2-ythsu0511@gmail.com
[groeck: probe_new --> probe; add MODULE_IMPORT_NS(PMBUS)]
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
---
 Documentation/hwmon/index.rst   |  1 +
 Documentation/hwmon/mpq8785.rst | 94 +++++++++++++++++++++++++++++++++
 drivers/hwmon/pmbus/Kconfig     |  9 ++++
 drivers/hwmon/pmbus/Makefile    |  1 +
 drivers/hwmon/pmbus/mpq8785.c   | 90 +++++++++++++++++++++++++++++++
 5 files changed, 195 insertions(+)
 create mode 100644 Documentation/hwmon/mpq8785.rst
 create mode 100644 drivers/hwmon/pmbus/mpq8785.c

diff --git a/Documentation/hwmon/index.rst b/Documentation/hwmon/index.rst
index 8f73badecba11c..9ac087dd3e78d1 100644
--- a/Documentation/hwmon/index.rst
+++ b/Documentation/hwmon/index.rst
@@ -164,6 +164,7 @@ Hardware Monitoring Kernel Drivers
    mp2975
    mp5023
    mp5990
+   mpq8785
    nct6683
    nct6775
    nct7802
diff --git a/Documentation/hwmon/mpq8785.rst b/Documentation/hwmon/mpq8785.rst
new file mode 100644
index 00000000000000..bf8176b8708687
--- /dev/null
+++ b/Documentation/hwmon/mpq8785.rst
@@ -0,0 +1,94 @@
+.. SPDX-License-Identifier: GPL-2.0-only
+
+Kernel driver mpq8785
+=======================
+
+Supported chips:
+
+  * MPS MPQ8785
+
+    Prefix: 'mpq8785'
+
+Author: Charles Hsu <ythsu0511@gmail.com>
+
+Description
+-----------
+
+The MPQ8785 is a fully integrated, PMBus-compatible, high-frequency, synchronous
+buck converter. The MPQ8785 offers a very compact solution that achieves up to
+40A output current per phase, with excellent load and line regulation over a
+wide input supply range. The MPQ8785 operates at high efficiency over a wide
+output current load range.
+
+The PMBus interface provides converter configurations and key parameters
+monitoring.
+
+The MPQ8785 adopts MPS's proprietary multi-phase digital constant-on-time (MCOT)
+control, which provides fast transient response and eases loop stabilization.
+The MCOT scheme also allows multiple MPQ8785 devices to be connected in parallel
+with excellent current sharing and phase interleaving for high-current
+applications.
+
+Fully integrated protection features include over-current protection (OCP),
+over-voltage protection (OVP), under-voltage protection (UVP), and
+over-temperature protection (OTP).
+
+The MPQ8785 requires a minimal number of readily available, standard external
+components, and is available in a TLGA (5mmx6mm) package.
+
+Device compliant with:
+
+- PMBus rev 1.3 interface.
+
+The driver exports the following attributes via the 'sysfs' files
+for input voltage:
+
+**in1_input**
+
+**in1_label**
+
+**in1_max**
+
+**in1_max_alarm**
+
+**in1_min**
+
+**in1_min_alarm**
+
+**in1_crit**
+
+**in1_crit_alarm**
+
+The driver provides the following attributes for output voltage:
+
+**in2_input**
+
+**in2_label**
+
+**in2_alarm**
+
+The driver provides the following attributes for output current:
+
+**curr1_input**
+
+**curr1_label**
+
+**curr1_max**
+
+**curr1_max_alarm**
+
+**curr1_crit**
+
+**curr1_crit_alarm**
+
+The driver provides the following attributes for temperature:
+
+**temp1_input**
+
+**temp1_max**
+
+**temp1_max_alarm**
+
+**temp1_crit**
+
+**temp1_crit_alarm**
diff --git a/drivers/hwmon/pmbus/Kconfig b/drivers/hwmon/pmbus/Kconfig
index 294808f5240abd..557ae0c414b09e 100644
--- a/drivers/hwmon/pmbus/Kconfig
+++ b/drivers/hwmon/pmbus/Kconfig
@@ -377,6 +377,15 @@ config SENSORS_MPQ7932
 	  This driver can also be built as a module. If so, the module will
 	  be called mpq7932.
 
+config SENSORS_MPQ8785
+	tristate "MPS MPQ8785"
+	help
+	  If you say yes here you get hardware monitoring functionality support
+	  for power management IC MPS MPQ8785.
+
+	  This driver can also be built as a module. If so, the module will
+	  be called mpq8785.
+
 config SENSORS_PIM4328
 	tristate "Flex PIM4328 and compatibles"
 	help
diff --git a/drivers/hwmon/pmbus/Makefile b/drivers/hwmon/pmbus/Makefile
index cf8a767445456a..f14ecf03ad7790 100644
--- a/drivers/hwmon/pmbus/Makefile
+++ b/drivers/hwmon/pmbus/Makefile
@@ -39,6 +39,7 @@ obj-$(CONFIG_SENSORS_MP2975)	+= mp2975.o
 obj-$(CONFIG_SENSORS_MP5023)	+= mp5023.o
 obj-$(CONFIG_SENSORS_MP5990)	+= mp5990.o
 obj-$(CONFIG_SENSORS_MPQ7932)	+= mpq7932.o
+obj-$(CONFIG_SENSORS_MPQ8785)	+= mpq8785.o
 obj-$(CONFIG_SENSORS_PLI1209BC)	+= pli1209bc.o
 obj-$(CONFIG_SENSORS_PM6764TR)	+= pm6764tr.o
 obj-$(CONFIG_SENSORS_PXE1610)	+= pxe1610.o
diff --git a/drivers/hwmon/pmbus/mpq8785.c b/drivers/hwmon/pmbus/mpq8785.c
new file mode 100644
index 00000000000000..4e2549cc81203f
--- /dev/null
+++ b/drivers/hwmon/pmbus/mpq8785.c
@@ -0,0 +1,90 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Driver for MPS MPQ8785 Step-Down Converter
+ */
+
+#include <linux/i2c.h>
+#include <linux/module.h>
+#include <linux/of_device.h>
+#include "pmbus.h"
+
+static int mpq8785_identify(struct i2c_client *client,
+			    struct pmbus_driver_info *info)
+{
+	int vout_mode;
+
+	vout_mode = pmbus_read_byte_data(client, 0, PMBUS_VOUT_MODE);
+	if (vout_mode < 0 || vout_mode == 0xff)
+		return vout_mode < 0 ? vout_mode : -ENODEV;
+	switch (vout_mode >> 5) {
+	case 0:
+		info->format[PSC_VOLTAGE_OUT] = linear;
+		break;
+	case 1:
+	case 2:
+		info->format[PSC_VOLTAGE_OUT] = direct,
+		info->m[PSC_VOLTAGE_OUT] = 64;
+		info->b[PSC_VOLTAGE_OUT] = 0;
+		info->R[PSC_VOLTAGE_OUT] = 1;
+		break;
+	default:
+		return -ENODEV;
+	}
+
+	return 0;
+};
+
+static struct pmbus_driver_info mpq8785_info = {
+	.pages = 1,
+	.format[PSC_VOLTAGE_IN] = direct,
+	.format[PSC_CURRENT_OUT] = direct,
+	.format[PSC_TEMPERATURE] = direct,
+	.m[PSC_VOLTAGE_IN] = 4,
+	.b[PSC_VOLTAGE_IN] = 0,
+	.R[PSC_VOLTAGE_IN] = 1,
+	.m[PSC_CURRENT_OUT] = 16,
+	.b[PSC_CURRENT_OUT] = 0,
+	.R[PSC_CURRENT_OUT] = 0,
+	.m[PSC_TEMPERATURE] = 1,
+	.b[PSC_TEMPERATURE] = 0,
+	.R[PSC_TEMPERATURE] = 0,
+	.func[0] =
+		PMBUS_HAVE_VIN | PMBUS_HAVE_STATUS_INPUT |
+		PMBUS_HAVE_VOUT | PMBUS_HAVE_STATUS_VOUT |
+		PMBUS_HAVE_IOUT | PMBUS_HAVE_STATUS_IOUT |
+		PMBUS_HAVE_TEMP | PMBUS_HAVE_STATUS_TEMP,
+	.identify = mpq8785_identify,
+};
+
+static int mpq8785_probe(struct i2c_client *client)
+{
+	return pmbus_do_probe(client, &mpq8785_info);
+};
+
+static const struct i2c_device_id mpq8785_id[] = {
+	{ "mpq8785", 0 },
+	{ },
+};
+MODULE_DEVICE_TABLE(i2c, mpq8785_id);
+
+static const struct of_device_id __maybe_unused mpq8785_of_match[] = {
+	{ .compatible = "mps,mpq8785" },
+	{}
+};
+MODULE_DEVICE_TABLE(of, mpq8785_of_match);
+
+static struct i2c_driver mpq8785_driver = {
+	.driver = {
+		   .name = "mpq8785",
+		   .of_match_table = of_match_ptr(mpq8785_of_match),
+	},
+	.probe = mpq8785_probe,
+	.id_table = mpq8785_id,
+};
+
+module_i2c_driver(mpq8785_driver);
+
+MODULE_AUTHOR("Charles Hsu <ythsu0511@gmail.com>");
+MODULE_DESCRIPTION("PMBus driver for MPS MPQ8785");
+MODULE_LICENSE("GPL");
+MODULE_IMPORT_NS(PMBUS);

From 11d1be0fd07e78f4a684b5dc153a04322432688f Mon Sep 17 00:00:00 2001
From: Stefan Gloor <code@stefan-gloor.ch>
Date: Wed, 31 Jan 2024 12:15:12 +0100
Subject: [PATCH 0310/1406] hwmon: (sht3x) read out sensor serial number

The temperature/humidity sensors of the STS3x/SHT3x family are
calibrated and factory-programmed with a unique serial number.
For some sensors, this serial number can be used to obtain a calibration
certificate via an API provided by the manufacturer (Sensirion).
Expose the serial number via debugfs.

Tested with: 2x STS31, 1x STS32, 1x SHT31

Signed-off-by: Stefan Gloor <code@stefan-gloor.ch>
Link: https://lore.kernel.org/r/20240131111512.25321-2-code@stefan-gloor.ch
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
---
 Documentation/hwmon/sht3x.rst | 11 ++++++
 drivers/hwmon/sht3x.c         | 66 ++++++++++++++++++++++++++++++++++-
 2 files changed, 76 insertions(+), 1 deletion(-)

diff --git a/Documentation/hwmon/sht3x.rst b/Documentation/hwmon/sht3x.rst
index 957c854f5d088c..9585fa7c5a5d87 100644
--- a/Documentation/hwmon/sht3x.rst
+++ b/Documentation/hwmon/sht3x.rst
@@ -65,6 +65,10 @@ When the temperature and humidity readings move back between the hysteresis
 values, the alert bit is set to 0 and the alert pin on the sensor is set to
 low.
 
+The serial number exposed to debugfs allows for unique identification of the
+sensors. For sts32, sts33 and sht33, the manufacturer provides calibration
+certificates through an API.
+
 sysfs-Interface
 ---------------
 
@@ -99,3 +103,10 @@ repeatability:      write or read repeatability, higher repeatability means
                         - 1: medium repeatability
                         - 2: high repeatability
 =================== ============================================================
+
+debugfs-Interface
+-----------------
+
+=================== ============================================================
+serial_number:      unique serial number of the sensor in decimal
+=================== ============================================================
diff --git a/drivers/hwmon/sht3x.c b/drivers/hwmon/sht3x.c
index 79657910b79e64..c0d02fbcdb76c2 100644
--- a/drivers/hwmon/sht3x.c
+++ b/drivers/hwmon/sht3x.c
@@ -10,6 +10,7 @@
 
 #include <asm/page.h>
 #include <linux/crc8.h>
+#include <linux/debugfs.h>
 #include <linux/delay.h>
 #include <linux/err.h>
 #include <linux/hwmon.h>
@@ -41,6 +42,9 @@ static const unsigned char sht3x_cmd_heater_off[]              = { 0x30, 0x66 };
 /* other commands */
 static const unsigned char sht3x_cmd_read_status_reg[]         = { 0xf3, 0x2d };
 static const unsigned char sht3x_cmd_clear_status_reg[]        = { 0x30, 0x41 };
+static const unsigned char sht3x_cmd_read_serial_number[]      = { 0x37, 0x80 };
+
+static struct dentry *debugfs;
 
 /* delays for single-shot mode i2c commands, both in us */
 #define SHT3X_SINGLE_WAIT_TIME_HPM  15000
@@ -163,12 +167,14 @@ struct sht3x_data {
 	enum sht3x_chips chip_id;
 	struct mutex i2c_lock; /* lock for sending i2c commands */
 	struct mutex data_lock; /* lock for updating driver data */
+	struct dentry *sensor_dir;
 
 	u8 mode;
 	const unsigned char *command;
 	u32 wait_time;			/* in us*/
 	unsigned long last_update;	/* last update in periodic mode*/
 	enum sht3x_repeatability repeatability;
+	u32 serial_number;
 
 	/*
 	 * cached values for temperature and humidity and limits
@@ -831,6 +837,40 @@ static int sht3x_write(struct device *dev, enum hwmon_sensor_types type,
 	}
 }
 
+static void sht3x_debugfs_init(struct sht3x_data *data)
+{
+	char name[32];
+
+	snprintf(name, sizeof(name), "i2c%u-%02x",
+		 data->client->adapter->nr, data->client->addr);
+	data->sensor_dir = debugfs_create_dir(name, debugfs);
+	debugfs_create_u32("serial_number", 0444,
+			   data->sensor_dir, &data->serial_number);
+}
+
+static void sht3x_debugfs_remove(void *sensor_dir)
+{
+	debugfs_remove_recursive(sensor_dir);
+}
+
+static int sht3x_serial_number_read(struct sht3x_data *data)
+{
+	int ret;
+	char buffer[SHT3X_RESPONSE_LENGTH];
+	struct i2c_client *client = data->client;
+
+	ret = sht3x_read_from_command(client, data,
+				      sht3x_cmd_read_serial_number,
+				      buffer,
+				      SHT3X_RESPONSE_LENGTH, 0);
+	if (ret)
+		return ret;
+
+	data->serial_number = (buffer[0] << 24) | (buffer[1] << 16) |
+			      (buffer[3] << 8) | buffer[4];
+	return ret;
+}
+
 static const struct hwmon_ops sht3x_ops = {
 	.is_visible = sht3x_is_visible,
 	.read = sht3x_read,
@@ -899,6 +939,18 @@ static int sht3x_probe(struct i2c_client *client)
 	if (ret)
 		return ret;
 
+	ret = sht3x_serial_number_read(data);
+	if (ret) {
+		dev_dbg(dev, "unable to read serial number\n");
+	} else {
+		sht3x_debugfs_init(data);
+		ret = devm_add_action_or_reset(dev,
+					       sht3x_debugfs_remove,
+					       data->sensor_dir);
+		if (ret)
+			return ret;
+	}
+
 	hwmon_dev = devm_hwmon_device_register_with_info(dev,
 							 client->name,
 							 data,
@@ -917,7 +969,19 @@ static struct i2c_driver sht3x_i2c_driver = {
 	.id_table    = sht3x_ids,
 };
 
-module_i2c_driver(sht3x_i2c_driver);
+static int __init sht3x_init(void)
+{
+	debugfs = debugfs_create_dir("sht3x", NULL);
+	return i2c_add_driver(&sht3x_i2c_driver);
+}
+module_init(sht3x_init);
+
+static void __exit sht3x_cleanup(void)
+{
+	debugfs_remove_recursive(debugfs);
+	i2c_del_driver(&sht3x_i2c_driver);
+}
+module_exit(sht3x_cleanup);
 
 MODULE_AUTHOR("David Frey <david.frey@sensirion.com>");
 MODULE_AUTHOR("Pascal Sachs <pascal.sachs@sensirion.com>");

From 21d23ec005b3f8c742dc1a30205d4580344c8686 Mon Sep 17 00:00:00 2001
From: Bo Liu <liubo03@inspur.com>
Date: Fri, 2 Feb 2024 02:13:55 -0500
Subject: [PATCH 0311/1406] hwmon: (adt7x10) convert to use maple tree register
 cache

The maple tree register cache is based on a much more modern data structure
than the rbtree cache and makes optimisation choices which are probably
more appropriate for modern systems than those made by the rbtree cache.

Signed-off-by: Bo Liu <liubo03@inspur.com>
Link: https://lore.kernel.org/r/20240202071355.40666-1-liubo03@inspur.com
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
---
 drivers/hwmon/adt7310.c | 2 +-
 drivers/hwmon/adt7410.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/hwmon/adt7310.c b/drivers/hwmon/adt7310.c
index 067865f4887a79..25281739aa3b16 100644
--- a/drivers/hwmon/adt7310.c
+++ b/drivers/hwmon/adt7310.c
@@ -124,7 +124,7 @@ static int adt7310_reg_write(void *context, unsigned int reg, unsigned int val)
 static const struct regmap_config adt7310_regmap_config = {
 	.reg_bits = 8,
 	.val_bits = 16,
-	.cache_type = REGCACHE_RBTREE,
+	.cache_type = REGCACHE_MAPLE,
 	.volatile_reg = adt7310_regmap_is_volatile,
 	.reg_read = adt7310_reg_read,
 	.reg_write = adt7310_reg_write,
diff --git a/drivers/hwmon/adt7410.c b/drivers/hwmon/adt7410.c
index fd214d9b3a895a..d15f64d4b6e755 100644
--- a/drivers/hwmon/adt7410.c
+++ b/drivers/hwmon/adt7410.c
@@ -69,7 +69,7 @@ static const struct regmap_config adt7410_regmap_config = {
 	.reg_bits = 8,
 	.val_bits = 16,
 	.max_register = ADT7X10_ID,
-	.cache_type = REGCACHE_RBTREE,
+	.cache_type = REGCACHE_MAPLE,
 	.volatile_reg = adt7410_regmap_is_volatile,
 	.reg_read = adt7410_reg_read,
 	.reg_write = adt7410_reg_write,

From e947f1cfda5c99d3053eae38dc0966f8aa7e3def Mon Sep 17 00:00:00 2001
From: Bo Liu <liubo03@inspur.com>
Date: Fri, 2 Feb 2024 02:14:52 -0500
Subject: [PATCH 0312/1406] hwmon: (emc1403) convert to use maple tree register
 cache

The maple tree register cache is based on a much more modern data structure
than the rbtree cache and makes optimisation choices which are probably
more appropriate for modern systems than those made by the rbtree cache.

Signed-off-by: Bo Liu <liubo03@inspur.com>
Link: https://lore.kernel.org/r/20240202071452.40778-1-liubo03@inspur.com
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
---
 drivers/hwmon/emc1403.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/hwmon/emc1403.c b/drivers/hwmon/emc1403.c
index 1332e4ac078c1d..d370efd6f9864a 100644
--- a/drivers/hwmon/emc1403.c
+++ b/drivers/hwmon/emc1403.c
@@ -385,7 +385,7 @@ static bool emc1403_regmap_is_volatile(struct device *dev, unsigned int reg)
 static const struct regmap_config emc1403_regmap_config = {
 	.reg_bits = 8,
 	.val_bits = 8,
-	.cache_type = REGCACHE_RBTREE,
+	.cache_type = REGCACHE_MAPLE,
 	.volatile_reg = emc1403_regmap_is_volatile,
 };
 

From 64334bbfd2c47228f2e101faa9946251f3662c91 Mon Sep 17 00:00:00 2001
From: Bo Liu <liubo03@inspur.com>
Date: Fri, 2 Feb 2024 02:15:38 -0500
Subject: [PATCH 0313/1406] hwmon: (ina3221) convert to use maple tree register
 cache

The maple tree register cache is based on a much more modern data structure
than the rbtree cache and makes optimisation choices which are probably
more appropriate for modern systems than those made by the rbtree cache.

Signed-off-by: Bo Liu <liubo03@inspur.com>
Link: https://lore.kernel.org/r/20240202071538.40877-1-liubo03@inspur.com
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
---
 drivers/hwmon/ina3221.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/hwmon/ina3221.c b/drivers/hwmon/ina3221.c
index 5ffdc94db436d4..2c9530b6f19213 100644
--- a/drivers/hwmon/ina3221.c
+++ b/drivers/hwmon/ina3221.c
@@ -762,7 +762,7 @@ static const struct regmap_config ina3221_regmap_config = {
 	.reg_bits = 8,
 	.val_bits = 16,
 
-	.cache_type = REGCACHE_RBTREE,
+	.cache_type = REGCACHE_MAPLE,
 	.volatile_table = &ina3221_volatile_table,
 };
 

From 54a47150af1920d1215f672ece8936a31e716619 Mon Sep 17 00:00:00 2001
From: Bo Liu <liubo03@inspur.com>
Date: Fri, 2 Feb 2024 02:16:28 -0500
Subject: [PATCH 0314/1406] hwmon: (jc42) convert to use maple tree register
 cache

The maple tree register cache is based on a much more modern data structure
than the rbtree cache and makes optimisation choices which are probably
more appropriate for modern systems than those made by the rbtree cache.

Signed-off-by: Bo Liu <liubo03@inspur.com>
Link: https://lore.kernel.org/r/20240202071628.40990-1-liubo03@inspur.com
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
---
 drivers/hwmon/jc42.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/hwmon/jc42.c b/drivers/hwmon/jc42.c
index f958e830b23cfa..75dc25df0f8bbe 100644
--- a/drivers/hwmon/jc42.c
+++ b/drivers/hwmon/jc42.c
@@ -497,7 +497,7 @@ static const struct regmap_config jc42_regmap_config = {
 	.writeable_reg = jc42_writable_reg,
 	.readable_reg = jc42_readable_reg,
 	.volatile_reg = jc42_volatile_reg,
-	.cache_type = REGCACHE_RBTREE,
+	.cache_type = REGCACHE_MAPLE,
 };
 
 static int jc42_probe(struct i2c_client *client)

From 052a7074c11e3c91b0a137a9ead2bcf7c2852957 Mon Sep 17 00:00:00 2001
From: Bo Liu <liubo03@inspur.com>
Date: Fri, 2 Feb 2024 02:18:00 -0500
Subject: [PATCH 0315/1406] hwmon: (lm83) convert to use maple tree register
 cache

The maple tree register cache is based on a much more modern data structure
than the rbtree cache and makes optimisation choices which are probably
more appropriate for modern systems than those made by the rbtree cache.

Signed-off-by: Bo Liu <liubo03@inspur.com>
Link: https://lore.kernel.org/r/20240202071800.41113-1-liubo03@inspur.com
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
---
 drivers/hwmon/lm83.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/hwmon/lm83.c b/drivers/hwmon/lm83.c
index 5befedca6abb4f..b333c9bde4e642 100644
--- a/drivers/hwmon/lm83.c
+++ b/drivers/hwmon/lm83.c
@@ -165,7 +165,7 @@ static bool lm83_regmap_is_volatile(struct device *dev, unsigned int reg)
 static const struct regmap_config lm83_regmap_config = {
 	.reg_bits = 8,
 	.val_bits = 8,
-	.cache_type = REGCACHE_RBTREE,
+	.cache_type = REGCACHE_MAPLE,
 	.volatile_reg = lm83_regmap_is_volatile,
 	.reg_read = lm83_regmap_reg_read,
 	.reg_write = lm83_regmap_reg_write,

From c7e7ec2411d7707ac7c97d48eedb10685ba4b044 Mon Sep 17 00:00:00 2001
From: Bo Liu <liubo03@inspur.com>
Date: Fri, 2 Feb 2024 02:19:27 -0500
Subject: [PATCH 0316/1406] hwmon: (max31760) convert to use maple tree
 register cache

The maple tree register cache is based on a much more modern data structure
than the rbtree cache and makes optimisation choices which are probably
more appropriate for modern systems than those made by the rbtree cache.

Signed-off-by: Bo Liu <liubo03@inspur.com>
Link: https://lore.kernel.org/r/20240202071927.41213-1-liubo03@inspur.com
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
---
 drivers/hwmon/max31760.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/hwmon/max31760.c b/drivers/hwmon/max31760.c
index 1b6f71bc61cb54..127e31ca3c8737 100644
--- a/drivers/hwmon/max31760.c
+++ b/drivers/hwmon/max31760.c
@@ -60,7 +60,7 @@ static const struct regmap_config regmap_config = {
 	.reg_bits = 8,
 	.val_bits = 8,
 	.max_register = 0x5B,
-	.cache_type = REGCACHE_RBTREE,
+	.cache_type = REGCACHE_MAPLE,
 	.volatile_reg = max31760_volatile_reg,
 };
 

From 5afba04453a6eb750326757b7cab294f626fed6b Mon Sep 17 00:00:00 2001
From: Bo Liu <liubo03@inspur.com>
Date: Fri, 2 Feb 2024 02:20:07 -0500
Subject: [PATCH 0317/1406] hwmon: (nct7802) convert to use maple tree register
 cache

The maple tree register cache is based on a much more modern data structure
than the rbtree cache and makes optimisation choices which are probably
more appropriate for modern systems than those made by the rbtree cache.

Signed-off-by: Bo Liu <liubo03@inspur.com>
Link: https://lore.kernel.org/r/20240202072007.41316-1-liubo03@inspur.com
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
---
 drivers/hwmon/nct7802.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/hwmon/nct7802.c b/drivers/hwmon/nct7802.c
index 024cff151c365b..a0e664d5ebfea4 100644
--- a/drivers/hwmon/nct7802.c
+++ b/drivers/hwmon/nct7802.c
@@ -1051,7 +1051,7 @@ static bool nct7802_regmap_is_volatile(struct device *dev, unsigned int reg)
 static const struct regmap_config nct7802_regmap_config = {
 	.reg_bits = 8,
 	.val_bits = 8,
-	.cache_type = REGCACHE_RBTREE,
+	.cache_type = REGCACHE_MAPLE,
 	.volatile_reg = nct7802_regmap_is_volatile,
 };
 

From 746a8a938512320ef4d3d53953d80d9085efe2d8 Mon Sep 17 00:00:00 2001
From: Bo Liu <liubo03@inspur.com>
Date: Fri, 2 Feb 2024 02:20:39 -0500
Subject: [PATCH 0318/1406] hwmon: (sch5627) convert to use maple tree register
 cache

The maple tree register cache is based on a much more modern data structure
than the rbtree cache and makes optimisation choices which are probably
more appropriate for modern systems than those made by the rbtree cache.

Signed-off-by: Bo Liu <liubo03@inspur.com>
Link: https://lore.kernel.org/r/20240202072039.41419-1-liubo03@inspur.com
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
---
 drivers/hwmon/sch5627.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/hwmon/sch5627.c b/drivers/hwmon/sch5627.c
index 1891d4d75aa94d..33e997b5c1f57d 100644
--- a/drivers/hwmon/sch5627.c
+++ b/drivers/hwmon/sch5627.c
@@ -116,7 +116,7 @@ static const struct regmap_config sch5627_regmap_config = {
 	.val_bits = 8,
 	.wr_table = &sch5627_tunables_table,
 	.rd_table = &sch5627_tunables_table,
-	.cache_type = REGCACHE_RBTREE,
+	.cache_type = REGCACHE_MAPLE,
 	.use_single_read = true,
 	.use_single_write = true,
 	.can_sleep = true,

From b00d69193ab012edc02d8a2e4fb7b001f9a9ebe0 Mon Sep 17 00:00:00 2001
From: Bo Liu <liubo03@inspur.com>
Date: Fri, 2 Feb 2024 02:22:35 -0500
Subject: [PATCH 0319/1406] hwmon: (tmp401) convert to use maple tree register
 cache

The maple tree register cache is based on a much more modern data structure
than the rbtree cache and makes optimisation choices which are probably
more appropriate for modern systems than those made by the rbtree cache.

Signed-off-by: Bo Liu <liubo03@inspur.com>
Link: https://lore.kernel.org/r/20240202072235.41614-1-liubo03@inspur.com
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
---
 drivers/hwmon/tmp401.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/hwmon/tmp401.c b/drivers/hwmon/tmp401.c
index 91f2314568cfaf..df1b45a62e804e 100644
--- a/drivers/hwmon/tmp401.c
+++ b/drivers/hwmon/tmp401.c
@@ -256,7 +256,7 @@ static int tmp401_reg_write(void *context, unsigned int reg, unsigned int val)
 static const struct regmap_config tmp401_regmap_config = {
 	.reg_bits = 8,
 	.val_bits = 16,
-	.cache_type = REGCACHE_RBTREE,
+	.cache_type = REGCACHE_MAPLE,
 	.volatile_reg = tmp401_regmap_is_volatile,
 	.reg_read = tmp401_reg_read,
 	.reg_write = tmp401_reg_write,

From 097e8677537fd49ab6244d167b6a3264b3f17e60 Mon Sep 17 00:00:00 2001
From: Javier Carrasco <javier.carrasco.cruz@gmail.com>
Date: Tue, 30 Jan 2024 22:06:44 +0100
Subject: [PATCH 0320/1406] dt-bindings: vendor-prefixes: add Amphenol

Add vendor prefix for Amphenol (https://www.amphenol-sensors.com)

Acked-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Signed-off-by: Javier Carrasco <javier.carrasco.cruz@gmail.com>
Link: https://lore.kernel.org/r/20240130-topic-chipcap2-v6-1-260bea05cf9b@gmail.com
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
---
 Documentation/devicetree/bindings/vendor-prefixes.yaml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Documentation/devicetree/bindings/vendor-prefixes.yaml b/Documentation/devicetree/bindings/vendor-prefixes.yaml
index 1a0dc04f1db478..25158559471ca6 100644
--- a/Documentation/devicetree/bindings/vendor-prefixes.yaml
+++ b/Documentation/devicetree/bindings/vendor-prefixes.yaml
@@ -107,6 +107,8 @@ patternProperties:
     description: Amlogic, Inc.
   "^ampere,.*":
     description: Ampere Computing LLC
+  "^amphenol,.*":
+    description: Amphenol Advanced Sensors
   "^ampire,.*":
     description: Ampire Co., Ltd.
   "^ams,.*":

From f4ef2317c4333c29aedc7b86428c4f711c760d39 Mon Sep 17 00:00:00 2001
From: Javier Carrasco <javier.carrasco.cruz@gmail.com>
Date: Tue, 30 Jan 2024 22:06:45 +0100
Subject: [PATCH 0321/1406] hwmon: (core) Add support for humidity min/max
 alarm

Add min_alarm and max_alarm attributes for humidityX to support devices
that can generate these alarms.
Such attributes already exist for other magnitudes such as tempX.

Tested with a ChipCap 2 temperature-humidity sensor.

Signed-off-by: Javier Carrasco <javier.carrasco.cruz@gmail.com>
Link: https://lore.kernel.org/r/20240130-topic-chipcap2-v6-2-260bea05cf9b@gmail.com
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
---
 drivers/hwmon/hwmon.c | 2 ++
 include/linux/hwmon.h | 4 ++++
 2 files changed, 6 insertions(+)

diff --git a/drivers/hwmon/hwmon.c b/drivers/hwmon/hwmon.c
index 18705049ad610e..3b259c425ab701 100644
--- a/drivers/hwmon/hwmon.c
+++ b/drivers/hwmon/hwmon.c
@@ -587,6 +587,8 @@ static const char * const hwmon_humidity_attr_templates[] = {
 	[hwmon_humidity_fault] = "humidity%d_fault",
 	[hwmon_humidity_rated_min] = "humidity%d_rated_min",
 	[hwmon_humidity_rated_max] = "humidity%d_rated_max",
+	[hwmon_humidity_min_alarm] = "humidity%d_min_alarm",
+	[hwmon_humidity_max_alarm] = "humidity%d_max_alarm",
 };
 
 static const char * const hwmon_fan_attr_templates[] = {
diff --git a/include/linux/hwmon.h b/include/linux/hwmon.h
index c7885fdce88f09..edf96f249eb50a 100644
--- a/include/linux/hwmon.h
+++ b/include/linux/hwmon.h
@@ -295,6 +295,8 @@ enum hwmon_humidity_attributes {
 	hwmon_humidity_fault,
 	hwmon_humidity_rated_min,
 	hwmon_humidity_rated_max,
+	hwmon_humidity_min_alarm,
+	hwmon_humidity_max_alarm,
 };
 
 #define HWMON_H_ENABLE			BIT(hwmon_humidity_enable)
@@ -308,6 +310,8 @@ enum hwmon_humidity_attributes {
 #define HWMON_H_FAULT			BIT(hwmon_humidity_fault)
 #define HWMON_H_RATED_MIN		BIT(hwmon_humidity_rated_min)
 #define HWMON_H_RATED_MAX		BIT(hwmon_humidity_rated_max)
+#define HWMON_H_MIN_ALARM		BIT(hwmon_humidity_min_alarm)
+#define HWMON_H_MAX_ALARM		BIT(hwmon_humidity_max_alarm)
 
 enum hwmon_fan_attributes {
 	hwmon_fan_enable,

From a64431ea5cf4511d972d909db3f98a4e9dd5d510 Mon Sep 17 00:00:00 2001
From: Javier Carrasco <javier.carrasco.cruz@gmail.com>
Date: Tue, 30 Jan 2024 22:06:46 +0100
Subject: [PATCH 0322/1406] ABI: sysfs-class-hwmon: add descriptions for
 humidity min/max alarms

This attributes have been recently introduced and require the
corresponding ABI documentation.

Signed-off-by: Javier Carrasco <javier.carrasco.cruz@gmail.com>
Link: https://lore.kernel.org/r/20240130-topic-chipcap2-v6-3-260bea05cf9b@gmail.com
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
---
 Documentation/ABI/testing/sysfs-class-hwmon | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/Documentation/ABI/testing/sysfs-class-hwmon b/Documentation/ABI/testing/sysfs-class-hwmon
index 6c4e68ad4a8331..cfd0d0bab48398 100644
--- a/Documentation/ABI/testing/sysfs-class-hwmon
+++ b/Documentation/ABI/testing/sysfs-class-hwmon
@@ -977,6 +977,15 @@ Description:
 
 		RW
 
+What:		/sys/class/hwmon/hwmonX/humidityY_max_alarm
+Description:
+		Maximum humidity detection
+
+		- 0: OK
+		- 1: Maximum humidity detected
+
+		RO
+
 What:		/sys/class/hwmon/hwmonX/humidityY_max_hyst
 Description:
 		Humidity hysteresis value for max limit.
@@ -996,6 +1005,15 @@ Description:
 
 		RW
 
+What:		/sys/class/hwmon/hwmonX/humidityY_min_alarm
+Description:
+		Minimum humidity detection
+
+		- 0: OK
+		- 1: Minimum humidity detected
+
+		RO
+
 What:		/sys/class/hwmon/hwmonX/humidityY_min_hyst
 Description:
 		Humidity hysteresis value for min limit.

From d2cec120bfd983d0e207126eed8abbf4efccadd0 Mon Sep 17 00:00:00 2001
From: Javier Carrasco <javier.carrasco.cruz@gmail.com>
Date: Tue, 30 Jan 2024 22:06:47 +0100
Subject: [PATCH 0323/1406] dt-bindings: hwmon: Add Amphenol ChipCap 2

Add device tree bindings and an example for the ChipCap 2 humidity
and temperature sensor.

Reviewed-by: Conor Dooley <conor.dooley@microchip.com>
Signed-off-by: Javier Carrasco <javier.carrasco.cruz@gmail.com>
Link: https://lore.kernel.org/r/20240130-topic-chipcap2-v6-4-260bea05cf9b@gmail.com
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
---
 .../bindings/hwmon/amphenol,chipcap2.yaml     | 77 +++++++++++++++++++
 1 file changed, 77 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/hwmon/amphenol,chipcap2.yaml

diff --git a/Documentation/devicetree/bindings/hwmon/amphenol,chipcap2.yaml b/Documentation/devicetree/bindings/hwmon/amphenol,chipcap2.yaml
new file mode 100644
index 00000000000000..17351fdbefce6c
--- /dev/null
+++ b/Documentation/devicetree/bindings/hwmon/amphenol,chipcap2.yaml
@@ -0,0 +1,77 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/hwmon/amphenol,chipcap2.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: ChipCap 2 humidity and temperature iio sensor
+
+maintainers:
+  - Javier Carrasco <javier.carrasco.cruz@gmail.com>
+
+description: |
+  Relative humidity and temperature sensor on I2C bus.
+
+  Datasheets:
+    https://www.amphenol-sensors.com/en/telaire/humidity/527-humidity-sensors/3095-chipcap-2
+
+properties:
+  compatible:
+    oneOf:
+      - const: amphenol,cc2d23
+      - items:
+          - enum:
+              - amphenol,cc2d23s
+              - amphenol,cc2d25
+              - amphenol,cc2d25s
+              - amphenol,cc2d33
+              - amphenol,cc2d33s
+              - amphenol,cc2d35
+              - amphenol,cc2d35s
+          - const: amphenol,cc2d23
+
+  reg:
+    maxItems: 1
+
+  interrupts:
+    items:
+      - description: measurement ready indicator
+      - description: low humidity alarm
+      - description: high humidity alarm
+
+  interrupt-names:
+    items:
+      - const: ready
+      - const: low
+      - const: high
+
+  vdd-supply:
+    description:
+      Dedicated, controllable supply-regulator to reset the device and
+      enter in command mode.
+
+required:
+  - compatible
+  - reg
+  - vdd-supply
+
+additionalProperties: false
+
+examples:
+  - |
+    #include <dt-bindings/interrupt-controller/irq.h>
+    i2c {
+        #address-cells = <1>;
+        #size-cells = <0>;
+
+        humidity@28 {
+            compatible = "amphenol,cc2d23s", "amphenol,cc2d23";
+            reg = <0x28>;
+            interrupt-parent = <&gpio>;
+            interrupts = <4 IRQ_TYPE_EDGE_RISING>,
+                         <5 IRQ_TYPE_EDGE_RISING>,
+                         <6 IRQ_TYPE_EDGE_RISING>;
+            interrupt-names = "ready", "low", "high";
+            vdd-supply = <&reg_vdd>;
+        };
+    };

From 0149debb023c6dc5028d7281320fb965fbe1c400 Mon Sep 17 00:00:00 2001
From: Javier Carrasco <javier.carrasco.cruz@gmail.com>
Date: Tue, 30 Jan 2024 22:06:48 +0100
Subject: [PATCH 0324/1406] hwmon: Add support for Amphenol ChipCap 2

The Amphenol ChipCap 2 is a capacitive polymer humidity and temperature
sensor with an integrated EEPROM and minimum/maximum humidity alarms.

All device variants offer an I2C interface and depending on the part
number, two different output modes:
- CC2D: digital output
- CC2A: analog (PDM) output

This driver adds support for the digital variant (CC2D part numbers),
which includes the following part numbers:
- non-sleep measurement mode (CC2D23, CC2D25, CC2D33, CC2D35)
- sleep measurement mode (CC2D23S, CC2D25S, CC2D33S, CC2D35S)

The Chipcap 2 EEPROM can be accessed to configure a series of parameters
like the minimum/maximum humidity alarm threshold and hysteresis. The
EEPROM is only accessible in the command window after a power-on reset.
The default window lasts 10 ms if no Start_CM command is sent. After the
command window is finished (either after the mentioned timeout of after
a Start_NOM command is sent), the device enters the normal operation
mode and makes a first measurement automatically.

Unfortunately, the device does not provide any hardware or software
reset and therefore the driver must trigger power cycles to enter the
command mode. A dedicated, external regulator is required for that.

This driver keeps the device off until a measurement or access to the
EEPROM is required, making use of the first automatic measurement to
avoid different code paths for sleep and non-sleep devices.

The minimum and maximum humidity alarms are configured with two
registers per alarm: one stores the alarm threshold and the other one
keeps the value that turns off the alarm. The alarm signals are only
updated when a measurement is carried out.

Signed-off-by: Javier Carrasco <javier.carrasco.cruz@gmail.com>
Link: https://lore.kernel.org/r/20240130-topic-chipcap2-v6-5-260bea05cf9b@gmail.com
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
---
 Documentation/hwmon/chipcap2.rst |  73 +++
 Documentation/hwmon/index.rst    |   1 +
 MAINTAINERS                      |   8 +
 drivers/hwmon/Kconfig            |  10 +
 drivers/hwmon/Makefile           |   1 +
 drivers/hwmon/chipcap2.c         | 816 +++++++++++++++++++++++++++++++
 6 files changed, 909 insertions(+)
 create mode 100644 Documentation/hwmon/chipcap2.rst
 create mode 100644 drivers/hwmon/chipcap2.c

diff --git a/Documentation/hwmon/chipcap2.rst b/Documentation/hwmon/chipcap2.rst
new file mode 100644
index 00000000000000..dc165becc64cff
--- /dev/null
+++ b/Documentation/hwmon/chipcap2.rst
@@ -0,0 +1,73 @@
+.. SPDX-License-Identifier: GPL-2.0-or-later
+
+Kernel driver ChipCap2
+======================
+
+Supported chips:
+
+  * Amphenol CC2D23, CC2D23S, CC2D25, CC2D25S, CC2D33, CC2D33S, CC2D35, CC2D35S
+
+    Prefix: 'chipcap2'
+
+    Addresses scanned: -
+
+    Datasheet: https://www.amphenol-sensors.com/en/telaire/humidity/527-humidity-sensors/3095-chipcap-2
+
+Author:
+
+  - Javier Carrasco <javier.carrasco.cruz@gmail.com>
+
+Description
+-----------
+
+This driver implements support for the Amphenol ChipCap 2, a humidity and
+temperature chip family. Temperature is measured in milli degrees celsius,
+relative humidity is expressed as a per cent mille. The measurement ranges
+are the following:
+
+  - Relative humidity: 0 to 100000 pcm (14-bit resolution)
+  - Temperature: -40000 to +125000 m°C (14-bit resolution)
+
+The device communicates with the I2C protocol and uses the I2C address 0x28
+by default.
+
+Depending on the hardware configuration, up to two humidity alarms to control
+minimum and maximum values are provided. Their thresholds and hystersis can be
+configured via sysfs.
+
+Thresholds and hysteris must be provided as a per cent mille. These values
+might be truncated to match the 14-bit device resolution (6.1 pcm/LSB)
+
+Known Issues
+------------
+
+The driver does not support I2C address and command window length modification.
+
+sysfs-Interface
+---------------
+
+The following list includes the sysfs attributes that the driver always provides,
+their permissions and a short description:
+
+=============================== ======= ========================================
+Name                            Perm    Description
+=============================== ======= ========================================
+temp1_input:                    RO      temperature input
+humidity1_input:                RO      humidity input
+=============================== ======= ========================================
+
+The following list includes the sysfs attributes that the driver may provide
+depending on the hardware configuration:
+
+=============================== ======= ========================================
+Name                            Perm    Description
+=============================== ======= ========================================
+humidity1_min:                  RW      humidity low limit. Measurements under
+                                        this limit trigger a humidity low alarm
+humidity1_max:                  RW      humidity high limit. Measurements above
+                                        this limit trigger a humidity high alarm
+humidity1_min_hyst:             RW      humidity low hystersis
+humidity1_max_hyst:             RW      humidity high hystersis
+humidity1_min_alarm:            RO      humidity low alarm indicator
+humidity1_max_alarm:            RO      humidity high alarm indicator
+=============================== ======= ========================================
diff --git a/Documentation/hwmon/index.rst b/Documentation/hwmon/index.rst
index 9ac087dd3e78d1..0d12254c0f9ee8 100644
--- a/Documentation/hwmon/index.rst
+++ b/Documentation/hwmon/index.rst
@@ -51,6 +51,7 @@ Hardware Monitoring Kernel Drivers
    bel-pfe
    bpa-rs600
    bt1-pvt
+   chipcap2
    coretemp
    corsair-cpro
    corsair-psu
diff --git a/MAINTAINERS b/MAINTAINERS
index dd69696f3c0e83..b00657d2536f5d 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1098,6 +1098,14 @@ F:	Documentation/devicetree/bindings/perf/amlogic,g12-ddr-pmu.yaml
 F:	drivers/perf/amlogic/
 F:	include/soc/amlogic/
 
+AMPHENOL CHIPCAP 2 HUMIDITY-TEMPERATURE IIO DRIVER
+M:	Javier Carrasco <javier.carrasco.cruz@gmail.com>
+L:	linux-hwmon@vger.kernel.org
+S:	Maintained
+F:	Documentation/devicetree/bindings/hwmon/amphenol,chipcap2.yaml
+F:	Documentation/hwmon/chipcap2.rst
+F:	drivers/hwmon/chipcap2.c
+
 AMPHION VPU CODEC V4L2 DRIVER
 M:	Ming Qian <ming.qian@nxp.com>
 M:	Zhou Peng <eagle.zhou@nxp.com>
diff --git a/drivers/hwmon/Kconfig b/drivers/hwmon/Kconfig
index cdf228fe50e2be..5c85c976d795cb 100644
--- a/drivers/hwmon/Kconfig
+++ b/drivers/hwmon/Kconfig
@@ -452,6 +452,16 @@ config SENSORS_BT1_PVT_ALARMS
 	  the data conversion will be periodically performed and the data will be
 	  saved in the internal driver cache.
 
+config SENSORS_CHIPCAP2
+	tristate "Amphenol ChipCap 2 relative humidity and temperature sensor"
+	depends on I2C
+	help
+	  Say yes here to build support for the Amphenol ChipCap 2
+	  relative humidity and temperature sensor.
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called chipcap2.
+
 config SENSORS_CORSAIR_CPRO
 	tristate "Corsair Commander Pro controller"
 	depends on HID
diff --git a/drivers/hwmon/Makefile b/drivers/hwmon/Makefile
index a49704cd48a4b5..5da0c4ce881b3c 100644
--- a/drivers/hwmon/Makefile
+++ b/drivers/hwmon/Makefile
@@ -58,6 +58,7 @@ obj-$(CONFIG_SENSORS_ASPEED)	+= aspeed-pwm-tacho.o
 obj-$(CONFIG_SENSORS_ATXP1)	+= atxp1.o
 obj-$(CONFIG_SENSORS_AXI_FAN_CONTROL) += axi-fan-control.o
 obj-$(CONFIG_SENSORS_BT1_PVT)	+= bt1-pvt.o
+obj-$(CONFIG_SENSORS_CHIPCAP2) += chipcap2.o
 obj-$(CONFIG_SENSORS_CORETEMP)	+= coretemp.o
 obj-$(CONFIG_SENSORS_CORSAIR_CPRO) += corsair-cpro.o
 obj-$(CONFIG_SENSORS_CORSAIR_PSU) += corsair-psu.o
diff --git a/drivers/hwmon/chipcap2.c b/drivers/hwmon/chipcap2.c
new file mode 100644
index 00000000000000..a62c507b10429c
--- /dev/null
+++ b/drivers/hwmon/chipcap2.c
@@ -0,0 +1,816 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * cc2.c - Support for the Amphenol ChipCap 2 relative humidity, temperature sensor
+ *
+ * Part numbers supported:
+ * CC2D23, CC2D23S, CC2D25, CC2D25S, CC2D33, CC2D33S, CC2D35, CC2D35S
+ *
+ * Author: Javier Carrasco <javier.carrasco.cruz@gmail.com>
+ *
+ * Datasheet and application notes:
+ * https://www.amphenol-sensors.com/en/telaire/humidity/527-humidity-sensors/3095-chipcap-2
+ */
+
+#include <linux/bitfield.h>
+#include <linux/bits.h>
+#include <linux/completion.h>
+#include <linux/delay.h>
+#include <linux/hwmon.h>
+#include <linux/i2c.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/module.h>
+#include <linux/regulator/consumer.h>
+
+#define CC2_START_CM			0xA0
+#define CC2_START_NOM			0x80
+#define CC2_R_ALARM_H_ON		0x18
+#define CC2_R_ALARM_H_OFF		0x19
+#define CC2_R_ALARM_L_ON		0x1A
+#define CC2_R_ALARM_L_OFF		0x1B
+#define CC2_RW_OFFSET			0x40
+#define CC2_W_ALARM_H_ON		(CC2_R_ALARM_H_ON + CC2_RW_OFFSET)
+#define CC2_W_ALARM_H_OFF		(CC2_R_ALARM_H_OFF + CC2_RW_OFFSET)
+#define CC2_W_ALARM_L_ON		(CC2_R_ALARM_L_ON + CC2_RW_OFFSET)
+#define CC2_W_ALARM_L_OFF		(CC2_R_ALARM_L_OFF + CC2_RW_OFFSET)
+
+#define CC2_STATUS_FIELD		GENMASK(7, 6)
+#define CC2_STATUS_VALID_DATA		0x00
+#define CC2_STATUS_STALE_DATA		0x01
+#define CC2_STATUS_CMD_MODE		0x02
+
+#define CC2_RESPONSE_FIELD		GENMASK(1, 0)
+#define CC2_RESPONSE_BUSY		0x00
+#define CC2_RESPONSE_ACK		0x01
+#define CC2_RESPONSE_NACK		0x02
+
+#define CC2_ERR_CORR_EEPROM		BIT(2)
+#define CC2_ERR_UNCORR_EEPROM		BIT(3)
+#define CC2_ERR_RAM_PARITY		BIT(4)
+#define CC2_ERR_CONFIG_LOAD		BIT(5)
+
+#define CC2_EEPROM_SIZE			10
+#define CC2_EEPROM_DATA_LEN		3
+#define CC2_MEASUREMENT_DATA_LEN	4
+
+#define CC2_RH_DATA_FIELD		GENMASK(13, 0)
+
+/* ensure clean off -> on transitions */
+#define CC2_POWER_CYCLE_MS		80
+
+#define CC2_STARTUP_TO_DATA_MS		55
+#define CC2_RESP_START_CM_US		100
+#define CC2_RESP_EEPROM_R_US		100
+#define CC2_RESP_EEPROM_W_MS		12
+#define CC2_STARTUP_TIME_US		1250
+
+#define CC2_RH_MAX			(100 * 1000U)
+
+#define CC2_CM_RETRIES			5
+
+struct cc2_rh_alarm_info {
+	bool low_alarm;
+	bool high_alarm;
+	bool low_alarm_visible;
+	bool high_alarm_visible;
+};
+
+struct cc2_data {
+	struct cc2_rh_alarm_info rh_alarm;
+	struct completion complete;
+	struct device *hwmon;
+	struct i2c_client *client;
+	struct mutex dev_access_lock; /* device access lock */
+	struct regulator *regulator;
+	const char *name;
+	int irq_ready;
+	int irq_low;
+	int irq_high;
+	bool process_irqs;
+};
+
+enum cc2_chan_addr {
+	CC2_CHAN_TEMP = 0,
+	CC2_CHAN_HUMIDITY,
+};
+
+/* %RH as a per cent mille from a register value */
+static long cc2_rh_convert(u16 data)
+{
+	unsigned long tmp = (data & CC2_RH_DATA_FIELD) * CC2_RH_MAX;
+
+	return tmp / ((1 << 14) - 1);
+}
+
+/* convert %RH to a register value */
+static u16 cc2_rh_to_reg(long data)
+{
+	return data * ((1 << 14) - 1) / CC2_RH_MAX;
+}
+
+/* temperature in milli degrees celsius from a register value */
+static long cc2_temp_convert(u16 data)
+{
+	unsigned long tmp = ((data >> 2) * 165 * 1000U) / ((1 << 14) - 1);
+
+	return tmp - 40 * 1000U;
+}
+
+static int cc2_enable(struct cc2_data *data)
+{
+	int ret;
+
+	/* exclusive regulator, check in case a disable failed */
+	if (regulator_is_enabled(data->regulator))
+		return 0;
+
+	/* clear any pending completion */
+	try_wait_for_completion(&data->complete);
+
+	ret = regulator_enable(data->regulator);
+	if (ret < 0)
+		return ret;
+
+	usleep_range(CC2_STARTUP_TIME_US, CC2_STARTUP_TIME_US + 125);
+
+	data->process_irqs = true;
+
+	return 0;
+}
+
+static void cc2_disable(struct cc2_data *data)
+{
+	int err;
+
+	/* ignore alarms triggered by voltage toggling when powering up */
+	data->process_irqs = false;
+
+	/* exclusive regulator, check in case an enable failed */
+	if (regulator_is_enabled(data->regulator)) {
+		err = regulator_disable(data->regulator);
+		if (err)
+			dev_dbg(&data->client->dev, "Failed to disable device");
+	}
+}
+
+static int cc2_cmd_response_diagnostic(struct device *dev, u8 status)
+{
+	int resp;
+
+	if (FIELD_GET(CC2_STATUS_FIELD, status) != CC2_STATUS_CMD_MODE) {
+		dev_dbg(dev, "Command sent out of command window\n");
+		return -ETIMEDOUT;
+	}
+
+	resp = FIELD_GET(CC2_RESPONSE_FIELD, status);
+	switch (resp) {
+	case CC2_RESPONSE_ACK:
+		return 0;
+	case CC2_RESPONSE_BUSY:
+		return -EBUSY;
+	case CC2_RESPONSE_NACK:
+		if (resp & CC2_ERR_CORR_EEPROM)
+			dev_dbg(dev, "Command failed: corrected EEPROM\n");
+		if (resp & CC2_ERR_UNCORR_EEPROM)
+			dev_dbg(dev, "Command failed: uncorrected EEPROM\n");
+		if (resp & CC2_ERR_RAM_PARITY)
+			dev_dbg(dev, "Command failed: RAM parity\n");
+		if (resp & CC2_ERR_RAM_PARITY)
+			dev_dbg(dev, "Command failed: configuration error\n");
+		return -ENODATA;
+	default:
+		dev_dbg(dev, "Unknown command reply\n");
+		return -EINVAL;
+	}
+}
+
+static int cc2_read_command_status(struct i2c_client *client)
+{
+	u8 status;
+	int ret;
+
+	ret = i2c_master_recv(client, &status, 1);
+	if (ret != 1) {
+		ret = ret < 0 ? ret : -EIO;
+		return ret;
+	}
+
+	return cc2_cmd_response_diagnostic(&client->dev, status);
+}
+
+/*
+ * The command mode is only accessible after sending the START_CM command in the
+ * first 10 ms after power-up. Only in case the command window is missed,
+ * CC2_CM_RETRIES retries are attempted before giving up and returning an error.
+ */
+static int cc2_command_mode_start(struct cc2_data *data)
+{
+	unsigned long timeout;
+	int i, ret;
+
+	for (i = 0; i < CC2_CM_RETRIES; i++) {
+		ret = cc2_enable(data);
+		if (ret < 0)
+			return ret;
+
+		ret = i2c_smbus_write_word_data(data->client, CC2_START_CM, 0);
+		if (ret < 0)
+			return ret;
+
+		if (data->irq_ready > 0) {
+			timeout = usecs_to_jiffies(2 * CC2_RESP_START_CM_US);
+			ret = wait_for_completion_timeout(&data->complete,
+							  timeout);
+			if (!ret)
+				return -ETIMEDOUT;
+		} else {
+			usleep_range(CC2_RESP_START_CM_US,
+				     2 * CC2_RESP_START_CM_US);
+		}
+		ret = cc2_read_command_status(data->client);
+		if (ret != -ETIMEDOUT || i == CC2_CM_RETRIES)
+			break;
+
+		/* command window missed, prepare for a retry */
+		cc2_disable(data);
+		msleep(CC2_POWER_CYCLE_MS);
+	}
+
+	return ret;
+}
+
+/* Sending a Start_NOM command finishes the command mode immediately with no
+ * reply and the device enters normal operation mode
+ */
+static int cc2_command_mode_finish(struct cc2_data *data)
+{
+	int ret;
+
+	ret = i2c_smbus_write_word_data(data->client, CC2_START_NOM, 0);
+	if (ret < 0)
+		return ret;
+
+	return 0;
+}
+
+static int cc2_write_reg(struct cc2_data *data, u8 reg, u16 val)
+{
+	unsigned long timeout;
+	int ret;
+
+	ret = cc2_command_mode_start(data);
+	if (ret < 0)
+		goto disable;
+
+	cpu_to_be16s(&val);
+	ret = i2c_smbus_write_word_data(data->client, reg, val);
+	if (ret < 0)
+		goto disable;
+
+	if (data->irq_ready > 0) {
+		timeout = msecs_to_jiffies(2 * CC2_RESP_EEPROM_W_MS);
+		ret = wait_for_completion_timeout(&data->complete, timeout);
+		if (!ret) {
+			ret = -ETIMEDOUT;
+			goto disable;
+		}
+	} else {
+		msleep(CC2_RESP_EEPROM_W_MS);
+	}
+
+	ret = cc2_read_command_status(data->client);
+
+disable:
+	cc2_disable(data);
+
+	return ret;
+}
+
+static int cc2_read_reg(struct cc2_data *data, u8 reg, u16 *val)
+{
+	u8 buf[CC2_EEPROM_DATA_LEN];
+	unsigned long timeout;
+	int ret;
+
+	ret = cc2_command_mode_start(data);
+	if (ret < 0)
+		return ret;
+
+	ret = i2c_smbus_write_word_data(data->client, reg, 0);
+	if (ret < 0)
+		return ret;
+
+	if (data->irq_ready > 0) {
+		timeout = usecs_to_jiffies(2 * CC2_RESP_EEPROM_R_US);
+		ret = wait_for_completion_timeout(&data->complete, timeout);
+		if (!ret)
+			return -ETIMEDOUT;
+
+	} else {
+		usleep_range(CC2_RESP_EEPROM_R_US, CC2_RESP_EEPROM_R_US + 10);
+	}
+	ret = i2c_master_recv(data->client, buf, CC2_EEPROM_DATA_LEN);
+	if (ret != CC2_EEPROM_DATA_LEN)
+		return ret < 0 ? ret : -EIO;
+
+	*val = be16_to_cpup((__be16 *)&buf[1]);
+
+	return cc2_read_command_status(data->client);
+}
+
+static int cc2_get_reg_val(struct cc2_data *data, u8 reg, long *val)
+{
+	u16 reg_val;
+	int ret;
+
+	ret = cc2_read_reg(data, reg, &reg_val);
+	*val = cc2_rh_convert(reg_val);
+	cc2_disable(data);
+
+	return ret;
+}
+
+static int cc2_data_fetch(struct i2c_client *client,
+			  enum hwmon_sensor_types type, long *val)
+{
+	u8 data[CC2_MEASUREMENT_DATA_LEN];
+	u8 status;
+	int ret;
+
+	ret = i2c_master_recv(client, data, CC2_MEASUREMENT_DATA_LEN);
+	if (ret != CC2_MEASUREMENT_DATA_LEN) {
+		ret = ret < 0 ? ret : -EIO;
+		return ret;
+	}
+	status = FIELD_GET(CC2_STATUS_FIELD, data[0]);
+	if (status == CC2_STATUS_STALE_DATA)
+		return -EBUSY;
+
+	if (status != CC2_STATUS_VALID_DATA)
+		return -EIO;
+
+	switch (type) {
+	case hwmon_humidity:
+		*val = cc2_rh_convert(be16_to_cpup((__be16 *)&data[0]));
+		break;
+	case hwmon_temp:
+		*val = cc2_temp_convert(be16_to_cpup((__be16 *)&data[2]));
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int cc2_read_measurement(struct cc2_data *data,
+				enum hwmon_sensor_types type, long *val)
+{
+	unsigned long timeout;
+	int ret;
+
+	if (data->irq_ready > 0) {
+		timeout = msecs_to_jiffies(CC2_STARTUP_TO_DATA_MS * 2);
+		ret = wait_for_completion_timeout(&data->complete, timeout);
+		if (!ret)
+			return -ETIMEDOUT;
+
+	} else {
+		msleep(CC2_STARTUP_TO_DATA_MS);
+	}
+
+	ret = cc2_data_fetch(data->client, type, val);
+
+	return ret;
+}
+
+/*
+ * A measurement requires enabling the device, waiting for the automatic
+ * measurement to finish, reading the measurement data and disabling the device
+ * again.
+ */
+static int cc2_measurement(struct cc2_data *data, enum hwmon_sensor_types type,
+			   long *val)
+{
+	int ret;
+
+	ret = cc2_enable(data);
+	if (ret)
+		return ret;
+
+	ret = cc2_read_measurement(data, type, val);
+
+	cc2_disable(data);
+
+	return ret;
+}
+
+/*
+ * In order to check alarm status, the corresponding ALARM_OFF (hysteresis)
+ * register must be read and a new measurement must be carried out to trigger
+ * the alarm signals. Given that the device carries out a measurement after
+ * exiting the command mode, there is no need to force two power-up sequences.
+ * Instead, a NOM command is sent and the device is disabled after the
+ * measurement is read.
+ */
+static int cc2_read_hyst_and_measure(struct cc2_data *data, u8 reg,
+				     long *hyst, long *measurement)
+{
+	u16 reg_val;
+	int ret;
+
+	ret = cc2_read_reg(data, reg, &reg_val);
+	if (ret)
+		goto disable;
+
+	*hyst = cc2_rh_convert(reg_val);
+
+	ret = cc2_command_mode_finish(data);
+	if (ret)
+		goto disable;
+
+	ret = cc2_read_measurement(data, hwmon_humidity, measurement);
+
+disable:
+	cc2_disable(data);
+
+	return ret;
+}
+
+static umode_t cc2_is_visible(const void *data, enum hwmon_sensor_types type,
+			      u32 attr, int channel)
+{
+	const struct cc2_data *cc2 = data;
+
+	switch (type) {
+	case hwmon_humidity:
+		switch (attr) {
+		case hwmon_humidity_input:
+			return 0444;
+		case hwmon_humidity_min_alarm:
+			return cc2->rh_alarm.low_alarm_visible ? 0444 : 0;
+		case hwmon_humidity_max_alarm:
+			return cc2->rh_alarm.high_alarm_visible ? 0444 : 0;
+		case hwmon_humidity_min:
+		case hwmon_humidity_min_hyst:
+			return cc2->rh_alarm.low_alarm_visible ? 0644 : 0;
+		case hwmon_humidity_max:
+		case hwmon_humidity_max_hyst:
+			return cc2->rh_alarm.high_alarm_visible ? 0644 : 0;
+		default:
+			return 0;
+		}
+	case hwmon_temp:
+		switch (attr) {
+		case hwmon_temp_input:
+			return 0444;
+		default:
+			return 0;
+		}
+	default:
+		break;
+	}
+
+	return 0;
+}
+
+static irqreturn_t cc2_ready_interrupt(int irq, void *data)
+{
+	struct cc2_data *cc2 = data;
+
+	if (cc2->process_irqs)
+		complete(&cc2->complete);
+
+	return IRQ_HANDLED;
+}
+
+static irqreturn_t cc2_low_interrupt(int irq, void *data)
+{
+	struct cc2_data *cc2 = data;
+
+	if (cc2->process_irqs) {
+		hwmon_notify_event(cc2->hwmon, hwmon_humidity,
+				   hwmon_humidity_min_alarm, CC2_CHAN_HUMIDITY);
+		cc2->rh_alarm.low_alarm = true;
+	}
+
+	return IRQ_HANDLED;
+}
+
+static irqreturn_t cc2_high_interrupt(int irq, void *data)
+{
+	struct cc2_data *cc2 = data;
+
+	if (cc2->process_irqs) {
+		hwmon_notify_event(cc2->hwmon, hwmon_humidity,
+				   hwmon_humidity_max_alarm, CC2_CHAN_HUMIDITY);
+		cc2->rh_alarm.high_alarm = true;
+	}
+
+	return IRQ_HANDLED;
+}
+
+static int cc2_humidity_min_alarm_status(struct cc2_data *data, long *val)
+{
+	long measurement, min_hyst;
+	int ret;
+
+	ret = cc2_read_hyst_and_measure(data, CC2_R_ALARM_L_OFF, &min_hyst,
+					&measurement);
+	if (ret < 0)
+		return ret;
+
+	if (data->rh_alarm.low_alarm) {
+		*val = (measurement < min_hyst) ? 1 : 0;
+		data->rh_alarm.low_alarm = *val;
+	} else {
+		*val = 0;
+	}
+
+	return 0;
+}
+
+static int cc2_humidity_max_alarm_status(struct cc2_data *data, long *val)
+{
+	long measurement, max_hyst;
+	int ret;
+
+	ret = cc2_read_hyst_and_measure(data, CC2_R_ALARM_H_OFF, &max_hyst,
+					&measurement);
+	if (ret < 0)
+		return ret;
+
+	if (data->rh_alarm.high_alarm) {
+		*val = (measurement > max_hyst) ? 1 : 0;
+		data->rh_alarm.high_alarm = *val;
+	} else {
+		*val = 0;
+	}
+
+	return 0;
+}
+
+static int cc2_read(struct device *dev, enum hwmon_sensor_types type, u32 attr,
+		    int channel, long *val)
+{
+	struct cc2_data *data = dev_get_drvdata(dev);
+	int ret = 0;
+
+	mutex_lock(&data->dev_access_lock);
+
+	switch (type) {
+	case hwmon_temp:
+		ret = cc2_measurement(data, type, val);
+		break;
+	case hwmon_humidity:
+		switch (attr) {
+		case hwmon_humidity_input:
+			ret = cc2_measurement(data, type, val);
+			break;
+		case hwmon_humidity_min:
+			ret = cc2_get_reg_val(data, CC2_R_ALARM_L_ON, val);
+			break;
+		case hwmon_humidity_min_hyst:
+			ret = cc2_get_reg_val(data, CC2_R_ALARM_L_OFF, val);
+			break;
+		case hwmon_humidity_max:
+			ret = cc2_get_reg_val(data, CC2_R_ALARM_H_ON, val);
+			break;
+		case hwmon_humidity_max_hyst:
+			ret = cc2_get_reg_val(data, CC2_R_ALARM_H_OFF, val);
+			break;
+		case hwmon_humidity_min_alarm:
+			ret = cc2_humidity_min_alarm_status(data, val);
+			break;
+		case hwmon_humidity_max_alarm:
+			ret = cc2_humidity_max_alarm_status(data, val);
+			break;
+		default:
+			ret = -EOPNOTSUPP;
+		}
+		break;
+	default:
+		ret = -EOPNOTSUPP;
+	}
+
+	mutex_unlock(&data->dev_access_lock);
+
+	return ret;
+}
+
+static int cc2_write(struct device *dev, enum hwmon_sensor_types type, u32 attr,
+		     int channel, long val)
+{
+	struct cc2_data *data = dev_get_drvdata(dev);
+	int ret;
+	u16 arg;
+	u8 cmd;
+
+	if (type != hwmon_humidity)
+		return -EOPNOTSUPP;
+
+	if (val < 0 || val > CC2_RH_MAX)
+		return -EINVAL;
+
+	mutex_lock(&data->dev_access_lock);
+
+	switch (attr) {
+	case hwmon_humidity_min:
+		cmd = CC2_W_ALARM_L_ON;
+		arg = cc2_rh_to_reg(val);
+		ret = cc2_write_reg(data, cmd, arg);
+		break;
+
+	case hwmon_humidity_min_hyst:
+		cmd = CC2_W_ALARM_L_OFF;
+		arg = cc2_rh_to_reg(val);
+		ret = cc2_write_reg(data, cmd, arg);
+		break;
+
+	case hwmon_humidity_max:
+		cmd = CC2_W_ALARM_H_ON;
+		arg = cc2_rh_to_reg(val);
+		ret = cc2_write_reg(data, cmd, arg);
+		break;
+
+	case hwmon_humidity_max_hyst:
+		cmd = CC2_W_ALARM_H_OFF;
+		arg = cc2_rh_to_reg(val);
+		ret = cc2_write_reg(data, cmd, arg);
+		break;
+
+	default:
+		ret = -EOPNOTSUPP;
+		break;
+	}
+
+	mutex_unlock(&data->dev_access_lock);
+
+	return ret;
+}
+
+static int cc2_request_ready_irq(struct cc2_data *data, struct device *dev)
+{
+	int ret = 0;
+
+	data->irq_ready = fwnode_irq_get_byname(dev_fwnode(dev), "ready");
+	if (data->irq_ready > 0) {
+		init_completion(&data->complete);
+		ret = devm_request_threaded_irq(dev, data->irq_ready, NULL,
+						cc2_ready_interrupt,
+						IRQF_ONESHOT |
+						IRQF_TRIGGER_RISING,
+						dev_name(dev), data);
+	}
+
+	return ret;
+}
+
+static int cc2_request_alarm_irqs(struct cc2_data *data, struct device *dev)
+{
+	int ret;
+
+	data->irq_low = fwnode_irq_get_byname(dev_fwnode(dev), "low");
+	if (data->irq_low > 0) {
+		ret = devm_request_threaded_irq(dev, data->irq_low, NULL,
+						cc2_low_interrupt,
+						IRQF_ONESHOT |
+						IRQF_TRIGGER_RISING,
+						dev_name(dev), data);
+		if (!ret)
+			data->rh_alarm.low_alarm_visible = true;
+	}
+
+	data->irq_high = fwnode_irq_get_byname(dev_fwnode(dev), "high");
+	if (data->irq_high > 0) {
+		ret = devm_request_threaded_irq(dev, data->irq_high, NULL,
+						cc2_high_interrupt,
+						IRQF_ONESHOT |
+						IRQF_TRIGGER_RISING,
+						dev_name(dev), data);
+		if (!ret)
+			data->rh_alarm.high_alarm_visible = true;
+	}
+
+	return ret;
+}
+
+static const struct hwmon_channel_info *cc2_info[] = {
+	HWMON_CHANNEL_INFO(temp, HWMON_T_INPUT),
+	HWMON_CHANNEL_INFO(humidity, HWMON_H_INPUT | HWMON_H_MIN | HWMON_H_MAX |
+			   HWMON_H_MIN_HYST | HWMON_H_MAX_HYST |
+			   HWMON_H_MIN_ALARM | HWMON_H_MAX_ALARM),
+	NULL
+};
+
+static const struct hwmon_ops cc2_hwmon_ops = {
+	.is_visible = cc2_is_visible,
+	.read = cc2_read,
+	.write = cc2_write,
+};
+
+static const struct hwmon_chip_info cc2_chip_info = {
+	.ops = &cc2_hwmon_ops,
+	.info = cc2_info,
+};
+
+static int cc2_probe(struct i2c_client *client)
+{
+	struct cc2_data *data;
+	struct device *dev = &client->dev;
+	int ret;
+
+	if (!i2c_check_functionality(client->adapter, I2C_FUNC_I2C))
+		return -EOPNOTSUPP;
+
+	data = devm_kzalloc(dev, sizeof(*data), GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+
+	i2c_set_clientdata(client, data);
+
+	mutex_init(&data->dev_access_lock);
+
+	data->client = client;
+
+	data->regulator = devm_regulator_get_exclusive(dev, "vdd");
+	if (IS_ERR(data->regulator)) {
+		dev_err_probe(dev, PTR_ERR(data->regulator),
+			      "Failed to get regulator\n");
+		return PTR_ERR(data->regulator);
+	}
+
+	ret = cc2_request_ready_irq(data, dev);
+	if (ret) {
+		dev_err_probe(dev, ret, "Failed to request ready irq\n");
+		return ret;
+	}
+
+	ret = cc2_request_alarm_irqs(data, dev);
+	if (ret) {
+		dev_err_probe(dev, ret, "Failed to request alarm irqs\n");
+		goto disable;
+	}
+
+	data->hwmon = devm_hwmon_device_register_with_info(dev, client->name,
+							   data, &cc2_chip_info,
+							   NULL);
+	if (IS_ERR(data->hwmon)) {
+		dev_err_probe(dev, PTR_ERR(data->hwmon),
+			      "Failed to register hwmon device\n");
+		ret = PTR_ERR(data->hwmon);
+	}
+
+disable:
+	cc2_disable(data);
+
+	return ret;
+}
+
+static void cc2_remove(struct i2c_client *client)
+{
+	struct cc2_data *data = i2c_get_clientdata(client);
+
+	cc2_disable(data);
+}
+
+static const struct i2c_device_id cc2_id[] = {
+	{ "cc2d23" },
+	{ "cc2d23s" },
+	{ "cc2d25" },
+	{ "cc2d25s" },
+	{ "cc2d33" },
+	{ "cc2d33s" },
+	{ "cc2d35" },
+	{ "cc2d35s" },
+	{ }
+};
+MODULE_DEVICE_TABLE(i2c, cc2_id);
+
+static const struct of_device_id cc2_of_match[] = {
+	{ .compatible = "amphenol,cc2d23" },
+	{ .compatible = "amphenol,cc2d23s" },
+	{ .compatible = "amphenol,cc2d25" },
+	{ .compatible = "amphenol,cc2d25s" },
+	{ .compatible = "amphenol,cc2d33" },
+	{ .compatible = "amphenol,cc2d33s" },
+	{ .compatible = "amphenol,cc2d35" },
+	{ .compatible = "amphenol,cc2d35s" },
+	{ },
+};
+MODULE_DEVICE_TABLE(of, cc2_of_match);
+
+static struct i2c_driver cc2_driver = {
+	.driver = {
+		.name	= "cc2d23",
+		.of_match_table = cc2_of_match,
+	},
+	.probe		= cc2_probe,
+	.remove		= cc2_remove,
+	.id_table = cc2_id,
+};
+module_i2c_driver(cc2_driver);
+
+MODULE_AUTHOR("Javier Carrasco <javier.carrasco.cruz@gamil.com>");
+MODULE_DESCRIPTION("Amphenol ChipCap 2 humidity and temperature sensor driver");
+MODULE_LICENSE("GPL");

From 3bdd2db1122f607e108826926738a8c5929005aa Mon Sep 17 00:00:00 2001
From: Cosmo Chou <chou.cosmo@gmail.com>
Date: Mon, 15 Jan 2024 18:05:16 +0800
Subject: [PATCH 0325/1406] dt-bindings: vendor-prefixes: add asteralabs

Add vendor prefix for Astera Labs, Inc.
https://www.asteralabs.com

Signed-off-by: Cosmo Chou <chou.cosmo@gmail.com>
Acked-by: Conor Dooley <conor.dooley@microchip.com>
Link: https://lore.kernel.org/r/20240115100518.2887549-2-chou.cosmo@gmail.com
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
---
 Documentation/devicetree/bindings/vendor-prefixes.yaml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Documentation/devicetree/bindings/vendor-prefixes.yaml b/Documentation/devicetree/bindings/vendor-prefixes.yaml
index 25158559471ca6..c734ea4de4288c 100644
--- a/Documentation/devicetree/bindings/vendor-prefixes.yaml
+++ b/Documentation/devicetree/bindings/vendor-prefixes.yaml
@@ -161,6 +161,8 @@ patternProperties:
     description: ASPEED Technology Inc.
   "^asrock,.*":
     description: ASRock Inc.
+  "^asteralabs,.*":
+    description: Astera Labs, Inc.
   "^asus,.*":
     description: AsusTek Computer Inc.
   "^atheros,.*":

From 08c06fb56e52dc907a4415da27f185da6320f1b5 Mon Sep 17 00:00:00 2001
From: Cosmo Chou <chou.cosmo@gmail.com>
Date: Mon, 15 Jan 2024 18:05:17 +0800
Subject: [PATCH 0326/1406] dt-bindings: trivial-devices: add Astera Labs
 PT5161L

Add dt-bindings for pt5161l temperature monitoring.

Signed-off-by: Cosmo Chou <chou.cosmo@gmail.com>
Acked-by: Conor Dooley <conor.dooley@microchip.com>
Link: https://lore.kernel.org/r/20240115100518.2887549-3-chou.cosmo@gmail.com
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
---
 Documentation/devicetree/bindings/trivial-devices.yaml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Documentation/devicetree/bindings/trivial-devices.yaml b/Documentation/devicetree/bindings/trivial-devices.yaml
index 088b23ed2ae6c8..842eb65e4c0317 100644
--- a/Documentation/devicetree/bindings/trivial-devices.yaml
+++ b/Documentation/devicetree/bindings/trivial-devices.yaml
@@ -47,6 +47,8 @@ properties:
           - adi,lt7182s
             # AMS iAQ-Core VOC Sensor
           - ams,iaq-core
+            # Temperature monitoring of Astera Labs PT5161L PCIe retimer
+          - asteralabs,pt5161l
             # i2c serial eeprom (24cxx)
           - at,24c08
             # ATSHA204 - i2c h/w symmetric crypto module

From b38ac9f445546b9fd95672ac405e033c2d40ba1b Mon Sep 17 00:00:00 2001
From: Aleksa Savic <savicaleksa83@gmail.com>
Date: Mon, 29 Jan 2024 12:19:28 +0100
Subject: [PATCH 0327/1406] hwmon: Add driver for NZXT Kraken X and Z series
 AIO CPU coolers

This driver enables hardware monitoring support for NZXT Kraken
X53/X63/X73 and Z53/Z63/Z73 all-in-one CPU liquid coolers.

All models expose liquid temperature and pump speed (in RPM), as well as
PWM control (natively only through a temp-PWM curve, but the driver also
emulates fixed PWM control on top of that). The Z-series models
additionally expose the speed and duty of an optionally connected fan,
with the same PWM control capabilities.

Pump and fan duty control mode can be set through pwm[1-2]_enable,
where 1 is for the manual control mode and 2 is for the liquid temp
to PWM curve mode. Writing a 0 disables control of the channel through
the driver after setting its duty to 100%. As it is not possible to query
the device for the active mode, the driver keeps track of it.

The temperature of the curves relates to the fixed [20-59] C range, per
device limitations, and correlating to the detected liquid temperature.
Only PWM values (ranging from 0-255) can be set.

The addressable RGB LEDs and LCD screen, included only on Z-series models,
are not supported in this driver.

Co-developed-by: Jonas Malaco <jonas@protocubo.io>
Signed-off-by: Jonas Malaco <jonas@protocubo.io>
Co-developed-by: Yury Zhuravlev <stalkerg@gmail.com>
Signed-off-by: Yury Zhuravlev <stalkerg@gmail.com>
Signed-off-by: Aleksa Savic <savicaleksa83@gmail.com>
Link: https://lore.kernel.org/r/20240129111932.368232-1-savicaleksa83@gmail.com
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
---
 Documentation/hwmon/index.rst        |    1 +
 Documentation/hwmon/nzxt-kraken3.rst |   74 ++
 MAINTAINERS                          |    8 +
 drivers/hwmon/Kconfig                |   10 +
 drivers/hwmon/Makefile               |    1 +
 drivers/hwmon/nzxt-kraken3.c         | 1008 ++++++++++++++++++++++++++
 6 files changed, 1102 insertions(+)
 create mode 100644 Documentation/hwmon/nzxt-kraken3.rst
 create mode 100644 drivers/hwmon/nzxt-kraken3.c

diff --git a/Documentation/hwmon/index.rst b/Documentation/hwmon/index.rst
index 0d12254c0f9ee8..6f8a4a7524e8b1 100644
--- a/Documentation/hwmon/index.rst
+++ b/Documentation/hwmon/index.rst
@@ -174,6 +174,7 @@ Hardware Monitoring Kernel Drivers
    nsa320
    ntc_thermistor
    nzxt-kraken2
+   nzxt-kraken3
    nzxt-smart2
    occ
    oxp-sensors
diff --git a/Documentation/hwmon/nzxt-kraken3.rst b/Documentation/hwmon/nzxt-kraken3.rst
new file mode 100644
index 00000000000000..90fd9dec15ff22
--- /dev/null
+++ b/Documentation/hwmon/nzxt-kraken3.rst
@@ -0,0 +1,74 @@
+.. SPDX-License-Identifier: GPL-2.0-or-later
+
+Kernel driver nzxt-kraken3
+==========================
+
+Supported devices:
+
+* NZXT Kraken X53
+* NZXT Kraken X63
+* NZXT Kraken X73
+* NZXT Kraken Z53
+* NZXT Kraken Z63
+* NZXT Kraken Z73
+
+Author: Jonas Malaco, Aleksa Savic
+
+Description
+-----------
+
+This driver enables hardware monitoring support for NZXT Kraken X53/X63/X73 and
+Z53/Z63/Z73 all-in-one CPU liquid coolers. All models expose liquid temperature
+and pump speed (in RPM), as well as PWM control (either as a fixed value
+or through a temp-PWM curve). The Z-series models additionally expose the speed
+and duty of an optionally connected fan, with the same PWM control capabilities.
+
+Pump and fan duty control mode can be set through pwm[1-2]_enable, where 1 is
+for the manual control mode and 2 is for the liquid temp to PWM curve mode.
+Writing a 0 disables control of the channel through the driver after setting its
+duty to 100%.
+
+The temperature of the curves relates to the fixed [20-59] range, correlating to
+the detected liquid temperature. Only PWM values (ranging from 0-255) can be set.
+If in curve mode, setting point values should be done in moderation - the devices
+require complete curves to be sent for each change; they can lock up or discard
+the changes if they are too numerous at once. Suggestion is to set them while
+in an another mode, and then apply them by switching to curve.
+
+The devices can report if they are faulty. The driver supports that situation
+and will issue a warning. This can also happen when the USB cable is connected,
+but SATA power is not.
+
+The addressable RGB LEDs and LCD screen (only on Z-series models) are not
+supported in this driver, but can be controlled through existing userspace tools,
+such as `liquidctl`_.
+
+.. _liquidctl: https://github.com/liquidctl/liquidctl
+
+Usage Notes
+-----------
+
+As these are USB HIDs, the driver can be loaded automatically by the kernel and
+supports hot swapping.
+
+Possible pwm_enable values are:
+
+====== ==========================================================================
+0      Set fan to 100%
+1      Direct PWM mode (applies value in corresponding PWM entry)
+2      Curve control mode (applies the temp-PWM duty curve based on coolant temp)
+====== ==========================================================================
+
+Sysfs entries
+-------------
+
+============================== ================================================================
+fan1_input                     Pump speed (in rpm)
+fan2_input                     Fan speed (in rpm)
+temp1_input                    Coolant temperature (in millidegrees Celsius)
+pwm1                           Pump duty (value between 0-255)
+pwm1_enable                    Pump duty control mode (0: disabled, 1: manual, 2: curve)
+pwm2                           Fan duty (value between 0-255)
+pwm2_enable                    Fan duty control mode (0: disabled, 1: manual, 2: curve)
+temp[1-2]_auto_point[1-40]_pwm Temp-PWM duty curves (for pump and fan), related to coolant temp
+============================== ================================================================
diff --git a/MAINTAINERS b/MAINTAINERS
index b00657d2536f5d..b9906f88c1b7a1 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -15840,6 +15840,14 @@ S:	Maintained
 F:	Documentation/hwmon/nzxt-kraken2.rst
 F:	drivers/hwmon/nzxt-kraken2.c
 
+NZXT-KRAKEN3 HARDWARE MONITORING DRIVER
+M:	Jonas Malaco <jonas@protocubo.io>
+M:	Aleksa Savic <savicaleksa83@gmail.com>
+L:	linux-hwmon@vger.kernel.org
+S:	Maintained
+F:	Documentation/hwmon/nzxt-kraken3.rst
+F:	drivers/hwmon/nzxt-kraken3.c
+
 NZXT-SMART2 HARDWARE MONITORING DRIVER
 M:	Aleksandr Mezin <mezin.alexander@gmail.com>
 L:	linux-hwmon@vger.kernel.org
diff --git a/drivers/hwmon/Kconfig b/drivers/hwmon/Kconfig
index 5c85c976d795cb..3904bb297d6137 100644
--- a/drivers/hwmon/Kconfig
+++ b/drivers/hwmon/Kconfig
@@ -1695,6 +1695,16 @@ config SENSORS_NZXT_KRAKEN2
 	  This driver can also be built as a module. If so, the module
 	  will be called nzxt-kraken2.
 
+config SENSORS_NZXT_KRAKEN3
+	tristate "NZXT Kraken X53/X63/X73, Z53/Z63/Z73 coolers"
+	depends on USB_HID
+	help
+	  If you say yes here you get support for hardware monitoring for the
+	  NZXT Kraken X53/X63/X73, Z53/Z63/Z73 all-in-one CPU liquid coolers.
+
+	  This driver can also be built as a module. If so, the module
+	  will be called nzxt-kraken3.
+
 config SENSORS_NZXT_SMART2
 	tristate "NZXT RGB & Fan Controller/Smart Device v2"
 	depends on USB_HID
diff --git a/drivers/hwmon/Makefile b/drivers/hwmon/Makefile
index 5da0c4ce881b3c..76e6dfef9f2459 100644
--- a/drivers/hwmon/Makefile
+++ b/drivers/hwmon/Makefile
@@ -175,6 +175,7 @@ obj-$(CONFIG_SENSORS_NPCM7XX)	+= npcm750-pwm-fan.o
 obj-$(CONFIG_SENSORS_NSA320)	+= nsa320-hwmon.o
 obj-$(CONFIG_SENSORS_NTC_THERMISTOR)	+= ntc_thermistor.o
 obj-$(CONFIG_SENSORS_NZXT_KRAKEN2) += nzxt-kraken2.o
+obj-$(CONFIG_SENSORS_NZXT_KRAKEN3) += nzxt-kraken3.o
 obj-$(CONFIG_SENSORS_NZXT_SMART2) += nzxt-smart2.o
 obj-$(CONFIG_SENSORS_OXP) += oxp-sensors.o
 obj-$(CONFIG_SENSORS_PC87360)	+= pc87360.o
diff --git a/drivers/hwmon/nzxt-kraken3.c b/drivers/hwmon/nzxt-kraken3.c
new file mode 100644
index 00000000000000..5806a3f32bcb43
--- /dev/null
+++ b/drivers/hwmon/nzxt-kraken3.c
@@ -0,0 +1,1008 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * hwmon driver for NZXT Kraken X53/X63/X73 and Z53/Z63/Z73 all in one coolers.
+ * X53 and Z53 in code refer to all models in their respective series (shortened
+ * for brevity).
+ *
+ * Copyright 2021  Jonas Malaco <jonas@protocubo.io>
+ * Copyright 2022  Aleksa Savic <savicaleksa83@gmail.com>
+ */
+
+#include <linux/debugfs.h>
+#include <linux/hid.h>
+#include <linux/hwmon.h>
+#include <linux/hwmon-sysfs.h>
+#include <linux/jiffies.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/spinlock.h>
+#include <linux/wait.h>
+#include <asm/unaligned.h>
+
+#define USB_VENDOR_ID_NZXT		0x1e71
+#define USB_PRODUCT_ID_X53		0x2007
+#define USB_PRODUCT_ID_X53_SECOND	0x2014
+#define USB_PRODUCT_ID_Z53		0x3008
+
+enum kinds { X53, Z53 } __packed;
+enum pwm_enable { off, manual, curve } __packed;
+
+static const char *const kraken3_device_names[] = {
+	[X53] = "x53",
+	[Z53] = "z53",
+};
+
+#define DRIVER_NAME		"nzxt_kraken3"
+#define STATUS_REPORT_ID	0x75
+#define FIRMWARE_REPORT_ID	0x11
+#define STATUS_VALIDITY		2000	/* In ms, equivalent to period of four status reports */
+#define CUSTOM_CURVE_POINTS	40	/* For temps from 20C to 59C (critical temp) */
+#define PUMP_DUTY_MIN		20	/* In percent */
+
+/* Sensor report offsets for Kraken X53 and Z53 */
+#define TEMP_SENSOR_START_OFFSET	15
+#define TEMP_SENSOR_END_OFFSET		16
+#define PUMP_SPEED_OFFSET		17
+#define PUMP_DUTY_OFFSET		19
+
+/* Firmware version report offset for Kraken X53 and Z53 */
+#define FIRMWARE_VERSION_OFFSET		17
+
+/* Sensor report offsets for Kraken Z53 */
+#define Z53_FAN_SPEED_OFFSET		23
+#define Z53_FAN_DUTY_OFFSET		25
+
+/* Report offsets for control commands for Kraken X53 and Z53 */
+#define SET_DUTY_ID_OFFSET		1
+
+/* Control commands and their lengths for Kraken X53 and Z53 */
+
+/* Last byte sets the report interval at 0.5s */
+static const u8 set_interval_cmd[] = { 0x70, 0x02, 0x01, 0xB8, 1 };
+static const u8 finish_init_cmd[] = { 0x70, 0x01 };
+static const u8 __maybe_unused get_fw_version_cmd[] = { 0x10, 0x01 };
+static const u8 set_pump_duty_cmd_header[] = { 0x72, 0x00, 0x00, 0x00 };
+static const u8 z53_get_status_cmd[] = { 0x74, 0x01 };
+
+#define SET_INTERVAL_CMD_LENGTH			5
+#define FINISH_INIT_CMD_LENGTH			2
+#define GET_FW_VERSION_CMD_LENGTH		2
+#define MAX_REPORT_LENGTH			64
+#define MIN_REPORT_LENGTH			20
+#define SET_CURVE_DUTY_CMD_HEADER_LENGTH	4
+/* 4 byte header and 40 duty offsets */
+#define SET_CURVE_DUTY_CMD_LENGTH		(4 + 40)
+#define Z53_GET_STATUS_CMD_LENGTH		2
+
+static const char *const kraken3_temp_label[] = {
+	"Coolant temp",
+};
+
+static const char *const kraken3_fan_label[] = {
+	"Pump speed",
+	"Fan speed"
+};
+
+struct kraken3_channel_info {
+	enum pwm_enable mode;
+
+	/* Both values are PWM */
+	u16 reported_duty;
+	u16 fixed_duty;		/* Manually set fixed duty */
+
+	u8 pwm_points[CUSTOM_CURVE_POINTS];
+};
+
+struct kraken3_data {
+	struct hid_device *hdev;
+	struct device *hwmon_dev;
+	struct dentry *debugfs;
+	struct mutex buffer_lock;	/* For locking access to buffer */
+	struct mutex z53_status_request_lock;
+	struct completion fw_version_processed;
+	/*
+	 * For X53 devices, tracks whether an initial (one) sensor report was received to
+	 * make fancontrol not bail outright. For Z53 devices, whether a status report
+	 * was processed after requesting one.
+	 */
+	struct completion status_report_processed;
+	/* For locking the above completion */
+	spinlock_t status_completion_lock;
+
+	u8 *buffer;
+	struct kraken3_channel_info channel_info[2];	/* Pump and fan */
+	bool is_device_faulty;
+
+	/* Sensor values */
+	s32 temp_input[1];
+	u16 fan_input[2];
+
+	enum kinds kind;
+	u8 firmware_version[3];
+
+	unsigned long updated;	/* jiffies */
+};
+
+static umode_t kraken3_is_visible(const void *data, enum hwmon_sensor_types type, u32 attr,
+				  int channel)
+{
+	const struct kraken3_data *priv = data;
+
+	switch (type) {
+	case hwmon_temp:
+		if (channel < 1)
+			return 0444;
+		break;
+	case hwmon_fan:
+		switch (priv->kind) {
+		case X53:
+			/* Just the pump */
+			if (channel < 1)
+				return 0444;
+			break;
+		case Z53:
+			/* Pump and fan */
+			if (channel < 2)
+				return 0444;
+			break;
+		default:
+			break;
+		}
+		break;
+	case hwmon_pwm:
+		switch (attr) {
+		case hwmon_pwm_enable:
+		case hwmon_pwm_input:
+			switch (priv->kind) {
+			case X53:
+				/* Just the pump */
+				if (channel < 1)
+					return 0644;
+				break;
+			case Z53:
+				/* Pump and fan */
+				if (channel < 2)
+					return 0644;
+				break;
+			default:
+				break;
+			}
+			break;
+		default:
+			break;
+		}
+		break;
+	default:
+		break;
+	}
+
+	return 0;
+}
+
+/*
+ * Writes the command to the device with the rest of the report (up to 64 bytes) filled
+ * with zeroes.
+ */
+static int kraken3_write_expanded(struct kraken3_data *priv, const u8 *cmd, int cmd_length)
+{
+	int ret;
+
+	mutex_lock(&priv->buffer_lock);
+
+	memcpy_and_pad(priv->buffer, MAX_REPORT_LENGTH, cmd, cmd_length, 0x00);
+	ret = hid_hw_output_report(priv->hdev, priv->buffer, MAX_REPORT_LENGTH);
+
+	mutex_unlock(&priv->buffer_lock);
+	return ret;
+}
+
+static int kraken3_percent_to_pwm(long val)
+{
+	return DIV_ROUND_CLOSEST(val * 255, 100);
+}
+
+static int kraken3_pwm_to_percent(long val, int channel)
+{
+	int percent_value;
+
+	if (val < 0 || val > 255)
+		return -EINVAL;
+
+	percent_value = DIV_ROUND_CLOSEST(val * 100, 255);
+
+	/* Bring up pump duty to min value if needed */
+	if (channel == 0 && percent_value < PUMP_DUTY_MIN)
+		percent_value = PUMP_DUTY_MIN;
+
+	return percent_value;
+}
+
+static int kraken3_read_x53(struct kraken3_data *priv)
+{
+	int ret;
+
+	if (completion_done(&priv->status_report_processed))
+		/*
+		 * We're here because data is stale. This means that sensor reports haven't
+		 * been received for some time in kraken3_raw_event(). On X-series sensor data
+		 * can't be manually requested, so return an error.
+		 */
+		return -ENODATA;
+
+	/*
+	 * Data needs to be read, but a sensor report wasn't yet received. It's usually
+	 * fancontrol that requests data this early and it exits if it reads an error code.
+	 * So, wait for the first report to be parsed (but up to STATUS_VALIDITY).
+	 * This does not concern the Z series devices, because they send a sensor report
+	 * only when requested.
+	 */
+	ret = wait_for_completion_interruptible_timeout(&priv->status_report_processed,
+							msecs_to_jiffies(STATUS_VALIDITY));
+	if (ret == 0)
+		return -ETIMEDOUT;
+	else if (ret < 0)
+		return ret;
+
+	/* The first sensor report was parsed on time and reading can continue */
+	return 0;
+}
+
+static int kraken3_read_z53(struct kraken3_data *priv)
+{
+	int ret = mutex_lock_interruptible(&priv->z53_status_request_lock);
+
+	if (ret < 0)
+		return ret;
+
+	if (!time_after(jiffies, priv->updated + msecs_to_jiffies(STATUS_VALIDITY))) {
+		/* Data is up to date */
+		goto unlock_and_return;
+	}
+
+	/*
+	 * Disable interrupts for a moment to safely reinit the completion,
+	 * as hidraw calls could have allowed one or more readers to complete.
+	 */
+	spin_lock_bh(&priv->status_completion_lock);
+	reinit_completion(&priv->status_report_processed);
+	spin_unlock_bh(&priv->status_completion_lock);
+
+	/* Send command for getting status */
+	ret = kraken3_write_expanded(priv, z53_get_status_cmd, Z53_GET_STATUS_CMD_LENGTH);
+	if (ret < 0)
+		goto unlock_and_return;
+
+	/* Wait for completion from kraken3_raw_event() */
+	ret = wait_for_completion_interruptible_timeout(&priv->status_report_processed,
+							msecs_to_jiffies(STATUS_VALIDITY));
+	if (ret == 0)
+		ret = -ETIMEDOUT;
+
+unlock_and_return:
+	mutex_unlock(&priv->z53_status_request_lock);
+	if (ret < 0)
+		return ret;
+
+	return 0;
+}
+
+static int kraken3_read(struct device *dev, enum hwmon_sensor_types type, u32 attr, int channel,
+			long *val)
+{
+	struct kraken3_data *priv = dev_get_drvdata(dev);
+	int ret;
+
+	if (time_after(jiffies, priv->updated + msecs_to_jiffies(STATUS_VALIDITY))) {
+		if (priv->kind == X53)
+			ret = kraken3_read_x53(priv);
+		else
+			ret = kraken3_read_z53(priv);
+
+		if (ret < 0)
+			return ret;
+
+		if (priv->is_device_faulty)
+			return -ENODATA;
+	}
+
+	switch (type) {
+	case hwmon_temp:
+		*val = priv->temp_input[channel];
+		break;
+	case hwmon_fan:
+		*val = priv->fan_input[channel];
+		break;
+	case hwmon_pwm:
+		switch (attr) {
+		case hwmon_pwm_enable:
+			*val = priv->channel_info[channel].mode;
+			break;
+		case hwmon_pwm_input:
+			*val = priv->channel_info[channel].reported_duty;
+			break;
+		default:
+			return -EOPNOTSUPP;
+		}
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	return 0;
+}
+
+static int kraken3_read_string(struct device *dev, enum hwmon_sensor_types type, u32 attr,
+			       int channel, const char **str)
+{
+	switch (type) {
+	case hwmon_temp:
+		*str = kraken3_temp_label[channel];
+		break;
+	case hwmon_fan:
+		*str = kraken3_fan_label[channel];
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	return 0;
+}
+
+/* Writes custom curve to device */
+static int kraken3_write_curve(struct kraken3_data *priv, u8 *curve_array, int channel)
+{
+	u8 fixed_duty_cmd[SET_CURVE_DUTY_CMD_LENGTH];
+	int ret;
+
+	/* Copy command header */
+	memcpy(fixed_duty_cmd, set_pump_duty_cmd_header, SET_CURVE_DUTY_CMD_HEADER_LENGTH);
+
+	/* Set the correct ID for writing pump/fan duty (0x01 or 0x02, respectively) */
+	fixed_duty_cmd[SET_DUTY_ID_OFFSET] = channel + 1;
+
+	/* Copy curve to command */
+	memcpy(fixed_duty_cmd + SET_CURVE_DUTY_CMD_HEADER_LENGTH, curve_array, CUSTOM_CURVE_POINTS);
+
+	ret = kraken3_write_expanded(priv, fixed_duty_cmd, SET_CURVE_DUTY_CMD_LENGTH);
+	return ret;
+}
+
+static int kraken3_write_fixed_duty(struct kraken3_data *priv, long val, int channel)
+{
+	u8 fixed_curve_points[CUSTOM_CURVE_POINTS];
+	int ret, percent_val, i;
+
+	percent_val = kraken3_pwm_to_percent(val, channel);
+	if (percent_val < 0)
+		return percent_val;
+
+	/*
+	 * The devices can only control the duty through a curve.
+	 * Since we're setting a fixed duty here, fill the whole curve
+	 * (ranging from 20C to 59C) with the same duty, except for
+	 * the last point, the critical temperature, where it's maxed
+	 * out for safety.
+	 */
+
+	/* Fill the custom curve with the fixed value we're setting */
+	for (i = 0; i < CUSTOM_CURVE_POINTS - 1; i++)
+		fixed_curve_points[i] = percent_val;
+
+	/* Force duty to 100% at critical temp */
+	fixed_curve_points[CUSTOM_CURVE_POINTS - 1] = 100;
+
+	/* Write the fixed duty curve to the device */
+	ret = kraken3_write_curve(priv, fixed_curve_points, channel);
+	return ret;
+}
+
+static int kraken3_write(struct device *dev, enum hwmon_sensor_types type, u32 attr, int channel,
+			 long val)
+{
+	struct kraken3_data *priv = dev_get_drvdata(dev);
+	int ret;
+
+	switch (type) {
+	case hwmon_pwm:
+		switch (attr) {
+		case hwmon_pwm_input:
+			/* Remember the last set fixed duty for channel */
+			priv->channel_info[channel].fixed_duty = val;
+
+			if (priv->channel_info[channel].mode == manual) {
+				ret = kraken3_write_fixed_duty(priv, val, channel);
+				if (ret < 0)
+					return ret;
+
+				/*
+				 * Lock onto this value and report it until next interrupt status
+				 * report is received, so userspace tools can continue to work.
+				 */
+				priv->channel_info[channel].reported_duty = val;
+			}
+			break;
+		case hwmon_pwm_enable:
+			if (val < 0 || val > 2)
+				return -EINVAL;
+
+			switch (val) {
+			case 0:
+				/* Set channel to 100%, direct duty value */
+				ret = kraken3_write_fixed_duty(priv, 255, channel);
+				if (ret < 0)
+					return ret;
+
+				/* We don't control anything anymore */
+				priv->channel_info[channel].mode = off;
+				break;
+			case 1:
+				/* Apply the last known direct duty value */
+				ret =
+				    kraken3_write_fixed_duty(priv,
+							     priv->channel_info[channel].fixed_duty,
+							     channel);
+				if (ret < 0)
+					return ret;
+
+				priv->channel_info[channel].mode = manual;
+				break;
+			case 2:
+				/* Apply the curve and note as enabled */
+				ret =
+				    kraken3_write_curve(priv,
+							priv->channel_info[channel].pwm_points,
+							channel);
+				if (ret < 0)
+					return ret;
+
+				priv->channel_info[channel].mode = curve;
+				break;
+			default:
+				break;
+			}
+			break;
+		default:
+			return -EOPNOTSUPP;
+		}
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	return 0;
+}
+
+static ssize_t kraken3_fan_curve_pwm_store(struct device *dev, struct device_attribute *attr,
+					   const char *buf, size_t count)
+{
+	struct sensor_device_attribute_2 *dev_attr = to_sensor_dev_attr_2(attr);
+	struct kraken3_data *priv = dev_get_drvdata(dev);
+	long val;
+	int ret;
+
+	if (kstrtol(buf, 10, &val) < 0)
+		return -EINVAL;
+
+	val = kraken3_pwm_to_percent(val, dev_attr->nr);
+	if (val < 0)
+		return val;
+
+	priv->channel_info[dev_attr->nr].pwm_points[dev_attr->index] = val;
+
+	if (priv->channel_info[dev_attr->nr].mode == curve) {
+		/* Apply the curve */
+		ret =
+		    kraken3_write_curve(priv,
+					priv->channel_info[dev_attr->nr].pwm_points, dev_attr->nr);
+		if (ret < 0)
+			return ret;
+	}
+
+	return count;
+}
+
+static umode_t kraken3_curve_props_are_visible(struct kobject *kobj, struct attribute *attr,
+					       int index)
+{
+	struct device *dev = kobj_to_dev(kobj);
+	struct kraken3_data *priv = dev_get_drvdata(dev);
+
+	/* Only Z53 has the fan curve */
+	if (index >= CUSTOM_CURVE_POINTS && priv->kind != Z53)
+		return 0;
+
+	return attr->mode;
+}
+
+/* Custom pump curve from 20C to 59C (critical temp) */
+static SENSOR_DEVICE_ATTR_2_WO(temp1_auto_point1_pwm, kraken3_fan_curve_pwm, 0, 0);
+static SENSOR_DEVICE_ATTR_2_WO(temp1_auto_point2_pwm, kraken3_fan_curve_pwm, 0, 1);
+static SENSOR_DEVICE_ATTR_2_WO(temp1_auto_point3_pwm, kraken3_fan_curve_pwm, 0, 2);
+static SENSOR_DEVICE_ATTR_2_WO(temp1_auto_point4_pwm, kraken3_fan_curve_pwm, 0, 3);
+static SENSOR_DEVICE_ATTR_2_WO(temp1_auto_point5_pwm, kraken3_fan_curve_pwm, 0, 4);
+static SENSOR_DEVICE_ATTR_2_WO(temp1_auto_point6_pwm, kraken3_fan_curve_pwm, 0, 5);
+static SENSOR_DEVICE_ATTR_2_WO(temp1_auto_point7_pwm, kraken3_fan_curve_pwm, 0, 6);
+static SENSOR_DEVICE_ATTR_2_WO(temp1_auto_point8_pwm, kraken3_fan_curve_pwm, 0, 7);
+static SENSOR_DEVICE_ATTR_2_WO(temp1_auto_point9_pwm, kraken3_fan_curve_pwm, 0, 8);
+static SENSOR_DEVICE_ATTR_2_WO(temp1_auto_point10_pwm, kraken3_fan_curve_pwm, 0, 9);
+static SENSOR_DEVICE_ATTR_2_WO(temp1_auto_point11_pwm, kraken3_fan_curve_pwm, 0, 10);
+static SENSOR_DEVICE_ATTR_2_WO(temp1_auto_point12_pwm, kraken3_fan_curve_pwm, 0, 11);
+static SENSOR_DEVICE_ATTR_2_WO(temp1_auto_point13_pwm, kraken3_fan_curve_pwm, 0, 12);
+static SENSOR_DEVICE_ATTR_2_WO(temp1_auto_point14_pwm, kraken3_fan_curve_pwm, 0, 13);
+static SENSOR_DEVICE_ATTR_2_WO(temp1_auto_point15_pwm, kraken3_fan_curve_pwm, 0, 14);
+static SENSOR_DEVICE_ATTR_2_WO(temp1_auto_point16_pwm, kraken3_fan_curve_pwm, 0, 15);
+static SENSOR_DEVICE_ATTR_2_WO(temp1_auto_point17_pwm, kraken3_fan_curve_pwm, 0, 16);
+static SENSOR_DEVICE_ATTR_2_WO(temp1_auto_point18_pwm, kraken3_fan_curve_pwm, 0, 17);
+static SENSOR_DEVICE_ATTR_2_WO(temp1_auto_point19_pwm, kraken3_fan_curve_pwm, 0, 18);
+static SENSOR_DEVICE_ATTR_2_WO(temp1_auto_point20_pwm, kraken3_fan_curve_pwm, 0, 19);
+static SENSOR_DEVICE_ATTR_2_WO(temp1_auto_point21_pwm, kraken3_fan_curve_pwm, 0, 20);
+static SENSOR_DEVICE_ATTR_2_WO(temp1_auto_point22_pwm, kraken3_fan_curve_pwm, 0, 21);
+static SENSOR_DEVICE_ATTR_2_WO(temp1_auto_point23_pwm, kraken3_fan_curve_pwm, 0, 22);
+static SENSOR_DEVICE_ATTR_2_WO(temp1_auto_point24_pwm, kraken3_fan_curve_pwm, 0, 23);
+static SENSOR_DEVICE_ATTR_2_WO(temp1_auto_point25_pwm, kraken3_fan_curve_pwm, 0, 24);
+static SENSOR_DEVICE_ATTR_2_WO(temp1_auto_point26_pwm, kraken3_fan_curve_pwm, 0, 25);
+static SENSOR_DEVICE_ATTR_2_WO(temp1_auto_point27_pwm, kraken3_fan_curve_pwm, 0, 26);
+static SENSOR_DEVICE_ATTR_2_WO(temp1_auto_point28_pwm, kraken3_fan_curve_pwm, 0, 27);
+static SENSOR_DEVICE_ATTR_2_WO(temp1_auto_point29_pwm, kraken3_fan_curve_pwm, 0, 28);
+static SENSOR_DEVICE_ATTR_2_WO(temp1_auto_point30_pwm, kraken3_fan_curve_pwm, 0, 29);
+static SENSOR_DEVICE_ATTR_2_WO(temp1_auto_point31_pwm, kraken3_fan_curve_pwm, 0, 30);
+static SENSOR_DEVICE_ATTR_2_WO(temp1_auto_point32_pwm, kraken3_fan_curve_pwm, 0, 31);
+static SENSOR_DEVICE_ATTR_2_WO(temp1_auto_point33_pwm, kraken3_fan_curve_pwm, 0, 32);
+static SENSOR_DEVICE_ATTR_2_WO(temp1_auto_point34_pwm, kraken3_fan_curve_pwm, 0, 33);
+static SENSOR_DEVICE_ATTR_2_WO(temp1_auto_point35_pwm, kraken3_fan_curve_pwm, 0, 34);
+static SENSOR_DEVICE_ATTR_2_WO(temp1_auto_point36_pwm, kraken3_fan_curve_pwm, 0, 35);
+static SENSOR_DEVICE_ATTR_2_WO(temp1_auto_point37_pwm, kraken3_fan_curve_pwm, 0, 36);
+static SENSOR_DEVICE_ATTR_2_WO(temp1_auto_point38_pwm, kraken3_fan_curve_pwm, 0, 37);
+static SENSOR_DEVICE_ATTR_2_WO(temp1_auto_point39_pwm, kraken3_fan_curve_pwm, 0, 38);
+static SENSOR_DEVICE_ATTR_2_WO(temp1_auto_point40_pwm, kraken3_fan_curve_pwm, 0, 39);
+
+/* Custom fan curve from 20C to 59C (critical temp) */
+static SENSOR_DEVICE_ATTR_2_WO(temp2_auto_point1_pwm, kraken3_fan_curve_pwm, 1, 0);
+static SENSOR_DEVICE_ATTR_2_WO(temp2_auto_point2_pwm, kraken3_fan_curve_pwm, 1, 1);
+static SENSOR_DEVICE_ATTR_2_WO(temp2_auto_point3_pwm, kraken3_fan_curve_pwm, 1, 2);
+static SENSOR_DEVICE_ATTR_2_WO(temp2_auto_point4_pwm, kraken3_fan_curve_pwm, 1, 3);
+static SENSOR_DEVICE_ATTR_2_WO(temp2_auto_point5_pwm, kraken3_fan_curve_pwm, 1, 4);
+static SENSOR_DEVICE_ATTR_2_WO(temp2_auto_point6_pwm, kraken3_fan_curve_pwm, 1, 5);
+static SENSOR_DEVICE_ATTR_2_WO(temp2_auto_point7_pwm, kraken3_fan_curve_pwm, 1, 6);
+static SENSOR_DEVICE_ATTR_2_WO(temp2_auto_point8_pwm, kraken3_fan_curve_pwm, 1, 7);
+static SENSOR_DEVICE_ATTR_2_WO(temp2_auto_point9_pwm, kraken3_fan_curve_pwm, 1, 8);
+static SENSOR_DEVICE_ATTR_2_WO(temp2_auto_point10_pwm, kraken3_fan_curve_pwm, 1, 9);
+static SENSOR_DEVICE_ATTR_2_WO(temp2_auto_point11_pwm, kraken3_fan_curve_pwm, 1, 10);
+static SENSOR_DEVICE_ATTR_2_WO(temp2_auto_point12_pwm, kraken3_fan_curve_pwm, 1, 11);
+static SENSOR_DEVICE_ATTR_2_WO(temp2_auto_point13_pwm, kraken3_fan_curve_pwm, 1, 12);
+static SENSOR_DEVICE_ATTR_2_WO(temp2_auto_point14_pwm, kraken3_fan_curve_pwm, 1, 13);
+static SENSOR_DEVICE_ATTR_2_WO(temp2_auto_point15_pwm, kraken3_fan_curve_pwm, 1, 14);
+static SENSOR_DEVICE_ATTR_2_WO(temp2_auto_point16_pwm, kraken3_fan_curve_pwm, 1, 15);
+static SENSOR_DEVICE_ATTR_2_WO(temp2_auto_point17_pwm, kraken3_fan_curve_pwm, 1, 16);
+static SENSOR_DEVICE_ATTR_2_WO(temp2_auto_point18_pwm, kraken3_fan_curve_pwm, 1, 17);
+static SENSOR_DEVICE_ATTR_2_WO(temp2_auto_point19_pwm, kraken3_fan_curve_pwm, 1, 18);
+static SENSOR_DEVICE_ATTR_2_WO(temp2_auto_point20_pwm, kraken3_fan_curve_pwm, 1, 19);
+static SENSOR_DEVICE_ATTR_2_WO(temp2_auto_point21_pwm, kraken3_fan_curve_pwm, 1, 20);
+static SENSOR_DEVICE_ATTR_2_WO(temp2_auto_point22_pwm, kraken3_fan_curve_pwm, 1, 21);
+static SENSOR_DEVICE_ATTR_2_WO(temp2_auto_point23_pwm, kraken3_fan_curve_pwm, 1, 22);
+static SENSOR_DEVICE_ATTR_2_WO(temp2_auto_point24_pwm, kraken3_fan_curve_pwm, 1, 23);
+static SENSOR_DEVICE_ATTR_2_WO(temp2_auto_point25_pwm, kraken3_fan_curve_pwm, 1, 24);
+static SENSOR_DEVICE_ATTR_2_WO(temp2_auto_point26_pwm, kraken3_fan_curve_pwm, 1, 25);
+static SENSOR_DEVICE_ATTR_2_WO(temp2_auto_point27_pwm, kraken3_fan_curve_pwm, 1, 26);
+static SENSOR_DEVICE_ATTR_2_WO(temp2_auto_point28_pwm, kraken3_fan_curve_pwm, 1, 27);
+static SENSOR_DEVICE_ATTR_2_WO(temp2_auto_point29_pwm, kraken3_fan_curve_pwm, 1, 28);
+static SENSOR_DEVICE_ATTR_2_WO(temp2_auto_point30_pwm, kraken3_fan_curve_pwm, 1, 29);
+static SENSOR_DEVICE_ATTR_2_WO(temp2_auto_point31_pwm, kraken3_fan_curve_pwm, 1, 30);
+static SENSOR_DEVICE_ATTR_2_WO(temp2_auto_point32_pwm, kraken3_fan_curve_pwm, 1, 31);
+static SENSOR_DEVICE_ATTR_2_WO(temp2_auto_point33_pwm, kraken3_fan_curve_pwm, 1, 32);
+static SENSOR_DEVICE_ATTR_2_WO(temp2_auto_point34_pwm, kraken3_fan_curve_pwm, 1, 33);
+static SENSOR_DEVICE_ATTR_2_WO(temp2_auto_point35_pwm, kraken3_fan_curve_pwm, 1, 34);
+static SENSOR_DEVICE_ATTR_2_WO(temp2_auto_point36_pwm, kraken3_fan_curve_pwm, 1, 35);
+static SENSOR_DEVICE_ATTR_2_WO(temp2_auto_point37_pwm, kraken3_fan_curve_pwm, 1, 36);
+static SENSOR_DEVICE_ATTR_2_WO(temp2_auto_point38_pwm, kraken3_fan_curve_pwm, 1, 37);
+static SENSOR_DEVICE_ATTR_2_WO(temp2_auto_point39_pwm, kraken3_fan_curve_pwm, 1, 38);
+static SENSOR_DEVICE_ATTR_2_WO(temp2_auto_point40_pwm, kraken3_fan_curve_pwm, 1, 39);
+
+static struct attribute *kraken3_curve_attrs[] = {
+	/* Pump control curve */
+	&sensor_dev_attr_temp1_auto_point1_pwm.dev_attr.attr,
+	&sensor_dev_attr_temp1_auto_point2_pwm.dev_attr.attr,
+	&sensor_dev_attr_temp1_auto_point3_pwm.dev_attr.attr,
+	&sensor_dev_attr_temp1_auto_point4_pwm.dev_attr.attr,
+	&sensor_dev_attr_temp1_auto_point5_pwm.dev_attr.attr,
+	&sensor_dev_attr_temp1_auto_point6_pwm.dev_attr.attr,
+	&sensor_dev_attr_temp1_auto_point7_pwm.dev_attr.attr,
+	&sensor_dev_attr_temp1_auto_point8_pwm.dev_attr.attr,
+	&sensor_dev_attr_temp1_auto_point9_pwm.dev_attr.attr,
+	&sensor_dev_attr_temp1_auto_point10_pwm.dev_attr.attr,
+	&sensor_dev_attr_temp1_auto_point11_pwm.dev_attr.attr,
+	&sensor_dev_attr_temp1_auto_point12_pwm.dev_attr.attr,
+	&sensor_dev_attr_temp1_auto_point13_pwm.dev_attr.attr,
+	&sensor_dev_attr_temp1_auto_point14_pwm.dev_attr.attr,
+	&sensor_dev_attr_temp1_auto_point15_pwm.dev_attr.attr,
+	&sensor_dev_attr_temp1_auto_point16_pwm.dev_attr.attr,
+	&sensor_dev_attr_temp1_auto_point17_pwm.dev_attr.attr,
+	&sensor_dev_attr_temp1_auto_point18_pwm.dev_attr.attr,
+	&sensor_dev_attr_temp1_auto_point19_pwm.dev_attr.attr,
+	&sensor_dev_attr_temp1_auto_point20_pwm.dev_attr.attr,
+	&sensor_dev_attr_temp1_auto_point21_pwm.dev_attr.attr,
+	&sensor_dev_attr_temp1_auto_point22_pwm.dev_attr.attr,
+	&sensor_dev_attr_temp1_auto_point23_pwm.dev_attr.attr,
+	&sensor_dev_attr_temp1_auto_point24_pwm.dev_attr.attr,
+	&sensor_dev_attr_temp1_auto_point25_pwm.dev_attr.attr,
+	&sensor_dev_attr_temp1_auto_point26_pwm.dev_attr.attr,
+	&sensor_dev_attr_temp1_auto_point27_pwm.dev_attr.attr,
+	&sensor_dev_attr_temp1_auto_point28_pwm.dev_attr.attr,
+	&sensor_dev_attr_temp1_auto_point29_pwm.dev_attr.attr,
+	&sensor_dev_attr_temp1_auto_point30_pwm.dev_attr.attr,
+	&sensor_dev_attr_temp1_auto_point31_pwm.dev_attr.attr,
+	&sensor_dev_attr_temp1_auto_point32_pwm.dev_attr.attr,
+	&sensor_dev_attr_temp1_auto_point33_pwm.dev_attr.attr,
+	&sensor_dev_attr_temp1_auto_point34_pwm.dev_attr.attr,
+	&sensor_dev_attr_temp1_auto_point35_pwm.dev_attr.attr,
+	&sensor_dev_attr_temp1_auto_point36_pwm.dev_attr.attr,
+	&sensor_dev_attr_temp1_auto_point37_pwm.dev_attr.attr,
+	&sensor_dev_attr_temp1_auto_point38_pwm.dev_attr.attr,
+	&sensor_dev_attr_temp1_auto_point39_pwm.dev_attr.attr,
+	&sensor_dev_attr_temp1_auto_point40_pwm.dev_attr.attr,
+	/* Fan control curve (Z53 only) */
+	&sensor_dev_attr_temp2_auto_point1_pwm.dev_attr.attr,
+	&sensor_dev_attr_temp2_auto_point2_pwm.dev_attr.attr,
+	&sensor_dev_attr_temp2_auto_point3_pwm.dev_attr.attr,
+	&sensor_dev_attr_temp2_auto_point4_pwm.dev_attr.attr,
+	&sensor_dev_attr_temp2_auto_point5_pwm.dev_attr.attr,
+	&sensor_dev_attr_temp2_auto_point6_pwm.dev_attr.attr,
+	&sensor_dev_attr_temp2_auto_point7_pwm.dev_attr.attr,
+	&sensor_dev_attr_temp2_auto_point8_pwm.dev_attr.attr,
+	&sensor_dev_attr_temp2_auto_point9_pwm.dev_attr.attr,
+	&sensor_dev_attr_temp2_auto_point10_pwm.dev_attr.attr,
+	&sensor_dev_attr_temp2_auto_point11_pwm.dev_attr.attr,
+	&sensor_dev_attr_temp2_auto_point12_pwm.dev_attr.attr,
+	&sensor_dev_attr_temp2_auto_point13_pwm.dev_attr.attr,
+	&sensor_dev_attr_temp2_auto_point14_pwm.dev_attr.attr,
+	&sensor_dev_attr_temp2_auto_point15_pwm.dev_attr.attr,
+	&sensor_dev_attr_temp2_auto_point16_pwm.dev_attr.attr,
+	&sensor_dev_attr_temp2_auto_point17_pwm.dev_attr.attr,
+	&sensor_dev_attr_temp2_auto_point18_pwm.dev_attr.attr,
+	&sensor_dev_attr_temp2_auto_point19_pwm.dev_attr.attr,
+	&sensor_dev_attr_temp2_auto_point20_pwm.dev_attr.attr,
+	&sensor_dev_attr_temp2_auto_point21_pwm.dev_attr.attr,
+	&sensor_dev_attr_temp2_auto_point22_pwm.dev_attr.attr,
+	&sensor_dev_attr_temp2_auto_point23_pwm.dev_attr.attr,
+	&sensor_dev_attr_temp2_auto_point24_pwm.dev_attr.attr,
+	&sensor_dev_attr_temp2_auto_point25_pwm.dev_attr.attr,
+	&sensor_dev_attr_temp2_auto_point26_pwm.dev_attr.attr,
+	&sensor_dev_attr_temp2_auto_point27_pwm.dev_attr.attr,
+	&sensor_dev_attr_temp2_auto_point28_pwm.dev_attr.attr,
+	&sensor_dev_attr_temp2_auto_point29_pwm.dev_attr.attr,
+	&sensor_dev_attr_temp2_auto_point30_pwm.dev_attr.attr,
+	&sensor_dev_attr_temp2_auto_point31_pwm.dev_attr.attr,
+	&sensor_dev_attr_temp2_auto_point32_pwm.dev_attr.attr,
+	&sensor_dev_attr_temp2_auto_point33_pwm.dev_attr.attr,
+	&sensor_dev_attr_temp2_auto_point34_pwm.dev_attr.attr,
+	&sensor_dev_attr_temp2_auto_point35_pwm.dev_attr.attr,
+	&sensor_dev_attr_temp2_auto_point36_pwm.dev_attr.attr,
+	&sensor_dev_attr_temp2_auto_point37_pwm.dev_attr.attr,
+	&sensor_dev_attr_temp2_auto_point38_pwm.dev_attr.attr,
+	&sensor_dev_attr_temp2_auto_point39_pwm.dev_attr.attr,
+	&sensor_dev_attr_temp2_auto_point40_pwm.dev_attr.attr,
+	NULL
+};
+
+static const struct attribute_group kraken3_curves_group = {
+	.attrs = kraken3_curve_attrs,
+	.is_visible = kraken3_curve_props_are_visible
+};
+
+static const struct attribute_group *kraken3_groups[] = {
+	&kraken3_curves_group,
+	NULL
+};
+
+static const struct hwmon_ops kraken3_hwmon_ops = {
+	.is_visible = kraken3_is_visible,
+	.read = kraken3_read,
+	.read_string = kraken3_read_string,
+	.write = kraken3_write
+};
+
+static const struct hwmon_channel_info *kraken3_info[] = {
+	HWMON_CHANNEL_INFO(temp,
+			   HWMON_T_INPUT | HWMON_T_LABEL),
+	HWMON_CHANNEL_INFO(fan,
+			   HWMON_F_INPUT | HWMON_F_LABEL,
+			   HWMON_F_INPUT | HWMON_F_LABEL,
+			   HWMON_F_INPUT | HWMON_F_LABEL,
+			   HWMON_F_INPUT | HWMON_F_LABEL),
+	HWMON_CHANNEL_INFO(pwm,
+			   HWMON_PWM_INPUT | HWMON_PWM_ENABLE,
+			   HWMON_PWM_INPUT | HWMON_PWM_ENABLE),
+	NULL
+};
+
+static const struct hwmon_chip_info kraken3_chip_info = {
+	.ops = &kraken3_hwmon_ops,
+	.info = kraken3_info,
+};
+
+static int kraken3_raw_event(struct hid_device *hdev, struct hid_report *report, u8 *data, int size)
+{
+	struct kraken3_data *priv = hid_get_drvdata(hdev);
+	int i;
+
+	if (size < MIN_REPORT_LENGTH)
+		return 0;
+
+	if (report->id == FIRMWARE_REPORT_ID) {
+		/* Read firmware version */
+		for (i = 0; i < 3; i++)
+			priv->firmware_version[i] = data[FIRMWARE_VERSION_OFFSET + i];
+
+		if (!completion_done(&priv->fw_version_processed))
+			complete_all(&priv->fw_version_processed);
+
+		return 0;
+	}
+
+	if (report->id != STATUS_REPORT_ID)
+		return 0;
+
+	if (data[TEMP_SENSOR_START_OFFSET] == 0xff && data[TEMP_SENSOR_END_OFFSET] == 0xff) {
+		hid_err_once(hdev,
+			     "firmware or device is possibly damaged (is SATA power connected?), not parsing reports\n");
+
+		/*
+		 * Mark first X-series device report as received,
+		 * as well as all for Z-series, if faulty.
+		 */
+		spin_lock(&priv->status_completion_lock);
+		if (priv->kind != X53 || !completion_done(&priv->status_report_processed)) {
+			priv->is_device_faulty = true;
+			complete_all(&priv->status_report_processed);
+		}
+		spin_unlock(&priv->status_completion_lock);
+
+		return 0;
+	}
+
+	/* Received normal data */
+	priv->is_device_faulty = false;
+
+	/* Temperature and fan sensor readings */
+	priv->temp_input[0] =
+	    data[TEMP_SENSOR_START_OFFSET] * 1000 + data[TEMP_SENSOR_END_OFFSET] * 100;
+
+	priv->fan_input[0] = get_unaligned_le16(data + PUMP_SPEED_OFFSET);
+	priv->channel_info[0].reported_duty = kraken3_percent_to_pwm(data[PUMP_DUTY_OFFSET]);
+
+	spin_lock(&priv->status_completion_lock);
+	if (priv->kind == X53 && !completion_done(&priv->status_report_processed)) {
+		/* Mark first X-series device report as received */
+		complete_all(&priv->status_report_processed);
+	} else if (priv->kind == Z53) {
+		/* Additional readings for Z53 */
+		priv->fan_input[1] = get_unaligned_le16(data + Z53_FAN_SPEED_OFFSET);
+		priv->channel_info[1].reported_duty =
+		    kraken3_percent_to_pwm(data[Z53_FAN_DUTY_OFFSET]);
+
+		if (!completion_done(&priv->status_report_processed))
+			complete_all(&priv->status_report_processed);
+	}
+	spin_unlock(&priv->status_completion_lock);
+
+	priv->updated = jiffies;
+
+	return 0;
+}
+
+static int kraken3_init_device(struct hid_device *hdev)
+{
+	struct kraken3_data *priv = hid_get_drvdata(hdev);
+	int ret;
+
+	/* Set the polling interval */
+	ret = kraken3_write_expanded(priv, set_interval_cmd, SET_INTERVAL_CMD_LENGTH);
+	if (ret < 0)
+		return ret;
+
+	/* Finalize the init process */
+	ret = kraken3_write_expanded(priv, finish_init_cmd, FINISH_INIT_CMD_LENGTH);
+	if (ret < 0)
+		return ret;
+
+	return 0;
+}
+
+static int kraken3_get_fw_ver(struct hid_device *hdev)
+{
+	struct kraken3_data *priv = hid_get_drvdata(hdev);
+	int ret;
+
+	ret = kraken3_write_expanded(priv, get_fw_version_cmd, GET_FW_VERSION_CMD_LENGTH);
+	if (ret < 0)
+		return ret;
+
+	ret = wait_for_completion_interruptible_timeout(&priv->fw_version_processed,
+							msecs_to_jiffies(STATUS_VALIDITY));
+	if (ret == 0)
+		return -ETIMEDOUT;
+	else if (ret < 0)
+		return ret;
+
+	return 0;
+}
+
+static int __maybe_unused kraken3_reset_resume(struct hid_device *hdev)
+{
+	int ret;
+
+	ret = kraken3_init_device(hdev);
+	if (ret)
+		hid_err(hdev, "req init (reset_resume) failed with %d\n", ret);
+
+	return ret;
+}
+
+static int firmware_version_show(struct seq_file *seqf, void *unused)
+{
+	struct kraken3_data *priv = seqf->private;
+
+	seq_printf(seqf, "%u.%u.%u\n", priv->firmware_version[0], priv->firmware_version[1],
+		   priv->firmware_version[2]);
+
+	return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(firmware_version);
+
+static void kraken3_debugfs_init(struct kraken3_data *priv)
+{
+	char name[64];
+
+	if (!priv->firmware_version[0])
+		return;		/* Nothing to display in debugfs */
+
+	scnprintf(name, sizeof(name), "%s_%s-%s", DRIVER_NAME, kraken3_device_names[priv->kind],
+		  dev_name(&priv->hdev->dev));
+
+	priv->debugfs = debugfs_create_dir(name, NULL);
+	debugfs_create_file("firmware_version", 0444, priv->debugfs, priv, &firmware_version_fops);
+}
+
+static int kraken3_probe(struct hid_device *hdev, const struct hid_device_id *id)
+{
+	struct kraken3_data *priv;
+	int ret;
+
+	priv = devm_kzalloc(&hdev->dev, sizeof(*priv), GFP_KERNEL);
+	if (!priv)
+		return -ENOMEM;
+
+	priv->hdev = hdev;
+	hid_set_drvdata(hdev, priv);
+
+	/*
+	 * Initialize ->updated to STATUS_VALIDITY seconds in the past, making
+	 * the initial empty data invalid for kraken3_read without the need for
+	 * a special case there.
+	 */
+	priv->updated = jiffies - msecs_to_jiffies(STATUS_VALIDITY);
+
+	ret = hid_parse(hdev);
+	if (ret) {
+		hid_err(hdev, "hid parse failed with %d\n", ret);
+		return ret;
+	}
+
+	/* Enable hidraw so existing user-space tools can continue to work */
+	ret = hid_hw_start(hdev, HID_CONNECT_HIDRAW);
+	if (ret) {
+		hid_err(hdev, "hid hw start failed with %d\n", ret);
+		return ret;
+	}
+
+	ret = hid_hw_open(hdev);
+	if (ret) {
+		hid_err(hdev, "hid hw open failed with %d\n", ret);
+		goto fail_and_stop;
+	}
+
+	switch (hdev->product) {
+	case USB_PRODUCT_ID_X53:
+	case USB_PRODUCT_ID_X53_SECOND:
+		priv->kind = X53;
+		break;
+	case USB_PRODUCT_ID_Z53:
+		priv->kind = Z53;
+		break;
+	default:
+		break;
+	}
+
+	priv->buffer = devm_kzalloc(&hdev->dev, MAX_REPORT_LENGTH, GFP_KERNEL);
+	if (!priv->buffer) {
+		ret = -ENOMEM;
+		goto fail_and_close;
+	}
+
+	mutex_init(&priv->buffer_lock);
+	mutex_init(&priv->z53_status_request_lock);
+	init_completion(&priv->fw_version_processed);
+	init_completion(&priv->status_report_processed);
+	spin_lock_init(&priv->status_completion_lock);
+
+	hid_device_io_start(hdev);
+	ret = kraken3_init_device(hdev);
+	if (ret < 0) {
+		hid_err(hdev, "device init failed with %d\n", ret);
+		goto fail_and_close;
+	}
+
+	ret = kraken3_get_fw_ver(hdev);
+	if (ret < 0)
+		hid_warn(hdev, "fw version request failed with %d\n", ret);
+
+	priv->hwmon_dev = hwmon_device_register_with_info(&hdev->dev,
+							  kraken3_device_names[priv->kind], priv,
+							  &kraken3_chip_info, kraken3_groups);
+	if (IS_ERR(priv->hwmon_dev)) {
+		ret = PTR_ERR(priv->hwmon_dev);
+		hid_err(hdev, "hwmon registration failed with %d\n", ret);
+		goto fail_and_close;
+	}
+
+	kraken3_debugfs_init(priv);
+
+	return 0;
+
+fail_and_close:
+	hid_hw_close(hdev);
+fail_and_stop:
+	hid_hw_stop(hdev);
+	return ret;
+}
+
+static void kraken3_remove(struct hid_device *hdev)
+{
+	struct kraken3_data *priv = hid_get_drvdata(hdev);
+
+	debugfs_remove_recursive(priv->debugfs);
+	hwmon_device_unregister(priv->hwmon_dev);
+
+	hid_hw_close(hdev);
+	hid_hw_stop(hdev);
+}
+
+static const struct hid_device_id kraken3_table[] = {
+	/* NZXT Kraken X53/X63/X73 have two possible product IDs */
+	{ HID_USB_DEVICE(USB_VENDOR_ID_NZXT, USB_PRODUCT_ID_X53) },
+	{ HID_USB_DEVICE(USB_VENDOR_ID_NZXT, USB_PRODUCT_ID_X53_SECOND) },
+	{ HID_USB_DEVICE(USB_VENDOR_ID_NZXT, USB_PRODUCT_ID_Z53) },
+	{ }
+};
+
+MODULE_DEVICE_TABLE(hid, kraken3_table);
+
+static struct hid_driver kraken3_driver = {
+	.name = DRIVER_NAME,
+	.id_table = kraken3_table,
+	.probe = kraken3_probe,
+	.remove = kraken3_remove,
+	.raw_event = kraken3_raw_event,
+#ifdef CONFIG_PM
+	.reset_resume = kraken3_reset_resume,
+#endif
+};
+
+static int __init kraken3_init(void)
+{
+	return hid_register_driver(&kraken3_driver);
+}
+
+static void __exit kraken3_exit(void)
+{
+	hid_unregister_driver(&kraken3_driver);
+}
+
+/* When compiled into the kernel, initialize after the HID bus */
+late_initcall(kraken3_init);
+module_exit(kraken3_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Jonas Malaco <jonas@protocubo.io>");
+MODULE_AUTHOR("Aleksa Savic <savicaleksa83@gmail.com>");
+MODULE_DESCRIPTION("Hwmon driver for NZXT Kraken X53/X63/X73, Z53/Z63/Z73 coolers");

From b8fb6a85c29a7ec594d574b427abe7d062a8e323 Mon Sep 17 00:00:00 2001
From: Aleksa Savic <savicaleksa83@gmail.com>
Date: Mon, 8 Jan 2024 10:44:50 +0100
Subject: [PATCH 0328/1406] hwmon: Add driver for ASUS ROG RYUJIN II 360 AIO
 cooler

This driver exposes hardware sensors of the ASUS ROG RYUJIN II 360
all-in-one CPU liquid cooler, which communicates through a proprietary
USB HID protocol. Report offsets were initially discovered in [1] by
Florian Freudiger.

Available sensors are pump, internal and external
(controller) fan speed in RPM, their duties in PWM, as well as
coolant temperature.

Attaching external fans to the controller is optional and allows them
to be controlled from the device. If not connected, the fan-related
sensors will report zeroes. The controller is a separate hardware unit
that comes bundled with the AIO and connects to it to allow fan control.

The addressable LCD screen is not supported in this
driver and should be controlled through userspace tools.

[1]: https://github.com/liquidctl/liquidctl/pull/653

Tested-by: Florian Freudiger <florian.freudiger@proton.me>
Signed-off-by: Aleksa Savic <savicaleksa83@gmail.com>
Link: https://lore.kernel.org/r/20240108094453.22986-1-savicaleksa83@gmail.com
[groeck: Add HID dependency]
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
---
 Documentation/hwmon/asus_rog_ryujin.rst |  47 ++
 Documentation/hwmon/index.rst           |   1 +
 MAINTAINERS                             |   6 +
 drivers/hwmon/Kconfig                   |  10 +
 drivers/hwmon/Makefile                  |   1 +
 drivers/hwmon/asus_rog_ryujin.c         | 609 ++++++++++++++++++++++++
 6 files changed, 674 insertions(+)
 create mode 100644 Documentation/hwmon/asus_rog_ryujin.rst
 create mode 100644 drivers/hwmon/asus_rog_ryujin.c

diff --git a/Documentation/hwmon/asus_rog_ryujin.rst b/Documentation/hwmon/asus_rog_ryujin.rst
new file mode 100644
index 00000000000000..9f77da07002218
--- /dev/null
+++ b/Documentation/hwmon/asus_rog_ryujin.rst
@@ -0,0 +1,47 @@
+.. SPDX-License-Identifier: GPL-2.0-or-later
+
+Kernel driver asus_rog_ryujin
+=============================
+
+Supported devices:
+
+* ASUS ROG RYUJIN II 360
+
+Author: Aleksa Savic
+
+Description
+-----------
+
+This driver enables hardware monitoring support for the listed ASUS ROG RYUJIN
+all-in-one CPU liquid coolers. Available sensors are pump, internal and external
+(controller) fan speed in RPM, their duties in PWM, as well as coolant temperature.
+
+Attaching external fans to the controller is optional and allows them to be
+controlled from the device. If not connected, the fan-related sensors will
+report zeroes. The controller is a separate hardware unit that comes bundled
+with the AIO and connects to it to allow fan control.
+
+The addressable LCD screen is not supported in this driver and should
+be controlled through userspace tools.
+
+Usage notes
+-----------
+
+As these are USB HIDs, the driver can be loaded automatically by the kernel and
+supports hot swapping.
+
+Sysfs entries
+-------------
+
+=========== =============================================
+fan1_input  Pump speed (in rpm)
+fan2_input  Internal fan speed (in rpm)
+fan3_input  External (controller) fan 1 speed (in rpm)
+fan4_input  External (controller) fan 2 speed (in rpm)
+fan5_input  External (controller) fan 3 speed (in rpm)
+fan6_input  External (controller) fan 4 speed (in rpm)
+temp1_input Coolant temperature (in millidegrees Celsius)
+pwm1        Pump duty
+pwm2        Internal fan duty
+pwm3        External (controller) fan duty
+=========== =============================================
diff --git a/Documentation/hwmon/index.rst b/Documentation/hwmon/index.rst
index 6f8a4a7524e8b1..c19f53d9b3ab14 100644
--- a/Documentation/hwmon/index.rst
+++ b/Documentation/hwmon/index.rst
@@ -46,6 +46,7 @@ Hardware Monitoring Kernel Drivers
    asc7621
    aspeed-pwm-tacho
    asus_ec_sensors
+   asus_rog_ryujin
    asus_wmi_sensors
    bcm54140
    bel-pfe
diff --git a/MAINTAINERS b/MAINTAINERS
index b9906f88c1b7a1..8fc0ca8e881c97 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3181,6 +3181,12 @@ S:	Maintained
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/pdx86/platform-drivers-x86.git
 F:	drivers/platform/x86/asus-tf103c-dock.c
 
+ASUS ROG RYUJIN AIO HARDWARE MONITOR DRIVER
+M:	Aleksa Savic <savicaleksa83@gmail.com>
+L:	linux-hwmon@vger.kernel.org
+S:	Maintained
+F:	drivers/hwmon/asus_rog_ryujin.c
+
 ASUS WIRELESS RADIO CONTROL DRIVER
 M:	João Paulo Rechi Vita <jprvita@gmail.com>
 L:	platform-driver-x86@vger.kernel.org
diff --git a/drivers/hwmon/Kconfig b/drivers/hwmon/Kconfig
index 3904bb297d6137..e4b24ad9396114 100644
--- a/drivers/hwmon/Kconfig
+++ b/drivers/hwmon/Kconfig
@@ -301,6 +301,16 @@ config SENSORS_ASC7621
 	  This driver can also be built as a module. If so, the module
 	  will be called asc7621.
 
+config SENSORS_ASUS_ROG_RYUJIN
+	tristate "ASUS ROG RYUJIN II 360 hardware monitoring driver"
+	depends on HID
+	help
+	  If you say yes here you get support for the fans and sensors of
+	  the ASUS ROG RYUJIN II 360 AIO CPU liquid cooler.
+
+	  This driver can also be built as a module. If so, the module
+	  will be called asus_rog_ryujin.
+
 config SENSORS_AXI_FAN_CONTROL
 	tristate "Analog Devices FAN Control HDL Core driver"
 	help
diff --git a/drivers/hwmon/Makefile b/drivers/hwmon/Makefile
index 76e6dfef9f2459..e3faee7be51a69 100644
--- a/drivers/hwmon/Makefile
+++ b/drivers/hwmon/Makefile
@@ -55,6 +55,7 @@ obj-$(CONFIG_SENSORS_ARM_SCPI)	+= scpi-hwmon.o
 obj-$(CONFIG_SENSORS_AS370)	+= as370-hwmon.o
 obj-$(CONFIG_SENSORS_ASC7621)	+= asc7621.o
 obj-$(CONFIG_SENSORS_ASPEED)	+= aspeed-pwm-tacho.o
+obj-$(CONFIG_SENSORS_ASUS_ROG_RYUJIN)	+= asus_rog_ryujin.o
 obj-$(CONFIG_SENSORS_ATXP1)	+= atxp1.o
 obj-$(CONFIG_SENSORS_AXI_FAN_CONTROL) += axi-fan-control.o
 obj-$(CONFIG_SENSORS_BT1_PVT)	+= bt1-pvt.o
diff --git a/drivers/hwmon/asus_rog_ryujin.c b/drivers/hwmon/asus_rog_ryujin.c
new file mode 100644
index 00000000000000..f8b20346a9956f
--- /dev/null
+++ b/drivers/hwmon/asus_rog_ryujin.c
@@ -0,0 +1,609 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * hwmon driver for Asus ROG Ryujin II 360 AIO cooler.
+ *
+ * Copyright 2024 Aleksa Savic <savicaleksa83@gmail.com>
+ */
+
+#include <linux/debugfs.h>
+#include <linux/hid.h>
+#include <linux/hwmon.h>
+#include <linux/jiffies.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <asm/unaligned.h>
+
+#define DRIVER_NAME	"asus_rog_ryujin"
+
+#define USB_VENDOR_ID_ASUS_ROG		0x0b05
+#define USB_PRODUCT_ID_RYUJIN_AIO	0x1988	/* ASUS ROG RYUJIN II 360 */
+
+#define STATUS_VALIDITY		1500	/* ms */
+#define MAX_REPORT_LENGTH	65
+
+/* Cooler status report offsets */
+#define RYUJIN_TEMP_SENSOR_1		3
+#define RYUJIN_TEMP_SENSOR_2		4
+#define RYUJIN_PUMP_SPEED		5
+#define RYUJIN_INTERNAL_FAN_SPEED	7
+
+/* Cooler duty report offsets */
+#define RYUJIN_PUMP_DUTY		4
+#define RYUJIN_INTERNAL_FAN_DUTY	5
+
+/* Controller status (speeds) report offsets */
+#define RYUJIN_CONTROLLER_SPEED_1	5
+#define RYUJIN_CONTROLLER_SPEED_2	7
+#define RYUJIN_CONTROLLER_SPEED_3	9
+#define RYUJIN_CONTROLLER_SPEED_4	3
+
+/* Controller duty report offsets */
+#define RYUJIN_CONTROLLER_DUTY		4
+
+/* Control commands and their inner offsets */
+#define RYUJIN_CMD_PREFIX	0xEC
+
+static const u8 get_cooler_status_cmd[] = { RYUJIN_CMD_PREFIX, 0x99 };
+static const u8 get_cooler_duty_cmd[] = { RYUJIN_CMD_PREFIX, 0x9A };
+static const u8 get_controller_speed_cmd[] = { RYUJIN_CMD_PREFIX, 0xA0 };
+static const u8 get_controller_duty_cmd[] = { RYUJIN_CMD_PREFIX, 0xA1 };
+
+#define RYUJIN_SET_COOLER_PUMP_DUTY_OFFSET	3
+#define RYUJIN_SET_COOLER_FAN_DUTY_OFFSET	4
+static const u8 set_cooler_duty_cmd[] = { RYUJIN_CMD_PREFIX, 0x1A, 0x00, 0x00, 0x00 };
+
+#define RYUJIN_SET_CONTROLLER_FAN_DUTY_OFFSET	4
+static const u8 set_controller_duty_cmd[] = { RYUJIN_CMD_PREFIX, 0x21, 0x00, 0x00, 0x00 };
+
+/* Command lengths */
+#define GET_CMD_LENGTH	2	/* Same length for all get commands */
+#define SET_CMD_LENGTH	5	/* Same length for all set commands */
+
+/* Command response headers */
+#define RYUJIN_GET_COOLER_STATUS_CMD_RESPONSE		0x19
+#define RYUJIN_GET_COOLER_DUTY_CMD_RESPONSE		0x1A
+#define RYUJIN_GET_CONTROLLER_SPEED_CMD_RESPONSE	0x20
+#define RYUJIN_GET_CONTROLLER_DUTY_CMD_RESPONSE		0x21
+
+static const char *const rog_ryujin_temp_label[] = {
+	"Coolant temp"
+};
+
+static const char *const rog_ryujin_speed_label[] = {
+	"Pump speed",
+	"Internal fan speed",
+	"Controller fan 1 speed",
+	"Controller fan 2 speed",
+	"Controller fan 3 speed",
+	"Controller fan 4 speed",
+};
+
+struct rog_ryujin_data {
+	struct hid_device *hdev;
+	struct device *hwmon_dev;
+	/* For locking access to buffer */
+	struct mutex buffer_lock;
+	/* For queueing multiple readers */
+	struct mutex status_report_request_mutex;
+	/* For reinitializing the completions below */
+	spinlock_t status_report_request_lock;
+	struct completion cooler_status_received;
+	struct completion controller_status_received;
+	struct completion cooler_duty_received;
+	struct completion controller_duty_received;
+	struct completion cooler_duty_set;
+	struct completion controller_duty_set;
+
+	/* Sensor data */
+	s32 temp_input[1];
+	u16 speed_input[6];	/* Pump, internal fan and four controller fan speeds in RPM */
+	u8 duty_input[3];	/* Pump, internal fan and controller fan duty in PWM */
+
+	u8 *buffer;
+	unsigned long updated;	/* jiffies */
+};
+
+static int rog_ryujin_percent_to_pwm(u16 val)
+{
+	return DIV_ROUND_CLOSEST(val * 255, 100);
+}
+
+static int rog_ryujin_pwm_to_percent(long val)
+{
+	return DIV_ROUND_CLOSEST(val * 100, 255);
+}
+
+static umode_t rog_ryujin_is_visible(const void *data,
+				     enum hwmon_sensor_types type, u32 attr, int channel)
+{
+	switch (type) {
+	case hwmon_temp:
+		switch (attr) {
+		case hwmon_temp_label:
+		case hwmon_temp_input:
+			return 0444;
+		default:
+			break;
+		}
+		break;
+	case hwmon_fan:
+		switch (attr) {
+		case hwmon_fan_label:
+		case hwmon_fan_input:
+			return 0444;
+		default:
+			break;
+		}
+		break;
+	case hwmon_pwm:
+		switch (attr) {
+		case hwmon_pwm_input:
+			return 0644;
+		default:
+			break;
+		}
+		break;
+	default:
+		break;
+	}
+
+	return 0;
+}
+
+/* Writes the command to the device with the rest of the report filled with zeroes */
+static int rog_ryujin_write_expanded(struct rog_ryujin_data *priv, const u8 *cmd, int cmd_length)
+{
+	int ret;
+
+	mutex_lock(&priv->buffer_lock);
+
+	memcpy_and_pad(priv->buffer, MAX_REPORT_LENGTH, cmd, cmd_length, 0x00);
+	ret = hid_hw_output_report(priv->hdev, priv->buffer, MAX_REPORT_LENGTH);
+
+	mutex_unlock(&priv->buffer_lock);
+	return ret;
+}
+
+/* Assumes priv->status_report_request_mutex is locked */
+static int rog_ryujin_execute_cmd(struct rog_ryujin_data *priv, const u8 *cmd, int cmd_length,
+				  struct completion *status_completion)
+{
+	int ret;
+
+	/*
+	 * Disable raw event parsing for a moment to safely reinitialize the
+	 * completion. Reinit is done because hidraw could have triggered
+	 * the raw event parsing and marked the passed in completion as done.
+	 */
+	spin_lock_bh(&priv->status_report_request_lock);
+	reinit_completion(status_completion);
+	spin_unlock_bh(&priv->status_report_request_lock);
+
+	/* Send command for getting data */
+	ret = rog_ryujin_write_expanded(priv, cmd, cmd_length);
+	if (ret < 0)
+		return ret;
+
+	ret = wait_for_completion_interruptible_timeout(status_completion,
+							msecs_to_jiffies(STATUS_VALIDITY));
+	if (ret == 0)
+		return -ETIMEDOUT;
+	else if (ret < 0)
+		return ret;
+
+	return 0;
+}
+
+static int rog_ryujin_get_status(struct rog_ryujin_data *priv)
+{
+	int ret = mutex_lock_interruptible(&priv->status_report_request_mutex);
+
+	if (ret < 0)
+		return ret;
+
+	if (!time_after(jiffies, priv->updated + msecs_to_jiffies(STATUS_VALIDITY))) {
+		/* Data is up to date */
+		goto unlock_and_return;
+	}
+
+	/* Retrieve cooler status */
+	ret =
+	    rog_ryujin_execute_cmd(priv, get_cooler_status_cmd, GET_CMD_LENGTH,
+				   &priv->cooler_status_received);
+	if (ret < 0)
+		goto unlock_and_return;
+
+	/* Retrieve controller status (speeds) */
+	ret =
+	    rog_ryujin_execute_cmd(priv, get_controller_speed_cmd, GET_CMD_LENGTH,
+				   &priv->controller_status_received);
+	if (ret < 0)
+		goto unlock_and_return;
+
+	/* Retrieve cooler duty */
+	ret =
+	    rog_ryujin_execute_cmd(priv, get_cooler_duty_cmd, GET_CMD_LENGTH,
+				   &priv->cooler_duty_received);
+	if (ret < 0)
+		goto unlock_and_return;
+
+	/* Retrieve controller duty */
+	ret =
+	    rog_ryujin_execute_cmd(priv, get_controller_duty_cmd, GET_CMD_LENGTH,
+				   &priv->controller_duty_received);
+	if (ret < 0)
+		goto unlock_and_return;
+
+	priv->updated = jiffies;
+
+unlock_and_return:
+	mutex_unlock(&priv->status_report_request_mutex);
+	if (ret < 0)
+		return ret;
+
+	return 0;
+}
+
+static int rog_ryujin_read(struct device *dev, enum hwmon_sensor_types type,
+			   u32 attr, int channel, long *val)
+{
+	struct rog_ryujin_data *priv = dev_get_drvdata(dev);
+	int ret = rog_ryujin_get_status(priv);
+
+	if (ret < 0)
+		return ret;
+
+	switch (type) {
+	case hwmon_temp:
+		*val = priv->temp_input[channel];
+		break;
+	case hwmon_fan:
+		*val = priv->speed_input[channel];
+		break;
+	case hwmon_pwm:
+		switch (attr) {
+		case hwmon_pwm_input:
+			*val = priv->duty_input[channel];
+			break;
+		default:
+			return -EOPNOTSUPP;
+		}
+		break;
+	default:
+		return -EOPNOTSUPP;	/* unreachable */
+	}
+
+	return 0;
+}
+
+static int rog_ryujin_read_string(struct device *dev, enum hwmon_sensor_types type,
+				  u32 attr, int channel, const char **str)
+{
+	switch (type) {
+	case hwmon_temp:
+		*str = rog_ryujin_temp_label[channel];
+		break;
+	case hwmon_fan:
+		*str = rog_ryujin_speed_label[channel];
+		break;
+	default:
+		return -EOPNOTSUPP;	/* unreachable */
+	}
+
+	return 0;
+}
+
+static int rog_ryujin_write_fixed_duty(struct rog_ryujin_data *priv, int channel, int val)
+{
+	u8 set_cmd[SET_CMD_LENGTH];
+	int ret;
+
+	if (channel < 2) {
+		/*
+		 * Retrieve cooler duty since both pump and internal fan are set
+		 * together, then write back with one of them modified.
+		 */
+		ret = mutex_lock_interruptible(&priv->status_report_request_mutex);
+		if (ret < 0)
+			return ret;
+		ret =
+		    rog_ryujin_execute_cmd(priv, get_cooler_duty_cmd, GET_CMD_LENGTH,
+					   &priv->cooler_duty_received);
+		if (ret < 0)
+			goto unlock_and_return;
+
+		memcpy(set_cmd, set_cooler_duty_cmd, SET_CMD_LENGTH);
+
+		/* Cooler duties are set as 0-100% */
+		val = rog_ryujin_pwm_to_percent(val);
+
+		if (channel == 0) {
+			/* Cooler pump duty */
+			set_cmd[RYUJIN_SET_COOLER_PUMP_DUTY_OFFSET] = val;
+			set_cmd[RYUJIN_SET_COOLER_FAN_DUTY_OFFSET] =
+			    rog_ryujin_pwm_to_percent(priv->duty_input[1]);
+		} else if (channel == 1) {
+			/* Cooler internal fan duty */
+			set_cmd[RYUJIN_SET_COOLER_PUMP_DUTY_OFFSET] =
+			    rog_ryujin_pwm_to_percent(priv->duty_input[0]);
+			set_cmd[RYUJIN_SET_COOLER_FAN_DUTY_OFFSET] = val;
+		}
+
+		ret = rog_ryujin_execute_cmd(priv, set_cmd, SET_CMD_LENGTH, &priv->cooler_duty_set);
+unlock_and_return:
+		mutex_unlock(&priv->status_report_request_mutex);
+		if (ret < 0)
+			return ret;
+	} else {
+		/*
+		 * Controller fan duty (channel == 2). No need to retrieve current
+		 * duty, so just send the command.
+		 */
+		memcpy(set_cmd, set_controller_duty_cmd, SET_CMD_LENGTH);
+		set_cmd[RYUJIN_SET_CONTROLLER_FAN_DUTY_OFFSET] = val;
+
+		ret =
+		    rog_ryujin_execute_cmd(priv, set_cmd, SET_CMD_LENGTH,
+					   &priv->controller_duty_set);
+		if (ret < 0)
+			return ret;
+	}
+
+	/* Lock onto this value until next refresh cycle */
+	priv->duty_input[channel] = val;
+
+	return 0;
+}
+
+static int rog_ryujin_write(struct device *dev, enum hwmon_sensor_types type, u32 attr, int channel,
+			    long val)
+{
+	struct rog_ryujin_data *priv = dev_get_drvdata(dev);
+	int ret;
+
+	switch (type) {
+	case hwmon_pwm:
+		switch (attr) {
+		case hwmon_pwm_input:
+			if (val < 0 || val > 255)
+				return -EINVAL;
+
+			ret = rog_ryujin_write_fixed_duty(priv, channel, val);
+			if (ret < 0)
+				return ret;
+			break;
+		default:
+			return -EOPNOTSUPP;
+		}
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	return 0;
+}
+
+static const struct hwmon_ops rog_ryujin_hwmon_ops = {
+	.is_visible = rog_ryujin_is_visible,
+	.read = rog_ryujin_read,
+	.read_string = rog_ryujin_read_string,
+	.write = rog_ryujin_write
+};
+
+static const struct hwmon_channel_info *rog_ryujin_info[] = {
+	HWMON_CHANNEL_INFO(temp,
+			   HWMON_T_INPUT | HWMON_T_LABEL),
+	HWMON_CHANNEL_INFO(fan,
+			   HWMON_F_INPUT | HWMON_F_LABEL,
+			   HWMON_F_INPUT | HWMON_F_LABEL,
+			   HWMON_F_INPUT | HWMON_F_LABEL,
+			   HWMON_F_INPUT | HWMON_F_LABEL,
+			   HWMON_F_INPUT | HWMON_F_LABEL,
+			   HWMON_F_INPUT | HWMON_F_LABEL),
+	HWMON_CHANNEL_INFO(pwm,
+			   HWMON_PWM_INPUT,
+			   HWMON_PWM_INPUT,
+			   HWMON_PWM_INPUT),
+	NULL
+};
+
+static const struct hwmon_chip_info rog_ryujin_chip_info = {
+	.ops = &rog_ryujin_hwmon_ops,
+	.info = rog_ryujin_info,
+};
+
+static int rog_ryujin_raw_event(struct hid_device *hdev, struct hid_report *report, u8 *data,
+				int size)
+{
+	struct rog_ryujin_data *priv = hid_get_drvdata(hdev);
+
+	if (data[0] != RYUJIN_CMD_PREFIX)
+		return 0;
+
+	if (data[1] == RYUJIN_GET_COOLER_STATUS_CMD_RESPONSE) {
+		/* Received coolant temp and speeds of pump and internal fan */
+		priv->temp_input[0] =
+		    data[RYUJIN_TEMP_SENSOR_1] * 1000 + data[RYUJIN_TEMP_SENSOR_2] * 100;
+		priv->speed_input[0] = get_unaligned_le16(data + RYUJIN_PUMP_SPEED);
+		priv->speed_input[1] = get_unaligned_le16(data + RYUJIN_INTERNAL_FAN_SPEED);
+
+		if (!completion_done(&priv->cooler_status_received))
+			complete_all(&priv->cooler_status_received);
+	} else if (data[1] == RYUJIN_GET_CONTROLLER_SPEED_CMD_RESPONSE) {
+		/* Received speeds of four fans attached to the controller */
+		priv->speed_input[2] = get_unaligned_le16(data + RYUJIN_CONTROLLER_SPEED_1);
+		priv->speed_input[3] = get_unaligned_le16(data + RYUJIN_CONTROLLER_SPEED_2);
+		priv->speed_input[4] = get_unaligned_le16(data + RYUJIN_CONTROLLER_SPEED_3);
+		priv->speed_input[5] = get_unaligned_le16(data + RYUJIN_CONTROLLER_SPEED_4);
+
+		if (!completion_done(&priv->controller_status_received))
+			complete_all(&priv->controller_status_received);
+	} else if (data[1] == RYUJIN_GET_COOLER_DUTY_CMD_RESPONSE) {
+		/* Received report for pump and internal fan duties (in %) */
+		if (data[RYUJIN_PUMP_DUTY] == 0 && data[RYUJIN_INTERNAL_FAN_DUTY] == 0) {
+			/*
+			 * We received a report with zeroes for duty in both places.
+			 * The device returns this as a confirmation that setting values
+			 * is successful. If we initiated a write, mark it as complete.
+			 */
+			if (!completion_done(&priv->cooler_duty_set))
+				complete_all(&priv->cooler_duty_set);
+			else if (!completion_done(&priv->cooler_duty_received))
+				/*
+				 * We didn't initiate a write, but received both zeroes.
+				 * This means that either both duties are actually zero,
+				 * or that we received a success report caused by userspace.
+				 * We're expecting a report, so parse it.
+				 */
+				goto read_cooler_duty;
+			return 0;
+		}
+read_cooler_duty:
+		priv->duty_input[0] = rog_ryujin_percent_to_pwm(data[RYUJIN_PUMP_DUTY]);
+		priv->duty_input[1] = rog_ryujin_percent_to_pwm(data[RYUJIN_INTERNAL_FAN_DUTY]);
+
+		if (!completion_done(&priv->cooler_duty_received))
+			complete_all(&priv->cooler_duty_received);
+	} else if (data[1] == RYUJIN_GET_CONTROLLER_DUTY_CMD_RESPONSE) {
+		/* Received report for controller duty for fans (in PWM) */
+		if (data[RYUJIN_CONTROLLER_DUTY] == 0) {
+			/*
+			 * We received a report with a zero for duty. The device returns this as
+			 * a confirmation that setting the controller duty value was successful.
+			 * If we initiated a write, mark it as complete.
+			 */
+			if (!completion_done(&priv->controller_duty_set))
+				complete_all(&priv->controller_duty_set);
+			else if (!completion_done(&priv->controller_duty_received))
+				/*
+				 * We didn't initiate a write, but received a zero for duty.
+				 * This means that either the duty is actually zero, or that
+				 * we received a success report caused by userspace.
+				 * We're expecting a report, so parse it.
+				 */
+				goto read_controller_duty;
+			return 0;
+		}
+read_controller_duty:
+		priv->duty_input[2] = data[RYUJIN_CONTROLLER_DUTY];
+
+		if (!completion_done(&priv->controller_duty_received))
+			complete_all(&priv->controller_duty_received);
+	}
+
+	return 0;
+}
+
+static int rog_ryujin_probe(struct hid_device *hdev, const struct hid_device_id *id)
+{
+	struct rog_ryujin_data *priv;
+	int ret;
+
+	priv = devm_kzalloc(&hdev->dev, sizeof(*priv), GFP_KERNEL);
+	if (!priv)
+		return -ENOMEM;
+
+	priv->hdev = hdev;
+	hid_set_drvdata(hdev, priv);
+
+	/*
+	 * Initialize priv->updated to STATUS_VALIDITY seconds in the past, making
+	 * the initial empty data invalid for rog_ryujin_read() without the need for
+	 * a special case there.
+	 */
+	priv->updated = jiffies - msecs_to_jiffies(STATUS_VALIDITY);
+
+	ret = hid_parse(hdev);
+	if (ret) {
+		hid_err(hdev, "hid parse failed with %d\n", ret);
+		return ret;
+	}
+
+	/* Enable hidraw so existing user-space tools can continue to work */
+	ret = hid_hw_start(hdev, HID_CONNECT_HIDRAW);
+	if (ret) {
+		hid_err(hdev, "hid hw start failed with %d\n", ret);
+		return ret;
+	}
+
+	ret = hid_hw_open(hdev);
+	if (ret) {
+		hid_err(hdev, "hid hw open failed with %d\n", ret);
+		goto fail_and_stop;
+	}
+
+	priv->buffer = devm_kzalloc(&hdev->dev, MAX_REPORT_LENGTH, GFP_KERNEL);
+	if (!priv->buffer) {
+		ret = -ENOMEM;
+		goto fail_and_close;
+	}
+
+	mutex_init(&priv->status_report_request_mutex);
+	mutex_init(&priv->buffer_lock);
+	spin_lock_init(&priv->status_report_request_lock);
+	init_completion(&priv->cooler_status_received);
+	init_completion(&priv->controller_status_received);
+	init_completion(&priv->cooler_duty_received);
+	init_completion(&priv->controller_duty_received);
+	init_completion(&priv->cooler_duty_set);
+	init_completion(&priv->controller_duty_set);
+
+	priv->hwmon_dev = hwmon_device_register_with_info(&hdev->dev, "rog_ryujin",
+							  priv, &rog_ryujin_chip_info, NULL);
+	if (IS_ERR(priv->hwmon_dev)) {
+		ret = PTR_ERR(priv->hwmon_dev);
+		hid_err(hdev, "hwmon registration failed with %d\n", ret);
+		goto fail_and_close;
+	}
+
+	return 0;
+
+fail_and_close:
+	hid_hw_close(hdev);
+fail_and_stop:
+	hid_hw_stop(hdev);
+	return ret;
+}
+
+static void rog_ryujin_remove(struct hid_device *hdev)
+{
+	struct rog_ryujin_data *priv = hid_get_drvdata(hdev);
+
+	hwmon_device_unregister(priv->hwmon_dev);
+
+	hid_hw_close(hdev);
+	hid_hw_stop(hdev);
+}
+
+static const struct hid_device_id rog_ryujin_table[] = {
+	{ HID_USB_DEVICE(USB_VENDOR_ID_ASUS_ROG, USB_PRODUCT_ID_RYUJIN_AIO) },
+	{ }
+};
+
+MODULE_DEVICE_TABLE(hid, rog_ryujin_table);
+
+static struct hid_driver rog_ryujin_driver = {
+	.name = "rog_ryujin",
+	.id_table = rog_ryujin_table,
+	.probe = rog_ryujin_probe,
+	.remove = rog_ryujin_remove,
+	.raw_event = rog_ryujin_raw_event,
+};
+
+static int __init rog_ryujin_init(void)
+{
+	return hid_register_driver(&rog_ryujin_driver);
+}
+
+static void __exit rog_ryujin_exit(void)
+{
+	hid_unregister_driver(&rog_ryujin_driver);
+}
+
+/* When compiled into the kernel, initialize after the HID bus */
+late_initcall(rog_ryujin_init);
+module_exit(rog_ryujin_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Aleksa Savic <savicaleksa83@gmail.com>");
+MODULE_DESCRIPTION("Hwmon driver for Asus ROG Ryujin II 360 AIO cooler");

From d534f90d68bdab606c3545afb5d558594a50f32e Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Fri, 2 Feb 2024 17:21:37 +0800
Subject: [PATCH 0329/1406] hwmon: (coretemp) Introduce enum for attr index

Introduce enum coretemp_attr_index to better describe the index of each
sensor attribute.

No functional change.

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Link: https://lore.kernel.org/r/20240202092144.71180-5-rui.zhang@intel.com
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
---
 drivers/hwmon/coretemp.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/drivers/hwmon/coretemp.c b/drivers/hwmon/coretemp.c
index b8fc8d1ef20dfc..32f99cf6308b29 100644
--- a/drivers/hwmon/coretemp.c
+++ b/drivers/hwmon/coretemp.c
@@ -43,10 +43,18 @@ MODULE_PARM_DESC(tjmax, "TjMax value in degrees Celsius");
 #define BASE_SYSFS_ATTR_NO	2	/* Sysfs Base attr no for coretemp */
 #define NUM_REAL_CORES		512	/* Number of Real cores per cpu */
 #define CORETEMP_NAME_LENGTH	28	/* String Length of attrs */
-#define MAX_CORE_ATTRS		4	/* Maximum no of basic attrs */
-#define TOTAL_ATTRS		(MAX_CORE_ATTRS + 1)
 #define MAX_CORE_DATA		(NUM_REAL_CORES + BASE_SYSFS_ATTR_NO)
 
+enum coretemp_attr_index {
+	ATTR_LABEL,
+	ATTR_CRIT_ALARM,
+	ATTR_TEMP,
+	ATTR_TJMAX,
+	ATTR_TTARGET,
+	MAX_CORE_ATTRS = ATTR_TJMAX + 1,	/* Maximum no of basic attrs */
+	TOTAL_ATTRS = ATTR_TTARGET + 1		/* Maximum no of possible attrs */
+};
+
 #ifdef CONFIG_SMP
 #define for_each_sibling(i, cpu) \
 	for_each_cpu(i, topology_sibling_cpumask(cpu))

From 819ec33a3d80b0a57d0c73060eb68a3abfc95c89 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Fri, 2 Feb 2024 17:21:38 +0800
Subject: [PATCH 0330/1406] hwmon: (coretemp) Remove unnecessary dependency of
 array index

When sensor_device_attribute pointer is available, use container_of() to
get the temp_data address.

This removes the unnecessary dependency of cached index in
pdata->core_data[].

No functional change.

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Link: https://lore.kernel.org/r/20240202092144.71180-6-rui.zhang@intel.com
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
---
 drivers/hwmon/coretemp.c | 20 +++++++-------------
 1 file changed, 7 insertions(+), 13 deletions(-)

diff --git a/drivers/hwmon/coretemp.c b/drivers/hwmon/coretemp.c
index 32f99cf6308b29..9a7bfc046c7254 100644
--- a/drivers/hwmon/coretemp.c
+++ b/drivers/hwmon/coretemp.c
@@ -342,7 +342,7 @@ static ssize_t show_label(struct device *dev,
 {
 	struct sensor_device_attribute *attr = to_sensor_dev_attr(devattr);
 	struct platform_data *pdata = dev_get_drvdata(dev);
-	struct temp_data *tdata = pdata->core_data[attr->index];
+	struct temp_data *tdata = container_of(attr, struct temp_data, sd_attrs[ATTR_LABEL]);
 
 	if (tdata->is_pkg_data)
 		return sprintf(buf, "Package id %u\n", pdata->pkg_id);
@@ -355,8 +355,7 @@ static ssize_t show_crit_alarm(struct device *dev,
 {
 	u32 eax, edx;
 	struct sensor_device_attribute *attr = to_sensor_dev_attr(devattr);
-	struct platform_data *pdata = dev_get_drvdata(dev);
-	struct temp_data *tdata = pdata->core_data[attr->index];
+	struct temp_data *tdata = container_of(attr, struct temp_data, sd_attrs[ATTR_CRIT_ALARM]);
 
 	mutex_lock(&tdata->update_lock);
 	rdmsr_on_cpu(tdata->cpu, tdata->status_reg, &eax, &edx);
@@ -369,8 +368,7 @@ static ssize_t show_tjmax(struct device *dev,
 			struct device_attribute *devattr, char *buf)
 {
 	struct sensor_device_attribute *attr = to_sensor_dev_attr(devattr);
-	struct platform_data *pdata = dev_get_drvdata(dev);
-	struct temp_data *tdata = pdata->core_data[attr->index];
+	struct temp_data *tdata = container_of(attr, struct temp_data, sd_attrs[ATTR_TJMAX]);
 	int tjmax;
 
 	mutex_lock(&tdata->update_lock);
@@ -384,8 +382,7 @@ static ssize_t show_ttarget(struct device *dev,
 				struct device_attribute *devattr, char *buf)
 {
 	struct sensor_device_attribute *attr = to_sensor_dev_attr(devattr);
-	struct platform_data *pdata = dev_get_drvdata(dev);
-	struct temp_data *tdata = pdata->core_data[attr->index];
+	struct temp_data *tdata = container_of(attr, struct temp_data, sd_attrs[ATTR_TTARGET]);
 	int ttarget;
 
 	mutex_lock(&tdata->update_lock);
@@ -402,8 +399,7 @@ static ssize_t show_temp(struct device *dev,
 {
 	u32 eax, edx;
 	struct sensor_device_attribute *attr = to_sensor_dev_attr(devattr);
-	struct platform_data *pdata = dev_get_drvdata(dev);
-	struct temp_data *tdata = pdata->core_data[attr->index];
+	struct temp_data *tdata = container_of(attr, struct temp_data, sd_attrs[ATTR_TEMP]);
 	int tjmax;
 
 	mutex_lock(&tdata->update_lock);
@@ -426,8 +422,7 @@ static ssize_t show_temp(struct device *dev,
 	return sprintf(buf, "%d\n", tdata->temp);
 }
 
-static int create_core_attrs(struct temp_data *tdata, struct device *dev,
-			     int index)
+static int create_core_attrs(struct temp_data *tdata, struct device *dev)
 {
 	int i;
 	static ssize_t (*const rd_ptr[TOTAL_ATTRS]) (struct device *dev,
@@ -452,7 +447,6 @@ static int create_core_attrs(struct temp_data *tdata, struct device *dev,
 		tdata->sd_attrs[i].dev_attr.attr.name = tdata->attr_name[i];
 		tdata->sd_attrs[i].dev_attr.attr.mode = 0444;
 		tdata->sd_attrs[i].dev_attr.show = rd_ptr[i];
-		tdata->sd_attrs[i].index = index;
 		tdata->attrs[i] = &tdata->sd_attrs[i].dev_attr.attr;
 	}
 	tdata->attr_group.attrs = tdata->attrs;
@@ -557,7 +551,7 @@ static int create_core_data(struct platform_device *pdev, unsigned int cpu,
 	pdata->core_data[index] = tdata;
 
 	/* Create sysfs interfaces */
-	err = create_core_attrs(tdata, pdata->hwmon_dev, index);
+	err = create_core_attrs(tdata, pdata->hwmon_dev);
 	if (err)
 		goto exit_free;
 

From 2179f654c6479c8298d0cafb7ae83b2910a95770 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Fri, 2 Feb 2024 17:21:39 +0800
Subject: [PATCH 0331/1406] hwmon: (coretemp) Replace sensor_device_attribute
 with device_attribute

Replace sensor_device_attribute with device_attribute because
sensor_device_attribute->index is no longer used.

No functional change.

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Link: https://lore.kernel.org/r/20240202092144.71180-7-rui.zhang@intel.com
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
---
 drivers/hwmon/coretemp.c | 28 ++++++++++++----------------
 1 file changed, 12 insertions(+), 16 deletions(-)

diff --git a/drivers/hwmon/coretemp.c b/drivers/hwmon/coretemp.c
index 9a7bfc046c7254..cdd1e069d5c1d9 100644
--- a/drivers/hwmon/coretemp.c
+++ b/drivers/hwmon/coretemp.c
@@ -85,7 +85,7 @@ struct temp_data {
 	u32 status_reg;
 	int attr_size;
 	bool is_pkg_data;
-	struct sensor_device_attribute sd_attrs[TOTAL_ATTRS];
+	struct device_attribute sd_attrs[TOTAL_ATTRS];
 	char attr_name[TOTAL_ATTRS][CORETEMP_NAME_LENGTH];
 	struct attribute *attrs[TOTAL_ATTRS + 1];
 	struct attribute_group attr_group;
@@ -340,9 +340,8 @@ static struct platform_device **zone_devices;
 static ssize_t show_label(struct device *dev,
 				struct device_attribute *devattr, char *buf)
 {
-	struct sensor_device_attribute *attr = to_sensor_dev_attr(devattr);
 	struct platform_data *pdata = dev_get_drvdata(dev);
-	struct temp_data *tdata = container_of(attr, struct temp_data, sd_attrs[ATTR_LABEL]);
+	struct temp_data *tdata = container_of(devattr, struct temp_data, sd_attrs[ATTR_LABEL]);
 
 	if (tdata->is_pkg_data)
 		return sprintf(buf, "Package id %u\n", pdata->pkg_id);
@@ -354,8 +353,8 @@ static ssize_t show_crit_alarm(struct device *dev,
 				struct device_attribute *devattr, char *buf)
 {
 	u32 eax, edx;
-	struct sensor_device_attribute *attr = to_sensor_dev_attr(devattr);
-	struct temp_data *tdata = container_of(attr, struct temp_data, sd_attrs[ATTR_CRIT_ALARM]);
+	struct temp_data *tdata = container_of(devattr, struct temp_data,
+						sd_attrs[ATTR_CRIT_ALARM]);
 
 	mutex_lock(&tdata->update_lock);
 	rdmsr_on_cpu(tdata->cpu, tdata->status_reg, &eax, &edx);
@@ -367,8 +366,7 @@ static ssize_t show_crit_alarm(struct device *dev,
 static ssize_t show_tjmax(struct device *dev,
 			struct device_attribute *devattr, char *buf)
 {
-	struct sensor_device_attribute *attr = to_sensor_dev_attr(devattr);
-	struct temp_data *tdata = container_of(attr, struct temp_data, sd_attrs[ATTR_TJMAX]);
+	struct temp_data *tdata = container_of(devattr, struct temp_data, sd_attrs[ATTR_TJMAX]);
 	int tjmax;
 
 	mutex_lock(&tdata->update_lock);
@@ -381,8 +379,7 @@ static ssize_t show_tjmax(struct device *dev,
 static ssize_t show_ttarget(struct device *dev,
 				struct device_attribute *devattr, char *buf)
 {
-	struct sensor_device_attribute *attr = to_sensor_dev_attr(devattr);
-	struct temp_data *tdata = container_of(attr, struct temp_data, sd_attrs[ATTR_TTARGET]);
+	struct temp_data *tdata = container_of(devattr, struct temp_data, sd_attrs[ATTR_TTARGET]);
 	int ttarget;
 
 	mutex_lock(&tdata->update_lock);
@@ -398,8 +395,7 @@ static ssize_t show_temp(struct device *dev,
 			struct device_attribute *devattr, char *buf)
 {
 	u32 eax, edx;
-	struct sensor_device_attribute *attr = to_sensor_dev_attr(devattr);
-	struct temp_data *tdata = container_of(attr, struct temp_data, sd_attrs[ATTR_TEMP]);
+	struct temp_data *tdata = container_of(devattr, struct temp_data, sd_attrs[ATTR_TEMP]);
 	int tjmax;
 
 	mutex_lock(&tdata->update_lock);
@@ -443,11 +439,11 @@ static int create_core_attrs(struct temp_data *tdata, struct device *dev)
 
 		snprintf(tdata->attr_name[i], CORETEMP_NAME_LENGTH,
 			 "temp%d_%s", attr_no, suffixes[i]);
-		sysfs_attr_init(&tdata->sd_attrs[i].dev_attr.attr);
-		tdata->sd_attrs[i].dev_attr.attr.name = tdata->attr_name[i];
-		tdata->sd_attrs[i].dev_attr.attr.mode = 0444;
-		tdata->sd_attrs[i].dev_attr.show = rd_ptr[i];
-		tdata->attrs[i] = &tdata->sd_attrs[i].dev_attr.attr;
+		sysfs_attr_init(&tdata->sd_attrs[i].attr);
+		tdata->sd_attrs[i].attr.name = tdata->attr_name[i];
+		tdata->sd_attrs[i].attr.mode = 0444;
+		tdata->sd_attrs[i].show = rd_ptr[i];
+		tdata->attrs[i] = &tdata->sd_attrs[i].attr;
 	}
 	tdata->attr_group.attrs = tdata->attrs;
 	return sysfs_create_group(&dev->kobj, &tdata->attr_group);

From b54d847d36d80d4c594b92d8ec35a991024b684f Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Fri, 2 Feb 2024 17:21:40 +0800
Subject: [PATCH 0332/1406] hwmon: (coretemp) Remove redundant pdata->cpu_map[]

pdata->cpu_map[] saves the mapping between cpu core id and the index in
pdata->core_data[]. This is used to find the temp_data structure using
cpu_core_id, by traversing the pdata->cpu_map[] array. But the same goal
can be achieved by traversing the pdata->core_temp[] array directly.

Remove redundant pdata->cpu_map[].

No functional change.

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Link: https://lore.kernel.org/r/20240202092144.71180-8-rui.zhang@intel.com
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
---
 drivers/hwmon/coretemp.c | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/drivers/hwmon/coretemp.c b/drivers/hwmon/coretemp.c
index cdd1e069d5c1d9..29ee8e0c0fe92e 100644
--- a/drivers/hwmon/coretemp.c
+++ b/drivers/hwmon/coretemp.c
@@ -96,7 +96,6 @@ struct temp_data {
 struct platform_data {
 	struct device		*hwmon_dev;
 	u16			pkg_id;
-	u16			cpu_map[NUM_REAL_CORES];
 	struct ida		ida;
 	struct cpumask		cpumask;
 	struct temp_data	*core_data[MAX_CORE_DATA];
@@ -517,7 +516,6 @@ static int create_core_data(struct platform_device *pdev, unsigned int cpu,
 		if (index < 0)
 			return index;
 
-		pdata->cpu_map[index] = topology_core_id(cpu);
 		index += BASE_SYSFS_ATTR_NO;
 	}
 
@@ -696,7 +694,7 @@ static int coretemp_cpu_offline(unsigned int cpu)
 	struct platform_device *pdev = coretemp_get_pdev(cpu);
 	struct platform_data *pd;
 	struct temp_data *tdata;
-	int i, indx = -1, target;
+	int i, target;
 
 	/* No need to tear down any interfaces for suspend */
 	if (cpuhp_tasks_frozen)
@@ -707,18 +705,16 @@ static int coretemp_cpu_offline(unsigned int cpu)
 	if (!pd->hwmon_dev)
 		return 0;
 
-	for (i = 0; i < NUM_REAL_CORES; i++) {
-		if (pd->cpu_map[i] == topology_core_id(cpu)) {
-			indx = i + BASE_SYSFS_ATTR_NO;
+	for (i = BASE_SYSFS_ATTR_NO; i < MAX_CORE_DATA; i++) {
+		if (pd->core_data[i] && pd->core_data[i]->cpu_core_id == topology_core_id(cpu))
 			break;
-		}
 	}
 
 	/* Too many cores and this core is not populated, just return */
-	if (indx < 0)
+	if (i == MAX_CORE_DATA)
 		return 0;
 
-	tdata = pd->core_data[indx];
+	tdata = pd->core_data[i];
 
 	cpumask_clear_cpu(cpu, &pd->cpumask);
 
@@ -729,7 +725,7 @@ static int coretemp_cpu_offline(unsigned int cpu)
 	 */
 	target = cpumask_any_and(&pd->cpumask, topology_sibling_cpumask(cpu));
 	if (target >= nr_cpu_ids) {
-		coretemp_remove_core(pd, indx);
+		coretemp_remove_core(pd, i);
 	} else if (tdata && tdata->cpu == cpu) {
 		mutex_lock(&tdata->update_lock);
 		tdata->cpu = target;

From a749c25cd63fdf8f584bb37395de72aa5741eb4b Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Fri, 2 Feb 2024 17:21:41 +0800
Subject: [PATCH 0333/1406] hwmon: (coretemp) Abstract core_temp helpers

coretemp driver has an obscure and fragile logic for handling package
and core temperature data.

Place the logic in newly introduced helpers for further optimizations.

No functional change.

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Link: https://lore.kernel.org/r/20240202092144.71180-9-rui.zhang@intel.com
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
---
 drivers/hwmon/coretemp.c | 118 +++++++++++++++++++++------------------
 1 file changed, 64 insertions(+), 54 deletions(-)

diff --git a/drivers/hwmon/coretemp.c b/drivers/hwmon/coretemp.c
index 29ee8e0c0fe92e..a19799a302a2a9 100644
--- a/drivers/hwmon/coretemp.c
+++ b/drivers/hwmon/coretemp.c
@@ -81,6 +81,7 @@ struct temp_data {
 	int tjmax;
 	unsigned long last_updated;
 	unsigned int cpu;
+	unsigned int index;
 	u32 cpu_core_id;
 	u32 status_reg;
 	int attr_size;
@@ -474,14 +475,36 @@ static struct platform_device *coretemp_get_pdev(unsigned int cpu)
 	return NULL;
 }
 
-static struct temp_data *init_temp_data(unsigned int cpu, int pkg_flag)
+static struct temp_data *
+init_temp_data(struct platform_data *pdata, unsigned int cpu, int pkg_flag)
 {
 	struct temp_data *tdata;
+	int index;
 
 	tdata = kzalloc(sizeof(struct temp_data), GFP_KERNEL);
 	if (!tdata)
 		return NULL;
 
+	/*
+	 * Get the index of tdata in pdata->core_data[]
+	 * tdata for package: pdata->core_data[1]
+	 * tdata for core: pdata->core_data[2] .. pdata->core_data[NUM_REAL_CORES + 1]
+	 */
+	if (pkg_flag) {
+		index = PKG_SYSFS_ATTR_NO;
+	} else {
+		index = ida_alloc_max(&pdata->ida, NUM_REAL_CORES - 1, GFP_KERNEL);
+		if (index < 0) {
+			kfree(tdata);
+			return NULL;
+		}
+		index += BASE_SYSFS_ATTR_NO;
+	}
+	/* Index in pdata->core_data[] */
+	tdata->index = index;
+
+	pdata->core_data[index] = tdata;
+
 	tdata->status_reg = pkg_flag ? MSR_IA32_PACKAGE_THERM_STATUS :
 							MSR_IA32_THERM_STATUS;
 	tdata->is_pkg_data = pkg_flag;
@@ -492,6 +515,30 @@ static struct temp_data *init_temp_data(unsigned int cpu, int pkg_flag)
 	return tdata;
 }
 
+static void destroy_temp_data(struct platform_data *pdata, struct temp_data *tdata)
+{
+	pdata->core_data[tdata->index] = NULL;
+	if (!tdata->is_pkg_data)
+		ida_free(&pdata->ida, tdata->index - BASE_SYSFS_ATTR_NO);
+	kfree(tdata);
+}
+
+static struct temp_data *get_temp_data(struct platform_data *pdata, int cpu)
+{
+	int i;
+
+	/* cpu < 0 means get pkg temp_data */
+	if (cpu < 0)
+		return pdata->core_data[PKG_SYSFS_ATTR_NO];
+
+	for (i = BASE_SYSFS_ATTR_NO; i < MAX_CORE_DATA; i++) {
+		if (pdata->core_data[i] &&
+		    pdata->core_data[i]->cpu_core_id == topology_core_id(cpu))
+			return pdata->core_data[i];
+	}
+	return NULL;
+}
+
 static int create_core_data(struct platform_device *pdev, unsigned int cpu,
 			    int pkg_flag)
 {
@@ -499,36 +546,19 @@ static int create_core_data(struct platform_device *pdev, unsigned int cpu,
 	struct platform_data *pdata = platform_get_drvdata(pdev);
 	struct cpuinfo_x86 *c = &cpu_data(cpu);
 	u32 eax, edx;
-	int err, index;
+	int err;
 
 	if (!housekeeping_cpu(cpu, HK_TYPE_MISC))
 		return 0;
 
-	/*
-	 * Get the index of tdata in pdata->core_data[]
-	 * tdata for package: pdata->core_data[1]
-	 * tdata for core: pdata->core_data[2] .. pdata->core_data[NUM_REAL_CORES + 1]
-	 */
-	if (pkg_flag) {
-		index = PKG_SYSFS_ATTR_NO;
-	} else {
-		index = ida_alloc_max(&pdata->ida, NUM_REAL_CORES - 1, GFP_KERNEL);
-		if (index < 0)
-			return index;
-
-		index += BASE_SYSFS_ATTR_NO;
-	}
-
-	tdata = init_temp_data(cpu, pkg_flag);
-	if (!tdata) {
-		err = -ENOMEM;
-		goto ida_free;
-	}
+	tdata = init_temp_data(pdata, cpu, pkg_flag);
+	if (!tdata)
+		return -ENOMEM;
 
 	/* Test if we can access the status register */
 	err = rdmsr_safe_on_cpu(cpu, tdata->status_reg, &eax, &edx);
 	if (err)
-		goto exit_free;
+		goto err;
 
 	/* Make sure tdata->tjmax is a valid indicator for dynamic/static tjmax */
 	get_tjmax(tdata, &pdev->dev);
@@ -542,20 +572,15 @@ static int create_core_data(struct platform_device *pdev, unsigned int cpu,
 		if (get_ttarget(tdata, &pdev->dev) >= 0)
 			tdata->attr_size++;
 
-	pdata->core_data[index] = tdata;
-
 	/* Create sysfs interfaces */
 	err = create_core_attrs(tdata, pdata->hwmon_dev);
 	if (err)
-		goto exit_free;
+		goto err;
 
 	return 0;
-exit_free:
-	pdata->core_data[index] = NULL;
-	kfree(tdata);
-ida_free:
-	if (!pkg_flag)
-		ida_free(&pdata->ida, index - BASE_SYSFS_ATTR_NO);
+
+err:
+	destroy_temp_data(pdata, tdata);
 	return err;
 }
 
@@ -566,10 +591,8 @@ coretemp_add_core(struct platform_device *pdev, unsigned int cpu, int pkg_flag)
 		dev_err(&pdev->dev, "Adding Core %u failed\n", cpu);
 }
 
-static void coretemp_remove_core(struct platform_data *pdata, int indx)
+static void coretemp_remove_core(struct platform_data *pdata, struct temp_data *tdata)
 {
-	struct temp_data *tdata = pdata->core_data[indx];
-
 	/* if we errored on add then this is already gone */
 	if (!tdata)
 		return;
@@ -577,11 +600,7 @@ static void coretemp_remove_core(struct platform_data *pdata, int indx)
 	/* Remove the sysfs attributes */
 	sysfs_remove_group(&pdata->hwmon_dev->kobj, &tdata->attr_group);
 
-	kfree(pdata->core_data[indx]);
-	pdata->core_data[indx] = NULL;
-
-	if (indx >= BASE_SYSFS_ATTR_NO)
-		ida_free(&pdata->ida, indx - BASE_SYSFS_ATTR_NO);
+	destroy_temp_data(pdata, tdata);
 }
 
 static int coretemp_device_add(int zoneid)
@@ -694,7 +713,7 @@ static int coretemp_cpu_offline(unsigned int cpu)
 	struct platform_device *pdev = coretemp_get_pdev(cpu);
 	struct platform_data *pd;
 	struct temp_data *tdata;
-	int i, target;
+	int target;
 
 	/* No need to tear down any interfaces for suspend */
 	if (cpuhp_tasks_frozen)
@@ -705,16 +724,7 @@ static int coretemp_cpu_offline(unsigned int cpu)
 	if (!pd->hwmon_dev)
 		return 0;
 
-	for (i = BASE_SYSFS_ATTR_NO; i < MAX_CORE_DATA; i++) {
-		if (pd->core_data[i] && pd->core_data[i]->cpu_core_id == topology_core_id(cpu))
-			break;
-	}
-
-	/* Too many cores and this core is not populated, just return */
-	if (i == MAX_CORE_DATA)
-		return 0;
-
-	tdata = pd->core_data[i];
+	tdata = get_temp_data(pd, cpu);
 
 	cpumask_clear_cpu(cpu, &pd->cpumask);
 
@@ -725,7 +735,7 @@ static int coretemp_cpu_offline(unsigned int cpu)
 	 */
 	target = cpumask_any_and(&pd->cpumask, topology_sibling_cpumask(cpu));
 	if (target >= nr_cpu_ids) {
-		coretemp_remove_core(pd, i);
+		coretemp_remove_core(pd, tdata);
 	} else if (tdata && tdata->cpu == cpu) {
 		mutex_lock(&tdata->update_lock);
 		tdata->cpu = target;
@@ -735,10 +745,10 @@ static int coretemp_cpu_offline(unsigned int cpu)
 	/*
 	 * If all cores in this pkg are offline, remove the interface.
 	 */
-	tdata = pd->core_data[PKG_SYSFS_ATTR_NO];
+	tdata = get_temp_data(pd, -1);
 	if (cpumask_empty(&pd->cpumask)) {
 		if (tdata)
-			coretemp_remove_core(pd, PKG_SYSFS_ATTR_NO);
+			coretemp_remove_core(pd, tdata);
 		hwmon_device_unregister(pd->hwmon_dev);
 		pd->hwmon_dev = NULL;
 		return 0;

From c24bc938beb01a9f744e979cb7d92657a0aa9718 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Fri, 2 Feb 2024 17:21:42 +0800
Subject: [PATCH 0334/1406] hwmon: (coretemp) Split package temp_data and core
 temp_data

Saving package temp_data and core temp_data in one array with different
offsets is fragile.

Split them and clean up crabbed maths and macros. This also fixes a
problem that pdata->core_data[0] was never used.

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Link: https://lore.kernel.org/r/20240202092144.71180-10-rui.zhang@intel.com
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
---
 drivers/hwmon/coretemp.c | 37 ++++++++++++++-----------------------
 1 file changed, 14 insertions(+), 23 deletions(-)

diff --git a/drivers/hwmon/coretemp.c b/drivers/hwmon/coretemp.c
index a19799a302a2a9..1a3b5ae0bacaff 100644
--- a/drivers/hwmon/coretemp.c
+++ b/drivers/hwmon/coretemp.c
@@ -39,11 +39,8 @@ static int force_tjmax;
 module_param_named(tjmax, force_tjmax, int, 0444);
 MODULE_PARM_DESC(tjmax, "TjMax value in degrees Celsius");
 
-#define PKG_SYSFS_ATTR_NO	1	/* Sysfs attribute for package temp */
-#define BASE_SYSFS_ATTR_NO	2	/* Sysfs Base attr no for coretemp */
 #define NUM_REAL_CORES		512	/* Number of Real cores per cpu */
 #define CORETEMP_NAME_LENGTH	28	/* String Length of attrs */
-#define MAX_CORE_DATA		(NUM_REAL_CORES + BASE_SYSFS_ATTR_NO)
 
 enum coretemp_attr_index {
 	ATTR_LABEL,
@@ -99,7 +96,8 @@ struct platform_data {
 	u16			pkg_id;
 	struct ida		ida;
 	struct cpumask		cpumask;
-	struct temp_data	*core_data[MAX_CORE_DATA];
+	struct temp_data	*pkg_data;
+	struct temp_data	*core_data[NUM_REAL_CORES];
 	struct device_attribute name_attr;
 };
 
@@ -479,31 +477,21 @@ static struct temp_data *
 init_temp_data(struct platform_data *pdata, unsigned int cpu, int pkg_flag)
 {
 	struct temp_data *tdata;
-	int index;
 
 	tdata = kzalloc(sizeof(struct temp_data), GFP_KERNEL);
 	if (!tdata)
 		return NULL;
 
-	/*
-	 * Get the index of tdata in pdata->core_data[]
-	 * tdata for package: pdata->core_data[1]
-	 * tdata for core: pdata->core_data[2] .. pdata->core_data[NUM_REAL_CORES + 1]
-	 */
 	if (pkg_flag) {
-		index = PKG_SYSFS_ATTR_NO;
+		pdata->pkg_data = tdata;
 	} else {
-		index = ida_alloc_max(&pdata->ida, NUM_REAL_CORES - 1, GFP_KERNEL);
-		if (index < 0) {
+		tdata->index = ida_alloc_max(&pdata->ida, NUM_REAL_CORES - 1, GFP_KERNEL);
+		if (tdata->index < 0) {
 			kfree(tdata);
 			return NULL;
 		}
-		index += BASE_SYSFS_ATTR_NO;
+		pdata->core_data[tdata->index] = tdata;
 	}
-	/* Index in pdata->core_data[] */
-	tdata->index = index;
-
-	pdata->core_data[index] = tdata;
 
 	tdata->status_reg = pkg_flag ? MSR_IA32_PACKAGE_THERM_STATUS :
 							MSR_IA32_THERM_STATUS;
@@ -517,9 +505,12 @@ init_temp_data(struct platform_data *pdata, unsigned int cpu, int pkg_flag)
 
 static void destroy_temp_data(struct platform_data *pdata, struct temp_data *tdata)
 {
-	pdata->core_data[tdata->index] = NULL;
-	if (!tdata->is_pkg_data)
-		ida_free(&pdata->ida, tdata->index - BASE_SYSFS_ATTR_NO);
+	if (tdata->is_pkg_data) {
+		pdata->pkg_data = NULL;
+	} else {
+		pdata->core_data[tdata->index] = NULL;
+		ida_free(&pdata->ida, tdata->index);
+	}
 	kfree(tdata);
 }
 
@@ -529,9 +520,9 @@ static struct temp_data *get_temp_data(struct platform_data *pdata, int cpu)
 
 	/* cpu < 0 means get pkg temp_data */
 	if (cpu < 0)
-		return pdata->core_data[PKG_SYSFS_ATTR_NO];
+		return pdata->pkg_data;
 
-	for (i = BASE_SYSFS_ATTR_NO; i < MAX_CORE_DATA; i++) {
+	for (i = 0; i < NUM_REAL_CORES; i++) {
 		if (pdata->core_data[i] &&
 		    pdata->core_data[i]->cpu_core_id == topology_core_id(cpu))
 			return pdata->core_data[i];

From 1a9ed43879bdb54c65d932cfd6514753c097ca70 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Fri, 2 Feb 2024 17:21:43 +0800
Subject: [PATCH 0335/1406] hwmon: (coretemp) Remove redundant
 temp_data->is_pkg_data

temp_data->index saves the index in pdata->core_data[]. It is not used
by package temp_data.

Use temp_data->index as the indicator of package temp_data and remove
redundant temp_data->is_pkg_data.

No functional change.

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Link: https://lore.kernel.org/r/20240202092144.71180-11-rui.zhang@intel.com
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
---
 drivers/hwmon/coretemp.c | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/drivers/hwmon/coretemp.c b/drivers/hwmon/coretemp.c
index 1a3b5ae0bacaff..e548f2145449b8 100644
--- a/drivers/hwmon/coretemp.c
+++ b/drivers/hwmon/coretemp.c
@@ -70,19 +70,16 @@ enum coretemp_attr_index {
  * @status_reg: One of IA32_THERM_STATUS or IA32_PACKAGE_THERM_STATUS,
  *		from where the temperature values should be read.
  * @attr_size:  Total number of pre-core attrs displayed in the sysfs.
- * @is_pkg_data: If this is 1, the temp_data holds pkgtemp data.
- *		Otherwise, temp_data holds coretemp data.
  */
 struct temp_data {
 	int temp;
 	int tjmax;
 	unsigned long last_updated;
 	unsigned int cpu;
-	unsigned int index;
+	int index;
 	u32 cpu_core_id;
 	u32 status_reg;
 	int attr_size;
-	bool is_pkg_data;
 	struct device_attribute sd_attrs[TOTAL_ATTRS];
 	char attr_name[TOTAL_ATTRS][CORETEMP_NAME_LENGTH];
 	struct attribute *attrs[TOTAL_ATTRS + 1];
@@ -149,6 +146,11 @@ static const struct tjmax_model tjmax_model_table[] = {
 				 */
 };
 
+static bool is_pkg_temp_data(struct temp_data *tdata)
+{
+	return tdata->index < 0;
+}
+
 static int adjust_tjmax(struct cpuinfo_x86 *c, u32 id, struct device *dev)
 {
 	/* The 100C is default for both mobile and non mobile CPUs */
@@ -341,7 +343,7 @@ static ssize_t show_label(struct device *dev,
 	struct platform_data *pdata = dev_get_drvdata(dev);
 	struct temp_data *tdata = container_of(devattr, struct temp_data, sd_attrs[ATTR_LABEL]);
 
-	if (tdata->is_pkg_data)
+	if (is_pkg_temp_data(tdata))
 		return sprintf(buf, "Package id %u\n", pdata->pkg_id);
 
 	return sprintf(buf, "Core %u\n", tdata->cpu_core_id);
@@ -433,7 +435,7 @@ static int create_core_attrs(struct temp_data *tdata, struct device *dev)
 		 * The attr number is always core id + 2
 		 * The Pkgtemp will always show up as temp1_*, if available
 		 */
-		int attr_no = tdata->is_pkg_data ? 1 : tdata->cpu_core_id + 2;
+		int attr_no = is_pkg_temp_data(tdata) ? 1 : tdata->cpu_core_id + 2;
 
 		snprintf(tdata->attr_name[i], CORETEMP_NAME_LENGTH,
 			 "temp%d_%s", attr_no, suffixes[i]);
@@ -484,6 +486,8 @@ init_temp_data(struct platform_data *pdata, unsigned int cpu, int pkg_flag)
 
 	if (pkg_flag) {
 		pdata->pkg_data = tdata;
+		/* Use tdata->index as indicator of package temp data */
+		tdata->index = -1;
 	} else {
 		tdata->index = ida_alloc_max(&pdata->ida, NUM_REAL_CORES - 1, GFP_KERNEL);
 		if (tdata->index < 0) {
@@ -495,7 +499,6 @@ init_temp_data(struct platform_data *pdata, unsigned int cpu, int pkg_flag)
 
 	tdata->status_reg = pkg_flag ? MSR_IA32_PACKAGE_THERM_STATUS :
 							MSR_IA32_THERM_STATUS;
-	tdata->is_pkg_data = pkg_flag;
 	tdata->cpu = cpu;
 	tdata->cpu_core_id = topology_core_id(cpu);
 	tdata->attr_size = MAX_CORE_ATTRS;
@@ -505,7 +508,7 @@ init_temp_data(struct platform_data *pdata, unsigned int cpu, int pkg_flag)
 
 static void destroy_temp_data(struct platform_data *pdata, struct temp_data *tdata)
 {
-	if (tdata->is_pkg_data) {
+	if (is_pkg_temp_data(tdata)) {
 		pdata->pkg_data = NULL;
 	} else {
 		pdata->core_data[tdata->index] = NULL;

From 1ec93e101fa617b666c7fbe9c26e9813335160b9 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Fri, 2 Feb 2024 17:21:44 +0800
Subject: [PATCH 0336/1406] hwmon: (coretemp) Use dynamic allocated memory for
 core temp_data

The total memory needed for saving per core temperature data depends on
the number of cores in a package. Using static allocated memory wastes
memories on systems with low per package core count.

Improve the code to use dynamic allocated memory so that it can be
improved further when per package core count information becomes
available.

No functional change intended.

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Link: https://lore.kernel.org/r/20240202092144.71180-12-rui.zhang@intel.com
[groeck: Fixed continuation line alignment]
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
---
 drivers/hwmon/coretemp.c | 24 +++++++++++++++++++++---
 1 file changed, 21 insertions(+), 3 deletions(-)

diff --git a/drivers/hwmon/coretemp.c b/drivers/hwmon/coretemp.c
index e548f2145449b8..30402de2c88907 100644
--- a/drivers/hwmon/coretemp.c
+++ b/drivers/hwmon/coretemp.c
@@ -91,10 +91,11 @@ struct temp_data {
 struct platform_data {
 	struct device		*hwmon_dev;
 	u16			pkg_id;
+	int			nr_cores;
 	struct ida		ida;
 	struct cpumask		cpumask;
 	struct temp_data	*pkg_data;
-	struct temp_data	*core_data[NUM_REAL_CORES];
+	struct temp_data	**core_data;
 	struct device_attribute name_attr;
 };
 
@@ -480,6 +481,20 @@ init_temp_data(struct platform_data *pdata, unsigned int cpu, int pkg_flag)
 {
 	struct temp_data *tdata;
 
+	if (!pdata->core_data) {
+		/*
+		 * TODO:
+		 * The information of actual possible cores in a package is broken for now.
+		 * Will replace hardcoded NUM_REAL_CORES with actual per package core count
+		 * when this information becomes available.
+		 */
+		pdata->nr_cores = NUM_REAL_CORES;
+		pdata->core_data = kcalloc(pdata->nr_cores, sizeof(struct temp_data *),
+					   GFP_KERNEL);
+		if (!pdata->core_data)
+			return NULL;
+	}
+
 	tdata = kzalloc(sizeof(struct temp_data), GFP_KERNEL);
 	if (!tdata)
 		return NULL;
@@ -489,7 +504,7 @@ init_temp_data(struct platform_data *pdata, unsigned int cpu, int pkg_flag)
 		/* Use tdata->index as indicator of package temp data */
 		tdata->index = -1;
 	} else {
-		tdata->index = ida_alloc_max(&pdata->ida, NUM_REAL_CORES - 1, GFP_KERNEL);
+		tdata->index = ida_alloc_max(&pdata->ida, pdata->nr_cores - 1, GFP_KERNEL);
 		if (tdata->index < 0) {
 			kfree(tdata);
 			return NULL;
@@ -510,6 +525,9 @@ static void destroy_temp_data(struct platform_data *pdata, struct temp_data *tda
 {
 	if (is_pkg_temp_data(tdata)) {
 		pdata->pkg_data = NULL;
+		kfree(pdata->core_data);
+		pdata->core_data = NULL;
+		pdata->nr_cores = 0;
 	} else {
 		pdata->core_data[tdata->index] = NULL;
 		ida_free(&pdata->ida, tdata->index);
@@ -525,7 +543,7 @@ static struct temp_data *get_temp_data(struct platform_data *pdata, int cpu)
 	if (cpu < 0)
 		return pdata->pkg_data;
 
-	for (i = 0; i < NUM_REAL_CORES; i++) {
+	for (i = 0; i < pdata->nr_cores; i++) {
 		if (pdata->core_data[i] &&
 		    pdata->core_data[i]->cpu_core_id == topology_core_id(cpu))
 			return pdata->core_data[i];

From f576d12cf2a0154b2f962bb34bb457682ba38283 Mon Sep 17 00:00:00 2001
From: Cosmo Chou <chou.cosmo@gmail.com>
Date: Tue, 6 Feb 2024 20:54:20 +0800
Subject: [PATCH 0337/1406] hwmon: Add driver for Astera Labs PT5161L retimer

This driver implements support for temperature monitoring of Astera Labs
PT5161L series PCIe retimer chips.

This driver implementation originates from the CSDK available at
Link: https://github.com/facebook/openbmc/tree/helium/common/recipes-lib/retimer-v2.14
The communication protocol utilized is based on the I2C/SMBus standard.

Signed-off-by: Cosmo Chou <chou.cosmo@gmail.com>
Link: https://lore.kernel.org/r/20240206125420.3884300-2-chou.cosmo@gmail.com
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
---
 Documentation/hwmon/index.rst   |   1 +
 Documentation/hwmon/pt5161l.rst |  42 ++
 MAINTAINERS                     |   7 +
 drivers/hwmon/Kconfig           |  10 +
 drivers/hwmon/Makefile          |   1 +
 drivers/hwmon/pt5161l.c         | 667 ++++++++++++++++++++++++++++++++
 6 files changed, 728 insertions(+)
 create mode 100644 Documentation/hwmon/pt5161l.rst
 create mode 100644 drivers/hwmon/pt5161l.c

diff --git a/Documentation/hwmon/index.rst b/Documentation/hwmon/index.rst
index c19f53d9b3ab14..c4af2a894c42dd 100644
--- a/Documentation/hwmon/index.rst
+++ b/Documentation/hwmon/index.rst
@@ -190,6 +190,7 @@ Hardware Monitoring Kernel Drivers
    pmbus
    powerz
    powr1220
+   pt5161l
    pxe1610
    pwm-fan
    q54sj108a2
diff --git a/Documentation/hwmon/pt5161l.rst b/Documentation/hwmon/pt5161l.rst
new file mode 100644
index 00000000000000..1b97336991ea5f
--- /dev/null
+++ b/Documentation/hwmon/pt5161l.rst
@@ -0,0 +1,42 @@
+.. SPDX-License-Identifier: GPL-2.0-or-later
+
+Kernel driver pt5161l
+=====================
+
+Supported chips:
+
+  * Astera Labs PT5161L
+
+    Prefix: 'pt5161l'
+
+    Addresses scanned: I2C 0x20 - 0x27
+
+    Datasheet: Not publicly available.
+
+Authors: Cosmo Chou <cosmo.chou@quantatw.com>
+
+Description
+-----------
+
+This driver implements support for temperature monitoring of Astera Labs
+PT5161L series PCIe retimer chips.
+
+This driver implementation originates from the CSDK available at
+https://github.com/facebook/openbmc/tree/helium/common/recipes-lib/retimer-v2.14
+The communication protocol utilized is based on the I2C/SMBus standard.
+
+Sysfs entries
+----------------
+
+================ ==============================================
+temp1_input      Measured temperature (in millidegrees Celsius)
+================ ==============================================
+
+Debugfs entries
+----------------
+
+================ ===============================
+fw_load_status   Firmware load status
+fw_ver           Firmware version of the retimer
+heartbeat_status Heartbeat status
+================ ===============================
diff --git a/MAINTAINERS b/MAINTAINERS
index 8fc0ca8e881c97..afe08a63f7a968 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -17698,6 +17698,13 @@ F:	fs/pstore/
 F:	include/linux/pstore*
 K:	\b(pstore|ramoops)
 
+PT5161L HARDWARE MONITOR DRIVER
+M:	Cosmo Chou <cosmo.chou@quantatw.com>
+L:	linux-hwmon@vger.kernel.org
+S:	Maintained
+F:	Documentation/hwmon/pt5161l.rst
+F:	drivers/hwmon/pt5161l.c
+
 PTP HARDWARE CLOCK SUPPORT
 M:	Richard Cochran <richardcochran@gmail.com>
 L:	netdev@vger.kernel.org
diff --git a/drivers/hwmon/Kconfig b/drivers/hwmon/Kconfig
index e4b24ad9396114..56260821d658d7 100644
--- a/drivers/hwmon/Kconfig
+++ b/drivers/hwmon/Kconfig
@@ -1755,6 +1755,16 @@ source "drivers/hwmon/peci/Kconfig"
 
 source "drivers/hwmon/pmbus/Kconfig"
 
+config SENSORS_PT5161L
+	tristate "Astera Labs PT5161L PCIe retimer hardware monitoring"
+	depends on I2C
+	help
+	  If you say yes here you get support for temperature monitoring
+	  on the Astera Labs PT5161L PCIe retimer.
+
+	  This driver can also be built as a module. If so, the module
+	  will be called pt5161l.
+
 config SENSORS_PWM_FAN
 	tristate "PWM fan"
 	depends on (PWM && OF) || COMPILE_TEST
diff --git a/drivers/hwmon/Makefile b/drivers/hwmon/Makefile
index e3faee7be51a69..f45c31aff009c0 100644
--- a/drivers/hwmon/Makefile
+++ b/drivers/hwmon/Makefile
@@ -184,6 +184,7 @@ obj-$(CONFIG_SENSORS_PC87427)	+= pc87427.o
 obj-$(CONFIG_SENSORS_PCF8591)	+= pcf8591.o
 obj-$(CONFIG_SENSORS_POWERZ)	+= powerz.o
 obj-$(CONFIG_SENSORS_POWR1220)  += powr1220.o
+obj-$(CONFIG_SENSORS_PT5161L)	+= pt5161l.o
 obj-$(CONFIG_SENSORS_PWM_FAN)	+= pwm-fan.o
 obj-$(CONFIG_SENSORS_RASPBERRYPI_HWMON)	+= raspberrypi-hwmon.o
 obj-$(CONFIG_SENSORS_SBTSI)	+= sbtsi_temp.o
diff --git a/drivers/hwmon/pt5161l.c b/drivers/hwmon/pt5161l.c
new file mode 100644
index 00000000000000..60361e39c47454
--- /dev/null
+++ b/drivers/hwmon/pt5161l.c
@@ -0,0 +1,667 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <linux/debugfs.h>
+#include <linux/delay.h>
+#include <linux/err.h>
+#include <linux/i2c.h>
+#include <linux/init.h>
+#include <linux/hwmon.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+
+/* Aries current average temp ADC code CSR */
+#define ARIES_CURRENT_AVG_TEMP_ADC_CSR	0x42c
+
+/* Device Load check register */
+#define ARIES_CODE_LOAD_REG	0x605
+/* Value indicating FW was loaded properly, [3:1] = 3'b111 */
+#define ARIES_LOAD_CODE	0xe
+
+/* Main Micro Heartbeat register */
+#define ARIES_MM_HEARTBEAT_ADDR	0x923
+
+/* Reg offset to specify Address for MM assisted accesses */
+#define ARIES_MM_ASSIST_REG_ADDR_OFFSET	0xd99
+/* Reg offset to specify Command for MM assisted accesses */
+#define ARIES_MM_ASSIST_CMD_OFFSET	0xd9d
+/* Reg offset to MM SPARE 0 used specify Address[7:0] */
+#define ARIES_MM_ASSIST_SPARE_0_OFFSET	0xd9f
+/* Reg offset to MM SPARE 3 used specify Data Byte 0 */
+#define ARIES_MM_ASSIST_SPARE_3_OFFSET	0xda2
+/* Wide register reads */
+#define ARIES_MM_RD_WIDE_REG_2B	0x1d
+#define ARIES_MM_RD_WIDE_REG_3B	0x1e
+#define ARIES_MM_RD_WIDE_REG_4B	0x1f
+#define ARIES_MM_RD_WIDE_REG_5B	0x20
+
+/* Time delay between checking MM status of EEPROM write (microseconds) */
+#define ARIES_MM_STATUS_TIME	5000
+
+/* AL Main SRAM DMEM offset (A0) */
+#define AL_MAIN_SRAM_DMEM_OFFSET	(64 * 1024)
+/* SRAM read command */
+#define AL_TG_RD_LOC_IND_SRAM	0x16
+
+/* Offset for main micro FW info */
+#define ARIES_MAIN_MICRO_FW_INFO	(96 * 1024 - 128)
+/* FW Info (Major) offset location in struct */
+#define ARIES_MM_FW_VERSION_MAJOR	0
+/* FW Info (Minor) offset location in struct */
+#define ARIES_MM_FW_VERSION_MINOR	1
+/* FW Info (Build no.) offset location in struct */
+#define ARIES_MM_FW_VERSION_BUILD	2
+
+#define ARIES_TEMP_CAL_CODE_DEFAULT	84
+
+/* Struct defining FW version loaded on an Aries device */
+struct pt5161l_fw_ver {
+	u8 major;
+	u8 minor;
+	u16 build;
+};
+
+/* Each client has this additional data */
+struct pt5161l_data {
+	struct i2c_client *client;
+	struct dentry *debugfs;
+	struct pt5161l_fw_ver fw_ver;
+	struct mutex lock; /* for atomic I2C transactions */
+	bool init_done;
+	bool code_load_okay; /* indicate if code load reg value is expected */
+	bool mm_heartbeat_okay; /* indicate if Main Micro heartbeat is good */
+	bool mm_wide_reg_access; /* MM assisted wide register access */
+};
+
+static struct dentry *pt5161l_debugfs_dir;
+
+/*
+ * Write multiple data bytes to Aries over I2C
+ */
+static int pt5161l_write_block_data(struct pt5161l_data *data, u32 address,
+				    u8 len, u8 *val)
+{
+	struct i2c_client *client = data->client;
+	int ret;
+	u8 remain_len = len;
+	u8 xfer_len, curr_len;
+	u8 buf[16];
+	u8 cmd = 0x0F; /* [7]:pec_en, [4:2]:func, [1]:start, [0]:end */
+	u8 config = 0x40; /* [6]:cfg_type, [4:1]:burst_len, [0]:address bit16 */
+
+	while (remain_len > 0) {
+		if (remain_len > 4) {
+			curr_len = 4;
+			remain_len -= 4;
+		} else {
+			curr_len = remain_len;
+			remain_len = 0;
+		}
+
+		buf[0] = config | (curr_len - 1) << 1 | ((address >> 16) & 0x1);
+		buf[1] = (address >> 8) & 0xff;
+		buf[2] = address & 0xff;
+		memcpy(&buf[3], val, curr_len);
+
+		xfer_len = 3 + curr_len;
+		ret = i2c_smbus_write_block_data(client, cmd, xfer_len, buf);
+		if (ret)
+			return ret;
+
+		val += curr_len;
+		address += curr_len;
+	}
+
+	return 0;
+}
+
+/*
+ * Read multiple data bytes from Aries over I2C
+ */
+static int pt5161l_read_block_data(struct pt5161l_data *data, u32 address,
+				   u8 len, u8 *val)
+{
+	struct i2c_client *client = data->client;
+	int ret, tries;
+	u8 remain_len = len;
+	u8 curr_len;
+	u8 wbuf[16], rbuf[24];
+	u8 cmd = 0x08; /* [7]:pec_en, [4:2]:func, [1]:start, [0]:end */
+	u8 config = 0x00; /* [6]:cfg_type, [4:1]:burst_len, [0]:address bit16 */
+
+	while (remain_len > 0) {
+		if (remain_len > 16) {
+			curr_len = 16;
+			remain_len -= 16;
+		} else {
+			curr_len = remain_len;
+			remain_len = 0;
+		}
+
+		wbuf[0] = config | (curr_len - 1) << 1 |
+			  ((address >> 16) & 0x1);
+		wbuf[1] = (address >> 8) & 0xff;
+		wbuf[2] = address & 0xff;
+
+		for (tries = 0; tries < 3; tries++) {
+			ret = i2c_smbus_write_block_data(client, (cmd | 0x2), 3,
+							 wbuf);
+			if (ret)
+				return ret;
+
+			ret = i2c_smbus_read_block_data(client, (cmd | 0x1),
+							rbuf);
+			if (ret == curr_len)
+				break;
+		}
+		if (tries >= 3)
+			return ret;
+
+		memcpy(val, rbuf, curr_len);
+		val += curr_len;
+		address += curr_len;
+	}
+
+	return 0;
+}
+
+static int pt5161l_read_wide_reg(struct pt5161l_data *data, u32 address,
+				 u8 width, u8 *val)
+{
+	int ret, tries;
+	u8 buf[8];
+	u8 status;
+
+	/*
+	 * Safely access wide registers using mailbox method to prevent
+	 * risking conflict with Aries firmware; otherwise fallback to
+	 * legacy, less secure method.
+	 */
+	if (data->mm_wide_reg_access) {
+		buf[0] = address & 0xff;
+		buf[1] = (address >> 8) & 0xff;
+		buf[2] = (address >> 16) & 0x1;
+		ret = pt5161l_write_block_data(data,
+					       ARIES_MM_ASSIST_SPARE_0_OFFSET,
+					       3, buf);
+		if (ret)
+			return ret;
+
+		/* Set command based on width */
+		switch (width) {
+		case 2:
+			buf[0] = ARIES_MM_RD_WIDE_REG_2B;
+			break;
+		case 3:
+			buf[0] = ARIES_MM_RD_WIDE_REG_3B;
+			break;
+		case 4:
+			buf[0] = ARIES_MM_RD_WIDE_REG_4B;
+			break;
+		case 5:
+			buf[0] = ARIES_MM_RD_WIDE_REG_5B;
+			break;
+		default:
+			return -EINVAL;
+		}
+		ret = pt5161l_write_block_data(data, ARIES_MM_ASSIST_CMD_OFFSET,
+					       1, buf);
+		if (ret)
+			return ret;
+
+		status = 0xff;
+		for (tries = 0; tries < 100; tries++) {
+			ret = pt5161l_read_block_data(data,
+						      ARIES_MM_ASSIST_CMD_OFFSET,
+						      1, &status);
+			if (ret)
+				return ret;
+
+			if (status == 0)
+				break;
+
+			usleep_range(ARIES_MM_STATUS_TIME,
+				     ARIES_MM_STATUS_TIME + 1000);
+		}
+		if (status != 0)
+			return -ETIMEDOUT;
+
+		ret = pt5161l_read_block_data(data,
+					      ARIES_MM_ASSIST_SPARE_3_OFFSET,
+					      width, val);
+		if (ret)
+			return ret;
+	} else {
+		return pt5161l_read_block_data(data, address, width, val);
+	}
+
+	return 0;
+}
+
+/*
+ * Read multiple (up to eight) data bytes from micro SRAM over I2C
+ */
+static int
+pt5161l_read_block_data_main_micro_indirect(struct pt5161l_data *data,
+					    u32 address, u8 len, u8 *val)
+{
+	int ret, tries;
+	u8 buf[8];
+	u8 i, status;
+	u32 uind_offs = ARIES_MM_ASSIST_REG_ADDR_OFFSET;
+	u32 eeprom_base, eeprom_addr;
+
+	/* No multi-byte indirect support here. Hence read a byte at a time */
+	eeprom_base = address - AL_MAIN_SRAM_DMEM_OFFSET;
+	for (i = 0; i < len; i++) {
+		eeprom_addr = eeprom_base + i;
+		buf[0] = eeprom_addr & 0xff;
+		buf[1] = (eeprom_addr >> 8) & 0xff;
+		buf[2] = (eeprom_addr >> 16) & 0xff;
+		ret = pt5161l_write_block_data(data, uind_offs, 3, buf);
+		if (ret)
+			return ret;
+
+		buf[0] = AL_TG_RD_LOC_IND_SRAM;
+		ret = pt5161l_write_block_data(data, uind_offs + 4, 1, buf);
+		if (ret)
+			return ret;
+
+		status = 0xff;
+		for (tries = 0; tries < 255; tries++) {
+			ret = pt5161l_read_block_data(data, uind_offs + 4, 1,
+						      &status);
+			if (ret)
+				return ret;
+
+			if (status == 0)
+				break;
+		}
+		if (status != 0)
+			return -ETIMEDOUT;
+
+		ret = pt5161l_read_block_data(data, uind_offs + 3, 1, buf);
+		if (ret)
+			return ret;
+
+		val[i] = buf[0];
+	}
+
+	return 0;
+}
+
+/*
+ * Check firmware load status
+ */
+static int pt5161l_fw_load_check(struct pt5161l_data *data)
+{
+	int ret;
+	u8 buf[8];
+
+	ret = pt5161l_read_block_data(data, ARIES_CODE_LOAD_REG, 1, buf);
+	if (ret)
+		return ret;
+
+	if (buf[0] < ARIES_LOAD_CODE) {
+		dev_dbg(&data->client->dev,
+			"Code Load reg unexpected. Not all modules are loaded %x\n",
+			buf[0]);
+		data->code_load_okay = false;
+	} else {
+		data->code_load_okay = true;
+	}
+
+	return 0;
+}
+
+/*
+ * Check main micro heartbeat
+ */
+static int pt5161l_heartbeat_check(struct pt5161l_data *data)
+{
+	int ret, tries;
+	u8 buf[8];
+	u8 heartbeat;
+	bool hb_changed = false;
+
+	ret = pt5161l_read_block_data(data, ARIES_MM_HEARTBEAT_ADDR, 1, buf);
+	if (ret)
+		return ret;
+
+	heartbeat = buf[0];
+	for (tries = 0; tries < 100; tries++) {
+		ret = pt5161l_read_block_data(data, ARIES_MM_HEARTBEAT_ADDR, 1,
+					      buf);
+		if (ret)
+			return ret;
+
+		if (buf[0] != heartbeat) {
+			hb_changed = true;
+			break;
+		}
+	}
+	data->mm_heartbeat_okay = hb_changed;
+
+	return 0;
+}
+
+/*
+ * Check the status of firmware
+ */
+static int pt5161l_fwsts_check(struct pt5161l_data *data)
+{
+	int ret;
+	u8 buf[8];
+	u8 major = 0, minor = 0;
+	u16 build = 0;
+
+	ret = pt5161l_fw_load_check(data);
+	if (ret)
+		return ret;
+
+	ret = pt5161l_heartbeat_check(data);
+	if (ret)
+		return ret;
+
+	if (data->code_load_okay && data->mm_heartbeat_okay) {
+		ret = pt5161l_read_block_data_main_micro_indirect(data, ARIES_MAIN_MICRO_FW_INFO +
+								  ARIES_MM_FW_VERSION_MAJOR,
+								  1, &major);
+		if (ret)
+			return ret;
+
+		ret = pt5161l_read_block_data_main_micro_indirect(data, ARIES_MAIN_MICRO_FW_INFO +
+								  ARIES_MM_FW_VERSION_MINOR,
+								  1, &minor);
+		if (ret)
+			return ret;
+
+		ret = pt5161l_read_block_data_main_micro_indirect(data, ARIES_MAIN_MICRO_FW_INFO +
+								  ARIES_MM_FW_VERSION_BUILD,
+								  2, buf);
+		if (ret)
+			return ret;
+		build = buf[1] << 8 | buf[0];
+	}
+	data->fw_ver.major = major;
+	data->fw_ver.minor = minor;
+	data->fw_ver.build = build;
+
+	return 0;
+}
+
+static int pt5161l_fw_is_at_least(struct pt5161l_data *data, u8 major, u8 minor,
+				  u16 build)
+{
+	u32 ver = major << 24 | minor << 16 | build;
+	u32 curr_ver = data->fw_ver.major << 24 | data->fw_ver.minor << 16 |
+		       data->fw_ver.build;
+
+	if (curr_ver >= ver)
+		return true;
+
+	return false;
+}
+
+static int pt5161l_init_dev(struct pt5161l_data *data)
+{
+	int ret;
+
+	mutex_lock(&data->lock);
+	ret = pt5161l_fwsts_check(data);
+	mutex_unlock(&data->lock);
+	if (ret)
+		return ret;
+
+	/* Firmware 2.2.0 enables safe access to wide registers */
+	if (pt5161l_fw_is_at_least(data, 2, 2, 0))
+		data->mm_wide_reg_access = true;
+
+	data->init_done = true;
+
+	return 0;
+}
+
+static int pt5161l_read(struct device *dev, enum hwmon_sensor_types type,
+			u32 attr, int channel, long *val)
+{
+	struct pt5161l_data *data = dev_get_drvdata(dev);
+	int ret;
+	u8 buf[8];
+	long adc_code;
+
+	switch (attr) {
+	case hwmon_temp_input:
+		if (!data->init_done) {
+			ret = pt5161l_init_dev(data);
+			if (ret)
+				return ret;
+		}
+
+		mutex_lock(&data->lock);
+		ret = pt5161l_read_wide_reg(data,
+					    ARIES_CURRENT_AVG_TEMP_ADC_CSR, 4,
+					    buf);
+		mutex_unlock(&data->lock);
+		if (ret) {
+			dev_dbg(dev, "Read adc_code failed %d\n", ret);
+			return ret;
+		}
+
+		adc_code = buf[3] << 24 | buf[2] << 16 | buf[1] << 8 | buf[0];
+		if (adc_code == 0 || adc_code >= 0x3ff) {
+			dev_dbg(dev, "Invalid adc_code %lx\n", adc_code);
+			return -EIO;
+		}
+
+		*val = 110000 +
+		       ((adc_code - (ARIES_TEMP_CAL_CODE_DEFAULT + 250)) *
+			-320);
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	return 0;
+}
+
+static umode_t pt5161l_is_visible(const void *data,
+				  enum hwmon_sensor_types type, u32 attr,
+				  int channel)
+{
+	switch (attr) {
+	case hwmon_temp_input:
+		return 0444;
+	default:
+		break;
+	}
+
+	return 0;
+}
+
+static const struct hwmon_channel_info *pt5161l_info[] = {
+	HWMON_CHANNEL_INFO(temp, HWMON_T_INPUT),
+	NULL
+};
+
+static const struct hwmon_ops pt5161l_hwmon_ops = {
+	.is_visible = pt5161l_is_visible,
+	.read = pt5161l_read,
+};
+
+static const struct hwmon_chip_info pt5161l_chip_info = {
+	.ops = &pt5161l_hwmon_ops,
+	.info = pt5161l_info,
+};
+
+static ssize_t pt5161l_debugfs_read_fw_ver(struct file *file, char __user *buf,
+					   size_t count, loff_t *ppos)
+{
+	struct pt5161l_data *data = file->private_data;
+	int ret;
+	char ver[32];
+
+	mutex_lock(&data->lock);
+	ret = pt5161l_fwsts_check(data);
+	mutex_unlock(&data->lock);
+	if (ret)
+		return ret;
+
+	ret = snprintf(ver, sizeof(ver), "%u.%u.%u\n", data->fw_ver.major,
+		       data->fw_ver.minor, data->fw_ver.build);
+
+	return simple_read_from_buffer(buf, count, ppos, ver, ret);
+}
+
+static const struct file_operations pt5161l_debugfs_ops_fw_ver = {
+	.read = pt5161l_debugfs_read_fw_ver,
+	.open = simple_open,
+};
+
+static ssize_t pt5161l_debugfs_read_fw_load_sts(struct file *file,
+						char __user *buf, size_t count,
+						loff_t *ppos)
+{
+	struct pt5161l_data *data = file->private_data;
+	int ret;
+	bool status = false;
+	char health[16];
+
+	mutex_lock(&data->lock);
+	ret = pt5161l_fw_load_check(data);
+	mutex_unlock(&data->lock);
+	if (ret == 0)
+		status = data->code_load_okay;
+
+	ret = snprintf(health, sizeof(health), "%s\n",
+		       status ? "normal" : "abnormal");
+
+	return simple_read_from_buffer(buf, count, ppos, health, ret);
+}
+
+static const struct file_operations pt5161l_debugfs_ops_fw_load_sts = {
+	.read = pt5161l_debugfs_read_fw_load_sts,
+	.open = simple_open,
+};
+
+static ssize_t pt5161l_debugfs_read_hb_sts(struct file *file, char __user *buf,
+					   size_t count, loff_t *ppos)
+{
+	struct pt5161l_data *data = file->private_data;
+	int ret;
+	bool status = false;
+	char health[16];
+
+	mutex_lock(&data->lock);
+	ret = pt5161l_heartbeat_check(data);
+	mutex_unlock(&data->lock);
+	if (ret == 0)
+		status = data->mm_heartbeat_okay;
+
+	ret = snprintf(health, sizeof(health), "%s\n",
+		       status ? "normal" : "abnormal");
+
+	return simple_read_from_buffer(buf, count, ppos, health, ret);
+}
+
+static const struct file_operations pt5161l_debugfs_ops_hb_sts = {
+	.read = pt5161l_debugfs_read_hb_sts,
+	.open = simple_open,
+};
+
+static int pt5161l_init_debugfs(struct pt5161l_data *data)
+{
+	data->debugfs = debugfs_create_dir(dev_name(&data->client->dev),
+					   pt5161l_debugfs_dir);
+
+	debugfs_create_file("fw_ver", 0444, data->debugfs, data,
+			    &pt5161l_debugfs_ops_fw_ver);
+
+	debugfs_create_file("fw_load_status", 0444, data->debugfs, data,
+			    &pt5161l_debugfs_ops_fw_load_sts);
+
+	debugfs_create_file("heartbeat_status", 0444, data->debugfs, data,
+			    &pt5161l_debugfs_ops_hb_sts);
+
+	return 0;
+}
+
+static int pt5161l_probe(struct i2c_client *client)
+{
+	struct device *dev = &client->dev;
+	struct device *hwmon_dev;
+	struct pt5161l_data *data;
+
+	data = devm_kzalloc(dev, sizeof(*data), GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+
+	data->client = client;
+	mutex_init(&data->lock);
+	pt5161l_init_dev(data);
+	dev_set_drvdata(dev, data);
+
+	hwmon_dev = devm_hwmon_device_register_with_info(dev, client->name,
+							 data,
+							 &pt5161l_chip_info,
+							 NULL);
+
+	pt5161l_init_debugfs(data);
+
+	return PTR_ERR_OR_ZERO(hwmon_dev);
+}
+
+static void pt5161l_remove(struct i2c_client *client)
+{
+	struct pt5161l_data *data = i2c_get_clientdata(client);
+
+	debugfs_remove_recursive(data->debugfs);
+}
+
+static const struct of_device_id __maybe_unused pt5161l_of_match[] = {
+	{ .compatible = "asteralabs,pt5161l" },
+	{},
+};
+MODULE_DEVICE_TABLE(of, pt5161l_of_match);
+
+static const struct acpi_device_id __maybe_unused pt5161l_acpi_match[] = {
+	{ "PT5161L", 0 },
+	{},
+};
+MODULE_DEVICE_TABLE(acpi, pt5161l_acpi_match);
+
+static const struct i2c_device_id pt5161l_id[] = {
+	{ "pt5161l", 0 },
+	{}
+};
+MODULE_DEVICE_TABLE(i2c, pt5161l_id);
+
+static struct i2c_driver pt5161l_driver = {
+	.class = I2C_CLASS_HWMON,
+	.driver = {
+		.name = "pt5161l",
+		.of_match_table = of_match_ptr(pt5161l_of_match),
+		.acpi_match_table = ACPI_PTR(pt5161l_acpi_match),
+	},
+	.probe = pt5161l_probe,
+	.remove = pt5161l_remove,
+	.id_table = pt5161l_id,
+};
+
+static int __init pt5161l_init(void)
+{
+	pt5161l_debugfs_dir = debugfs_create_dir("pt5161l", NULL);
+	return i2c_add_driver(&pt5161l_driver);
+}
+
+static void __exit pt5161l_exit(void)
+{
+	i2c_del_driver(&pt5161l_driver);
+	debugfs_remove_recursive(pt5161l_debugfs_dir);
+}
+
+module_init(pt5161l_init);
+module_exit(pt5161l_exit);
+
+MODULE_AUTHOR("Cosmo Chou <cosmo.chou@quantatw.com>");
+MODULE_DESCRIPTION("Hwmon driver for Astera Labs Aries PCIe retimer");
+MODULE_LICENSE("GPL");

From f1d14873cb31bfbf2783490bf87a1c2483ac53b6 Mon Sep 17 00:00:00 2001
From: Javier Carrasco <javier.carrasco.cruz@gmail.com>
Date: Wed, 7 Feb 2024 22:17:08 +0100
Subject: [PATCH 0338/1406] hwmon: chipcap2: fix uninitialized variable in
 cc2_get_reg_val()

The reg_val variable in cc2_get_reg_val() might be used without a known
value if cc2_read_reg() fails. That leads to a useless data conversion
because the returned error means the read operation failed and the data is
not relevant.

That makes its initial value irrelevant as well, so skip the data
conversion instead. If no error happens, a value is assigned to reg_val
and the data conversion is required.

Reported-by: Dan Carpenter <dan.carpenter@linaro.org>
Closes: https://lore.kernel.org/linux-hwmon/294e4634-89d4-415e-a723-b208d8770d7c@gmail.com/T/#t
Signed-off-by: Javier Carrasco <javier.carrasco.cruz@gmail.com>
Link: https://lore.kernel.org/r/20240207-chipcap2_init_vars-v1-1-08cafe43e20e@gmail.com
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
---
 drivers/hwmon/chipcap2.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/hwmon/chipcap2.c b/drivers/hwmon/chipcap2.c
index a62c507b10429c..3b604fc5d8aeac 100644
--- a/drivers/hwmon/chipcap2.c
+++ b/drivers/hwmon/chipcap2.c
@@ -324,7 +324,9 @@ static int cc2_get_reg_val(struct cc2_data *data, u8 reg, long *val)
 	int ret;
 
 	ret = cc2_read_reg(data, reg, &reg_val);
-	*val = cc2_rh_convert(reg_val);
+	if (!ret)
+		*val = cc2_rh_convert(reg_val);
+
 	cc2_disable(data);
 
 	return ret;

From 7c9ac35977761ea3eee2f3aab4b7339ce497c009 Mon Sep 17 00:00:00 2001
From: Javier Carrasco <javier.carrasco.cruz@gmail.com>
Date: Wed, 7 Feb 2024 22:17:09 +0100
Subject: [PATCH 0339/1406] hwmon: chipcap2: fix return path in
 cc2_request_alarm_irqs()

The return path can be improved by returning upon first failure. The
current implementation would try to register the second interrupt even
if the first one failed, which is unnecessary.

Moreover, if no irqs are available, the return value should be zero
(the driver supports the use case with no interrupts). Currently the
initial value is unassigned and that may lead to returning an unknown
value if stack variables are not automatically set to zero and no irqs
were provided.

Reported-by: Dan Carpenter <dan.carpenter@linaro.org>
Closes: https://lore.kernel.org/linux-hwmon/294e4634-89d4-415e-a723-b208d8770d7c@gmail.com/T/#t
Signed-off-by: Javier Carrasco <javier.carrasco.cruz@gmail.com>
Link: https://lore.kernel.org/r/20240207-chipcap2_init_vars-v1-2-08cafe43e20e@gmail.com
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
---
 drivers/hwmon/chipcap2.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/drivers/hwmon/chipcap2.c b/drivers/hwmon/chipcap2.c
index 3b604fc5d8aeac..6ccceae21f701a 100644
--- a/drivers/hwmon/chipcap2.c
+++ b/drivers/hwmon/chipcap2.c
@@ -670,7 +670,7 @@ static int cc2_request_ready_irq(struct cc2_data *data, struct device *dev)
 
 static int cc2_request_alarm_irqs(struct cc2_data *data, struct device *dev)
 {
-	int ret;
+	int ret = 0;
 
 	data->irq_low = fwnode_irq_get_byname(dev_fwnode(dev), "low");
 	if (data->irq_low > 0) {
@@ -679,8 +679,10 @@ static int cc2_request_alarm_irqs(struct cc2_data *data, struct device *dev)
 						IRQF_ONESHOT |
 						IRQF_TRIGGER_RISING,
 						dev_name(dev), data);
-		if (!ret)
-			data->rh_alarm.low_alarm_visible = true;
+		if (ret)
+			return ret;
+
+		data->rh_alarm.low_alarm_visible = true;
 	}
 
 	data->irq_high = fwnode_irq_get_byname(dev_fwnode(dev), "high");
@@ -690,8 +692,10 @@ static int cc2_request_alarm_irqs(struct cc2_data *data, struct device *dev)
 						IRQF_ONESHOT |
 						IRQF_TRIGGER_RISING,
 						dev_name(dev), data);
-		if (!ret)
-			data->rh_alarm.high_alarm_visible = true;
+		if (ret)
+			return ret;
+
+		data->rh_alarm.high_alarm_visible = true;
 	}
 
 	return ret;

From 1c365b5017d017260161de296bf0b35fb1f0dbb9 Mon Sep 17 00:00:00 2001
From: Sebastian Kranz <tklightforce@googlemail.com>
Date: Fri, 9 Feb 2024 10:01:23 +0100
Subject: [PATCH 0340/1406] hwmon: (oxp-sensors) Add support for Ayaneo Air
 Plus 7320u.

Add support for handheld AYANEO AIR Plus with the same EC registers
to add proper fan control.

Functionality was tested successfully.

Signed-off-by: Sebastian Kranz <tklightforce@googlemail.com>
Link: https://lore.kernel.org/r/20240209090157.3232-1-tklightforce@googlemail.com
[groeck: Fixed up commit message]
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
---
 Documentation/hwmon/oxp-sensors.rst |  1 +
 drivers/hwmon/oxp-sensors.c         | 10 ++++++++++
 2 files changed, 11 insertions(+)

diff --git a/Documentation/hwmon/oxp-sensors.rst b/Documentation/hwmon/oxp-sensors.rst
index 3adeb7406243fb..55b1ef61625ebc 100644
--- a/Documentation/hwmon/oxp-sensors.rst
+++ b/Documentation/hwmon/oxp-sensors.rst
@@ -33,6 +33,7 @@ Currently the driver supports the following handhelds:
  - AOK ZOE A1 PRO
  - Aya Neo 2
  - Aya Neo AIR
+ - Aya Neo AIR Plus (Mendocino)
  - Aya Neo AIR Pro
  - Aya Neo Geek
  - OneXPlayer AMD
diff --git a/drivers/hwmon/oxp-sensors.c b/drivers/hwmon/oxp-sensors.c
index ea9602063eabc7..8d3b0f86cc57a9 100644
--- a/drivers/hwmon/oxp-sensors.c
+++ b/drivers/hwmon/oxp-sensors.c
@@ -43,6 +43,7 @@ enum oxp_board {
 	aok_zoe_a1 = 1,
 	aya_neo_2,
 	aya_neo_air,
+	aya_neo_air_plus_mendo,
 	aya_neo_air_pro,
 	aya_neo_geek,
 	oxp_mini_amd,
@@ -98,6 +99,13 @@ static const struct dmi_system_id dmi_table[] = {
 		},
 		.driver_data = (void *)aya_neo_air,
 	},
+	{
+		.matches = {
+			DMI_MATCH(DMI_BOARD_VENDOR, "AYANEO"),
+			DMI_EXACT_MATCH(DMI_BOARD_NAME, "AB05-Mendocino"),
+		},
+		.driver_data = (void *)aya_neo_air_plus_mendo,
+	},
 	{
 		.matches = {
 			DMI_MATCH(DMI_BOARD_VENDOR, "AYANEO"),
@@ -332,6 +340,7 @@ static int oxp_platform_read(struct device *dev, enum hwmon_sensor_types type,
 			switch (board) {
 			case aya_neo_2:
 			case aya_neo_air:
+			case aya_neo_air_plus_mendo:
 			case aya_neo_air_pro:
 			case aya_neo_geek:
 			case oxp_mini_amd:
@@ -374,6 +383,7 @@ static int oxp_platform_write(struct device *dev, enum hwmon_sensor_types type,
 			switch (board) {
 			case aya_neo_2:
 			case aya_neo_air:
+			case aya_neo_air_plus_mendo:
 			case aya_neo_air_pro:
 			case aya_neo_geek:
 			case oxp_mini_amd:

From c3f5a7d4ff0d0368a73ed932efd8045ae59fc2dd Mon Sep 17 00:00:00 2001
From: Li Zhijian <lizhijian@fujitsu.com>
Date: Mon, 22 Jan 2024 13:39:41 +0800
Subject: [PATCH 0341/1406] firewire: Kill unnecessary buf check in
 device_attribute.show

Per Documentation/filesystems/sysfs.rst:
> sysfs allocates a buffer of size (PAGE_SIZE) and passes it to the
> method.

So we can kill the unnecessary buf check safely.

Signed-off-by: Li Zhijian <lizhijian@fujitsu.com>
Link: https://lore.kernel.org/r/20240122053942.80648-1-lizhijian@fujitsu.com
Signed-off-by: Takashi Sakamoto <o-takashi@sakamocchi.jp>
---
 drivers/firewire/core-device.c | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

diff --git a/drivers/firewire/core-device.c b/drivers/firewire/core-device.c
index 7d3346b3a2bf32..3a1a2bf1717ccf 100644
--- a/drivers/firewire/core-device.c
+++ b/drivers/firewire/core-device.c
@@ -322,7 +322,7 @@ static ssize_t show_immediate(struct device *dev,
 	if (value < 0)
 		return -ENOENT;
 
-	return snprintf(buf, buf ? PAGE_SIZE : 0, "0x%06x\n", value);
+	return snprintf(buf, PAGE_SIZE, "0x%06x\n", value);
 }
 
 #define IMMEDIATE_ATTR(name, key)				\
@@ -334,8 +334,6 @@ static ssize_t show_text_leaf(struct device *dev,
 	struct config_rom_attribute *attr =
 		container_of(dattr, struct config_rom_attribute, attr);
 	const u32 *directories[] = {NULL, NULL};
-	size_t bufsize;
-	char dummy_buf[2];
 	int i, ret = -ENOENT;
 
 	down_read(&fw_device_rwsem);
@@ -357,15 +355,9 @@ static ssize_t show_text_leaf(struct device *dev,
 		}
 	}
 
-	if (buf) {
-		bufsize = PAGE_SIZE - 1;
-	} else {
-		buf = dummy_buf;
-		bufsize = 1;
-	}
-
 	for (i = 0; i < ARRAY_SIZE(directories) && !!directories[i]; ++i) {
-		int result = fw_csr_string(directories[i], attr->key, buf, bufsize);
+		int result = fw_csr_string(directories[i], attr->key, buf,
+					   PAGE_SIZE - 1);
 		// Detected.
 		if (result >= 0) {
 			ret = result;

From 7c6eed4d6161a0a930884ffb324dee230a1e1cd6 Mon Sep 17 00:00:00 2001
From: Li Zhijian <lizhijian@fujitsu.com>
Date: Mon, 22 Jan 2024 13:39:42 +0800
Subject: [PATCH 0342/1406] firewire: Convert snprintf/sprintf to sysfs_emit

Per filesystems/sysfs.rst, show() should only use sysfs_emit()
or sysfs_emit_at() when formatting the value to be returned to user space.

coccinelle complains that there are still a couple of functions that use
snprintf(). Convert them to sysfs_emit().

> drivers/firewire/core-device.c:326:8-16: WARNING: please use sysfs_emit or sysfs_emit_at

No functional change intended

Signed-off-by: Li Zhijian <lizhijian@fujitsu.com>
Link: https://lore.kernel.org/r/20240122053942.80648-2-lizhijian@fujitsu.com
Signed-off-by: Takashi Sakamoto <o-takashi@sakamocchi.jp>
---
 drivers/firewire/core-device.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/firewire/core-device.c b/drivers/firewire/core-device.c
index 3a1a2bf1717ccf..a802c6d4f4fdf0 100644
--- a/drivers/firewire/core-device.c
+++ b/drivers/firewire/core-device.c
@@ -322,7 +322,7 @@ static ssize_t show_immediate(struct device *dev,
 	if (value < 0)
 		return -ENOENT;
 
-	return snprintf(buf, PAGE_SIZE, "0x%06x\n", value);
+	return sysfs_emit(buf, "0x%06x\n", value);
 }
 
 #define IMMEDIATE_ATTR(name, key)				\
@@ -482,7 +482,7 @@ static ssize_t is_local_show(struct device *dev,
 {
 	struct fw_device *device = fw_device(dev);
 
-	return sprintf(buf, "%u\n", device->is_local);
+	return sysfs_emit(buf, "%u\n", device->is_local);
 }
 
 static int units_sprintf(char *buf, const u32 *directory)

From 41ebb53b1bffb24547e21015ea53f382f922a099 Mon Sep 17 00:00:00 2001
From: Takashi Sakamoto <o-takashi@sakamocchi.jp>
Date: Mon, 5 Feb 2024 15:04:48 +0900
Subject: [PATCH 0343/1406] firewire: core: fix build failure due to the caller
 of fw_csr_string()

A commit 47dc55181dcb ("firewire: core: search descriptor leaf just after
vendor directory entry in root directory") for v6.8-rc3 and a commit
67a5a58c0443 ("firewire: Kill unnecessary buf check in
device_attribute.show") for v6.9 bring build failure in for-next tree due
to the change of the name of local variable.

This commit fixes it.

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Closes: https://lore.kernel.org/lkml/20240202111602.6f6e2c1a@canb.auug.org.au/
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202402022343.NkgsMITA-lkp@intel.com/
Link: https://lore.kernel.org/r/20240205060448.13881-1-o-takashi@sakamocchi.jp
Signed-off-by: Takashi Sakamoto <o-takashi@sakamocchi.jp>
---
 drivers/firewire/core-device.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/firewire/core-device.c b/drivers/firewire/core-device.c
index a802c6d4f4fdf0..c0976f6268d329 100644
--- a/drivers/firewire/core-device.c
+++ b/drivers/firewire/core-device.c
@@ -366,7 +366,7 @@ static ssize_t show_text_leaf(struct device *dev,
 			// in the root directory follows to the directory entry for vendor ID
 			// instead of the immediate value for vendor ID.
 			result = fw_csr_string(directories[i], CSR_DIRECTORY | attr->key, buf,
-					       bufsize);
+					       PAGE_SIZE - 1);
 			if (result >= 0)
 				ret = result;
 		}

From 878c391f74d6a730a600e021bffc92d13c791c1b Mon Sep 17 00:00:00 2001
From: Dmitry Antipov <dmantipov@yandex.ru>
Date: Fri, 9 Feb 2024 15:52:19 +0300
Subject: [PATCH 0344/1406] fs: prefer kfree_rcu() in fasync_remove_entry()

In 'fasync_remove_entry()', prefer 'kfree_rcu()' over 'call_rcu()' with dummy
'fasync_free_rcu()' callback. This is mostly intended in attempt to fix weird
https://syzkaller.appspot.com/bug?id=6a64ad907e361e49e92d1c4c114128a1bda2ed7f,
where kmemleak may consider 'fa' as unreferenced during RCU grace period. See
https://lore.kernel.org/stable/20230930174657.800551-1-joel@joelfernandes.org
as well. Comments are highly appreciated.

Ever since ae65a5211d90 ("mm/slab: document kfree() as allowed for
kmem_cache_alloc() objects") kfree() can be used for both kmalloc() and
kmem_cache_alloc() so this is no safe.

Do not backport this to stable, please.

Link ae65a5211d90 ("mm/slab: document kfree() as > allowed for kmem_cache_alloc() objects")
Signed-off-by: Dmitry Antipov <dmantipov@yandex.ru>
Link: https://lore.kernel.org/r/20240209125220.330383-1-dmantipov@yandex.ru
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/fcntl.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/fs/fcntl.c b/fs/fcntl.c
index c80a6acad742fb..c3e342eb74afa6 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -846,12 +846,6 @@ int send_sigurg(struct fown_struct *fown)
 static DEFINE_SPINLOCK(fasync_lock);
 static struct kmem_cache *fasync_cache __ro_after_init;
 
-static void fasync_free_rcu(struct rcu_head *head)
-{
-	kmem_cache_free(fasync_cache,
-			container_of(head, struct fasync_struct, fa_rcu));
-}
-
 /*
  * Remove a fasync entry. If successfully removed, return
  * positive and clear the FASYNC flag. If no entry exists,
@@ -877,7 +871,7 @@ int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp)
 		write_unlock_irq(&fa->fa_lock);
 
 		*fp = fa->fa_next;
-		call_rcu(&fa->fa_rcu, fasync_free_rcu);
+		kfree_rcu(fa, fa_rcu);
 		filp->f_flags &= ~FASYNC;
 		result = 1;
 		break;

From b97afc4b8574a3c798fae307235b3e487e1dabd4 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Mon, 8 Jan 2024 13:50:20 +1030
Subject: [PATCH 0345/1406] btrfs: remove the pg_offset parameter from
 btrfs_get_extent()

The parameter @pg_offset of btrfs_get_extent() is only utilized for
inlined extent, and we already have an ASSERT() and tree-checker, to
make sure we can only get inline extent at file offset 0.

Any invalid inline extent with non-zero file offset would be rejected by
tree-checker in the first place.

Thus the @pg_offset parameter is not really necessary, just remove it.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/btrfs_inode.h       |  3 +--
 fs/btrfs/extent_io.c         | 10 ++++-----
 fs/btrfs/file.c              | 11 +++++-----
 fs/btrfs/inode.c             | 16 ++++++---------
 fs/btrfs/tests/inode-tests.c | 40 ++++++++++++++++++------------------
 5 files changed, 36 insertions(+), 44 deletions(-)

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 7f7c5a92d2b879..83d78a6f3aa2f3 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -490,8 +490,7 @@ struct inode *btrfs_iget_path(struct super_block *s, u64 ino,
 			      struct btrfs_root *root, struct btrfs_path *path);
 struct inode *btrfs_iget(struct super_block *s, u64 ino, struct btrfs_root *root);
 struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
-				    struct page *page, size_t pg_offset,
-				    u64 start, u64 len);
+				    struct page *page, u64 start, u64 len);
 int btrfs_update_inode(struct btrfs_trans_handle *trans,
 		       struct btrfs_inode *inode);
 int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index cfd2967f04a293..70a1ad6da50b48 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -970,8 +970,7 @@ void clear_page_extent_mapped(struct page *page)
 	folio_detach_private(folio);
 }
 
-static struct extent_map *
-__get_extent_map(struct inode *inode, struct page *page, size_t pg_offset,
+static struct extent_map *__get_extent_map(struct inode *inode, struct page *page,
 		 u64 start, u64 len, struct extent_map **em_cached)
 {
 	struct extent_map *em;
@@ -988,7 +987,7 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset,
 		*em_cached = NULL;
 	}
 
-	em = btrfs_get_extent(BTRFS_I(inode), page, pg_offset, start, len);
+	em = btrfs_get_extent(BTRFS_I(inode), page, start, len);
 	if (em_cached && !IS_ERR(em)) {
 		BUG_ON(*em_cached);
 		refcount_inc(&em->refs);
@@ -1051,8 +1050,7 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
 			end_page_read(page, true, cur, iosize);
 			break;
 		}
-		em = __get_extent_map(inode, page, pg_offset, cur,
-				      end - cur + 1, em_cached);
+		em = __get_extent_map(inode, page, cur, end - cur + 1, em_cached);
 		if (IS_ERR(em)) {
 			unlock_extent(tree, cur, end, NULL);
 			end_page_read(page, false, cur, end + 1 - cur);
@@ -1371,7 +1369,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
 			continue;
 		}
 
-		em = btrfs_get_extent(inode, NULL, 0, cur, len);
+		em = btrfs_get_extent(inode, NULL, cur, len);
 		if (IS_ERR(em)) {
 			ret = PTR_ERR_OR_ZERO(em);
 			goto out_error;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 38dfcac4760990..f8e1a7ce3d39ae 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -2176,7 +2176,7 @@ static int find_first_non_hole(struct btrfs_inode *inode, u64 *start, u64 *len)
 	struct extent_map *em;
 	int ret = 0;
 
-	em = btrfs_get_extent(inode, NULL, 0,
+	em = btrfs_get_extent(inode, NULL,
 			      round_down(*start, fs_info->sectorsize),
 			      round_up(*len, fs_info->sectorsize));
 	if (IS_ERR(em))
@@ -2835,7 +2835,7 @@ static int btrfs_zero_range_check_range_boundary(struct btrfs_inode *inode,
 	int ret;
 
 	offset = round_down(offset, sectorsize);
-	em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize);
+	em = btrfs_get_extent(inode, NULL, offset, sectorsize);
 	if (IS_ERR(em))
 		return PTR_ERR(em);
 
@@ -2866,7 +2866,7 @@ static int btrfs_zero_range(struct inode *inode,
 	u64 bytes_to_reserve = 0;
 	bool space_reserved = false;
 
-	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, alloc_start,
+	em = btrfs_get_extent(BTRFS_I(inode), NULL, alloc_start,
 			      alloc_end - alloc_start);
 	if (IS_ERR(em)) {
 		ret = PTR_ERR(em);
@@ -2909,8 +2909,7 @@ static int btrfs_zero_range(struct inode *inode,
 
 	if (BTRFS_BYTES_TO_BLKS(fs_info, offset) ==
 	    BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1)) {
-		em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, alloc_start,
-				      sectorsize);
+		em = btrfs_get_extent(BTRFS_I(inode), NULL, alloc_start, sectorsize);
 		if (IS_ERR(em)) {
 			ret = PTR_ERR(em);
 			goto out;
@@ -3126,7 +3125,7 @@ static long btrfs_fallocate(struct file *file, int mode,
 
 	/* First, check if we exceed the qgroup limit */
 	while (cur_offset < alloc_end) {
-		em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur_offset,
+		em = btrfs_get_extent(BTRFS_I(inode), NULL, cur_offset,
 				      alloc_end - cur_offset);
 		if (IS_ERR(em)) {
 			ret = PTR_ERR(em);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 1eb93d3962aac4..da7946c44d1eda 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2632,7 +2632,7 @@ static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode,
 		u64 em_len;
 		int ret = 0;
 
-		em = btrfs_get_extent(inode, NULL, 0, search_start, search_len);
+		em = btrfs_get_extent(inode, NULL, search_start, search_len);
 		if (IS_ERR(em))
 			return PTR_ERR(em);
 
@@ -4892,8 +4892,7 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size)
 					   &cached_state);
 	cur_offset = hole_start;
 	while (1) {
-		em = btrfs_get_extent(inode, NULL, 0, cur_offset,
-				      block_end - cur_offset);
+		em = btrfs_get_extent(inode, NULL, cur_offset, block_end - cur_offset);
 		if (IS_ERR(em)) {
 			err = PTR_ERR(em);
 			em = NULL;
@@ -6741,7 +6740,6 @@ static int read_inline_extent(struct btrfs_inode *inode, struct btrfs_path *path
  *
  * @inode:	file to search in
  * @page:	page to read extent data into if the extent is inline
- * @pg_offset:	offset into @page to copy to
  * @start:	file offset
  * @len:	length of range starting at @start
  *
@@ -6755,8 +6753,7 @@ static int read_inline_extent(struct btrfs_inode *inode, struct btrfs_path *path
  * Return: ERR_PTR on error, non-NULL extent_map on success.
  */
 struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
-				    struct page *page, size_t pg_offset,
-				    u64 start, u64 len)
+				    struct page *page, u64 start, u64 len)
 {
 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 	int ret = 0;
@@ -6899,7 +6896,6 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
 		 * ensured by tree-checker and inline extent creation path.
 		 * Thus all members representing file offsets should be zero.
 		 */
-		ASSERT(pg_offset == 0);
 		ASSERT(extent_start == 0);
 		ASSERT(em->start == 0);
 
@@ -7540,7 +7536,7 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
 	if (ret < 0)
 		goto err;
 
-	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len);
+	em = btrfs_get_extent(BTRFS_I(inode), NULL, start, len);
 	if (IS_ERR(em)) {
 		ret = PTR_ERR(em);
 		goto unlock_err;
@@ -10129,7 +10125,7 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
 		cond_resched();
 	}
 
-	em = btrfs_get_extent(inode, NULL, 0, start, lockend - start + 1);
+	em = btrfs_get_extent(inode, NULL, start, lockend - start + 1);
 	if (IS_ERR(em)) {
 		ret = PTR_ERR(em);
 		goto out_unlock_extent;
@@ -10702,7 +10698,7 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
 		struct btrfs_block_group *bg;
 		u64 len = isize - start;
 
-		em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len);
+		em = btrfs_get_extent(BTRFS_I(inode), NULL, start, len);
 		if (IS_ERR(em)) {
 			ret = PTR_ERR(em);
 			goto out;
diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c
index 9957de9f7806d1..99da9d34b77aed 100644
--- a/fs/btrfs/tests/inode-tests.c
+++ b/fs/btrfs/tests/inode-tests.c
@@ -258,7 +258,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
 
 	/* First with no extents */
 	BTRFS_I(inode)->root = root;
-	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 0, sectorsize);
+	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, sectorsize);
 	if (IS_ERR(em)) {
 		em = NULL;
 		test_err("got an error when we shouldn't have");
@@ -278,7 +278,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
 	 */
 	setup_file_extents(root, sectorsize);
 
-	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 0, (u64)-1);
+	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, (u64)-1);
 	if (IS_ERR(em)) {
 		test_err("got an error when we shouldn't have");
 		goto out;
@@ -316,7 +316,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
 	offset = em->start + em->len;
 	free_extent_map(em);
 
-	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
+	em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
 	if (IS_ERR(em)) {
 		test_err("got an error when we shouldn't have");
 		goto out;
@@ -339,7 +339,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
 	free_extent_map(em);
 
 	/* Regular extent */
-	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
+	em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
 	if (IS_ERR(em)) {
 		test_err("got an error when we shouldn't have");
 		goto out;
@@ -367,7 +367,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
 	free_extent_map(em);
 
 	/* The next 3 are split extents */
-	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
+	em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
 	if (IS_ERR(em)) {
 		test_err("got an error when we shouldn't have");
 		goto out;
@@ -396,7 +396,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
 	offset = em->start + em->len;
 	free_extent_map(em);
 
-	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
+	em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
 	if (IS_ERR(em)) {
 		test_err("got an error when we shouldn't have");
 		goto out;
@@ -418,7 +418,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
 	offset = em->start + em->len;
 	free_extent_map(em);
 
-	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
+	em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
 	if (IS_ERR(em)) {
 		test_err("got an error when we shouldn't have");
 		goto out;
@@ -452,7 +452,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
 	free_extent_map(em);
 
 	/* Prealloc extent */
-	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
+	em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
 	if (IS_ERR(em)) {
 		test_err("got an error when we shouldn't have");
 		goto out;
@@ -481,7 +481,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
 	free_extent_map(em);
 
 	/* The next 3 are a half written prealloc extent */
-	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
+	em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
 	if (IS_ERR(em)) {
 		test_err("got an error when we shouldn't have");
 		goto out;
@@ -511,7 +511,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
 	offset = em->start + em->len;
 	free_extent_map(em);
 
-	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
+	em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
 	if (IS_ERR(em)) {
 		test_err("got an error when we shouldn't have");
 		goto out;
@@ -544,7 +544,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
 	offset = em->start + em->len;
 	free_extent_map(em);
 
-	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
+	em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
 	if (IS_ERR(em)) {
 		test_err("got an error when we shouldn't have");
 		goto out;
@@ -579,7 +579,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
 	free_extent_map(em);
 
 	/* Now for the compressed extent */
-	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
+	em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
 	if (IS_ERR(em)) {
 		test_err("got an error when we shouldn't have");
 		goto out;
@@ -613,7 +613,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
 	free_extent_map(em);
 
 	/* Split compressed extent */
-	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
+	em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
 	if (IS_ERR(em)) {
 		test_err("got an error when we shouldn't have");
 		goto out;
@@ -648,7 +648,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
 	offset = em->start + em->len;
 	free_extent_map(em);
 
-	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
+	em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
 	if (IS_ERR(em)) {
 		test_err("got an error when we shouldn't have");
 		goto out;
@@ -675,7 +675,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
 	offset = em->start + em->len;
 	free_extent_map(em);
 
-	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
+	em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
 	if (IS_ERR(em)) {
 		test_err("got an error when we shouldn't have");
 		goto out;
@@ -710,7 +710,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
 	free_extent_map(em);
 
 	/* A hole between regular extents but no hole extent */
-	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset + 6, sectorsize);
+	em = btrfs_get_extent(BTRFS_I(inode), NULL, offset + 6, sectorsize);
 	if (IS_ERR(em)) {
 		test_err("got an error when we shouldn't have");
 		goto out;
@@ -737,7 +737,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
 	offset = em->start + em->len;
 	free_extent_map(em);
 
-	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, SZ_4M);
+	em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, SZ_4M);
 	if (IS_ERR(em)) {
 		test_err("got an error when we shouldn't have");
 		goto out;
@@ -770,7 +770,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
 	offset = em->start + em->len;
 	free_extent_map(em);
 
-	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
+	em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
 	if (IS_ERR(em)) {
 		test_err("got an error when we shouldn't have");
 		goto out;
@@ -850,7 +850,7 @@ static int test_hole_first(u32 sectorsize, u32 nodesize)
 	insert_inode_item_key(root);
 	insert_extent(root, sectorsize, sectorsize, sectorsize, 0, sectorsize,
 		      sectorsize, BTRFS_FILE_EXTENT_REG, 0, 1);
-	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 0, 2 * sectorsize);
+	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 2 * sectorsize);
 	if (IS_ERR(em)) {
 		test_err("got an error when we shouldn't have");
 		goto out;
@@ -872,7 +872,7 @@ static int test_hole_first(u32 sectorsize, u32 nodesize)
 	}
 	free_extent_map(em);
 
-	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, sectorsize, 2 * sectorsize);
+	em = btrfs_get_extent(BTRFS_I(inode), NULL, sectorsize, 2 * sectorsize);
 	if (IS_ERR(em)) {
 		test_err("got an error when we shouldn't have");
 		goto out;

From f8ac489e129259780b8fde310c9b31b529c6e485 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Mon, 8 Jan 2024 12:30:44 +1030
Subject: [PATCH 0346/1406] btrfs: remove unused variable bio_offset from
 end_bbio_data_read()

The variable @bio_offset was introduced in commit 7ffd27e378d2 ("btrfs:
pass bio_offset to check_data_csum() directly"), when we are still using
the same endio function for both data and metadata.

Later we had several changes to data and metadata endio functions:

- Data verification is handled by btrfs bio layer

- Split data and metadata endio paths

Now for data path we no longer do any verification in
end_bbio_data_read(), as the verification is handled by btrfs bio layer
already.

Thus there is no need for such bio_offset variable.

Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_io.c | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 70a1ad6da50b48..ff8d5c4ac169ca 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -596,11 +596,6 @@ static void end_bbio_data_read(struct btrfs_bio *bbio)
 	struct bio *bio = &bbio->bio;
 	struct processed_extent processed = { 0 };
 	struct folio_iter fi;
-	/*
-	 * The offset to the beginning of a bio, since one bio can never be
-	 * larger than UINT_MAX, u32 here is enough.
-	 */
-	u32 bio_offset = 0;
 
 	ASSERT(!bio_flagged(bio, BIO_CLONED));
 	bio_for_each_folio_all(fi, &bbio->bio) {
@@ -667,10 +662,6 @@ static void end_bbio_data_read(struct btrfs_bio *bbio)
 		end_page_read(folio_page(folio, 0), uptodate, start, len);
 		endio_readpage_release_extent(&processed, BTRFS_I(inode),
 					      start, end, uptodate);
-
-		ASSERT(bio_offset + len > bio_offset);
-		bio_offset += len;
-
 	}
 	/* Release the last extent */
 	endio_readpage_release_extent(&processed, NULL, 0, 0, false);

From ceabfd5ef4b22b2e96025066cfe995ab311add7d Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Fri, 5 Jan 2024 16:05:55 +1030
Subject: [PATCH 0347/1406] btrfs: cache folio size and shift in extent_buffer

After the conversion to folio interfaces (but without the patch to
enable larger folio allocation), there is an LTP report about observable
performance drop on metadata heavy operations.

https://lore.kernel.org/linux-btrfs/202312221750.571925bd-oliver.sang@intel.com/

This drop is caused by the extra code of calculating the
folio_size()/folio_shift(), instead of the old hard coded
PAGE_SIZE/PAGE_SHIFT.

To slightly reduce the overhead, just cache both folio_size and
folio_shift in extent_buffer.

The two new members (u32 folio_size and u8 folio_shift) are stored
inside the holes of extent_buffer. folio_size is shared with len, which
is reduced to u32. The size of eb does not change.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/accessors.c | 12 ++++++------
 fs/btrfs/ctree.c     |  2 +-
 fs/btrfs/disk-io.c   |  2 +-
 fs/btrfs/extent_io.c | 38 +++++++++++++++++++++-----------------
 fs/btrfs/extent_io.h | 16 +++++++++++++---
 5 files changed, 42 insertions(+), 28 deletions(-)

diff --git a/fs/btrfs/accessors.c b/fs/btrfs/accessors.c
index 1925a0919ca62f..6eb850ad37d2ae 100644
--- a/fs/btrfs/accessors.c
+++ b/fs/btrfs/accessors.c
@@ -63,8 +63,8 @@ u##bits btrfs_get_token_##bits(struct btrfs_map_token *token,		\
 	const unsigned long idx = get_eb_folio_index(token->eb, member_offset); \
 	const unsigned long oil = get_eb_offset_in_folio(token->eb,	\
 							 member_offset);\
-	const int unit_size = folio_size(token->eb->folios[0]);		\
-	const int unit_shift = folio_shift(token->eb->folios[0]);	\
+	const int unit_size = token->eb->folio_size;			\
+	const int unit_shift = token->eb->folio_shift;			\
 	const int size = sizeof(u##bits);				\
 	u8 lebytes[sizeof(u##bits)];					\
 	const int part = unit_size - oil;				\
@@ -94,7 +94,7 @@ u##bits btrfs_get_##bits(const struct extent_buffer *eb,		\
 	const unsigned long idx = get_eb_folio_index(eb, member_offset);\
 	const unsigned long oil = get_eb_offset_in_folio(eb,		\
 							 member_offset);\
-	const int unit_size = folio_size(eb->folios[0]);		\
+	const int unit_size = eb->folio_size;				\
 	char *kaddr = folio_address(eb->folios[idx]);			\
 	const int size = sizeof(u##bits);				\
 	const int part = unit_size - oil;				\
@@ -117,8 +117,8 @@ void btrfs_set_token_##bits(struct btrfs_map_token *token,		\
 	const unsigned long idx = get_eb_folio_index(token->eb, member_offset); \
 	const unsigned long oil = get_eb_offset_in_folio(token->eb,	\
 							 member_offset);\
-	const int unit_size = folio_size(token->eb->folios[0]);		\
-	const int unit_shift = folio_shift(token->eb->folios[0]);	\
+	const int unit_size = token->eb->folio_size;			\
+	const int unit_shift = token->eb->folio_shift;			\
 	const int size = sizeof(u##bits);				\
 	u8 lebytes[sizeof(u##bits)];					\
 	const int part = unit_size - oil;				\
@@ -151,7 +151,7 @@ void btrfs_set_##bits(const struct extent_buffer *eb, void *ptr,	\
 	const unsigned long idx = get_eb_folio_index(eb, member_offset);\
 	const unsigned long oil = get_eb_offset_in_folio(eb,		\
 							 member_offset);\
-	const int unit_size = folio_size(eb->folios[0]);		\
+	const int unit_size = eb->folio_size;				\
 	char *kaddr = folio_address(eb->folios[idx]);			\
 	const int size = sizeof(u##bits);				\
 	const int part = unit_size - oil;				\
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index e65e012bac5531..33145da449cc8d 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -820,7 +820,7 @@ int btrfs_bin_search(struct extent_buffer *eb, int first_slot,
 	}
 
 	while (low < high) {
-		const int unit_size = folio_size(eb->folios[0]);
+		const int unit_size = eb->folio_size;
 		unsigned long oil;
 		unsigned long offset;
 		struct btrfs_disk_key *tmp;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index e71ef97d0a7cab..57b44c1b85a89c 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -193,7 +193,7 @@ static int btrfs_repair_eb_io_failure(const struct extent_buffer *eb,
 		struct folio *folio = eb->folios[i];
 		u64 start = max_t(u64, eb->start, folio_pos(folio));
 		u64 end = min_t(u64, eb->start + eb->len,
-				folio_pos(folio) + folio_size(folio));
+				folio_pos(folio) + eb->folio_size);
 		u32 len = end - start;
 
 		ret = btrfs_repair_io_failure(fs_info, 0, start, len,
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index ff8d5c4ac169ca..a9f95d66cb696d 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -78,7 +78,7 @@ void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info)
 		eb = list_first_entry(&fs_info->allocated_ebs,
 				      struct extent_buffer, leak_list);
 		pr_err(
-	"BTRFS: buffer leak start %llu len %lu refs %d bflags %lu owner %llu\n",
+	"BTRFS: buffer leak start %llu len %u refs %d bflags %lu owner %llu\n",
 		       eb->start, eb->len, atomic_read(&eb->refs), eb->bflags,
 		       btrfs_header_owner(eb));
 		list_del(&eb->leak_list);
@@ -729,6 +729,8 @@ static int alloc_eb_folio_array(struct extent_buffer *eb, gfp_t extra_gfp)
 
 	for (int i = 0; i < num_pages; i++)
 		eb->folios[i] = page_folio(page_array[i]);
+	eb->folio_size = PAGE_SIZE;
+	eb->folio_shift = PAGE_SHIFT;
 	return 0;
 }
 
@@ -1728,10 +1730,10 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb,
 			folio_lock(folio);
 			folio_clear_dirty_for_io(folio);
 			folio_start_writeback(folio);
-			ret = bio_add_folio(&bbio->bio, folio, folio_size(folio), 0);
+			ret = bio_add_folio(&bbio->bio, folio, eb->folio_size, 0);
 			ASSERT(ret);
 			wbc_account_cgroup_owner(wbc, folio_page(folio, 0),
-						 folio_size(folio));
+						 eb->folio_size);
 			wbc->nr_to_write -= folio_nr_pages(folio);
 			folio_unlock(folio);
 		}
@@ -3523,7 +3525,7 @@ static int attach_eb_folio_to_filemap(struct extent_buffer *eb, int i,
 	/* For now, we should only have single-page folios for btree inode. */
 	ASSERT(folio_nr_pages(existing_folio) == 1);
 
-	if (folio_size(existing_folio) != folio_size(eb->folios[0])) {
+	if (folio_size(existing_folio) != eb->folio_size) {
 		folio_unlock(existing_folio);
 		folio_put(existing_folio);
 		return -EAGAIN;
@@ -3666,6 +3668,8 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
 		 * and free the allocated page.
 		 */
 		folio = eb->folios[i];
+		eb->folio_size = folio_size(folio);
+		eb->folio_shift = folio_shift(folio);
 		spin_lock(&mapping->i_private_lock);
 		/* Should not fail, as we have preallocated the memory */
 		ret = attach_extent_buffer_folio(eb, folio, prealloc);
@@ -4115,7 +4119,7 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num,
 		for (int i = 0; i < num_folios; i++) {
 			struct folio *folio = eb->folios[i];
 
-			ret = bio_add_folio(&bbio->bio, folio, folio_size(folio), 0);
+			ret = bio_add_folio(&bbio->bio, folio, eb->folio_size, 0);
 			ASSERT(ret);
 		}
 	}
@@ -4135,7 +4139,7 @@ static bool report_eb_range(const struct extent_buffer *eb, unsigned long start,
 			    unsigned long len)
 {
 	btrfs_warn(eb->fs_info,
-		"access to eb bytenr %llu len %lu out of range start %lu len %lu",
+		"access to eb bytenr %llu len %u out of range start %lu len %lu",
 		eb->start, eb->len, start, len);
 	WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
 
@@ -4164,7 +4168,7 @@ static inline int check_eb_range(const struct extent_buffer *eb,
 void read_extent_buffer(const struct extent_buffer *eb, void *dstv,
 			unsigned long start, unsigned long len)
 {
-	const int unit_size = folio_size(eb->folios[0]);
+	const int unit_size = eb->folio_size;
 	size_t cur;
 	size_t offset;
 	char *dst = (char *)dstv;
@@ -4204,7 +4208,7 @@ int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb,
 				       void __user *dstv,
 				       unsigned long start, unsigned long len)
 {
-	const int unit_size = folio_size(eb->folios[0]);
+	const int unit_size = eb->folio_size;
 	size_t cur;
 	size_t offset;
 	char __user *dst = (char __user *)dstv;
@@ -4244,7 +4248,7 @@ int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb,
 int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv,
 			 unsigned long start, unsigned long len)
 {
-	const int unit_size = folio_size(eb->folios[0]);
+	const int unit_size = eb->folio_size;
 	size_t cur;
 	size_t offset;
 	char *kaddr;
@@ -4315,7 +4319,7 @@ static void __write_extent_buffer(const struct extent_buffer *eb,
 				  const void *srcv, unsigned long start,
 				  unsigned long len, bool use_memmove)
 {
-	const int unit_size = folio_size(eb->folios[0]);
+	const int unit_size = eb->folio_size;
 	size_t cur;
 	size_t offset;
 	char *kaddr;
@@ -4364,7 +4368,7 @@ void write_extent_buffer(const struct extent_buffer *eb, const void *srcv,
 static void memset_extent_buffer(const struct extent_buffer *eb, int c,
 				 unsigned long start, unsigned long len)
 {
-	const int unit_size = folio_size(eb->folios[0]);
+	const int unit_size = eb->folio_size;
 	unsigned long cur = start;
 
 	if (eb->addr) {
@@ -4395,7 +4399,7 @@ void memzero_extent_buffer(const struct extent_buffer *eb, unsigned long start,
 void copy_extent_buffer_full(const struct extent_buffer *dst,
 			     const struct extent_buffer *src)
 {
-	const int unit_size = folio_size(src->folios[0]);
+	const int unit_size = src->folio_size;
 	unsigned long cur = 0;
 
 	ASSERT(dst->len == src->len);
@@ -4417,7 +4421,7 @@ void copy_extent_buffer(const struct extent_buffer *dst,
 			unsigned long dst_offset, unsigned long src_offset,
 			unsigned long len)
 {
-	const int unit_size = folio_size(dst->folios[0]);
+	const int unit_size = dst->folio_size;
 	u64 dst_len = dst->len;
 	size_t cur;
 	size_t offset;
@@ -4473,10 +4477,10 @@ static inline void eb_bitmap_offset(const struct extent_buffer *eb,
 	 * the bitmap item in the extent buffer + the offset of the byte in the
 	 * bitmap item.
 	 */
-	offset = start + offset_in_folio(eb->folios[0], eb->start) + byte_offset;
+	offset = start + offset_in_eb_folio(eb, eb->start) + byte_offset;
 
-	*folio_index = offset >> folio_shift(eb->folios[0]);
-	*folio_offset = offset_in_folio(eb->folios[0], offset);
+	*folio_index = offset >> eb->folio_shift;
+	*folio_offset = offset_in_eb_folio(eb, offset);
 }
 
 /*
@@ -4590,7 +4594,7 @@ void memcpy_extent_buffer(const struct extent_buffer *dst,
 			  unsigned long dst_offset, unsigned long src_offset,
 			  unsigned long len)
 {
-	const int unit_size = folio_size(dst->folios[0]);
+	const int unit_size = dst->folio_size;
 	unsigned long cur_off = 0;
 
 	if (check_eb_range(dst, dst_offset, len) ||
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 46050500529bff..8e5639597800a7 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -8,6 +8,7 @@
 #include <linux/fiemap.h>
 #include <linux/btrfs_tree.h>
 #include "compression.h"
+#include "messages.h"
 #include "ulist.h"
 #include "misc.h"
 
@@ -75,7 +76,8 @@ void __cold extent_buffer_free_cachep(void);
 #define INLINE_EXTENT_BUFFER_PAGES     (BTRFS_MAX_METADATA_BLOCKSIZE / PAGE_SIZE)
 struct extent_buffer {
 	u64 start;
-	unsigned long len;
+	u32 len;
+	u32 folio_size;
 	unsigned long bflags;
 	struct btrfs_fs_info *fs_info;
 
@@ -90,6 +92,7 @@ struct extent_buffer {
 	int read_mirror;
 	/* >= 0 if eb belongs to a log tree, -1 otherwise */
 	s8 log_index;
+	u8 folio_shift;
 	struct rcu_head rcu_head;
 
 	struct rw_semaphore lock;
@@ -113,6 +116,13 @@ struct btrfs_eb_write_context {
 	struct btrfs_block_group *zoned_bg;
 };
 
+static inline unsigned long offset_in_eb_folio(const struct extent_buffer *eb,
+					       u64 start)
+{
+	ASSERT(eb->folio_size);
+	return start & (eb->folio_size - 1);
+}
+
 /*
  * Get the correct offset inside the page of extent buffer.
  *
@@ -151,13 +161,13 @@ static inline unsigned long get_eb_folio_index(const struct extent_buffer *eb,
 	 *	   the folio_shift would be large enough to always make us
 	 *	   return 0 as index.
 	 *    1.2) Several page sized folios
-	 *         The folio_shift() would be PAGE_SHIFT, giving us the correct
+	 *         The folio_shift would be PAGE_SHIFT, giving us the correct
 	 *         index.
 	 *
 	 * 2) sectorsize < PAGE_SIZE and nodesize < PAGE_SIZE case
 	 *    The folio would only be page sized, and always give us 0 as index.
 	 */
-	return offset >> folio_shift(eb->folios[0]);
+	return offset >> eb->folio_shift;
 }
 
 /*

From 82f275a092f1ced7bb5eec018f26ea08986554aa Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Tue, 9 Jan 2024 15:46:25 +0000
Subject: [PATCH 0348/1406] btrfs: remove extent_map_tree forward declaration
 at extent_io.h

There's no need to do a forward declaration of struct extent_map_tree at
extent_io.h, as there are no function prototypes, inline functions or data
structures that refer to struct extent_map_tree.

So remove that forward declaration, which is not needed since commit
477a30ba5f8d ("btrfs: Sink extent_tree arguments in
try_release_extent_mapping").

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_io.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 8e5639597800a7..3cbececc8c6ddd 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -215,8 +215,6 @@ static inline void extent_changeset_free(struct extent_changeset *changeset)
 	kfree(changeset);
 }
 
-struct extent_map_tree;
-
 int try_release_extent_mapping(struct page *page, gfp_t mask);
 int try_release_extent_buffer(struct page *page);
 

From e36e3767884f9df7cc30c14043af2fdf15c1226a Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Tue, 2 Jan 2024 15:18:07 -0500
Subject: [PATCH 0349/1406] btrfs: WARN_ON_ONCE() in our leak detection code

fstests looks for WARN_ON's in dmesg.  Add WARN_ON_ONCE() to our leak
detection code (enabled only in debug builds) so that fstests will fail
if these things trip at all.  This will allow us to easily catch
problems with our reference counting that may otherwise go unnoticed.

Reviewed-by: Neal Gompa <neal@gompa.dev>
Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c        | 1 +
 fs/btrfs/extent-io-tree.c | 1 +
 fs/btrfs/extent_io.c      | 1 +
 3 files changed, 3 insertions(+)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 57b44c1b85a89c..3befabaaaa1896 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1244,6 +1244,7 @@ void btrfs_check_leaked_roots(struct btrfs_fs_info *fs_info)
 		btrfs_err(fs_info, "leaked root %s refcount %d",
 			  btrfs_root_name(&root->root_key, buf),
 			  refcount_read(&root->refs));
+		WARN_ON_ONCE(1);
 		while (refcount_read(&root->refs) > 1)
 			btrfs_put_root(root);
 		btrfs_put_root(root);
diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c
index e3ee5449cc4af7..1544e7b1eaed30 100644
--- a/fs/btrfs/extent-io-tree.c
+++ b/fs/btrfs/extent-io-tree.c
@@ -48,6 +48,7 @@ static inline void btrfs_extent_state_leak_debug_check(void)
 		       extent_state_in_tree(state),
 		       refcount_read(&state->refs));
 		list_del(&state->leak_list);
+		WARN_ON_ONCE(1);
 		kmem_cache_free(extent_state_cache, state);
 	}
 }
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index a9f95d66cb696d..33a5f17e86b2c0 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -82,6 +82,7 @@ void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info)
 		       eb->start, eb->len, atomic_read(&eb->refs), eb->bflags,
 		       btrfs_header_owner(eb));
 		list_del(&eb->leak_list);
+		WARN_ON_ONCE(1);
 		kmem_cache_free(extent_buffer_cache, eb);
 	}
 	spin_unlock_irqrestore(&fs_info->eb_leak_lock, flags);

From 8c3ea568565698b2e4fbbc50e15211293625acad Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 14 Dec 2023 16:13:29 +0000
Subject: [PATCH 0350/1406] btrfs: add set_folio_extent_mapped() helper

Turn set_page_extent_mapped() into a wrapper around this version.
Saves a call to compound_head() for callers who already have a folio
and removes a couple of users of page->mapping.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_io.c | 12 ++++++++----
 fs/btrfs/extent_io.h |  1 +
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 33a5f17e86b2c0..b02acecc433b44 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -930,17 +930,21 @@ static int attach_extent_buffer_folio(struct extent_buffer *eb,
 
 int set_page_extent_mapped(struct page *page)
 {
-	struct folio *folio = page_folio(page);
+	return set_folio_extent_mapped(page_folio(page));
+}
+
+int set_folio_extent_mapped(struct folio *folio)
+{
 	struct btrfs_fs_info *fs_info;
 
-	ASSERT(page->mapping);
+	ASSERT(folio->mapping);
 
 	if (folio_test_private(folio))
 		return 0;
 
-	fs_info = btrfs_sb(page->mapping->host->i_sb);
+	fs_info = btrfs_sb(folio->mapping->host->i_sb);
 
-	if (btrfs_is_subpage(fs_info, page->mapping))
+	if (btrfs_is_subpage(fs_info, folio->mapping))
 		return btrfs_attach_subpage(fs_info, folio, BTRFS_SUBPAGE_DATA);
 
 	folio_attach_private(folio, (void *)EXTENT_FOLIO_PRIVATE);
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 3cbececc8c6ddd..4437607f2b0601 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -229,6 +229,7 @@ int btree_write_cache_pages(struct address_space *mapping,
 void extent_readahead(struct readahead_control *rac);
 int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
 		  u64 start, u64 len);
+int set_folio_extent_mapped(struct folio *folio);
 int set_page_extent_mapped(struct page *page);
 void clear_page_extent_mapped(struct page *page);
 

From 683a1784c24839d6be057a1e6fcff119074a75e8 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 14 Dec 2023 16:13:30 +0000
Subject: [PATCH 0351/1406] btrfs: convert defrag_prepare_one_page() to use a
 folio

Use a folio throughout defrag_prepare_one_page() to remove dozens of
hidden calls to compound_head().  There is no support here for large
folios; indeed, turn the existing check for PageCompound into a check
for large folios.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/defrag.c | 53 ++++++++++++++++++++++++-----------------------
 1 file changed, 27 insertions(+), 26 deletions(-)

diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c
index c276b136ab63a1..04229a029d6fe1 100644
--- a/fs/btrfs/defrag.c
+++ b/fs/btrfs/defrag.c
@@ -868,13 +868,14 @@ static struct page *defrag_prepare_one_page(struct btrfs_inode *inode, pgoff_t i
 	u64 page_start = (u64)index << PAGE_SHIFT;
 	u64 page_end = page_start + PAGE_SIZE - 1;
 	struct extent_state *cached_state = NULL;
-	struct page *page;
+	struct folio *folio;
 	int ret;
 
 again:
-	page = find_or_create_page(mapping, index, mask);
-	if (!page)
-		return ERR_PTR(-ENOMEM);
+	folio = __filemap_get_folio(mapping, index,
+				    FGP_LOCK | FGP_ACCESSED | FGP_CREAT, mask);
+	if (IS_ERR(folio))
+		return &folio->page;
 
 	/*
 	 * Since we can defragment files opened read-only, we can encounter
@@ -884,16 +885,16 @@ static struct page *defrag_prepare_one_page(struct btrfs_inode *inode, pgoff_t i
 	 * executables that explicitly enable them, so this isn't very
 	 * restrictive.
 	 */
-	if (PageCompound(page)) {
-		unlock_page(page);
-		put_page(page);
+	if (folio_test_large(folio)) {
+		folio_unlock(folio);
+		folio_put(folio);
 		return ERR_PTR(-ETXTBSY);
 	}
 
-	ret = set_page_extent_mapped(page);
+	ret = set_folio_extent_mapped(folio);
 	if (ret < 0) {
-		unlock_page(page);
-		put_page(page);
+		folio_unlock(folio);
+		folio_put(folio);
 		return ERR_PTR(ret);
 	}
 
@@ -908,17 +909,17 @@ static struct page *defrag_prepare_one_page(struct btrfs_inode *inode, pgoff_t i
 		if (!ordered)
 			break;
 
-		unlock_page(page);
+		folio_unlock(folio);
 		btrfs_start_ordered_extent(ordered);
 		btrfs_put_ordered_extent(ordered);
-		lock_page(page);
+		folio_lock(folio);
 		/*
-		 * We unlocked the page above, so we need check if it was
+		 * We unlocked the folio above, so we need check if it was
 		 * released or not.
 		 */
-		if (page->mapping != mapping || !PagePrivate(page)) {
-			unlock_page(page);
-			put_page(page);
+		if (folio->mapping != mapping || !folio->private) {
+			folio_unlock(folio);
+			folio_put(folio);
 			goto again;
 		}
 	}
@@ -927,21 +928,21 @@ static struct page *defrag_prepare_one_page(struct btrfs_inode *inode, pgoff_t i
 	 * Now the page range has no ordered extent any more.  Read the page to
 	 * make it uptodate.
 	 */
-	if (!PageUptodate(page)) {
-		btrfs_read_folio(NULL, page_folio(page));
-		lock_page(page);
-		if (page->mapping != mapping || !PagePrivate(page)) {
-			unlock_page(page);
-			put_page(page);
+	if (!folio_test_uptodate(folio)) {
+		btrfs_read_folio(NULL, folio);
+		folio_lock(folio);
+		if (folio->mapping != mapping || !folio->private) {
+			folio_unlock(folio);
+			folio_put(folio);
 			goto again;
 		}
-		if (!PageUptodate(page)) {
-			unlock_page(page);
-			put_page(page);
+		if (!folio_test_uptodate(folio)) {
+			folio_unlock(folio);
+			folio_put(folio);
 			return ERR_PTR(-EIO);
 		}
 	}
-	return page;
+	return &folio->page;
 }
 
 struct defrag_target_range {

From 23d64e6784d0a5879d3c4694f31223c1e536ef3e Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 14 Dec 2023 16:13:31 +0000
Subject: [PATCH 0352/1406] btrfs: use a folio array throughout the defrag
 process

Remove more hidden calls to compound_head() by using an array of folios
instead of pages.  Also neaten the error path in defrag_one_range() by
adjusting the length of the array instead of checking for NULL.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/defrag.c | 44 +++++++++++++++++++++-----------------------
 1 file changed, 21 insertions(+), 23 deletions(-)

diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c
index 04229a029d6fe1..dd1b5a060366f9 100644
--- a/fs/btrfs/defrag.c
+++ b/fs/btrfs/defrag.c
@@ -861,7 +861,7 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em,
  * NOTE: Caller should also wait for page writeback after the cluster is
  * prepared, here we don't do writeback wait for each page.
  */
-static struct page *defrag_prepare_one_page(struct btrfs_inode *inode, pgoff_t index)
+static struct folio *defrag_prepare_one_folio(struct btrfs_inode *inode, pgoff_t index)
 {
 	struct address_space *mapping = inode->vfs_inode.i_mapping;
 	gfp_t mask = btrfs_alloc_write_mask(mapping);
@@ -875,7 +875,7 @@ static struct page *defrag_prepare_one_page(struct btrfs_inode *inode, pgoff_t i
 	folio = __filemap_get_folio(mapping, index,
 				    FGP_LOCK | FGP_ACCESSED | FGP_CREAT, mask);
 	if (IS_ERR(folio))
-		return &folio->page;
+		return folio;
 
 	/*
 	 * Since we can defragment files opened read-only, we can encounter
@@ -942,7 +942,7 @@ static struct page *defrag_prepare_one_page(struct btrfs_inode *inode, pgoff_t i
 			return ERR_PTR(-EIO);
 		}
 	}
-	return &folio->page;
+	return folio;
 }
 
 struct defrag_target_range {
@@ -1163,7 +1163,7 @@ static_assert(PAGE_ALIGNED(CLUSTER_SIZE));
  */
 static int defrag_one_locked_target(struct btrfs_inode *inode,
 				    struct defrag_target_range *target,
-				    struct page **pages, int nr_pages,
+				    struct folio **folios, int nr_pages,
 				    struct extent_state **cached_state)
 {
 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
@@ -1172,7 +1172,7 @@ static int defrag_one_locked_target(struct btrfs_inode *inode,
 	const u64 len = target->len;
 	unsigned long last_index = (start + len - 1) >> PAGE_SHIFT;
 	unsigned long start_index = start >> PAGE_SHIFT;
-	unsigned long first_index = page_index(pages[0]);
+	unsigned long first_index = folios[0]->index;
 	int ret = 0;
 	int i;
 
@@ -1189,8 +1189,8 @@ static int defrag_one_locked_target(struct btrfs_inode *inode,
 
 	/* Update the page status */
 	for (i = start_index - first_index; i <= last_index - first_index; i++) {
-		ClearPageChecked(pages[i]);
-		btrfs_folio_clamp_set_dirty(fs_info, page_folio(pages[i]), start, len);
+		folio_clear_checked(folios[i]);
+		btrfs_folio_clamp_set_dirty(fs_info, folios[i], start, len);
 	}
 	btrfs_delalloc_release_extents(inode, len);
 	extent_changeset_free(data_reserved);
@@ -1206,7 +1206,7 @@ static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len,
 	struct defrag_target_range *entry;
 	struct defrag_target_range *tmp;
 	LIST_HEAD(target_list);
-	struct page **pages;
+	struct folio **folios;
 	const u32 sectorsize = inode->root->fs_info->sectorsize;
 	u64 last_index = (start + len - 1) >> PAGE_SHIFT;
 	u64 start_index = start >> PAGE_SHIFT;
@@ -1217,21 +1217,21 @@ static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len,
 	ASSERT(nr_pages <= CLUSTER_SIZE / PAGE_SIZE);
 	ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(len, sectorsize));
 
-	pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
-	if (!pages)
+	folios = kcalloc(nr_pages, sizeof(struct folio *), GFP_NOFS);
+	if (!folios)
 		return -ENOMEM;
 
 	/* Prepare all pages */
 	for (i = 0; i < nr_pages; i++) {
-		pages[i] = defrag_prepare_one_page(inode, start_index + i);
-		if (IS_ERR(pages[i])) {
-			ret = PTR_ERR(pages[i]);
-			pages[i] = NULL;
-			goto free_pages;
+		folios[i] = defrag_prepare_one_folio(inode, start_index + i);
+		if (IS_ERR(folios[i])) {
+			ret = PTR_ERR(folios[i]);
+			nr_pages = i;
+			goto free_folios;
 		}
 	}
 	for (i = 0; i < nr_pages; i++)
-		wait_on_page_writeback(pages[i]);
+		folio_wait_writeback(folios[i]);
 
 	/* Lock the pages range */
 	lock_extent(&inode->io_tree, start_index << PAGE_SHIFT,
@@ -1251,7 +1251,7 @@ static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len,
 		goto unlock_extent;
 
 	list_for_each_entry(entry, &target_list, list) {
-		ret = defrag_one_locked_target(inode, entry, pages, nr_pages,
+		ret = defrag_one_locked_target(inode, entry, folios, nr_pages,
 					       &cached_state);
 		if (ret < 0)
 			break;
@@ -1265,14 +1265,12 @@ static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len,
 	unlock_extent(&inode->io_tree, start_index << PAGE_SHIFT,
 		      (last_index << PAGE_SHIFT) + PAGE_SIZE - 1,
 		      &cached_state);
-free_pages:
+free_folios:
 	for (i = 0; i < nr_pages; i++) {
-		if (pages[i]) {
-			unlock_page(pages[i]);
-			put_page(pages[i]);
-		}
+		folio_unlock(folios[i]);
+		folio_put(folios[i]);
 	}
-	kfree(pages);
+	kfree(folios);
 	return ret;
 }
 

From 6bfddf2ed8423d1ccae403c1dd6d63136c27265a Mon Sep 17 00:00:00 2001
From: Goldwyn Rodrigues <rgoldwyn@suse.de>
Date: Wed, 10 Jan 2024 19:56:13 -0600
Subject: [PATCH 0353/1406] btrfs: page to folio conversion in
 btrfs_truncate_block()

Convert use of struct page to struct folio inside btrfs_truncate_block().
The only page based function is set_page_extent_mapped(). All other
functions have folio equivalents.

Had to use __filemap_get_folio() because filemap_grab_folio() does not
allow passing allocation mask as a parameter.

Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Reviewed-by: Boris Burkov <boris@bur.io>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 46 ++++++++++++++++++++++++----------------------
 1 file changed, 24 insertions(+), 22 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index da7946c44d1eda..25090d23834bdf 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4680,7 +4680,7 @@ int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len,
 	u32 blocksize = fs_info->sectorsize;
 	pgoff_t index = from >> PAGE_SHIFT;
 	unsigned offset = from & (blocksize - 1);
-	struct page *page;
+	struct folio *folio;
 	gfp_t mask = btrfs_alloc_write_mask(mapping);
 	size_t write_bytes = blocksize;
 	int ret = 0;
@@ -4712,8 +4712,9 @@ int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len,
 		goto out;
 	}
 again:
-	page = find_or_create_page(mapping, index, mask);
-	if (!page) {
+	folio = __filemap_get_folio(mapping, index,
+				    FGP_LOCK | FGP_ACCESSED | FGP_CREAT, mask);
+	if (IS_ERR(folio)) {
 		btrfs_delalloc_release_space(inode, data_reserved, block_start,
 					     blocksize, true);
 		btrfs_delalloc_release_extents(inode, blocksize);
@@ -4721,15 +4722,15 @@ int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len,
 		goto out;
 	}
 
-	if (!PageUptodate(page)) {
-		ret = btrfs_read_folio(NULL, page_folio(page));
-		lock_page(page);
-		if (page->mapping != mapping) {
-			unlock_page(page);
-			put_page(page);
+	if (!folio_test_uptodate(folio)) {
+		ret = btrfs_read_folio(NULL, folio);
+		folio_lock(folio);
+		if (folio->mapping != mapping) {
+			folio_unlock(folio);
+			folio_put(folio);
 			goto again;
 		}
-		if (!PageUptodate(page)) {
+		if (!folio_test_uptodate(folio)) {
 			ret = -EIO;
 			goto out_unlock;
 		}
@@ -4741,19 +4742,19 @@ int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len,
 	 * folio private, but left the page in the mapping.  Set the page mapped
 	 * here to make sure it's properly set for the subpage stuff.
 	 */
-	ret = set_page_extent_mapped(page);
+	ret = set_folio_extent_mapped(folio);
 	if (ret < 0)
 		goto out_unlock;
 
-	wait_on_page_writeback(page);
+	folio_wait_writeback(folio);
 
 	lock_extent(io_tree, block_start, block_end, &cached_state);
 
 	ordered = btrfs_lookup_ordered_extent(inode, block_start);
 	if (ordered) {
 		unlock_extent(io_tree, block_start, block_end, &cached_state);
-		unlock_page(page);
-		put_page(page);
+		folio_unlock(folio);
+		folio_put(folio);
 		btrfs_start_ordered_extent(ordered);
 		btrfs_put_ordered_extent(ordered);
 		goto again;
@@ -4774,15 +4775,16 @@ int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len,
 		if (!len)
 			len = blocksize - offset;
 		if (front)
-			memzero_page(page, (block_start - page_offset(page)),
-				     offset);
+			folio_zero_range(folio, block_start - folio_pos(folio),
+					 offset);
 		else
-			memzero_page(page, (block_start - page_offset(page)) + offset,
-				     len);
+			folio_zero_range(folio,
+					 (block_start - folio_pos(folio)) + offset,
+					 len);
 	}
-	btrfs_folio_clear_checked(fs_info, page_folio(page), block_start,
+	btrfs_folio_clear_checked(fs_info, folio, block_start,
 				  block_end + 1 - block_start);
-	btrfs_folio_set_dirty(fs_info, page_folio(page), block_start,
+	btrfs_folio_set_dirty(fs_info, folio, block_start,
 			      block_end + 1 - block_start);
 	unlock_extent(io_tree, block_start, block_end, &cached_state);
 
@@ -4799,8 +4801,8 @@ int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len,
 					block_start, blocksize, true);
 	}
 	btrfs_delalloc_release_extents(inode, blocksize);
-	unlock_page(page);
-	put_page(page);
+	folio_unlock(folio);
+	folio_put(folio);
 out:
 	if (only_release_metadata)
 		btrfs_check_nocow_unlock(inode);

From 670454f618b7774a3dbfa1fcf940959adf0ca174 Mon Sep 17 00:00:00 2001
From: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Date: Wed, 17 Jan 2024 02:25:42 -0800
Subject: [PATCH 0354/1406] btrfs: remove duplicate recording of physical
 address

Remove the duplicate physical recording of the original write physical
address in case of a single device write.

This duplicated code is most likely present due to a rebase error.

Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/bio.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c
index 928f512cdb4a74..2d20215548db58 100644
--- a/fs/btrfs/bio.c
+++ b/fs/btrfs/bio.c
@@ -509,8 +509,6 @@ static void __btrfs_submit_bio(struct bio *bio, struct btrfs_io_context *bioc,
 	if (!bioc) {
 		/* Single mirror read/write fast path. */
 		btrfs_bio(bio)->mirror_num = mirror_num;
-		if (bio_op(bio) != REQ_OP_READ)
-			btrfs_bio(bio)->orig_physical = smap->physical;
 		bio->bi_iter.bi_sector = smap->physical >> SECTOR_SHIFT;
 		if (bio_op(bio) != REQ_OP_READ)
 			btrfs_bio(bio)->orig_physical = smap->physical;

From 242f1c34d013606f4f6febaf50519d4890136c29 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Tue, 16 Jan 2024 17:33:20 +0100
Subject: [PATCH 0355/1406] btrfs: replace sb::s_blocksize by
 fs_info::sectorsize

The block size stored in the super block is used by subsystems outside
of btrfs and it's a copy of fs_info::sectorsize. Unify that to always
use our sectorsize, with the exception of mount where we first need to
use fixed values (4K) until we read the super block and can set the
sectorsize.

Replace all uses, in most cases it's fewer pointer indirections.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c   | 2 ++
 fs/btrfs/extent_io.c | 4 ++--
 fs/btrfs/inode.c     | 2 +-
 fs/btrfs/ioctl.c     | 2 +-
 fs/btrfs/reflink.c   | 6 +++---
 fs/btrfs/send.c      | 2 +-
 fs/btrfs/super.c     | 2 +-
 7 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 3befabaaaa1896..132d7b846b4ac3 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2840,6 +2840,7 @@ static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block
 	int ret;
 
 	fs_info->sb = sb;
+	/* Temporary fixed values for block size until we read the superblock. */
 	sb->s_blocksize = BTRFS_BDEV_BLOCKSIZE;
 	sb->s_blocksize_bits = blksize_bits(BTRFS_BDEV_BLOCKSIZE);
 
@@ -3357,6 +3358,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
 	sb->s_bdi->ra_pages *= btrfs_super_num_devices(disk_super);
 	sb->s_bdi->ra_pages = max(sb->s_bdi->ra_pages, SZ_4M / PAGE_SIZE);
 
+	/* Update the values for the current filesystem. */
 	sb->s_blocksize = sectorsize;
 	sb->s_blocksize_bits = blksize_bits(sectorsize);
 	memcpy(&sb->s_uuid, fs_info->fs_devices->fsid, BTRFS_FSID_SIZE);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index b02acecc433b44..c1b15a8efef5ee 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1015,7 +1015,7 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
 	int ret = 0;
 	size_t pg_offset = 0;
 	size_t iosize;
-	size_t blocksize = inode->i_sb->s_blocksize;
+	size_t blocksize = fs_info->sectorsize;
 	struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
 
 	ret = set_page_extent_mapped(page);
@@ -2305,7 +2305,7 @@ int extent_invalidate_folio(struct extent_io_tree *tree,
 	struct extent_state *cached_state = NULL;
 	u64 start = folio_pos(folio);
 	u64 end = start + folio_size(folio) - 1;
-	size_t blocksize = folio->mapping->host->i_sb->s_blocksize;
+	size_t blocksize = btrfs_sb(folio->mapping->host->i_sb)->sectorsize;
 
 	/* This function is only called for the btree inode */
 	ASSERT(tree->owner == IO_TREE_BTREE_INODE_IO);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 25090d23834bdf..bedd8703bfa615 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -8686,7 +8686,7 @@ static int btrfs_getattr(struct mnt_idmap *idmap,
 	u64 delalloc_bytes;
 	u64 inode_bytes;
 	struct inode *inode = d_inode(path->dentry);
-	u32 blocksize = inode->i_sb->s_blocksize;
+	u32 blocksize = btrfs_sb(inode->i_sb)->sectorsize;
 	u32 bi_flags = BTRFS_I(inode)->flags;
 	u32 bi_ro_flags = BTRFS_I(inode)->ro_flags;
 
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index ac3316e0d11c3a..3d476decde52cf 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -528,7 +528,7 @@ static noinline int btrfs_ioctl_fitrim(struct btrfs_fs_info *fs_info,
 	 * block group is in the logical address space, which can be any
 	 * sectorsize aligned bytenr in  the range [0, U64_MAX].
 	 */
-	if (range.len < fs_info->sb->s_blocksize)
+	if (range.len < fs_info->sectorsize)
 		return -EINVAL;
 
 	range.minlen = max(range.minlen, minlen);
diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c
index ae90894dc7dc7d..e38cb40e150c96 100644
--- a/fs/btrfs/reflink.c
+++ b/fs/btrfs/reflink.c
@@ -663,7 +663,7 @@ static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len,
 				   struct inode *dst, u64 dst_loff)
 {
 	struct btrfs_fs_info *fs_info = BTRFS_I(src)->root->fs_info;
-	const u64 bs = fs_info->sb->s_blocksize;
+	const u64 bs = fs_info->sectorsize;
 	int ret;
 
 	/*
@@ -730,7 +730,7 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src,
 	int ret;
 	int wb_ret;
 	u64 len = olen;
-	u64 bs = fs_info->sb->s_blocksize;
+	u64 bs = fs_info->sectorsize;
 
 	/*
 	 * VFS's generic_remap_file_range_prep() protects us from cloning the
@@ -796,7 +796,7 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in,
 {
 	struct inode *inode_in = file_inode(file_in);
 	struct inode *inode_out = file_inode(file_out);
-	u64 bs = BTRFS_I(inode_out)->root->fs_info->sb->s_blocksize;
+	u64 bs = BTRFS_I(inode_out)->root->fs_info->sectorsize;
 	u64 wb_len;
 	int ret;
 
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 7902298c1f25bb..141ab89fb63ee8 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -6140,7 +6140,7 @@ static int send_write_or_clone(struct send_ctx *sctx,
 	int ret = 0;
 	u64 offset = key->offset;
 	u64 end;
-	u64 bs = sctx->send_root->fs_info->sb->s_blocksize;
+	u64 bs = sctx->send_root->fs_info->sectorsize;
 
 	end = min_t(u64, btrfs_file_extent_end(path), sctx->cur_inode_size);
 	if (offset >= end)
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 101f786963d4d7..c45fdaf24cd1c2 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1767,7 +1767,7 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 		buf->f_bavail = 0;
 
 	buf->f_type = BTRFS_SUPER_MAGIC;
-	buf->f_bsize = dentry->d_sb->s_blocksize;
+	buf->f_bsize = fs_info->sectorsize;
 	buf->f_namelen = BTRFS_NAME_LEN;
 
 	/* We treat it as constant endianness (it doesn't matter _which_)

From 02372039509bfde161174e32df00e14d63262e45 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Tue, 16 Jan 2024 18:17:14 +0100
Subject: [PATCH 0356/1406] btrfs: replace i_blocksize by fs_info::sectorsize

The block size calculated by i_blocksize from inode is the same as what
we have in fs_info, initalized in inode_init_always(). Unify that to use
the fs_info value everywhere.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/file.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index f8e1a7ce3d39ae..bd8d13740f41fa 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -3004,7 +3004,7 @@ static int btrfs_zero_range(struct inode *inode,
 		}
 		ret = btrfs_prealloc_file_range(inode, mode, alloc_start,
 						alloc_end - alloc_start,
-						i_blocksize(inode),
+						fs_info->sectorsize,
 						offset + len, &alloc_hint);
 		unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
 			      &cached_state);
@@ -3176,7 +3176,7 @@ static long btrfs_fallocate(struct file *file, int mode,
 		if (!ret) {
 			ret = btrfs_prealloc_file_range(inode, mode,
 					range->start,
-					range->len, i_blocksize(inode),
+					range->len, blocksize,
 					offset + len, &alloc_hint);
 			/*
 			 * btrfs_prealloc_file_range() releases space even

From a670951217d1143789a63aeca5c955361e879b52 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Thu, 25 Jan 2024 17:44:47 +0100
Subject: [PATCH 0357/1406] btrfs: remove unused included headers

With help of neovim, LSP and clangd we can identify header files that
are not actually needed to be included in the .c files. This is focused
only on removal (with minor fixups), further cleanups are possible but
will require doing the header files properly with forward declarations,
minimized includes and include-what-you-use care.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/accessors.c        | 3 ++-
 fs/btrfs/acl.c              | 1 -
 fs/btrfs/async-thread.c     | 1 -
 fs/btrfs/bio.c              | 1 -
 fs/btrfs/block-rsv.c        | 1 -
 fs/btrfs/compression.c      | 5 +----
 fs/btrfs/defrag.c           | 1 -
 fs/btrfs/delalloc-space.c   | 2 --
 fs/btrfs/dev-replace.c      | 2 --
 fs/btrfs/disk-io.c          | 1 -
 fs/btrfs/export.c           | 1 -
 fs/btrfs/extent-io-tree.c   | 1 -
 fs/btrfs/extent-tree.c      | 5 +----
 fs/btrfs/extent_io.c        | 2 --
 fs/btrfs/extent_map.c       | 1 -
 fs/btrfs/file-item.c        | 3 ---
 fs/btrfs/file-item.h        | 2 ++
 fs/btrfs/file.c             | 2 --
 fs/btrfs/free-space-cache.c | 2 --
 fs/btrfs/fs.h               | 1 -
 fs/btrfs/inode-item.c       | 1 -
 fs/btrfs/inode.c            | 2 --
 fs/btrfs/ioctl.c            | 4 ----
 fs/btrfs/locking.c          | 1 -
 fs/btrfs/messages.c         | 2 --
 fs/btrfs/ordered-data.c     | 1 -
 fs/btrfs/orphan.c           | 1 -
 fs/btrfs/raid-stripe-tree.c | 1 -
 fs/btrfs/raid56.c           | 1 -
 fs/btrfs/root-tree.c        | 1 -
 fs/btrfs/send.c             | 1 -
 fs/btrfs/space-info.c       | 1 -
 fs/btrfs/super.c            | 2 --
 fs/btrfs/transaction.c      | 2 --
 fs/btrfs/tree-checker.c     | 2 --
 fs/btrfs/tree-log.c         | 2 --
 fs/btrfs/ulist.c            | 1 -
 fs/btrfs/uuid-tree.c        | 1 -
 fs/btrfs/verity.c           | 1 -
 fs/btrfs/volumes.c          | 2 --
 fs/btrfs/zoned.c            | 2 --
 fs/btrfs/zstd.c             | 1 -
 42 files changed, 6 insertions(+), 65 deletions(-)

diff --git a/fs/btrfs/accessors.c b/fs/btrfs/accessors.c
index 6eb850ad37d2ae..79026917db19dc 100644
--- a/fs/btrfs/accessors.c
+++ b/fs/btrfs/accessors.c
@@ -5,7 +5,8 @@
 
 #include <asm/unaligned.h>
 #include "messages.h"
-#include "ctree.h"
+#include "extent_io.h"
+#include "fs.h"
 #include "accessors.h"
 
 static bool check_setget_bounds(const struct extent_buffer *eb,
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 7427449a04a3f2..e0ba00d64ea0bf 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -12,7 +12,6 @@
 #include <linux/sched/mm.h>
 #include <linux/slab.h>
 #include "ctree.h"
-#include "btrfs_inode.h"
 #include "xattr.h"
 #include "acl.h"
 
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 9e261aac671e62..361a866c19955a 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -11,7 +11,6 @@
 #include <linux/freezer.h>
 #include <trace/events/btrfs.h>
 #include "async-thread.h"
-#include "ctree.h"
 
 enum {
 	WORK_DONE_BIT,
diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c
index 2d20215548db58..960b81718e2958 100644
--- a/fs/btrfs/bio.c
+++ b/fs/btrfs/bio.c
@@ -11,7 +11,6 @@
 #include "raid56.h"
 #include "async-thread.h"
 #include "dev-replace.h"
-#include "rcu-string.h"
 #include "zoned.h"
 #include "file-item.h"
 #include "raid-stripe-tree.h"
diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c
index ceb5f586a2d555..27207dad27c29b 100644
--- a/fs/btrfs/block-rsv.c
+++ b/fs/btrfs/block-rsv.c
@@ -6,7 +6,6 @@
 #include "space-info.h"
 #include "transaction.h"
 #include "block-group.h"
-#include "disk-io.h"
 #include "fs.h"
 #include "accessors.h"
 
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 68345f73d429aa..488089acd49f13 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -25,8 +25,6 @@
 #include "misc.h"
 #include "ctree.h"
 #include "fs.h"
-#include "disk-io.h"
-#include "transaction.h"
 #include "btrfs_inode.h"
 #include "bio.h"
 #include "ordered-data.h"
@@ -34,8 +32,7 @@
 #include "extent_io.h"
 #include "extent_map.h"
 #include "subpage.h"
-#include "zoned.h"
-#include "file-item.h"
+#include "messages.h"
 #include "super.h"
 
 static struct bio_set btrfs_compressed_bioset;
diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c
index dd1b5a060366f9..8fc8118c322501 100644
--- a/fs/btrfs/defrag.c
+++ b/fs/btrfs/defrag.c
@@ -6,7 +6,6 @@
 #include <linux/sched.h>
 #include "ctree.h"
 #include "disk-io.h"
-#include "print-tree.h"
 #include "transaction.h"
 #include "locking.h"
 #include "accessors.h"
diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c
index 2833e8ef4c098f..4a60a679d7b445 100644
--- a/fs/btrfs/delalloc-space.c
+++ b/fs/btrfs/delalloc-space.c
@@ -6,9 +6,7 @@
 #include "block-rsv.h"
 #include "btrfs_inode.h"
 #include "space-info.h"
-#include "transaction.h"
 #include "qgroup.h"
-#include "block-group.h"
 #include "fs.h"
 
 /*
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 1502d664c89273..a13e1a91870e93 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -11,10 +11,8 @@
 #include <linux/math64.h>
 #include "misc.h"
 #include "ctree.h"
-#include "extent_map.h"
 #include "disk-io.h"
 #include "transaction.h"
-#include "print-tree.h"
 #include "volumes.h"
 #include "async-thread.h"
 #include "dev-replace.h"
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 132d7b846b4ac3..26c11fce5e4e00 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -29,7 +29,6 @@
 #include "tree-log.h"
 #include "free-space-cache.h"
 #include "free-space-tree.h"
-#include "rcu-string.h"
 #include "dev-replace.h"
 #include "raid56.h"
 #include "sysfs.h"
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 744a02b7fd6717..3f2e8fb9e3e950 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -5,7 +5,6 @@
 #include "ctree.h"
 #include "disk-io.h"
 #include "btrfs_inode.h"
-#include "print-tree.h"
 #include "export.h"
 #include "accessors.h"
 #include "super.h"
diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c
index 1544e7b1eaed30..6b923c0ef4ea57 100644
--- a/fs/btrfs/extent-io-tree.c
+++ b/fs/btrfs/extent-io-tree.c
@@ -6,7 +6,6 @@
 #include "ctree.h"
 #include "extent-io-tree.h"
 #include "btrfs_inode.h"
-#include "misc.h"
 
 static struct kmem_cache *extent_state_cache;
 
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 8e8cc11112772d..f4ab437d4160b7 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -18,7 +18,7 @@
 #include <linux/crc32c.h>
 #include "ctree.h"
 #include "extent-tree.h"
-#include "tree-log.h"
+#include "transaction.h"
 #include "disk-io.h"
 #include "print-tree.h"
 #include "volumes.h"
@@ -26,14 +26,11 @@
 #include "locking.h"
 #include "free-space-cache.h"
 #include "free-space-tree.h"
-#include "sysfs.h"
 #include "qgroup.h"
 #include "ref-verify.h"
 #include "space-info.h"
 #include "block-rsv.h"
-#include "delalloc-space.h"
 #include "discard.h"
-#include "rcu-string.h"
 #include "zoned.h"
 #include "dev-replace.h"
 #include "fs.h"
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index c1b15a8efef5ee..8648ea9b5fb53a 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -14,7 +14,6 @@
 #include <linux/pagevec.h>
 #include <linux/prefetch.h>
 #include <linux/fsverity.h>
-#include "misc.h"
 #include "extent_io.h"
 #include "extent-io-tree.h"
 #include "extent_map.h"
@@ -22,7 +21,6 @@
 #include "btrfs_inode.h"
 #include "bio.h"
 #include "locking.h"
-#include "rcu-string.h"
 #include "backref.h"
 #include "disk-io.h"
 #include "subpage.h"
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index b61099bf97a824..e9b20fbbdfcad7 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -5,7 +5,6 @@
 #include <linux/spinlock.h>
 #include "messages.h"
 #include "ctree.h"
-#include "volumes.h"
 #include "extent_map.h"
 #include "compression.h"
 #include "btrfs_inode.h"
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 81ac1d474bf183..f7ef9fa469b946 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -10,17 +10,14 @@
 #include <linux/sched/mm.h>
 #include <crypto/hash.h>
 #include "messages.h"
-#include "misc.h"
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
 #include "bio.h"
-#include "print-tree.h"
 #include "compression.h"
 #include "fs.h"
 #include "accessors.h"
 #include "file-item.h"
-#include "super.h"
 
 #define __MAX_CSUM_ITEMS(r, size) ((unsigned long)(((BTRFS_LEAF_DATA_SIZE(r) - \
 				   sizeof(struct btrfs_item) * 2) / \
diff --git a/fs/btrfs/file-item.h b/fs/btrfs/file-item.h
index 04bd2d34efb14b..606731bef247b0 100644
--- a/fs/btrfs/file-item.h
+++ b/fs/btrfs/file-item.h
@@ -5,6 +5,8 @@
 
 #include "accessors.h"
 
+struct extent_map;
+
 #define BTRFS_FILE_EXTENT_INLINE_DATA_START		\
 		(offsetof(struct btrfs_file_extent_item, disk_bytenr))
 
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index bd8d13740f41fa..4bca37fd6833ad 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -22,10 +22,8 @@
 #include "disk-io.h"
 #include "transaction.h"
 #include "btrfs_inode.h"
-#include "print-tree.h"
 #include "tree-log.h"
 #include "locking.h"
-#include "volumes.h"
 #include "qgroup.h"
 #include "compression.h"
 #include "delalloc-space.h"
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index d372c7ce0e6b43..f74b13f9b19343 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -19,9 +19,7 @@
 #include "transaction.h"
 #include "disk-io.h"
 #include "extent_io.h"
-#include "volumes.h"
 #include "space-info.h"
-#include "delalloc-space.h"
 #include "block-group.h"
 #include "discard.h"
 #include "subpage.h"
diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
index f8bb73d6ab68c4..b747134fac7742 100644
--- a/fs/btrfs/fs.h
+++ b/fs/btrfs/fs.h
@@ -8,7 +8,6 @@
 #include <linux/btrfs_tree.h>
 #include <linux/sizes.h>
 #include "extent-io-tree.h"
-#include "extent_map.h"
 #include "async-thread.h"
 #include "block-rsv.h"
 
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 7d734830e514eb..9c1394c0a6d72d 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -9,7 +9,6 @@
 #include "inode-item.h"
 #include "disk-io.h"
 #include "transaction.h"
-#include "print-tree.h"
 #include "space-info.h"
 #include "accessors.h"
 #include "extent-tree.h"
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index bedd8703bfa615..6734717350e35a 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -39,14 +39,12 @@
 #include "disk-io.h"
 #include "transaction.h"
 #include "btrfs_inode.h"
-#include "print-tree.h"
 #include "ordered-data.h"
 #include "xattr.h"
 #include "tree-log.h"
 #include "bio.h"
 #include "compression.h"
 #include "locking.h"
-#include "free-space-cache.h"
 #include "props.h"
 #include "qgroup.h"
 #include "delalloc-space.h"
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 3d476decde52cf..46f9a6645bf6d1 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -34,11 +34,9 @@
 #include "export.h"
 #include "transaction.h"
 #include "btrfs_inode.h"
-#include "print-tree.h"
 #include "volumes.h"
 #include "locking.h"
 #include "backref.h"
-#include "rcu-string.h"
 #include "send.h"
 #include "dev-replace.h"
 #include "props.h"
@@ -47,9 +45,7 @@
 #include "tree-log.h"
 #include "compression.h"
 #include "space-info.h"
-#include "delalloc-space.h"
 #include "block-group.h"
-#include "subpage.h"
 #include "fs.h"
 #include "accessors.h"
 #include "extent-tree.h"
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 74d8e2003f58c0..286e6aa721c73d 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -13,7 +13,6 @@
 #include "ctree.h"
 #include "extent_io.h"
 #include "locking.h"
-#include "accessors.h"
 
 /*
  * Lockdep class keys for extent_buffer->lock's in this root.  For a given
diff --git a/fs/btrfs/messages.c b/fs/btrfs/messages.c
index cdada4865837fc..c96dd66fd0f722 100644
--- a/fs/btrfs/messages.c
+++ b/fs/btrfs/messages.c
@@ -3,8 +3,6 @@
 #include "fs.h"
 #include "messages.h"
 #include "discard.h"
-#include "transaction.h"
-#include "space-info.h"
 #include "super.h"
 
 #ifdef CONFIG_PRINTK
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 59850dc17b22f0..de12c282e69bcf 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -19,7 +19,6 @@
 #include "qgroup.h"
 #include "subpage.h"
 #include "file.h"
-#include "super.h"
 
 static struct kmem_cache *btrfs_ordered_extent_cache;
 
diff --git a/fs/btrfs/orphan.c b/fs/btrfs/orphan.c
index 7a1b021b5669d2..6195a2215b8fee 100644
--- a/fs/btrfs/orphan.c
+++ b/fs/btrfs/orphan.c
@@ -4,7 +4,6 @@
  */
 
 #include "ctree.h"
-#include "disk-io.h"
 #include "orphan.h"
 
 int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c
index 9589362acfbf9e..6af6b4b9a32ef9 100644
--- a/fs/btrfs/raid-stripe-tree.c
+++ b/fs/btrfs/raid-stripe-tree.c
@@ -11,7 +11,6 @@
 #include "disk-io.h"
 #include "raid-stripe-tree.h"
 #include "volumes.h"
-#include "misc.h"
 #include "print-tree.h"
 
 int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 length)
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 792c8e17c31d76..5c4bf3f907c1a1 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -14,7 +14,6 @@
 #include <linux/raid/xor.h>
 #include <linux/mm.h>
 #include "messages.h"
-#include "misc.h"
 #include "ctree.h"
 #include "disk-io.h"
 #include "volumes.h"
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 603ad1459368c3..3f6d10eb1aafca 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -10,7 +10,6 @@
 #include "messages.h"
 #include "transaction.h"
 #include "disk-io.h"
-#include "print-tree.h"
 #include "qgroup.h"
 #include "space-info.h"
 #include "accessors.h"
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 141ab89fb63ee8..14ea3085073905 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -25,7 +25,6 @@
 #include "btrfs_inode.h"
 #include "transaction.h"
 #include "compression.h"
-#include "xattr.h"
 #include "print-tree.h"
 #include "accessors.h"
 #include "dir-item.h"
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index 571bb13587d5e7..a5b652c1650ad8 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -9,7 +9,6 @@
 #include "ordered-data.h"
 #include "transaction.h"
 #include "block-group.h"
-#include "zoned.h"
 #include "fs.h"
 #include "accessors.h"
 #include "extent-tree.h"
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index c45fdaf24cd1c2..40ae264fd3ed50 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -34,13 +34,11 @@
 #include "disk-io.h"
 #include "transaction.h"
 #include "btrfs_inode.h"
-#include "print-tree.h"
 #include "props.h"
 #include "xattr.h"
 #include "bio.h"
 #include "export.h"
 #include "compression.h"
-#include "rcu-string.h"
 #include "dev-replace.h"
 #include "free-space-cache.h"
 #include "backref.h"
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 5b3333ceef0481..70d7abd1f772f1 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -23,12 +23,10 @@
 #include "qgroup.h"
 #include "block-group.h"
 #include "space-info.h"
-#include "zoned.h"
 #include "fs.h"
 #include "accessors.h"
 #include "extent-tree.h"
 #include "root-tree.h"
-#include "defrag.h"
 #include "dir-item.h"
 #include "uuid-tree.h"
 #include "ioctl.h"
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index 6eccf8496486c0..4fa95eca285ec1 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -21,7 +21,6 @@
 #include "messages.h"
 #include "ctree.h"
 #include "tree-checker.h"
-#include "disk-io.h"
 #include "compression.h"
 #include "volumes.h"
 #include "misc.h"
@@ -30,7 +29,6 @@
 #include "file-item.h"
 #include "inode-item.h"
 #include "dir-item.h"
-#include "raid-stripe-tree.h"
 #include "extent-tree.h"
 
 /*
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 331fc7429952fd..043b8df5665ff7 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -13,13 +13,11 @@
 #include "tree-log.h"
 #include "disk-io.h"
 #include "locking.h"
-#include "print-tree.h"
 #include "backref.h"
 #include "compression.h"
 #include "qgroup.h"
 #include "block-group.h"
 #include "space-info.h"
-#include "zoned.h"
 #include "inode-item.h"
 #include "fs.h"
 #include "accessors.h"
diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c
index b4ac2b0cd2359a..183863f4bfa417 100644
--- a/fs/btrfs/ulist.c
+++ b/fs/btrfs/ulist.c
@@ -7,7 +7,6 @@
 #include <linux/slab.h>
 #include "messages.h"
 #include "ulist.h"
-#include "ctree.h"
 
 /*
  * ulist is a generic data structure to hold a collection of unique u64
diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c
index 5be74f9e47ebf3..b8c6e46dd499a5 100644
--- a/fs/btrfs/uuid-tree.c
+++ b/fs/btrfs/uuid-tree.c
@@ -9,7 +9,6 @@
 #include "ctree.h"
 #include "transaction.h"
 #include "disk-io.h"
-#include "print-tree.h"
 #include "fs.h"
 #include "accessors.h"
 #include "uuid-tree.h"
diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c
index 66e2270b0dae9f..4042dd6437aefa 100644
--- a/fs/btrfs/verity.c
+++ b/fs/btrfs/verity.c
@@ -14,7 +14,6 @@
 #include "ctree.h"
 #include "btrfs_inode.h"
 #include "transaction.h"
-#include "disk-io.h"
 #include "locking.h"
 #include "fs.h"
 #include "accessors.h"
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index d67785be2c778c..474ab7ed65ea92 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -14,10 +14,8 @@
 #include <linux/namei.h>
 #include "misc.h"
 #include "ctree.h"
-#include "extent_map.h"
 #include "disk-io.h"
 #include "transaction.h"
-#include "print-tree.h"
 #include "volumes.h"
 #include "raid56.h"
 #include "rcu-string.h"
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index 168af9d000d168..d9716456bce03a 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -12,10 +12,8 @@
 #include "rcu-string.h"
 #include "disk-io.h"
 #include "block-group.h"
-#include "transaction.h"
 #include "dev-replace.h"
 #include "space-info.h"
-#include "super.h"
 #include "fs.h"
 #include "accessors.h"
 #include "bio.h"
diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c
index 0d66db8bc1d477..4cba8176b074a8 100644
--- a/fs/btrfs/zstd.c
+++ b/fs/btrfs/zstd.c
@@ -19,7 +19,6 @@
 #include <linux/zstd.h>
 #include "misc.h"
 #include "compression.h"
-#include "ctree.h"
 
 #define ZSTD_BTRFS_MAX_WINDOWLOG 17
 #define ZSTD_BTRFS_MAX_INPUT (1 << ZSTD_BTRFS_MAX_WINDOWLOG)

From d6d1fdfaa36c740272d73a84116e791b8db46c33 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Tue, 23 Jan 2024 13:33:30 +1030
Subject: [PATCH 0358/1406] btrfs: zstd: fix and simplify the inline extent
 decompression

[BUG]
If we have a filesystem with 4k sectorsize, and an inlined compressed
extent created like this:

	item 4 key (257 INODE_ITEM 0) itemoff 15863 itemsize 160
		generation 8 transid 8 size 4096 nbytes 4096
		block group 0 mode 100600 links 1 uid 0 gid 0 rdev 0
		sequence 1 flags 0x0(none)
	item 5 key (257 INODE_REF 256) itemoff 15839 itemsize 24
		index 2 namelen 14 name: source_inlined
	item 6 key (257 EXTENT_DATA 0) itemoff 15770 itemsize 69
		generation 8 type 0 (inline)
		inline extent data size 48 ram_bytes 4096 compression 3 (zstd)

Then trying to reflink that extent in an aarch64 system with 64K page
size, the reflink would just fail:

  # xfs_io -f -c "reflink $mnt/source_inlined 0 60k 4k" $mnt/dest
  XFS_IOC_CLONE_RANGE: Input/output error

[CAUSE]
In zstd_decompress(), we didn't treat @start_byte as just a page offset,
but also use it as an indicator on whether we should error out, without
any proper explanation (this is copied from other decompression code).

In reality, for subpage cases, although @start_byte can be non-zero,
we should never switch input/output buffer nor error out, since the whole
input/output buffer should never exceed one sector, thus we should not
need to do any buffer switch.

Thus the current code using @start_byte as a condition to switch
input/output buffer or finish the decompression is completely incorrect.

[FIX]
The fix involves several modification:

- Rename @start_byte to @dest_pgoff to properly express its meaning

- Use @sectorsize other than PAGE_SIZE to properly initialize the
  output buffer size

- Use correct destination offset inside the destination page

- Simplify the main loop
  Since the input/output buffer should never switch, we only need one
  zstd_decompress_stream() call.

- Consider early end as an error

After the fix, even on 64K page sized aarch64, above reflink now
works as expected:

  # xfs_io -f -c "reflink $mnt/source_inlined 0 60k 4k" $mnt/dest
  linked 4096/4096 bytes at offset 61440

And results the correct file layout:

	item 9 key (258 INODE_ITEM 0) itemoff 15542 itemsize 160
		generation 10 transid 10 size 65536 nbytes 4096
		block group 0 mode 100600 links 1 uid 0 gid 0 rdev 0
		sequence 1 flags 0x0(none)
	item 10 key (258 INODE_REF 256) itemoff 15528 itemsize 14
		index 3 namelen 4 name: dest
	item 11 key (258 XATTR_ITEM 3817753667) itemoff 15445 itemsize 83
		location key (0 UNKNOWN.0 0) type XATTR
		transid 10 data_len 37 name_len 16
		name: security.selinux
		data unconfined_u:object_r:unlabeled_t:s0
	item 12 key (258 EXTENT_DATA 61440) itemoff 15392 itemsize 53
		generation 10 type 1 (regular)
		extent data disk byte 13631488 nr 4096
		extent data offset 0 nr 4096 ram 4096
		extent compression 0 (none)

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/compression.h |  2 +-
 fs/btrfs/zstd.c        | 76 +++++++++++++-----------------------------
 2 files changed, 24 insertions(+), 54 deletions(-)

diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index afd7e50d073d4a..97fe3ebf11a223 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -169,7 +169,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
 		unsigned long *total_in, unsigned long *total_out);
 int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
 int zstd_decompress(struct list_head *ws, const u8 *data_in,
-		struct page *dest_page, unsigned long start_byte, size_t srclen,
+		struct page *dest_page, unsigned long dest_pgoff, size_t srclen,
 		size_t destlen);
 void zstd_init_workspace_manager(void);
 void zstd_cleanup_workspace_manager(void);
diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c
index 4cba8176b074a8..92b3744b819bd4 100644
--- a/fs/btrfs/zstd.c
+++ b/fs/btrfs/zstd.c
@@ -18,7 +18,9 @@
 #include <linux/slab.h>
 #include <linux/zstd.h>
 #include "misc.h"
+#include "fs.h"
 #include "compression.h"
+#include "super.h"
 
 #define ZSTD_BTRFS_MAX_WINDOWLOG 17
 #define ZSTD_BTRFS_MAX_INPUT (1 << ZSTD_BTRFS_MAX_WINDOWLOG)
@@ -617,80 +619,48 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
 }
 
 int zstd_decompress(struct list_head *ws, const u8 *data_in,
-		struct page *dest_page, unsigned long start_byte, size_t srclen,
+		struct page *dest_page, unsigned long dest_pgoff, size_t srclen,
 		size_t destlen)
 {
 	struct workspace *workspace = list_entry(ws, struct workspace, list);
+	struct btrfs_fs_info *fs_info = btrfs_sb(dest_page->mapping->host->i_sb);
+	const u32 sectorsize = fs_info->sectorsize;
 	zstd_dstream *stream;
 	int ret = 0;
-	size_t ret2;
-	unsigned long total_out = 0;
-	unsigned long pg_offset = 0;
+	unsigned long to_copy = 0;
 
 	stream = zstd_init_dstream(
 			ZSTD_BTRFS_MAX_INPUT, workspace->mem, workspace->size);
 	if (!stream) {
 		pr_warn("BTRFS: zstd_init_dstream failed\n");
-		ret = -EIO;
 		goto finish;
 	}
 
-	destlen = min_t(size_t, destlen, PAGE_SIZE);
-
 	workspace->in_buf.src = data_in;
 	workspace->in_buf.pos = 0;
 	workspace->in_buf.size = srclen;
 
 	workspace->out_buf.dst = workspace->buf;
 	workspace->out_buf.pos = 0;
-	workspace->out_buf.size = PAGE_SIZE;
-
-	ret2 = 1;
-	while (pg_offset < destlen
-	       && workspace->in_buf.pos < workspace->in_buf.size) {
-		unsigned long buf_start;
-		unsigned long buf_offset;
-		unsigned long bytes;
-
-		/* Check if the frame is over and we still need more input */
-		if (ret2 == 0) {
-			pr_debug("BTRFS: zstd_decompress_stream ended early\n");
-			ret = -EIO;
-			goto finish;
-		}
-		ret2 = zstd_decompress_stream(stream, &workspace->out_buf,
-				&workspace->in_buf);
-		if (zstd_is_error(ret2)) {
-			pr_debug("BTRFS: zstd_decompress_stream returned %d\n",
-					zstd_get_error_code(ret2));
-			ret = -EIO;
-			goto finish;
-		}
-
-		buf_start = total_out;
-		total_out += workspace->out_buf.pos;
-		workspace->out_buf.pos = 0;
-
-		if (total_out <= start_byte)
-			continue;
-
-		if (total_out > start_byte && buf_start < start_byte)
-			buf_offset = start_byte - buf_start;
-		else
-			buf_offset = 0;
-
-		bytes = min_t(unsigned long, destlen - pg_offset,
-				workspace->out_buf.size - buf_offset);
-
-		memcpy_to_page(dest_page, pg_offset,
-			       workspace->out_buf.dst + buf_offset, bytes);
-
-		pg_offset += bytes;
+	workspace->out_buf.size = sectorsize;
+
+	/*
+	 * Since both input and output buffers should not exceed one sector,
+	 * one call should end the decompression.
+	 */
+	ret = zstd_decompress_stream(stream, &workspace->out_buf, &workspace->in_buf);
+	if (zstd_is_error(ret)) {
+		pr_warn_ratelimited("BTRFS: zstd_decompress_stream return %d\n",
+				    zstd_get_error_code(ret));
+		goto finish;
 	}
-	ret = 0;
+	to_copy = workspace->out_buf.pos;
+	memcpy_to_page(dest_page, dest_pgoff, workspace->out_buf.dst, to_copy);
 finish:
-	if (pg_offset < destlen) {
-		memzero_page(dest_page, pg_offset, destlen - pg_offset);
+	/* Error or early end. */
+	if (unlikely(to_copy < destlen)) {
+		ret = -EIO;
+		memzero_page(dest_page, dest_pgoff + to_copy, destlen - to_copy);
 	}
 	return ret;
 }

From 6f17c6cab662fd4f4130e7e3f6b4dcbf81fe81f5 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.i.king@gmail.com>
Date: Mon, 22 Jan 2024 13:01:02 +0000
Subject: [PATCH 0359/1406] btrfs: zlib: Fix spelling mistake "infalte" ->
 "inflate"

There is a spelling mistake in a warning message. Fix it.

Signed-off-by: Colin Ian King <colin.i.king@gmail.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/zlib.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index 8da66ea699e8fe..e5b3f20038962f 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -398,7 +398,7 @@ int zlib_decompress(struct list_head *ws, const u8 *data_in,
 
 out:
 	if (unlikely(to_copy != destlen)) {
-		pr_warn_ratelimited("BTRFS: infalte failed, decompressed=%lu expected=%zu\n",
+		pr_warn_ratelimited("BTRFS: inflate failed, decompressed=%lu expected=%zu\n",
 					to_copy, destlen);
 		ret = -EIO;
 	} else {

From 837672267e26cc8abb978462f0a3aab78bc92de3 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Fri, 12 Jan 2024 18:31:40 +0100
Subject: [PATCH 0360/1406] btrfs: handle errors returned from
 unpin_extent_cache()

We've had numerous attempts to let function unpin_extent_cache() return
void as it only returns 0. There are still error cases to handle so do
that, in addition to the verbose messages. The only caller
btrfs_finish_one_ordered() will now abort the transaction, previously it
let it continue which could lead to further problems.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_map.c | 10 +++++++++-
 fs/btrfs/inode.c      |  9 +++++++--
 2 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index e9b20fbbdfcad7..1bf4d0319f6cbe 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -290,6 +290,10 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
  * Called after an extent has been written to disk properly.  Set the generation
  * to the generation that actually added the file item to the inode so we know
  * we need to sync this extent when we call fsync().
+ *
+ * Returns: 0	     on success
+ * 	    -ENOENT  when the extent is not found in the tree
+ * 	    -EUCLEAN if the found extent does not match the expected start
  */
 int unpin_extent_cache(struct btrfs_inode *inode, u64 start, u64 len, u64 gen)
 {
@@ -307,14 +311,18 @@ int unpin_extent_cache(struct btrfs_inode *inode, u64 start, u64 len, u64 gen)
 "no extent map found for inode %llu (root %lld) when unpinning extent range [%llu, %llu), generation %llu",
 			   btrfs_ino(inode), btrfs_root_id(inode->root),
 			   start, len, gen);
+		ret = -ENOENT;
 		goto out;
 	}
 
-	if (WARN_ON(em->start != start))
+	if (WARN_ON(em->start != start)) {
 		btrfs_warn(fs_info,
 "found extent map for inode %llu (root %lld) with unexpected start offset %llu when unpinning extent range [%llu, %llu), generation %llu",
 			   btrfs_ino(inode), btrfs_root_id(inode->root),
 			   em->start, start, len, gen);
+		ret = -EUCLEAN;
+		goto out;
+	}
 
 	em->generation = gen;
 	em->flags &= ~EXTENT_FLAG_PINNED;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 6734717350e35a..b2d348c9c93b18 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3125,8 +3125,13 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
 						ordered_extent->disk_num_bytes);
 		}
 	}
-	unpin_extent_cache(inode, ordered_extent->file_offset,
-			   ordered_extent->num_bytes, trans->transid);
+	if (ret < 0) {
+		btrfs_abort_transaction(trans, ret);
+		goto out;
+	}
+
+	ret = unpin_extent_cache(inode, ordered_extent->file_offset,
+				 ordered_extent->num_bytes, trans->transid);
 	if (ret < 0) {
 		btrfs_abort_transaction(trans, ret);
 		goto out;

From 62bcb604ba4f42f227cf941f54e069607bf3db06 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Fri, 12 Jan 2024 18:45:24 +0100
Subject: [PATCH 0361/1406] btrfs: return errors from unpin_extent_range()

Handle the lookup failure of the block group to unpin, this is a logic
error as the block group must exist at this point. If not, something else
must have freed it, like clean_pinned_extents() would do without locking
the unused_bg_unpin_mutex.

Push the errors to the callers, proper handling will be done in followup
patches.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/block-group.c |  2 +-
 fs/btrfs/extent-tree.c | 19 +++++++++++++++----
 2 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index a9be9ac9922225..1905d76772a905 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -1429,7 +1429,7 @@ static bool clean_pinned_extents(struct btrfs_trans_handle *trans,
 	 * group in pinned_extents before we were able to clear the whole block
 	 * group range from pinned_extents. This means that task can lookup for
 	 * the block group after we unpinned it from pinned_extents and removed
-	 * it, leading to a BUG_ON() at unpin_extent_range().
+	 * it, leading to an error at unpin_extent_range().
 	 */
 	mutex_lock(&fs_info->unused_bg_unpin_mutex);
 	if (prev_trans) {
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index f4ab437d4160b7..73905a6519844d 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2777,6 +2777,7 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info,
 	u64 total_unpinned = 0;
 	u64 empty_cluster = 0;
 	bool readonly;
+	int ret = 0;
 
 	while (start <= end) {
 		readonly = false;
@@ -2786,7 +2787,11 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info,
 				btrfs_put_block_group(cache);
 			total_unpinned = 0;
 			cache = btrfs_lookup_block_group(fs_info, start);
-			BUG_ON(!cache); /* Logic error */
+			if (cache == NULL) {
+				/* Logic error, something removed the block group. */
+				ret = -EUCLEAN;
+				goto out;
+			}
 
 			cluster = fetch_cluster_info(fs_info,
 						     cache->space_info,
@@ -2855,7 +2860,8 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info,
 
 	if (cache)
 		btrfs_put_block_group(cache);
-	return 0;
+out:
+	return ret;
 }
 
 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
@@ -2885,7 +2891,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
 						   end + 1 - start, NULL);
 
 		clear_extent_dirty(unpin, start, end, &cached_state);
-		unpin_extent_range(fs_info, start, end, true);
+		ret = unpin_extent_range(fs_info, start, end, true);
+		BUG_ON(ret);
 		mutex_unlock(&fs_info->unused_bg_unpin_mutex);
 		free_extent_state(cached_state);
 		cond_resched();
@@ -6167,7 +6174,11 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
 int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info,
 				   u64 start, u64 end)
 {
-	return unpin_extent_range(fs_info, start, end, false);
+	int ret;
+
+	ret = unpin_extent_range(fs_info, start, end, false);
+	BUG_ON(ret);
+	return ret;
 }
 
 /*

From f67aa6d837d4e42371ff7c9079e46ee8eb967248 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Fri, 12 Jan 2024 19:06:16 +0100
Subject: [PATCH 0362/1406] btrfs: make btrfs_error_unpin_extent_range() return
 void

This helper is used in transaction abort or cleanup context and the
callers cannot handle all errors, only do best effort.

btrfs_cleanup_one_transaction
  btrfs_destroy_delayed_refs
    btrfs_error_unpin_extent_range
  btrfs_destroy_pinned_extent
    btrfs_error_unpin_extent_range

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h       |  3 +--
 fs/btrfs/extent-tree.c | 13 ++++++-------
 2 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 70e828d33177d6..eede8128819686 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -478,8 +478,7 @@ static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping)
 	return mapping_gfp_constraint(mapping, ~__GFP_FS);
 }
 
-int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info,
-				   u64 start, u64 end);
+void btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, u64 start, u64 end);
 int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
 			 u64 num_bytes, u64 *actual_bytes);
 int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 73905a6519844d..49437ad7248de3 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -6171,14 +6171,13 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
-int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info,
-				   u64 start, u64 end)
+/*
+ * Unpin the extent range in an error context and don't add the space back.
+ * Errors are not propagated further.
+ */
+void btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, u64 start, u64 end)
 {
-	int ret;
-
-	ret = unpin_extent_range(fs_info, start, end, false);
-	BUG_ON(ret);
-	return ret;
+	unpin_extent_range(fs_info, start, end, false);
 }
 
 /*

From 69f02feeea08b2ef22ead5c4ef7162a5c986ec50 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Wed, 31 Jan 2024 14:27:25 -0500
Subject: [PATCH 0363/1406] btrfs: don't drop extent_map for free space inode
 on write error

While running the CI for an unrelated change I hit the following panic
with generic/648 on btrfs_holes_spacecache.

assertion failed: block_start != EXTENT_MAP_HOLE, in fs/btrfs/extent_io.c:1385
------------[ cut here ]------------
kernel BUG at fs/btrfs/extent_io.c:1385!
invalid opcode: 0000 [#1] PREEMPT SMP NOPTI
CPU: 1 PID: 2695096 Comm: fsstress Kdump: loaded Tainted: G        W          6.8.0-rc2+ #1
RIP: 0010:__extent_writepage_io.constprop.0+0x4c1/0x5c0
Call Trace:
 <TASK>
 extent_write_cache_pages+0x2ac/0x8f0
 extent_writepages+0x87/0x110
 do_writepages+0xd5/0x1f0
 filemap_fdatawrite_wbc+0x63/0x90
 __filemap_fdatawrite_range+0x5c/0x80
 btrfs_fdatawrite_range+0x1f/0x50
 btrfs_write_out_cache+0x507/0x560
 btrfs_write_dirty_block_groups+0x32a/0x420
 commit_cowonly_roots+0x21b/0x290
 btrfs_commit_transaction+0x813/0x1360
 btrfs_sync_file+0x51a/0x640
 __x64_sys_fdatasync+0x52/0x90
 do_syscall_64+0x9c/0x190
 entry_SYSCALL_64_after_hwframe+0x6e/0x76

This happens because we fail to write out the free space cache in one
instance, come back around and attempt to write it again.  However on
the second pass through we go to call btrfs_get_extent() on the inode to
get the extent mapping.  Because this is a new block group, and with the
free space inode we always search the commit root to avoid deadlocking
with the tree, we find nothing and return a EXTENT_MAP_HOLE for the
requested range.

This happens because the first time we try to write the space cache out
we hit an error, and on an error we drop the extent mapping.  This is
normal for normal files, but the free space cache inode is special.  We
always expect the extent map to be correct.  Thus the second time
through we end up with a bogus extent map.

Since we're deprecating this feature, the most straightforward way to
fix this is to simply skip dropping the extent map range for this failed
range.

I shortened the test by using error injection to stress the area to make
it easier to reproduce.  With this patch in place we no longer panic
with my error injection test.

CC: stable@vger.kernel.org # 4.14+
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index b2d348c9c93b18..50aea888d9773e 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3187,8 +3187,23 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
 			unwritten_start += logical_len;
 		clear_extent_uptodate(io_tree, unwritten_start, end, NULL);
 
-		/* Drop extent maps for the part of the extent we didn't write. */
-		btrfs_drop_extent_map_range(inode, unwritten_start, end, false);
+		/*
+		 * Drop extent maps for the part of the extent we didn't write.
+		 *
+		 * We have an exception here for the free_space_inode, this is
+		 * because when we do btrfs_get_extent() on the free space inode
+		 * we will search the commit root.  If this is a new block group
+		 * we won't find anything, and we will trip over the assert in
+		 * writepage where we do ASSERT(em->block_start !=
+		 * EXTENT_MAP_HOLE).
+		 *
+		 * Theoretically we could also skip this for any NOCOW extent as
+		 * we don't mess with the extent map tree in the NOCOW case, but
+		 * for now simply skip this if we are the free space inode.
+		 */
+		if (!btrfs_is_free_space_inode(inode))
+			btrfs_drop_extent_map_range(inode, unwritten_start,
+						    end, false);
 
 		/*
 		 * If the ordered extent had an IOERR or something else went

From 1a8ac59445068688cc4ffa37bb99709bc007b7c4 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Thu, 25 Jan 2024 09:53:06 +0000
Subject: [PATCH 0364/1406] btrfs: add and use helper to check if block group
 is used

Add a helper function to determine if a block group is being used and make
use of it at btrfs_delete_unused_bgs(). This helper will also be used in
future code changes.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Boris Burkov <boris@bur.io>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/block-group.c | 3 +--
 fs/btrfs/block-group.h | 7 +++++++
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 1905d76772a905..c2063afc57ad7b 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -1512,8 +1512,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
 		}
 
 		spin_lock(&block_group->lock);
-		if (block_group->reserved || block_group->pinned ||
-		    block_group->used || block_group->ro ||
+		if (btrfs_is_block_group_used(block_group) || block_group->ro ||
 		    list_is_singular(&block_group->list)) {
 			/*
 			 * We want to bail if we made new allocations or have
diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h
index c4a1f01cc1c240..962b11983901a8 100644
--- a/fs/btrfs/block-group.h
+++ b/fs/btrfs/block-group.h
@@ -257,6 +257,13 @@ static inline u64 btrfs_block_group_end(struct btrfs_block_group *block_group)
 	return (block_group->start + block_group->length);
 }
 
+static inline bool btrfs_is_block_group_used(const struct btrfs_block_group *bg)
+{
+	lockdep_assert_held(&bg->lock);
+
+	return (bg->used > 0 || bg->reserved > 0 || bg->pinned > 0);
+}
+
 static inline bool btrfs_is_block_group_data_only(
 					struct btrfs_block_group *block_group)
 {

From 049941652da9200e080cf584a0a799677cf82852 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Thu, 25 Jan 2024 09:53:14 +0000
Subject: [PATCH 0365/1406] btrfs: do not delete unused block group if it may
 be used soon

Before deleting a block group that is in the list of unused block groups
(fs_info->unused_bgs), we check if the block group became used before
deleting it, as extents from it may have been allocated after it was added
to the list.

However even if the block group was not yet used, there may be tasks that
have only reserved space and have not yet allocated extents, and they
might be relying on the availability of the unused block group in order
to allocate extents. The reservation works first by increasing the
"bytes_may_use" field of the corresponding space_info object (which may
first require flushing delayed items, allocating a new block group, etc),
and only later a task does the actual allocation of extents.

For metadata we usually don't end up using all reserved space, as we are
pessimistic and typically account for the worst cases (need to COW every
single node in a path of a tree at maximum possible height, etc). For
data we usually reserve the exact amount of space we're going to allocate
later, except when using compression where we always reserve space based
on the uncompressed size, as compression is only triggered when writeback
starts so we don't know in advance how much space we'll actually need, or
if the data is compressible.

So don't delete an unused block group if the total size of its space_info
object minus the block group's size is less then the sum of used space and
space that may be used (space_info->bytes_may_use), as that means we have
tasks that reserved space and may need to allocate extents from the block
group. In this case, besides skipping the deletion, re-add the block group
to the list of unused block groups so that it may be reconsidered later,
in case the tasks that reserved space end up not needing to allocate
extents from it.

Allowing the deletion of the block group while we have reserved space, can
result in tasks failing to allocate metadata extents (-ENOSPC) while under
a transaction handle, resulting in a transaction abort, or failure during
writeback for the case of data extents.

CC: stable@vger.kernel.org # 6.0+
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Boris Burkov <boris@bur.io>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/block-group.c | 46 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 46 insertions(+)

diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index c2063afc57ad7b..ce732db07ef6ed 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -1455,6 +1455,7 @@ static bool clean_pinned_extents(struct btrfs_trans_handle *trans,
  */
 void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
 {
+	LIST_HEAD(retry_list);
 	struct btrfs_block_group *block_group;
 	struct btrfs_space_info *space_info;
 	struct btrfs_trans_handle *trans;
@@ -1476,6 +1477,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
 
 	spin_lock(&fs_info->unused_bgs_lock);
 	while (!list_empty(&fs_info->unused_bgs)) {
+		u64 used;
 		int trimming;
 
 		block_group = list_first_entry(&fs_info->unused_bgs,
@@ -1511,6 +1513,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
 			goto next;
 		}
 
+		spin_lock(&space_info->lock);
 		spin_lock(&block_group->lock);
 		if (btrfs_is_block_group_used(block_group) || block_group->ro ||
 		    list_is_singular(&block_group->list)) {
@@ -1522,10 +1525,49 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
 			 */
 			trace_btrfs_skip_unused_block_group(block_group);
 			spin_unlock(&block_group->lock);
+			spin_unlock(&space_info->lock);
+			up_write(&space_info->groups_sem);
+			goto next;
+		}
+
+		/*
+		 * The block group may be unused but there may be space reserved
+		 * accounting with the existence of that block group, that is,
+		 * space_info->bytes_may_use was incremented by a task but no
+		 * space was yet allocated from the block group by the task.
+		 * That space may or may not be allocated, as we are generally
+		 * pessimistic about space reservation for metadata as well as
+		 * for data when using compression (as we reserve space based on
+		 * the worst case, when data can't be compressed, and before
+		 * actually attempting compression, before starting writeback).
+		 *
+		 * So check if the total space of the space_info minus the size
+		 * of this block group is less than the used space of the
+		 * space_info - if that's the case, then it means we have tasks
+		 * that might be relying on the block group in order to allocate
+		 * extents, and add back the block group to the unused list when
+		 * we finish, so that we retry later in case no tasks ended up
+		 * needing to allocate extents from the block group.
+		 */
+		used = btrfs_space_info_used(space_info, true);
+		if (space_info->total_bytes - block_group->length < used) {
+			/*
+			 * Add a reference for the list, compensate for the ref
+			 * drop under the "next" label for the
+			 * fs_info->unused_bgs list.
+			 */
+			btrfs_get_block_group(block_group);
+			list_add_tail(&block_group->bg_list, &retry_list);
+
+			trace_btrfs_skip_unused_block_group(block_group);
+			spin_unlock(&block_group->lock);
+			spin_unlock(&space_info->lock);
 			up_write(&space_info->groups_sem);
 			goto next;
 		}
+
 		spin_unlock(&block_group->lock);
+		spin_unlock(&space_info->lock);
 
 		/* We don't want to force the issue, only flip if it's ok. */
 		ret = inc_block_group_ro(block_group, 0);
@@ -1649,12 +1691,16 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
 		btrfs_put_block_group(block_group);
 		spin_lock(&fs_info->unused_bgs_lock);
 	}
+	list_splice_tail(&retry_list, &fs_info->unused_bgs);
 	spin_unlock(&fs_info->unused_bgs_lock);
 	mutex_unlock(&fs_info->reclaim_bgs_lock);
 	return;
 
 flip_async:
 	btrfs_end_transaction(trans);
+	spin_lock(&fs_info->unused_bgs_lock);
+	list_splice_tail(&retry_list, &fs_info->unused_bgs);
+	spin_unlock(&fs_info->unused_bgs_lock);
 	mutex_unlock(&fs_info->reclaim_bgs_lock);
 	btrfs_put_block_group(block_group);
 	btrfs_discard_punt_unused_bgs_list(fs_info);

From 205fc6964cb5dee6f752250b12d15e67eaa81b1d Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Thu, 25 Jan 2024 09:53:19 +0000
Subject: [PATCH 0366/1406] btrfs: add new unused block groups to the list of
 unused block groups

Space reservations for metadata are, most of the time, pessimistic as we
reserve space for worst possible cases - where tree heights are at the
maximum possible height (8), we need to COW every extent buffer in a tree
path, need to split extent buffers, etc.

For data, we generally reserve the exact amount of space we are going to
allocate. The exception here is when using compression, in which case we
reserve space matching the uncompressed size, as the compression only
happens at writeback time and in the worst possible case we need that
amount of space in case the data is not compressible.

This means that when there's not available space in the corresponding
space_info object, we may need to allocate a new block group, and then
that block group might not be used after all. In this case the block
group is never added to the list of unused block groups and ends up
never being deleted - except if we unmount and mount again the fs, as
when reading block groups from disk we add unused ones to the list of
unused block groups (fs_info->unused_bgs). Otherwise a block group is
only added to the list of unused block groups when we deallocate the
last extent from it, so if no extent is ever allocated, the block group
is kept around forever.

This also means that if we have a bunch of tasks reserving space in
parallel we can end up allocating many block groups that end up never
being used or kept around for too long without being used, which has
the potential to result in ENOSPC failures in case for example we over
allocate too many metadata block groups and then end up in a state
without enough unallocated space to allocate a new data block group.

This is more likely to happen with metadata reservations as of kernel
6.7, namely since commit 28270e25c69a ("btrfs: always reserve space for
delayed refs when starting transaction"), because we started to always
reserve space for delayed references when starting a transaction handle
for a non-zero number of items, and also to try to reserve space to fill
the gap between the delayed block reserve's reserved space and its size.

So to avoid this, when finishing the creation a new block group, add the
block group to the list of unused block groups if it's still unused at
that time. This way the next time the cleaner kthread runs, it will delete
the block group if it's still unused and not needed to satisfy existing
space reservations.

Reported-by: Ivan Shapovalov <intelfx@intelfx.name>
Link: https://lore.kernel.org/linux-btrfs/9cdbf0ca9cdda1b4c84e15e548af7d7f9f926382.camel@intelfx.name/
CC: stable@vger.kernel.org # 6.7+
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Boris Burkov <boris@bur.io>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/block-group.c | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index ce732db07ef6ed..e9e455fd528a1f 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -2729,6 +2729,37 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
 		btrfs_dec_delayed_refs_rsv_bg_inserts(fs_info);
 		list_del_init(&block_group->bg_list);
 		clear_bit(BLOCK_GROUP_FLAG_NEW, &block_group->runtime_flags);
+
+		/*
+		 * If the block group is still unused, add it to the list of
+		 * unused block groups. The block group may have been created in
+		 * order to satisfy a space reservation, in which case the
+		 * extent allocation only happens later. But often we don't
+		 * actually need to allocate space that we previously reserved,
+		 * so the block group may become unused for a long time. For
+		 * example for metadata we generally reserve space for a worst
+		 * possible scenario, but then don't end up allocating all that
+		 * space or none at all (due to no need to COW, extent buffers
+		 * were already COWed in the current transaction and still
+		 * unwritten, tree heights lower than the maximum possible
+		 * height, etc). For data we generally reserve the axact amount
+		 * of space we are going to allocate later, the exception is
+		 * when using compression, as we must reserve space based on the
+		 * uncompressed data size, because the compression is only done
+		 * when writeback triggered and we don't know how much space we
+		 * are actually going to need, so we reserve the uncompressed
+		 * size because the data may be uncompressible in the worst case.
+		 */
+		if (ret == 0) {
+			bool used;
+
+			spin_lock(&block_group->lock);
+			used = btrfs_is_block_group_used(block_group);
+			spin_unlock(&block_group->lock);
+
+			if (!used)
+				btrfs_mark_bg_unused(block_group);
+		}
 	}
 	btrfs_trans_release_chunk_metadata(trans);
 }

From 00191e4896a3463d8a8cbdc02823ad8902ed1b8c Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Thu, 25 Jan 2024 09:53:23 +0000
Subject: [PATCH 0367/1406] btrfs: document what the spinlock unused_bgs_lock
 protects

Add some comments to struct btrfs_fs_info to explicitly document which
members are protected by the spinlock unused_bgs_lock. It is currently
used to protect two linked lists, the reclaim_bgs and unused_bgs lists.

So add an explicit comment on top of each list to mention its protected
by unused_bgs_lock, as well as comment on top of unused_bgs_lock to
mention the lists it protects.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Boris Burkov <boris@bur.io>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/fs.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
index b747134fac7742..97d10ed4b2c12c 100644
--- a/fs/btrfs/fs.h
+++ b/fs/btrfs/fs.h
@@ -731,10 +731,13 @@ struct btrfs_fs_info {
 
 	/* Reclaim partially filled block groups in the background */
 	struct work_struct reclaim_bgs_work;
+	/* Protected by unused_bgs_lock. */
 	struct list_head reclaim_bgs;
 	int bg_reclaim_threshold;
 
+	/* Protects the lists unused_bgs and reclaim_bgs. */
 	spinlock_t unused_bgs_lock;
+	/* Protected by unused_bgs_lock. */
 	struct list_head unused_bgs;
 	struct mutex unused_bg_unpin_mutex;
 	/* Protect block groups that are going to be deleted */

From a59e75f3a85d9863818d215c09e4849fc1f5ee8b Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Thu, 25 Jan 2024 09:53:26 +0000
Subject: [PATCH 0368/1406] btrfs: add comment about list_is_singular() use at
 btrfs_delete_unused_bgs()

At btrfs_delete_unused_bgs(), the use of the list_is_singular() check on
a block group may not be immediately obvious. It is there to prevent
losing raid profile information for a block group type (data, metadata or
system), as that information is removed from
fs_info->avail_[data|metadata|system]_alloc_bits when the last block group
of a given type is deleted. So deleting the block group would later result
in creating block groups of that type with a single profile (because
fs_info->avail_*_alloc_bits would have a value of 0).

This check was added in commit aefbe9a633b5 ("btrfs: Fix lost-data-profile
caused by auto removing bg").

So add a comment mentioning the need for the check.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Boris Burkov <boris@bur.io>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/block-group.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index e9e455fd528a1f..78cae9f9deca91 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -1522,6 +1522,13 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
 			 * outstanding allocations in this block group.  We do
 			 * the ro check in case balance is currently acting on
 			 * this block group.
+			 *
+			 * Also bail out if this is the only block group for its
+			 * type, because otherwise we would lose profile
+			 * information from fs_info->avail_*_alloc_bits and the
+			 * next block group of this type would be created with a
+			 * "single" profile (even if we're in a raid fs) because
+			 * fs_info->avail_*_alloc_bits would be 0.
 			 */
 			trace_btrfs_skip_unused_block_group(block_group);
 			spin_unlock(&block_group->lock);

From 3fb2a0b94749aeac23b47afdbdf3ef1b57cd8c00 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Fri, 26 Jan 2024 12:59:23 +0000
Subject: [PATCH 0369/1406] btrfs: preallocate temporary extent buffer for
 inode logging when needed

When logging an inode and we require to copy items from subvolume leaves
to the log tree, we clone each subvolume leaf and than use that clone to
copy items to the log tree. This is required to avoid possible deadlocks
as stated in commit 796787c978ef ("btrfs: do not modify log tree while
holding a leaf from fs tree locked").

The cloning requires allocating an extent buffer (struct extent_buffer)
and then allocating pages (folios) to attach to the extent buffer. This
may be slow in case we are under memory pressure, and since we are doing
the cloning while holding a read lock on a subvolume leaf, it means we
can be blocking other operations on that leaf for significant periods of
time, which can increase latency on operations like creating other files,
renaming files, etc. Similarly because we're under a log transaction, we
may also cause extra delay on other tasks doing an fsync, because syncing
the log requires waiting for tasks that joined a log transaction to exit
the transaction.

So to improve this, for any inode logging operation that needs to copy
items from a subvolume leaf ("full sync" or "copy everything" bit set
in the inode), preallocate a dummy extent buffer before locking any
extent buffer from the subvolume tree, and even before joining a log
transaction, add it to the log context and then use it when we need to
copy items from a subvolume leaf to the log tree. This avoids making
other operations get extra latency when waiting to lock a subvolume
leaf that is used during inode logging and we are under heavy memory
pressure.

The following test script with bonnie++ was used to test this:

  $ cat test.sh
  #!/bin/bash

  DEV=/dev/sdh
  MNT=/mnt/sdh
  MOUNT_OPTIONS="-o ssd"

  MEMTOTAL_BYTES=`free -b | grep Mem: | awk '{ print $2 }'`
  NR_DIRECTORIES=20
  NR_FILES=20480
  DATASET_SIZE=$((MEMTOTAL_BYTES * 2 / 1048576))
  DIRECTORY_SIZE=$((MEMTOTAL_BYTES * 2 / NR_FILES))
  NR_FILES=$((NR_FILES / 1024))

  echo "performance" | \
      tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor

  umount $DEV &> /dev/null
  mkfs.btrfs -f $MKFS_OPTIONS $DEV
  mount $MOUNT_OPTIONS $DEV $MNT

  bonnie++ -u root -d $MNT \
      -n $NR_FILES:$DIRECTORY_SIZE:$DIRECTORY_SIZE:$NR_DIRECTORIES \
      -r 0 -s $DATASET_SIZE -b

  umount $MNT

The results of this test on a 8G VM running a non-debug kernel (Debian's
default kernel config), were the following.

Before this change:

  Version 2.00a       ------Sequential Output------ --Sequential Input- --Random-
                      -Per Chr- --Block-- -Rewrite- -Per Chr- --Block-- --Seeks--
  Name:Size etc        /sec %CP  /sec %CP  /sec %CP  /sec %CP  /sec %CP  /sec %CP
  debian0       7501M  376k  99  1.4g  96  117m  14 1510k  99  2.5g  95 +++++ +++
  Latency             35068us   24976us    2944ms   30725us   71770us   26152us
  Version 2.00a       ------Sequential Create------ --------Random Create--------
  debian0             -Create-- --Read--- -Delete-- -Create-- --Read--- -Delete--
  files:max:min        /sec %CP  /sec %CP  /sec %CP  /sec %CP  /sec %CP  /sec %CP
  20:384100:384100/20 20480  32 20480  58 20480  48 20480  39 20480  56 20480  61
  Latency               411ms   11914us     119ms     617ms   10296us     110ms

After this change:

  Version 2.00a       ------Sequential Output------ --Sequential Input- --Random-
                      -Per Chr- --Block-- -Rewrite- -Per Chr- --Block-- --Seeks--
  Name:Size etc        /sec %CP  /sec %CP  /sec %CP  /sec %CP  /sec %CP  /sec %CP
  debian0       7501M  375k  99  1.4g  97  117m  14 1546k  99  2.3g  98 +++++ +++
  Latency             35975us  20945us    2144ms   10297us    2217us    6004us
  Version 2.00a       ------Sequential Create------ --------Random Create--------
  debian0             -Create-- --Read--- -Delete-- -Create-- --Read--- -Delete--
  files:max:min        /sec %CP  /sec %CP  /sec %CP  /sec %CP  /sec %CP  /sec %CP
  20:384100:384100/20 20480  35 20480  58 20480  48 20480  40 20480  57 20480  59
  Latency               320ms   11237us   77779us     518ms    6470us   86389us

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/file.c     | 12 ++++++
 fs/btrfs/tree-log.c | 93 +++++++++++++++++++++++++++------------------
 fs/btrfs/tree-log.h | 25 ++++++++++++
 3 files changed, 94 insertions(+), 36 deletions(-)

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 4bca37fd6833ad..78c3ef68caa3bf 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1910,6 +1910,8 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 		goto out_release_extents;
 	}
 
+	btrfs_init_log_ctx_scratch_eb(&ctx);
+
 	/*
 	 * We use start here because we will need to wait on the IO to complete
 	 * in btrfs_sync_log, which could require joining a transaction (for
@@ -1929,6 +1931,15 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 	trans->in_fsync = true;
 
 	ret = btrfs_log_dentry_safe(trans, dentry, &ctx);
+	/*
+	 * Scratch eb no longer needed, release before syncing log or commit
+	 * transaction, to avoid holding unnecessary memory during such long
+	 * operations.
+	 */
+	if (ctx.scratch_eb) {
+		free_extent_buffer(ctx.scratch_eb);
+		ctx.scratch_eb = NULL;
+	}
 	btrfs_release_log_ctx_extents(&ctx);
 	if (ret < 0) {
 		/* Fallthrough and commit/free transaction. */
@@ -2004,6 +2015,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 
 	ret = btrfs_commit_transaction(trans);
 out:
+	free_extent_buffer(ctx.scratch_eb);
 	ASSERT(list_empty(&ctx.list));
 	ASSERT(list_empty(&ctx.conflict_inodes));
 	err = file_check_and_advance_wb_err(file);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 043b8df5665ff7..d7693368f34f10 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3617,6 +3617,30 @@ static int flush_dir_items_batch(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
+static int clone_leaf(struct btrfs_path *path, struct btrfs_log_ctx *ctx)
+{
+	const int slot = path->slots[0];
+
+	if (ctx->scratch_eb) {
+		copy_extent_buffer_full(ctx->scratch_eb, path->nodes[0]);
+	} else {
+		ctx->scratch_eb = btrfs_clone_extent_buffer(path->nodes[0]);
+		if (!ctx->scratch_eb)
+			return -ENOMEM;
+	}
+
+	btrfs_release_path(path);
+	path->nodes[0] = ctx->scratch_eb;
+	path->slots[0] = slot;
+	/*
+	 * Add extra ref to scratch eb so that it is not freed when callers
+	 * release the path, so we can reuse it later if needed.
+	 */
+	atomic_inc(&ctx->scratch_eb->refs);
+
+	return 0;
+}
+
 static int process_dir_items_leaf(struct btrfs_trans_handle *trans,
 				  struct btrfs_inode *inode,
 				  struct btrfs_path *path,
@@ -3631,23 +3655,20 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans,
 	bool last_found = false;
 	int batch_start = 0;
 	int batch_size = 0;
-	int i;
+	int ret;
 
 	/*
 	 * We need to clone the leaf, release the read lock on it, and use the
 	 * clone before modifying the log tree. See the comment at copy_items()
 	 * about why we need to do this.
 	 */
-	src = btrfs_clone_extent_buffer(path->nodes[0]);
-	if (!src)
-		return -ENOMEM;
+	ret = clone_leaf(path, ctx);
+	if (ret < 0)
+		return ret;
 
-	i = path->slots[0];
-	btrfs_release_path(path);
-	path->nodes[0] = src;
-	path->slots[0] = i;
+	src = path->nodes[0];
 
-	for (; i < nritems; i++) {
+	for (int i = path->slots[0]; i < nritems; i++) {
 		struct btrfs_dir_item *di;
 		struct btrfs_key key;
 		int ret;
@@ -4257,17 +4278,16 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
 			       struct btrfs_path *dst_path,
 			       struct btrfs_path *src_path,
 			       int start_slot, int nr, int inode_only,
-			       u64 logged_isize)
+			       u64 logged_isize, struct btrfs_log_ctx *ctx)
 {
 	struct btrfs_root *log = inode->root->log_root;
 	struct btrfs_file_extent_item *extent;
 	struct extent_buffer *src;
-	int ret = 0;
+	int ret;
 	struct btrfs_key *ins_keys;
 	u32 *ins_sizes;
 	struct btrfs_item_batch batch;
 	char *ins_data;
-	int i;
 	int dst_index;
 	const bool skip_csum = (inode->flags & BTRFS_INODE_NODATASUM);
 	const u64 i_size = i_size_read(&inode->vfs_inode);
@@ -4300,14 +4320,11 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
 	 * while the other is holding the delayed node's mutex and wants to
 	 * write lock the same subvolume leaf for flushing delayed items.
 	 */
-	src = btrfs_clone_extent_buffer(src_path->nodes[0]);
-	if (!src)
-		return -ENOMEM;
+	ret = clone_leaf(src_path, ctx);
+	if (ret < 0)
+		return ret;
 
-	i = src_path->slots[0];
-	btrfs_release_path(src_path);
-	src_path->nodes[0] = src;
-	src_path->slots[0] = i;
+	src = src_path->nodes[0];
 
 	ins_data = kmalloc(nr * sizeof(struct btrfs_key) +
 			   nr * sizeof(u32), GFP_NOFS);
@@ -4322,7 +4339,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
 	batch.nr = 0;
 
 	dst_index = 0;
-	for (i = 0; i < nr; i++) {
+	for (int i = 0; i < nr; i++) {
 		const int src_slot = start_slot + i;
 		struct btrfs_root *csum_root;
 		struct btrfs_ordered_sum *sums;
@@ -4429,7 +4446,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
 		goto out;
 
 	dst_index = 0;
-	for (i = 0; i < nr; i++) {
+	for (int i = 0; i < nr; i++) {
 		const int src_slot = start_slot + i;
 		const int dst_slot = dst_path->slots[0] + dst_index;
 		struct btrfs_key key;
@@ -4702,7 +4719,8 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
  */
 static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans,
 				      struct btrfs_inode *inode,
-				      struct btrfs_path *path)
+				      struct btrfs_path *path,
+				      struct btrfs_log_ctx *ctx)
 {
 	struct btrfs_root *root = inode->root;
 	struct btrfs_key key;
@@ -4768,7 +4786,7 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans,
 		if (slot >= btrfs_header_nritems(leaf)) {
 			if (ins_nr > 0) {
 				ret = copy_items(trans, inode, dst_path, path,
-						 start_slot, ins_nr, 1, 0);
+						 start_slot, ins_nr, 1, 0, ctx);
 				if (ret < 0)
 					goto out;
 				ins_nr = 0;
@@ -4818,7 +4836,7 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans,
 	}
 	if (ins_nr > 0)
 		ret = copy_items(trans, inode, dst_path, path,
-				 start_slot, ins_nr, 1, 0);
+				 start_slot, ins_nr, 1, 0, ctx);
 out:
 	btrfs_release_path(path);
 	btrfs_free_path(dst_path);
@@ -4897,7 +4915,7 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
 	write_unlock(&tree->lock);
 
 	if (!ret)
-		ret = btrfs_log_prealloc_extents(trans, inode, path);
+		ret = btrfs_log_prealloc_extents(trans, inode, path, ctx);
 	if (ret)
 		return ret;
 
@@ -4978,7 +4996,8 @@ static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode,
 static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans,
 				struct btrfs_inode *inode,
 				struct btrfs_path *path,
-				struct btrfs_path *dst_path)
+				struct btrfs_path *dst_path,
+				struct btrfs_log_ctx *ctx)
 {
 	struct btrfs_root *root = inode->root;
 	int ret;
@@ -5007,7 +5026,7 @@ static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans,
 		if (slot >= nritems) {
 			if (ins_nr > 0) {
 				ret = copy_items(trans, inode, dst_path, path,
-						 start_slot, ins_nr, 1, 0);
+						 start_slot, ins_nr, 1, 0, ctx);
 				if (ret < 0)
 					return ret;
 				ins_nr = 0;
@@ -5033,7 +5052,7 @@ static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans,
 	}
 	if (ins_nr > 0) {
 		ret = copy_items(trans, inode, dst_path, path,
-				 start_slot, ins_nr, 1, 0);
+				 start_slot, ins_nr, 1, 0, ctx);
 		if (ret < 0)
 			return ret;
 	}
@@ -5845,7 +5864,7 @@ static int copy_inode_items_to_log(struct btrfs_trans_handle *trans,
 				}
 				ret = copy_items(trans, inode, dst_path, path,
 						 ins_start_slot, ins_nr,
-						 inode_only, logged_isize);
+						 inode_only, logged_isize, ctx);
 				if (ret < 0)
 					return ret;
 				ins_nr = 0;
@@ -5864,7 +5883,7 @@ static int copy_inode_items_to_log(struct btrfs_trans_handle *trans,
 				goto next_slot;
 			ret = copy_items(trans, inode, dst_path, path,
 					 ins_start_slot,
-					 ins_nr, inode_only, logged_isize);
+					 ins_nr, inode_only, logged_isize, ctx);
 			if (ret < 0)
 				return ret;
 			ins_nr = 0;
@@ -5881,7 +5900,7 @@ static int copy_inode_items_to_log(struct btrfs_trans_handle *trans,
 		}
 
 		ret = copy_items(trans, inode, dst_path, path, ins_start_slot,
-				 ins_nr, inode_only, logged_isize);
+				 ins_nr, inode_only, logged_isize, ctx);
 		if (ret < 0)
 			return ret;
 		ins_nr = 1;
@@ -5896,7 +5915,7 @@ static int copy_inode_items_to_log(struct btrfs_trans_handle *trans,
 		if (ins_nr) {
 			ret = copy_items(trans, inode, dst_path, path,
 					 ins_start_slot, ins_nr, inode_only,
-					 logged_isize);
+					 logged_isize, ctx);
 			if (ret < 0)
 				return ret;
 			ins_nr = 0;
@@ -5921,7 +5940,7 @@ static int copy_inode_items_to_log(struct btrfs_trans_handle *trans,
 	}
 	if (ins_nr) {
 		ret = copy_items(trans, inode, dst_path, path, ins_start_slot,
-				 ins_nr, inode_only, logged_isize);
+				 ins_nr, inode_only, logged_isize, ctx);
 		if (ret)
 			return ret;
 	}
@@ -5932,7 +5951,7 @@ static int copy_inode_items_to_log(struct btrfs_trans_handle *trans,
 		 * lock the same leaf with btrfs_log_prealloc_extents() below.
 		 */
 		btrfs_release_path(path);
-		ret = btrfs_log_prealloc_extents(trans, inode, dst_path);
+		ret = btrfs_log_prealloc_extents(trans, inode, dst_path, ctx);
 	}
 
 	return ret;
@@ -6524,7 +6543,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
 
 	btrfs_release_path(path);
 	btrfs_release_path(dst_path);
-	ret = btrfs_log_all_xattrs(trans, inode, path, dst_path);
+	ret = btrfs_log_all_xattrs(trans, inode, path, dst_path, ctx);
 	if (ret)
 		goto out_unlock;
 	xattrs_logged = true;
@@ -6551,7 +6570,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
 		 * BTRFS_INODE_COPY_EVERYTHING set.
 		 */
 		if (!xattrs_logged && inode->logged_trans < trans->transid) {
-			ret = btrfs_log_all_xattrs(trans, inode, path, dst_path);
+			ret = btrfs_log_all_xattrs(trans, inode, path, dst_path, ctx);
 			if (ret)
 				goto out_unlock;
 			btrfs_release_path(path);
@@ -7500,6 +7519,7 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans,
 
 	btrfs_init_log_ctx(&ctx, &inode->vfs_inode);
 	ctx.logging_new_name = true;
+	btrfs_init_log_ctx_scratch_eb(&ctx);
 	/*
 	 * We don't care about the return value. If we fail to log the new name
 	 * then we know the next attempt to sync the log will fallback to a full
@@ -7508,6 +7528,7 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans,
 	 * inconsistent state after a rename operation.
 	 */
 	btrfs_log_inode_parent(trans, inode, parent, LOG_INODE_EXISTS, &ctx);
+	free_extent_buffer(ctx.scratch_eb);
 	ASSERT(list_empty(&ctx.conflict_inodes));
 out:
 	/*
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index a550a8a375cd15..af219e8840d285 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -36,6 +36,15 @@ struct btrfs_log_ctx {
 	struct list_head conflict_inodes;
 	int num_conflict_inodes;
 	bool logging_conflict_inodes;
+	/*
+	 * Used for fsyncs that need to copy items from the subvolume tree to
+	 * the log tree (full sync flag set or copy everything flag set) to
+	 * avoid allocating a temporary extent buffer while holding a lock on
+	 * an extent buffer of the subvolume tree and under the log transaction.
+	 * Also helps to avoid allocating and freeing a temporary extent buffer
+	 * in case we need to process multiple leaves from the subvolume tree.
+	 */
+	struct extent_buffer *scratch_eb;
 };
 
 static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx,
@@ -53,6 +62,22 @@ static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx,
 	INIT_LIST_HEAD(&ctx->conflict_inodes);
 	ctx->num_conflict_inodes = 0;
 	ctx->logging_conflict_inodes = false;
+	ctx->scratch_eb = NULL;
+}
+
+static inline void btrfs_init_log_ctx_scratch_eb(struct btrfs_log_ctx *ctx)
+{
+	struct btrfs_inode *inode = BTRFS_I(ctx->inode);
+
+	if (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) &&
+	    !test_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags))
+		return;
+
+	/*
+	 * Don't care about allocation failure. This is just for optimization,
+	 * if we fail to allocate here, we will try again later if needed.
+	 */
+	ctx->scratch_eb = alloc_dummy_extent_buffer(inode->root->fs_info, 0);
 }
 
 static inline void btrfs_release_log_ctx_extents(struct btrfs_log_ctx *ctx)

From c70e41f5a8f255cfd5ece97907e88fc7f39303c2 Mon Sep 17 00:00:00 2001
From: Naohiro Aota <naohiro.aota@wdc.com>
Date: Fri, 2 Feb 2024 13:23:28 +0900
Subject: [PATCH 0370/1406] btrfs: use READ/WRITE_ONCE for
 fs_devices->read_policy

Since we can read/modify the value from the sysfs interface concurrently,
it would be better to protect it from compiler optimizations.

Currently, there is only one read policy BTRFS_READ_POLICY_PID available,
so no actual problem can happen now. This is a preparation for the future
expansion.

Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/sysfs.c   |  7 ++++---
 fs/btrfs/volumes.c | 10 +++++-----
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 84c05246ffd8ad..21586ecc35bf9c 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -1228,11 +1228,12 @@ static ssize_t btrfs_read_policy_show(struct kobject *kobj,
 				      struct kobj_attribute *a, char *buf)
 {
 	struct btrfs_fs_devices *fs_devices = to_fs_devs(kobj);
+	const enum btrfs_read_policy policy = READ_ONCE(fs_devices->read_policy);
 	ssize_t ret = 0;
 	int i;
 
 	for (i = 0; i < BTRFS_NR_READ_POLICY; i++) {
-		if (fs_devices->read_policy == i)
+		if (policy == i)
 			ret += sysfs_emit_at(buf, ret, "%s[%s]",
 					 (ret == 0 ? "" : " "),
 					 btrfs_read_policy_name[i]);
@@ -1256,8 +1257,8 @@ static ssize_t btrfs_read_policy_store(struct kobject *kobj,
 
 	for (i = 0; i < BTRFS_NR_READ_POLICY; i++) {
 		if (sysfs_streq(buf, btrfs_read_policy_name[i])) {
-			if (i != fs_devices->read_policy) {
-				fs_devices->read_policy = i;
+			if (i != READ_ONCE(fs_devices->read_policy)) {
+				WRITE_ONCE(fs_devices->read_policy, i);
 				btrfs_info(fs_devices->fs_info,
 					   "read policy set to '%s'",
 					   btrfs_read_policy_name[i]);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 474ab7ed65ea92..224345658ea556 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -5942,6 +5942,7 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info,
 			    struct btrfs_chunk_map *map, int first,
 			    int dev_replace_is_ongoing)
 {
+	const enum btrfs_read_policy policy = READ_ONCE(fs_info->fs_devices->read_policy);
 	int i;
 	int num_stripes;
 	int preferred_mirror;
@@ -5956,13 +5957,12 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info,
 	else
 		num_stripes = map->num_stripes;
 
-	switch (fs_info->fs_devices->read_policy) {
+	switch (policy) {
 	default:
 		/* Shouldn't happen, just warn and use pid instead of failing */
-		btrfs_warn_rl(fs_info,
-			      "unknown read_policy type %u, reset to pid",
-			      fs_info->fs_devices->read_policy);
-		fs_info->fs_devices->read_policy = BTRFS_READ_POLICY_PID;
+		btrfs_warn_rl(fs_info, "unknown read_policy type %u, reset to pid",
+			      policy);
+		WRITE_ONCE(fs_info->fs_devices->read_policy, BTRFS_READ_POLICY_PID);
 		fallthrough;
 	case BTRFS_READ_POLICY_PID:
 		preferred_mirror = first + (current->pid % num_stripes);

From 539aaa8211e32ad10244f77db9e413ab32fe2bf7 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Fri, 19 Jan 2024 20:23:56 +0100
Subject: [PATCH 0371/1406] btrfs: handle directory and dentry mismatch in
 btrfs_may_delete()

The helper btrfs_may_delete() is a copy of generic fs/namei.c:may_delete()
to verify various conditions before deletion. There's a BUG_ON added
before linux.git started, we can turn it to a proper error handling
at least in our local helper. A mistmatch between directory and the
deleted dentry is clearly invalid.

This won't be probably ever hit due to the way how the parameters are
set from the caller btrfs_ioctl_snap_destroy(), using a VFS helper
lookup_one().

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ioctl.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 46f9a6645bf6d1..b3f931f915333f 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -906,7 +906,9 @@ static int btrfs_may_delete(struct mnt_idmap *idmap,
 	if (d_really_is_negative(victim))
 		return -ENOENT;
 
-	BUG_ON(d_inode(victim->d_parent) != dir);
+	/* The @victim is not inside @dir. */
+	if (d_inode(victim->d_parent) != dir)
+		return -EINVAL;
 	audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE);
 
 	error = inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC);

From c71a7787340b9d0d912fab6675c29ad8ba947b40 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Fri, 19 Jan 2024 20:44:57 +0100
Subject: [PATCH 0372/1406] btrfs: handle invalid range and start in
 merge_extent_mapping()

Turn a BUG_ON to a properly handled error and update the error message
in the caller.  It is expected that @em_in and @start passed to
btrfs_add_extent_mapping() overlap. Besides tests, the only caller
btrfs_get_extent() makes sure this is true.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_map.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 1bf4d0319f6cbe..ea08601988debb 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -538,7 +538,8 @@ static noinline int merge_extent_mapping(struct extent_map_tree *em_tree,
 	u64 end;
 	u64 start_diff;
 
-	BUG_ON(map_start < em->start || map_start >= extent_map_end(em));
+	if (map_start < em->start || map_start >= extent_map_end(em))
+		return -EINVAL;
 
 	if (existing->start > map_start) {
 		next = existing;
@@ -633,9 +634,9 @@ int btrfs_add_extent_mapping(struct btrfs_fs_info *fs_info,
 				free_extent_map(em);
 				*em_in = NULL;
 				WARN_ONCE(ret,
-"unexpected error %d: merge existing(start %llu len %llu) with em(start %llu len %llu)\n",
-					  ret, existing->start, existing->len,
-					  orig_start, orig_len);
+"extent map merge error existing [%llu, %llu) with em [%llu, %llu) start %llu\n",
+					  existing->start, existing->len,
+					  orig_start, orig_len, start);
 			}
 			free_extent_map(existing);
 		}

From 2ebb97ad3c79e0128c59e35081d38f71ce343849 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Sat, 20 Jan 2024 02:17:03 +0100
Subject: [PATCH 0373/1406] btrfs: handle block group lookup error when it's
 being removed

The unlikely case of lookup error in btrfs_remove_block_group() can be
handled properly, in its caller this would lead to a transaction abort.
We can't do anything else, a block group must have been loaded first.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/block-group.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 78cae9f9deca91..58f2e8951dbf23 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -1063,7 +1063,9 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 	bool remove_rsv = false;
 
 	block_group = btrfs_lookup_block_group(fs_info, map->start);
-	BUG_ON(!block_group);
+	if (!block_group)
+		return -ENOENT;
+
 	BUG_ON(!block_group->ro);
 
 	trace_btrfs_remove_block_group(block_group);

From 6811e3d50a1a2549ee450b13eab0b8de9f00676f Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Tue, 23 Jan 2024 23:19:19 +0100
Subject: [PATCH 0374/1406] btrfs: handle root deletion lookup error in
 btrfs_del_root()

We're deleting a root and looking it up by key does not succeed, this
is an inconsistent state and we can't do anything. All callers handle
errors and abort a transaction.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/root-tree.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 3f6d10eb1aafca..ce831660550bfd 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -322,8 +322,11 @@ int btrfs_del_root(struct btrfs_trans_handle *trans,
 	ret = btrfs_search_slot(trans, root, key, path, -1, 1);
 	if (ret < 0)
 		goto out;
-
-	BUG_ON(ret != 0);
+	if (ret != 0) {
+		/* The root must exist but we did not find it by the key. */
+		ret = -EUCLEAN;
+		goto out;
+	}
 
 	ret = btrfs_del_item(trans, root, path);
 out:

From 23b5c73a31bbedf952d570e7c5d204230053ea5c Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Tue, 23 Jan 2024 23:28:24 +0100
Subject: [PATCH 0375/1406] btrfs: handle invalid root reference found in
 btrfs_find_root()

The btrfs_find_root() looks up a root by a key, allowing to do an
inexact search when key->offset is -1.  It's never expected to find such
item, as it would break allowed the range of a root id.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/root-tree.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index ce831660550bfd..4bb538a372ce56 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -81,7 +81,14 @@ int btrfs_find_root(struct btrfs_root *root, const struct btrfs_key *search_key,
 		if (ret > 0)
 			goto out;
 	} else {
-		BUG_ON(ret == 0);		/* Logical error */
+		/*
+		 * Key with offset -1 found, there would have to exist a root
+		 * with such id, but this is out of the valid range.
+		 */
+		if (ret == 0) {
+			ret = -EUCLEAN;
+			goto out;
+		}
 		if (path->slots[0] == 0)
 			goto out;
 		path->slots[0]--;

From 2e5490ce34e98e1c3f0fa33ae6ad97f684abd43e Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Tue, 23 Jan 2024 23:34:57 +0100
Subject: [PATCH 0376/1406] btrfs: handle invalid root reference found in
 btrfs_init_root_free_objectid()

The btrfs_init_root_free_objectid() looks up a root by a key, allowing
to do an inexact search when key->offset is -1.  It's never expected to
find such item, as it would break the allowed range of a root id.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 26c11fce5e4e00..0f38620a58da79 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -4927,7 +4927,14 @@ int btrfs_init_root_free_objectid(struct btrfs_root *root)
 	ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
 	if (ret < 0)
 		goto error;
-	BUG_ON(ret == 0); /* Corruption */
+	if (ret == 0) {
+		/*
+		 * Key with offset -1 found, there would have to exist a root
+		 * with such id, but this is out of valid range.
+		 */
+		ret = -EUCLEAN;
+		goto error;
+	}
 	if (path->slots[0] > 0) {
 		slot = path->slots[0] - 1;
 		l = path->nodes[0];

From 8fd791871d9e441662e0398b804674bd20bf92e7 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Tue, 23 Jan 2024 23:42:29 +0100
Subject: [PATCH 0377/1406] btrfs: handle chunk tree lookup error in
 btrfs_relocate_sys_chunks()

The unhandled case in btrfs_relocate_sys_chunks() loop is a corruption,
as it could be caused only by two impossible conditions:

- at first the search key is set up to look for a chunk tree item, with
  offset -1, this is an inexact search and the key->offset will contain
  the correct offset upon a successful search, a valid chunk tree item
  cannot have an offset -1

- after first successful search, the found_key corresponds to a chunk
  item, the offset is decremented by 1 before the next loop, it's
  impossible to find a chunk item there due to alignment and size
  constraints

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 224345658ea556..56b0b167e3867b 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -3391,7 +3391,17 @@ static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info)
 			mutex_unlock(&fs_info->reclaim_bgs_lock);
 			goto error;
 		}
-		BUG_ON(ret == 0); /* Corruption */
+		if (ret == 0) {
+			/*
+			 * On the first search we would find chunk tree with
+			 * offset -1, which is not possible. On subsequent
+			 * loops this would find an existing item on an invalid
+			 * offset (one less than the previous one, wrong
+			 * alignment and size).
+			 */
+			ret = -EUCLEAN;
+			goto error;
+		}
 
 		ret = btrfs_previous_item(chunk_root, path, key.objectid,
 					  key.type);

From a91b186e27219d3a809f7fd697a9a2f72de45eb1 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Wed, 24 Jan 2024 15:37:59 +0100
Subject: [PATCH 0378/1406] btrfs: handle invalid extent item reference found
 in check_committed_ref()

The check_committed_ref() helper looks up an extent item by a key,
allowing to do an inexact search when key->offset is -1.  It's never
expected to find such item, as it would break the allowed range of a
extent item offset.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent-tree.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 49437ad7248de3..bd1645089d4996 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2396,7 +2396,14 @@ static noinline int check_committed_ref(struct btrfs_root *root,
 	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
 	if (ret < 0)
 		goto out;
-	BUG_ON(ret == 0); /* Corruption */
+	if (ret == 0) {
+		/*
+		 * Key with offset -1 found, there would have to exist an extent
+		 * item with such offset, but this is out of the valid range.
+		 */
+		ret = -EUCLEAN;
+		goto out;
+	}
 
 	ret = -ENOENT;
 	if (path->slots[0] == 0)

From 17e83a9bde9f15e0c29bc01d2b8b641676384321 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Fri, 19 Jan 2024 21:19:18 +0100
Subject: [PATCH 0379/1406] btrfs: export: handle invalid inode or root
 reference in btrfs_get_parent()

The get_parent handler looks up a parent of a given dentry, this can be
either a subvolume or a directory. The search is set up with offset -1
but it's never expected to find such item, as it would break allowed
range of inode number or a root id. This means it's a corruption (ext4
also returns this error code).

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/export.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 3f2e8fb9e3e950..d710339ca4f391 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -173,8 +173,15 @@ struct dentry *btrfs_get_parent(struct dentry *child)
 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 	if (ret < 0)
 		goto fail;
+	if (ret == 0) {
+		/*
+		 * Key with offset of -1 found, there would have to exist an
+		 * inode with such number or a root with such id.
+		 */
+		ret = -EUCLEAN;
+		goto fail;
+	}
 
-	BUG_ON(ret == 0); /* Key with offset of -1 found */
 	if (path->slots[0] == 0) {
 		ret = -ENOENT;
 		goto fail;

From 1f1900ea74e9cd6cfa621b2bd4ebee1571fb2a43 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Sat, 20 Jan 2024 02:22:37 +0100
Subject: [PATCH 0380/1406] btrfs: delayed-inode: drop pointless BUG_ON in
 __btrfs_remove_delayed_item()

There's a BUG_ON checking for a valid pointer of fs_info::delayed_root
but it is valid since init_mount_fs_info() and has the same lifetime as
fs_info.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/delayed-inode.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 08102883f560a3..0b1701f1b8c9e1 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -430,8 +430,6 @@ static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item)
 
 	delayed_root = delayed_node->root->fs_info->delayed_root;
 
-	BUG_ON(!delayed_root);
-
 	if (delayed_item->type == BTRFS_DELAYED_INSERTION_ITEM)
 		root = &delayed_node->ins_root;
 	else

From b412b4572927052bcefb9a7189d89061ede4ca4c Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Sat, 20 Jan 2024 02:26:32 +0100
Subject: [PATCH 0381/1406] btrfs: change BUG_ON to assertion when checking for
 delayed_node root

The pointer to root is initialized in btrfs_init_delayed_node(), no need
to check for it again. Change the BUG_ON to assertion.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/delayed-inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 0b1701f1b8c9e1..efe435403b77a0 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -978,7 +978,7 @@ static void btrfs_release_delayed_inode(struct btrfs_delayed_node *delayed_node)
 
 	if (delayed_node &&
 	    test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) {
-		BUG_ON(!delayed_node->root);
+		ASSERT(delayed_node->root);
 		clear_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags);
 		delayed_node->count--;
 

From 391cb8d2a3b0aa2e63b52f072be9d023b7624e55 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Fri, 19 Jan 2024 20:15:41 +0100
Subject: [PATCH 0382/1406] btrfs: defrag: change BUG_ON to assertion in
 btrfs_defrag_leaves()

The BUG_ON verifies a condition that should be guaranteed by the correct
use of the path search (with keep_locks and lowest_level set), an
assertion is the suitable check.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/defrag.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c
index 8fc8118c322501..664b1177702c95 100644
--- a/fs/btrfs/defrag.c
+++ b/fs/btrfs/defrag.c
@@ -520,7 +520,7 @@ static int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
 	 * keep_locks set and lowest_level is 1, regardless of the value of
 	 * path->slots[1].
 	 */
-	BUG_ON(path->locks[1] == 0);
+	ASSERT(path->locks[1] != 0);
 	ret = btrfs_realloc_node(trans, root,
 				 path->nodes[1], 0,
 				 &last_ret,

From c4095828b88ae9b637031e35ee38f5091aac0b97 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Wed, 24 Jan 2024 01:09:46 +0100
Subject: [PATCH 0383/1406] btrfs: change BUG_ON to assertion in
 btrfs_read_roots()

There's one caller of btrfs_read_roots() and that already uses the
tree_root pointer, it's pointless to BUG_ON on it. As it's an assumption
of the initialization helpers make it an assert instead.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 0f38620a58da79..41bf4a18968e9c 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2239,7 +2239,7 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
 	struct btrfs_key location;
 	int ret;
 
-	BUG_ON(!fs_info->tree_root);
+	ASSERT(fs_info->tree_root);
 
 	ret = load_global_roots(tree_root);
 	if (ret)

From 1bd094058e112cb84c9ccadeef7186f67ce8088b Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Tue, 23 Jan 2024 23:09:18 +0100
Subject: [PATCH 0384/1406] btrfs: change BUG_ON to assertion when verifying
 lockdep class setup

The BUG_ON in btrfs_set_buffer_lockdep_class() is a sanity check of the
level which is verified in callers, e.g. when initializing an extent
buffer or reading from an eb header. Change it to an assertion as this
would not happen unless things are really bad and would fail elsewhere
too.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/locking.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 286e6aa721c73d..99ccab86bb8656 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -84,7 +84,7 @@ void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb, int
 {
 	struct btrfs_lockdep_keyset *ks;
 
-	BUG_ON(level >= ARRAY_SIZE(ks->keys));
+	ASSERT(level < ARRAY_SIZE(ks->keys));
 
 	/* Find the matching keyset, id 0 is the default entry */
 	for (ks = btrfs_lockdep_keysets; ks->id; ks++)

From 1076e14cf1f01500f58e055f978a8c015ae93c6a Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Wed, 24 Jan 2024 16:18:11 +0100
Subject: [PATCH 0385/1406] btrfs: change BUG_ON to assertion when verifying
 root in btrfs_alloc_reserved_file_extent()

The file extents are normally reserved in subvolume roots but could be
also in the data reloc tree. Change the BUG_ON to assertions as this
verifies the usage assumptions.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent-tree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index bd1645089d4996..0d72d0f7cefcad 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4961,7 +4961,7 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
 	u64 root_objectid = root->root_key.objectid;
 	u64 owning_root = root_objectid;
 
-	BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID);
+	ASSERT(root_objectid != BTRFS_TREE_LOG_OBJECTID);
 
 	if (btrfs_is_data_reloc_root(root) && is_fstree(root->relocation_src_root))
 		owning_root = root->relocation_src_root;

From c0febf1edd9f39d99cd0d99d5b3e62dcf6fef559 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Wed, 24 Jan 2024 17:23:11 +0100
Subject: [PATCH 0386/1406] btrfs: change BUG_ON to assertion in
 reset_balance_state()

The balance state machine is complex so it's good to verify the
assumptions in helpers, however reset_balance_state() is used
at the end of balance and fs_info::balance_ctl is properly set up before
and protected by the exclusive op ownership in btrfs_balance().

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 56b0b167e3867b..4ad9eca9b46c4a 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -3632,7 +3632,7 @@ static void reset_balance_state(struct btrfs_fs_info *fs_info)
 	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
 	int ret;
 
-	BUG_ON(!fs_info->balance_ctl);
+	ASSERT(fs_info->balance_ctl);
 
 	spin_lock(&fs_info->balance_lock);
 	fs_info->balance_ctl = NULL;

From 47c03f7d9fef3681cc43a58e6c9fcc1c7f57799d Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Wed, 24 Jan 2024 15:59:36 +0100
Subject: [PATCH 0387/1406] btrfs: unify handling of return values of
 btrfs_insert_empty_items()

The error values returned by btrfs_insert_empty_items() are following
the common patter of 0/-errno, but some callers check for a value > 0,
which can't happen. Document that and update calls to not expect
positive values.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.c     | 4 ++++
 fs/btrfs/file-item.c | 3 ---
 fs/btrfs/uuid-tree.c | 2 +-
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 33145da449cc8d..c878ca466b7c87 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -4280,6 +4280,10 @@ void btrfs_setup_item_for_insert(struct btrfs_trans_handle *trans,
 /*
  * Given a key and some data, insert items into the tree.
  * This does all the path init required, making room in the tree if needed.
+ *
+ * Returns: 0        on success
+ *          -EEXIST  if the first key already exists
+ *          < 0      on other errors
  */
 int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
 			    struct btrfs_root *root,
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index f7ef9fa469b946..e58fb5347e65ee 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -176,7 +176,6 @@ int btrfs_insert_hole_extent(struct btrfs_trans_handle *trans,
 				      sizeof(*item));
 	if (ret < 0)
 		goto out;
-	BUG_ON(ret); /* Can't happen */
 	leaf = path->nodes[0];
 	item = btrfs_item_ptr(leaf, path->slots[0],
 			      struct btrfs_file_extent_item);
@@ -1226,8 +1225,6 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
 				      ins_size);
 	if (ret < 0)
 		goto out;
-	if (WARN_ON(ret != 0))
-		goto out;
 	leaf = path->nodes[0];
 csum:
 	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item);
diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c
index b8c6e46dd499a5..b0aff297d67d23 100644
--- a/fs/btrfs/uuid-tree.c
+++ b/fs/btrfs/uuid-tree.c
@@ -113,7 +113,7 @@ int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, u8 *uuid, u8 type,
 
 	ret = btrfs_insert_empty_item(trans, uuid_root, path, &key,
 				      sizeof(subid_le));
-	if (ret >= 0) {
+	if (ret == 0) {
 		/* Add an item for the type for the first time */
 		eb = path->nodes[0];
 		slot = path->slots[0];

From a5985dc8625c3263253d7d2476a97b636ebdb28c Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Wed, 24 Jan 2024 00:23:49 +0100
Subject: [PATCH 0388/1406] btrfs: move transaction abort to the error site in
 btrfs_delete_free_space_tree()

The recommended pattern for transaction abort after error is to place it
right after the error is handled. That way it's easier to locate where
it failed and help debugging.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/free-space-tree.c | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c
index 7b598b070700e7..888185265f4b4c 100644
--- a/fs/btrfs/free-space-tree.c
+++ b/fs/btrfs/free-space-tree.c
@@ -1273,12 +1273,18 @@ int btrfs_delete_free_space_tree(struct btrfs_fs_info *fs_info)
 	btrfs_clear_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID);
 
 	ret = clear_free_space_tree(trans, free_space_root);
-	if (ret)
-		goto abort;
+	if (ret) {
+		btrfs_abort_transaction(trans, ret);
+		btrfs_end_transaction(trans);
+		return ret;
+	}
 
 	ret = btrfs_del_root(trans, &free_space_root->root_key);
-	if (ret)
-		goto abort;
+	if (ret) {
+		btrfs_abort_transaction(trans, ret);
+		btrfs_end_transaction(trans);
+		return ret;
+	}
 
 	btrfs_global_root_delete(free_space_root);
 
@@ -1295,11 +1301,6 @@ int btrfs_delete_free_space_tree(struct btrfs_fs_info *fs_info)
 	btrfs_put_root(free_space_root);
 
 	return btrfs_commit_transaction(trans);
-
-abort:
-	btrfs_abort_transaction(trans, ret);
-	btrfs_end_transaction(trans);
-	return ret;
 }
 
 int btrfs_rebuild_free_space_tree(struct btrfs_fs_info *fs_info)

From 4c91c1efccb2a25b646fd0b0506f64635b7ea1ec Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Wed, 24 Jan 2024 00:23:49 +0100
Subject: [PATCH 0389/1406] btrfs: move transaction abort to the error site in
 btrfs_create_free_space_tree()

The recommended pattern for transaction abort after error is to place it
right after the error is handled. That way it's easier to locate where
it failed and help debugging.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/free-space-tree.c | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c
index 888185265f4b4c..bdc2341c43e4a9 100644
--- a/fs/btrfs/free-space-tree.c
+++ b/fs/btrfs/free-space-tree.c
@@ -1176,12 +1176,16 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info)
 					    BTRFS_FREE_SPACE_TREE_OBJECTID);
 	if (IS_ERR(free_space_root)) {
 		ret = PTR_ERR(free_space_root);
-		goto abort;
+		btrfs_abort_transaction(trans, ret);
+		btrfs_end_transaction(trans);
+		goto out_clear;
 	}
 	ret = btrfs_global_root_insert(free_space_root);
 	if (ret) {
 		btrfs_put_root(free_space_root);
-		goto abort;
+		btrfs_abort_transaction(trans, ret);
+		btrfs_end_transaction(trans);
+		goto out_clear;
 	}
 
 	node = rb_first_cached(&fs_info->block_group_cache_tree);
@@ -1189,8 +1193,11 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info)
 		block_group = rb_entry(node, struct btrfs_block_group,
 				       cache_node);
 		ret = populate_free_space_tree(trans, block_group);
-		if (ret)
-			goto abort;
+		if (ret) {
+			btrfs_abort_transaction(trans, ret);
+			btrfs_end_transaction(trans);
+			goto out_clear;
+		}
 		node = rb_next(node);
 	}
 
@@ -1206,11 +1213,9 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info)
 	clear_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags);
 	return ret;
 
-abort:
+out_clear:
 	clear_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags);
 	clear_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags);
-	btrfs_abort_transaction(trans, ret);
-	btrfs_end_transaction(trans);
 	return ret;
 }
 

From 10667d0e7baf39fe8d377821ada2779aeefe6420 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Wed, 24 Jan 2024 00:23:49 +0100
Subject: [PATCH 0390/1406] btrfs: move transaction abort to the error site
 btrfs_rebuild_free_space_tree()

The recommended pattern for transaction abort after error is to place it
right after the error is handled. That way it's easier to locate where
it failed and help debugging.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/free-space-tree.c | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c
index bdc2341c43e4a9..90f2938bd743d3 100644
--- a/fs/btrfs/free-space-tree.c
+++ b/fs/btrfs/free-space-tree.c
@@ -1328,8 +1328,11 @@ int btrfs_rebuild_free_space_tree(struct btrfs_fs_info *fs_info)
 	set_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags);
 
 	ret = clear_free_space_tree(trans, free_space_root);
-	if (ret)
-		goto abort;
+	if (ret) {
+		btrfs_abort_transaction(trans, ret);
+		btrfs_end_transaction(trans);
+		return ret;
+	}
 
 	node = rb_first_cached(&fs_info->block_group_cache_tree);
 	while (node) {
@@ -1338,8 +1341,11 @@ int btrfs_rebuild_free_space_tree(struct btrfs_fs_info *fs_info)
 		block_group = rb_entry(node, struct btrfs_block_group,
 				       cache_node);
 		ret = populate_free_space_tree(trans, block_group);
-		if (ret)
-			goto abort;
+		if (ret) {
+			btrfs_abort_transaction(trans, ret);
+			btrfs_end_transaction(trans);
+			return ret;
+		}
 		node = rb_next(node);
 	}
 
@@ -1350,10 +1356,6 @@ int btrfs_rebuild_free_space_tree(struct btrfs_fs_info *fs_info)
 	ret = btrfs_commit_transaction(trans);
 	clear_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags);
 	return ret;
-abort:
-	btrfs_abort_transaction(trans, ret);
-	btrfs_end_transaction(trans);
-	return ret;
 }
 
 static int __add_block_group_free_space(struct btrfs_trans_handle *trans,

From 38528da75b9b9497fa5e4b2450e8c11c7ebe11e5 Mon Sep 17 00:00:00 2001
From: Lijuan Li <lilijuan@iscas.ac.cn>
Date: Tue, 6 Feb 2024 09:56:00 +0800
Subject: [PATCH 0391/1406] btrfs: mark __btrfs_add_free_space static

__btrfs_add_free_space is only used in free-space-cache.c,
so mark it static.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Lijuan Li <lilijuan@iscas.ac.cn>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/free-space-cache.c | 2 +-
 fs/btrfs/free-space-cache.h | 2 --
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index f74b13f9b19343..deadf5e6258a6b 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -2619,7 +2619,7 @@ static void steal_from_bitmap(struct btrfs_free_space_ctl *ctl,
 	}
 }
 
-int __btrfs_add_free_space(struct btrfs_block_group *block_group,
+static int __btrfs_add_free_space(struct btrfs_block_group *block_group,
 			   u64 offset, u64 bytes,
 			   enum btrfs_trim_state trim_state)
 {
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index 33b4da3271b1be..d9b7fbc2008a53 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -114,8 +114,6 @@ int btrfs_write_out_cache(struct btrfs_trans_handle *trans,
 
 void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group,
 			       struct btrfs_free_space_ctl *ctl);
-int __btrfs_add_free_space(struct btrfs_block_group *block_group, u64 bytenr,
-			   u64 size, enum btrfs_trim_state trim_state);
 int btrfs_add_free_space(struct btrfs_block_group *block_group,
 			 u64 bytenr, u64 size);
 int btrfs_add_free_space_unused(struct btrfs_block_group *block_group,

From aa20901b07b6caddc64340bf16f0fd877d6a5660 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Mon, 29 Jan 2024 19:04:33 +0100
Subject: [PATCH 0392/1406] btrfs: tests: allocate dummy fs_info and root in
 test_find_delalloc()

Allocate fs_info and root to have a valid fs_info pointer in case it's
dereferenced by a helper outside of tests, like find_lock_delalloc_range().

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/tests/extent-io-tests.c | 28 ++++++++++++++++++++++++----
 1 file changed, 24 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c
index 25b3349595e005..865d4af4b30356 100644
--- a/fs/btrfs/tests/extent-io-tests.c
+++ b/fs/btrfs/tests/extent-io-tests.c
@@ -11,6 +11,7 @@
 #include "btrfs-tests.h"
 #include "../ctree.h"
 #include "../extent_io.h"
+#include "../disk-io.h"
 #include "../btrfs_inode.h"
 
 #define PROCESS_UNLOCK		(1 << 0)
@@ -105,9 +106,11 @@ static void dump_extent_io_tree(const struct extent_io_tree *tree)
 	}
 }
 
-static int test_find_delalloc(u32 sectorsize)
+static int test_find_delalloc(u32 sectorsize, u32 nodesize)
 {
-	struct inode *inode;
+	struct btrfs_fs_info *fs_info;
+	struct btrfs_root *root = NULL;
+	struct inode *inode = NULL;
 	struct extent_io_tree *tmp;
 	struct page *page;
 	struct page *locked_page = NULL;
@@ -121,12 +124,27 @@ static int test_find_delalloc(u32 sectorsize)
 
 	test_msg("running find delalloc tests");
 
+	fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize);
+	if (!fs_info) {
+		test_std_err(TEST_ALLOC_FS_INFO);
+		return -ENOMEM;
+	}
+
+	root = btrfs_alloc_dummy_root(fs_info);
+	if (IS_ERR(root)) {
+		test_std_err(TEST_ALLOC_ROOT);
+		ret = PTR_ERR(root);
+		goto out;
+	}
+
 	inode = btrfs_new_test_inode();
 	if (!inode) {
 		test_std_err(TEST_ALLOC_INODE);
-		return -ENOMEM;
+		ret = -ENOMEM;
+		goto out;
 	}
 	tmp = &BTRFS_I(inode)->io_tree;
+	BTRFS_I(inode)->root = root;
 
 	/*
 	 * Passing NULL as we don't have fs_info but tracepoints are not used
@@ -316,6 +334,8 @@ static int test_find_delalloc(u32 sectorsize)
 	process_page_range(inode, 0, total_dirty - 1,
 			   PROCESS_UNLOCK | PROCESS_RELEASE);
 	iput(inode);
+	btrfs_free_dummy_root(root);
+	btrfs_free_dummy_fs_info(fs_info);
 	return ret;
 }
 
@@ -794,7 +814,7 @@ int btrfs_test_extent_io(u32 sectorsize, u32 nodesize)
 
 	test_msg("running extent I/O tests");
 
-	ret = test_find_delalloc(sectorsize);
+	ret = test_find_delalloc(sectorsize, nodesize);
 	if (ret)
 		goto out;
 

From 20fa34922b57ffcbbcfd06bade71cb5933a72443 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Wed, 13 Sep 2023 16:11:29 +0200
Subject: [PATCH 0393/1406] btrfs: add helpers to get inode from page/folio
 pointers

Add convenience helpers to get a struct btrfs_inode from a page or folio
pointer instead of open coding the chain or intermediate BTRFS_I. This
is implemented as a macro (still with type checking) so we don't need
full definitions of struct page or address_space.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c   | 3 ++-
 fs/btrfs/extent_io.c | 8 ++++----
 fs/btrfs/fs.h        | 5 +++++
 fs/btrfs/inode.c     | 2 +-
 4 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 41bf4a18968e9c..15ab1baa0630ac 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -528,7 +528,8 @@ static void btree_invalidate_folio(struct folio *folio, size_t offset,
 				 size_t length)
 {
 	struct extent_io_tree *tree;
-	tree = &BTRFS_I(folio->mapping->host)->io_tree;
+
+	tree = &folio_to_inode(folio)->io_tree;
 	extent_invalidate_folio(tree, folio, offset);
 	btree_release_folio(folio, GFP_NOFS);
 	if (folio_get_private(folio)) {
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 8648ea9b5fb53a..7f9eaffbf433a6 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -819,7 +819,7 @@ static void submit_extent_page(struct btrfs_bio_ctrl *bio_ctrl,
 			       u64 disk_bytenr, struct page *page,
 			       size_t size, unsigned long pg_offset)
 {
-	struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
+	struct btrfs_inode *inode = page_to_inode(page);
 
 	ASSERT(pg_offset + size <= PAGE_SIZE);
 	ASSERT(bio_ctrl->end_io_func);
@@ -1151,7 +1151,7 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
 int btrfs_read_folio(struct file *file, struct folio *folio)
 {
 	struct page *page = &folio->page;
-	struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
+	struct btrfs_inode *inode = page_to_inode(page);
 	u64 start = page_offset(page);
 	u64 end = start + PAGE_SIZE - 1;
 	struct btrfs_bio_ctrl bio_ctrl = { .opf = REQ_OP_READ };
@@ -1174,7 +1174,7 @@ static inline void contiguous_readpages(struct page *pages[], int nr_pages,
 					struct btrfs_bio_ctrl *bio_ctrl,
 					u64 *prev_em_start)
 {
-	struct btrfs_inode *inode = BTRFS_I(pages[0]->mapping->host);
+	struct btrfs_inode *inode = page_to_inode(pages[0]);
 	int index;
 
 	btrfs_lock_and_flush_ordered_range(inode, start, end, NULL);
@@ -2372,7 +2372,7 @@ int try_release_extent_mapping(struct page *page, gfp_t mask)
 	struct extent_map *em;
 	u64 start = page_offset(page);
 	u64 end = start + PAGE_SIZE - 1;
-	struct btrfs_inode *btrfs_inode = BTRFS_I(page->mapping->host);
+	struct btrfs_inode *btrfs_inode = page_to_inode(page);
 	struct extent_io_tree *tree = &btrfs_inode->io_tree;
 	struct extent_map_tree *map = &btrfs_inode->extent_tree;
 
diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
index 97d10ed4b2c12c..d9a8290152742f 100644
--- a/fs/btrfs/fs.h
+++ b/fs/btrfs/fs.h
@@ -831,6 +831,11 @@ struct btrfs_fs_info {
 #endif
 };
 
+#define page_to_inode(_page)	(BTRFS_I(_Generic((_page),			\
+					  struct page *: (_page))->mapping->host))
+#define folio_to_inode(_folio)	(BTRFS_I(_Generic((_folio),			\
+					  struct folio *: (_folio))->mapping->host))
+
 static inline u64 btrfs_get_fs_generation(const struct btrfs_fs_info *fs_info)
 {
 	return READ_ONCE(fs_info->generation);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 50aea888d9773e..d3db3f9438b989 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -7951,7 +7951,7 @@ static int btrfs_migrate_folio(struct address_space *mapping,
 static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
 				 size_t length)
 {
-	struct btrfs_inode *inode = BTRFS_I(folio->mapping->host);
+	struct btrfs_inode *inode = folio_to_inode(folio);
 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 	struct extent_io_tree *tree = &inode->io_tree;
 	struct extent_state *cached_state = NULL;

From cc6a271673dc23c6fd4120861fdc58305be76f85 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Thu, 14 Sep 2023 16:24:43 +0200
Subject: [PATCH 0394/1406] btrfs: add helpers to get fs_info from page/folio
 pointers

Add convenience helpers to get a fs_info from a page or folio pointer
instead of open coding the chain or using btrfs_sb() that in some cases
does one more pointer hop.  This is implemented as a macro (still with
type checking) so we don't need full definitions of struct page, folio,
btrfs_root and btrfs_fs_info. The latter can't be static inlines as this
would create loop between ctree.h <-> fs.h, or the headers would have to
be restructured.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/compression.c |  2 +-
 fs/btrfs/disk-io.c     |  2 +-
 fs/btrfs/extent_io.c   | 16 ++++++++--------
 fs/btrfs/fs.h          |  3 +++
 fs/btrfs/inode.c       |  2 +-
 fs/btrfs/lzo.c         |  2 +-
 6 files changed, 15 insertions(+), 12 deletions(-)

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 488089acd49f13..9cae8542c7e025 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -1036,7 +1036,7 @@ static int btrfs_decompress_bio(struct compressed_bio *cb)
 int btrfs_decompress(int type, const u8 *data_in, struct page *dest_page,
 		     unsigned long dest_pgoff, size_t srclen, size_t destlen)
 {
-	struct btrfs_fs_info *fs_info = btrfs_sb(dest_page->mapping->host->i_sb);
+	struct btrfs_fs_info *fs_info = page_to_fs_info(dest_page);
 	struct list_head *workspace;
 	const u32 sectorsize = fs_info->sectorsize;
 	int ret;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 15ab1baa0630ac..f46ebb346e0447 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -533,7 +533,7 @@ static void btree_invalidate_folio(struct folio *folio, size_t offset,
 	extent_invalidate_folio(tree, folio, offset);
 	btree_release_folio(folio, GFP_NOFS);
 	if (folio_get_private(folio)) {
-		btrfs_warn(BTRFS_I(folio->mapping->host)->root->fs_info,
+		btrfs_warn(folio_to_fs_info(folio),
 			   "folio private not zero on folio %llu",
 			   (unsigned long long)folio_pos(folio));
 		folio_detach_private(folio);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 7f9eaffbf433a6..420054ad9acb16 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -432,7 +432,7 @@ static bool btrfs_verify_page(struct page *page, u64 start)
 
 static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len)
 {
-	struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
+	struct btrfs_fs_info *fs_info = page_to_fs_info(page);
 	struct folio *folio = page_folio(page);
 
 	ASSERT(page_offset(page) <= start &&
@@ -940,7 +940,7 @@ int set_folio_extent_mapped(struct folio *folio)
 	if (folio_test_private(folio))
 		return 0;
 
-	fs_info = btrfs_sb(folio->mapping->host->i_sb);
+	fs_info = folio_to_fs_info(folio);
 
 	if (btrfs_is_subpage(fs_info, folio->mapping))
 		return btrfs_attach_subpage(fs_info, folio, BTRFS_SUBPAGE_DATA);
@@ -959,7 +959,7 @@ void clear_page_extent_mapped(struct page *page)
 	if (!folio_test_private(folio))
 		return;
 
-	fs_info = btrfs_sb(page->mapping->host->i_sb);
+	fs_info = page_to_fs_info(page);
 	if (btrfs_is_subpage(fs_info, page->mapping))
 		return btrfs_detach_subpage(fs_info, folio);
 
@@ -1760,7 +1760,7 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb,
  */
 static int submit_eb_subpage(struct page *page, struct writeback_control *wbc)
 {
-	struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
+	struct btrfs_fs_info *fs_info = page_to_fs_info(page);
 	struct folio *folio = page_folio(page);
 	int submitted = 0;
 	u64 page_start = page_offset(page);
@@ -1851,7 +1851,7 @@ static int submit_eb_page(struct page *page, struct btrfs_eb_write_context *ctx)
 	if (!folio_test_private(folio))
 		return 0;
 
-	if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE)
+	if (page_to_fs_info(page)->nodesize < PAGE_SIZE)
 		return submit_eb_subpage(page, wbc);
 
 	spin_lock(&mapping->i_private_lock);
@@ -2303,7 +2303,7 @@ int extent_invalidate_folio(struct extent_io_tree *tree,
 	struct extent_state *cached_state = NULL;
 	u64 start = folio_pos(folio);
 	u64 end = start + folio_size(folio) - 1;
-	size_t blocksize = btrfs_sb(folio->mapping->host->i_sb)->sectorsize;
+	size_t blocksize = folio_to_fs_info(folio)->sectorsize;
 
 	/* This function is only called for the btree inode */
 	ASSERT(tree->owner == IO_TREE_BTREE_INODE_IO);
@@ -4721,7 +4721,7 @@ static struct extent_buffer *get_next_extent_buffer(
 
 static int try_release_subpage_extent_buffer(struct page *page)
 {
-	struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
+	struct btrfs_fs_info *fs_info = page_to_fs_info(page);
 	u64 cur = page_offset(page);
 	const u64 end = page_offset(page) + PAGE_SIZE;
 	int ret;
@@ -4794,7 +4794,7 @@ int try_release_extent_buffer(struct page *page)
 	struct folio *folio = page_folio(page);
 	struct extent_buffer *eb;
 
-	if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE)
+	if (page_to_fs_info(page)->nodesize < PAGE_SIZE)
 		return try_release_subpage_extent_buffer(page);
 
 	/*
diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
index d9a8290152742f..60ec557a7e3a00 100644
--- a/fs/btrfs/fs.h
+++ b/fs/btrfs/fs.h
@@ -836,6 +836,9 @@ struct btrfs_fs_info {
 #define folio_to_inode(_folio)	(BTRFS_I(_Generic((_folio),			\
 					  struct folio *: (_folio))->mapping->host))
 
+#define page_to_fs_info(_page)	 (page_to_inode(_page)->root->fs_info)
+#define folio_to_fs_info(_folio) (folio_to_inode(_folio)->root->fs_info)
+
 static inline u64 btrfs_get_fs_generation(const struct btrfs_fs_info *fs_info)
 {
 	return READ_ONCE(fs_info->generation);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index d3db3f9438b989..b048867c28fbc2 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -7884,7 +7884,7 @@ static void btrfs_readahead(struct readahead_control *rac)
  */
 static void wait_subpage_spinlock(struct page *page)
 {
-	struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
+	struct btrfs_fs_info *fs_info = page_to_fs_info(page);
 	struct folio *folio = page_folio(page);
 	struct btrfs_subpage *subpage;
 
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index e43bc0fdc74ec9..110a2c304bdc7a 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -429,7 +429,7 @@ int lzo_decompress(struct list_head *ws, const u8 *data_in,
 		size_t destlen)
 {
 	struct workspace *workspace = list_entry(ws, struct workspace, list);
-	struct btrfs_fs_info *fs_info = btrfs_sb(dest_page->mapping->host->i_sb);
+	struct btrfs_fs_info *fs_info = page_to_fs_info(dest_page);
 	const u32 sectorsize = fs_info->sectorsize;
 	size_t in_len;
 	size_t out_len;

From 2e85b25200a8e87cea6e248381e41eb66e425a89 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Thu, 14 Sep 2023 16:45:41 +0200
Subject: [PATCH 0395/1406] btrfs: add helper to get fs_info from struct inode
 pointer

Add a convenience helper to get a fs_info from a VFS inode pointer
instead of open coding the chain or using btrfs_sb() that in some cases
does one more pointer hop.  This is implemented as a macro (still with
type checking) so we don't need full definitions of struct btrfs_inode,
btrfs_root or btrfs_fs_info.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/compression.c      |  6 +++---
 fs/btrfs/defrag.c           |  4 ++--
 fs/btrfs/disk-io.c          |  6 +++---
 fs/btrfs/export.c           |  2 +-
 fs/btrfs/extent_io.c        | 12 +++++------
 fs/btrfs/file.c             | 14 ++++++-------
 fs/btrfs/free-space-cache.c |  2 +-
 fs/btrfs/fs.h               |  3 +++
 fs/btrfs/inode.c            | 39 ++++++++++++++++++------------------
 fs/btrfs/ioctl.c            | 40 ++++++++++++++++++-------------------
 fs/btrfs/lzo.c              |  2 +-
 fs/btrfs/props.c            |  2 +-
 fs/btrfs/reflink.c          |  6 +++---
 fs/btrfs/relocation.c       |  2 +-
 14 files changed, 72 insertions(+), 68 deletions(-)

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 9cae8542c7e025..0b8833baf40403 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -281,7 +281,7 @@ static void end_bbio_comprssed_read(struct btrfs_bio *bbio)
 static noinline void end_compressed_writeback(const struct compressed_bio *cb)
 {
 	struct inode *inode = &cb->bbio.inode->vfs_inode;
-	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
 	unsigned long index = cb->start >> PAGE_SHIFT;
 	unsigned long end_index = (cb->start + cb->len - 1) >> PAGE_SHIFT;
 	struct folio_batch fbatch;
@@ -412,7 +412,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
 				     struct compressed_bio *cb,
 				     int *memstall, unsigned long *pflags)
 {
-	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
 	unsigned long end_index;
 	struct bio *orig_bio = &cb->orig_bbio->bio;
 	u64 cur = cb->orig_bbio->file_offset + orig_bio->bi_iter.bi_size;
@@ -438,7 +438,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
 	 * This makes readahead less effective, so here disable readahead for
 	 * subpage for now, until full compressed write is supported.
 	 */
-	if (btrfs_sb(inode->i_sb)->sectorsize < PAGE_SIZE)
+	if (fs_info->sectorsize < PAGE_SIZE)
 		return 0;
 
 	end_index = (i_size_read(inode) - 1) >> PAGE_SHIFT;
diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c
index 664b1177702c95..786905731146b9 100644
--- a/fs/btrfs/defrag.c
+++ b/fs/btrfs/defrag.c
@@ -809,7 +809,7 @@ static u32 get_extent_max_capacity(const struct btrfs_fs_info *fs_info,
 static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em,
 				     u32 extent_thresh, u64 newer_than, bool locked)
 {
-	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
 	struct extent_map *next;
 	bool ret = false;
 
@@ -1364,7 +1364,7 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
 		      struct btrfs_ioctl_defrag_range_args *range,
 		      u64 newer_than, unsigned long max_to_defrag)
 {
-	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
 	unsigned long sectors_defragged = 0;
 	u64 isize = i_size_read(inode);
 	u64 cur;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index f46ebb346e0447..4280f8e23461a3 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -497,15 +497,15 @@ static int btree_migrate_folio(struct address_space *mapping,
 static int btree_writepages(struct address_space *mapping,
 			    struct writeback_control *wbc)
 {
-	struct btrfs_fs_info *fs_info;
 	int ret;
 
 	if (wbc->sync_mode == WB_SYNC_NONE) {
+		struct btrfs_fs_info *fs_info;
 
 		if (wbc->for_kupdate)
 			return 0;
 
-		fs_info = BTRFS_I(mapping->host)->root->fs_info;
+		fs_info = inode_to_fs_info(mapping->host);
 		/* this is a bit racy, but that's ok */
 		ret = __percpu_counter_compare(&fs_info->dirty_metadata_bytes,
 					     BTRFS_DIRTY_METADATA_THRESH,
@@ -544,7 +544,7 @@ static void btree_invalidate_folio(struct folio *folio, size_t offset,
 static bool btree_dirty_folio(struct address_space *mapping,
 		struct folio *folio)
 {
-	struct btrfs_fs_info *fs_info = btrfs_sb(mapping->host->i_sb);
+	struct btrfs_fs_info *fs_info = inode_to_fs_info(mapping->host);
 	struct btrfs_subpage_info *spi = fs_info->subpage_info;
 	struct btrfs_subpage *subpage;
 	struct extent_buffer *eb;
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index d710339ca4f391..8398d345ec5b91 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -221,7 +221,7 @@ static int btrfs_get_name(struct dentry *parent, char *name,
 {
 	struct inode *inode = d_inode(child);
 	struct inode *dir = d_inode(parent);
-	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
 	struct btrfs_path *path;
 	struct btrfs_root *root = BTRFS_I(dir)->root;
 	struct btrfs_inode_ref *iref;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 420054ad9acb16..ac6b5d8895aab9 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -206,7 +206,7 @@ static void __process_pages_contig(struct address_space *mapping,
 				   struct page *locked_page, u64 start, u64 end,
 				   unsigned long page_ops)
 {
-	struct btrfs_fs_info *fs_info = btrfs_sb(mapping->host->i_sb);
+	struct btrfs_fs_info *fs_info = inode_to_fs_info(mapping->host);
 	pgoff_t start_index = start >> PAGE_SHIFT;
 	pgoff_t end_index = end >> PAGE_SHIFT;
 	pgoff_t index = start_index;
@@ -250,7 +250,7 @@ static noinline int lock_delalloc_pages(struct inode *inode,
 					u64 start,
 					u64 end)
 {
-	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
 	struct address_space *mapping = inode->i_mapping;
 	pgoff_t start_index = start >> PAGE_SHIFT;
 	pgoff_t end_index = end >> PAGE_SHIFT;
@@ -322,7 +322,7 @@ noinline_for_stack bool find_lock_delalloc_range(struct inode *inode,
 				    struct page *locked_page, u64 *start,
 				    u64 *end)
 {
-	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
 	struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
 	const u64 orig_start = *start;
 	const u64 orig_end = *end;
@@ -1002,7 +1002,7 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
 		      struct btrfs_bio_ctrl *bio_ctrl, u64 *prev_em_start)
 {
 	struct inode *inode = page->mapping->host;
-	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
 	u64 start = page_offset(page);
 	const u64 end = start + PAGE_SIZE - 1;
 	u64 cur = start;
@@ -1909,7 +1909,7 @@ int btree_write_cache_pages(struct address_space *mapping,
 				   struct writeback_control *wbc)
 {
 	struct btrfs_eb_write_context ctx = { .wbc = wbc };
-	struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info;
+	struct btrfs_fs_info *fs_info = inode_to_fs_info(mapping->host);
 	int ret = 0;
 	int done = 0;
 	int nr_to_write_done = 0;
@@ -2197,7 +2197,7 @@ void extent_write_locked_range(struct inode *inode, struct page *locked_page,
 	bool found_error = false;
 	int ret = 0;
 	struct address_space *mapping = inode->i_mapping;
-	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
 	const u32 sectorsize = fs_info->sectorsize;
 	loff_t i_size = i_size_read(inode);
 	u64 cur = start;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 78c3ef68caa3bf..f9d76072398da5 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1135,7 +1135,7 @@ static int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from,
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file_inode(file);
-	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
 	loff_t pos = iocb->ki_pos;
 	int ret;
 	loff_t oldsize;
@@ -1183,7 +1183,7 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
 	struct file *file = iocb->ki_filp;
 	loff_t pos;
 	struct inode *inode = file_inode(file);
-	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
 	struct page **pages = NULL;
 	struct extent_changeset *data_reserved = NULL;
 	u64 release_bytes = 0;
@@ -1459,7 +1459,7 @@ static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file_inode(file);
-	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
 	loff_t pos;
 	ssize_t written = 0;
 	ssize_t written_buffered;
@@ -1785,7 +1785,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 {
 	struct dentry *dentry = file_dentry(file);
 	struct inode *inode = d_inode(dentry);
-	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_trans_handle *trans;
 	struct btrfs_log_ctx ctx;
@@ -2603,7 +2603,7 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode,
 static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len)
 {
 	struct inode *inode = file_inode(file);
-	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct extent_state *cached_state = NULL;
 	struct btrfs_path *path;
@@ -3058,7 +3058,7 @@ static long btrfs_fallocate(struct file *file, int mode,
 	int ret;
 
 	/* Do not allow fallocate in ZONED mode */
-	if (btrfs_is_zoned(btrfs_sb(inode->i_sb)))
+	if (btrfs_is_zoned(inode_to_fs_info(inode)))
 		return -EOPNOTSUPP;
 
 	alloc_start = round_down(offset, blocksize);
@@ -3763,7 +3763,7 @@ static ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to)
 	if (fsverity_active(inode))
 		return 0;
 
-	if (check_direct_read(btrfs_sb(inode->i_sb), to, iocb->ki_pos))
+	if (check_direct_read(inode_to_fs_info(inode), to, iocb->ki_pos))
 		return 0;
 
 	btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index deadf5e6258a6b..ca9f837daa9e3b 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -397,7 +397,7 @@ static int io_ctl_init(struct btrfs_io_ctl *io_ctl, struct inode *inode,
 		return -ENOMEM;
 
 	io_ctl->num_pages = num_pages;
-	io_ctl->fs_info = btrfs_sb(inode->i_sb);
+	io_ctl->fs_info = inode_to_fs_info(inode);
 	io_ctl->inode = inode;
 
 	return 0;
diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
index 60ec557a7e3a00..a83c5152215272 100644
--- a/fs/btrfs/fs.h
+++ b/fs/btrfs/fs.h
@@ -839,6 +839,9 @@ struct btrfs_fs_info {
 #define page_to_fs_info(_page)	 (page_to_inode(_page)->root->fs_info)
 #define folio_to_fs_info(_folio) (folio_to_inode(_folio)->root->fs_info)
 
+#define inode_to_fs_info(_inode) (BTRFS_I(_Generic((_inode),			\
+					   struct inode *: (_inode)))->root->fs_info)
+
 static inline u64 btrfs_get_fs_generation(const struct btrfs_fs_info *fs_info)
 {
 	return READ_ONCE(fs_info->generation);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index b048867c28fbc2..adf11936a47e22 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2827,7 +2827,7 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
 int btrfs_writepage_cow_fixup(struct page *page)
 {
 	struct inode *inode = page->mapping->host;
-	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
 	struct btrfs_writepage_fixup *fixup;
 
 	/* This page has ordered extent covering it already */
@@ -3257,7 +3257,7 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
 
 int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered)
 {
-	if (btrfs_is_zoned(btrfs_sb(ordered->inode->i_sb)) &&
+	if (btrfs_is_zoned(inode_to_fs_info(ordered->inode)) &&
 	    !test_bit(BTRFS_ORDERED_IOERR, &ordered->flags) &&
 	    list_empty(&ordered->bioc_list))
 		btrfs_finish_ordered_zoned(ordered);
@@ -3742,7 +3742,7 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf,
 static int btrfs_read_locked_inode(struct inode *inode,
 				   struct btrfs_path *in_path)
 {
-	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
 	struct btrfs_path *path = in_path;
 	struct extent_buffer *leaf;
 	struct btrfs_inode_item *inode_item;
@@ -4467,8 +4467,8 @@ static void btrfs_prune_dentries(struct btrfs_root *root)
 
 int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry)
 {
-	struct btrfs_fs_info *fs_info = btrfs_sb(dentry->d_sb);
 	struct btrfs_root *root = dir->root;
+	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct inode *inode = d_inode(dentry);
 	struct btrfs_root *dest = BTRFS_I(inode)->root;
 	struct btrfs_trans_handle *trans;
@@ -5023,7 +5023,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
 		btrfs_drew_write_unlock(&root->snapshot_lock);
 		btrfs_end_transaction(trans);
 	} else {
-		struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+		struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
 
 		if (btrfs_is_zoned(fs_info)) {
 			ret = btrfs_wait_ordered_range(inode,
@@ -5226,7 +5226,7 @@ static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root,
 
 void btrfs_evict_inode(struct inode *inode)
 {
-	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+	struct btrfs_fs_info *fs_info;
 	struct btrfs_trans_handle *trans;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_block_rsv *rsv = NULL;
@@ -5240,6 +5240,7 @@ void btrfs_evict_inode(struct inode *inode)
 		return;
 	}
 
+	fs_info = inode_to_fs_info(inode);
 	evict_inode_truncate_pages(inode);
 
 	if (inode->i_nlink &&
@@ -5665,7 +5666,7 @@ static inline u8 btrfs_inode_type(struct inode *inode)
 
 struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
 {
-	struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
+	struct btrfs_fs_info *fs_info = inode_to_fs_info(dir);
 	struct inode *inode;
 	struct btrfs_root *root = BTRFS_I(dir)->root;
 	struct btrfs_root *sub_root = root;
@@ -6204,7 +6205,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
 	struct inode *dir = args->dir;
 	struct inode *inode = args->inode;
 	const struct fscrypt_str *name = args->orphan ? NULL : &args->fname.disk_name;
-	struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
+	struct btrfs_fs_info *fs_info = inode_to_fs_info(dir);
 	struct btrfs_root *root;
 	struct btrfs_inode_item *inode_item;
 	struct btrfs_key *location;
@@ -6526,7 +6527,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
 static int btrfs_create_common(struct inode *dir, struct dentry *dentry,
 			       struct inode *inode)
 {
-	struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
+	struct btrfs_fs_info *fs_info = inode_to_fs_info(dir);
 	struct btrfs_root *root = BTRFS_I(dir)->root;
 	struct btrfs_new_inode_args new_inode_args = {
 		.dir = dir,
@@ -6596,7 +6597,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
 	struct btrfs_trans_handle *trans = NULL;
 	struct btrfs_root *root = BTRFS_I(dir)->root;
 	struct inode *inode = d_inode(old_dentry);
-	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
 	struct fscrypt_name fname;
 	u64 index;
 	int err;
@@ -7079,7 +7080,7 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
 			      u64 *orig_start, u64 *orig_block_len,
 			      u64 *ram_bytes, bool nowait, bool strict)
 {
-	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
 	struct can_nocow_file_extent_args nocow_args = { 0 };
 	struct btrfs_path *path;
 	int ret;
@@ -7318,7 +7319,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
 					 unsigned int iomap_flags)
 {
 	const bool nowait = (iomap_flags & IOMAP_NOWAIT);
-	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
 	struct extent_map *em = *map;
 	int type;
 	u64 block_start, orig_start, orig_block_len, ram_bytes;
@@ -7458,7 +7459,7 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
 		struct iomap *srcmap)
 {
 	struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap);
-	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
 	struct extent_map *em;
 	struct extent_state *cached_state = NULL;
 	struct btrfs_dio_data *dio_data = iter->private;
@@ -8135,7 +8136,7 @@ vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
 	struct page *page = vmf->page;
 	struct folio *folio = page_folio(page);
 	struct inode *inode = file_inode(vmf->vma->vm_file);
-	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
 	struct btrfs_ordered_extent *ordered;
 	struct extent_state *cached_state = NULL;
@@ -8744,7 +8745,7 @@ static int btrfs_rename_exchange(struct inode *old_dir,
 			      struct inode *new_dir,
 			      struct dentry *new_dentry)
 {
-	struct btrfs_fs_info *fs_info = btrfs_sb(old_dir->i_sb);
+	struct btrfs_fs_info *fs_info = inode_to_fs_info(old_dir);
 	struct btrfs_trans_handle *trans;
 	unsigned int trans_num_items;
 	struct btrfs_root *root = BTRFS_I(old_dir)->root;
@@ -8996,7 +8997,7 @@ static int btrfs_rename(struct mnt_idmap *idmap,
 			struct inode *new_dir, struct dentry *new_dentry,
 			unsigned int flags)
 {
-	struct btrfs_fs_info *fs_info = btrfs_sb(old_dir->i_sb);
+	struct btrfs_fs_info *fs_info = inode_to_fs_info(old_dir);
 	struct btrfs_new_inode_args whiteout_args = {
 		.dir = old_dir,
 		.dentry = old_dentry,
@@ -9438,7 +9439,7 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr,
 static int btrfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
 			 struct dentry *dentry, const char *symname)
 {
-	struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
+	struct btrfs_fs_info *fs_info = inode_to_fs_info(dir);
 	struct btrfs_trans_handle *trans;
 	struct btrfs_root *root = BTRFS_I(dir)->root;
 	struct btrfs_path *path;
@@ -9619,7 +9620,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
 				       loff_t actual_len, u64 *alloc_hint,
 				       struct btrfs_trans_handle *trans)
 {
-	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
 	struct extent_map *em;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_key ins;
@@ -9771,7 +9772,7 @@ static int btrfs_permission(struct mnt_idmap *idmap,
 static int btrfs_tmpfile(struct mnt_idmap *idmap, struct inode *dir,
 			 struct file *file, umode_t mode)
 {
-	struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
+	struct btrfs_fs_info *fs_info = inode_to_fs_info(dir);
 	struct btrfs_trans_handle *trans;
 	struct btrfs_root *root = BTRFS_I(dir)->root;
 	struct inode *inode;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index b3f931f915333f..43a85b07f656cb 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -243,7 +243,7 @@ int btrfs_fileattr_set(struct mnt_idmap *idmap,
 		       struct dentry *dentry, struct fileattr *fa)
 {
 	struct inode *inode = d_inode(dentry);
-	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
 	struct btrfs_inode *binode = BTRFS_I(inode);
 	struct btrfs_root *root = binode->root;
 	struct btrfs_trans_handle *trans;
@@ -580,7 +580,7 @@ static noinline int create_subvol(struct mnt_idmap *idmap,
 				  struct inode *dir, struct dentry *dentry,
 				  struct btrfs_qgroup_inherit *inherit)
 {
-	struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
+	struct btrfs_fs_info *fs_info = inode_to_fs_info(dir);
 	struct btrfs_trans_handle *trans;
 	struct btrfs_key key;
 	struct btrfs_root_item *root_item;
@@ -772,7 +772,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
 			   struct dentry *dentry, bool readonly,
 			   struct btrfs_qgroup_inherit *inherit)
 {
-	struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
+	struct btrfs_fs_info *fs_info = inode_to_fs_info(dir);
 	struct inode *inode;
 	struct btrfs_pending_snapshot *pending_snapshot;
 	unsigned int trans_num_items;
@@ -960,7 +960,7 @@ static noinline int btrfs_mksubvol(const struct path *parent,
 				   struct btrfs_qgroup_inherit *inherit)
 {
 	struct inode *dir = d_inode(parent->dentry);
-	struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
+	struct btrfs_fs_info *fs_info = inode_to_fs_info(dir);
 	struct dentry *dentry;
 	struct fscrypt_str name_str = FSTR_INIT((char *)name, namelen);
 	int error;
@@ -1095,7 +1095,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
 {
 	BTRFS_DEV_LOOKUP_ARGS(args);
 	struct inode *inode = file_inode(file);
-	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
 	u64 new_size;
 	u64 old_size;
 	u64 devid = 1;
@@ -1403,7 +1403,7 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
 static noinline int btrfs_ioctl_subvol_getflags(struct inode *inode,
 						void __user *arg)
 {
-	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	int ret = 0;
 	u64 flags = 0;
@@ -1426,7 +1426,7 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file,
 					      void __user *arg)
 {
 	struct inode *inode = file_inode(file);
-	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_trans_handle *trans;
 	u64 root_flags;
@@ -1673,7 +1673,7 @@ static noinline int search_ioctl(struct inode *inode,
 				 u64 *buf_size,
 				 char __user *ubuf)
 {
-	struct btrfs_fs_info *info = btrfs_sb(inode->i_sb);
+	struct btrfs_fs_info *info = inode_to_fs_info(inode);
 	struct btrfs_root *root;
 	struct btrfs_key key;
 	struct btrfs_path *path;
@@ -2344,9 +2344,9 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
 					     bool destroy_v2)
 {
 	struct dentry *parent = file->f_path.dentry;
-	struct btrfs_fs_info *fs_info = btrfs_sb(parent->d_sb);
 	struct dentry *dentry;
 	struct inode *dir = d_inode(parent);
+	struct btrfs_fs_info *fs_info = inode_to_fs_info(dir);
 	struct inode *inode;
 	struct btrfs_root *root = BTRFS_I(dir)->root;
 	struct btrfs_root *dest = NULL;
@@ -2694,7 +2694,7 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
 {
 	BTRFS_DEV_LOOKUP_ARGS(args);
 	struct inode *inode = file_inode(file);
-	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
 	struct btrfs_ioctl_vol_args_v2 *vol_args;
 	struct bdev_handle *bdev_handle = NULL;
 	int ret;
@@ -2759,7 +2759,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
 {
 	BTRFS_DEV_LOOKUP_ARGS(args);
 	struct inode *inode = file_inode(file);
-	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
 	struct btrfs_ioctl_vol_args *vol_args;
 	struct bdev_handle *bdev_handle = NULL;
 	int ret;
@@ -2902,7 +2902,7 @@ static long btrfs_ioctl_dev_info(struct btrfs_fs_info *fs_info,
 static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
 {
 	struct inode *inode = file_inode(file);
-	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_root *new_root;
 	struct btrfs_dir_item *di;
@@ -3176,7 +3176,7 @@ static noinline long btrfs_ioctl_wait_sync(struct btrfs_fs_info *fs_info,
 
 static long btrfs_ioctl_scrub(struct file *file, void __user *arg)
 {
-	struct btrfs_fs_info *fs_info = btrfs_sb(file_inode(file)->i_sb);
+	struct btrfs_fs_info *fs_info = inode_to_fs_info(file_inode(file));
 	struct btrfs_ioctl_scrub_args *sa;
 	int ret;
 
@@ -3694,7 +3694,7 @@ static long btrfs_ioctl_balance_progress(struct btrfs_fs_info *fs_info,
 static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg)
 {
 	struct inode *inode = file_inode(file);
-	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
 	struct btrfs_ioctl_quota_ctl_args *sa;
 	int ret;
 
@@ -3736,7 +3736,7 @@ static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg)
 static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg)
 {
 	struct inode *inode = file_inode(file);
-	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_ioctl_qgroup_assign_args *sa;
 	struct btrfs_trans_handle *trans;
@@ -3892,7 +3892,7 @@ static long btrfs_ioctl_qgroup_limit(struct file *file, void __user *arg)
 static long btrfs_ioctl_quota_rescan(struct file *file, void __user *arg)
 {
 	struct inode *inode = file_inode(file);
-	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
 	struct btrfs_ioctl_quota_rescan_args *qsa;
 	int ret;
 
@@ -3956,7 +3956,7 @@ static long _btrfs_ioctl_set_received_subvol(struct file *file,
 					    struct btrfs_ioctl_received_subvol_args *sa)
 {
 	struct inode *inode = file_inode(file);
-	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_root_item *root_item = &root->root_item;
 	struct btrfs_trans_handle *trans;
@@ -4144,7 +4144,7 @@ static int btrfs_ioctl_get_fslabel(struct btrfs_fs_info *fs_info,
 static int btrfs_ioctl_set_fslabel(struct file *file, void __user *arg)
 {
 	struct inode *inode = file_inode(file);
-	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_super_block *super_block = fs_info->super_copy;
 	struct btrfs_trans_handle *trans;
@@ -4287,7 +4287,7 @@ check_feature_bits(fs_info, FEAT_##mask_base, change_mask, flags,	\
 static int btrfs_ioctl_set_features(struct file *file, void __user *arg)
 {
 	struct inode *inode = file_inode(file);
-	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_super_block *super_block = fs_info->super_copy;
 	struct btrfs_ioctl_feature_flags flags[2];
@@ -4578,7 +4578,7 @@ long btrfs_ioctl(struct file *file, unsigned int
 		cmd, unsigned long arg)
 {
 	struct inode *inode = file_inode(file);
-	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	void __user *argp = (void __user *)arg;
 
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index 110a2c304bdc7a..3e5d3b7028e8ba 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -214,7 +214,7 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
 		unsigned long *total_in, unsigned long *total_out)
 {
 	struct workspace *workspace = list_entry(ws, struct workspace, list);
-	const u32 sectorsize = btrfs_sb(mapping->host->i_sb)->sectorsize;
+	const u32 sectorsize = inode_to_fs_info(mapping->host)->sectorsize;
 	struct page *page_in = NULL;
 	char *sizes_ptr;
 	const unsigned long max_nr_page = *out_pages;
diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c
index f9bf591a07187a..ac4a0af2b55439 100644
--- a/fs/btrfs/props.c
+++ b/fs/btrfs/props.c
@@ -302,7 +302,7 @@ static int prop_compression_validate(const struct btrfs_inode *inode,
 static int prop_compression_apply(struct inode *inode, const char *value,
 				  size_t len)
 {
-	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
 	int type;
 
 	/* Reset to defaults */
diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c
index e38cb40e150c96..08d0fb46ceec4d 100644
--- a/fs/btrfs/reflink.c
+++ b/fs/btrfs/reflink.c
@@ -174,7 +174,7 @@ static int clone_copy_inline_extent(struct inode *dst,
 				    char *inline_data,
 				    struct btrfs_trans_handle **trans_out)
 {
-	struct btrfs_fs_info *fs_info = btrfs_sb(dst->i_sb);
+	struct btrfs_fs_info *fs_info = inode_to_fs_info(dst);
 	struct btrfs_root *root = BTRFS_I(dst)->root;
 	const u64 aligned_end = ALIGN(new_key->offset + datal,
 				      fs_info->sectorsize);
@@ -337,7 +337,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
 		       const u64 off, const u64 olen, const u64 olen_aligned,
 		       const u64 destoff, int no_time_update)
 {
-	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
 	struct btrfs_path *path = NULL;
 	struct extent_buffer *leaf;
 	struct btrfs_trans_handle *trans;
@@ -726,7 +726,7 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src,
 {
 	struct inode *inode = file_inode(file);
 	struct inode *src = file_inode(file_src);
-	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
 	int ret;
 	int wb_ret;
 	u64 len = olen;
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index abe594f77f99c0..2fca67f2b39b9c 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2987,7 +2987,7 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra,
 			     const struct file_extent_cluster *cluster,
 			     int *cluster_nr, unsigned long page_index)
 {
-	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
 	u64 offset = BTRFS_I(inode)->index_cnt;
 	const unsigned long last_index = (cluster->end - offset) >> PAGE_SHIFT;
 	gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);

From 7f2c4f406aab890e056fb8f96d16ae0684fa0253 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Wed, 24 Jan 2024 23:24:03 +0100
Subject: [PATCH 0396/1406] btrfs: hoist fs_info out of loops in
 end_bbio_data_write and end_bbio_data_read

The fs_info and sectorsize remain the same during the loops, no need to
set them on each iteration.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_io.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index ac6b5d8895aab9..197b9f50e75cfa 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -461,16 +461,15 @@ static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len)
  */
 static void end_bbio_data_write(struct btrfs_bio *bbio)
 {
+	struct btrfs_fs_info *fs_info = bbio->fs_info;
 	struct bio *bio = &bbio->bio;
 	int error = blk_status_to_errno(bio->bi_status);
 	struct folio_iter fi;
+	const u32 sectorsize = fs_info->sectorsize;
 
 	ASSERT(!bio_flagged(bio, BIO_CLONED));
 	bio_for_each_folio_all(fi, bio) {
 		struct folio *folio = fi.folio;
-		struct inode *inode = folio->mapping->host;
-		struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
-		const u32 sectorsize = fs_info->sectorsize;
 		u64 start = folio_pos(folio) + fi.offset;
 		u32 len = fi.length;
 
@@ -592,17 +591,17 @@ static void begin_page_read(struct btrfs_fs_info *fs_info, struct page *page)
  */
 static void end_bbio_data_read(struct btrfs_bio *bbio)
 {
+	struct btrfs_fs_info *fs_info = bbio->fs_info;
 	struct bio *bio = &bbio->bio;
 	struct processed_extent processed = { 0 };
 	struct folio_iter fi;
+	const u32 sectorsize = fs_info->sectorsize;
 
 	ASSERT(!bio_flagged(bio, BIO_CLONED));
 	bio_for_each_folio_all(fi, &bbio->bio) {
 		bool uptodate = !bio->bi_status;
 		struct folio *folio = fi.folio;
 		struct inode *inode = folio->mapping->host;
-		struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
-		const u32 sectorsize = fs_info->sectorsize;
 		u64 start;
 		u64 end;
 		u32 len;

From 7113deccba990ccd00493db22df7f36203879bf7 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Wed, 31 Jan 2024 17:18:04 +0000
Subject: [PATCH 0397/1406] btrfs: don't reserve space for checksums when
 writing to nocow files

Currently when doing a write to a file we always reserve metadata space
for inserting data checksums. However we don't need to do it if we have
a nodatacow file (-o nodatacow mount option or chattr +C) or if checksums
are disabled (-o nodatasum mount option), as in that case we are only
adding unnecessary pressure to metadata reservations.

For example on x86_64, with the default node size of 16K, a 4K buffered
write into a nodatacow file is reserving 655360 bytes of metadata space,
as it's accounting for checksums. After this change, which stops reserving
space for checksums if we have a nodatacow file or checksums are disabled,
we only need to reserve 393216 bytes of metadata.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/delalloc-space.c | 29 +++++++++++++++++++----------
 1 file changed, 19 insertions(+), 10 deletions(-)

diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c
index 4a60a679d7b445..b3527efd0b4b52 100644
--- a/fs/btrfs/delalloc-space.c
+++ b/fs/btrfs/delalloc-space.c
@@ -243,7 +243,6 @@ static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info,
 	struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
 	u64 reserve_size = 0;
 	u64 qgroup_rsv_size = 0;
-	u64 csum_leaves;
 	unsigned outstanding_extents;
 
 	lockdep_assert_held(&inode->lock);
@@ -258,10 +257,12 @@ static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info,
 						outstanding_extents);
 		reserve_size += btrfs_calc_metadata_size(fs_info, 1);
 	}
-	csum_leaves = btrfs_csum_bytes_to_leaves(fs_info,
-						 inode->csum_bytes);
-	reserve_size += btrfs_calc_insert_metadata_size(fs_info,
-							csum_leaves);
+	if (!(inode->flags & BTRFS_INODE_NODATASUM)) {
+		u64 csum_leaves;
+
+		csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, inode->csum_bytes);
+		reserve_size += btrfs_calc_insert_metadata_size(fs_info, csum_leaves);
+	}
 	/*
 	 * For qgroup rsv, the calculation is very simple:
 	 * account one nodesize for each outstanding extent
@@ -276,14 +277,20 @@ static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info,
 	spin_unlock(&block_rsv->lock);
 }
 
-static void calc_inode_reservations(struct btrfs_fs_info *fs_info,
+static void calc_inode_reservations(struct btrfs_inode *inode,
 				    u64 num_bytes, u64 disk_num_bytes,
 				    u64 *meta_reserve, u64 *qgroup_reserve)
 {
+	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 	u64 nr_extents = count_max_extents(fs_info, num_bytes);
-	u64 csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, disk_num_bytes);
+	u64 csum_leaves;
 	u64 inode_update = btrfs_calc_metadata_size(fs_info, 1);
 
+	if (inode->flags & BTRFS_INODE_NODATASUM)
+		csum_leaves = 0;
+	else
+		csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, disk_num_bytes);
+
 	*meta_reserve = btrfs_calc_insert_metadata_size(fs_info,
 						nr_extents + csum_leaves);
 
@@ -335,7 +342,7 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
 	 * everything out and try again, which is bad.  This way we just
 	 * over-reserve slightly, and clean up the mess when we are done.
 	 */
-	calc_inode_reservations(fs_info, num_bytes, disk_num_bytes,
+	calc_inode_reservations(inode, num_bytes, disk_num_bytes,
 				&meta_reserve, &qgroup_reserve);
 	ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserve, true,
 						 noflush);
@@ -357,7 +364,8 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
 	nr_extents = count_max_extents(fs_info, num_bytes);
 	spin_lock(&inode->lock);
 	btrfs_mod_outstanding_extents(inode, nr_extents);
-	inode->csum_bytes += disk_num_bytes;
+	if (!(inode->flags & BTRFS_INODE_NODATASUM))
+		inode->csum_bytes += disk_num_bytes;
 	btrfs_calculate_inode_block_rsv_size(fs_info, inode);
 	spin_unlock(&inode->lock);
 
@@ -391,7 +399,8 @@ void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes,
 
 	num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
 	spin_lock(&inode->lock);
-	inode->csum_bytes -= num_bytes;
+	if (!(inode->flags & BTRFS_INODE_NODATASUM))
+		inode->csum_bytes -= num_bytes;
 	btrfs_calculate_inode_block_rsv_size(fs_info, inode);
 	spin_unlock(&inode->lock);
 

From 6a1ac55313eee32228e65f1db1636779eb420af9 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Fri, 2 Feb 2024 12:09:22 +0000
Subject: [PATCH 0398/1406] btrfs: reject encoded write if inode has nodatasum
 flag set

Currently we allow an encoded write against inodes that have the NODATASUM
flag set, either because they are NOCOW files or they were created while
the filesystem was mounted with "-o nodatasum". This results in having
compressed extents without corresponding checksums, which is a filesystem
inconsistency reported by 'btrfs check'.

For example, running btrfs/281 with MOUNT_OPTIONS="-o nodatacow" triggers
this and 'btrfs check' errors out with:

   [1/7] checking root items
   [2/7] checking extents
   [3/7] checking free space tree
   [4/7] checking fs roots
   root 256 inode 257 errors 1040, bad file extent, some csum missing
   root 256 inode 258 errors 1040, bad file extent, some csum missing
   ERROR: errors found in fs roots
   (...)

So reject encoded writes if the target inode has NODATASUM set.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index adf11936a47e22..2d16bb08e90512 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -10290,6 +10290,13 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
 	if (encoded->encryption != BTRFS_ENCODED_IO_ENCRYPTION_NONE)
 		return -EINVAL;
 
+	/*
+	 * Compressed extents should always have checksums, so error out if we
+	 * have a NOCOW file or inode was created while mounted with NODATASUM.
+	 */
+	if (inode->flags & BTRFS_INODE_NODATASUM)
+		return -EINVAL;
+
 	orig_count = iov_iter_count(from);
 
 	/* The extent size must be sane. */

From e95cd684bedafdb791f72ade3c79e9df5b1cb285 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Sat, 27 Jan 2024 00:53:06 +0100
Subject: [PATCH 0399/1406] btrfs: add forward declarations and headers, part 1

Do a cleanup in the short headers:

- add forward declarations for types referenced by pointers
- add includes when types need them

This fixes potential compilation problems if the headers are reordered
or the missing includes are not provided indirectly.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/acl.h              | 11 +++++++++++
 fs/btrfs/async-thread.h     |  3 +++
 fs/btrfs/defrag.h           | 10 ++++++++++
 fs/btrfs/delalloc-space.h   |  4 ++++
 fs/btrfs/dev-replace.h      |  4 ++++
 fs/btrfs/dir-item.h         |  6 ++++++
 fs/btrfs/disk-io.h          |  4 ++++
 fs/btrfs/export.h           |  4 ++++
 fs/btrfs/extent_map.h       |  1 +
 fs/btrfs/file.h             | 15 +++++++++++++++
 fs/btrfs/ioctl.h            |  9 +++++++++
 fs/btrfs/ordered-data.h     |  2 ++
 fs/btrfs/orphan.h           |  5 +++++
 fs/btrfs/print-tree.h       |  3 +++
 fs/btrfs/props.c            |  1 +
 fs/btrfs/props.h            |  7 ++++++-
 fs/btrfs/raid-stripe-tree.h |  5 +++++
 fs/btrfs/rcu-string.h       |  6 ++++++
 fs/btrfs/ref-verify.h       |  9 +++++++++
 fs/btrfs/reflink.h          |  4 +++-
 fs/btrfs/relocation.h       |  9 +++++++++
 fs/btrfs/root-tree.h        | 10 ++++++++++
 fs/btrfs/scrub.h            |  6 ++++++
 fs/btrfs/super.h            |  7 +++++++
 fs/btrfs/sysfs.h            |  9 +++++++++
 fs/btrfs/tree-mod-log.h     |  8 +++++++-
 fs/btrfs/uuid-tree.h        |  5 +++++
 fs/btrfs/verity.h           |  7 +++++++
 fs/btrfs/xattr.h            |  6 +++++-
 29 files changed, 176 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/acl.h b/fs/btrfs/acl.h
index a270e71ec05f91..48b9ddae4a46a7 100644
--- a/fs/btrfs/acl.h
+++ b/fs/btrfs/acl.h
@@ -3,8 +3,15 @@
 #ifndef BTRFS_ACL_H
 #define BTRFS_ACL_H
 
+struct posix_acl;
+struct inode;
+struct btrfs_trans_handle;
+
 #ifdef CONFIG_BTRFS_FS_POSIX_ACL
 
+struct mnt_idmap;
+struct dentry;
+
 struct posix_acl *btrfs_get_acl(struct inode *inode, int type, bool rcu);
 int btrfs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
 		  struct posix_acl *acl, int type);
@@ -13,6 +20,10 @@ int __btrfs_set_acl(struct btrfs_trans_handle *trans, struct inode *inode,
 
 #else
 
+#include <linux/errno.h>
+
+struct btrfs_trans_handle;
+
 #define btrfs_get_acl NULL
 #define btrfs_set_acl NULL
 static inline int __btrfs_set_acl(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index 62b8a0d5789865..04c2f3175828bb 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -7,11 +7,14 @@
 #ifndef BTRFS_ASYNC_THREAD_H
 #define BTRFS_ASYNC_THREAD_H
 
+#include <linux/compiler_types.h>
 #include <linux/workqueue.h>
+#include <linux/list.h>
 
 struct btrfs_fs_info;
 struct btrfs_workqueue;
 struct btrfs_work;
+
 typedef void (*btrfs_func_t)(struct btrfs_work *arg);
 typedef void (*btrfs_ordered_func_t)(struct btrfs_work *arg, bool);
 
diff --git a/fs/btrfs/defrag.h b/fs/btrfs/defrag.h
index 5a62763528d1b5..878528e086fbe8 100644
--- a/fs/btrfs/defrag.h
+++ b/fs/btrfs/defrag.h
@@ -3,6 +3,16 @@
 #ifndef BTRFS_DEFRAG_H
 #define BTRFS_DEFRAG_H
 
+#include <linux/types.h>
+#include <linux/compiler_types.h>
+
+struct inode;
+struct file_ra_state;
+struct btrfs_fs_info;
+struct btrfs_root;
+struct btrfs_trans_handle;
+struct btrfs_ioctl_defrag_range_args;
+
 int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
 		      struct btrfs_ioctl_defrag_range_args *range,
 		      u64 newer_than, unsigned long max_to_defrag);
diff --git a/fs/btrfs/delalloc-space.h b/fs/btrfs/delalloc-space.h
index c5d573f2366e37..ce4f889e4f17b7 100644
--- a/fs/btrfs/delalloc-space.h
+++ b/fs/btrfs/delalloc-space.h
@@ -3,7 +3,11 @@
 #ifndef BTRFS_DELALLOC_SPACE_H
 #define BTRFS_DELALLOC_SPACE_H
 
+#include <linux/types.h>
+
 struct extent_changeset;
+struct btrfs_inode;
+struct btrfs_fs_info;
 
 int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes);
 int btrfs_check_data_free_space(struct btrfs_inode *inode,
diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h
index 675082ccec89f8..23e480efe5e6e4 100644
--- a/fs/btrfs/dev-replace.h
+++ b/fs/btrfs/dev-replace.h
@@ -6,11 +6,15 @@
 #ifndef BTRFS_DEV_REPLACE_H
 #define BTRFS_DEV_REPLACE_H
 
+#include <linux/types.h>
+#include <linux/compiler_types.h>
+
 struct btrfs_ioctl_dev_replace_args;
 struct btrfs_fs_info;
 struct btrfs_trans_handle;
 struct btrfs_dev_replace;
 struct btrfs_block_group;
+struct btrfs_device;
 
 int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info);
 int btrfs_run_dev_replace(struct btrfs_trans_handle *trans);
diff --git a/fs/btrfs/dir-item.h b/fs/btrfs/dir-item.h
index e40a226373d7ec..00b3d83d7569e5 100644
--- a/fs/btrfs/dir-item.h
+++ b/fs/btrfs/dir-item.h
@@ -3,9 +3,15 @@
 #ifndef BTRFS_DIR_ITEM_H
 #define BTRFS_DIR_ITEM_H
 
+#include <linux/types.h>
 #include <linux/crc32c.h>
 
 struct fscrypt_str;
+struct btrfs_fs_info;
+struct btrfs_key;
+struct btrfs_path;
+struct btrfs_root;
+struct btrfs_trans_handle;
 
 int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
 			  const struct fscrypt_str *name);
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 9413726b329bb1..21ff41bfe2b556 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -6,6 +6,10 @@
 #ifndef BTRFS_DISK_IO_H
 #define BTRFS_DISK_IO_H
 
+#include <linux/sizes.h>
+#include "ctree.h"
+#include "fs.h"
+
 #define BTRFS_SUPER_MIRROR_MAX	 3
 #define BTRFS_SUPER_MIRROR_SHIFT 12
 
diff --git a/fs/btrfs/export.h b/fs/btrfs/export.h
index eba6bc4f5a619f..464582273af926 100644
--- a/fs/btrfs/export.h
+++ b/fs/btrfs/export.h
@@ -4,6 +4,10 @@
 #define BTRFS_EXPORT_H
 
 #include <linux/exportfs.h>
+#include <linux/types.h>
+
+struct dentry;
+struct super_block;
 
 extern const struct export_operations btrfs_export_ops;
 
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index e380fc08bbe453..7fd55cf91f5372 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -5,6 +5,7 @@
 
 #include <linux/rbtree.h>
 #include <linux/refcount.h>
+#include "misc.h"
 #include "compression.h"
 
 #define EXTENT_MAP_LAST_BYTE ((u64)-4)
diff --git a/fs/btrfs/file.h b/fs/btrfs/file.h
index 82b34fbb295f27..77aaca208c7bce 100644
--- a/fs/btrfs/file.h
+++ b/fs/btrfs/file.h
@@ -3,6 +3,21 @@
 #ifndef BTRFS_FILE_H
 #define BTRFS_FILE_H
 
+#include <linux/types.h>
+
+struct file;
+struct extent_state;
+struct kiocb;
+struct iov_iter;
+struct page;
+struct btrfs_ioctl_encoded_io_args;
+struct btrfs_drop_extents_args;
+struct btrfs_inode;
+struct btrfs_root;
+struct btrfs_path;
+struct btrfs_replace_extent_info;
+struct btrfs_trans_handle;
+
 extern const struct file_operations btrfs_file_operations;
 
 int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync);
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index d51b9a2f2f6e88..2c5dc25ec67011 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -3,6 +3,15 @@
 #ifndef BTRFS_IOCTL_H
 #define BTRFS_IOCTL_H
 
+#include <linux/types.h>
+
+struct file;
+struct dentry;
+struct mnt_idmap;
+struct fileattr;
+struct btrfs_fs_info;
+struct btrfs_ioctl_balance_args;
+
 long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
 long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
 int btrfs_fileattr_get(struct dentry *dentry, struct fileattr *fa);
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 127ef8bf0ffd78..6fc0521000ac85 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -6,6 +6,8 @@
 #ifndef BTRFS_ORDERED_DATA_H
 #define BTRFS_ORDERED_DATA_H
 
+#include "async-thread.h"
+
 struct btrfs_ordered_sum {
 	/*
 	 * Logical start address and length for of the blocks covered by
diff --git a/fs/btrfs/orphan.h b/fs/btrfs/orphan.h
index 3faab5cbb59ac9..aa54a88a60de1e 100644
--- a/fs/btrfs/orphan.h
+++ b/fs/btrfs/orphan.h
@@ -3,6 +3,11 @@
 #ifndef BTRFS_ORPHAN_H
 #define BTRFS_ORPHAN_H
 
+#include <linux/types.h>
+
+struct btrfs_trans_handle;
+struct btrfs_root;
+
 int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root, u64 offset);
 int btrfs_del_orphan_item(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/print-tree.h b/fs/btrfs/print-tree.h
index c42bc666d5eeab..8504bf1702c7a2 100644
--- a/fs/btrfs/print-tree.h
+++ b/fs/btrfs/print-tree.h
@@ -9,6 +9,9 @@
 /* Buffer size to contain tree name and possibly additional data (offset) */
 #define BTRFS_ROOT_NAME_BUF_LEN				48
 
+struct extent_buffer;
+struct btrfs_key;
+
 void btrfs_print_leaf(const struct extent_buffer *l);
 void btrfs_print_tree(const struct extent_buffer *c, bool follow);
 const char *btrfs_root_name(const struct btrfs_key *key, char *buf);
diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c
index ac4a0af2b55439..2a9b7b029eeba3 100644
--- a/fs/btrfs/props.c
+++ b/fs/btrfs/props.c
@@ -4,6 +4,7 @@
  */
 
 #include <linux/hashtable.h>
+#include <linux/xattr.h>
 #include "messages.h"
 #include "props.h"
 #include "btrfs_inode.h"
diff --git a/fs/btrfs/props.h b/fs/btrfs/props.h
index 6e283196e38aba..f60cd89feb2930 100644
--- a/fs/btrfs/props.h
+++ b/fs/btrfs/props.h
@@ -6,7 +6,12 @@
 #ifndef BTRFS_PROPS_H
 #define BTRFS_PROPS_H
 
-#include "ctree.h"
+#include <linux/compiler_types.h>
+
+struct inode;
+struct btrfs_inode;
+struct btrfs_path;
+struct btrfs_trans_handle;
 
 int __init btrfs_props_init(void);
 
diff --git a/fs/btrfs/raid-stripe-tree.h b/fs/btrfs/raid-stripe-tree.h
index cdb58b38fcb5ea..c9c258f8490374 100644
--- a/fs/btrfs/raid-stripe-tree.h
+++ b/fs/btrfs/raid-stripe-tree.h
@@ -6,6 +6,10 @@
 #ifndef BTRFS_RAID_STRIPE_TREE_H
 #define BTRFS_RAID_STRIPE_TREE_H
 
+#include <linux/types.h>
+#include <uapi/linux/btrfs_tree.h>
+#include "fs.h"
+
 #define BTRFS_RST_SUPP_BLOCK_GROUP_MASK    (BTRFS_BLOCK_GROUP_DUP |		\
 					    BTRFS_BLOCK_GROUP_RAID1_MASK |	\
 					    BTRFS_BLOCK_GROUP_RAID0 |		\
@@ -13,6 +17,7 @@
 
 struct btrfs_io_context;
 struct btrfs_io_stripe;
+struct btrfs_fs_info;
 struct btrfs_ordered_extent;
 struct btrfs_trans_handle;
 
diff --git a/fs/btrfs/rcu-string.h b/fs/btrfs/rcu-string.h
index 5c2b66d155ef72..1c2d7cb1fe6f63 100644
--- a/fs/btrfs/rcu-string.h
+++ b/fs/btrfs/rcu-string.h
@@ -6,6 +6,12 @@
 #ifndef BTRFS_RCU_STRING_H
 #define BTRFS_RCU_STRING_H
 
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/rcupdate.h>
+#include <linux/printk.h>
+
 struct rcu_string {
 	struct rcu_head rcu;
 	char str[];
diff --git a/fs/btrfs/ref-verify.h b/fs/btrfs/ref-verify.h
index 855de37719b546..3511e1a5c96ba9 100644
--- a/fs/btrfs/ref-verify.h
+++ b/fs/btrfs/ref-verify.h
@@ -6,7 +6,16 @@
 #ifndef BTRFS_REF_VERIFY_H
 #define BTRFS_REF_VERIFY_H
 
+#include <linux/types.h>
+#include <linux/rbtree_types.h>
+
+struct btrfs_fs_info;
+struct btrfs_ref;
+
 #ifdef CONFIG_BTRFS_FS_REF_VERIFY
+
+#include <linux/spinlock.h>
+
 int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info);
 void btrfs_free_ref_cache(struct btrfs_fs_info *fs_info);
 int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/reflink.h b/fs/btrfs/reflink.h
index ecb309b4dad0fc..1e291f7d85c428 100644
--- a/fs/btrfs/reflink.h
+++ b/fs/btrfs/reflink.h
@@ -3,7 +3,9 @@
 #ifndef BTRFS_REFLINK_H
 #define BTRFS_REFLINK_H
 
-#include <linux/fs.h>
+#include <linux/types.h>
+
+struct file;
 
 loff_t btrfs_remap_file_range(struct file *file_in, loff_t pos_in,
 			      struct file *file_out, loff_t pos_out,
diff --git a/fs/btrfs/relocation.h b/fs/btrfs/relocation.h
index 5fb60f2deb5305..788c86d8633aff 100644
--- a/fs/btrfs/relocation.h
+++ b/fs/btrfs/relocation.h
@@ -3,6 +3,15 @@
 #ifndef BTRFS_RELOCATION_H
 #define BTRFS_RELOCATION_H
 
+#include <linux/types.h>
+
+struct extent_buffer;
+struct btrfs_fs_info;
+struct btrfs_root;
+struct btrfs_trans_handle;
+struct btrfs_ordered_extent;
+struct btrfs_pending_snapshot;
+
 int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start);
 int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, struct btrfs_root *root);
 int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/root-tree.h b/fs/btrfs/root-tree.h
index 8b2c3859e4647a..6f929cf3bd4967 100644
--- a/fs/btrfs/root-tree.h
+++ b/fs/btrfs/root-tree.h
@@ -3,7 +3,17 @@
 #ifndef BTRFS_ROOT_TREE_H
 #define BTRFS_ROOT_TREE_H
 
+#include <linux/types.h>
+
 struct fscrypt_str;
+struct extent_buffer;
+struct btrfs_key;
+struct btrfs_root;
+struct btrfs_root_item;
+struct btrfs_path;
+struct btrfs_fs_info;
+struct btrfs_block_rsv;
+struct btrfs_trans_handle;
 
 int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
 				     struct btrfs_block_rsv *rsv,
diff --git a/fs/btrfs/scrub.h b/fs/btrfs/scrub.h
index 7639103ebf9df3..f0df597b75c7c7 100644
--- a/fs/btrfs/scrub.h
+++ b/fs/btrfs/scrub.h
@@ -3,6 +3,12 @@
 #ifndef BTRFS_SCRUB_H
 #define BTRFS_SCRUB_H
 
+#include <linux/types.h>
+
+struct btrfs_fs_info;
+struct btrfs_device;
+struct btrfs_scrub_progress;
+
 int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
 		    u64 end, struct btrfs_scrub_progress *progress,
 		    int readonly, int is_dev_replace);
diff --git a/fs/btrfs/super.h b/fs/btrfs/super.h
index f18253ca280d3e..cbcab434b5ecb9 100644
--- a/fs/btrfs/super.h
+++ b/fs/btrfs/super.h
@@ -3,6 +3,13 @@
 #ifndef BTRFS_SUPER_H
 #define BTRFS_SUPER_H
 
+#include <linux/types.h>
+#include <linux/fs.h>
+#include "fs.h"
+
+struct super_block;
+struct btrfs_fs_info;
+
 bool btrfs_check_options(struct btrfs_fs_info *info, unsigned long *mount_opt,
 			 unsigned long flags);
 int btrfs_sync_fs(struct super_block *sb, int wait);
diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h
index 86c7eef128731e..e6a284c59809c9 100644
--- a/fs/btrfs/sysfs.h
+++ b/fs/btrfs/sysfs.h
@@ -3,8 +3,17 @@
 #ifndef BTRFS_SYSFS_H
 #define BTRFS_SYSFS_H
 
+#include <linux/types.h>
+#include <linux/compiler_types.h>
 #include <linux/kobject.h>
 
+struct btrfs_fs_info;
+struct btrfs_device;
+struct btrfs_fs_devices;
+struct btrfs_block_group;
+struct btrfs_space_info;
+struct btrfs_qgroup;
+
 enum btrfs_feature_set {
 	FEAT_COMPAT,
 	FEAT_COMPAT_RO,
diff --git a/fs/btrfs/tree-mod-log.h b/fs/btrfs/tree-mod-log.h
index 94f10afeee9725..ff00c8e8a393cb 100644
--- a/fs/btrfs/tree-mod-log.h
+++ b/fs/btrfs/tree-mod-log.h
@@ -3,7 +3,13 @@
 #ifndef BTRFS_TREE_MOD_LOG_H
 #define BTRFS_TREE_MOD_LOG_H
 
-#include "ctree.h"
+#include <linux/list.h>
+
+struct extent_buffer;
+struct btrfs_fs_info;
+struct btrfs_path;
+struct btrfs_root;
+struct btrfs_seq_list;
 
 /* Represents a tree mod log user. */
 struct btrfs_seq_list {
diff --git a/fs/btrfs/uuid-tree.h b/fs/btrfs/uuid-tree.h
index 5350c87fe2caf3..080ede0227aee0 100644
--- a/fs/btrfs/uuid-tree.h
+++ b/fs/btrfs/uuid-tree.h
@@ -3,6 +3,11 @@
 #ifndef BTRFS_UUID_TREE_H
 #define BTRFS_UUID_TREE_H
 
+#include <linux/types.h>
+
+struct btrfs_trans_handle;
+struct btrfs_fs_info;
+
 int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, u8 *uuid, u8 type,
 			u64 subid);
 int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, u8 *uuid, u8 type,
diff --git a/fs/btrfs/verity.h b/fs/btrfs/verity.h
index 91c10f7d0a4654..d696659e43e43d 100644
--- a/fs/btrfs/verity.h
+++ b/fs/btrfs/verity.h
@@ -3,8 +3,13 @@
 #ifndef BTRFS_VERITY_H
 #define BTRFS_VERITY_H
 
+struct inode;
+struct btrfs_inode;
+
 #ifdef CONFIG_FS_VERITY
 
+#include <linux/fsverity.h>
+
 extern const struct fsverity_operations btrfs_verityops;
 
 int btrfs_drop_verity_items(struct btrfs_inode *inode);
@@ -12,6 +17,8 @@ int btrfs_get_verity_descriptor(struct inode *inode, void *buf, size_t buf_size)
 
 #else
 
+#include <linux/errno.h>
+
 static inline int btrfs_drop_verity_items(struct btrfs_inode *inode)
 {
 	return 0;
diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h
index 118118ca3e1de7..b9376ea258ff32 100644
--- a/fs/btrfs/xattr.h
+++ b/fs/btrfs/xattr.h
@@ -6,7 +6,11 @@
 #ifndef BTRFS_XATTR_H
 #define BTRFS_XATTR_H
 
-#include <linux/xattr.h>
+struct dentry;
+struct inode;
+struct qstr;
+struct xattr_handler;
+struct btrfs_trans_handle;
 
 extern const struct xattr_handler * const btrfs_xattr_handlers[];
 

From 0c2c9b152176a9d0f19a8fac725a5fce39b73f69 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Sat, 27 Jan 2024 03:19:56 +0100
Subject: [PATCH 0400/1406] btrfs: add forward declarations and headers, part 2

Do a cleanup in more headers:

- add forward declarations for types referenced by pointers
- add includes when types need them

This fixes potential compilation problems if the headers are reordered
or the missing includes are not provided indirectly.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/bio.h              |  2 ++
 fs/btrfs/block-rsv.h        |  7 +++++++
 fs/btrfs/compression.h      | 10 ++++++++--
 fs/btrfs/delayed-inode.h    |  8 ++++++++
 fs/btrfs/disk-io.h          | 16 ++++++++++++----
 fs/btrfs/extent-io-tree.h   |  7 +++++++
 fs/btrfs/extent-tree.h      |  9 +++++++++
 fs/btrfs/extent_io.h        | 25 ++++++++++++++++++++-----
 fs/btrfs/extent_map.h       |  7 +++++++
 fs/btrfs/file-item.h        | 11 +++++++++++
 fs/btrfs/free-space-cache.h | 13 +++++++++++++
 fs/btrfs/free-space-tree.h  |  6 ++++++
 fs/btrfs/inode-item.h       |  5 +++--
 fs/btrfs/locking.h          |  8 ++++++--
 fs/btrfs/lru_cache.h        |  2 ++
 fs/btrfs/misc.h             |  2 ++
 fs/btrfs/ordered-data.h     | 13 +++++++++++++
 fs/btrfs/raid56.h           |  9 +++++++++
 fs/btrfs/send.h             |  8 +++++---
 fs/btrfs/space-info.h       |  9 +++++++++
 fs/btrfs/subpage.h          |  5 +++++
 fs/btrfs/transaction.h      | 17 ++++++++++++++++-
 fs/btrfs/tree-checker.h     |  2 ++
 fs/btrfs/tree-log.h         |  8 ++++++++
 fs/btrfs/ulist.h            |  1 +
 fs/btrfs/zoned.h            | 15 +++++++++++++++
 26 files changed, 206 insertions(+), 19 deletions(-)

diff --git a/fs/btrfs/bio.h b/fs/btrfs/bio.h
index bbaed317161a4c..d9dd5276093df0 100644
--- a/fs/btrfs/bio.h
+++ b/fs/btrfs/bio.h
@@ -7,12 +7,14 @@
 #ifndef BTRFS_BIO_H
 #define BTRFS_BIO_H
 
+#include <linux/types.h>
 #include <linux/bio.h>
 #include <linux/workqueue.h>
 #include "tree-checker.h"
 
 struct btrfs_bio;
 struct btrfs_fs_info;
+struct btrfs_inode;
 
 #define BTRFS_BIO_INLINE_CSUM_SIZE	64
 
diff --git a/fs/btrfs/block-rsv.h b/fs/btrfs/block-rsv.h
index b0bd12b8652f4f..b621199b01308c 100644
--- a/fs/btrfs/block-rsv.h
+++ b/fs/btrfs/block-rsv.h
@@ -3,8 +3,15 @@
 #ifndef BTRFS_BLOCK_RSV_H
 #define BTRFS_BLOCK_RSV_H
 
+#include <linux/types.h>
+#include <linux/compiler.h>
+#include <linux/spinlock.h>
+
 struct btrfs_trans_handle;
 struct btrfs_root;
+struct btrfs_space_info;
+struct btrfs_block_rsv;
+struct btrfs_fs_info;
 enum btrfs_reserve_flush_enum;
 
 /*
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index 97fe3ebf11a223..4691a84ca83831 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -7,10 +7,18 @@
 #define BTRFS_COMPRESSION_H
 
 #include <linux/sizes.h>
+#include <linux/mm.h>
+#include <linux/list.h>
+#include <linux/workqueue.h>
+#include <linux/wait.h>
 #include "bio.h"
 
+struct address_space;
+struct page;
+struct inode;
 struct btrfs_inode;
 struct btrfs_ordered_extent;
+struct btrfs_bio;
 
 /*
  * We want to make sure that amount of RAM required to uncompress an extent is
@@ -32,8 +40,6 @@ static_assert((BTRFS_MAX_COMPRESSED % PAGE_SIZE) == 0);
 
 #define	BTRFS_ZLIB_DEFAULT_LEVEL		3
 
-struct page;
-
 struct compressed_bio {
 	/* Number of compressed pages in the array */
 	unsigned int nr_pages;
diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h
index 5cceb31bbd16b2..3870a4bf718977 100644
--- a/fs/btrfs/delayed-inode.h
+++ b/fs/btrfs/delayed-inode.h
@@ -7,15 +7,23 @@
 #ifndef BTRFS_DELAYED_INODE_H
 #define BTRFS_DELAYED_INODE_H
 
+#include <linux/types.h>
 #include <linux/rbtree.h>
 #include <linux/spinlock.h>
 #include <linux/mutex.h>
 #include <linux/list.h>
 #include <linux/wait.h>
+#include <linux/fs.h>
 #include <linux/atomic.h>
 #include <linux/refcount.h>
 #include "ctree.h"
 
+struct btrfs_disk_key;
+struct btrfs_fs_info;
+struct btrfs_inode;
+struct btrfs_root;
+struct btrfs_trans_handle;
+
 enum btrfs_delayed_item_type {
 	BTRFS_DELAYED_INSERTION_ITEM,
 	BTRFS_DELAYED_DELETION_ITEM
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 21ff41bfe2b556..375f62ae370920 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -7,9 +7,21 @@
 #define BTRFS_DISK_IO_H
 
 #include <linux/sizes.h>
+#include <linux/compiler_types.h>
 #include "ctree.h"
 #include "fs.h"
 
+struct block_device;
+struct super_block;
+struct extent_buffer;
+struct btrfs_device;
+struct btrfs_fs_devices;
+struct btrfs_fs_info;
+struct btrfs_super_block;
+struct btrfs_trans_handle;
+struct btrfs_tree_parent_check;
+struct btrfs_transaction;
+
 #define BTRFS_SUPER_MIRROR_MAX	 3
 #define BTRFS_SUPER_MIRROR_SHIFT 12
 
@@ -29,10 +41,6 @@ static inline u64 btrfs_sb_offset(int mirror)
 	return BTRFS_SUPER_INFO_OFFSET;
 }
 
-struct btrfs_device;
-struct btrfs_fs_devices;
-struct btrfs_tree_parent_check;
-
 void btrfs_check_leaked_roots(struct btrfs_fs_info *fs_info);
 void btrfs_init_fs_info(struct btrfs_fs_info *fs_info);
 struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr,
diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h
index ebe6390d65e9dd..9d3a52d8f59a80 100644
--- a/fs/btrfs/extent-io-tree.h
+++ b/fs/btrfs/extent-io-tree.h
@@ -3,9 +3,16 @@
 #ifndef BTRFS_EXTENT_IO_TREE_H
 #define BTRFS_EXTENT_IO_TREE_H
 
+#include <linux/rbtree.h>
+#include <linux/spinlock.h>
+#include <linux/refcount.h>
+#include <linux/list.h>
+#include <linux/wait.h>
 #include "misc.h"
 
 struct extent_changeset;
+struct btrfs_fs_info;
+struct btrfs_inode;
 
 /* Bits for the extent state */
 enum {
diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h
index 2e066035cceeea..3fbcb7776a03d0 100644
--- a/fs/btrfs/extent-tree.h
+++ b/fs/btrfs/extent-tree.h
@@ -3,11 +3,20 @@
 #ifndef BTRFS_EXTENT_TREE_H
 #define BTRFS_EXTENT_TREE_H
 
+#include <linux/types.h>
 #include "misc.h"
 #include "block-group.h"
+#include "locking.h"
 
+struct extent_buffer;
 struct btrfs_free_cluster;
+struct btrfs_fs_info;
+struct btrfs_root;
+struct btrfs_path;
+struct btrfs_disk_key;
 struct btrfs_delayed_ref_head;
+struct btrfs_delayed_ref_root;
+struct btrfs_extent_inline_ref;
 
 enum btrfs_extent_allocation_policy {
 	BTRFS_EXTENT_ALLOC_CLUSTERED,
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 4437607f2b0601..e3530d427e1f9f 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -7,12 +7,32 @@
 #include <linux/refcount.h>
 #include <linux/fiemap.h>
 #include <linux/btrfs_tree.h>
+#include <linux/spinlock.h>
+#include <linux/atomic.h>
+#include <linux/rwsem.h>
+#include <linux/list.h>
+#include <linux/slab.h>
 #include "compression.h"
 #include "messages.h"
 #include "ulist.h"
 #include "misc.h"
 
+struct page;
+struct file;
+struct folio;
+struct inode;
+struct fiemap_extent_info;
+struct readahead_control;
+struct address_space;
+struct writeback_control;
+struct extent_io_tree;
+struct extent_map_tree;
+struct btrfs_block_group;
+struct btrfs_fs_info;
+struct btrfs_inode;
+struct btrfs_root;
 struct btrfs_trans_handle;
+struct btrfs_tree_parent_check;
 
 enum {
 	EXTENT_BUFFER_UPTODATE,
@@ -64,11 +84,6 @@ enum {
 #define BITMAP_LAST_BYTE_MASK(nbits) \
 	(BYTE_MASK >> (-(nbits) & (BITS_PER_BYTE - 1)))
 
-struct btrfs_root;
-struct btrfs_inode;
-struct btrfs_fs_info;
-struct extent_io_tree;
-struct btrfs_tree_parent_check;
 
 int __init extent_buffer_init_cachep(void);
 void __cold extent_buffer_free_cachep(void);
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 7fd55cf91f5372..c5a098c99cc6e2 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -3,11 +3,18 @@
 #ifndef BTRFS_EXTENT_MAP_H
 #define BTRFS_EXTENT_MAP_H
 
+#include <linux/compiler_types.h>
+#include <linux/rwlock_types.h>
 #include <linux/rbtree.h>
+#include <linux/list.h>
 #include <linux/refcount.h>
 #include "misc.h"
+#include "extent_map.h"
 #include "compression.h"
 
+struct btrfs_inode;
+struct btrfs_fs_info;
+
 #define EXTENT_MAP_LAST_BYTE ((u64)-4)
 #define EXTENT_MAP_HOLE ((u64)-3)
 #define EXTENT_MAP_INLINE ((u64)-2)
diff --git a/fs/btrfs/file-item.h b/fs/btrfs/file-item.h
index 606731bef247b0..15c05cc0fce60e 100644
--- a/fs/btrfs/file-item.h
+++ b/fs/btrfs/file-item.h
@@ -3,9 +3,20 @@
 #ifndef BTRFS_FILE_ITEM_H
 #define BTRFS_FILE_ITEM_H
 
+#include <linux/list.h>
+#include <uapi/linux/btrfs_tree.h>
 #include "accessors.h"
 
 struct extent_map;
+struct btrfs_file_extent_item;
+struct btrfs_fs_info;
+struct btrfs_path;
+struct btrfs_bio;
+struct btrfs_trans_handle;
+struct btrfs_root;
+struct btrfs_ordered_sum;
+struct btrfs_path;
+struct btrfs_inode;
 
 #define BTRFS_FILE_EXTENT_INLINE_DATA_START		\
 		(offsetof(struct btrfs_file_extent_item, disk_bytenr))
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index d9b7fbc2008a53..83774bfd7b3bb0 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -6,6 +6,19 @@
 #ifndef BTRFS_FREE_SPACE_CACHE_H
 #define BTRFS_FREE_SPACE_CACHE_H
 
+#include <linux/rbtree.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/mutex.h>
+#include "fs.h"
+
+struct inode;
+struct page;
+struct btrfs_fs_info;
+struct btrfs_path;
+struct btrfs_trans_handle;
+struct btrfs_trim_block_group;
+
 /*
  * This is the trim state of an extent or bitmap.
  *
diff --git a/fs/btrfs/free-space-tree.h b/fs/btrfs/free-space-tree.h
index 6d5551d0ced810..e6c6d6f4f2210a 100644
--- a/fs/btrfs/free-space-tree.h
+++ b/fs/btrfs/free-space-tree.h
@@ -6,7 +6,13 @@
 #ifndef BTRFS_FREE_SPACE_TREE_H
 #define BTRFS_FREE_SPACE_TREE_H
 
+#include <linux/bits.h>
+
 struct btrfs_caching_control;
+struct btrfs_fs_info;
+struct btrfs_path;
+struct btrfs_block_group;
+struct btrfs_trans_handle;
 
 /*
  * The default size for new free space bitmap items. The last bitmap in a block
diff --git a/fs/btrfs/inode-item.h b/fs/btrfs/inode-item.h
index 4337bb26f419b7..c4aded82709b1a 100644
--- a/fs/btrfs/inode-item.h
+++ b/fs/btrfs/inode-item.h
@@ -6,14 +6,15 @@
 #include <linux/types.h>
 #include <linux/crc32c.h>
 
+struct fscrypt_str;
+struct extent_buffer;
 struct btrfs_trans_handle;
 struct btrfs_root;
 struct btrfs_path;
 struct btrfs_key;
 struct btrfs_inode_extref;
 struct btrfs_inode;
-struct extent_buffer;
-struct fscrypt_str;
+struct btrfs_truncate_control;
 
 /*
  * Return this if we need to call truncate_block for the last bit of the
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
index 7d6ee1e609bf2b..9576f485a300b1 100644
--- a/fs/btrfs/locking.h
+++ b/fs/btrfs/locking.h
@@ -8,8 +8,14 @@
 
 #include <linux/atomic.h>
 #include <linux/wait.h>
+#include <linux/lockdep.h>
 #include <linux/percpu_counter.h>
 #include "extent_io.h"
+#include "locking.h"
+
+struct extent_buffer;
+struct btrfs_path;
+struct btrfs_root;
 
 #define BTRFS_WRITE_LOCK 1
 #define BTRFS_READ_LOCK 2
@@ -157,8 +163,6 @@ enum btrfs_lockdep_trans_states {
 static_assert(BTRFS_NESTING_MAX <= MAX_LOCKDEP_SUBCLASSES,
 	      "too many lock subclasses defined");
 
-struct btrfs_path;
-
 void __btrfs_tree_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest);
 void btrfs_tree_lock(struct extent_buffer *eb);
 void btrfs_tree_unlock(struct extent_buffer *eb);
diff --git a/fs/btrfs/lru_cache.h b/fs/btrfs/lru_cache.h
index 00328c856be6ef..390a12b61fd28e 100644
--- a/fs/btrfs/lru_cache.h
+++ b/fs/btrfs/lru_cache.h
@@ -3,8 +3,10 @@
 #ifndef BTRFS_LRU_CACHE_H
 #define BTRFS_LRU_CACHE_H
 
+#include <linux/types.h>
 #include <linux/maple_tree.h>
 #include <linux/list.h>
+#include "lru_cache.h"
 
 /*
  * A cache entry. This is meant to be embedded in a structure of a user of
diff --git a/fs/btrfs/misc.h b/fs/btrfs/misc.h
index 40f2d9f1a17a9c..dde4904aead9bc 100644
--- a/fs/btrfs/misc.h
+++ b/fs/btrfs/misc.h
@@ -3,6 +3,8 @@
 #ifndef BTRFS_MISC_H
 #define BTRFS_MISC_H
 
+#include <linux/types.h>
+#include <linux/bitmap.h>
 #include <linux/sched.h>
 #include <linux/wait.h>
 #include <linux/math64.h>
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 6fc0521000ac85..34413fc5b4bd2b 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -6,8 +6,21 @@
 #ifndef BTRFS_ORDERED_DATA_H
 #define BTRFS_ORDERED_DATA_H
 
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/refcount.h>
+#include <linux/completion.h>
+#include <linux/rbtree.h>
+#include <linux/wait.h>
 #include "async-thread.h"
 
+struct inode;
+struct page;
+struct extent_state;
+struct btrfs_inode;
+struct btrfs_root;
+struct btrfs_fs_info;
+
 struct btrfs_ordered_sum {
 	/*
 	 * Logical start address and length for of the blocks covered by
diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h
index 470213688872ec..0d7b4c2fb6ae80 100644
--- a/fs/btrfs/raid56.h
+++ b/fs/btrfs/raid56.h
@@ -7,9 +7,18 @@
 #ifndef BTRFS_RAID56_H
 #define BTRFS_RAID56_H
 
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/bio.h>
+#include <linux/refcount.h>
 #include <linux/workqueue.h>
 #include "volumes.h"
 
+struct page;
+struct sector_ptr;
+struct btrfs_fs_info;
+
 enum btrfs_rbio_ops {
 	BTRFS_RBIO_WRITE,
 	BTRFS_RBIO_READ_REBUILD,
diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h
index 4f5509cb180358..dd1c9f02b01118 100644
--- a/fs/btrfs/send.h
+++ b/fs/btrfs/send.h
@@ -8,6 +8,11 @@
 #define BTRFS_SEND_H
 
 #include <linux/types.h>
+#include <linux/sizes.h>
+#include <linux/align.h>
+
+struct inode;
+struct btrfs_ioctl_send_args;
 
 #define BTRFS_SEND_STREAM_MAGIC "btrfs-stream"
 /* Conditional support for the upcoming protocol version. */
@@ -25,9 +30,6 @@
 #define BTRFS_SEND_BUF_SIZE_V1				SZ_64K
 #define BTRFS_SEND_BUF_SIZE_V2	ALIGN(SZ_16K + BTRFS_MAX_COMPRESSED, PAGE_SIZE)
 
-struct inode;
-struct btrfs_ioctl_send_args;
-
 enum btrfs_tlv_type {
 	BTRFS_TLV_U8,
 	BTRFS_TLV_U16,
diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h
index 92c595fed1b0a6..a733458fd13b35 100644
--- a/fs/btrfs/space-info.h
+++ b/fs/btrfs/space-info.h
@@ -4,8 +4,17 @@
 #define BTRFS_SPACE_INFO_H
 
 #include <trace/events/btrfs.h>
+#include <linux/spinlock.h>
+#include <linux/list.h>
+#include <linux/kobject.h>
+#include <linux/lockdep.h>
+#include <linux/wait.h>
+#include <linux/rwsem.h>
 #include "volumes.h"
 
+struct btrfs_fs_info;
+struct btrfs_block_group;
+
 /*
  * Different levels for to flush space when doing space reservations.
  *
diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h
index 793c2b314a583a..55fc42db707e43 100644
--- a/fs/btrfs/subpage.h
+++ b/fs/btrfs/subpage.h
@@ -4,6 +4,11 @@
 #define BTRFS_SUBPAGE_H
 
 #include <linux/spinlock.h>
+#include <linux/atomic.h>
+
+struct address_space;
+struct folio;
+struct btrfs_fs_info;
 
 /*
  * Extra info for subpapge bitmap.
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 2bf8bbdfd0b38b..681109c5f441fa 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -6,12 +6,27 @@
 #ifndef BTRFS_TRANSACTION_H
 #define BTRFS_TRANSACTION_H
 
+#include <linux/atomic.h>
 #include <linux/refcount.h>
+#include <linux/list.h>
+#include <linux/time64.h>
+#include <linux/mutex.h>
+#include <linux/wait.h>
 #include "btrfs_inode.h"
 #include "delayed-ref.h"
-#include "ctree.h"
+#include "extent-io-tree.h"
+#include "block-rsv.h"
+#include "messages.h"
 #include "misc.h"
 
+struct dentry;
+struct inode;
+struct btrfs_pending_snapshot;
+struct btrfs_fs_info;
+struct btrfs_root_item;
+struct btrfs_root;
+struct btrfs_path;
+
 /* Radix-tree tag for roots that are part of the trasaction. */
 #define BTRFS_ROOT_TRANS_TAG			0
 
diff --git a/fs/btrfs/tree-checker.h b/fs/btrfs/tree-checker.h
index 14b9fbe82da474..5c809b50b2d09b 100644
--- a/fs/btrfs/tree-checker.h
+++ b/fs/btrfs/tree-checker.h
@@ -6,10 +6,12 @@
 #ifndef BTRFS_TREE_CHECKER_H
 #define BTRFS_TREE_CHECKER_H
 
+#include <linux/types.h>
 #include <uapi/linux/btrfs_tree.h>
 
 struct extent_buffer;
 struct btrfs_chunk;
+struct btrfs_key;
 
 /* All the extra info needed to verify the parentness of a tree block. */
 struct btrfs_tree_parent_check {
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index af219e8840d285..254082a189c323 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -6,10 +6,18 @@
 #ifndef BTRFS_TREE_LOG_H
 #define BTRFS_TREE_LOG_H
 
+#include <linux/list.h>
+#include <linux/fs.h>
 #include "messages.h"
 #include "ctree.h"
 #include "transaction.h"
 
+struct inode;
+struct dentry;
+struct btrfs_ordered_extent;
+struct btrfs_root;
+struct btrfs_trans_handle;
+
 /* return value for btrfs_log_dentry_safe that means we don't need to log it at all */
 #define BTRFS_NO_LOG_SYNC 256
 
diff --git a/fs/btrfs/ulist.h b/fs/btrfs/ulist.h
index b2cef187ea8efc..8e200fe1a2dd3c 100644
--- a/fs/btrfs/ulist.h
+++ b/fs/btrfs/ulist.h
@@ -7,6 +7,7 @@
 #ifndef BTRFS_ULIST_H
 #define BTRFS_ULIST_H
 
+#include <linux/types.h>
 #include <linux/list.h>
 #include <linux/rbtree.h>
 
diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h
index f573bda496fbd1..77c4321e331f37 100644
--- a/fs/btrfs/zoned.h
+++ b/fs/btrfs/zoned.h
@@ -4,12 +4,27 @@
 #define BTRFS_ZONED_H
 
 #include <linux/types.h>
+#include <linux/atomic.h>
 #include <linux/blkdev.h>
+#include <linux/blkzoned.h>
+#include <linux/errno.h>
+#include <linux/spinlock.h>
+#include <linux/mutex.h>
 #include "messages.h"
 #include "volumes.h"
 #include "disk-io.h"
 #include "block-group.h"
 #include "btrfs_inode.h"
+#include "fs.h"
+
+struct block_device;
+struct extent_buffer;
+struct btrfs_bio;
+struct btrfs_ordered_extent;
+struct btrfs_fs_info;
+struct btrfs_space_info;
+struct btrfs_eb_write_context;
+struct btrfs_fs_devices;
 
 #define BTRFS_DEFAULT_RECLAIM_THRESH           			(75)
 

From 48e69b630abe7afa0625939a9886fc88e401b99b Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Sat, 27 Jan 2024 04:31:30 +0100
Subject: [PATCH 0401/1406] btrfs: add forward declarations and headers, part 3

Do a cleanup in the rest of the headers:

- add forward declarations for types referenced by pointers
- add includes when types need them

This fixes potential compilation problems if the headers are reordered
or the missing includes are not provided indirectly.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/accessors.h   | 11 ++++++++++-
 fs/btrfs/backref.h     | 16 ++++++++++++++--
 fs/btrfs/block-group.h | 13 +++++++++++++
 fs/btrfs/btrfs_inode.h | 19 +++++++++++++++++++
 fs/btrfs/ctree.h       | 25 ++++++++++++-------------
 fs/btrfs/delayed-ref.h | 10 ++++++++++
 fs/btrfs/extent-tree.h |  1 +
 fs/btrfs/fs.h          | 42 ++++++++++++++++++++++++++++++++++++++++--
 fs/btrfs/qgroup.h      | 17 +++++++++++++----
 fs/btrfs/volumes.h     | 25 +++++++++++++++++++------
 10 files changed, 151 insertions(+), 28 deletions(-)

diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h
index ed7aa32972add9..fa099f61fc8ce6 100644
--- a/fs/btrfs/accessors.h
+++ b/fs/btrfs/accessors.h
@@ -3,8 +3,17 @@
 #ifndef BTRFS_ACCESSORS_H
 #define BTRFS_ACCESSORS_H
 
-#include <linux/stddef.h>
 #include <asm/unaligned.h>
+#include <linux/stddef.h>
+#include <linux/types.h>
+#include <linux/align.h>
+#include <linux/build_bug.h>
+#include <linux/compiler.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <uapi/linux/btrfs_tree.h>
+
+struct extent_buffer;
 
 struct btrfs_map_token {
 	struct extent_buffer *eb;
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index ab4ca0eda60557..523e594ac75356 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -6,11 +6,23 @@
 #ifndef BTRFS_BACKREF_H
 #define BTRFS_BACKREF_H
 
-#include <linux/btrfs.h>
+#include <linux/types.h>
+#include <linux/rbtree.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <uapi/linux/btrfs.h>
+#include <uapi/linux/btrfs_tree.h>
 #include "messages.h"
-#include "ulist.h"
+#include "locking.h"
 #include "disk-io.h"
 #include "extent_io.h"
+#include "ctree.h"
+
+struct extent_inode_elem;
+struct ulist;
+struct btrfs_extent_item;
+struct btrfs_trans_handle;
+struct btrfs_fs_info;
 
 /*
  * Used by implementations of iterate_extent_inodes_t (see definition below) to
diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h
index 962b11983901a8..5ef52b9ea37176 100644
--- a/fs/btrfs/block-group.h
+++ b/fs/btrfs/block-group.h
@@ -3,9 +3,22 @@
 #ifndef BTRFS_BLOCK_GROUP_H
 #define BTRFS_BLOCK_GROUP_H
 
+#include <linux/atomic.h>
+#include <linux/mutex.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/refcount.h>
+#include <linux/wait.h>
+#include <linux/sizes.h>
+#include <linux/rwsem.h>
+#include <linux/rbtree.h>
+#include <uapi/linux/btrfs_tree.h>
 #include "free-space-cache.h"
 
 struct btrfs_chunk_map;
+struct btrfs_fs_info;
+struct btrfs_inode;
+struct btrfs_trans_handle;
 
 enum btrfs_disk_cache_state {
 	BTRFS_DC_WRITTEN,
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 83d78a6f3aa2f3..397371472c1c5e 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -8,13 +8,32 @@
 
 #include <linux/hash.h>
 #include <linux/refcount.h>
+#include <linux/spinlock.h>
+#include <linux/mutex.h>
+#include <linux/rwsem.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/compiler.h>
 #include <linux/fscrypt.h>
+#include <linux/lockdep.h>
+#include <uapi/linux/btrfs_tree.h>
 #include <trace/events/btrfs.h>
+#include "block-rsv.h"
+#include "btrfs_inode.h"
 #include "extent_map.h"
 #include "extent_io.h"
+#include "extent-io-tree.h"
 #include "ordered-data.h"
 #include "delayed-inode.h"
 
+struct extent_state;
+struct posix_acl;
+struct iov_iter;
+struct writeback_control;
+struct btrfs_root;
+struct btrfs_fs_info;
+struct btrfs_trans_handle;
+
 /*
  * Since we search a directory based on f_pos (struct dir_context::pos) we have
  * to start at 2 since '.' and '..' have f_pos of 0 and 1 respectively, so
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index eede8128819686..c03c58246033bf 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -7,25 +7,24 @@
 #define BTRFS_CTREE_H
 
 #include <linux/pagemap.h>
+#include <linux/spinlock.h>
+#include <linux/rbtree.h>
+#include <linux/mutex.h>
+#include <linux/wait.h>
+#include <linux/list.h>
+#include <linux/atomic.h>
+#include <linux/xarray.h>
+#include <linux/refcount.h>
+#include <uapi/linux/btrfs_tree.h>
 #include "locking.h"
 #include "fs.h"
 #include "accessors.h"
+#include "extent-io-tree.h"
 
+struct extent_buffer;
+struct btrfs_block_rsv;
 struct btrfs_trans_handle;
-struct btrfs_transaction;
-struct btrfs_pending_snapshot;
-struct btrfs_delayed_ref_root;
-struct btrfs_space_info;
 struct btrfs_block_group;
-struct btrfs_ordered_sum;
-struct btrfs_ref;
-struct btrfs_bio;
-struct btrfs_ioctl_encoded_io_args;
-struct btrfs_device;
-struct btrfs_fs_devices;
-struct btrfs_balance_control;
-struct btrfs_delayed_root;
-struct reloc_control;
 
 /* Read ahead values for struct btrfs_path.reada */
 enum {
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index 62d679d40f4f91..cbd632f145f062 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -6,7 +6,17 @@
 #ifndef BTRFS_DELAYED_REF_H
 #define BTRFS_DELAYED_REF_H
 
+#include <linux/types.h>
 #include <linux/refcount.h>
+#include <linux/list.h>
+#include <linux/rbtree.h>
+#include <linux/mutex.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <uapi/linux/btrfs_tree.h>
+
+struct btrfs_trans_handle;
+struct btrfs_fs_info;
 
 /* these are the possible values of struct btrfs_delayed_ref_node->action */
 enum btrfs_delayed_ref_action {
diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h
index 3fbcb7776a03d0..af9f8800d5aca5 100644
--- a/fs/btrfs/extent-tree.h
+++ b/fs/btrfs/extent-tree.h
@@ -13,6 +13,7 @@ struct btrfs_free_cluster;
 struct btrfs_fs_info;
 struct btrfs_root;
 struct btrfs_path;
+struct btrfs_ref;
 struct btrfs_disk_key;
 struct btrfs_delayed_ref_head;
 struct btrfs_delayed_ref_root;
diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
index a83c5152215272..a7c3f9abc53a51 100644
--- a/fs/btrfs/fs.h
+++ b/fs/btrfs/fs.h
@@ -4,12 +4,50 @@
 #define BTRFS_FS_H
 
 #include <linux/blkdev.h>
-#include <linux/fs.h>
-#include <linux/btrfs_tree.h>
 #include <linux/sizes.h>
+#include <linux/time64.h>
+#include <linux/compiler.h>
+#include <linux/math.h>
+#include <linux/atomic.h>
+#include <linux/blkdev.h>
+#include <linux/percpu_counter.h>
+#include <linux/completion.h>
+#include <linux/lockdep.h>
+#include <linux/spinlock.h>
+#include <linux/mutex.h>
+#include <linux/rwlock_types.h>
+#include <linux/rwsem.h>
+#include <linux/semaphore.h>
+#include <linux/list.h>
+#include <linux/radix-tree.h>
+#include <linux/workqueue.h>
+#include <linux/wait.h>
+#include <linux/wait_bit.h>
+#include <linux/sched.h>
+#include <linux/rbtree.h>
+#include <uapi/linux/btrfs.h>
+#include <uapi/linux/btrfs_tree.h>
 #include "extent-io-tree.h"
 #include "async-thread.h"
 #include "block-rsv.h"
+#include "fs.h"
+
+struct inode;
+struct super_block;
+struct kobject;
+struct reloc_control;
+struct crypto_shash;
+struct ulist;
+struct btrfs_device;
+struct btrfs_block_group;
+struct btrfs_root;
+struct btrfs_fs_devices;
+struct btrfs_transaction;
+struct btrfs_delayed_root;
+struct btrfs_balance_control;
+struct btrfs_subpage_info;
+struct btrfs_stripe_hash_table;
+struct btrfs_space_info;
 
 #define BTRFS_MAX_EXTENT_SIZE SZ_128M
 
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index be18c862e64ede..1f664261c064e6 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -6,12 +6,22 @@
 #ifndef BTRFS_QGROUP_H
 #define BTRFS_QGROUP_H
 
+#include <linux/types.h>
 #include <linux/spinlock.h>
 #include <linux/rbtree.h>
 #include <linux/kobject.h>
-#include "ulist.h"
-#include "delayed-ref.h"
-#include "misc.h"
+#include <linux/list.h>
+#include <uapi/linux/btrfs_tree.h>
+
+struct extent_buffer;
+struct extent_changeset;
+struct btrfs_delayed_extent_op;
+struct btrfs_fs_info;
+struct btrfs_root;
+struct btrfs_ioctl_quota_ctl_args;
+struct btrfs_trans_handle;
+struct btrfs_delayed_ref_root;
+struct btrfs_inode;
 
 /*
  * Btrfs qgroup overview
@@ -321,7 +331,6 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid,
 		       struct btrfs_qgroup_limit *limit);
 int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info);
 void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info);
-struct btrfs_delayed_extent_op;
 
 int btrfs_qgroup_trace_extent_nolock(
 		struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 53f87f398da779..21d4de0e3f1f5b 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -6,13 +6,28 @@
 #ifndef BTRFS_VOLUMES_H
 #define BTRFS_VOLUMES_H
 
+#include <linux/blk_types.h>
+#include <linux/sizes.h>
+#include <linux/atomic.h>
 #include <linux/sort.h>
-#include <linux/btrfs.h>
-#include "async-thread.h"
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/log2.h>
+#include <linux/kobject.h>
+#include <linux/refcount.h>
+#include <linux/completion.h>
+#include <linux/rbtree.h>
+#include <uapi/linux/btrfs.h>
 #include "messages.h"
-#include "tree-checker.h"
 #include "rcu-string.h"
 
+struct block_device;
+struct bdev_handle;
+struct btrfs_fs_info;
+struct btrfs_block_group;
+struct btrfs_trans_handle;
+struct btrfs_zoned_device_info;
+
 #define BTRFS_MAX_DATA_CHUNK_SIZE	(10ULL * SZ_1G)
 
 extern struct mutex uuid_mutex;
@@ -77,7 +92,7 @@ enum btrfs_raid_types {
 #define BTRFS_DEV_STATE_FLUSH_SENT	(4)
 #define BTRFS_DEV_STATE_NO_READA	(5)
 
-struct btrfs_zoned_device_info;
+struct btrfs_fs_devices;
 
 struct btrfs_device {
 	struct list_head dev_list; /* device_list_mutex */
@@ -557,8 +572,6 @@ static inline void btrfs_free_chunk_map(struct btrfs_chunk_map *map)
 	}
 }
 
-struct btrfs_balance_args;
-struct btrfs_balance_progress;
 struct btrfs_balance_control {
 	struct btrfs_balance_args data;
 	struct btrfs_balance_args meta;

From c4effe1da938b0dcb1e45782d5e5b49eb49bf197 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Fri, 9 Feb 2024 21:06:06 -0800
Subject: [PATCH 0402/1406] fs/hfsplus: use better @opf description

Use a more descriptive explanation of the @opf function parameter,
more in line with <linux/blk_types.h>.

Fixes: 02105f18a26c ("fs/hfsplus: wrapper.c: fix kernel-doc warnings")
Suggested-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Link: https://lore.kernel.org/r/20240210050606.9182-1-rdunlap@infradead.org
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/hfsplus/wrapper.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c
index b0cb704009963c..ce9346099c72dc 100644
--- a/fs/hfsplus/wrapper.c
+++ b/fs/hfsplus/wrapper.c
@@ -30,7 +30,7 @@ struct hfsplus_wd {
  * @sector: block to read or write, for blocks of HFSPLUS_SECTOR_SIZE bytes
  * @buf: buffer for I/O
  * @data: output pointer for location of requested data
- * @opf: request op flags
+ * @opf: I/O operation type and flags
  *
  * The unit of I/O is hfsplus_min_io_size(sb), which may be bigger than
  * HFSPLUS_SECTOR_SIZE, and @buf must be sized accordingly. On reads

From 32e9b680de4b46cfe835cbc7ee3dc721f7cc65fb Mon Sep 17 00:00:00 2001
From: Viken Dadhaniya <quic_vdadhani@quicinc.com>
Date: Mon, 12 Feb 2024 18:22:39 +0530
Subject: [PATCH 0403/1406] i2c: i2c-qcom-geni: Correct I2C TRE sequence

For i2c read operation in GSI mode, we are getting timeout
due to malformed TRE basically incorrect TRE sequence
in gpi(drivers/dma/qcom/gpi.c) driver.

I2C driver has geni_i2c_gpi(I2C_WRITE) function which generates GO TRE and
geni_i2c_gpi(I2C_READ)generates DMA TRE. Hence to generate GO TRE before
DMA TRE, we should move geni_i2c_gpi(I2C_WRITE) before
geni_i2c_gpi(I2C_READ) inside the I2C GSI mode transfer function
i.e. geni_i2c_gpi_xfer().

TRE stands for Transfer Ring Element - which is basically an element with
size of 4 words. It contains all information like slave address,
clk divider, dma address value data size etc).

Mainly we have 3 TREs(Config, GO and DMA tre).
- CONFIG TRE : consists of internal register configuration which is
               required before start of the transfer.
- DMA TRE :    contains DDR/Memory address, called as DMA descriptor.
- GO TRE :     contains Transfer directions, slave ID, Delay flags, Length
               of the transfer.

I2c driver calls GPI driver API to config each TRE depending on the
protocol.

For read operation tre sequence will be as below which is not aligned
to hardware programming guide.

- CONFIG tre
- DMA tre
- GO tre

As per Qualcomm's internal Hardware Programming Guide, we should configure
TREs in below sequence for any RX only transfer.

- CONFIG tre
- GO tre
- DMA tre

Fixes: d8703554f4de ("i2c: qcom-geni: Add support for GPI DMA")
Reviewed-by: Andi Shyti <andi.shyti@kernel.org>
Reviewed-by: Bryan O'Donoghue <bryan.odonoghue@linaro.org>
Tested-by: Bryan O'Donoghue <bryan.odonoghue@linaro.org> # qrb5165-rb5
Co-developed-by: Mukesh Kumar Savaliya <quic_msavaliy@quicinc.com>
Signed-off-by: Mukesh Kumar Savaliya <quic_msavaliy@quicinc.com>
Signed-off-by: Viken Dadhaniya <quic_vdadhani@quicinc.com>
Reviewed-by: Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
Signed-off-by: Andi Shyti <andi.shyti@kernel.org>
---
 drivers/i2c/busses/i2c-qcom-geni.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/drivers/i2c/busses/i2c-qcom-geni.c b/drivers/i2c/busses/i2c-qcom-geni.c
index 0d2e7171e3a6f9..da94df466e83c9 100644
--- a/drivers/i2c/busses/i2c-qcom-geni.c
+++ b/drivers/i2c/busses/i2c-qcom-geni.c
@@ -613,20 +613,20 @@ static int geni_i2c_gpi_xfer(struct geni_i2c_dev *gi2c, struct i2c_msg msgs[], i
 
 		peripheral.addr = msgs[i].addr;
 
+		ret =  geni_i2c_gpi(gi2c, &msgs[i], &config,
+				    &tx_addr, &tx_buf, I2C_WRITE, gi2c->tx_c);
+		if (ret)
+			goto err;
+
 		if (msgs[i].flags & I2C_M_RD) {
 			ret =  geni_i2c_gpi(gi2c, &msgs[i], &config,
 					    &rx_addr, &rx_buf, I2C_READ, gi2c->rx_c);
 			if (ret)
 				goto err;
-		}
-
-		ret =  geni_i2c_gpi(gi2c, &msgs[i], &config,
-				    &tx_addr, &tx_buf, I2C_WRITE, gi2c->tx_c);
-		if (ret)
-			goto err;
 
-		if (msgs[i].flags & I2C_M_RD)
 			dma_async_issue_pending(gi2c->rx_c);
+		}
+
 		dma_async_issue_pending(gi2c->tx_c);
 
 		timeout = wait_for_completion_timeout(&gi2c->done, XFER_TIMEOUT);

From 1f82cb2f3859540120e990a79abfee8151ea6b93 Mon Sep 17 00:00:00 2001
From: Marco Elver <elver@google.com>
Date: Mon, 12 Feb 2024 14:01:09 +0100
Subject: [PATCH 0404/1406] hardening: Enable KFENCE in the hardening config

KFENCE is not a security mitigation mechanism (due to sampling), but has
the performance characteristics of unintrusive hardening techniques.
When used at scale, however, it improves overall security by allowing
kernel developers to detect heap memory-safety bugs cheaply.

Link: https://lkml.kernel.org/r/79B9A832-B3DE-4229-9D87-748B2CFB7D12@kernel.org
Cc: Matthieu Baerts <matttbe@kernel.org>
Cc: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Marco Elver <elver@google.com>
Link: https://lore.kernel.org/r/20240212130116.997627-1-elver@google.com
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 kernel/configs/hardening.config | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/kernel/configs/hardening.config b/kernel/configs/hardening.config
index ed126d7b5e83b7..7a5bbfc024b7d0 100644
--- a/kernel/configs/hardening.config
+++ b/kernel/configs/hardening.config
@@ -45,6 +45,9 @@ CONFIG_UBSAN_BOUNDS=y
 # CONFIG_UBSAN_ENUM
 # CONFIG_UBSAN_ALIGNMENT
 
+# Sampling-based heap out-of-bounds and use-after-free detection.
+CONFIG_KFENCE=y
+
 # Linked list integrity checking.
 CONFIG_LIST_HARDENED=y
 

From f23e1dee9e3edcf3d66eab3e7019f81af2ebd05f Mon Sep 17 00:00:00 2001
From: Jisheng Zhang <jszhang@kernel.org>
Date: Sun, 11 Feb 2024 16:17:38 +0800
Subject: [PATCH 0405/1406] dt-bindings: arm: sunxi: Add Sipeed Longan Module
 3H and Longan Pi 3H

Add name & compatible for the Sipeed Longan Module 3H and Longan PI 3H
board.

Signed-off-by: Jisheng Zhang <jszhang@kernel.org>
Reviewed-by: Andre Przywara <andre.przywara@arm.com>
Acked-by: Conor Dooley <conor.dooley@microchip.com>
Link: https://lore.kernel.org/r/20240211081739.395-2-jszhang@kernel.org
Signed-off-by: Jernej Skrabec <jernej.skrabec@gmail.com>
---
 Documentation/devicetree/bindings/arm/sunxi.yaml | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/Documentation/devicetree/bindings/arm/sunxi.yaml b/Documentation/devicetree/bindings/arm/sunxi.yaml
index dab7a248c88da2..09d835db6db57a 100644
--- a/Documentation/devicetree/bindings/arm/sunxi.yaml
+++ b/Documentation/devicetree/bindings/arm/sunxi.yaml
@@ -841,6 +841,12 @@ properties:
           - const: sinlinx,sina33
           - const: allwinner,sun8i-a33
 
+      - description: Sipeed Longan Pi 3H board for the Sipeed Longan Module 3H
+        items:
+          - const: sipeed,longan-pi-3h
+          - const: sipeed,longan-module-3h
+          - const: allwinner,sun50i-h618
+
       - description: SourceParts PopStick v1.1
         items:
           - const: sourceparts,popstick-v1.1

From ec9fec09419482c756db9bb626aa9da921e04525 Mon Sep 17 00:00:00 2001
From: Jisheng Zhang <jszhang@kernel.org>
Date: Sun, 11 Feb 2024 16:17:39 +0800
Subject: [PATCH 0406/1406] arm64: dts: allwinner: h616: Add Sipeed Longan SoM
 3H and Pi 3H board support

The Sipeed Longan SoM 3H is a system on module based on the Allwinner
H618 SoC. The SoM features:

- Four ARM Cortex-A53 cores, Mali-G31 MP2 GPU
- 2/4 GiB LPDDR4 DRAM SoMs
- AXP313a PMIC
- eMMC

The Sipeed Longan PI 3H is a development board based on the above SoM.
The board features:
- Longan SoM 3H
- Raspberry-Pi-1 compatible GPIO header
- 2 USB 2.0 host port
- 1 USB 2.0 type C port (power supply + OTG)
- MicroSD slot
- 1Gbps Ethernet port (via RTL8211 PHY)
- HDMI port
- WiFi/BT chip

Add the devicetree file describing the currently supported features,
namely PMIC, LEDs, UART, SD card, eMMC, USB and Ethernet.

Signed-off-by: Jisheng Zhang <jszhang@kernel.org>
Reviewed-by: Andre Przywara <andre.przywara@arm.com>
Reviewed-by: Jernej Skrabec <jernej.skrabec@gmail.com>
Link: https://lore.kernel.org/r/20240211081739.395-3-jszhang@kernel.org
Signed-off-by: Jernej Skrabec <jernej.skrabec@gmail.com>
---
 arch/arm64/boot/dts/allwinner/Makefile        |   1 +
 .../sun50i-h618-longan-module-3h.dtsi         |  75 +++++++++
 .../dts/allwinner/sun50i-h618-longanpi-3h.dts | 144 ++++++++++++++++++
 3 files changed, 220 insertions(+)
 create mode 100644 arch/arm64/boot/dts/allwinner/sun50i-h618-longan-module-3h.dtsi
 create mode 100644 arch/arm64/boot/dts/allwinner/sun50i-h618-longanpi-3h.dts

diff --git a/arch/arm64/boot/dts/allwinner/Makefile b/arch/arm64/boot/dts/allwinner/Makefile
index 2db3b15ad09f2c..62ac7cd2953316 100644
--- a/arch/arm64/boot/dts/allwinner/Makefile
+++ b/arch/arm64/boot/dts/allwinner/Makefile
@@ -43,5 +43,6 @@ dtb-$(CONFIG_ARCH_SUNXI) += sun50i-h616-bigtreetech-cb1-manta.dtb
 dtb-$(CONFIG_ARCH_SUNXI) += sun50i-h616-bigtreetech-pi.dtb
 dtb-$(CONFIG_ARCH_SUNXI) += sun50i-h616-orangepi-zero2.dtb
 dtb-$(CONFIG_ARCH_SUNXI) += sun50i-h616-x96-mate.dtb
+dtb-$(CONFIG_ARCH_SUNXI) += sun50i-h618-longanpi-3h.dtb
 dtb-$(CONFIG_ARCH_SUNXI) += sun50i-h618-orangepi-zero3.dtb
 dtb-$(CONFIG_ARCH_SUNXI) += sun50i-h618-transpeed-8k618-t.dtb
diff --git a/arch/arm64/boot/dts/allwinner/sun50i-h618-longan-module-3h.dtsi b/arch/arm64/boot/dts/allwinner/sun50i-h618-longan-module-3h.dtsi
new file mode 100644
index 00000000000000..8c1263a3939e76
--- /dev/null
+++ b/arch/arm64/boot/dts/allwinner/sun50i-h618-longan-module-3h.dtsi
@@ -0,0 +1,75 @@
+// SPDX-License-Identifier: (GPL-2.0+ OR MIT)
+/*
+ * Copyright (C) Jisheng Zhang <jszhang@kernel.org>
+ */
+
+#include "sun50i-h616.dtsi"
+
+&mmc2 {
+	pinctrl-names = "default";
+	pinctrl-0 = <&mmc2_pins>;
+	vmmc-supply = <&reg_dldo1>;
+	vqmmc-supply = <&reg_aldo1>;
+	bus-width = <8>;
+	non-removable;
+	cap-mmc-hw-reset;
+	mmc-ddr-1_8v;
+	mmc-hs200-1_8v;
+	status = "okay";
+};
+
+&r_i2c {
+	status = "okay";
+
+	axp313: pmic@36 {
+		compatible = "x-powers,axp313a";
+		reg = <0x36>;
+		#interrupt-cells = <1>;
+		interrupt-controller;
+
+		regulators {
+			reg_aldo1: aldo1 {
+				regulator-always-on;
+				regulator-min-microvolt = <1800000>;
+				regulator-max-microvolt = <1800000>;
+				regulator-name = "vcc-1v8-pll";
+			};
+
+			reg_dldo1: dldo1 {
+				regulator-always-on;
+				regulator-min-microvolt = <3300000>;
+				regulator-max-microvolt = <3300000>;
+				regulator-name = "vcc-3v3-io";
+			};
+
+			reg_dcdc1: dcdc1 {
+				regulator-always-on;
+				regulator-min-microvolt = <810000>;
+				regulator-max-microvolt = <990000>;
+				regulator-name = "vdd-gpu-sys";
+			};
+
+			reg_dcdc2: dcdc2 {
+				regulator-always-on;
+				regulator-min-microvolt = <810000>;
+				regulator-max-microvolt = <1100000>;
+				regulator-name = "vdd-cpu";
+			};
+
+			reg_dcdc3: dcdc3 {
+				regulator-always-on;
+				regulator-min-microvolt = <1100000>;
+				regulator-max-microvolt = <1100000>;
+				regulator-name = "vdd-dram";
+			};
+		};
+	};
+};
+
+&pio {
+	vcc-pc-supply = <&reg_dldo1>;
+	vcc-pf-supply = <&reg_dldo1>;
+	vcc-pg-supply = <&reg_aldo1>;
+	vcc-ph-supply = <&reg_dldo1>;
+	vcc-pi-supply = <&reg_dldo1>;
+};
diff --git a/arch/arm64/boot/dts/allwinner/sun50i-h618-longanpi-3h.dts b/arch/arm64/boot/dts/allwinner/sun50i-h618-longanpi-3h.dts
new file mode 100644
index 00000000000000..18b29c6b867f3f
--- /dev/null
+++ b/arch/arm64/boot/dts/allwinner/sun50i-h618-longanpi-3h.dts
@@ -0,0 +1,144 @@
+// SPDX-License-Identifier: (GPL-2.0+ OR MIT)
+/*
+ * Copyright (C) Jisheng Zhang <jszhang@kernel.org>
+ */
+
+/dts-v1/;
+
+#include "sun50i-h618-longan-module-3h.dtsi"
+
+#include <dt-bindings/gpio/gpio.h>
+#include <dt-bindings/interrupt-controller/arm-gic.h>
+#include <dt-bindings/leds/common.h>
+
+/ {
+	model = "Sipeed Longan Pi 3H";
+	compatible = "sipeed,longan-pi-3h", "sipeed,longan-module-3h", "allwinner,sun50i-h618";
+
+	aliases {
+		ethernet0 = &emac0;
+		serial0 = &uart0;
+	};
+
+	chosen {
+		stdout-path = "serial0:115200n8";
+	};
+
+	leds {
+		compatible = "gpio-leds";
+
+		led-0 {
+			color = <LED_COLOR_ID_ORANGE>;
+			function = LED_FUNCTION_INDICATOR;
+			function-enumerator = <0>;
+			gpios = <&pio 6 2 GPIO_ACTIVE_LOW>; /* PG2 */
+		};
+
+		led-1 {
+			color = <LED_COLOR_ID_ORANGE>;
+			function = LED_FUNCTION_INDICATOR;
+			function-enumerator = <1>;
+			gpios = <&pio 6 4 GPIO_ACTIVE_LOW>; /* PG4 */
+		};
+	};
+
+	reg_vcc5v: regulator-vcc5v {
+		/* board wide 5V supply directly from the USB-C socket */
+		compatible = "regulator-fixed";
+		regulator-name = "vcc-5v";
+		regulator-min-microvolt = <5000000>;
+		regulator-max-microvolt = <5000000>;
+		regulator-always-on;
+	};
+
+	reg_vcc3v3: regulator-vcc3v3 {
+		compatible = "regulator-fixed";
+		regulator-name = "vcc-3v3";
+		regulator-min-microvolt = <3300000>;
+		regulator-max-microvolt = <3300000>;
+		regulator-always-on;
+		vin-supply = <&reg_vcc5v>;
+	};
+};
+
+&axp313 {
+	vin1-supply = <&reg_vcc5v>;
+	vin2-supply = <&reg_vcc5v>;
+	vin3-supply = <&reg_vcc5v>;
+};
+
+&ehci1 {
+	status = "okay";
+};
+
+&ohci1 {
+	status = "okay";
+};
+
+&ehci2 {
+	status = "okay";
+};
+
+&ohci2 {
+	status = "okay";
+};
+
+/* WiFi & BT combo module is connected to this Host */
+&ehci3 {
+	status = "okay";
+};
+
+&ohci3 {
+	status = "okay";
+};
+
+&emac0 {
+	pinctrl-names = "default";
+	pinctrl-0 = <&ext_rgmii_pins>;
+	phy-mode = "rgmii";
+	phy-handle = <&ext_rgmii_phy>;
+	allwinner,rx-delay-ps = <3100>;
+	allwinner,tx-delay-ps = <700>;
+	phy-supply = <&reg_vcc3v3>;
+	status = "okay";
+};
+
+&mdio0 {
+	ext_rgmii_phy: ethernet-phy@1 {
+		compatible = "ethernet-phy-ieee802.3-c22";
+		reg = <1>;
+	};
+};
+
+&mmc0 {
+	bus-width = <4>;
+	cd-gpios = <&pio 5 6 GPIO_ACTIVE_HIGH>;	/* PF6 */
+	vmmc-supply = <&reg_vcc3v3>;
+	status = "okay";
+};
+
+&uart0 {
+	status = "okay";
+};
+
+&usbotg {
+	/*
+	 * PHY0 pins are connected to a USB-C socket, but a role switch
+	 * is not implemented: both CC pins are pulled to GND.
+	 * The VBUS pins power the device, so a fixed peripheral mode
+	 * is the best choice.
+	 * The board can be powered via GPIOs, in this case port0 *can*
+	 * act as a host (with a cable/adapter ignoring CC), as VBUS is
+	 * then provided by the GPIOs. Any user of this setup would
+	 * need to adjust the DT accordingly: dr_mode set to "host",
+	 * enabling OHCI0 and EHCI0.
+	 */
+	dr_mode = "peripheral";
+	status = "okay";
+};
+
+&usbphy {
+	usb1_vbus-supply = <&reg_vcc5v>;
+	usb2_vbus-supply = <&reg_vcc5v>;
+	status = "okay";
+};

From fca901387e5dc2cafeb99084fa2379cdd09f3401 Mon Sep 17 00:00:00 2001
From: Wolfram Sang <wsa+renesas@sang-engineering.com>
Date: Mon, 29 Jan 2024 14:58:40 +0100
Subject: [PATCH 0407/1406] arm64: dts: renesas: ulcb-kf: Add regulators for
 PCIe ch1

Without them, no power, so cards do not get recognized.

Signed-off-by: Wolfram Sang <wsa+renesas@sang-engineering.com>
Reviewed-by: Geert Uytterhoeven <geert+renesas@glider.be>
Link: https://lore.kernel.org/r/20240129135840.28988-1-wsa+renesas@sang-engineering.com
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
---
 arch/arm64/boot/dts/renesas/ulcb-kf.dtsi | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/arch/arm64/boot/dts/renesas/ulcb-kf.dtsi b/arch/arm64/boot/dts/renesas/ulcb-kf.dtsi
index e3cc0e0e73cc33..c8dc06f0dfed71 100644
--- a/arch/arm64/boot/dts/renesas/ulcb-kf.dtsi
+++ b/arch/arm64/boot/dts/renesas/ulcb-kf.dtsi
@@ -39,6 +39,24 @@
 		regulator-max-microvolt = <1800000>;
 	};
 
+	pcie_1v5: regulator-pcie-1v5 {
+		compatible = "regulator-fixed";
+		regulator-name = "pcie-1v5";
+		regulator-min-microvolt = <1500000>;
+		regulator-max-microvolt = <1500000>;
+		gpio = <&gpio_exp_77 15 GPIO_ACTIVE_HIGH>;
+		enable-active-high;
+	};
+
+	pcie_3v3: regulator-pcie-3v3 {
+		compatible = "regulator-fixed";
+		regulator-name = "pcie-3v3";
+		regulator-min-microvolt = <3300000>;
+		regulator-max-microvolt = <3300000>;
+		gpio = <&gpio_exp_77 14 GPIO_ACTIVE_HIGH>;
+		enable-active-high;
+	};
+
 	snd_vcc5v: regulator-snd_vcc5v {
 		compatible = "regulator-fixed";
 		regulator-name = "snd-vcc5v";
@@ -323,6 +341,9 @@
 
 &pciec1 {
 	status = "okay";
+
+	vpcie1v5-supply = <&pcie_1v5>;
+	vpcie3v3-supply = <&pcie_3v3>;
 };
 
 &pfc {

From 57a49af7e5aee2486a24846ce33937fd9aca3ea7 Mon Sep 17 00:00:00 2001
From: Wolfram Sang <wsa+renesas@sang-engineering.com>
Date: Mon, 29 Jan 2024 22:23:49 +0100
Subject: [PATCH 0408/1406] arm64: dts: renesas: ulcb-kf: Adapt 1.8V HDMI
 regulator to schematics

It is named T1.8V in the schematics. Also add properties documenting it
is always on, also during boot.

Signed-off-by: Wolfram Sang <wsa+renesas@sang-engineering.com>
Reviewed-by: Geert Uytterhoeven <geert+renesas@glider.be>
Link: https://lore.kernel.org/r/20240129212350.33370-2-wsa+renesas@sang-engineering.com
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
---
 arch/arm64/boot/dts/renesas/ulcb-kf.dtsi | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/arch/arm64/boot/dts/renesas/ulcb-kf.dtsi b/arch/arm64/boot/dts/renesas/ulcb-kf.dtsi
index c8dc06f0dfed71..52249c3e5509a7 100644
--- a/arch/arm64/boot/dts/renesas/ulcb-kf.dtsi
+++ b/arch/arm64/boot/dts/renesas/ulcb-kf.dtsi
@@ -32,11 +32,13 @@
 		};
 	};
 
-	hdmi_1v8: regulator-hdmi-1v8 {
+	reg_t1p8v: regulator-t1p8v {
 		compatible = "regulator-fixed";
-		regulator-name = "hdmi-1v8";
+		regulator-name = "T1.8V";
 		regulator-min-microvolt = <1800000>;
 		regulator-max-microvolt = <1800000>;
+		regulator-boot-on;
+		regulator-always-on;
 	};
 
 	pcie_1v5: regulator-pcie-1v5 {
@@ -154,11 +156,11 @@
 
 				pd-gpios = <&gpio_exp_75 5 GPIO_ACTIVE_LOW>;
 
-				avdd-supply = <&hdmi_1v8>;
-				dvdd-supply = <&hdmi_1v8>;
-				pvdd-supply = <&hdmi_1v8>;
+				avdd-supply = <&reg_t1p8v>;
+				dvdd-supply = <&reg_t1p8v>;
+				pvdd-supply = <&reg_t1p8v>;
 				dvdd-3v-supply = <&reg_3p3v>;
-				bgvdd-supply = <&hdmi_1v8>;
+				bgvdd-supply = <&reg_t1p8v>;
 
 				adi,input-depth = <8>;
 				adi,input-colorspace = "rgb";

From 2a1c27371d080857a08939e1fd595a8427d585fc Mon Sep 17 00:00:00 2001
From: Wolfram Sang <wsa+renesas@sang-engineering.com>
Date: Mon, 29 Jan 2024 22:23:50 +0100
Subject: [PATCH 0409/1406] arm64: dts: renesas: ulcb-kf: Adapt sound 5v
 regulator to schematics

Sound uses the standard 5V supply, so rename the fixed regulator as
such. Also add properties documenting it is always on, also during boot.

Signed-off-by: Wolfram Sang <wsa+renesas@sang-engineering.com>
Reviewed-by: Geert Uytterhoeven <geert+renesas@glider.be>
Link: https://lore.kernel.org/r/20240129212350.33370-3-wsa+renesas@sang-engineering.com
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
---
 arch/arm64/boot/dts/renesas/ulcb-kf.dtsi | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/arch/arm64/boot/dts/renesas/ulcb-kf.dtsi b/arch/arm64/boot/dts/renesas/ulcb-kf.dtsi
index 52249c3e5509a7..d655aeb69725a1 100644
--- a/arch/arm64/boot/dts/renesas/ulcb-kf.dtsi
+++ b/arch/arm64/boot/dts/renesas/ulcb-kf.dtsi
@@ -59,11 +59,13 @@
 		enable-active-high;
 	};
 
-	snd_vcc5v: regulator-snd_vcc5v {
+	reg_5v: regulator-5v {
 		compatible = "regulator-fixed";
-		regulator-name = "snd-vcc5v";
+		regulator-name = "fixed-5V";
 		regulator-min-microvolt = <5000000>;
 		regulator-max-microvolt = <5000000>;
+		regulator-boot-on;
+		regulator-always-on;
 	};
 
 	wlan_en: regulator-wlan_en {
@@ -210,10 +212,10 @@
 
 				VDD1-supply = <&reg_3p3v>;
 				VDD2-supply = <&reg_3p3v>;
-				VCCAD1-supply = <&snd_vcc5v>;
-				VCCAD2-supply = <&snd_vcc5v>;
-				VCCDA1-supply = <&snd_vcc5v>;
-				VCCDA2-supply = <&snd_vcc5v>;
+				VCCAD1-supply = <&reg_5v>;
+				VCCAD2-supply = <&reg_5v>;
+				VCCDA1-supply = <&reg_5v>;
+				VCCDA2-supply = <&reg_5v>;
 			};
 
 			gyroscope@6b {

From 48a9cfe6d9f2754eb1709d9658ab309cefa621cb Mon Sep 17 00:00:00 2001
From: Duy Nguyen <duy.nguyen.rh@renesas.com>
Date: Thu, 1 Feb 2024 15:19:16 +0100
Subject: [PATCH 0410/1406] arm64: dts: renesas: r8a779h0: Add L3 cache
 controller

Describe the cache configuration for the first Cortex-A76 CPU core on
the Renesas R-Car V4M (R8A779H0) SoC.

Signed-off-by: Duy Nguyen <duy.nguyen.rh@renesas.com>
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Link: https://lore.kernel.org/r/9d56a46892c5e0957d244370e6809013cf815905.1706796979.git.geert+renesas@glider.be
---
 arch/arm64/boot/dts/renesas/r8a779h0.dtsi | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/arch/arm64/boot/dts/renesas/r8a779h0.dtsi b/arch/arm64/boot/dts/renesas/r8a779h0.dtsi
index 9ad53e85cf6068..d8a0f093387327 100644
--- a/arch/arm64/boot/dts/renesas/r8a779h0.dtsi
+++ b/arch/arm64/boot/dts/renesas/r8a779h0.dtsi
@@ -23,6 +23,14 @@
 			reg = <0>;
 			device_type = "cpu";
 			power-domains = <&sysc R8A779H0_PD_A1E0D0C0>;
+			next-level-cache = <&L3_CA76>;
+		};
+
+		L3_CA76: cache-controller {
+			compatible = "cache";
+			power-domains = <&sysc R8A779H0_PD_A2E0D0>;
+			cache-unified;
+			cache-level = <3>;
 		};
 	};
 

From 7db43283be5364e07f8d2571a11789a2af086b12 Mon Sep 17 00:00:00 2001
From: Duy Nguyen <duy.nguyen.rh@renesas.com>
Date: Thu, 1 Feb 2024 15:19:17 +0100
Subject: [PATCH 0411/1406] arm64: dts: renesas: r8a779h0: Add secondary CA76
 CPU cores

Complete the description of the Cortex-A76 CPU cores and L3 cache
controllers on the Renesas R-Car V4M (R8A779H0) SoC, including CPU
topology and PSCI support for enabling CPU cores.

Signed-off-by: Duy Nguyen <duy.nguyen.rh@renesas.com>
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Link: https://lore.kernel.org/r/c2a38a0da74915bf2a9171e53886c83a1c732934.1706796979.git.geert+renesas@glider.be
---
 arch/arm64/boot/dts/renesas/r8a779h0.dtsi | 50 +++++++++++++++++++++++
 1 file changed, 50 insertions(+)

diff --git a/arch/arm64/boot/dts/renesas/r8a779h0.dtsi b/arch/arm64/boot/dts/renesas/r8a779h0.dtsi
index d8a0f093387327..24e162d09c46a2 100644
--- a/arch/arm64/boot/dts/renesas/r8a779h0.dtsi
+++ b/arch/arm64/boot/dts/renesas/r8a779h0.dtsi
@@ -18,12 +18,57 @@
 		#address-cells = <1>;
 		#size-cells = <0>;
 
+		cpu-map {
+			cluster0 {
+				core0 {
+					cpu = <&a76_0>;
+				};
+				core1 {
+					cpu = <&a76_1>;
+				};
+				core2 {
+					cpu = <&a76_2>;
+				};
+				core3 {
+					cpu = <&a76_3>;
+				};
+			};
+		};
+
 		a76_0: cpu@0 {
 			compatible = "arm,cortex-a76";
 			reg = <0>;
 			device_type = "cpu";
 			power-domains = <&sysc R8A779H0_PD_A1E0D0C0>;
 			next-level-cache = <&L3_CA76>;
+			enable-method = "psci";
+		};
+
+		a76_1: cpu@100 {
+			compatible = "arm,cortex-a76";
+			reg = <0x100>;
+			device_type = "cpu";
+			power-domains = <&sysc R8A779H0_PD_A1E0D0C1>;
+			next-level-cache = <&L3_CA76>;
+			enable-method = "psci";
+		};
+
+		a76_2: cpu@200 {
+			compatible = "arm,cortex-a76";
+			reg = <0x200>;
+			device_type = "cpu";
+			power-domains = <&sysc R8A779H0_PD_A1E0D0C2>;
+			next-level-cache = <&L3_CA76>;
+			enable-method = "psci";
+		};
+
+		a76_3: cpu@300 {
+			compatible = "arm,cortex-a76";
+			reg = <0x300>;
+			device_type = "cpu";
+			power-domains = <&sysc R8A779H0_PD_A1E0D0C3>;
+			next-level-cache = <&L3_CA76>;
+			enable-method = "psci";
 		};
 
 		L3_CA76: cache-controller {
@@ -53,6 +98,11 @@
 		interrupts-extended = <&gic GIC_PPI 7 IRQ_TYPE_LEVEL_LOW>;
 	};
 
+	psci {
+		compatible = "arm,psci-1.0", "arm,psci-0.2";
+		method = "smc";
+	};
+
 	/* External SCIF clock - to be overridden by boards that provide it */
 	scif_clk: scif-clk {
 		compatible = "fixed-clock";

From ad802e18165a809d15c2b7a0e7f47c56031edb08 Mon Sep 17 00:00:00 2001
From: Duy Nguyen <duy.nguyen.rh@renesas.com>
Date: Thu, 1 Feb 2024 15:19:18 +0100
Subject: [PATCH 0412/1406] arm64: dts: renesas: r8a779h0: Add CPUIdle support

Support CPUIdle for ARM Cortex-A76 on R-Car V4M.

Signed-off-by: Duy Nguyen <duy.nguyen.rh@renesas.com>
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Link: https://lore.kernel.org/r/848d176bdbcaf3bc44e5dae555afa9c812a19fd1.1706796979.git.geert+renesas@glider.be
---
 arch/arm64/boot/dts/renesas/r8a779h0.dtsi | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/arch/arm64/boot/dts/renesas/r8a779h0.dtsi b/arch/arm64/boot/dts/renesas/r8a779h0.dtsi
index 24e162d09c46a2..9ef61ff61740b9 100644
--- a/arch/arm64/boot/dts/renesas/r8a779h0.dtsi
+++ b/arch/arm64/boot/dts/renesas/r8a779h0.dtsi
@@ -42,6 +42,7 @@
 			power-domains = <&sysc R8A779H0_PD_A1E0D0C0>;
 			next-level-cache = <&L3_CA76>;
 			enable-method = "psci";
+			cpu-idle-states = <&CPU_SLEEP_0>;
 		};
 
 		a76_1: cpu@100 {
@@ -51,6 +52,7 @@
 			power-domains = <&sysc R8A779H0_PD_A1E0D0C1>;
 			next-level-cache = <&L3_CA76>;
 			enable-method = "psci";
+			cpu-idle-states = <&CPU_SLEEP_0>;
 		};
 
 		a76_2: cpu@200 {
@@ -60,6 +62,7 @@
 			power-domains = <&sysc R8A779H0_PD_A1E0D0C2>;
 			next-level-cache = <&L3_CA76>;
 			enable-method = "psci";
+			cpu-idle-states = <&CPU_SLEEP_0>;
 		};
 
 		a76_3: cpu@300 {
@@ -69,6 +72,20 @@
 			power-domains = <&sysc R8A779H0_PD_A1E0D0C3>;
 			next-level-cache = <&L3_CA76>;
 			enable-method = "psci";
+			cpu-idle-states = <&CPU_SLEEP_0>;
+		};
+
+		idle-states {
+			entry-method = "psci";
+
+			CPU_SLEEP_0: cpu-sleep-0 {
+				compatible = "arm,idle-state";
+				arm,psci-suspend-param = <0x0010000>;
+				local-timer-stop;
+				entry-latency-us = <400>;
+				exit-latency-us = <500>;
+				min-residency-us = <4000>;
+			};
 		};
 
 		L3_CA76: cache-controller {

From 6061af689099d61e95a76f9d890594a1dd848931 Mon Sep 17 00:00:00 2001
From: Duy Nguyen <duy.nguyen.rh@renesas.com>
Date: Thu, 1 Feb 2024 15:19:19 +0100
Subject: [PATCH 0413/1406] arm64: dts: renesas: r8a779h0: Add CPU core clocks

Describe the clocks for the four Cortex-A76 CPU cores.
CA76 CPU cores 0,1,2,3 are clocked by ZC0,ZC1,ZC2,ZC3.

Signed-off-by: Duy Nguyen <duy.nguyen.rh@renesas.com>
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Link: https://lore.kernel.org/r/c64cf6ca1590fa1a36b90a18fd70c831d5b8318e.1706796979.git.geert+renesas@glider.be
---
 arch/arm64/boot/dts/renesas/r8a779h0.dtsi | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/arch/arm64/boot/dts/renesas/r8a779h0.dtsi b/arch/arm64/boot/dts/renesas/r8a779h0.dtsi
index 9ef61ff61740b9..63e08fc1e6a8d5 100644
--- a/arch/arm64/boot/dts/renesas/r8a779h0.dtsi
+++ b/arch/arm64/boot/dts/renesas/r8a779h0.dtsi
@@ -43,6 +43,7 @@
 			next-level-cache = <&L3_CA76>;
 			enable-method = "psci";
 			cpu-idle-states = <&CPU_SLEEP_0>;
+			clocks = <&cpg CPG_CORE R8A779H0_CLK_ZC0>;
 		};
 
 		a76_1: cpu@100 {
@@ -53,6 +54,7 @@
 			next-level-cache = <&L3_CA76>;
 			enable-method = "psci";
 			cpu-idle-states = <&CPU_SLEEP_0>;
+			clocks = <&cpg CPG_CORE R8A779H0_CLK_ZC1>;
 		};
 
 		a76_2: cpu@200 {
@@ -63,6 +65,7 @@
 			next-level-cache = <&L3_CA76>;
 			enable-method = "psci";
 			cpu-idle-states = <&CPU_SLEEP_0>;
+			clocks = <&cpg CPG_CORE R8A779H0_CLK_ZC2>;
 		};
 
 		a76_3: cpu@300 {
@@ -73,6 +76,7 @@
 			next-level-cache = <&L3_CA76>;
 			enable-method = "psci";
 			cpu-idle-states = <&CPU_SLEEP_0>;
+			clocks = <&cpg CPG_CORE R8A779H0_CLK_ZC3>;
 		};
 
 		idle-states {

From a80974199d10fd538d18eb27515139e494fa9a6e Mon Sep 17 00:00:00 2001
From: Duy Nguyen <duy.nguyen.rh@renesas.com>
Date: Thu, 1 Feb 2024 15:19:20 +0100
Subject: [PATCH 0414/1406] arm64: dts: renesas: r8a779h0: Add CA76 operating
 points

Add operating points for running the Cortex-A76 CPU cores on R-Car V4M
at various speeds, up to the Normal (1.0 GHz).

Signed-off-by: Duy Nguyen <duy.nguyen.rh@renesas.com>
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Link: https://lore.kernel.org/r/736b5836ec2b54e8b36712866309dc1b7ee1fc48.1706796979.git.geert+renesas@glider.be
---
 arch/arm64/boot/dts/renesas/r8a779h0.dtsi | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/arch/arm64/boot/dts/renesas/r8a779h0.dtsi b/arch/arm64/boot/dts/renesas/r8a779h0.dtsi
index 63e08fc1e6a8d5..09cf4fe97d81a1 100644
--- a/arch/arm64/boot/dts/renesas/r8a779h0.dtsi
+++ b/arch/arm64/boot/dts/renesas/r8a779h0.dtsi
@@ -14,6 +14,22 @@
 	#address-cells = <2>;
 	#size-cells = <2>;
 
+	cluster0_opp: opp-table-0 {
+		compatible = "operating-points-v2";
+		opp-shared;
+
+		opp-500000000 {
+			opp-hz = /bits/ 64 <500000000>;
+			opp-microvolt = <825000>;
+			clock-latency-ns = <500000>;
+		};
+		opp-1000000000 {
+			opp-hz = /bits/ 64 <1000000000>;
+			opp-microvolt = <825000>;
+			clock-latency-ns = <500000>;
+		};
+	};
+
 	cpus {
 		#address-cells = <1>;
 		#size-cells = <0>;
@@ -44,6 +60,7 @@
 			enable-method = "psci";
 			cpu-idle-states = <&CPU_SLEEP_0>;
 			clocks = <&cpg CPG_CORE R8A779H0_CLK_ZC0>;
+			operating-points-v2 = <&cluster0_opp>;
 		};
 
 		a76_1: cpu@100 {
@@ -55,6 +72,7 @@
 			enable-method = "psci";
 			cpu-idle-states = <&CPU_SLEEP_0>;
 			clocks = <&cpg CPG_CORE R8A779H0_CLK_ZC1>;
+			operating-points-v2 = <&cluster0_opp>;
 		};
 
 		a76_2: cpu@200 {
@@ -66,6 +84,7 @@
 			enable-method = "psci";
 			cpu-idle-states = <&CPU_SLEEP_0>;
 			clocks = <&cpg CPG_CORE R8A779H0_CLK_ZC2>;
+			operating-points-v2 = <&cluster0_opp>;
 		};
 
 		a76_3: cpu@300 {
@@ -77,6 +96,7 @@
 			enable-method = "psci";
 			cpu-idle-states = <&CPU_SLEEP_0>;
 			clocks = <&cpg CPG_CORE R8A779H0_CLK_ZC3>;
+			operating-points-v2 = <&cluster0_opp>;
 		};
 
 		idle-states {

From dad3078f8695d4db88f5ecbb47fd88b69047bbf8 Mon Sep 17 00:00:00 2001
From: Claudiu Beznea <claudiu.beznea.uj@bp.renesas.com>
Date: Thu, 8 Feb 2024 15:56:29 +0200
Subject: [PATCH 0415/1406] arm64: dts: renesas: r9a08g045: Add PSCI support

Add PSCI support to enable suspend/resume with the help of TF-A.

Signed-off-by: Claudiu Beznea <claudiu.beznea.uj@bp.renesas.com>
Reviewed-by: Geert Uytterhoeven <geert+renesas@glider.be>
Link: https://lore.kernel.org/r/20240208135629.2840932-3-claudiu.beznea.uj@bp.renesas.com
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
---
 arch/arm64/boot/dts/renesas/r9a08g045.dtsi | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/arch/arm64/boot/dts/renesas/r9a08g045.dtsi b/arch/arm64/boot/dts/renesas/r9a08g045.dtsi
index dfee878c0f4922..19bbcae01d8087 100644
--- a/arch/arm64/boot/dts/renesas/r9a08g045.dtsi
+++ b/arch/arm64/boot/dts/renesas/r9a08g045.dtsi
@@ -42,6 +42,11 @@
 		clock-frequency = <0>;
 	};
 
+	psci {
+		compatible = "arm,psci-1.0", "arm,psci-0.2";
+		method = "smc";
+	};
+
 	soc: soc {
 		compatible = "simple-bus";
 		interrupt-parent = <&gic>;

From 53c51d96e3c98a23bdcd52a0e5099510435bcb0a Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Thu, 8 Feb 2024 15:32:36 +0000
Subject: [PATCH 0416/1406] btrfs: stop passing root argument to
 btrfs_add_delalloc_inodes()

There's no need to pass a root argument to btrfs_add_delalloc_inodes(), we
can just pass the inode since the root is always the root associated to
the inode in the context it's called. So remove it and have the single
caller pass only the inode.

Reviewed-by: Boris Burkov <boris@bur.io>
Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 2d16bb08e90512..e3d12d8cf088bc 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2385,10 +2385,10 @@ void btrfs_merge_delalloc_extent(struct btrfs_inode *inode, struct extent_state
 	spin_unlock(&inode->lock);
 }
 
-static void btrfs_add_delalloc_inodes(struct btrfs_root *root,
-				      struct btrfs_inode *inode)
+static void btrfs_add_delalloc_inodes(struct btrfs_inode *inode)
 {
-	struct btrfs_fs_info *fs_info = inode->root->fs_info;
+	struct btrfs_root *root = inode->root;
+	struct btrfs_fs_info *fs_info = root->fs_info;
 
 	spin_lock(&root->delalloc_lock);
 	if (list_empty(&inode->delalloc_inodes)) {
@@ -2451,7 +2451,6 @@ void btrfs_set_delalloc_extent(struct btrfs_inode *inode, struct extent_state *s
 	 * bit, which is only set or cleared with irqs on
 	 */
 	if (!(state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
-		struct btrfs_root *root = inode->root;
 		u64 len = state->end + 1 - state->start;
 		u32 num_extents = count_max_extents(fs_info, len);
 		bool do_list = !btrfs_is_free_space_inode(inode);
@@ -2472,7 +2471,7 @@ void btrfs_set_delalloc_extent(struct btrfs_inode *inode, struct extent_state *s
 			inode->defrag_bytes += len;
 		if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
 					 &inode->runtime_flags))
-			btrfs_add_delalloc_inodes(root, inode);
+			btrfs_add_delalloc_inodes(inode);
 		spin_unlock(&inode->lock);
 	}
 

From 2844cd16bc0b6c84e0a7d11701d1b287a112f738 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Thu, 8 Feb 2024 21:55:42 +0000
Subject: [PATCH 0417/1406] btrfs: stop passing root argument to
 __btrfs_del_delalloc_inode()

There's no need to pass a root argument to __btrfs_del_delalloc_inode()
and btrfs_del_delalloc_inode(), we can just pass the inode since the root
is always the root associated to that inode. Some remove the root argument
from these functions.

Reviewed-by: Boris Burkov <boris@bur.io>
Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/btrfs_inode.h |  2 +-
 fs/btrfs/disk-io.c     |  2 +-
 fs/btrfs/inode.c       | 15 +++++++--------
 3 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 397371472c1c5e..4d8c2c5ece0171 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -447,7 +447,7 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
 			      u64 *orig_start, u64 *orig_block_len,
 			      u64 *ram_bytes, bool nowait, bool strict);
 
-void __btrfs_del_delalloc_inode(struct btrfs_root *root, struct btrfs_inode *inode);
+void __btrfs_del_delalloc_inode(struct btrfs_inode *inode);
 struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry);
 int btrfs_set_inode_index(struct btrfs_inode *dir, u64 *index);
 int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 4280f8e23461a3..8ab185182c30fe 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -4629,7 +4629,7 @@ static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root)
 		struct inode *inode = NULL;
 		btrfs_inode = list_first_entry(&splice, struct btrfs_inode,
 					       delalloc_inodes);
-		__btrfs_del_delalloc_inode(root, btrfs_inode);
+		__btrfs_del_delalloc_inode(btrfs_inode);
 		spin_unlock(&root->delalloc_lock);
 
 		/*
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index e3d12d8cf088bc..ec8af7d0f16643 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2406,9 +2406,9 @@ static void btrfs_add_delalloc_inodes(struct btrfs_inode *inode)
 	spin_unlock(&root->delalloc_lock);
 }
 
-void __btrfs_del_delalloc_inode(struct btrfs_root *root,
-				struct btrfs_inode *inode)
+void __btrfs_del_delalloc_inode(struct btrfs_inode *inode)
 {
+	struct btrfs_root *root = inode->root;
 	struct btrfs_fs_info *fs_info = root->fs_info;
 
 	if (!list_empty(&inode->delalloc_inodes)) {
@@ -2426,12 +2426,11 @@ void __btrfs_del_delalloc_inode(struct btrfs_root *root,
 	}
 }
 
-static void btrfs_del_delalloc_inode(struct btrfs_root *root,
-				     struct btrfs_inode *inode)
+static void btrfs_del_delalloc_inode(struct btrfs_inode *inode)
 {
-	spin_lock(&root->delalloc_lock);
-	__btrfs_del_delalloc_inode(root, inode);
-	spin_unlock(&root->delalloc_lock);
+	spin_lock(&inode->root->delalloc_lock);
+	__btrfs_del_delalloc_inode(inode);
+	spin_unlock(&inode->root->delalloc_lock);
 }
 
 /*
@@ -2538,7 +2537,7 @@ void btrfs_clear_delalloc_extent(struct btrfs_inode *inode,
 		if (do_list && inode->delalloc_bytes == 0 &&
 		    test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
 					&inode->runtime_flags))
-			btrfs_del_delalloc_inode(root, inode);
+			btrfs_del_delalloc_inode(inode);
 		spin_unlock(&inode->lock);
 	}
 

From 37b6c0ea2a3ae78274cc2f4ffd3711d0262cd694 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Thu, 8 Feb 2024 22:03:31 +0000
Subject: [PATCH 0418/1406] btrfs: assert root delalloc lock is held at
 __btrfs_del_delalloc_inode()

This function requires the delalloc lock of the inode's root to be held,
so assert it's held.

Reviewed-by: Boris Burkov <boris@bur.io>
Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index ec8af7d0f16643..3a19e30676e821 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2411,6 +2411,8 @@ void __btrfs_del_delalloc_inode(struct btrfs_inode *inode)
 	struct btrfs_root *root = inode->root;
 	struct btrfs_fs_info *fs_info = root->fs_info;
 
+	lockdep_assert_held(&root->delalloc_lock);
+
 	if (!list_empty(&inode->delalloc_inodes)) {
 		list_del_init(&inode->delalloc_inodes);
 		clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,

From 2ad1f1a1af29e42845380ef39652dfe4301cb356 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Thu, 8 Feb 2024 22:08:34 +0000
Subject: [PATCH 0419/1406] btrfs: rename btrfs_add_delalloc_inodes() to
 singular form

The function btrfs_add_delalloc_inodes() adds a single inode its root's
list of delalloc inodes, so it doesn't make any sense at all for the
function's name to be plural. Rename it to the singular form
btrfs_add_delalloc_inode() to avoid any confusion.

Reviewed-by: Boris Burkov <boris@bur.io>
Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 3a19e30676e821..b273fdbd63cd44 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2385,7 +2385,7 @@ void btrfs_merge_delalloc_extent(struct btrfs_inode *inode, struct extent_state
 	spin_unlock(&inode->lock);
 }
 
-static void btrfs_add_delalloc_inodes(struct btrfs_inode *inode)
+static void btrfs_add_delalloc_inode(struct btrfs_inode *inode)
 {
 	struct btrfs_root *root = inode->root;
 	struct btrfs_fs_info *fs_info = root->fs_info;
@@ -2472,7 +2472,7 @@ void btrfs_set_delalloc_extent(struct btrfs_inode *inode, struct extent_state *s
 			inode->defrag_bytes += len;
 		if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
 					 &inode->runtime_flags))
-			btrfs_add_delalloc_inodes(inode);
+			btrfs_add_delalloc_inode(inode);
 		spin_unlock(&inode->lock);
 	}
 

From 9defc1f949c63d1acaf0a1f1d14494ef470f1d7e Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Fri, 9 Feb 2024 10:37:10 +0000
Subject: [PATCH 0420/1406] btrfs: reduce inode lock critical section when
 setting and clearing delalloc

When setting and clearing a delalloc range, at btrfs_set_delalloc_extent()
and btrfs_clear_delalloc_extent(), we are adding/removing the inode
to/from the root's list of delalloc inodes while under the protection of
the inode's lock. This however is not needed, we can add and remove the
inode to the root's list without holding the inode's lock because here
we are under the protection of the io tree's lock, reducing the size of
the critical section delimited by the inode's lock. The inode's lock is
used in many other places such as when finishing an ordered extent (when
calling btrfs_update_inode_bytes() or btrfs_delalloc_release_metadata(),
or decreasing the number of outstanding extents) or when reserving space
when doing a buffered or direct IO write (calls to functions from
delalloc-space.c).

So move the inode add/remove operations to the root's list of delalloc
inodes to outside the critical section delimited by the inode's lock.
This also allows us to get rid of the BTRFS_INODE_IN_DELALLOC_LIST flag
since we can rely on the inode's delalloc bytes counter to determine if
the inode is or is not in the list.

The following fio based test, that exercises IO to multiple files in the
same subvolume, was used to test:

   $ cat test.sh
   #!/bin/bash

   DEV=/dev/nullb0
   MNT=/mnt/nullb0
   MOUNT_OPTIONS="-o ssd"

   mkfs.btrfs -f $DEV &> /dev/null
   mount $MOUNT_OPTIONS $DEV $MNT

   fio --direct=0 --ioengine=sync --thread --directory=$MNT \
       --invalidate=1 --group_reporting=1 \
       --new_group --rw=randwrite --size=50m --numjobs=200 \
       --bs=4k --fsync_on_close=0 --fallocate=none --end_fsync=0 \
       --name=foo --filename_format=FioWorkloads.\$jobnum

   umount $MNT

The test was run on a non-debug kernel (Debian's default kernel config)
against a 16G null block device.

Result before this patch:

   WRITE: bw=81.9MiB/s (85.9MB/s), 81.9MiB/s-81.9MiB/s (85.9MB/s-85.9MB/s), io=9.77GiB (10.5GB), run=122136-122136msec

Result after this patch:

   WRITE: bw=86.8MiB/s (91.0MB/s), 86.8MiB/s-86.8MiB/s (91.0MB/s-91.0MB/s), io=9.77GiB (10.5GB), run=115180-115180msec

Reviewed-by: Boris Burkov <boris@bur.io>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/btrfs_inode.h |  1 -
 fs/btrfs/inode.c       | 60 ++++++++++++++++++++++++++++--------------
 2 files changed, 40 insertions(+), 21 deletions(-)

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 4d8c2c5ece0171..a35adc06ce0c6f 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -60,7 +60,6 @@ enum {
 	  */
 	BTRFS_INODE_NEEDS_FULL_SYNC,
 	BTRFS_INODE_COPY_EVERYTHING,
-	BTRFS_INODE_IN_DELALLOC_LIST,
 	BTRFS_INODE_HAS_PROPS,
 	BTRFS_INODE_SNAPSHOT_FLUSH,
 	/*
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index b273fdbd63cd44..778bb6754e00fa 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2391,17 +2391,14 @@ static void btrfs_add_delalloc_inode(struct btrfs_inode *inode)
 	struct btrfs_fs_info *fs_info = root->fs_info;
 
 	spin_lock(&root->delalloc_lock);
-	if (list_empty(&inode->delalloc_inodes)) {
-		list_add_tail(&inode->delalloc_inodes, &root->delalloc_inodes);
-		set_bit(BTRFS_INODE_IN_DELALLOC_LIST, &inode->runtime_flags);
-		root->nr_delalloc_inodes++;
-		if (root->nr_delalloc_inodes == 1) {
-			spin_lock(&fs_info->delalloc_root_lock);
-			BUG_ON(!list_empty(&root->delalloc_root));
-			list_add_tail(&root->delalloc_root,
-				      &fs_info->delalloc_roots);
-			spin_unlock(&fs_info->delalloc_root_lock);
-		}
+	ASSERT(list_empty(&inode->delalloc_inodes));
+	list_add_tail(&inode->delalloc_inodes, &root->delalloc_inodes);
+	root->nr_delalloc_inodes++;
+	if (root->nr_delalloc_inodes == 1) {
+		spin_lock(&fs_info->delalloc_root_lock);
+		BUG_ON(!list_empty(&root->delalloc_root));
+		list_add_tail(&root->delalloc_root, &fs_info->delalloc_roots);
+		spin_unlock(&fs_info->delalloc_root_lock);
 	}
 	spin_unlock(&root->delalloc_lock);
 }
@@ -2413,10 +2410,14 @@ void __btrfs_del_delalloc_inode(struct btrfs_inode *inode)
 
 	lockdep_assert_held(&root->delalloc_lock);
 
+	/*
+	 * We may be called after the inode was already deleted from the list,
+	 * namely in the transaction abort path btrfs_destroy_delalloc_inodes(),
+	 * and then later through btrfs_clear_delalloc_extent() while the inode
+	 * still has ->delalloc_bytes > 0.
+	 */
 	if (!list_empty(&inode->delalloc_inodes)) {
 		list_del_init(&inode->delalloc_inodes);
-		clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
-			  &inode->runtime_flags);
 		root->nr_delalloc_inodes--;
 		if (!root->nr_delalloc_inodes) {
 			ASSERT(list_empty(&root->delalloc_inodes));
@@ -2444,6 +2445,8 @@ void btrfs_set_delalloc_extent(struct btrfs_inode *inode, struct extent_state *s
 {
 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 
+	lockdep_assert_held(&inode->io_tree.lock);
+
 	if ((bits & EXTENT_DEFRAG) && !(bits & EXTENT_DELALLOC))
 		WARN_ON(1);
 	/*
@@ -2453,6 +2456,7 @@ void btrfs_set_delalloc_extent(struct btrfs_inode *inode, struct extent_state *s
 	 */
 	if (!(state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
 		u64 len = state->end + 1 - state->start;
+		u64 prev_delalloc_bytes;
 		u32 num_extents = count_max_extents(fs_info, len);
 		bool do_list = !btrfs_is_free_space_inode(inode);
 
@@ -2467,13 +2471,20 @@ void btrfs_set_delalloc_extent(struct btrfs_inode *inode, struct extent_state *s
 		percpu_counter_add_batch(&fs_info->delalloc_bytes, len,
 					 fs_info->delalloc_batch);
 		spin_lock(&inode->lock);
+		prev_delalloc_bytes = inode->delalloc_bytes;
 		inode->delalloc_bytes += len;
 		if (bits & EXTENT_DEFRAG)
 			inode->defrag_bytes += len;
-		if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
-					 &inode->runtime_flags))
-			btrfs_add_delalloc_inode(inode);
 		spin_unlock(&inode->lock);
+
+		/*
+		 * We don't need to be under the protection of the inode's lock,
+		 * because we are called while holding the inode's io_tree lock
+		 * and are therefore protected against concurrent calls of this
+		 * function and btrfs_clear_delalloc_extent().
+		 */
+		if (do_list && prev_delalloc_bytes == 0)
+			btrfs_add_delalloc_inode(inode);
 	}
 
 	if (!(state->state & EXTENT_DELALLOC_NEW) &&
@@ -2495,6 +2506,8 @@ void btrfs_clear_delalloc_extent(struct btrfs_inode *inode,
 	u64 len = state->end + 1 - state->start;
 	u32 num_extents = count_max_extents(fs_info, len);
 
+	lockdep_assert_held(&inode->io_tree.lock);
+
 	if ((state->state & EXTENT_DEFRAG) && (bits & EXTENT_DEFRAG)) {
 		spin_lock(&inode->lock);
 		inode->defrag_bytes -= len;
@@ -2508,6 +2521,7 @@ void btrfs_clear_delalloc_extent(struct btrfs_inode *inode,
 	 */
 	if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
 		struct btrfs_root *root = inode->root;
+		u64 new_delalloc_bytes;
 		bool do_list = !btrfs_is_free_space_inode(inode);
 
 		spin_lock(&inode->lock);
@@ -2536,11 +2550,17 @@ void btrfs_clear_delalloc_extent(struct btrfs_inode *inode,
 					 fs_info->delalloc_batch);
 		spin_lock(&inode->lock);
 		inode->delalloc_bytes -= len;
-		if (do_list && inode->delalloc_bytes == 0 &&
-		    test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
-					&inode->runtime_flags))
-			btrfs_del_delalloc_inode(inode);
+		new_delalloc_bytes = inode->delalloc_bytes;
 		spin_unlock(&inode->lock);
+
+		/*
+		 * We don't need to be under the protection of the inode's lock,
+		 * because we are called while holding the inode's io_tree lock
+		 * and are therefore protected against concurrent calls of this
+		 * function and btrfs_set_delalloc_extent().
+		 */
+		if (do_list && new_delalloc_bytes == 0)
+			btrfs_del_delalloc_inode(inode);
 	}
 
 	if ((state->state & EXTENT_DELALLOC_NEW) &&

From db31da27dde801f577fa30603c1c254107a27514 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Fri, 9 Feb 2024 12:19:55 +0000
Subject: [PATCH 0421/1406] btrfs: add lockdep assertion to remaining delalloc
 callbacks

The merge and split callbacks for an inode's io tree are supposed to be
called while the io tree's spinlock is being held, so that the given
extent_state records are stable, not modified or freed while the callbacks
are using them. So add lockdep assertions in the callbacks.

Reviewed-by: Boris Burkov <boris@bur.io>
Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 778bb6754e00fa..c7a5fb1f8b3eef 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2300,6 +2300,8 @@ void btrfs_split_delalloc_extent(struct btrfs_inode *inode,
 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 	u64 size;
 
+	lockdep_assert_held(&inode->io_tree.lock);
+
 	/* not delalloc, ignore it */
 	if (!(orig->state & EXTENT_DELALLOC))
 		return;
@@ -2338,6 +2340,8 @@ void btrfs_merge_delalloc_extent(struct btrfs_inode *inode, struct extent_state
 	u64 new_size, old_size;
 	u32 num_extents;
 
+	lockdep_assert_held(&inode->io_tree.lock);
+
 	/* not delalloc, ignore it */
 	if (!(other->state & EXTENT_DELALLOC))
 		return;

From 2a91ac8a51e7387eb325e3e0dc315fb725359817 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Fri, 9 Feb 2024 12:25:43 +0000
Subject: [PATCH 0422/1406] btrfs: use assertion instead of BUG_ON when
 adding/removing to delalloc list

When adding or removing and inode to/from the root's delalloc list,
instead of using a BUG_ON() to validate list emptiness, use ASSERT()
since this is to check logic errors rather than real errors.

Reviewed-by: Boris Burkov <boris@bur.io>
Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index c7a5fb1f8b3eef..fe962a6045fde0 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2400,7 +2400,7 @@ static void btrfs_add_delalloc_inode(struct btrfs_inode *inode)
 	root->nr_delalloc_inodes++;
 	if (root->nr_delalloc_inodes == 1) {
 		spin_lock(&fs_info->delalloc_root_lock);
-		BUG_ON(!list_empty(&root->delalloc_root));
+		ASSERT(list_empty(&root->delalloc_root));
 		list_add_tail(&root->delalloc_root, &fs_info->delalloc_roots);
 		spin_unlock(&fs_info->delalloc_root_lock);
 	}
@@ -2426,7 +2426,7 @@ void __btrfs_del_delalloc_inode(struct btrfs_inode *inode)
 		if (!root->nr_delalloc_inodes) {
 			ASSERT(list_empty(&root->delalloc_inodes));
 			spin_lock(&fs_info->delalloc_root_lock);
-			BUG_ON(list_empty(&root->delalloc_root));
+			ASSERT(!list_empty(&root->delalloc_root));
 			list_del_init(&root->delalloc_root);
 			spin_unlock(&fs_info->delalloc_root_lock);
 		}

From 2d330b3b3060a9fe59e393873c377bf222f1f1bb Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Fri, 9 Feb 2024 12:35:20 +0000
Subject: [PATCH 0423/1406] btrfs: remove do_list variable at
 btrfs_set_delalloc_extent()

The "do_list" variable is only used once, plus its name/meaning is a bit
confusing, so remove it and directory use btrfs_is_free_space_inode().

Reviewed-by: Boris Burkov <boris@bur.io>
Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index fe962a6045fde0..17b6ab71584ae2 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2462,7 +2462,6 @@ void btrfs_set_delalloc_extent(struct btrfs_inode *inode, struct extent_state *s
 		u64 len = state->end + 1 - state->start;
 		u64 prev_delalloc_bytes;
 		u32 num_extents = count_max_extents(fs_info, len);
-		bool do_list = !btrfs_is_free_space_inode(inode);
 
 		spin_lock(&inode->lock);
 		btrfs_mod_outstanding_extents(inode, num_extents);
@@ -2487,7 +2486,7 @@ void btrfs_set_delalloc_extent(struct btrfs_inode *inode, struct extent_state *s
 		 * and are therefore protected against concurrent calls of this
 		 * function and btrfs_clear_delalloc_extent().
 		 */
-		if (do_list && prev_delalloc_bytes == 0)
+		if (!btrfs_is_free_space_inode(inode) && prev_delalloc_bytes == 0)
 			btrfs_add_delalloc_inode(inode);
 	}
 

From ba3f88c3ab16d044cba4223e9735bf85b34d561d Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Fri, 9 Feb 2024 12:42:28 +0000
Subject: [PATCH 0424/1406] btrfs: remove do_list variable at
 btrfs_clear_delalloc_extent()

The "do_list" variable has a rather confusing name, so remove it and
directly use btrfs_is_free_space_inode() instead.

Reviewed-by: Boris Burkov <boris@bur.io>
Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 17b6ab71584ae2..5bb61d4aa2cbab 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2525,7 +2525,6 @@ void btrfs_clear_delalloc_extent(struct btrfs_inode *inode,
 	if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
 		struct btrfs_root *root = inode->root;
 		u64 new_delalloc_bytes;
-		bool do_list = !btrfs_is_free_space_inode(inode);
 
 		spin_lock(&inode->lock);
 		btrfs_mod_outstanding_extents(inode, -num_extents);
@@ -2545,7 +2544,8 @@ void btrfs_clear_delalloc_extent(struct btrfs_inode *inode,
 			return;
 
 		if (!btrfs_is_data_reloc_root(root) &&
-		    do_list && !(state->state & EXTENT_NORESERVE) &&
+		    !btrfs_is_free_space_inode(inode) &&
+		    !(state->state & EXTENT_NORESERVE) &&
 		    (bits & EXTENT_CLEAR_DATA_RESV))
 			btrfs_free_reserved_data_space_noquota(fs_info, len);
 
@@ -2562,7 +2562,7 @@ void btrfs_clear_delalloc_extent(struct btrfs_inode *inode,
 		 * and are therefore protected against concurrent calls of this
 		 * function and btrfs_set_delalloc_extent().
 		 */
-		if (do_list && new_delalloc_bytes == 0)
+		if (!btrfs_is_free_space_inode(inode) && new_delalloc_bytes == 0)
 			btrfs_del_delalloc_inode(inode);
 	}
 

From 1866c6061622406eedbb691f69c8908fad967381 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Mon, 12 Feb 2024 21:50:53 +0000
Subject: [PATCH 0425/1406] btrfs: zoned: fix chunk map leak when loading block
 group zone info

At btrfs_load_block_group_zone_info() we never drop a reference on the
chunk map we have looked up, therefore leaking a reference on it. So
add the missing btrfs_free_chunk_map() at the end of the function.

Fixes: 7dc66abb5a47 ("btrfs: use a dedicated data structure for chunk maps")
Reported-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Tested-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/zoned.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index d9716456bce03a..46537d606dc37d 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -1668,6 +1668,7 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
 	}
 	bitmap_free(active);
 	kfree(zone_info);
+	btrfs_free_chunk_map(map);
 
 	return ret;
 }

From 2fefeb0003e987680349844ef9fdcf9b712c517b Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Fri, 2 Feb 2024 14:32:17 +0000
Subject: [PATCH 0426/1406] btrfs: don't refill whole delayed refs block
 reserve when starting transaction

Since commit 28270e25c69a ("btrfs: always reserve space for delayed refs
when starting transaction") we started not only to reserve metadata space
for the delayed refs a caller of btrfs_start_transaction() might generate
but also to try to fully refill the delayed refs block reserve, because
there are several case where we generate delayed refs and haven't reserved
space for them, relying on the global block reserve. Relying too much on
the global block reserve is not always safe, and can result in hitting
-ENOSPC during transaction commits or worst, in rare cases, being unable
to mount a filesystem that needs to do orphan cleanup or anything that
requires modifying the filesystem during mount, and has no more
unallocated space and the metadata space is nearly full. This was
explained in detail in that commit's change log.

However the gap between the reserved amount and the size of the delayed
refs block reserve can be huge, so attempting to reserve space for such
a gap can result in allocating many metadata block groups that end up
not being used. After a recent patch, with the subject:

  "btrfs: add new unused block groups to the list of unused block groups"

We started to add new block groups that are unused to the list of unused
block groups, to avoid having them around for a very long time in case
they are never used, because a block group is only added to the list of
unused block groups when we deallocate the last extent or when mounting
the filesystem and the block group has 0 bytes used. This is not a problem
introduced by the commit mentioned earlier, it always existed as our
metadata space reservations are, most of the time, pessimistic and end up
not using all the space they reserved, so we can occasionally end up with
one or two unused metadata block groups for a long period. However after
that commit mentioned earlier, we are just more pessimistic in the
metadata space reservations when starting a transaction and therefore the
issue is more likely to happen.

This however is not always enough because we might create unused metadata
block groups when reserving metadata space at a high rate if there's
always a gap in the delayed refs block reserve and the cleaner kthread
isn't triggered often enough or is busy with other work (running delayed
iputs, cleaning deleted roots, etc), not to mention the block group's
allocated space is only usable for a new block group after the transaction
used to remove it is committed.

A user reported that he's getting a lot of allocated metadata block groups
but the usage percentage of metadata space was very low compared to the
total allocated space, specially after running a series of block group
relocations.

So for now stop trying to refill the gap in the delayed refs block reserve
and reserve space only for the delayed refs we are expected to generate
when starting a transaction.

CC: stable@vger.kernel.org # 6.7+
Reported-by: Ivan Shapovalov <intelfx@intelfx.name>
Link: https://lore.kernel.org/linux-btrfs/9cdbf0ca9cdda1b4c84e15e548af7d7f9f926382.camel@intelfx.name/
Link: https://lore.kernel.org/linux-btrfs/CAL3q7H6802ayLHUJFztzZAVzBLJAGdFx=6FHNNy87+obZXXZpQ@mail.gmail.com/
Tested-by: Ivan Shapovalov <intelfx@intelfx.name>
Reported-by: Heddxh <g311571057@gmail.com>
Link: https://lore.kernel.org/linux-btrfs/CAE93xANEby6RezOD=zcofENYZOT-wpYygJyauyUAZkLv6XVFOA@mail.gmail.com/
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/transaction.c | 38 ++------------------------------------
 1 file changed, 2 insertions(+), 36 deletions(-)

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 70d7abd1f772f1..3575b2bf3042a9 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -562,56 +562,22 @@ static int btrfs_reserve_trans_metadata(struct btrfs_fs_info *fs_info,
 					u64 num_bytes,
 					u64 *delayed_refs_bytes)
 {
-	struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
 	struct btrfs_space_info *si = fs_info->trans_block_rsv.space_info;
-	u64 extra_delayed_refs_bytes = 0;
-	u64 bytes;
+	u64 bytes = num_bytes + *delayed_refs_bytes;
 	int ret;
 
-	/*
-	 * If there's a gap between the size of the delayed refs reserve and
-	 * its reserved space, than some tasks have added delayed refs or bumped
-	 * its size otherwise (due to block group creation or removal, or block
-	 * group item update). Also try to allocate that gap in order to prevent
-	 * using (and possibly abusing) the global reserve when committing the
-	 * transaction.
-	 */
-	if (flush == BTRFS_RESERVE_FLUSH_ALL &&
-	    !btrfs_block_rsv_full(delayed_refs_rsv)) {
-		spin_lock(&delayed_refs_rsv->lock);
-		if (delayed_refs_rsv->size > delayed_refs_rsv->reserved)
-			extra_delayed_refs_bytes = delayed_refs_rsv->size -
-				delayed_refs_rsv->reserved;
-		spin_unlock(&delayed_refs_rsv->lock);
-	}
-
-	bytes = num_bytes + *delayed_refs_bytes + extra_delayed_refs_bytes;
-
 	/*
 	 * We want to reserve all the bytes we may need all at once, so we only
 	 * do 1 enospc flushing cycle per transaction start.
 	 */
 	ret = btrfs_reserve_metadata_bytes(fs_info, si, bytes, flush);
-	if (ret == 0) {
-		if (extra_delayed_refs_bytes > 0)
-			btrfs_migrate_to_delayed_refs_rsv(fs_info,
-							  extra_delayed_refs_bytes);
-		return 0;
-	}
-
-	if (extra_delayed_refs_bytes > 0) {
-		bytes -= extra_delayed_refs_bytes;
-		ret = btrfs_reserve_metadata_bytes(fs_info, si, bytes, flush);
-		if (ret == 0)
-			return 0;
-	}
 
 	/*
 	 * If we are an emergency flush, which can steal from the global block
 	 * reserve, then attempt to not reserve space for the delayed refs, as
 	 * we will consume space for them from the global block reserve.
 	 */
-	if (flush == BTRFS_RESERVE_FLUSH_ALL_STEAL) {
+	if (ret && flush == BTRFS_RESERVE_FLUSH_ALL_STEAL) {
 		bytes -= *delayed_refs_bytes;
 		*delayed_refs_bytes = 0;
 		ret = btrfs_reserve_metadata_bytes(fs_info, si, bytes, flush);

From 75a0b92805e5cdb53dab4978c8c77846a2f2ed42 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Wed, 24 Jan 2024 17:26:25 +0100
Subject: [PATCH 0427/1406] btrfs: push errors up from add_async_extent()

The memory allocation error in add_async_extent() is not handled
properly, return an error and push the BUG_ON to the caller. Handling it
there is not trivial so at least make it visible.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 5bb61d4aa2cbab..366f4f8bec0c59 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -738,7 +738,8 @@ static noinline int add_async_extent(struct async_chunk *cow,
 	struct async_extent *async_extent;
 
 	async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS);
-	BUG_ON(!async_extent); /* -ENOMEM */
+	if (!async_extent)
+		return -ENOMEM;
 	async_extent->start = start;
 	async_extent->ram_size = ram_size;
 	async_extent->compressed_size = compressed_size;
@@ -1025,8 +1026,9 @@ static void compress_file_range(struct btrfs_work *work)
 	 * The async work queues will take care of doing actual allocation on
 	 * disk for these compressed pages, and will submit the bios.
 	 */
-	add_async_extent(async_chunk, start, total_in, total_compressed, pages,
-			 nr_pages, compress_type);
+	ret = add_async_extent(async_chunk, start, total_in, total_compressed, pages,
+			       nr_pages, compress_type);
+	BUG_ON(ret);
 	if (start + total_in < end) {
 		start += total_in;
 		cond_resched();
@@ -1038,8 +1040,9 @@ static void compress_file_range(struct btrfs_work *work)
 	if (!btrfs_test_opt(fs_info, FORCE_COMPRESS) && !inode->prop_compress)
 		inode->flags |= BTRFS_INODE_NOCOMPRESS;
 cleanup_and_bail_uncompressed:
-	add_async_extent(async_chunk, start, end - start + 1, 0, NULL, 0,
-			 BTRFS_COMPRESS_NONE);
+	ret = add_async_extent(async_chunk, start, end - start + 1, 0, NULL, 0,
+			       BTRFS_COMPRESS_NONE);
+	BUG_ON(ret);
 free_pages:
 	if (pages) {
 		for (i = 0; i < nr_pages; i++) {

From f8470f84c2b31c050f1afa3c1c1b8871bf130403 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Wed, 24 Jan 2024 22:29:46 +0100
Subject: [PATCH 0428/1406] btrfs: update comment and drop assertion in extent
 item lookup in find_parent_nodes()

Same comment was added to this type of error, unify that and drop the
assertion as we'd find out quickly that something is wrong after
returning -EUCLEAN.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/backref.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index beed7e459dabde..0fa27ed802f6fb 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -1435,8 +1435,10 @@ static int find_parent_nodes(struct btrfs_backref_walk_ctx *ctx,
 	if (ret < 0)
 		goto out;
 	if (ret == 0) {
-		/* This shouldn't happen, indicates a bug or fs corruption. */
-		ASSERT(ret != 0);
+		/*
+		 * Key with offset -1 found, there would have to exist an extent
+		 * item with such offset, but this is out of the valid range.
+		 */
 		ret = -EUCLEAN;
 		goto out;
 	}

From 42b7b25c9d8b815191c796333d8525d6154be6ad Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Wed, 24 Jan 2024 22:41:01 +0100
Subject: [PATCH 0429/1406] btrfs: handle invalid extent item reference found
 in extent_from_logical()

The extent_from_logical() helper looks up an extent item by a key,
allowing to do an inexact search when key->offset is -1.  It's never
expected to find such item, as it would break the allowed range of a
extent item offset.

The same error is already handled in btrfs_backref_iter_start() so add a
comment for consistency.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/backref.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 0fa27ed802f6fb..6ba743ddfe211c 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -2227,6 +2227,13 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
 	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
 	if (ret < 0)
 		return ret;
+	if (ret == 0) {
+		/*
+		 * Key with offset -1 found, there would have to exist an extent
+		 * item with such offset, but this is out of the valid range.
+		 */
+		return -EUCLEAN;
+	}
 
 	ret = btrfs_previous_extent_item(extent_root, path, 0);
 	if (ret) {
@@ -2870,6 +2877,10 @@ int btrfs_backref_iter_start(struct btrfs_backref_iter *iter, u64 bytenr)
 	if (ret < 0)
 		return ret;
 	if (ret == 0) {
+		/*
+		 * Key with offset -1 found, there would have to exist an extent
+		 * item with such offset, but this is out of the valid range.
+		 */
 		ret = -EUCLEAN;
 		goto release;
 	}

From 549476940e9d23b06754fe38f84cf96d5d6cf03c Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Wed, 24 Jan 2024 22:49:02 +0100
Subject: [PATCH 0430/1406] btrfs: handle invalid extent item reference found
 in find_first_extent_item()

The find_first_extent_item() helper looks up an extent item by a key,
allowing to do an inexact search when key->offset is -1.  It's never
expected to find such item, as it would break the allowed range of a
extent item offset.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/scrub.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 0123d272892373..c4bd0e60db5925 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -1390,8 +1390,15 @@ static int find_first_extent_item(struct btrfs_root *extent_root,
 	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
 	if (ret < 0)
 		return ret;
+	if (ret == 0) {
+		/*
+		 * Key with offset -1 found, there would have to exist an extent
+		 * item with such offset, but this is out of the valid range.
+		 */
+		btrfs_release_path(path);
+		return -EUCLEAN;
+	}
 
-	ASSERT(ret > 0);
 	/*
 	 * Here we intentionally pass 0 as @min_objectid, as there could be
 	 * an extent item starting before @search_start.

From a5ee5fe06d7e7d04bd2030025e8f6d98b7c1738a Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Wed, 24 Jan 2024 22:58:01 +0100
Subject: [PATCH 0431/1406] btrfs: handle invalid root reference found in
 may_destroy_subvol()

The may_destroy_subvol() looks up a root by a key, allowing to do an
inexact search when key->offset is -1.  It's never expected to find such
item, as it would break the allowed range of a root id.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 366f4f8bec0c59..c9e14bd96a7b5a 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4412,7 +4412,14 @@ static noinline int may_destroy_subvol(struct btrfs_root *root)
 	ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
 	if (ret < 0)
 		goto out;
-	BUG_ON(ret == 0);
+	if (ret == 0) {
+		/*
+		 * Key with offset -1 found, there would have to exist a root
+		 * with such id, but this is out of valid range.
+		 */
+		ret = -EUCLEAN;
+		goto out;
+	}
 
 	ret = 0;
 	if (path->slots[0] > 0) {

From 80e3afb5b2c164d04bb95b779b65e0d930151da2 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Tue, 6 Feb 2024 22:47:13 +0100
Subject: [PATCH 0432/1406] btrfs: send: handle unexpected data in header
 buffer in begin_cmd()

Change BUG_ON to a proper error handling in the unlikely case of seeing
data when the command is started. This is supposed to be reset when the
command is finished (send_cmd, send_encoded_extent).

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/send.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 14ea3085073905..1bff7b3008accc 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -776,7 +776,12 @@ static int begin_cmd(struct send_ctx *sctx, int cmd)
 	if (WARN_ON(!sctx->send_buf))
 		return -EINVAL;
 
-	BUG_ON(sctx->send_size);
+	if (unlikely(sctx->send_size != 0)) {
+		btrfs_err(sctx->send_root->fs_info,
+			  "send: command header buffer not empty cmd %d offset %llu",
+			  cmd, sctx->send_off);
+		return -EINVAL;
+	}
 
 	sctx->send_size += sizeof(*hdr);
 	hdr = (struct btrfs_cmd_header *)sctx->send_buf;

From b485fa713fb70236cc2d0edfa605cb5779d76a68 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Tue, 6 Feb 2024 22:47:13 +0100
Subject: [PATCH 0433/1406] btrfs: send: handle unexpected inode in header
 process_recorded_refs()

Change BUG_ON to proper error handling when an unexpected inode number
is encountered. As the comment says this should never happen.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/send.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 1bff7b3008accc..778c2da1c9dd23 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -4186,7 +4186,13 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
 	 * This should never happen as the root dir always has the same ref
 	 * which is always '..'
 	 */
-	BUG_ON(sctx->cur_ino <= BTRFS_FIRST_FREE_OBJECTID);
+	if (unlikely(sctx->cur_ino <= BTRFS_FIRST_FREE_OBJECTID)) {
+		btrfs_err(fs_info,
+			  "send: unexpected inode %llu in process_recorded_refs()",
+			  sctx->cur_ino);
+		ret = -EINVAL;
+		goto out;
+	}
 
 	valid_path = fs_path_alloc();
 	if (!valid_path) {

From 41fc58f6b3c2add0c985079a684433ba3010adba Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Tue, 6 Feb 2024 22:47:13 +0100
Subject: [PATCH 0434/1406] btrfs: send: handle path ref underflow in header
 iterate_inode_ref()

Change BUG_ON to proper error handling if building the path buffer
fails. The pointers are not printed so we don't accidentally leak kernel
addresses.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/send.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 778c2da1c9dd23..7a601de7fa7cf8 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -1074,7 +1074,15 @@ static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path,
 					ret = PTR_ERR(start);
 					goto out;
 				}
-				BUG_ON(start < p->buf);
+				if (unlikely(start < p->buf)) {
+					btrfs_err(root->fs_info,
+			"send: path ref buffer underflow for key (%llu %u %llu)",
+						  found_key->objectid,
+						  found_key->type,
+						  found_key->offset);
+					ret = -EINVAL;
+					goto out;
+				}
 			}
 			p->start = start;
 		} else {

From d4a8ebfafcbc99ba059f9ae0de40188c0ae895c5 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Tue, 6 Feb 2024 23:06:46 +0100
Subject: [PATCH 0435/1406] btrfs: change BUG_ON to assertion in
 tree_move_down()

There's only one caller of tree_move_down() that does not pass level 0
so the assertion is better suited here.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/send.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 7a601de7fa7cf8..e96d511f9dd92a 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -7438,8 +7438,8 @@ static int tree_move_down(struct btrfs_path *path, int *level, u64 reada_min_gen
 	u64 reada_done = 0;
 
 	lockdep_assert_held_read(&parent->fs_info->commit_root_sem);
+	ASSERT(*level != 0);
 
-	BUG_ON(*level == 0);
 	eb = btrfs_read_node_slot(parent, slot);
 	if (IS_ERR(eb))
 		return PTR_ERR(eb);

From 62756b9727fd000246bbc90bb6bf6e0959ecdd71 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Tue, 6 Feb 2024 23:06:46 +0100
Subject: [PATCH 0436/1406] btrfs: change BUG_ONs to assertions in
 btrfs_qgroup_trace_subtree()

The only caller do_walk_down() of btrfs_qgroup_trace_subtree() validates
the value of level and uses it several times before it's passed as an
argument. Same for root_eb that's called 'next' in the caller.

Change both BUG_ONs to assertions as this is to assure proper interface
use rather than real errors.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/qgroup.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 5470e1cdf10c53..cfe366110a6949 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -2505,8 +2505,8 @@ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans,
 	struct extent_buffer *eb = root_eb;
 	struct btrfs_path *path = NULL;
 
-	BUG_ON(root_level < 0 || root_level >= BTRFS_MAX_LEVEL);
-	BUG_ON(root_eb == NULL);
+	ASSERT(0 <= root_level && root_level < BTRFS_MAX_LEVEL);
+	ASSERT(root_eb != NULL);
 
 	if (!btrfs_qgroup_full_accounting(fs_info))
 		return 0;

From e8df4c1ea271985a14d8ade91aa1929bccef5a48 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Tue, 6 Feb 2024 23:20:53 +0100
Subject: [PATCH 0437/1406] btrfs: delete pointless BUG_ON check on quota root
 in btrfs_qgroup_account_extent()

The BUG_ON is deep in the qgroup code where we can expect that it
exists. A NULL pointer would cause a crash.

It was added long ago in 550d7a2ed5db35 ("btrfs: qgroup: Add new qgroup
calculation function btrfs_qgroup_account_extents()."). It maybe made
sense back then as the quota enable/disable state machine was not that
robust as it is nowadays, so we can just delete it.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/qgroup.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index cfe366110a6949..044331228bd048 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -2861,8 +2861,6 @@ int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr,
 	if (nr_old_roots == 0 && nr_new_roots == 0)
 		goto out_free;
 
-	BUG_ON(!fs_info->quota_root);
-
 	trace_btrfs_qgroup_account_extent(fs_info, trans->transid, bytenr,
 					num_bytes, nr_old_roots, nr_new_roots);
 

From ce05bcedfb117c4fc73c40818530318aa23a261d Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Tue, 6 Feb 2024 23:20:53 +0100
Subject: [PATCH 0438/1406] btrfs: delete pointless BUG_ONs on extent item size

Checking extent item size in add_inline_refs() is redundant, we do that
already in tree-checker after reading the extent buffer and it won't
change under normal circumstances.  It was added long ago in
8da6d5815c592b ("Btrfs: added btrfs_find_all_roots()") and does not seem
to have a clear purpose.

Similar case in extent_from_logical(), added in a542ad1bafc7df ("btrfs:
added helper functions to iterate backrefs").

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/backref.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 6ba743ddfe211c..fe05e2f55bf7bc 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -1036,8 +1036,6 @@ static int add_inline_refs(struct btrfs_backref_walk_ctx *ctx,
 	slot = path->slots[0];
 
 	item_size = btrfs_item_size(leaf, slot);
-	BUG_ON(item_size < sizeof(*ei));
-
 	ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
 
 	if (ctx->check_extent_item) {
@@ -2256,7 +2254,6 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
 
 	eb = path->nodes[0];
 	item_size = btrfs_item_size(eb, path->slots[0]);
-	BUG_ON(item_size < sizeof(*ei));
 
 	ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
 	flags = btrfs_extent_flags(eb, ei);

From f5a9fa96c6a49bfe8a994c1a736b6b412c804af7 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Wed, 7 Feb 2024 03:24:06 +0100
Subject: [PATCH 0439/1406] btrfs: delete BUG_ON in btrfs_init_locked_inode()

The purpose of the BUG_ON is not clear. The helper btrfs_grab_root()
could return a NULL in case args->root would be a NULL or if there are
zero references. Then we check if the root pointer stored in the inode
still exists.

The whole call chain is for iget:

btrfs_iget
  btrfs_iget_path
    btrfs_iget_locked
      iget5_locked
	btrfs_init_locked_inode

which is called from many contexts where we the root pointer is used and
we can safely assume has enough references.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index c9e14bd96a7b5a..f93cb23ae1ee40 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -5571,7 +5571,6 @@ static int btrfs_init_locked_inode(struct inode *inode, void *p)
 	BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
 	BTRFS_I(inode)->location.offset = 0;
 	BTRFS_I(inode)->root = btrfs_grab_root(args->root);
-	BUG_ON(args->root && !BTRFS_I(inode)->root);
 
 	if (args->root && args->root == args->root->fs_info->tree_root &&
 	    args->ino != BTRFS_BTREE_INODE_OBJECTID)

From 9b130d5c50cf26e901942cd732a1c898f0c863ae Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Fri, 5 Jan 2024 16:27:01 +1030
Subject: [PATCH 0440/1406] btrfs: prefer to allocate larger folio for metadata

With all the migration (including the previous ones which are only
detected through this patch), we can finally enable larger folio support
for metadata.

For btrfs metadata, the high order folios are only utilized in the
following way:

- The extent buffer start is aligned to nodesize
  This should be the common case for any btrfs in the last 5 years.

- The nodesize is larger than page size
  Or there is no need to use larger folios at all.

- MM layer can fulfill our request without retry
  If we're going to retry, it's better just falling back to per-page
  allocation.
  This would also help us to expose some corner cases mentioned below.

- The larger folio must exactly cover the extent buffer
  No longer no smaller, must be an exact fit.

  This is to make extent buffer accessors much easier.
  They only need to check the first slot in eb->folios[], to determine
  their access unit (need per-page handling or a large folio covering
  the whole eb).

There is another small blockage, filemap APIs can not guarantee the
folio size.
For example, by default we go 16K nodesize on x86_64, meaning a larger
folio we expect would be with order 2 (size 16K).
We don't accept 2 order 1 (size 8K) folios, but accept 4 order 0 (size
4K) folios.

So here we go a different workaround, allocate a order 2 folio first,
then attach them to the filemap of metadata.

Thus here comes several cases, all would be addressed inside
attach_eb_folio_to_filemap():

1) We can attach the pre-allocated eb folio to filemap
   This is the most simple and hot path, we just continue our work
   setting up the extent buffer.

2) There is an existing folio in the filemap

   2.0) Subpage case
        We would reuse the folio no matter what, subpage is doing a
	different way handling folio->private (a bitmap other than a
	pointer to an existing eb).

   2.1) There is already a live extent buffer attached to the filemap
        folio
	This should be more or less hot path, we grab the existing eb
	and free the current one.

   2.2) No live eb.
   2.2.1) The filemap folio is larger than eb folio
          This is a better case, we can reuse the filemap folio, but
	  we need to cleanup all the pre-allocated folios of the
	  new eb before reusing.
	  Later code should take the folio size change into
	  consideration.

   2.2.2) The filemap folio is the same size of eb folio
          We just free the current folio, and reuse the filemap one.
	  No other special handling needed.

   2.2.3) The filemap folio is smaller than eb folio
          This is the most tricky corner case, we can not easily replace
	  the folio in filemap using our eb folio.

	  Thus here we return -EAGAIN, to inform our caller to re-try
	  with order 0 (of course with our larger folio freed).

Otherwise all the needed infrastructure is already here, we only need to
try allocate larger folio as our first try in alloc_eb_folio_array().

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_io.c | 152 ++++++++++++++++++++++++++++++-------------
 1 file changed, 107 insertions(+), 45 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 197b9f50e75cfa..d02b7672906e50 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -715,12 +715,32 @@ int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array,
  *
  * For now, the folios populated are always in order 0 (aka, single page).
  */
-static int alloc_eb_folio_array(struct extent_buffer *eb, gfp_t extra_gfp)
+static int alloc_eb_folio_array(struct extent_buffer *eb, gfp_t extra_gfp,
+				int order)
 {
 	struct page *page_array[INLINE_EXTENT_BUFFER_PAGES] = { 0 };
 	int num_pages = num_extent_pages(eb);
 	int ret;
 
+	if (order) {
+		/*
+		 * For higher order folio allocation, we discard the extra_gfp
+		 * (should only be __GFP_NOFAIL, and conflicts with higher order
+		 * folio).
+		 *
+		 * Instead we want no warning when allocation failed, and no
+		 * extra retry (to get a faster allocation).
+		 * As we're completely fine to fall back to lower order.
+		 */
+		eb->folios[0] = folio_alloc(GFP_NOFS | __GFP_NOWARN |
+					    __GFP_NORETRY, order);
+		if (eb->folios[0]) {
+			eb->folio_size = folio_size(eb->folios[0]);
+			eb->folio_shift = folio_shift(eb->folios[0]);
+			return 0;
+		}
+		/* Fallback to 0 order (single page) folios. */
+	}
 	ret = btrfs_alloc_page_array(num_pages, page_array, extra_gfp);
 	if (ret < 0)
 		return ret;
@@ -3242,7 +3262,7 @@ struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src)
 	 */
 	set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags);
 
-	ret = alloc_eb_folio_array(new, 0);
+	ret = alloc_eb_folio_array(new, 0, folio_order(src->folios[0]));
 	if (ret) {
 		btrfs_release_extent_buffer(new);
 		return NULL;
@@ -3276,7 +3296,7 @@ struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
 	if (!eb)
 		return NULL;
 
-	ret = alloc_eb_folio_array(eb, 0);
+	ret = alloc_eb_folio_array(eb, 0, 0);
 	if (ret)
 		goto err;
 
@@ -3489,6 +3509,18 @@ static int check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start)
 	return 0;
 }
 
+/*
+ * A helper to free all eb folios, should only be utilized in eb allocation
+ * path where we know all the folios are safe to be dropped.
+ */
+static void free_all_eb_folios(struct extent_buffer *eb)
+{
+	for (int i = 0; i < INLINE_EXTENT_BUFFER_PAGES; i++) {
+		if (eb->folios[i])
+			folio_put(eb->folios[i]);
+		eb->folios[i] = NULL;
+	}
+}
 
 /*
  * Return 0 if eb->folios[i] is attached to btree inode successfully.
@@ -3505,7 +3537,10 @@ static int attach_eb_folio_to_filemap(struct extent_buffer *eb, int i,
 	struct btrfs_fs_info *fs_info = eb->fs_info;
 	struct address_space *mapping = fs_info->btree_inode->i_mapping;
 	const unsigned long index = eb->start >> PAGE_SHIFT;
+	struct extent_buffer *existing_eb;
 	struct folio *existing_folio;
+	int eb_order = folio_order(eb->folios[0]);
+	int existing_order;
 	int ret;
 
 	ASSERT(found_eb_ret);
@@ -3524,37 +3559,63 @@ static int attach_eb_folio_to_filemap(struct extent_buffer *eb, int i,
 	if (IS_ERR(existing_folio))
 		goto retry;
 
-	/* For now, we should only have single-page folios for btree inode. */
-	ASSERT(folio_nr_pages(existing_folio) == 1);
+	existing_order = folio_order(existing_folio);
+	if (fs_info->nodesize < PAGE_SIZE) {
+		/*
+		 * We're going to reuse the existing page, can drop our page
+		 * and subpage structure now.
+		 */
+		folio_put(eb->folios[i]);
+		eb->folios[i] = existing_folio;
+		return 0;
+	}
 
-	if (folio_size(existing_folio) != eb->folio_size) {
+	/* Non-subpage case, try if we can grab the eb from the existing folio. */
+	existing_eb = grab_extent_buffer(fs_info,
+				folio_page(existing_folio, 0));
+	if (existing_eb) {
+		/*
+		 * The extent buffer still exists, we can use
+		 * it directly.
+		 */
+		*found_eb_ret = existing_eb;
 		folio_unlock(existing_folio);
 		folio_put(existing_folio);
-		return -EAGAIN;
+		return 1;
 	}
-
-	if (fs_info->nodesize < PAGE_SIZE) {
+	if (existing_order > eb_order) {
 		/*
-		 * We're going to reuse the existing page, can drop our page
-		 * and subpage structure now.
+		 * The existing one has higher order, we need to drop
+		 * ALL eb folios before reusing it.
+		 * And this can only happen for the first folio.
 		 */
-		__free_page(folio_page(eb->folios[i], 0));
+		ASSERT(i == 0);
+		free_all_eb_folios(eb);
 		eb->folios[i] = existing_folio;
-	} else {
-		struct extent_buffer *existing_eb;
-
-		existing_eb = grab_extent_buffer(fs_info,
-						 folio_page(existing_folio, 0));
-		if (existing_eb) {
-			/* The extent buffer still exists, we can use it directly. */
-			*found_eb_ret = existing_eb;
-			folio_unlock(existing_folio);
-			folio_put(existing_folio);
-			return 1;
-		}
-		/* The extent buffer no longer exists, we can reuse the folio. */
-		__free_page(folio_page(eb->folios[i], 0));
+	} else if (existing_order == eb_order) {
+		/*
+		 * Can safely reuse the filemap folio, just
+		 * release the eb one.
+		 */
+		folio_put(eb->folios[i]);
 		eb->folios[i] = existing_folio;
+	} else if (existing_order < eb_order) {
+		/*
+		 * The existing one has lower order (page based)
+		 * meanwhile we have a better higher order eb.
+		 *
+		 * In theory we should be able to drop all the
+		 * lower order folios in filemap and replace them
+		 * with our better one.
+		 * But we can not as the existing one still has
+		 * private set.
+		 * So here we force to fallback to 0 order folio
+		 * and retry.
+		 */
+		ASSERT(i == 0);
+		folio_unlock(existing_folio);
+		folio_put(existing_folio);
+		return -EAGAIN;
 	}
 	return 0;
 }
@@ -3571,6 +3632,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
 	struct btrfs_subpage *prealloc = NULL;
 	u64 lockdep_owner = owner_root;
 	bool page_contig = true;
+	int order = 0;
 	int uptodate = 1;
 	int ret;
 
@@ -3588,6 +3650,10 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
 		btrfs_warn_32bit_limit(fs_info);
 #endif
 
+	if (fs_info->nodesize > PAGE_SIZE &&
+	    IS_ALIGNED(start, fs_info->nodesize))
+		order = ilog2(fs_info->nodesize >> PAGE_SHIFT);
+
 	eb = find_extent_buffer(fs_info, start);
 	if (eb)
 		return eb;
@@ -3622,7 +3688,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
 
 reallocate:
 	/* Allocate all pages first. */
-	ret = alloc_eb_folio_array(eb, __GFP_NOFAIL);
+	ret = alloc_eb_folio_array(eb, __GFP_NOFAIL, order);
 	if (ret < 0) {
 		btrfs_free_subpage(prealloc);
 		goto out;
@@ -3640,26 +3706,14 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
 		}
 
 		/*
-		 * TODO: Special handling for a corner case where the order of
-		 * folios mismatch between the new eb and filemap.
-		 *
-		 * This happens when:
-		 *
-		 * - the new eb is using higher order folio
-		 *
-		 * - the filemap is still using 0-order folios for the range
-		 *   This can happen at the previous eb allocation, and we don't
-		 *   have higher order folio for the call.
-		 *
-		 * - the existing eb has already been freed
-		 *
-		 * In this case, we have to free the existing folios first, and
-		 * re-allocate using the same order.
-		 * Thankfully this is not going to happen yet, as we're still
-		 * using 0-order folios.
+		 * This happens when we got a higher order (better) folio, but
+		 * the filemap still has lower order (single paged) folio.
+		 * We don't have a good way to replace them yet.
+		 * Thus has to retry with lower order (0) folio.
 		 */
 		if (unlikely(ret == -EAGAIN)) {
-			ASSERT(0);
+			order = 0;
+			free_all_eb_folios(eb);
 			goto reallocate;
 		}
 		attached++;
@@ -3672,6 +3726,14 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
 		folio = eb->folios[i];
 		eb->folio_size = folio_size(folio);
 		eb->folio_shift = folio_shift(folio);
+
+		/*
+		 * We may have changed from single page folios to a larger
+		 * folios from filemap.
+		 * Re-calculate num_folios;
+		 */
+		num_folios = num_extent_folios(eb);
+
 		spin_lock(&mapping->i_private_lock);
 		/* Should not fail, as we have preallocated the memory */
 		ret = attach_extent_buffer_folio(eb, folio, prealloc);

From 44fb7b037a9aae07a7d42c5a06d5e194f468d1cd Mon Sep 17 00:00:00 2001
From: Goldwyn Rodrigues <rgoldwyn@suse.com>
Date: Tue, 23 Jan 2024 13:28:05 -0600
Subject: [PATCH 0441/1406] btrfs: page to folio conversion:
 prealloc_file_extent_cluster()

Convert usage of page to folio in prealloc_file_extent_cluster()

Reviewed-by: Boris Burkov <boris@bur.io>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/relocation.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 2fca67f2b39b9c..265370185453af 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2858,7 +2858,7 @@ static noinline_for_stack int prealloc_file_extent_cluster(
 		struct address_space *mapping = inode->vfs_inode.i_mapping;
 		struct btrfs_fs_info *fs_info = inode->root->fs_info;
 		const u32 sectorsize = fs_info->sectorsize;
-		struct page *page;
+		struct folio *folio;
 
 		ASSERT(sectorsize < PAGE_SIZE);
 		ASSERT(IS_ALIGNED(i_size, sectorsize));
@@ -2889,16 +2889,16 @@ static noinline_for_stack int prealloc_file_extent_cluster(
 		clear_extent_bits(&inode->io_tree, i_size,
 				  round_up(i_size, PAGE_SIZE) - 1,
 				  EXTENT_UPTODATE);
-		page = find_lock_page(mapping, i_size >> PAGE_SHIFT);
+		folio = filemap_lock_folio(mapping, i_size >> PAGE_SHIFT);
 		/*
 		 * If page is freed we don't need to do anything then, as we
 		 * will re-read the whole page anyway.
 		 */
-		if (page) {
-			btrfs_subpage_clear_uptodate(fs_info, page_folio(page), i_size,
+		if (!IS_ERR(folio)) {
+			btrfs_subpage_clear_uptodate(fs_info, folio, i_size,
 					round_up(i_size, PAGE_SIZE) - i_size);
-			unlock_page(page);
-			put_page(page);
+			folio_unlock(folio);
+			folio_put(folio);
 		}
 	}
 

From 10ed9b4736a364054fec4f11b0cb1459a93f84be Mon Sep 17 00:00:00 2001
From: Goldwyn Rodrigues <rgoldwyn@suse.com>
Date: Tue, 23 Jan 2024 13:28:06 -0600
Subject: [PATCH 0442/1406] btrfs: convert relocate_one_page() to
 relocate_one_folio()

Convert page references to folios and call the respective folio
functions.

Since find_or_create_page() takes a mask argument, call
__filemap_get_folio() instead of filemap_grab_folio().

The patch assumes folio size is PAGE_SIZE, so added a
WARN_ON(folio_order(folio)) to warn future development
of using larger folio sizes.

Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/relocation.c | 91 ++++++++++++++++++++++---------------------
 1 file changed, 46 insertions(+), 45 deletions(-)

diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 265370185453af..0882b220bb12da 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2849,7 +2849,7 @@ static noinline_for_stack int prealloc_file_extent_cluster(
 	 * btrfs_do_readpage() call of previously relocated file cluster.
 	 *
 	 * If the current cluster starts in the above range, btrfs_do_readpage()
-	 * will skip the read, and relocate_one_page() will later writeback
+	 * will skip the read, and relocate_one_folio() will later writeback
 	 * the padding zeros as new data, causing data corruption.
 	 *
 	 * Here we have to manually invalidate the range (i_size, PAGE_END + 1).
@@ -2983,68 +2983,71 @@ static u64 get_cluster_boundary_end(const struct file_extent_cluster *cluster,
 	return cluster->boundary[cluster_nr + 1] - 1;
 }
 
-static int relocate_one_page(struct inode *inode, struct file_ra_state *ra,
+static int relocate_one_folio(struct inode *inode, struct file_ra_state *ra,
 			     const struct file_extent_cluster *cluster,
-			     int *cluster_nr, unsigned long page_index)
+			     int *cluster_nr, unsigned long index)
 {
 	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
 	u64 offset = BTRFS_I(inode)->index_cnt;
 	const unsigned long last_index = (cluster->end - offset) >> PAGE_SHIFT;
 	gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
-	struct page *page;
-	u64 page_start;
-	u64 page_end;
+	struct folio *folio;
+	u64 folio_start;
+	u64 folio_end;
 	u64 cur;
 	int ret;
 
-	ASSERT(page_index <= last_index);
-	page = find_lock_page(inode->i_mapping, page_index);
-	if (!page) {
+	ASSERT(index <= last_index);
+	folio = filemap_lock_folio(inode->i_mapping, index);
+	if (IS_ERR(folio)) {
 		page_cache_sync_readahead(inode->i_mapping, ra, NULL,
-				page_index, last_index + 1 - page_index);
-		page = find_or_create_page(inode->i_mapping, page_index, mask);
-		if (!page)
-			return -ENOMEM;
+				index, last_index + 1 - index);
+		folio = __filemap_get_folio(inode->i_mapping, index,
+					    FGP_LOCK | FGP_ACCESSED | FGP_CREAT, mask);
+		if (IS_ERR(folio))
+			return PTR_ERR(folio);
 	}
 
-	if (PageReadahead(page))
+	WARN_ON(folio_order(folio));
+
+	if (folio_test_readahead(folio))
 		page_cache_async_readahead(inode->i_mapping, ra, NULL,
-				page_folio(page), page_index,
-				last_index + 1 - page_index);
+				folio, index,
+				last_index + 1 - index);
 
-	if (!PageUptodate(page)) {
-		btrfs_read_folio(NULL, page_folio(page));
-		lock_page(page);
-		if (!PageUptodate(page)) {
+	if (!folio_test_uptodate(folio)) {
+		btrfs_read_folio(NULL, folio);
+		folio_lock(folio);
+		if (!folio_test_uptodate(folio)) {
 			ret = -EIO;
-			goto release_page;
+			goto release_folio;
 		}
 	}
 
 	/*
-	 * We could have lost page private when we dropped the lock to read the
-	 * page above, make sure we set_page_extent_mapped here so we have any
+	 * We could have lost folio private when we dropped the lock to read the
+	 * folio above, make sure we set_page_extent_mapped here so we have any
 	 * of the subpage blocksize stuff we need in place.
 	 */
-	ret = set_page_extent_mapped(page);
+	ret = set_folio_extent_mapped(folio);
 	if (ret < 0)
-		goto release_page;
+		goto release_folio;
 
-	page_start = page_offset(page);
-	page_end = page_start + PAGE_SIZE - 1;
+	folio_start = folio_pos(folio);
+	folio_end = folio_start + PAGE_SIZE - 1;
 
 	/*
 	 * Start from the cluster, as for subpage case, the cluster can start
-	 * inside the page.
+	 * inside the folio.
 	 */
-	cur = max(page_start, cluster->boundary[*cluster_nr] - offset);
-	while (cur <= page_end) {
+	cur = max(folio_start, cluster->boundary[*cluster_nr] - offset);
+	while (cur <= folio_end) {
 		struct extent_state *cached_state = NULL;
 		u64 extent_start = cluster->boundary[*cluster_nr] - offset;
 		u64 extent_end = get_cluster_boundary_end(cluster,
 						*cluster_nr) - offset;
-		u64 clamped_start = max(page_start, extent_start);
-		u64 clamped_end = min(page_end, extent_end);
+		u64 clamped_start = max(folio_start, extent_start);
+		u64 clamped_end = min(folio_end, extent_end);
 		u32 clamped_len = clamped_end + 1 - clamped_start;
 
 		/* Reserve metadata for this range */
@@ -3052,7 +3055,7 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra,
 						      clamped_len, clamped_len,
 						      false);
 		if (ret)
-			goto release_page;
+			goto release_folio;
 
 		/* Mark the range delalloc and dirty for later writeback */
 		lock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end,
@@ -3068,20 +3071,18 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra,
 							clamped_len, true);
 			btrfs_delalloc_release_extents(BTRFS_I(inode),
 						       clamped_len);
-			goto release_page;
+			goto release_folio;
 		}
-		btrfs_folio_set_dirty(fs_info, page_folio(page),
-				      clamped_start, clamped_len);
+		btrfs_folio_set_dirty(fs_info, folio, clamped_start, clamped_len);
 
 		/*
-		 * Set the boundary if it's inside the page.
+		 * Set the boundary if it's inside the folio.
 		 * Data relocation requires the destination extents to have the
 		 * same size as the source.
 		 * EXTENT_BOUNDARY bit prevents current extent from being merged
 		 * with previous extent.
 		 */
-		if (in_range(cluster->boundary[*cluster_nr] - offset,
-			     page_start, PAGE_SIZE)) {
+		if (in_range(cluster->boundary[*cluster_nr] - offset, folio_start, PAGE_SIZE)) {
 			u64 boundary_start = cluster->boundary[*cluster_nr] -
 						offset;
 			u64 boundary_end = boundary_start +
@@ -3104,8 +3105,8 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra,
 				break;
 		}
 	}
-	unlock_page(page);
-	put_page(page);
+	folio_unlock(folio);
+	folio_put(folio);
 
 	balance_dirty_pages_ratelimited(inode->i_mapping);
 	btrfs_throttle(fs_info);
@@ -3113,9 +3114,9 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra,
 		ret = -ECANCELED;
 	return ret;
 
-release_page:
-	unlock_page(page);
-	put_page(page);
+release_folio:
+	folio_unlock(folio);
+	folio_put(folio);
 	return ret;
 }
 
@@ -3150,7 +3151,7 @@ static int relocate_file_extent_cluster(struct inode *inode,
 	last_index = (cluster->end - offset) >> PAGE_SHIFT;
 	for (index = (cluster->start - offset) >> PAGE_SHIFT;
 	     index <= last_index && !ret; index++)
-		ret = relocate_one_page(inode, ra, cluster, &cluster_nr, index);
+		ret = relocate_one_folio(inode, ra, cluster, &cluster_nr, index);
 	if (ret == 0)
 		WARN_ON(cluster_nr != cluster->nr);
 out:

From b6ea4bf9fe0df6b25691cc627a5c6f47f5c35835 Mon Sep 17 00:00:00 2001
From: Goldwyn Rodrigues <rgoldwyn@suse.com>
Date: Tue, 23 Jan 2024 13:28:07 -0600
Subject: [PATCH 0443/1406] btrfs: page to folio conversion in put_file_data()

Use folio instead of page in put_file_data().
Add a WARN_ON(folio_order(folio)) to make sure we are dealing with
PAGE_SIZE folios.

Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/send.c | 44 +++++++++++++++++++++++---------------------
 1 file changed, 23 insertions(+), 21 deletions(-)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index e96d511f9dd92a..c352cc5b1a98aa 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -5275,10 +5275,11 @@ static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len)
 {
 	struct btrfs_root *root = sctx->send_root;
 	struct btrfs_fs_info *fs_info = root->fs_info;
-	struct page *page;
+	struct folio *folio;
 	pgoff_t index = offset >> PAGE_SHIFT;
 	pgoff_t last_index;
 	unsigned pg_offset = offset_in_page(offset);
+	struct address_space *mapping = sctx->cur_inode->i_mapping;
 	int ret;
 
 	ret = put_data_header(sctx, len);
@@ -5291,44 +5292,45 @@ static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len)
 		unsigned cur_len = min_t(unsigned, len,
 					 PAGE_SIZE - pg_offset);
 
-		page = find_lock_page(sctx->cur_inode->i_mapping, index);
-		if (!page) {
-			page_cache_sync_readahead(sctx->cur_inode->i_mapping,
+		folio = filemap_lock_folio(mapping, index);
+		if (IS_ERR(folio)) {
+			page_cache_sync_readahead(mapping,
 						  &sctx->ra, NULL, index,
 						  last_index + 1 - index);
 
-			page = find_or_create_page(sctx->cur_inode->i_mapping,
-						   index, GFP_KERNEL);
-			if (!page) {
-				ret = -ENOMEM;
+	                folio = filemap_grab_folio(mapping, index);
+			if (IS_ERR(folio)) {
+				ret = PTR_ERR(folio);
 				break;
 			}
 		}
 
-		if (PageReadahead(page))
-			page_cache_async_readahead(sctx->cur_inode->i_mapping,
-						   &sctx->ra, NULL, page_folio(page),
+		WARN_ON(folio_order(folio));
+
+		if (folio_test_readahead(folio))
+			page_cache_async_readahead(mapping,
+						   &sctx->ra, NULL, folio,
 						   index, last_index + 1 - index);
 
-		if (!PageUptodate(page)) {
-			btrfs_read_folio(NULL, page_folio(page));
-			lock_page(page);
-			if (!PageUptodate(page)) {
-				unlock_page(page);
+		if (!folio_test_uptodate(folio)) {
+			btrfs_read_folio(NULL, folio);
+			folio_lock(folio);
+			if (!folio_test_uptodate(folio)) {
+				folio_unlock(folio);
 				btrfs_err(fs_info,
 			"send: IO error at offset %llu for inode %llu root %llu",
-					page_offset(page), sctx->cur_ino,
+					folio_pos(folio), sctx->cur_ino,
 					sctx->send_root->root_key.objectid);
-				put_page(page);
+				folio_put(folio);
 				ret = -EIO;
 				break;
 			}
 		}
 
-		memcpy_from_page(sctx->send_buf + sctx->send_size, page,
+		memcpy_from_folio(sctx->send_buf + sctx->send_size, folio,
 				 pg_offset, cur_len);
-		unlock_page(page);
-		put_page(page);
+		folio_unlock(folio);
+		folio_put(folio);
 		index++;
 		pg_offset = 0;
 		len -= cur_len;

From 73c92cfa8da859a26f4833526c070274455fe34f Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Wed, 24 Jan 2024 14:29:07 +1030
Subject: [PATCH 0444/1406] btrfs: introduce cached folio size

For the future multipage sectorsize support (sectorsize > PAGE_SIZE), we
want to fully utilize folio interface, thus making every data sector to
be represented by a folio.

However this would lead to a small problem that multipage and subpage
support would have every different folio size expectation.

For subpage, since folio can not be smaller than a page, one folio would
always be page sized.
But for multiplage, each folio would be sector sized.

For callsites directly handling pages/folios (aka, all read/write paths)
we don't want to do such check every time we got a folio.
So instead we cache the data folio size and its shift into
btrfs_fs_info.

This would make later folio conversion easier.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c | 11 +++++++++++
 fs/btrfs/fs.h      | 10 ++++++++++
 2 files changed, 21 insertions(+)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 8ab185182c30fe..bececdd63b4dec 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2822,6 +2822,8 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
 	fs_info->sectorsize = 4096;
 	fs_info->sectorsize_bits = ilog2(4096);
 	fs_info->stripesize = 4096;
+	fs_info->folio_size = PAGE_SIZE;
+	fs_info->folio_shift = PAGE_SHIFT;
 
 	/* Default compress algorithm when user does -o compress */
 	fs_info->compress_type = BTRFS_COMPRESS_ZLIB;
@@ -3315,6 +3317,15 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
 	fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size;
 	fs_info->stripesize = stripesize;
 
+	if (sectorsize > PAGE_SIZE) {
+		/* For future multi-page sectorsize support */
+		fs_info->folio_size = sectorsize;
+		fs_info->sectorsize_bits = fs_info->sectorsize_bits;
+	} else {
+		fs_info->folio_size = PAGE_SIZE;
+		fs_info->folio_shift = PAGE_SHIFT;
+	}
+
 	/*
 	 * Handle the space caching options appropriately now that we have the
 	 * super block loaded and validated.
diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
index a7c3f9abc53a51..98e9c67034df09 100644
--- a/fs/btrfs/fs.h
+++ b/fs/btrfs/fs.h
@@ -790,6 +790,16 @@ struct btrfs_fs_info {
 	u32 csums_per_leaf;
 	u32 stripesize;
 
+	/*
+	 * For future subpage and multipage sectorsize support.
+	 *
+	 * For subpage, all of our data folios would still be PAGE_SIZE.
+	 * But for multipage, those data folios would be sector sized.
+	 * This is the cached result to read/write path to utilize.
+	 */
+	u32 folio_size;
+	u32 folio_shift;
+
 	/*
 	 * Maximum size of an extent. BTRFS_MAX_EXTENT_SIZE on regular
 	 * filesystem, on zoned it depends on the device constraints.

From 65f64df5db7c399780ef6d1a71cb177b6bf545a8 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Wed, 24 Jan 2024 14:29:08 +1030
Subject: [PATCH 0445/1406] btrfs: defrag: prepare defrag for larger data folio
 size

Although we have migrated defrag to use the folio interface, we can
still further enhance it for the future larger data folio size.

This patch would:

- Rename page related variables to folio's equivalent

- Change "pgoff_t index" to "u64 folio_start" for defrag_prepare_one_folio()
  For the future multi-page sectorsize support, each data folio would be
  sector sized (except for subpage cases).
  Thus we should avoid using index, as there would be two different
  shifts:
  * PAGE_SHIFT based index
    Would be utilized in filemap related interfaces

  * Folio shift based index
    Would be utilized for the remaining cases

  So here we use the "u64 folio_start" to represent one folio

- Use fs_info->folio_shift to replace PAGE_SHIFT
  Since in the future the data folios would no longer be page sized, use
  the cached fs_info->folio_shift to handle both multi-page and subpage
  cases.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/defrag.c | 69 +++++++++++++++++++++++++----------------------
 1 file changed, 37 insertions(+), 32 deletions(-)

diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c
index 786905731146b9..efea82f9f4fcee 100644
--- a/fs/btrfs/defrag.c
+++ b/fs/btrfs/defrag.c
@@ -860,18 +860,19 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em,
  * NOTE: Caller should also wait for page writeback after the cluster is
  * prepared, here we don't do writeback wait for each page.
  */
-static struct folio *defrag_prepare_one_folio(struct btrfs_inode *inode, pgoff_t index)
+static struct folio *defrag_prepare_one_folio(struct btrfs_inode *inode,
+					      u64 folio_start)
 {
+	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 	struct address_space *mapping = inode->vfs_inode.i_mapping;
 	gfp_t mask = btrfs_alloc_write_mask(mapping);
-	u64 page_start = (u64)index << PAGE_SHIFT;
-	u64 page_end = page_start + PAGE_SIZE - 1;
+	u64 folio_end = folio_start + fs_info->folio_size - 1;
 	struct extent_state *cached_state = NULL;
 	struct folio *folio;
 	int ret;
 
 again:
-	folio = __filemap_get_folio(mapping, index,
+	folio = __filemap_get_folio(mapping, folio_start >> PAGE_SHIFT,
 				    FGP_LOCK | FGP_ACCESSED | FGP_CREAT, mask);
 	if (IS_ERR(folio))
 		return folio;
@@ -901,9 +902,10 @@ static struct folio *defrag_prepare_one_folio(struct btrfs_inode *inode, pgoff_t
 	while (1) {
 		struct btrfs_ordered_extent *ordered;
 
-		lock_extent(&inode->io_tree, page_start, page_end, &cached_state);
-		ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE);
-		unlock_extent(&inode->io_tree, page_start, page_end,
+		lock_extent(&inode->io_tree, folio_start, folio_end, &cached_state);
+		ordered = btrfs_lookup_ordered_range(inode, folio_start,
+						     fs_info->folio_size);
+		unlock_extent(&inode->io_tree, folio_start, folio_end,
 			      &cached_state);
 		if (!ordered)
 			break;
@@ -1162,20 +1164,20 @@ static_assert(PAGE_ALIGNED(CLUSTER_SIZE));
  */
 static int defrag_one_locked_target(struct btrfs_inode *inode,
 				    struct defrag_target_range *target,
-				    struct folio **folios, int nr_pages,
+				    struct folio **folios, int nr_folios,
 				    struct extent_state **cached_state)
 {
 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 	struct extent_changeset *data_reserved = NULL;
 	const u64 start = target->start;
 	const u64 len = target->len;
-	unsigned long last_index = (start + len - 1) >> PAGE_SHIFT;
-	unsigned long start_index = start >> PAGE_SHIFT;
+	unsigned long last_index = (start + len - 1) >> fs_info->folio_shift;
+	unsigned long start_index = start >> fs_info->folio_shift;
 	unsigned long first_index = folios[0]->index;
 	int ret = 0;
 	int i;
 
-	ASSERT(last_index - first_index + 1 <= nr_pages);
+	ASSERT(last_index - first_index + 1 <= nr_folios);
 
 	ret = btrfs_delalloc_reserve_space(inode, &data_reserved, start, len);
 	if (ret < 0)
@@ -1186,7 +1188,7 @@ static int defrag_one_locked_target(struct btrfs_inode *inode,
 	set_extent_bit(&inode->io_tree, start, start + len - 1,
 		       EXTENT_DELALLOC | EXTENT_DEFRAG, cached_state);
 
-	/* Update the page status */
+	/* Update the folio status */
 	for (i = start_index - first_index; i <= last_index - first_index; i++) {
 		folio_clear_checked(folios[i]);
 		btrfs_folio_clamp_set_dirty(fs_info, folios[i], start, len);
@@ -1201,40 +1203,42 @@ static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len,
 			    u32 extent_thresh, u64 newer_than, bool do_compress,
 			    u64 *last_scanned_ret)
 {
+	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 	struct extent_state *cached_state = NULL;
 	struct defrag_target_range *entry;
 	struct defrag_target_range *tmp;
 	LIST_HEAD(target_list);
 	struct folio **folios;
-	const u32 sectorsize = inode->root->fs_info->sectorsize;
-	u64 last_index = (start + len - 1) >> PAGE_SHIFT;
-	u64 start_index = start >> PAGE_SHIFT;
-	unsigned int nr_pages = last_index - start_index + 1;
+	const u32 sectorsize = fs_info->sectorsize;
+	u64 last_index = (start + len - 1) >> fs_info->folio_shift;
+	u64 start_index = start >> fs_info->folio_shift;
+	unsigned int nr_folios = last_index - start_index + 1;
 	int ret = 0;
 	int i;
 
-	ASSERT(nr_pages <= CLUSTER_SIZE / PAGE_SIZE);
+	ASSERT(nr_folios <= (CLUSTER_SIZE >> fs_info->folio_shift));
 	ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(len, sectorsize));
 
-	folios = kcalloc(nr_pages, sizeof(struct folio *), GFP_NOFS);
+	folios = kcalloc(nr_folios, sizeof(struct folio *), GFP_NOFS);
 	if (!folios)
 		return -ENOMEM;
 
 	/* Prepare all pages */
-	for (i = 0; i < nr_pages; i++) {
-		folios[i] = defrag_prepare_one_folio(inode, start_index + i);
+	for (i = 0; i < nr_folios ; i++) {
+		folios[i] = defrag_prepare_one_folio(inode,
+				(start_index + i) << fs_info->folio_shift);
 		if (IS_ERR(folios[i])) {
 			ret = PTR_ERR(folios[i]);
-			nr_pages = i;
+			nr_folios = i;
 			goto free_folios;
 		}
 	}
-	for (i = 0; i < nr_pages; i++)
+	for (i = 0; i < nr_folios; i++)
 		folio_wait_writeback(folios[i]);
 
 	/* Lock the pages range */
-	lock_extent(&inode->io_tree, start_index << PAGE_SHIFT,
-		    (last_index << PAGE_SHIFT) + PAGE_SIZE - 1,
+	lock_extent(&inode->io_tree, start_index << fs_info->folio_shift,
+		    (last_index << fs_info->folio_shift) + fs_info->folio_size - 1,
 		    &cached_state);
 	/*
 	 * Now we have a consistent view about the extent map, re-check
@@ -1250,7 +1254,7 @@ static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len,
 		goto unlock_extent;
 
 	list_for_each_entry(entry, &target_list, list) {
-		ret = defrag_one_locked_target(inode, entry, folios, nr_pages,
+		ret = defrag_one_locked_target(inode, entry, folios, nr_folios,
 					       &cached_state);
 		if (ret < 0)
 			break;
@@ -1261,11 +1265,11 @@ static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len,
 		kfree(entry);
 	}
 unlock_extent:
-	unlock_extent(&inode->io_tree, start_index << PAGE_SHIFT,
-		      (last_index << PAGE_SHIFT) + PAGE_SIZE - 1,
+	unlock_extent(&inode->io_tree, start_index << fs_info->folio_shift,
+		      (last_index << fs_info->folio_shift) + fs_info->folio_size - 1,
 		      &cached_state);
 free_folios:
-	for (i = 0; i < nr_pages; i++) {
+	for (i = 0; i < nr_folios; i++) {
 		folio_unlock(folios[i]);
 		folio_put(folios[i]);
 	}
@@ -1281,7 +1285,8 @@ static int defrag_one_cluster(struct btrfs_inode *inode,
 			      unsigned long max_sectors,
 			      u64 *last_scanned_ret)
 {
-	const u32 sectorsize = inode->root->fs_info->sectorsize;
+	struct btrfs_fs_info *fs_info = inode->root->fs_info;
+	const u32 sectorsize = fs_info->sectorsize;
 	struct defrag_target_range *entry;
 	struct defrag_target_range *tmp;
 	LIST_HEAD(target_list);
@@ -1420,7 +1425,7 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
 	 * Make writeback start from the beginning of the range, so that the
 	 * defrag range can be written sequentially.
 	 */
-	start_index = cur >> PAGE_SHIFT;
+	start_index = cur >> fs_info->folio_shift;
 	if (start_index < inode->i_mapping->writeback_index)
 		inode->i_mapping->writeback_index = start_index;
 
@@ -1435,8 +1440,8 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
 		}
 
 		/* We want the cluster end at page boundary when possible */
-		cluster_end = (((cur >> PAGE_SHIFT) +
-			       (SZ_256K >> PAGE_SHIFT)) << PAGE_SHIFT) - 1;
+		cluster_end = (((cur >> fs_info->folio_shift) +
+			(SZ_256K >> fs_info->folio_shift)) << fs_info->folio_shift) - 1;
 		cluster_end = min(cluster_end, last_byte);
 
 		btrfs_inode_lock(BTRFS_I(inode), 0);

From 965fc913425a8f0650b22345088ada9a6a325a97 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Fri, 26 Jan 2024 13:51:32 +1030
Subject: [PATCH 0446/1406] btrfs: raid56: extra debug for raid6 syndrome
 generation

[BUG]
I have got at least two crash report for RAID6 syndrome generation, no
matter if it's AVX2 or SSE2, they all seems to have a similar
calltrace with corrupted RAX:

 BUG: kernel NULL pointer dereference, address: 0000000000000000
 #PF: supervisor read access in kernel mode
 #PF: error_code(0x0000) - not-present page
 PGD 0 P4D 0
 Oops: 0000 [#1] PREEMPT SMP PTI
 Workqueue: btrfs-rmw rmw_rbio_work [btrfs]
 RIP: 0010:raid6_sse21_gen_syndrome+0x9e/0x130 [raid6_pq]
 RAX: 0000000000000000 RBX: 0000000000001000 RCX: ffffa0ff4cfa3248
 RDX: 0000000000000000 RSI: ffffa0f74cfa3238 RDI: 0000000000000000
 Call Trace:
  <TASK>
  rmw_rbio+0x5c8/0xa80 [btrfs]
  process_one_work+0x1c7/0x3d0
  worker_thread+0x4d/0x380
  kthread+0xf3/0x120
  ret_from_fork+0x2c/0x50
  </TASK>

[CAUSE]
In fact I don't have any clue.

Recently I also hit this in AVX512 path, and that's even in v5.15
backport, which doesn't have any of my RAID56 rework.

Furthermore according to the registers:

 RAX: 0000000000000000 RBX: 0000000000001000 RCX: ffffa0ff4cfa3248

The RAX register is showing the number of stripes (including PQ),
which is not correct (0).
But the remaining two registers are all sane.

- RBX is the sectorsize
  For x86_64 it should always be 4K and matches the output.

- RCX is the pointers array
  Which is from rbio->finish_pointers, and it looks like a sane
  kernel address.

[WORKAROUND]
For now, I can only add extra debug ASSERT()s before we call raid6
gen_syndrome() helper and hopes to catch the problem.

The debug requires both CONFIG_BTRFS_DEBUG and CONFIG_BTRFS_ASSERT
enabled.

My current guess is some use-after-free, but every report is only having
corrupted RAX but seemingly valid pointers doesn't make much sense.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/raid56.c | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 5c4bf3f907c1a1..6f4a9cfeea44a3 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -917,6 +917,13 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
 	 */
 	ASSERT(stripe_nsectors <= BITS_PER_LONG);
 
+	/*
+	 * Real stripes must be between 2 (2 disks RAID5, aka RAID1) and 256
+	 * (limited by u8).
+	 */
+	ASSERT(real_stripes >= 2);
+	ASSERT(real_stripes <= U8_MAX);
+
 	rbio = kzalloc(sizeof(*rbio), GFP_NOFS);
 	if (!rbio)
 		return ERR_PTR(-ENOMEM);
@@ -954,6 +961,7 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
 
 	ASSERT(btrfs_nr_parity_stripes(bioc->map_type));
 	rbio->nr_data = real_stripes - btrfs_nr_parity_stripes(bioc->map_type);
+	ASSERT(rbio->nr_data > 0);
 
 	return rbio;
 }
@@ -1180,6 +1188,26 @@ static inline void bio_list_put(struct bio_list *bio_list)
 		bio_put(bio);
 }
 
+static void assert_rbio(struct btrfs_raid_bio *rbio)
+{
+	if (!IS_ENABLED(CONFIG_BTRFS_DEBUG) ||
+	    !IS_ENABLED(CONFIG_BTRFS_ASSERT))
+		return;
+
+	/*
+	 * At least two stripes (2 disks RAID5), and since real_stripes is U8,
+	 * we won't go beyond 256 disks anyway.
+	 */
+	ASSERT(rbio->real_stripes >= 2);
+	ASSERT(rbio->nr_data > 0);
+
+	/*
+	 * This is another check to make sure nr data stripes is smaller
+	 * than total stripes.
+	 */
+	ASSERT(rbio->nr_data < rbio->real_stripes);
+}
+
 /* Generate PQ for one vertical stripe. */
 static void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr)
 {
@@ -1211,6 +1239,7 @@ static void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr)
 		pointers[stripe++] = kmap_local_page(sector->page) +
 				     sector->pgoff;
 
+		assert_rbio(rbio);
 		raid6_call.gen_syndrome(rbio->real_stripes, sectorsize,
 					pointers);
 	} else {
@@ -2472,6 +2501,7 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio)
 		}
 
 		if (has_qstripe) {
+			assert_rbio(rbio);
 			/* RAID6, call the library function to fill in our P/Q */
 			raid6_call.gen_syndrome(rbio->real_stripes, sectorsize,
 						pointers);

From 54650c6ab3cbb6b2cf0af65d54031f3de8add111 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Sat, 27 Jan 2024 10:18:36 +1030
Subject: [PATCH 0447/1406] btrfs: tree-checker: dump the page status if hit
 something wrong

[BUG]
There is a bug report about very suspicious tree-checker got triggered:

  BTRFS critical (device dm-0): corrupted node, root=256
block=8550954455682405139 owner mismatch, have 11858205567642294356
expect [256, 18446744073709551360]
  BTRFS critical (device dm-0): corrupted node, root=256
block=8550954455682405139 owner mismatch, have 11858205567642294356
expect [256, 18446744073709551360]
  BTRFS critical (device dm-0): corrupted node, root=256
block=8550954455682405139 owner mismatch, have 11858205567642294356
expect [256, 18446744073709551360]
  SELinux: inode_doinit_use_xattr:  getxattr returned 117 for dev=dm-0
ino=5737268

[ANALYZE]
The root cause is still unclear, but there are some clues already:

- Unaligned eb bytenr
  The block bytenr is 8550954455682405139, which is not even aligned to
  2.
  This bytenr is fetched from extent buffer header, not from eb->start.

  This means, at the initial time of read, eb header bytenr is still
  correct (the very basis check to continue read), but later something
  wrong happened, got at least the first page corrupted.
  Thus we got such obviously incorrect value.

- Invalid extent buffer header owner
  The read itself is triggered for subvolume 256, but the eb header
  owner is 11858205567642294356, which is not really possible.
  The problem here is, subovlume id is limited to (1 << 48 - 1),
  and this one definitely goes beyond that limit.

  So this value is another garbage.

We already got two garbage from an extent buffer, which passed the
initial bytenr and csum checks, but later the contents become garbage at
some point.

This looks like a page lifespan problem (e.g. we didn't proper hold the
page).

[ENHANCEMENT]
The current tree-checker only output things from the extent buffer,
nothing with the page status.

So this patch would enhance the tree-checker output by also dumpping the
first page, which would look like this:

 page:00000000aa9f3ce8 refcount:4 mapcount:0 mapping:00000000169aa6b6 index:0x1d0c pfn:0x1022e5
 memcg:ffff888103456000
 aops:btree_aops [btrfs] ino:1
 flags: 0x2ffff0000008000(private|node=0|zone=2|lastcpupid=0xffff)
 page_type: 0xffffffff()
 raw: 02ffff0000008000 0000000000000000 dead000000000122 ffff88811e06e220
 raw: 0000000000001d0c ffff888102fdb1d8 00000004ffffffff ffff888103456000
 page dumped because: eb page dump
 BTRFS critical (device dm-3): corrupt leaf: root=5 block=30457856 slot=6 ino=257 file_offset=0, invalid disk_bytenr for file extent, have 10617606235235216665, should be aligned to 4096
 BTRFS error (device dm-3): read time tree block corruption detected on logical 30457856 mirror 1

>From the dump we can see some extra info, something can help us to do
extra cross-checks:

- Page refcount
  if it's too low, it definitely means something bad.

- Page aops
  Any mapped eb page should have btree_aops with inode number 1.

- Page index
  Since a mapped eb page should has its bytenr matching the page
  position, (index << PAGE_SHIFT) should match the bytenr of the
  bytenr from the critical line.

- Page Private flags
  A mapped eb page should have Private flag set to indicate it's managed
  by btrfs.

Link: https://marc.info/?l=linux-btrfs&m=170629708724284&w=2
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/tree-checker.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index 4fa95eca285ec1..c8fbcae4e88ea5 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -65,6 +65,7 @@ static void generic_err(const struct extent_buffer *eb, int slot,
 	vaf.fmt = fmt;
 	vaf.va = &args;
 
+	dump_page(folio_page(eb->folios[0], 0), "eb page dump");
 	btrfs_crit(fs_info,
 		"corrupt %s: root=%llu block=%llu slot=%d, %pV",
 		btrfs_header_level(eb) == 0 ? "leaf" : "node",
@@ -92,6 +93,7 @@ static void file_extent_err(const struct extent_buffer *eb, int slot,
 	vaf.fmt = fmt;
 	vaf.va = &args;
 
+	dump_page(folio_page(eb->folios[0], 0), "eb page dump");
 	btrfs_crit(fs_info,
 	"corrupt %s: root=%llu block=%llu slot=%d ino=%llu file_offset=%llu, %pV",
 		btrfs_header_level(eb) == 0 ? "leaf" : "node",
@@ -152,6 +154,7 @@ static void dir_item_err(const struct extent_buffer *eb, int slot,
 	vaf.fmt = fmt;
 	vaf.va = &args;
 
+	dump_page(folio_page(eb->folios[0], 0), "eb page dump");
 	btrfs_crit(fs_info,
 		"corrupt %s: root=%llu block=%llu slot=%d ino=%llu, %pV",
 		btrfs_header_level(eb) == 0 ? "leaf" : "node",
@@ -647,6 +650,7 @@ static void block_group_err(const struct extent_buffer *eb, int slot,
 	vaf.fmt = fmt;
 	vaf.va = &args;
 
+	dump_page(folio_page(eb->folios[0], 0), "eb page dump");
 	btrfs_crit(fs_info,
 	"corrupt %s: root=%llu block=%llu slot=%d bg_start=%llu bg_len=%llu, %pV",
 		btrfs_header_level(eb) == 0 ? "leaf" : "node",
@@ -1003,6 +1007,7 @@ static void dev_item_err(const struct extent_buffer *eb, int slot,
 	vaf.fmt = fmt;
 	vaf.va = &args;
 
+	dump_page(folio_page(eb->folios[0], 0), "eb page dump");
 	btrfs_crit(eb->fs_info,
 	"corrupt %s: root=%llu block=%llu slot=%d devid=%llu %pV",
 		btrfs_header_level(eb) == 0 ? "leaf" : "node",
@@ -1258,6 +1263,7 @@ static void extent_err(const struct extent_buffer *eb, int slot,
 	vaf.fmt = fmt;
 	vaf.va = &args;
 
+	dump_page(folio_page(eb->folios[0], 0), "eb page dump");
 	btrfs_crit(eb->fs_info,
 	"corrupt %s: block=%llu slot=%d extent bytenr=%llu len=%llu %pV",
 		btrfs_header_level(eb) == 0 ? "leaf" : "node",

From 927fada89635c31b2205e68941f02a898b3e17dd Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Mon, 29 Jan 2024 20:16:06 +1030
Subject: [PATCH 0448/1406] btrfs: compression: add error handling for missed
 page cache

For all the supported compression algorithms, the compression path would
always need to grab the page cache, then do the compression.

Normally we would get a page reference without any problem, since the
write path should have already locked the pages in the write range.
Just for the sake of error handling, we should handle the page cache
miss case.

This patch adds a common wrapper, btrfs_compress_find_get_page(),
which calls find_get_page(), and do the error handling along with an
error message with an ASSERT().

Callers inside compression path would only need to call
btrfs_compress_find_get_page(), and error out if it returned any error.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/compression.c | 26 ++++++++++++++++++++++++++
 fs/btrfs/compression.h |  3 +++
 fs/btrfs/lzo.c         |  5 +++--
 fs/btrfs/zlib.c        | 14 ++++++++++----
 fs/btrfs/zstd.c        |  9 +++++++--
 5 files changed, 49 insertions(+), 8 deletions(-)

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 0b8833baf40403..9a16e1c6ced54a 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -974,6 +974,32 @@ static unsigned int btrfs_compress_set_level(int type, unsigned level)
 	return level;
 }
 
+/* A wrapper around find_get_page(), with extra error message. */
+int btrfs_compress_find_get_page(struct address_space *mapping, u64 start,
+				 struct page **in_page_ret)
+{
+	struct page *in_page;
+
+	/*
+	 * The compressed write path should have the page locked already,
+	 * thus we only need to grab one reference of the page cache.
+	 */
+	in_page = find_get_page(mapping, start >> PAGE_SHIFT);
+	if (unlikely(!in_page)) {
+		struct btrfs_inode *binode = BTRFS_I(mapping->host);
+		struct btrfs_fs_info *fs_info = binode->root->fs_info;
+
+		btrfs_crit(fs_info,
+		"failed to get page cache, root %lld ino %llu file offset %llu",
+			   binode->root->root_key.objectid, btrfs_ino(binode),
+			   start);
+		ASSERT(0);
+		return -ENOENT;
+	}
+	*in_page_ret = in_page;
+	return 0;
+}
+
 /*
  * Given an address space and start and length, compress the bytes into @pages
  * that are allocated on demand.
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index 4691a84ca83831..7590dc86d0401e 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -149,6 +149,9 @@ bool btrfs_compress_is_valid_type(const char *str, size_t len);
 
 int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end);
 
+int btrfs_compress_find_get_page(struct address_space *mapping, u64 start,
+				 struct page **in_page_ret);
+
 int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
 		u64 start, struct page **pages, unsigned long *out_pages,
 		unsigned long *total_in, unsigned long *total_out);
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index 3e5d3b7028e8ba..6ac2cd177d4410 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -244,8 +244,9 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
 
 		/* Get the input page first */
 		if (!page_in) {
-			page_in = find_get_page(mapping, cur_in >> PAGE_SHIFT);
-			ASSERT(page_in);
+			ret =  btrfs_compress_find_get_page(mapping, cur_in, &page_in);
+			if (ret < 0)
+				goto out;
 		}
 
 		/* Compress at most one sector of data each time */
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index e5b3f20038962f..ad6f011eab6998 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -151,9 +151,12 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
 					if (data_in) {
 						kunmap_local(data_in);
 						put_page(in_page);
+						data_in = NULL;
 					}
-					in_page = find_get_page(mapping,
-								start >> PAGE_SHIFT);
+					ret = btrfs_compress_find_get_page(mapping,
+							start, &in_page);
+					if (ret < 0)
+						goto out;
 					data_in = kmap_local_page(in_page);
 					copy_page(workspace->buf + i * PAGE_SIZE,
 						  data_in);
@@ -164,9 +167,12 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
 				if (data_in) {
 					kunmap_local(data_in);
 					put_page(in_page);
+					data_in = NULL;
 				}
-				in_page = find_get_page(mapping,
-							start >> PAGE_SHIFT);
+				ret = btrfs_compress_find_get_page(mapping,
+						start, &in_page);
+				if (ret < 0)
+					goto out;
 				data_in = kmap_local_page(in_page);
 				start += PAGE_SIZE;
 				workspace->strm.next_in = data_in;
diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c
index 92b3744b819bd4..b647ad036af3e8 100644
--- a/fs/btrfs/zstd.c
+++ b/fs/btrfs/zstd.c
@@ -406,7 +406,9 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
 	}
 
 	/* map in the first page of input data */
-	in_page = find_get_page(mapping, start >> PAGE_SHIFT);
+	ret = btrfs_compress_find_get_page(mapping, start, &in_page);
+	if (ret < 0)
+		goto out;
 	workspace->in_buf.src = kmap_local_page(in_page);
 	workspace->in_buf.pos = 0;
 	workspace->in_buf.size = min_t(size_t, len, PAGE_SIZE);
@@ -479,10 +481,13 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
 		if (workspace->in_buf.pos == workspace->in_buf.size) {
 			tot_in += PAGE_SIZE;
 			kunmap_local(workspace->in_buf.src);
+			workspace->in_buf.src = NULL;
 			put_page(in_page);
 			start += PAGE_SIZE;
 			len -= PAGE_SIZE;
-			in_page = find_get_page(mapping, start >> PAGE_SHIFT);
+			ret = btrfs_compress_find_get_page(mapping, start, &in_page);
+			if (ret < 0)
+				goto out;
 			workspace->in_buf.src = kmap_local_page(in_page);
 			workspace->in_buf.pos = 0;
 			workspace->in_buf.size = min_t(size_t, len, PAGE_SIZE);

From 73bc0c778e4b18b6bb82f526015f8735cd0494f0 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Mon, 29 Jan 2024 20:16:07 +1030
Subject: [PATCH 0449/1406] btrfs: compression: convert page allocation to
 folio interfaces

Currently we have two wrappers to allocate and free a page for
compression usage:

- btrfs_alloc_compr_page()
- btrfs_free_compr_page()

The allocator would try to grab a page from the pool, and only allocate
a new page if the pool is empty.

The reclaimer would check if the pool is full, and if not full it would
put the page into the pool.

This patch would convert both helpers to use folio interfaces, and
allowing further conversion of compression path to folios.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/compression.c | 24 ++++++++++++------------
 fs/btrfs/compression.h |  4 ++--
 fs/btrfs/inode.c       |  4 ++--
 fs/btrfs/lzo.c         |  4 ++--
 fs/btrfs/zlib.c        |  6 +++---
 fs/btrfs/zstd.c        |  6 +++---
 6 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 9a16e1c6ced54a..07a1393f0a6762 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -161,7 +161,7 @@ static int compression_decompress(int type, struct list_head *ws,
 static void btrfs_free_compressed_pages(struct compressed_bio *cb)
 {
 	for (unsigned int i = 0; i < cb->nr_pages; i++)
-		btrfs_free_compr_page(cb->compressed_pages[i]);
+		btrfs_free_compr_folio(page_folio(cb->compressed_pages[i]));
 	kfree(cb->compressed_pages);
 }
 
@@ -223,25 +223,25 @@ static unsigned long btrfs_compr_pool_scan(struct shrinker *sh, struct shrink_co
 /*
  * Common wrappers for page allocation from compression wrappers
  */
-struct page *btrfs_alloc_compr_page(void)
+struct folio *btrfs_alloc_compr_folio(void)
 {
-	struct page *page = NULL;
+	struct folio *folio = NULL;
 
 	spin_lock(&compr_pool.lock);
 	if (compr_pool.count > 0) {
-		page = list_first_entry(&compr_pool.list, struct page, lru);
-		list_del_init(&page->lru);
+		folio = list_first_entry(&compr_pool.list, struct folio, lru);
+		list_del_init(&folio->lru);
 		compr_pool.count--;
 	}
 	spin_unlock(&compr_pool.lock);
 
-	if (page)
-		return page;
+	if (folio)
+		return folio;
 
-	return alloc_page(GFP_NOFS);
+	return folio_alloc(GFP_NOFS, 0);
 }
 
-void btrfs_free_compr_page(struct page *page)
+void btrfs_free_compr_folio(struct folio *folio)
 {
 	bool do_free = false;
 
@@ -249,7 +249,7 @@ void btrfs_free_compr_page(struct page *page)
 	if (compr_pool.count > compr_pool.thresh) {
 		do_free = true;
 	} else {
-		list_add(&page->lru, &compr_pool.list);
+		list_add(&folio->lru, &compr_pool.list);
 		compr_pool.count++;
 	}
 	spin_unlock(&compr_pool.lock);
@@ -257,8 +257,8 @@ void btrfs_free_compr_page(struct page *page)
 	if (!do_free)
 		return;
 
-	ASSERT(page_ref_count(page) == 1);
-	put_page(page);
+	ASSERT(folio_ref_count(folio) == 1);
+	folio_put(folio);
 }
 
 static void end_bbio_comprssed_read(struct btrfs_bio *bbio)
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index 7590dc86d0401e..2673c25415e595 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -104,8 +104,8 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio);
 
 unsigned int btrfs_compress_str2level(unsigned int type, const char *str);
 
-struct page *btrfs_alloc_compr_page(void);
-void btrfs_free_compr_page(struct page *page);
+struct folio *btrfs_alloc_compr_folio(void);
+void btrfs_free_compr_folio(struct folio *folio);
 
 enum btrfs_compression_type {
 	BTRFS_COMPRESS_NONE  = 0,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index f93cb23ae1ee40..76e93fa63421fc 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1047,7 +1047,7 @@ static void compress_file_range(struct btrfs_work *work)
 	if (pages) {
 		for (i = 0; i < nr_pages; i++) {
 			WARN_ON(pages[i]->mapping);
-			btrfs_free_compr_page(pages[i]);
+			btrfs_free_compr_folio(page_folio(pages[i]));
 		}
 		kfree(pages);
 	}
@@ -1062,7 +1062,7 @@ static void free_async_extent_pages(struct async_extent *async_extent)
 
 	for (i = 0; i < async_extent->nr_pages; i++) {
 		WARN_ON(async_extent->pages[i]->mapping);
-		btrfs_free_compr_page(async_extent->pages[i]);
+		btrfs_free_compr_folio(page_folio(async_extent->pages[i]));
 	}
 	kfree(async_extent->pages);
 	async_extent->nr_pages = 0;
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index 6ac2cd177d4410..c0e8894c972756 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -152,7 +152,7 @@ static int copy_compressed_data_to_page(char *compressed_data,
 	cur_page = out_pages[*cur_out / PAGE_SIZE];
 	/* Allocate a new page */
 	if (!cur_page) {
-		cur_page = btrfs_alloc_compr_page();
+		cur_page = folio_page(btrfs_alloc_compr_folio(), 0);
 		if (!cur_page)
 			return -ENOMEM;
 		out_pages[*cur_out / PAGE_SIZE] = cur_page;
@@ -178,7 +178,7 @@ static int copy_compressed_data_to_page(char *compressed_data,
 		cur_page = out_pages[*cur_out / PAGE_SIZE];
 		/* Allocate a new page */
 		if (!cur_page) {
-			cur_page = btrfs_alloc_compr_page();
+			cur_page = folio_page(btrfs_alloc_compr_folio(), 0);
 			if (!cur_page)
 				return -ENOMEM;
 			out_pages[*cur_out / PAGE_SIZE] = cur_page;
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index ad6f011eab6998..c260e42b0a3399 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -121,7 +121,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
 	workspace->strm.total_in = 0;
 	workspace->strm.total_out = 0;
 
-	out_page = btrfs_alloc_compr_page();
+	out_page = folio_page(btrfs_alloc_compr_folio(), 0);
 	if (out_page == NULL) {
 		ret = -ENOMEM;
 		goto out;
@@ -206,7 +206,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
 				ret = -E2BIG;
 				goto out;
 			}
-			out_page = btrfs_alloc_compr_page();
+			out_page = folio_page(btrfs_alloc_compr_folio(), 0);
 			if (out_page == NULL) {
 				ret = -ENOMEM;
 				goto out;
@@ -242,7 +242,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
 				ret = -E2BIG;
 				goto out;
 			}
-			out_page = btrfs_alloc_compr_page();
+			out_page = folio_page(btrfs_alloc_compr_folio(), 0);
 			if (out_page == NULL) {
 				ret = -ENOMEM;
 				goto out;
diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c
index b647ad036af3e8..4ec5dd84b93e33 100644
--- a/fs/btrfs/zstd.c
+++ b/fs/btrfs/zstd.c
@@ -414,7 +414,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
 	workspace->in_buf.size = min_t(size_t, len, PAGE_SIZE);
 
 	/* Allocate and map in the output buffer */
-	out_page = btrfs_alloc_compr_page();
+	out_page = folio_page(btrfs_alloc_compr_folio(), 0);
 	if (out_page == NULL) {
 		ret = -ENOMEM;
 		goto out;
@@ -459,7 +459,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
 				ret = -E2BIG;
 				goto out;
 			}
-			out_page = btrfs_alloc_compr_page();
+			out_page = folio_page(btrfs_alloc_compr_folio(), 0);
 			if (out_page == NULL) {
 				ret = -ENOMEM;
 				goto out;
@@ -519,7 +519,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
 			ret = -E2BIG;
 			goto out;
 		}
-		out_page = btrfs_alloc_compr_page();
+		out_page = folio_page(btrfs_alloc_compr_folio(), 0);
 		if (out_page == NULL) {
 			ret = -ENOMEM;
 			goto out;

From ab1ffa7021abca4960ed9f6305acc1e91a39dfb3 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Mon, 29 Jan 2024 20:16:08 +1030
Subject: [PATCH 0450/1406] btrfs: make insert_inline_extent() to accept one
 page directly

Since our inline extent can not accept anything larger than a sector,
there is really no need to pass all the compressed pages to
insert_inline_extent().

And just in case, expand the ASSERT()s to make sure we only try inline
with compressed size no larger than sectorsize.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 47 +++++++++++++++++++++++++----------------------
 1 file changed, 25 insertions(+), 22 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 76e93fa63421fc..7222a957251f43 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -512,12 +512,13 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
 				struct btrfs_inode *inode, bool extent_inserted,
 				size_t size, size_t compressed_size,
 				int compress_type,
-				struct page **compressed_pages,
+				struct page *compressed_page,
 				bool update_i_size)
 {
 	struct btrfs_root *root = inode->root;
 	struct extent_buffer *leaf;
 	struct page *page = NULL;
+	const u32 sectorsize = trans->fs_info->sectorsize;
 	char *kaddr;
 	unsigned long ptr;
 	struct btrfs_file_extent_item *ei;
@@ -525,10 +526,23 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
 	size_t cur_size = size;
 	u64 i_size;
 
-	ASSERT((compressed_size > 0 && compressed_pages) ||
-	       (compressed_size == 0 && !compressed_pages));
+	/*
+	 * The decompressed size must still be no larger than a sector.
+	 * Under heavy race, we can have size == 0 passed in, but that
+	 * shouldn't be a big deal and we can continue the insertion.
+	 */
+	ASSERT(size <= sectorsize);
 
-	if (compressed_size && compressed_pages)
+	/*
+	 * The compressed size also need to be no larger than a sector.
+	 * That's also why we only need one page as the parameter.
+	 */
+	if (compressed_page)
+		ASSERT(compressed_size <= sectorsize);
+	else
+		ASSERT(compressed_size == 0);
+
+	if (compressed_size && compressed_page)
 		cur_size = compressed_size;
 
 	if (!extent_inserted) {
@@ -556,21 +570,10 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
 	ptr = btrfs_file_extent_inline_start(ei);
 
 	if (compress_type != BTRFS_COMPRESS_NONE) {
-		struct page *cpage;
-		int i = 0;
-		while (compressed_size > 0) {
-			cpage = compressed_pages[i];
-			cur_size = min_t(unsigned long, compressed_size,
-				       PAGE_SIZE);
-
-			kaddr = kmap_local_page(cpage);
-			write_extent_buffer(leaf, kaddr, ptr, cur_size);
-			kunmap_local(kaddr);
+		kaddr = kmap_local_page(compressed_page);
+		write_extent_buffer(leaf, kaddr, ptr, compressed_size);
+		kunmap_local(kaddr);
 
-			i++;
-			ptr += cur_size;
-			compressed_size -= cur_size;
-		}
 		btrfs_set_file_extent_compression(leaf, ei,
 						  compress_type);
 	} else {
@@ -620,7 +623,7 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
 static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 size,
 					  size_t compressed_size,
 					  int compress_type,
-					  struct page **compressed_pages,
+					  struct page *compressed_page,
 					  bool update_i_size)
 {
 	struct btrfs_drop_extents_args drop_args = { 0 };
@@ -668,7 +671,7 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 size,
 
 	ret = insert_inline_extent(trans, path, inode, drop_args.extent_inserted,
 				   size, compressed_size, compress_type,
-				   compressed_pages, update_i_size);
+				   compressed_page, update_i_size);
 	if (ret && ret != -ENOSPC) {
 		btrfs_abort_transaction(trans, ret);
 		goto out;
@@ -976,7 +979,7 @@ static void compress_file_range(struct btrfs_work *work)
 		} else {
 			ret = cow_file_range_inline(inode, actual_end,
 						    total_compressed,
-						    compress_type, pages,
+						    compress_type, pages[0],
 						    false);
 		}
 		if (ret <= 0) {
@@ -10445,7 +10448,7 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
 	if (start == 0 && encoded->unencoded_len == encoded->len &&
 	    encoded->unencoded_offset == 0) {
 		ret = cow_file_range_inline(inode, encoded->len, orig_count,
-					    compression, pages, true);
+					    compression, pages[0], true);
 		if (ret <= 0) {
 			if (ret == 0)
 				ret = orig_count;

From 85d9eeb5d52a8a5b1a3e8b2f8771ada3999c277f Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Mon, 29 Jan 2024 20:16:09 +1030
Subject: [PATCH 0451/1406] btrfs: migrate insert_inline_extent() to folio
 interfaces

Since insert_inline_extent() now only accepts a single page, it's much
easier to convert it to use folio interfaces.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 7222a957251f43..03bf9195fb2273 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -512,7 +512,7 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
 				struct btrfs_inode *inode, bool extent_inserted,
 				size_t size, size_t compressed_size,
 				int compress_type,
-				struct page *compressed_page,
+				struct folio *compressed_folio,
 				bool update_i_size)
 {
 	struct btrfs_root *root = inode->root;
@@ -537,12 +537,12 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
 	 * The compressed size also need to be no larger than a sector.
 	 * That's also why we only need one page as the parameter.
 	 */
-	if (compressed_page)
+	if (compressed_folio)
 		ASSERT(compressed_size <= sectorsize);
 	else
 		ASSERT(compressed_size == 0);
 
-	if (compressed_size && compressed_page)
+	if (compressed_size && compressed_folio)
 		cur_size = compressed_size;
 
 	if (!extent_inserted) {
@@ -570,7 +570,7 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
 	ptr = btrfs_file_extent_inline_start(ei);
 
 	if (compress_type != BTRFS_COMPRESS_NONE) {
-		kaddr = kmap_local_page(compressed_page);
+		kaddr = kmap_local_folio(compressed_folio, 0);
 		write_extent_buffer(leaf, kaddr, ptr, compressed_size);
 		kunmap_local(kaddr);
 
@@ -623,7 +623,7 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
 static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 size,
 					  size_t compressed_size,
 					  int compress_type,
-					  struct page *compressed_page,
+					  struct folio *compressed_folio,
 					  bool update_i_size)
 {
 	struct btrfs_drop_extents_args drop_args = { 0 };
@@ -671,7 +671,7 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 size,
 
 	ret = insert_inline_extent(trans, path, inode, drop_args.extent_inserted,
 				   size, compressed_size, compress_type,
-				   compressed_page, update_i_size);
+				   compressed_folio, update_i_size);
 	if (ret && ret != -ENOSPC) {
 		btrfs_abort_transaction(trans, ret);
 		goto out;
@@ -979,7 +979,8 @@ static void compress_file_range(struct btrfs_work *work)
 		} else {
 			ret = cow_file_range_inline(inode, actual_end,
 						    total_compressed,
-						    compress_type, pages[0],
+						    compress_type,
+						    page_folio(pages[0]),
 						    false);
 		}
 		if (ret <= 0) {
@@ -10448,7 +10449,8 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
 	if (start == 0 && encoded->unencoded_len == encoded->len &&
 	    encoded->unencoded_offset == 0) {
 		ret = cow_file_range_inline(inode, encoded->len, orig_count,
-					    compression, pages[0], true);
+					    compression, page_folio(pages[0]),
+					    true);
 		if (ret <= 0) {
 			if (ret == 0)
 				ret = orig_count;

From 8bf09fd1f8dc57e7d951d4876aa299b6098dfb69 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Mon, 29 Jan 2024 20:16:10 +1030
Subject: [PATCH 0452/1406] btrfs: introduce btrfs_alloc_folio_array()

The new helper would do the same thing as btrfs_alloc_page_array(), but
with folios.

One extra difference is, there is no extra helper for bulk allocation,
thus it may not be as efficient as the page version.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_io.c | 31 +++++++++++++++++++++++++++++++
 fs/btrfs/extent_io.h |  2 ++
 2 files changed, 33 insertions(+)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index d02b7672906e50..7449e0864b7806 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -666,6 +666,37 @@ static void end_bbio_data_read(struct btrfs_bio *bbio)
 	bio_put(bio);
 }
 
+/*
+ * Populate every free slot in a provided array with folios.
+ *
+ * @nr_folios:   number of foliosto allocate
+ * @folio_array: the array to fill with folios; any existing non-null entries in
+ *		 the array will be skipped
+ * @extra_gfp:	 the extra GFP flags for the allocation.
+ *
+ * Return: 0        if all folios were able to be allocated;
+ *         -ENOMEM  otherwise, the partially allocated folios would be freed and
+ *                  the array slots zeroed
+ */
+int btrfs_alloc_folio_array(unsigned int nr_folios, struct folio **folio_array,
+			    gfp_t extra_gfp)
+{
+	for (int i = 0; i < nr_folios; i++) {
+		if (folio_array[i])
+			continue;
+		folio_array[i] = folio_alloc(GFP_NOFS | extra_gfp, 0);
+		if (!folio_array[i])
+			goto error;
+	}
+	return 0;
+error:
+	for (int i = 0; i < nr_folios; i++) {
+		if (folio_array[i])
+			folio_put(folio_array[i]);
+	}
+	return -ENOMEM;
+}
+
 /*
  * Populate every free slot in a provided array with pages.
  *
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index e3530d427e1f9f..ab5f7df29f120d 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -361,6 +361,8 @@ void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans,
 
 int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array,
 			   gfp_t extra_gfp);
+int btrfs_alloc_folio_array(unsigned int nr_folios, struct folio **folio_array,
+			    gfp_t extra_gfp);
 
 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
 bool find_lock_delalloc_range(struct inode *inode,

From 10ca718b71df9b065dcbf9d7ff3854e49c0ae059 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Mon, 29 Jan 2024 20:16:11 +1030
Subject: [PATCH 0453/1406] btrfs: compression: migrate
 compression/decompression paths to folios

For both compression and decompression paths, btrfs always requires a
"struct page **pages" and "unsigned long nr_pages", this involves quite
some part of the btrfs compression paths:

- All the compression entrances

- compressed_bio structure
  This affects both compression and decompression.

- async_extent structure

Unfortunately with all those involved parts, there is no good way to
split the conversion into smaller patches while still passes compiling.

So this patch would go a large conversion just in one go.

Please note this is just pure page->folio conversion, no change on the
page sized folio requirement yet.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/compression.c |  89 ++++++++++++++++-----------------
 fs/btrfs/compression.h |  38 +++++++--------
 fs/btrfs/inode.c       | 108 ++++++++++++++++++++---------------------
 fs/btrfs/lzo.c         |  84 ++++++++++++++++----------------
 fs/btrfs/zlib.c        | 104 +++++++++++++++++++--------------------
 fs/btrfs/zstd.c        |  74 ++++++++++++++--------------
 6 files changed, 249 insertions(+), 248 deletions(-)

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 07a1393f0a6762..781654b74307ac 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -90,20 +90,20 @@ bool btrfs_compress_is_valid_type(const char *str, size_t len)
 }
 
 static int compression_compress_pages(int type, struct list_head *ws,
-               struct address_space *mapping, u64 start, struct page **pages,
-               unsigned long *out_pages, unsigned long *total_in,
-               unsigned long *total_out)
+		struct address_space *mapping, u64 start, struct folio **folios,
+		unsigned long *out_folios, unsigned long *total_in,
+		unsigned long *total_out)
 {
 	switch (type) {
 	case BTRFS_COMPRESS_ZLIB:
-		return zlib_compress_pages(ws, mapping, start, pages,
-				out_pages, total_in, total_out);
+		return zlib_compress_folios(ws, mapping, start, folios,
+				out_folios, total_in, total_out);
 	case BTRFS_COMPRESS_LZO:
-		return lzo_compress_pages(ws, mapping, start, pages,
-				out_pages, total_in, total_out);
+		return lzo_compress_folios(ws, mapping, start, folios,
+				out_folios, total_in, total_out);
 	case BTRFS_COMPRESS_ZSTD:
-		return zstd_compress_pages(ws, mapping, start, pages,
-				out_pages, total_in, total_out);
+		return zstd_compress_folios(ws, mapping, start, folios,
+				out_folios, total_in, total_out);
 	case BTRFS_COMPRESS_NONE:
 	default:
 		/*
@@ -115,7 +115,7 @@ static int compression_compress_pages(int type, struct list_head *ws,
 		 * Not a big deal, just need to inform caller that we
 		 * haven't allocated any pages yet.
 		 */
-		*out_pages = 0;
+		*out_folios = 0;
 		return -E2BIG;
 	}
 }
@@ -158,11 +158,11 @@ static int compression_decompress(int type, struct list_head *ws,
 	}
 }
 
-static void btrfs_free_compressed_pages(struct compressed_bio *cb)
+static void btrfs_free_compressed_folios(struct compressed_bio *cb)
 {
-	for (unsigned int i = 0; i < cb->nr_pages; i++)
-		btrfs_free_compr_folio(page_folio(cb->compressed_pages[i]));
-	kfree(cb->compressed_pages);
+	for (unsigned int i = 0; i < cb->nr_folios; i++)
+		btrfs_free_compr_folio(cb->compressed_folios[i]);
+	kfree(cb->compressed_folios);
 }
 
 static int btrfs_decompress_bio(struct compressed_bio *cb);
@@ -269,7 +269,7 @@ static void end_bbio_comprssed_read(struct btrfs_bio *bbio)
 	if (!status)
 		status = errno_to_blk_status(btrfs_decompress_bio(cb));
 
-	btrfs_free_compressed_pages(cb);
+	btrfs_free_compressed_folios(cb);
 	btrfs_bio_end_io(cb->orig_bbio, status);
 	bio_put(&bbio->bio);
 }
@@ -323,7 +323,7 @@ static void btrfs_finish_compressed_write_work(struct work_struct *work)
 		end_compressed_writeback(cb);
 	/* Note, our inode could be gone now */
 
-	btrfs_free_compressed_pages(cb);
+	btrfs_free_compressed_folios(cb);
 	bio_put(&cb->bbio.bio);
 }
 
@@ -342,17 +342,18 @@ static void end_bbio_comprssed_write(struct btrfs_bio *bbio)
 	queue_work(fs_info->compressed_write_workers, &cb->write_end_work);
 }
 
-static void btrfs_add_compressed_bio_pages(struct compressed_bio *cb)
+static void btrfs_add_compressed_bio_folios(struct compressed_bio *cb)
 {
 	struct bio *bio = &cb->bbio.bio;
 	u32 offset = 0;
 
 	while (offset < cb->compressed_len) {
+		int ret;
 		u32 len = min_t(u32, cb->compressed_len - offset, PAGE_SIZE);
 
 		/* Maximum compressed extent is smaller than bio size limit. */
-		__bio_add_page(bio, cb->compressed_pages[offset >> PAGE_SHIFT],
-			       len, 0);
+		ret = bio_add_folio(bio, cb->compressed_folios[offset >> PAGE_SHIFT], len, 0);
+		ASSERT(ret);
 		offset += len;
 	}
 }
@@ -367,8 +368,8 @@ static void btrfs_add_compressed_bio_pages(struct compressed_bio *cb)
  * the end io hooks.
  */
 void btrfs_submit_compressed_write(struct btrfs_ordered_extent *ordered,
-				   struct page **compressed_pages,
-				   unsigned int nr_pages,
+				   struct folio **compressed_folios,
+				   unsigned int nr_folios,
 				   blk_opf_t write_flags,
 				   bool writeback)
 {
@@ -384,14 +385,14 @@ void btrfs_submit_compressed_write(struct btrfs_ordered_extent *ordered,
 				  end_bbio_comprssed_write);
 	cb->start = ordered->file_offset;
 	cb->len = ordered->num_bytes;
-	cb->compressed_pages = compressed_pages;
+	cb->compressed_folios = compressed_folios;
 	cb->compressed_len = ordered->disk_num_bytes;
 	cb->writeback = writeback;
 	INIT_WORK(&cb->write_end_work, btrfs_finish_compressed_write_work);
-	cb->nr_pages = nr_pages;
+	cb->nr_folios = nr_folios;
 	cb->bbio.bio.bi_iter.bi_sector = ordered->disk_bytenr >> SECTOR_SHIFT;
 	cb->bbio.ordered = ordered;
-	btrfs_add_compressed_bio_pages(cb);
+	btrfs_add_compressed_bio_folios(cb);
 
 	btrfs_submit_bio(&cb->bbio, 0);
 }
@@ -599,14 +600,14 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio)
 
 	free_extent_map(em);
 
-	cb->nr_pages = DIV_ROUND_UP(compressed_len, PAGE_SIZE);
-	cb->compressed_pages = kcalloc(cb->nr_pages, sizeof(struct page *), GFP_NOFS);
-	if (!cb->compressed_pages) {
+	cb->nr_folios = DIV_ROUND_UP(compressed_len, PAGE_SIZE);
+	cb->compressed_folios = kcalloc(cb->nr_folios, sizeof(struct page *), GFP_NOFS);
+	if (!cb->compressed_folios) {
 		ret = BLK_STS_RESOURCE;
 		goto out_free_bio;
 	}
 
-	ret2 = btrfs_alloc_page_array(cb->nr_pages, cb->compressed_pages, 0);
+	ret2 = btrfs_alloc_folio_array(cb->nr_folios, cb->compressed_folios, 0);
 	if (ret2) {
 		ret = BLK_STS_RESOURCE;
 		goto out_free_compressed_pages;
@@ -618,7 +619,7 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio)
 	/* include any pages we added in add_ra-bio_pages */
 	cb->len = bbio->bio.bi_iter.bi_size;
 	cb->bbio.bio.bi_iter.bi_sector = bbio->bio.bi_iter.bi_sector;
-	btrfs_add_compressed_bio_pages(cb);
+	btrfs_add_compressed_bio_folios(cb);
 
 	if (memstall)
 		psi_memstall_leave(&pflags);
@@ -627,7 +628,7 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio)
 	return;
 
 out_free_compressed_pages:
-	kfree(cb->compressed_pages);
+	kfree(cb->compressed_folios);
 out_free_bio:
 	bio_put(&cb->bbio.bio);
 out:
@@ -974,18 +975,18 @@ static unsigned int btrfs_compress_set_level(int type, unsigned level)
 	return level;
 }
 
-/* A wrapper around find_get_page(), with extra error message. */
-int btrfs_compress_find_get_page(struct address_space *mapping, u64 start,
-				 struct page **in_page_ret)
+/* A wrapper around filemap_get_folio(), with extra error message. */
+int btrfs_compress_filemap_get_folio(struct address_space *mapping, u64 start,
+				     struct folio **in_folio_ret)
 {
-	struct page *in_page;
+	struct folio *in_folio;
 
 	/*
-	 * The compressed write path should have the page locked already,
-	 * thus we only need to grab one reference of the page cache.
+	 * The compressed write path should have the folio locked already,
+	 * thus we only need to grab one reference.
 	 */
-	in_page = find_get_page(mapping, start >> PAGE_SHIFT);
-	if (unlikely(!in_page)) {
+	in_folio = filemap_get_folio(mapping, start >> PAGE_SHIFT);
+	if (IS_ERR(in_folio)) {
 		struct btrfs_inode *binode = BTRFS_I(mapping->host);
 		struct btrfs_fs_info *fs_info = binode->root->fs_info;
 
@@ -996,7 +997,7 @@ int btrfs_compress_find_get_page(struct address_space *mapping, u64 start,
 		ASSERT(0);
 		return -ENOENT;
 	}
-	*in_page_ret = in_page;
+	*in_folio_ret = in_folio;
 	return 0;
 }
 
@@ -1020,9 +1021,9 @@ int btrfs_compress_find_get_page(struct address_space *mapping, u64 start,
  * @total_out is an in/out parameter, must be set to the input length and will
  * be also used to return the total number of compressed bytes
  */
-int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping,
-			 u64 start, struct page **pages,
-			 unsigned long *out_pages,
+int btrfs_compress_folios(unsigned int type_level, struct address_space *mapping,
+			 u64 start, struct folio **folios,
+			 unsigned long *out_folios,
 			 unsigned long *total_in,
 			 unsigned long *total_out)
 {
@@ -1033,8 +1034,8 @@ int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping,
 
 	level = btrfs_compress_set_level(type, level);
 	workspace = get_workspace(type, level);
-	ret = compression_compress_pages(type, workspace, mapping, start, pages,
-					 out_pages, total_in, total_out);
+	ret = compression_compress_pages(type, workspace, mapping, start, folios,
+					 out_folios, total_in, total_out);
 	put_workspace(type, workspace);
 	return ret;
 }
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index 2673c25415e595..a31e8fc938ac55 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -41,11 +41,11 @@ static_assert((BTRFS_MAX_COMPRESSED % PAGE_SIZE) == 0);
 #define	BTRFS_ZLIB_DEFAULT_LEVEL		3
 
 struct compressed_bio {
-	/* Number of compressed pages in the array */
-	unsigned int nr_pages;
+	/* Number of compressed folios in the array */
+	unsigned int nr_folios;
 
-	/* the pages with the compressed data on them */
-	struct page **compressed_pages;
+	/* the folios with the compressed data on them */
+	struct folio **compressed_folios;
 
 	/* starting offset in the inode for our pages */
 	u64 start;
@@ -85,9 +85,9 @@ static inline unsigned int btrfs_compress_level(unsigned int type_level)
 int __init btrfs_init_compress(void);
 void __cold btrfs_exit_compress(void);
 
-int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping,
-			 u64 start, struct page **pages,
-			 unsigned long *out_pages,
+int btrfs_compress_folios(unsigned int type_level, struct address_space *mapping,
+			 u64 start, struct folio **folios,
+			 unsigned long *out_folios,
 			 unsigned long *total_in,
 			 unsigned long *total_out);
 int btrfs_decompress(int type, const u8 *data_in, struct page *dest_page,
@@ -96,10 +96,10 @@ int btrfs_decompress_buf2page(const char *buf, u32 buf_len,
 			      struct compressed_bio *cb, u32 decompressed);
 
 void btrfs_submit_compressed_write(struct btrfs_ordered_extent *ordered,
-				  struct page **compressed_pages,
-				  unsigned int nr_pages,
-				  blk_opf_t write_flags,
-				  bool writeback);
+				   struct folio **compressed_folios,
+				   unsigned int nr_folios,
+				   blk_opf_t write_flags,
+				   bool writeback);
 void btrfs_submit_compressed_read(struct btrfs_bio *bbio);
 
 unsigned int btrfs_compress_str2level(unsigned int type, const char *str);
@@ -149,11 +149,11 @@ bool btrfs_compress_is_valid_type(const char *str, size_t len);
 
 int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end);
 
-int btrfs_compress_find_get_page(struct address_space *mapping, u64 start,
-				 struct page **in_page_ret);
+int btrfs_compress_filemap_get_folio(struct address_space *mapping, u64 start,
+				     struct folio **in_folio_ret);
 
-int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
-		u64 start, struct page **pages, unsigned long *out_pages,
+int zlib_compress_folios(struct list_head *ws, struct address_space *mapping,
+		u64 start, struct folio **folios, unsigned long *out_folios,
 		unsigned long *total_in, unsigned long *total_out);
 int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
 int zlib_decompress(struct list_head *ws, const u8 *data_in,
@@ -163,8 +163,8 @@ struct list_head *zlib_alloc_workspace(unsigned int level);
 void zlib_free_workspace(struct list_head *ws);
 struct list_head *zlib_get_workspace(unsigned int level);
 
-int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
-		u64 start, struct page **pages, unsigned long *out_pages,
+int lzo_compress_folios(struct list_head *ws, struct address_space *mapping,
+		u64 start, struct folio **folios, unsigned long *out_folios,
 		unsigned long *total_in, unsigned long *total_out);
 int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
 int lzo_decompress(struct list_head *ws, const u8 *data_in,
@@ -173,8 +173,8 @@ int lzo_decompress(struct list_head *ws, const u8 *data_in,
 struct list_head *lzo_alloc_workspace(unsigned int level);
 void lzo_free_workspace(struct list_head *ws);
 
-int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
-		u64 start, struct page **pages, unsigned long *out_pages,
+int zstd_compress_folios(struct list_head *ws, struct address_space *mapping,
+		u64 start, struct folio **folios, unsigned long *out_folios,
 		unsigned long *total_in, unsigned long *total_out);
 int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
 int zstd_decompress(struct list_head *ws, const u8 *data_in,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 03bf9195fb2273..df55dd89113755 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -708,8 +708,8 @@ struct async_extent {
 	u64 start;
 	u64 ram_size;
 	u64 compressed_size;
-	struct page **pages;
-	unsigned long nr_pages;
+	struct folio **folios;
+	unsigned long nr_folios;
 	int compress_type;
 	struct list_head list;
 };
@@ -734,8 +734,8 @@ struct async_cow {
 static noinline int add_async_extent(struct async_chunk *cow,
 				     u64 start, u64 ram_size,
 				     u64 compressed_size,
-				     struct page **pages,
-				     unsigned long nr_pages,
+				     struct folio **folios,
+				     unsigned long nr_folios,
 				     int compress_type)
 {
 	struct async_extent *async_extent;
@@ -746,8 +746,8 @@ static noinline int add_async_extent(struct async_chunk *cow,
 	async_extent->start = start;
 	async_extent->ram_size = ram_size;
 	async_extent->compressed_size = compressed_size;
-	async_extent->pages = pages;
-	async_extent->nr_pages = nr_pages;
+	async_extent->folios = folios;
+	async_extent->nr_folios = nr_folios;
 	async_extent->compress_type = compress_type;
 	list_add_tail(&async_extent->list, &cow->extents);
 	return 0;
@@ -851,8 +851,8 @@ static void compress_file_range(struct btrfs_work *work)
 	u64 actual_end;
 	u64 i_size;
 	int ret = 0;
-	struct page **pages;
-	unsigned long nr_pages;
+	struct folio **folios;
+	unsigned long nr_folios;
 	unsigned long total_compressed = 0;
 	unsigned long total_in = 0;
 	unsigned int poff;
@@ -882,9 +882,9 @@ static void compress_file_range(struct btrfs_work *work)
 	barrier();
 	actual_end = min_t(u64, i_size, end + 1);
 again:
-	pages = NULL;
-	nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
-	nr_pages = min_t(unsigned long, nr_pages, BTRFS_MAX_COMPRESSED_PAGES);
+	folios = NULL;
+	nr_folios = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
+	nr_folios = min_t(unsigned long, nr_folios, BTRFS_MAX_COMPRESSED_PAGES);
 
 	/*
 	 * we don't want to send crud past the end of i_size through
@@ -933,8 +933,8 @@ static void compress_file_range(struct btrfs_work *work)
 	if (!inode_need_compress(inode, start, end))
 		goto cleanup_and_bail_uncompressed;
 
-	pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
-	if (!pages) {
+	folios = kcalloc(nr_folios, sizeof(struct folio *), GFP_NOFS);
+	if (!folios) {
 		/*
 		 * Memory allocation failure is not a fatal error, we can fall
 		 * back to uncompressed code.
@@ -948,8 +948,8 @@ static void compress_file_range(struct btrfs_work *work)
 		compress_type = inode->prop_compress;
 
 	/* Compression level is applied here. */
-	ret = btrfs_compress_pages(compress_type | (fs_info->compress_level << 4),
-				   mapping, start, pages, &nr_pages, &total_in,
+	ret = btrfs_compress_folios(compress_type | (fs_info->compress_level << 4),
+				   mapping, start, folios, &nr_folios, &total_in,
 				   &total_compressed);
 	if (ret)
 		goto mark_incompressible;
@@ -960,7 +960,7 @@ static void compress_file_range(struct btrfs_work *work)
 	 */
 	poff = offset_in_page(total_compressed);
 	if (poff)
-		memzero_page(pages[nr_pages - 1], poff, PAGE_SIZE - poff);
+		folio_zero_range(folios[nr_folios - 1], poff, PAGE_SIZE - poff);
 
 	/*
 	 * Try to create an inline extent.
@@ -979,8 +979,7 @@ static void compress_file_range(struct btrfs_work *work)
 		} else {
 			ret = cow_file_range_inline(inode, actual_end,
 						    total_compressed,
-						    compress_type,
-						    page_folio(pages[0]),
+						    compress_type, folios[0],
 						    false);
 		}
 		if (ret <= 0) {
@@ -1030,8 +1029,8 @@ static void compress_file_range(struct btrfs_work *work)
 	 * The async work queues will take care of doing actual allocation on
 	 * disk for these compressed pages, and will submit the bios.
 	 */
-	ret = add_async_extent(async_chunk, start, total_in, total_compressed, pages,
-			       nr_pages, compress_type);
+	ret = add_async_extent(async_chunk, start, total_in, total_compressed, folios,
+			       nr_folios, compress_type);
 	BUG_ON(ret);
 	if (start + total_in < end) {
 		start += total_in;
@@ -1048,12 +1047,12 @@ static void compress_file_range(struct btrfs_work *work)
 			       BTRFS_COMPRESS_NONE);
 	BUG_ON(ret);
 free_pages:
-	if (pages) {
-		for (i = 0; i < nr_pages; i++) {
-			WARN_ON(pages[i]->mapping);
-			btrfs_free_compr_folio(page_folio(pages[i]));
+	if (folios) {
+		for (i = 0; i < nr_folios; i++) {
+			WARN_ON(folios[i]->mapping);
+			btrfs_free_compr_folio(folios[i]);
 		}
-		kfree(pages);
+		kfree(folios);
 	}
 }
 
@@ -1061,16 +1060,16 @@ static void free_async_extent_pages(struct async_extent *async_extent)
 {
 	int i;
 
-	if (!async_extent->pages)
+	if (!async_extent->folios)
 		return;
 
-	for (i = 0; i < async_extent->nr_pages; i++) {
-		WARN_ON(async_extent->pages[i]->mapping);
-		btrfs_free_compr_folio(page_folio(async_extent->pages[i]));
+	for (i = 0; i < async_extent->nr_folios; i++) {
+		WARN_ON(async_extent->folios[i]->mapping);
+		btrfs_free_compr_folio(async_extent->folios[i]);
 	}
-	kfree(async_extent->pages);
-	async_extent->nr_pages = 0;
-	async_extent->pages = NULL;
+	kfree(async_extent->folios);
+	async_extent->nr_folios = 0;
+	async_extent->folios = NULL;
 }
 
 static void submit_uncompressed_range(struct btrfs_inode *inode,
@@ -1194,8 +1193,8 @@ static void submit_one_async_extent(struct async_chunk *async_chunk,
 			NULL, EXTENT_LOCKED | EXTENT_DELALLOC,
 			PAGE_UNLOCK | PAGE_START_WRITEBACK);
 	btrfs_submit_compressed_write(ordered,
-			    async_extent->pages,	/* compressed_pages */
-			    async_extent->nr_pages,
+			    async_extent->folios,	/* compressed_folios */
+			    async_extent->nr_folios,
 			    async_chunk->write_flags, true);
 	*alloc_hint = ins.objectid + ins.offset;
 done:
@@ -10294,8 +10293,8 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
 	size_t orig_count;
 	u64 start, end;
 	u64 num_bytes, ram_bytes, disk_num_bytes;
-	unsigned long nr_pages, i;
-	struct page **pages;
+	unsigned long nr_folios, i;
+	struct folio **folios;
 	struct btrfs_key ins;
 	bool extent_reserved = false;
 	struct extent_map *em;
@@ -10384,24 +10383,24 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
 	 * isn't.
 	 */
 	disk_num_bytes = ALIGN(orig_count, fs_info->sectorsize);
-	nr_pages = DIV_ROUND_UP(disk_num_bytes, PAGE_SIZE);
-	pages = kvcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL_ACCOUNT);
-	if (!pages)
+	nr_folios = DIV_ROUND_UP(disk_num_bytes, PAGE_SIZE);
+	folios = kvcalloc(nr_folios, sizeof(struct page *), GFP_KERNEL_ACCOUNT);
+	if (!folios)
 		return -ENOMEM;
-	for (i = 0; i < nr_pages; i++) {
+	for (i = 0; i < nr_folios; i++) {
 		size_t bytes = min_t(size_t, PAGE_SIZE, iov_iter_count(from));
 		char *kaddr;
 
-		pages[i] = alloc_page(GFP_KERNEL_ACCOUNT);
-		if (!pages[i]) {
+		folios[i] = folio_alloc(GFP_KERNEL_ACCOUNT, 0);
+		if (!folios[i]) {
 			ret = -ENOMEM;
-			goto out_pages;
+			goto out_folios;
 		}
-		kaddr = kmap_local_page(pages[i]);
+		kaddr = kmap_local_folio(folios[i], 0);
 		if (copy_from_iter(kaddr, bytes, from) != bytes) {
 			kunmap_local(kaddr);
 			ret = -EFAULT;
-			goto out_pages;
+			goto out_folios;
 		}
 		if (bytes < PAGE_SIZE)
 			memset(kaddr + bytes, 0, PAGE_SIZE - bytes);
@@ -10413,12 +10412,12 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
 
 		ret = btrfs_wait_ordered_range(&inode->vfs_inode, start, num_bytes);
 		if (ret)
-			goto out_pages;
+			goto out_folios;
 		ret = invalidate_inode_pages2_range(inode->vfs_inode.i_mapping,
 						    start >> PAGE_SHIFT,
 						    end >> PAGE_SHIFT);
 		if (ret)
-			goto out_pages;
+			goto out_folios;
 		lock_extent(io_tree, start, end, &cached_state);
 		ordered = btrfs_lookup_ordered_range(inode, start, num_bytes);
 		if (!ordered &&
@@ -10449,8 +10448,7 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
 	if (start == 0 && encoded->unencoded_len == encoded->len &&
 	    encoded->unencoded_offset == 0) {
 		ret = cow_file_range_inline(inode, encoded->len, orig_count,
-					    compression, page_folio(pages[0]),
-					    true);
+					    compression, folios[0], true);
 		if (ret <= 0) {
 			if (ret == 0)
 				ret = orig_count;
@@ -10494,7 +10492,7 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
 
 	btrfs_delalloc_release_extents(inode, num_bytes);
 
-	btrfs_submit_compressed_write(ordered, pages, nr_pages, 0, false);
+	btrfs_submit_compressed_write(ordered, folios, nr_folios, 0, false);
 	ret = orig_count;
 	goto out;
 
@@ -10516,12 +10514,12 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
 		btrfs_free_reserved_data_space_noquota(fs_info, disk_num_bytes);
 out_unlock:
 	unlock_extent(io_tree, start, end, &cached_state);
-out_pages:
-	for (i = 0; i < nr_pages; i++) {
-		if (pages[i])
-			__free_page(pages[i]);
+out_folios:
+	for (i = 0; i < nr_folios; i++) {
+		if (folios[i])
+			__folio_put(folios[i]);
 	}
-	kvfree(pages);
+	kvfree(folios);
 out:
 	if (ret >= 0)
 		iocb->ki_pos += encoded->len;
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index c0e8894c972756..e0fe2bf27bc71f 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -130,17 +130,17 @@ static inline size_t read_compress_length(const char *buf)
  */
 static int copy_compressed_data_to_page(char *compressed_data,
 					size_t compressed_size,
-					struct page **out_pages,
-					unsigned long max_nr_page,
+					struct folio **out_folios,
+					unsigned long max_nr_folio,
 					u32 *cur_out,
 					const u32 sectorsize)
 {
 	u32 sector_bytes_left;
 	u32 orig_out;
-	struct page *cur_page;
+	struct folio *cur_folio;
 	char *kaddr;
 
-	if ((*cur_out / PAGE_SIZE) >= max_nr_page)
+	if ((*cur_out / PAGE_SIZE) >= max_nr_folio)
 		return -E2BIG;
 
 	/*
@@ -149,16 +149,16 @@ static int copy_compressed_data_to_page(char *compressed_data,
 	 */
 	ASSERT((*cur_out / sectorsize) == (*cur_out + LZO_LEN - 1) / sectorsize);
 
-	cur_page = out_pages[*cur_out / PAGE_SIZE];
+	cur_folio = out_folios[*cur_out / PAGE_SIZE];
 	/* Allocate a new page */
-	if (!cur_page) {
-		cur_page = folio_page(btrfs_alloc_compr_folio(), 0);
-		if (!cur_page)
+	if (!cur_folio) {
+		cur_folio = btrfs_alloc_compr_folio();
+		if (!cur_folio)
 			return -ENOMEM;
-		out_pages[*cur_out / PAGE_SIZE] = cur_page;
+		out_folios[*cur_out / PAGE_SIZE] = cur_folio;
 	}
 
-	kaddr = kmap_local_page(cur_page);
+	kaddr = kmap_local_folio(cur_folio, 0);
 	write_compress_length(kaddr + offset_in_page(*cur_out),
 			      compressed_size);
 	*cur_out += LZO_LEN;
@@ -172,18 +172,18 @@ static int copy_compressed_data_to_page(char *compressed_data,
 
 		kunmap_local(kaddr);
 
-		if ((*cur_out / PAGE_SIZE) >= max_nr_page)
+		if ((*cur_out / PAGE_SIZE) >= max_nr_folio)
 			return -E2BIG;
 
-		cur_page = out_pages[*cur_out / PAGE_SIZE];
+		cur_folio = out_folios[*cur_out / PAGE_SIZE];
 		/* Allocate a new page */
-		if (!cur_page) {
-			cur_page = folio_page(btrfs_alloc_compr_folio(), 0);
-			if (!cur_page)
+		if (!cur_folio) {
+			cur_folio = btrfs_alloc_compr_folio();
+			if (!cur_folio)
 				return -ENOMEM;
-			out_pages[*cur_out / PAGE_SIZE] = cur_page;
+			out_folios[*cur_out / PAGE_SIZE] = cur_folio;
 		}
-		kaddr = kmap_local_page(cur_page);
+		kaddr = kmap_local_folio(cur_folio, 0);
 
 		memcpy(kaddr + offset_in_page(*cur_out),
 		       compressed_data + *cur_out - orig_out, copy_len);
@@ -209,15 +209,15 @@ static int copy_compressed_data_to_page(char *compressed_data,
 	return 0;
 }
 
-int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
-		u64 start, struct page **pages, unsigned long *out_pages,
+int lzo_compress_folios(struct list_head *ws, struct address_space *mapping,
+		u64 start, struct folio **folios, unsigned long *out_folios,
 		unsigned long *total_in, unsigned long *total_out)
 {
 	struct workspace *workspace = list_entry(ws, struct workspace, list);
 	const u32 sectorsize = inode_to_fs_info(mapping->host)->sectorsize;
-	struct page *page_in = NULL;
+	struct folio *folio_in = NULL;
 	char *sizes_ptr;
-	const unsigned long max_nr_page = *out_pages;
+	const unsigned long max_nr_folio = *out_folios;
 	int ret = 0;
 	/* Points to the file offset of input data */
 	u64 cur_in = start;
@@ -225,8 +225,8 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
 	u32 cur_out = 0;
 	u32 len = *total_out;
 
-	ASSERT(max_nr_page > 0);
-	*out_pages = 0;
+	ASSERT(max_nr_folio > 0);
+	*out_folios = 0;
 	*total_out = 0;
 	*total_in = 0;
 
@@ -243,8 +243,8 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
 		size_t out_len;
 
 		/* Get the input page first */
-		if (!page_in) {
-			ret =  btrfs_compress_find_get_page(mapping, cur_in, &page_in);
+		if (!folio_in) {
+			ret = btrfs_compress_filemap_get_folio(mapping, cur_in, &folio_in);
 			if (ret < 0)
 				goto out;
 		}
@@ -252,7 +252,7 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
 		/* Compress at most one sector of data each time */
 		in_len = min_t(u32, start + len - cur_in, sectorsize - sector_off);
 		ASSERT(in_len);
-		data_in = kmap_local_page(page_in);
+		data_in = kmap_local_folio(folio_in, 0);
 		ret = lzo1x_1_compress(data_in +
 				       offset_in_page(cur_in), in_len,
 				       workspace->cbuf, &out_len,
@@ -265,7 +265,7 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
 		}
 
 		ret = copy_compressed_data_to_page(workspace->cbuf, out_len,
-						   pages, max_nr_page,
+						   folios, max_nr_folio,
 						   &cur_out, sectorsize);
 		if (ret < 0)
 			goto out;
@@ -283,13 +283,13 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
 
 		/* Check if we have reached page boundary */
 		if (PAGE_ALIGNED(cur_in)) {
-			put_page(page_in);
-			page_in = NULL;
+			folio_put(folio_in);
+			folio_in = NULL;
 		}
 	}
 
 	/* Store the size of all chunks of compressed data */
-	sizes_ptr = kmap_local_page(pages[0]);
+	sizes_ptr = kmap_local_folio(folios[0], 0);
 	write_compress_length(sizes_ptr, cur_out);
 	kunmap_local(sizes_ptr);
 
@@ -297,9 +297,9 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
 	*total_out = cur_out;
 	*total_in = cur_in - start;
 out:
-	if (page_in)
-		put_page(page_in);
-	*out_pages = DIV_ROUND_UP(cur_out, PAGE_SIZE);
+	if (folio_in)
+		folio_put(folio_in);
+	*out_folios = DIV_ROUND_UP(cur_out, PAGE_SIZE);
 	return ret;
 }
 
@@ -314,15 +314,15 @@ static void copy_compressed_segment(struct compressed_bio *cb,
 	u32 orig_in = *cur_in;
 
 	while (*cur_in < orig_in + len) {
-		struct page *cur_page;
+		struct folio *cur_folio;
 		u32 copy_len = min_t(u32, PAGE_SIZE - offset_in_page(*cur_in),
 					  orig_in + len - *cur_in);
 
 		ASSERT(copy_len);
-		cur_page = cb->compressed_pages[*cur_in / PAGE_SIZE];
+		cur_folio = cb->compressed_folios[*cur_in / PAGE_SIZE];
 
-		memcpy_from_page(dest + *cur_in - orig_in, cur_page,
-				 offset_in_page(*cur_in), copy_len);
+		memcpy_from_folio(dest + *cur_in - orig_in, cur_folio,
+				offset_in_folio(cur_folio, *cur_in), copy_len);
 
 		*cur_in += copy_len;
 	}
@@ -342,7 +342,7 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
 	/* Bytes decompressed so far */
 	u32 cur_out = 0;
 
-	kaddr = kmap_local_page(cb->compressed_pages[0]);
+	kaddr = kmap_local_folio(cb->compressed_folios[0], 0);
 	len_in = read_compress_length(kaddr);
 	kunmap_local(kaddr);
 	cur_in += LZO_LEN;
@@ -364,7 +364,7 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
 
 	/* Go through each lzo segment */
 	while (cur_in < len_in) {
-		struct page *cur_page;
+		struct folio *cur_folio;
 		/* Length of the compressed segment */
 		u32 seg_len;
 		u32 sector_bytes_left;
@@ -376,9 +376,9 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
 		 */
 		ASSERT(cur_in / sectorsize ==
 		       (cur_in + LZO_LEN - 1) / sectorsize);
-		cur_page = cb->compressed_pages[cur_in / PAGE_SIZE];
-		ASSERT(cur_page);
-		kaddr = kmap_local_page(cur_page);
+		cur_folio = cb->compressed_folios[cur_in / PAGE_SIZE];
+		ASSERT(cur_folio);
+		kaddr = kmap_local_folio(cur_folio, 0);
 		seg_len = read_compress_length(kaddr + offset_in_page(cur_in));
 		kunmap_local(kaddr);
 		cur_in += LZO_LEN;
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index c260e42b0a3399..1a4de77ba46d77 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -91,24 +91,24 @@ struct list_head *zlib_alloc_workspace(unsigned int level)
 	return ERR_PTR(-ENOMEM);
 }
 
-int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
-		u64 start, struct page **pages, unsigned long *out_pages,
+int zlib_compress_folios(struct list_head *ws, struct address_space *mapping,
+		u64 start, struct folio **folios, unsigned long *out_folios,
 		unsigned long *total_in, unsigned long *total_out)
 {
 	struct workspace *workspace = list_entry(ws, struct workspace, list);
 	int ret;
 	char *data_in = NULL;
-	char *cpage_out;
-	int nr_pages = 0;
-	struct page *in_page = NULL;
-	struct page *out_page = NULL;
+	char *cfolio_out;
+	int nr_folios = 0;
+	struct folio *in_folio = NULL;
+	struct folio *out_folio = NULL;
 	unsigned long bytes_left;
-	unsigned int in_buf_pages;
+	unsigned int in_buf_folios;
 	unsigned long len = *total_out;
-	unsigned long nr_dest_pages = *out_pages;
-	const unsigned long max_out = nr_dest_pages * PAGE_SIZE;
+	unsigned long nr_dest_folios = *out_folios;
+	const unsigned long max_out = nr_dest_folios * PAGE_SIZE;
 
-	*out_pages = 0;
+	*out_folios = 0;
 	*total_out = 0;
 	*total_in = 0;
 
@@ -121,18 +121,18 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
 	workspace->strm.total_in = 0;
 	workspace->strm.total_out = 0;
 
-	out_page = folio_page(btrfs_alloc_compr_folio(), 0);
-	if (out_page == NULL) {
+	out_folio = btrfs_alloc_compr_folio();
+	if (out_folio == NULL) {
 		ret = -ENOMEM;
 		goto out;
 	}
-	cpage_out = page_address(out_page);
-	pages[0] = out_page;
-	nr_pages = 1;
+	cfolio_out = folio_address(out_folio);
+	folios[0] = out_folio;
+	nr_folios = 1;
 
 	workspace->strm.next_in = workspace->buf;
 	workspace->strm.avail_in = 0;
-	workspace->strm.next_out = cpage_out;
+	workspace->strm.next_out = cfolio_out;
 	workspace->strm.avail_out = PAGE_SIZE;
 
 	while (workspace->strm.total_in < len) {
@@ -142,22 +142,22 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
 		 */
 		if (workspace->strm.avail_in == 0) {
 			bytes_left = len - workspace->strm.total_in;
-			in_buf_pages = min(DIV_ROUND_UP(bytes_left, PAGE_SIZE),
-					   workspace->buf_size / PAGE_SIZE);
-			if (in_buf_pages > 1) {
+			in_buf_folios = min(DIV_ROUND_UP(bytes_left, PAGE_SIZE),
+					    workspace->buf_size / PAGE_SIZE);
+			if (in_buf_folios > 1) {
 				int i;
 
-				for (i = 0; i < in_buf_pages; i++) {
+				for (i = 0; i < in_buf_folios; i++) {
 					if (data_in) {
 						kunmap_local(data_in);
-						put_page(in_page);
+						folio_put(in_folio);
 						data_in = NULL;
 					}
-					ret = btrfs_compress_find_get_page(mapping,
-							start, &in_page);
+					ret = btrfs_compress_filemap_get_folio(mapping,
+							start, &in_folio);
 					if (ret < 0)
 						goto out;
-					data_in = kmap_local_page(in_page);
+					data_in = kmap_local_folio(in_folio, 0);
 					copy_page(workspace->buf + i * PAGE_SIZE,
 						  data_in);
 					start += PAGE_SIZE;
@@ -166,14 +166,14 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
 			} else {
 				if (data_in) {
 					kunmap_local(data_in);
-					put_page(in_page);
+					folio_put(in_folio);
 					data_in = NULL;
 				}
-				ret = btrfs_compress_find_get_page(mapping,
-						start, &in_page);
+				ret = btrfs_compress_filemap_get_folio(mapping,
+						start, &in_folio);
 				if (ret < 0)
 					goto out;
-				data_in = kmap_local_page(in_page);
+				data_in = kmap_local_folio(in_folio, 0);
 				start += PAGE_SIZE;
 				workspace->strm.next_in = data_in;
 			}
@@ -202,20 +202,20 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
 		 * the stream end if required
 		 */
 		if (workspace->strm.avail_out == 0) {
-			if (nr_pages == nr_dest_pages) {
+			if (nr_folios == nr_dest_folios) {
 				ret = -E2BIG;
 				goto out;
 			}
-			out_page = folio_page(btrfs_alloc_compr_folio(), 0);
-			if (out_page == NULL) {
+			out_folio = btrfs_alloc_compr_folio();
+			if (out_folio == NULL) {
 				ret = -ENOMEM;
 				goto out;
 			}
-			cpage_out = page_address(out_page);
-			pages[nr_pages] = out_page;
-			nr_pages++;
+			cfolio_out = folio_address(out_folio);
+			folios[nr_folios] = out_folio;
+			nr_folios++;
 			workspace->strm.avail_out = PAGE_SIZE;
-			workspace->strm.next_out = cpage_out;
+			workspace->strm.next_out = cfolio_out;
 		}
 		/* we're all done */
 		if (workspace->strm.total_in >= len)
@@ -237,21 +237,21 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
 			ret = -EIO;
 			goto out;
 		} else if (workspace->strm.avail_out == 0) {
-			/* get another page for the stream end */
-			if (nr_pages == nr_dest_pages) {
+			/* get another folio for the stream end */
+			if (nr_folios == nr_dest_folios) {
 				ret = -E2BIG;
 				goto out;
 			}
-			out_page = folio_page(btrfs_alloc_compr_folio(), 0);
-			if (out_page == NULL) {
+			out_folio = btrfs_alloc_compr_folio();
+			if (out_folio == NULL) {
 				ret = -ENOMEM;
 				goto out;
 			}
-			cpage_out = page_address(out_page);
-			pages[nr_pages] = out_page;
-			nr_pages++;
+			cfolio_out = folio_address(out_folio);
+			folios[nr_folios] = out_folio;
+			nr_folios++;
 			workspace->strm.avail_out = PAGE_SIZE;
-			workspace->strm.next_out = cpage_out;
+			workspace->strm.next_out = cfolio_out;
 		}
 	}
 	zlib_deflateEnd(&workspace->strm);
@@ -265,10 +265,10 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
 	*total_out = workspace->strm.total_out;
 	*total_in = workspace->strm.total_in;
 out:
-	*out_pages = nr_pages;
+	*out_folios = nr_folios;
 	if (data_in) {
 		kunmap_local(data_in);
-		put_page(in_page);
+		folio_put(in_folio);
 	}
 
 	return ret;
@@ -281,13 +281,13 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
 	int wbits = MAX_WBITS;
 	char *data_in;
 	size_t total_out = 0;
-	unsigned long page_in_index = 0;
+	unsigned long folio_in_index = 0;
 	size_t srclen = cb->compressed_len;
-	unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_SIZE);
+	unsigned long total_folios_in = DIV_ROUND_UP(srclen, PAGE_SIZE);
 	unsigned long buf_start;
-	struct page **pages_in = cb->compressed_pages;
+	struct folio **folios_in = cb->compressed_folios;
 
-	data_in = kmap_local_page(pages_in[page_in_index]);
+	data_in = kmap_local_folio(folios_in[folio_in_index], 0);
 	workspace->strm.next_in = data_in;
 	workspace->strm.avail_in = min_t(size_t, srclen, PAGE_SIZE);
 	workspace->strm.total_in = 0;
@@ -337,12 +337,12 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
 		if (workspace->strm.avail_in == 0) {
 			unsigned long tmp;
 			kunmap_local(data_in);
-			page_in_index++;
-			if (page_in_index >= total_pages_in) {
+			folio_in_index++;
+			if (folio_in_index >= total_folios_in) {
 				data_in = NULL;
 				break;
 			}
-			data_in = kmap_local_page(pages_in[page_in_index]);
+			data_in = kmap_local_folio(folios_in[folio_in_index], 0);
 			workspace->strm.next_in = data_in;
 			tmp = srclen - workspace->strm.total_in;
 			workspace->strm.avail_in = min(tmp, PAGE_SIZE);
diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c
index 4ec5dd84b93e33..c018f0fab349ad 100644
--- a/fs/btrfs/zstd.c
+++ b/fs/btrfs/zstd.c
@@ -374,25 +374,25 @@ struct list_head *zstd_alloc_workspace(unsigned int level)
 	return ERR_PTR(-ENOMEM);
 }
 
-int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
-		u64 start, struct page **pages, unsigned long *out_pages,
+int zstd_compress_folios(struct list_head *ws, struct address_space *mapping,
+		u64 start, struct folio **folios, unsigned long *out_folios,
 		unsigned long *total_in, unsigned long *total_out)
 {
 	struct workspace *workspace = list_entry(ws, struct workspace, list);
 	zstd_cstream *stream;
 	int ret = 0;
-	int nr_pages = 0;
-	struct page *in_page = NULL;  /* The current page to read */
-	struct page *out_page = NULL; /* The current page to write to */
+	int nr_folios = 0;
+	struct folio *in_folio = NULL;  /* The current page to read */
+	struct folio *out_folio = NULL; /* The current page to write to */
 	unsigned long tot_in = 0;
 	unsigned long tot_out = 0;
 	unsigned long len = *total_out;
-	const unsigned long nr_dest_pages = *out_pages;
-	unsigned long max_out = nr_dest_pages * PAGE_SIZE;
+	const unsigned long nr_dest_folios = *out_folios;
+	unsigned long max_out = nr_dest_folios * PAGE_SIZE;
 	zstd_parameters params = zstd_get_btrfs_parameters(workspace->req_level,
 							   len);
 
-	*out_pages = 0;
+	*out_folios = 0;
 	*total_out = 0;
 	*total_in = 0;
 
@@ -406,21 +406,21 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
 	}
 
 	/* map in the first page of input data */
-	ret = btrfs_compress_find_get_page(mapping, start, &in_page);
+	ret = btrfs_compress_filemap_get_folio(mapping, start, &in_folio);
 	if (ret < 0)
 		goto out;
-	workspace->in_buf.src = kmap_local_page(in_page);
+	workspace->in_buf.src = kmap_local_folio(in_folio, 0);
 	workspace->in_buf.pos = 0;
 	workspace->in_buf.size = min_t(size_t, len, PAGE_SIZE);
 
 	/* Allocate and map in the output buffer */
-	out_page = folio_page(btrfs_alloc_compr_folio(), 0);
-	if (out_page == NULL) {
+	out_folio = btrfs_alloc_compr_folio();
+	if (out_folio == NULL) {
 		ret = -ENOMEM;
 		goto out;
 	}
-	pages[nr_pages++] = out_page;
-	workspace->out_buf.dst = page_address(out_page);
+	folios[nr_folios++] = out_folio;
+	workspace->out_buf.dst = folio_address(out_folio);
 	workspace->out_buf.pos = 0;
 	workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE);
 
@@ -455,17 +455,17 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
 		if (workspace->out_buf.pos == workspace->out_buf.size) {
 			tot_out += PAGE_SIZE;
 			max_out -= PAGE_SIZE;
-			if (nr_pages == nr_dest_pages) {
+			if (nr_folios == nr_dest_folios) {
 				ret = -E2BIG;
 				goto out;
 			}
-			out_page = folio_page(btrfs_alloc_compr_folio(), 0);
-			if (out_page == NULL) {
+			out_folio = btrfs_alloc_compr_folio();
+			if (out_folio == NULL) {
 				ret = -ENOMEM;
 				goto out;
 			}
-			pages[nr_pages++] = out_page;
-			workspace->out_buf.dst = page_address(out_page);
+			folios[nr_folios++] = out_folio;
+			workspace->out_buf.dst = folio_address(out_folio);
 			workspace->out_buf.pos = 0;
 			workspace->out_buf.size = min_t(size_t, max_out,
 							PAGE_SIZE);
@@ -482,13 +482,14 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
 			tot_in += PAGE_SIZE;
 			kunmap_local(workspace->in_buf.src);
 			workspace->in_buf.src = NULL;
-			put_page(in_page);
+			folio_put(in_folio);
 			start += PAGE_SIZE;
 			len -= PAGE_SIZE;
-			ret = btrfs_compress_find_get_page(mapping, start, &in_page);
+			ret = btrfs_compress_filemap_get_folio(mapping, start,
+							       &in_folio);
 			if (ret < 0)
 				goto out;
-			workspace->in_buf.src = kmap_local_page(in_page);
+			workspace->in_buf.src = kmap_local_folio(in_folio, 0);
 			workspace->in_buf.pos = 0;
 			workspace->in_buf.size = min_t(size_t, len, PAGE_SIZE);
 		}
@@ -515,17 +516,17 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
 
 		tot_out += PAGE_SIZE;
 		max_out -= PAGE_SIZE;
-		if (nr_pages == nr_dest_pages) {
+		if (nr_folios == nr_dest_folios) {
 			ret = -E2BIG;
 			goto out;
 		}
-		out_page = folio_page(btrfs_alloc_compr_folio(), 0);
-		if (out_page == NULL) {
+		out_folio = btrfs_alloc_compr_folio();
+		if (out_folio == NULL) {
 			ret = -ENOMEM;
 			goto out;
 		}
-		pages[nr_pages++] = out_page;
-		workspace->out_buf.dst = page_address(out_page);
+		folios[nr_folios++] = out_folio;
+		workspace->out_buf.dst = folio_address(out_folio);
 		workspace->out_buf.pos = 0;
 		workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE);
 	}
@@ -539,10 +540,10 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
 	*total_in = tot_in;
 	*total_out = tot_out;
 out:
-	*out_pages = nr_pages;
+	*out_folios = nr_folios;
 	if (workspace->in_buf.src) {
 		kunmap_local(workspace->in_buf.src);
-		put_page(in_page);
+		folio_put(in_folio);
 	}
 	return ret;
 }
@@ -550,12 +551,12 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
 int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
 {
 	struct workspace *workspace = list_entry(ws, struct workspace, list);
-	struct page **pages_in = cb->compressed_pages;
+	struct folio **folios_in = cb->compressed_folios;
 	size_t srclen = cb->compressed_len;
 	zstd_dstream *stream;
 	int ret = 0;
-	unsigned long page_in_index = 0;
-	unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_SIZE);
+	unsigned long folio_in_index = 0;
+	unsigned long total_folios_in = DIV_ROUND_UP(srclen, PAGE_SIZE);
 	unsigned long buf_start;
 	unsigned long total_out = 0;
 
@@ -567,7 +568,7 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
 		goto done;
 	}
 
-	workspace->in_buf.src = kmap_local_page(pages_in[page_in_index]);
+	workspace->in_buf.src = kmap_local_folio(folios_in[folio_in_index], 0);
 	workspace->in_buf.pos = 0;
 	workspace->in_buf.size = min_t(size_t, srclen, PAGE_SIZE);
 
@@ -604,14 +605,15 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
 
 		if (workspace->in_buf.pos == workspace->in_buf.size) {
 			kunmap_local(workspace->in_buf.src);
-			page_in_index++;
-			if (page_in_index >= total_pages_in) {
+			folio_in_index++;
+			if (folio_in_index >= total_folios_in) {
 				workspace->in_buf.src = NULL;
 				ret = -EIO;
 				goto done;
 			}
 			srclen -= PAGE_SIZE;
-			workspace->in_buf.src = kmap_local_page(pages_in[page_in_index]);
+			workspace->in_buf.src =
+				kmap_local_folio(folios_in[folio_in_index], 0);
 			workspace->in_buf.pos = 0;
 			workspace->in_buf.size = min_t(size_t, srclen, PAGE_SIZE);
 		}

From ae4a7bb03e7a47e8617137398632b3b1fdace2c9 Mon Sep 17 00:00:00 2001
From: Naohiro Aota <naohiro.aota@wdc.com>
Date: Mon, 5 Feb 2024 22:01:16 +0900
Subject: [PATCH 0454/1406] btrfs: introduce offload_csum_mode to tweak
 checksum offloading behavior

We disable offloading checksum to workqueues and do it synchronously when
the checksum algorithm is fast. However, as reported in the link below,
RAID0 with multiple devices may suffer from the sync checksum, because
"fast checksum" is still not fast enough to catch up RAID0 writing.

To measure the effectiveness of sync checksum and checksum offloading for
developers, it would be better to have a switch for the checksum offloading
under CONFIG_BTRFS_DEBUG hood.

This commit introduces fs_devices->offload_csum_mode for
CONFIG_BTRFS_DEBUG, so that a btrfs developer can change the behavior by
writing to /sys/fs/btrfs/<uuid>/offload_csum. The default is "auto" which
is the same as the previous behavior. Or, you can set "on" or "off" (or "y"
or "n" whatever kstrtobool() accepts) to always/never offload checksum.

More benchmark should be collected with this knob to implement a proper
criteria to enable/disable checksum offloading.

Link: https://lore.kernel.org/linux-btrfs/20230731152223.4EFB.409509F4@e16-tech.com/
Link: https://lore.kernel.org/linux-btrfs/p3vo3g7pqn664mhmdhlotu5dzcna6vjtcoc2hb2lsgo2fwct7k@xzaxclba5tae/
Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/bio.c     | 14 +++++++++++++-
 fs/btrfs/sysfs.c   | 44 ++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/volumes.h | 24 ++++++++++++++++++++++++
 3 files changed, 81 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c
index 960b81718e2958..477f350a8bd09e 100644
--- a/fs/btrfs/bio.c
+++ b/fs/btrfs/bio.c
@@ -608,8 +608,20 @@ static void run_one_async_done(struct btrfs_work *work, bool do_free)
 
 static bool should_async_write(struct btrfs_bio *bbio)
 {
+	bool auto_csum_mode = true;
+
+#ifdef CONFIG_BTRFS_DEBUG
+	struct btrfs_fs_devices *fs_devices = bbio->fs_info->fs_devices;
+	enum btrfs_offload_csum_mode csum_mode = READ_ONCE(fs_devices->offload_csum_mode);
+
+	if (csum_mode == BTRFS_OFFLOAD_CSUM_FORCE_OFF)
+		return false;
+
+	auto_csum_mode = (csum_mode == BTRFS_OFFLOAD_CSUM_AUTO);
+#endif
+
 	/* Submit synchronously if the checksum implementation is fast. */
-	if (test_bit(BTRFS_FS_CSUM_IMPL_FAST, &bbio->fs_info->flags))
+	if (auto_csum_mode && test_bit(BTRFS_FS_CSUM_IMPL_FAST, &bbio->fs_info->flags))
 		return false;
 
 	/*
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 21586ecc35bf9c..7248b966bf24d9 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -1307,6 +1307,47 @@ static ssize_t btrfs_bg_reclaim_threshold_store(struct kobject *kobj,
 BTRFS_ATTR_RW(, bg_reclaim_threshold, btrfs_bg_reclaim_threshold_show,
 	      btrfs_bg_reclaim_threshold_store);
 
+#ifdef CONFIG_BTRFS_DEBUG
+static ssize_t btrfs_offload_csum_show(struct kobject *kobj,
+				       struct kobj_attribute *a, char *buf)
+{
+	struct btrfs_fs_devices *fs_devices = to_fs_devs(kobj);
+
+	switch (READ_ONCE(fs_devices->offload_csum_mode)) {
+	case BTRFS_OFFLOAD_CSUM_AUTO:
+		return sysfs_emit(buf, "auto\n");
+	case BTRFS_OFFLOAD_CSUM_FORCE_ON:
+		return sysfs_emit(buf, "1\n");
+	case BTRFS_OFFLOAD_CSUM_FORCE_OFF:
+		return sysfs_emit(buf, "0\n");
+	default:
+		WARN_ON(1);
+		return -EINVAL;
+	}
+}
+
+static ssize_t btrfs_offload_csum_store(struct kobject *kobj,
+					struct kobj_attribute *a, const char *buf,
+					size_t len)
+{
+	struct btrfs_fs_devices *fs_devices = to_fs_devs(kobj);
+	int ret;
+	bool val;
+
+	ret = kstrtobool(buf, &val);
+	if (ret == 0)
+		WRITE_ONCE(fs_devices->offload_csum_mode,
+			   val ? BTRFS_OFFLOAD_CSUM_FORCE_ON : BTRFS_OFFLOAD_CSUM_FORCE_OFF);
+	else if (ret == -EINVAL && sysfs_streq(buf, "auto"))
+		WRITE_ONCE(fs_devices->offload_csum_mode, BTRFS_OFFLOAD_CSUM_AUTO);
+	else
+		return -EINVAL;
+
+	return len;
+}
+BTRFS_ATTR_RW(, offload_csum, btrfs_offload_csum_show, btrfs_offload_csum_store);
+#endif
+
 /*
  * Per-filesystem information and stats.
  *
@@ -1326,6 +1367,9 @@ static const struct attribute *btrfs_attrs[] = {
 	BTRFS_ATTR_PTR(, bg_reclaim_threshold),
 	BTRFS_ATTR_PTR(, commit_stats),
 	BTRFS_ATTR_PTR(, temp_fsid),
+#ifdef CONFIG_BTRFS_DEBUG
+	BTRFS_ATTR_PTR(, offload_csum),
+#endif
 	NULL,
 };
 
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 21d4de0e3f1f5b..055e095c2f6117 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -291,6 +291,25 @@ enum btrfs_read_policy {
 	BTRFS_NR_READ_POLICY,
 };
 
+#ifdef CONFIG_BTRFS_DEBUG
+/*
+ * Checksum mode - offload it to workqueues or do it synchronously in
+ * btrfs_submit_chunk().
+ */
+enum btrfs_offload_csum_mode {
+	/*
+	 * Choose offloading checksum or do it synchronously automatically.
+	 * Do it synchronously if the checksum is fast, or offload to workqueues
+	 * otherwise.
+	 */
+	BTRFS_OFFLOAD_CSUM_AUTO,
+	/* Always offload checksum to workqueues. */
+	BTRFS_OFFLOAD_CSUM_FORCE_ON,
+	/* Never offload checksum to workqueues. */
+	BTRFS_OFFLOAD_CSUM_FORCE_OFF,
+};
+#endif
+
 struct btrfs_fs_devices {
 	u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */
 
@@ -395,6 +414,11 @@ struct btrfs_fs_devices {
 
 	/* Policy used to read the mirrored stripes. */
 	enum btrfs_read_policy read_policy;
+
+#ifdef CONFIG_BTRFS_DEBUG
+	/* Checksum mode - offload it or do it synchronously. */
+	enum btrfs_offload_csum_mode offload_csum_mode;
+#endif
 };
 
 #define BTRFS_MAX_DEVS(info) ((BTRFS_MAX_ITEM_SIZE(info)	\

From 5b30dcfa8fc27f392eeaf0859bee56826254b9eb Mon Sep 17 00:00:00 2001
From: Boris Burkov <boris@bur.io>
Date: Fri, 2 Feb 2024 15:12:43 -0800
Subject: [PATCH 0455/1406] btrfs: report reclaim count in sysfs

When evaluating various reclaim strategies/thresholds against each
other, it is useful to collect data about the amount of reclaim
happening. Expose it via sysfs per space_info.

Signed-off-by: Boris Burkov <boris@bur.io>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/block-group.c | 3 +++
 fs/btrfs/space-info.h  | 6 ++++++
 fs/btrfs/sysfs.c       | 2 ++
 3 files changed, 11 insertions(+)

diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 58f2e8951dbf23..19abb6fd93a30c 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -1912,6 +1912,9 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
 				div64_u64(bg->used * 100, bg->length),
 				div64_u64(zone_unusable * 100, bg->length));
 		trace_btrfs_reclaim_block_group(bg);
+		spin_lock(&space_info->lock);
+		space_info->reclaim_count++;
+		spin_unlock(&space_info->lock);
 		ret = btrfs_relocate_chunk(fs_info, bg->start);
 		if (ret) {
 			btrfs_dec_block_group_ro(bg);
diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h
index a733458fd13b35..cd3fa503561f38 100644
--- a/fs/btrfs/space-info.h
+++ b/fs/btrfs/space-info.h
@@ -165,6 +165,12 @@ struct btrfs_space_info {
 
 	struct kobject kobj;
 	struct kobject *block_group_kobjs[BTRFS_NR_RAID_TYPES];
+
+	/*
+	 * Monotonically increasing counter of relocated block groups.
+	 * Exposed in /sys/fs/<uuid>/allocation/<type>/reclaim_count
+	 */
+	u64 reclaim_count;
 };
 
 struct reserve_ticket {
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 7248b966bf24d9..d60f44dee8faea 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -894,6 +894,7 @@ SPACE_INFO_ATTR(bytes_readonly);
 SPACE_INFO_ATTR(bytes_zone_unusable);
 SPACE_INFO_ATTR(disk_used);
 SPACE_INFO_ATTR(disk_total);
+SPACE_INFO_ATTR(reclaim_count);
 BTRFS_ATTR_RW(space_info, chunk_size, btrfs_chunk_size_show, btrfs_chunk_size_store);
 BTRFS_ATTR(space_info, size_classes, btrfs_size_classes_show);
 
@@ -949,6 +950,7 @@ static struct attribute *space_info_attrs[] = {
 	BTRFS_ATTR_PTR(space_info, bg_reclaim_threshold),
 	BTRFS_ATTR_PTR(space_info, chunk_size),
 	BTRFS_ATTR_PTR(space_info, size_classes),
+	BTRFS_ATTR_PTR(space_info, reclaim_count),
 #ifdef CONFIG_BTRFS_DEBUG
 	BTRFS_ATTR_PTR(space_info, force_chunk_alloc),
 #endif

From 070172b6776fc1b8ee27bf9a1ecc9a626cb0ab2a Mon Sep 17 00:00:00 2001
From: Boris Burkov <boris@bur.io>
Date: Fri, 2 Feb 2024 15:12:44 -0800
Subject: [PATCH 0456/1406] btrfs: store fs_info on space_info

This is handy when computing space_info dynamic reclaim thresholds where
we do not have access to a block group. We could add it to the various
functions as a parameter, but it seems reasonable for space_info to have
an fs_info pointer.

Signed-off-by: Boris Burkov <boris@bur.io>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/space-info.c | 1 +
 fs/btrfs/space-info.h | 1 +
 2 files changed, 2 insertions(+)

diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index a5b652c1650ad8..1477429b44e17b 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -232,6 +232,7 @@ static int create_space_info(struct btrfs_fs_info *info, u64 flags)
 	if (!space_info)
 		return -ENOMEM;
 
+	space_info->fs_info = info;
 	for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
 		INIT_LIST_HEAD(&space_info->block_groups[i]);
 	init_rwsem(&space_info->groups_sem);
diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h
index cd3fa503561f38..3e067503c53a69 100644
--- a/fs/btrfs/space-info.h
+++ b/fs/btrfs/space-info.h
@@ -94,6 +94,7 @@ enum btrfs_flush_state {
 };
 
 struct btrfs_space_info {
+	struct btrfs_fs_info *fs_info;
 	spinlock_t lock;
 
 	u64 total_bytes;	/* total bytes in the space,

From ea2abf316722e781f533bba30d1777cfb9c56fa8 Mon Sep 17 00:00:00 2001
From: Boris Burkov <boris@bur.io>
Date: Fri, 2 Feb 2024 15:12:45 -0800
Subject: [PATCH 0457/1406] btrfs: dynamic block_group reclaim threshold

We can currently recover allocated block_groups by:
- explicitly starting balance operations
- "auto reclaim" via bg_reclaim_threshold

The latter works by checking against a fixed threshold on frees. If we
pass from above the threshold to below, relocation triggers and the
block group will get reclaimed by the cleaner thread (assuming it is
still eligible)

Picking a threshold is challenging. Too high, and you end up trying to
reclaim very full block_groups which is quite costly, and you don't do
reclaim on block_groups that don't get quite THAT full, but could still
be quite fragmented and stranding a lot of space. Too low, and you
similarly miss out on reclaim even if you badly need it to avoid running
out of unallocated space, if you have heavily fragmented block groups
living above the threshold.

No matter the threshold, it suffers from a workload that happens to
bounce around that threshold, which can introduce arbitrary amounts of
reclaim waste.

To improve this situation, introduce a dynamic threshold. The basic idea
behind this threshold is that it should be high when there is lots of
unused space, and little unallocated space, relative to fs size. OTOH,
when either unused is low or unallocated is high, reclaim is not that
important, so we can set a quite low threshold.

The formula to acheive this is:
(unused / allocated) * (unused / unallocated)
which is also clamped to 90% as fuller than that is very challenging to
reclaim flat out, and means the file system is legitimately quite full.

I tested this by running it on three interesting workloads:
1. bounce allocations around X% full.
2. fill up all the way and introduce full fragmentation.
3. write in a fragmented way until the filesystem is just about full.

1. and 2. attack the weaknesses of a fixed threshold; fixed either works
perfectly or fully falls apart, depending on the threshold. Dynamic
always handles these cases well.

3. attacks dynamic by checking whether it is too zealous to reclaim in
conditions with low unallocated and low unused. It tends to claw back
1GiB of unallocated fairly aggressively, but not much more. Early
versions of dynamic threshold struggled on this test.

Additional work could be done to intelligently ratchet up the urgency of
reclaim in very low unallocated conditions. Existing mechanisms are
already useless in that case anyway.

Signed-off-by: Boris Burkov <boris@bur.io>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/block-group.c | 18 +++++-----
 fs/btrfs/space-info.c  | 74 ++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/space-info.h  |  8 +++++
 fs/btrfs/sysfs.c       | 43 +++++++++++++++++++++++-
 4 files changed, 134 insertions(+), 9 deletions(-)

diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 19abb6fd93a30c..6f0bf4bf68a00e 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -1756,24 +1756,21 @@ static inline bool btrfs_should_reclaim(struct btrfs_fs_info *fs_info)
 
 static bool should_reclaim_block_group(struct btrfs_block_group *bg, u64 bytes_freed)
 {
-	const struct btrfs_space_info *space_info = bg->space_info;
-	const int reclaim_thresh = READ_ONCE(space_info->bg_reclaim_threshold);
+	const int thresh_pct = btrfs_calc_reclaim_threshold(bg->space_info);
+	u64 thresh_bytes = mult_perc(bg->length, thresh_pct);
 	const u64 new_val = bg->used;
 	const u64 old_val = new_val + bytes_freed;
-	u64 thresh;
 
-	if (reclaim_thresh == 0)
+	if (thresh_bytes == 0)
 		return false;
 
-	thresh = mult_perc(bg->length, reclaim_thresh);
-
 	/*
 	 * If we were below the threshold before don't reclaim, we are likely a
 	 * brand new block group and we don't want to relocate new block groups.
 	 */
-	if (old_val < thresh)
+	if (old_val < thresh_bytes)
 		return false;
-	if (new_val >= thresh)
+	if (new_val >= thresh_bytes)
 		return false;
 	return true;
 }
@@ -1833,6 +1830,7 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
 		/* Don't race with allocators so take the groups_sem */
 		down_write(&space_info->groups_sem);
 
+		spin_lock(&space_info->lock);
 		spin_lock(&bg->lock);
 		if (bg->reserved || bg->pinned || bg->ro) {
 			/*
@@ -1842,6 +1840,7 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
 			 * this block group.
 			 */
 			spin_unlock(&bg->lock);
+			spin_unlock(&space_info->lock);
 			up_write(&space_info->groups_sem);
 			goto next;
 		}
@@ -1860,6 +1859,7 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
 			if (!btrfs_test_opt(fs_info, DISCARD_ASYNC))
 				btrfs_mark_bg_unused(bg);
 			spin_unlock(&bg->lock);
+			spin_unlock(&space_info->lock);
 			up_write(&space_info->groups_sem);
 			goto next;
 
@@ -1876,10 +1876,12 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
 		 */
 		if (!should_reclaim_block_group(bg, bg->length)) {
 			spin_unlock(&bg->lock);
+			spin_unlock(&space_info->lock);
 			up_write(&space_info->groups_sem);
 			goto next;
 		}
 		spin_unlock(&bg->lock);
+		spin_unlock(&space_info->lock);
 
 		/*
 		 * Get out fast, in case we're read-only or unmounting the
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index 1477429b44e17b..3d3c91876b7152 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -1,5 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 
+#include <linux/minmax.h>
 #include "misc.h"
 #include "ctree.h"
 #include "space-info.h"
@@ -190,6 +191,8 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
  */
 #define BTRFS_DEFAULT_ZONED_RECLAIM_THRESH			(75)
 
+#define BTRFS_DYNAMIC_RECLAIM_THRESH_MAX			(90)
+
 /*
  * Calculate chunk size depending on volume type (regular or zoned).
  */
@@ -1869,3 +1872,74 @@ u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
 
 	return free_bytes;
 }
+
+static u64 calc_pct_ratio(u64 x, u64 y)
+{
+	int err;
+
+	if (!y)
+		return 0;
+again:
+	err = check_mul_overflow(100, x, &x);
+	if (err)
+		goto lose_precision;
+	return div64_u64(x, y);
+lose_precision:
+	x >>= 10;
+	y >>= 10;
+	if (!y)
+		y = 1;
+	goto again;
+}
+
+/*
+ * The dynamic threshold formula is:
+ * (unused / allocated) * (unused / unallocated) or equivalently
+ * unused^2 / (allocated * unallocated)
+ *
+ * The fundamental goal of automatic reclaim is to protect the filesystem's
+ * unallocated space and thus minimize the probability of the filesystem going
+ * read only when a metadata allocation failure causes a transaction abort.
+ *
+ * However, relocations happen into the space_info's unused space, therefore
+ * automatic reclaim must also back off as that space runs low. There is no
+ * value in doing trivial "relocations" of re-writing the same block group
+ * into a fresh one.
+ *
+ * unused / allocated sets a baseline, very conservative threshold which
+ * properly goes to 0 as unused goes to a small portion of the allocated space.
+ *
+ * On its own, this would likely do very little reclaim, so include
+ * unused / unallocated (which can be greatly in excess of 100%) to bias heavily
+ * towards reclaim when unallocated goes low or unused goes high.
+ */
+
+static int calc_dynamic_reclaim_threshold(struct btrfs_space_info *space_info)
+{
+	struct btrfs_fs_info *fs_info = space_info->fs_info;
+	u64 unalloc = atomic64_read(&fs_info->free_chunk_space);
+	u64 alloc = space_info->total_bytes;
+	u64 used = btrfs_space_info_used(space_info, false);
+	u64 unused = alloc - used;
+	/* unused <= alloc; clamped to 100 */
+	int unused_pct = calc_pct_ratio(unused, alloc);
+	u64 unused_unalloc_ratio = calc_pct_ratio(unused, unalloc);
+	int err;
+	u64 thresh;
+
+	err = check_mul_overflow(unused_pct, unused_unalloc_ratio, &thresh);
+	if (err)
+		return BTRFS_DYNAMIC_RECLAIM_THRESH_MAX;
+	/* Both quantities are percentages; remove the squared factor of 100. */
+	thresh = div64_u64(thresh, 100);
+	return clamp_val(thresh, 0, BTRFS_DYNAMIC_RECLAIM_THRESH_MAX);
+}
+
+int btrfs_calc_reclaim_threshold(struct btrfs_space_info *space_info)
+{
+	lockdep_assert_held(&space_info->lock);
+
+	if (READ_ONCE(space_info->dynamic_reclaim))
+		return calc_dynamic_reclaim_threshold(space_info);
+	return READ_ONCE(space_info->bg_reclaim_threshold);
+}
diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h
index 3e067503c53a69..bf2b3fb79104a3 100644
--- a/fs/btrfs/space-info.h
+++ b/fs/btrfs/space-info.h
@@ -172,6 +172,12 @@ struct btrfs_space_info {
 	 * Exposed in /sys/fs/<uuid>/allocation/<type>/reclaim_count
 	 */
 	u64 reclaim_count;
+
+	/*
+	 * If true, use the dynamic relocation threshold, instead of the
+	 * fixed bg_reclaim_threshold.
+	 */
+	bool dynamic_reclaim;
 };
 
 struct reserve_ticket {
@@ -254,4 +260,6 @@ void btrfs_dump_space_info_for_trans_abort(struct btrfs_fs_info *fs_info);
 void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info);
 u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo);
 
+int btrfs_calc_reclaim_threshold(struct btrfs_space_info *space_info);
+
 #endif /* BTRFS_SPACE_INFO_H */
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index d60f44dee8faea..dfe790ad4d2bd2 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -903,8 +903,12 @@ static ssize_t btrfs_sinfo_bg_reclaim_threshold_show(struct kobject *kobj,
 						     char *buf)
 {
 	struct btrfs_space_info *space_info = to_space_info(kobj);
+	ssize_t ret;
 
-	return sysfs_emit(buf, "%d\n", READ_ONCE(space_info->bg_reclaim_threshold));
+	spin_lock(&space_info->lock);
+	ret = sysfs_emit(buf, "%d\n", btrfs_calc_reclaim_threshold(space_info));
+	spin_unlock(&space_info->lock);
+	return ret;
 }
 
 static ssize_t btrfs_sinfo_bg_reclaim_threshold_store(struct kobject *kobj,
@@ -915,6 +919,9 @@ static ssize_t btrfs_sinfo_bg_reclaim_threshold_store(struct kobject *kobj,
 	int thresh;
 	int ret;
 
+	if (READ_ONCE(space_info->dynamic_reclaim))
+		return -EINVAL;
+
 	ret = kstrtoint(buf, 10, &thresh);
 	if (ret)
 		return ret;
@@ -931,6 +938,39 @@ BTRFS_ATTR_RW(space_info, bg_reclaim_threshold,
 	      btrfs_sinfo_bg_reclaim_threshold_show,
 	      btrfs_sinfo_bg_reclaim_threshold_store);
 
+static ssize_t btrfs_sinfo_dynamic_reclaim_show(struct kobject *kobj,
+						struct kobj_attribute *a,
+						char *buf)
+{
+	struct btrfs_space_info *space_info = to_space_info(kobj);
+
+	return sysfs_emit(buf, "%d\n", READ_ONCE(space_info->dynamic_reclaim));
+}
+
+static ssize_t btrfs_sinfo_dynamic_reclaim_store(struct kobject *kobj,
+						 struct kobj_attribute *a,
+						 const char *buf, size_t len)
+{
+	struct btrfs_space_info *space_info = to_space_info(kobj);
+	int dynamic_reclaim;
+	int ret;
+
+	ret = kstrtoint(buf, 10, &dynamic_reclaim);
+	if (ret)
+		return ret;
+
+	if (dynamic_reclaim < 0)
+		return -EINVAL;
+
+	WRITE_ONCE(space_info->dynamic_reclaim, dynamic_reclaim != 0);
+
+	return len;
+}
+
+BTRFS_ATTR_RW(space_info, dynamic_reclaim,
+	      btrfs_sinfo_dynamic_reclaim_show,
+	      btrfs_sinfo_dynamic_reclaim_store);
+
 /*
  * Allocation information about block group types.
  *
@@ -948,6 +988,7 @@ static struct attribute *space_info_attrs[] = {
 	BTRFS_ATTR_PTR(space_info, disk_used),
 	BTRFS_ATTR_PTR(space_info, disk_total),
 	BTRFS_ATTR_PTR(space_info, bg_reclaim_threshold),
+	BTRFS_ATTR_PTR(space_info, dynamic_reclaim),
 	BTRFS_ATTR_PTR(space_info, chunk_size),
 	BTRFS_ATTR_PTR(space_info, size_classes),
 	BTRFS_ATTR_PTR(space_info, reclaim_count),

From 224a37ebdc68cdc8eedf64ee829af3193785e1e4 Mon Sep 17 00:00:00 2001
From: Boris Burkov <boris@bur.io>
Date: Fri, 2 Feb 2024 15:12:46 -0800
Subject: [PATCH 0458/1406] btrfs: periodic block_group reclaim

We currently employ a edge-triggered block group reclaim strategy which
marks block groups for reclaim as they free down past a threshold.

With a dynamic threshold, this is worse than doing it in a
level-triggered fashion periodically. That is because the reclaim
itself happens periodically, so the threshold at that point in time is
what really matters, not the threshold at freeing time. If we mark the
reclaim in a big pass, then sort by usage and do reclaim, we also
benefit from a negative feedback loop preventing unnecessary reclaims as
we crunch through the "best" candidates.

Since this is quite a different model, it requires some additional
support. The edge triggered reclaim has a good heuristic for not
reclaiming fresh block groups, so we need to replace that with a typical
GC sweep mark which skips block groups that have seen an allocation
since the last sweep.

Signed-off-by: Boris Burkov <boris@bur.io>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/block-group.c |  2 ++
 fs/btrfs/block-group.h |  1 +
 fs/btrfs/space-info.c  | 51 ++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/space-info.h  |  7 ++++++
 fs/btrfs/sysfs.c       | 34 ++++++++++++++++++++++++++++
 5 files changed, 95 insertions(+)

diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 6f0bf4bf68a00e..59c5d7c653598b 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -1952,6 +1952,7 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
 
 void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info)
 {
+	btrfs_reclaim_sweep(fs_info);
 	spin_lock(&fs_info->unused_bgs_lock);
 	if (!list_empty(&fs_info->reclaim_bgs))
 		queue_work(system_unbound_wq, &fs_info->reclaim_bgs_work);
@@ -3650,6 +3651,7 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
 		old_val += num_bytes;
 		cache->used = old_val;
 		cache->reserved -= num_bytes;
+		cache->reclaim_mark = 0;
 		space_info->bytes_reserved -= num_bytes;
 		space_info->bytes_used += num_bytes;
 		space_info->disk_used += num_bytes * factor;
diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h
index 5ef52b9ea37176..c03971f505215e 100644
--- a/fs/btrfs/block-group.h
+++ b/fs/btrfs/block-group.h
@@ -263,6 +263,7 @@ struct btrfs_block_group {
 	struct work_struct zone_finish_work;
 	struct extent_buffer *last_eb;
 	enum btrfs_block_group_size_class size_class;
+	u64 reclaim_mark;
 };
 
 static inline u64 btrfs_block_group_end(struct btrfs_block_group *block_group)
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index 3d3c91876b7152..ddd1224fcbb15f 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -1943,3 +1943,54 @@ int btrfs_calc_reclaim_threshold(struct btrfs_space_info *space_info)
 		return calc_dynamic_reclaim_threshold(space_info);
 	return READ_ONCE(space_info->bg_reclaim_threshold);
 }
+
+static int do_reclaim_sweep(struct btrfs_fs_info *fs_info,
+			    struct btrfs_space_info *space_info, int raid)
+{
+	struct btrfs_block_group *bg;
+	int thresh_pct;
+
+	spin_lock(&space_info->lock);
+	thresh_pct = btrfs_calc_reclaim_threshold(space_info);
+	spin_unlock(&space_info->lock);
+
+	down_read(&space_info->groups_sem);
+	list_for_each_entry(bg, &space_info->block_groups[raid], list) {
+		u64 thresh;
+		bool reclaim = false;
+
+		btrfs_get_block_group(bg);
+		spin_lock(&bg->lock);
+		thresh = mult_perc(bg->length, thresh_pct);
+		if (bg->used < thresh && bg->reclaim_mark)
+			reclaim = true;
+		bg->reclaim_mark++;
+		spin_unlock(&bg->lock);
+		if (reclaim)
+			btrfs_mark_bg_to_reclaim(bg);
+		btrfs_put_block_group(bg);
+	}
+	up_read(&space_info->groups_sem);
+	return 0;
+}
+
+int btrfs_reclaim_sweep(struct btrfs_fs_info *fs_info)
+{
+	int ret;
+	int raid;
+	struct btrfs_space_info *space_info;
+
+	list_for_each_entry(space_info, &fs_info->space_info, list) {
+		if (space_info->flags & BTRFS_BLOCK_GROUP_SYSTEM)
+			continue;
+		if (!READ_ONCE(space_info->periodic_reclaim))
+			continue;
+		for (raid = 0; raid < BTRFS_NR_RAID_TYPES; raid++) {
+			ret = do_reclaim_sweep(fs_info, space_info, raid);
+			if (ret)
+				return ret;
+		}
+	}
+
+	return ret;
+}
diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h
index bf2b3fb79104a3..c83dc5ed6eaafb 100644
--- a/fs/btrfs/space-info.h
+++ b/fs/btrfs/space-info.h
@@ -178,6 +178,12 @@ struct btrfs_space_info {
 	 * fixed bg_reclaim_threshold.
 	 */
 	bool dynamic_reclaim;
+
+	/*
+	 * Periodically check all block groups against the reclaim
+	 * threshold in the cleaner thread.
+	 */
+	bool periodic_reclaim;
 };
 
 struct reserve_ticket {
@@ -261,5 +267,6 @@ void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info);
 u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo);
 
 int btrfs_calc_reclaim_threshold(struct btrfs_space_info *space_info);
+int btrfs_reclaim_sweep(struct btrfs_fs_info *fs_info);
 
 #endif /* BTRFS_SPACE_INFO_H */
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index dfe790ad4d2bd2..d7c81d4aa7bc33 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -971,6 +971,39 @@ BTRFS_ATTR_RW(space_info, dynamic_reclaim,
 	      btrfs_sinfo_dynamic_reclaim_show,
 	      btrfs_sinfo_dynamic_reclaim_store);
 
+static ssize_t btrfs_sinfo_periodic_reclaim_show(struct kobject *kobj,
+						struct kobj_attribute *a,
+						char *buf)
+{
+	struct btrfs_space_info *space_info = to_space_info(kobj);
+
+	return sysfs_emit(buf, "%d\n", READ_ONCE(space_info->periodic_reclaim));
+}
+
+static ssize_t btrfs_sinfo_periodic_reclaim_store(struct kobject *kobj,
+						 struct kobj_attribute *a,
+						 const char *buf, size_t len)
+{
+	struct btrfs_space_info *space_info = to_space_info(kobj);
+	int periodic_reclaim;
+	int ret;
+
+	ret = kstrtoint(buf, 10, &periodic_reclaim);
+	if (ret)
+		return ret;
+
+	if (periodic_reclaim < 0)
+		return -EINVAL;
+
+	WRITE_ONCE(space_info->periodic_reclaim, periodic_reclaim != 0);
+
+	return len;
+}
+
+BTRFS_ATTR_RW(space_info, periodic_reclaim,
+	      btrfs_sinfo_periodic_reclaim_show,
+	      btrfs_sinfo_periodic_reclaim_store);
+
 /*
  * Allocation information about block group types.
  *
@@ -992,6 +1025,7 @@ static struct attribute *space_info_attrs[] = {
 	BTRFS_ATTR_PTR(space_info, chunk_size),
 	BTRFS_ATTR_PTR(space_info, size_classes),
 	BTRFS_ATTR_PTR(space_info, reclaim_count),
+	BTRFS_ATTR_PTR(space_info, periodic_reclaim),
 #ifdef CONFIG_BTRFS_DEBUG
 	BTRFS_ATTR_PTR(space_info, force_chunk_alloc),
 #endif

From 53adbfe41a7e9dc12369a58b5a5ae68852219bfc Mon Sep 17 00:00:00 2001
From: Boris Burkov <boris@bur.io>
Date: Fri, 2 Feb 2024 15:12:47 -0800
Subject: [PATCH 0459/1406] btrfs: urgent periodic reclaim pass

Periodic reclaim attempts to avoid block_groups seeing active use with a
sweep mark that gets cleared on allocation and set on a sweep. In urgent
conditions where we have very little unallocated space, we want to be
able to override this mechanism.

Introduce a second pass that only happens if we fail to find a reclaim
candidate and reclaim is urgent. In that case, do a second pass where
all block groups are eligible.

Signed-off-by: Boris Burkov <boris@bur.io>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/space-info.c | 35 ++++++++++++++++++++++++++++++++++-
 1 file changed, 34 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index ddd1224fcbb15f..251f2894e2c100 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -1944,17 +1944,35 @@ int btrfs_calc_reclaim_threshold(struct btrfs_space_info *space_info)
 	return READ_ONCE(space_info->bg_reclaim_threshold);
 }
 
+/*
+ * Under "urgent" reclaim, we will reclaim even fresh block groups that have
+ * recently seen successful allocations, as we are desperate to reclaim
+ * whatever we can to avoid ENOSPC in a transaction leading to a readonly fs.
+ */
+static bool is_reclaim_urgent(struct btrfs_space_info *space_info)
+{
+	struct btrfs_fs_info *fs_info = space_info->fs_info;
+	u64 unalloc = atomic64_read(&fs_info->free_chunk_space);
+	u64 chunk_size = min(READ_ONCE(space_info->chunk_size), SZ_1G);
+
+	return unalloc < chunk_size;
+}
+
 static int do_reclaim_sweep(struct btrfs_fs_info *fs_info,
 			    struct btrfs_space_info *space_info, int raid)
 {
 	struct btrfs_block_group *bg;
 	int thresh_pct;
+	bool try_again = true;
+	bool urgent;
 
 	spin_lock(&space_info->lock);
+	urgent = is_reclaim_urgent(space_info);
 	thresh_pct = btrfs_calc_reclaim_threshold(space_info);
 	spin_unlock(&space_info->lock);
 
 	down_read(&space_info->groups_sem);
+again:
 	list_for_each_entry(bg, &space_info->block_groups[raid], list) {
 		u64 thresh;
 		bool reclaim = false;
@@ -1962,14 +1980,29 @@ static int do_reclaim_sweep(struct btrfs_fs_info *fs_info,
 		btrfs_get_block_group(bg);
 		spin_lock(&bg->lock);
 		thresh = mult_perc(bg->length, thresh_pct);
-		if (bg->used < thresh && bg->reclaim_mark)
+		if (bg->used < thresh && bg->reclaim_mark) {
+			try_again = false;
 			reclaim = true;
+		}
 		bg->reclaim_mark++;
 		spin_unlock(&bg->lock);
 		if (reclaim)
 			btrfs_mark_bg_to_reclaim(bg);
 		btrfs_put_block_group(bg);
 	}
+
+	/*
+	 * In situations where we are very motivated to reclaim (low unalloc)
+	 * use two passes to make the reclaim mark check best effort.
+	 *
+	 * If we have any staler groups, we don't touch the fresher ones, but if we
+	 * really need a block group, do take a fresh one.
+	 */
+	if (try_again && urgent) {
+		try_again = false;
+		goto again;
+	}
+
 	up_read(&space_info->groups_sem);
 	return 0;
 }

From f35fafe43174b5f8d91ecec31148c57f1aab46f0 Mon Sep 17 00:00:00 2001
From: Boris Burkov <boris@bur.io>
Date: Fri, 2 Feb 2024 15:12:48 -0800
Subject: [PATCH 0460/1406] btrfs: prevent pathological periodic reclaim loops

Periodic reclaim runs the risk of getting stuck in a state where it
keeps reclaiming the same block group over and over. This can happen if
1. reclaiming that block_group fails
2. reclaiming that block_group fails to move any extents into existing
   block_groups and just allocates a fresh chunk and moves everything.

Currently, 1. is a very tight loop inside the reclaim worker. That is
critical for edge triggered reclaim or else we risk forgetting about a
reclaimable group. On the other hand, with level triggered reclaim we
can break out of that loop and get it later.

With that fixed, 2. applies to both failures and "successes" with no
progress. If we have done a periodic reclaim on a space_info and nothing
has changed in that space_info, there is not much point to trying again,
so don't, until some space gets free.

Signed-off-by: Boris Burkov <boris@bur.io>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/block-group.c | 3 ++-
 fs/btrfs/space-info.c  | 6 ++++++
 fs/btrfs/space-info.h  | 6 ++++++
 3 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 59c5d7c653598b..1b4be41495eaee 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -1925,7 +1925,7 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
 		}
 
 next:
-		if (ret)
+		if (ret && !READ_ONCE(space_info->periodic_reclaim))
 			btrfs_mark_bg_to_reclaim(bg);
 		btrfs_put_block_group(bg);
 
@@ -3665,6 +3665,7 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
 		space_info->bytes_used -= num_bytes;
 		space_info->disk_used -= num_bytes * factor;
 
+		space_info->periodic_reclaim_ready = true;
 		reclaim = should_reclaim_block_group(cache, num_bytes);
 
 		spin_unlock(&cache->lock);
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index 251f2894e2c100..a9b333fcf319f0 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -1967,6 +1967,12 @@ static int do_reclaim_sweep(struct btrfs_fs_info *fs_info,
 	bool urgent;
 
 	spin_lock(&space_info->lock);
+	if (space_info->periodic_reclaim_ready) {
+		space_info->periodic_reclaim_ready = false;
+	} else {
+		spin_unlock(&space_info->lock);
+		return 0;
+	}
 	urgent = is_reclaim_urgent(space_info);
 	thresh_pct = btrfs_calc_reclaim_threshold(space_info);
 	spin_unlock(&space_info->lock);
diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h
index c83dc5ed6eaafb..739f5953a2a59c 100644
--- a/fs/btrfs/space-info.h
+++ b/fs/btrfs/space-info.h
@@ -184,6 +184,12 @@ struct btrfs_space_info {
 	 * threshold in the cleaner thread.
 	 */
 	bool periodic_reclaim;
+
+	/*
+	 * Periodic reclaim should be a no-op if a space_info hasn't
+	 * freed any space since the last time we tried.
+	 */
+	bool periodic_reclaim_ready;
 };
 
 struct reserve_ticket {

From fc8304ff9241d78d4ec9a64c0b07a8adff6b974d Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Wed, 7 Feb 2024 02:34:23 +0100
Subject: [PATCH 0461/1406] btrfs: handle unexpected parent block offset in
 btrfs_alloc_tree_block()

Change a BUG_ON to a proper error handling, here it checks that a root
other than reloc tree does not see a non-zero offset. This is set by
btrfs_force_cow_block() and is a special case so the check makes sure
it's not accidentally set by other callers.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent-tree.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 0d72d0f7cefcad..3708f886d21a88 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -5187,8 +5187,16 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
 			parent = ins.objectid;
 		flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
 		owning_root = reloc_src_root;
-	} else
-		BUG_ON(parent > 0);
+	} else {
+		if (unlikely(parent > 0)) {
+			/*
+			 * Other roots than reloc tree don't expect start
+			 * offset of a parent block.
+			 */
+			ret = -EUCLEAN;
+			goto out_free_reserved;
+		}
+	}
 
 	if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
 		extent_op = btrfs_alloc_delayed_extent_op();

From 01e165497ea7d96cb0c93e0f11763e06f3c5ed25 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Wed, 7 Feb 2024 10:00:42 +1030
Subject: [PATCH 0462/1406] btrfs: defrag: avoid unnecessary defrag caused by
 incorrect extent size

[BUG]
With the following file extent layout, defrag would do unnecessary IO
and result more on-disk space usage.

 # mkfs.btrfs -f $dev
 # mount $dev $mnt
 # xfs_io -f -c "pwrite 0 40m" $mnt/foobar
 # sync
 # xfs_io -f -c "pwrite 40m 16k" $mnt/foobar.
 # sync

Above command would lead to the following file extent layout:

        item 6 key (257 EXTENT_DATA 0) itemoff 15816 itemsize 53
                generation 7 type 1 (regular)
                extent data disk byte 298844160 nr 41943040
                extent data offset 0 nr 41943040 ram 41943040
                extent compression 0 (none)
        item 7 key (257 EXTENT_DATA 41943040) itemoff 15763 itemsize 53
                generation 8 type 1 (regular)
                extent data disk byte 13631488 nr 16384
                extent data offset 0 nr 16384 ram 16384
                extent compression 0 (none)

Which is mostly fine. We can allow the final 16K to be merged with the
previous 40M, but it's upon the end users' preference.

But if we defrag the file using the default parameters, it would result
worse file layout:

 # btrfs filesystem defrag $mnt/foobar
 # sync

        item 6 key (257 EXTENT_DATA 0) itemoff 15816 itemsize 53
                generation 7 type 1 (regular)
                extent data disk byte 298844160 nr 41943040
                extent data offset 0 nr 8650752 ram 41943040
                extent compression 0 (none)
        item 7 key (257 EXTENT_DATA 8650752) itemoff 15763 itemsize 53
                generation 9 type 1 (regular)
                extent data disk byte 340787200 nr 33292288
                extent data offset 0 nr 33292288 ram 33292288
                extent compression 0 (none)
        item 8 key (257 EXTENT_DATA 41943040) itemoff 15710 itemsize 53
                generation 8 type 1 (regular)
                extent data disk byte 13631488 nr 16384
                extent data offset 0 nr 16384 ram 16384
                extent compression 0 (none)

Note the original 40M extent is still there, but a new 32M extent is
created for no benefit at all.

[CAUSE]
There is an existing check to make sure we won't defrag a large enough
extent (the threshold is by default 32M).

But the check is using the length to the end of the extent:

	range_len = em->len - (cur - em->start);

	/* Skip too large extent */
	if (range_len >= extent_thresh)
		goto next;

This means, for the first 8MiB of the extent, the range_len is always
smaller than the default threshold, and would not be defragged.
But after the first 8MiB, the remaining part would fit the requirement,
and be defragged.

Such different behavior inside the same extent caused the above problem,
and we should avoid different defrag decision inside the same extent.

[FIX]
Instead of using @range_len, just use @em->len, so that we have a
consistent decision among the same file extent.

Now with this fix, we won't touch the extent, thus not making it any
worse.

Fixes: 0cb5950f3f3b ("btrfs: fix deadlock when reserving space during defrag")
Reported-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/defrag.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c
index efea82f9f4fcee..6d3abfcf92d4b2 100644
--- a/fs/btrfs/defrag.c
+++ b/fs/btrfs/defrag.c
@@ -1048,7 +1048,7 @@ static int defrag_collect_targets(struct btrfs_inode *inode,
 			goto add;
 
 		/* Skip too large extent */
-		if (range_len >= extent_thresh)
+		if (em->len >= extent_thresh)
 			goto next;
 
 		/*

From da4cb9663e2fb7a8d9785e35c49e0b60a27429e9 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Mon, 12 Feb 2024 11:56:02 -0500
Subject: [PATCH 0463/1406] btrfs: fix deadlock with fiemap and extent locking

While working on the patchset to remove extent locking I got a lockdep
splat with fiemap and pagefaulting with my new extent lock replacement
lock.

This deadlock exists with our normal code, we just don't have lockdep
annotations with the extent locking so we've never noticed it.

Since we're copying the fiemap extent to user space on every iteration
we have the chance of pagefaulting.  Because we hold the extent lock for
the entire range we could mkwrite into a range in the file that we have
mmap'ed.  This would deadlock with the following stack trace

[<0>] lock_extent+0x28d/0x2f0
[<0>] btrfs_page_mkwrite+0x273/0x8a0
[<0>] do_page_mkwrite+0x50/0xb0
[<0>] do_fault+0xc1/0x7b0
[<0>] __handle_mm_fault+0x2fa/0x460
[<0>] handle_mm_fault+0xa4/0x330
[<0>] do_user_addr_fault+0x1f4/0x800
[<0>] exc_page_fault+0x7c/0x1e0
[<0>] asm_exc_page_fault+0x26/0x30
[<0>] rep_movs_alternative+0x33/0x70
[<0>] _copy_to_user+0x49/0x70
[<0>] fiemap_fill_next_extent+0xc8/0x120
[<0>] emit_fiemap_extent+0x4d/0xa0
[<0>] extent_fiemap+0x7f8/0xad0
[<0>] btrfs_fiemap+0x49/0x80
[<0>] __x64_sys_ioctl+0x3e1/0xb50
[<0>] do_syscall_64+0x94/0x1a0
[<0>] entry_SYSCALL_64_after_hwframe+0x6e/0x76

I wrote an fstest to reproduce this deadlock without my replacement lock
and verified that the deadlock exists with our existing locking.

To fix this simply don't take the extent lock for the entire duration of
the fiemap.  This is safe in general because we keep track of where we
are when we're searching the tree, so if an ordered extent updates in
the middle of our fiemap call we'll still emit the correct extents
because we know what offset we were on before.

The only place we maintain the lock is searching delalloc.  Since the
delalloc stuff can change during writeback we want to lock the extent
range so we have a consistent view of delalloc at the time we're
checking to see if we need to set the delalloc flag.

With this patch applied we no longer deadlock with my testcase.

Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_io.c | 62 ++++++++++++++++++++++++++++++++------------
 1 file changed, 45 insertions(+), 17 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 7449e0864b7806..bfef67c6822146 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2733,16 +2733,34 @@ static int fiemap_process_hole(struct btrfs_inode *inode,
 	 * it beyond i_size.
 	 */
 	while (cur_offset < end && cur_offset < i_size) {
+		struct extent_state *cached_state = NULL;
 		u64 delalloc_start;
 		u64 delalloc_end;
 		u64 prealloc_start;
+		u64 lockstart;
+		u64 lockend;
 		u64 prealloc_len = 0;
 		bool delalloc;
 
+		lockstart = round_down(cur_offset, inode->root->fs_info->sectorsize);
+		lockend = round_up(end, inode->root->fs_info->sectorsize);
+
+		/*
+		 * We are only locking for the delalloc range because that's the
+		 * only thing that can change here.  With fiemap we have a lock
+		 * on the inode, so no buffered or direct writes can happen.
+		 *
+		 * However mmaps and normal page writeback will cause this to
+		 * change arbitrarily.  We have to lock the extent lock here to
+		 * make sure that nobody messes with the tree while we're doing
+		 * btrfs_find_delalloc_in_range.
+		 */
+		lock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
 		delalloc = btrfs_find_delalloc_in_range(inode, cur_offset, end,
 							delalloc_cached_state,
 							&delalloc_start,
 							&delalloc_end);
+		unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
 		if (!delalloc)
 			break;
 
@@ -2910,15 +2928,15 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
 		  u64 start, u64 len)
 {
 	const u64 ino = btrfs_ino(inode);
-	struct extent_state *cached_state = NULL;
 	struct extent_state *delalloc_cached_state = NULL;
 	struct btrfs_path *path;
 	struct fiemap_cache cache = { 0 };
 	struct btrfs_backref_share_check_ctx *backref_ctx;
 	u64 last_extent_end;
 	u64 prev_extent_end;
-	u64 lockstart;
-	u64 lockend;
+	u64 range_start;
+	u64 range_end;
+	const u64 sectorsize = inode->root->fs_info->sectorsize;
 	bool stopped = false;
 	int ret;
 
@@ -2929,12 +2947,11 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
 		goto out;
 	}
 
-	lockstart = round_down(start, inode->root->fs_info->sectorsize);
-	lockend = round_up(start + len, inode->root->fs_info->sectorsize);
-	prev_extent_end = lockstart;
+	range_start = round_down(start, sectorsize);
+	range_end = round_up(start + len, sectorsize);
+	prev_extent_end = range_start;
 
 	btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED);
-	lock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
 
 	ret = fiemap_find_last_extent_offset(inode, path, &last_extent_end);
 	if (ret < 0)
@@ -2942,7 +2959,7 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
 	btrfs_release_path(path);
 
 	path->reada = READA_FORWARD;
-	ret = fiemap_search_slot(inode, path, lockstart);
+	ret = fiemap_search_slot(inode, path, range_start);
 	if (ret < 0) {
 		goto out_unlock;
 	} else if (ret > 0) {
@@ -2954,7 +2971,7 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
 		goto check_eof_delalloc;
 	}
 
-	while (prev_extent_end < lockend) {
+	while (prev_extent_end < range_end) {
 		struct extent_buffer *leaf = path->nodes[0];
 		struct btrfs_file_extent_item *ei;
 		struct btrfs_key key;
@@ -2977,19 +2994,19 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
 		 * The first iteration can leave us at an extent item that ends
 		 * before our range's start. Move to the next item.
 		 */
-		if (extent_end <= lockstart)
+		if (extent_end <= range_start)
 			goto next_item;
 
 		backref_ctx->curr_leaf_bytenr = leaf->start;
 
 		/* We have in implicit hole (NO_HOLES feature enabled). */
 		if (prev_extent_end < key.offset) {
-			const u64 range_end = min(key.offset, lockend) - 1;
+			const u64 hole_end = min(key.offset, range_end) - 1;
 
 			ret = fiemap_process_hole(inode, fieinfo, &cache,
 						  &delalloc_cached_state,
 						  backref_ctx, 0, 0, 0,
-						  prev_extent_end, range_end);
+						  prev_extent_end, hole_end);
 			if (ret < 0) {
 				goto out_unlock;
 			} else if (ret > 0) {
@@ -2999,7 +3016,7 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
 			}
 
 			/* We've reached the end of the fiemap range, stop. */
-			if (key.offset >= lockend) {
+			if (key.offset >= range_end) {
 				stopped = true;
 				break;
 			}
@@ -3093,29 +3110,41 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
 	btrfs_free_path(path);
 	path = NULL;
 
-	if (!stopped && prev_extent_end < lockend) {
+	if (!stopped && prev_extent_end < range_end) {
 		ret = fiemap_process_hole(inode, fieinfo, &cache,
 					  &delalloc_cached_state, backref_ctx,
-					  0, 0, 0, prev_extent_end, lockend - 1);
+					  0, 0, 0, prev_extent_end, range_end - 1);
 		if (ret < 0)
 			goto out_unlock;
-		prev_extent_end = lockend;
+		prev_extent_end = range_end;
 	}
 
 	if (cache.cached && cache.offset + cache.len >= last_extent_end) {
 		const u64 i_size = i_size_read(&inode->vfs_inode);
 
 		if (prev_extent_end < i_size) {
+			struct extent_state *cached_state = NULL;
 			u64 delalloc_start;
 			u64 delalloc_end;
+			u64 lockstart;
+			u64 lockend;
 			bool delalloc;
 
+			lockstart = round_down(prev_extent_end, sectorsize);
+			lockend = round_up(i_size, sectorsize);
+
+			/*
+			 * See the comment in fiemap_process_hole as to why
+			 * we're doing the locking here.
+			 */
+			lock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
 			delalloc = btrfs_find_delalloc_in_range(inode,
 								prev_extent_end,
 								i_size - 1,
 								&delalloc_cached_state,
 								&delalloc_start,
 								&delalloc_end);
+			unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
 			if (!delalloc)
 				cache.flags |= FIEMAP_EXTENT_LAST;
 		} else {
@@ -3126,7 +3155,6 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
 	ret = emit_last_fiemap_cache(fieinfo, &cache);
 
 out_unlock:
-	unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
 	btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
 out:
 	free_extent_state(delalloc_cached_state);

From ffdc36daabf26230de3e9850025f12cf5c21acca Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Tue, 13 Feb 2024 09:13:56 +0800
Subject: [PATCH 0464/1406] btrfs: do not skip re-registration for the mounted
 device

There are reports that since version 6.7 update-grub fails to find the
device of the root on systems without initrd and on a single device.

This looks like the device name changed in the output of
/proc/self/mountinfo:

6.5-rc5 working

  18 1 0:16 / / rw,noatime - btrfs /dev/sda8 ...

6.7 not working:

  17 1 0:15 / / rw,noatime - btrfs /dev/root ...

and "update-grub" shows this error:

  /usr/sbin/grub-probe: error: cannot find a device for / (is /dev mounted?)

This looks like it's related to the device name, but grub-probe
recognizes the "/dev/root" path and tries to find the underlying device.
However there's a special case for some filesystems, for btrfs in
particular.

The generic root device detection heuristic is not done and it all
relies on reading the device infos by a btrfs specific ioctl. This ioctl
returns the device name as it was saved at the time of device scan (in
this case it's /dev/root).

The change in 6.7 for temp_fsid to allow several single device
filesystem to exist with the same fsid (and transparently generate a new
UUID at mount time) was to skip caching/registering such devices.

This also skipped mounted device. One step of scanning is to check if
the device name hasn't changed, and if yes then update the cached value.

This broke the grub-probe as it always read the device /dev/root and
couldn't find it in the system. A temporary workaround is to create a
symlink but this does not survive reboot.

The right fix is to allow updating the device path of a mounted
filesystem even if this is a single device one.

In the fix, check if the device's major:minor number matches with the
cached device. If they do, then we can allow the scan to happen so that
device_list_add() can take care of updating the device path. The file
descriptor remains unchanged.

This does not affect the temp_fsid feature, the UUID of the mounted
filesystem remains the same and the matching is based on device major:minor
which is unique per mounted filesystem.

This covers the path when the device (that exists for all mounted
devices) name changes, updating /dev/root to /dev/sdx. Any other single
device with filesystem and is not mounted is still skipped.

Note that if a system is booted and initial mount is done on the
/dev/root device, this will be the cached name of the device. Only after
the command "btrfs device scan" it will change as it triggers the
rename.

The fix was verified by users whose systems were affected.

CC: stable@vger.kernel.org # 6.7+
Fixes: bc27d6f0aa0e ("btrfs: scan but don't register device on single device filesystem")
Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=218353
Link: https://lore.kernel.org/lkml/CAKLYgeJ1tUuqLcsquwuFqjDXPSJpEiokrWK2gisPKDZLs8Y2TQ@mail.gmail.com/
Signed-off-by: Anand Jain <anand.jain@oracle.com>
Tested-by: Alex Romosan <aromosan@gmail.com>
Tested-by: CHECK_1234543212345@protonmail.com
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 44 ++++++++++++++++++++++++++++++++++----------
 1 file changed, 34 insertions(+), 10 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 4ad9eca9b46c4a..bfe54745eae3f4 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1299,6 +1299,31 @@ int btrfs_forget_devices(dev_t devt)
 	return ret;
 }
 
+static bool btrfs_skip_registration(struct btrfs_super_block *disk_super,
+				    dev_t devt, bool mount_arg_dev)
+{
+	struct btrfs_fs_devices *fs_devices;
+
+	list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
+		struct btrfs_device *device;
+
+		mutex_lock(&fs_devices->device_list_mutex);
+		list_for_each_entry(device, &fs_devices->devices, dev_list) {
+			if (device->devt == devt) {
+				mutex_unlock(&fs_devices->device_list_mutex);
+				return false;
+			}
+		}
+		mutex_unlock(&fs_devices->device_list_mutex);
+	}
+
+	if (!mount_arg_dev && btrfs_super_num_devices(disk_super) == 1 &&
+	    !(btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING))
+		return true;
+
+	return false;
+}
+
 /*
  * Look for a btrfs signature on a device. This may be called out of the mount path
  * and we are not allowed to call set_blocksize during the scan. The superblock
@@ -1316,6 +1341,7 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags,
 	struct btrfs_device *device = NULL;
 	struct bdev_handle *bdev_handle;
 	u64 bytenr, bytenr_orig;
+	dev_t devt = 0;
 	int ret;
 
 	lockdep_assert_held(&uuid_mutex);
@@ -1355,18 +1381,16 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags,
 		goto error_bdev_put;
 	}
 
-	if (!mount_arg_dev && btrfs_super_num_devices(disk_super) == 1 &&
-	    !(btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING)) {
-		dev_t devt;
+	ret = lookup_bdev(path, &devt);
+	if (ret)
+		btrfs_warn(NULL, "lookup bdev failed for path %s: %d",
+			   path, ret);
 
-		ret = lookup_bdev(path, &devt);
-		if (ret)
-			btrfs_warn(NULL, "lookup bdev failed for path %s: %d",
-				   path, ret);
-		else
+	if (btrfs_skip_registration(disk_super, devt, mount_arg_dev)) {
+		pr_debug("BTRFS: skip registering single non-seed device %s\n",
+			  path);
+		if (devt)
 			btrfs_free_stale_devices(devt, NULL);
-
-		pr_debug("BTRFS: skip registering single non-seed device %s\n", path);
 		device = NULL;
 		goto free_disk_super;
 	}

From 1d25dbbe328d52acc902d83f0238cd98d8455842 Mon Sep 17 00:00:00 2001
From: Neal Gompa <neal@gompa.dev>
Date: Sun, 11 Feb 2024 20:34:44 -0500
Subject: [PATCH 0465/1406] btrfs: sysfs: drop unnecessary double logical
 negation in acl_show()

The IS_ENABLED() macro already guarantees the result will be a
suitable boolean return value ("1" for enabled, and "0" for disabled).
Thus, it seems that the "!!" used right before is unnecessary to force
the 0/1 values.

Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: Neal Gompa <neal@gompa.dev>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/sysfs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index d7c81d4aa7bc33..40c6bca5ebcf5b 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -421,7 +421,7 @@ BTRFS_ATTR(static_feature, supported_sectorsizes,
 
 static ssize_t acl_show(struct kobject *kobj, struct kobj_attribute *a, char *buf)
 {
-	return sysfs_emit(buf, "%d\n", !!IS_ENABLED(CONFIG_BTRFS_FS_POSIX_ACL));
+	return sysfs_emit(buf, "%d\n", IS_ENABLED(CONFIG_BTRFS_FS_POSIX_ACL));
 }
 BTRFS_ATTR(static_feature, acl, acl_show);
 

From 4a8a8f446ef2686b16607d3074ac6f4e369f1844 Mon Sep 17 00:00:00 2001
From: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Date: Wed, 9 Aug 2023 13:43:53 -0700
Subject: [PATCH 0466/1406] Bluetooth: hci_sync: Add helper functions to
 manipulate cmd_sync queue

This adds functions to queue, dequeue and lookup into the cmd_sync
list.

Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/hci_sync.h |  12 +++
 net/bluetooth/hci_sync.c         | 132 +++++++++++++++++++++++++++++--
 2 files changed, 136 insertions(+), 8 deletions(-)

diff --git a/include/net/bluetooth/hci_sync.h b/include/net/bluetooth/hci_sync.h
index ed334c253ebcd9..4ff4aa68ee196d 100644
--- a/include/net/bluetooth/hci_sync.h
+++ b/include/net/bluetooth/hci_sync.h
@@ -48,6 +48,18 @@ int hci_cmd_sync_submit(struct hci_dev *hdev, hci_cmd_sync_work_func_t func,
 			void *data, hci_cmd_sync_work_destroy_t destroy);
 int hci_cmd_sync_queue(struct hci_dev *hdev, hci_cmd_sync_work_func_t func,
 		       void *data, hci_cmd_sync_work_destroy_t destroy);
+struct hci_cmd_sync_work_entry *
+hci_cmd_sync_lookup_entry(struct hci_dev *hdev, hci_cmd_sync_work_func_t func,
+			  void *data, hci_cmd_sync_work_destroy_t destroy);
+int hci_cmd_sync_queue_once(struct hci_dev *hdev, hci_cmd_sync_work_func_t func,
+			    void *data, hci_cmd_sync_work_destroy_t destroy);
+void hci_cmd_sync_cancel_entry(struct hci_dev *hdev,
+			       struct hci_cmd_sync_work_entry *entry);
+bool hci_cmd_sync_dequeue(struct hci_dev *hdev, hci_cmd_sync_work_func_t func,
+			  void *data, hci_cmd_sync_work_destroy_t destroy);
+bool hci_cmd_sync_dequeue_once(struct hci_dev *hdev,
+			      hci_cmd_sync_work_func_t func, void *data,
+			      hci_cmd_sync_work_destroy_t destroy);
 
 int hci_update_eir_sync(struct hci_dev *hdev);
 int hci_update_class_sync(struct hci_dev *hdev);
diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c
index e1fdcb3c270625..5b314bf844f847 100644
--- a/net/bluetooth/hci_sync.c
+++ b/net/bluetooth/hci_sync.c
@@ -566,6 +566,17 @@ void hci_cmd_sync_init(struct hci_dev *hdev)
 	INIT_DELAYED_WORK(&hdev->adv_instance_expire, adv_timeout_expire);
 }
 
+static void _hci_cmd_sync_cancel_entry(struct hci_dev *hdev,
+				       struct hci_cmd_sync_work_entry *entry,
+				       int err)
+{
+	if (entry->destroy)
+		entry->destroy(hdev, entry->data, err);
+
+	list_del(&entry->list);
+	kfree(entry);
+}
+
 void hci_cmd_sync_clear(struct hci_dev *hdev)
 {
 	struct hci_cmd_sync_work_entry *entry, *tmp;
@@ -574,13 +585,8 @@ void hci_cmd_sync_clear(struct hci_dev *hdev)
 	cancel_work_sync(&hdev->reenable_adv_work);
 
 	mutex_lock(&hdev->cmd_sync_work_lock);
-	list_for_each_entry_safe(entry, tmp, &hdev->cmd_sync_work_list, list) {
-		if (entry->destroy)
-			entry->destroy(hdev, entry->data, -ECANCELED);
-
-		list_del(&entry->list);
-		kfree(entry);
-	}
+	list_for_each_entry_safe(entry, tmp, &hdev->cmd_sync_work_list, list)
+		_hci_cmd_sync_cancel_entry(hdev, entry, -ECANCELED);
 	mutex_unlock(&hdev->cmd_sync_work_lock);
 }
 
@@ -669,6 +675,115 @@ int hci_cmd_sync_queue(struct hci_dev *hdev, hci_cmd_sync_work_func_t func,
 }
 EXPORT_SYMBOL(hci_cmd_sync_queue);
 
+static struct hci_cmd_sync_work_entry *
+_hci_cmd_sync_lookup_entry(struct hci_dev *hdev, hci_cmd_sync_work_func_t func,
+			   void *data, hci_cmd_sync_work_destroy_t destroy)
+{
+	struct hci_cmd_sync_work_entry *entry, *tmp;
+
+	list_for_each_entry_safe(entry, tmp, &hdev->cmd_sync_work_list, list) {
+		if (func && entry->func != func)
+			continue;
+
+		if (data && entry->data != data)
+			continue;
+
+		if (destroy && entry->destroy != destroy)
+			continue;
+
+		return entry;
+	}
+
+	return NULL;
+}
+
+/* Queue HCI command entry once:
+ *
+ * - Lookup if an entry already exist and only if it doesn't creates a new entry
+ *   and queue it.
+ */
+int hci_cmd_sync_queue_once(struct hci_dev *hdev, hci_cmd_sync_work_func_t func,
+			    void *data, hci_cmd_sync_work_destroy_t destroy)
+{
+	if (hci_cmd_sync_lookup_entry(hdev, func, data, destroy))
+		return 0;
+
+	return hci_cmd_sync_queue(hdev, func, data, destroy);
+}
+EXPORT_SYMBOL(hci_cmd_sync_queue_once);
+
+/* Lookup HCI command entry:
+ *
+ * - Return first entry that matches by function callback or data or
+ *   destroy callback.
+ */
+struct hci_cmd_sync_work_entry *
+hci_cmd_sync_lookup_entry(struct hci_dev *hdev, hci_cmd_sync_work_func_t func,
+			  void *data, hci_cmd_sync_work_destroy_t destroy)
+{
+	struct hci_cmd_sync_work_entry *entry;
+
+	mutex_lock(&hdev->cmd_sync_work_lock);
+	entry = _hci_cmd_sync_lookup_entry(hdev, func, data, destroy);
+	mutex_unlock(&hdev->cmd_sync_work_lock);
+
+	return entry;
+}
+EXPORT_SYMBOL(hci_cmd_sync_lookup_entry);
+
+/* Cancel HCI command entry */
+void hci_cmd_sync_cancel_entry(struct hci_dev *hdev,
+			       struct hci_cmd_sync_work_entry *entry)
+{
+	mutex_lock(&hdev->cmd_sync_work_lock);
+	_hci_cmd_sync_cancel_entry(hdev, entry, -ECANCELED);
+	mutex_unlock(&hdev->cmd_sync_work_lock);
+}
+EXPORT_SYMBOL(hci_cmd_sync_cancel_entry);
+
+/* Dequeue one HCI command entry:
+ *
+ * - Lookup and cancel first entry that matches.
+ */
+bool hci_cmd_sync_dequeue_once(struct hci_dev *hdev,
+			       hci_cmd_sync_work_func_t func,
+			       void *data, hci_cmd_sync_work_destroy_t destroy)
+{
+	struct hci_cmd_sync_work_entry *entry;
+
+	entry = hci_cmd_sync_lookup_entry(hdev, func, data, destroy);
+	if (!entry)
+		return false;
+
+	hci_cmd_sync_cancel_entry(hdev, entry);
+
+	return true;
+}
+EXPORT_SYMBOL(hci_cmd_sync_dequeue_once);
+
+/* Dequeue HCI command entry:
+ *
+ * - Lookup and cancel any entry that matches by function callback or data or
+ *   destroy callback.
+ */
+bool hci_cmd_sync_dequeue(struct hci_dev *hdev, hci_cmd_sync_work_func_t func,
+			  void *data, hci_cmd_sync_work_destroy_t destroy)
+{
+	struct hci_cmd_sync_work_entry *entry;
+	bool ret = false;
+
+	mutex_lock(&hdev->cmd_sync_work_lock);
+	while ((entry = _hci_cmd_sync_lookup_entry(hdev, func, data,
+						   destroy))) {
+		_hci_cmd_sync_cancel_entry(hdev, entry, -ECANCELED);
+		ret = true;
+	}
+	mutex_unlock(&hdev->cmd_sync_work_lock);
+
+	return ret;
+}
+EXPORT_SYMBOL(hci_cmd_sync_dequeue);
+
 int hci_update_eir_sync(struct hci_dev *hdev)
 {
 	struct hci_cp_write_eir cp;
@@ -2881,7 +2996,8 @@ int hci_update_passive_scan(struct hci_dev *hdev)
 	    hci_dev_test_flag(hdev, HCI_UNREGISTER))
 		return 0;
 
-	return hci_cmd_sync_queue(hdev, update_passive_scan_sync, NULL, NULL);
+	return hci_cmd_sync_queue_once(hdev, update_passive_scan_sync, NULL,
+				       NULL);
 }
 
 int hci_write_sc_support_sync(struct hci_dev *hdev, u8 val)

From 96fb2aab16bf3eb2fd69477fff9e70f128b52d30 Mon Sep 17 00:00:00 2001
From: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Date: Tue, 13 Feb 2024 09:59:32 -0500
Subject: [PATCH 0467/1406] Bluetooth: hci_sync: Attempt to dequeue connection
 attempt

If connection is still queued/pending in the cmd_sync queue it means no
command has been generated and it should be safe to just dequeue the
callback when it is being aborted.

Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/hci_core.h | 19 ++++++++
 include/net/bluetooth/hci_sync.h | 10 +++--
 net/bluetooth/hci_conn.c         | 70 ++++++------------------------
 net/bluetooth/hci_sync.c         | 74 ++++++++++++++++++++++++++++----
 4 files changed, 102 insertions(+), 71 deletions(-)

diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index 2bdea85b7c447c..317d495cfcf5e8 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -1083,6 +1083,24 @@ static inline unsigned int hci_conn_count(struct hci_dev *hdev)
 	return c->acl_num + c->amp_num + c->sco_num + c->le_num + c->iso_num;
 }
 
+static inline bool hci_conn_valid(struct hci_dev *hdev, struct hci_conn *conn)
+{
+	struct hci_conn_hash *h = &hdev->conn_hash;
+	struct hci_conn  *c;
+
+	rcu_read_lock();
+
+	list_for_each_entry_rcu(c, &h->list, list) {
+		if (c == conn) {
+			rcu_read_unlock();
+			return true;
+		}
+	}
+	rcu_read_unlock();
+
+	return false;
+}
+
 static inline __u8 hci_conn_lookup_type(struct hci_dev *hdev, __u16 handle)
 {
 	struct hci_conn_hash *h = &hdev->conn_hash;
@@ -1493,6 +1511,7 @@ struct hci_conn *hci_connect_le_scan(struct hci_dev *hdev, bdaddr_t *dst,
 struct hci_conn *hci_connect_le(struct hci_dev *hdev, bdaddr_t *dst,
 				u8 dst_type, bool dst_resolved, u8 sec_level,
 				u16 conn_timeout, u8 role);
+void hci_connect_le_scan_cleanup(struct hci_conn *conn, u8 status);
 struct hci_conn *hci_connect_acl(struct hci_dev *hdev, bdaddr_t *dst,
 				 u8 sec_level, u8 auth_type,
 				 enum conn_reasons conn_reason, u16 timeout);
diff --git a/include/net/bluetooth/hci_sync.h b/include/net/bluetooth/hci_sync.h
index 4ff4aa68ee196d..6a9d063e9f472d 100644
--- a/include/net/bluetooth/hci_sync.h
+++ b/include/net/bluetooth/hci_sync.h
@@ -48,11 +48,11 @@ int hci_cmd_sync_submit(struct hci_dev *hdev, hci_cmd_sync_work_func_t func,
 			void *data, hci_cmd_sync_work_destroy_t destroy);
 int hci_cmd_sync_queue(struct hci_dev *hdev, hci_cmd_sync_work_func_t func,
 		       void *data, hci_cmd_sync_work_destroy_t destroy);
+int hci_cmd_sync_queue_once(struct hci_dev *hdev, hci_cmd_sync_work_func_t func,
+			    void *data, hci_cmd_sync_work_destroy_t destroy);
 struct hci_cmd_sync_work_entry *
 hci_cmd_sync_lookup_entry(struct hci_dev *hdev, hci_cmd_sync_work_func_t func,
 			  void *data, hci_cmd_sync_work_destroy_t destroy);
-int hci_cmd_sync_queue_once(struct hci_dev *hdev, hci_cmd_sync_work_func_t func,
-			    void *data, hci_cmd_sync_work_destroy_t destroy);
 void hci_cmd_sync_cancel_entry(struct hci_dev *hdev,
 			       struct hci_cmd_sync_work_entry *entry);
 bool hci_cmd_sync_dequeue(struct hci_dev *hdev, hci_cmd_sync_work_func_t func,
@@ -139,8 +139,6 @@ struct hci_conn;
 
 int hci_abort_conn_sync(struct hci_dev *hdev, struct hci_conn *conn, u8 reason);
 
-int hci_le_create_conn_sync(struct hci_dev *hdev, struct hci_conn *conn);
-
 int hci_le_create_cis_sync(struct hci_dev *hdev);
 
 int hci_le_remove_cig_sync(struct hci_dev *hdev, u8 handle);
@@ -152,3 +150,7 @@ int hci_le_big_terminate_sync(struct hci_dev *hdev, u8 handle);
 int hci_le_pa_terminate_sync(struct hci_dev *hdev, u16 handle);
 
 int hci_connect_acl_sync(struct hci_dev *hdev, struct hci_conn *conn);
+
+int hci_connect_le_sync(struct hci_dev *hdev, struct hci_conn *conn);
+
+int hci_cancel_connect_sync(struct hci_dev *hdev, struct hci_conn *conn);
diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c
index 587eb27f374c98..21e0b4064d05d6 100644
--- a/net/bluetooth/hci_conn.c
+++ b/net/bluetooth/hci_conn.c
@@ -68,7 +68,7 @@ static const struct sco_param esco_param_msbc[] = {
 };
 
 /* This function requires the caller holds hdev->lock */
-static void hci_connect_le_scan_cleanup(struct hci_conn *conn, u8 status)
+void hci_connect_le_scan_cleanup(struct hci_conn *conn, u8 status)
 {
 	struct hci_conn_params *params;
 	struct hci_dev *hdev = conn->hdev;
@@ -1124,6 +1124,9 @@ void hci_conn_del(struct hci_conn *conn)
 	 * rest of hci_conn_del.
 	 */
 	hci_conn_cleanup(conn);
+
+	/* Dequeue callbacks using connection pointer as data */
+	hci_cmd_sync_dequeue(hdev, NULL, conn, NULL);
 }
 
 struct hci_dev *hci_get_route(bdaddr_t *dst, bdaddr_t *src, uint8_t src_type)
@@ -1258,53 +1261,6 @@ u8 hci_conn_set_handle(struct hci_conn *conn, u16 handle)
 	return 0;
 }
 
-static void create_le_conn_complete(struct hci_dev *hdev, void *data, int err)
-{
-	struct hci_conn *conn;
-	u16 handle = PTR_UINT(data);
-
-	conn = hci_conn_hash_lookup_handle(hdev, handle);
-	if (!conn)
-		return;
-
-	bt_dev_dbg(hdev, "err %d", err);
-
-	hci_dev_lock(hdev);
-
-	if (!err) {
-		hci_connect_le_scan_cleanup(conn, 0x00);
-		goto done;
-	}
-
-	/* Check if connection is still pending */
-	if (conn != hci_lookup_le_connect(hdev))
-		goto done;
-
-	/* Flush to make sure we send create conn cancel command if needed */
-	flush_delayed_work(&conn->le_conn_timeout);
-	hci_conn_failed(conn, bt_status(err));
-
-done:
-	hci_dev_unlock(hdev);
-}
-
-static int hci_connect_le_sync(struct hci_dev *hdev, void *data)
-{
-	struct hci_conn *conn;
-	u16 handle = PTR_UINT(data);
-
-	conn = hci_conn_hash_lookup_handle(hdev, handle);
-	if (!conn)
-		return 0;
-
-	bt_dev_dbg(hdev, "conn %p", conn);
-
-	clear_bit(HCI_CONN_SCANNING, &conn->flags);
-	conn->state = BT_CONNECT;
-
-	return hci_le_create_conn_sync(hdev, conn);
-}
-
 struct hci_conn *hci_connect_le(struct hci_dev *hdev, bdaddr_t *dst,
 				u8 dst_type, bool dst_resolved, u8 sec_level,
 				u16 conn_timeout, u8 role)
@@ -1371,9 +1327,7 @@ struct hci_conn *hci_connect_le(struct hci_dev *hdev, bdaddr_t *dst,
 	conn->sec_level = BT_SECURITY_LOW;
 	conn->conn_timeout = conn_timeout;
 
-	err = hci_cmd_sync_queue(hdev, hci_connect_le_sync,
-				 UINT_PTR(conn->handle),
-				 create_le_conn_complete);
+	err = hci_connect_le_sync(hdev, conn);
 	if (err) {
 		hci_conn_del(conn);
 		return ERR_PTR(err);
@@ -2909,12 +2863,10 @@ u32 hci_conn_get_phy(struct hci_conn *conn)
 
 static int abort_conn_sync(struct hci_dev *hdev, void *data)
 {
-	struct hci_conn *conn;
-	u16 handle = PTR_UINT(data);
+	struct hci_conn *conn = data;
 
-	conn = hci_conn_hash_lookup_handle(hdev, handle);
-	if (!conn)
-		return 0;
+	if (!hci_conn_valid(hdev, conn))
+		return -ECANCELED;
 
 	return hci_abort_conn_sync(hdev, conn, conn->abort_reason);
 }
@@ -2949,8 +2901,10 @@ int hci_abort_conn(struct hci_conn *conn, u8 reason)
 			hci_cmd_sync_cancel(hdev, -ECANCELED);
 			break;
 		}
+	/* Cancel connect attempt if still queued/pending */
+	} else if (!hci_cancel_connect_sync(hdev, conn)) {
+		return 0;
 	}
 
-	return hci_cmd_sync_queue(hdev, abort_conn_sync, UINT_PTR(conn->handle),
-				  NULL);
+	return hci_cmd_sync_queue_once(hdev, abort_conn_sync, conn, NULL);
 }
diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c
index 5b314bf844f847..b7d8e99e2a30e0 100644
--- a/net/bluetooth/hci_sync.c
+++ b/net/bluetooth/hci_sync.c
@@ -6285,12 +6285,21 @@ static int hci_le_ext_create_conn_sync(struct hci_dev *hdev,
 					conn->conn_timeout, NULL);
 }
 
-int hci_le_create_conn_sync(struct hci_dev *hdev, struct hci_conn *conn)
+static int hci_le_create_conn_sync(struct hci_dev *hdev, void *data)
 {
 	struct hci_cp_le_create_conn cp;
 	struct hci_conn_params *params;
 	u8 own_addr_type;
 	int err;
+	struct hci_conn *conn = data;
+
+	if (!hci_conn_valid(hdev, conn))
+		return -ECANCELED;
+
+	bt_dev_dbg(hdev, "conn %p", conn);
+
+	clear_bit(HCI_CONN_SCANNING, &conn->flags);
+	conn->state = BT_CONNECT;
 
 	/* If requested to connect as peripheral use directed advertising */
 	if (conn->role == HCI_ROLE_SLAVE) {
@@ -6611,16 +6620,11 @@ int hci_update_adv_data(struct hci_dev *hdev, u8 instance)
 
 static int hci_acl_create_conn_sync(struct hci_dev *hdev, void *data)
 {
-	struct hci_conn *conn;
-	u16 handle = PTR_UINT(data);
+	struct hci_conn *conn = data;
 	struct inquiry_entry *ie;
 	struct hci_cp_create_conn cp;
 	int err;
 
-	conn = hci_conn_hash_lookup_handle(hdev, handle);
-	if (!conn)
-		return 0;
-
 	/* Many controllers disallow HCI Create Connection while it is doing
 	 * HCI Inquiry. So we cancel the Inquiry first before issuing HCI Create
 	 * Connection. This may cause the MGMT discovering state to become false
@@ -6679,6 +6683,58 @@ static int hci_acl_create_conn_sync(struct hci_dev *hdev, void *data)
 
 int hci_connect_acl_sync(struct hci_dev *hdev, struct hci_conn *conn)
 {
-	return hci_cmd_sync_queue(hdev, hci_acl_create_conn_sync,
-				  UINT_PTR(conn->handle), NULL);
+	return hci_cmd_sync_queue_once(hdev, hci_acl_create_conn_sync, conn,
+				       NULL);
+}
+
+static void create_le_conn_complete(struct hci_dev *hdev, void *data, int err)
+{
+	struct hci_conn *conn = data;
+
+	bt_dev_dbg(hdev, "err %d", err);
+
+	if (err == -ECANCELED)
+		return;
+
+	hci_dev_lock(hdev);
+
+	if (!err) {
+		hci_connect_le_scan_cleanup(conn, 0x00);
+		goto done;
+	}
+
+	/* Check if connection is still pending */
+	if (conn != hci_lookup_le_connect(hdev))
+		goto done;
+
+	/* Flush to make sure we send create conn cancel command if needed */
+	flush_delayed_work(&conn->le_conn_timeout);
+	hci_conn_failed(conn, bt_status(err));
+
+done:
+	hci_dev_unlock(hdev);
+}
+
+int hci_connect_le_sync(struct hci_dev *hdev, struct hci_conn *conn)
+{
+	return hci_cmd_sync_queue_once(hdev, hci_le_create_conn_sync, conn,
+				       create_le_conn_complete);
+}
+
+int hci_cancel_connect_sync(struct hci_dev *hdev, struct hci_conn *conn)
+{
+	if (conn->state != BT_OPEN)
+		return -EINVAL;
+
+	switch (conn->type) {
+	case ACL_LINK:
+		return !hci_cmd_sync_dequeue_once(hdev,
+						  hci_acl_create_conn_sync,
+						  conn, NULL);
+	case LE_LINK:
+		return !hci_cmd_sync_dequeue_once(hdev, hci_le_create_conn_sync,
+						  conn, create_le_conn_complete);
+	}
+
+	return -ENOENT;
 }

From 9ef38cc0e19743039a077aade95556d2ffcbb06c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 31 Jan 2024 11:06:59 -0500
Subject: [PATCH 0468/1406] bcachefs: Kill unnecessary wakeups in journal
 reclaim

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal_reclaim.c | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index 2cf626315652c0..f4d0c726f34817 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -394,8 +394,6 @@ void bch2_journal_pin_copy(struct journal *j,
 			   struct journal_entry_pin *src,
 			   journal_pin_flush_fn flush_fn)
 {
-	bool reclaim;
-
 	spin_lock(&j->lock);
 
 	u64 seq = READ_ONCE(src->seq);
@@ -411,44 +409,44 @@ void bch2_journal_pin_copy(struct journal *j,
 		return;
 	}
 
-	reclaim = __journal_pin_drop(j, dst);
+	bool reclaim = __journal_pin_drop(j, dst);
 
 	bch2_journal_pin_set_locked(j, seq, dst, flush_fn, journal_pin_type(flush_fn));
 
 	if (reclaim)
 		bch2_journal_reclaim_fast(j);
-	spin_unlock(&j->lock);
 
 	/*
 	 * If the journal is currently full,  we might want to call flush_fn
 	 * immediately:
 	 */
-	journal_wake(j);
+	if (seq == journal_last_seq(j))
+		journal_wake(j);
+	spin_unlock(&j->lock);
 }
 
 void bch2_journal_pin_set(struct journal *j, u64 seq,
 			  struct journal_entry_pin *pin,
 			  journal_pin_flush_fn flush_fn)
 {
-	bool reclaim;
-
 	spin_lock(&j->lock);
 
 	BUG_ON(seq < journal_last_seq(j));
 
-	reclaim = __journal_pin_drop(j, pin);
+	bool reclaim = __journal_pin_drop(j, pin);
 
 	bch2_journal_pin_set_locked(j, seq, pin, flush_fn, journal_pin_type(flush_fn));
 
 	if (reclaim)
 		bch2_journal_reclaim_fast(j);
-	spin_unlock(&j->lock);
-
 	/*
 	 * If the journal is currently full,  we might want to call flush_fn
 	 * immediately:
 	 */
-	journal_wake(j);
+	if (seq == journal_last_seq(j))
+		journal_wake(j);
+
+	spin_unlock(&j->lock);
 }
 
 /**

From 5f8e0f379671e5d82bb732c6afcb88f01fc3314b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 31 Jan 2024 11:21:46 -0500
Subject: [PATCH 0469/1406] bcachefs: Split out journal workqueue

We don't want journal write completions to be blocked behind btree
transactions - io_complete_wq is used for btree updates after data and
metadata writes.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal.c       | 22 ++++++++++++----------
 fs/bcachefs/journal_io.c    | 12 ++++++------
 fs/bcachefs/journal_types.h |  1 +
 3 files changed, 19 insertions(+), 16 deletions(-)

diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index bc890776eb5793..7c6f3ae47507b2 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -181,14 +181,12 @@ journal_error_check_stuck(struct journal *j, int error, unsigned flags)
  */
 void bch2_journal_buf_put_final(struct journal *j, u64 seq, bool write)
 {
-	struct bch_fs *c = container_of(j, struct bch_fs, journal);
-
 	lockdep_assert_held(&j->lock);
 
 	if (__bch2_journal_pin_put(j, seq))
 		bch2_journal_reclaim_fast(j);
 	if (write)
-		closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL);
+		closure_call(&j->io, bch2_journal_write, j->wq, NULL);
 }
 
 /*
@@ -418,7 +416,7 @@ static int journal_entry_open(struct journal *j)
 	} while ((v = atomic64_cmpxchg(&j->reservations.counter,
 				       old.v, new.v)) != old.v);
 
-	mod_delayed_work(c->io_complete_wq,
+	mod_delayed_work(j->wq,
 			 &j->write_work,
 			 msecs_to_jiffies(c->opts.journal_flush_delay));
 	journal_wake(j);
@@ -445,7 +443,6 @@ static void journal_quiesce(struct journal *j)
 static void journal_write_work(struct work_struct *work)
 {
 	struct journal *j = container_of(work, struct journal, write_work.work);
-	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 	long delta;
 
 	spin_lock(&j->lock);
@@ -455,7 +452,7 @@ static void journal_write_work(struct work_struct *work)
 	delta = journal_cur_buf(j)->expires - jiffies;
 
 	if (delta > 0)
-		mod_delayed_work(c->io_complete_wq, &j->write_work, delta);
+		mod_delayed_work(j->wq, &j->write_work, delta);
 	else
 		__journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true);
 unlock:
@@ -1303,11 +1300,12 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
 
 void bch2_fs_journal_exit(struct journal *j)
 {
-	unsigned i;
+	if (j->wq)
+		destroy_workqueue(j->wq);
 
 	darray_exit(&j->early_journal_entries);
 
-	for (i = 0; i < ARRAY_SIZE(j->buf); i++)
+	for (unsigned i = 0; i < ARRAY_SIZE(j->buf); i++)
 		kvpfree(j->buf[i].data, j->buf[i].buf_size);
 	free_fifo(&j->pin);
 }
@@ -1315,7 +1313,6 @@ void bch2_fs_journal_exit(struct journal *j)
 int bch2_fs_journal_init(struct journal *j)
 {
 	static struct lock_class_key res_key;
-	unsigned i;
 
 	mutex_init(&j->buf_lock);
 	spin_lock_init(&j->lock);
@@ -1336,7 +1333,7 @@ int bch2_fs_journal_init(struct journal *j)
 	if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)))
 		return -BCH_ERR_ENOMEM_journal_pin_fifo;
 
-	for (i = 0; i < ARRAY_SIZE(j->buf); i++) {
+	for (unsigned i = 0; i < ARRAY_SIZE(j->buf); i++) {
 		j->buf[i].buf_size = JOURNAL_ENTRY_SIZE_MIN;
 		j->buf[i].data = kvpmalloc(j->buf[i].buf_size, GFP_KERNEL);
 		if (!j->buf[i].data)
@@ -1344,6 +1341,11 @@ int bch2_fs_journal_init(struct journal *j)
 	}
 
 	j->pin.front = j->pin.back = 1;
+
+	j->wq = alloc_workqueue("bcachefs_journal",
+				WQ_HIGHPRI|WQ_FREEZABLE|WQ_UNBOUND|WQ_MEM_RECLAIM, 512);
+	if (!j->wq)
+		return -BCH_ERR_ENOMEM_fs_other_alloc;
 	return 0;
 }
 
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 47805193f18cc7..5dcb4f4ceae77b 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1648,7 +1648,7 @@ static CLOSURE_CALLBACK(journal_write_done)
 	if (!journal_state_count(new, new.unwritten_idx) &&
 	    journal_last_unwritten_seq(j) <= journal_cur_seq(j)) {
 		spin_unlock(&j->lock);
-		closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL);
+		closure_call(&j->io, bch2_journal_write, j->wq, NULL);
 	} else if (journal_last_unwritten_seq(j) == journal_cur_seq(j) &&
 		   new.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) {
 		struct journal_buf *buf = journal_cur_buf(j);
@@ -1661,7 +1661,7 @@ static CLOSURE_CALLBACK(journal_write_done)
 		 */
 
 		spin_unlock(&j->lock);
-		mod_delayed_work(c->io_complete_wq, &j->write_work, max(0L, delta));
+		mod_delayed_work(j->wq, &j->write_work, max(0L, delta));
 	} else {
 		spin_unlock(&j->lock);
 	}
@@ -1731,7 +1731,7 @@ static CLOSURE_CALLBACK(do_journal_write)
 			le64_to_cpu(w->data->seq);
 	}
 
-	continue_at(cl, journal_write_done, c->io_complete_wq);
+	continue_at(cl, journal_write_done, j->wq);
 }
 
 static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w)
@@ -1998,12 +1998,12 @@ CLOSURE_CALLBACK(bch2_journal_write)
 		}
 	}
 
-	continue_at(cl, do_journal_write, c->io_complete_wq);
+	continue_at(cl, do_journal_write, j->wq);
 	return;
 no_io:
-	continue_at(cl, journal_write_done, c->io_complete_wq);
+	continue_at(cl, journal_write_done, j->wq);
 	return;
 err:
 	bch2_fatal_error(c);
-	continue_at(cl, journal_write_done, c->io_complete_wq);
+	continue_at(cl, journal_write_done, j->wq);
 }
diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h
index 38817c7a085159..2ca5d5014cf654 100644
--- a/fs/bcachefs/journal_types.h
+++ b/fs/bcachefs/journal_types.h
@@ -205,6 +205,7 @@ struct journal {
 
 	struct closure		io;
 	struct delayed_work	write_work;
+	struct workqueue_struct *wq;
 
 	/* Sequence number of most recent journal entry (last entry in @pin) */
 	atomic64_t		seq;

From 38dc5877330846b8ca8d2085d9ab9621a3d9787e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 31 Jan 2024 11:24:37 -0500
Subject: [PATCH 0470/1406] bcachefs: Avoid setting j->write_work unnecessarily

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal.c | 24 +++++++++++-------------
 1 file changed, 11 insertions(+), 13 deletions(-)

diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 7c6f3ae47507b2..04309e7c108bee 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -416,9 +416,10 @@ static int journal_entry_open(struct journal *j)
 	} while ((v = atomic64_cmpxchg(&j->reservations.counter,
 				       old.v, new.v)) != old.v);
 
-	mod_delayed_work(j->wq,
-			 &j->write_work,
-			 msecs_to_jiffies(c->opts.journal_flush_delay));
+	if (nr_unwritten_journal_entries(j) == 1)
+		mod_delayed_work(j->wq,
+				 &j->write_work,
+				 msecs_to_jiffies(c->opts.journal_flush_delay));
 	journal_wake(j);
 
 	if (j->early_journal_entries.nr)
@@ -443,19 +444,16 @@ static void journal_quiesce(struct journal *j)
 static void journal_write_work(struct work_struct *work)
 {
 	struct journal *j = container_of(work, struct journal, write_work.work);
-	long delta;
 
 	spin_lock(&j->lock);
-	if (!__journal_entry_is_open(j->reservations))
-		goto unlock;
-
-	delta = journal_cur_buf(j)->expires - jiffies;
+	if (__journal_entry_is_open(j->reservations)) {
+		long delta = journal_cur_buf(j)->expires - jiffies;
 
-	if (delta > 0)
-		mod_delayed_work(j->wq, &j->write_work, delta);
-	else
-		__journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true);
-unlock:
+		if (delta > 0)
+			mod_delayed_work(j->wq, &j->write_work, delta);
+		else
+			__journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true);
+	}
 	spin_unlock(&j->lock);
 }
 

From 94f0df169f84a7be29aa5dde5e64f59520ebfc55 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 31 Jan 2024 11:25:46 -0500
Subject: [PATCH 0471/1406] bcachefs: Journal writes should be
 REQ_SYNC|REQ_META

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal_io.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 5dcb4f4ceae77b..8047425e84c311 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1991,7 +1991,7 @@ CLOSURE_CALLBACK(bch2_journal_write)
 
 			bio = ca->journal.bio;
 			bio_reset(bio, ca->disk_sb.bdev,
-				  REQ_OP_WRITE|REQ_PREFLUSH);
+				  REQ_OP_WRITE|REQ_SYNC|REQ_META|REQ_PREFLUSH);
 			bio->bi_end_io		= journal_write_endio;
 			bio->bi_private		= ca;
 			closure_bio_submit(bio, cl);

From 9d140ecc3a5497f9f6d6315a4249462bc4a44ddb Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 31 Jan 2024 11:28:13 -0500
Subject: [PATCH 0472/1406] bcachefs: Avoid taking journal lock unnecessarily

Previously, any time we failed to get a journal reservation we'd retry,
with the journal lock held; but this isn't necessary given
wait_event()/wake_up() ordering.

This avoids performance cliffs when the journal starts to get backed up
and lock contention shoots up.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal.c       | 107 ++++++++++++++++++------------------
 fs/bcachefs/journal_types.h |   1 +
 2 files changed, 55 insertions(+), 53 deletions(-)

diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 04309e7c108bee..20fdacb86f517f 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -27,6 +27,26 @@ static const char * const bch2_journal_errors[] = {
 	NULL
 };
 
+static inline bool journal_seq_unwritten(struct journal *j, u64 seq)
+{
+	return seq > j->seq_ondisk;
+}
+
+static bool __journal_entry_is_open(union journal_res_state state)
+{
+	return state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL;
+}
+
+static inline unsigned nr_unwritten_journal_entries(struct journal *j)
+{
+	return atomic64_read(&j->seq) - j->seq_ondisk;
+}
+
+static bool journal_entry_is_open(struct journal *j)
+{
+	return __journal_entry_is_open(j->reservations);
+}
+
 static void bch2_journal_buf_to_text(struct printbuf *out, struct journal *j, u64 seq)
 {
 	union journal_res_state s = READ_ONCE(j->reservations);
@@ -66,26 +86,7 @@ static void bch2_journal_bufs_to_text(struct printbuf *out, struct journal *j)
 	     seq <= journal_cur_seq(j);
 	     seq++)
 		bch2_journal_buf_to_text(out, j, seq);
-}
-
-static inline bool journal_seq_unwritten(struct journal *j, u64 seq)
-{
-	return seq > j->seq_ondisk;
-}
-
-static bool __journal_entry_is_open(union journal_res_state state)
-{
-	return state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL;
-}
-
-static inline unsigned nr_unwritten_journal_entries(struct journal *j)
-{
-	return atomic64_read(&j->seq) - j->seq_ondisk;
-}
-
-static bool journal_entry_is_open(struct journal *j)
-{
-	return __journal_entry_is_open(j->reservations);
+	prt_printf(out, "last buf %s\n", journal_entry_is_open(j) ? "open" : "closed");
 }
 
 static inline struct journal_buf *
@@ -468,33 +469,32 @@ static int __journal_res_get(struct journal *j, struct journal_res *res,
 	if (journal_res_get_fast(j, res, flags))
 		return 0;
 
-	if (bch2_journal_error(j))
-		return -BCH_ERR_erofs_journal_err;
+	if ((flags & BCH_WATERMARK_MASK) < j->watermark) {
+		ret = JOURNAL_ERR_journal_full;
+		can_discard = j->can_discard;
+		goto out;
+	}
 
-	spin_lock(&j->lock);
+	if (j->blocked)
+		return -BCH_ERR_journal_res_get_blocked;
 
-	/* check once more in case somebody else shut things down... */
-	if (bch2_journal_error(j)) {
-		spin_unlock(&j->lock);
+	if (bch2_journal_error(j))
 		return -BCH_ERR_erofs_journal_err;
+
+	if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf) && !journal_entry_is_open(j)) {
+		ret = JOURNAL_ERR_max_in_flight;
+		goto out;
 	}
 
+	spin_lock(&j->lock);
+
 	/*
 	 * Recheck after taking the lock, so we don't race with another thread
 	 * that just did journal_entry_open() and call bch2_journal_entry_close()
 	 * unnecessarily
 	 */
 	if (journal_res_get_fast(j, res, flags)) {
-		spin_unlock(&j->lock);
-		return 0;
-	}
-
-	if ((flags & BCH_WATERMARK_MASK) < j->watermark) {
-		/*
-		 * Don't want to close current journal entry, just need to
-		 * invoke reclaim:
-		 */
-		ret = JOURNAL_ERR_journal_full;
+		ret = 0;
 		goto unlock;
 	}
 
@@ -510,30 +510,31 @@ static int __journal_res_get(struct journal *j, struct journal_res *res,
 		j->buf_size_want = max(j->buf_size_want, buf->buf_size << 1);
 
 	__journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, false);
-	ret = journal_entry_open(j);
-
-	if (ret == JOURNAL_ERR_max_in_flight) {
-		track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight],
-				   &j->max_in_flight_start, true);
-		if (trace_journal_entry_full_enabled()) {
-			struct printbuf buf = PRINTBUF;
-			buf.atomic++;
-
-			bch2_journal_bufs_to_text(&buf, j);
-			trace_journal_entry_full(c, buf.buf);
-			printbuf_exit(&buf);
-		}
-		count_event(c, journal_entry_full);
-	}
+	ret = journal_entry_open(j) ?: JOURNAL_ERR_retry;
 unlock:
 	can_discard = j->can_discard;
 	spin_unlock(&j->lock);
-
-	if (!ret)
+out:
+	if (ret == JOURNAL_ERR_retry)
 		goto retry;
+	if (!ret)
+		return 0;
+
 	if (journal_error_check_stuck(j, ret, flags))
 		ret = -BCH_ERR_journal_res_get_blocked;
 
+	if (ret == JOURNAL_ERR_max_in_flight &&
+	    track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight],
+			       &j->max_in_flight_start, true)) {
+
+		struct printbuf buf = PRINTBUF;
+		prt_printf(&buf, "seq %llu\n", journal_cur_seq(j));
+		bch2_journal_bufs_to_text(&buf, j);
+		trace_journal_entry_full(c, buf.buf);
+		printbuf_exit(&buf);
+		count_event(c, journal_entry_full);
+	}
+
 	/*
 	 * Journal is full - can't rely on reclaim from work item due to
 	 * freezing:
diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h
index 2ca5d5014cf654..1493c262eaf407 100644
--- a/fs/bcachefs/journal_types.h
+++ b/fs/bcachefs/journal_types.h
@@ -134,6 +134,7 @@ enum journal_flags {
 /* Reasons we may fail to get a journal reservation: */
 #define JOURNAL_ERRORS()		\
 	x(ok)				\
+	x(retry)			\
 	x(blocked)			\
 	x(max_in_flight)		\
 	x(journal_full)			\

From 34ed9afffafec6a20d2806ff0c51629b4532bfd8 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 16 Jan 2024 17:29:15 -0500
Subject: [PATCH 0473/1406] bcachefs: fixup for building in userspace

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/checksum.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c
index 3c761ad6b1c8ef..4701457f6381ca 100644
--- a/fs/bcachefs/checksum.c
+++ b/fs/bcachefs/checksum.c
@@ -558,7 +558,7 @@ static int __bch2_request_key(char *key_description, struct bch_key *key)
 	return 0;
 }
 
-#include "../crypto.h"
+#include "crypto.h"
 #endif
 
 int bch2_request_key(struct bch_sb *sb, struct bch_key *key)

From 21fcde5349c97fb23fc9b7915f5485bdd3d88f50 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 21 Jan 2024 17:46:14 -0500
Subject: [PATCH 0474/1406] bcachefs: Improve bch2_dirent_to_text()

For DT_SUBVOL, we now print both parent and child subvol IDs.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/dirent.c | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index 4ae1e9f002a09b..ae29ad0c63e574 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -144,19 +144,21 @@ int bch2_dirent_invalid(struct bch_fs *c, struct bkey_s_c k,
 	return ret;
 }
 
-void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c,
-			 struct bkey_s_c k)
+void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
 {
 	struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
 	struct qstr d_name = bch2_dirent_get_name(d);
 
-	prt_printf(out, "%.*s -> %llu type %s",
-	       d_name.len,
-	       d_name.name,
-	       d.v->d_type != DT_SUBVOL
-	       ? le64_to_cpu(d.v->d_inum)
-	       : le32_to_cpu(d.v->d_child_subvol),
-	       bch2_d_type_str(d.v->d_type));
+	prt_printf(out, "%.*s -> ", d_name.len, d_name.name);
+
+	if (d.v->d_type != DT_SUBVOL)
+		prt_printf(out, "%llu", le64_to_cpu(d.v->d_inum));
+	else
+		prt_printf(out, "%u -> %u",
+			   le32_to_cpu(d.v->d_parent_subvol),
+			   le32_to_cpu(d.v->d_child_subvol));
+
+	prt_printf(out, " type %s", bch2_d_type_str(d.v->d_type));
 }
 
 static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans,

From 28463af552020eab70330f0f5c0e6b12fae93848 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 22 Jan 2024 20:55:08 -0500
Subject: [PATCH 0475/1406] bcachefs: Workqueues should be WQ_HIGHPRI

Most bcachefs workqueues are used for completions, and should be
WQ_HIGHPRI - this helps reduce queuing delays, we want to complete
quickly once we can no longer signal backpressure by blocking.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/super.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 6b23e11825e6d4..d5bd337ec64f7c 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -862,13 +862,13 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	c->inode_shard_bits = ilog2(roundup_pow_of_two(num_possible_cpus()));
 
 	if (!(c->btree_update_wq = alloc_workqueue("bcachefs",
-				WQ_FREEZABLE|WQ_UNBOUND|WQ_MEM_RECLAIM, 512)) ||
+				WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_UNBOUND, 512)) ||
 	    !(c->btree_io_complete_wq = alloc_workqueue("bcachefs_btree_io",
-				WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) ||
+				WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) ||
 	    !(c->copygc_wq = alloc_workqueue("bcachefs_copygc",
-				WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) ||
+				WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) ||
 	    !(c->io_complete_wq = alloc_workqueue("bcachefs_io",
-				WQ_FREEZABLE|WQ_HIGHPRI|WQ_MEM_RECLAIM, 512)) ||
+				WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 512)) ||
 	    !(c->write_ref_wq = alloc_workqueue("bcachefs_write_ref",
 				WQ_FREEZABLE, 0)) ||
 #ifndef BCH_WRITE_REF_DEBUG

From ef2ef95c168ac80a451feb3e1f243f45eb877fcd Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 25 Jan 2024 19:57:26 -0500
Subject: [PATCH 0476/1406] bcachefs: bch2_hash_set_snapshot() ->
 bch2_hash_set_in_snapshot()

Minor renaming for clarity, bit of refactoring.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/dirent.c   |  8 ++++----
 fs/bcachefs/fsck.c     |  7 +++----
 fs/bcachefs/str_hash.h | 15 +++++----------
 3 files changed, 12 insertions(+), 18 deletions(-)

diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index ae29ad0c63e574..8c1673e6ac4c6d 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -219,10 +219,10 @@ int bch2_dirent_create_snapshot(struct btree_trans *trans,
 	dirent->k.p.inode	= dir;
 	dirent->k.p.snapshot	= snapshot;
 
-	ret = bch2_hash_set_snapshot(trans, bch2_dirent_hash_desc, hash_info,
-				     zero_inum, snapshot,
-				     &dirent->k_i, str_hash_flags,
-				     BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+	ret = bch2_hash_set_in_snapshot(trans, bch2_dirent_hash_desc, hash_info,
+					zero_inum, snapshot,
+					&dirent->k_i, str_hash_flags,
+					BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
 	*dir_offset = dirent->k.p.offset;
 
 	return ret;
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 6a760777bafb06..8e53ac2fbcd3b8 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -100,8 +100,8 @@ static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr,
 }
 
 static int lookup_inode(struct btree_trans *trans, u64 inode_nr,
-			  struct bch_inode_unpacked *inode,
-			  u32 *snapshot)
+			struct bch_inode_unpacked *inode,
+			u32 *snapshot)
 {
 	struct btree_iter iter;
 	struct bkey_s_c k;
@@ -722,7 +722,7 @@ static int hash_redo_key(struct btree_trans *trans,
 	delete->k.p = k_iter->pos;
 	return  bch2_btree_iter_traverse(k_iter) ?:
 		bch2_trans_update(trans, k_iter, delete, 0) ?:
-		bch2_hash_set_snapshot(trans, desc, hash_info,
+		bch2_hash_set_in_snapshot(trans, desc, hash_info,
 				       (subvol_inum) { 0, k.k->p.inode },
 				       k.k->p.snapshot, tmp,
 				       BCH_HASH_SET_MUST_CREATE,
@@ -1781,7 +1781,6 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
 	if (d.v->d_type == DT_DIR)
 		for_each_visible_inode(c, s, dir, equiv.snapshot, i)
 			i->count++;
-
 out:
 err:
 fsck_err:
diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h
index fcaa5a88874488..3976f80721bf1b 100644
--- a/fs/bcachefs/str_hash.h
+++ b/fs/bcachefs/str_hash.h
@@ -259,7 +259,7 @@ int bch2_hash_needs_whiteout(struct btree_trans *trans,
 }
 
 static __always_inline
-int bch2_hash_set_snapshot(struct btree_trans *trans,
+int bch2_hash_set_in_snapshot(struct btree_trans *trans,
 			   const struct bch_hash_desc desc,
 			   const struct bch_hash_info *info,
 			   subvol_inum inum, u32 snapshot,
@@ -328,17 +328,12 @@ int bch2_hash_set(struct btree_trans *trans,
 		  struct bkey_i *insert,
 		  bch_str_hash_flags_t str_hash_flags)
 {
-	u32 snapshot;
-	int ret;
-
-	ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
-	if (ret)
-		return ret;
-
 	insert->k.p.inode = inum.inum;
 
-	return bch2_hash_set_snapshot(trans, desc, info, inum,
-				      snapshot, insert, str_hash_flags, 0);
+	u32 snapshot;
+	return  bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot) ?:
+		bch2_hash_set_in_snapshot(trans, desc, info, inum,
+					  snapshot, insert, str_hash_flags, 0);
 }
 
 static __always_inline

From e89242d7b92e2333549e88e5390ab981dafe0b79 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 25 Jan 2024 12:35:06 -0500
Subject: [PATCH 0477/1406] bcachefs: Cleanup bch2_dirent_lookup_trans()

Drop an unnecessary bch2_subvolume_get_snapshot() call, and drop the __
from the name - this is a normal interface.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/dirent.c    | 34 +++++++++++-----------------------
 fs/bcachefs/dirent.h    |  2 +-
 fs/bcachefs/fs-common.c |  4 ++--
 3 files changed, 14 insertions(+), 26 deletions(-)

diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index 8c1673e6ac4c6d..116752a7d1cd05 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -458,41 +458,29 @@ int bch2_dirent_rename(struct btree_trans *trans,
 	return ret;
 }
 
-int __bch2_dirent_lookup_trans(struct btree_trans *trans,
-			       struct btree_iter *iter,
-			       subvol_inum dir,
-			       const struct bch_hash_info *hash_info,
-			       const struct qstr *name, subvol_inum *inum,
-			       unsigned flags)
+int bch2_dirent_lookup_trans(struct btree_trans *trans,
+			     struct btree_iter *iter,
+			     subvol_inum dir,
+			     const struct bch_hash_info *hash_info,
+			     const struct qstr *name, subvol_inum *inum,
+			     unsigned flags)
 {
-	struct bkey_s_c k;
-	struct bkey_s_c_dirent d;
-	u32 snapshot;
-	int ret;
-
-	ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot);
+	int ret = bch2_hash_lookup(trans, iter, bch2_dirent_hash_desc,
+				   hash_info, dir, name, flags);
 	if (ret)
 		return ret;
 
-	ret = bch2_hash_lookup(trans, iter, bch2_dirent_hash_desc,
-			       hash_info, dir, name, flags);
-	if (ret)
-		return ret;
-
-	k = bch2_btree_iter_peek_slot(iter);
+	struct bkey_s_c k = bch2_btree_iter_peek_slot(iter);
 	ret = bkey_err(k);
 	if (ret)
 		goto err;
 
-	d = bkey_s_c_to_dirent(k);
-
-	ret = bch2_dirent_read_target(trans, dir, d, inum);
+	ret = bch2_dirent_read_target(trans, dir, bkey_s_c_to_dirent(k), inum);
 	if (ret > 0)
 		ret = -ENOENT;
 err:
 	if (ret)
 		bch2_trans_iter_exit(trans, iter);
-
 	return ret;
 }
 
@@ -504,7 +492,7 @@ u64 bch2_dirent_lookup(struct bch_fs *c, subvol_inum dir,
 	struct btree_iter iter = { NULL };
 
 	int ret = lockrestart_do(trans,
-		__bch2_dirent_lookup_trans(trans, &iter, dir, hash_info, name, inum, 0));
+		bch2_dirent_lookup_trans(trans, &iter, dir, hash_info, name, inum, 0));
 	bch2_trans_iter_exit(trans, &iter);
 	bch2_trans_put(trans);
 	return ret;
diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h
index 21ffeb78f02ee3..f1dd7208a58e05 100644
--- a/fs/bcachefs/dirent.h
+++ b/fs/bcachefs/dirent.h
@@ -62,7 +62,7 @@ int bch2_dirent_rename(struct btree_trans *,
 		       const struct qstr *, subvol_inum *, u64 *,
 		       enum bch_rename_mode);
 
-int __bch2_dirent_lookup_trans(struct btree_trans *, struct btree_iter *,
+int bch2_dirent_lookup_trans(struct btree_trans *, struct btree_iter *,
 			       subvol_inum, const struct bch_hash_info *,
 			       const struct qstr *, subvol_inum *, unsigned);
 u64 bch2_dirent_lookup(struct bch_fs *, subvol_inum,
diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c
index 1c1ea0f0c692a6..8ee716e4c2e72e 100644
--- a/fs/bcachefs/fs-common.c
+++ b/fs/bcachefs/fs-common.c
@@ -260,8 +260,8 @@ int bch2_unlink_trans(struct btree_trans *trans,
 
 	dir_hash = bch2_hash_info_init(c, dir_u);
 
-	ret = __bch2_dirent_lookup_trans(trans, &dirent_iter, dir, &dir_hash,
-					 name, &inum, BTREE_ITER_INTENT);
+	ret = bch2_dirent_lookup_trans(trans, &dirent_iter, dir, &dir_hash,
+				       name, &inum, BTREE_ITER_INTENT);
 	if (ret)
 		goto err;
 

From d2fba5604106d335e99eb6bc54b37941242ddb0b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 27 Jan 2024 00:05:03 -0500
Subject: [PATCH 0478/1406] bcachefs: convert journal replay ptrs to darray

Eliminates some error paths - no longer have a hardcoded
BCH_REPLICAS_MAX limit.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal.c    |  5 ++-
 fs/bcachefs/journal_io.c | 70 +++++++++++++---------------------------
 fs/bcachefs/journal_io.h | 19 ++++++-----
 3 files changed, 36 insertions(+), 58 deletions(-)

diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 20fdacb86f517f..edbbedaa0def63 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -1153,7 +1153,6 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq)
 	struct journal_replay *i, **_i;
 	struct genradix_iter iter;
 	bool had_entries = false;
-	unsigned ptr;
 	u64 last_seq = cur_seq, nr, seq;
 
 	genradix_for_each_reverse(&c->journal_entries, iter, _i) {
@@ -1207,8 +1206,8 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq)
 		p = journal_seq_pin(j, seq);
 
 		p->devs.nr = 0;
-		for (ptr = 0; ptr < i->nr_ptrs; ptr++)
-			bch2_dev_list_add_dev(&p->devs, i->ptrs[ptr].dev);
+		darray_for_each(i->ptrs, ptr)
+			bch2_dev_list_add_dev(&p->devs, ptr->dev);
 
 		had_entries = true;
 	}
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 8047425e84c311..8cfb8f1fe02d58 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -84,7 +84,6 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
 {
 	struct genradix_iter iter;
 	struct journal_replay **_i, *i, *dup;
-	struct journal_ptr *ptr;
 	size_t bytes = vstruct_bytes(j);
 	u64 last_seq = !JSET_NO_FLUSH(j) ? le64_to_cpu(j->last_seq) : 0;
 	int ret = JOURNAL_ENTRY_ADD_OK;
@@ -156,45 +155,29 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
 	if (!i)
 		return -BCH_ERR_ENOMEM_journal_entry_add;
 
-	i->nr_ptrs	= 0;
+	darray_init(&i->ptrs);
 	i->csum_good	= entry_ptr.csum_good;
 	i->ignore	= false;
 	unsafe_memcpy(&i->j, j, bytes, "embedded variable length struct");
-	i->ptrs[i->nr_ptrs++] = entry_ptr;
+	darray_push(&i->ptrs, entry_ptr);
 
 	if (dup) {
-		if (dup->nr_ptrs >= ARRAY_SIZE(dup->ptrs)) {
-			bch_err(c, "found too many copies of journal entry %llu",
-				le64_to_cpu(i->j.seq));
-			dup->nr_ptrs = ARRAY_SIZE(dup->ptrs) - 1;
-		}
-
 		/* The first ptr should represent the jset we kept: */
-		memcpy(i->ptrs + i->nr_ptrs,
-		       dup->ptrs,
-		       sizeof(dup->ptrs[0]) * dup->nr_ptrs);
-		i->nr_ptrs += dup->nr_ptrs;
+		darray_for_each(dup->ptrs, ptr)
+			darray_push(&i->ptrs, *ptr);
 		__journal_replay_free(c, dup);
 	}
 
 	*_i = i;
-	return 0;
 found:
-	for (ptr = i->ptrs; ptr < i->ptrs + i->nr_ptrs; ptr++) {
+	darray_for_each(i->ptrs, ptr)
 		if (ptr->dev == ca->dev_idx) {
 			bch_err(c, "duplicate journal entry %llu on same device",
 				le64_to_cpu(i->j.seq));
 			goto out;
 		}
-	}
 
-	if (i->nr_ptrs >= ARRAY_SIZE(i->ptrs)) {
-		bch_err(c, "found too many copies of journal entry %llu",
-			le64_to_cpu(i->j.seq));
-		goto out;
-	}
-
-	i->ptrs[i->nr_ptrs++] = entry_ptr;
+	ret = darray_push(&i->ptrs, entry_ptr);
 out:
 fsck_err:
 	return ret;
@@ -1102,16 +1085,15 @@ static CLOSURE_CALLBACK(bch2_journal_read_device)
 		if (!r)
 			continue;
 
-		for (i = 0; i < r->nr_ptrs; i++) {
-			if (r->ptrs[i].dev == ca->dev_idx) {
-				unsigned wrote = bucket_remainder(ca, r->ptrs[i].sector) +
+		darray_for_each(r->ptrs, i)
+			if (i->dev == ca->dev_idx) {
+				unsigned wrote = bucket_remainder(ca, i->sector) +
 					vstruct_sectors(&r->j, c->block_bits);
 
-				ja->cur_idx = r->ptrs[i].bucket;
+				ja->cur_idx = i->bucket;
 				ja->sectors_free = ca->mi.bucket_size - wrote;
 				goto found;
 			}
-		}
 	}
 found:
 	mutex_unlock(&jlist->lock);
@@ -1158,21 +1140,16 @@ static CLOSURE_CALLBACK(bch2_journal_read_device)
 void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
 			       struct journal_replay *j)
 {
-	unsigned i;
-
-	for (i = 0; i < j->nr_ptrs; i++) {
-		struct bch_dev *ca = bch_dev_bkey_exists(c, j->ptrs[i].dev);
+	darray_for_each(j->ptrs, i) {
+		struct bch_dev *ca = bch_dev_bkey_exists(c, i->dev);
 		u64 offset;
 
-		div64_u64_rem(j->ptrs[i].sector, ca->mi.bucket_size, &offset);
+		div64_u64_rem(i->sector, ca->mi.bucket_size, &offset);
 
-		if (i)
+		if (i != j->ptrs.data)
 			prt_printf(out, " ");
 		prt_printf(out, "%u:%u:%u (sector %llu)",
-		       j->ptrs[i].dev,
-		       j->ptrs[i].bucket,
-		       j->ptrs[i].bucket_offset,
-		       j->ptrs[i].sector);
+			   i->dev, i->bucket, i->bucket_offset, i->sector);
 	}
 }
 
@@ -1353,32 +1330,31 @@ int bch2_journal_read(struct bch_fs *c,
 			.e.data_type = BCH_DATA_journal,
 			.e.nr_required = 1,
 		};
-		unsigned ptr;
 
 		i = *_i;
 		if (!i || i->ignore)
 			continue;
 
-		for (ptr = 0; ptr < i->nr_ptrs; ptr++) {
-			struct bch_dev *ca = bch_dev_bkey_exists(c, i->ptrs[ptr].dev);
+		darray_for_each(i->ptrs, ptr) {
+			struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
 
-			if (!i->ptrs[ptr].csum_good)
-				bch_err_dev_offset(ca, i->ptrs[ptr].sector,
+			if (!ptr->csum_good)
+				bch_err_dev_offset(ca, ptr->sector,
 						   "invalid journal checksum, seq %llu%s",
 						   le64_to_cpu(i->j.seq),
 						   i->csum_good ? " (had good copy on another device)" : "");
 		}
 
 		ret = jset_validate(c,
-				    bch_dev_bkey_exists(c, i->ptrs[0].dev),
+				    bch_dev_bkey_exists(c, i->ptrs.data[0].dev),
 				    &i->j,
-				    i->ptrs[0].sector,
+				    i->ptrs.data[0].sector,
 				    READ);
 		if (ret)
 			goto err;
 
-		for (ptr = 0; ptr < i->nr_ptrs; ptr++)
-			replicas.e.devs[replicas.e.nr_devs++] = i->ptrs[ptr].dev;
+		darray_for_each(i->ptrs, ptr)
+			replicas.e.devs[replicas.e.nr_devs++] = ptr->dev;
 
 		bch2_replicas_entry_sort(&replicas.e);
 
diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h
index c035e7c108e190..1e0b9a57164868 100644
--- a/fs/bcachefs/journal_io.h
+++ b/fs/bcachefs/journal_io.h
@@ -2,19 +2,22 @@
 #ifndef _BCACHEFS_JOURNAL_IO_H
 #define _BCACHEFS_JOURNAL_IO_H
 
+#include "darray.h"
+
+struct journal_ptr {
+	bool		csum_good;
+	u8		dev;
+	u32		bucket;
+	u32		bucket_offset;
+	u64		sector;
+};
+
 /*
  * Only used for holding the journal entries we read in btree_journal_read()
  * during cache_registration
  */
 struct journal_replay {
-	struct journal_ptr {
-		bool		csum_good;
-		u8		dev;
-		u32		bucket;
-		u32		bucket_offset;
-		u64		sector;
-	}			ptrs[BCH_REPLICAS_MAX];
-	unsigned		nr_ptrs;
+	DARRAY_PREALLOCATED(struct journal_ptr, 8) ptrs;
 
 	bool			csum_good;
 	bool			ignore;

From 50172b92ece31e4a17983fb7feabf9c4c408adf0 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 27 Jan 2024 00:31:13 -0500
Subject: [PATCH 0479/1406] bcachefs: fix split brain message

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/super.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index d5bd337ec64f7c..ab14277438d009 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -1124,7 +1124,7 @@ static int bch2_dev_in_fs(struct bch_sb_handle *fs,
 		prt_newline(&buf);
 
 		prt_bdevname(&buf, fs->bdev);
-		prt_str(&buf, "believes seq of ");
+		prt_str(&buf, " believes seq of ");
 		prt_bdevname(&buf, sb->bdev);
 		prt_printf(&buf, " to be %llu, but ", seq_from_fs);
 		prt_bdevname(&buf, sb->bdev);

From f7443adebfc2645c79d3810923ca5791f04c2a70 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 27 Jan 2024 10:01:23 -0500
Subject: [PATCH 0480/1406] bcachefs: improve journal entry read fsck error
 messages

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal_io.c      | 96 ++++++++++++++++++++---------------
 fs/bcachefs/sb-errors_types.h |  3 +-
 2 files changed, 57 insertions(+), 42 deletions(-)

diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 8cfb8f1fe02d58..66ce522950fff6 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -17,6 +17,30 @@
 #include "sb-clean.h"
 #include "trace.h"
 
+void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
+			       struct journal_replay *j)
+{
+	darray_for_each(j->ptrs, i) {
+		struct bch_dev *ca = bch_dev_bkey_exists(c, i->dev);
+		u64 offset;
+
+		div64_u64_rem(i->sector, ca->mi.bucket_size, &offset);
+
+		if (i != j->ptrs.data)
+			prt_printf(out, " ");
+		prt_printf(out, "%u:%u:%u (sector %llu)",
+			   i->dev, i->bucket, i->bucket_offset, i->sector);
+	}
+}
+
+static void bch2_journal_replay_to_text(struct printbuf *out, struct bch_fs *c,
+					struct journal_replay *j)
+{
+	prt_printf(out, "seq %llu ", le64_to_cpu(j->j.seq));
+
+	bch2_journal_ptrs_to_text(out, c, j);
+}
+
 static struct nonce journal_nonce(const struct jset *jset)
 {
 	return (struct nonce) {{
@@ -86,6 +110,7 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
 	struct journal_replay **_i, *i, *dup;
 	size_t bytes = vstruct_bytes(j);
 	u64 last_seq = !JSET_NO_FLUSH(j) ? le64_to_cpu(j->last_seq) : 0;
+	struct printbuf buf = PRINTBUF;
 	int ret = JOURNAL_ENTRY_ADD_OK;
 
 	/* Is this entry older than the range we need? */
@@ -130,25 +155,37 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
 	 */
 	dup = *_i;
 	if (dup) {
-		if (bytes == vstruct_bytes(&dup->j) &&
-		    !memcmp(j, &dup->j, bytes)) {
-			i = dup;
-			goto found;
-		}
+		bool identical = bytes == vstruct_bytes(&dup->j) &&
+			!memcmp(j, &dup->j, bytes);
+		bool not_identical = !identical &&
+			entry_ptr.csum_good &&
+			dup->csum_good;
 
-		if (!entry_ptr.csum_good) {
-			i = dup;
-			goto found;
-		}
+		bool same_device = false;
+		darray_for_each(dup->ptrs, ptr)
+			if (ptr->dev == ca->dev_idx)
+				same_device = true;
+
+		ret = darray_push(&dup->ptrs, entry_ptr);
+		if (ret)
+			goto out;
 
-		if (!dup->csum_good)
+		bch2_journal_replay_to_text(&buf, c, dup);
+
+		fsck_err_on(same_device,
+			    c, journal_entry_dup_same_device,
+			    "duplicate journal entry on same device\n  %s",
+			    buf.buf);
+
+		fsck_err_on(not_identical,
+			    c, journal_entry_replicas_data_mismatch,
+			    "found duplicate but non identical journal entries\n  %s",
+			    buf.buf);
+
+		if (entry_ptr.csum_good && !identical)
 			goto replace;
 
-		fsck_err(c, journal_entry_replicas_data_mismatch,
-			 "found duplicate but non identical journal entries (seq %llu)",
-			 le64_to_cpu(j->seq));
-		i = dup;
-		goto found;
+		goto out;
 	}
 replace:
 	i = kvpmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL);
@@ -159,27 +196,20 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
 	i->csum_good	= entry_ptr.csum_good;
 	i->ignore	= false;
 	unsafe_memcpy(&i->j, j, bytes, "embedded variable length struct");
-	darray_push(&i->ptrs, entry_ptr);
 
 	if (dup) {
 		/* The first ptr should represent the jset we kept: */
 		darray_for_each(dup->ptrs, ptr)
 			darray_push(&i->ptrs, *ptr);
 		__journal_replay_free(c, dup);
+	} else {
+		darray_push(&i->ptrs, entry_ptr);
 	}
 
 	*_i = i;
-found:
-	darray_for_each(i->ptrs, ptr)
-		if (ptr->dev == ca->dev_idx) {
-			bch_err(c, "duplicate journal entry %llu on same device",
-				le64_to_cpu(i->j.seq));
-			goto out;
-		}
-
-	ret = darray_push(&i->ptrs, entry_ptr);
 out:
 fsck_err:
+	printbuf_exit(&buf);
 	return ret;
 }
 
@@ -1137,22 +1167,6 @@ static CLOSURE_CALLBACK(bch2_journal_read_device)
 	goto out;
 }
 
-void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
-			       struct journal_replay *j)
-{
-	darray_for_each(j->ptrs, i) {
-		struct bch_dev *ca = bch_dev_bkey_exists(c, i->dev);
-		u64 offset;
-
-		div64_u64_rem(i->sector, ca->mi.bucket_size, &offset);
-
-		if (i != j->ptrs.data)
-			prt_printf(out, " ");
-		prt_printf(out, "%u:%u:%u (sector %llu)",
-			   i->dev, i->bucket, i->bucket_offset, i->sector);
-	}
-}
-
 int bch2_journal_read(struct bch_fs *c,
 		      u64 *last_seq,
 		      u64 *blacklist_seq,
diff --git a/fs/bcachefs/sb-errors_types.h b/fs/bcachefs/sb-errors_types.h
index c08aacdfd073c2..dbfd91ab86cfae 100644
--- a/fs/bcachefs/sb-errors_types.h
+++ b/fs/bcachefs/sb-errors_types.h
@@ -250,7 +250,8 @@
 	x(hash_table_key_duplicate,				242)	\
 	x(hash_table_key_wrong_offset,				243)	\
 	x(unlinked_inode_not_on_deleted_list,			244)	\
-	x(reflink_p_front_pad_bad,				245)
+	x(reflink_p_front_pad_bad,				245)	\
+	x(journal_entry_dup_same_device,			246)
 
 enum bch_sb_error_id {
 #define x(t, n) BCH_FSCK_ERR_##t = n,

From 06e55b11c34b8bf0ad0aef7a982234817b97974b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 27 Jan 2024 10:16:15 -0500
Subject: [PATCH 0481/1406] bcachefs: jset_entry_datetime

This gives us a way to record the date and time every journal entry was
written - useful for debugging.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs_format.h |  8 ++++++-
 fs/bcachefs/journal_io.c      | 44 +++++++++++++++++++++++++++++++++++
 fs/bcachefs/journal_io.h      | 16 +++++++++++++
 fs/bcachefs/sb-clean.c        | 16 -------------
 4 files changed, 67 insertions(+), 17 deletions(-)

diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 0668b682a21ca8..14f613617913e1 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -1275,7 +1275,8 @@ static inline __u64 __bset_magic(struct bch_sb *sb)
 	x(dev_usage,		8)		\
 	x(log,			9)		\
 	x(overwrite,		10)		\
-	x(write_buffer_keys,	11)
+	x(write_buffer_keys,	11)		\
+	x(datetime,		12)
 
 enum {
 #define x(f, nr)	BCH_JSET_ENTRY_##f	= nr,
@@ -1376,6 +1377,11 @@ struct jset_entry_log {
 	u8			d[];
 } __packed __aligned(8);
 
+struct jset_entry_datetime {
+	struct jset_entry	entry;
+	__le64			seconds;
+} __packed __aligned(8);
+
 /*
  * On disk format for a journal entry:
  * seq is monotonically increasing; every journal entry has its own unique
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 66ce522950fff6..0ca6d976f4d51a 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -39,6 +39,14 @@ static void bch2_journal_replay_to_text(struct printbuf *out, struct bch_fs *c,
 	prt_printf(out, "seq %llu ", le64_to_cpu(j->j.seq));
 
 	bch2_journal_ptrs_to_text(out, c, j);
+
+	struct jset_entry *entry;
+	for_each_jset_entry_type(entry, &j->j, BCH_JSET_ENTRY_datetime) {
+		struct jset_entry_datetime *datetime =
+			container_of(entry, struct jset_entry_datetime, entry);
+		bch2_prt_datetime(out, le64_to_cpu(datetime->seconds));
+		break;
+	}
 }
 
 static struct nonce journal_nonce(const struct jset *jset)
@@ -754,6 +762,37 @@ static void journal_entry_write_buffer_keys_to_text(struct printbuf *out, struct
 	journal_entry_btree_keys_to_text(out, c, entry);
 }
 
+static int journal_entry_datetime_validate(struct bch_fs *c,
+				struct jset *jset,
+				struct jset_entry *entry,
+				unsigned version, int big_endian,
+				enum bkey_invalid_flags flags)
+{
+	unsigned bytes = vstruct_bytes(entry);
+	unsigned expected = 16;
+	int ret = 0;
+
+	if (journal_entry_err_on(vstruct_bytes(entry) < expected,
+				 c, version, jset, entry,
+				 journal_entry_dev_usage_bad_size,
+				 "bad size (%u < %u)",
+				 bytes, expected)) {
+		journal_entry_null_range(entry, vstruct_next(entry));
+		return ret;
+	}
+fsck_err:
+	return ret;
+}
+
+static void journal_entry_datetime_to_text(struct printbuf *out, struct bch_fs *c,
+					    struct jset_entry *entry)
+{
+	struct jset_entry_datetime *datetime =
+		container_of(entry, struct jset_entry_datetime, entry);
+
+	bch2_prt_datetime(out, le64_to_cpu(datetime->seconds));
+}
+
 struct jset_entry_ops {
 	int (*validate)(struct bch_fs *, struct jset *,
 			struct jset_entry *, unsigned, int,
@@ -1794,6 +1833,11 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w)
 
 	end	= bch2_btree_roots_to_journal_entries(c, end, btree_roots_have);
 
+	struct jset_entry_datetime *d =
+		container_of(jset_entry_init(&end, sizeof(*d)), struct jset_entry_datetime, entry);
+	d->entry.type	= BCH_JSET_ENTRY_datetime;
+	d->seconds	= cpu_to_le64(ktime_get_real_seconds());
+
 	bch2_journal_super_entries_add_common(c, &end, seq);
 	u64s	= (u64 *) end - (u64 *) start;
 	BUG_ON(u64s > j->entry_u64s_reserved);
diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h
index 1e0b9a57164868..1f395f43cf76f3 100644
--- a/fs/bcachefs/journal_io.h
+++ b/fs/bcachefs/journal_io.h
@@ -65,4 +65,20 @@ int bch2_journal_read(struct bch_fs *, u64 *, u64 *, u64 *);
 
 CLOSURE_CALLBACK(bch2_journal_write);
 
+static inline struct jset_entry *jset_entry_init(struct jset_entry **end, size_t size)
+{
+	struct jset_entry *entry = *end;
+	unsigned u64s = DIV_ROUND_UP(size, sizeof(u64));
+
+	memset(entry, 0, u64s * sizeof(u64));
+	/*
+	 * The u64s field counts from the start of data, ignoring the shared
+	 * fields.
+	 */
+	entry->u64s = cpu_to_le16(u64s - 1);
+
+	*end = vstruct_next(*end);
+	return entry;
+}
+
 #endif /* _BCACHEFS_JOURNAL_IO_H */
diff --git a/fs/bcachefs/sb-clean.c b/fs/bcachefs/sb-clean.c
index b6bf0ebe7e8404..5980ba2563fe9f 100644
--- a/fs/bcachefs/sb-clean.c
+++ b/fs/bcachefs/sb-clean.c
@@ -171,22 +171,6 @@ struct bch_sb_field_clean *bch2_read_superblock_clean(struct bch_fs *c)
 	return ERR_PTR(ret);
 }
 
-static struct jset_entry *jset_entry_init(struct jset_entry **end, size_t size)
-{
-	struct jset_entry *entry = *end;
-	unsigned u64s = DIV_ROUND_UP(size, sizeof(u64));
-
-	memset(entry, 0, u64s * sizeof(u64));
-	/*
-	 * The u64s field counts from the start of data, ignoring the shared
-	 * fields.
-	 */
-	entry->u64s = cpu_to_le16(u64s - 1);
-
-	*end = vstruct_next(*end);
-	return entry;
-}
-
 void bch2_journal_super_entries_add_common(struct bch_fs *c,
 					   struct jset_entry **end,
 					   u64 journal_seq)

From 2cb13b3cb7fdab08c459cdc5a128284212aeda57 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 31 Jan 2024 13:20:28 -0500
Subject: [PATCH 0482/1406] bcachefs: bio per journal buf

Prep work for having multiple journal writes in flight.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal.c       | 40 ++++++++++++++++++++-----------------
 fs/bcachefs/journal_io.c    | 21 +++++++++----------
 fs/bcachefs/journal_types.h |  2 +-
 3 files changed, 34 insertions(+), 29 deletions(-)

diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index edbbedaa0def63..f714fc7238f872 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -1235,13 +1235,17 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq)
 
 void bch2_dev_journal_exit(struct bch_dev *ca)
 {
-	kfree(ca->journal.bio);
-	kfree(ca->journal.buckets);
-	kfree(ca->journal.bucket_seq);
+	struct journal_device *ja = &ca->journal;
+
+	for (unsigned i = 0; i < ARRAY_SIZE(ja->bio); i++) {
+		kfree(ja->bio[i]);
+		ja->bio[i] = NULL;
+	}
 
-	ca->journal.bio		= NULL;
-	ca->journal.buckets	= NULL;
-	ca->journal.bucket_seq	= NULL;
+	kfree(ja->buckets);
+	kfree(ja->bucket_seq);
+	ja->buckets	= NULL;
+	ja->bucket_seq	= NULL;
 }
 
 int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
@@ -1251,14 +1255,13 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
 		bch2_sb_field_get(sb, journal);
 	struct bch_sb_field_journal_v2 *journal_buckets_v2 =
 		bch2_sb_field_get(sb, journal_v2);
-	unsigned i, nr_bvecs;
 
 	ja->nr = 0;
 
 	if (journal_buckets_v2) {
 		unsigned nr = bch2_sb_field_journal_v2_nr_entries(journal_buckets_v2);
 
-		for (i = 0; i < nr; i++)
+		for (unsigned i = 0; i < nr; i++)
 			ja->nr += le64_to_cpu(journal_buckets_v2->d[i].nr);
 	} else if (journal_buckets) {
 		ja->nr = bch2_nr_journal_buckets(journal_buckets);
@@ -1268,13 +1271,14 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
 	if (!ja->bucket_seq)
 		return -BCH_ERR_ENOMEM_dev_journal_init;
 
-	nr_bvecs = DIV_ROUND_UP(JOURNAL_ENTRY_SIZE_MAX, PAGE_SIZE);
+	unsigned nr_bvecs = DIV_ROUND_UP(JOURNAL_ENTRY_SIZE_MAX, PAGE_SIZE);
 
-	ca->journal.bio = bio_kmalloc(nr_bvecs, GFP_KERNEL);
-	if (!ca->journal.bio)
-		return -BCH_ERR_ENOMEM_dev_journal_init;
-
-	bio_init(ca->journal.bio, NULL, ca->journal.bio->bi_inline_vecs, nr_bvecs, 0);
+	for (unsigned i = 0; i < ARRAY_SIZE(ja->bio); i++) {
+		ja->bio[i] = bio_kmalloc(nr_bvecs, GFP_KERNEL);
+		if (!ja->bio[i])
+			return -BCH_ERR_ENOMEM_dev_journal_init;
+		bio_init(ja->bio[i], NULL, ja->bio[i]->bi_inline_vecs, nr_bvecs, 0);
+	}
 
 	ja->buckets = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL);
 	if (!ja->buckets)
@@ -1282,14 +1286,14 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
 
 	if (journal_buckets_v2) {
 		unsigned nr = bch2_sb_field_journal_v2_nr_entries(journal_buckets_v2);
-		unsigned j, dst = 0;
+		unsigned dst = 0;
 
-		for (i = 0; i < nr; i++)
-			for (j = 0; j < le64_to_cpu(journal_buckets_v2->d[i].nr); j++)
+		for (unsigned i = 0; i < nr; i++)
+			for (unsigned j = 0; j < le64_to_cpu(journal_buckets_v2->d[i].nr); j++)
 				ja->buckets[dst++] =
 					le64_to_cpu(journal_buckets_v2->d[i].start) + j;
 	} else if (journal_buckets) {
-		for (i = 0; i < ja->nr; i++)
+		for (unsigned i = 0; i < ja->nr; i++)
 			ja->buckets[i] = le64_to_cpu(journal_buckets->buckets[i]);
 	}
 
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 0ca6d976f4d51a..1dc8318e1f143b 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1721,13 +1721,14 @@ static CLOSURE_CALLBACK(do_journal_write)
 {
 	closure_type(j, struct journal, io);
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
-	struct bch_dev *ca;
-	struct journal_buf *w = journal_last_unwritten_buf(j);
-	struct bio *bio;
+	unsigned buf_idx = journal_last_unwritten_seq(j) & JOURNAL_BUF_MASK;
+	struct journal_buf *w = j->buf + buf_idx;
 	unsigned sectors = vstruct_sectors(w->data, c->block_bits);
 
 	extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) {
-		ca = bch_dev_bkey_exists(c, ptr->dev);
+		struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+		struct journal_device *ja = &ca->journal;
+
 		if (!percpu_ref_tryget(&ca->io_ref)) {
 			/* XXX: fix this */
 			bch_err(c, "missing device for journal write\n");
@@ -1737,7 +1738,7 @@ static CLOSURE_CALLBACK(do_journal_write)
 		this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal],
 			     sectors);
 
-		bio = ca->journal.bio;
+		struct bio *bio = ja->bio[buf_idx];
 		bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META);
 		bio->bi_iter.bi_sector	= ptr->offset;
 		bio->bi_end_io		= journal_write_endio;
@@ -1756,8 +1757,7 @@ static CLOSURE_CALLBACK(do_journal_write)
 		trace_and_count(c, journal_write, bio);
 		closure_bio_submit(bio, cl);
 
-		ca->journal.bucket_seq[ca->journal.cur_idx] =
-			le64_to_cpu(w->data->seq);
+		ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq);
 	}
 
 	continue_at(cl, journal_write_done, j->wq);
@@ -1939,9 +1939,9 @@ CLOSURE_CALLBACK(bch2_journal_write)
 {
 	closure_type(j, struct journal, io);
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
-	struct journal_buf *w = journal_last_unwritten_buf(j);
+	unsigned buf_idx = journal_last_unwritten_seq(j) & JOURNAL_BUF_MASK;
+	struct journal_buf *w = j->buf + buf_idx;
 	struct bch_replicas_padded replicas;
-	struct bio *bio;
 	struct printbuf journal_debug_buf = PRINTBUF;
 	unsigned nr_rw_members = 0;
 	int ret;
@@ -2023,7 +2023,8 @@ CLOSURE_CALLBACK(bch2_journal_write)
 		for_each_rw_member(c, ca) {
 			percpu_ref_get(&ca->io_ref);
 
-			bio = ca->journal.bio;
+			struct journal_device *ja = &ca->journal;
+			struct bio *bio = ja->bio[buf_idx];
 			bio_reset(bio, ca->disk_sb.bdev,
 				  REQ_OP_WRITE|REQ_SYNC|REQ_META|REQ_PREFLUSH);
 			bio->bi_end_io		= journal_write_endio;
diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h
index 1493c262eaf407..79db1daa1de2e7 100644
--- a/fs/bcachefs/journal_types.h
+++ b/fs/bcachefs/journal_types.h
@@ -315,7 +315,7 @@ struct journal_device {
 	u64			*buckets;
 
 	/* Bio for journal reads/writes to this device */
-	struct bio		*bio;
+	struct bio		*bio[JOURNAL_BUF_NR];
 
 	/* for bch_journal_read_device */
 	struct closure		read;

From 113876d40453cdce5213d4a8aee38821ff3e4a8e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 31 Jan 2024 13:42:48 -0500
Subject: [PATCH 0483/1406] bcachefs: closure per journal buf

Prep work for having multiple journal writes in flight.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal.c       | 18 ++++++++++++------
 fs/bcachefs/journal_io.c    | 34 +++++++++++++++++++---------------
 fs/bcachefs/journal_types.h | 12 ++++++++++--
 3 files changed, 41 insertions(+), 23 deletions(-)

diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index f714fc7238f872..96393393036609 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -186,8 +186,10 @@ void bch2_journal_buf_put_final(struct journal *j, u64 seq, bool write)
 
 	if (__bch2_journal_pin_put(j, seq))
 		bch2_journal_reclaim_fast(j);
-	if (write)
-		closure_call(&j->io, bch2_journal_write, j->wq, NULL);
+	if (write) {
+		struct journal_buf *w = j->buf + (seq & JOURNAL_BUF_MASK);
+		closure_call(&w->io, bch2_journal_write, j->wq, NULL);
+	}
 }
 
 /*
@@ -1274,10 +1276,14 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
 	unsigned nr_bvecs = DIV_ROUND_UP(JOURNAL_ENTRY_SIZE_MAX, PAGE_SIZE);
 
 	for (unsigned i = 0; i < ARRAY_SIZE(ja->bio); i++) {
-		ja->bio[i] = bio_kmalloc(nr_bvecs, GFP_KERNEL);
+		ja->bio[i] = kmalloc(struct_size(ja->bio[i], bio.bi_inline_vecs,
+				     nr_bvecs), GFP_KERNEL);
 		if (!ja->bio[i])
 			return -BCH_ERR_ENOMEM_dev_journal_init;
-		bio_init(ja->bio[i], NULL, ja->bio[i]->bi_inline_vecs, nr_bvecs, 0);
+
+		ja->bio[i]->ca = ca;
+		ja->bio[i]->buf_idx = i;
+		bio_init(&ja->bio[i]->bio, NULL, ja->bio[i]->bio.bi_inline_vecs, nr_bvecs, 0);
 	}
 
 	ja->buckets = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL);
@@ -1340,6 +1346,7 @@ int bch2_fs_journal_init(struct journal *j)
 		j->buf[i].data = kvpmalloc(j->buf[i].buf_size, GFP_KERNEL);
 		if (!j->buf[i].data)
 			return -BCH_ERR_ENOMEM_journal_buf;
+		j->buf[i].idx = i;
 	}
 
 	j->pin.front = j->pin.back = 1;
@@ -1459,7 +1466,6 @@ bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64
 {
 	struct journal_entry_pin_list *pin_list;
 	struct journal_entry_pin *pin;
-	unsigned i;
 
 	spin_lock(&j->lock);
 	*seq = max(*seq, j->pin.front);
@@ -1477,7 +1483,7 @@ bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64
 	prt_newline(out);
 	printbuf_indent_add(out, 2);
 
-	for (i = 0; i < ARRAY_SIZE(pin_list->list); i++)
+	for (unsigned i = 0; i < ARRAY_SIZE(pin_list->list); i++)
 		list_for_each_entry(pin, &pin_list->list[i], list) {
 			prt_printf(out, "\t%px %ps", pin, pin->flush);
 			prt_newline(out);
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 1dc8318e1f143b..d02e499566219f 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1597,9 +1597,9 @@ static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j)
 
 static CLOSURE_CALLBACK(journal_write_done)
 {
-	closure_type(j, struct journal, io);
+	closure_type(w, struct journal_buf, io);
+	struct journal *j = container_of(w, struct journal, buf[w->idx]);
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
-	struct journal_buf *w = journal_last_unwritten_buf(j);
 	struct bch_replicas_padded replicas;
 	union journal_res_state old, new;
 	u64 v, seq;
@@ -1676,8 +1676,9 @@ static CLOSURE_CALLBACK(journal_write_done)
 
 	if (!journal_state_count(new, new.unwritten_idx) &&
 	    journal_last_unwritten_seq(j) <= journal_cur_seq(j)) {
+		struct journal_buf *w = j->buf + (journal_last_unwritten_seq(j) & JOURNAL_BUF_MASK);
 		spin_unlock(&j->lock);
-		closure_call(&j->io, bch2_journal_write, j->wq, NULL);
+		closure_call(&w->io, bch2_journal_write, j->wq, NULL);
 	} else if (journal_last_unwritten_seq(j) == journal_cur_seq(j) &&
 		   new.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) {
 		struct journal_buf *buf = journal_cur_buf(j);
@@ -1698,31 +1699,32 @@ static CLOSURE_CALLBACK(journal_write_done)
 
 static void journal_write_endio(struct bio *bio)
 {
-	struct bch_dev *ca = bio->bi_private;
+	struct journal_bio *jbio = container_of(bio, struct journal_bio, bio);
+	struct bch_dev *ca = jbio->ca;
 	struct journal *j = &ca->fs->journal;
-	struct journal_buf *w = journal_last_unwritten_buf(j);
-	unsigned long flags;
+	struct journal_buf *w = j->buf + jbio->buf_idx;
 
 	if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write,
 			       "error writing journal entry %llu: %s",
 			       le64_to_cpu(w->data->seq),
 			       bch2_blk_status_to_str(bio->bi_status)) ||
 	    bch2_meta_write_fault("journal")) {
+		unsigned long flags;
+
 		spin_lock_irqsave(&j->err_lock, flags);
 		bch2_dev_list_drop_dev(&w->devs_written, ca->dev_idx);
 		spin_unlock_irqrestore(&j->err_lock, flags);
 	}
 
-	closure_put(&j->io);
+	closure_put(&w->io);
 	percpu_ref_put(&ca->io_ref);
 }
 
 static CLOSURE_CALLBACK(do_journal_write)
 {
-	closure_type(j, struct journal, io);
+	closure_type(w, struct journal_buf, io);
+	struct journal *j = container_of(w, struct journal, buf[w->idx]);
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
-	unsigned buf_idx = journal_last_unwritten_seq(j) & JOURNAL_BUF_MASK;
-	struct journal_buf *w = j->buf + buf_idx;
 	unsigned sectors = vstruct_sectors(w->data, c->block_bits);
 
 	extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) {
@@ -1738,7 +1740,7 @@ static CLOSURE_CALLBACK(do_journal_write)
 		this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal],
 			     sectors);
 
-		struct bio *bio = ja->bio[buf_idx];
+		struct bio *bio = &ja->bio[w->idx]->bio;
 		bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META);
 		bio->bi_iter.bi_sector	= ptr->offset;
 		bio->bi_end_io		= journal_write_endio;
@@ -1937,10 +1939,9 @@ static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf *
 
 CLOSURE_CALLBACK(bch2_journal_write)
 {
-	closure_type(j, struct journal, io);
+	closure_type(w, struct journal_buf, io);
+	struct journal *j = container_of(w, struct journal, buf[w->idx]);
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
-	unsigned buf_idx = journal_last_unwritten_seq(j) & JOURNAL_BUF_MASK;
-	struct journal_buf *w = j->buf + buf_idx;
 	struct bch_replicas_padded replicas;
 	struct printbuf journal_debug_buf = PRINTBUF;
 	unsigned nr_rw_members = 0;
@@ -2019,12 +2020,15 @@ CLOSURE_CALLBACK(bch2_journal_write)
 	if (ret)
 		goto err;
 
+	if (!JSET_NO_FLUSH(w->data))
+		closure_wait_event(&j->async_wait, j->seq_ondisk + 1 == le64_to_cpu(w->data->seq));
+
 	if (!JSET_NO_FLUSH(w->data) && w->separate_flush) {
 		for_each_rw_member(c, ca) {
 			percpu_ref_get(&ca->io_ref);
 
 			struct journal_device *ja = &ca->journal;
-			struct bio *bio = ja->bio[buf_idx];
+			struct bio *bio = &ja->bio[w->idx]->bio;
 			bio_reset(bio, ca->disk_sb.bdev,
 				  REQ_OP_WRITE|REQ_SYNC|REQ_META|REQ_PREFLUSH);
 			bio->bi_end_io		= journal_write_endio;
diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h
index 79db1daa1de2e7..d75fbd3b1d34d1 100644
--- a/fs/bcachefs/journal_types.h
+++ b/fs/bcachefs/journal_types.h
@@ -18,6 +18,7 @@
  * the journal that are being staged or in flight.
  */
 struct journal_buf {
+	struct closure		io;
 	struct jset		*data;
 
 	__BKEY_PADDED(key, BCH_REPLICAS_MAX);
@@ -37,6 +38,7 @@ struct journal_buf {
 	bool			must_flush;	/* something wants a flush */
 	bool			separate_flush;
 	bool			need_flush_to_write_buffer;
+	u8			idx;
 };
 
 /*
@@ -150,6 +152,13 @@ enum journal_errors {
 
 typedef DARRAY(u64)		darray_u64;
 
+struct journal_bio {
+	struct bch_dev		*ca;
+	unsigned		buf_idx;
+
+	struct bio		bio;
+};
+
 /* Embedded in struct bch_fs */
 struct journal {
 	/* Fastpath stuff up front: */
@@ -204,7 +213,6 @@ struct journal {
 	wait_queue_head_t	wait;
 	struct closure_waitlist	async_wait;
 
-	struct closure		io;
 	struct delayed_work	write_work;
 	struct workqueue_struct *wq;
 
@@ -315,7 +323,7 @@ struct journal_device {
 	u64			*buckets;
 
 	/* Bio for journal reads/writes to this device */
-	struct bio		*bio[JOURNAL_BUF_NR];
+	struct journal_bio	*bio[JOURNAL_BUF_NR];
 
 	/* for bch_journal_read_device */
 	struct closure		read;

From f491d96a8778c90bbce8536eb36588b86b7d2f03 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 31 Jan 2024 14:26:15 -0500
Subject: [PATCH 0484/1406] bcachefs: better journal pipelining

Recently a severe performance regression was discovered, which bisected
to

  a6548c8b5eb5 bcachefs: Avoid flushing the journal in the discard path

It turns out the old behaviour, which issued excessive journal flushes,
worked around a performance issue where queueing delays would cause the
journal to not be able to write quickly enough and stall.

The journal flushes masked the issue because they periodically flushed
the device write cache, reducing write latency for non flushes.

This patch reworks the journalling code to allow more than one
(non-flush) write to be in flight at a time. With this patch, doing 4k
random writes and an iodepth of 128, we are now able to hit 560k iops to
a Samsung 970 EVO Plus - previously, we were stuck in the ~200k range.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal.c       | 47 +++++++++++++++----
 fs/bcachefs/journal.h       |  7 +--
 fs/bcachefs/journal_io.c    | 92 ++++++++++++++++++++-----------------
 fs/bcachefs/journal_types.h | 11 +++--
 4 files changed, 98 insertions(+), 59 deletions(-)

diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 96393393036609..fe5f7a944ad308 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -74,6 +74,13 @@ static void bch2_journal_buf_to_text(struct printbuf *out, struct journal *j, u6
 	prt_printf(out, "%li jiffies", buf->expires - jiffies);
 	prt_newline(out);
 
+	if (buf->write_done)
+		prt_printf(out, "write done\n");
+	else if (buf->write_allocated)
+		prt_printf(out, "write allocated\n");
+	else if (buf->write_started)
+		prt_printf(out, "write started\n");
+
 	printbuf_indent_sub(out, 2);
 }
 
@@ -175,21 +182,40 @@ journal_error_check_stuck(struct journal *j, int error, unsigned flags)
 	return stuck;
 }
 
+void bch2_journal_do_writes(struct journal *j)
+{
+	for (u64 seq = journal_last_unwritten_seq(j);
+	     seq <= journal_cur_seq(j);
+	     seq++) {
+		unsigned idx = seq & JOURNAL_BUF_MASK;
+		struct journal_buf *w = j->buf + idx;
+
+		if (w->write_started && !w->write_allocated)
+			break;
+		if (w->write_started)
+			continue;
+
+		if (!journal_state_count(j->reservations, idx)) {
+			w->write_started = true;
+			closure_call(&w->io, bch2_journal_write, j->wq, NULL);
+		}
+
+		break;
+	}
+}
+
 /*
  * Final processing when the last reference of a journal buffer has been
  * dropped. Drop the pin list reference acquired at journal entry open and write
  * the buffer, if requested.
  */
-void bch2_journal_buf_put_final(struct journal *j, u64 seq, bool write)
+void bch2_journal_buf_put_final(struct journal *j, u64 seq)
 {
 	lockdep_assert_held(&j->lock);
 
 	if (__bch2_journal_pin_put(j, seq))
 		bch2_journal_reclaim_fast(j);
-	if (write) {
-		struct journal_buf *w = j->buf + (seq & JOURNAL_BUF_MASK);
-		closure_call(&w->io, bch2_journal_write, j->wq, NULL);
-	}
+	bch2_journal_do_writes(j);
 }
 
 /*
@@ -381,11 +407,14 @@ static int journal_entry_open(struct journal *j)
 	BUG_ON(j->buf + (journal_cur_seq(j) & JOURNAL_BUF_MASK) != buf);
 
 	bkey_extent_init(&buf->key);
-	buf->noflush	= false;
-	buf->must_flush	= false;
-	buf->separate_flush = false;
-	buf->flush_time	= 0;
+	buf->noflush		= false;
+	buf->must_flush		= false;
+	buf->separate_flush	= false;
+	buf->flush_time		= 0;
 	buf->need_flush_to_write_buffer = true;
+	buf->write_started	= false;
+	buf->write_allocated	= false;
+	buf->write_done		= false;
 
 	memset(buf->data, 0, sizeof(*buf->data));
 	buf->data->seq	= cpu_to_le64(journal_cur_seq(j));
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
index 4544ce24bb8a65..7c7528f839c567 100644
--- a/fs/bcachefs/journal.h
+++ b/fs/bcachefs/journal.h
@@ -264,7 +264,8 @@ static inline union journal_res_state journal_state_buf_put(struct journal *j, u
 }
 
 bool bch2_journal_entry_close(struct journal *);
-void bch2_journal_buf_put_final(struct journal *, u64, bool);
+void bch2_journal_do_writes(struct journal *);
+void bch2_journal_buf_put_final(struct journal *, u64);
 
 static inline void __bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq)
 {
@@ -272,7 +273,7 @@ static inline void __bch2_journal_buf_put(struct journal *j, unsigned idx, u64 s
 
 	s = journal_state_buf_put(j, idx);
 	if (!journal_state_count(s, idx))
-		bch2_journal_buf_put_final(j, seq, idx == s.unwritten_idx);
+		bch2_journal_buf_put_final(j, seq);
 }
 
 static inline void bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq)
@@ -282,7 +283,7 @@ static inline void bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq
 	s = journal_state_buf_put(j, idx);
 	if (!journal_state_count(s, idx)) {
 		spin_lock(&j->lock);
-		bch2_journal_buf_put_final(j, seq, idx == s.unwritten_idx);
+		bch2_journal_buf_put_final(j, seq);
 		spin_unlock(&j->lock);
 	}
 }
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index d02e499566219f..cd8921a2c0daed 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1602,7 +1602,7 @@ static CLOSURE_CALLBACK(journal_write_done)
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 	struct bch_replicas_padded replicas;
 	union journal_res_state old, new;
-	u64 v, seq;
+	u64 v, seq = le64_to_cpu(w->data->seq);
 	int err = 0;
 
 	bch2_time_stats_update(!JSET_NO_FLUSH(w->data)
@@ -1622,64 +1622,69 @@ static CLOSURE_CALLBACK(journal_write_done)
 	if (err)
 		bch2_fatal_error(c);
 
-	spin_lock(&j->lock);
-	seq = le64_to_cpu(w->data->seq);
+	closure_debug_destroy(cl);
 
+	spin_lock(&j->lock);
 	if (seq >= j->pin.front)
 		journal_seq_pin(j, seq)->devs = w->devs_written;
+	if (err && (!j->err_seq || seq < j->err_seq))
+		j->err_seq	= seq;
+	w->write_done = true;
+
+	bool completed = false;
+
+	for (seq = journal_last_unwritten_seq(j);
+	     seq <= journal_cur_seq(j);
+	     seq++) {
+		w = j->buf + (seq & JOURNAL_BUF_MASK);
+		if (!w->write_done)
+			break;
 
-	if (!err) {
-		if (!JSET_NO_FLUSH(w->data)) {
+		if (!j->err_seq && !JSET_NO_FLUSH(w->data)) {
 			j->flushed_seq_ondisk = seq;
 			j->last_seq_ondisk = w->last_seq;
 
 			bch2_do_discards(c);
 			closure_wake_up(&c->freelist_wait);
-
 			bch2_reset_alloc_cursors(c);
 		}
-	} else if (!j->err_seq || seq < j->err_seq)
-		j->err_seq	= seq;
 
-	j->seq_ondisk		= seq;
+		j->seq_ondisk = seq;
 
-	/*
-	 * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard
-	 * more buckets:
-	 *
-	 * Must come before signaling write completion, for
-	 * bch2_fs_journal_stop():
-	 */
-	if (j->watermark != BCH_WATERMARK_stripe)
-		journal_reclaim_kick(&c->journal);
+		/*
+		 * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard
+		 * more buckets:
+		 *
+		 * Must come before signaling write completion, for
+		 * bch2_fs_journal_stop():
+		 */
+		if (j->watermark != BCH_WATERMARK_stripe)
+			journal_reclaim_kick(&c->journal);
 
-	/* also must come before signalling write completion: */
-	closure_debug_destroy(cl);
+		v = atomic64_read(&j->reservations.counter);
+		do {
+			old.v = new.v = v;
+			BUG_ON(journal_state_count(new, new.unwritten_idx));
+			BUG_ON(new.unwritten_idx != (seq & JOURNAL_BUF_MASK));
 
-	v = atomic64_read(&j->reservations.counter);
-	do {
-		old.v = new.v = v;
-		BUG_ON(journal_state_count(new, new.unwritten_idx));
+			new.unwritten_idx++;
+		} while ((v = atomic64_cmpxchg(&j->reservations.counter, old.v, new.v)) != old.v);
 
-		new.unwritten_idx++;
-	} while ((v = atomic64_cmpxchg(&j->reservations.counter,
-				       old.v, new.v)) != old.v);
+		completed = true;
+	}
 
-	bch2_journal_reclaim_fast(j);
-	bch2_journal_space_available(j);
+	if (completed) {
+		bch2_journal_reclaim_fast(j);
+		bch2_journal_space_available(j);
 
-	track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight],
-			   &j->max_in_flight_start, false);
+		track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight],
+				   &j->max_in_flight_start, false);
 
-	closure_wake_up(&w->wait);
-	journal_wake(j);
+		closure_wake_up(&w->wait);
+		journal_wake(j);
+	}
 
-	if (!journal_state_count(new, new.unwritten_idx) &&
-	    journal_last_unwritten_seq(j) <= journal_cur_seq(j)) {
-		struct journal_buf *w = j->buf + (journal_last_unwritten_seq(j) & JOURNAL_BUF_MASK);
-		spin_unlock(&j->lock);
-		closure_call(&w->io, bch2_journal_write, j->wq, NULL);
-	} else if (journal_last_unwritten_seq(j) == journal_cur_seq(j) &&
+	if (journal_last_unwritten_seq(j) == journal_cur_seq(j) &&
 		   new.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) {
 		struct journal_buf *buf = journal_cur_buf(j);
 		long delta = buf->expires - jiffies;
@@ -1689,12 +1694,10 @@ static CLOSURE_CALLBACK(journal_write_done)
 		 * previous entries still in flight - the current journal entry
 		 * might want to be written now:
 		 */
-
-		spin_unlock(&j->lock);
 		mod_delayed_work(j->wq, &j->write_work, max(0L, delta));
-	} else {
-		spin_unlock(&j->lock);
 	}
+
+	spin_unlock(&j->lock);
 }
 
 static void journal_write_endio(struct bio *bio)
@@ -1948,6 +1951,7 @@ CLOSURE_CALLBACK(bch2_journal_write)
 	int ret;
 
 	BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb));
+	BUG_ON(w->write_allocated);
 
 	j->write_start_time = local_clock();
 
@@ -1991,12 +1995,14 @@ CLOSURE_CALLBACK(bch2_journal_write)
 	 * bch2_journal_space_available():
 	 */
 	w->sectors = 0;
+	w->write_allocated = true;
 
 	/*
 	 * journal entry has been compacted and allocated, recalculate space
 	 * available:
 	 */
 	bch2_journal_space_available(j);
+	bch2_journal_do_writes(j);
 	spin_unlock(&j->lock);
 
 	w->devs_written = bch2_bkey_devs(bkey_i_to_s_c(&w->key));
diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h
index d75fbd3b1d34d1..3696aac3ccb728 100644
--- a/fs/bcachefs/journal_types.h
+++ b/fs/bcachefs/journal_types.h
@@ -34,10 +34,13 @@ struct journal_buf {
 	unsigned		disk_sectors;	/* maximum size entry could have been, if
 						   buf_size was bigger */
 	unsigned		u64s_reserved;
-	bool			noflush;	/* write has already been kicked off, and was noflush */
-	bool			must_flush;	/* something wants a flush */
-	bool			separate_flush;
-	bool			need_flush_to_write_buffer;
+	bool			noflush:1;	/* write has already been kicked off, and was noflush */
+	bool			must_flush:1;	/* something wants a flush */
+	bool			separate_flush:1;
+	bool			need_flush_to_write_buffer:1;
+	bool			write_started:1;
+	bool			write_allocated:1;
+	bool			write_done:1;
 	u8			idx;
 };
 

From 5722b6112d291d65e8bd787b4f6b0f190cc5b572 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 22 Jan 2024 14:37:42 -0500
Subject: [PATCH 0485/1406] bcachefs: btree_and_journal_iter.trans

we now always have a btree_trans when using a btree_and_journal_iter;
prep work for adding prefetching to btree_and_journal_iter

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_gc.c           |  8 ++++----
 fs/bcachefs/btree_iter.c         |  2 +-
 fs/bcachefs/btree_journal_iter.c | 15 ++++++++-------
 fs/bcachefs/btree_journal_iter.h | 13 ++++++++-----
 4 files changed, 21 insertions(+), 17 deletions(-)

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 1102995643b137..bc14e9c70ab171 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -389,7 +389,7 @@ static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct
 	have_child = dropped_children = false;
 	bch2_bkey_buf_init(&prev_k);
 	bch2_bkey_buf_init(&cur_k);
-	bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
+	bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b);
 
 	while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
 		BUG_ON(bpos_lt(k.k->p, b->data->min_key));
@@ -478,7 +478,7 @@ static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct
 		goto err;
 
 	bch2_btree_and_journal_iter_exit(&iter);
-	bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
+	bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b);
 
 	while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
 		bch2_bkey_buf_reassemble(&cur_k, c, k);
@@ -931,7 +931,7 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b
 	struct printbuf buf = PRINTBUF;
 	int ret = 0;
 
-	bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
+	bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b);
 	bch2_bkey_buf_init(&prev);
 	bch2_bkey_buf_init(&cur);
 	bkey_init(&prev.k->k);
@@ -963,7 +963,7 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b
 
 	if (b->c.level > target_depth) {
 		bch2_btree_and_journal_iter_exit(&iter);
-		bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
+		bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b);
 
 		while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
 			struct btree *child;
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 5467a8635be113..1b2970dc4f1d54 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -891,7 +891,7 @@ static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans,
 	struct bkey_s_c k;
 	int ret = 0;
 
-	__bch2_btree_and_journal_iter_init_node_iter(&jiter, c, l->b, l->iter, path->pos);
+	__bch2_btree_and_journal_iter_init_node_iter(trans, &jiter, l->b, l->iter, path->pos);
 
 	k = bch2_btree_and_journal_iter_peek(&jiter);
 
diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c
index 719a94a84950b7..fa907293ba4336 100644
--- a/fs/bcachefs/btree_journal_iter.c
+++ b/fs/bcachefs/btree_journal_iter.c
@@ -376,17 +376,18 @@ void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *iter)
 	bch2_journal_iter_exit(&iter->journal);
 }
 
-void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter,
-						  struct bch_fs *c,
+void __bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *trans,
+						  struct btree_and_journal_iter *iter,
 						  struct btree *b,
 						  struct btree_node_iter node_iter,
 						  struct bpos pos)
 {
 	memset(iter, 0, sizeof(*iter));
 
+	iter->trans = trans;
 	iter->b = b;
 	iter->node_iter = node_iter;
-	bch2_journal_iter_init(c, &iter->journal, b->c.btree_id, b->c.level, pos);
+	bch2_journal_iter_init(trans->c, &iter->journal, b->c.btree_id, b->c.level, pos);
 	INIT_LIST_HEAD(&iter->journal.list);
 	iter->pos = b->data->min_key;
 	iter->at_end = false;
@@ -396,15 +397,15 @@ void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter
  * this version is used by btree_gc before filesystem has gone RW and
  * multithreaded, so uses the journal_iters list:
  */
-void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter,
-						struct bch_fs *c,
+void bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *trans,
+						struct btree_and_journal_iter *iter,
 						struct btree *b)
 {
 	struct btree_node_iter node_iter;
 
 	bch2_btree_node_iter_init_from_start(&node_iter, b);
-	__bch2_btree_and_journal_iter_init_node_iter(iter, c, b, node_iter, b->data->min_key);
-	list_add(&iter->journal.list, &c->journal_iters);
+	__bch2_btree_and_journal_iter_init_node_iter(trans, iter, b, node_iter, b->data->min_key);
+	list_add(&iter->journal.list, &trans->c->journal_iters);
 }
 
 /* sort and dedup all keys in the journal: */
diff --git a/fs/bcachefs/btree_journal_iter.h b/fs/bcachefs/btree_journal_iter.h
index 8ca4c100b2e3e4..1793cf89148b01 100644
--- a/fs/bcachefs/btree_journal_iter.h
+++ b/fs/bcachefs/btree_journal_iter.h
@@ -15,6 +15,7 @@ struct journal_iter {
  */
 
 struct btree_and_journal_iter {
+	struct btree_trans	*trans;
 	struct btree		*b;
 	struct btree_node_iter	node_iter;
 	struct bkey		unpacked;
@@ -29,6 +30,9 @@ struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *, enum btree_id,
 struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *, enum btree_id,
 					   unsigned, struct bpos);
 
+int bch2_btree_and_journal_iter_prefetch(struct btree_trans *, struct btree_path *,
+					 struct btree_and_journal_iter *);
+
 int bch2_journal_key_insert_take(struct bch_fs *, enum btree_id,
 				 unsigned, struct bkey_i *);
 int bch2_journal_key_insert(struct bch_fs *, enum btree_id,
@@ -42,12 +46,11 @@ void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *);
 struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *);
 
 void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *);
-void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *,
-				struct bch_fs *, struct btree *,
+void __bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *,
+				struct btree_and_journal_iter *, struct btree *,
 				struct btree_node_iter, struct bpos);
-void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *,
-						struct bch_fs *,
-						struct btree *);
+void bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *,
+				struct btree_and_journal_iter *, struct btree *);
 
 void bch2_journal_keys_put(struct bch_fs *);
 

From afdb7d0eafd4db3778f32d55ad779811f9669a66 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 22 Jan 2024 14:25:00 -0500
Subject: [PATCH 0486/1406] bcachefs: btree node prefetching in check_topology

btree_and_journal_iter is old code that we want to get rid of, but we're
not ready to yet.

lack of btree node prefetching is, it turns out, a real performance
issue for fsck on spinning rust, so - add it.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_cache.c        | 10 +++++++---
 fs/bcachefs/btree_gc.c           |  3 +++
 fs/bcachefs/btree_journal_iter.c | 31 +++++++++++++++++++++++++++++++
 fs/bcachefs/btree_journal_iter.h |  1 +
 4 files changed, 42 insertions(+), 3 deletions(-)

diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index d7c81beac14afa..a8b393bc7567b4 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -711,6 +711,9 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans,
 	b = bch2_btree_node_mem_alloc(trans, level != 0);
 
 	if (bch2_err_matches(PTR_ERR_OR_ZERO(b), ENOMEM)) {
+		if (!path)
+			return b;
+
 		trans->memory_allocation_failure = true;
 		trace_and_count(c, trans_restart_memory_allocation_failure, trans, _THIS_IP_, path);
 		return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_mem_alloc_fail));
@@ -760,8 +763,9 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans,
 	}
 
 	if (!six_relock_type(&b->c.lock, lock_type, seq)) {
-		if (path)
-			trace_and_count(c, trans_restart_relock_after_fill, trans, _THIS_IP_, path);
+		BUG_ON(!path);
+
+		trace_and_count(c, trans_restart_relock_after_fill, trans, _THIS_IP_, path);
 		return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_after_fill));
 	}
 
@@ -1096,7 +1100,7 @@ int bch2_btree_node_prefetch(struct btree_trans *trans,
 	struct btree_cache *bc = &c->btree_cache;
 	struct btree *b;
 
-	BUG_ON(trans && !btree_node_locked(path, level + 1));
+	BUG_ON(path && !btree_node_locked(path, level + 1));
 	BUG_ON(level >= BTREE_MAX_DEPTH);
 
 	b = btree_cache_find(bc, k);
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index bc14e9c70ab171..4a08af94634a4f 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -390,6 +390,7 @@ static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct
 	bch2_bkey_buf_init(&prev_k);
 	bch2_bkey_buf_init(&cur_k);
 	bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b);
+	iter.prefetch = true;
 
 	while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
 		BUG_ON(bpos_lt(k.k->p, b->data->min_key));
@@ -479,6 +480,7 @@ static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct
 
 	bch2_btree_and_journal_iter_exit(&iter);
 	bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b);
+	iter.prefetch = true;
 
 	while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
 		bch2_bkey_buf_reassemble(&cur_k, c, k);
@@ -964,6 +966,7 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b
 	if (b->c.level > target_depth) {
 		bch2_btree_and_journal_iter_exit(&iter);
 		bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b);
+		iter.prefetch = true;
 
 		while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
 			struct btree *child;
diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c
index fa907293ba4336..b7ac93c8fdd8ab 100644
--- a/fs/bcachefs/btree_journal_iter.c
+++ b/fs/bcachefs/btree_journal_iter.c
@@ -1,7 +1,9 @@
 // SPDX-License-Identifier: GPL-2.0
 
 #include "bcachefs.h"
+#include "bkey_buf.h"
 #include "bset.h"
+#include "btree_cache.h"
 #include "btree_journal_iter.h"
 #include "journal_io.h"
 
@@ -334,9 +336,38 @@ void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *iter)
 		iter->pos = bpos_successor(iter->pos);
 }
 
+static void btree_and_journal_iter_prefetch(struct btree_and_journal_iter *_iter)
+{
+	struct btree_and_journal_iter iter = *_iter;
+	struct bch_fs *c = iter.trans->c;
+	unsigned level = iter.journal.level;
+	struct bkey_buf tmp;
+	unsigned nr = test_bit(BCH_FS_started, &c->flags)
+		? (level > 1 ? 0 :  2)
+		: (level > 1 ? 1 : 16);
+
+	iter.prefetch = false;
+	bch2_bkey_buf_init(&tmp);
+
+	while (nr--) {
+		bch2_btree_and_journal_iter_advance(&iter);
+		struct bkey_s_c k = bch2_btree_and_journal_iter_peek(&iter);
+		if (!k.k)
+			break;
+
+		bch2_bkey_buf_reassemble(&tmp, c, k);
+		bch2_btree_node_prefetch(iter.trans, NULL, tmp.k, iter.journal.btree_id, level - 1);
+	}
+
+	bch2_bkey_buf_exit(&tmp, c);
+}
+
 struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *iter)
 {
 	struct bkey_s_c btree_k, journal_k, ret;
+
+	if (iter->prefetch && iter->journal.level)
+		btree_and_journal_iter_prefetch(iter);
 again:
 	if (iter->at_end)
 		return bkey_s_c_null;
diff --git a/fs/bcachefs/btree_journal_iter.h b/fs/bcachefs/btree_journal_iter.h
index 1793cf89148b01..c9d19da3ea0480 100644
--- a/fs/bcachefs/btree_journal_iter.h
+++ b/fs/bcachefs/btree_journal_iter.h
@@ -23,6 +23,7 @@ struct btree_and_journal_iter {
 	struct journal_iter	journal;
 	struct bpos		pos;
 	bool			at_end;
+	bool			prefetch;
 };
 
 struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *, enum btree_id,

From 10abefd90bfb3f448ec573444c6eab14306ec70d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 21 Jan 2024 16:46:45 -0500
Subject: [PATCH 0487/1406] bcachefs: Subvolumes may now be renamed

Files within a subvolume cannot be renamed into another subvolume, but
subvolumes themselves were intended to be.

This implements subvolume renaming - we need to ensure that there's only
a single dirent that points to a subvolume key (not multiple versions in
different snapshots), and we need to ensure that dirent.d_parent_subol
and inode.bi_parent_subvol are updated.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/dirent.c    | 66 +++++++++++++++++++++++++----------------
 fs/bcachefs/fs-common.c | 15 ++++++++++
 2 files changed, 55 insertions(+), 26 deletions(-)

diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index 116752a7d1cd05..97773cffccae8d 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -293,12 +293,10 @@ int bch2_dirent_rename(struct btree_trans *trans,
 	struct bkey_i_dirent *new_src = NULL, *new_dst = NULL;
 	struct bpos dst_pos =
 		POS(dst_dir.inum, bch2_dirent_hash(dst_hash, dst_name));
-	unsigned src_type = 0, dst_type = 0, src_update_flags = 0;
+	unsigned src_update_flags = 0;
+	bool delete_src, delete_dst;
 	int ret = 0;
 
-	if (src_dir.subvol != dst_dir.subvol)
-		return -EXDEV;
-
 	memset(src_inum, 0, sizeof(*src_inum));
 	memset(dst_inum, 0, sizeof(*dst_inum));
 
@@ -319,12 +317,6 @@ int bch2_dirent_rename(struct btree_trans *trans,
 	if (ret)
 		goto out;
 
-	src_type = bkey_s_c_to_dirent(old_src).v->d_type;
-
-	if (src_type == DT_SUBVOL && mode == BCH_RENAME_EXCHANGE)
-		return -EOPNOTSUPP;
-
-
 	/* Lookup dst: */
 	if (mode == BCH_RENAME) {
 		/*
@@ -352,11 +344,6 @@ int bch2_dirent_rename(struct btree_trans *trans,
 				bkey_s_c_to_dirent(old_dst), dst_inum);
 		if (ret)
 			goto out;
-
-		dst_type = bkey_s_c_to_dirent(old_dst).v->d_type;
-
-		if (dst_type == DT_SUBVOL)
-			return -EOPNOTSUPP;
 	}
 
 	if (mode != BCH_RENAME_EXCHANGE)
@@ -426,28 +413,55 @@ int bch2_dirent_rename(struct btree_trans *trans,
 		}
 	}
 
+	if (new_dst->v.d_type == DT_SUBVOL)
+		new_dst->v.d_parent_subvol = cpu_to_le32(dst_dir.subvol);
+
+	if ((mode == BCH_RENAME_EXCHANGE) &&
+	    new_src->v.d_type == DT_SUBVOL)
+		new_src->v.d_parent_subvol = cpu_to_le32(src_dir.subvol);
+
 	ret = bch2_trans_update(trans, &dst_iter, &new_dst->k_i, 0);
 	if (ret)
 		goto out;
 out_set_src:
-
 	/*
-	 * If we're deleting a subvolume, we need to really delete the dirent,
-	 * not just emit a whiteout in the current snapshot:
+	 * If we're deleting a subvolume we need to really delete the dirent,
+	 * not just emit a whiteout in the current snapshot - there can only be
+	 * single dirent that points to a given subvolume.
+	 *
+	 * IOW, we don't maintain multiple versions in different snapshots of
+	 * dirents that point to subvolumes - dirents that point to subvolumes
+	 * are only visible in one particular subvolume so it's not necessary,
+	 * and it would be particularly confusing for fsck to have to deal with.
 	 */
-	if (src_type == DT_SUBVOL) {
-		bch2_btree_iter_set_snapshot(&src_iter, old_src.k->p.snapshot);
-		ret = bch2_btree_iter_traverse(&src_iter);
+	delete_src = bkey_s_c_to_dirent(old_src).v->d_type == DT_SUBVOL &&
+		new_src->k.p.snapshot != old_src.k->p.snapshot;
+
+	delete_dst = old_dst.k &&
+		bkey_s_c_to_dirent(old_dst).v->d_type == DT_SUBVOL &&
+		new_dst->k.p.snapshot != old_dst.k->p.snapshot;
+
+	if (!delete_src || !bkey_deleted(&new_src->k)) {
+		ret = bch2_trans_update(trans, &src_iter, &new_src->k_i, src_update_flags);
 		if (ret)
 			goto out;
+	}
 
-		new_src->k.p = src_iter.pos;
-		src_update_flags |= BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE;
+	if (delete_src) {
+		bch2_btree_iter_set_snapshot(&src_iter, old_src.k->p.snapshot);
+		ret =   bch2_btree_iter_traverse(&src_iter) ?:
+			bch2_btree_delete_at(trans, &src_iter, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+		if (ret)
+			goto out;
 	}
 
-	ret = bch2_trans_update(trans, &src_iter, &new_src->k_i, src_update_flags);
-	if (ret)
-		goto out;
+	if (delete_dst) {
+		bch2_btree_iter_set_snapshot(&dst_iter, old_dst.k->p.snapshot);
+		ret =   bch2_btree_iter_traverse(&dst_iter) ?:
+			bch2_btree_delete_at(trans, &dst_iter, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+		if (ret)
+			goto out;
+	}
 
 	if (mode == BCH_RENAME_EXCHANGE)
 		*src_offset = new_src->k.p.offset;
diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c
index 8ee716e4c2e72e..523507e38887bf 100644
--- a/fs/bcachefs/fs-common.c
+++ b/fs/bcachefs/fs-common.c
@@ -410,6 +410,21 @@ int bch2_rename_trans(struct btree_trans *trans,
 			goto err;
 	}
 
+	/* Can't move across subvolumes, unless it's a subvolume root: */
+	if (src_dir.subvol != dst_dir.subvol &&
+	    (!src_inode_u->bi_subvol ||
+	     (dst_inum.inum && !dst_inode_u->bi_subvol))) {
+		ret = -EXDEV;
+		goto err;
+	}
+
+	if (src_inode_u->bi_parent_subvol)
+		src_inode_u->bi_parent_subvol = dst_dir.subvol;
+
+	if ((mode == BCH_RENAME_EXCHANGE) &&
+	    dst_inode_u->bi_parent_subvol)
+		dst_inode_u->bi_parent_subvol = src_dir.subvol;
+
 	src_inode_u->bi_dir		= dst_dir_u->bi_inum;
 	src_inode_u->bi_dir_offset	= dst_offset;
 

From 15882bf6a705a16435b532682eb1ac26404f2521 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 3 Feb 2024 15:05:17 -0500
Subject: [PATCH 0488/1406] bcachefs: Switch to uuid_to_fsid()

switch the statfs code from something horrible and open coded to the
more standard uuid_to_fsid()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 77ae65542db916..ec9cf4b8faf1ba 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -1572,7 +1572,6 @@ static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf)
 	 * number:
 	 */
 	u64 avail_inodes = ((usage.capacity - usage.used) << 3);
-	u64 fsid;
 
 	buf->f_type	= BCACHEFS_STATFS_MAGIC;
 	buf->f_bsize	= sb->s_blocksize;
@@ -1583,10 +1582,7 @@ static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf)
 	buf->f_files	= usage.nr_inodes + avail_inodes;
 	buf->f_ffree	= avail_inodes;
 
-	fsid = le64_to_cpup((void *) c->sb.user_uuid.b) ^
-	       le64_to_cpup((void *) c->sb.user_uuid.b + sizeof(u64));
-	buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL;
-	buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL;
+	buf->f_fsid	= uuid_to_fsid(c->sb.user_uuid.b);
 	buf->f_namelen	= BCH_NAME_MAX;
 
 	return 0;

From 0d440fa494960f422acbe4d68de7a56ace74a351 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 3 Feb 2024 15:23:07 -0500
Subject: [PATCH 0489/1406] bcachefs: Initialize super_block->s_uuid

Need to fix this oversight for the new FS_IOC_(GET|SET)UUID ioctls.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index ec9cf4b8faf1ba..be0c059d70f03d 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -1878,6 +1878,7 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type,
 	sb->s_time_gran		= c->sb.nsec_per_time_unit;
 	sb->s_time_min		= div_s64(S64_MIN, c->sb.time_units_per_sec) + 1;
 	sb->s_time_max		= div_s64(S64_MAX, c->sb.time_units_per_sec);
+	sb->s_uuid		= c->sb.user_uuid;
 	c->vfs_sb		= sb;
 	strscpy(sb->s_id, c->name, sizeof(sb->s_id));
 

From 2f275a368d85656b6c84c3afc3c4aaa169b48a74 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 24 Jan 2024 16:32:12 -0500
Subject: [PATCH 0490/1406] bcachefs: Set path->uptodate when no node at level

We were failing to set path->uptodate when reaching the end of a btree
node iterator, causing the new prefetch code for backpointers gc to go
into an infinite loop.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 1b2970dc4f1d54..924c58823f0d56 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1146,7 +1146,7 @@ int bch2_btree_path_traverse_one(struct btree_trans *trans,
 	path = &trans->paths[path_idx];
 
 	if (unlikely(path->level >= BTREE_MAX_DEPTH))
-		goto out;
+		goto out_uptodate;
 
 	path->level = btree_path_up_until_good_node(trans, path, 0);
 
@@ -1179,7 +1179,7 @@ int bch2_btree_path_traverse_one(struct btree_trans *trans,
 			goto out;
 		}
 	}
-
+out_uptodate:
 	path->uptodate = BTREE_ITER_UPTODATE;
 out:
 	if (bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted)

From 5755090412af5dba7ea419db5d250493578b7001 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 1 Feb 2024 07:35:28 -0500
Subject: [PATCH 0491/1406] bcachefs: move fsck_write_inode() to inode.c

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fsck.c  | 53 +++++++++++----------------------------------
 fs/bcachefs/inode.c | 28 ++++++++++++++++++++++++
 fs/bcachefs/inode.h |  3 +++
 3 files changed, 44 insertions(+), 40 deletions(-)

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 8e53ac2fbcd3b8..65616af69db7af 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -142,34 +142,6 @@ static int lookup_dirent_in_snapshot(struct btree_trans *trans,
 	return 0;
 }
 
-static int __write_inode(struct btree_trans *trans,
-			 struct bch_inode_unpacked *inode,
-			 u32 snapshot)
-{
-	struct bkey_inode_buf *inode_p =
-		bch2_trans_kmalloc(trans, sizeof(*inode_p));
-
-	if (IS_ERR(inode_p))
-		return PTR_ERR(inode_p);
-
-	bch2_inode_pack(inode_p, inode);
-	inode_p->inode.k.p.snapshot = snapshot;
-
-	return bch2_btree_insert_nonextent(trans, BTREE_ID_inodes,
-				&inode_p->inode.k_i,
-				BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
-}
-
-static int fsck_write_inode(struct btree_trans *trans,
-			    struct bch_inode_unpacked *inode,
-			    u32 snapshot)
-{
-	int ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
-			    __write_inode(trans, inode, snapshot));
-	bch_err_fn(trans->c, ret);
-	return ret;
-}
-
 static int __remove_dirent(struct btree_trans *trans, struct bpos pos)
 {
 	struct bch_fs *c = trans->c;
@@ -312,7 +284,7 @@ static int reattach_inode(struct btree_trans *trans,
 	if (S_ISDIR(inode->bi_mode)) {
 		lostfound.bi_nlink++;
 
-		ret = __write_inode(trans, &lostfound, U32_MAX);
+		ret = __bch2_fsck_write_inode(trans, &lostfound, U32_MAX);
 		if (ret)
 			return ret;
 	}
@@ -334,7 +306,7 @@ static int reattach_inode(struct btree_trans *trans,
 	inode->bi_dir		= lostfound.bi_inum;
 	inode->bi_dir_offset	= dir_offset;
 
-	return __write_inode(trans, inode, inode_snapshot);
+	return __bch2_fsck_write_inode(trans, inode, inode_snapshot);
 }
 
 static int remove_backpointer(struct btree_trans *trans,
@@ -861,7 +833,8 @@ static int check_inode(struct btree_trans *trans,
 
 		u.bi_flags &= ~BCH_INODE_i_size_dirty|BCH_INODE_unlinked;
 
-		ret = __write_inode(trans, &u, iter->pos.snapshot);
+		ret = __bch2_fsck_write_inode(trans, &u, iter->pos.snapshot);
+
 		bch_err_msg(c, ret, "in fsck updating inode");
 		if (ret)
 			return ret;
@@ -951,7 +924,7 @@ static int check_inode(struct btree_trans *trans,
 	}
 
 	if (do_update) {
-		ret = __write_inode(trans, &u, iter->pos.snapshot);
+		ret = __bch2_fsck_write_inode(trans, &u, iter->pos.snapshot);
 		bch_err_msg(c, ret, "in fsck updating inode");
 		if (ret)
 			return ret;
@@ -1032,7 +1005,7 @@ static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w)
 				w->last_pos.inode, i->snapshot,
 				i->inode.bi_sectors, i->count)) {
 			i->inode.bi_sectors = i->count;
-			ret = fsck_write_inode(trans, &i->inode, i->snapshot);
+			ret = bch2_fsck_write_inode(trans, &i->inode, i->snapshot);
 			if (ret)
 				break;
 		}
@@ -1481,7 +1454,7 @@ static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w)
 				"directory %llu:%u with wrong i_nlink: got %u, should be %llu",
 				w->last_pos.inode, i->snapshot, i->inode.bi_nlink, i->count)) {
 			i->inode.bi_nlink = i->count;
-			ret = fsck_write_inode(trans, &i->inode, i->snapshot);
+			ret = bch2_fsck_write_inode(trans, &i->inode, i->snapshot);
 			if (ret)
 				break;
 		}
@@ -1508,7 +1481,7 @@ static int check_dirent_target(struct btree_trans *trans,
 		target->bi_dir		= d.k->p.inode;
 		target->bi_dir_offset	= d.k->p.offset;
 
-		ret = __write_inode(trans, target, target_snapshot);
+		ret = __bch2_fsck_write_inode(trans, target, target_snapshot);
 		if (ret)
 			goto err;
 	}
@@ -1548,7 +1521,7 @@ static int check_dirent_target(struct btree_trans *trans,
 			target->bi_nlink++;
 			target->bi_flags &= ~BCH_INODE_unlinked;
 
-			ret = __write_inode(trans, target, target_snapshot);
+			ret = __bch2_fsck_write_inode(trans, target, target_snapshot);
 			if (ret)
 				goto err;
 		}
@@ -1566,7 +1539,7 @@ static int check_dirent_target(struct btree_trans *trans,
 			target->bi_dir		= d.k->p.inode;
 			target->bi_dir_offset	= d.k->p.offset;
 
-			ret = __write_inode(trans, target, target_snapshot);
+			ret = __bch2_fsck_write_inode(trans, target, target_snapshot);
 			if (ret)
 				goto err;
 		}
@@ -1744,7 +1717,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
 				target_inum,
 				subvol_root.bi_subvol, target_subvol)) {
 			subvol_root.bi_subvol = target_subvol;
-			ret = __write_inode(trans, &subvol_root, target_snapshot);
+			ret = __bch2_fsck_write_inode(trans, &subvol_root, target_snapshot);
 			if (ret)
 				goto err;
 		}
@@ -1918,7 +1891,7 @@ static int check_root_trans(struct btree_trans *trans)
 				0, NULL);
 		root_inode.bi_inum = inum;
 
-		ret = __write_inode(trans, &root_inode, snapshot);
+		ret = __bch2_fsck_write_inode(trans, &root_inode, snapshot);
 		bch_err_msg(c, ret, "writing root inode");
 	}
 err:
@@ -2290,7 +2263,7 @@ static int check_nlinks_update_inode(struct btree_trans *trans, struct btree_ite
 			u.bi_inum, bch2_d_types[mode_to_type(u.bi_mode)],
 			bch2_inode_nlink_get(&u), link->count)) {
 		bch2_inode_nlink_set(&u, link->count);
-		ret = __write_inode(trans, &u, k.k->p.snapshot);
+		ret = __bch2_fsck_write_inode(trans, &u, k.k->p.snapshot);
 	}
 fsck_err:
 	return ret;
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 086f0090b03a40..e7ba169c4e5472 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -384,6 +384,34 @@ int bch2_inode_write_flags(struct btree_trans *trans,
 	return bch2_trans_update(trans, iter, &inode_p->inode.k_i, flags);
 }
 
+int __bch2_fsck_write_inode(struct btree_trans *trans,
+			 struct bch_inode_unpacked *inode,
+			 u32 snapshot)
+{
+	struct bkey_inode_buf *inode_p =
+		bch2_trans_kmalloc(trans, sizeof(*inode_p));
+
+	if (IS_ERR(inode_p))
+		return PTR_ERR(inode_p);
+
+	bch2_inode_pack(inode_p, inode);
+	inode_p->inode.k.p.snapshot = snapshot;
+
+	return bch2_btree_insert_nonextent(trans, BTREE_ID_inodes,
+				&inode_p->inode.k_i,
+				BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+}
+
+int bch2_fsck_write_inode(struct btree_trans *trans,
+			    struct bch_inode_unpacked *inode,
+			    u32 snapshot)
+{
+	int ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+			    __bch2_fsck_write_inode(trans, inode, snapshot));
+	bch_err_fn(trans->c, ret);
+	return ret;
+}
+
 struct bkey_i *bch2_inode_to_v3(struct btree_trans *trans, struct bkey_i *k)
 {
 	struct bch_inode_unpacked u;
diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
index b63f312581cfa5..b8da7ff8069d08 100644
--- a/fs/bcachefs/inode.h
+++ b/fs/bcachefs/inode.h
@@ -108,6 +108,9 @@ static inline int bch2_inode_write(struct btree_trans *trans,
 	return bch2_inode_write_flags(trans, iter, inode, 0);
 }
 
+int __bch2_fsck_write_inode(struct btree_trans *, struct bch_inode_unpacked *, u32);
+int bch2_fsck_write_inode(struct btree_trans *, struct bch_inode_unpacked *, u32);
+
 void bch2_inode_init_early(struct bch_fs *,
 			   struct bch_inode_unpacked *);
 void bch2_inode_init_late(struct bch_inode_unpacked *, u64,

From e00994f875c6bf494e235e3a567ff062476efa28 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 5 Feb 2024 19:28:03 -0500
Subject: [PATCH 0492/1406] bcachefs: bump max_active on
 btree_interior_update_worker

WQ_UNBOUND with max_active 1 means ordered workqueue, but we don't
actually need or want ordered semantics - and probably want a higher
concurrency limit anyways.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_interior.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 4530b14ff2c371..cd87ac3a9b0d49 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -2485,7 +2485,7 @@ void bch2_fs_btree_interior_update_init_early(struct bch_fs *c)
 int bch2_fs_btree_interior_update_init(struct bch_fs *c)
 {
 	c->btree_interior_update_worker =
-		alloc_workqueue("btree_update", WQ_UNBOUND|WQ_MEM_RECLAIM, 1);
+		alloc_workqueue("btree_update", WQ_UNBOUND|WQ_MEM_RECLAIM, 8);
 	if (!c->btree_interior_update_worker)
 		return -BCH_ERR_ENOMEM_btree_interior_update_worker_init;
 

From a47e747de21a57ce90da42b2c7fd266dc028aa23 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 5 Feb 2024 21:44:23 -0500
Subject: [PATCH 0493/1406] bcachefs: Kill some -EINVALs

Repurposing standard error codes in bcachefs code is banned in new code,
and we need to get rid of the remaining ones - private error codes give
us much better error messages.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/errcode.h | 2 ++
 fs/bcachefs/migrate.c | 8 +++-----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
index 8c40c2067a0471..3fd33b307a77f9 100644
--- a/fs/bcachefs/errcode.h
+++ b/fs/bcachefs/errcode.h
@@ -176,6 +176,8 @@
 	x(EINVAL,			invalid)				\
 	x(EINVAL,			internal_fsck_err)			\
 	x(EINVAL,			opt_parse_error)			\
+	x(EINVAL,			remove_with_metadata_missing_unimplemented)\
+	x(EINVAL,			remove_would_lose_data)			\
 	x(EROFS,			erofs_trans_commit)			\
 	x(EROFS,			erofs_no_writes)			\
 	x(EROFS,			erofs_journal_err)			\
diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c
index 5623cee3ef8693..69098eeb5d48e3 100644
--- a/fs/bcachefs/migrate.c
+++ b/fs/bcachefs/migrate.c
@@ -31,7 +31,7 @@ static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s k,
 	nr_good = bch2_bkey_durability(c, k.s_c);
 	if ((!nr_good && !(flags & lost)) ||
 	    (nr_good < replicas && !(flags & degraded)))
-		return -EINVAL;
+		return -BCH_ERR_remove_would_lose_data;
 
 	return 0;
 }
@@ -111,7 +111,7 @@ static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
 
 	/* don't handle this yet: */
 	if (flags & BCH_FORCE_IF_METADATA_LOST)
-		return -EINVAL;
+		return -BCH_ERR_remove_with_metadata_missing_unimplemented;
 
 	trans = bch2_trans_get(c);
 	bch2_bkey_buf_init(&k);
@@ -132,10 +132,8 @@ static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
 
 			ret = drop_dev_ptrs(c, bkey_i_to_s(k.k),
 					    dev_idx, flags, true);
-			if (ret) {
-				bch_err(c, "Cannot drop device without losing data");
+			if (ret)
 				break;
-			}
 
 			ret = bch2_btree_node_update_key(trans, &iter, b, k.k, 0, false);
 			if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) {

From e2108b45ca6edf579ff587fa66b84dd6ba97b2d0 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 5 Feb 2024 03:22:29 -0500
Subject: [PATCH 0494/1406] bcachefs: Factor out check_subvol_dirent()

Going to be adding more code here for checking subvol structure.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 MAINTAINERS        |   1 +
 fs/bcachefs/fsck.c | 105 ++++++++++++++++++++++++---------------------
 2 files changed, 58 insertions(+), 48 deletions(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index 8d1052fa6a6924..1a46ba4e496a56 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3536,6 +3536,7 @@ R:	Brian Foster <bfoster@redhat.com>
 L:	linux-bcachefs@vger.kernel.org
 S:	Supported
 C:	irc://irc.oftc.net/bcache
+T:	git https://evilpiepirate.org/git/bcachefs.git
 F:	fs/bcachefs/
 
 BDISP ST MEDIA DRIVER
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 65616af69db7af..873ae22b78dce8 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -1596,6 +1596,58 @@ static int check_dirent_target(struct btree_trans *trans,
 	return ret;
 }
 
+static int check_subvol_dirent(struct btree_trans *trans, struct btree_iter *iter,
+			       struct bkey_s_c_dirent d)
+{
+	struct bch_fs *c = trans->c;
+	struct bch_inode_unpacked subvol_root;
+	u32 target_subvol = le32_to_cpu(d.v->d_child_subvol);
+	u32 target_snapshot;
+	u64 target_inum;
+	int ret = 0;
+
+	ret = subvol_lookup(trans, target_subvol,
+			      &target_snapshot, &target_inum);
+	if (ret && !bch2_err_matches(ret, ENOENT))
+		return ret;
+
+	if (fsck_err_on(ret, c, dirent_to_missing_subvol,
+			"dirent points to missing subvolume %u",
+			le32_to_cpu(d.v->d_child_subvol)))
+		return __remove_dirent(trans, d.k->p);
+
+	ret = lookup_inode(trans, target_inum,
+			   &subvol_root, &target_snapshot);
+	if (ret && !bch2_err_matches(ret, ENOENT))
+		return ret;
+
+	if (fsck_err_on(ret, c, subvol_to_missing_root,
+			"subvolume %u points to missing subvolume root %llu",
+			target_subvol,
+			target_inum)) {
+		bch_err(c, "repair not implemented yet");
+		return -EINVAL;
+	}
+
+	if (fsck_err_on(subvol_root.bi_subvol != target_subvol,
+			c, subvol_root_wrong_bi_subvol,
+			"subvol root %llu has wrong bi_subvol field: got %u, should be %u",
+			target_inum,
+			subvol_root.bi_subvol, target_subvol)) {
+		subvol_root.bi_subvol = target_subvol;
+		ret = __bch2_fsck_write_inode(trans, &subvol_root, target_snapshot);
+		if (ret)
+			return ret;
+	}
+
+	ret = check_dirent_target(trans, iter, d, &subvol_root,
+				  target_snapshot);
+	if (ret)
+		return ret;
+fsck_err:
+	return ret;
+}
+
 static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
 			struct bkey_s_c k,
 			struct bch_hash_info *hash_info,
@@ -1680,50 +1732,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
 	d = bkey_s_c_to_dirent(k);
 
 	if (d.v->d_type == DT_SUBVOL) {
-		struct bch_inode_unpacked subvol_root;
-		u32 target_subvol = le32_to_cpu(d.v->d_child_subvol);
-		u32 target_snapshot;
-		u64 target_inum;
-
-		ret = subvol_lookup(trans, target_subvol,
-				      &target_snapshot, &target_inum);
-		if (ret && !bch2_err_matches(ret, ENOENT))
-			goto err;
-
-		if (fsck_err_on(ret, c, dirent_to_missing_subvol,
-				"dirent points to missing subvolume %u",
-				le32_to_cpu(d.v->d_child_subvol))) {
-			ret = __remove_dirent(trans, d.k->p);
-			goto err;
-		}
-
-		ret = lookup_inode(trans, target_inum,
-				   &subvol_root, &target_snapshot);
-		if (ret && !bch2_err_matches(ret, ENOENT))
-			goto err;
-
-		if (fsck_err_on(ret, c, subvol_to_missing_root,
-				"subvolume %u points to missing subvolume root %llu",
-				target_subvol,
-				target_inum)) {
-			bch_err(c, "repair not implemented yet");
-			ret = -EINVAL;
-			goto err;
-		}
-
-		if (fsck_err_on(subvol_root.bi_subvol != target_subvol,
-				c, subvol_root_wrong_bi_subvol,
-				"subvol root %llu has wrong bi_subvol field: got %u, should be %u",
-				target_inum,
-				subvol_root.bi_subvol, target_subvol)) {
-			subvol_root.bi_subvol = target_subvol;
-			ret = __bch2_fsck_write_inode(trans, &subvol_root, target_snapshot);
-			if (ret)
-				goto err;
-		}
-
-		ret = check_dirent_target(trans, iter, d, &subvol_root,
-					  target_snapshot);
+		ret = check_subvol_dirent(trans, iter, d);
 		if (ret)
 			goto err;
 	} else {
@@ -1749,11 +1758,11 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
 			if (ret)
 				goto err;
 		}
-	}
 
-	if (d.v->d_type == DT_DIR)
-		for_each_visible_inode(c, s, dir, equiv.snapshot, i)
-			i->count++;
+		if (d.v->d_type == DT_DIR)
+			for_each_visible_inode(c, s, dir, equiv.snapshot, i)
+				i->count++;
+	}
 out:
 err:
 fsck_err:

From f8873872873f8412176c683537894e9d5288e115 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 5 Feb 2024 19:38:19 -0500
Subject: [PATCH 0495/1406] bcachefs: factor out check_inode_backpointer()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fsck.c | 38 +++++++++++++++++++++++++++++---------
 1 file changed, 29 insertions(+), 9 deletions(-)

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 873ae22b78dce8..10d144c5a37a5c 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -1464,16 +1464,15 @@ static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w)
 	return ret ?: trans_was_restarted(trans, restart_count);
 }
 
-static int check_dirent_target(struct btree_trans *trans,
-			       struct btree_iter *iter,
-			       struct bkey_s_c_dirent d,
-			       struct bch_inode_unpacked *target,
-			       u32 target_snapshot)
+static int check_inode_backpointer(struct btree_trans *trans,
+				   struct btree_iter *iter,
+				   struct bkey_s_c_dirent d,
+				   struct bch_inode_unpacked *target,
+				   u32 target_snapshot)
 {
 	struct bch_fs *c = trans->c;
-	struct bkey_i_dirent *n;
-	struct printbuf buf = PRINTBUF;
 	struct btree_iter bp_iter = { NULL };
+	struct printbuf buf = PRINTBUF;
 	int ret = 0;
 
 	if (!target->bi_dir &&
@@ -1544,6 +1543,29 @@ static int check_dirent_target(struct btree_trans *trans,
 				goto err;
 		}
 	}
+out:
+err:
+fsck_err:
+	bch2_trans_iter_exit(trans, &bp_iter);
+	printbuf_exit(&buf);
+	bch_err_fn(c, ret);
+	return ret;
+}
+
+static int check_dirent_target(struct btree_trans *trans,
+			       struct btree_iter *iter,
+			       struct bkey_s_c_dirent d,
+			       struct bch_inode_unpacked *target,
+			       u32 target_snapshot)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_i_dirent *n;
+	struct printbuf buf = PRINTBUF;
+	int ret = 0;
+
+	ret = check_inode_backpointer(trans, iter, d, target, target_snapshot);
+	if (ret)
+		goto err;
 
 	if (fsck_err_on(d.v->d_type != inode_d_type(target),
 			c, dirent_d_type_wrong,
@@ -1587,10 +1609,8 @@ static int check_dirent_target(struct btree_trans *trans,
 
 		d = dirent_i_to_s_c(n);
 	}
-out:
 err:
 fsck_err:
-	bch2_trans_iter_exit(trans, &bp_iter);
 	printbuf_exit(&buf);
 	bch_err_fn(c, ret);
 	return ret;

From c7b3f17061ea0dd4a81bfac5229e88252039982e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 25 Jan 2024 19:00:24 -0500
Subject: [PATCH 0496/1406] mm: introduce memalloc_flags_{save,restore}

Our proliferation of memalloc_*_{save,restore} APIs is getting a bit
silly, this adds a generic version and converts the existing
save/restore functions to wrappers.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Darrick J. Wong <djwong@kernel.org>
Cc: linux-mm@kvack.org
Acked-by: Vlastimil Babka <vbabka@suse.cz>
---
 include/linux/sched/mm.h | 43 ++++++++++++++++++++++++----------------
 1 file changed, 26 insertions(+), 17 deletions(-)

diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index 9a19f1b42f6412..f00d7ecc2adf51 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -306,6 +306,24 @@ static inline void might_alloc(gfp_t gfp_mask)
 	might_sleep_if(gfpflags_allow_blocking(gfp_mask));
 }
 
+/**
+ * memalloc_flags_save - Add a PF_* flag to current->flags, save old value
+ *
+ * This allows PF_* flags to be conveniently added, irrespective of current
+ * value, and then the old version restored with memalloc_flags_restore().
+ */
+static inline unsigned memalloc_flags_save(unsigned flags)
+{
+	unsigned oldflags = ~current->flags & flags;
+	current->flags |= flags;
+	return oldflags;
+}
+
+static inline void memalloc_flags_restore(unsigned flags)
+{
+	current->flags &= ~flags;
+}
+
 /**
  * memalloc_noio_save - Marks implicit GFP_NOIO allocation scope.
  *
@@ -319,9 +337,7 @@ static inline void might_alloc(gfp_t gfp_mask)
  */
 static inline unsigned int memalloc_noio_save(void)
 {
-	unsigned int flags = current->flags & PF_MEMALLOC_NOIO;
-	current->flags |= PF_MEMALLOC_NOIO;
-	return flags;
+	return memalloc_flags_save(PF_MEMALLOC_NOIO);
 }
 
 /**
@@ -334,7 +350,7 @@ static inline unsigned int memalloc_noio_save(void)
  */
 static inline void memalloc_noio_restore(unsigned int flags)
 {
-	current->flags = (current->flags & ~PF_MEMALLOC_NOIO) | flags;
+	memalloc_flags_restore(flags);
 }
 
 /**
@@ -350,9 +366,7 @@ static inline void memalloc_noio_restore(unsigned int flags)
  */
 static inline unsigned int memalloc_nofs_save(void)
 {
-	unsigned int flags = current->flags & PF_MEMALLOC_NOFS;
-	current->flags |= PF_MEMALLOC_NOFS;
-	return flags;
+	return memalloc_flags_save(PF_MEMALLOC_NOFS);
 }
 
 /**
@@ -365,32 +379,27 @@ static inline unsigned int memalloc_nofs_save(void)
  */
 static inline void memalloc_nofs_restore(unsigned int flags)
 {
-	current->flags = (current->flags & ~PF_MEMALLOC_NOFS) | flags;
+	memalloc_flags_restore(flags);
 }
 
 static inline unsigned int memalloc_noreclaim_save(void)
 {
-	unsigned int flags = current->flags & PF_MEMALLOC;
-	current->flags |= PF_MEMALLOC;
-	return flags;
+	return memalloc_flags_save(PF_MEMALLOC);
 }
 
 static inline void memalloc_noreclaim_restore(unsigned int flags)
 {
-	current->flags = (current->flags & ~PF_MEMALLOC) | flags;
+	memalloc_flags_restore(flags);
 }
 
 static inline unsigned int memalloc_pin_save(void)
 {
-	unsigned int flags = current->flags & PF_MEMALLOC_PIN;
-
-	current->flags |= PF_MEMALLOC_PIN;
-	return flags;
+	return memalloc_flags_save(PF_MEMALLOC_PIN);
 }
 
 static inline void memalloc_pin_restore(unsigned int flags)
 {
-	current->flags = (current->flags & ~PF_MEMALLOC_PIN) | flags;
+	memalloc_flags_restore(flags);
 }
 
 #ifdef CONFIG_MEMCG

From 78f404cd27a72bda6b03a96379ba0e63c70a486e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 25 Jan 2024 19:00:24 -0500
Subject: [PATCH 0497/1406] mm: introduce PF_MEMALLOC_NORECLAIM,
 PF_MEMALLOC_NOWARN

Introduce PF_MEMALLOC_* equivalents of some GFP_ flags:

PF_MEMALLOC_NORECLAIM	-> GFP_NOWAIT
PF_MEMALLOC_NOWARN	-> __GFP_NOWARN

Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Darrick J. Wong <djwong@kernel.org>
Cc: linux-mm@kvack.org
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 include/linux/sched.h    |  4 ++--
 include/linux/sched/mm.h | 17 +++++++++++++----
 2 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index cdb8ea53c365ba..187e4f5f654588 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1636,8 +1636,8 @@ extern struct pid *cad_pid;
 						 * I am cleaning dirty pages from some other bdi. */
 #define PF_KTHREAD		0x00200000	/* I am a kernel thread */
 #define PF_RANDOMIZE		0x00400000	/* Randomize virtual address space */
-#define PF__HOLE__00800000	0x00800000
-#define PF__HOLE__01000000	0x01000000
+#define PF_MEMALLOC_NORECLAIM	0x00800000	/* All allocation requests will clear __GFP_DIRECT_RECLAIM */
+#define PF_MEMALLOC_NOWARN	0x01000000	/* All allocation requests will inherit __GFP_NOWARN */
 #define PF__HOLE__02000000	0x02000000
 #define PF_NO_SETAFFINITY	0x04000000	/* Userland is not allowed to meddle with cpus_mask */
 #define PF_MCE_EARLY		0x08000000      /* Early kill for mce process policy */
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index f00d7ecc2adf51..c29059a7605258 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -236,16 +236,25 @@ static inline gfp_t current_gfp_context(gfp_t flags)
 {
 	unsigned int pflags = READ_ONCE(current->flags);
 
-	if (unlikely(pflags & (PF_MEMALLOC_NOIO | PF_MEMALLOC_NOFS | PF_MEMALLOC_PIN))) {
+	if (unlikely(pflags & (PF_MEMALLOC_NOIO |
+			       PF_MEMALLOC_NOFS |
+			       PF_MEMALLOC_NORECLAIM |
+			       PF_MEMALLOC_NOWARN |
+			       PF_MEMALLOC_PIN))) {
 		/*
-		 * NOIO implies both NOIO and NOFS and it is a weaker context
-		 * so always make sure it makes precedence
+		 * Stronger flags before weaker flags:
+		 * NORECLAIM implies NOIO, which in turn implies NOFS
 		 */
-		if (pflags & PF_MEMALLOC_NOIO)
+		if (pflags & PF_MEMALLOC_NORECLAIM)
+			flags &= ~__GFP_DIRECT_RECLAIM;
+		else if (pflags & PF_MEMALLOC_NOIO)
 			flags &= ~(__GFP_IO | __GFP_FS);
 		else if (pflags & PF_MEMALLOC_NOFS)
 			flags &= ~__GFP_FS;
 
+		if (pflags & PF_MEMALLOC_NOWARN)
+			flags |= __GFP_NOWARN;
+
 		if (pflags & PF_MEMALLOC_PIN)
 			flags &= ~__GFP_MOVABLE;
 	}

From 26fd7108826c7c2ff383c726ded5cd56630aacb7 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 25 Jan 2024 20:25:49 -0500
Subject: [PATCH 0498/1406] bcachefs: bch2_inode_insert()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs.c | 138 ++++++++++++++++++++++++++---------------------
 1 file changed, 76 insertions(+), 62 deletions(-)

diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index be0c059d70f03d..bcc1436c172068 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -176,45 +176,88 @@ static unsigned bch2_inode_hash(subvol_inum inum)
 	return jhash_3words(inum.subvol, inum.inum >> 32, inum.inum, JHASH_INITVAL);
 }
 
-struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum)
+static struct bch_inode_info *bch2_inode_insert(struct bch_fs *c, struct bch_inode_info *inode)
 {
-	struct bch_inode_unpacked inode_u;
-	struct bch_inode_info *inode;
-	struct btree_trans *trans;
-	struct bch_subvolume subvol;
-	int ret;
+	subvol_inum inum = inode_inum(inode);
+	struct bch_inode_info *old = to_bch_ei(inode_insert5(&inode->v,
+				      bch2_inode_hash(inum),
+				      bch2_iget5_test,
+				      bch2_iget5_set,
+				      &inum));
+	BUG_ON(!old);
 
-	inode = to_bch_ei(iget5_locked(c->vfs_sb,
-				       bch2_inode_hash(inum),
-				       bch2_iget5_test,
-				       bch2_iget5_set,
-				       &inum));
-	if (unlikely(!inode))
-		return ERR_PTR(-ENOMEM);
-	if (!(inode->v.i_state & I_NEW))
-		return &inode->v;
+	if (unlikely(old != inode)) {
+		discard_new_inode(&inode->v);
+		inode = old;
+	} else {
+		mutex_lock(&c->vfs_inodes_lock);
+		list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list);
+		mutex_unlock(&c->vfs_inodes_lock);
+		/*
+		 * we really don't want insert_inode_locked2() to be setting
+		 * I_NEW...
+		 */
+		unlock_new_inode(&inode->v);
+	}
 
-	trans = bch2_trans_get(c);
-	ret = lockrestart_do(trans,
-		bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?:
-		bch2_inode_find_by_inum_trans(trans, inum, &inode_u));
+	return inode;
+}
 
-	if (!ret)
-		bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol);
-	bch2_trans_put(trans);
+#define memalloc_flags_do(_flags, _do)						\
+({										\
+	unsigned _saved_flags = memalloc_flags_save(_flags);			\
+	typeof(_do) _ret = _do;							\
+	memalloc_noreclaim_restore(_saved_flags);				\
+	_ret;									\
+})
 
-	if (ret) {
-		iget_failed(&inode->v);
-		return ERR_PTR(bch2_err_class(ret));
+/*
+ * Allocate a new inode, dropping/retaking btree locks if necessary:
+ */
+static struct bch_inode_info *bch2_new_inode(struct btree_trans *trans)
+{
+	struct bch_fs *c = trans->c;
+
+	struct bch_inode_info *inode =
+		memalloc_flags_do(PF_MEMALLOC_NORECLAIM|PF_MEMALLOC_NOWARN,
+				  to_bch_ei(new_inode(c->vfs_sb)));
+
+	if (unlikely(!inode)) {
+		int ret = drop_locks_do(trans, (inode = to_bch_ei(new_inode(c->vfs_sb))) ? 0 : -ENOMEM);
+		if (ret && inode)
+			discard_new_inode(&inode->v);
+		if (ret)
+			return ERR_PTR(ret);
 	}
 
-	mutex_lock(&c->vfs_inodes_lock);
-	list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list);
-	mutex_unlock(&c->vfs_inodes_lock);
+	return inode;
+}
 
-	unlock_new_inode(&inode->v);
+struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum)
+{
+	struct bch_inode_info *inode =
+		to_bch_ei(ilookup5_nowait(c->vfs_sb,
+					  bch2_inode_hash(inum),
+					  bch2_iget5_test,
+					  &inum));
+	if (inode)
+		return &inode->v;
 
-	return &inode->v;
+	struct btree_trans *trans = bch2_trans_get(c);
+
+	struct bch_inode_unpacked inode_u;
+	struct bch_subvolume subvol;
+	int ret = lockrestart_do(trans,
+		bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?:
+		bch2_inode_find_by_inum_trans(trans, inum, &inode_u)) ?:
+		PTR_ERR_OR_ZERO(inode = bch2_new_inode(trans));
+	if (!ret) {
+		bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol);
+		inode = bch2_inode_insert(c, inode);
+	}
+	bch2_trans_put(trans);
+
+	return ret ? ERR_PTR(ret) : &inode->v;
 }
 
 struct bch_inode_info *
@@ -226,7 +269,7 @@ __bch2_create(struct mnt_idmap *idmap,
 	struct bch_fs *c = dir->v.i_sb->s_fs_info;
 	struct btree_trans *trans;
 	struct bch_inode_unpacked dir_u;
-	struct bch_inode_info *inode, *old;
+	struct bch_inode_info *inode;
 	struct bch_inode_unpacked inode_u;
 	struct posix_acl *default_acl = NULL, *acl = NULL;
 	subvol_inum inum;
@@ -293,7 +336,6 @@ __bch2_create(struct mnt_idmap *idmap,
 		mutex_unlock(&dir->ei_update_lock);
 	}
 
-	bch2_iget5_set(&inode->v, &inum);
 	bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol);
 
 	set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
@@ -304,36 +346,7 @@ __bch2_create(struct mnt_idmap *idmap,
 	 * bch2_trans_exit() and dropping locks, else we could race with another
 	 * thread pulling the inode in and modifying it:
 	 */
-
-	inode->v.i_state |= I_CREATING;
-
-	old = to_bch_ei(inode_insert5(&inode->v,
-				      bch2_inode_hash(inum),
-				      bch2_iget5_test,
-				      bch2_iget5_set,
-				      &inum));
-	BUG_ON(!old);
-
-	if (unlikely(old != inode)) {
-		/*
-		 * We raced, another process pulled the new inode into cache
-		 * before us:
-		 */
-		make_bad_inode(&inode->v);
-		iput(&inode->v);
-
-		inode = old;
-	} else {
-		mutex_lock(&c->vfs_inodes_lock);
-		list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list);
-		mutex_unlock(&c->vfs_inodes_lock);
-		/*
-		 * we really don't want insert_inode_locked2() to be setting
-		 * I_NEW...
-		 */
-		unlock_new_inode(&inode->v);
-	}
-
+	inode = bch2_inode_insert(c, inode);
 	bch2_trans_put(trans);
 err:
 	posix_acl_release(default_acl);
@@ -1372,6 +1385,7 @@ static void bch2_vfs_inode_init(struct btree_trans *trans, subvol_inum inum,
 				struct bch_inode_unpacked *bi,
 				struct bch_subvolume *subvol)
 {
+	bch2_iget5_set(&inode->v, &inum);
 	bch2_inode_update_after_write(trans, inode, bi, ~0);
 
 	if (BCH_SUBVOLUME_SNAP(subvol))

From 3abd0b6693673ffac73b598dad7158ec9718d814 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 25 Jan 2024 12:36:37 -0500
Subject: [PATCH 0499/1406] bcachefs: bch2_lookup() gives better error message
 on inode not found

When a dirent points to a missing inode, we really should print out the
dirent.

This requires quite a bit of refactoring, but there's some other
benefits: we now do the entire looup (dirent and inode) in a single
btree transaction, and copy to the VFS inode with btree locks still
held, like the create path.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs.c | 73 ++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 64 insertions(+), 9 deletions(-)

diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index bcc1436c172068..093f5404a655a0 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -365,23 +365,78 @@ __bch2_create(struct mnt_idmap *idmap,
 
 /* methods */
 
+static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans,
+			subvol_inum dir, struct bch_hash_info *dir_hash_info,
+			const struct qstr *name)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter dirent_iter = {};
+	subvol_inum inum = {};
+
+	int ret = bch2_hash_lookup(trans, &dirent_iter, bch2_dirent_hash_desc,
+				   dir_hash_info, dir, name, 0);
+	if (ret)
+		return ERR_PTR(ret);
+
+	struct bkey_s_c k = bch2_btree_iter_peek_slot(&dirent_iter);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	ret = bch2_dirent_read_target(trans, dir, bkey_s_c_to_dirent(k), &inum);
+	if (ret > 0)
+		ret = -ENOENT;
+	if (ret)
+		goto err;
+
+	struct bch_inode_info *inode =
+		to_bch_ei(ilookup5_nowait(c->vfs_sb,
+					  bch2_inode_hash(inum),
+					  bch2_iget5_test,
+					  &inum));
+	if (inode)
+		goto out;
+
+	struct bch_subvolume subvol;
+	struct bch_inode_unpacked inode_u;
+	ret =   bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?:
+		bch2_inode_find_by_inum_nowarn_trans(trans, inum, &inode_u) ?:
+		PTR_ERR_OR_ZERO(inode = bch2_new_inode(trans));
+	if (bch2_err_matches(ret, ENOENT)) {
+		struct printbuf buf = PRINTBUF;
+
+		bch2_bkey_val_to_text(&buf, c, k);
+		bch_err(c, "%s points to missing inode", buf.buf);
+		printbuf_exit(&buf);
+	}
+	if (ret)
+		goto err;
+
+	bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol);
+	inode = bch2_inode_insert(c, inode);
+out:
+	bch2_trans_iter_exit(trans, &dirent_iter);
+	return inode;
+err:
+	inode = ERR_PTR(ret);
+	goto out;
+}
+
 static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry,
 				  unsigned int flags)
 {
 	struct bch_fs *c = vdir->i_sb->s_fs_info;
 	struct bch_inode_info *dir = to_bch_ei(vdir);
 	struct bch_hash_info hash = bch2_hash_info_init(c, &dir->ei_inode);
-	struct inode *vinode = NULL;
-	subvol_inum inum = { .subvol = 1 };
-	int ret;
-
-	ret = bch2_dirent_lookup(c, inode_inum(dir), &hash,
-				 &dentry->d_name, &inum);
 
-	if (!ret)
-		vinode = bch2_vfs_inode_get(c, inum);
+	struct bch_inode_info *inode;
+	bch2_trans_do(c, NULL, NULL, 0,
+		PTR_ERR_OR_ZERO(inode = bch2_lookup_trans(trans, inode_inum(dir),
+							  &hash, &dentry->d_name)));
+	if (IS_ERR(inode))
+		inode = NULL;
 
-	return d_splice_alias(vinode, dentry);
+	return d_splice_alias(&inode->v, dentry);
 }
 
 static int bch2_mknod(struct mnt_idmap *idmap,

From aee119a94da6500daa6857ad51b729147e7a2aa6 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 11 Jul 2023 20:35:06 -0400
Subject: [PATCH 0500/1406] mean and variance: Promote to lib/math

Small statistics library, for taking in a series of value and computing
mean, weighted mean, standard deviation and weighted deviation.

The main use case is for statistics on latency measurements.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
Cc: Daniel Hill <daniel@gluo.nz>
Cc: Darrick J. Wong <djwong@kernel.org>
---
 MAINTAINERS                                        |  9 +++++++++
 fs/bcachefs/Kconfig                                | 10 +---------
 fs/bcachefs/Makefile                               |  3 ---
 fs/bcachefs/util.c                                 |  2 +-
 fs/bcachefs/util.h                                 |  3 +--
 {fs/bcachefs => include/linux}/mean_and_variance.h |  0
 lib/Kconfig.debug                                  |  9 +++++++++
 lib/math/Kconfig                                   |  3 +++
 lib/math/Makefile                                  |  2 ++
 {fs/bcachefs => lib/math}/mean_and_variance.c      |  3 +--
 {fs/bcachefs => lib/math}/mean_and_variance_test.c |  3 +--
 11 files changed, 28 insertions(+), 19 deletions(-)
 rename {fs/bcachefs => include/linux}/mean_and_variance.h (100%)
 rename {fs/bcachefs => lib/math}/mean_and_variance.c (99%)
 rename {fs/bcachefs => lib/math}/mean_and_variance_test.c (99%)

diff --git a/MAINTAINERS b/MAINTAINERS
index 1a46ba4e496a56..32354f59ead9d9 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -13380,6 +13380,15 @@ S:	Maintained
 F:	drivers/net/mdio/mdio-regmap.c
 F:	include/linux/mdio/mdio-regmap.h
 
+MEAN AND VARIANCE LIBRARY
+M:	Daniel B. Hill <daniel@gluo.nz>
+M:	Kent Overstreet <kent.overstreet@linux.dev>
+S:	Maintained
+T:	git https://github.com/YellowOnion/linux/
+F:	include/linux/mean_and_variance.h
+F:	lib/math/mean_and_variance.c
+F:	lib/math/mean_and_variance_test.c
+
 MEASUREMENT COMPUTING CIO-DAC IIO DRIVER
 M:	William Breathitt Gray <william.gray@linaro.org>
 L:	linux-iio@vger.kernel.org
diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig
index 5cdfef3b551a78..72d1179262b33e 100644
--- a/fs/bcachefs/Kconfig
+++ b/fs/bcachefs/Kconfig
@@ -24,6 +24,7 @@ config BCACHEFS_FS
 	select XXHASH
 	select SRCU
 	select SYMBOLIC_ERRNAME
+	select MEAN_AND_VARIANCE
 	help
 	The bcachefs filesystem - a modern, copy on write filesystem, with
 	support for multiple devices, compression, checksumming, etc.
@@ -86,12 +87,3 @@ config BCACHEFS_SIX_OPTIMISTIC_SPIN
 	Instead of immediately sleeping when attempting to take a six lock that
 	is held by another thread, spin for a short while, as long as the
 	thread owning the lock is running.
-
-config MEAN_AND_VARIANCE_UNIT_TEST
-	tristate "mean_and_variance unit tests" if !KUNIT_ALL_TESTS
-	depends on KUNIT
-	depends on BCACHEFS_FS
-	default KUNIT_ALL_TESTS
-	help
-	  This option enables the kunit tests for mean_and_variance module.
-	  If unsure, say N.
diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile
index 1a05cecda7cc5c..b11ba74b8ad41a 100644
--- a/fs/bcachefs/Makefile
+++ b/fs/bcachefs/Makefile
@@ -57,7 +57,6 @@ bcachefs-y		:=	\
 	keylist.o		\
 	logged_ops.o		\
 	lru.o			\
-	mean_and_variance.o	\
 	migrate.o		\
 	move.o			\
 	movinggc.o		\
@@ -88,5 +87,3 @@ bcachefs-y		:=	\
 	util.o			\
 	varint.o		\
 	xattr.o
-
-obj-$(CONFIG_MEAN_AND_VARIANCE_UNIT_TEST)   += mean_and_variance_test.o
diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
index 231003b405efc3..c9d13dcf3ef1ab 100644
--- a/fs/bcachefs/util.c
+++ b/fs/bcachefs/util.c
@@ -22,9 +22,9 @@
 #include <linux/string.h>
 #include <linux/types.h>
 #include <linux/sched/clock.h>
+#include <linux/mean_and_variance.h>
 
 #include "eytzinger.h"
-#include "mean_and_variance.h"
 #include "util.h"
 
 static const char si_units[] = "?kMGTPEZY";
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index b414736d59a5b3..0059481995ef74 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -17,8 +17,7 @@
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/workqueue.h>
-
-#include "mean_and_variance.h"
+#include <linux/mean_and_variance.h>
 
 #include "darray.h"
 
diff --git a/fs/bcachefs/mean_and_variance.h b/include/linux/mean_and_variance.h
similarity index 100%
rename from fs/bcachefs/mean_and_variance.h
rename to include/linux/mean_and_variance.h
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 975a07f9f1cc08..817ddfe132cdab 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -2191,6 +2191,15 @@ config CPUMASK_KUNIT_TEST
 
 	  If unsure, say N.
 
+config MEAN_AND_VARIANCE_UNIT_TEST
+	tristate "mean_and_variance unit tests" if !KUNIT_ALL_TESTS
+	depends on KUNIT
+	select MEAN_AND_VARIANCE
+	default KUNIT_ALL_TESTS
+	help
+	  This option enables the kunit tests for mean_and_variance module.
+	  If unsure, say N.
+
 config TEST_LIST_SORT
 	tristate "Linked list sorting test" if !KUNIT_ALL_TESTS
 	depends on KUNIT
diff --git a/lib/math/Kconfig b/lib/math/Kconfig
index 0634b428d0cb7f..7530ae9a3584fa 100644
--- a/lib/math/Kconfig
+++ b/lib/math/Kconfig
@@ -15,3 +15,6 @@ config PRIME_NUMBERS
 
 config RATIONAL
 	tristate
+
+config MEAN_AND_VARIANCE
+	tristate
diff --git a/lib/math/Makefile b/lib/math/Makefile
index 91fcdb0c9efe44..8cdfa13a67ce03 100644
--- a/lib/math/Makefile
+++ b/lib/math/Makefile
@@ -4,6 +4,8 @@ obj-y += div64.o gcd.o lcm.o int_log.o int_pow.o int_sqrt.o reciprocal_div.o
 obj-$(CONFIG_CORDIC)		+= cordic.o
 obj-$(CONFIG_PRIME_NUMBERS)	+= prime_numbers.o
 obj-$(CONFIG_RATIONAL)		+= rational.o
+obj-$(CONFIG_MEAN_AND_VARIANCE) += mean_and_variance.o
 
 obj-$(CONFIG_TEST_DIV64)	+= test_div64.o
 obj-$(CONFIG_RATIONAL_KUNIT_TEST) += rational-test.o
+obj-$(CONFIG_MEAN_AND_VARIANCE_UNIT_TEST)   += mean_and_variance_test.o
diff --git a/fs/bcachefs/mean_and_variance.c b/lib/math/mean_and_variance.c
similarity index 99%
rename from fs/bcachefs/mean_and_variance.c
rename to lib/math/mean_and_variance.c
index bf0ef668fd3832..ba90293204bae1 100644
--- a/fs/bcachefs/mean_and_variance.c
+++ b/lib/math/mean_and_variance.c
@@ -40,10 +40,9 @@
 #include <linux/limits.h>
 #include <linux/math.h>
 #include <linux/math64.h>
+#include <linux/mean_and_variance.h>
 #include <linux/module.h>
 
-#include "mean_and_variance.h"
-
 u128_u u128_div(u128_u n, u64 d)
 {
 	u128_u r;
diff --git a/fs/bcachefs/mean_and_variance_test.c b/lib/math/mean_and_variance_test.c
similarity index 99%
rename from fs/bcachefs/mean_and_variance_test.c
rename to lib/math/mean_and_variance_test.c
index 019583c3ca0eab..f45591a169d879 100644
--- a/fs/bcachefs/mean_and_variance_test.c
+++ b/lib/math/mean_and_variance_test.c
@@ -1,7 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 #include <kunit/test.h>
-
-#include "mean_and_variance.h"
+#include <linux/mean_and_variance.h>
 
 #define MAX_SQR (SQRT_U64_MAX*SQRT_U64_MAX)
 

From 80be0ad8102f3a8b0e5264f830855ac77f987fa7 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 26 Jan 2024 12:18:05 -0500
Subject: [PATCH 0501/1406] eytzinger: Promote to include/linux/

eytzinger trees are a faster alternative to binary search. They're a bit
more expensive to setup, but lookups perform much better assuming the
tree isn't entirely in cache.

Binary search is a worst case scenario for branch prediction and
prefetching, but eytzinger trees have children adjacent in memory and
thus we can prefetch before knowing the result of a comparison.

An eytzinger tree is a binary tree laid out in an array, with the same
geometry as the usual binary heap construction, but used as a search
tree instead.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 MAINTAINERS                                |   6 +
 fs/bcachefs/bset.c                         |   2 +-
 fs/bcachefs/journal_seq_blacklist.c        |   6 +-
 fs/bcachefs/replicas.c                     |  19 ++-
 fs/bcachefs/replicas.h                     |   3 +-
 fs/bcachefs/super-io.h                     |   2 +-
 fs/bcachefs/util.c                         | 145 +--------------------
 fs/bcachefs/util.h                         |   4 -
 {fs/bcachefs => include/linux}/eytzinger.h |  58 +++++----
 lib/sort.c                                 |  89 +++++++++++++
 10 files changed, 148 insertions(+), 186 deletions(-)
 rename {fs/bcachefs => include/linux}/eytzinger.h (77%)

diff --git a/MAINTAINERS b/MAINTAINERS
index 32354f59ead9d9..8322dbc120404c 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -8062,6 +8062,12 @@ L:	iommu@lists.linux.dev
 S:	Maintained
 F:	drivers/iommu/exynos-iommu.c
 
+EYTZINGER TREE LIB
+M:	Kent Overstreet <kent.overstreet@linux.dev>
+L:	linux-bcachefs@vger.kernel.org
+S:	Maintained
+F:	include/linux/eytzinger.h
+
 F2FS FILE SYSTEM
 M:	Jaegeuk Kim <jaegeuk@kernel.org>
 M:	Chao Yu <chao@kernel.org>
diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c
index 3fd1085b6c61ee..1d77aa55d641c6 100644
--- a/fs/bcachefs/bset.c
+++ b/fs/bcachefs/bset.c
@@ -9,12 +9,12 @@
 #include "bcachefs.h"
 #include "btree_cache.h"
 #include "bset.h"
-#include "eytzinger.h"
 #include "trace.h"
 #include "util.h"
 
 #include <asm/unaligned.h>
 #include <linux/console.h>
+#include <linux/eytzinger.h>
 #include <linux/random.h>
 #include <linux/prefetch.h>
 
diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c
index 0200e299cfbb9c..024c9b1b323f84 100644
--- a/fs/bcachefs/journal_seq_blacklist.c
+++ b/fs/bcachefs/journal_seq_blacklist.c
@@ -2,10 +2,11 @@
 
 #include "bcachefs.h"
 #include "btree_iter.h"
-#include "eytzinger.h"
 #include "journal_seq_blacklist.h"
 #include "super-io.h"
 
+#include <linux/eytzinger.h>
+
 /*
  * journal_seq_blacklist machinery:
  *
@@ -119,8 +120,7 @@ int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end)
 	return ret ?: bch2_blacklist_table_initialize(c);
 }
 
-static int journal_seq_blacklist_table_cmp(const void *_l,
-					   const void *_r, size_t size)
+static int journal_seq_blacklist_table_cmp(const void *_l, const void *_r)
 {
 	const struct journal_seq_blacklist_table_entry *l = _l;
 	const struct journal_seq_blacklist_table_entry *r = _r;
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index cc2672c120312c..678b9c20e2514b 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -6,12 +6,15 @@
 #include "replicas.h"
 #include "super-io.h"
 
+#include <linux/sort.h>
+
 static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *,
 					    struct bch_replicas_cpu *);
 
 /* Some (buggy!) compilers don't allow memcmp to be passed as a pointer */
-static int bch2_memcmp(const void *l, const void *r, size_t size)
+static int bch2_memcmp(const void *l, const void *r,  const void *priv)
 {
+	size_t size = (size_t) priv;
 	return memcmp(l, r, size);
 }
 
@@ -39,7 +42,8 @@ void bch2_replicas_entry_sort(struct bch_replicas_entry_v1 *e)
 
 static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r)
 {
-	eytzinger0_sort(r->entries, r->nr, r->entry_size, bch2_memcmp, NULL);
+	eytzinger0_sort_r(r->entries, r->nr, r->entry_size,
+			  bch2_memcmp, NULL, (void *)(size_t)r->entry_size);
 }
 
 static void bch2_replicas_entry_v0_to_text(struct printbuf *out,
@@ -228,7 +232,7 @@ static inline int __replicas_entry_idx(struct bch_replicas_cpu *r,
 
 	verify_replicas_entry(search);
 
-#define entry_cmp(_l, _r, size)	memcmp(_l, _r, entry_size)
+#define entry_cmp(_l, _r)	memcmp(_l, _r, entry_size)
 	idx = eytzinger0_find(r->entries, r->nr, r->entry_size,
 			      entry_cmp, search);
 #undef entry_cmp
@@ -824,10 +828,11 @@ static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r,
 {
 	unsigned i;
 
-	sort_cmp_size(cpu_r->entries,
-		      cpu_r->nr,
-		      cpu_r->entry_size,
-		      bch2_memcmp, NULL);
+	sort_r(cpu_r->entries,
+	       cpu_r->nr,
+	       cpu_r->entry_size,
+	       bch2_memcmp, NULL,
+	       (void *)(size_t)cpu_r->entry_size);
 
 	for (i = 0; i < cpu_r->nr; i++) {
 		struct bch_replicas_entry_v1 *e =
diff --git a/fs/bcachefs/replicas.h b/fs/bcachefs/replicas.h
index 654a4b26d3a3c9..983cce782ac2a2 100644
--- a/fs/bcachefs/replicas.h
+++ b/fs/bcachefs/replicas.h
@@ -3,9 +3,10 @@
 #define _BCACHEFS_REPLICAS_H
 
 #include "bkey.h"
-#include "eytzinger.h"
 #include "replicas_types.h"
 
+#include <linux/eytzinger.h>
+
 void bch2_replicas_entry_sort(struct bch_replicas_entry_v1 *);
 void bch2_replicas_entry_to_text(struct printbuf *,
 				 struct bch_replicas_entry_v1 *);
diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h
index 95e80e06316bf4..f37620919e11a5 100644
--- a/fs/bcachefs/super-io.h
+++ b/fs/bcachefs/super-io.h
@@ -3,12 +3,12 @@
 #define _BCACHEFS_SUPER_IO_H
 
 #include "extents.h"
-#include "eytzinger.h"
 #include "super_types.h"
 #include "super.h"
 #include "sb-members.h"
 
 #include <asm/byteorder.h>
+#include <linux/eytzinger.h>
 
 static inline bool bch2_version_compatible(u16 version)
 {
diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
index c9d13dcf3ef1ab..902f6b1a8a1429 100644
--- a/fs/bcachefs/util.c
+++ b/fs/bcachefs/util.c
@@ -11,6 +11,7 @@
 #include <linux/console.h>
 #include <linux/ctype.h>
 #include <linux/debugfs.h>
+#include <linux/eytzinger.h>
 #include <linux/freezer.h>
 #include <linux/kthread.h>
 #include <linux/log2.h>
@@ -24,7 +25,6 @@
 #include <linux/sched/clock.h>
 #include <linux/mean_and_variance.h>
 
-#include "eytzinger.h"
 #include "util.h"
 
 static const char si_units[] = "?kMGTPEZY";
@@ -864,149 +864,6 @@ void memcpy_from_bio(void *dst, struct bio *src, struct bvec_iter src_iter)
 	}
 }
 
-static int alignment_ok(const void *base, size_t align)
-{
-	return IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) ||
-		((unsigned long)base & (align - 1)) == 0;
-}
-
-static void u32_swap(void *a, void *b, size_t size)
-{
-	u32 t = *(u32 *)a;
-	*(u32 *)a = *(u32 *)b;
-	*(u32 *)b = t;
-}
-
-static void u64_swap(void *a, void *b, size_t size)
-{
-	u64 t = *(u64 *)a;
-	*(u64 *)a = *(u64 *)b;
-	*(u64 *)b = t;
-}
-
-static void generic_swap(void *a, void *b, size_t size)
-{
-	char t;
-
-	do {
-		t = *(char *)a;
-		*(char *)a++ = *(char *)b;
-		*(char *)b++ = t;
-	} while (--size > 0);
-}
-
-static inline int do_cmp(void *base, size_t n, size_t size,
-			 int (*cmp_func)(const void *, const void *, size_t),
-			 size_t l, size_t r)
-{
-	return cmp_func(base + inorder_to_eytzinger0(l, n) * size,
-			base + inorder_to_eytzinger0(r, n) * size,
-			size);
-}
-
-static inline void do_swap(void *base, size_t n, size_t size,
-			   void (*swap_func)(void *, void *, size_t),
-			   size_t l, size_t r)
-{
-	swap_func(base + inorder_to_eytzinger0(l, n) * size,
-		  base + inorder_to_eytzinger0(r, n) * size,
-		  size);
-}
-
-void eytzinger0_sort(void *base, size_t n, size_t size,
-		     int (*cmp_func)(const void *, const void *, size_t),
-		     void (*swap_func)(void *, void *, size_t))
-{
-	int i, c, r;
-
-	if (!swap_func) {
-		if (size == 4 && alignment_ok(base, 4))
-			swap_func = u32_swap;
-		else if (size == 8 && alignment_ok(base, 8))
-			swap_func = u64_swap;
-		else
-			swap_func = generic_swap;
-	}
-
-	/* heapify */
-	for (i = n / 2 - 1; i >= 0; --i) {
-		for (r = i; r * 2 + 1 < n; r = c) {
-			c = r * 2 + 1;
-
-			if (c + 1 < n &&
-			    do_cmp(base, n, size, cmp_func, c, c + 1) < 0)
-				c++;
-
-			if (do_cmp(base, n, size, cmp_func, r, c) >= 0)
-				break;
-
-			do_swap(base, n, size, swap_func, r, c);
-		}
-	}
-
-	/* sort */
-	for (i = n - 1; i > 0; --i) {
-		do_swap(base, n, size, swap_func, 0, i);
-
-		for (r = 0; r * 2 + 1 < i; r = c) {
-			c = r * 2 + 1;
-
-			if (c + 1 < i &&
-			    do_cmp(base, n, size, cmp_func, c, c + 1) < 0)
-				c++;
-
-			if (do_cmp(base, n, size, cmp_func, r, c) >= 0)
-				break;
-
-			do_swap(base, n, size, swap_func, r, c);
-		}
-	}
-}
-
-void sort_cmp_size(void *base, size_t num, size_t size,
-	  int (*cmp_func)(const void *, const void *, size_t),
-	  void (*swap_func)(void *, void *, size_t size))
-{
-	/* pre-scale counters for performance */
-	int i = (num/2 - 1) * size, n = num * size, c, r;
-
-	if (!swap_func) {
-		if (size == 4 && alignment_ok(base, 4))
-			swap_func = u32_swap;
-		else if (size == 8 && alignment_ok(base, 8))
-			swap_func = u64_swap;
-		else
-			swap_func = generic_swap;
-	}
-
-	/* heapify */
-	for ( ; i >= 0; i -= size) {
-		for (r = i; r * 2 + size < n; r  = c) {
-			c = r * 2 + size;
-			if (c < n - size &&
-			    cmp_func(base + c, base + c + size, size) < 0)
-				c += size;
-			if (cmp_func(base + r, base + c, size) >= 0)
-				break;
-			swap_func(base + r, base + c, size);
-		}
-	}
-
-	/* sort */
-	for (i = n - size; i > 0; i -= size) {
-		swap_func(base, base + i, size);
-		for (r = 0; r * 2 + size < i; r = c) {
-			c = r * 2 + size;
-			if (c < i - size &&
-			    cmp_func(base + c, base + c + size, size) < 0)
-				c += size;
-			if (cmp_func(base + r, base + c, size) >= 0)
-				break;
-			swap_func(base + r, base + c, size);
-		}
-	}
-}
-
 static void mempool_free_vp(void *element, void *pool_data)
 {
 	size_t size = (size_t) pool_data;
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index 0059481995ef74..c3b11c3d24ea98 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -737,10 +737,6 @@ static inline void memset_u64s_tail(void *s, int c, unsigned bytes)
 	memset(s + bytes, c, rem);
 }
 
-void sort_cmp_size(void *base, size_t num, size_t size,
-	  int (*cmp_func)(const void *, const void *, size_t),
-	  void (*swap_func)(void *, void *, size_t));
-
 /* just the memmove, doesn't update @_nr */
 #define __array_insert_item(_array, _nr, _pos)				\
 	memmove(&(_array)[(_pos) + 1],					\
diff --git a/fs/bcachefs/eytzinger.h b/include/linux/eytzinger.h
similarity index 77%
rename from fs/bcachefs/eytzinger.h
rename to include/linux/eytzinger.h
index b04750dbf870bc..1031501030449d 100644
--- a/fs/bcachefs/eytzinger.h
+++ b/include/linux/eytzinger.h
@@ -1,27 +1,37 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _EYTZINGER_H
-#define _EYTZINGER_H
+#ifndef _LINUX_EYTZINGER_H
+#define _LINUX_EYTZINGER_H
 
 #include <linux/bitops.h>
 #include <linux/log2.h>
 
-#include "util.h"
+#ifdef EYTZINGER_DEBUG
+#define EYTZINGER_BUG_ON(cond)		BUG_ON(cond)
+#else
+#define EYTZINGER_BUG_ON(cond)
+#endif
 
 /*
  * Traversal for trees in eytzinger layout - a full binary tree layed out in an
- * array
- */
-
-/*
- * One based indexing version:
+ * array.
  *
- * With one based indexing each level of the tree starts at a power of two -
- * good for cacheline alignment:
+ * Consider using an eytzinger tree any time you would otherwise be doing binary
+ * search over an array. Binary search is a worst case scenario for branch
+ * prediction and prefetching, but in an eytzinger tree every node's children
+ * are adjacent in memory, thus we can prefetch children before knowing the
+ * result of the comparison, assuming multiple nodes fit on a cacheline.
+ *
+ * Two variants are provided, for one based indexing and zero based indexing.
+ *
+ * Zero based indexing is more convenient, but one based indexing has better
+ * alignment and thus better performance because each new level of the tree
+ * starts at a power of two, and thus if element 0 was cacheline aligned, each
+ * new level will be as well.
  */
 
 static inline unsigned eytzinger1_child(unsigned i, unsigned child)
 {
-	EBUG_ON(child > 1);
+	EYTZINGER_BUG_ON(child > 1);
 
 	return (i << 1) + child;
 }
@@ -58,7 +68,7 @@ static inline unsigned eytzinger1_last(unsigned size)
 
 static inline unsigned eytzinger1_next(unsigned i, unsigned size)
 {
-	EBUG_ON(i > size);
+	EYTZINGER_BUG_ON(i > size);
 
 	if (eytzinger1_right_child(i) <= size) {
 		i = eytzinger1_right_child(i);
@@ -74,7 +84,7 @@ static inline unsigned eytzinger1_next(unsigned i, unsigned size)
 
 static inline unsigned eytzinger1_prev(unsigned i, unsigned size)
 {
-	EBUG_ON(i > size);
+	EYTZINGER_BUG_ON(i > size);
 
 	if (eytzinger1_left_child(i) <= size) {
 		i = eytzinger1_left_child(i) + 1;
@@ -101,7 +111,7 @@ static inline unsigned __eytzinger1_to_inorder(unsigned i, unsigned size,
 	unsigned shift = __fls(size) - b;
 	int s;
 
-	EBUG_ON(!i || i > size);
+	EYTZINGER_BUG_ON(!i || i > size);
 
 	i  ^= 1U << b;
 	i <<= 1;
@@ -126,7 +136,7 @@ static inline unsigned __inorder_to_eytzinger1(unsigned i, unsigned size,
 	unsigned shift;
 	int s;
 
-	EBUG_ON(!i || i > size);
+	EYTZINGER_BUG_ON(!i || i > size);
 
 	/*
 	 * sign bit trick:
@@ -164,7 +174,7 @@ static inline unsigned inorder_to_eytzinger1(unsigned i, unsigned size)
 
 static inline unsigned eytzinger0_child(unsigned i, unsigned child)
 {
-	EBUG_ON(child > 1);
+	EYTZINGER_BUG_ON(child > 1);
 
 	return (i << 1) + 1 + child;
 }
@@ -231,11 +241,9 @@ static inline unsigned inorder_to_eytzinger0(unsigned i, unsigned size)
 	     (_i) != -1;				\
 	     (_i) = eytzinger0_next((_i), (_size)))
 
-typedef int (*eytzinger_cmp_fn)(const void *l, const void *r, size_t size);
-
 /* return greatest node <= @search, or -1 if not found */
 static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size,
-					 eytzinger_cmp_fn cmp, const void *search)
+					 cmp_func_t cmp, const void *search)
 {
 	unsigned i, n = 0;
 
@@ -244,7 +252,7 @@ static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size,
 
 	do {
 		i = n;
-		n = eytzinger0_child(i, cmp(search, base + i * size, size) >= 0);
+		n = eytzinger0_child(i, cmp(search, base + i * size) >= 0);
 	} while (n < nr);
 
 	if (n & 1) {
@@ -269,13 +277,13 @@ static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size,
 	int _res;							\
 									\
 	while (_i < _nr &&						\
-	       (_res = _cmp(_search, _base + _i * _size, _size)))	\
+	       (_res = _cmp(_search, _base + _i * _size)))		\
 		_i = eytzinger0_child(_i, _res > 0);			\
 	_i;								\
 })
 
-void eytzinger0_sort(void *, size_t, size_t,
-		    int (*cmp_func)(const void *, const void *, size_t),
-		    void (*swap_func)(void *, void *, size_t));
+void eytzinger0_sort_r(void *, size_t, size_t,
+		       cmp_r_func_t, swap_r_func_t, const void *);
+void eytzinger0_sort(void *, size_t, size_t, cmp_func_t, swap_func_t);
 
-#endif /* _EYTZINGER_H */
+#endif /* _LINUX_EYTZINGER_H */
diff --git a/lib/sort.c b/lib/sort.c
index b399bf10d6759b..f5b2206c73461e 100644
--- a/lib/sort.c
+++ b/lib/sort.c
@@ -290,3 +290,92 @@ void sort(void *base, size_t num, size_t size,
 	return sort_r(base, num, size, _CMP_WRAPPER, SWAP_WRAPPER, &w);
 }
 EXPORT_SYMBOL(sort);
+
+#include <linux/eytzinger.h>
+
+static inline int eytzinger0_do_cmp(void *base, size_t n, size_t size,
+			 cmp_r_func_t cmp_func, const void *priv,
+			 size_t l, size_t r)
+{
+	return do_cmp(base + inorder_to_eytzinger0(l, n) * size,
+		      base + inorder_to_eytzinger0(r, n) * size,
+		      cmp_func, priv);
+}
+
+static inline void eytzinger0_do_swap(void *base, size_t n, size_t size,
+			   swap_r_func_t swap_func, const void *priv,
+			   size_t l, size_t r)
+{
+	do_swap(base + inorder_to_eytzinger0(l, n) * size,
+		base + inorder_to_eytzinger0(r, n) * size,
+		size, swap_func, priv);
+}
+
+void eytzinger0_sort_r(void *base, size_t n, size_t size,
+		       cmp_r_func_t cmp_func,
+		       swap_r_func_t swap_func,
+		       const void *priv)
+{
+	int i, c, r;
+
+	/* called from 'sort' without swap function, let's pick the default */
+	if (swap_func == SWAP_WRAPPER && !((struct wrapper *)priv)->swap)
+		swap_func = NULL;
+
+	if (!swap_func) {
+		if (is_aligned(base, size, 8))
+			swap_func = SWAP_WORDS_64;
+		else if (is_aligned(base, size, 4))
+			swap_func = SWAP_WORDS_32;
+		else
+			swap_func = SWAP_BYTES;
+	}
+
+	/* heapify */
+	for (i = n / 2 - 1; i >= 0; --i) {
+		for (r = i; r * 2 + 1 < n; r = c) {
+			c = r * 2 + 1;
+
+			if (c + 1 < n &&
+			    eytzinger0_do_cmp(base, n, size, cmp_func, priv, c, c + 1) < 0)
+				c++;
+
+			if (eytzinger0_do_cmp(base, n, size, cmp_func, priv, r, c) >= 0)
+				break;
+
+			eytzinger0_do_swap(base, n, size, swap_func, priv, r, c);
+		}
+	}
+
+	/* sort */
+	for (i = n - 1; i > 0; --i) {
+		eytzinger0_do_swap(base, n, size, swap_func, priv, 0, i);
+
+		for (r = 0; r * 2 + 1 < i; r = c) {
+			c = r * 2 + 1;
+
+			if (c + 1 < i &&
+			    eytzinger0_do_cmp(base, n, size, cmp_func, priv, c, c + 1) < 0)
+				c++;
+
+			if (eytzinger0_do_cmp(base, n, size, cmp_func, priv, r, c) >= 0)
+				break;
+
+			eytzinger0_do_swap(base, n, size, swap_func, priv, r, c);
+		}
+	}
+}
+EXPORT_SYMBOL_GPL(eytzinger0_sort_r);
+
+void eytzinger0_sort(void *base, size_t n, size_t size,
+		     cmp_func_t cmp_func,
+		     swap_func_t swap_func)
+{
+	struct wrapper w = {
+		.cmp  = cmp_func,
+		.swap = swap_func,
+	};
+
+	return eytzinger0_sort_r(base, n, size, _CMP_WRAPPER, SWAP_WRAPPER, &w);
+}
+EXPORT_SYMBOL_GPL(eytzinger0_sort);

From 316c2091ed09d86ed37bae8f73557d5b6d70e801 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 26 Jan 2024 11:44:38 -0500
Subject: [PATCH 0502/1406] bcachefs: bch2_time_stats_to_seq_buf()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/super.c |   2 +
 fs/bcachefs/util.c  | 129 +++++++++++++++++++++++++++++++++++++++-----
 fs/bcachefs/util.h  |   4 ++
 3 files changed, 121 insertions(+), 14 deletions(-)

diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index ab14277438d009..6ca877f1c3325a 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -1262,6 +1262,8 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c,
 
 	bch2_time_stats_init(&ca->io_latency[READ]);
 	bch2_time_stats_init(&ca->io_latency[WRITE]);
+	ca->io_latency[READ].quantiles_enabled = true;
+	ca->io_latency[WRITE].quantiles_enabled = true;
 
 	ca->mi = bch2_mi_to_cpu(member);
 
diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
index 902f6b1a8a1429..4c63f81e18bc45 100644
--- a/fs/bcachefs/util.c
+++ b/fs/bcachefs/util.c
@@ -506,10 +506,8 @@ static inline void pr_name_and_units(struct printbuf *out, const char *name, u64
 
 void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats)
 {
-	const struct time_unit *u;
 	s64 f_mean = 0, d_mean = 0;
-	u64 q, last_q = 0, f_stddev = 0, d_stddev = 0;
-	int i;
+	u64 f_stddev = 0, d_stddev = 0;
 
 	if (stats->buffer) {
 		int cpu;
@@ -608,19 +606,122 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats
 
 	printbuf_tabstops_reset(out);
 
-	i = eytzinger0_first(NR_QUANTILES);
-	u = pick_time_units(stats->quantiles.entries[i].m);
+	if (stats->quantiles_enabled) {
+		int i = eytzinger0_first(NR_QUANTILES);
+		const struct time_unit *u =
+			pick_time_units(stats->quantiles.entries[i].m);
+		u64 last_q = 0;
+
+		prt_printf(out, "quantiles (%s):\t", u->name);
+		eytzinger0_for_each(i, NR_QUANTILES) {
+			bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1;
+
+			u64 q = max(stats->quantiles.entries[i].m, last_q);
+			prt_printf(out, "%llu ", div_u64(q, u->nsecs));
+			if (is_last)
+				prt_newline(out);
+			last_q = q;
+		}
+	}
+}
+
+#include <linux/seq_buf.h>
+
+static void seq_buf_time_units_aligned(struct seq_buf *out, u64 ns)
+{
+	const struct time_unit *u = pick_time_units(ns);
+
+	seq_buf_printf(out, "%8llu %s", div64_u64(ns, u->nsecs), u->name);
+}
+
+void bch2_time_stats_to_seq_buf(struct seq_buf *out, struct bch2_time_stats *stats)
+{
+	s64 f_mean = 0, d_mean = 0;
+	u64 f_stddev = 0, d_stddev = 0;
+
+	if (stats->buffer) {
+		int cpu;
 
-	prt_printf(out, "quantiles (%s):\t", u->name);
-	eytzinger0_for_each(i, NR_QUANTILES) {
-		bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1;
+		spin_lock_irq(&stats->lock);
+		for_each_possible_cpu(cpu)
+			__bch2_time_stats_clear_buffer(stats, per_cpu_ptr(stats->buffer, cpu));
+		spin_unlock_irq(&stats->lock);
+	}
 
-		q = max(stats->quantiles.entries[i].m, last_q);
-		prt_printf(out, "%llu ",
-		       div_u64(q, u->nsecs));
-		if (is_last)
-			prt_newline(out);
-		last_q = q;
+	/*
+	 * avoid divide by zero
+	 */
+	if (stats->freq_stats.n) {
+		f_mean = mean_and_variance_get_mean(stats->freq_stats);
+		f_stddev = mean_and_variance_get_stddev(stats->freq_stats);
+		d_mean = mean_and_variance_get_mean(stats->duration_stats);
+		d_stddev = mean_and_variance_get_stddev(stats->duration_stats);
+	}
+
+	seq_buf_printf(out, "count: %llu\n", stats->duration_stats.n);
+
+	seq_buf_printf(out, "                       since mount        recent\n");
+
+	seq_buf_printf(out, "duration of events\n");
+
+	seq_buf_printf(out, "  min:                     ");
+	seq_buf_time_units_aligned(out, stats->min_duration);
+	seq_buf_printf(out, "\n");
+
+	seq_buf_printf(out, "  max:                     ");
+	seq_buf_time_units_aligned(out, stats->max_duration);
+	seq_buf_printf(out, "\n");
+
+	seq_buf_printf(out, "  total:                   ");
+	seq_buf_time_units_aligned(out, stats->total_duration);
+	seq_buf_printf(out, "\n");
+
+	seq_buf_printf(out, "  mean:                    ");
+	seq_buf_time_units_aligned(out, d_mean);
+	seq_buf_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->duration_stats_weighted));
+	seq_buf_printf(out, "\n");
+
+	seq_buf_printf(out, "  stddev:                  ");
+	seq_buf_time_units_aligned(out, d_stddev);
+	seq_buf_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->duration_stats_weighted));
+	seq_buf_printf(out, "\n");
+
+	seq_buf_printf(out, "time between events\n");
+
+	seq_buf_printf(out, "  min:                     ");
+	seq_buf_time_units_aligned(out, stats->min_freq);
+	seq_buf_printf(out, "\n");
+
+	seq_buf_printf(out, "  max:                     ");
+	seq_buf_time_units_aligned(out, stats->max_freq);
+	seq_buf_printf(out, "\n");
+
+	seq_buf_printf(out, "  mean:                    ");
+	seq_buf_time_units_aligned(out, f_mean);
+	seq_buf_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->freq_stats_weighted));
+	seq_buf_printf(out, "\n");
+
+	seq_buf_printf(out, "  stddev:                  ");
+	seq_buf_time_units_aligned(out, f_stddev);
+	seq_buf_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->freq_stats_weighted));
+	seq_buf_printf(out, "\n");
+
+	if (stats->quantiles_enabled) {
+		int i = eytzinger0_first(NR_QUANTILES);
+		const struct time_unit *u =
+			pick_time_units(stats->quantiles.entries[i].m);
+		u64 last_q = 0;
+
+		prt_printf(out, "quantiles (%s):\t", u->name);
+		eytzinger0_for_each(i, NR_QUANTILES) {
+			bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1;
+
+			u64 q = max(stats->quantiles.entries[i].m, last_q);
+			seq_buf_printf(out, "%llu ", div_u64(q, u->nsecs));
+			if (is_last)
+				seq_buf_printf(out, "\n");
+			last_q = q;
+		}
 	}
 }
 #else
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index c3b11c3d24ea98..7ff2d4fe26f684 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -382,6 +382,7 @@ struct bch2_time_stat_buffer {
 
 struct bch2_time_stats {
 	spinlock_t	lock;
+	bool		quantiles_enabled;
 	/* all fields are in nanoseconds */
 	u64             min_duration;
 	u64		max_duration;
@@ -435,6 +436,9 @@ static inline bool track_event_change(struct bch2_time_stats *stats,
 
 void bch2_time_stats_to_text(struct printbuf *, struct bch2_time_stats *);
 
+struct seq_buf;
+void bch2_time_stats_to_seq_buf(struct seq_buf *, struct bch2_time_stats *);
+
 void bch2_time_stats_exit(struct bch2_time_stats *);
 void bch2_time_stats_init(struct bch2_time_stats *);
 

From fefc5e735c643912f5dd69fbfad6c75050ad6ed0 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 26 Jan 2024 11:58:44 -0500
Subject: [PATCH 0503/1406] time_stats: Promote to lib/

Library code from bcachefs for tracking latency measurements.

The main interface is
  time_stats_update(stats, start_time);

which collects a new event with an end time of the current time.

It features percpu buffering of input values, making it very low
overhead, and nicely formatted output to printbufs or seq_buf.

Sample output, from the bcache conversion:

root@moria-kvm:/sys/fs/bcache/bdaedb8c-4554-4dd2-87e4-276e51eb47cc# cat internal/btree_sort_times
count: 6414
                       since mount        recent
duration of events
  min:                          440 ns
  max:                         1102 us
  total:                        674 ms
  mean:                         105 us     102 us
  stddev:                       101 us      88 us
time between events
  min:                          881 ns
  max:                            3 s
  mean:                           7 ms       6 ms
  stddev:                        52 ms       6 ms

Cc: Darrick J. Wong <djwong@kernel.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: Coly Li <colyli@suse.de>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
---
 MAINTAINERS                         |   7 +
 fs/bcachefs/Kconfig                 |   2 +-
 fs/bcachefs/alloc_foreground.c      |  13 +-
 fs/bcachefs/bcachefs.h              |  11 +-
 fs/bcachefs/btree_cache.c           |   2 +-
 fs/bcachefs/btree_gc.c              |   2 +-
 fs/bcachefs/btree_io.c              |   8 +-
 fs/bcachefs/btree_iter.c            |   8 +-
 fs/bcachefs/btree_locking.h         |   2 +-
 fs/bcachefs/btree_update_interior.c |   8 +-
 fs/bcachefs/io_read.c               |   4 +-
 fs/bcachefs/io_write.c              |   4 +-
 fs/bcachefs/journal.c               |   5 +-
 fs/bcachefs/journal_io.c            |   9 +-
 fs/bcachefs/journal_reclaim.c       |   9 +-
 fs/bcachefs/journal_types.h         |  11 +-
 fs/bcachefs/nocow_locking.c         |   2 +-
 fs/bcachefs/super.c                 |  12 +-
 fs/bcachefs/util.c                  | 263 +--------------------------
 fs/bcachefs/util.h                  |  83 +--------
 include/linux/time_stats.h          | 134 ++++++++++++++
 lib/Kconfig                         |   4 +
 lib/Makefile                        |   2 +
 lib/time_stats.c                    | 271 ++++++++++++++++++++++++++++
 24 files changed, 470 insertions(+), 406 deletions(-)
 create mode 100644 include/linux/time_stats.h
 create mode 100644 lib/time_stats.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 8322dbc120404c..f2face46f3650d 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -22140,6 +22140,13 @@ F:	kernel/time/ntp.c
 F:	kernel/time/time*.c
 F:	tools/testing/selftests/timers/
 
+TIME STATS:
+M:	Kent Overstreet <kent.overstreet@linux.dev>
+M:	Darrick J. Wong <djwong@kernel.org>
+S:	Maintained
+F:	include/linux/time_stats.h
+F:	lib/time_stats.c
+
 TIPC NETWORK LAYER
 M:	Jon Maloy <jmaloy@redhat.com>
 M:	Ying Xue <ying.xue@windriver.com>
diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig
index 72d1179262b33e..8c587ddd2f85ef 100644
--- a/fs/bcachefs/Kconfig
+++ b/fs/bcachefs/Kconfig
@@ -24,7 +24,7 @@ config BCACHEFS_FS
 	select XXHASH
 	select SRCU
 	select SYMBOLIC_ERRNAME
-	select MEAN_AND_VARIANCE
+	select TIME_STATS
 	help
 	The bcachefs filesystem - a modern, copy on write filesystem, with
 	support for multiple devices, compression, checksumming, etc.
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 633d3223b353f8..ca58193dd90279 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -236,8 +236,7 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *
 		if (cl)
 			closure_wait(&c->open_buckets_wait, cl);
 
-		track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket],
-				   &c->blocked_allocate_open_bucket, true);
+		track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket], true);
 		spin_unlock(&c->freelist_lock);
 		return ERR_PTR(-BCH_ERR_open_buckets_empty);
 	}
@@ -263,11 +262,8 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *
 	ca->nr_open_buckets++;
 	bch2_open_bucket_hash_add(c, ob);
 
-	track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket],
-			   &c->blocked_allocate_open_bucket, false);
-
-	track_event_change(&c->times[BCH_TIME_blocked_allocate],
-			   &c->blocked_allocate, false);
+	track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket], false);
+	track_event_change(&c->times[BCH_TIME_blocked_allocate], false);
 
 	spin_unlock(&c->freelist_lock);
 	return ob;
@@ -555,8 +551,7 @@ static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans,
 			goto again;
 		}
 
-		track_event_change(&c->times[BCH_TIME_blocked_allocate],
-				   &c->blocked_allocate, true);
+		track_event_change(&c->times[BCH_TIME_blocked_allocate], true);
 
 		ob = ERR_PTR(-BCH_ERR_freelist_empty);
 		goto err;
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 69d0d60d50e366..92547d6fd2d95b 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -200,6 +200,7 @@
 #include <linux/seqlock.h>
 #include <linux/shrinker.h>
 #include <linux/srcu.h>
+#include <linux/time_stats.h>
 #include <linux/types.h>
 #include <linux/workqueue.h>
 #include <linux/zstd.h>
@@ -593,7 +594,7 @@ struct bch_dev {
 
 	/* The rest of this all shows up in sysfs */
 	atomic64_t		cur_latency[2];
-	struct bch2_time_stats	io_latency[2];
+	struct time_stats	io_latency[2];
 
 #define CONGESTED_MAX		1024
 	atomic_t		congested;
@@ -640,8 +641,8 @@ struct btree_debug {
 #define BCH_TRANSACTIONS_NR 128
 
 struct btree_transaction_stats {
-	struct bch2_time_stats	duration;
-	struct bch2_time_stats	lock_hold_times;
+	struct time_stats	duration;
+	struct time_stats	lock_hold_times;
 	struct mutex		lock;
 	unsigned		nr_max_paths;
 	unsigned		journal_entries_size;
@@ -919,8 +920,6 @@ struct bch_fs {
 	/* ALLOCATOR */
 	spinlock_t		freelist_lock;
 	struct closure_waitlist	freelist_wait;
-	u64			blocked_allocate;
-	u64			blocked_allocate_open_bucket;
 
 	open_bucket_idx_t	open_buckets_freelist;
 	open_bucket_idx_t	open_buckets_nr_free;
@@ -1104,7 +1103,7 @@ struct bch_fs {
 	unsigned		copy_gc_enabled:1;
 	bool			promote_whole_extents;
 
-	struct bch2_time_stats	times[BCH_TIME_STAT_NR];
+	struct time_stats	times[BCH_TIME_STAT_NR];
 
 	struct btree_transaction_stats btree_transaction_stats[BCH_TRANSACTIONS_NR];
 
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index a8b393bc7567b4..9dcc4f9334bf22 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -648,7 +648,7 @@ struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_rea
 	bch2_btree_keys_init(b);
 	set_btree_node_accessed(b);
 
-	bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc],
+	time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc],
 			       start_time);
 
 	memalloc_nofs_restore(flags);
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 4a08af94634a4f..642f4c929b3c4e 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -1973,7 +1973,7 @@ int bch2_gc_gens(struct bch_fs *c)
 
 	c->gc_count++;
 
-	bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time);
+	time_stats_update(&c->times[BCH_TIME_btree_gc], start_time);
 	trace_and_count(c, gc_gens_end, c);
 err:
 	for_each_member_device(c, ca) {
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index aa9b6cbe322690..a56dcabb7ace7a 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -327,7 +327,7 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b,
 	BUG_ON(vstruct_end(&out->keys) > (void *) out + bytes);
 
 	if (sorting_entire_node)
-		bch2_time_stats_update(&c->times[BCH_TIME_btree_node_sort],
+		time_stats_update(&c->times[BCH_TIME_btree_node_sort],
 				       start_time);
 
 	/* Make sure we preserve bset journal_seq: */
@@ -397,7 +397,7 @@ void bch2_btree_sort_into(struct bch_fs *c,
 			&dst->format,
 			true);
 
-	bch2_time_stats_update(&c->times[BCH_TIME_btree_node_sort],
+	time_stats_update(&c->times[BCH_TIME_btree_node_sort],
 			       start_time);
 
 	set_btree_bset_end(dst, dst->set);
@@ -1251,7 +1251,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
 out:
 	mempool_free(iter, &c->fill_iter);
 	printbuf_exit(&buf);
-	bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read_done], start_time);
+	time_stats_update(&c->times[BCH_TIME_btree_node_read_done], start_time);
 	return retry_read;
 fsck_err:
 	if (ret == -BCH_ERR_btree_node_read_err_want_retry ||
@@ -1323,7 +1323,7 @@ static void btree_node_read_work(struct work_struct *work)
 		}
 	}
 
-	bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read],
+	time_stats_update(&c->times[BCH_TIME_btree_node_read],
 			       rb->start_time);
 	bio_put(&rb->bio);
 
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 924c58823f0d56..3aac6ed5446ebd 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -2899,7 +2899,7 @@ u32 bch2_trans_begin(struct btree_trans *trans)
 
 	if (!IS_ENABLED(CONFIG_BCACHEFS_NO_LATENCY_ACCT) &&
 	    time_after64(now, trans->last_begin_time + 10))
-		__bch2_time_stats_update(&btree_trans_stats(trans)->duration,
+		__time_stats_update(&btree_trans_stats(trans)->duration,
 					 trans->last_begin_time, now);
 
 	if (!trans->restarted &&
@@ -3224,7 +3224,7 @@ void bch2_fs_btree_iter_exit(struct bch_fs *c)
 	     s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats);
 	     s++) {
 		kfree(s->max_paths_text);
-		bch2_time_stats_exit(&s->lock_hold_times);
+		time_stats_exit(&s->lock_hold_times);
 	}
 
 	if (c->btree_trans_barrier_initialized)
@@ -3240,8 +3240,8 @@ void bch2_fs_btree_iter_init_early(struct bch_fs *c)
 	for (s = c->btree_transaction_stats;
 	     s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats);
 	     s++) {
-		bch2_time_stats_init(&s->duration);
-		bch2_time_stats_init(&s->lock_hold_times);
+		time_stats_init(&s->duration);
+		time_stats_init(&s->lock_hold_times);
 		mutex_init(&s->lock);
 	}
 
diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
index 4bd72c855da1a4..f2e2c5881b7e4b 100644
--- a/fs/bcachefs/btree_locking.h
+++ b/fs/bcachefs/btree_locking.h
@@ -122,7 +122,7 @@ static void btree_trans_lock_hold_time_update(struct btree_trans *trans,
 					      struct btree_path *path, unsigned level)
 {
 #ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS
-	__bch2_time_stats_update(&btree_trans_stats(trans)->lock_hold_times,
+	__time_stats_update(&btree_trans_stats(trans)->lock_hold_times,
 				 path->l[level].lock_taken_time,
 				 local_clock());
 #endif
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index cd87ac3a9b0d49..efe51d99dce40c 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -517,7 +517,7 @@ static void bch2_btree_update_free(struct btree_update *as, struct btree_trans *
 	bch2_disk_reservation_put(c, &as->disk_res);
 	bch2_btree_reserve_put(as, trans);
 
-	bch2_time_stats_update(&c->times[BCH_TIME_btree_interior_update_total],
+	time_stats_update(&c->times[BCH_TIME_btree_interior_update_total],
 			       as->start_time);
 
 	mutex_lock(&c->btree_interior_update_lock);
@@ -1039,7 +1039,7 @@ static void bch2_btree_update_done(struct btree_update *as, struct btree_trans *
 	continue_at(&as->cl, btree_update_set_nodes_written,
 		    as->c->btree_interior_update_worker);
 
-	bch2_time_stats_update(&c->times[BCH_TIME_btree_interior_update_foreground],
+	time_stats_update(&c->times[BCH_TIME_btree_interior_update_foreground],
 			       start_time);
 }
 
@@ -1630,7 +1630,7 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans,
 
 	bch2_trans_verify_locks(trans);
 
-	bch2_time_stats_update(&c->times[n2
+	time_stats_update(&c->times[n2
 			       ? BCH_TIME_btree_node_split
 			       : BCH_TIME_btree_node_compact],
 			       start_time);
@@ -1936,7 +1936,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
 
 	bch2_btree_update_done(as, trans);
 
-	bch2_time_stats_update(&c->times[BCH_TIME_btree_node_merge], start_time);
+	time_stats_update(&c->times[BCH_TIME_btree_node_merge], start_time);
 out:
 err:
 	if (new_path)
diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c
index 3c574d8873a1e2..dce136cd227132 100644
--- a/fs/bcachefs/io_read.c
+++ b/fs/bcachefs/io_read.c
@@ -134,7 +134,7 @@ static void promote_done(struct bch_write_op *wop)
 		container_of(wop, struct promote_op, write.op);
 	struct bch_fs *c = op->write.op.c;
 
-	bch2_time_stats_update(&c->times[BCH_TIME_data_promote],
+	time_stats_update(&c->times[BCH_TIME_data_promote],
 			       op->start_time);
 	promote_free(c, op);
 }
@@ -356,7 +356,7 @@ static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio)
 static void bch2_rbio_done(struct bch_read_bio *rbio)
 {
 	if (rbio->start_time)
-		bch2_time_stats_update(&rbio->c->times[BCH_TIME_data_read],
+		time_stats_update(&rbio->c->times[BCH_TIME_data_read],
 				       rbio->start_time);
 	bio_endio(&rbio->bio);
 }
diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c
index 2c098ac017b30b..8123a84320e3f1 100644
--- a/fs/bcachefs/io_write.c
+++ b/fs/bcachefs/io_write.c
@@ -88,7 +88,7 @@ void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw)
 
 	bch2_congested_acct(ca, io_latency, now, rw);
 
-	__bch2_time_stats_update(&ca->io_latency[rw], submit_time, now);
+	__time_stats_update(&ca->io_latency[rw], submit_time, now);
 }
 
 #endif
@@ -457,7 +457,7 @@ static void bch2_write_done(struct closure *cl)
 
 	EBUG_ON(op->open_buckets.nr);
 
-	bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time);
+	time_stats_update(&c->times[BCH_TIME_data_write], op->start_time);
 	bch2_disk_reservation_put(c, &op->res);
 
 	if (!(op->flags & BCH_WRITE_MOVE))
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index fe5f7a944ad308..c040f69dfb5c70 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -555,8 +555,7 @@ static int __journal_res_get(struct journal *j, struct journal_res *res,
 		ret = -BCH_ERR_journal_res_get_blocked;
 
 	if (ret == JOURNAL_ERR_max_in_flight &&
-	    track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight],
-			       &j->max_in_flight_start, true)) {
+	    track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], true)) {
 
 		struct printbuf buf = PRINTBUF;
 		prt_printf(&buf, "seq %llu\n", journal_cur_seq(j));
@@ -754,7 +753,7 @@ int bch2_journal_flush_seq(struct journal *j, u64 seq)
 	ret = wait_event_interruptible(j->wait, (ret2 = bch2_journal_flush_seq_async(j, seq, NULL)));
 
 	if (!ret)
-		bch2_time_stats_update(j->flush_seq_time, start_time);
+		time_stats_update(j->flush_seq_time, start_time);
 
 	return ret ?: ret2 < 0 ? ret2 : 0;
 }
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index cd8921a2c0daed..7c2321c5af2ac3 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1605,9 +1605,9 @@ static CLOSURE_CALLBACK(journal_write_done)
 	u64 v, seq = le64_to_cpu(w->data->seq);
 	int err = 0;
 
-	bch2_time_stats_update(!JSET_NO_FLUSH(w->data)
-			       ? j->flush_write_time
-			       : j->noflush_write_time, j->write_start_time);
+	time_stats_update(!JSET_NO_FLUSH(w->data)
+			  ? j->flush_write_time
+			  : j->noflush_write_time, j->write_start_time);
 
 	if (!w->devs_written.nr) {
 		bch_err(c, "unable to write journal to sufficient devices");
@@ -1677,8 +1677,7 @@ static CLOSURE_CALLBACK(journal_write_done)
 		bch2_journal_reclaim_fast(j);
 		bch2_journal_space_available(j);
 
-		track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight],
-				   &j->max_in_flight_start, false);
+		track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], false);
 
 		closure_wake_up(&w->wait);
 		journal_wake(j);
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index f4d0c726f34817..a71550816c3040 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -62,12 +62,9 @@ void bch2_journal_set_watermark(struct journal *j)
 		? BCH_WATERMARK_reclaim
 		: BCH_WATERMARK_stripe;
 
-	if (track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_space],
-			       &j->low_on_space_start, low_on_space) ||
-	    track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_pin],
-			       &j->low_on_pin_start, low_on_pin) ||
-	    track_event_change(&c->times[BCH_TIME_blocked_write_buffer_full],
-			       &j->write_buffer_full_start, low_on_wb))
+	if (track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_space], low_on_space) ||
+	    track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_pin], low_on_pin) ||
+	    track_event_change(&c->times[BCH_TIME_blocked_write_buffer_full], low_on_wb))
 		trace_and_count(c, journal_full, c);
 
 	swap(watermark, j->watermark);
diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h
index 3696aac3ccb728..011f7a0d4ebd8c 100644
--- a/fs/bcachefs/journal_types.h
+++ b/fs/bcachefs/journal_types.h
@@ -287,14 +287,9 @@ struct journal {
 	u64			nr_noflush_writes;
 	u64			entry_bytes_written;
 
-	u64			low_on_space_start;
-	u64			low_on_pin_start;
-	u64			max_in_flight_start;
-	u64			write_buffer_full_start;
-
-	struct bch2_time_stats	*flush_write_time;
-	struct bch2_time_stats	*noflush_write_time;
-	struct bch2_time_stats	*flush_seq_time;
+	struct time_stats	*flush_write_time;
+	struct time_stats	*noflush_write_time;
+	struct time_stats	*flush_seq_time;
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 	struct lockdep_map	res_map;
diff --git a/fs/bcachefs/nocow_locking.c b/fs/bcachefs/nocow_locking.c
index 3c21981a4a1c09..181efa4a83fa12 100644
--- a/fs/bcachefs/nocow_locking.c
+++ b/fs/bcachefs/nocow_locking.c
@@ -85,7 +85,7 @@ void __bch2_bucket_nocow_lock(struct bucket_nocow_lock_table *t,
 		u64 start_time = local_clock();
 
 		__closure_wait_event(&l->wait, __bch2_bucket_nocow_trylock(l, dev_bucket, flags));
-		bch2_time_stats_update(&c->times[BCH_TIME_nocow_lock_contended], start_time);
+		time_stats_update(&c->times[BCH_TIME_nocow_lock_contended], start_time);
 	}
 }
 
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 6ca877f1c3325a..647ab8ebe3582d 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -520,7 +520,7 @@ static void __bch2_fs_free(struct bch_fs *c)
 	unsigned i;
 
 	for (i = 0; i < BCH_TIME_STAT_NR; i++)
-		bch2_time_stats_exit(&c->times[i]);
+		time_stats_exit(&c->times[i]);
 
 	bch2_free_pending_node_rewrites(c);
 	bch2_fs_sb_errors_exit(c);
@@ -753,7 +753,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	c->journal_keys.initial_ref_held = true;
 
 	for (i = 0; i < BCH_TIME_STAT_NR; i++)
-		bch2_time_stats_init(&c->times[i]);
+		time_stats_init(&c->times[i]);
 
 	bch2_fs_copygc_init(c);
 	bch2_fs_btree_key_cache_init_early(&c->btree_key_cache);
@@ -1168,8 +1168,8 @@ static void bch2_dev_free(struct bch_dev *ca)
 	bch2_dev_buckets_free(ca);
 	free_page((unsigned long) ca->sb_read_scratch);
 
-	bch2_time_stats_exit(&ca->io_latency[WRITE]);
-	bch2_time_stats_exit(&ca->io_latency[READ]);
+	time_stats_exit(&ca->io_latency[WRITE]);
+	time_stats_exit(&ca->io_latency[READ]);
 
 	percpu_ref_exit(&ca->io_ref);
 	percpu_ref_exit(&ca->ref);
@@ -1260,8 +1260,8 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c,
 
 	INIT_WORK(&ca->io_error_work, bch2_io_error_work);
 
-	bch2_time_stats_init(&ca->io_latency[READ]);
-	bch2_time_stats_init(&ca->io_latency[WRITE]);
+	time_stats_init(&ca->io_latency[READ]);
+	time_stats_init(&ca->io_latency[WRITE]);
 	ca->io_latency[READ].quantiles_enabled = true;
 	ca->io_latency[WRITE].quantiles_enabled = true;
 
diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
index 4c63f81e18bc45..88853513a15faf 100644
--- a/fs/bcachefs/util.c
+++ b/fs/bcachefs/util.c
@@ -337,32 +337,6 @@ void bch2_prt_datetime(struct printbuf *out, time64_t sec)
 }
 #endif
 
-static const struct time_unit {
-	const char	*name;
-	u64		nsecs;
-} time_units[] = {
-	{ "ns",		1		 },
-	{ "us",		NSEC_PER_USEC	 },
-	{ "ms",		NSEC_PER_MSEC	 },
-	{ "s",		NSEC_PER_SEC	 },
-	{ "m",          (u64) NSEC_PER_SEC * 60},
-	{ "h",          (u64) NSEC_PER_SEC * 3600},
-	{ "eon",        U64_MAX          },
-};
-
-static const struct time_unit *pick_time_units(u64 ns)
-{
-	const struct time_unit *u;
-
-	for (u = time_units;
-	     u + 1 < time_units + ARRAY_SIZE(time_units) &&
-	     ns >= u[1].nsecs << 1;
-	     u++)
-		;
-
-	return u;
-}
-
 void bch2_pr_time_units(struct printbuf *out, u64 ns)
 {
 	const struct time_unit *u = pick_time_units(ns);
@@ -370,121 +344,6 @@ void bch2_pr_time_units(struct printbuf *out, u64 ns)
 	prt_printf(out, "%llu %s", div_u64(ns, u->nsecs), u->name);
 }
 
-/* time stats: */
-
-#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
-static void bch2_quantiles_update(struct bch2_quantiles *q, u64 v)
-{
-	unsigned i = 0;
-
-	while (i < ARRAY_SIZE(q->entries)) {
-		struct bch2_quantile_entry *e = q->entries + i;
-
-		if (unlikely(!e->step)) {
-			e->m = v;
-			e->step = max_t(unsigned, v / 2, 1024);
-		} else if (e->m > v) {
-			e->m = e->m >= e->step
-				? e->m - e->step
-				: 0;
-		} else if (e->m < v) {
-			e->m = e->m + e->step > e->m
-				? e->m + e->step
-				: U32_MAX;
-		}
-
-		if ((e->m > v ? e->m - v : v - e->m) < e->step)
-			e->step = max_t(unsigned, e->step / 2, 1);
-
-		if (v >= e->m)
-			break;
-
-		i = eytzinger0_child(i, v > e->m);
-	}
-}
-
-static inline void bch2_time_stats_update_one(struct bch2_time_stats *stats,
-					      u64 start, u64 end)
-{
-	u64 duration, freq;
-
-	if (time_after64(end, start)) {
-		duration = end - start;
-		mean_and_variance_update(&stats->duration_stats, duration);
-		mean_and_variance_weighted_update(&stats->duration_stats_weighted, duration);
-		stats->max_duration = max(stats->max_duration, duration);
-		stats->min_duration = min(stats->min_duration, duration);
-		stats->total_duration += duration;
-		bch2_quantiles_update(&stats->quantiles, duration);
-	}
-
-	if (stats->last_event && time_after64(end, stats->last_event)) {
-		freq = end - stats->last_event;
-		mean_and_variance_update(&stats->freq_stats, freq);
-		mean_and_variance_weighted_update(&stats->freq_stats_weighted, freq);
-		stats->max_freq = max(stats->max_freq, freq);
-		stats->min_freq = min(stats->min_freq, freq);
-	}
-
-	stats->last_event = end;
-}
-
-static void __bch2_time_stats_clear_buffer(struct bch2_time_stats *stats,
-					   struct bch2_time_stat_buffer *b)
-{
-	for (struct bch2_time_stat_buffer_entry *i = b->entries;
-	     i < b->entries + ARRAY_SIZE(b->entries);
-	     i++)
-		bch2_time_stats_update_one(stats, i->start, i->end);
-	b->nr = 0;
-}
-
-static noinline void bch2_time_stats_clear_buffer(struct bch2_time_stats *stats,
-						  struct bch2_time_stat_buffer *b)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&stats->lock, flags);
-	__bch2_time_stats_clear_buffer(stats, b);
-	spin_unlock_irqrestore(&stats->lock, flags);
-}
-
-void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end)
-{
-	unsigned long flags;
-
-	WARN_ONCE(!stats->duration_stats_weighted.weight ||
-		  !stats->freq_stats_weighted.weight,
-		  "uninitialized time_stats");
-
-	if (!stats->buffer) {
-		spin_lock_irqsave(&stats->lock, flags);
-		bch2_time_stats_update_one(stats, start, end);
-
-		if (mean_and_variance_weighted_get_mean(stats->freq_stats_weighted) < 32 &&
-		    stats->duration_stats.n > 1024)
-			stats->buffer =
-				alloc_percpu_gfp(struct bch2_time_stat_buffer,
-						 GFP_ATOMIC);
-		spin_unlock_irqrestore(&stats->lock, flags);
-	} else {
-		struct bch2_time_stat_buffer *b;
-
-		preempt_disable();
-		b = this_cpu_ptr(stats->buffer);
-
-		BUG_ON(b->nr >= ARRAY_SIZE(b->entries));
-		b->entries[b->nr++] = (struct bch2_time_stat_buffer_entry) {
-			.start = start,
-			.end = end
-		};
-
-		if (unlikely(b->nr == ARRAY_SIZE(b->entries)))
-			bch2_time_stats_clear_buffer(stats, b);
-		preempt_enable();
-	}
-}
-
 static void bch2_pr_time_units_aligned(struct printbuf *out, u64 ns)
 {
 	const struct time_unit *u = pick_time_units(ns);
@@ -504,7 +363,7 @@ static inline void pr_name_and_units(struct printbuf *out, const char *name, u64
 
 #define TABSTOP_SIZE 12
 
-void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats)
+void bch2_time_stats_to_text(struct printbuf *out, struct time_stats *stats)
 {
 	s64 f_mean = 0, d_mean = 0;
 	u64 f_stddev = 0, d_stddev = 0;
@@ -514,7 +373,7 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats
 
 		spin_lock_irq(&stats->lock);
 		for_each_possible_cpu(cpu)
-			__bch2_time_stats_clear_buffer(stats, per_cpu_ptr(stats->buffer, cpu));
+			__time_stats_clear_buffer(stats, per_cpu_ptr(stats->buffer, cpu));
 		spin_unlock_irq(&stats->lock);
 	}
 
@@ -625,124 +484,6 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats
 	}
 }
 
-#include <linux/seq_buf.h>
-
-static void seq_buf_time_units_aligned(struct seq_buf *out, u64 ns)
-{
-	const struct time_unit *u = pick_time_units(ns);
-
-	seq_buf_printf(out, "%8llu %s", div64_u64(ns, u->nsecs), u->name);
-}
-
-void bch2_time_stats_to_seq_buf(struct seq_buf *out, struct bch2_time_stats *stats)
-{
-	s64 f_mean = 0, d_mean = 0;
-	u64 f_stddev = 0, d_stddev = 0;
-
-	if (stats->buffer) {
-		int cpu;
-
-		spin_lock_irq(&stats->lock);
-		for_each_possible_cpu(cpu)
-			__bch2_time_stats_clear_buffer(stats, per_cpu_ptr(stats->buffer, cpu));
-		spin_unlock_irq(&stats->lock);
-	}
-
-	/*
-	 * avoid divide by zero
-	 */
-	if (stats->freq_stats.n) {
-		f_mean = mean_and_variance_get_mean(stats->freq_stats);
-		f_stddev = mean_and_variance_get_stddev(stats->freq_stats);
-		d_mean = mean_and_variance_get_mean(stats->duration_stats);
-		d_stddev = mean_and_variance_get_stddev(stats->duration_stats);
-	}
-
-	seq_buf_printf(out, "count: %llu\n", stats->duration_stats.n);
-
-	seq_buf_printf(out, "                       since mount        recent\n");
-
-	seq_buf_printf(out, "duration of events\n");
-
-	seq_buf_printf(out, "  min:                     ");
-	seq_buf_time_units_aligned(out, stats->min_duration);
-	seq_buf_printf(out, "\n");
-
-	seq_buf_printf(out, "  max:                     ");
-	seq_buf_time_units_aligned(out, stats->max_duration);
-	seq_buf_printf(out, "\n");
-
-	seq_buf_printf(out, "  total:                   ");
-	seq_buf_time_units_aligned(out, stats->total_duration);
-	seq_buf_printf(out, "\n");
-
-	seq_buf_printf(out, "  mean:                    ");
-	seq_buf_time_units_aligned(out, d_mean);
-	seq_buf_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->duration_stats_weighted));
-	seq_buf_printf(out, "\n");
-
-	seq_buf_printf(out, "  stddev:                  ");
-	seq_buf_time_units_aligned(out, d_stddev);
-	seq_buf_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->duration_stats_weighted));
-	seq_buf_printf(out, "\n");
-
-	seq_buf_printf(out, "time between events\n");
-
-	seq_buf_printf(out, "  min:                     ");
-	seq_buf_time_units_aligned(out, stats->min_freq);
-	seq_buf_printf(out, "\n");
-
-	seq_buf_printf(out, "  max:                     ");
-	seq_buf_time_units_aligned(out, stats->max_freq);
-	seq_buf_printf(out, "\n");
-
-	seq_buf_printf(out, "  mean:                    ");
-	seq_buf_time_units_aligned(out, f_mean);
-	seq_buf_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->freq_stats_weighted));
-	seq_buf_printf(out, "\n");
-
-	seq_buf_printf(out, "  stddev:                  ");
-	seq_buf_time_units_aligned(out, f_stddev);
-	seq_buf_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->freq_stats_weighted));
-	seq_buf_printf(out, "\n");
-
-	if (stats->quantiles_enabled) {
-		int i = eytzinger0_first(NR_QUANTILES);
-		const struct time_unit *u =
-			pick_time_units(stats->quantiles.entries[i].m);
-		u64 last_q = 0;
-
-		prt_printf(out, "quantiles (%s):\t", u->name);
-		eytzinger0_for_each(i, NR_QUANTILES) {
-			bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1;
-
-			u64 q = max(stats->quantiles.entries[i].m, last_q);
-			seq_buf_printf(out, "%llu ", div_u64(q, u->nsecs));
-			if (is_last)
-				seq_buf_printf(out, "\n");
-			last_q = q;
-		}
-	}
-}
-#else
-void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats) {}
-#endif
-
-void bch2_time_stats_exit(struct bch2_time_stats *stats)
-{
-	free_percpu(stats->buffer);
-}
-
-void bch2_time_stats_init(struct bch2_time_stats *stats)
-{
-	memset(stats, 0, sizeof(*stats));
-	stats->duration_stats_weighted.weight = 8;
-	stats->freq_stats_weighted.weight = 8;
-	stats->min_duration = U64_MAX;
-	stats->min_freq = U64_MAX;
-	spin_lock_init(&stats->lock);
-}
-
 /* ratelimit: */
 
 /**
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index 7ff2d4fe26f684..cf8d16a9116223 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -15,6 +15,7 @@
 #include <linux/preempt.h>
 #include <linux/ratelimit.h>
 #include <linux/slab.h>
+#include <linux/time_stats.h>
 #include <linux/vmalloc.h>
 #include <linux/workqueue.h>
 #include <linux/mean_and_variance.h>
@@ -360,87 +361,7 @@ static inline void prt_bdevname(struct printbuf *out, struct block_device *bdev)
 #endif
 }
 
-#define NR_QUANTILES	15
-#define QUANTILE_IDX(i)	inorder_to_eytzinger0(i, NR_QUANTILES)
-#define QUANTILE_FIRST	eytzinger0_first(NR_QUANTILES)
-#define QUANTILE_LAST	eytzinger0_last(NR_QUANTILES)
-
-struct bch2_quantiles {
-	struct bch2_quantile_entry {
-		u64	m;
-		u64	step;
-	}		entries[NR_QUANTILES];
-};
-
-struct bch2_time_stat_buffer {
-	unsigned	nr;
-	struct bch2_time_stat_buffer_entry {
-		u64	start;
-		u64	end;
-	}		entries[32];
-};
-
-struct bch2_time_stats {
-	spinlock_t	lock;
-	bool		quantiles_enabled;
-	/* all fields are in nanoseconds */
-	u64             min_duration;
-	u64		max_duration;
-	u64		total_duration;
-	u64             max_freq;
-	u64             min_freq;
-	u64		last_event;
-	struct bch2_quantiles quantiles;
-
-	struct mean_and_variance	  duration_stats;
-	struct mean_and_variance_weighted duration_stats_weighted;
-	struct mean_and_variance	  freq_stats;
-	struct mean_and_variance_weighted freq_stats_weighted;
-	struct bch2_time_stat_buffer __percpu *buffer;
-};
-
-#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
-void __bch2_time_stats_update(struct bch2_time_stats *stats, u64, u64);
-
-static inline void bch2_time_stats_update(struct bch2_time_stats *stats, u64 start)
-{
-	__bch2_time_stats_update(stats, start, local_clock());
-}
-
-static inline bool track_event_change(struct bch2_time_stats *stats,
-				      u64 *start, bool v)
-{
-	if (v != !!*start) {
-		if (!v) {
-			bch2_time_stats_update(stats, *start);
-			*start = 0;
-		} else {
-			*start = local_clock() ?: 1;
-			return true;
-		}
-	}
-
-	return false;
-}
-#else
-static inline void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end) {}
-static inline void bch2_time_stats_update(struct bch2_time_stats *stats, u64 start) {}
-static inline bool track_event_change(struct bch2_time_stats *stats,
-				      u64 *start, bool v)
-{
-	bool ret = v && !*start;
-	*start = v;
-	return ret;
-}
-#endif
-
-void bch2_time_stats_to_text(struct printbuf *, struct bch2_time_stats *);
-
-struct seq_buf;
-void bch2_time_stats_to_seq_buf(struct seq_buf *, struct bch2_time_stats *);
-
-void bch2_time_stats_exit(struct bch2_time_stats *);
-void bch2_time_stats_init(struct bch2_time_stats *);
+void bch2_time_stats_to_text(struct printbuf *, struct time_stats *);
 
 #define ewma_add(ewma, val, weight)					\
 ({									\
diff --git a/include/linux/time_stats.h b/include/linux/time_stats.h
new file mode 100644
index 00000000000000..caefa7aba65a06
--- /dev/null
+++ b/include/linux/time_stats.h
@@ -0,0 +1,134 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * time_stats - collect statistics on events that have a duration, with nicely
+ * formatted textual output on demand
+ *
+ * - percpu buffering of event collection: cheap enough to shotgun
+ *   everywhere without worrying about overhead
+ *
+ * tracks:
+ *  - number of events
+ *  - maximum event duration ever seen
+ *  - sum of all event durations
+ *  - average event duration, standard and weighted
+ *  - standard deviation of event durations, standard and weighted
+ * and analagous statistics for the frequency of events
+ *
+ * We provide both mean and weighted mean (exponentially weighted), and standard
+ * deviation and weighted standard deviation, to give an efficient-to-compute
+ * view of current behaviour versus. average behaviour - "did this event source
+ * just become wonky, or is this typical?".
+ *
+ * Particularly useful for tracking down latency issues.
+ */
+#ifndef _LINUX_TIME_STATS_H
+#define _LINUX_TIME_STATS_H
+
+#include <linux/mean_and_variance.h>
+#include <linux/sched/clock.h>
+#include <linux/spinlock_types.h>
+
+struct time_unit {
+	const char	*name;
+	u64		nsecs;
+};
+
+/*
+ * given a nanosecond value, pick the preferred time units for printing:
+ */
+const struct time_unit *pick_time_units(u64 ns);
+
+/*
+ * quantiles - do not use:
+ *
+ * Only enabled if time_stats->quantiles_enabled has been manually set - don't
+ * use in new code.
+ */
+
+#define NR_QUANTILES	15
+#define QUANTILE_IDX(i)	inorder_to_eytzinger0(i, NR_QUANTILES)
+#define QUANTILE_FIRST	eytzinger0_first(NR_QUANTILES)
+#define QUANTILE_LAST	eytzinger0_last(NR_QUANTILES)
+
+struct quantiles {
+	struct quantile_entry {
+		u64	m;
+		u64	step;
+	}		entries[NR_QUANTILES];
+};
+
+struct time_stat_buffer {
+	unsigned	nr;
+	struct time_stat_buffer_entry {
+		u64	start;
+		u64	end;
+	}		entries[32];
+};
+
+struct time_stats {
+	spinlock_t	lock;
+	bool		quantiles_enabled;
+	/* all fields are in nanoseconds */
+	u64             min_duration;
+	u64		max_duration;
+	u64		total_duration;
+	u64             max_freq;
+	u64             min_freq;
+	u64		last_event;
+	u64		last_event_start;
+	struct quantiles quantiles;
+
+	struct mean_and_variance	  duration_stats;
+	struct mean_and_variance_weighted duration_stats_weighted;
+	struct mean_and_variance	  freq_stats;
+	struct mean_and_variance_weighted freq_stats_weighted;
+	struct time_stat_buffer __percpu *buffer;
+};
+
+void __time_stats_clear_buffer(struct time_stats *, struct time_stat_buffer *);
+void __time_stats_update(struct time_stats *stats, u64, u64);
+
+/**
+ * time_stats_update - collect a new event being tracked
+ *
+ * @stats	- time_stats to update
+ * @start	- start time of event, recorded with local_clock()
+ *
+ * The end duration of the event will be the current time
+ */
+static inline void time_stats_update(struct time_stats *stats, u64 start)
+{
+	__time_stats_update(stats, start, local_clock());
+}
+
+/**
+ * track_event_change - track state change events
+ *
+ * @stats	- time_stats to update
+ * @v		- new state, true or false
+ *
+ * Use this when tracking time stats for state changes, i.e. resource X becoming
+ * blocked/unblocked.
+ */
+static inline bool track_event_change(struct time_stats *stats, bool v)
+{
+	if (v != !!stats->last_event_start) {
+		if (!v) {
+			time_stats_update(stats, stats->last_event_start);
+			stats->last_event_start = 0;
+		} else {
+			stats->last_event_start = local_clock() ?: 1;
+			return true;
+		}
+	}
+
+	return false;
+}
+
+struct seq_buf;
+void time_stats_to_seq_buf(struct seq_buf *, struct time_stats *);
+
+void time_stats_exit(struct time_stats *);
+void time_stats_init(struct time_stats *);
+
+#endif /* _LINUX_TIME_STATS_H */
diff --git a/lib/Kconfig b/lib/Kconfig
index 5ddda7c2ed9b33..3ba8b965f8c7ec 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -785,3 +785,7 @@ config POLYNOMIAL
 
 config FIRMWARE_TABLE
 	bool
+
+config TIME_STATS
+	tristate
+	select MEAN_AND_VARIANCE
diff --git a/lib/Makefile b/lib/Makefile
index 6b09731d8e6195..57858997c87aa1 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -370,6 +370,8 @@ obj-$(CONFIG_SBITMAP) += sbitmap.o
 
 obj-$(CONFIG_PARMAN) += parman.o
 
+obj-$(CONFIG_TIME_STATS) += time_stats.o
+
 obj-y += group_cpus.o
 
 # GCC library routines
diff --git a/lib/time_stats.c b/lib/time_stats.c
new file mode 100644
index 00000000000000..081aeba88b5354
--- /dev/null
+++ b/lib/time_stats.c
@@ -0,0 +1,271 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/eytzinger.h>
+#include <linux/jiffies.h>
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/preempt.h>
+#include <linux/time.h>
+#include <linux/time_stats.h>
+#include <linux/spinlock.h>
+
+static const struct time_unit time_units[] = {
+	{ "ns",		1		 },
+	{ "us",		NSEC_PER_USEC	 },
+	{ "ms",		NSEC_PER_MSEC	 },
+	{ "s",		NSEC_PER_SEC	 },
+	{ "m",          (u64) NSEC_PER_SEC * 60},
+	{ "h",          (u64) NSEC_PER_SEC * 3600},
+	{ "eon",        U64_MAX          },
+};
+
+const struct time_unit *pick_time_units(u64 ns)
+{
+	const struct time_unit *u;
+
+	for (u = time_units;
+	     u + 1 < time_units + ARRAY_SIZE(time_units) &&
+	     ns >= u[1].nsecs << 1;
+	     u++)
+		;
+
+	return u;
+}
+EXPORT_SYMBOL_GPL(pick_time_units);
+
+static void quantiles_update(struct quantiles *q, u64 v)
+{
+	unsigned i = 0;
+
+	while (i < ARRAY_SIZE(q->entries)) {
+		struct quantile_entry *e = q->entries + i;
+
+		if (unlikely(!e->step)) {
+			e->m = v;
+			e->step = max_t(unsigned, v / 2, 1024);
+		} else if (e->m > v) {
+			e->m = e->m >= e->step
+				? e->m - e->step
+				: 0;
+		} else if (e->m < v) {
+			e->m = e->m + e->step > e->m
+				? e->m + e->step
+				: U32_MAX;
+		}
+
+		if ((e->m > v ? e->m - v : v - e->m) < e->step)
+			e->step = max_t(unsigned, e->step / 2, 1);
+
+		if (v >= e->m)
+			break;
+
+		i = eytzinger0_child(i, v > e->m);
+	}
+}
+
+static inline void time_stats_update_one(struct time_stats *stats,
+					      u64 start, u64 end)
+{
+	u64 duration, freq;
+
+	if (time_after64(end, start)) {
+		duration = end - start;
+		mean_and_variance_update(&stats->duration_stats, duration);
+		mean_and_variance_weighted_update(&stats->duration_stats_weighted, duration);
+		stats->max_duration = max(stats->max_duration, duration);
+		stats->min_duration = min(stats->min_duration, duration);
+		stats->total_duration += duration;
+
+		if (stats->quantiles_enabled)
+			quantiles_update(&stats->quantiles, duration);
+	}
+
+	if (stats->last_event && time_after64(end, stats->last_event)) {
+		freq = end - stats->last_event;
+		mean_and_variance_update(&stats->freq_stats, freq);
+		mean_and_variance_weighted_update(&stats->freq_stats_weighted, freq);
+		stats->max_freq = max(stats->max_freq, freq);
+		stats->min_freq = min(stats->min_freq, freq);
+	}
+
+	stats->last_event = end;
+}
+
+void __time_stats_clear_buffer(struct time_stats *stats,
+			       struct time_stat_buffer *b)
+{
+	for (struct time_stat_buffer_entry *i = b->entries;
+	     i < b->entries + ARRAY_SIZE(b->entries);
+	     i++)
+		time_stats_update_one(stats, i->start, i->end);
+	b->nr = 0;
+}
+EXPORT_SYMBOL_GPL(__time_stats_clear_buffer);
+
+static noinline void time_stats_clear_buffer(struct time_stats *stats,
+					     struct time_stat_buffer *b)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&stats->lock, flags);
+	__time_stats_clear_buffer(stats, b);
+	spin_unlock_irqrestore(&stats->lock, flags);
+}
+
+void __time_stats_update(struct time_stats *stats, u64 start, u64 end)
+{
+	unsigned long flags;
+
+	WARN_ONCE(!stats->duration_stats_weighted.weight ||
+		  !stats->freq_stats_weighted.weight,
+		  "uninitialized time_stats");
+
+	if (!stats->buffer) {
+		spin_lock_irqsave(&stats->lock, flags);
+		time_stats_update_one(stats, start, end);
+
+		if (mean_and_variance_weighted_get_mean(stats->freq_stats_weighted) < 32 &&
+		    stats->duration_stats.n > 1024)
+			stats->buffer =
+				alloc_percpu_gfp(struct time_stat_buffer,
+						 GFP_ATOMIC);
+		spin_unlock_irqrestore(&stats->lock, flags);
+	} else {
+		struct time_stat_buffer *b;
+
+		preempt_disable();
+		b = this_cpu_ptr(stats->buffer);
+
+		BUG_ON(b->nr >= ARRAY_SIZE(b->entries));
+		b->entries[b->nr++] = (struct time_stat_buffer_entry) {
+			.start = start,
+			.end = end
+		};
+
+		if (unlikely(b->nr == ARRAY_SIZE(b->entries)))
+			time_stats_clear_buffer(stats, b);
+		preempt_enable();
+	}
+}
+EXPORT_SYMBOL_GPL(__time_stats_update);
+
+#include <linux/seq_buf.h>
+
+static void seq_buf_time_units_aligned(struct seq_buf *out, u64 ns)
+{
+	const struct time_unit *u = pick_time_units(ns);
+
+	seq_buf_printf(out, "%8llu %s", div64_u64(ns, u->nsecs), u->name);
+}
+
+void time_stats_to_seq_buf(struct seq_buf *out, struct time_stats *stats)
+{
+	s64 f_mean = 0, d_mean = 0;
+	u64 f_stddev = 0, d_stddev = 0;
+
+	if (stats->buffer) {
+		int cpu;
+
+		spin_lock_irq(&stats->lock);
+		for_each_possible_cpu(cpu)
+			__time_stats_clear_buffer(stats, per_cpu_ptr(stats->buffer, cpu));
+		spin_unlock_irq(&stats->lock);
+	}
+
+	/*
+	 * avoid divide by zero
+	 */
+	if (stats->freq_stats.n) {
+		f_mean = mean_and_variance_get_mean(stats->freq_stats);
+		f_stddev = mean_and_variance_get_stddev(stats->freq_stats);
+		d_mean = mean_and_variance_get_mean(stats->duration_stats);
+		d_stddev = mean_and_variance_get_stddev(stats->duration_stats);
+	}
+
+	seq_buf_printf(out, "count: %llu\n", stats->duration_stats.n);
+
+	seq_buf_printf(out, "                       since mount        recent\n");
+
+	seq_buf_printf(out, "duration of events\n");
+
+	seq_buf_printf(out, "  min:                     ");
+	seq_buf_time_units_aligned(out, stats->min_duration);
+	seq_buf_printf(out, "\n");
+
+	seq_buf_printf(out, "  max:                     ");
+	seq_buf_time_units_aligned(out, stats->max_duration);
+	seq_buf_printf(out, "\n");
+
+	seq_buf_printf(out, "  total:                   ");
+	seq_buf_time_units_aligned(out, stats->total_duration);
+	seq_buf_printf(out, "\n");
+
+	seq_buf_printf(out, "  mean:                    ");
+	seq_buf_time_units_aligned(out, d_mean);
+	seq_buf_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->duration_stats_weighted));
+	seq_buf_printf(out, "\n");
+
+	seq_buf_printf(out, "  stddev:                  ");
+	seq_buf_time_units_aligned(out, d_stddev);
+	seq_buf_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->duration_stats_weighted));
+	seq_buf_printf(out, "\n");
+
+	seq_buf_printf(out, "time between events\n");
+
+	seq_buf_printf(out, "  min:                     ");
+	seq_buf_time_units_aligned(out, stats->min_freq);
+	seq_buf_printf(out, "\n");
+
+	seq_buf_printf(out, "  max:                     ");
+	seq_buf_time_units_aligned(out, stats->max_freq);
+	seq_buf_printf(out, "\n");
+
+	seq_buf_printf(out, "  mean:                    ");
+	seq_buf_time_units_aligned(out, f_mean);
+	seq_buf_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->freq_stats_weighted));
+	seq_buf_printf(out, "\n");
+
+	seq_buf_printf(out, "  stddev:                  ");
+	seq_buf_time_units_aligned(out, f_stddev);
+	seq_buf_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->freq_stats_weighted));
+	seq_buf_printf(out, "\n");
+
+	if (stats->quantiles_enabled) {
+		int i = eytzinger0_first(NR_QUANTILES);
+		const struct time_unit *u =
+			pick_time_units(stats->quantiles.entries[i].m);
+		u64 last_q = 0;
+
+		seq_buf_printf(out, "quantiles (%s):\t", u->name);
+		eytzinger0_for_each(i, NR_QUANTILES) {
+			bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1;
+
+			u64 q = max(stats->quantiles.entries[i].m, last_q);
+			seq_buf_printf(out, "%llu ", div_u64(q, u->nsecs));
+			if (is_last)
+				seq_buf_printf(out, "\n");
+			last_q = q;
+		}
+	}
+}
+EXPORT_SYMBOL_GPL(time_stats_to_seq_buf);
+
+void time_stats_exit(struct time_stats *stats)
+{
+	free_percpu(stats->buffer);
+}
+EXPORT_SYMBOL_GPL(time_stats_exit);
+
+void time_stats_init(struct time_stats *stats)
+{
+	memset(stats, 0, sizeof(*stats));
+	stats->duration_stats_weighted.weight = 8;
+	stats->freq_stats_weighted.weight = 8;
+	stats->min_duration = U64_MAX;
+	stats->min_freq = U64_MAX;
+	spin_lock_init(&stats->lock);
+}
+EXPORT_SYMBOL_GPL(time_stats_init);
+
+MODULE_AUTHOR("Kent Overstreet");
+MODULE_LICENSE("GPL");

From 164a3b12816a4ada4044013bebebfeed0a20d681 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 26 Jan 2024 16:26:30 -0500
Subject: [PATCH 0504/1406] bcache: Convert to lib/time_stats

delete bcache's time stats code, convert to newer version from bcachefs.

example output:

root@moria-kvm:/sys/fs/bcache/bdaedb8c-4554-4dd2-87e4-276e51eb47cc# cat internal/btree_sort_times
count: 6414
                       since mount        recent
duration of events
  min:                          440 ns
  max:                         1102 us
  total:                        674 ms
  mean:                         105 us     102 us
  stddev:                       101 us      88 us
time between events
  min:                          881 ns
  max:                            3 s
  mean:                           7 ms       6 ms
  stddev:                        52 ms       6 ms

Cc: Coly Li <colyli@suse.de>
Cc: linux-bcache@vger.kernel.org
Acked-by: Coly Li <colyli@suse.de>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 drivers/md/bcache/Kconfig  |  1 +
 drivers/md/bcache/bcache.h |  1 +
 drivers/md/bcache/bset.c   |  6 +++--
 drivers/md/bcache/bset.h   |  1 +
 drivers/md/bcache/btree.c  |  6 ++---
 drivers/md/bcache/super.c  |  7 +++++
 drivers/md/bcache/sysfs.c  | 25 +++++++++---------
 drivers/md/bcache/util.c   | 30 ----------------------
 drivers/md/bcache/util.h   | 52 +++++---------------------------------
 9 files changed, 37 insertions(+), 92 deletions(-)

diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig
index b2d10063d35fb4..7ea057983d3da9 100644
--- a/drivers/md/bcache/Kconfig
+++ b/drivers/md/bcache/Kconfig
@@ -5,6 +5,7 @@ config BCACHE
 	select BLOCK_HOLDER_DEPRECATED if SYSFS
 	select CRC64
 	select CLOSURES
+	select TIME_STATS
 	help
 	Allows a block device to be used as cache for other devices; uses
 	a btree for indexing and the layout is optimized for SSDs.
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 6ae2329052c92c..76e7b494c3943f 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -186,6 +186,7 @@
 #include <linux/rbtree.h>
 #include <linux/rwsem.h>
 #include <linux/refcount.h>
+#include <linux/time_stats.h>
 #include <linux/types.h>
 #include <linux/workqueue.h>
 #include <linux/kthread.h>
diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c
index 2bba4d6aaaa28c..31c08d4ab83bf2 100644
--- a/drivers/md/bcache/bset.c
+++ b/drivers/md/bcache/bset.c
@@ -1177,6 +1177,7 @@ struct bkey *bch_btree_iter_next_filter(struct btree_iter *iter,
 
 void bch_bset_sort_state_free(struct bset_sort_state *state)
 {
+	time_stats_exit(&state->time);
 	mempool_exit(&state->pool);
 }
 
@@ -1184,6 +1185,7 @@ int bch_bset_sort_state_init(struct bset_sort_state *state,
 			     unsigned int page_order)
 {
 	spin_lock_init(&state->time.lock);
+	time_stats_init(&state->time);
 
 	state->page_order = page_order;
 	state->crit_factor = int_sqrt(1 << page_order);
@@ -1286,7 +1288,7 @@ static void __btree_sort(struct btree_keys *b, struct btree_iter *iter,
 	bch_bset_build_written_tree(b);
 
 	if (!start)
-		bch_time_stats_update(&state->time, start_time);
+		time_stats_update(&state->time, start_time);
 }
 
 void bch_btree_sort_partial(struct btree_keys *b, unsigned int start,
@@ -1329,7 +1331,7 @@ void bch_btree_sort_into(struct btree_keys *b, struct btree_keys *new,
 
 	btree_mergesort(b, new->set->data, &iter, false, true);
 
-	bch_time_stats_update(&state->time, start_time);
+	time_stats_update(&state->time, start_time);
 
 	new->set->size = 0; // XXX: why?
 }
diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h
index d795c84246b018..13e524ad7783df 100644
--- a/drivers/md/bcache/bset.h
+++ b/drivers/md/bcache/bset.h
@@ -3,6 +3,7 @@
 #define _BCACHE_BSET_H
 
 #include <linux/kernel.h>
+#include <linux/time_stats.h>
 #include <linux/types.h>
 
 #include "bcache_ondisk.h"
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 196cdacce38f25..0ed337c5f0dc6d 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -270,7 +270,7 @@ static void bch_btree_node_read(struct btree *b)
 		goto err;
 
 	bch_btree_node_read_done(b);
-	bch_time_stats_update(&b->c->btree_read_time, start_time);
+	time_stats_update(&b->c->btree_read_time, start_time);
 
 	return;
 err:
@@ -1852,7 +1852,7 @@ static void bch_btree_gc(struct cache_set *c)
 	bch_btree_gc_finish(c);
 	wake_up_allocators(c);
 
-	bch_time_stats_update(&c->btree_gc_time, start_time);
+	time_stats_update(&c->btree_gc_time, start_time);
 
 	stats.key_bytes *= sizeof(uint64_t);
 	stats.data	<<= 9;
@@ -2343,7 +2343,7 @@ static int btree_split(struct btree *b, struct btree_op *op,
 	btree_node_free(b);
 	rw_unlock(true, n1);
 
-	bch_time_stats_update(&b->c->btree_split_time, start_time);
+	time_stats_update(&b->c->btree_split_time, start_time);
 
 	return 0;
 err_free2:
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index dc3f50f6971417..625e4883299cb9 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -1676,6 +1676,9 @@ static CLOSURE_CALLBACK(cache_set_free)
 
 	debugfs_remove(c->debug);
 
+	time_stats_exit(&c->btree_read_time);
+	time_stats_exit(&c->btree_split_time);
+	time_stats_exit(&c->btree_gc_time);
 	bch_open_buckets_free(c);
 	bch_btree_cache_free(c);
 	bch_journal_free(c);
@@ -1913,6 +1916,10 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
 	INIT_LIST_HEAD(&c->btree_cache_freed);
 	INIT_LIST_HEAD(&c->data_buckets);
 
+	time_stats_init(&c->btree_gc_time);
+	time_stats_init(&c->btree_split_time);
+	time_stats_init(&c->btree_read_time);
+
 	iter_size = ((meta_bucket_pages(sb) * PAGE_SECTORS) / sb->block_size + 1) *
 		sizeof(struct btree_iter_set);
 
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index a438efb660699b..01cc5c632f0844 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -14,6 +14,7 @@
 #include "features.h"
 
 #include <linux/blkdev.h>
+#include <linux/seq_buf.h>
 #include <linux/sort.h>
 #include <linux/sched/clock.h>
 
@@ -79,10 +80,10 @@ read_attribute(active_journal_entries);
 read_attribute(backing_dev_name);
 read_attribute(backing_dev_uuid);
 
-sysfs_time_stats_attribute(btree_gc,	sec, ms);
-sysfs_time_stats_attribute(btree_split, sec, us);
-sysfs_time_stats_attribute(btree_sort,	ms,  us);
-sysfs_time_stats_attribute(btree_read,	ms,  us);
+read_attribute(btree_gc_times);
+read_attribute(btree_split_times);
+read_attribute(btree_sort_times);
+read_attribute(btree_read_times);
 
 read_attribute(btree_nodes);
 read_attribute(btree_used_percent);
@@ -743,10 +744,10 @@ SHOW(__bch_cache_set)
 	sysfs_print(btree_cache_max_chain,	bch_cache_max_chain(c));
 	sysfs_print(cache_available_percent,	100 - c->gc_stats.in_use);
 
-	sysfs_print_time_stats(&c->btree_gc_time,	btree_gc, sec, ms);
-	sysfs_print_time_stats(&c->btree_split_time,	btree_split, sec, us);
-	sysfs_print_time_stats(&c->sort.time,		btree_sort, ms, us);
-	sysfs_print_time_stats(&c->btree_read_time,	btree_read, ms, us);
+	sysfs_print_time_stats(&c->btree_gc_time,	btree_gc_times);
+	sysfs_print_time_stats(&c->btree_split_time,	btree_split_times);
+	sysfs_print_time_stats(&c->sort.time,		btree_sort_times);
+	sysfs_print_time_stats(&c->btree_read_time,	btree_read_times);
 
 	sysfs_print(btree_used_percent,	bch_btree_used(c));
 	sysfs_print(btree_nodes,	c->gc_stats.nodes);
@@ -989,10 +990,10 @@ KTYPE(bch_cache_set);
 static struct attribute *bch_cache_set_internal_attrs[] = {
 	&sysfs_active_journal_entries,
 
-	sysfs_time_stats_attribute_list(btree_gc, sec, ms)
-	sysfs_time_stats_attribute_list(btree_split, sec, us)
-	sysfs_time_stats_attribute_list(btree_sort, ms, us)
-	sysfs_time_stats_attribute_list(btree_read, ms, us)
+	&sysfs_btree_gc_times,
+	&sysfs_btree_split_times,
+	&sysfs_btree_sort_times,
+	&sysfs_btree_read_times,
 
 	&sysfs_btree_nodes,
 	&sysfs_btree_used_percent,
diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c
index ae380bc3992e3c..95282bf0f9a7aa 100644
--- a/drivers/md/bcache/util.c
+++ b/drivers/md/bcache/util.c
@@ -160,36 +160,6 @@ int bch_parse_uuid(const char *s, char *uuid)
 	return i;
 }
 
-void bch_time_stats_update(struct time_stats *stats, uint64_t start_time)
-{
-	uint64_t now, duration, last;
-
-	spin_lock(&stats->lock);
-
-	now		= local_clock();
-	duration	= time_after64(now, start_time)
-		? now - start_time : 0;
-	last		= time_after64(now, stats->last)
-		? now - stats->last : 0;
-
-	stats->max_duration = max(stats->max_duration, duration);
-
-	if (stats->last) {
-		ewma_add(stats->average_duration, duration, 8, 8);
-
-		if (stats->average_frequency)
-			ewma_add(stats->average_frequency, last, 8, 8);
-		else
-			stats->average_frequency  = last << 8;
-	} else {
-		stats->average_duration  = duration << 8;
-	}
-
-	stats->last = now ?: 1;
-
-	spin_unlock(&stats->lock);
-}
-
 /**
  * bch_next_delay() - update ratelimiting statistics and calculate next delay
  * @d: the struct bch_ratelimit to update
diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h
index f61ab1bada6cf5..6fcb9db4f50dc1 100644
--- a/drivers/md/bcache/util.h
+++ b/drivers/md/bcache/util.h
@@ -344,20 +344,6 @@ ssize_t bch_hprint(char *buf, int64_t v);
 bool bch_is_zero(const char *p, size_t n);
 int bch_parse_uuid(const char *s, char *uuid);
 
-struct time_stats {
-	spinlock_t	lock;
-	/*
-	 * all fields are in nanoseconds, averages are ewmas stored left shifted
-	 * by 8
-	 */
-	uint64_t	max_duration;
-	uint64_t	average_duration;
-	uint64_t	average_frequency;
-	uint64_t	last;
-};
-
-void bch_time_stats_update(struct time_stats *stats, uint64_t time);
-
 static inline unsigned int local_clock_us(void)
 {
 	return local_clock() >> 10;
@@ -372,40 +358,16 @@ static inline unsigned int local_clock_us(void)
 	sysfs_print(name ## _ ## stat ## _ ## units,			\
 		    div_u64((stats)->stat >> 8, NSEC_PER_ ## units))
 
-#define sysfs_print_time_stats(stats, name,				\
-			       frequency_units,				\
-			       duration_units)				\
+#define sysfs_print_time_stats(stats, name)				\
 do {									\
-	__print_time_stat(stats, name,					\
-			  average_frequency,	frequency_units);	\
-	__print_time_stat(stats, name,					\
-			  average_duration,	duration_units);	\
-	sysfs_print(name ## _ ##max_duration ## _ ## duration_units,	\
-			div_u64((stats)->max_duration,			\
-				NSEC_PER_ ## duration_units));		\
-									\
-	sysfs_print(name ## _last_ ## frequency_units, (stats)->last	\
-		    ? div_s64(local_clock() - (stats)->last,		\
-			      NSEC_PER_ ## frequency_units)		\
-		    : -1LL);						\
+	if (attr == &sysfs_##name) {					\
+		struct seq_buf seq;					\
+		seq_buf_init(&seq, buf, PAGE_SIZE);			\
+		time_stats_to_seq_buf(&seq, stats);			\
+		return seq.len;						\
+	}								\
 } while (0)
 
-#define sysfs_time_stats_attribute(name,				\
-				   frequency_units,			\
-				   duration_units)			\
-read_attribute(name ## _average_frequency_ ## frequency_units);		\
-read_attribute(name ## _average_duration_ ## duration_units);		\
-read_attribute(name ## _max_duration_ ## duration_units);		\
-read_attribute(name ## _last_ ## frequency_units)
-
-#define sysfs_time_stats_attribute_list(name,				\
-					frequency_units,		\
-					duration_units)			\
-&sysfs_ ## name ## _average_frequency_ ## frequency_units,		\
-&sysfs_ ## name ## _average_duration_ ## duration_units,		\
-&sysfs_ ## name ## _max_duration_ ## duration_units,			\
-&sysfs_ ## name ## _last_ ## frequency_units,
-
 #define ewma_add(ewma, val, weight, factor)				\
 ({									\
 	(ewma) *= (weight) - 1;						\

From 44fc34a7467dbc1fadf57ee0316bb8c8126b79bd Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Mon, 5 Feb 2024 11:50:53 -0800
Subject: [PATCH 0505/1406] time_stats: report lifetime of the stats object

Capture the initialization time of the time_stats object so that we can
report how long the counter has been observing data.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 include/linux/time_stats.h |  2 ++
 lib/time_stats.c           | 10 ++++++++++
 2 files changed, 12 insertions(+)

diff --git a/include/linux/time_stats.h b/include/linux/time_stats.h
index caefa7aba65a06..eb1957cb77c0de 100644
--- a/include/linux/time_stats.h
+++ b/include/linux/time_stats.h
@@ -78,6 +78,8 @@ struct time_stats {
 	u64		last_event_start;
 	struct quantiles quantiles;
 
+	u64		start_time;
+
 	struct mean_and_variance	  duration_stats;
 	struct mean_and_variance_weighted duration_stats_weighted;
 	struct mean_and_variance	  freq_stats;
diff --git a/lib/time_stats.c b/lib/time_stats.c
index 081aeba88b5354..8df4b55fc63371 100644
--- a/lib/time_stats.c
+++ b/lib/time_stats.c
@@ -158,10 +158,16 @@ static void seq_buf_time_units_aligned(struct seq_buf *out, u64 ns)
 	seq_buf_printf(out, "%8llu %s", div64_u64(ns, u->nsecs), u->name);
 }
 
+static inline u64 time_stats_lifetime(const struct time_stats *stats)
+{
+	return local_clock() - stats->start_time;
+}
+
 void time_stats_to_seq_buf(struct seq_buf *out, struct time_stats *stats)
 {
 	s64 f_mean = 0, d_mean = 0;
 	u64 f_stddev = 0, d_stddev = 0;
+	u64 lifetime = time_stats_lifetime(stats);
 
 	if (stats->buffer) {
 		int cpu;
@@ -183,6 +189,9 @@ void time_stats_to_seq_buf(struct seq_buf *out, struct time_stats *stats)
 	}
 
 	seq_buf_printf(out, "count: %llu\n", stats->duration_stats.n);
+	seq_buf_printf(out, "lifetime: ");
+	seq_buf_time_units_aligned(out, lifetime);
+	seq_buf_printf(out, "\n");
 
 	seq_buf_printf(out, "                       since mount        recent\n");
 
@@ -263,6 +272,7 @@ void time_stats_init(struct time_stats *stats)
 	stats->freq_stats_weighted.weight = 8;
 	stats->min_duration = U64_MAX;
 	stats->min_freq = U64_MAX;
+	stats->start_time = local_clock();
 	spin_lock_init(&stats->lock);
 }
 EXPORT_SYMBOL_GPL(time_stats_init);

From 5338de973a5804fc691b258931fbd86218c00ff6 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Thu, 1 Feb 2024 12:41:42 -0800
Subject: [PATCH 0506/1406] time_stats: split stats-with-quantiles into a
 separate structure

Currently, struct time_stats has the optional ability to quantize the
information that it collects.  This is /probably/ useful for callers who
want to see quantized information, but it more than doubles the size of
the structure from 224 bytes to 464.  For users who don't care about
that (e.g. upcoming xfs patches) and want to avoid wasting 240 bytes per
counter, split the two into separate pieces.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h     |  2 +-
 fs/bcachefs/io_write.c     |  2 +-
 fs/bcachefs/super.c        | 10 ++++------
 fs/bcachefs/sysfs.c        |  4 ++--
 fs/bcachefs/util.c         |  7 ++++---
 include/linux/time_stats.h | 36 ++++++++++++++++++++++++++++++++++--
 lib/time_stats.c           | 17 ++++++++++-------
 7 files changed, 56 insertions(+), 22 deletions(-)

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 92547d6fd2d95b..04e4a65909a4f6 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -594,7 +594,7 @@ struct bch_dev {
 
 	/* The rest of this all shows up in sysfs */
 	atomic64_t		cur_latency[2];
-	struct time_stats	io_latency[2];
+	struct time_stats_quantiles	io_latency[2];
 
 #define CONGESTED_MAX		1024
 	atomic_t		congested;
diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c
index 8123a84320e3f1..3fa2cb1d5b13aa 100644
--- a/fs/bcachefs/io_write.c
+++ b/fs/bcachefs/io_write.c
@@ -88,7 +88,7 @@ void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw)
 
 	bch2_congested_acct(ca, io_latency, now, rw);
 
-	__time_stats_update(&ca->io_latency[rw], submit_time, now);
+	__time_stats_update(&ca->io_latency[rw].stats, submit_time, now);
 }
 
 #endif
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 647ab8ebe3582d..30f8b6e9af38e9 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -1168,8 +1168,8 @@ static void bch2_dev_free(struct bch_dev *ca)
 	bch2_dev_buckets_free(ca);
 	free_page((unsigned long) ca->sb_read_scratch);
 
-	time_stats_exit(&ca->io_latency[WRITE]);
-	time_stats_exit(&ca->io_latency[READ]);
+	time_stats_quantiles_exit(&ca->io_latency[WRITE]);
+	time_stats_quantiles_exit(&ca->io_latency[READ]);
 
 	percpu_ref_exit(&ca->io_ref);
 	percpu_ref_exit(&ca->ref);
@@ -1260,10 +1260,8 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c,
 
 	INIT_WORK(&ca->io_error_work, bch2_io_error_work);
 
-	time_stats_init(&ca->io_latency[READ]);
-	time_stats_init(&ca->io_latency[WRITE]);
-	ca->io_latency[READ].quantiles_enabled = true;
-	ca->io_latency[WRITE].quantiles_enabled = true;
+	time_stats_quantiles_init(&ca->io_latency[READ]);
+	time_stats_quantiles_init(&ca->io_latency[WRITE]);
 
 	ca->mi = bch2_mi_to_cpu(member);
 
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index cee80c47feea2b..c86a93a8d8fc81 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -930,10 +930,10 @@ SHOW(bch2_dev)
 	sysfs_print(io_latency_write,		atomic64_read(&ca->cur_latency[WRITE]));
 
 	if (attr == &sysfs_io_latency_stats_read)
-		bch2_time_stats_to_text(out, &ca->io_latency[READ]);
+		bch2_time_stats_to_text(out, &ca->io_latency[READ].stats);
 
 	if (attr == &sysfs_io_latency_stats_write)
-		bch2_time_stats_to_text(out, &ca->io_latency[WRITE]);
+		bch2_time_stats_to_text(out, &ca->io_latency[WRITE].stats);
 
 	sysfs_printf(congested,			"%u%%",
 		     clamp(atomic_read(&ca->congested), 0, CONGESTED_MAX)
diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
index 88853513a15faf..ef620bfe76cd21 100644
--- a/fs/bcachefs/util.c
+++ b/fs/bcachefs/util.c
@@ -365,6 +365,7 @@ static inline void pr_name_and_units(struct printbuf *out, const char *name, u64
 
 void bch2_time_stats_to_text(struct printbuf *out, struct time_stats *stats)
 {
+	struct quantiles *quantiles = time_stats_to_quantiles(stats);
 	s64 f_mean = 0, d_mean = 0;
 	u64 f_stddev = 0, d_stddev = 0;
 
@@ -465,17 +466,17 @@ void bch2_time_stats_to_text(struct printbuf *out, struct time_stats *stats)
 
 	printbuf_tabstops_reset(out);
 
-	if (stats->quantiles_enabled) {
+	if (quantiles) {
 		int i = eytzinger0_first(NR_QUANTILES);
 		const struct time_unit *u =
-			pick_time_units(stats->quantiles.entries[i].m);
+			pick_time_units(quantiles->entries[i].m);
 		u64 last_q = 0;
 
 		prt_printf(out, "quantiles (%s):\t", u->name);
 		eytzinger0_for_each(i, NR_QUANTILES) {
 			bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1;
 
-			u64 q = max(stats->quantiles.entries[i].m, last_q);
+			u64 q = max(quantiles->entries[i].m, last_q);
 			prt_printf(out, "%llu ", div_u64(q, u->nsecs));
 			if (is_last)
 				prt_newline(out);
diff --git a/include/linux/time_stats.h b/include/linux/time_stats.h
index eb1957cb77c0de..c05490101d1970 100644
--- a/include/linux/time_stats.h
+++ b/include/linux/time_stats.h
@@ -27,6 +27,7 @@
 #include <linux/mean_and_variance.h>
 #include <linux/sched/clock.h>
 #include <linux/spinlock_types.h>
+#include <linux/string.h>
 
 struct time_unit {
 	const char	*name;
@@ -67,7 +68,6 @@ struct time_stat_buffer {
 
 struct time_stats {
 	spinlock_t	lock;
-	bool		quantiles_enabled;
 	/* all fields are in nanoseconds */
 	u64             min_duration;
 	u64		max_duration;
@@ -76,7 +76,12 @@ struct time_stats {
 	u64             min_freq;
 	u64		last_event;
 	u64		last_event_start;
-	struct quantiles quantiles;
+
+/*
+ * Is this really a struct time_stats_quantiled?  Hide this flag in the least
+ * significant bit of the start time to avoid blowing up the structure size.
+ */
+#define TIME_STATS_HAVE_QUANTILES	(1ULL << 0)
 
 	u64		start_time;
 
@@ -87,6 +92,22 @@ struct time_stats {
 	struct time_stat_buffer __percpu *buffer;
 };
 
+struct time_stats_quantiles {
+	struct time_stats	stats;
+	struct quantiles	quantiles;
+};
+
+static inline struct quantiles *time_stats_to_quantiles(struct time_stats *stats)
+{
+	struct time_stats_quantiles *statq;
+
+	if (!(stats->start_time & TIME_STATS_HAVE_QUANTILES))
+		return NULL;
+
+	statq = container_of(stats, struct time_stats_quantiles, stats);
+	return &statq->quantiles;
+}
+
 void __time_stats_clear_buffer(struct time_stats *, struct time_stat_buffer *);
 void __time_stats_update(struct time_stats *stats, u64, u64);
 
@@ -133,4 +154,15 @@ void time_stats_to_seq_buf(struct seq_buf *, struct time_stats *);
 void time_stats_exit(struct time_stats *);
 void time_stats_init(struct time_stats *);
 
+static inline void time_stats_quantiles_exit(struct time_stats_quantiles *statq)
+{
+	time_stats_exit(&statq->stats);
+}
+static inline void time_stats_quantiles_init(struct time_stats_quantiles *statq)
+{
+	time_stats_init(&statq->stats);
+	statq->stats.start_time |= TIME_STATS_HAVE_QUANTILES;
+	memset(&statq->quantiles, 0, sizeof(statq->quantiles));
+}
+
 #endif /* _LINUX_TIME_STATS_H */
diff --git a/lib/time_stats.c b/lib/time_stats.c
index 8df4b55fc63371..767b1a340e8050 100644
--- a/lib/time_stats.c
+++ b/lib/time_stats.c
@@ -69,6 +69,8 @@ static inline void time_stats_update_one(struct time_stats *stats,
 	u64 duration, freq;
 
 	if (time_after64(end, start)) {
+		struct quantiles *quantiles = time_stats_to_quantiles(stats);
+
 		duration = end - start;
 		mean_and_variance_update(&stats->duration_stats, duration);
 		mean_and_variance_weighted_update(&stats->duration_stats_weighted, duration);
@@ -76,8 +78,8 @@ static inline void time_stats_update_one(struct time_stats *stats,
 		stats->min_duration = min(stats->min_duration, duration);
 		stats->total_duration += duration;
 
-		if (stats->quantiles_enabled)
-			quantiles_update(&stats->quantiles, duration);
+		if (quantiles)
+			quantiles_update(quantiles, duration);
 	}
 
 	if (stats->last_event && time_after64(end, stats->last_event)) {
@@ -160,11 +162,12 @@ static void seq_buf_time_units_aligned(struct seq_buf *out, u64 ns)
 
 static inline u64 time_stats_lifetime(const struct time_stats *stats)
 {
-	return local_clock() - stats->start_time;
+	return local_clock() - (stats->start_time & ~TIME_STATS_HAVE_QUANTILES);
 }
 
 void time_stats_to_seq_buf(struct seq_buf *out, struct time_stats *stats)
 {
+	struct quantiles *quantiles = time_stats_to_quantiles(stats);
 	s64 f_mean = 0, d_mean = 0;
 	u64 f_stddev = 0, d_stddev = 0;
 	u64 lifetime = time_stats_lifetime(stats);
@@ -239,17 +242,17 @@ void time_stats_to_seq_buf(struct seq_buf *out, struct time_stats *stats)
 	seq_buf_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->freq_stats_weighted));
 	seq_buf_printf(out, "\n");
 
-	if (stats->quantiles_enabled) {
+	if (quantiles) {
 		int i = eytzinger0_first(NR_QUANTILES);
 		const struct time_unit *u =
-			pick_time_units(stats->quantiles.entries[i].m);
+			pick_time_units(quantiles->entries[i].m);
 		u64 last_q = 0;
 
 		seq_buf_printf(out, "quantiles (%s):\t", u->name);
 		eytzinger0_for_each(i, NR_QUANTILES) {
 			bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1;
 
-			u64 q = max(stats->quantiles.entries[i].m, last_q);
+			u64 q = max(quantiles->entries[i].m, last_q);
 			seq_buf_printf(out, "%llu ", div_u64(q, u->nsecs));
 			if (is_last)
 				seq_buf_printf(out, "\n");
@@ -272,7 +275,7 @@ void time_stats_init(struct time_stats *stats)
 	stats->freq_stats_weighted.weight = 8;
 	stats->min_duration = U64_MAX;
 	stats->min_freq = U64_MAX;
-	stats->start_time = local_clock();
+	stats->start_time = local_clock() & ~TIME_STATS_HAVE_QUANTILES;
 	spin_lock_init(&stats->lock);
 }
 EXPORT_SYMBOL_GPL(time_stats_init);

From 349f0db993b66d3da5f116b960808b26dcb5e481 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Fri, 2 Feb 2024 08:31:40 -0800
Subject: [PATCH 0507/1406] time_stats: fix struct layout bloat

Make these more efficient by getting rid of the holes.  This reduces the
structure size from 224 bytes to 208 bytes.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 include/linux/time_stats.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/include/linux/time_stats.h b/include/linux/time_stats.h
index c05490101d1970..1c1ba8efa7bfea 100644
--- a/include/linux/time_stats.h
+++ b/include/linux/time_stats.h
@@ -77,6 +77,12 @@ struct time_stats {
 	u64		last_event;
 	u64		last_event_start;
 
+	struct mean_and_variance	  duration_stats;
+	struct mean_and_variance	  freq_stats;
+	struct mean_and_variance_weighted duration_stats_weighted;
+	struct mean_and_variance_weighted freq_stats_weighted;
+	struct time_stat_buffer __percpu *buffer;
+
 /*
  * Is this really a struct time_stats_quantiled?  Hide this flag in the least
  * significant bit of the start time to avoid blowing up the structure size.
@@ -84,12 +90,6 @@ struct time_stats {
 #define TIME_STATS_HAVE_QUANTILES	(1ULL << 0)
 
 	u64		start_time;
-
-	struct mean_and_variance	  duration_stats;
-	struct mean_and_variance_weighted duration_stats_weighted;
-	struct mean_and_variance	  freq_stats;
-	struct mean_and_variance_weighted freq_stats_weighted;
-	struct time_stat_buffer __percpu *buffer;
 };
 
 struct time_stats_quantiles {

From e2235ee6f16dea5cd6b70f699ede1b279cbe22b0 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Mon, 5 Feb 2024 10:50:15 -0800
Subject: [PATCH 0508/1406] time_stats: add larger units

Filesystems can stay mounted for a very long time, so add some larger
units.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 lib/time_stats.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/lib/time_stats.c b/lib/time_stats.c
index 767b1a340e8050..43106bda43a929 100644
--- a/lib/time_stats.c
+++ b/lib/time_stats.c
@@ -16,6 +16,9 @@ static const struct time_unit time_units[] = {
 	{ "s",		NSEC_PER_SEC	 },
 	{ "m",          (u64) NSEC_PER_SEC * 60},
 	{ "h",          (u64) NSEC_PER_SEC * 3600},
+	{ "d",          (u64) NSEC_PER_SEC * 3600 * 24},
+	{ "w",          (u64) NSEC_PER_SEC * 3600 * 24 * 7},
+	{ "y",          (u64) NSEC_PER_SEC * ((3600 * 24 * 7 * 365) + (3600 * (24 / 4) * 7))}, /* 365.25d */
 	{ "eon",        U64_MAX          },
 };
 

From d6604b38de15a8c80839f141a624d54c7036a55b Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Sat, 3 Feb 2024 13:00:56 -0800
Subject: [PATCH 0509/1406] time_stats: don't print any output if event count
 is zero

There's no point in printing an empty report for no data, so add a flag
that allows us to do that.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 drivers/md/bcache/util.h   |  2 +-
 include/linux/time_stats.h |  4 +++-
 lib/time_stats.c           | 10 ++++++----
 3 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h
index 6fcb9db4f50dc1..54822f910e1beb 100644
--- a/drivers/md/bcache/util.h
+++ b/drivers/md/bcache/util.h
@@ -363,7 +363,7 @@ do {									\
 	if (attr == &sysfs_##name) {					\
 		struct seq_buf seq;					\
 		seq_buf_init(&seq, buf, PAGE_SIZE);			\
-		time_stats_to_seq_buf(&seq, stats);			\
+		time_stats_to_seq_buf(&seq, stats, 0);			\
 		return seq.len;						\
 	}								\
 } while (0)
diff --git a/include/linux/time_stats.h b/include/linux/time_stats.h
index 1c1ba8efa7bfea..994823c17bca9e 100644
--- a/include/linux/time_stats.h
+++ b/include/linux/time_stats.h
@@ -148,8 +148,10 @@ static inline bool track_event_change(struct time_stats *stats, bool v)
 	return false;
 }
 
+#define TIME_STATS_PRINT_NO_ZEROES	(1U << 0)	/* print nothing if zero count */
 struct seq_buf;
-void time_stats_to_seq_buf(struct seq_buf *, struct time_stats *);
+void time_stats_to_seq_buf(struct seq_buf *, struct time_stats *,
+		unsigned int flags);
 
 void time_stats_exit(struct time_stats *);
 void time_stats_init(struct time_stats *);
diff --git a/lib/time_stats.c b/lib/time_stats.c
index 43106bda43a929..382935979f8f7b 100644
--- a/lib/time_stats.c
+++ b/lib/time_stats.c
@@ -168,7 +168,8 @@ static inline u64 time_stats_lifetime(const struct time_stats *stats)
 	return local_clock() - (stats->start_time & ~TIME_STATS_HAVE_QUANTILES);
 }
 
-void time_stats_to_seq_buf(struct seq_buf *out, struct time_stats *stats)
+void time_stats_to_seq_buf(struct seq_buf *out, struct time_stats *stats,
+		unsigned int flags)
 {
 	struct quantiles *quantiles = time_stats_to_quantiles(stats);
 	s64 f_mean = 0, d_mean = 0;
@@ -184,14 +185,15 @@ void time_stats_to_seq_buf(struct seq_buf *out, struct time_stats *stats)
 		spin_unlock_irq(&stats->lock);
 	}
 
-	/*
-	 * avoid divide by zero
-	 */
 	if (stats->freq_stats.n) {
+		/* avoid divide by zero */
 		f_mean = mean_and_variance_get_mean(stats->freq_stats);
 		f_stddev = mean_and_variance_get_stddev(stats->freq_stats);
 		d_mean = mean_and_variance_get_mean(stats->duration_stats);
 		d_stddev = mean_and_variance_get_stddev(stats->duration_stats);
+	} else if (flags & TIME_STATS_PRINT_NO_ZEROES) {
+		/* unless we didn't want zeroes anyway */
+		return;
 	}
 
 	seq_buf_printf(out, "count: %llu\n", stats->duration_stats.n);

From 6986c97828362bf51e54eaaa7a874f9e5bab1b97 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Mon, 5 Feb 2024 13:47:09 -0800
Subject: [PATCH 0510/1406] time_stats: allow custom epoch names

Let callers of time_stats_to_seq_buf define the epoch name; "mount"
doesn't make sense generally.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 drivers/md/bcache/util.h   | 2 +-
 include/linux/time_stats.h | 2 +-
 lib/time_stats.c           | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h
index 54822f910e1beb..f3bc6476ba8ebd 100644
--- a/drivers/md/bcache/util.h
+++ b/drivers/md/bcache/util.h
@@ -363,7 +363,7 @@ do {									\
 	if (attr == &sysfs_##name) {					\
 		struct seq_buf seq;					\
 		seq_buf_init(&seq, buf, PAGE_SIZE);			\
-		time_stats_to_seq_buf(&seq, stats, 0);			\
+		time_stats_to_seq_buf(&seq, stats, "startup", 0);	\
 		return seq.len;						\
 	}								\
 } while (0)
diff --git a/include/linux/time_stats.h b/include/linux/time_stats.h
index 994823c17bca9e..b2f71e3862c0f5 100644
--- a/include/linux/time_stats.h
+++ b/include/linux/time_stats.h
@@ -151,7 +151,7 @@ static inline bool track_event_change(struct time_stats *stats, bool v)
 #define TIME_STATS_PRINT_NO_ZEROES	(1U << 0)	/* print nothing if zero count */
 struct seq_buf;
 void time_stats_to_seq_buf(struct seq_buf *, struct time_stats *,
-		unsigned int flags);
+		const char *epoch_name, unsigned int flags);
 
 void time_stats_exit(struct time_stats *);
 void time_stats_init(struct time_stats *);
diff --git a/lib/time_stats.c b/lib/time_stats.c
index 382935979f8f7b..f4a21409006bd3 100644
--- a/lib/time_stats.c
+++ b/lib/time_stats.c
@@ -169,7 +169,7 @@ static inline u64 time_stats_lifetime(const struct time_stats *stats)
 }
 
 void time_stats_to_seq_buf(struct seq_buf *out, struct time_stats *stats,
-		unsigned int flags)
+		const char *epoch_name, unsigned int flags)
 {
 	struct quantiles *quantiles = time_stats_to_quantiles(stats);
 	s64 f_mean = 0, d_mean = 0;
@@ -201,7 +201,7 @@ void time_stats_to_seq_buf(struct seq_buf *out, struct time_stats *stats,
 	seq_buf_time_units_aligned(out, lifetime);
 	seq_buf_printf(out, "\n");
 
-	seq_buf_printf(out, "                       since mount        recent\n");
+	seq_buf_printf(out, "                       since %-12s recent\n", epoch_name);
 
 	seq_buf_printf(out, "duration of events\n");
 

From 0cf31c5665a422430230a5870a51b32b2c3ef24e Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Thu, 8 Feb 2024 18:33:35 -0500
Subject: [PATCH 0511/1406] mean_and_variance: put struct
 mean_and_variance_weighted on a diet

The only caller of this code (time_stats) always knows the weights and
whether or not any information has been collected.  Pass this
information into the mean and variance code so that it doesn't have to
store that information.  This reduces the structure size from 24 to 16
bytes, which shrinks each time_stats counter to 192 bytes from 208.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/util.c                |  8 ++--
 include/linux/mean_and_variance.h | 14 +++---
 include/linux/time_stats.h        |  4 ++
 lib/math/mean_and_variance.c      | 28 +++++++----
 lib/math/mean_and_variance_test.c | 80 +++++++++++++++++--------------
 lib/time_stats.c                  | 23 ++++-----
 6 files changed, 87 insertions(+), 70 deletions(-)

diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
index ef620bfe76cd21..4c3e19d562852e 100644
--- a/fs/bcachefs/util.c
+++ b/fs/bcachefs/util.c
@@ -429,14 +429,14 @@ void bch2_time_stats_to_text(struct printbuf *out, struct time_stats *stats)
 	prt_tab(out);
 	bch2_pr_time_units_aligned(out, d_mean);
 	prt_tab(out);
-	bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->duration_stats_weighted));
+	bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->duration_stats_weighted, TIME_STATS_MV_WEIGHT));
 	prt_newline(out);
 
 	prt_printf(out, "stddev:");
 	prt_tab(out);
 	bch2_pr_time_units_aligned(out, d_stddev);
 	prt_tab(out);
-	bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->duration_stats_weighted));
+	bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->duration_stats_weighted, TIME_STATS_MV_WEIGHT));
 
 	printbuf_indent_sub(out, 2);
 	prt_newline(out);
@@ -452,14 +452,14 @@ void bch2_time_stats_to_text(struct printbuf *out, struct time_stats *stats)
 	prt_tab(out);
 	bch2_pr_time_units_aligned(out, f_mean);
 	prt_tab(out);
-	bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->freq_stats_weighted));
+	bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT));
 	prt_newline(out);
 
 	prt_printf(out, "stddev:");
 	prt_tab(out);
 	bch2_pr_time_units_aligned(out, f_stddev);
 	prt_tab(out);
-	bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->freq_stats_weighted));
+	bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT));
 
 	printbuf_indent_sub(out, 2);
 	prt_newline(out);
diff --git a/include/linux/mean_and_variance.h b/include/linux/mean_and_variance.h
index 64df11ab422bf4..4fcf062dd22c71 100644
--- a/include/linux/mean_and_variance.h
+++ b/include/linux/mean_and_variance.h
@@ -154,8 +154,6 @@ struct mean_and_variance {
 
 /* expontentially weighted variant */
 struct mean_and_variance_weighted {
-	bool	init;
-	u8	weight;	/* base 2 logarithim */
 	s64	mean;
 	u64	variance;
 };
@@ -192,10 +190,14 @@ s64 mean_and_variance_get_mean(struct mean_and_variance s);
 u64 mean_and_variance_get_variance(struct mean_and_variance s1);
 u32 mean_and_variance_get_stddev(struct mean_and_variance s);
 
-void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s, s64 v);
+void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s,
+		s64 v, bool initted, u8 weight);
 
-s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s);
-u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s);
-u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s);
+s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s,
+		u8 weight);
+u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s,
+		u8 weight);
+u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s,
+		u8 weight);
 
 #endif // MEAN_AND_VAIRANCE_H_
diff --git a/include/linux/time_stats.h b/include/linux/time_stats.h
index b2f71e3862c0f5..dc539123f7997d 100644
--- a/include/linux/time_stats.h
+++ b/include/linux/time_stats.h
@@ -79,6 +79,10 @@ struct time_stats {
 
 	struct mean_and_variance	  duration_stats;
 	struct mean_and_variance	  freq_stats;
+
+/* default weight for weighted mean and variance calculations */
+#define TIME_STATS_MV_WEIGHT	8
+
 	struct mean_and_variance_weighted duration_stats_weighted;
 	struct mean_and_variance_weighted freq_stats_weighted;
 	struct time_stat_buffer __percpu *buffer;
diff --git a/lib/math/mean_and_variance.c b/lib/math/mean_and_variance.c
index ba90293204bae1..21ec6afc678841 100644
--- a/lib/math/mean_and_variance.c
+++ b/lib/math/mean_and_variance.c
@@ -102,14 +102,17 @@ EXPORT_SYMBOL_GPL(mean_and_variance_get_stddev);
  * mean_and_variance_weighted_update() - exponentially weighted variant of mean_and_variance_update()
  * @s: mean and variance number of samples and their sums
  * @x: new value to include in the &mean_and_variance_weighted
+ * @initted: caller must track whether this is the first use or not
+ * @weight: ewma weight
  *
  * see linked pdf: function derived from equations 140-143 where alpha = 2^w.
  * values are stored bitshifted for performance and added precision.
  */
-void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s, s64 x)
+void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s,
+		s64 x, bool initted, u8 weight)
 {
 	// previous weighted variance.
-	u8 w		= s->weight;
+	u8 w		= weight;
 	u64 var_w0	= s->variance;
 	// new value weighted.
 	s64 x_w		= x << w;
@@ -118,45 +121,50 @@ void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s, s64
 	// new mean weighted.
 	s64 u_w1	= s->mean + diff;
 
-	if (!s->init) {
+	if (!initted) {
 		s->mean = x_w;
 		s->variance = 0;
 	} else {
 		s->mean = u_w1;
 		s->variance = ((var_w0 << w) - var_w0 + ((diff_w * (x_w - u_w1)) >> w)) >> w;
 	}
-	s->init = true;
 }
 EXPORT_SYMBOL_GPL(mean_and_variance_weighted_update);
 
 /**
  * mean_and_variance_weighted_get_mean() - get mean from @s
  * @s: mean and variance number of samples and their sums
+ * @weight: ewma weight
  */
-s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s)
+s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s,
+		u8 weight)
 {
-	return fast_divpow2(s.mean, s.weight);
+	return fast_divpow2(s.mean, weight);
 }
 EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_mean);
 
 /**
  * mean_and_variance_weighted_get_variance() -- get variance from @s
  * @s: mean and variance number of samples and their sums
+ * @weight: ewma weight
  */
-u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s)
+u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s,
+		u8 weight)
 {
 	// always positive don't need fast divpow2
-	return s.variance >> s.weight;
+	return s.variance >> weight;
 }
 EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_variance);
 
 /**
  * mean_and_variance_weighted_get_stddev() - get standard deviation from @s
  * @s: mean and variance number of samples and their sums
+ * @weight: ewma weight
  */
-u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s)
+u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s,
+		u8 weight)
 {
-	return int_sqrt64(mean_and_variance_weighted_get_variance(s));
+	return int_sqrt64(mean_and_variance_weighted_get_variance(s, weight));
 }
 EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_stddev);
 
diff --git a/lib/math/mean_and_variance_test.c b/lib/math/mean_and_variance_test.c
index f45591a169d879..0d8c2451a85885 100644
--- a/lib/math/mean_and_variance_test.c
+++ b/lib/math/mean_and_variance_test.c
@@ -30,53 +30,59 @@ static void mean_and_variance_basic_test(struct kunit *test)
 
 static void mean_and_variance_weighted_test(struct kunit *test)
 {
-	struct mean_and_variance_weighted s = { .weight = 2 };
+	struct mean_and_variance_weighted s = { };
 
-	mean_and_variance_weighted_update(&s, 10);
-	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), 10);
-	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 0);
+	mean_and_variance_weighted_update(&s, 10, false, 2);
+	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), 10);
+	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 0);
 
-	mean_and_variance_weighted_update(&s, 20);
-	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), 12);
-	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 18);
+	mean_and_variance_weighted_update(&s, 20, true, 2);
+	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), 12);
+	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 18);
 
-	mean_and_variance_weighted_update(&s, 30);
-	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), 16);
-	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 72);
+	mean_and_variance_weighted_update(&s, 30, true, 2);
+	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), 16);
+	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 72);
 
-	s = (struct mean_and_variance_weighted) { .weight = 2 };
+	s = (struct mean_and_variance_weighted) { };
 
-	mean_and_variance_weighted_update(&s, -10);
-	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), -10);
-	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 0);
+	mean_and_variance_weighted_update(&s, -10, false, 2);
+	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), -10);
+	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 0);
 
-	mean_and_variance_weighted_update(&s, -20);
-	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), -12);
-	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 18);
+	mean_and_variance_weighted_update(&s, -20, true, 2);
+	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), -12);
+	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 18);
 
-	mean_and_variance_weighted_update(&s, -30);
-	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), -16);
-	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 72);
+	mean_and_variance_weighted_update(&s, -30, true, 2);
+	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), -16);
+	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 72);
 }
 
 static void mean_and_variance_weighted_advanced_test(struct kunit *test)
 {
-	struct mean_and_variance_weighted s = { .weight = 8 };
+	struct mean_and_variance_weighted s = { };
+	bool initted = false;
 	s64 i;
 
-	for (i = 10; i <= 100; i += 10)
-		mean_and_variance_weighted_update(&s, i);
+	for (i = 10; i <= 100; i += 10) {
+		mean_and_variance_weighted_update(&s, i, initted, 8);
+		initted = true;
+	}
 
-	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), 11);
-	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 107);
+	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 8), 11);
+	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 8), 107);
 
-	s = (struct mean_and_variance_weighted) { .weight = 8 };
+	s = (struct mean_and_variance_weighted) { };
+	initted = false;
 
-	for (i = -10; i >= -100; i -= 10)
-		mean_and_variance_weighted_update(&s, i);
+	for (i = -10; i >= -100; i -= 10) {
+		mean_and_variance_weighted_update(&s, i, initted, 8);
+		initted = true;
+	}
 
-	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), -11);
-	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 107);
+	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 8), -11);
+	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 8), 107);
 }
 
 static void do_mean_and_variance_test(struct kunit *test,
@@ -91,26 +97,26 @@ static void do_mean_and_variance_test(struct kunit *test,
 				      s64 *weighted_stddev)
 {
 	struct mean_and_variance mv = {};
-	struct mean_and_variance_weighted vw = { .weight = weight };
+	struct mean_and_variance_weighted vw = { };
 
 	for (unsigned i = 0; i < initial_n; i++) {
 		mean_and_variance_update(&mv, initial_value);
-		mean_and_variance_weighted_update(&vw, initial_value);
+		mean_and_variance_weighted_update(&vw, initial_value, false, weight);
 
 		KUNIT_EXPECT_EQ(test, mean_and_variance_get_mean(mv),		initial_value);
 		KUNIT_EXPECT_EQ(test, mean_and_variance_get_stddev(mv),		0);
-		KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(vw),	initial_value);
-		KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_stddev(vw),0);
+		KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(vw, weight),	initial_value);
+		KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_stddev(vw, weight),0);
 	}
 
 	for (unsigned i = 0; i < n; i++) {
 		mean_and_variance_update(&mv, data[i]);
-		mean_and_variance_weighted_update(&vw, data[i]);
+		mean_and_variance_weighted_update(&vw, data[i], true, weight);
 
 		KUNIT_EXPECT_EQ(test, mean_and_variance_get_mean(mv),		mean[i]);
 		KUNIT_EXPECT_EQ(test, mean_and_variance_get_stddev(mv),		stddev[i]);
-		KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(vw),	weighted_mean[i]);
-		KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_stddev(vw),weighted_stddev[i]);
+		KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(vw, weight),	weighted_mean[i]);
+		KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_stddev(vw, weight),weighted_stddev[i]);
 	}
 
 	KUNIT_EXPECT_EQ(test, mv.n, initial_n + n);
diff --git a/lib/time_stats.c b/lib/time_stats.c
index f4a21409006bd3..0fb3d854e503b9 100644
--- a/lib/time_stats.c
+++ b/lib/time_stats.c
@@ -70,13 +70,15 @@ static inline void time_stats_update_one(struct time_stats *stats,
 					      u64 start, u64 end)
 {
 	u64 duration, freq;
+	bool initted = stats->last_event != 0;
 
 	if (time_after64(end, start)) {
 		struct quantiles *quantiles = time_stats_to_quantiles(stats);
 
 		duration = end - start;
 		mean_and_variance_update(&stats->duration_stats, duration);
-		mean_and_variance_weighted_update(&stats->duration_stats_weighted, duration);
+		mean_and_variance_weighted_update(&stats->duration_stats_weighted,
+				duration, initted, TIME_STATS_MV_WEIGHT);
 		stats->max_duration = max(stats->max_duration, duration);
 		stats->min_duration = min(stats->min_duration, duration);
 		stats->total_duration += duration;
@@ -88,7 +90,8 @@ static inline void time_stats_update_one(struct time_stats *stats,
 	if (stats->last_event && time_after64(end, stats->last_event)) {
 		freq = end - stats->last_event;
 		mean_and_variance_update(&stats->freq_stats, freq);
-		mean_and_variance_weighted_update(&stats->freq_stats_weighted, freq);
+		mean_and_variance_weighted_update(&stats->freq_stats_weighted,
+				freq, initted, TIME_STATS_MV_WEIGHT);
 		stats->max_freq = max(stats->max_freq, freq);
 		stats->min_freq = min(stats->min_freq, freq);
 	}
@@ -121,15 +124,11 @@ void __time_stats_update(struct time_stats *stats, u64 start, u64 end)
 {
 	unsigned long flags;
 
-	WARN_ONCE(!stats->duration_stats_weighted.weight ||
-		  !stats->freq_stats_weighted.weight,
-		  "uninitialized time_stats");
-
 	if (!stats->buffer) {
 		spin_lock_irqsave(&stats->lock, flags);
 		time_stats_update_one(stats, start, end);
 
-		if (mean_and_variance_weighted_get_mean(stats->freq_stats_weighted) < 32 &&
+		if (mean_and_variance_weighted_get_mean(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT) < 32 &&
 		    stats->duration_stats.n > 1024)
 			stats->buffer =
 				alloc_percpu_gfp(struct time_stat_buffer,
@@ -219,12 +218,12 @@ void time_stats_to_seq_buf(struct seq_buf *out, struct time_stats *stats,
 
 	seq_buf_printf(out, "  mean:                    ");
 	seq_buf_time_units_aligned(out, d_mean);
-	seq_buf_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->duration_stats_weighted));
+	seq_buf_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->duration_stats_weighted, TIME_STATS_MV_WEIGHT));
 	seq_buf_printf(out, "\n");
 
 	seq_buf_printf(out, "  stddev:                  ");
 	seq_buf_time_units_aligned(out, d_stddev);
-	seq_buf_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->duration_stats_weighted));
+	seq_buf_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->duration_stats_weighted, TIME_STATS_MV_WEIGHT));
 	seq_buf_printf(out, "\n");
 
 	seq_buf_printf(out, "time between events\n");
@@ -239,12 +238,12 @@ void time_stats_to_seq_buf(struct seq_buf *out, struct time_stats *stats,
 
 	seq_buf_printf(out, "  mean:                    ");
 	seq_buf_time_units_aligned(out, f_mean);
-	seq_buf_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->freq_stats_weighted));
+	seq_buf_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT));
 	seq_buf_printf(out, "\n");
 
 	seq_buf_printf(out, "  stddev:                  ");
 	seq_buf_time_units_aligned(out, f_stddev);
-	seq_buf_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->freq_stats_weighted));
+	seq_buf_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT));
 	seq_buf_printf(out, "\n");
 
 	if (quantiles) {
@@ -276,8 +275,6 @@ EXPORT_SYMBOL_GPL(time_stats_exit);
 void time_stats_init(struct time_stats *stats)
 {
 	memset(stats, 0, sizeof(*stats));
-	stats->duration_stats_weighted.weight = 8;
-	stats->freq_stats_weighted.weight = 8;
 	stats->min_duration = U64_MAX;
 	stats->min_freq = U64_MAX;
 	stats->start_time = local_clock() & ~TIME_STATS_HAVE_QUANTILES;

From 8c2edac14ff923a1d9fe4cefa4290a0ab4587db4 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Mon, 5 Feb 2024 13:48:21 -0800
Subject: [PATCH 0512/1406] time_stats: shrink time_stat_buffer for better
 alignment

Shrink this percpu object by one array element so that the object size
becomes exactly 512 bytes.  This will lead to more efficient memory use,
hopefully.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 include/linux/time_stats.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/time_stats.h b/include/linux/time_stats.h
index dc539123f7997d..b3c810fff963aa 100644
--- a/include/linux/time_stats.h
+++ b/include/linux/time_stats.h
@@ -63,7 +63,7 @@ struct time_stat_buffer {
 	struct time_stat_buffer_entry {
 		u64	start;
 		u64	end;
-	}		entries[32];
+	}		entries[31];
 };
 
 struct time_stats {

From 2d6329ba4a7e2dfb398b24009aac1ea4a2241302 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Mon, 5 Feb 2024 17:22:27 -0800
Subject: [PATCH 0513/1406] time_stats: report information in json format

Export json versions of time statistics information.  Given the tabular
nature of the numbers exposed, this will make it a lot easier for higher
(than C) level languages (e.g. python) to import information without
needing to write yet another clumsy string parser.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 include/linux/time_stats.h |  2 +
 lib/time_stats.c           | 87 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 89 insertions(+)

diff --git a/include/linux/time_stats.h b/include/linux/time_stats.h
index b3c810fff963aa..4e1f5485ed0390 100644
--- a/include/linux/time_stats.h
+++ b/include/linux/time_stats.h
@@ -156,6 +156,8 @@ static inline bool track_event_change(struct time_stats *stats, bool v)
 struct seq_buf;
 void time_stats_to_seq_buf(struct seq_buf *, struct time_stats *,
 		const char *epoch_name, unsigned int flags);
+void time_stats_to_json(struct seq_buf *, struct time_stats *,
+		const char *epoch_name, unsigned int flags);
 
 void time_stats_exit(struct time_stats *);
 void time_stats_init(struct time_stats *);
diff --git a/lib/time_stats.c b/lib/time_stats.c
index 0fb3d854e503b9..c0f209dd9f6dd8 100644
--- a/lib/time_stats.c
+++ b/lib/time_stats.c
@@ -266,6 +266,93 @@ void time_stats_to_seq_buf(struct seq_buf *out, struct time_stats *stats,
 }
 EXPORT_SYMBOL_GPL(time_stats_to_seq_buf);
 
+void time_stats_to_json(struct seq_buf *out, struct time_stats *stats,
+		const char *epoch_name, unsigned int flags)
+{
+	struct quantiles *quantiles = time_stats_to_quantiles(stats);
+	s64 f_mean = 0, d_mean = 0;
+	u64 f_stddev = 0, d_stddev = 0;
+
+	if (stats->buffer) {
+		int cpu;
+
+		spin_lock_irq(&stats->lock);
+		for_each_possible_cpu(cpu)
+			__time_stats_clear_buffer(stats, per_cpu_ptr(stats->buffer, cpu));
+		spin_unlock_irq(&stats->lock);
+	}
+
+	if (stats->freq_stats.n) {
+		/* avoid divide by zero */
+		f_mean = mean_and_variance_get_mean(stats->freq_stats);
+		f_stddev = mean_and_variance_get_stddev(stats->freq_stats);
+		d_mean = mean_and_variance_get_mean(stats->duration_stats);
+		d_stddev = mean_and_variance_get_stddev(stats->duration_stats);
+	} else if (flags & TIME_STATS_PRINT_NO_ZEROES) {
+		/* unless we didn't want zeroes anyway */
+		return;
+	}
+
+	seq_buf_printf(out, "{\n");
+	seq_buf_printf(out, "  \"epoch\":       \"%s\",\n", epoch_name);
+	seq_buf_printf(out, "  \"count\":       %llu,\n", stats->duration_stats.n);
+
+	seq_buf_printf(out, "  \"duration_ns\": {\n");
+	seq_buf_printf(out, "    \"min\":       %llu,\n", stats->min_duration);
+	seq_buf_printf(out, "    \"max\":       %llu,\n", stats->max_duration);
+	seq_buf_printf(out, "    \"total\":     %llu,\n", stats->total_duration);
+	seq_buf_printf(out, "    \"mean\":      %llu,\n", d_mean);
+	seq_buf_printf(out, "    \"stddev\":    %llu\n", d_stddev);
+	seq_buf_printf(out, "  },\n");
+
+	d_mean = mean_and_variance_weighted_get_mean(stats->duration_stats_weighted, TIME_STATS_MV_WEIGHT);
+	d_stddev = mean_and_variance_weighted_get_stddev(stats->duration_stats_weighted, TIME_STATS_MV_WEIGHT);
+
+	seq_buf_printf(out, "  \"duration_ewma_ns\": {\n");
+	seq_buf_printf(out, "    \"mean\":      %llu,\n", d_mean);
+	seq_buf_printf(out, "    \"stddev\":    %llu\n", d_stddev);
+	seq_buf_printf(out, "  },\n");
+
+	seq_buf_printf(out, "  \"frequency_ns\": {\n");
+	seq_buf_printf(out, "    \"min\":       %llu,\n", stats->min_freq);
+	seq_buf_printf(out, "    \"max\":       %llu,\n", stats->max_freq);
+	seq_buf_printf(out, "    \"mean\":      %llu,\n", f_mean);
+	seq_buf_printf(out, "    \"stddev\":    %llu\n", f_stddev);
+	seq_buf_printf(out, "  },\n");
+
+	f_mean = mean_and_variance_weighted_get_mean(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT);
+	f_stddev = mean_and_variance_weighted_get_stddev(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT);
+
+	seq_buf_printf(out, "  \"frequency_ewma_ns\": {\n");
+	seq_buf_printf(out, "    \"mean\":      %llu,\n", f_mean);
+	seq_buf_printf(out, "    \"stddev\":    %llu\n", f_stddev);
+
+	if (quantiles) {
+		u64 last_q = 0;
+
+		/* close frequency_ewma_ns but signal more items */
+		seq_buf_printf(out, "  },\n");
+
+		seq_buf_printf(out, "  \"quantiles_ns\": [\n");
+		eytzinger0_for_each(i, NR_QUANTILES) {
+			bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1;
+
+			u64 q = max(quantiles->entries[i].m, last_q);
+			seq_buf_printf(out, "    %llu", q);
+			if (!is_last)
+				seq_buf_printf(out, ", ");
+			last_q = q;
+		}
+		seq_buf_printf(out, "  ]\n");
+	} else {
+		/* close frequency_ewma_ns without dumping further */
+		seq_buf_printf(out, "  }\n");
+	}
+
+	seq_buf_printf(out, "}\n");
+}
+EXPORT_SYMBOL_GPL(time_stats_to_json);
+
 void time_stats_exit(struct time_stats *stats)
 {
 	free_percpu(stats->buffer);

From e72d7904d4e2051b54fd1b1957d68f1d2c879baa Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 5 Feb 2024 21:56:47 -0500
Subject: [PATCH 0514/1406] time_stats: Kill TIME_STATS_HAVE_QUANTILES

We have 4 spare bytes next to the spinlock, no need for bit stuffing

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 include/linux/time_stats.h | 19 +++++--------------
 lib/time_stats.c           |  4 ++--
 2 files changed, 7 insertions(+), 16 deletions(-)

diff --git a/include/linux/time_stats.h b/include/linux/time_stats.h
index 4e1f5485ed0390..6df2b34aa274bd 100644
--- a/include/linux/time_stats.h
+++ b/include/linux/time_stats.h
@@ -68,6 +68,7 @@ struct time_stat_buffer {
 
 struct time_stats {
 	spinlock_t	lock;
+	bool		have_quantiles;
 	/* all fields are in nanoseconds */
 	u64             min_duration;
 	u64		max_duration;
@@ -87,12 +88,6 @@ struct time_stats {
 	struct mean_and_variance_weighted freq_stats_weighted;
 	struct time_stat_buffer __percpu *buffer;
 
-/*
- * Is this really a struct time_stats_quantiled?  Hide this flag in the least
- * significant bit of the start time to avoid blowing up the structure size.
- */
-#define TIME_STATS_HAVE_QUANTILES	(1ULL << 0)
-
 	u64		start_time;
 };
 
@@ -103,13 +98,9 @@ struct time_stats_quantiles {
 
 static inline struct quantiles *time_stats_to_quantiles(struct time_stats *stats)
 {
-	struct time_stats_quantiles *statq;
-
-	if (!(stats->start_time & TIME_STATS_HAVE_QUANTILES))
-		return NULL;
-
-	statq = container_of(stats, struct time_stats_quantiles, stats);
-	return &statq->quantiles;
+	return stats->have_quantiles
+		? &container_of(stats, struct time_stats_quantiles, stats)->quantiles
+		: NULL;
 }
 
 void __time_stats_clear_buffer(struct time_stats *, struct time_stat_buffer *);
@@ -169,7 +160,7 @@ static inline void time_stats_quantiles_exit(struct time_stats_quantiles *statq)
 static inline void time_stats_quantiles_init(struct time_stats_quantiles *statq)
 {
 	time_stats_init(&statq->stats);
-	statq->stats.start_time |= TIME_STATS_HAVE_QUANTILES;
+	statq->stats.have_quantiles = true;
 	memset(&statq->quantiles, 0, sizeof(statq->quantiles));
 }
 
diff --git a/lib/time_stats.c b/lib/time_stats.c
index c0f209dd9f6dd8..0b90c80cba9f17 100644
--- a/lib/time_stats.c
+++ b/lib/time_stats.c
@@ -164,7 +164,7 @@ static void seq_buf_time_units_aligned(struct seq_buf *out, u64 ns)
 
 static inline u64 time_stats_lifetime(const struct time_stats *stats)
 {
-	return local_clock() - (stats->start_time & ~TIME_STATS_HAVE_QUANTILES);
+	return local_clock() - stats->start_time;
 }
 
 void time_stats_to_seq_buf(struct seq_buf *out, struct time_stats *stats,
@@ -364,7 +364,7 @@ void time_stats_init(struct time_stats *stats)
 	memset(stats, 0, sizeof(*stats));
 	stats->min_duration = U64_MAX;
 	stats->min_freq = U64_MAX;
-	stats->start_time = local_clock() & ~TIME_STATS_HAVE_QUANTILES;
+	stats->start_time = local_clock();
 	spin_lock_init(&stats->lock);
 }
 EXPORT_SYMBOL_GPL(time_stats_init);

From 859770f4d71d004d97252604f574d1a33b1ef55c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 1 Feb 2024 06:28:41 -0500
Subject: [PATCH 0515/1406] mempool: kvmalloc pool

Add mempool_init_kvmalloc_pool() and mempool_create_kvmalloc_pool(),
which wrap kvmalloc() instead of kmalloc() - kmalloc() with a vmalloc()
fallback.

This is part of a bcachefs cleanup - dropping an internal kvpmalloc()
helper (which predates kvmalloc()) along with mempool helpers; this
replaces the bcachefs-private kvpmalloc_pool.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
Cc: linux-mm@kvack.org
---
 include/linux/mempool.h | 13 +++++++++++++
 mm/mempool.c            | 13 +++++++++++++
 2 files changed, 26 insertions(+)

diff --git a/include/linux/mempool.h b/include/linux/mempool.h
index 7be1e32e6d421d..16c5cc807ff6b4 100644
--- a/include/linux/mempool.h
+++ b/include/linux/mempool.h
@@ -95,6 +95,19 @@ static inline mempool_t *mempool_create_kmalloc_pool(int min_nr, size_t size)
 			      (void *) size);
 }
 
+void *mempool_kvmalloc(gfp_t gfp_mask, void *pool_data);
+void mempool_kvfree(void *element, void *pool_data);
+
+static inline int mempool_init_kvmalloc_pool(mempool_t *pool, int min_nr, size_t size)
+{
+	return mempool_init(pool, min_nr, mempool_kvmalloc, mempool_kvfree, (void *) size);
+}
+
+static inline mempool_t *mempool_create_kvmalloc_pool(int min_nr, size_t size)
+{
+	return mempool_create(min_nr, mempool_kvmalloc, mempool_kvfree, (void *) size);
+}
+
 /*
  * A mempool_alloc_t and mempool_free_t for a simple page allocator that
  * allocates pages of the order specified by pool_data
diff --git a/mm/mempool.c b/mm/mempool.c
index dbbf0e9fb42467..076c736f5f1ff8 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -590,6 +590,19 @@ void mempool_kfree(void *element, void *pool_data)
 }
 EXPORT_SYMBOL(mempool_kfree);
 
+void *mempool_kvmalloc(gfp_t gfp_mask, void *pool_data)
+{
+	size_t size = (size_t)pool_data;
+	return kvmalloc(size, gfp_mask);
+}
+EXPORT_SYMBOL(mempool_kvmalloc);
+
+void mempool_kvfree(void *element, void *pool_data)
+{
+	kvfree(element);
+}
+EXPORT_SYMBOL(mempool_kvfree);
+
 /*
  * A simple mempool-backed page allocator that allocates pages
  * of the order specified by pool_data.

From 7df4c6331b3fcdd04987906cbbee8ef1d8602663 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 1 Feb 2024 06:35:46 -0500
Subject: [PATCH 0516/1406] bcachefs: kill kvpmalloc()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_cache.c        |  8 +++----
 fs/bcachefs/btree_gc.c           |  6 ++----
 fs/bcachefs/btree_io.c           |  4 ++--
 fs/bcachefs/btree_journal_iter.c |  4 +---
 fs/bcachefs/buckets.c            | 29 ++++++++++---------------
 fs/bcachefs/compress.c           | 14 ++++++-------
 fs/bcachefs/debug.c              |  6 +++---
 fs/bcachefs/ec.c                 |  4 ++--
 fs/bcachefs/fifo.h               |  4 ++--
 fs/bcachefs/journal.c            |  4 ++--
 fs/bcachefs/journal_io.c         | 15 +++++++------
 fs/bcachefs/super.c              |  8 +++----
 fs/bcachefs/util.c               | 22 -------------------
 fs/bcachefs/util.h               | 36 ++------------------------------
 14 files changed, 49 insertions(+), 115 deletions(-)

diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 9dcc4f9334bf22..9b7ea1227069e6 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -60,7 +60,7 @@ static void btree_node_data_free(struct bch_fs *c, struct btree *b)
 
 	clear_btree_node_just_written(b);
 
-	kvpfree(b->data, btree_buf_bytes(b));
+	kvfree(b->data);
 	b->data = NULL;
 #ifdef __KERNEL__
 	kvfree(b->aux_data);
@@ -94,7 +94,7 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
 {
 	BUG_ON(b->data || b->aux_data);
 
-	b->data = kvpmalloc(btree_buf_bytes(b), gfp);
+	b->data = kvmalloc(btree_buf_bytes(b), gfp);
 	if (!b->data)
 		return -BCH_ERR_ENOMEM_btree_node_mem_alloc;
 #ifdef __KERNEL__
@@ -107,7 +107,7 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
 		b->aux_data = NULL;
 #endif
 	if (!b->aux_data) {
-		kvpfree(b->data, btree_buf_bytes(b));
+		kvfree(b->data);
 		b->data = NULL;
 		return -BCH_ERR_ENOMEM_btree_node_mem_alloc;
 	}
@@ -408,7 +408,7 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c)
 	if (c->verify_data)
 		list_move(&c->verify_data->list, &bc->live);
 
-	kvpfree(c->verify_ondisk, c->opts.btree_node_size);
+	kvfree(c->verify_ondisk);
 
 	for (i = 0; i < btree_id_nr_alive(c); i++) {
 		struct btree_root *r = bch2_btree_id_root(c, i);
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 642f4c929b3c4e..eb92526bb9b64c 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -1193,9 +1193,7 @@ static void bch2_gc_free(struct bch_fs *c)
 	genradix_free(&c->gc_stripes);
 
 	for_each_member_device(c, ca) {
-		kvpfree(rcu_dereference_protected(ca->buckets_gc, 1),
-			sizeof(struct bucket_array) +
-			ca->mi.nbuckets * sizeof(struct bucket));
+		kvfree(rcu_dereference_protected(ca->buckets_gc, 1));
 		ca->buckets_gc = NULL;
 
 		free_percpu(ca->usage_gc);
@@ -1494,7 +1492,7 @@ static int bch2_gc_alloc_done(struct bch_fs *c, bool metadata_only)
 static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only)
 {
 	for_each_member_device(c, ca) {
-		struct bucket_array *buckets = kvpmalloc(sizeof(struct bucket_array) +
+		struct bucket_array *buckets = kvmalloc(sizeof(struct bucket_array) +
 				ca->mi.nbuckets * sizeof(struct bucket),
 				GFP_KERNEL|__GFP_ZERO);
 		if (!buckets) {
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index a56dcabb7ace7a..61b6093805eaf2 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -103,7 +103,7 @@ static void btree_bounce_free(struct bch_fs *c, size_t size,
 	if (used_mempool)
 		mempool_free(p, &c->btree_bounce_pool);
 	else
-		vpfree(p, size);
+		kvfree(p);
 }
 
 static void *btree_bounce_alloc(struct bch_fs *c, size_t size,
@@ -115,7 +115,7 @@ static void *btree_bounce_alloc(struct bch_fs *c, size_t size,
 	BUG_ON(size > c->opts.btree_node_size);
 
 	*used_mempool = false;
-	p = vpmalloc(size, __GFP_NOWARN|GFP_NOWAIT);
+	p = kvmalloc(size, __GFP_NOWARN|GFP_NOWAIT);
 	if (!p) {
 		*used_mempool = true;
 		p = mempool_alloc(&c->btree_bounce_pool, GFP_NOFS);
diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c
index b7ac93c8fdd8ab..3da65562fdb042 100644
--- a/fs/bcachefs/btree_journal_iter.c
+++ b/fs/bcachefs/btree_journal_iter.c
@@ -447,9 +447,7 @@ void bch2_journal_entries_free(struct bch_fs *c)
 	struct genradix_iter iter;
 
 	genradix_for_each(&c->journal_entries, iter, i)
-		if (*i)
-			kvpfree(*i, offsetof(struct journal_replay, j) +
-				vstruct_bytes(&(*i)->j));
+		kvfree(*i);
 	genradix_free(&c->journal_entries);
 }
 
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 54f7826ac49874..7dca10ba70d253 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1335,7 +1335,7 @@ static void bucket_gens_free_rcu(struct rcu_head *rcu)
 	struct bucket_gens *buckets =
 		container_of(rcu, struct bucket_gens, rcu);
 
-	kvpfree(buckets, sizeof(*buckets) + buckets->nbuckets);
+	kvfree(buckets);
 }
 
 int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
@@ -1345,16 +1345,16 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 	bool resize = ca->bucket_gens != NULL;
 	int ret;
 
-	if (!(bucket_gens	= kvpmalloc(sizeof(struct bucket_gens) + nbuckets,
-					    GFP_KERNEL|__GFP_ZERO))) {
+	if (!(bucket_gens	= kvmalloc(sizeof(struct bucket_gens) + nbuckets,
+					   GFP_KERNEL|__GFP_ZERO))) {
 		ret = -BCH_ERR_ENOMEM_bucket_gens;
 		goto err;
 	}
 
 	if ((c->opts.buckets_nouse &&
-	     !(buckets_nouse	= kvpmalloc(BITS_TO_LONGS(nbuckets) *
-					    sizeof(unsigned long),
-					    GFP_KERNEL|__GFP_ZERO)))) {
+	     !(buckets_nouse	= kvmalloc(BITS_TO_LONGS(nbuckets) *
+					   sizeof(unsigned long),
+					   GFP_KERNEL|__GFP_ZERO)))) {
 		ret = -BCH_ERR_ENOMEM_buckets_nouse;
 		goto err;
 	}
@@ -1397,8 +1397,7 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 
 	ret = 0;
 err:
-	kvpfree(buckets_nouse,
-		BITS_TO_LONGS(nbuckets) * sizeof(unsigned long));
+	kvfree(buckets_nouse);
 	if (bucket_gens)
 		call_rcu(&bucket_gens->rcu, bucket_gens_free_rcu);
 
@@ -1407,27 +1406,21 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 
 void bch2_dev_buckets_free(struct bch_dev *ca)
 {
-	unsigned i;
-
-	kvpfree(ca->buckets_nouse,
-		BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long));
-	kvpfree(rcu_dereference_protected(ca->bucket_gens, 1),
-		sizeof(struct bucket_gens) + ca->mi.nbuckets);
+	kvfree(ca->buckets_nouse);
+	kvfree(rcu_dereference_protected(ca->bucket_gens, 1));
 
-	for (i = 0; i < ARRAY_SIZE(ca->usage); i++)
+	for (unsigned i = 0; i < ARRAY_SIZE(ca->usage); i++)
 		free_percpu(ca->usage[i]);
 	kfree(ca->usage_base);
 }
 
 int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca)
 {
-	unsigned i;
-
 	ca->usage_base = kzalloc(sizeof(struct bch_dev_usage), GFP_KERNEL);
 	if (!ca->usage_base)
 		return -BCH_ERR_ENOMEM_usage_init;
 
-	for (i = 0; i < ARRAY_SIZE(ca->usage); i++) {
+	for (unsigned i = 0; i < ARRAY_SIZE(ca->usage); i++) {
 		ca->usage[i] = alloc_percpu(struct bch_dev_usage);
 		if (!ca->usage[i])
 			return -BCH_ERR_ENOMEM_usage_init;
diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c
index 33df8cf86bd8f8..1410365a889156 100644
--- a/fs/bcachefs/compress.c
+++ b/fs/bcachefs/compress.c
@@ -601,13 +601,13 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
 		return 0;
 
 	if (!mempool_initialized(&c->compression_bounce[READ]) &&
-	    mempool_init_kvpmalloc_pool(&c->compression_bounce[READ],
-					1, c->opts.encoded_extent_max))
+	    mempool_init_kvmalloc_pool(&c->compression_bounce[READ],
+				       1, c->opts.encoded_extent_max))
 		return -BCH_ERR_ENOMEM_compression_bounce_read_init;
 
 	if (!mempool_initialized(&c->compression_bounce[WRITE]) &&
-	    mempool_init_kvpmalloc_pool(&c->compression_bounce[WRITE],
-					1, c->opts.encoded_extent_max))
+	    mempool_init_kvmalloc_pool(&c->compression_bounce[WRITE],
+				       1, c->opts.encoded_extent_max))
 		return -BCH_ERR_ENOMEM_compression_bounce_write_init;
 
 	for (i = compression_types;
@@ -622,15 +622,15 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
 		if (mempool_initialized(&c->compress_workspace[i->type]))
 			continue;
 
-		if (mempool_init_kvpmalloc_pool(
+		if (mempool_init_kvmalloc_pool(
 				&c->compress_workspace[i->type],
 				1, i->compress_workspace))
 			return -BCH_ERR_ENOMEM_compression_workspace_init;
 	}
 
 	if (!mempool_initialized(&c->decompress_workspace) &&
-	    mempool_init_kvpmalloc_pool(&c->decompress_workspace,
-					1, decompress_workspace_size))
+	    mempool_init_kvmalloc_pool(&c->decompress_workspace,
+				       1, decompress_workspace_size))
 		return -BCH_ERR_ENOMEM_decompression_workspace_init;
 
 	return 0;
diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
index 7bdba8507fc93c..b1f147e6be4d5c 100644
--- a/fs/bcachefs/debug.c
+++ b/fs/bcachefs/debug.c
@@ -137,7 +137,7 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
 	mutex_lock(&c->verify_lock);
 
 	if (!c->verify_ondisk) {
-		c->verify_ondisk = kvpmalloc(btree_buf_bytes(b), GFP_KERNEL);
+		c->verify_ondisk = kvmalloc(btree_buf_bytes(b), GFP_KERNEL);
 		if (!c->verify_ondisk)
 			goto out;
 	}
@@ -199,7 +199,7 @@ void bch2_btree_node_ondisk_to_text(struct printbuf *out, struct bch_fs *c,
 		return;
 	}
 
-	n_ondisk = kvpmalloc(btree_buf_bytes(b), GFP_KERNEL);
+	n_ondisk = kvmalloc(btree_buf_bytes(b), GFP_KERNEL);
 	if (!n_ondisk) {
 		prt_printf(out, "memory allocation failure\n");
 		goto out;
@@ -293,7 +293,7 @@ void bch2_btree_node_ondisk_to_text(struct printbuf *out, struct bch_fs *c,
 out:
 	if (bio)
 		bio_put(bio);
-	kvpfree(n_ondisk, btree_buf_bytes(b));
+	kvfree(n_ondisk);
 	percpu_ref_put(&ca->io_ref);
 }
 
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index d503af2700247d..b98e2c2b8bf06f 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -504,7 +504,7 @@ static void ec_stripe_buf_exit(struct ec_stripe_buf *buf)
 		unsigned i;
 
 		for (i = 0; i < s->v.nr_blocks; i++) {
-			kvpfree(buf->data[i], buf->size << 9);
+			kvfree(buf->data[i]);
 			buf->data[i] = NULL;
 		}
 	}
@@ -531,7 +531,7 @@ static int ec_stripe_buf_init(struct ec_stripe_buf *buf,
 	memset(buf->valid, 0xFF, sizeof(buf->valid));
 
 	for (i = 0; i < v->nr_blocks; i++) {
-		buf->data[i] = kvpmalloc(buf->size << 9, GFP_KERNEL);
+		buf->data[i] = kvmalloc(buf->size << 9, GFP_KERNEL);
 		if (!buf->data[i])
 			goto err;
 	}
diff --git a/fs/bcachefs/fifo.h b/fs/bcachefs/fifo.h
index 66b945be10c230..d8153fe27037ef 100644
--- a/fs/bcachefs/fifo.h
+++ b/fs/bcachefs/fifo.h
@@ -24,12 +24,12 @@ struct {								\
 	(fifo)->mask	= (fifo)->size					\
 		? roundup_pow_of_two((fifo)->size) - 1			\
 		: 0;							\
-	(fifo)->data	= kvpmalloc(fifo_buf_size(fifo), (_gfp));	\
+	(fifo)->data	= kvmalloc(fifo_buf_size(fifo), (_gfp));	\
 })
 
 #define free_fifo(fifo)							\
 do {									\
-	kvpfree((fifo)->data, fifo_buf_size(fifo));			\
+	kvfree((fifo)->data);						\
 	(fifo)->data = NULL;						\
 } while (0)
 
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index c040f69dfb5c70..214c8030048292 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -1342,7 +1342,7 @@ void bch2_fs_journal_exit(struct journal *j)
 	darray_exit(&j->early_journal_entries);
 
 	for (unsigned i = 0; i < ARRAY_SIZE(j->buf); i++)
-		kvpfree(j->buf[i].data, j->buf[i].buf_size);
+		kvfree(j->buf[i].data);
 	free_fifo(&j->pin);
 }
 
@@ -1371,7 +1371,7 @@ int bch2_fs_journal_init(struct journal *j)
 
 	for (unsigned i = 0; i < ARRAY_SIZE(j->buf); i++) {
 		j->buf[i].buf_size = JOURNAL_ENTRY_SIZE_MIN;
-		j->buf[i].data = kvpmalloc(j->buf[i].buf_size, GFP_KERNEL);
+		j->buf[i].data = kvmalloc(j->buf[i].buf_size, GFP_KERNEL);
 		if (!j->buf[i].data)
 			return -BCH_ERR_ENOMEM_journal_buf;
 		j->buf[i].idx = i;
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 7c2321c5af2ac3..16c1249c84e09e 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -84,8 +84,7 @@ static void __journal_replay_free(struct bch_fs *c,
 
 	BUG_ON(*p != i);
 	*p = NULL;
-	kvpfree(i, offsetof(struct journal_replay, j) +
-		vstruct_bytes(&i->j));
+	kvfree(i);
 }
 
 static void journal_replay_free(struct bch_fs *c, struct journal_replay *i)
@@ -196,7 +195,7 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
 		goto out;
 	}
 replace:
-	i = kvpmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL);
+	i = kvmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL);
 	if (!i)
 		return -BCH_ERR_ENOMEM_journal_entry_add;
 
@@ -965,11 +964,11 @@ static int journal_read_buf_realloc(struct journal_read_buf *b,
 		return -BCH_ERR_ENOMEM_journal_read_buf_realloc;
 
 	new_size = roundup_pow_of_two(new_size);
-	n = kvpmalloc(new_size, GFP_KERNEL);
+	n = kvmalloc(new_size, GFP_KERNEL);
 	if (!n)
 		return -BCH_ERR_ENOMEM_journal_read_buf_realloc;
 
-	kvpfree(b->data, b->size);
+	kvfree(b->data);
 	b->data = n;
 	b->size = new_size;
 	return 0;
@@ -1195,7 +1194,7 @@ static CLOSURE_CALLBACK(bch2_journal_read_device)
 		ja->dirty_idx = (ja->cur_idx + 1) % ja->nr;
 out:
 	bch_verbose(c, "journal read done on device %s, ret %i", ca->name, ret);
-	kvpfree(buf.data, buf.size);
+	kvfree(buf.data);
 	percpu_ref_put(&ca->io_ref);
 	closure_return(cl);
 	return;
@@ -1576,7 +1575,7 @@ static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
 	if (bch2_btree_write_buffer_resize(c, btree_write_buffer_size))
 		return;
 
-	new_buf = kvpmalloc(new_size, GFP_NOFS|__GFP_NOWARN);
+	new_buf = kvmalloc(new_size, GFP_NOFS|__GFP_NOWARN);
 	if (!new_buf)
 		return;
 
@@ -1587,7 +1586,7 @@ static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
 	swap(buf->buf_size,	new_size);
 	spin_unlock(&j->lock);
 
-	kvpfree(new_buf, new_size);
+	kvfree(new_buf);
 }
 
 static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j)
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 30f8b6e9af38e9..f45b68a962d0df 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -576,7 +576,7 @@ static void __bch2_fs_free(struct bch_fs *c)
 		destroy_workqueue(c->btree_update_wq);
 
 	bch2_free_super(&c->disk_sb);
-	kvpfree(c, sizeof(*c));
+	kvfree(c);
 	module_put(THIS_MODULE);
 }
 
@@ -715,7 +715,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	unsigned i, iter_size;
 	int ret = 0;
 
-	c = kvpmalloc(sizeof(struct bch_fs), GFP_KERNEL|__GFP_ZERO);
+	c = kvmalloc(sizeof(struct bch_fs), GFP_KERNEL|__GFP_ZERO);
 	if (!c) {
 		c = ERR_PTR(-BCH_ERR_ENOMEM_fs_alloc);
 		goto out;
@@ -882,8 +882,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 			BIOSET_NEED_BVECS) ||
 	    !(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) ||
 	    !(c->online_reserved = alloc_percpu(u64)) ||
-	    mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1,
-					c->opts.btree_node_size) ||
+	    mempool_init_kvmalloc_pool(&c->btree_bounce_pool, 1,
+				       c->opts.btree_node_size) ||
 	    mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048) ||
 	    !(c->unused_inode_hints = kcalloc(1U << c->inode_shard_bits,
 					      sizeof(u64), GFP_KERNEL))) {
diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
index 4c3e19d562852e..539735033947b9 100644
--- a/fs/bcachefs/util.c
+++ b/fs/bcachefs/util.c
@@ -707,28 +707,6 @@ void memcpy_from_bio(void *dst, struct bio *src, struct bvec_iter src_iter)
 	}
 }
 
-static void mempool_free_vp(void *element, void *pool_data)
-{
-	size_t size = (size_t) pool_data;
-
-	vpfree(element, size);
-}
-
-static void *mempool_alloc_vp(gfp_t gfp_mask, void *pool_data)
-{
-	size_t size = (size_t) pool_data;
-
-	return vpmalloc(size, gfp_mask);
-}
-
-int mempool_init_kvpmalloc_pool(mempool_t *pool, int min_nr, size_t size)
-{
-	return size < PAGE_SIZE
-		? mempool_init_kmalloc_pool(pool, min_nr, size)
-		: mempool_init(pool, min_nr, mempool_alloc_vp,
-			       mempool_free_vp, (void *) size);
-}
-
 #if 0
 void eytzinger1_test(void)
 {
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index cf8d16a9116223..c4cd32a2aeb28e 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -53,38 +53,6 @@ static inline size_t buf_pages(void *p, size_t len)
 			    PAGE_SIZE);
 }
 
-static inline void vpfree(void *p, size_t size)
-{
-	if (is_vmalloc_addr(p))
-		vfree(p);
-	else
-		free_pages((unsigned long) p, get_order(size));
-}
-
-static inline void *vpmalloc(size_t size, gfp_t gfp_mask)
-{
-	return (void *) __get_free_pages(gfp_mask|__GFP_NOWARN,
-					 get_order(size)) ?:
-		__vmalloc(size, gfp_mask);
-}
-
-static inline void kvpfree(void *p, size_t size)
-{
-	if (size < PAGE_SIZE)
-		kfree(p);
-	else
-		vpfree(p, size);
-}
-
-static inline void *kvpmalloc(size_t size, gfp_t gfp_mask)
-{
-	return size < PAGE_SIZE
-		? kmalloc(size, gfp_mask)
-		: vpmalloc(size, gfp_mask);
-}
-
-int mempool_init_kvpmalloc_pool(mempool_t *, int, size_t);
-
 #define HEAP(type)							\
 struct {								\
 	size_t size, used;						\
@@ -97,13 +65,13 @@ struct {								\
 ({									\
 	(heap)->used = 0;						\
 	(heap)->size = (_size);						\
-	(heap)->data = kvpmalloc((heap)->size * sizeof((heap)->data[0]),\
+	(heap)->data = kvmalloc((heap)->size * sizeof((heap)->data[0]),\
 				 (gfp));				\
 })
 
 #define free_heap(heap)							\
 do {									\
-	kvpfree((heap)->data, (heap)->size * sizeof((heap)->data[0]));	\
+	kvfree((heap)->data);						\
 	(heap)->data = NULL;						\
 } while (0)
 

From d2793f77fbcdf7beb05443cb4312ebc300c52bb2 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 4 Feb 2024 20:19:49 -0500
Subject: [PATCH 0517/1406] bcachefs: thread_with_stdio: eliminate double
 buffering

The output buffer lock has to be a spinlock so that we can write to it
from interrupt context, so we can't use a direct copy_to_user; this
switches thread_with_file_read() to use fault_in_writeable() and
copy_to_user_nofault(), similar to how thread_with_file_write() works.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/thread_with_file.c | 58 +++++++++++-----------------------
 fs/bcachefs/thread_with_file.h |  1 -
 2 files changed, 18 insertions(+), 41 deletions(-)

diff --git a/fs/bcachefs/thread_with_file.c b/fs/bcachefs/thread_with_file.c
index 9220d7de10db67..8c3afb4c3204fe 100644
--- a/fs/bcachefs/thread_with_file.c
+++ b/fs/bcachefs/thread_with_file.c
@@ -67,16 +67,15 @@ int bch2_run_thread_with_file(struct thread_with_file *thr,
 
 static inline bool thread_with_stdio_has_output(struct thread_with_stdio *thr)
 {
-	return thr->stdio.output_buf.pos ||
-		thr->output2.nr ||
-		thr->thr.done;
+	return thr->stdio.output_buf.pos || thr->thr.done;
 }
 
-static ssize_t thread_with_stdio_read(struct file *file, char __user *buf,
+static ssize_t thread_with_stdio_read(struct file *file, char __user *ubuf,
 				      size_t len, loff_t *ppos)
 {
 	struct thread_with_stdio *thr =
 		container_of(file->private_data, struct thread_with_stdio, thr);
+	struct printbuf *buf = &thr->stdio.output_buf;
 	size_t copied = 0, b;
 	int ret = 0;
 
@@ -89,44 +88,25 @@ static ssize_t thread_with_stdio_read(struct file *file, char __user *buf,
 	if (ret)
 		return ret;
 
-	if (thr->thr.done)
-		return 0;
-
-	while (len) {
-		ret = darray_make_room(&thr->output2, thr->stdio.output_buf.pos);
-		if (ret)
-			break;
-
-		spin_lock_irq(&thr->stdio.output_lock);
-		b = min_t(size_t, darray_room(thr->output2), thr->stdio.output_buf.pos);
-
-		memcpy(&darray_top(thr->output2), thr->stdio.output_buf.buf, b);
-		memmove(thr->stdio.output_buf.buf,
-			thr->stdio.output_buf.buf + b,
-			thr->stdio.output_buf.pos - b);
-
-		thr->output2.nr += b;
-		thr->stdio.output_buf.pos -= b;
-		spin_unlock_irq(&thr->stdio.output_lock);
-
-		b = min(len, thr->output2.nr);
-		if (!b)
-			break;
-
-		b -= copy_to_user(buf, thr->output2.data, b);
-		if (!b) {
+	while (len && buf->pos) {
+		if (fault_in_writeable(ubuf, len) == len) {
 			ret = -EFAULT;
 			break;
 		}
 
-		copied	+= b;
-		buf	+= b;
-		len	-= b;
-
-		memmove(thr->output2.data,
-			thr->output2.data + b,
-			thr->output2.nr - b);
-		thr->output2.nr -= b;
+		spin_lock_irq(&thr->stdio.output_lock);
+		b = min_t(size_t, len, buf->pos);
+
+		if (b && !copy_to_user_nofault(ubuf, buf->buf, b)) {
+			memmove(buf->buf,
+				buf->buf + b,
+				buf->pos - b);
+			buf->pos -= b;
+			ubuf	+= b;
+			len	-= b;
+			copied	+= b;
+		}
+		spin_unlock_irq(&thr->stdio.output_lock);
 	}
 
 	return copied ?: ret;
@@ -140,7 +120,6 @@ static int thread_with_stdio_release(struct inode *inode, struct file *file)
 	bch2_thread_with_file_exit(&thr->thr);
 	printbuf_exit(&thr->stdio.input_buf);
 	printbuf_exit(&thr->stdio.output_buf);
-	darray_exit(&thr->output2);
 	thr->exit(thr);
 	return 0;
 }
@@ -245,7 +224,6 @@ int bch2_run_thread_with_stdio(struct thread_with_stdio *thr,
 	spin_lock_init(&thr->stdio.output_lock);
 	init_waitqueue_head(&thr->stdio.output_wait);
 
-	darray_init(&thr->output2);
 	thr->exit = exit;
 
 	return bch2_run_thread_with_file(&thr->thr, &thread_with_stdio_fops, fn);
diff --git a/fs/bcachefs/thread_with_file.h b/fs/bcachefs/thread_with_file.h
index 05879c5048c875..b5098b52db709b 100644
--- a/fs/bcachefs/thread_with_file.h
+++ b/fs/bcachefs/thread_with_file.h
@@ -20,7 +20,6 @@ int bch2_run_thread_with_file(struct thread_with_file *,
 struct thread_with_stdio {
 	struct thread_with_file	thr;
 	struct stdio_redirect	stdio;
-	DARRAY(char)		output2;
 	void			(*exit)(struct thread_with_stdio *);
 };
 

From e10c0e70c691961600a0ea8cb557b0a1f465578d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 4 Feb 2024 22:20:40 -0500
Subject: [PATCH 0518/1406] bcachefs: thread_with_stdio: convert to darray

 - eliminate the dependency on printbufs, so that we can lift
   thread_with_file for use in xfs
 - add a nonblocking parameter to stdio_redirect_printf(), and either
   block if the buffer is full or drop it on the floor - don't buffer
   infinitely

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/super.c                  |   9 +-
 fs/bcachefs/thread_with_file.c       | 229 +++++++++++++++++----------
 fs/bcachefs/thread_with_file.h       |   7 +-
 fs/bcachefs/thread_with_file_types.h |  15 +-
 4 files changed, 160 insertions(+), 100 deletions(-)

diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index f45b68a962d0df..6edb2ef0e03ef6 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -56,6 +56,7 @@
 #include "super.h"
 #include "super-io.h"
 #include "sysfs.h"
+#include "thread_with_file.h"
 #include "trace.h"
 
 #include <linux/backing-dev.h>
@@ -95,16 +96,10 @@ void __bch2_print(struct bch_fs *c, const char *fmt, ...)
 	if (likely(!stdio)) {
 		vprintk(fmt, args);
 	} else {
-		unsigned long flags;
-
 		if (fmt[0] == KERN_SOH[0])
 			fmt += 2;
 
-		spin_lock_irqsave(&stdio->output_lock, flags);
-		prt_vprintf(&stdio->output_buf, fmt, args);
-		spin_unlock_irqrestore(&stdio->output_lock, flags);
-
-		wake_up(&stdio->output_wait);
+		bch2_stdio_redirect_vprintf(stdio, true, fmt, args);
 	}
 	va_end(args);
 }
diff --git a/fs/bcachefs/thread_with_file.c b/fs/bcachefs/thread_with_file.c
index 8c3afb4c3204fe..ca81d3fec3eef8 100644
--- a/fs/bcachefs/thread_with_file.c
+++ b/fs/bcachefs/thread_with_file.c
@@ -2,7 +2,6 @@
 #ifndef NO_BCACHEFS_FS
 
 #include "bcachefs.h"
-#include "printbuf.h"
 #include "thread_with_file.h"
 
 #include <linux/anon_inodes.h>
@@ -65,48 +64,74 @@ int bch2_run_thread_with_file(struct thread_with_file *thr,
 	return ret;
 }
 
-static inline bool thread_with_stdio_has_output(struct thread_with_stdio *thr)
+/* stdio_redirect */
+
+static bool stdio_redirect_has_input(struct stdio_redirect *stdio)
+{
+	return stdio->input.buf.nr || stdio->done;
+}
+
+static bool stdio_redirect_has_output(struct stdio_redirect *stdio)
 {
-	return thr->stdio.output_buf.pos || thr->thr.done;
+	return stdio->output.buf.nr || stdio->done;
 }
 
+#define WRITE_BUFFER		4096
+
+static bool stdio_redirect_has_input_space(struct stdio_redirect *stdio)
+{
+	return stdio->input.buf.nr < WRITE_BUFFER || stdio->done;
+}
+
+static bool stdio_redirect_has_output_space(struct stdio_redirect *stdio)
+{
+	return stdio->output.buf.nr < WRITE_BUFFER || stdio->done;
+}
+
+static void stdio_buf_init(struct stdio_buf *buf)
+{
+	spin_lock_init(&buf->lock);
+	init_waitqueue_head(&buf->wait);
+	darray_init(&buf->buf);
+}
+
+/* thread_with_stdio */
+
 static ssize_t thread_with_stdio_read(struct file *file, char __user *ubuf,
 				      size_t len, loff_t *ppos)
 {
 	struct thread_with_stdio *thr =
 		container_of(file->private_data, struct thread_with_stdio, thr);
-	struct printbuf *buf = &thr->stdio.output_buf;
+	struct stdio_buf *buf = &thr->stdio.output;
 	size_t copied = 0, b;
 	int ret = 0;
 
-	if ((file->f_flags & O_NONBLOCK) &&
-	    !thread_with_stdio_has_output(thr))
+	if (!(file->f_flags & O_NONBLOCK)) {
+		ret = wait_event_interruptible(buf->wait, stdio_redirect_has_output(&thr->stdio));
+		if (ret)
+			return ret;
+	} else if (!stdio_redirect_has_output(&thr->stdio))
 		return -EAGAIN;
 
-	ret = wait_event_interruptible(thr->stdio.output_wait,
-		thread_with_stdio_has_output(thr));
-	if (ret)
-		return ret;
-
-	while (len && buf->pos) {
+	while (len && buf->buf.nr) {
 		if (fault_in_writeable(ubuf, len) == len) {
 			ret = -EFAULT;
 			break;
 		}
 
-		spin_lock_irq(&thr->stdio.output_lock);
-		b = min_t(size_t, len, buf->pos);
+		spin_lock_irq(&buf->lock);
+		b = min_t(size_t, len, buf->buf.nr);
 
-		if (b && !copy_to_user_nofault(ubuf, buf->buf, b)) {
-			memmove(buf->buf,
-				buf->buf + b,
-				buf->pos - b);
-			buf->pos -= b;
+		if (b && !copy_to_user_nofault(ubuf, buf->buf.data, b)) {
 			ubuf	+= b;
 			len	-= b;
 			copied	+= b;
+			buf->buf.nr -= b;
+			memmove(buf->buf.data,
+				buf->buf.data + b,
+				buf->buf.nr);
 		}
-		spin_unlock_irq(&thr->stdio.output_lock);
+		spin_unlock_irq(&buf->lock);
 	}
 
 	return copied ?: ret;
@@ -118,25 +143,18 @@ static int thread_with_stdio_release(struct inode *inode, struct file *file)
 		container_of(file->private_data, struct thread_with_stdio, thr);
 
 	bch2_thread_with_file_exit(&thr->thr);
-	printbuf_exit(&thr->stdio.input_buf);
-	printbuf_exit(&thr->stdio.output_buf);
+	darray_exit(&thr->stdio.input.buf);
+	darray_exit(&thr->stdio.output.buf);
 	thr->exit(thr);
 	return 0;
 }
 
-#define WRITE_BUFFER		4096
-
-static inline bool thread_with_stdio_has_input_space(struct thread_with_stdio *thr)
-{
-	return thr->stdio.input_buf.pos < WRITE_BUFFER || thr->thr.done;
-}
-
 static ssize_t thread_with_stdio_write(struct file *file, const char __user *ubuf,
 				       size_t len, loff_t *ppos)
 {
 	struct thread_with_stdio *thr =
 		container_of(file->private_data, struct thread_with_stdio, thr);
-	struct printbuf *buf = &thr->stdio.input_buf;
+	struct stdio_buf *buf = &thr->stdio.input;
 	size_t copied = 0;
 	ssize_t ret = 0;
 
@@ -152,29 +170,29 @@ static ssize_t thread_with_stdio_write(struct file *file, const char __user *ubu
 			break;
 		}
 
-		spin_lock(&thr->stdio.input_lock);
-		if (buf->pos < WRITE_BUFFER)
-			bch2_printbuf_make_room(buf, min(b, WRITE_BUFFER - buf->pos));
-		b = min(len, printbuf_remaining_size(buf));
+		spin_lock(&buf->lock);
+		if (buf->buf.nr < WRITE_BUFFER)
+			darray_make_room_gfp(&buf->buf, min(b, WRITE_BUFFER - buf->buf.nr), __GFP_NOWARN);
+		b = min(len, darray_room(buf->buf));
 
-		if (b && !copy_from_user_nofault(&buf->buf[buf->pos], ubuf, b)) {
-			ubuf += b;
-			len -= b;
-			copied += b;
-			buf->pos += b;
+		if (b && !copy_from_user_nofault(&buf->buf.data[buf->buf.nr], ubuf, b)) {
+			buf->buf.nr += b;
+			ubuf	+= b;
+			len	-= b;
+			copied	+= b;
 		}
-		spin_unlock(&thr->stdio.input_lock);
+		spin_unlock(&buf->lock);
 
 		if (b) {
-			wake_up(&thr->stdio.input_wait);
+			wake_up(&buf->wait);
 		} else {
 			if ((file->f_flags & O_NONBLOCK)) {
 				ret = -EAGAIN;
 				break;
 			}
 
-			ret = wait_event_interruptible(thr->stdio.input_wait,
-					thread_with_stdio_has_input_space(thr));
+			ret = wait_event_interruptible(buf->wait,
+					stdio_redirect_has_input_space(&thr->stdio));
 			if (ret)
 				break;
 		}
@@ -188,14 +206,14 @@ static __poll_t thread_with_stdio_poll(struct file *file, struct poll_table_stru
 	struct thread_with_stdio *thr =
 		container_of(file->private_data, struct thread_with_stdio, thr);
 
-	poll_wait(file, &thr->stdio.output_wait, wait);
-	poll_wait(file, &thr->stdio.input_wait, wait);
+	poll_wait(file, &thr->stdio.output.wait, wait);
+	poll_wait(file, &thr->stdio.input.wait, wait);
 
 	__poll_t mask = 0;
 
-	if (thread_with_stdio_has_output(thr))
+	if (stdio_redirect_has_output(&thr->stdio))
 		mask |= EPOLLIN;
-	if (thread_with_stdio_has_input_space(thr))
+	if (stdio_redirect_has_input_space(&thr->stdio))
 		mask |= EPOLLOUT;
 	if (thr->thr.done)
 		mask |= EPOLLHUP|EPOLLERR;
@@ -203,75 +221,112 @@ static __poll_t thread_with_stdio_poll(struct file *file, struct poll_table_stru
 }
 
 static const struct file_operations thread_with_stdio_fops = {
-	.release	= thread_with_stdio_release,
+	.llseek		= no_llseek,
 	.read		= thread_with_stdio_read,
 	.write		= thread_with_stdio_write,
 	.poll		= thread_with_stdio_poll,
-	.llseek		= no_llseek,
+	.release	= thread_with_stdio_release,
 };
 
 int bch2_run_thread_with_stdio(struct thread_with_stdio *thr,
 			       void (*exit)(struct thread_with_stdio *),
 			       int (*fn)(void *))
 {
-	thr->stdio.input_buf = PRINTBUF;
-	thr->stdio.input_buf.atomic++;
-	spin_lock_init(&thr->stdio.input_lock);
-	init_waitqueue_head(&thr->stdio.input_wait);
-
-	thr->stdio.output_buf = PRINTBUF;
-	thr->stdio.output_buf.atomic++;
-	spin_lock_init(&thr->stdio.output_lock);
-	init_waitqueue_head(&thr->stdio.output_wait);
-
+	stdio_buf_init(&thr->stdio.input);
+	stdio_buf_init(&thr->stdio.output);
 	thr->exit = exit;
 
 	return bch2_run_thread_with_file(&thr->thr, &thread_with_stdio_fops, fn);
 }
 
-int bch2_stdio_redirect_read(struct stdio_redirect *stdio, char *buf, size_t len)
+int bch2_stdio_redirect_read(struct stdio_redirect *stdio, char *ubuf, size_t len)
 {
-	wait_event(stdio->input_wait,
-		   stdio->input_buf.pos || stdio->done);
+	struct stdio_buf *buf = &stdio->input;
 
+	wait_event(buf->wait, stdio_redirect_has_input(stdio));
 	if (stdio->done)
 		return -1;
 
-	spin_lock(&stdio->input_lock);
-	int ret = min(len, stdio->input_buf.pos);
-	stdio->input_buf.pos -= ret;
-	memcpy(buf, stdio->input_buf.buf, ret);
-	memmove(stdio->input_buf.buf,
-		stdio->input_buf.buf + ret,
-		stdio->input_buf.pos);
-	spin_unlock(&stdio->input_lock);
+	spin_lock(&buf->lock);
+	int ret = min(len, buf->buf.nr);
+	buf->buf.nr -= ret;
+	memcpy(ubuf, buf->buf.data, ret);
+	memmove(buf->buf.data,
+		buf->buf.data + ret,
+		buf->buf.nr);
+	spin_unlock(&buf->lock);
 
-	wake_up(&stdio->input_wait);
+	wake_up(&buf->wait);
 	return ret;
 }
 
-int bch2_stdio_redirect_readline(struct stdio_redirect *stdio, char *buf, size_t len)
+int bch2_stdio_redirect_readline(struct stdio_redirect *stdio, char *ubuf, size_t len)
 {
-	wait_event(stdio->input_wait,
-		   stdio->input_buf.pos || stdio->done);
+	struct stdio_buf *buf = &stdio->input;
 
+	wait_event(buf->wait, stdio_redirect_has_input(stdio));
 	if (stdio->done)
 		return -1;
 
-	spin_lock(&stdio->input_lock);
-	int ret = min(len, stdio->input_buf.pos);
-	char *n = memchr(stdio->input_buf.buf, '\n', ret);
-	if (n)
-		ret = min(ret, n + 1 - stdio->input_buf.buf);
-	stdio->input_buf.pos -= ret;
-	memcpy(buf, stdio->input_buf.buf, ret);
-	memmove(stdio->input_buf.buf,
-		stdio->input_buf.buf + ret,
-		stdio->input_buf.pos);
-	spin_unlock(&stdio->input_lock);
-
-	wake_up(&stdio->input_wait);
+	spin_lock(&buf->lock);
+	int ret = min(len, buf->buf.nr);
+	char *n = memchr(buf->buf.data, '\n', ret);
+	if (!n)
+		ret = min(ret, n + 1 - buf->buf.data);
+	buf->buf.nr -= ret;
+	memcpy(ubuf, buf->buf.data, ret);
+	memmove(buf->buf.data,
+		buf->buf.data + ret,
+		buf->buf.nr);
+	spin_unlock(&buf->lock);
+
+	wake_up(&buf->wait);
 	return ret;
 }
 
+__printf(3, 0)
+static void bch2_darray_vprintf(darray_char *out, gfp_t gfp, const char *fmt, va_list args)
+{
+	size_t len;
+
+	do {
+		va_list args2;
+		va_copy(args2, args);
+
+		len = vsnprintf(out->data + out->nr, darray_room(*out), fmt, args2);
+	} while (len + 1 > darray_room(*out) && !darray_make_room_gfp(out, len + 1, gfp));
+
+	out->nr += min(len, darray_room(*out));
+}
+
+void bch2_stdio_redirect_vprintf(struct stdio_redirect *stdio, bool nonblocking,
+				 const char *fmt, va_list args)
+{
+	struct stdio_buf *buf = &stdio->output;
+	unsigned long flags;
+
+	if (!nonblocking)
+		wait_event(buf->wait, stdio_redirect_has_output_space(stdio));
+	else if (!stdio_redirect_has_output_space(stdio))
+		return;
+	if (stdio->done)
+		return;
+
+	spin_lock_irqsave(&buf->lock, flags);
+	bch2_darray_vprintf(&buf->buf, nonblocking ? __GFP_NOWARN : GFP_KERNEL, fmt, args);
+	spin_unlock_irqrestore(&buf->lock, flags);
+
+	wake_up(&buf->wait);
+}
+
+void bch2_stdio_redirect_printf(struct stdio_redirect *stdio, bool nonblocking,
+				const char *fmt, ...)
+{
+
+	va_list args;
+	va_start(args, fmt);
+	bch2_stdio_redirect_vprintf(stdio, nonblocking, fmt, args);
+	va_end(args);
+}
+
 #endif /* NO_BCACHEFS_FS */
diff --git a/fs/bcachefs/thread_with_file.h b/fs/bcachefs/thread_with_file.h
index b5098b52db709b..4243c7c5ad3f3a 100644
--- a/fs/bcachefs/thread_with_file.h
+++ b/fs/bcachefs/thread_with_file.h
@@ -27,8 +27,8 @@ static inline void thread_with_stdio_done(struct thread_with_stdio *thr)
 {
 	thr->thr.done = true;
 	thr->stdio.done = true;
-	wake_up(&thr->stdio.input_wait);
-	wake_up(&thr->stdio.output_wait);
+	wake_up(&thr->stdio.input.wait);
+	wake_up(&thr->stdio.output.wait);
 }
 
 int bch2_run_thread_with_stdio(struct thread_with_stdio *,
@@ -37,4 +37,7 @@ int bch2_run_thread_with_stdio(struct thread_with_stdio *,
 int bch2_stdio_redirect_read(struct stdio_redirect *, char *, size_t);
 int bch2_stdio_redirect_readline(struct stdio_redirect *, char *, size_t);
 
+__printf(3, 0) void bch2_stdio_redirect_vprintf(struct stdio_redirect *, bool, const char *, va_list);
+__printf(3, 4) void bch2_stdio_redirect_printf(struct stdio_redirect *, bool, const char *, ...);
+
 #endif /* _BCACHEFS_THREAD_WITH_FILE_H */
diff --git a/fs/bcachefs/thread_with_file_types.h b/fs/bcachefs/thread_with_file_types.h
index 90b5e645e98ce5..e0daf4eec341e0 100644
--- a/fs/bcachefs/thread_with_file_types.h
+++ b/fs/bcachefs/thread_with_file_types.h
@@ -2,14 +2,21 @@
 #ifndef _BCACHEFS_THREAD_WITH_FILE_TYPES_H
 #define _BCACHEFS_THREAD_WITH_FILE_TYPES_H
 
+#include "darray.h"
+
+struct stdio_buf {
+	spinlock_t		lock;
+	wait_queue_head_t	wait;
+	darray_char		buf;
+};
+
 struct stdio_redirect {
-	spinlock_t		output_lock;
-	wait_queue_head_t	output_wait;
-	struct printbuf		output_buf;
+	struct stdio_buf	input;
+	struct stdio_buf	output;
 
 	spinlock_t		input_lock;
 	wait_queue_head_t	input_wait;
-	struct printbuf		input_buf;
+	darray_char		input_buf;
 	bool			done;
 };
 

From b15b3c0b538db2ee6d8eb57db292253c84194f4b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 4 Feb 2024 22:49:34 -0500
Subject: [PATCH 0519/1406] bcachefs: thread_with_stdio: kill
 thread_with_stdio_done()

Move the cleanup code to a wrapper function, where we can call it after
the thread_with_stdio fn exits.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/chardev.c          | 14 ++++----------
 fs/bcachefs/thread_with_file.c | 20 +++++++++++++++++---
 fs/bcachefs/thread_with_file.h | 11 ++---------
 3 files changed, 23 insertions(+), 22 deletions(-)

diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c
index 226b39c176673a..11711f54057e14 100644
--- a/fs/bcachefs/chardev.c
+++ b/fs/bcachefs/chardev.c
@@ -155,17 +155,14 @@ static void bch2_fsck_thread_exit(struct thread_with_stdio *_thr)
 	kfree(thr);
 }
 
-static int bch2_fsck_offline_thread_fn(void *arg)
+static void bch2_fsck_offline_thread_fn(struct thread_with_stdio *stdio)
 {
-	struct fsck_thread *thr = container_of(arg, struct fsck_thread, thr);
+	struct fsck_thread *thr = container_of(stdio, struct fsck_thread, thr);
 	struct bch_fs *c = bch2_fs_open(thr->devs, thr->nr_devs, thr->opts);
 
 	thr->thr.thr.ret = PTR_ERR_OR_ZERO(c);
 	if (!thr->thr.thr.ret)
 		bch2_fs_stop(c);
-
-	thread_with_stdio_done(&thr->thr);
-	return 0;
 }
 
 static long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_arg)
@@ -763,9 +760,9 @@ static long bch2_ioctl_disk_resize_journal(struct bch_fs *c,
 	return ret;
 }
 
-static int bch2_fsck_online_thread_fn(void *arg)
+static void bch2_fsck_online_thread_fn(struct thread_with_stdio *stdio)
 {
-	struct fsck_thread *thr = container_of(arg, struct fsck_thread, thr);
+	struct fsck_thread *thr = container_of(stdio, struct fsck_thread, thr);
 	struct bch_fs *c = thr->c;
 
 	c->stdio_filter = current;
@@ -793,11 +790,8 @@ static int bch2_fsck_online_thread_fn(void *arg)
 	c->stdio_filter = NULL;
 	c->opts.fix_errors = old_fix_errors;
 
-	thread_with_stdio_done(&thr->thr);
-
 	up(&c->online_fsck_mutex);
 	bch2_ro_ref_put(c);
-	return 0;
 }
 
 static long bch2_ioctl_fsck_online(struct bch_fs *c,
diff --git a/fs/bcachefs/thread_with_file.c b/fs/bcachefs/thread_with_file.c
index ca81d3fec3eef8..eb8ab4c47a94ba 100644
--- a/fs/bcachefs/thread_with_file.c
+++ b/fs/bcachefs/thread_with_file.c
@@ -228,15 +228,29 @@ static const struct file_operations thread_with_stdio_fops = {
 	.release	= thread_with_stdio_release,
 };
 
+static int thread_with_stdio_fn(void *arg)
+{
+	struct thread_with_stdio *thr = arg;
+
+	thr->fn(thr);
+
+	thr->thr.done = true;
+	thr->stdio.done = true;
+	wake_up(&thr->stdio.input.wait);
+	wake_up(&thr->stdio.output.wait);
+	return 0;
+}
+
 int bch2_run_thread_with_stdio(struct thread_with_stdio *thr,
 			       void (*exit)(struct thread_with_stdio *),
-			       int (*fn)(void *))
+			       void (*fn)(struct thread_with_stdio *))
 {
 	stdio_buf_init(&thr->stdio.input);
 	stdio_buf_init(&thr->stdio.output);
-	thr->exit = exit;
+	thr->exit	= exit;
+	thr->fn		= fn;
 
-	return bch2_run_thread_with_file(&thr->thr, &thread_with_stdio_fops, fn);
+	return bch2_run_thread_with_file(&thr->thr, &thread_with_stdio_fops, thread_with_stdio_fn);
 }
 
 int bch2_stdio_redirect_read(struct stdio_redirect *stdio, char *ubuf, size_t len)
diff --git a/fs/bcachefs/thread_with_file.h b/fs/bcachefs/thread_with_file.h
index 4243c7c5ad3f3a..66212fcae226aa 100644
--- a/fs/bcachefs/thread_with_file.h
+++ b/fs/bcachefs/thread_with_file.h
@@ -21,19 +21,12 @@ struct thread_with_stdio {
 	struct thread_with_file	thr;
 	struct stdio_redirect	stdio;
 	void			(*exit)(struct thread_with_stdio *);
+	void			(*fn)(struct thread_with_stdio *);
 };
 
-static inline void thread_with_stdio_done(struct thread_with_stdio *thr)
-{
-	thr->thr.done = true;
-	thr->stdio.done = true;
-	wake_up(&thr->stdio.input.wait);
-	wake_up(&thr->stdio.output.wait);
-}
-
 int bch2_run_thread_with_stdio(struct thread_with_stdio *,
 			       void (*exit)(struct thread_with_stdio *),
-			       int (*fn)(void *));
+			       void (*fn)(struct thread_with_stdio *));
 int bch2_stdio_redirect_read(struct stdio_redirect *, char *, size_t);
 int bch2_stdio_redirect_readline(struct stdio_redirect *, char *, size_t);
 

From 9adef8513518cda077335dd1fe0e0107dda9795d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 4 Feb 2024 22:56:16 -0500
Subject: [PATCH 0520/1406] bcachefs: thread_with_stdio: fix
 bch2_stdio_redirect_readline()

This fixes a bug where we'd return data without waiting for a newline,
if data was present but a newline was not.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/thread_with_file.c | 33 ++++++++++++++++++++++-----------
 1 file changed, 22 insertions(+), 11 deletions(-)

diff --git a/fs/bcachefs/thread_with_file.c b/fs/bcachefs/thread_with_file.c
index eb8ab4c47a94ba..830efb06ef0be7 100644
--- a/fs/bcachefs/thread_with_file.c
+++ b/fs/bcachefs/thread_with_file.c
@@ -277,25 +277,36 @@ int bch2_stdio_redirect_read(struct stdio_redirect *stdio, char *ubuf, size_t le
 int bch2_stdio_redirect_readline(struct stdio_redirect *stdio, char *ubuf, size_t len)
 {
 	struct stdio_buf *buf = &stdio->input;
-
+	size_t copied = 0;
+	ssize_t ret = 0;
+again:
 	wait_event(buf->wait, stdio_redirect_has_input(stdio));
-	if (stdio->done)
-		return -1;
+	if (stdio->done) {
+		ret = -1;
+		goto out;
+	}
 
 	spin_lock(&buf->lock);
-	int ret = min(len, buf->buf.nr);
-	char *n = memchr(buf->buf.data, '\n', ret);
-	if (!n)
-		ret = min(ret, n + 1 - buf->buf.data);
-	buf->buf.nr -= ret;
-	memcpy(ubuf, buf->buf.data, ret);
+	size_t b = min(len, buf->buf.nr);
+	char *n = memchr(buf->buf.data, '\n', b);
+	if (n)
+		b = min_t(size_t, b, n + 1 - buf->buf.data);
+	buf->buf.nr -= b;
+	memcpy(ubuf, buf->buf.data, b);
 	memmove(buf->buf.data,
-		buf->buf.data + ret,
+		buf->buf.data + b,
 		buf->buf.nr);
+	ubuf += b;
+	len -= b;
+	copied += b;
 	spin_unlock(&buf->lock);
 
 	wake_up(&buf->wait);
-	return ret;
+
+	if (!n && len)
+		goto again;
+out:
+	return copied ?: ret;
 }
 
 __printf(3, 0)

From 892071117264ab6e005a93940bcff27226f3f158 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 3 Feb 2024 15:43:16 -0500
Subject: [PATCH 0521/1406] bcachefs: Thread with file documentation

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/thread_with_file.c | 15 ++++++++-------
 fs/bcachefs/thread_with_file.h | 32 ++++++++++++++++++++++++++++++++
 2 files changed, 40 insertions(+), 7 deletions(-)

diff --git a/fs/bcachefs/thread_with_file.c b/fs/bcachefs/thread_with_file.c
index 830efb06ef0be7..dde9679b68b425 100644
--- a/fs/bcachefs/thread_with_file.c
+++ b/fs/bcachefs/thread_with_file.c
@@ -76,16 +76,16 @@ static bool stdio_redirect_has_output(struct stdio_redirect *stdio)
 	return stdio->output.buf.nr || stdio->done;
 }
 
-#define WRITE_BUFFER		4096
+#define STDIO_REDIRECT_BUFSIZE		4096
 
 static bool stdio_redirect_has_input_space(struct stdio_redirect *stdio)
 {
-	return stdio->input.buf.nr < WRITE_BUFFER || stdio->done;
+	return stdio->input.buf.nr < STDIO_REDIRECT_BUFSIZE || stdio->done;
 }
 
 static bool stdio_redirect_has_output_space(struct stdio_redirect *stdio)
 {
-	return stdio->output.buf.nr < WRITE_BUFFER || stdio->done;
+	return stdio->output.buf.nr < STDIO_REDIRECT_BUFSIZE || stdio->done;
 }
 
 static void stdio_buf_init(struct stdio_buf *buf)
@@ -171,11 +171,12 @@ static ssize_t thread_with_stdio_write(struct file *file, const char __user *ubu
 		}
 
 		spin_lock(&buf->lock);
-		if (buf->buf.nr < WRITE_BUFFER)
-			darray_make_room_gfp(&buf->buf, min(b, WRITE_BUFFER - buf->buf.nr), __GFP_NOWARN);
+		if (buf->buf.nr < STDIO_REDIRECT_BUFSIZE)
+			darray_make_room_gfp(&buf->buf,
+				min(b, STDIO_REDIRECT_BUFSIZE - buf->buf.nr), GFP_NOWAIT);
 		b = min(len, darray_room(buf->buf));
 
-		if (b && !copy_from_user_nofault(&buf->buf.data[buf->buf.nr], ubuf, b)) {
+		if (b && !copy_from_user_nofault(&darray_top(buf->buf), ubuf, b)) {
 			buf->buf.nr += b;
 			ubuf	+= b;
 			len	-= b;
@@ -338,7 +339,7 @@ void bch2_stdio_redirect_vprintf(struct stdio_redirect *stdio, bool nonblocking,
 		return;
 
 	spin_lock_irqsave(&buf->lock, flags);
-	bch2_darray_vprintf(&buf->buf, nonblocking ? __GFP_NOWARN : GFP_KERNEL, fmt, args);
+	bch2_darray_vprintf(&buf->buf, nonblocking ? GFP_NOWAIT : GFP_KERNEL, fmt, args);
 	spin_unlock_irqrestore(&buf->lock, flags);
 
 	wake_up(&buf->wait);
diff --git a/fs/bcachefs/thread_with_file.h b/fs/bcachefs/thread_with_file.h
index 66212fcae226aa..f06f8ff19a790a 100644
--- a/fs/bcachefs/thread_with_file.h
+++ b/fs/bcachefs/thread_with_file.h
@@ -4,6 +4,38 @@
 
 #include "thread_with_file_types.h"
 
+/*
+ * Thread with file: Run a kthread and connect it to a file descriptor, so that
+ * it can be interacted with via fd read/write methods and closing the file
+ * descriptor stops the kthread.
+ *
+ * We have two different APIs:
+ *
+ * thread_with_file, the low level version.
+ * You get to define the full file_operations, including your release function,
+ * which means that you must call bch2_thread_with_file_exit() from your
+ * .release method
+ *
+ * thread_with_stdio, the higher level version
+ * This implements full piping of input and output, including .poll.
+ *
+ * Notes on behaviour:
+ *  - kthread shutdown behaves like writing or reading from a pipe that has been
+ *    closed
+ *  - Input and output buffers are 4096 bytes, although buffers may in some
+ *    situations slightly exceed that limit so as to avoid chopping off a
+ *    message in the middle in nonblocking mode.
+ *  - Input/output buffers are lazily allocated, with GFP_NOWAIT allocations -
+ *    should be fine but might change in future revisions.
+ *  - Output buffer may grow past 4096 bytes to deal with messages that are
+ *    bigger than 4096 bytes
+ *  - Writing may be done blocking or nonblocking; in nonblocking mode, we only
+ *    drop entire messages.
+ *
+ * To write, use stdio_redirect_printf()
+ * To read, use stdio_redirect_read() or stdio_redirect_readline()
+ */
+
 struct task_struct;
 
 struct thread_with_file {

From 6af7c3eb8b735214257d0363a3e55757de63ea89 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 5 Feb 2024 00:15:20 -0500
Subject: [PATCH 0522/1406] darray: lift from bcachefs

dynamic arrays - inspired from CCAN darrays, basically c++ stl vectors.

Used by thread_with_stdio, which is also being lifted from bcachefs for
xfs.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 MAINTAINERS                             |  7 +++
 fs/bcachefs/Makefile                    |  1 -
 fs/bcachefs/btree_types.h               |  2 +-
 fs/bcachefs/btree_update.c              |  2 +
 fs/bcachefs/btree_write_buffer_types.h  |  2 +-
 fs/bcachefs/fsck.c                      |  2 +-
 fs/bcachefs/journal_io.h                |  2 +-
 fs/bcachefs/journal_sb.c                |  2 +-
 fs/bcachefs/sb-downgrade.c              |  3 +-
 fs/bcachefs/sb-errors_types.h           |  2 +-
 fs/bcachefs/sb-members.h                |  2 +-
 fs/bcachefs/subvolume.h                 |  1 -
 fs/bcachefs/subvolume_types.h           |  2 +-
 fs/bcachefs/thread_with_file_types.h    |  2 +-
 fs/bcachefs/util.h                      | 29 +-----------
 {fs/bcachefs => include/linux}/darray.h | 59 ++++++++++++++++---------
 include/linux/darray_types.h            | 22 +++++++++
 lib/Makefile                            |  2 +-
 {fs/bcachefs => lib}/darray.c           | 12 ++++-
 19 files changed, 94 insertions(+), 62 deletions(-)
 rename {fs/bcachefs => include/linux}/darray.h (66%)
 create mode 100644 include/linux/darray_types.h
 rename {fs/bcachefs => lib}/darray.c (56%)

diff --git a/MAINTAINERS b/MAINTAINERS
index f2face46f3650d..cec0ca4e108eb2 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5806,6 +5806,13 @@ F:	net/ax25/ax25_out.c
 F:	net/ax25/ax25_timer.c
 F:	net/ax25/sysctl_net_ax25.c
 
+DARRAY
+M:	Kent Overstreet <kent.overstreet@linux.dev>
+L:	linux-bcachefs@vger.kernel.org
+S:	Maintained
+F:	include/linux/darray.h
+F:	include/linux/darray_types.h
+
 DATA ACCESS MONITOR
 M:	SeongJae Park <sj@kernel.org>
 L:	damon@lists.linux.dev
diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile
index b11ba74b8ad41a..bb17d146b0900b 100644
--- a/fs/bcachefs/Makefile
+++ b/fs/bcachefs/Makefile
@@ -27,7 +27,6 @@ bcachefs-y		:=	\
 	checksum.o		\
 	clock.o			\
 	compress.o		\
-	darray.o		\
 	debug.o			\
 	dirent.o		\
 	disk_groups.o		\
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 4a5a64499eb766..0d5eecbd3e9cfb 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -2,12 +2,12 @@
 #ifndef _BCACHEFS_BTREE_TYPES_H
 #define _BCACHEFS_BTREE_TYPES_H
 
+#include <linux/darray_types.h>
 #include <linux/list.h>
 #include <linux/rhashtable.h>
 
 #include "btree_key_cache_types.h"
 #include "buckets_types.h"
-#include "darray.h"
 #include "errcode.h"
 #include "journal_types.h"
 #include "replicas_types.h"
diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c
index c3ff365acce9af..e5193116b092f6 100644
--- a/fs/bcachefs/btree_update.c
+++ b/fs/bcachefs/btree_update.c
@@ -14,6 +14,8 @@
 #include "snapshot.h"
 #include "trace.h"
 
+#include <linux/darray.h>
+
 static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l,
 					 const struct btree_insert_entry *r)
 {
diff --git a/fs/bcachefs/btree_write_buffer_types.h b/fs/bcachefs/btree_write_buffer_types.h
index 9b9433de9c3686..5f248873087c30 100644
--- a/fs/bcachefs/btree_write_buffer_types.h
+++ b/fs/bcachefs/btree_write_buffer_types.h
@@ -2,7 +2,7 @@
 #ifndef _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H
 #define _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H
 
-#include "darray.h"
+#include <linux/darray_types.h>
 #include "journal_types.h"
 
 #define BTREE_WRITE_BUFERED_VAL_U64s_MAX	4
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 10d144c5a37a5c..a4b44c4f9bdc28 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -5,7 +5,6 @@
 #include "btree_cache.h"
 #include "btree_update.h"
 #include "buckets.h"
-#include "darray.h"
 #include "dirent.h"
 #include "error.h"
 #include "fs-common.h"
@@ -18,6 +17,7 @@
 #include "xattr.h"
 
 #include <linux/bsearch.h>
+#include <linux/darray.h>
 #include <linux/dcache.h> /* struct qstr */
 
 /*
diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h
index 1f395f43cf76f3..f18b90000cc5db 100644
--- a/fs/bcachefs/journal_io.h
+++ b/fs/bcachefs/journal_io.h
@@ -2,7 +2,7 @@
 #ifndef _BCACHEFS_JOURNAL_IO_H
 #define _BCACHEFS_JOURNAL_IO_H
 
-#include "darray.h"
+#include <linux/darray_types.h>
 
 struct journal_ptr {
 	bool		csum_good;
diff --git a/fs/bcachefs/journal_sb.c b/fs/bcachefs/journal_sb.c
index ae4fb8c3a2bc26..156691c203bef6 100644
--- a/fs/bcachefs/journal_sb.c
+++ b/fs/bcachefs/journal_sb.c
@@ -2,8 +2,8 @@
 
 #include "bcachefs.h"
 #include "journal_sb.h"
-#include "darray.h"
 
+#include <linux/darray.h>
 #include <linux/sort.h>
 
 /* BCH_SB_FIELD_journal: */
diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c
index 441dcb1bf160e9..626eaaea5b01d7 100644
--- a/fs/bcachefs/sb-downgrade.c
+++ b/fs/bcachefs/sb-downgrade.c
@@ -6,12 +6,13 @@
  */
 
 #include "bcachefs.h"
-#include "darray.h"
 #include "recovery.h"
 #include "sb-downgrade.h"
 #include "sb-errors.h"
 #include "super-io.h"
 
+#include <linux/darray.h>
+
 #define RECOVERY_PASS_ALL_FSCK		BIT_ULL(63)
 
 /*
diff --git a/fs/bcachefs/sb-errors_types.h b/fs/bcachefs/sb-errors_types.h
index dbfd91ab86cfae..cadf12ce917393 100644
--- a/fs/bcachefs/sb-errors_types.h
+++ b/fs/bcachefs/sb-errors_types.h
@@ -2,7 +2,7 @@
 #ifndef _BCACHEFS_SB_ERRORS_TYPES_H
 #define _BCACHEFS_SB_ERRORS_TYPES_H
 
-#include "darray.h"
+#include <linux/darray_types.h>
 
 #define BCH_SB_ERRS()							\
 	x(clean_but_journal_not_empty,				0)	\
diff --git a/fs/bcachefs/sb-members.h b/fs/bcachefs/sb-members.h
index be0a941832715a..e4d4d842229a6b 100644
--- a/fs/bcachefs/sb-members.h
+++ b/fs/bcachefs/sb-members.h
@@ -2,7 +2,7 @@
 #ifndef _BCACHEFS_SB_MEMBERS_H
 #define _BCACHEFS_SB_MEMBERS_H
 
-#include "darray.h"
+#include <linux/darray.h>
 
 extern char * const bch2_member_error_strs[];
 
diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h
index a6f56f66e27cb7..3ca1d183369c5f 100644
--- a/fs/bcachefs/subvolume.h
+++ b/fs/bcachefs/subvolume.h
@@ -2,7 +2,6 @@
 #ifndef _BCACHEFS_SUBVOLUME_H
 #define _BCACHEFS_SUBVOLUME_H
 
-#include "darray.h"
 #include "subvolume_types.h"
 
 enum bkey_invalid_flags;
diff --git a/fs/bcachefs/subvolume_types.h b/fs/bcachefs/subvolume_types.h
index ae644adfc39168..40f16e3a6dd04f 100644
--- a/fs/bcachefs/subvolume_types.h
+++ b/fs/bcachefs/subvolume_types.h
@@ -2,7 +2,7 @@
 #ifndef _BCACHEFS_SUBVOLUME_TYPES_H
 #define _BCACHEFS_SUBVOLUME_TYPES_H
 
-#include "darray.h"
+#include <linux/darray_types.h>
 
 typedef DARRAY(u32) snapshot_id_list;
 
diff --git a/fs/bcachefs/thread_with_file_types.h b/fs/bcachefs/thread_with_file_types.h
index e0daf4eec341e0..41990756aa261d 100644
--- a/fs/bcachefs/thread_with_file_types.h
+++ b/fs/bcachefs/thread_with_file_types.h
@@ -2,7 +2,7 @@
 #ifndef _BCACHEFS_THREAD_WITH_FILE_TYPES_H
 #define _BCACHEFS_THREAD_WITH_FILE_TYPES_H
 
-#include "darray.h"
+#include <linux/darray_types.h>
 
 struct stdio_buf {
 	spinlock_t		lock;
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index c4cd32a2aeb28e..1b3aced8d83caf 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -5,23 +5,22 @@
 #include <linux/bio.h>
 #include <linux/blkdev.h>
 #include <linux/closure.h>
+#include <linux/darray.h>
 #include <linux/errno.h>
 #include <linux/freezer.h>
 #include <linux/kernel.h>
-#include <linux/sched/clock.h>
 #include <linux/llist.h>
 #include <linux/log2.h>
 #include <linux/percpu.h>
 #include <linux/preempt.h>
 #include <linux/ratelimit.h>
+#include <linux/sched/clock.h>
 #include <linux/slab.h>
 #include <linux/time_stats.h>
 #include <linux/vmalloc.h>
 #include <linux/workqueue.h>
 #include <linux/mean_and_variance.h>
 
-#include "darray.h"
-
 struct closure;
 
 #ifdef CONFIG_BCACHEFS_DEBUG
@@ -630,30 +629,6 @@ static inline void memset_u64s_tail(void *s, int c, unsigned bytes)
 	memset(s + bytes, c, rem);
 }
 
-/* just the memmove, doesn't update @_nr */
-#define __array_insert_item(_array, _nr, _pos)				\
-	memmove(&(_array)[(_pos) + 1],					\
-		&(_array)[(_pos)],					\
-		sizeof((_array)[0]) * ((_nr) - (_pos)))
-
-#define array_insert_item(_array, _nr, _pos, _new_item)			\
-do {									\
-	__array_insert_item(_array, _nr, _pos);				\
-	(_nr)++;							\
-	(_array)[(_pos)] = (_new_item);					\
-} while (0)
-
-#define array_remove_items(_array, _nr, _pos, _nr_to_remove)		\
-do {									\
-	(_nr) -= (_nr_to_remove);					\
-	memmove(&(_array)[(_pos)],					\
-		&(_array)[(_pos) + (_nr_to_remove)],			\
-		sizeof((_array)[0]) * ((_nr) - (_pos)));		\
-} while (0)
-
-#define array_remove_item(_array, _nr, _pos)				\
-	array_remove_items(_array, _nr, _pos, 1)
-
 static inline void __move_gap(void *array, size_t element_size,
 			      size_t nr, size_t size,
 			      size_t old_gap, size_t new_gap)
diff --git a/fs/bcachefs/darray.h b/include/linux/darray.h
similarity index 66%
rename from fs/bcachefs/darray.h
rename to include/linux/darray.h
index 4b340d13caace0..ff167eb795f22e 100644
--- a/fs/bcachefs/darray.h
+++ b/include/linux/darray.h
@@ -1,34 +1,26 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_DARRAY_H
-#define _BCACHEFS_DARRAY_H
+/*
+ * (C) 2022-2024 Kent Overstreet <kent.overstreet@linux.dev>
+ */
+#ifndef _LINUX_DARRAY_H
+#define _LINUX_DARRAY_H
 
 /*
- * Dynamic arrays:
+ * Dynamic arrays
  *
  * Inspired by CCAN's darray
  */
 
+#include <linux/darray_types.h>
 #include <linux/slab.h>
 
-#define DARRAY_PREALLOCATED(_type, _nr)					\
-struct {								\
-	size_t nr, size;						\
-	_type *data;							\
-	_type preallocated[_nr];					\
-}
-
-#define DARRAY(_type) DARRAY_PREALLOCATED(_type, 0)
-
-typedef DARRAY(char)	darray_char;
-typedef DARRAY(char *) darray_str;
-
-int __bch2_darray_resize(darray_char *, size_t, size_t, gfp_t);
+int __darray_resize_slowpath(darray_char *, size_t, size_t, gfp_t);
 
 static inline int __darray_resize(darray_char *d, size_t element_size,
 				  size_t new_size, gfp_t gfp)
 {
 	return unlikely(new_size > d->size)
-		? __bch2_darray_resize(d, element_size, new_size, gfp)
+		? __darray_resize_slowpath(d, element_size, new_size, gfp)
 		: 0;
 }
 
@@ -69,6 +61,28 @@ static inline int __darray_make_room(darray_char *d, size_t t_size, size_t more,
 #define darray_first(_d)	((_d).data[0])
 #define darray_last(_d)		((_d).data[(_d).nr - 1])
 
+/* Insert/remove items into the middle of a darray: */
+
+#define array_insert_item(_array, _nr, _pos, _new_item)			\
+do {									\
+	memmove(&(_array)[(_pos) + 1],					\
+		&(_array)[(_pos)],					\
+		sizeof((_array)[0]) * ((_nr) - (_pos)));		\
+	(_nr)++;							\
+	(_array)[(_pos)] = (_new_item);					\
+} while (0)
+
+#define array_remove_items(_array, _nr, _pos, _nr_to_remove)		\
+do {									\
+	(_nr) -= (_nr_to_remove);					\
+	memmove(&(_array)[(_pos)],					\
+		&(_array)[(_pos) + (_nr_to_remove)],			\
+		sizeof((_array)[0]) * ((_nr) - (_pos)));		\
+} while (0)
+
+#define array_remove_item(_array, _nr, _pos)				\
+	array_remove_items(_array, _nr, _pos, 1)
+
 #define darray_insert_item(_d, pos, _item)				\
 ({									\
 	size_t _pos = (pos);						\
@@ -79,10 +93,15 @@ static inline int __darray_make_room(darray_char *d, size_t t_size, size_t more,
 	_ret;								\
 })
 
+#define darray_remove_items(_d, _pos, _nr_to_remove)			\
+	array_remove_items((_d)->data, (_d)->nr, (_pos) - (_d)->data, _nr_to_remove)
+
 #define darray_remove_item(_d, _pos)					\
-	array_remove_item((_d)->data, (_d)->nr, (_pos) - (_d)->data)
+	darray_remove_items(_d, _pos, 1)
+
+/* Iteration: */
 
-#define __darray_for_each(_d, _i)						\
+#define __darray_for_each(_d, _i)					\
 	for ((_i) = (_d).data; _i < (_d).data + (_d).nr; _i++)
 
 #define darray_for_each(_d, _i)						\
@@ -106,4 +125,4 @@ do {									\
 	darray_init(_d);						\
 } while (0)
 
-#endif /* _BCACHEFS_DARRAY_H */
+#endif /* _LINUX_DARRAY_H */
diff --git a/include/linux/darray_types.h b/include/linux/darray_types.h
new file mode 100644
index 00000000000000..a400a0c3600d83
--- /dev/null
+++ b/include/linux/darray_types.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * (C) 2022-2024 Kent Overstreet <kent.overstreet@linux.dev>
+ */
+#ifndef _LINUX_DARRAY_TYpES_H
+#define _LINUX_DARRAY_TYpES_H
+
+#include <linux/types.h>
+
+#define DARRAY_PREALLOCATED(_type, _nr)					\
+struct {								\
+	size_t nr, size;						\
+	_type *data;							\
+	_type preallocated[_nr];					\
+}
+
+#define DARRAY(_type) DARRAY_PREALLOCATED(_type, 0)
+
+typedef DARRAY(char)	darray_char;
+typedef DARRAY(char *)	darray_str;
+
+#endif /* _LINUX_DARRAY_TYpES_H */
diff --git a/lib/Makefile b/lib/Makefile
index 57858997c87aa1..830907bb8fc85e 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -48,7 +48,7 @@ obj-y += bcd.o sort.o parser.o debug_locks.o random32.o \
 	 bsearch.o find_bit.o llist.o lwq.o memweight.o kfifo.o \
 	 percpu-refcount.o rhashtable.o base64.o \
 	 once.o refcount.o rcuref.o usercopy.o errseq.o bucket_locks.o \
-	 generic-radix-tree.o bitmap-str.o
+	 generic-radix-tree.o bitmap-str.o darray.o
 obj-$(CONFIG_STRING_SELFTEST) += test_string.o
 obj-y += string_helpers.o
 obj-$(CONFIG_TEST_STRING_HELPERS) += test-string_helpers.o
diff --git a/fs/bcachefs/darray.c b/lib/darray.c
similarity index 56%
rename from fs/bcachefs/darray.c
rename to lib/darray.c
index ac35b8b705ae1c..7cb064f14b3911 100644
--- a/fs/bcachefs/darray.c
+++ b/lib/darray.c
@@ -1,10 +1,14 @@
 // SPDX-License-Identifier: GPL-2.0
+/*
+ * (C) 2022-2024 Kent Overstreet <kent.overstreet@linux.dev>
+ */
 
+#include <linux/darray.h>
 #include <linux/log2.h>
+#include <linux/module.h>
 #include <linux/slab.h>
-#include "darray.h"
 
-int __bch2_darray_resize(darray_char *d, size_t element_size, size_t new_size, gfp_t gfp)
+int __darray_resize_slowpath(darray_char *d, size_t element_size, size_t new_size, gfp_t gfp)
 {
 	if (new_size > d->size) {
 		new_size = roundup_pow_of_two(new_size);
@@ -22,3 +26,7 @@ int __bch2_darray_resize(darray_char *d, size_t element_size, size_t new_size, g
 
 	return 0;
 }
+EXPORT_SYMBOL_GPL(__darray_resize_slowpath);
+
+MODULE_AUTHOR("Kent Overstreet");
+MODULE_LICENSE("GPL");

From eb27c48f290cfe7b3ccdd0ce5efd966c46f2727e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 5 Feb 2024 00:51:23 -0500
Subject: [PATCH 0523/1406] thread_with_file: Lift from bcachefs

thread_with_file and thread_with_stdio are abstractions for connecting
kthreads to file descriptors, which is handy for all sorts of things -
the running kthread has its lifetime connected to the file descriptor,
which means an asynchronous job running in the kernel can easily exit in
response to a ctrl-c, and the file descriptor also provides a
communications channel.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 MAINTAINERS                                   |   9 +
 fs/bcachefs/Kconfig                           |   1 +
 fs/bcachefs/Makefile                          |   1 -
 fs/bcachefs/bcachefs.h                        |   2 +-
 fs/bcachefs/chardev.c                         |  10 +-
 fs/bcachefs/error.c                           |   4 +-
 fs/bcachefs/super.c                           |   4 +-
 .../linux}/thread_with_file.h                 |  35 +-
 .../linux}/thread_with_file_types.h           |   8 +-
 lib/Kconfig                                   |   3 +
 lib/Makefile                                  |   1 +
 {fs/bcachefs => lib}/thread_with_file.c       | 326 +++++++++---------
 12 files changed, 212 insertions(+), 192 deletions(-)
 rename {fs/bcachefs => include/linux}/thread_with_file.h (63%)
 rename {fs/bcachefs => include/linux}/thread_with_file_types.h (64%)
 rename {fs/bcachefs => lib}/thread_with_file.c (79%)

diff --git a/MAINTAINERS b/MAINTAINERS
index cec0ca4e108eb2..97d85d9480835a 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -21879,6 +21879,15 @@ F:	Documentation/userspace-api/media/drivers/thp7312.rst
 F:	drivers/media/i2c/thp7312.c
 F:	include/uapi/linux/thp7312.h
 
+THREAD WITH FILE
+M:	Kent Overstreet <kent.overstreet@linux.dev>
+M:	Darrick J. Wong <djwong@kernel.org>
+L:	linux-bcachefs@vger.kernel.org
+S:	Maintained
+F:	include/linux/thread_with_file.c
+F:	include/linux/thread_with_file_types.c
+F:	lib/thread_with_file.c
+
 THUNDERBOLT DMA TRAFFIC TEST DRIVER
 M:	Isaac Hazan <isaac.hazan@intel.com>
 L:	linux-usb@vger.kernel.org
diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig
index 8c587ddd2f85ef..08073d76e5a428 100644
--- a/fs/bcachefs/Kconfig
+++ b/fs/bcachefs/Kconfig
@@ -25,6 +25,7 @@ config BCACHEFS_FS
 	select SRCU
 	select SYMBOLIC_ERRNAME
 	select TIME_STATS
+	select THREAD_WITH_FILE
 	help
 	The bcachefs filesystem - a modern, copy on write filesystem, with
 	support for multiple devices, compression, checksumming, etc.
diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile
index bb17d146b0900b..d335b6572d72da 100644
--- a/fs/bcachefs/Makefile
+++ b/fs/bcachefs/Makefile
@@ -80,7 +80,6 @@ bcachefs-y		:=	\
 	super-io.o		\
 	sysfs.o			\
 	tests.o			\
-	thread_with_file.o	\
 	trace.o			\
 	two_state_shared_lock.o	\
 	util.o			\
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 04e4a65909a4f6..5f801256e8740a 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -200,6 +200,7 @@
 #include <linux/seqlock.h>
 #include <linux/shrinker.h>
 #include <linux/srcu.h>
+#include <linux/thread_with_file_types.h>
 #include <linux/time_stats.h>
 #include <linux/types.h>
 #include <linux/workqueue.h>
@@ -466,7 +467,6 @@ enum bch_time_stats {
 #include "replicas_types.h"
 #include "subvolume_types.h"
 #include "super_types.h"
-#include "thread_with_file_types.h"
 
 /* Number of nodes btree coalesce will try to coalesce at once */
 #define GC_MERGE_NODES		4U
diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c
index 11711f54057e14..4cbda66bb6e0fa 100644
--- a/fs/bcachefs/chardev.c
+++ b/fs/bcachefs/chardev.c
@@ -11,7 +11,6 @@
 #include "replicas.h"
 #include "super.h"
 #include "super-io.h"
-#include "thread_with_file.h"
 
 #include <linux/cdev.h>
 #include <linux/device.h>
@@ -20,6 +19,7 @@
 #include <linux/major.h>
 #include <linux/sched/task.h>
 #include <linux/slab.h>
+#include <linux/thread_with_file.h>
 #include <linux/uaccess.h>
 
 __must_check
@@ -217,7 +217,7 @@ static long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_a
 
 	opt_set(thr->opts, stdio, (u64)(unsigned long)&thr->thr.stdio);
 
-	ret = bch2_run_thread_with_stdio(&thr->thr,
+	ret = run_thread_with_stdio(&thr->thr,
 			bch2_fsck_thread_exit,
 			bch2_fsck_offline_thread_fn);
 err:
@@ -422,7 +422,7 @@ static int bch2_data_job_release(struct inode *inode, struct file *file)
 {
 	struct bch_data_ctx *ctx = container_of(file->private_data, struct bch_data_ctx, thr);
 
-	bch2_thread_with_file_exit(&ctx->thr);
+	thread_with_file_exit(&ctx->thr);
 	kfree(ctx);
 	return 0;
 }
@@ -472,7 +472,7 @@ static long bch2_ioctl_data(struct bch_fs *c,
 	ctx->c = c;
 	ctx->arg = arg;
 
-	ret = bch2_run_thread_with_file(&ctx->thr,
+	ret = run_thread_with_file(&ctx->thr,
 			&bcachefs_data_ops,
 			bch2_data_thread);
 	if (ret < 0)
@@ -834,7 +834,7 @@ static long bch2_ioctl_fsck_online(struct bch_fs *c,
 			goto err;
 	}
 
-	ret = bch2_run_thread_with_stdio(&thr->thr,
+	ret = run_thread_with_stdio(&thr->thr,
 			bch2_fsck_thread_exit,
 			bch2_fsck_online_thread_fn);
 err:
diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c
index d32c8bebe46c32..70a12539597407 100644
--- a/fs/bcachefs/error.c
+++ b/fs/bcachefs/error.c
@@ -2,7 +2,7 @@
 #include "bcachefs.h"
 #include "error.h"
 #include "super.h"
-#include "thread_with_file.h"
+#include <linux/thread_with_file.h>
 
 #define FSCK_ERR_RATELIMIT_NR	10
 
@@ -105,7 +105,7 @@ static enum ask_yn bch2_fsck_ask_yn(struct bch_fs *c)
 	do {
 		bch2_print(c, " (y,n, or Y,N for all errors of this type) ");
 
-		int r = bch2_stdio_redirect_readline(stdio, buf, sizeof(buf) - 1);
+		int r = stdio_redirect_readline(stdio, buf, sizeof(buf) - 1);
 		if (r < 0)
 			return YN_NO;
 		buf[r] = '\0';
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 6edb2ef0e03ef6..8c6caebf843174 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -56,7 +56,6 @@
 #include "super.h"
 #include "super-io.h"
 #include "sysfs.h"
-#include "thread_with_file.h"
 #include "trace.h"
 
 #include <linux/backing-dev.h>
@@ -68,6 +67,7 @@
 #include <linux/percpu.h>
 #include <linux/random.h>
 #include <linux/sysfs.h>
+#include <linux/thread_with_file.h>
 #include <crypto/hash.h>
 
 MODULE_LICENSE("GPL");
@@ -99,7 +99,7 @@ void __bch2_print(struct bch_fs *c, const char *fmt, ...)
 		if (fmt[0] == KERN_SOH[0])
 			fmt += 2;
 
-		bch2_stdio_redirect_vprintf(stdio, true, fmt, args);
+		stdio_redirect_vprintf(stdio, true, fmt, args);
 	}
 	va_end(args);
 }
diff --git a/fs/bcachefs/thread_with_file.h b/include/linux/thread_with_file.h
similarity index 63%
rename from fs/bcachefs/thread_with_file.h
rename to include/linux/thread_with_file.h
index f06f8ff19a790a..54091f7ff3383d 100644
--- a/fs/bcachefs/thread_with_file.h
+++ b/include/linux/thread_with_file.h
@@ -1,8 +1,11 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_THREAD_WITH_FILE_H
-#define _BCACHEFS_THREAD_WITH_FILE_H
+/*
+ * (C) 2022-2024 Kent Overstreet <kent.overstreet@linux.dev>
+ */
+#ifndef _LINUX_THREAD_WITH_FILE_H
+#define _LINUX_THREAD_WITH_FILE_H
 
-#include "thread_with_file_types.h"
+#include <linux/thread_with_file_types.h>
 
 /*
  * Thread with file: Run a kthread and connect it to a file descriptor, so that
@@ -13,7 +16,7 @@
  *
  * thread_with_file, the low level version.
  * You get to define the full file_operations, including your release function,
- * which means that you must call bch2_thread_with_file_exit() from your
+ * which means that you must call thread_with_file_exit() from your
  * .release method
  *
  * thread_with_stdio, the higher level version
@@ -44,10 +47,10 @@ struct thread_with_file {
 	bool			done;
 };
 
-void bch2_thread_with_file_exit(struct thread_with_file *);
-int bch2_run_thread_with_file(struct thread_with_file *,
-			      const struct file_operations *,
-			      int (*fn)(void *));
+void thread_with_file_exit(struct thread_with_file *);
+int run_thread_with_file(struct thread_with_file *,
+			 const struct file_operations *,
+			 int (*fn)(void *));
 
 struct thread_with_stdio {
 	struct thread_with_file	thr;
@@ -56,13 +59,13 @@ struct thread_with_stdio {
 	void			(*fn)(struct thread_with_stdio *);
 };
 
-int bch2_run_thread_with_stdio(struct thread_with_stdio *,
-			       void (*exit)(struct thread_with_stdio *),
-			       void (*fn)(struct thread_with_stdio *));
-int bch2_stdio_redirect_read(struct stdio_redirect *, char *, size_t);
-int bch2_stdio_redirect_readline(struct stdio_redirect *, char *, size_t);
+int run_thread_with_stdio(struct thread_with_stdio *,
+			  void (*exit)(struct thread_with_stdio *),
+			  void (*fn)(struct thread_with_stdio *));
+int stdio_redirect_read(struct stdio_redirect *, char *, size_t);
+int stdio_redirect_readline(struct stdio_redirect *, char *, size_t);
 
-__printf(3, 0) void bch2_stdio_redirect_vprintf(struct stdio_redirect *, bool, const char *, va_list);
-__printf(3, 4) void bch2_stdio_redirect_printf(struct stdio_redirect *, bool, const char *, ...);
+__printf(3, 0) void stdio_redirect_vprintf(struct stdio_redirect *, bool, const char *, va_list);
+__printf(3, 4) void stdio_redirect_printf(struct stdio_redirect *, bool, const char *, ...);
 
-#endif /* _BCACHEFS_THREAD_WITH_FILE_H */
+#endif /* _LINUX_THREAD_WITH_FILE_H */
diff --git a/fs/bcachefs/thread_with_file_types.h b/include/linux/thread_with_file_types.h
similarity index 64%
rename from fs/bcachefs/thread_with_file_types.h
rename to include/linux/thread_with_file_types.h
index 41990756aa261d..98d0ad12532216 100644
--- a/fs/bcachefs/thread_with_file_types.h
+++ b/include/linux/thread_with_file_types.h
@@ -1,8 +1,10 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_THREAD_WITH_FILE_TYPES_H
-#define _BCACHEFS_THREAD_WITH_FILE_TYPES_H
+#ifndef _LINUX_THREAD_WITH_FILE_TYPES_H
+#define _LINUX_THREAD_WITH_FILE_TYPES_H
 
 #include <linux/darray_types.h>
+#include <linux/spinlock_types.h>
+#include <linux/wait.h>
 
 struct stdio_buf {
 	spinlock_t		lock;
@@ -20,4 +22,4 @@ struct stdio_redirect {
 	bool			done;
 };
 
-#endif /* _BCACHEFS_THREAD_WITH_FILE_TYPES_H */
+#endif /* _LINUX_THREAD_WITH_FILE_TYPES_H */
diff --git a/lib/Kconfig b/lib/Kconfig
index 3ba8b965f8c7ec..9258d04e939db2 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -789,3 +789,6 @@ config FIRMWARE_TABLE
 config TIME_STATS
 	tristate
 	select MEAN_AND_VARIANCE
+
+config THREAD_WITH_FILE
+	tristate
diff --git a/lib/Makefile b/lib/Makefile
index 830907bb8fc85e..e77304f69df031 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -371,6 +371,7 @@ obj-$(CONFIG_SBITMAP) += sbitmap.o
 obj-$(CONFIG_PARMAN) += parman.o
 
 obj-$(CONFIG_TIME_STATS) += time_stats.o
+obj-$(CONFIG_THREAD_WITH_FILE) += thread_with_file.o
 
 obj-y += group_cpus.o
 
diff --git a/fs/bcachefs/thread_with_file.c b/lib/thread_with_file.c
similarity index 79%
rename from fs/bcachefs/thread_with_file.c
rename to lib/thread_with_file.c
index dde9679b68b425..092996ca43fe74 100644
--- a/fs/bcachefs/thread_with_file.c
+++ b/lib/thread_with_file.c
@@ -1,26 +1,160 @@
 // SPDX-License-Identifier: GPL-2.0
-#ifndef NO_BCACHEFS_FS
-
-#include "bcachefs.h"
-#include "thread_with_file.h"
-
+/*
+ * (C) 2022-2024 Kent Overstreet <kent.overstreet@linux.dev>
+ */
 #include <linux/anon_inodes.h>
+#include <linux/darray.h>
 #include <linux/file.h>
 #include <linux/kthread.h>
+#include <linux/module.h>
 #include <linux/pagemap.h>
 #include <linux/poll.h>
+#include <linux/thread_with_file.h>
+
+/* stdio_redirect */
+
+#define STDIO_REDIRECT_BUFSIZE		4096
+
+static bool stdio_redirect_has_input(struct stdio_redirect *stdio)
+{
+	return stdio->input.buf.nr || stdio->done;
+}
+
+static bool stdio_redirect_has_output(struct stdio_redirect *stdio)
+{
+	return stdio->output.buf.nr || stdio->done;
+}
+
+static bool stdio_redirect_has_input_space(struct stdio_redirect *stdio)
+{
+	return stdio->input.buf.nr < STDIO_REDIRECT_BUFSIZE || stdio->done;
+}
+
+static bool stdio_redirect_has_output_space(struct stdio_redirect *stdio)
+{
+	return stdio->output.buf.nr < STDIO_REDIRECT_BUFSIZE || stdio->done;
+}
+
+static void stdio_buf_init(struct stdio_buf *buf)
+{
+	spin_lock_init(&buf->lock);
+	init_waitqueue_head(&buf->wait);
+	darray_init(&buf->buf);
+}
+
+int stdio_redirect_read(struct stdio_redirect *stdio, char *ubuf, size_t len)
+{
+	struct stdio_buf *buf = &stdio->input;
+
+	wait_event(buf->wait, stdio_redirect_has_input(stdio));
+	if (stdio->done)
+		return -1;
+
+	spin_lock(&buf->lock);
+	int ret = min(len, buf->buf.nr);
+	memcpy(ubuf, buf->buf.data, ret);
+	darray_remove_items(&buf->buf, buf->buf.data, ret);
+	spin_unlock(&buf->lock);
+
+	wake_up(&buf->wait);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(stdio_redirect_read);
+
+int stdio_redirect_readline(struct stdio_redirect *stdio, char *ubuf, size_t len)
+{
+	struct stdio_buf *buf = &stdio->input;
+	size_t copied = 0;
+	ssize_t ret = 0;
+again:
+	wait_event(buf->wait, stdio_redirect_has_input(stdio));
+	if (stdio->done) {
+		ret = -1;
+		goto out;
+	}
+
+	spin_lock(&buf->lock);
+	size_t b = min(len, buf->buf.nr);
+	char *n = memchr(buf->buf.data, '\n', b);
+	if (n)
+		b = min_t(size_t, b, n + 1 - buf->buf.data);
+	memcpy(ubuf, buf->buf.data, b);
+	darray_remove_items(&buf->buf, buf->buf.data, b);
+	ubuf += b;
+	len -= b;
+	copied += b;
+	spin_unlock(&buf->lock);
+
+	wake_up(&buf->wait);
+
+	if (!n && len)
+		goto again;
+out:
+	return copied ?: ret;
+}
+EXPORT_SYMBOL_GPL(stdio_redirect_readline);
+
+__printf(3, 0)
+static void darray_vprintf(darray_char *out, gfp_t gfp, const char *fmt, va_list args)
+{
+	size_t len;
+
+	do {
+		va_list args2;
+		va_copy(args2, args);
+
+		len = vsnprintf(out->data + out->nr, darray_room(*out), fmt, args2);
+	} while (len + 1 > darray_room(*out) && !darray_make_room_gfp(out, len + 1, gfp));
+
+	out->nr += min(len, darray_room(*out));
+}
+
+void stdio_redirect_vprintf(struct stdio_redirect *stdio, bool nonblocking,
+			    const char *fmt, va_list args)
+{
+	struct stdio_buf *buf = &stdio->output;
+	unsigned long flags;
+
+	if (!nonblocking)
+		wait_event(buf->wait, stdio_redirect_has_output_space(stdio));
+	else if (!stdio_redirect_has_output_space(stdio))
+		return;
+	if (stdio->done)
+		return;
+
+	spin_lock_irqsave(&buf->lock, flags);
+	darray_vprintf(&buf->buf, nonblocking ? GFP_NOWAIT : GFP_KERNEL, fmt, args);
+	spin_unlock_irqrestore(&buf->lock, flags);
+
+	wake_up(&buf->wait);
+}
+EXPORT_SYMBOL_GPL(stdio_redirect_vprintf);
+
+void stdio_redirect_printf(struct stdio_redirect *stdio, bool nonblocking,
+			   const char *fmt, ...)
+{
 
-void bch2_thread_with_file_exit(struct thread_with_file *thr)
+	va_list args;
+	va_start(args, fmt);
+	stdio_redirect_vprintf(stdio, nonblocking, fmt, args);
+	va_end(args);
+}
+EXPORT_SYMBOL_GPL(stdio_redirect_printf);
+
+/* thread with file: */
+
+void thread_with_file_exit(struct thread_with_file *thr)
 {
 	if (thr->task) {
 		kthread_stop(thr->task);
 		put_task_struct(thr->task);
 	}
 }
+EXPORT_SYMBOL_GPL(thread_with_file_exit);
 
-int bch2_run_thread_with_file(struct thread_with_file *thr,
-			      const struct file_operations *fops,
-			      int (*fn)(void *))
+int run_thread_with_file(struct thread_with_file *thr,
+			 const struct file_operations *fops,
+			 int (*fn)(void *))
 {
 	struct file *file = NULL;
 	int ret, fd = -1;
@@ -63,37 +197,7 @@ int bch2_run_thread_with_file(struct thread_with_file *thr,
 		kthread_stop(thr->task);
 	return ret;
 }
-
-/* stdio_redirect */
-
-static bool stdio_redirect_has_input(struct stdio_redirect *stdio)
-{
-	return stdio->input.buf.nr || stdio->done;
-}
-
-static bool stdio_redirect_has_output(struct stdio_redirect *stdio)
-{
-	return stdio->output.buf.nr || stdio->done;
-}
-
-#define STDIO_REDIRECT_BUFSIZE		4096
-
-static bool stdio_redirect_has_input_space(struct stdio_redirect *stdio)
-{
-	return stdio->input.buf.nr < STDIO_REDIRECT_BUFSIZE || stdio->done;
-}
-
-static bool stdio_redirect_has_output_space(struct stdio_redirect *stdio)
-{
-	return stdio->output.buf.nr < STDIO_REDIRECT_BUFSIZE || stdio->done;
-}
-
-static void stdio_buf_init(struct stdio_buf *buf)
-{
-	spin_lock_init(&buf->lock);
-	init_waitqueue_head(&buf->wait);
-	darray_init(&buf->buf);
-}
+EXPORT_SYMBOL_GPL(run_thread_with_file);
 
 /* thread_with_stdio */
 
@@ -126,10 +230,7 @@ static ssize_t thread_with_stdio_read(struct file *file, char __user *ubuf,
 			ubuf	+= b;
 			len	-= b;
 			copied	+= b;
-			buf->buf.nr -= b;
-			memmove(buf->buf.data,
-				buf->buf.data + b,
-				buf->buf.nr);
+			darray_remove_items(&buf->buf, buf->buf.data, b);
 		}
 		spin_unlock_irq(&buf->lock);
 	}
@@ -137,18 +238,6 @@ static ssize_t thread_with_stdio_read(struct file *file, char __user *ubuf,
 	return copied ?: ret;
 }
 
-static int thread_with_stdio_release(struct inode *inode, struct file *file)
-{
-	struct thread_with_stdio *thr =
-		container_of(file->private_data, struct thread_with_stdio, thr);
-
-	bch2_thread_with_file_exit(&thr->thr);
-	darray_exit(&thr->stdio.input.buf);
-	darray_exit(&thr->stdio.output.buf);
-	thr->exit(thr);
-	return 0;
-}
-
 static ssize_t thread_with_stdio_write(struct file *file, const char __user *ubuf,
 				       size_t len, loff_t *ppos)
 {
@@ -221,6 +310,18 @@ static __poll_t thread_with_stdio_poll(struct file *file, struct poll_table_stru
 	return mask;
 }
 
+static int thread_with_stdio_release(struct inode *inode, struct file *file)
+{
+	struct thread_with_stdio *thr =
+		container_of(file->private_data, struct thread_with_stdio, thr);
+
+	thread_with_file_exit(&thr->thr);
+	darray_exit(&thr->stdio.input.buf);
+	darray_exit(&thr->stdio.output.buf);
+	thr->exit(thr);
+	return 0;
+}
+
 static const struct file_operations thread_with_stdio_fops = {
 	.llseek		= no_llseek,
 	.read		= thread_with_stdio_read,
@@ -242,117 +343,18 @@ static int thread_with_stdio_fn(void *arg)
 	return 0;
 }
 
-int bch2_run_thread_with_stdio(struct thread_with_stdio *thr,
-			       void (*exit)(struct thread_with_stdio *),
-			       void (*fn)(struct thread_with_stdio *))
+int run_thread_with_stdio(struct thread_with_stdio *thr,
+			  void (*exit)(struct thread_with_stdio *),
+			  void (*fn)(struct thread_with_stdio *))
 {
 	stdio_buf_init(&thr->stdio.input);
 	stdio_buf_init(&thr->stdio.output);
 	thr->exit	= exit;
 	thr->fn		= fn;
 
-	return bch2_run_thread_with_file(&thr->thr, &thread_with_stdio_fops, thread_with_stdio_fn);
-}
-
-int bch2_stdio_redirect_read(struct stdio_redirect *stdio, char *ubuf, size_t len)
-{
-	struct stdio_buf *buf = &stdio->input;
-
-	wait_event(buf->wait, stdio_redirect_has_input(stdio));
-	if (stdio->done)
-		return -1;
-
-	spin_lock(&buf->lock);
-	int ret = min(len, buf->buf.nr);
-	buf->buf.nr -= ret;
-	memcpy(ubuf, buf->buf.data, ret);
-	memmove(buf->buf.data,
-		buf->buf.data + ret,
-		buf->buf.nr);
-	spin_unlock(&buf->lock);
-
-	wake_up(&buf->wait);
-	return ret;
-}
-
-int bch2_stdio_redirect_readline(struct stdio_redirect *stdio, char *ubuf, size_t len)
-{
-	struct stdio_buf *buf = &stdio->input;
-	size_t copied = 0;
-	ssize_t ret = 0;
-again:
-	wait_event(buf->wait, stdio_redirect_has_input(stdio));
-	if (stdio->done) {
-		ret = -1;
-		goto out;
-	}
-
-	spin_lock(&buf->lock);
-	size_t b = min(len, buf->buf.nr);
-	char *n = memchr(buf->buf.data, '\n', b);
-	if (n)
-		b = min_t(size_t, b, n + 1 - buf->buf.data);
-	buf->buf.nr -= b;
-	memcpy(ubuf, buf->buf.data, b);
-	memmove(buf->buf.data,
-		buf->buf.data + b,
-		buf->buf.nr);
-	ubuf += b;
-	len -= b;
-	copied += b;
-	spin_unlock(&buf->lock);
-
-	wake_up(&buf->wait);
-
-	if (!n && len)
-		goto again;
-out:
-	return copied ?: ret;
-}
-
-__printf(3, 0)
-static void bch2_darray_vprintf(darray_char *out, gfp_t gfp, const char *fmt, va_list args)
-{
-	size_t len;
-
-	do {
-		va_list args2;
-		va_copy(args2, args);
-
-		len = vsnprintf(out->data + out->nr, darray_room(*out), fmt, args2);
-	} while (len + 1 > darray_room(*out) && !darray_make_room_gfp(out, len + 1, gfp));
-
-	out->nr += min(len, darray_room(*out));
-}
-
-void bch2_stdio_redirect_vprintf(struct stdio_redirect *stdio, bool nonblocking,
-				 const char *fmt, va_list args)
-{
-	struct stdio_buf *buf = &stdio->output;
-	unsigned long flags;
-
-	if (!nonblocking)
-		wait_event(buf->wait, stdio_redirect_has_output_space(stdio));
-	else if (!stdio_redirect_has_output_space(stdio))
-		return;
-	if (stdio->done)
-		return;
-
-	spin_lock_irqsave(&buf->lock, flags);
-	bch2_darray_vprintf(&buf->buf, nonblocking ? GFP_NOWAIT : GFP_KERNEL, fmt, args);
-	spin_unlock_irqrestore(&buf->lock, flags);
-
-	wake_up(&buf->wait);
-}
-
-void bch2_stdio_redirect_printf(struct stdio_redirect *stdio, bool nonblocking,
-				const char *fmt, ...)
-{
-
-	va_list args;
-	va_start(args, fmt);
-	bch2_stdio_redirect_vprintf(stdio, nonblocking, fmt, args);
-	va_end(args);
+	return run_thread_with_file(&thr->thr, &thread_with_stdio_fops, thread_with_stdio_fn);
 }
+EXPORT_SYMBOL_GPL(run_thread_with_stdio);
 
-#endif /* NO_BCACHEFS_FS */
+MODULE_AUTHOR("Kent Overstreet");
+MODULE_LICENSE("GPL");

From dbad87bc3422c9f8b50ec0c330a0653a75131e2b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 8 Feb 2024 20:27:06 -0500
Subject: [PATCH 0524/1406] thread_with_stdio: Mark completed in ->release()

This fixes stdio_redirect_read() getting stuck, not noticing that the
pipe has been closed.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 lib/thread_with_file.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/lib/thread_with_file.c b/lib/thread_with_file.c
index 092996ca43fe74..f4946a437332a4 100644
--- a/lib/thread_with_file.c
+++ b/lib/thread_with_file.c
@@ -201,6 +201,14 @@ EXPORT_SYMBOL_GPL(run_thread_with_file);
 
 /* thread_with_stdio */
 
+static void thread_with_stdio_done(struct thread_with_stdio *thr)
+{
+	thr->thr.done = true;
+	thr->stdio.done = true;
+	wake_up(&thr->stdio.input.wait);
+	wake_up(&thr->stdio.output.wait);
+}
+
 static ssize_t thread_with_stdio_read(struct file *file, char __user *ubuf,
 				      size_t len, loff_t *ppos)
 {
@@ -315,6 +323,7 @@ static int thread_with_stdio_release(struct inode *inode, struct file *file)
 	struct thread_with_stdio *thr =
 		container_of(file->private_data, struct thread_with_stdio, thr);
 
+	thread_with_stdio_done(thr);
 	thread_with_file_exit(&thr->thr);
 	darray_exit(&thr->stdio.input.buf);
 	darray_exit(&thr->stdio.output.buf);
@@ -336,10 +345,7 @@ static int thread_with_stdio_fn(void *arg)
 
 	thr->fn(thr);
 
-	thr->thr.done = true;
-	thr->stdio.done = true;
-	wake_up(&thr->stdio.input.wait);
-	wake_up(&thr->stdio.output.wait);
+	thread_with_stdio_done(thr);
 	return 0;
 }
 

From b7e29d5030d0c766fcd6420d4fdd9a7a4bb4e837 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 9 Feb 2024 01:04:38 -0500
Subject: [PATCH 0525/1406] kernel/hung_task.c: export
 sysctl_hung_task_timeout_secs

needed for thread_with_file; also rare but not unheard of to need this
in module code, when blocking on user input.

one workaround used by some code is wait_event_interruptible() - but
that can be buggy if the outer context isn't expecting unwinding.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: fuyuanli <fuyuanli@didiglobal.com>
---
 kernel/hung_task.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 9a24574988d230..b2fc2727d65441 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -43,6 +43,7 @@ static int __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT;
  * Zero means infinite timeout - no checking done:
  */
 unsigned long __read_mostly sysctl_hung_task_timeout_secs = CONFIG_DEFAULT_HUNG_TASK_TIMEOUT;
+EXPORT_SYMBOL_GPL(sysctl_hung_task_timeout_secs);
 
 /*
  * Zero (default value) means use sysctl_hung_task_timeout_secs:

From ad5ae093f2a22901f51a2133241a02d62d66d9c0 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 8 Feb 2024 20:41:34 -0500
Subject: [PATCH 0526/1406] thread_with_stdio: suppress hung task warning

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 lib/thread_with_file.c | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/lib/thread_with_file.c b/lib/thread_with_file.c
index f4946a437332a4..b09dc60ba62804 100644
--- a/lib/thread_with_file.c
+++ b/lib/thread_with_file.c
@@ -9,6 +9,7 @@
 #include <linux/module.h>
 #include <linux/pagemap.h>
 #include <linux/poll.h>
+#include <linux/sched/sysctl.h>
 #include <linux/thread_with_file.h>
 
 /* stdio_redirect */
@@ -46,7 +47,15 @@ int stdio_redirect_read(struct stdio_redirect *stdio, char *ubuf, size_t len)
 {
 	struct stdio_buf *buf = &stdio->input;
 
-	wait_event(buf->wait, stdio_redirect_has_input(stdio));
+	/*
+	 * we're waiting on user input (or for the file descriptor to be
+	 * closed), don't want a hung task warning:
+	 */
+	do {
+		wait_event_timeout(buf->wait, stdio_redirect_has_input(stdio),
+				   sysctl_hung_task_timeout_secs * HZ / 2);
+	} while (!stdio_redirect_has_input(stdio));
+
 	if (stdio->done)
 		return -1;
 
@@ -67,7 +76,11 @@ int stdio_redirect_readline(struct stdio_redirect *stdio, char *ubuf, size_t len
 	size_t copied = 0;
 	ssize_t ret = 0;
 again:
-	wait_event(buf->wait, stdio_redirect_has_input(stdio));
+	do {
+		wait_event_timeout(buf->wait, stdio_redirect_has_input(stdio),
+				   sysctl_hung_task_timeout_secs * HZ / 2);
+	} while (!stdio_redirect_has_input(stdio));
+
 	if (stdio->done) {
 		ret = -1;
 		goto out;

From 649a6a78f617695af3519c4a1b229b740c8a83ba Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 6 Feb 2024 17:24:18 -0500
Subject: [PATCH 0527/1406] bcachefs: Kill more -EIO error codes

This converts -EIOs related to btree node errors to private error codes,
which will help with some ongoing debugging by giving us better error
messages.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/backpointers.c          |  3 +--
 fs/bcachefs/btree_cache.c           |  6 +++---
 fs/bcachefs/btree_gc.c              |  4 ++--
 fs/bcachefs/btree_io.c              |  7 +++----
 fs/bcachefs/btree_iter.c            |  2 +-
 fs/bcachefs/btree_types.h           |  2 +-
 fs/bcachefs/btree_update_interior.c |  3 +--
 fs/bcachefs/errcode.h               |  6 +++++-
 fs/bcachefs/error.c                 | 10 ++++++++--
 fs/bcachefs/error.h                 |  2 +-
 fs/bcachefs/recovery.c              |  2 +-
 11 files changed, 27 insertions(+), 20 deletions(-)

diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c
index b4dc319bcb2bc0..34d20e099dcfd3 100644
--- a/fs/bcachefs/backpointers.c
+++ b/fs/bcachefs/backpointers.c
@@ -129,8 +129,7 @@ static noinline int backpointer_mod_err(struct btree_trans *trans,
 	printbuf_exit(&buf);
 
 	if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_extents_to_backpointers) {
-		bch2_inconsistent_error(c);
-		return -EIO;
+		return bch2_inconsistent_error(c) ? BCH_ERR_erofs_unfixed_errors : 0;
 	} else {
 		return 0;
 	}
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 9b7ea1227069e6..37ec3dbde4eee4 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -905,7 +905,7 @@ static struct btree *__bch2_btree_node_get(struct btree_trans *trans, struct btr
 
 	if (unlikely(btree_node_read_error(b))) {
 		six_unlock_type(&b->c.lock, lock_type);
-		return ERR_PTR(-EIO);
+		return ERR_PTR(-BCH_ERR_btree_node_read_error);
 	}
 
 	EBUG_ON(b->c.btree_id != path->btree_id);
@@ -996,7 +996,7 @@ struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path *
 
 	if (unlikely(btree_node_read_error(b))) {
 		six_unlock_type(&b->c.lock, lock_type);
-		return ERR_PTR(-EIO);
+		return ERR_PTR(-BCH_ERR_btree_node_read_error);
 	}
 
 	EBUG_ON(b->c.btree_id != path->btree_id);
@@ -1079,7 +1079,7 @@ struct btree *bch2_btree_node_get_noiter(struct btree_trans *trans,
 
 	if (unlikely(btree_node_read_error(b))) {
 		six_unlock_read(&b->c.lock);
-		b = ERR_PTR(-EIO);
+		b = ERR_PTR(-BCH_ERR_btree_node_read_error);
 		goto out;
 	}
 
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index eb92526bb9b64c..6c52f116098f7d 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -407,7 +407,7 @@ static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct
 		printbuf_reset(&buf);
 		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur_k.k));
 
-		if (mustfix_fsck_err_on(ret == -EIO, c,
+		if (mustfix_fsck_err_on(bch2_err_matches(ret, EIO), c,
 				btree_node_unreadable,
 				"Topology repair: unreadable btree node at btree %s level %u:\n"
 				"  %s",
@@ -979,7 +979,7 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b
 						false);
 			ret = PTR_ERR_OR_ZERO(child);
 
-			if (ret == -EIO) {
+			if (bch2_err_matches(ret, EIO)) {
 				bch2_topology_error(c);
 
 				if (__fsck_err(c,
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 61b6093805eaf2..86415701b82407 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -581,8 +581,7 @@ static int __btree_err(int ret,
 		break;
 	case -BCH_ERR_btree_node_read_err_bad_node:
 		bch2_print_string_as_lines(KERN_ERR, out.buf);
-		bch2_topology_error(c);
-		ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology) ?: -EIO;
+		ret = bch2_topology_error(c);
 		break;
 	case -BCH_ERR_btree_node_read_err_incompatible:
 		bch2_print_string_as_lines(KERN_ERR, out.buf);
@@ -1737,7 +1736,7 @@ static int __bch2_btree_root_read(struct btree_trans *trans, enum btree_id id,
 		list_move(&b->list, &c->btree_cache.freeable);
 		mutex_unlock(&c->btree_cache.lock);
 
-		ret = -EIO;
+		ret = -BCH_ERR_btree_node_read_error;
 		goto err;
 	}
 
@@ -1841,7 +1840,7 @@ static void btree_node_write_work(struct work_struct *work)
 		bch2_dev_list_has_dev(wbio->wbio.failed, ptr->dev));
 
 	if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(&wbio->key))) {
-		ret = -BCH_ERR_btree_write_all_failed;
+		ret = -BCH_ERR_btree_node_write_all_failed;
 		goto err;
 	}
 
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 3aac6ed5446ebd..2195e602abf09b 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -2303,7 +2303,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
 		btree_iter_path(trans, iter)->level);
 
 	if (iter->flags & BTREE_ITER_WITH_JOURNAL)
-		return bkey_s_c_err(-EIO);
+		return bkey_s_c_err(-BCH_ERR_btree_iter_with_journal_not_supported);
 
 	bch2_btree_iter_verify(iter);
 	bch2_btree_iter_verify_entry_exit(iter);
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 0d5eecbd3e9cfb..95783ad03fb392 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -727,7 +727,7 @@ struct btree_root {
 	__BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
 	u8			level;
 	u8			alive;
-	s8			error;
+	s16			error;
 };
 
 enum btree_gc_coalesce_fail_reason {
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index efe51d99dce40c..70da4fa25c01e2 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -1845,8 +1845,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
 			__func__, buf1.buf, buf2.buf);
 		printbuf_exit(&buf1);
 		printbuf_exit(&buf2);
-		bch2_topology_error(c);
-		ret = -EIO;
+		ret = bch2_topology_error(c);
 		goto err;
 	}
 
diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
index 3fd33b307a77f9..a82a9d754fdab0 100644
--- a/fs/bcachefs/errcode.h
+++ b/fs/bcachefs/errcode.h
@@ -178,6 +178,7 @@
 	x(EINVAL,			opt_parse_error)			\
 	x(EINVAL,			remove_with_metadata_missing_unimplemented)\
 	x(EINVAL,			remove_would_lose_data)			\
+	x(EINVAL,			btree_iter_with_journal_not_supported)	\
 	x(EROFS,			erofs_trans_commit)			\
 	x(EROFS,			erofs_no_writes)			\
 	x(EROFS,			erofs_journal_err)			\
@@ -227,7 +228,10 @@
 	x(BCH_ERR_operation_blocked,    nocow_lock_blocked)			\
 	x(EIO,				btree_node_read_err)			\
 	x(EIO,				sb_not_downgraded)			\
-	x(EIO,				btree_write_all_failed)			\
+	x(EIO,				btree_node_write_all_failed)		\
+	x(EIO,				btree_node_read_error)			\
+	x(EIO,				btree_node_read_validate_error)		\
+	x(EIO,				btree_need_topology_repair)		\
 	x(BCH_ERR_btree_node_read_err,	btree_node_read_err_fixable)		\
 	x(BCH_ERR_btree_node_read_err,	btree_node_read_err_want_retry)		\
 	x(BCH_ERR_btree_node_read_err,	btree_node_read_err_must_retry)		\
diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c
index 70a12539597407..8ae95b218e8b0c 100644
--- a/fs/bcachefs/error.c
+++ b/fs/bcachefs/error.c
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 #include "bcachefs.h"
 #include "error.h"
+#include "recovery.h"
 #include "super.h"
 #include <linux/thread_with_file.h>
 
@@ -25,11 +26,16 @@ bool bch2_inconsistent_error(struct bch_fs *c)
 	}
 }
 
-void bch2_topology_error(struct bch_fs *c)
+int bch2_topology_error(struct bch_fs *c)
 {
 	set_bit(BCH_FS_topology_error, &c->flags);
-	if (!test_bit(BCH_FS_fsck_running, &c->flags))
+	if (!test_bit(BCH_FS_fsck_running, &c->flags)) {
 		bch2_inconsistent_error(c);
+		return -BCH_ERR_btree_need_topology_repair;
+	} else {
+		return bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology) ?:
+			-BCH_ERR_btree_node_read_validate_error;
+	}
 }
 
 void bch2_fatal_error(struct bch_fs *c)
diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h
index fec17d1353d180..94491190e09e9d 100644
--- a/fs/bcachefs/error.h
+++ b/fs/bcachefs/error.h
@@ -30,7 +30,7 @@ struct work_struct;
 
 bool bch2_inconsistent_error(struct bch_fs *);
 
-void bch2_topology_error(struct bch_fs *);
+int bch2_topology_error(struct bch_fs *);
 
 #define bch2_fs_inconsistent(c, ...)					\
 ({									\
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 21e13bb4335be3..c584945faaa975 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -264,7 +264,7 @@ static int journal_replay_entry_early(struct bch_fs *c,
 			bkey_copy(&r->key, (struct bkey_i *) entry->start);
 			r->error = 0;
 		} else {
-			r->error = -EIO;
+			r->error = -BCH_ERR_btree_node_read_error;
 		}
 		r->alive = true;
 		break;

From a74a41ae99eaab24b9d0cb6341dc7b2f7a7ef5f8 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 5 Feb 2024 22:20:12 -0500
Subject: [PATCH 0528/1406] bcachefs: Check subvol <-> inode pointers in
 check_subvol()

Subvolumes and subvolume root inodes point to each other: this verifies
the subvolume -> inode -> subvolme path.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/inode.c     |  2 +-
 fs/bcachefs/inode.h     |  2 ++
 fs/bcachefs/subvolume.c | 31 +++++++++++++++++++++++++++++++
 3 files changed, 34 insertions(+), 1 deletion(-)

diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index e7ba169c4e5472..dbe37ccc751958 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -324,7 +324,7 @@ int bch2_inode_unpack(struct bkey_s_c k,
 	return bch2_inode_unpack_slowpath(k, unpacked);
 }
 
-static int bch2_inode_peek_nowarn(struct btree_trans *trans,
+int bch2_inode_peek_nowarn(struct btree_trans *trans,
 		    struct btree_iter *iter,
 		    struct bch_inode_unpacked *inode,
 		    subvol_inum inum, unsigned flags)
diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
index b8da7ff8069d08..9a9353c001c2a5 100644
--- a/fs/bcachefs/inode.h
+++ b/fs/bcachefs/inode.h
@@ -95,6 +95,8 @@ struct bkey_i *bch2_inode_to_v3(struct btree_trans *, struct bkey_i *);
 
 void bch2_inode_unpacked_to_text(struct printbuf *, struct bch_inode_unpacked *);
 
+int bch2_inode_peek_nowarn(struct btree_trans *, struct btree_iter *,
+		    struct bch_inode_unpacked *, subvol_inum, unsigned);
 int bch2_inode_peek(struct btree_trans *, struct btree_iter *,
 		    struct bch_inode_unpacked *, subvol_inum, unsigned);
 
diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
index 7c67c28d3ef88f..e7ee52c39990cc 100644
--- a/fs/bcachefs/subvolume.c
+++ b/fs/bcachefs/subvolume.c
@@ -42,6 +42,36 @@ static int check_subvol(struct btree_trans *trans,
 		return ret ?: -BCH_ERR_transaction_restart_nested;
 	}
 
+	struct bch_inode_unpacked inode;
+	struct btree_iter inode_iter = {};
+	ret = bch2_inode_peek_nowarn(trans, &inode_iter, &inode,
+				    (subvol_inum) { k.k->p.offset, le64_to_cpu(subvol.v->inode) },
+				    0);
+	bch2_trans_iter_exit(trans, &inode_iter);
+
+	if (ret && !bch2_err_matches(ret, ENOENT))
+		return ret;
+
+	if (fsck_err_on(ret, c, subvol_to_missing_root,
+			"subvolume %llu points to missing subvolume root %llu:%u",
+			k.k->p.offset, le64_to_cpu(subvol.v->inode),
+			le32_to_cpu(subvol.v->snapshot))) {
+		ret = bch2_subvolume_delete(trans, iter->pos.offset);
+		bch_err_msg(c, ret, "deleting subvolume %llu", iter->pos.offset);
+		return ret ?: -BCH_ERR_transaction_restart_nested;
+	}
+
+	if (fsck_err_on(inode.bi_subvol != subvol.k->p.offset,
+			c, subvol_root_wrong_bi_subvol,
+			"subvol root %llu:%u has wrong bi_subvol field: got %u, should be %llu",
+			inode.bi_inum, inode_iter.k.p.snapshot,
+			inode.bi_subvol, subvol.k->p.offset)) {
+		inode.bi_subvol = subvol.k->p.offset;
+		ret = __bch2_fsck_write_inode(trans, &inode, le32_to_cpu(subvol.v->snapshot));
+		if (ret)
+			goto err;
+	}
+
 	if (!BCH_SUBVOLUME_SNAP(subvol.v)) {
 		u32 snapshot_root = bch2_snapshot_root(c, le32_to_cpu(subvol.v->snapshot));
 		u32 snapshot_tree;
@@ -73,6 +103,7 @@ static int check_subvol(struct btree_trans *trans,
 		}
 	}
 
+err:
 fsck_err:
 	return ret;
 }

From 1dfe3c66dafb2e970bbd8f4e139ef704b89880c4 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 5 Feb 2024 22:30:51 -0500
Subject: [PATCH 0529/1406] bcachefs: Check subvol <-> inode pointers in
 check_inode()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fsck.c            | 25 +++++++++++++++++++++++++
 fs/bcachefs/sb-errors_types.h |  4 +++-
 2 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index a4b44c4f9bdc28..e4a8a14c46bc92 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -923,6 +923,31 @@ static int check_inode(struct btree_trans *trans,
 		do_update = true;
 	}
 
+	if (u.bi_subvol) {
+		struct bch_subvolume s;
+
+		ret = bch2_subvolume_get(trans, u.bi_subvol, false, 0, &s);
+		if (ret && !bch2_err_matches(ret, ENOENT))
+			goto err;
+
+		if (fsck_err_on(ret,
+				c, inode_bi_subvol_missing,
+				"inode %llu:%u bi_subvol points to missing subvolume %u",
+				u.bi_inum, k.k->p.snapshot, u.bi_subvol) ||
+		    fsck_err_on(le64_to_cpu(s.inode) != u.bi_inum ||
+				!bch2_snapshot_is_ancestor(c, le32_to_cpu(s.snapshot),
+							   k.k->p.snapshot),
+				c, inode_bi_subvol_wrong,
+				"inode %llu:%u points to subvol %u, but subvol points to %llu:%u",
+				u.bi_inum, k.k->p.snapshot, u.bi_subvol,
+				le64_to_cpu(s.inode),
+				le32_to_cpu(s.snapshot))) {
+			u.bi_subvol = 0;
+			u.bi_parent_subvol = 0;
+			do_update = true;
+		}
+	}
+
 	if (do_update) {
 		ret = __bch2_fsck_write_inode(trans, &u, iter->pos.snapshot);
 		bch_err_msg(c, ret, "in fsck updating inode");
diff --git a/fs/bcachefs/sb-errors_types.h b/fs/bcachefs/sb-errors_types.h
index cadf12ce917393..63f18c7f30885e 100644
--- a/fs/bcachefs/sb-errors_types.h
+++ b/fs/bcachefs/sb-errors_types.h
@@ -251,7 +251,9 @@
 	x(hash_table_key_wrong_offset,				243)	\
 	x(unlinked_inode_not_on_deleted_list,			244)	\
 	x(reflink_p_front_pad_bad,				245)	\
-	x(journal_entry_dup_same_device,			246)
+	x(journal_entry_dup_same_device,			246)	\
+	x(inode_bi_subvol_missing,				247)	\
+	x(inode_bi_subvol_wrong,				248)
 
 enum bch_sb_error_id {
 #define x(t, n) BCH_FSCK_ERR_##t = n,

From f86542a5f0f2c4193cef4bfea1f7daac97a8521c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 6 Feb 2024 23:39:08 -0500
Subject: [PATCH 0530/1406] bcachefs: check_inode_dirent_inode()

check that if an inode has a backpointer, the dirent it points to points
back to it.

We do this in check_dirent_inode_dirent(), but only for inodes that have
dirents that point to them - we also have to do the check starting from
the inode to catch inodes that don't have dirents that point to them.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fsck.c            | 125 ++++++++++++++++++++++++----------
 fs/bcachefs/sb-errors_types.h |   4 +-
 2 files changed, 92 insertions(+), 37 deletions(-)

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index e4a8a14c46bc92..75aab077605757 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -767,6 +767,43 @@ static int hash_check_key(struct btree_trans *trans,
 	goto out;
 }
 
+static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans,
+						struct btree_iter *iter,
+						struct bpos pos)
+{
+	return bch2_bkey_get_iter_typed(trans, iter, BTREE_ID_dirents, pos, 0, dirent);
+}
+
+static struct bkey_s_c_dirent inode_get_dirent(struct btree_trans *trans,
+					       struct btree_iter *iter,
+					       struct bch_inode_unpacked *inode,
+					       u32 *snapshot)
+{
+	if (inode->bi_subvol) {
+		u64 inum;
+		int ret = subvol_lookup(trans, inode->bi_parent_subvol, snapshot, &inum);
+		if (ret)
+			return ((struct bkey_s_c_dirent) { .k = ERR_PTR(ret) });
+	}
+
+	return dirent_get_by_pos(trans, iter, SPOS(inode->bi_dir, inode->bi_dir_offset, *snapshot));
+}
+
+static bool inode_points_to_dirent(struct bch_inode_unpacked *inode,
+				   struct bkey_s_c_dirent d)
+{
+	return  inode->bi_dir		== d.k->p.inode &&
+		inode->bi_dir_offset	== d.k->p.offset;
+}
+
+static bool dirent_points_to_inode(struct bkey_s_c_dirent d,
+				   struct bch_inode_unpacked *inode)
+{
+	return d.v->d_type == DT_SUBVOL
+		? le32_to_cpu(d.v->d_child_subvol)	== inode->bi_subvol
+		: le64_to_cpu(d.v->d_inum)		== inode->bi_inum;
+}
+
 static int check_inode_deleted_list(struct btree_trans *trans, struct bpos p)
 {
 	struct btree_iter iter;
@@ -779,6 +816,49 @@ static int check_inode_deleted_list(struct btree_trans *trans, struct bpos p)
 	return k.k->type == KEY_TYPE_set;
 }
 
+static int check_inode_dirent_inode(struct btree_trans *trans, struct bkey_s_c inode_k,
+				    struct bch_inode_unpacked *inode,
+				    u32 inode_snapshot, bool *write_inode)
+{
+	struct bch_fs *c = trans->c;
+	struct printbuf buf = PRINTBUF;
+
+	struct btree_iter dirent_iter = {};
+	struct bkey_s_c_dirent d = inode_get_dirent(trans, &dirent_iter, inode, &inode_snapshot);
+	int ret = bkey_err(d);
+	if (ret && !bch2_err_matches(ret, ENOENT))
+		return ret;
+
+	if (fsck_err_on(ret,
+			c, inode_points_to_missing_dirent,
+			"inode points to missing dirent\n%s",
+			(bch2_bkey_val_to_text(&buf, c, inode_k), buf.buf)) ||
+	    fsck_err_on(!ret && !dirent_points_to_inode(d, inode),
+			c, inode_points_to_wrong_dirent,
+			"inode points to dirent that does not point back:\n%s",
+			(bch2_bkey_val_to_text(&buf, c, inode_k),
+			 prt_newline(&buf),
+			 bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
+		/*
+		 * We just clear the backpointer fields for now. If we find a
+		 * dirent that points to this inode in check_dirents(), we'll
+		 * update it then; then when we get to check_path() if the
+		 * backpointer is still 0 we'll reattach it.
+		 */
+		inode->bi_dir = 0;
+		inode->bi_dir_offset = 0;
+		inode->bi_flags &= ~BCH_INODE_backptr_untrusted;
+		*write_inode = true;
+	}
+
+	ret = 0;
+fsck_err:
+	bch2_trans_iter_exit(trans, &dirent_iter);
+	printbuf_exit(&buf);
+	bch_err_fn(c, ret);
+	return ret;
+}
+
 static int check_inode(struct btree_trans *trans,
 		       struct btree_iter *iter,
 		       struct bkey_s_c k,
@@ -923,6 +1003,12 @@ static int check_inode(struct btree_trans *trans,
 		do_update = true;
 	}
 
+	if (u.bi_dir || u.bi_dir_offset) {
+		ret = check_inode_dirent_inode(trans, k, &u, k.k->p.snapshot, &do_update);
+		if (ret)
+			goto err;
+	}
+
 	if (u.bi_subvol) {
 		struct bch_subvolume s;
 
@@ -980,28 +1066,6 @@ int bch2_check_inodes(struct bch_fs *c)
 	return ret;
 }
 
-static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans,
-						struct btree_iter *iter,
-						struct bpos pos)
-{
-	return bch2_bkey_get_iter_typed(trans, iter, BTREE_ID_dirents, pos, 0, dirent);
-}
-
-static bool inode_points_to_dirent(struct bch_inode_unpacked *inode,
-				   struct bkey_s_c_dirent d)
-{
-	return  inode->bi_dir		== d.k->p.inode &&
-		inode->bi_dir_offset	== d.k->p.offset;
-}
-
-static bool dirent_points_to_inode(struct bkey_s_c_dirent d,
-				   struct bch_inode_unpacked *inode)
-{
-	return d.v->d_type == DT_SUBVOL
-		? le32_to_cpu(d.v->d_child_subvol)	== inode->bi_subvol
-		: le64_to_cpu(d.v->d_inum)		== inode->bi_inum;
-}
-
 static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w)
 {
 	struct bch_fs *c = trans->c;
@@ -1489,7 +1553,7 @@ static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w)
 	return ret ?: trans_was_restarted(trans, restart_count);
 }
 
-static int check_inode_backpointer(struct btree_trans *trans,
+static int check_dirent_inode_dirent(struct btree_trans *trans,
 				   struct btree_iter *iter,
 				   struct bkey_s_c_dirent d,
 				   struct bch_inode_unpacked *target,
@@ -1588,7 +1652,7 @@ static int check_dirent_target(struct btree_trans *trans,
 	struct printbuf buf = PRINTBUF;
 	int ret = 0;
 
-	ret = check_inode_backpointer(trans, iter, d, target, target_snapshot);
+	ret = check_dirent_inode_dirent(trans, iter, d, target, target_snapshot);
 	if (ret)
 		goto err;
 
@@ -2014,18 +2078,7 @@ static int check_path(struct btree_trans *trans,
 		struct bkey_s_c_dirent d;
 		u32 parent_snapshot = snapshot;
 
-		if (inode->bi_subvol) {
-			u64 inum;
-
-			ret = subvol_lookup(trans, inode->bi_parent_subvol,
-					    &parent_snapshot, &inum);
-			if (ret)
-				break;
-		}
-
-		d = dirent_get_by_pos(trans, &dirent_iter,
-				      SPOS(inode->bi_dir, inode->bi_dir_offset,
-					   parent_snapshot));
+		d = inode_get_dirent(trans, &dirent_iter, inode, &parent_snapshot);
 		ret = bkey_err(d.s_c);
 		if (ret && !bch2_err_matches(ret, ENOENT))
 			break;
diff --git a/fs/bcachefs/sb-errors_types.h b/fs/bcachefs/sb-errors_types.h
index 63f18c7f30885e..a55aa9e2810f63 100644
--- a/fs/bcachefs/sb-errors_types.h
+++ b/fs/bcachefs/sb-errors_types.h
@@ -253,7 +253,9 @@
 	x(reflink_p_front_pad_bad,				245)	\
 	x(journal_entry_dup_same_device,			246)	\
 	x(inode_bi_subvol_missing,				247)	\
-	x(inode_bi_subvol_wrong,				248)
+	x(inode_bi_subvol_wrong,				248)	\
+	x(inode_points_to_missing_dirent,			249)	\
+	x(inode_points_to_wrong_dirent,				250)
 
 enum bch_sb_error_id {
 #define x(t, n) BCH_FSCK_ERR_##t = n,

From 97cd5cb1562f7bc9495cb038fc8f943dff2d5c11 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 8 Feb 2024 16:02:08 -0500
Subject: [PATCH 0531/1406] bcachefs: better log message in
 lookup_inode_for_snapshot()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fsck.c | 45 ++++++++++++++++++++++++---------------------
 1 file changed, 24 insertions(+), 21 deletions(-)

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 75aab077605757..6d2db44bb00f0d 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -564,13 +564,12 @@ static int get_inodes_all_snapshots(struct btree_trans *trans,
 }
 
 static struct inode_walker_entry *
-lookup_inode_for_snapshot(struct bch_fs *c, struct inode_walker *w,
-			  u32 snapshot, bool is_whiteout)
+lookup_inode_for_snapshot(struct bch_fs *c, struct inode_walker *w, struct bkey_s_c k)
 {
-	struct inode_walker_entry *i;
-
-	snapshot = bch2_snapshot_equiv(c, snapshot);
+	bool is_whiteout = k.k->type == KEY_TYPE_whiteout;
+	u32 snapshot = bch2_snapshot_equiv(c, k.k->p.snapshot);
 
+	struct inode_walker_entry *i;
 	__darray_for_each(w->inodes, i)
 		if (bch2_snapshot_is_ancestor(c, snapshot, i->snapshot))
 			goto found;
@@ -581,20 +580,24 @@ lookup_inode_for_snapshot(struct bch_fs *c, struct inode_walker *w,
 
 	if (snapshot != i->snapshot && !is_whiteout) {
 		struct inode_walker_entry new = *i;
-		size_t pos;
-		int ret;
 
 		new.snapshot = snapshot;
 		new.count = 0;
 
-		bch_info(c, "have key for inode %llu:%u but have inode in ancestor snapshot %u",
-			 w->last_pos.inode, snapshot, i->snapshot);
+		struct printbuf buf = PRINTBUF;
+		bch2_bkey_val_to_text(&buf, c, k);
+
+		bch_info(c, "have key for inode %llu:%u but have inode in ancestor snapshot %u\n"
+			 "unexpected because we should always update the inode when we update a key in that inode\n"
+			 "%s",
+			 w->last_pos.inode, snapshot, i->snapshot, buf.buf);
+		printbuf_exit(&buf);
 
 		while (i > w->inodes.data && i[-1].snapshot > snapshot)
 			--i;
 
-		pos = i - w->inodes.data;
-		ret = darray_insert_item(&w->inodes, pos, new);
+		size_t pos = i - w->inodes.data;
+		int ret = darray_insert_item(&w->inodes, pos, new);
 		if (ret)
 			return ERR_PTR(ret);
 
@@ -605,21 +608,21 @@ lookup_inode_for_snapshot(struct bch_fs *c, struct inode_walker *w,
 }
 
 static struct inode_walker_entry *walk_inode(struct btree_trans *trans,
-					     struct inode_walker *w, struct bpos pos,
-					     bool is_whiteout)
+					     struct inode_walker *w,
+					     struct bkey_s_c k)
 {
-	if (w->last_pos.inode != pos.inode) {
-		int ret = get_inodes_all_snapshots(trans, w, pos.inode);
+	if (w->last_pos.inode != k.k->p.inode) {
+		int ret = get_inodes_all_snapshots(trans, w, k.k->p.inode);
 		if (ret)
 			return ERR_PTR(ret);
-	} else if (bkey_cmp(w->last_pos, pos)) {
+	} else if (bkey_cmp(w->last_pos, k.k->p)) {
 		darray_for_each(w->inodes, i)
 			i->seen_this_pos = false;
 	}
 
-	w->last_pos = pos;
+	w->last_pos = k.k->p;
 
-	return lookup_inode_for_snapshot(trans->c, w, pos.snapshot, is_whiteout);
+	return lookup_inode_for_snapshot(trans->c, w, k);
 }
 
 static int __get_visible_inodes(struct btree_trans *trans,
@@ -1374,7 +1377,7 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
 			goto err;
 	}
 
-	i = walk_inode(trans, inode, equiv, k.k->type == KEY_TYPE_whiteout);
+	i = walk_inode(trans, inode, k);
 	ret = PTR_ERR_OR_ZERO(i);
 	if (ret)
 		goto err;
@@ -1795,7 +1798,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
 
 	BUG_ON(!btree_iter_path(trans, iter)->should_be_locked);
 
-	i = walk_inode(trans, dir, equiv, k.k->type == KEY_TYPE_whiteout);
+	i = walk_inode(trans, dir, k);
 	ret = PTR_ERR_OR_ZERO(i);
 	if (ret < 0)
 		goto err;
@@ -1922,7 +1925,7 @@ static int check_xattr(struct btree_trans *trans, struct btree_iter *iter,
 	if (ret)
 		return ret;
 
-	i = walk_inode(trans, inode, k.k->p, k.k->type == KEY_TYPE_whiteout);
+	i = walk_inode(trans, inode, k);
 	ret = PTR_ERR_OR_ZERO(i);
 	if (ret)
 		return ret;

From df6e05114fa7a9e8004fa028d4cc71f16ad50983 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 6 Feb 2024 23:41:46 -0500
Subject: [PATCH 0532/1406] bcachefs: check bi_parent_subvol in check_inode()

check for inodes with a nonzero bi_parent_subvol field that aren't
actually subvolume roots

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fsck.c            | 10 ++++++++++
 fs/bcachefs/sb-errors_types.h |  3 ++-
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 6d2db44bb00f0d..ae7bbf35a3e16b 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -1012,6 +1012,16 @@ static int check_inode(struct btree_trans *trans,
 			goto err;
 	}
 
+	if (fsck_err_on(u.bi_parent_subvol &&
+			(u.bi_subvol == 0 ||
+			 u.bi_subvol == BCACHEFS_ROOT_SUBVOL),
+			c, inode_bi_parent_subvol_nonzero,
+			"inode %llu:%u has subvol %u but nonzero parent subvol %u",
+			u.bi_inum, k.k->p.snapshot, u.bi_subvol, u.bi_parent_subvol)) {
+		u.bi_parent_subvol = 0;
+		do_update = true;
+	}
+
 	if (u.bi_subvol) {
 		struct bch_subvolume s;
 
diff --git a/fs/bcachefs/sb-errors_types.h b/fs/bcachefs/sb-errors_types.h
index a55aa9e2810f63..73104908e7c6df 100644
--- a/fs/bcachefs/sb-errors_types.h
+++ b/fs/bcachefs/sb-errors_types.h
@@ -255,7 +255,8 @@
 	x(inode_bi_subvol_missing,				247)	\
 	x(inode_bi_subvol_wrong,				248)	\
 	x(inode_points_to_missing_dirent,			249)	\
-	x(inode_points_to_wrong_dirent,				250)
+	x(inode_points_to_wrong_dirent,				250)	\
+	x(inode_bi_parent_subvol_nonzero,			251)
 
 enum bch_sb_error_id {
 #define x(t, n) BCH_FSCK_ERR_##t = n,

From fbf58a785e76977f9d0057ffa23acebb17b84d55 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 6 Feb 2024 23:51:23 -0500
Subject: [PATCH 0533/1406] bcachefs: simplify check_dirent_inode_dirent()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fsck.c | 114 ++++++++++++++++++++++-----------------------
 1 file changed, 56 insertions(+), 58 deletions(-)

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index ae7bbf35a3e16b..7c1e74bb761471 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -1573,77 +1573,75 @@ static int check_dirent_inode_dirent(struct btree_trans *trans,
 				   u32 target_snapshot)
 {
 	struct bch_fs *c = trans->c;
-	struct btree_iter bp_iter = { NULL };
 	struct printbuf buf = PRINTBUF;
 	int ret = 0;
 
+	if (inode_points_to_dirent(target, d))
+		return 0;
+
 	if (!target->bi_dir &&
 	    !target->bi_dir_offset) {
 		target->bi_dir		= d.k->p.inode;
 		target->bi_dir_offset	= d.k->p.offset;
-
-		ret = __bch2_fsck_write_inode(trans, target, target_snapshot);
-		if (ret)
-			goto err;
+		return __bch2_fsck_write_inode(trans, target, target_snapshot);
 	}
 
-	if (!inode_points_to_dirent(target, d)) {
-		struct bkey_s_c_dirent bp_dirent = dirent_get_by_pos(trans, &bp_iter,
-				      SPOS(target->bi_dir, target->bi_dir_offset, target_snapshot));
-		ret = bkey_err(bp_dirent);
-		if (ret && !bch2_err_matches(ret, ENOENT))
-			goto err;
-
-		bool backpointer_exists = !ret;
-		ret = 0;
+	struct btree_iter bp_iter = { NULL };
+	struct bkey_s_c_dirent bp_dirent = dirent_get_by_pos(trans, &bp_iter,
+			      SPOS(target->bi_dir, target->bi_dir_offset, target_snapshot));
+	ret = bkey_err(bp_dirent);
+	if (ret && !bch2_err_matches(ret, ENOENT))
+		goto err;
 
-		bch2_bkey_val_to_text(&buf, c, d.s_c);
-		prt_newline(&buf);
-		if (backpointer_exists)
-			bch2_bkey_val_to_text(&buf, c, bp_dirent.s_c);
+	bool backpointer_exists = !ret;
+	ret = 0;
 
-		if (fsck_err_on(S_ISDIR(target->bi_mode) && backpointer_exists,
-				c, inode_dir_multiple_links,
-				"directory %llu:%u with multiple links\n%s",
-				target->bi_inum, target_snapshot, buf.buf)) {
-			ret = __remove_dirent(trans, d.k->p);
-			goto out;
-		}
+	if (fsck_err_on(!backpointer_exists,
+			c, inode_wrong_backpointer,
+			"inode %llu:%u has wrong backpointer:\n"
+			"got       %llu:%llu\n"
+			"should be %llu:%llu",
+			target->bi_inum, target_snapshot,
+			target->bi_dir,
+			target->bi_dir_offset,
+			d.k->p.inode,
+			d.k->p.offset)) {
+		target->bi_dir		= d.k->p.inode;
+		target->bi_dir_offset	= d.k->p.offset;
+		ret = __bch2_fsck_write_inode(trans, target, target_snapshot);
+		goto out;
+	}
 
-		/*
-		 * hardlinked file with nlink 0:
-		 * We're just adjusting nlink here so check_nlinks() will pick
-		 * it up, it ignores inodes with nlink 0
-		 */
-		if (fsck_err_on(backpointer_exists && !target->bi_nlink,
-				c, inode_multiple_links_but_nlink_0,
-				"inode %llu:%u type %s has multiple links but i_nlink 0\n%s",
-				target->bi_inum, target_snapshot, bch2_d_types[d.v->d_type], buf.buf)) {
-			target->bi_nlink++;
-			target->bi_flags &= ~BCH_INODE_unlinked;
-
-			ret = __bch2_fsck_write_inode(trans, target, target_snapshot);
-			if (ret)
-				goto err;
-		}
+	bch2_bkey_val_to_text(&buf, c, d.s_c);
+	prt_newline(&buf);
+	if (backpointer_exists)
+		bch2_bkey_val_to_text(&buf, c, bp_dirent.s_c);
+
+	if (fsck_err_on(backpointer_exists &&
+			(S_ISDIR(target->bi_mode) ||
+			 target->bi_subvol),
+			c, inode_dir_multiple_links,
+			"%s %llu:%u with multiple links\n%s",
+			S_ISDIR(target->bi_mode) ? "directory" : "subvolume",
+			target->bi_inum, target_snapshot, buf.buf)) {
+		ret = __remove_dirent(trans, d.k->p);
+		goto out;
+	}
 
-		if (fsck_err_on(!backpointer_exists,
-				c, inode_wrong_backpointer,
-				"inode %llu:%u has wrong backpointer:\n"
-				"got       %llu:%llu\n"
-				"should be %llu:%llu",
-				target->bi_inum, target_snapshot,
-				target->bi_dir,
-				target->bi_dir_offset,
-				d.k->p.inode,
-				d.k->p.offset)) {
-			target->bi_dir		= d.k->p.inode;
-			target->bi_dir_offset	= d.k->p.offset;
-
-			ret = __bch2_fsck_write_inode(trans, target, target_snapshot);
-			if (ret)
-				goto err;
-		}
+	/*
+	 * hardlinked file with nlink 0:
+	 * We're just adjusting nlink here so check_nlinks() will pick
+	 * it up, it ignores inodes with nlink 0
+	 */
+	if (fsck_err_on(backpointer_exists && !target->bi_nlink,
+			c, inode_multiple_links_but_nlink_0,
+			"inode %llu:%u type %s has multiple links but i_nlink 0\n%s",
+			target->bi_inum, target_snapshot, bch2_d_types[d.v->d_type], buf.buf)) {
+		target->bi_nlink++;
+		target->bi_flags &= ~BCH_INODE_unlinked;
+		ret = __bch2_fsck_write_inode(trans, target, target_snapshot);
+		if (ret)
+			goto err;
 	}
 out:
 err:

From 35b42481590bec5c210deca541c05438617b291d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 7 Feb 2024 00:06:14 -0500
Subject: [PATCH 0534/1406] bcachefs: delete duplicated checks in
 check_dirent_to_subvol()

these were already checked in check_subvol()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fsck.c | 27 ++++-----------------------
 1 file changed, 4 insertions(+), 23 deletions(-)

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 7c1e74bb761471..35f207e1f8b7bc 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -1716,8 +1716,8 @@ static int check_dirent_target(struct btree_trans *trans,
 	return ret;
 }
 
-static int check_subvol_dirent(struct btree_trans *trans, struct btree_iter *iter,
-			       struct bkey_s_c_dirent d)
+static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *iter,
+				  struct bkey_s_c_dirent d)
 {
 	struct bch_fs *c = trans->c;
 	struct bch_inode_unpacked subvol_root;
@@ -1727,7 +1727,7 @@ static int check_subvol_dirent(struct btree_trans *trans, struct btree_iter *ite
 	int ret = 0;
 
 	ret = subvol_lookup(trans, target_subvol,
-			      &target_snapshot, &target_inum);
+			    &target_snapshot, &target_inum);
 	if (ret && !bch2_err_matches(ret, ENOENT))
 		return ret;
 
@@ -1741,25 +1741,6 @@ static int check_subvol_dirent(struct btree_trans *trans, struct btree_iter *ite
 	if (ret && !bch2_err_matches(ret, ENOENT))
 		return ret;
 
-	if (fsck_err_on(ret, c, subvol_to_missing_root,
-			"subvolume %u points to missing subvolume root %llu",
-			target_subvol,
-			target_inum)) {
-		bch_err(c, "repair not implemented yet");
-		return -EINVAL;
-	}
-
-	if (fsck_err_on(subvol_root.bi_subvol != target_subvol,
-			c, subvol_root_wrong_bi_subvol,
-			"subvol root %llu has wrong bi_subvol field: got %u, should be %u",
-			target_inum,
-			subvol_root.bi_subvol, target_subvol)) {
-		subvol_root.bi_subvol = target_subvol;
-		ret = __bch2_fsck_write_inode(trans, &subvol_root, target_snapshot);
-		if (ret)
-			return ret;
-	}
-
 	ret = check_dirent_target(trans, iter, d, &subvol_root,
 				  target_snapshot);
 	if (ret)
@@ -1852,7 +1833,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
 	d = bkey_s_c_to_dirent(k);
 
 	if (d.v->d_type == DT_SUBVOL) {
-		ret = check_subvol_dirent(trans, iter, d);
+		ret = check_dirent_to_subvol(trans, iter, d);
 		if (ret)
 			goto err;
 	} else {

From 142df888d819826bdf9800330e4dfd3afc5a7037 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 7 Feb 2024 00:23:25 -0500
Subject: [PATCH 0535/1406] bcachefs: check inode->bi_parent_subvol against
 dirent

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fsck.c            | 35 +++++++++++++----------------------
 fs/bcachefs/sb-errors_types.h |  4 ++--
 2 files changed, 15 insertions(+), 24 deletions(-)

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 35f207e1f8b7bc..9dd99854808b4c 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -1015,7 +1015,7 @@ static int check_inode(struct btree_trans *trans,
 	if (fsck_err_on(u.bi_parent_subvol &&
 			(u.bi_subvol == 0 ||
 			 u.bi_subvol == BCACHEFS_ROOT_SUBVOL),
-			c, inode_bi_parent_subvol_nonzero,
+			c, inode_bi_parent_nonzero,
 			"inode %llu:%u has subvol %u but nonzero parent subvol %u",
 			u.bi_inum, k.k->p.snapshot, u.bi_subvol, u.bi_parent_subvol)) {
 		u.bi_parent_subvol = 0;
@@ -1688,27 +1688,6 @@ static int check_dirent_target(struct btree_trans *trans,
 
 		d = dirent_i_to_s_c(n);
 	}
-
-	if (fsck_err_on(d.v->d_type == DT_SUBVOL &&
-			target->bi_parent_subvol != le32_to_cpu(d.v->d_parent_subvol),
-			c, dirent_d_parent_subvol_wrong,
-			"dirent has wrong d_parent_subvol field: got %u, should be %u",
-			le32_to_cpu(d.v->d_parent_subvol),
-			target->bi_parent_subvol)) {
-		n = bch2_trans_kmalloc(trans, bkey_bytes(d.k));
-		ret = PTR_ERR_OR_ZERO(n);
-		if (ret)
-			goto err;
-
-		bkey_reassemble(&n->k_i, d.s_c);
-		n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol);
-
-		ret = bch2_trans_update(trans, iter, &n->k_i, 0);
-		if (ret)
-			goto err;
-
-		d = dirent_i_to_s_c(n);
-	}
 err:
 fsck_err:
 	printbuf_exit(&buf);
@@ -1721,6 +1700,7 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *
 {
 	struct bch_fs *c = trans->c;
 	struct bch_inode_unpacked subvol_root;
+	u32 parent_subvol = le32_to_cpu(d.v->d_parent_subvol);
 	u32 target_subvol = le32_to_cpu(d.v->d_child_subvol);
 	u32 target_snapshot;
 	u64 target_inum;
@@ -1741,6 +1721,17 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *
 	if (ret && !bch2_err_matches(ret, ENOENT))
 		return ret;
 
+	if (fsck_err_on(parent_subvol != subvol_root.bi_parent_subvol,
+			c, inode_bi_parent_wrong,
+			"subvol root %llu has wrong bi_parent_subvol: got %u, should be %u",
+			target_inum,
+			subvol_root.bi_parent_subvol, parent_subvol)) {
+		subvol_root.bi_parent_subvol = parent_subvol;
+		ret = __bch2_fsck_write_inode(trans, &subvol_root, target_snapshot);
+		if (ret)
+			return ret;
+	}
+
 	ret = check_dirent_target(trans, iter, d, &subvol_root,
 				  target_snapshot);
 	if (ret)
diff --git a/fs/bcachefs/sb-errors_types.h b/fs/bcachefs/sb-errors_types.h
index 73104908e7c6df..f24c828019d89b 100644
--- a/fs/bcachefs/sb-errors_types.h
+++ b/fs/bcachefs/sb-errors_types.h
@@ -231,7 +231,7 @@
 	x(dirent_name_dot_or_dotdot,				223)	\
 	x(dirent_name_has_slash,				224)	\
 	x(dirent_d_type_wrong,					225)	\
-	x(dirent_d_parent_subvol_wrong,				226)	\
+	x(inode_bi_parent_wrong,				226)	\
 	x(dirent_in_missing_dir_inode,				227)	\
 	x(dirent_in_non_dir_inode,				228)	\
 	x(dirent_to_missing_inode,				229)	\
@@ -256,7 +256,7 @@
 	x(inode_bi_subvol_wrong,				248)	\
 	x(inode_points_to_missing_dirent,			249)	\
 	x(inode_points_to_wrong_dirent,				250)	\
-	x(inode_bi_parent_subvol_nonzero,			251)
+	x(inode_bi_parent_nonzero,				251)
 
 enum bch_sb_error_id {
 #define x(t, n) BCH_FSCK_ERR_##t = n,

From ecdef1f1f15d5c7f8c44b54b21271bf268169e34 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 7 Feb 2024 00:45:09 -0500
Subject: [PATCH 0536/1406] bcachefs: check dirent->d_parent_subvol

Check that d_parent_subvol makes sense - the dirent's snapshot must be
visible in d_parent_subvol (i.e. an ancestor of d_parent_subvol's
snapshot) in order to be visible.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fsck.c            | 61 ++++++++++++++++++++++++++++++++---
 fs/bcachefs/sb-errors_types.h |  4 ++-
 2 files changed, 60 insertions(+), 5 deletions(-)

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 9dd99854808b4c..9a9d3cfa26f768 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -1695,6 +1695,31 @@ static int check_dirent_target(struct btree_trans *trans,
 	return ret;
 }
 
+/* find a subvolume that's a descendent of @snapshot: */
+static int find_snapshot_subvol(struct btree_trans *trans, u32 snapshot, u32 *subvolid)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret;
+
+	for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN, 0, k, ret) {
+		if (k.k->type != KEY_TYPE_subvolume)
+			continue;
+
+		struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
+		if (bch2_snapshot_is_ancestor(trans->c, le32_to_cpu(s.v->snapshot), snapshot)) {
+			bch2_trans_iter_exit(trans, &iter);
+			*subvolid = k.k->p.offset;
+			goto found;
+		}
+	}
+	if (!ret)
+		ret = -ENOENT;
+found:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
 static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *iter,
 				  struct bkey_s_c_dirent d)
 {
@@ -1702,18 +1727,44 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *
 	struct bch_inode_unpacked subvol_root;
 	u32 parent_subvol = le32_to_cpu(d.v->d_parent_subvol);
 	u32 target_subvol = le32_to_cpu(d.v->d_child_subvol);
-	u32 target_snapshot;
-	u64 target_inum;
+	u32 target_snapshot, parent_snapshot;
+	u64 target_inum, parent_inum;
+	struct printbuf buf = PRINTBUF;
 	int ret = 0;
 
+	ret = subvol_lookup(trans, parent_subvol, &parent_snapshot, &parent_inum);
+	if (ret && !bch2_err_matches(ret, ENOENT))
+		return ret;
+
+	if (fsck_err_on(ret, c, dirent_to_missing_parent_subvol,
+			"dirent parent_subvol points to missing subvolume\n%s",
+			(bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)) ||
+	    fsck_err_on(!ret && !bch2_snapshot_is_ancestor(c, parent_snapshot, d.k->p.snapshot),
+			c, dirent_not_visible_in_parent_subvol,
+			"dirent not visible in parent_subvol (not an ancestor of subvol snap %u)\n%s",
+			parent_snapshot,
+			(bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
+		u32 new_parent_subvol;
+		ret = find_snapshot_subvol(trans, d.k->p.snapshot, &new_parent_subvol);
+		if (ret)
+			goto err;
+
+		struct bkey_i_dirent *new_dirent = bch2_bkey_make_mut_typed(trans, iter, &d.s_c, 0, dirent);
+		ret = PTR_ERR_OR_ZERO(new_dirent);
+		if (ret)
+			goto err;
+
+		new_dirent->v.d_parent_subvol = cpu_to_le32(new_parent_subvol);
+	}
+
 	ret = subvol_lookup(trans, target_subvol,
 			    &target_snapshot, &target_inum);
 	if (ret && !bch2_err_matches(ret, ENOENT))
 		return ret;
 
 	if (fsck_err_on(ret, c, dirent_to_missing_subvol,
-			"dirent points to missing subvolume %u",
-			le32_to_cpu(d.v->d_child_subvol)))
+			"dirent points to missing subvolume\n%s",
+			(bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)))
 		return __remove_dirent(trans, d.k->p);
 
 	ret = lookup_inode(trans, target_inum,
@@ -1736,7 +1787,9 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *
 				  target_snapshot);
 	if (ret)
 		return ret;
+err:
 fsck_err:
+	printbuf_exit(&buf);
 	return ret;
 }
 
diff --git a/fs/bcachefs/sb-errors_types.h b/fs/bcachefs/sb-errors_types.h
index f24c828019d89b..833555676f76c4 100644
--- a/fs/bcachefs/sb-errors_types.h
+++ b/fs/bcachefs/sb-errors_types.h
@@ -256,7 +256,9 @@
 	x(inode_bi_subvol_wrong,				248)	\
 	x(inode_points_to_missing_dirent,			249)	\
 	x(inode_points_to_wrong_dirent,				250)	\
-	x(inode_bi_parent_nonzero,				251)
+	x(inode_bi_parent_nonzero,				251)	\
+	x(dirent_to_missing_parent_subvol,			252)	\
+	x(dirent_not_visible_in_parent_subvol,			253)
 
 enum bch_sb_error_id {
 #define x(t, n) BCH_FSCK_ERR_##t = n,

From 9184295145cd2f8deffc4ffad392c623ebeb1ea7 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 21 Jan 2024 14:57:58 -0500
Subject: [PATCH 0537/1406] bcachefs: Repair subvol dirents that point to non
 subvols

when repair switches d_type to or from DT_SUBVOL, we need to update the
target accordingly

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fsck.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 9a9d3cfa26f768..15da85629c6f4b 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -1681,6 +1681,12 @@ static int check_dirent_target(struct btree_trans *trans,
 
 		bkey_reassemble(&n->k_i, d.s_c);
 		n->v.d_type = inode_d_type(target);
+		if (n->v.d_type == DT_SUBVOL) {
+			n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol);
+			n->v.d_child_subvol = cpu_to_le32(target->bi_subvol);
+		} else {
+			n->v.d_inum = cpu_to_le64(target->bi_inum);
+		}
 
 		ret = bch2_trans_update(trans, iter, &n->k_i, 0);
 		if (ret)

From 1d065323f8eae2a3a27b8ff998c39e44fd7b3efb Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 22 Jan 2024 15:12:28 -0500
Subject: [PATCH 0538/1406] bcachefs: bch_subvolume::parent -> creation_parent

bit of renaming, prep for adding a fs path parent

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/subvolume.c        | 24 ++++++++++++------------
 fs/bcachefs/subvolume_format.h |  2 +-
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
index e7ee52c39990cc..a0be103b48fe03 100644
--- a/fs/bcachefs/subvolume.c
+++ b/fs/bcachefs/subvolume.c
@@ -143,8 +143,8 @@ void bch2_subvolume_to_text(struct printbuf *out, struct bch_fs *c,
 		   le64_to_cpu(s.v->inode),
 		   le32_to_cpu(s.v->snapshot));
 
-	if (bkey_val_bytes(s.k) > offsetof(struct bch_subvolume, parent))
-		prt_printf(out, " parent %u", le32_to_cpu(s.v->parent));
+	if (bkey_val_bytes(s.k) > offsetof(struct bch_subvolume, creation_parent))
+		prt_printf(out, " creation_parent %u", le32_to_cpu(s.v->creation_parent));
 }
 
 static __always_inline int
@@ -228,8 +228,8 @@ static int bch2_subvolume_reparent(struct btree_trans *trans,
 	if (k.k->type != KEY_TYPE_subvolume)
 		return 0;
 
-	if (bkey_val_bytes(k.k) > offsetof(struct bch_subvolume, parent) &&
-	    le32_to_cpu(bkey_s_c_to_subvolume(k).v->parent) != old_parent)
+	if (bkey_val_bytes(k.k) > offsetof(struct bch_subvolume, creation_parent) &&
+	    le32_to_cpu(bkey_s_c_to_subvolume(k).v->creation_parent) != old_parent)
 		return 0;
 
 	s = bch2_bkey_make_mut_typed(trans, iter, &k, 0, subvolume);
@@ -237,7 +237,7 @@ static int bch2_subvolume_reparent(struct btree_trans *trans,
 	if (ret)
 		return ret;
 
-	s->v.parent = cpu_to_le32(new_parent);
+	s->v.creation_parent = cpu_to_le32(new_parent);
 	return 0;
 }
 
@@ -260,7 +260,7 @@ static int bch2_subvolumes_reparent(struct btree_trans *trans, u32 subvolid_to_d
 				BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k,
 				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
 			bch2_subvolume_reparent(trans, &iter, k,
-					subvolid_to_delete, le32_to_cpu(s.parent)));
+					subvolid_to_delete, le32_to_cpu(s.creation_parent)));
 }
 
 /*
@@ -447,12 +447,12 @@ int bch2_subvolume_create(struct btree_trans *trans, u64 inode,
 	if (ret)
 		goto err;
 
-	new_subvol->v.flags	= 0;
-	new_subvol->v.snapshot	= cpu_to_le32(new_nodes[0]);
-	new_subvol->v.inode	= cpu_to_le64(inode);
-	new_subvol->v.parent	= cpu_to_le32(src_subvolid);
-	new_subvol->v.otime.lo	= cpu_to_le64(bch2_current_time(c));
-	new_subvol->v.otime.hi	= 0;
+	new_subvol->v.flags		= 0;
+	new_subvol->v.snapshot		= cpu_to_le32(new_nodes[0]);
+	new_subvol->v.inode		= cpu_to_le64(inode);
+	new_subvol->v.creation_parent	= cpu_to_le32(src_subvolid);
+	new_subvol->v.otime.lo		= cpu_to_le64(bch2_current_time(c));
+	new_subvol->v.otime.hi		= 0;
 
 	SET_BCH_SUBVOLUME_RO(&new_subvol->v, ro);
 	SET_BCH_SUBVOLUME_SNAP(&new_subvol->v, src_subvolid != 0);
diff --git a/fs/bcachefs/subvolume_format.h b/fs/bcachefs/subvolume_format.h
index af79134b07d6ad..b81cf0c6119d87 100644
--- a/fs/bcachefs/subvolume_format.h
+++ b/fs/bcachefs/subvolume_format.h
@@ -19,7 +19,7 @@ struct bch_subvolume {
 	 * This is _not_ necessarily the subvolume of the directory containing
 	 * this subvolume:
 	 */
-	__le32			parent;
+	__le32			creation_parent;
 	__le32			pad;
 	bch_le128		otime;
 };

From 58675aa7f8cb67b2547f51d0eef8803a46940602 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 8 Feb 2024 19:52:37 -0500
Subject: [PATCH 0539/1406] bcachefs: Fix path where dirent -> subvol missing
 and we don't fix

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fsck.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 15da85629c6f4b..60de75a8ee4f5b 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -1768,10 +1768,14 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *
 	if (ret && !bch2_err_matches(ret, ENOENT))
 		return ret;
 
-	if (fsck_err_on(ret, c, dirent_to_missing_subvol,
-			"dirent points to missing subvolume\n%s",
-			(bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)))
-		return __remove_dirent(trans, d.k->p);
+	if (ret) {
+		if (fsck_err(c, dirent_to_missing_subvol,
+			     "dirent points to missing subvolume\n%s",
+			     (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)))
+			return __remove_dirent(trans, d.k->p);
+		ret = 0;
+		goto out;
+	}
 
 	ret = lookup_inode(trans, target_inum,
 			   &subvol_root, &target_snapshot);
@@ -1793,6 +1797,7 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *
 				  target_snapshot);
 	if (ret)
 		return ret;
+out:
 err:
 fsck_err:
 	printbuf_exit(&buf);

From 6ce46919a562442109061772b65dff75a815c799 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 8 Feb 2024 22:52:40 -0500
Subject: [PATCH 0540/1406] bcachefs: Pass inode bkey to check_path()

prep work for improving logging/error messages

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fsck.c  | 55 +++++++++++++++++++++------------------------
 fs/bcachefs/inode.h | 14 ++++++++++++
 2 files changed, 40 insertions(+), 29 deletions(-)

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 60de75a8ee4f5b..c93558ff7a4854 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -2108,51 +2108,51 @@ static int path_down(struct bch_fs *c, pathbuf *p,
  *
  * XXX: we should also be verifying that inodes are in the right subvolumes
  */
-static int check_path(struct btree_trans *trans,
-		      pathbuf *p,
-		      struct bch_inode_unpacked *inode,
-		      u32 snapshot)
+static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c inode_k)
 {
 	struct bch_fs *c = trans->c;
+	struct bch_inode_unpacked inode;
+	u32 snapshot = bch2_snapshot_equiv(c, inode_k.k->p.snapshot);
 	int ret = 0;
 
-	snapshot = bch2_snapshot_equiv(c, snapshot);
 	p->nr = 0;
 
-	while (!(inode->bi_inum == BCACHEFS_ROOT_INO &&
-		 inode->bi_subvol == BCACHEFS_ROOT_SUBVOL)) {
+	BUG_ON(bch2_inode_unpack(inode_k, &inode));
+
+	while (!(inode.bi_inum == BCACHEFS_ROOT_INO &&
+		 inode.bi_subvol == BCACHEFS_ROOT_SUBVOL)) {
 		struct btree_iter dirent_iter;
 		struct bkey_s_c_dirent d;
 		u32 parent_snapshot = snapshot;
 
-		d = inode_get_dirent(trans, &dirent_iter, inode, &parent_snapshot);
+		d = inode_get_dirent(trans, &dirent_iter, &inode, &parent_snapshot);
 		ret = bkey_err(d.s_c);
 		if (ret && !bch2_err_matches(ret, ENOENT))
 			break;
 
-		if (!ret && !dirent_points_to_inode(d, inode)) {
+		if (!ret && !dirent_points_to_inode(d, &inode)) {
 			bch2_trans_iter_exit(trans, &dirent_iter);
 			ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode;
 		}
 
 		if (bch2_err_matches(ret, ENOENT)) {
-			if (fsck_err(c,  inode_unreachable,
+			if (fsck_err(c, inode_unreachable,
 				     "unreachable inode %llu:%u, type %s nlink %u backptr %llu:%llu",
-				     inode->bi_inum, snapshot,
-				     bch2_d_type_str(inode_d_type(inode)),
-				     inode->bi_nlink,
-				     inode->bi_dir,
-				     inode->bi_dir_offset))
-				ret = reattach_inode(trans, inode, snapshot);
+				     inode.bi_inum, snapshot,
+				     bch2_d_type_str(inode_d_type(&inode)),
+				     inode.bi_nlink,
+				     inode.bi_dir,
+				     inode.bi_dir_offset))
+				ret = reattach_inode(trans, &inode, snapshot);
 			break;
 		}
 
 		bch2_trans_iter_exit(trans, &dirent_iter);
 
-		if (!S_ISDIR(inode->bi_mode))
+		if (!S_ISDIR(inode.bi_mode))
 			break;
 
-		ret = path_down(c, p, inode->bi_inum, snapshot);
+		ret = path_down(c, p, inode.bi_inum, snapshot);
 		if (ret) {
 			bch_err(c, "memory allocation failure");
 			return ret;
@@ -2160,7 +2160,7 @@ static int check_path(struct btree_trans *trans,
 
 		snapshot = parent_snapshot;
 
-		ret = lookup_inode(trans, inode->bi_dir, inode, &snapshot);
+		ret = lookup_inode(trans, inode.bi_dir, &inode, &snapshot);
 		if (ret) {
 			/* Should have been caught in dirents pass */
 			if (!bch2_err_matches(ret, BCH_ERR_transaction_restart))
@@ -2168,26 +2168,26 @@ static int check_path(struct btree_trans *trans,
 			break;
 		}
 
-		if (path_is_dup(p, inode->bi_inum, snapshot)) {
+		if (path_is_dup(p, inode.bi_inum, snapshot)) {
 			/* XXX print path */
 			bch_err(c, "directory structure loop");
 
 			darray_for_each(*p, i)
 				pr_err("%llu:%u", i->inum, i->snapshot);
-			pr_err("%llu:%u", inode->bi_inum, snapshot);
+			pr_err("%llu:%u", inode.bi_inum, snapshot);
 
 			if (!fsck_err(c, dir_loop, "directory structure loop"))
 				return 0;
 
-			ret = remove_backpointer(trans, inode);
+			ret = remove_backpointer(trans, &inode);
 			if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
 				bch_err_msg(c, ret, "removing dirent");
 			if (ret)
 				break;
 
-			ret = reattach_inode(trans, inode, snapshot);
+			ret = reattach_inode(trans, &inode, snapshot);
 			if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
-				bch_err_msg(c, ret, "reattaching inode %llu", inode->bi_inum);
+				bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum);
 			break;
 		}
 	}
@@ -2203,7 +2203,6 @@ static int check_path(struct btree_trans *trans,
  */
 int bch2_check_directory_structure(struct bch_fs *c)
 {
-	struct bch_inode_unpacked u;
 	pathbuf path = { 0, };
 	int ret;
 
@@ -2216,12 +2215,10 @@ int bch2_check_directory_structure(struct bch_fs *c)
 			if (!bkey_is_inode(k.k))
 				continue;
 
-			BUG_ON(bch2_inode_unpack(k, &u));
-
-			if (u.bi_flags & BCH_INODE_unlinked)
+			if (bch2_inode_flags(k) & BCH_INODE_unlinked)
 				continue;
 
-			check_path(trans, &path, &u, iter.pos.snapshot);
+			check_path(trans, &path, k);
 		})));
 	darray_exit(&path);
 
diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
index 9a9353c001c2a5..056298050550f9 100644
--- a/fs/bcachefs/inode.h
+++ b/fs/bcachefs/inode.h
@@ -177,6 +177,20 @@ static inline u8 inode_d_type(struct bch_inode_unpacked *inode)
 	return inode->bi_subvol ? DT_SUBVOL : mode_to_type(inode->bi_mode);
 }
 
+static inline u32 bch2_inode_flags(struct bkey_s_c k)
+{
+	switch (k.k->type) {
+	case KEY_TYPE_inode:
+		return le32_to_cpu(bkey_s_c_to_inode(k).v->bi_flags);
+	case KEY_TYPE_inode_v2:
+		return le64_to_cpu(bkey_s_c_to_inode_v2(k).v->bi_flags);
+	case KEY_TYPE_inode_v3:
+		return le64_to_cpu(bkey_s_c_to_inode_v3(k).v->bi_flags);
+	default:
+		return 0;
+	}
+}
+
 /* i_nlink: */
 
 static inline unsigned nlink_bias(umode_t mode)

From 21ac914de5098bb51491a0eac6fa3c03ce1fef66 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 8 Feb 2024 23:08:21 -0500
Subject: [PATCH 0541/1406] bcachefs: check_path() now prints full inode when
 reattaching

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fsck.c | 26 ++++++++++++++++++--------
 1 file changed, 18 insertions(+), 8 deletions(-)

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index c93558ff7a4854..ce4d556198e3da 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -2111,7 +2111,9 @@ static int path_down(struct bch_fs *c, pathbuf *p,
 static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c inode_k)
 {
 	struct bch_fs *c = trans->c;
+	struct btree_iter inode_iter = {};
 	struct bch_inode_unpacked inode;
+	struct printbuf buf = PRINTBUF;
 	u32 snapshot = bch2_snapshot_equiv(c, inode_k.k->p.snapshot);
 	int ret = 0;
 
@@ -2137,14 +2139,12 @@ static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c ino
 
 		if (bch2_err_matches(ret, ENOENT)) {
 			if (fsck_err(c, inode_unreachable,
-				     "unreachable inode %llu:%u, type %s nlink %u backptr %llu:%llu",
-				     inode.bi_inum, snapshot,
-				     bch2_d_type_str(inode_d_type(&inode)),
-				     inode.bi_nlink,
-				     inode.bi_dir,
-				     inode.bi_dir_offset))
+				     "unreachable inode\n%s",
+				     (printbuf_reset(&buf),
+				      bch2_bkey_val_to_text(&buf, c, inode_k),
+				      buf.buf)))
 				ret = reattach_inode(trans, &inode, snapshot);
-			break;
+			goto out;
 		}
 
 		bch2_trans_iter_exit(trans, &dirent_iter);
@@ -2160,7 +2160,12 @@ static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c ino
 
 		snapshot = parent_snapshot;
 
-		ret = lookup_inode(trans, inode.bi_dir, &inode, &snapshot);
+		bch2_trans_iter_exit(trans, &inode_iter);
+		inode_k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes,
+					     SPOS(0, inode.bi_dir, snapshot), 0);
+		ret = bkey_err(inode_k) ?:
+			!bkey_is_inode(inode_k.k) ? -BCH_ERR_ENOENT_inode
+			: bch2_inode_unpack(inode_k, &inode);
 		if (ret) {
 			/* Should have been caught in dirents pass */
 			if (!bch2_err_matches(ret, BCH_ERR_transaction_restart))
@@ -2168,6 +2173,8 @@ static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c ino
 			break;
 		}
 
+		snapshot = inode_k.k->p.snapshot;
+
 		if (path_is_dup(p, inode.bi_inum, snapshot)) {
 			/* XXX print path */
 			bch_err(c, "directory structure loop");
@@ -2191,7 +2198,10 @@ static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c ino
 			break;
 		}
 	}
+out:
 fsck_err:
+	bch2_trans_iter_exit(trans, &inode_iter);
+	printbuf_exit(&buf);
 	bch_err_fn(c, ret);
 	return ret;
 }

From 7b3056905ac92117a346e80c8557d1e0e2a44ca5 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 9 Feb 2024 16:04:50 -0500
Subject: [PATCH 0542/1406] bcachefs: Correctly reattach subvolumes

Subvolumes need special handling to reattach - we always reattach them
in the root subvolume's lost+found, and they need a slightly different
kind of dirent.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/dirent.c |  8 ++++----
 fs/bcachefs/dirent.h |  2 +-
 fs/bcachefs/fsck.c   | 28 +++++++++++++++++++++++-----
 3 files changed, 28 insertions(+), 10 deletions(-)

diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index 97773cffccae8d..52b350f8a3f108 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -201,17 +201,17 @@ static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans,
 }
 
 int bch2_dirent_create_snapshot(struct btree_trans *trans,
-			u64 dir, u32 snapshot,
+			u32 dir_subvol, u64 dir, u32 snapshot,
 			const struct bch_hash_info *hash_info,
 			u8 type, const struct qstr *name, u64 dst_inum,
 			u64 *dir_offset,
 			bch_str_hash_flags_t str_hash_flags)
 {
-	subvol_inum zero_inum = { 0 };
+	subvol_inum dir_inum = { .subvol = dir_subvol, .inum = dir };
 	struct bkey_i_dirent *dirent;
 	int ret;
 
-	dirent = dirent_create_key(trans, zero_inum, type, name, dst_inum);
+	dirent = dirent_create_key(trans, dir_inum, type, name, dst_inum);
 	ret = PTR_ERR_OR_ZERO(dirent);
 	if (ret)
 		return ret;
@@ -220,7 +220,7 @@ int bch2_dirent_create_snapshot(struct btree_trans *trans,
 	dirent->k.p.snapshot	= snapshot;
 
 	ret = bch2_hash_set_in_snapshot(trans, bch2_dirent_hash_desc, hash_info,
-					zero_inum, snapshot,
+					dir_inum, snapshot,
 					&dirent->k_i, str_hash_flags,
 					BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
 	*dir_offset = dirent->k.p.offset;
diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h
index f1dd7208a58e05..34cb8e18eaf897 100644
--- a/fs/bcachefs/dirent.h
+++ b/fs/bcachefs/dirent.h
@@ -35,7 +35,7 @@ static inline unsigned dirent_val_u64s(unsigned len)
 int bch2_dirent_read_target(struct btree_trans *, subvol_inum,
 			    struct bkey_s_c_dirent, subvol_inum *);
 
-int bch2_dirent_create_snapshot(struct btree_trans *, u64, u32,
+int bch2_dirent_create_snapshot(struct btree_trans *, u32, u64, u32,
 			const struct bch_hash_info *, u8,
 			const struct qstr *, u64, u64 *,
 			bch_str_hash_flags_t);
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index ce4d556198e3da..809b611e80ea8a 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -252,7 +252,7 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot,
 		goto err;
 
 	ret =   bch2_dirent_create_snapshot(trans,
-				root_inode.bi_inum, snapshot, &root_hash_info,
+				0, root_inode.bi_inum, snapshot, &root_hash_info,
 				mode_to_type(lostfound->bi_mode),
 				&lostfound_str,
 				lostfound->bi_inum,
@@ -275,9 +275,24 @@ static int reattach_inode(struct btree_trans *trans,
 	char name_buf[20];
 	struct qstr name;
 	u64 dir_offset = 0;
+	u32 dirent_snapshot = inode_snapshot;
 	int ret;
 
-	ret = lookup_lostfound(trans, inode_snapshot, &lostfound);
+	if (inode->bi_subvol) {
+		inode->bi_parent_subvol = BCACHEFS_ROOT_SUBVOL;
+
+		u64 root_inum;
+		ret = subvol_lookup(trans, inode->bi_parent_subvol,
+				    &dirent_snapshot, &root_inum);
+		if (ret)
+			return ret;
+
+		snprintf(name_buf, sizeof(name_buf), "subvol-%u", inode->bi_subvol);
+	} else {
+		snprintf(name_buf, sizeof(name_buf), "%llu", inode->bi_inum);
+	}
+
+	ret = lookup_lostfound(trans, dirent_snapshot, &lostfound);
 	if (ret)
 		return ret;
 
@@ -291,14 +306,16 @@ static int reattach_inode(struct btree_trans *trans,
 
 	dir_hash = bch2_hash_info_init(trans->c, &lostfound);
 
-	snprintf(name_buf, sizeof(name_buf), "%llu", inode->bi_inum);
 	name = (struct qstr) QSTR(name_buf);
 
 	ret = bch2_dirent_create_snapshot(trans,
-				lostfound.bi_inum, inode_snapshot,
+				inode->bi_parent_subvol, lostfound.bi_inum,
+				dirent_snapshot,
 				&dir_hash,
 				inode_d_type(inode),
-				&name, inode->bi_inum, &dir_offset,
+				&name,
+				inode->bi_subvol ?: inode->bi_inum,
+				&dir_offset,
 				BCH_HASH_SET_MUST_CREATE);
 	if (ret)
 		return ret;
@@ -2138,6 +2155,7 @@ static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c ino
 		}
 
 		if (bch2_err_matches(ret, ENOENT)) {
+			ret = 0;
 			if (fsck_err(c, inode_unreachable,
 				     "unreachable inode\n%s",
 				     (printbuf_reset(&buf),

From 021038f00806283f0683547c08f5fbbb18418c55 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 8 Feb 2024 19:10:19 -0500
Subject: [PATCH 0543/1406] bcachefs: bch2_btree_bit_mod ->
 bch2_btree_bit_mod_buffered

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update.c | 4 ++--
 fs/bcachefs/btree_update.h | 4 ++--
 fs/bcachefs/buckets.c      | 3 ++-
 fs/bcachefs/inode.c        | 5 +++--
 fs/bcachefs/lru.c          | 4 ++--
 5 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c
index e5193116b092f6..badaa479cc6754 100644
--- a/fs/bcachefs/btree_update.c
+++ b/fs/bcachefs/btree_update.c
@@ -787,8 +787,8 @@ int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id,
 	return ret;
 }
 
-int bch2_btree_bit_mod(struct btree_trans *trans, enum btree_id btree,
-		       struct bpos pos, bool set)
+int bch2_btree_bit_mod_buffered(struct btree_trans *trans, enum btree_id btree,
+				struct bpos pos, bool set)
 {
 	struct bkey_i k;
 
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index b9382b7b288b6a..75ffd82e0fc493 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -62,12 +62,12 @@ int bch2_btree_delete_range_trans(struct btree_trans *, enum btree_id,
 int bch2_btree_delete_range(struct bch_fs *, enum btree_id,
 			    struct bpos, struct bpos, unsigned, u64 *);
 
-int bch2_btree_bit_mod(struct btree_trans *, enum btree_id, struct bpos, bool);
+int bch2_btree_bit_mod_buffered(struct btree_trans *, enum btree_id, struct bpos, bool);
 
 static inline int bch2_btree_delete_at_buffered(struct btree_trans *trans,
 						enum btree_id btree, struct bpos pos)
 {
-	return bch2_btree_bit_mod(trans, btree, pos, false);
+	return bch2_btree_bit_mod_buffered(trans, btree, pos, false);
 }
 
 int __bch2_insert_snapshot_whiteouts(struct btree_trans *, enum btree_id,
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 7dca10ba70d253..c2f46b267b3ad5 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1053,7 +1053,8 @@ int bch2_trigger_extent(struct btree_trans *trans,
 			  (int) bch2_bkey_needs_rebalance(c, old);
 
 		if (mod) {
-			int ret = bch2_btree_bit_mod(trans, BTREE_ID_rebalance_work, new.k->p, mod > 0);
+			int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_rebalance_work,
+							      new.k->p, mod > 0);
 			if (ret)
 				return ret;
 		}
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index dbe37ccc751958..414aebe17fd335 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -620,7 +620,8 @@ int bch2_trigger_inode(struct btree_trans *trans,
 		bool old_deleted = bkey_is_deleted_inode(old);
 		bool new_deleted = bkey_is_deleted_inode(new.s_c);
 		if (old_deleted != new_deleted) {
-			int ret = bch2_btree_bit_mod(trans, BTREE_ID_deleted_inodes, new.k->p, new_deleted);
+			int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes,
+							      new.k->p, new_deleted);
 			if (ret)
 				return ret;
 		}
@@ -1169,7 +1170,7 @@ static int may_delete_deleted_inode(struct btree_trans *trans,
 	bch2_trans_iter_exit(trans, &inode_iter);
 	return ret;
 delete:
-	ret = bch2_btree_bit_mod(trans, BTREE_ID_deleted_inodes, pos, false);
+	ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes, pos, false);
 	goto out;
 }
 
diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c
index 7a4ca5a28b3eac..ed7577cdb2124c 100644
--- a/fs/bcachefs/lru.c
+++ b/fs/bcachefs/lru.c
@@ -44,8 +44,8 @@ static int __bch2_lru_set(struct btree_trans *trans, u16 lru_id,
 			  u64 dev_bucket, u64 time, bool set)
 {
 	return time
-		? bch2_btree_bit_mod(trans, BTREE_ID_lru,
-				     lru_pos(lru_id, dev_bucket, time), set)
+		? bch2_btree_bit_mod_buffered(trans, BTREE_ID_lru,
+					      lru_pos(lru_id, dev_bucket, time), set)
 		: 0;
 }
 

From 6a72081fb875d341f871d1d38f756726d512a342 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 8 Feb 2024 19:23:56 -0500
Subject: [PATCH 0544/1406] bcachefs: bch2_btree_bit_mod()

Provide a non-write buffer version of bch2_btree_bit_mod_buffered(), for
the subvolume children btree.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update.c | 21 +++++++++++++++++++++
 fs/bcachefs/btree_update.h |  1 +
 2 files changed, 22 insertions(+)

diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c
index badaa479cc6754..cbb7cf21da5be1 100644
--- a/fs/bcachefs/btree_update.c
+++ b/fs/bcachefs/btree_update.c
@@ -787,6 +787,27 @@ int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id,
 	return ret;
 }
 
+int bch2_btree_bit_mod(struct btree_trans *trans, enum btree_id btree,
+		       struct bpos pos, bool set)
+{
+	struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(*k));
+	int ret = PTR_ERR_OR_ZERO(k);
+	if (ret)
+		return ret;
+
+	bkey_init(&k->k);
+	k->k.type = set ? KEY_TYPE_set : KEY_TYPE_deleted;
+	k->k.p = pos;
+
+	struct btree_iter iter;
+	bch2_trans_iter_init(trans, &iter, btree, pos, BTREE_ITER_INTENT);
+
+	ret   = bch2_btree_iter_traverse(&iter) ?:
+		bch2_trans_update(trans, &iter, k, 0);
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
 int bch2_btree_bit_mod_buffered(struct btree_trans *trans, enum btree_id btree,
 				struct bpos pos, bool set)
 {
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index 75ffd82e0fc493..cc7c53e83f89dd 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -62,6 +62,7 @@ int bch2_btree_delete_range_trans(struct btree_trans *, enum btree_id,
 int bch2_btree_delete_range(struct bch_fs *, enum btree_id,
 			    struct bpos, struct bpos, unsigned, u64 *);
 
+int bch2_btree_bit_mod(struct btree_trans *, enum btree_id, struct bpos, bool);
 int bch2_btree_bit_mod_buffered(struct btree_trans *, enum btree_id, struct bpos, bool);
 
 static inline int bch2_btree_delete_at_buffered(struct btree_trans *trans,

From 6180c292cc86b347b62022e37a7f55e9619ec677 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 8 Feb 2024 18:39:42 -0500
Subject: [PATCH 0545/1406] bcachefs: bch_subvolume::fs_path_parent

Record the filesystem path heirarchy for subvolumes in bch_subvolume

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs_format.h  |  3 ++-
 fs/bcachefs/fs-common.c        | 32 ++++++++++++++++++++++++++++++++
 fs/bcachefs/fsck.c             | 33 +++++++++++++++++++++++++++------
 fs/bcachefs/sb-downgrade.c     |  5 ++++-
 fs/bcachefs/sb-errors_types.h  |  4 +++-
 fs/bcachefs/subvolume.c        | 23 +++++++++++++++++++++--
 fs/bcachefs/subvolume.h        |  3 +--
 fs/bcachefs/subvolume_format.h |  2 +-
 8 files changed, 91 insertions(+), 14 deletions(-)

diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 14f613617913e1..772eff5555f716 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -840,7 +840,8 @@ struct bch_sb_field_downgrade {
 	x(snapshot_skiplists,		BCH_VERSION(1,  1))		\
 	x(deleted_inodes,		BCH_VERSION(1,  2))		\
 	x(rebalance_work,		BCH_VERSION(1,  3))		\
-	x(member_seq,			BCH_VERSION(1,  4))
+	x(member_seq,			BCH_VERSION(1,  4))		\
+	x(subvolume_fs_parent,		BCH_VERSION(1,  5))
 
 enum bcachefs_metadata_version {
 	bcachefs_metadata_version_min = 9,
diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c
index 523507e38887bf..2aa3881105972b 100644
--- a/fs/bcachefs/fs-common.c
+++ b/fs/bcachefs/fs-common.c
@@ -107,6 +107,7 @@ int bch2_create_trans(struct btree_trans *trans,
 		u32 new_subvol, dir_snapshot;
 
 		ret = bch2_subvolume_create(trans, new_inode->bi_inum,
+					    dir.subvol,
 					    snapshot_src.subvol,
 					    &new_subvol, &snapshot,
 					    (flags & BCH_CREATE_SNAPSHOT_RO) != 0);
@@ -349,6 +350,22 @@ bool bch2_reinherit_attrs(struct bch_inode_unpacked *dst_u,
 	return ret;
 }
 
+static int subvol_update_parent(struct btree_trans *trans, u32 subvol, u32 new_parent)
+{
+	struct btree_iter iter;
+	struct bkey_i_subvolume *s =
+		bch2_bkey_get_mut_typed(trans, &iter,
+			BTREE_ID_subvolumes, POS(0, subvol),
+			BTREE_ITER_CACHED, subvolume);
+	int ret = PTR_ERR_OR_ZERO(s);
+	if (ret)
+		return ret;
+
+	s->v.fs_path_parent = cpu_to_le32(new_parent);
+	bch2_trans_iter_exit(trans, &iter);
+	return 0;
+}
+
 int bch2_rename_trans(struct btree_trans *trans,
 		      subvol_inum src_dir, struct bch_inode_unpacked *src_dir_u,
 		      subvol_inum dst_dir, struct bch_inode_unpacked *dst_dir_u,
@@ -410,6 +427,21 @@ int bch2_rename_trans(struct btree_trans *trans,
 			goto err;
 	}
 
+	if (src_inode_u->bi_subvol &&
+	    dst_dir.subvol != src_inode_u->bi_parent_subvol) {
+		ret = subvol_update_parent(trans, src_inode_u->bi_subvol, dst_dir.subvol);
+		if (ret)
+			goto err;
+	}
+
+	if (mode == BCH_RENAME_EXCHANGE &&
+	    dst_inode_u->bi_subvol &&
+	    src_dir.subvol != dst_inode_u->bi_parent_subvol) {
+		ret = subvol_update_parent(trans, dst_inode_u->bi_subvol, src_dir.subvol);
+		if (ret)
+			goto err;
+	}
+
 	/* Can't move across subvolumes, unless it's a subvolume root: */
 	if (src_dir.subvol != dst_dir.subvol &&
 	    (!src_inode_u->bi_subvol ||
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 809b611e80ea8a..3f74b6769a3838 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -1747,11 +1747,12 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *
 				  struct bkey_s_c_dirent d)
 {
 	struct bch_fs *c = trans->c;
+	struct btree_iter subvol_iter = {};
 	struct bch_inode_unpacked subvol_root;
 	u32 parent_subvol = le32_to_cpu(d.v->d_parent_subvol);
 	u32 target_subvol = le32_to_cpu(d.v->d_child_subvol);
-	u32 target_snapshot, parent_snapshot;
-	u64 target_inum, parent_inum;
+	u32 parent_snapshot;
+	u64 parent_inum;
 	struct printbuf buf = PRINTBUF;
 	int ret = 0;
 
@@ -1780,8 +1781,11 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *
 		new_dirent->v.d_parent_subvol = cpu_to_le32(new_parent_subvol);
 	}
 
-	ret = subvol_lookup(trans, target_subvol,
-			    &target_snapshot, &target_inum);
+	struct bkey_s_c_subvolume s =
+		bch2_bkey_get_iter_typed(trans, &subvol_iter,
+					 BTREE_ID_subvolumes, POS(0, target_subvol),
+					 0, subvolume);
+	ret = bkey_err(s.s_c);
 	if (ret && !bch2_err_matches(ret, ENOENT))
 		return ret;
 
@@ -1794,8 +1798,24 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *
 		goto out;
 	}
 
-	ret = lookup_inode(trans, target_inum,
-			   &subvol_root, &target_snapshot);
+	if (fsck_err_on(le32_to_cpu(s.v->fs_path_parent) != parent_subvol,
+			c, subvol_fs_path_parent_wrong,
+			"subvol with wrong fs_path_parent, should be be %u\n%s",
+			parent_subvol,
+			(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
+		struct bkey_i_subvolume *n =
+			bch2_bkey_make_mut_typed(trans, &subvol_iter, &s.s_c, 0, subvolume);
+		ret = PTR_ERR_OR_ZERO(n);
+		if (ret)
+			goto err;
+
+		n->v.fs_path_parent = cpu_to_le32(parent_subvol);
+	}
+
+	u64 target_inum = le64_to_cpu(s.v->inode);
+	u32 target_snapshot = le32_to_cpu(s.v->snapshot);
+
+	ret = lookup_inode(trans, target_inum, &subvol_root, &target_snapshot);
 	if (ret && !bch2_err_matches(ret, ENOENT))
 		return ret;
 
@@ -1817,6 +1837,7 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *
 out:
 err:
 fsck_err:
+	bch2_trans_iter_exit(trans, &subvol_iter);
 	printbuf_exit(&buf);
 	return ret;
 }
diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c
index 626eaaea5b01d7..4d49037a0bdd00 100644
--- a/fs/bcachefs/sb-downgrade.c
+++ b/fs/bcachefs/sb-downgrade.c
@@ -46,7 +46,10 @@
 	  BIT_ULL(BCH_RECOVERY_PASS_check_inodes),		\
 	  BCH_FSCK_ERR_unlinked_inode_not_on_deleted_list)	\
 	x(rebalance_work,					\
-	  BIT_ULL(BCH_RECOVERY_PASS_set_fs_needs_rebalance))
+	  BIT_ULL(BCH_RECOVERY_PASS_set_fs_needs_rebalance))	\
+	x(subvolume_fs_parent,					\
+	  BIT_ULL(BCH_RECOVERY_PASS_check_dirents),		\
+	  BCH_FSCK_ERR_subvol_fs_path_parent_wrong)
 
 #define DOWNGRADE_TABLE()
 
diff --git a/fs/bcachefs/sb-errors_types.h b/fs/bcachefs/sb-errors_types.h
index 833555676f76c4..dbbe2b7ce79981 100644
--- a/fs/bcachefs/sb-errors_types.h
+++ b/fs/bcachefs/sb-errors_types.h
@@ -258,7 +258,9 @@
 	x(inode_points_to_wrong_dirent,				250)	\
 	x(inode_bi_parent_nonzero,				251)	\
 	x(dirent_to_missing_parent_subvol,			252)	\
-	x(dirent_not_visible_in_parent_subvol,			253)
+	x(dirent_not_visible_in_parent_subvol,			253)	\
+	x(subvol_fs_path_parent_wrong,				254)	\
+	x(subvol_root_fs_path_parent_nonzero,			255)
 
 enum bch_sb_error_id {
 #define x(t, n) BCH_FSCK_ERR_##t = n,
diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
index a0be103b48fe03..d365e84367a3d8 100644
--- a/fs/bcachefs/subvolume.c
+++ b/fs/bcachefs/subvolume.c
@@ -20,6 +20,7 @@ static int check_subvol(struct btree_trans *trans,
 	struct bch_fs *c = trans->c;
 	struct bkey_s_c_subvolume subvol;
 	struct bch_snapshot snapshot;
+	struct printbuf buf = PRINTBUF;
 	unsigned snapid;
 	int ret = 0;
 
@@ -42,6 +43,20 @@ static int check_subvol(struct btree_trans *trans,
 		return ret ?: -BCH_ERR_transaction_restart_nested;
 	}
 
+	if (fsck_err_on(subvol.k->p.offset == BCACHEFS_ROOT_SUBVOL &&
+			subvol.v->fs_path_parent,
+			c, subvol_root_fs_path_parent_nonzero,
+			"root subvolume has nonzero fs_path_parent\n%s",
+			(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+		struct bkey_i_subvolume *n =
+			bch2_bkey_make_mut_typed(trans, iter, &subvol.s_c, 0, subvolume);
+		ret = PTR_ERR_OR_ZERO(n);
+		if (ret)
+			goto err;
+
+		n->v.fs_path_parent = 0;
+	}
+
 	struct bch_inode_unpacked inode;
 	struct btree_iter inode_iter = {};
 	ret = bch2_inode_peek_nowarn(trans, &inode_iter, &inode,
@@ -102,9 +117,9 @@ static int check_subvol(struct btree_trans *trans,
 			SET_BCH_SUBVOLUME_SNAP(&s->v, true);
 		}
 	}
-
 err:
 fsck_err:
+	printbuf_exit(&buf);
 	return ret;
 }
 
@@ -143,8 +158,10 @@ void bch2_subvolume_to_text(struct printbuf *out, struct bch_fs *c,
 		   le64_to_cpu(s.v->inode),
 		   le32_to_cpu(s.v->snapshot));
 
-	if (bkey_val_bytes(s.k) > offsetof(struct bch_subvolume, creation_parent))
+	if (bkey_val_bytes(s.k) > offsetof(struct bch_subvolume, creation_parent)) {
 		prt_printf(out, " creation_parent %u", le32_to_cpu(s.v->creation_parent));
+		prt_printf(out, " fs_parent %u", le32_to_cpu(s.v->fs_path_parent));
+	}
 }
 
 static __always_inline int
@@ -391,6 +408,7 @@ int bch2_subvolume_unlink(struct btree_trans *trans, u32 subvolid)
 }
 
 int bch2_subvolume_create(struct btree_trans *trans, u64 inode,
+			  u32 parent_subvolid,
 			  u32 src_subvolid,
 			  u32 *new_subvolid,
 			  u32 *new_snapshotid,
@@ -451,6 +469,7 @@ int bch2_subvolume_create(struct btree_trans *trans, u64 inode,
 	new_subvol->v.snapshot		= cpu_to_le32(new_nodes[0]);
 	new_subvol->v.inode		= cpu_to_le64(inode);
 	new_subvol->v.creation_parent	= cpu_to_le32(src_subvolid);
+	new_subvol->v.fs_path_parent	= cpu_to_le32(parent_subvolid);
 	new_subvol->v.otime.lo		= cpu_to_le64(bch2_current_time(c));
 	new_subvol->v.otime.hi		= 0;
 
diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h
index 3ca1d183369c5f..4c9825e9bd1873 100644
--- a/fs/bcachefs/subvolume.h
+++ b/fs/bcachefs/subvolume.h
@@ -29,8 +29,7 @@ int bch2_delete_dead_snapshots(struct bch_fs *);
 void bch2_delete_dead_snapshots_async(struct bch_fs *);
 
 int bch2_subvolume_unlink(struct btree_trans *, u32);
-int bch2_subvolume_create(struct btree_trans *, u64, u32,
-			  u32 *, u32 *, bool);
+int bch2_subvolume_create(struct btree_trans *, u64, u32, u32, u32 *, u32 *, bool);
 
 int bch2_fs_subvolumes_init(struct bch_fs *);
 
diff --git a/fs/bcachefs/subvolume_format.h b/fs/bcachefs/subvolume_format.h
index b81cf0c6119d87..e029df7ba89f52 100644
--- a/fs/bcachefs/subvolume_format.h
+++ b/fs/bcachefs/subvolume_format.h
@@ -20,7 +20,7 @@ struct bch_subvolume {
 	 * this subvolume:
 	 */
 	__le32			creation_parent;
-	__le32			pad;
+	__le32			fs_path_parent;
 	bch_le128		otime;
 };
 

From 519192e6b19ff8b2551b2dcd25e2fe089ee357e2 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 21 Jan 2024 06:00:07 -0500
Subject: [PATCH 0546/1406] bcachefs: BTREE_ID_subvolume_children

Add a btree to record a parent -> child subvolume relationships,
according to the filesystem heirarchy.

The subvolume_children btree is a bitset btree: if a bit is set at pos
p, that means p.offset is a child of subvolume p.inode.

This will be used for efficiently listing subvolumes, as well as
recursive deletion.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h        |  1 +
 fs/bcachefs/bcachefs_format.h |  7 ++-
 fs/bcachefs/btree_types.h     |  1 +
 fs/bcachefs/recovery_types.h  |  1 +
 fs/bcachefs/sb-downgrade.c    |  5 +-
 fs/bcachefs/sb-errors_types.h |  4 +-
 fs/bcachefs/subvolume.c       | 98 +++++++++++++++++++++++++++++++++++
 fs/bcachefs/subvolume.h       |  4 ++
 8 files changed, 117 insertions(+), 4 deletions(-)

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 5f801256e8740a..3b48c5e133b5b5 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -505,6 +505,7 @@ enum gc_phase {
 	GC_PHASE_BTREE_deleted_inodes,
 	GC_PHASE_BTREE_logged_ops,
 	GC_PHASE_BTREE_rebalance_work,
+	GC_PHASE_BTREE_subvolume_children,
 
 	GC_PHASE_PENDING_DELETE,
 };
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 772eff5555f716..1bb24aa7352800 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -841,7 +841,8 @@ struct bch_sb_field_downgrade {
 	x(deleted_inodes,		BCH_VERSION(1,  2))		\
 	x(rebalance_work,		BCH_VERSION(1,  3))		\
 	x(member_seq,			BCH_VERSION(1,  4))		\
-	x(subvolume_fs_parent,		BCH_VERSION(1,  5))
+	x(subvolume_fs_parent,		BCH_VERSION(1,  5))		\
+	x(btree_subvolume_children,	BCH_VERSION(1,  6))
 
 enum bcachefs_metadata_version {
 	bcachefs_metadata_version_min = 9,
@@ -1489,7 +1490,9 @@ enum btree_id_flags {
 	  BIT_ULL(KEY_TYPE_logged_op_truncate)|					\
 	  BIT_ULL(KEY_TYPE_logged_op_finsert))					\
 	x(rebalance_work,	18,	BTREE_ID_SNAPSHOT_FIELD,		\
-	  BIT_ULL(KEY_TYPE_set)|BIT_ULL(KEY_TYPE_cookie))
+	  BIT_ULL(KEY_TYPE_set)|BIT_ULL(KEY_TYPE_cookie))			\
+	x(subvolume_children,	19,	0,					\
+	  BIT_ULL(KEY_TYPE_set))
 
 enum btree_id {
 #define x(name, nr, ...) BTREE_ID_##name = nr,
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 95783ad03fb392..f163257724a9bf 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -654,6 +654,7 @@ const char *bch2_btree_node_type_str(enum btree_node_type);
 	 BIT_ULL(BKEY_TYPE_inodes)|			\
 	 BIT_ULL(BKEY_TYPE_stripes)|			\
 	 BIT_ULL(BKEY_TYPE_reflink)|			\
+	 BIT_ULL(BKEY_TYPE_subvolumes)|			\
 	 BIT_ULL(BKEY_TYPE_btree))
 
 #define BTREE_NODE_TYPE_HAS_ATOMIC_TRIGGERS		\
diff --git a/fs/bcachefs/recovery_types.h b/fs/bcachefs/recovery_types.h
index fa0c8efd2a1b42..f0fc1dbb723929 100644
--- a/fs/bcachefs/recovery_types.h
+++ b/fs/bcachefs/recovery_types.h
@@ -34,6 +34,7 @@
 	x(check_snapshot_trees,			18, PASS_ONLINE|PASS_FSCK)	\
 	x(check_snapshots,			19, PASS_ONLINE|PASS_FSCK)	\
 	x(check_subvols,			20, PASS_ONLINE|PASS_FSCK)	\
+	x(check_subvol_children,		35, PASS_ONLINE|PASS_FSCK)	\
 	x(delete_dead_snapshots,		21, PASS_ONLINE|PASS_FSCK)	\
 	x(fs_upgrade_for_subvolumes,		22, 0)				\
 	x(resume_logged_ops,			23, PASS_ALWAYS)		\
diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c
index 4d49037a0bdd00..3337419faeff3b 100644
--- a/fs/bcachefs/sb-downgrade.c
+++ b/fs/bcachefs/sb-downgrade.c
@@ -49,7 +49,10 @@
 	  BIT_ULL(BCH_RECOVERY_PASS_set_fs_needs_rebalance))	\
 	x(subvolume_fs_parent,					\
 	  BIT_ULL(BCH_RECOVERY_PASS_check_dirents),		\
-	  BCH_FSCK_ERR_subvol_fs_path_parent_wrong)
+	  BCH_FSCK_ERR_subvol_fs_path_parent_wrong)		\
+	x(btree_subvolume_children,				\
+	  BIT_ULL(BCH_RECOVERY_PASS_check_subvols),		\
+	  BCH_FSCK_ERR_subvol_children_not_set)
 
 #define DOWNGRADE_TABLE()
 
diff --git a/fs/bcachefs/sb-errors_types.h b/fs/bcachefs/sb-errors_types.h
index dbbe2b7ce79981..1530bd35b94572 100644
--- a/fs/bcachefs/sb-errors_types.h
+++ b/fs/bcachefs/sb-errors_types.h
@@ -260,7 +260,9 @@
 	x(dirent_to_missing_parent_subvol,			252)	\
 	x(dirent_not_visible_in_parent_subvol,			253)	\
 	x(subvol_fs_path_parent_wrong,				254)	\
-	x(subvol_root_fs_path_parent_nonzero,			255)
+	x(subvol_root_fs_path_parent_nonzero,			255)	\
+	x(subvol_children_not_set,				256)	\
+	x(subvol_children_bad,					257)
 
 enum bch_sb_error_id {
 #define x(t, n) BCH_FSCK_ERR_##t = n,
diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
index d365e84367a3d8..68be3a450ca12f 100644
--- a/fs/bcachefs/subvolume.c
+++ b/fs/bcachefs/subvolume.c
@@ -13,12 +13,24 @@
 
 static int bch2_subvolume_delete(struct btree_trans *, u32);
 
+static struct bpos subvolume_children_pos(struct bkey_s_c k)
+{
+	if (k.k->type != KEY_TYPE_subvolume)
+		return POS_MIN;
+
+	struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
+	if (!s.v->fs_path_parent)
+		return POS_MIN;
+	return POS(le32_to_cpu(s.v->fs_path_parent), s.k->p.offset);
+}
+
 static int check_subvol(struct btree_trans *trans,
 			struct btree_iter *iter,
 			struct bkey_s_c k)
 {
 	struct bch_fs *c = trans->c;
 	struct bkey_s_c_subvolume subvol;
+	struct btree_iter subvol_children_iter = {};
 	struct bch_snapshot snapshot;
 	struct printbuf buf = PRINTBUF;
 	unsigned snapid;
@@ -57,6 +69,28 @@ static int check_subvol(struct btree_trans *trans,
 		n->v.fs_path_parent = 0;
 	}
 
+	if (subvol.v->fs_path_parent) {
+		struct bpos pos = subvolume_children_pos(k);
+
+		struct bkey_s_c subvol_children_k =
+			bch2_bkey_get_iter(trans, &subvol_children_iter,
+					   BTREE_ID_subvolume_children, pos, 0);
+		ret = bkey_err(subvol_children_k);
+		if (ret)
+			goto err;
+
+		if (fsck_err_on(subvol_children_k.k->type != KEY_TYPE_set,
+				c, subvol_children_not_set,
+				"subvolume not set in subvolume_children btree at %llu:%llu\n%s",
+				pos.inode, pos.offset,
+				(printbuf_reset(&buf),
+				 bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+			ret = bch2_btree_bit_mod(trans, BTREE_ID_subvolume_children, pos, true);
+			if (ret)
+				goto err;
+		}
+	}
+
 	struct bch_inode_unpacked inode;
 	struct btree_iter inode_iter = {};
 	ret = bch2_inode_peek_nowarn(trans, &inode_iter, &inode,
@@ -119,6 +153,7 @@ static int check_subvol(struct btree_trans *trans,
 	}
 err:
 fsck_err:
+	bch2_trans_iter_exit(trans, &subvol_children_iter);
 	printbuf_exit(&buf);
 	return ret;
 }
@@ -134,6 +169,42 @@ int bch2_check_subvols(struct bch_fs *c)
 	return ret;
 }
 
+static int check_subvol_child(struct btree_trans *trans,
+			      struct btree_iter *child_iter,
+			      struct bkey_s_c child_k)
+{
+	struct bch_fs *c = trans->c;
+	struct bch_subvolume s;
+	int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_subvolumes, POS(0, child_k.k->p.offset),
+					  0, subvolume, &s);
+	if (ret && !bch2_err_matches(ret, ENOENT))
+		return ret;
+
+	if (fsck_err_on(ret ||
+			le32_to_cpu(s.fs_path_parent) != child_k.k->p.inode,
+			c, subvol_children_bad,
+			"incorrect entry in subvolume_children btree %llu:%llu",
+			child_k.k->p.inode, child_k.k->p.offset)) {
+		ret = bch2_btree_delete_at(trans, child_iter, 0);
+		if (ret)
+			goto err;
+	}
+err:
+fsck_err:
+	return ret;
+}
+
+int bch2_check_subvol_children(struct bch_fs *c)
+{
+	int ret = bch2_trans_run(c,
+		for_each_btree_key_commit(trans, iter,
+				BTREE_ID_subvolume_children, POS_MIN, BTREE_ITER_PREFETCH, k,
+				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+			check_subvol_child(trans, &iter, k)));
+	bch_err_fn(c, ret);
+	return 0;
+}
+
 /* Subvolumes: */
 
 int bch2_subvolume_invalid(struct bch_fs *c, struct bkey_s_c k,
@@ -164,6 +235,33 @@ void bch2_subvolume_to_text(struct printbuf *out, struct bch_fs *c,
 	}
 }
 
+static int subvolume_children_mod(struct btree_trans *trans, struct bpos pos, bool set)
+{
+	return !bpos_eq(pos, POS_MIN)
+		? bch2_btree_bit_mod(trans, BTREE_ID_subvolume_children, pos, set)
+		: 0;
+}
+
+int bch2_subvolume_trigger(struct btree_trans *trans,
+			   enum btree_id btree_id, unsigned level,
+			   struct bkey_s_c old, struct bkey_s new,
+			   unsigned flags)
+{
+	if (flags & BTREE_TRIGGER_TRANSACTIONAL) {
+		struct bpos children_pos_old = subvolume_children_pos(old);
+		struct bpos children_pos_new = subvolume_children_pos(new.s_c);
+
+		if (!bpos_eq(children_pos_old, children_pos_new)) {
+			int ret = subvolume_children_mod(trans, children_pos_old, false) ?:
+				  subvolume_children_mod(trans, children_pos_new, true);
+			if (ret)
+				return ret;
+		}
+	}
+
+	return 0;
+}
+
 static __always_inline int
 bch2_subvolume_get_inlined(struct btree_trans *trans, unsigned subvol,
 			   bool inconsistent_if_not_found,
diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h
index 4c9825e9bd1873..b6f1dfca7b80c4 100644
--- a/fs/bcachefs/subvolume.h
+++ b/fs/bcachefs/subvolume.h
@@ -7,14 +7,18 @@
 enum bkey_invalid_flags;
 
 int bch2_check_subvols(struct bch_fs *);
+int bch2_check_subvol_children(struct bch_fs *);
 
 int bch2_subvolume_invalid(struct bch_fs *, struct bkey_s_c,
 			   enum bkey_invalid_flags, struct printbuf *);
 void bch2_subvolume_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+int bch2_subvolume_trigger(struct btree_trans *, enum btree_id, unsigned,
+			   struct bkey_s_c, struct bkey_s, unsigned);
 
 #define bch2_bkey_ops_subvolume ((struct bkey_ops) {		\
 	.key_invalid	= bch2_subvolume_invalid,		\
 	.val_to_text	= bch2_subvolume_to_text,		\
+	.trigger	= bch2_subvolume_trigger,		\
 	.min_val_size	= 16,					\
 })
 

From e258b85f1c3c9122fe4592a0cf99669c60df35e1 Mon Sep 17 00:00:00 2001
From: Tim Schumacher <timschumi@gmx.de>
Date: Fri, 26 Jan 2024 17:25:23 +0100
Subject: [PATCH 0547/1406] efivarfs: Request at most 512 bytes for variable
 names

Work around a quirk in a few old (2011-ish) UEFI implementations, where
a call to `GetNextVariableName` with a buffer size larger than 512 bytes
will always return `EFI_INVALID_PARAMETER`.xi

There is some lore around EFI variable names being up to 1024 bytes in
size, but this has no basis in the UEFI specification, and the upper
bounds are typically platform specific, and apply to the entire variable
(name plus payload).

Given that Linux does not permit creating files with names longer than
NAME_MAX (255) bytes, 512 bytes (== 256 UTF-16 characters) is a
reasonable limit.

Cc: <stable@vger.kernel.org> # 6.1+
Signed-off-by: Tim Schumacher <timschumi@gmx.de>
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 fs/efivarfs/vars.c | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/fs/efivarfs/vars.c b/fs/efivarfs/vars.c
index 114ff0fd4e5573..2ebf74169072aa 100644
--- a/fs/efivarfs/vars.c
+++ b/fs/efivarfs/vars.c
@@ -373,7 +373,7 @@ int efivar_init(int (*func)(efi_char16_t *, efi_guid_t, unsigned long, void *,
 			    struct list_head *),
 		void *data, bool duplicates, struct list_head *head)
 {
-	unsigned long variable_name_size = 1024;
+	unsigned long variable_name_size = 512;
 	efi_char16_t *variable_name;
 	efi_status_t status;
 	efi_guid_t vendor_guid;
@@ -390,12 +390,13 @@ int efivar_init(int (*func)(efi_char16_t *, efi_guid_t, unsigned long, void *,
 		goto free;
 
 	/*
-	 * Per EFI spec, the maximum storage allocated for both
-	 * the variable name and variable data is 1024 bytes.
+	 * A small set of old UEFI implementations reject sizes
+	 * above a certain threshold, the lowest seen in the wild
+	 * is 512.
 	 */
 
 	do {
-		variable_name_size = 1024;
+		variable_name_size = 512;
 
 		status = efivar_get_next_variable(&variable_name_size,
 						  variable_name,
@@ -432,8 +433,13 @@ int efivar_init(int (*func)(efi_char16_t *, efi_guid_t, unsigned long, void *,
 			break;
 		case EFI_NOT_FOUND:
 			break;
+		case EFI_BUFFER_TOO_SMALL:
+			pr_warn("efivars: Variable name size exceeds maximum (%lu > 512)\n",
+				variable_name_size);
+			status = EFI_NOT_FOUND;
+			break;
 		default:
-			printk(KERN_WARNING "efivars: get_next_variable: status=%lx\n",
+			pr_warn("efivars: get_next_variable: status=%lx\n",
 				status);
 			status = EFI_NOT_FOUND;
 			break;

From 4f86ec909904869fc8b122e8471dd1798b24df7f Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Mon, 5 Feb 2024 13:08:22 -0800
Subject: [PATCH 0548/1406] rcu-tasks: Add data to eliminate
 RCU-tasks/do_exit() deadlocks

Holding a mutex across synchronize_rcu_tasks() and acquiring
that same mutex in code called from do_exit() after its call to
exit_tasks_rcu_start() but before its call to exit_tasks_rcu_stop()
results in deadlock.  This is by design, because tasks that are far
enough into do_exit() are no longer present on the tasks list, making
it a bit difficult for RCU Tasks to find them, let alone wait on them
to do a voluntary context switch.  However, such deadlocks are becoming
more frequent.  In addition, lockdep currently does not detect such
deadlocks and they can be difficult to reproduce.

In addition, if a task voluntarily context switches during that time
(for example, if it blocks acquiring a mutex), then this task is in an
RCU Tasks quiescent state.  And with some adjustments, RCU Tasks could
just as well take advantage of that fact.

This commit therefore adds the data structures that will be needed
to rely on these quiescent states and to eliminate these deadlocks.

Link: https://lore.kernel.org/all/20240118021842.290665-1-chenzhongjin@huawei.com/

Reported-by: Chen Zhongjin <chenzhongjin@huawei.com>
Reported-by: Yang Jihong <yangjihong1@huawei.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Tested-by: Yang Jihong <yangjihong1@huawei.com>
Tested-by: Chen Zhongjin <chenzhongjin@huawei.com>
Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
---
 include/linux/sched.h | 2 ++
 kernel/rcu/tasks.h    | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index ffe8f618ab8697..5eeebed2dd9ba2 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -858,6 +858,8 @@ struct task_struct {
 	u8				rcu_tasks_idx;
 	int				rcu_tasks_idle_cpu;
 	struct list_head		rcu_tasks_holdout_list;
+	int				rcu_tasks_exit_cpu;
+	struct list_head		rcu_tasks_exit_list;
 #endif /* #ifdef CONFIG_TASKS_RCU */
 
 #ifdef CONFIG_TASKS_TRACE_RCU
diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index 732ad5b39946a5..b7d5f27570532e 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -32,6 +32,7 @@ typedef void (*postgp_func_t)(struct rcu_tasks *rtp);
  * @rtp_irq_work: IRQ work queue for deferred wakeups.
  * @barrier_q_head: RCU callback for barrier operation.
  * @rtp_blkd_tasks: List of tasks blocked as readers.
+ * @rtp_exit_list: List of tasks in the latter portion of do_exit().
  * @cpu: CPU number corresponding to this entry.
  * @rtpp: Pointer to the rcu_tasks structure.
  */
@@ -46,6 +47,7 @@ struct rcu_tasks_percpu {
 	struct irq_work rtp_irq_work;
 	struct rcu_head barrier_q_head;
 	struct list_head rtp_blkd_tasks;
+	struct list_head rtp_exit_list;
 	int cpu;
 	struct rcu_tasks *rtpp;
 };

From a9c5e47e6864a41d11c340440af3db26ed4373c2 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Mon, 5 Feb 2024 13:10:19 -0800
Subject: [PATCH 0549/1406] rcu-tasks: Initialize data to eliminate
 RCU-tasks/do_exit() deadlocks

Holding a mutex across synchronize_rcu_tasks() and acquiring
that same mutex in code called from do_exit() after its call to
exit_tasks_rcu_start() but before its call to exit_tasks_rcu_stop()
results in deadlock.  This is by design, because tasks that are far
enough into do_exit() are no longer present on the tasks list, making
it a bit difficult for RCU Tasks to find them, let alone wait on them
to do a voluntary context switch.  However, such deadlocks are becoming
more frequent.  In addition, lockdep currently does not detect such
deadlocks and they can be difficult to reproduce.

In addition, if a task voluntarily context switches during that time
(for example, if it blocks acquiring a mutex), then this task is in an
RCU Tasks quiescent state.  And with some adjustments, RCU Tasks could
just as well take advantage of that fact.

This commit therefore initializes the data structures that will be needed
to rely on these quiescent states and to eliminate these deadlocks.

Link: https://lore.kernel.org/all/20240118021842.290665-1-chenzhongjin@huawei.com/

Reported-by: Chen Zhongjin <chenzhongjin@huawei.com>
Reported-by: Yang Jihong <yangjihong1@huawei.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Tested-by: Yang Jihong <yangjihong1@huawei.com>
Tested-by: Chen Zhongjin <chenzhongjin@huawei.com>
Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
---
 init/init_task.c   | 1 +
 kernel/fork.c      | 1 +
 kernel/rcu/tasks.h | 2 ++
 3 files changed, 4 insertions(+)

diff --git a/init/init_task.c b/init/init_task.c
index 7ecb458eb3da60..4daee6d761c86c 100644
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -147,6 +147,7 @@ struct task_struct init_task __aligned(L1_CACHE_BYTES) = {
 	.rcu_tasks_holdout = false,
 	.rcu_tasks_holdout_list = LIST_HEAD_INIT(init_task.rcu_tasks_holdout_list),
 	.rcu_tasks_idle_cpu = -1,
+	.rcu_tasks_exit_list = LIST_HEAD_INIT(init_task.rcu_tasks_exit_list),
 #endif
 #ifdef CONFIG_TASKS_TRACE_RCU
 	.trc_reader_nesting = 0,
diff --git a/kernel/fork.c b/kernel/fork.c
index 0d944e92a43ffa..af7203be1d2d19 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1976,6 +1976,7 @@ static inline void rcu_copy_process(struct task_struct *p)
 	p->rcu_tasks_holdout = false;
 	INIT_LIST_HEAD(&p->rcu_tasks_holdout_list);
 	p->rcu_tasks_idle_cpu = -1;
+	INIT_LIST_HEAD(&p->rcu_tasks_exit_list);
 #endif /* #ifdef CONFIG_TASKS_RCU */
 #ifdef CONFIG_TASKS_TRACE_RCU
 	p->trc_reader_nesting = 0;
diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index b7d5f27570532e..4a5d562e318927 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -277,6 +277,8 @@ static void cblist_init_generic(struct rcu_tasks *rtp)
 		rtpcp->rtpp = rtp;
 		if (!rtpcp->rtp_blkd_tasks.next)
 			INIT_LIST_HEAD(&rtpcp->rtp_blkd_tasks);
+		if (!rtpcp->rtp_exit_list.next)
+			INIT_LIST_HEAD(&rtpcp->rtp_exit_list);
 	}
 
 	pr_info("%s: Setting shift to %d and lim to %d rcu_task_cb_adjust=%d.\n", rtp->name,

From 599014e2d604771195c586b04e63cf6e94c28c88 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Fri, 2 Feb 2024 11:28:45 -0800
Subject: [PATCH 0550/1406] rcu-tasks: Maintain lists to eliminate
 RCU-tasks/do_exit() deadlocks

This commit continues the elimination of deadlocks involving do_exit()
and RCU tasks by causing exit_tasks_rcu_start() to add the current
task to a per-CPU list and causing exit_tasks_rcu_stop() to remove the
current task from whatever list it is on.  These lists will be used to
track tasks that are exiting, while still accounting for any RCU-tasks
quiescent states that these tasks pass though.

[ paulmck: Apply Frederic Weisbecker feedback. ]

Link: https://lore.kernel.org/all/20240118021842.290665-1-chenzhongjin@huawei.com/

Reported-by: Chen Zhongjin <chenzhongjin@huawei.com>
Reported-by: Yang Jihong <yangjihong1@huawei.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Tested-by: Yang Jihong <yangjihong1@huawei.com>
Tested-by: Chen Zhongjin <chenzhongjin@huawei.com>
Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
---
 kernel/rcu/tasks.h | 43 +++++++++++++++++++++++++++++++++----------
 1 file changed, 33 insertions(+), 10 deletions(-)

diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index 4a5d562e318927..68a8adf7de8e98 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -1151,25 +1151,48 @@ struct task_struct *get_rcu_tasks_gp_kthread(void)
 EXPORT_SYMBOL_GPL(get_rcu_tasks_gp_kthread);
 
 /*
- * Contribute to protect against tasklist scan blind spot while the
- * task is exiting and may be removed from the tasklist. See
- * corresponding synchronize_srcu() for further details.
+ * Protect against tasklist scan blind spot while the task is exiting and
+ * may be removed from the tasklist.  Do this by adding the task to yet
+ * another list.
+ *
+ * Note that the task will remove itself from this list, so there is no
+ * need for get_task_struct(), except in the case where rcu_tasks_pertask()
+ * adds it to the holdout list, in which case rcu_tasks_pertask() supplies
+ * the needed get_task_struct().
  */
-void exit_tasks_rcu_start(void) __acquires(&tasks_rcu_exit_srcu)
+void exit_tasks_rcu_start(void)
 {
-	current->rcu_tasks_idx = __srcu_read_lock(&tasks_rcu_exit_srcu);
+	unsigned long flags;
+	struct rcu_tasks_percpu *rtpcp;
+	struct task_struct *t = current;
+
+	WARN_ON_ONCE(!list_empty(&t->rcu_tasks_exit_list));
+	preempt_disable();
+	rtpcp = this_cpu_ptr(rcu_tasks.rtpcpu);
+	t->rcu_tasks_exit_cpu = smp_processor_id();
+	raw_spin_lock_irqsave_rcu_node(rtpcp, flags);
+	if (!rtpcp->rtp_exit_list.next)
+		INIT_LIST_HEAD(&rtpcp->rtp_exit_list);
+	list_add(&t->rcu_tasks_exit_list, &rtpcp->rtp_exit_list);
+	raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
+	preempt_enable();
 }
 
 /*
- * Contribute to protect against tasklist scan blind spot while the
- * task is exiting and may be removed from the tasklist. See
- * corresponding synchronize_srcu() for further details.
+ * Remove the task from the "yet another list" because do_exit() is now
+ * non-preemptible, allowing synchronize_rcu() to wait beyond this point.
  */
-void exit_tasks_rcu_stop(void) __releases(&tasks_rcu_exit_srcu)
+void exit_tasks_rcu_stop(void)
 {
+	unsigned long flags;
+	struct rcu_tasks_percpu *rtpcp;
 	struct task_struct *t = current;
 
-	__srcu_read_unlock(&tasks_rcu_exit_srcu, t->rcu_tasks_idx);
+	WARN_ON_ONCE(list_empty(&t->rcu_tasks_exit_list));
+	rtpcp = per_cpu_ptr(rcu_tasks.rtpcpu, t->rcu_tasks_exit_cpu);
+	raw_spin_lock_irqsave_rcu_node(rtpcp, flags);
+	list_del_init(&t->rcu_tasks_exit_list);
+	raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
 }
 
 /*

From 8668fc7236696b05c4f90fa6521abf19afb531b4 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Fri, 2 Feb 2024 11:49:06 -0800
Subject: [PATCH 0551/1406] rcu-tasks: Eliminate deadlocks involving do_exit()
 and RCU tasks

Holding a mutex across synchronize_rcu_tasks() and acquiring
that same mutex in code called from do_exit() after its call to
exit_tasks_rcu_start() but before its call to exit_tasks_rcu_stop()
results in deadlock.  This is by design, because tasks that are far
enough into do_exit() are no longer present on the tasks list, making
it a bit difficult for RCU Tasks to find them, let alone wait on them
to do a voluntary context switch.  However, such deadlocks are becoming
more frequent.  In addition, lockdep currently does not detect such
deadlocks and they can be difficult to reproduce.

In addition, if a task voluntarily context switches during that time
(for example, if it blocks acquiring a mutex), then this task is in an
RCU Tasks quiescent state.  And with some adjustments, RCU Tasks could
just as well take advantage of that fact.

This commit therefore eliminates these deadlock by replacing the
SRCU-based wait for do_exit() completion with per-CPU lists of tasks
currently exiting.  A given task will be on one of these per-CPU lists for
the same period of time that this task would previously have been in the
previous SRCU read-side critical section.  These lists enable RCU Tasks
to find the tasks that have already been removed from the tasks list,
but that must nevertheless be waited upon.

The RCU Tasks grace period gathers any of these do_exit() tasks that it
must wait on, and adds them to the list of holdouts.  Per-CPU locking
and get_task_struct() are used to synchronize addition to and removal
from these lists.

Link: https://lore.kernel.org/all/20240118021842.290665-1-chenzhongjin@huawei.com/

Reported-by: Chen Zhongjin <chenzhongjin@huawei.com>
Reported-by: Yang Jihong <yangjihong1@huawei.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Tested-by: Yang Jihong <yangjihong1@huawei.com>
Tested-by: Chen Zhongjin <chenzhongjin@huawei.com>
Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
---
 kernel/rcu/tasks.h | 44 ++++++++++++++++++++++++++++----------------
 1 file changed, 28 insertions(+), 16 deletions(-)

diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index 68a8adf7de8e98..4dc355b2ac2290 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -146,8 +146,6 @@ static struct rcu_tasks rt_name =							\
 }
 
 #ifdef CONFIG_TASKS_RCU
-/* Track exiting tasks in order to allow them to be waited for. */
-DEFINE_STATIC_SRCU(tasks_rcu_exit_srcu);
 
 /* Report delay in synchronize_srcu() completion in rcu_tasks_postscan(). */
 static void tasks_rcu_exit_srcu_stall(struct timer_list *unused);
@@ -855,10 +853,12 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp)
 //	number of voluntary context switches, and add that task to the
 //	holdout list.
 // rcu_tasks_postscan():
-//	Invoke synchronize_srcu() to ensure that all tasks that were
-//	in the process of exiting (and which thus might not know to
-//	synchronize with this RCU Tasks grace period) have completed
-//	exiting.
+//	Gather per-CPU lists of tasks in do_exit() to ensure that all
+//	tasks that were in the process of exiting (and which thus might
+//	not know to synchronize with this RCU Tasks grace period) have
+//	completed exiting.  The synchronize_rcu() in rcu_tasks_postgp()
+//	will take care of any tasks stuck in the non-preemptible region
+//	of do_exit() following its call to exit_tasks_rcu_stop().
 // check_all_holdout_tasks(), repeatedly until holdout list is empty:
 //	Scans the holdout list, attempting to identify a quiescent state
 //	for each task on the list.  If there is a quiescent state, the
@@ -871,8 +871,10 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp)
 //	with interrupts disabled.
 //
 // For each exiting task, the exit_tasks_rcu_start() and
-// exit_tasks_rcu_finish() functions begin and end, respectively, the SRCU
-// read-side critical sections waited for by rcu_tasks_postscan().
+// exit_tasks_rcu_finish() functions add and remove, respectively, the
+// current task to a per-CPU list of tasks that rcu_tasks_postscan() must
+// wait on.  This is necessary because rcu_tasks_postscan() must wait on
+// tasks that have already been removed from the global list of tasks.
 //
 // Pre-grace-period update-side code is ordered before the grace
 // via the raw_spin_lock.*rcu_node().  Pre-grace-period read-side code
@@ -936,9 +938,13 @@ static void rcu_tasks_pertask(struct task_struct *t, struct list_head *hop)
 	}
 }
 
+void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func);
+DEFINE_RCU_TASKS(rcu_tasks, rcu_tasks_wait_gp, call_rcu_tasks, "RCU Tasks");
+
 /* Processing between scanning taskslist and draining the holdout list. */
 static void rcu_tasks_postscan(struct list_head *hop)
 {
+	int cpu;
 	int rtsi = READ_ONCE(rcu_task_stall_info);
 
 	if (!IS_ENABLED(CONFIG_TINY_RCU)) {
@@ -952,9 +958,9 @@ static void rcu_tasks_postscan(struct list_head *hop)
 	 * this, divide the fragile exit path part in two intersecting
 	 * read side critical sections:
 	 *
-	 * 1) An _SRCU_ read side starting before calling exit_notify(),
-	 *    which may remove the task from the tasklist, and ending after
-	 *    the final preempt_disable() call in do_exit().
+	 * 1) A task_struct list addition before calling exit_notify(),
+	 *    which may remove the task from the tasklist, with the
+	 *    removal after the final preempt_disable() call in do_exit().
 	 *
 	 * 2) An _RCU_ read side starting with the final preempt_disable()
 	 *    call in do_exit() and ending with the final call to schedule()
@@ -963,7 +969,17 @@ static void rcu_tasks_postscan(struct list_head *hop)
 	 * This handles the part 1). And postgp will handle part 2) with a
 	 * call to synchronize_rcu().
 	 */
-	synchronize_srcu(&tasks_rcu_exit_srcu);
+
+	for_each_possible_cpu(cpu) {
+		struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rcu_tasks.rtpcpu, cpu);
+		struct task_struct *t;
+
+		raw_spin_lock_irq_rcu_node(rtpcp);
+		list_for_each_entry(t, &rtpcp->rtp_exit_list, rcu_tasks_exit_list)
+			if (list_empty(&t->rcu_tasks_holdout_list))
+				rcu_tasks_pertask(t, hop);
+		raw_spin_unlock_irq_rcu_node(rtpcp);
+	}
 
 	if (!IS_ENABLED(CONFIG_TINY_RCU))
 		del_timer_sync(&tasks_rcu_exit_srcu_stall_timer);
@@ -1031,7 +1047,6 @@ static void rcu_tasks_postgp(struct rcu_tasks *rtp)
 	 *
 	 * In addition, this synchronize_rcu() waits for exiting tasks
 	 * to complete their final preempt_disable() region of execution,
-	 * cleaning up after synchronize_srcu(&tasks_rcu_exit_srcu),
 	 * enforcing the whole region before tasklist removal until
 	 * the final schedule() with TASK_DEAD state to be an RCU TASKS
 	 * read side critical section.
@@ -1039,9 +1054,6 @@ static void rcu_tasks_postgp(struct rcu_tasks *rtp)
 	synchronize_rcu();
 }
 
-void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func);
-DEFINE_RCU_TASKS(rcu_tasks, rcu_tasks_wait_gp, call_rcu_tasks, "RCU Tasks");
-
 static void tasks_rcu_exit_srcu_stall(struct timer_list *unused)
 {
 #ifndef CONFIG_TINY_RCU

From c484f412b31da1c1b5cfe58132e1e8a9d5c4bcd6 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Thu, 1 Feb 2024 06:10:26 -0800
Subject: [PATCH 0552/1406] rcu-tasks: Maintain real-time response in
 rcu_tasks_postscan()

The current code will scan the entirety of each per-CPU list of exiting
tasks in ->rtp_exit_list with interrupts disabled.  This is normally just
fine, because each CPU typically won't have very many tasks in this state.
However, if a large number of tasks block late in do_exit(), these lists
could be arbitrarily long.  Low probability, perhaps, but it really
could happen.

This commit therefore occasionally re-enables interrupts while traversing
these lists, inserting a dummy element to hold the current place in the
list.  In kernels built with CONFIG_PREEMPT_RT=y, this re-enabling happens
after each list element is processed, otherwise every one-to-two jiffies.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: Anna-Maria Behnsen <anna-maria@linutronix.de>
Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
---
 kernel/rcu/tasks.h | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index 4dc355b2ac2290..866743e0796f47 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -971,13 +971,32 @@ static void rcu_tasks_postscan(struct list_head *hop)
 	 */
 
 	for_each_possible_cpu(cpu) {
+		unsigned long j = jiffies + 1;
 		struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rcu_tasks.rtpcpu, cpu);
 		struct task_struct *t;
+		struct task_struct *t1;
+		struct list_head tmp;
 
 		raw_spin_lock_irq_rcu_node(rtpcp);
-		list_for_each_entry(t, &rtpcp->rtp_exit_list, rcu_tasks_exit_list)
+		list_for_each_entry_safe(t, t1, &rtpcp->rtp_exit_list, rcu_tasks_exit_list) {
 			if (list_empty(&t->rcu_tasks_holdout_list))
 				rcu_tasks_pertask(t, hop);
+
+			// RT kernels need frequent pauses, otherwise
+			// pause at least once per pair of jiffies.
+			if (!IS_ENABLED(CONFIG_PREEMPT_RT) && time_before(jiffies, j))
+				continue;
+
+			// Keep our place in the list while pausing.
+			// Nothing else traverses this list, so adding a
+			// bare list_head is OK.
+			list_add(&tmp, &t->rcu_tasks_exit_list);
+			raw_spin_unlock_irq_rcu_node(rtpcp);
+			cond_resched(); // For CONFIG_PREEMPT=n kernels
+			raw_spin_lock_irq_rcu_node(rtpcp);
+			list_del(&tmp);
+			j = jiffies + 1;
+		}
 		raw_spin_unlock_irq_rcu_node(rtpcp);
 	}
 

From 54315b0dd6a1045ca924056379bcc0e1f6e43b87 Mon Sep 17 00:00:00 2001
From: Rob Herring <robh@kernel.org>
Date: Thu, 25 Jan 2024 07:46:44 -0600
Subject: [PATCH 0553/1406] nvmem: fixed-cell: Simplify nested if/then schema

There's no reason to have a nested if/then schema as checking for compatible
being present and containing 'mac-base' can all be done in one 'if' schema.

Signed-off-by: Rob Herring <robh@kernel.org>
Signed-off-by: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
---
 .../bindings/nvmem/layouts/fixed-cell.yaml    | 22 +++++++++----------
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/Documentation/devicetree/bindings/nvmem/layouts/fixed-cell.yaml b/Documentation/devicetree/bindings/nvmem/layouts/fixed-cell.yaml
index ac2381e6602790..8b3826243dddfc 100644
--- a/Documentation/devicetree/bindings/nvmem/layouts/fixed-cell.yaml
+++ b/Documentation/devicetree/bindings/nvmem/layouts/fixed-cell.yaml
@@ -36,20 +36,18 @@ properties:
 
 allOf:
   - if:
+      properties:
+        compatible:
+          contains:
+            const: mac-base
       required: [ compatible ]
     then:
-      if:
-        properties:
-          compatible:
-            contains:
-              const: mac-base
-      then:
-        properties:
-          "#nvmem-cell-cells":
-            description: The first argument is a MAC address offset.
-            const: 1
-        required:
-          - "#nvmem-cell-cells"
+      properties:
+        "#nvmem-cell-cells":
+          description: The first argument is a MAC address offset.
+          const: 1
+      required:
+        - "#nvmem-cell-cells"
 
 required:
   - reg

From 2799a2abaf01df35ddf6dc5241a6bedbcf48072e Mon Sep 17 00:00:00 2001
From: William-tw Lin <william-tw.lin@mediatek.com>
Date: Fri, 22 Dec 2023 16:07:39 +0800
Subject: [PATCH 0554/1406] nvmem: mtk-efuse: Register MediaTek socinfo driver
 from efuse

The socinfo driver reads chip information from eFuses and does not need
any devicetree node. Register it from mtk-efuse.

While at it, also add the name for this driver's nvmem_config.

Signed-off-by: William-tw Lin <william-tw.lin@mediatek.com>
Reviewed-by: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
Signed-off-by: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
---
 drivers/nvmem/mtk-efuse.c | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/drivers/nvmem/mtk-efuse.c b/drivers/nvmem/mtk-efuse.c
index 84f05b40a4112e..f5bebcecf9bd31 100644
--- a/drivers/nvmem/mtk-efuse.c
+++ b/drivers/nvmem/mtk-efuse.c
@@ -68,6 +68,7 @@ static int mtk_efuse_probe(struct platform_device *pdev)
 	struct nvmem_config econfig = {};
 	struct mtk_efuse_priv *priv;
 	const struct mtk_efuse_pdata *pdata;
+	struct platform_device *socinfo;
 
 	priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL);
 	if (!priv)
@@ -85,11 +86,20 @@ static int mtk_efuse_probe(struct platform_device *pdev)
 	econfig.size = resource_size(res);
 	econfig.priv = priv;
 	econfig.dev = dev;
+	econfig.name = "mtk-efuse";
 	if (pdata->uses_post_processing)
 		econfig.fixup_dt_cell_info = &mtk_efuse_fixup_dt_cell_info;
 	nvmem = devm_nvmem_register(dev, &econfig);
+	if (IS_ERR(nvmem))
+		return PTR_ERR(nvmem);
 
-	return PTR_ERR_OR_ZERO(nvmem);
+	socinfo = platform_device_register_data(&pdev->dev, "mtk-socinfo",
+						PLATFORM_DEVID_AUTO, NULL, 0);
+	if (IS_ERR(socinfo))
+		dev_info(dev, "MediaTek SoC Information will be unavailable\n");
+
+	platform_set_drvdata(pdev, socinfo);
+	return 0;
 }
 
 static const struct mtk_efuse_pdata mtk_mt8186_efuse_pdata = {
@@ -108,8 +118,17 @@ static const struct of_device_id mtk_efuse_of_match[] = {
 };
 MODULE_DEVICE_TABLE(of, mtk_efuse_of_match);
 
+static void mtk_efuse_remove(struct platform_device *pdev)
+{
+	struct platform_device *socinfo = platform_get_drvdata(pdev);
+
+	if (!IS_ERR_OR_NULL(socinfo))
+		platform_device_unregister(socinfo);
+}
+
 static struct platform_driver mtk_efuse_driver = {
 	.probe = mtk_efuse_probe,
+	.remove_new = mtk_efuse_remove,
 	.driver = {
 		.name = "mediatek,efuse",
 		.of_match_table = mtk_efuse_of_match,

From edf748c73b305dd3bea5ce3bb1f4774a44d0a4e1 Mon Sep 17 00:00:00 2001
From: Praveen Teja Kundanala <praveen.teja.kundanala@amd.com>
Date: Mon, 8 Jan 2024 10:56:16 +0530
Subject: [PATCH 0555/1406] dt-bindings: nvmem: Convert xlnx,zynqmp-nvmem.txt
 to yaml

Convert the xlnx,zynqmp-nvmem.txt to yaml.

Signed-off-by: Praveen Teja Kundanala <praveen.teja.kundanala@amd.com>
Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Signed-off-by: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
---
 .../bindings/nvmem/xlnx,zynqmp-nvmem.txt      | 46 -------------------
 .../bindings/nvmem/xlnx,zynqmp-nvmem.yaml     | 42 +++++++++++++++++
 2 files changed, 42 insertions(+), 46 deletions(-)
 delete mode 100644 Documentation/devicetree/bindings/nvmem/xlnx,zynqmp-nvmem.txt
 create mode 100644 Documentation/devicetree/bindings/nvmem/xlnx,zynqmp-nvmem.yaml

diff --git a/Documentation/devicetree/bindings/nvmem/xlnx,zynqmp-nvmem.txt b/Documentation/devicetree/bindings/nvmem/xlnx,zynqmp-nvmem.txt
deleted file mode 100644
index 4881561b3a02ac..00000000000000
--- a/Documentation/devicetree/bindings/nvmem/xlnx,zynqmp-nvmem.txt
+++ /dev/null
@@ -1,46 +0,0 @@
---------------------------------------------------------------------------
-=  Zynq UltraScale+ MPSoC nvmem firmware driver binding =
---------------------------------------------------------------------------
-The nvmem_firmware node provides access to the hardware related data
-like soc revision, IDCODE... etc, By using the firmware interface.
-
-Required properties:
-- compatible: should be "xlnx,zynqmp-nvmem-fw"
-
-= Data cells =
-Are child nodes of silicon id, bindings of which as described in
-bindings/nvmem/nvmem.txt
-
--------
- Example
--------
-firmware {
-	zynqmp_firmware: zynqmp-firmware {
-		compatible = "xlnx,zynqmp-firmware";
-		method = "smc";
-
-		nvmem_firmware {
-			compatible = "xlnx,zynqmp-nvmem-fw";
-			#address-cells = <1>;
-			#size-cells = <1>;
-
-			/* Data cells */
-			soc_revision: soc_revision {
-				reg = <0x0 0x4>;
-			};
-		};
-	};
-};
-
-= Data consumers =
-Are device nodes which consume nvmem data cells.
-
-For example:
-	pcap {
-		...
-
-		nvmem-cells = <&soc_revision>;
-		nvmem-cell-names = "soc_revision";
-
-		...
-	};
diff --git a/Documentation/devicetree/bindings/nvmem/xlnx,zynqmp-nvmem.yaml b/Documentation/devicetree/bindings/nvmem/xlnx,zynqmp-nvmem.yaml
new file mode 100644
index 00000000000000..917c40d5c382f4
--- /dev/null
+++ b/Documentation/devicetree/bindings/nvmem/xlnx,zynqmp-nvmem.yaml
@@ -0,0 +1,42 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/nvmem/xlnx,zynqmp-nvmem.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Zynq UltraScale+ MPSoC Non Volatile Memory interface
+
+description: |
+    The ZynqMP MPSoC provides access to the hardware related data
+    like SOC revision, IDCODE and specific purpose efuses.
+
+maintainers:
+  - Kalyani Akula <kalyani.akula@amd.com>
+  - Praveen Teja Kundanala <praveen.teja.kundanala@amd.com>
+
+allOf:
+  - $ref: nvmem.yaml#
+
+properties:
+  compatible:
+    const: xlnx,zynqmp-nvmem-fw
+
+required:
+  - compatible
+
+unevaluatedProperties: false
+
+examples:
+  - |
+    nvmem {
+        compatible = "xlnx,zynqmp-nvmem-fw";
+        nvmem-layout {
+            compatible = "fixed-layout";
+            #address-cells = <1>;
+            #size-cells = <1>;
+
+            soc_revision: soc-revision@0 {
+                reg = <0x0 0x4>;
+            };
+        };
+    };

From ceb0f97e143f380d741adb9f4205d5aebfb0b522 Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Date: Tue, 9 Jan 2024 22:37:39 +0100
Subject: [PATCH 0556/1406] dt-bindings: nvmem: add common definition of
 nvmem-cell-cells
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Linux kernel NVMEM consumer bindings define phandle to NVMEM cells
("nvmem-cells"), thus we also want the common definition of property
defining number of cells encoding that specifier, so the

Suggested-by: Rob Herring <robh@kernel.org>
Reported-by: Michael Walle <michael@walle.cc>
Closes: https://github.com/devicetree-org/dt-schema/pull/89
Reported-by: Rafał Miłecki <zajec5@gmail.com>
Closes: https://lore.kernel.org/linux-arm-kernel/20221121105830.7411-1-zajec5@gmail.com/#r
Closes: https://lore.kernel.org/all/bdf7751b-0421-485d-8382-26c084f09d7d@gmail.com/
Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Reviewed-by: Rob Herring <robh@kernel.org>
Signed-off-by: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
---
 .../bindings/nvmem/nvmem-provider.yaml         | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/nvmem/nvmem-provider.yaml

diff --git a/Documentation/devicetree/bindings/nvmem/nvmem-provider.yaml b/Documentation/devicetree/bindings/nvmem/nvmem-provider.yaml
new file mode 100644
index 00000000000000..4009a9a03841ed
--- /dev/null
+++ b/Documentation/devicetree/bindings/nvmem/nvmem-provider.yaml
@@ -0,0 +1,18 @@
+# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/nvmem/nvmem-provider.yaml#
+$schema: http://devicetree.org/meta-schemas/base.yaml#
+
+title: NVMEM (Non Volatile Memory) Provider
+
+maintainers:
+  - Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
+
+select: true
+
+properties:
+  '#nvmem-cell-cells':
+    enum: [0, 1]
+
+additionalProperties: true

From bbe357faf632af9b4bbf4ef7c4c95ed8184ebeeb Mon Sep 17 00:00:00 2001
From: Praveen Teja Kundanala <praveen.teja.kundanala@amd.com>
Date: Fri, 2 Feb 2024 17:08:40 +0530
Subject: [PATCH 0557/1406] firmware: xilinx: Add ZynqMP efuse access API

Add zynqmp_pm_efuse_access API in the ZynqMP
firmware for read/write access of efuse memory.

Signed-off-by: Praveen Teja Kundanala <praveen.teja.kundanala@amd.com>
Acked-by: Michal Simek <michal.simek@amd.com>
Signed-off-by: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
---
 drivers/firmware/xilinx/zynqmp.c     | 25 +++++++++++++++++++++++++
 include/linux/firmware/xlnx-zynqmp.h |  8 ++++++++
 2 files changed, 33 insertions(+)

diff --git a/drivers/firmware/xilinx/zynqmp.c b/drivers/firmware/xilinx/zynqmp.c
index 79789f0563f6a3..9bc45357e1a803 100644
--- a/drivers/firmware/xilinx/zynqmp.c
+++ b/drivers/firmware/xilinx/zynqmp.c
@@ -3,6 +3,7 @@
  * Xilinx Zynq MPSoC Firmware layer
  *
  *  Copyright (C) 2014-2022 Xilinx, Inc.
+ *  Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc.
  *
  *  Michal Simek <michal.simek@amd.com>
  *  Davorin Mista <davorin.mista@aggios.com>
@@ -1384,6 +1385,30 @@ int zynqmp_pm_aes_engine(const u64 address, u32 *out)
 }
 EXPORT_SYMBOL_GPL(zynqmp_pm_aes_engine);
 
+/**
+ * zynqmp_pm_efuse_access - Provides access to efuse memory.
+ * @address:	Address of the efuse params structure
+ * @out:		Returned output value
+ *
+ * Return:	Returns status, either success or error code.
+ */
+int zynqmp_pm_efuse_access(const u64 address, u32 *out)
+{
+	u32 ret_payload[PAYLOAD_ARG_CNT];
+	int ret;
+
+	if (!out)
+		return -EINVAL;
+
+	ret = zynqmp_pm_invoke_fn(PM_EFUSE_ACCESS, ret_payload, 2,
+				  upper_32_bits(address),
+				  lower_32_bits(address));
+	*out = ret_payload[1];
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(zynqmp_pm_efuse_access);
+
 /**
  * zynqmp_pm_sha_hash - Access the SHA engine to calculate the hash
  * @address:	Address of the data/ Address of output buffer where
diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h
index 9a7e527392512c..1a069a56c961b6 100644
--- a/include/linux/firmware/xlnx-zynqmp.h
+++ b/include/linux/firmware/xlnx-zynqmp.h
@@ -3,6 +3,7 @@
  * Xilinx Zynq MPSoC Firmware layer
  *
  *  Copyright (C) 2014-2021 Xilinx
+ *  Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc.
  *
  *  Michal Simek <michal.simek@amd.com>
  *  Davorin Mista <davorin.mista@aggios.com>
@@ -171,6 +172,7 @@ enum pm_api_id {
 	PM_CLOCK_GETPARENT = 44,
 	PM_FPGA_READ = 46,
 	PM_SECURE_AES = 47,
+	PM_EFUSE_ACCESS = 53,
 	PM_FEATURE_CHECK = 63,
 };
 
@@ -562,6 +564,7 @@ int zynqmp_pm_set_requirement(const u32 node, const u32 capabilities,
 			      const u32 qos,
 			      const enum zynqmp_pm_request_ack ack);
 int zynqmp_pm_aes_engine(const u64 address, u32 *out);
+int zynqmp_pm_efuse_access(const u64 address, u32 *out);
 int zynqmp_pm_sha_hash(const u64 address, const u32 size, const u32 flags);
 int zynqmp_pm_fpga_load(const u64 address, const u32 size, const u32 flags);
 int zynqmp_pm_fpga_get_status(u32 *value);
@@ -749,6 +752,11 @@ static inline int zynqmp_pm_aes_engine(const u64 address, u32 *out)
 	return -ENODEV;
 }
 
+static inline int zynqmp_pm_efuse_access(const u64 address, u32 *out)
+{
+	return -ENODEV;
+}
+
 static inline int zynqmp_pm_sha_hash(const u64 address, const u32 size,
 				     const u32 flags)
 {

From 80414036ccf4d116cb2a19764c72993240a72484 Mon Sep 17 00:00:00 2001
From: Praveen Teja Kundanala <praveen.teja.kundanala@amd.com>
Date: Fri, 2 Feb 2024 17:08:41 +0530
Subject: [PATCH 0558/1406] nvmem: zynqmp_nvmem: zynqmp_nvmem_probe cleanup

- Remove static nvmem_config declaration
- Remove zynqmp_nvmem_data

Signed-off-by: Praveen Teja Kundanala <praveen.teja.kundanala@amd.com>
Acked-by: Kalyani Akula <Kalyani.akula@amd.com>
Signed-off-by: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
---
 drivers/nvmem/zynqmp_nvmem.c | 37 ++++++++++++------------------------
 1 file changed, 12 insertions(+), 25 deletions(-)

diff --git a/drivers/nvmem/zynqmp_nvmem.c b/drivers/nvmem/zynqmp_nvmem.c
index 7f15aa89a9d091..391d8e88b2705d 100644
--- a/drivers/nvmem/zynqmp_nvmem.c
+++ b/drivers/nvmem/zynqmp_nvmem.c
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0+
 /*
  * Copyright (C) 2019 Xilinx, Inc.
+ * Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc.
  */
 
 #include <linux/module.h>
@@ -11,36 +12,25 @@
 
 #define SILICON_REVISION_MASK 0xF
 
-struct zynqmp_nvmem_data {
-	struct device *dev;
-	struct nvmem_device *nvmem;
-};
 
 static int zynqmp_nvmem_read(void *context, unsigned int offset,
 			     void *val, size_t bytes)
 {
+	struct device *dev = context;
 	int ret;
-	int idcode, version;
-	struct zynqmp_nvmem_data *priv = context;
+	int idcode;
+	int version;
 
 	ret = zynqmp_pm_get_chipid(&idcode, &version);
 	if (ret < 0)
 		return ret;
 
-	dev_dbg(priv->dev, "Read chipid val %x %x\n", idcode, version);
+	dev_dbg(dev, "Read chipid val %x %x\n", idcode, version);
 	*(int *)val = version & SILICON_REVISION_MASK;
 
 	return 0;
 }
 
-static struct nvmem_config econfig = {
-	.name = "zynqmp-nvmem",
-	.owner = THIS_MODULE,
-	.word_size = 1,
-	.size = 1,
-	.read_only = true,
-};
-
 static const struct of_device_id zynqmp_nvmem_match[] = {
 	{ .compatible = "xlnx,zynqmp-nvmem-fw", },
 	{ /* sentinel */ },
@@ -50,21 +40,18 @@ MODULE_DEVICE_TABLE(of, zynqmp_nvmem_match);
 static int zynqmp_nvmem_probe(struct platform_device *pdev)
 {
 	struct device *dev = &pdev->dev;
-	struct zynqmp_nvmem_data *priv;
+	struct nvmem_config econfig = {};
 
-	priv = devm_kzalloc(dev, sizeof(struct zynqmp_nvmem_data), GFP_KERNEL);
-	if (!priv)
-		return -ENOMEM;
-
-	priv->dev = dev;
+	econfig.name = "zynqmp-nvmem";
+	econfig.owner = THIS_MODULE;
+	econfig.word_size = 1;
+	econfig.size = 1;
 	econfig.dev = dev;
 	econfig.add_legacy_fixed_of_cells = true;
+	econfig.read_only = true;
 	econfig.reg_read = zynqmp_nvmem_read;
-	econfig.priv = priv;
-
-	priv->nvmem = devm_nvmem_register(dev, &econfig);
 
-	return PTR_ERR_OR_ZERO(priv->nvmem);
+	return PTR_ERR_OR_ZERO(devm_nvmem_register(dev, &econfig));
 }
 
 static struct platform_driver zynqmp_nvmem_driver = {

From 30c624c97d08d37765ca3a23762f8c79bd6ab26e Mon Sep 17 00:00:00 2001
From: Praveen Teja Kundanala <praveen.teja.kundanala@amd.com>
Date: Fri, 2 Feb 2024 17:08:42 +0530
Subject: [PATCH 0559/1406] nvmem: zynqmp_nvmem: Add support to access efuse

Add support to read/write efuse memory map of ZynqMP.
Below are the offsets of ZynqMP efuse memory map
	0 - SOC version(read only)
	0xC - 0xFC -ZynqMP specific purpose efuses
	0x100 - 0x17F - Physical Unclonable Function(PUF)
                efuses repurposed as user efuses

Signed-off-by: Praveen Teja Kundanala <praveen.teja.kundanala@amd.com>
Acked-by: Kalyani Akula <Kalyani.akula@amd.com>
Signed-off-by: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
---
 drivers/nvmem/zynqmp_nvmem.c | 186 +++++++++++++++++++++++++++++++++--
 1 file changed, 176 insertions(+), 10 deletions(-)

diff --git a/drivers/nvmem/zynqmp_nvmem.c b/drivers/nvmem/zynqmp_nvmem.c
index 391d8e88b2705d..8682adaacd692d 100644
--- a/drivers/nvmem/zynqmp_nvmem.c
+++ b/drivers/nvmem/zynqmp_nvmem.c
@@ -4,6 +4,7 @@
  * Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc.
  */
 
+#include <linux/dma-mapping.h>
 #include <linux/module.h>
 #include <linux/nvmem-provider.h>
 #include <linux/of.h>
@@ -11,24 +12,189 @@
 #include <linux/firmware/xlnx-zynqmp.h>
 
 #define SILICON_REVISION_MASK 0xF
+#define P_USER_0_64_UPPER_MASK	GENMASK(31, 16)
+#define P_USER_127_LOWER_4_BIT_MASK GENMASK(3, 0)
+#define WORD_INBYTES		4
+#define SOC_VER_SIZE		0x4
+#define EFUSE_MEMORY_SIZE	0x177
+#define UNUSED_SPACE		0x8
+#define ZYNQMP_NVMEM_SIZE	(SOC_VER_SIZE + UNUSED_SPACE + \
+				 EFUSE_MEMORY_SIZE)
+#define SOC_VERSION_OFFSET	0x0
+#define EFUSE_START_OFFSET	0xC
+#define EFUSE_END_OFFSET	0xFC
+#define EFUSE_PUF_START_OFFSET	0x100
+#define EFUSE_PUF_MID_OFFSET	0x140
+#define EFUSE_PUF_END_OFFSET	0x17F
+#define EFUSE_NOT_ENABLED	29
 
+/*
+ * efuse access type
+ */
+enum efuse_access {
+	EFUSE_READ = 0,
+	EFUSE_WRITE
+};
+
+/**
+ * struct xilinx_efuse - the basic structure
+ * @src:	address of the buffer to store the data to be write/read
+ * @size:	read/write word count
+ * @offset:	read/write offset
+ * @flag:	0 - represents efuse read and 1- represents efuse write
+ * @pufuserfuse:0 - represents non-puf efuses, offset is used for read/write
+ *		1 - represents puf user fuse row number.
+ *
+ * this structure stores all the required details to
+ * read/write efuse memory.
+ */
+struct xilinx_efuse {
+	u64 src;
+	u32 size;
+	u32 offset;
+	enum efuse_access flag;
+	u32 pufuserfuse;
+};
+
+static int zynqmp_efuse_access(void *context, unsigned int offset,
+			       void *val, size_t bytes, enum efuse_access flag,
+			       unsigned int pufflag)
+{
+	struct device *dev = context;
+	struct xilinx_efuse *efuse;
+	dma_addr_t dma_addr;
+	dma_addr_t dma_buf;
+	size_t words = bytes / WORD_INBYTES;
+	int ret;
+	int value;
+	char *data;
+
+	if (bytes % WORD_INBYTES != 0) {
+		dev_err(dev, "Bytes requested should be word aligned\n");
+		return -EOPNOTSUPP;
+	}
+
+	if (pufflag == 0 && offset % WORD_INBYTES) {
+		dev_err(dev, "Offset requested should be word aligned\n");
+		return -EOPNOTSUPP;
+	}
+
+	if (pufflag == 1 && flag == EFUSE_WRITE) {
+		memcpy(&value, val, bytes);
+		if ((offset == EFUSE_PUF_START_OFFSET ||
+		     offset == EFUSE_PUF_MID_OFFSET) &&
+		    value & P_USER_0_64_UPPER_MASK) {
+			dev_err(dev, "Only lower 4 bytes are allowed to be programmed in P_USER_0 & P_USER_64\n");
+			return -EOPNOTSUPP;
+		}
+
+		if (offset == EFUSE_PUF_END_OFFSET &&
+		    (value & P_USER_127_LOWER_4_BIT_MASK)) {
+			dev_err(dev, "Only MSB 28 bits are allowed to be programmed for P_USER_127\n");
+			return -EOPNOTSUPP;
+		}
+	}
+
+	efuse = dma_alloc_coherent(dev, sizeof(struct xilinx_efuse),
+				   &dma_addr, GFP_KERNEL);
+	if (!efuse)
+		return -ENOMEM;
 
-static int zynqmp_nvmem_read(void *context, unsigned int offset,
-			     void *val, size_t bytes)
+	data = dma_alloc_coherent(dev, sizeof(bytes),
+				  &dma_buf, GFP_KERNEL);
+	if (!data) {
+		ret = -ENOMEM;
+		goto efuse_data_fail;
+	}
+
+	if (flag == EFUSE_WRITE) {
+		memcpy(data, val, bytes);
+		efuse->flag = EFUSE_WRITE;
+	} else {
+		efuse->flag = EFUSE_READ;
+	}
+
+	efuse->src = dma_buf;
+	efuse->size = words;
+	efuse->offset = offset;
+	efuse->pufuserfuse = pufflag;
+
+	zynqmp_pm_efuse_access(dma_addr, (u32 *)&ret);
+	if (ret != 0) {
+		if (ret == EFUSE_NOT_ENABLED) {
+			dev_err(dev, "efuse access is not enabled\n");
+			ret = -EOPNOTSUPP;
+		} else {
+			dev_err(dev, "Error in efuse read %x\n", ret);
+			ret = -EPERM;
+		}
+		goto efuse_access_err;
+	}
+
+	if (flag == EFUSE_READ)
+		memcpy(val, data, bytes);
+efuse_access_err:
+	dma_free_coherent(dev, sizeof(bytes),
+			  data, dma_buf);
+efuse_data_fail:
+	dma_free_coherent(dev, sizeof(struct xilinx_efuse),
+			  efuse, dma_addr);
+
+	return ret;
+}
+
+static int zynqmp_nvmem_read(void *context, unsigned int offset, void *val, size_t bytes)
 {
 	struct device *dev = context;
 	int ret;
+	int pufflag = 0;
 	int idcode;
 	int version;
 
-	ret = zynqmp_pm_get_chipid(&idcode, &version);
-	if (ret < 0)
-		return ret;
+	if (offset >= EFUSE_PUF_START_OFFSET && offset <= EFUSE_PUF_END_OFFSET)
+		pufflag = 1;
+
+	switch (offset) {
+	/* Soc version offset is zero */
+	case SOC_VERSION_OFFSET:
+		if (bytes != SOC_VER_SIZE)
+			return -EOPNOTSUPP;
+
+		ret = zynqmp_pm_get_chipid((u32 *)&idcode, (u32 *)&version);
+		if (ret < 0)
+			return ret;
+
+		dev_dbg(dev, "Read chipid val %x %x\n", idcode, version);
+		*(int *)val = version & SILICON_REVISION_MASK;
+		break;
+	/* Efuse offset starts from 0xc */
+	case EFUSE_START_OFFSET ... EFUSE_END_OFFSET:
+	case EFUSE_PUF_START_OFFSET ... EFUSE_PUF_END_OFFSET:
+		ret = zynqmp_efuse_access(context, offset, val,
+					  bytes, EFUSE_READ, pufflag);
+		break;
+	default:
+		*(u32 *)val = 0xDEADBEEF;
+		ret = 0;
+		break;
+	}
+
+	return ret;
+}
+
+static int zynqmp_nvmem_write(void *context,
+			      unsigned int offset, void *val, size_t bytes)
+{
+	int pufflag = 0;
+
+	if (offset < EFUSE_START_OFFSET || offset > EFUSE_PUF_END_OFFSET)
+		return -EOPNOTSUPP;
 
-	dev_dbg(dev, "Read chipid val %x %x\n", idcode, version);
-	*(int *)val = version & SILICON_REVISION_MASK;
+	if (offset >= EFUSE_PUF_START_OFFSET && offset <= EFUSE_PUF_END_OFFSET)
+		pufflag = 1;
 
-	return 0;
+	return zynqmp_efuse_access(context, offset,
+				   val, bytes, EFUSE_WRITE, pufflag);
 }
 
 static const struct of_device_id zynqmp_nvmem_match[] = {
@@ -45,11 +211,11 @@ static int zynqmp_nvmem_probe(struct platform_device *pdev)
 	econfig.name = "zynqmp-nvmem";
 	econfig.owner = THIS_MODULE;
 	econfig.word_size = 1;
-	econfig.size = 1;
+	econfig.size = ZYNQMP_NVMEM_SIZE;
 	econfig.dev = dev;
 	econfig.add_legacy_fixed_of_cells = true;
-	econfig.read_only = true;
 	econfig.reg_read = zynqmp_nvmem_read;
+	econfig.reg_write = zynqmp_nvmem_write;
 
 	return PTR_ERR_OR_ZERO(devm_nvmem_register(dev, &econfig));
 }

From e58d9a5f317aa67f4befb5d69177d68ce5b385d2 Mon Sep 17 00:00:00 2001
From: Praveen Teja Kundanala <praveen.teja.kundanala@amd.com>
Date: Fri, 2 Feb 2024 17:08:43 +0530
Subject: [PATCH 0560/1406] MAINTAINERS: Add maintainers for ZynqMP NVMEM
 driver

Add maintainers for ZynqMP NVMEM driver and driver document.

Signed-off-by: Praveen Teja Kundanala <praveen.teja.kundanala@amd.com>
Signed-off-by: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
---
 MAINTAINERS | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index 8d1052fa6a6924..b3103e03015ebb 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -24233,6 +24233,14 @@ M:	Harsha <harsha.harsha@amd.com>
 S:	Maintained
 F:	drivers/crypto/xilinx/zynqmp-sha.c
 
+XILINX ZYNQMP NVMEM DRIVER
+M:	Praveen Teja Kundanala <praveen.teja.kundanala@amd.com>
+M:	Kalyani Akula <kalyani.akula@amd.com>
+R:	Michal Simek <michal.simek@amd.com>
+S:	Maintained
+F:	Documentation/devicetree/bindings/nvmem/xlnx,zynqmp-nvmem.yaml
+F:	drivers/nvmem/zynqmp_nvmem.c
+
 XILLYBUS DRIVER
 M:	Eli Billauer <eli.billauer@gmail.com>
 L:	linux-kernel@vger.kernel.org

From 2c8df24cc166478910c4e9e870adf44d157330fa Mon Sep 17 00:00:00 2001
From: Chen-Yu Tsai <wenst@chromium.org>
Date: Tue, 30 Jan 2024 17:56:53 +0800
Subject: [PATCH 0561/1406] nvmem: mtk-efuse: Drop NVMEM device name
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The MT8183 has not one but two efuse devices. The static name and ID
causes the second efuse device to fail to probe, due to duplicate sysfs
entries.

With the rework of the mtk-socinfo driver, lookup by name is no longer
necessary. The custom name can simply be dropped.

Signed-off-by: Chen-Yu Tsai <wenst@chromium.org>
Reviewed-by: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
Tested-by: Nícolas F. R. A. Prado <nfraprado@collabora.com>
Signed-off-by: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
---
 drivers/nvmem/mtk-efuse.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/nvmem/mtk-efuse.c b/drivers/nvmem/mtk-efuse.c
index f5bebcecf9bd31..9caf0466734101 100644
--- a/drivers/nvmem/mtk-efuse.c
+++ b/drivers/nvmem/mtk-efuse.c
@@ -86,7 +86,6 @@ static int mtk_efuse_probe(struct platform_device *pdev)
 	econfig.size = resource_size(res);
 	econfig.priv = priv;
 	econfig.dev = dev;
-	econfig.name = "mtk-efuse";
 	if (pdata->uses_post_processing)
 		econfig.fixup_dt_cell_info = &mtk_efuse_fixup_dt_cell_info;
 	nvmem = devm_nvmem_register(dev, &econfig);

From edf2f6fb51bcd3c94ab2729e691f7c5f42935dfa Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Wed, 14 Feb 2024 19:26:32 +0200
Subject: [PATCH 0562/1406] kernel.h: Move upper_*_bits() and lower_*_bits() to
 wordpart.h

The wordpart.h header is collecting APIs related to the handling
parts of the word (usually in byte granularity). The upper_*_bits()
and lower_*_bits() are good candidates to be moved to there.

This helps to clean up header dependency hell with regard to kernel.h
as the latter gathers completely unrelated stuff together and slows
down compilation (especially when it's included into other header).

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20240214172752.3605073-1-andriy.shevchenko@linux.intel.com
Reviewed-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 include/linux/kernel.h   | 30 ++----------------------------
 include/linux/wordpart.h | 29 +++++++++++++++++++++++++++++
 2 files changed, 31 insertions(+), 28 deletions(-)

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index f4a1d582b79d24..86dd8939c2cd5e 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -33,6 +33,8 @@
 #include <linux/sprintf.h>
 #include <linux/static_call_types.h>
 #include <linux/instruction_pointer.h>
+#include <linux/wordpart.h>
+
 #include <asm/byteorder.h>
 
 #include <uapi/linux/kernel.h>
@@ -52,34 +54,6 @@
 }					\
 )
 
-/**
- * upper_32_bits - return bits 32-63 of a number
- * @n: the number we're accessing
- *
- * A basic shift-right of a 64- or 32-bit quantity.  Use this to suppress
- * the "right shift count >= width of type" warning when that quantity is
- * 32-bits.
- */
-#define upper_32_bits(n) ((u32)(((n) >> 16) >> 16))
-
-/**
- * lower_32_bits - return bits 0-31 of a number
- * @n: the number we're accessing
- */
-#define lower_32_bits(n) ((u32)((n) & 0xffffffff))
-
-/**
- * upper_16_bits - return bits 16-31 of a number
- * @n: the number we're accessing
- */
-#define upper_16_bits(n) ((u16)((n) >> 16))
-
-/**
- * lower_16_bits - return bits 0-15 of a number
- * @n: the number we're accessing
- */
-#define lower_16_bits(n) ((u16)((n) & 0xffff))
-
 struct completion;
 struct user;
 
diff --git a/include/linux/wordpart.h b/include/linux/wordpart.h
index c9e6bd773ebd4e..f6f8f83b15b043 100644
--- a/include/linux/wordpart.h
+++ b/include/linux/wordpart.h
@@ -2,6 +2,35 @@
 
 #ifndef _LINUX_WORDPART_H
 #define _LINUX_WORDPART_H
+
+/**
+ * upper_32_bits - return bits 32-63 of a number
+ * @n: the number we're accessing
+ *
+ * A basic shift-right of a 64- or 32-bit quantity.  Use this to suppress
+ * the "right shift count >= width of type" warning when that quantity is
+ * 32-bits.
+ */
+#define upper_32_bits(n) ((u32)(((n) >> 16) >> 16))
+
+/**
+ * lower_32_bits - return bits 0-31 of a number
+ * @n: the number we're accessing
+ */
+#define lower_32_bits(n) ((u32)((n) & 0xffffffff))
+
+/**
+ * upper_16_bits - return bits 16-31 of a number
+ * @n: the number we're accessing
+ */
+#define upper_16_bits(n) ((u16)((n) >> 16))
+
+/**
+ * lower_16_bits - return bits 0-15 of a number
+ * @n: the number we're accessing
+ */
+#define lower_16_bits(n) ((u16)((n) & 0xffff))
+
 /**
  * REPEAT_BYTE - repeat the value @x multiple times as an unsigned long value
  * @x: value to repeat

From 22fded4cb822b72943d53dcf001b46576084b8ce Mon Sep 17 00:00:00 2001
From: Guenter Roeck <linux@roeck-us.net>
Date: Wed, 14 Feb 2024 11:29:24 -0800
Subject: [PATCH 0563/1406] hwmon: (pmbus/tda38640) Use PMBUS_REGULATOR_ONE to
 declare regulator

If a chip only provides a single regulator, it should be named 'vout'
and not 'vout0'. Declare regulator using PMBUS_REGULATOR_ONE() to make
that happen.

Cc: Conor Dooley <conor@kernel.org>
Cc: Naresh Solanki <naresh.solanki@9elements.com>
Cc: Patrick Rudolph <patrick.rudolph@9elements.com>
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
---
 drivers/hwmon/pmbus/tda38640.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/hwmon/pmbus/tda38640.c b/drivers/hwmon/pmbus/tda38640.c
index 09cd114b173688..c31889a036f01e 100644
--- a/drivers/hwmon/pmbus/tda38640.c
+++ b/drivers/hwmon/pmbus/tda38640.c
@@ -15,7 +15,7 @@
 #include "pmbus.h"
 
 static const struct regulator_desc __maybe_unused tda38640_reg_desc[] = {
-	PMBUS_REGULATOR("vout", 0),
+	PMBUS_REGULATOR_ONE("vout"),
 };
 
 struct tda38640_data {

From 94320aff72270667ee44db0709e5da2d256dd030 Mon Sep 17 00:00:00 2001
From: Guenter Roeck <linux@roeck-us.net>
Date: Wed, 14 Feb 2024 11:32:00 -0800
Subject: [PATCH 0564/1406] hwmon: (pmbus/lm25066) Use PMBUS_REGULATOR_ONE to
 declare regulator

If a chip only provides a single regulator, it should be named 'vout'
and not 'vout0'. Declare regulator using PMBUS_REGULATOR_ONE() to make
that happen.

Cc: Conor Dooley <conor@kernel.org>
Cc: Naresh Solanki <naresh.solanki@9elements.com>
Cc: Zev Weiss <zev@bewilderbeest.net>
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
---
 drivers/hwmon/pmbus/lm25066.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/hwmon/pmbus/lm25066.c b/drivers/hwmon/pmbus/lm25066.c
index 3a20df5a43ec8d..cfffa4cdc0df91 100644
--- a/drivers/hwmon/pmbus/lm25066.c
+++ b/drivers/hwmon/pmbus/lm25066.c
@@ -437,7 +437,7 @@ static int lm25066_write_word_data(struct i2c_client *client, int page, int reg,
 
 #if IS_ENABLED(CONFIG_SENSORS_LM25066_REGULATOR)
 static const struct regulator_desc lm25066_reg_desc[] = {
-	PMBUS_REGULATOR("vout", 0),
+	PMBUS_REGULATOR_ONE("vout"),
 };
 #endif
 

From 74d0d066bfaf25d2b25dc0e09f7af009534108a9 Mon Sep 17 00:00:00 2001
From: Guenter Roeck <linux@roeck-us.net>
Date: Wed, 14 Feb 2024 11:33:44 -0800
Subject: [PATCH 0565/1406] hwmon: (pmbus/ir38064) Use PMBUS_REGULATOR_ONE to
 declare regulator

If a chip only provides a single regulator, it should be named 'vout'
and not 'vout0'. Declare regulator using PMBUS_REGULATOR_ONE() to make
that happen.

Cc: Conor Dooley <conor@kernel.org>
Cc: Naresh Solanki <naresh.solanki@9elements.com>
Cc: Patrick Rudolph <patrick.rudolph@9elements.com>
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
---
 drivers/hwmon/pmbus/ir38064.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/hwmon/pmbus/ir38064.c b/drivers/hwmon/pmbus/ir38064.c
index 04185be3fdb6d0..69e18cb468f67e 100644
--- a/drivers/hwmon/pmbus/ir38064.c
+++ b/drivers/hwmon/pmbus/ir38064.c
@@ -22,7 +22,7 @@
 
 #if IS_ENABLED(CONFIG_SENSORS_IR38064_REGULATOR)
 static const struct regulator_desc ir38064_reg_desc[] = {
-	PMBUS_REGULATOR("vout", 0),
+	PMBUS_REGULATOR_ONE("vout"),
 };
 #endif /* CONFIG_SENSORS_IR38064_REGULATOR */
 

From ff2cde4819b33b413d947fe31782ecf5690f2e22 Mon Sep 17 00:00:00 2001
From: Nuno Sa <nuno.sa@analog.com>
Date: Wed, 14 Feb 2024 15:36:43 +0100
Subject: [PATCH 0566/1406] hwmon: (axi-fan-control) Use device firmware
 agnostic API

Don't directly use OF and use device property APIs. In addition, this
makes the probe() code neater and also allow us to move the
of_device_id table to it's natural place.

While at it, make sure to explicitly include mod_devicetable.h for the
of_device_id table.

Signed-off-by: Nuno Sa <nuno.sa@analog.com>
Link: https://lore.kernel.org/r/20240214-axi-fan-control-no-of-v1-1-43ca656fe2e3@analog.com
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
---
 drivers/hwmon/axi-fan-control.c | 39 +++++++++++++++++----------------
 1 file changed, 20 insertions(+), 19 deletions(-)

diff --git a/drivers/hwmon/axi-fan-control.c b/drivers/hwmon/axi-fan-control.c
index 19b9bf3d75ef94..8dfe3b6c5a177c 100644
--- a/drivers/hwmon/axi-fan-control.c
+++ b/drivers/hwmon/axi-fan-control.c
@@ -13,8 +13,9 @@
 #include <linux/io.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
-#include <linux/of.h>
+#include <linux/mod_devicetable.h>
 #include <linux/platform_device.h>
+#include <linux/property.h>
 
 /* register map */
 #define ADI_REG_RSTN		0x0080
@@ -368,12 +369,12 @@ static irqreturn_t axi_fan_control_irq_handler(int irq, void *data)
 }
 
 static int axi_fan_control_init(struct axi_fan_control_data *ctl,
-				const struct device_node *np)
+				const struct device *dev)
 {
 	int ret;
 
 	/* get fan pulses per revolution */
-	ret = of_property_read_u32(np, "pulses-per-revolution", &ctl->ppr);
+	ret = device_property_read_u32(dev, "pulses-per-revolution", &ctl->ppr);
 	if (ret)
 		return ret;
 
@@ -443,25 +444,16 @@ static struct attribute *axi_fan_control_attrs[] = {
 };
 ATTRIBUTE_GROUPS(axi_fan_control);
 
-static const u32 version_1_0_0 = ADI_AXI_PCORE_VER(1, 0, 'a');
-
-static const struct of_device_id axi_fan_control_of_match[] = {
-	{ .compatible = "adi,axi-fan-control-1.00.a",
-		.data = (void *)&version_1_0_0},
-	{},
-};
-MODULE_DEVICE_TABLE(of, axi_fan_control_of_match);
-
 static int axi_fan_control_probe(struct platform_device *pdev)
 {
 	struct axi_fan_control_data *ctl;
 	struct clk *clk;
-	const struct of_device_id *id;
+	const unsigned int *id;
 	const char *name = "axi_fan_control";
 	u32 version;
 	int ret;
 
-	id = of_match_node(axi_fan_control_of_match, pdev->dev.of_node);
+	id = device_get_match_data(&pdev->dev);
 	if (!id)
 		return -EINVAL;
 
@@ -485,18 +477,18 @@ static int axi_fan_control_probe(struct platform_device *pdev)
 
 	version = axi_ioread(ADI_AXI_REG_VERSION, ctl);
 	if (ADI_AXI_PCORE_VER_MAJOR(version) !=
-	    ADI_AXI_PCORE_VER_MAJOR((*(u32 *)id->data))) {
+	    ADI_AXI_PCORE_VER_MAJOR((*id))) {
 		dev_err(&pdev->dev, "Major version mismatch. Expected %d.%.2d.%c, Reported %d.%.2d.%c\n",
-			ADI_AXI_PCORE_VER_MAJOR((*(u32 *)id->data)),
-			ADI_AXI_PCORE_VER_MINOR((*(u32 *)id->data)),
-			ADI_AXI_PCORE_VER_PATCH((*(u32 *)id->data)),
+			ADI_AXI_PCORE_VER_MAJOR(*id),
+			ADI_AXI_PCORE_VER_MINOR(*id),
+			ADI_AXI_PCORE_VER_PATCH(*id),
 			ADI_AXI_PCORE_VER_MAJOR(version),
 			ADI_AXI_PCORE_VER_MINOR(version),
 			ADI_AXI_PCORE_VER_PATCH(version));
 		return -ENODEV;
 	}
 
-	ret = axi_fan_control_init(ctl, pdev->dev.of_node);
+	ret = axi_fan_control_init(ctl, &pdev->dev);
 	if (ret) {
 		dev_err(&pdev->dev, "Failed to initialize device\n");
 		return ret;
@@ -527,6 +519,15 @@ static int axi_fan_control_probe(struct platform_device *pdev)
 	return 0;
 }
 
+static const u32 version_1_0_0 = ADI_AXI_PCORE_VER(1, 0, 'a');
+
+static const struct of_device_id axi_fan_control_of_match[] = {
+	{ .compatible = "adi,axi-fan-control-1.00.a",
+		.data = (void *)&version_1_0_0},
+	{},
+};
+MODULE_DEVICE_TABLE(of, axi_fan_control_of_match);
+
 static struct platform_driver axi_fan_control_driver = {
 	.driver = {
 		.name = "axi_fan_control_driver",

From b7ed8218572b734b59a25789b51d7456792f6f0a Mon Sep 17 00:00:00 2001
From: Nuno Sa <nuno.sa@analog.com>
Date: Wed, 14 Feb 2024 15:36:44 +0100
Subject: [PATCH 0567/1406] hwmon: (axi-fan-control) Make use of sysfs_emit()

Use sysfs_emit() instead of directly call sprintf().

Signed-off-by: Nuno Sa <nuno.sa@analog.com>
Link: https://lore.kernel.org/r/20240214-axi-fan-control-no-of-v1-2-43ca656fe2e3@analog.com
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
---
 drivers/hwmon/axi-fan-control.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/hwmon/axi-fan-control.c b/drivers/hwmon/axi-fan-control.c
index 8dfe3b6c5a177c..efd42a4f59516f 100644
--- a/drivers/hwmon/axi-fan-control.c
+++ b/drivers/hwmon/axi-fan-control.c
@@ -84,7 +84,7 @@ static ssize_t axi_fan_control_show(struct device *dev, struct device_attribute
 
 	temp = DIV_ROUND_CLOSEST_ULL(temp * 509314ULL, 65535) - 280230;
 
-	return sprintf(buf, "%u\n", temp);
+	return sysfs_emit(buf, "%u\n", temp);
 }
 
 static ssize_t axi_fan_control_store(struct device *dev, struct device_attribute *da,

From 07a0e923d38fee1bec42d7a3afc64f3e70dc440d Mon Sep 17 00:00:00 2001
From: Nuno Sa <nuno.sa@analog.com>
Date: Wed, 14 Feb 2024 15:36:45 +0100
Subject: [PATCH 0568/1406] hwmon: (axi-fan-control) Make use of
 dev_err_probe()

Use dev_err_probe() to slightly simplify printing errors during probe.
No functional changes intended.

Signed-off-by: Nuno Sa <nuno.sa@analog.com>
Link: https://lore.kernel.org/r/20240214-axi-fan-control-no-of-v1-3-43ca656fe2e3@analog.com
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
---
 drivers/hwmon/axi-fan-control.c | 40 +++++++++++++++------------------
 1 file changed, 18 insertions(+), 22 deletions(-)

diff --git a/drivers/hwmon/axi-fan-control.c b/drivers/hwmon/axi-fan-control.c
index efd42a4f59516f..35c862eb158b09 100644
--- a/drivers/hwmon/axi-fan-control.c
+++ b/drivers/hwmon/axi-fan-control.c
@@ -466,10 +466,9 @@ static int axi_fan_control_probe(struct platform_device *pdev)
 		return PTR_ERR(ctl->base);
 
 	clk = devm_clk_get_enabled(&pdev->dev, NULL);
-	if (IS_ERR(clk)) {
-		dev_err(&pdev->dev, "clk_get failed with %ld\n", PTR_ERR(clk));
-		return PTR_ERR(clk);
-	}
+	if (IS_ERR(clk))
+		return dev_err_probe(&pdev->dev, PTR_ERR(clk),
+				     "clk_get failed\n");
 
 	ctl->clk_rate = clk_get_rate(clk);
 	if (!ctl->clk_rate)
@@ -477,22 +476,20 @@ static int axi_fan_control_probe(struct platform_device *pdev)
 
 	version = axi_ioread(ADI_AXI_REG_VERSION, ctl);
 	if (ADI_AXI_PCORE_VER_MAJOR(version) !=
-	    ADI_AXI_PCORE_VER_MAJOR((*id))) {
-		dev_err(&pdev->dev, "Major version mismatch. Expected %d.%.2d.%c, Reported %d.%.2d.%c\n",
-			ADI_AXI_PCORE_VER_MAJOR(*id),
-			ADI_AXI_PCORE_VER_MINOR(*id),
-			ADI_AXI_PCORE_VER_PATCH(*id),
-			ADI_AXI_PCORE_VER_MAJOR(version),
-			ADI_AXI_PCORE_VER_MINOR(version),
-			ADI_AXI_PCORE_VER_PATCH(version));
-		return -ENODEV;
-	}
+	    ADI_AXI_PCORE_VER_MAJOR((*id)))
+		return dev_err_probe(&pdev->dev, -ENODEV,
+				     "Major version mismatch. Expected %d.%.2d.%c, Reported %d.%.2d.%c\n",
+				     ADI_AXI_PCORE_VER_MAJOR(*id),
+				     ADI_AXI_PCORE_VER_MINOR(*id),
+				     ADI_AXI_PCORE_VER_PATCH(*id),
+				     ADI_AXI_PCORE_VER_MAJOR(version),
+				     ADI_AXI_PCORE_VER_MINOR(version),
+				     ADI_AXI_PCORE_VER_PATCH(version));
 
 	ret = axi_fan_control_init(ctl, &pdev->dev);
-	if (ret) {
-		dev_err(&pdev->dev, "Failed to initialize device\n");
-		return ret;
-	}
+	if (ret)
+		return dev_err_probe(&pdev->dev, ret,
+				     "Failed to initialize device\n");
 
 	ctl->hdev = devm_hwmon_device_register_with_info(&pdev->dev,
 							 name,
@@ -511,10 +508,9 @@ static int axi_fan_control_probe(struct platform_device *pdev)
 					axi_fan_control_irq_handler,
 					IRQF_ONESHOT | IRQF_TRIGGER_HIGH,
 					pdev->driver_override, ctl);
-	if (ret) {
-		dev_err(&pdev->dev, "failed to request an irq, %d", ret);
-		return ret;
-	}
+	if (ret)
+		return dev_err_probe(&pdev->dev, ret,
+				     "failed to request an irq\n");
 
 	return 0;
 }

From 3fab8a74c71a4ba32b2fa1dca7340f9107ff8dfc Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Mon, 12 Feb 2024 12:19:04 +0100
Subject: [PATCH 0569/1406] i2c: pasemi: split driver into two separate modules

On powerpc, it is possible to compile test both the new apple (arm) and
old pasemi (powerpc) drivers for the i2c hardware at the same time,
which leads to a warning about linking the same object file twice:

scripts/Makefile.build:244: drivers/i2c/busses/Makefile: i2c-pasemi-core.o is added to multiple modules: i2c-apple i2c-pasemi

Rework the driver to have an explicit helper module, letting Kbuild
take care of whether this should be built-in or a loadable driver.

Fixes: 9bc5f4f660ff ("i2c: pasemi: Split pci driver to its own file")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Sven Peter <sven@svenpeter.dev>
Signed-off-by: Andi Shyti <andi.shyti@kernel.org>
---
 drivers/i2c/busses/Makefile          | 6 ++----
 drivers/i2c/busses/i2c-pasemi-core.c | 6 ++++++
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/drivers/i2c/busses/Makefile b/drivers/i2c/busses/Makefile
index 3757b9391e60ae..aa0ee8ecd6f2f5 100644
--- a/drivers/i2c/busses/Makefile
+++ b/drivers/i2c/busses/Makefile
@@ -90,10 +90,8 @@ obj-$(CONFIG_I2C_NPCM)		+= i2c-npcm7xx.o
 obj-$(CONFIG_I2C_OCORES)	+= i2c-ocores.o
 obj-$(CONFIG_I2C_OMAP)		+= i2c-omap.o
 obj-$(CONFIG_I2C_OWL)		+= i2c-owl.o
-i2c-pasemi-objs := i2c-pasemi-core.o i2c-pasemi-pci.o
-obj-$(CONFIG_I2C_PASEMI)	+= i2c-pasemi.o
-i2c-apple-objs := i2c-pasemi-core.o i2c-pasemi-platform.o
-obj-$(CONFIG_I2C_APPLE)	+= i2c-apple.o
+obj-$(CONFIG_I2C_PASEMI)	+= i2c-pasemi-core.o i2c-pasemi-pci.o
+obj-$(CONFIG_I2C_APPLE)		+= i2c-pasemi-core.o i2c-pasemi-platform.o
 obj-$(CONFIG_I2C_PCA_PLATFORM)	+= i2c-pca-platform.o
 obj-$(CONFIG_I2C_PNX)		+= i2c-pnx.o
 obj-$(CONFIG_I2C_PXA)		+= i2c-pxa.o
diff --git a/drivers/i2c/busses/i2c-pasemi-core.c b/drivers/i2c/busses/i2c-pasemi-core.c
index 7d54a9f34c74b5..bd8becbdeeb28f 100644
--- a/drivers/i2c/busses/i2c-pasemi-core.c
+++ b/drivers/i2c/busses/i2c-pasemi-core.c
@@ -369,6 +369,7 @@ int pasemi_i2c_common_probe(struct pasemi_smbus *smbus)
 
 	return 0;
 }
+EXPORT_SYMBOL_GPL(pasemi_i2c_common_probe);
 
 irqreturn_t pasemi_irq_handler(int irq, void *dev_id)
 {
@@ -378,3 +379,8 @@ irqreturn_t pasemi_irq_handler(int irq, void *dev_id)
 	complete(&smbus->irq_completion);
 	return IRQ_HANDLED;
 }
+EXPORT_SYMBOL_GPL(pasemi_irq_handler);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Olof Johansson <olof@lixom.net>");
+MODULE_DESCRIPTION("PA Semi PWRficient SMBus driver");

From eb9f7f654f251b57db310eab90bbae5876898ae3 Mon Sep 17 00:00:00 2001
From: Jean Delvare <jdelvare@suse.de>
Date: Wed, 14 Feb 2024 15:59:39 +0100
Subject: [PATCH 0570/1406] i2c: i801: Fix block process call transactions

According to the Intel datasheets, software must reset the block
buffer index twice for block process call transactions: once before
writing the outgoing data to the buffer, and once again before
reading the incoming data from the buffer.

The driver is currently missing the second reset, causing the wrong
portion of the block buffer to be read.

Signed-off-by: Jean Delvare <jdelvare@suse.de>
Reported-by: Piotr Zakowski <piotr.zakowski@intel.com>
Closes: https://lore.kernel.org/linux-i2c/20240213120553.7b0ab120@endymion.delvare/
Fixes: 315cd67c9453 ("i2c: i801: Add Block Write-Block Read Process Call support")
Reviewed-by: Alexander Sverdlin <alexander.sverdlin@gmail.com>
Signed-off-by: Andi Shyti <andi.shyti@kernel.org>
---
 drivers/i2c/busses/i2c-i801.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/i2c/busses/i2c-i801.c b/drivers/i2c/busses/i2c-i801.c
index 3932e8d96a1717..2c36b36d7d516c 100644
--- a/drivers/i2c/busses/i2c-i801.c
+++ b/drivers/i2c/busses/i2c-i801.c
@@ -498,11 +498,10 @@ static int i801_block_transaction_by_block(struct i801_priv *priv,
 	/* Set block buffer mode */
 	outb_p(inb_p(SMBAUXCTL(priv)) | SMBAUXCTL_E32B, SMBAUXCTL(priv));
 
-	inb_p(SMBHSTCNT(priv)); /* reset the data buffer index */
-
 	if (read_write == I2C_SMBUS_WRITE) {
 		len = data->block[0];
 		outb_p(len, SMBHSTDAT0(priv));
+		inb_p(SMBHSTCNT(priv));	/* reset the data buffer index */
 		for (i = 0; i < len; i++)
 			outb_p(data->block[i+1], SMBBLKDAT(priv));
 	}
@@ -520,6 +519,7 @@ static int i801_block_transaction_by_block(struct i801_priv *priv,
 		}
 
 		data->block[0] = len;
+		inb_p(SMBHSTCNT(priv));	/* reset the data buffer index */
 		for (i = 0; i < len; i++)
 			data->block[i + 1] = inb_p(SMBBLKDAT(priv));
 	}

From 1d044941d53855ca06e4fa34936ff7273c8641dd Mon Sep 17 00:00:00 2001
From: "Ricardo B. Marliere" <ricardo@marliere.net>
Date: Tue, 13 Feb 2024 11:46:25 -0300
Subject: [PATCH 0571/1406] tee: make tee_bus_type const

Since commit d492cc2573a0 ("driver core: device.h: make struct
bus_type a const *"), the driver core can properly handle constant
struct bus_type, move the tee_bus_type variable to be a constant
structure as well, placing it into read-only memory which can not be
modified at runtime.

Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Suggested-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Ricardo B. Marliere <ricardo@marliere.net>
Reviewed-by: Sumit Garg <sumit.garg@linaro.org>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Jens Wiklander <jens.wiklander@linaro.org>
---
 drivers/tee/tee_core.c  | 2 +-
 include/linux/tee_drv.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/tee/tee_core.c b/drivers/tee/tee_core.c
index 792d6fae4354d5..e59c20d74b36ae 100644
--- a/drivers/tee/tee_core.c
+++ b/drivers/tee/tee_core.c
@@ -1226,7 +1226,7 @@ static int tee_client_device_uevent(const struct device *dev,
 	return add_uevent_var(env, "MODALIAS=tee:%pUb", dev_id);
 }
 
-struct bus_type tee_bus_type = {
+const struct bus_type tee_bus_type = {
 	.name		= "tee",
 	.match		= tee_client_device_match,
 	.uevent		= tee_client_device_uevent,
diff --git a/include/linux/tee_drv.h b/include/linux/tee_drv.h
index 911ddf92dcee75..71632e3c5f18fb 100644
--- a/include/linux/tee_drv.h
+++ b/include/linux/tee_drv.h
@@ -482,7 +482,7 @@ static inline bool tee_param_is_memref(struct tee_param *param)
 	}
 }
 
-extern struct bus_type tee_bus_type;
+extern const struct bus_type tee_bus_type;
 
 /**
  * struct tee_client_device - tee based device

From 2bd6f4d99e1256ef5041a2d83e376bead6a2fe9f Mon Sep 17 00:00:00 2001
From: Vegard Nossum <vegard.nossum@oracle.com>
Date: Thu, 15 Feb 2024 07:41:09 +0100
Subject: [PATCH 0572/1406] docs: translations: use attribute to store current
 language

Akira Yokosawa reported [1] that the "translations" extension we added in
commit 7418ec5b151f ("docs: translations: add translations links when they
exist") broke the build on Sphinx versions v6.1.3 through 7.1.2 (possibly
others) with the following error:

    Exception occurred:
      File "/usr/lib/python3.12/site-packages/sphinx/util/nodes.py", line 624, in _copy_except__document
        newnode = self.__class__(rawsource=self.rawsource, **self.attributes)
                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    TypeError: LanguagesNode.__init__() missing 1 required positional argument: 'current_language'
    The full traceback has been saved in /tmp/sphinx-err-7xmwytuu.log, if you want to report the issue to the developers.

Solve this problem by making 'current_language' a true element attribute
of the LanguagesNode element, which is probably the more correct way to do
it anyway.

Tested on Sphinx 2.x, 3.x, 6.x, and 7.x.

[1]: https://lore.kernel.org/all/54a56c2e-a27c-45a0-b712-02a7bc7d2673@gmail.com/

Fixes: 7418ec5b151f ("docs: translations: add translations links when they exist")
Reported-by: Akira Yokosawa <akiyks@gmail.com>
Signed-off-by: Vegard Nossum <vegard.nossum@oracle.com>
Closes: https://lore.kernel.org/all/54a56c2e-a27c-45a0-b712-02a7bc7d2673@gmail.com/
Tested-by: Akira Yokosawa <akiyks@gmail.com>  # Sphinx 4.3.2, 5.3.0 and 6.2.1
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
Link: https://lore.kernel.org/r/20240215064109.1193556-1-vegard.nossum@oracle.com
---
 Documentation/sphinx/translations.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/Documentation/sphinx/translations.py b/Documentation/sphinx/translations.py
index 47161e6eba9976..32c2b32b2b5ee9 100644
--- a/Documentation/sphinx/translations.py
+++ b/Documentation/sphinx/translations.py
@@ -29,10 +29,7 @@
 }
 
 class LanguagesNode(nodes.Element):
-    def __init__(self, current_language, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-
-        self.current_language = current_language
+    pass
 
 class TranslationsTransform(Transform):
     default_priority = 900
@@ -49,7 +46,8 @@ def apply(self):
             # normalize docname to be the untranslated one
             docname = os.path.join(*components[2:])
 
-        new_nodes = LanguagesNode(all_languages[this_lang_code])
+        new_nodes = LanguagesNode()
+        new_nodes['current_language'] = all_languages[this_lang_code]
 
         for lang_code, lang_name in all_languages.items():
             if lang_code == this_lang_code:
@@ -84,7 +82,7 @@ def process_languages(app, doctree, docname):
 
         html_content = app.builder.templates.render('translations.html',
             context={
-                'current_language': node.current_language,
+                'current_language': node['current_language'],
                 'languages': languages,
             })
 

From 0a2d3ce0031f504b2e3ad47625e149ad5759ad33 Mon Sep 17 00:00:00 2001
From: Artur Rojek <contact@artur-rojek.eu>
Date: Sun, 11 Feb 2024 20:34:51 +0100
Subject: [PATCH 0573/1406] sh: hd64461: Make setup_hd64461 static

Enforce internal linkage for setup_hd64461.

This fixes the following error:
arch/sh/cchips/hd6446x/hd64461.c:75:12: error: no previous prototype for 'setup_hd64461' [-Werror=missing-prototypes]

Signed-off-by: Artur Rojek <contact@artur-rojek.eu>
Reviewed-by: John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de>
Link: https://lore.kernel.org/r/20240211193451.106795-1-contact@artur-rojek.eu
Signed-off-by: John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de>
---
 arch/sh/cchips/hd6446x/hd64461.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/sh/cchips/hd6446x/hd64461.c b/arch/sh/cchips/hd6446x/hd64461.c
index f3fba967445aca..81764882d87d37 100644
--- a/arch/sh/cchips/hd6446x/hd64461.c
+++ b/arch/sh/cchips/hd6446x/hd64461.c
@@ -72,7 +72,7 @@ static void hd64461_irq_demux(struct irq_desc *desc)
 	}
 }
 
-int __init setup_hd64461(void)
+static int __init setup_hd64461(void)
 {
 	int irq_base, i;
 

From f1fa94763a55c00ff775cbeacd58e7bb15964dd7 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 9 Feb 2024 21:01:04 -0500
Subject: [PATCH 0574/1406] bcachefs: Check for subvolume children when
 deleting subvolumes

Recursively destroying subvolumes isn't allowed yet.

Fixes: https://github.com/koverstreet/bcachefs/issues/634
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/dirent.c    |  2 +-
 fs/bcachefs/errcode.h   |  2 ++
 fs/bcachefs/fs-common.c | 23 +++++++++++++++--------
 fs/bcachefs/inode.c     |  3 ++-
 fs/bcachefs/subvolume.c | 13 +++++++++++++
 fs/bcachefs/subvolume.h |  1 +
 6 files changed, 34 insertions(+), 10 deletions(-)

diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index 52b350f8a3f108..b5ee11b50f5c09 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -522,7 +522,7 @@ int bch2_empty_dir_snapshot(struct btree_trans *trans, u64 dir, u32 snapshot)
 			   SPOS(dir, 0, snapshot),
 			   POS(dir, U64_MAX), 0, k, ret)
 		if (k.k->type == KEY_TYPE_dirent) {
-			ret = -ENOTEMPTY;
+			ret = -BCH_ERR_ENOTEMPTY_dir_not_empty;
 			break;
 		}
 	bch2_trans_iter_exit(trans, &iter);
diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
index a82a9d754fdab0..fe3fc14d3c9a19 100644
--- a/fs/bcachefs/errcode.h
+++ b/fs/bcachefs/errcode.h
@@ -109,6 +109,8 @@
 	x(ENOENT,			ENOENT_dirent_doesnt_match_inode)	\
 	x(ENOENT,			ENOENT_dev_not_found)			\
 	x(ENOENT,			ENOENT_dev_idx_not_found)		\
+	x(ENOTEMPTY,			ENOTEMPTY_dir_not_empty)		\
+	x(ENOTEMPTY,			ENOTEMPTY_subvol_not_empty)		\
 	x(0,				open_buckets_empty)			\
 	x(0,				freelist_empty)				\
 	x(BCH_ERR_freelist_empty,	no_buckets_found)			\
diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c
index 2aa3881105972b..624e6f963240f8 100644
--- a/fs/bcachefs/fs-common.c
+++ b/fs/bcachefs/fs-common.c
@@ -243,7 +243,7 @@ int bch2_unlink_trans(struct btree_trans *trans,
 		      struct bch_inode_unpacked *dir_u,
 		      struct bch_inode_unpacked *inode_u,
 		      const struct qstr *name,
-		      bool deleting_snapshot)
+		      bool deleting_subvol)
 {
 	struct bch_fs *c = trans->c;
 	struct btree_iter dir_iter = { NULL };
@@ -271,18 +271,25 @@ int bch2_unlink_trans(struct btree_trans *trans,
 	if (ret)
 		goto err;
 
-	if (!deleting_snapshot && S_ISDIR(inode_u->bi_mode)) {
+	if (!deleting_subvol && S_ISDIR(inode_u->bi_mode)) {
 		ret = bch2_empty_dir_trans(trans, inum);
 		if (ret)
 			goto err;
 	}
 
-	if (deleting_snapshot && !inode_u->bi_subvol) {
+	if (deleting_subvol && !inode_u->bi_subvol) {
 		ret = -BCH_ERR_ENOENT_not_subvol;
 		goto err;
 	}
 
-	if (deleting_snapshot || inode_u->bi_subvol) {
+	if (inode_u->bi_subvol) {
+		/* Recursive subvolume destroy not allowed (yet?) */
+		ret = bch2_subvol_has_children(trans, inode_u->bi_subvol);
+		if (ret)
+			goto err;
+	}
+
+	if (deleting_subvol || inode_u->bi_subvol) {
 		ret = bch2_subvolume_unlink(trans, inode_u->bi_subvol);
 		if (ret)
 			goto err;
@@ -479,10 +486,10 @@ int bch2_rename_trans(struct btree_trans *trans,
 			goto err;
 		}
 
-		if (S_ISDIR(dst_inode_u->bi_mode) &&
-		    bch2_empty_dir_trans(trans, dst_inum)) {
-			ret = -ENOTEMPTY;
-			goto err;
+		if (S_ISDIR(dst_inode_u->bi_mode)) {
+			ret = bch2_empty_dir_trans(trans, dst_inum);
+			if (ret)
+				goto err;
 		}
 	}
 
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 414aebe17fd335..f9a566c52d61c7 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -1118,7 +1118,8 @@ static int may_delete_deleted_inode(struct btree_trans *trans,
 
 	if (S_ISDIR(inode.bi_mode)) {
 		ret = bch2_empty_dir_snapshot(trans, pos.offset, pos.snapshot);
-		if (fsck_err_on(ret == -ENOTEMPTY, c, deleted_inode_is_dir,
+		if (fsck_err_on(bch2_err_matches(ret, ENOTEMPTY),
+				c, deleted_inode_is_dir,
 				"non empty directory %llu:%u in deleted_inodes btree",
 				pos.offset, pos.snapshot))
 			goto delete;
diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
index 68be3a450ca12f..ce7aed12194238 100644
--- a/fs/bcachefs/subvolume.c
+++ b/fs/bcachefs/subvolume.c
@@ -262,6 +262,19 @@ int bch2_subvolume_trigger(struct btree_trans *trans,
 	return 0;
 }
 
+int bch2_subvol_has_children(struct btree_trans *trans, u32 subvol)
+{
+	struct btree_iter iter;
+
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolume_children, POS(subvol, 0), 0);
+	struct bkey_s_c k = bch2_btree_iter_peek(&iter);
+	bch2_trans_iter_exit(trans, &iter);
+
+	return bkey_err(k) ?: k.k && k.k->p.inode == subvol
+		? -BCH_ERR_ENOTEMPTY_subvol_not_empty
+		: 0;
+}
+
 static __always_inline int
 bch2_subvolume_get_inlined(struct btree_trans *trans, unsigned subvol,
 			   bool inconsistent_if_not_found,
diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h
index b6f1dfca7b80c4..4045a180154a36 100644
--- a/fs/bcachefs/subvolume.h
+++ b/fs/bcachefs/subvolume.h
@@ -22,6 +22,7 @@ int bch2_subvolume_trigger(struct btree_trans *, enum btree_id, unsigned,
 	.min_val_size	= 16,					\
 })
 
+int bch2_subvol_has_children(struct btree_trans *, u32);
 int bch2_subvolume_get(struct btree_trans *, unsigned,
 		       bool, int, struct bch_subvolume *);
 int bch2_subvolume_get_snapshot(struct btree_trans *, u32, u32 *);

From 0c6c5d8b2ecb2443f1e737d5ae7276b914be40d4 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 23 Jan 2024 00:01:07 -0500
Subject: [PATCH 0575/1406] bcachefs: Pin btree cache in ram for random access
 in fsck

Various phases of fsck involve checking references from one btree to
another: this means doing a sequential scan of one btree, and then
mostly random access into the second.

This is particularly painful for checking extents <-> backpointers; we
can prefetch btree node access on the sequential scan, but not on the
random access portion, and this is particularly painful on spinning
rust, where we'd like to keep the pipeline fairly full of btree node
reads so that the elevator can reduce seeking.

This patch implements prefetching and pinning of the portion of the
btree that we'll be doing random access to. We already calculate how
much of the random access btree will fit in memory so it's a fairly
straightforward change.

This will put more pressure on system memory usage, so we introduce a
new option, fsck_memory_usage_percent, which is the percentage of total
system ram that fsck is allowed to pin.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/backpointers.c | 137 +++++++++++++------------------------
 fs/bcachefs/bbpos_types.h  |   2 +-
 fs/bcachefs/btree_cache.c  |  13 ++++
 fs/bcachefs/btree_types.h  |   6 ++
 fs/bcachefs/opts.h         |   5 ++
 5 files changed, 72 insertions(+), 91 deletions(-)

diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c
index 34d20e099dcfd3..23fe9378fb40aa 100644
--- a/fs/bcachefs/backpointers.c
+++ b/fs/bcachefs/backpointers.c
@@ -552,60 +552,61 @@ static inline struct bbpos bp_to_bbpos(struct bch_backpointer bp)
 	};
 }
 
-static size_t btree_nodes_fit_in_ram(struct bch_fs *c)
+static u64 mem_may_pin_bytes(struct bch_fs *c)
 {
 	struct sysinfo i;
-	u64 mem_bytes;
-
 	si_meminfo(&i);
-	mem_bytes = i.totalram * i.mem_unit;
-	return div_u64(mem_bytes >> 1, c->opts.btree_node_size);
+
+	u64 mem_bytes = i.totalram * i.mem_unit;
+	return div_u64(mem_bytes * c->opts.fsck_memory_usage_percent, 100);
+}
+
+static size_t btree_nodes_fit_in_ram(struct bch_fs *c)
+{
+	return div_u64(mem_may_pin_bytes(c), c->opts.btree_node_size);
 }
 
 static int bch2_get_btree_in_memory_pos(struct btree_trans *trans,
-					unsigned btree_leaf_mask,
-					unsigned btree_interior_mask,
+					u64 btree_leaf_mask,
+					u64 btree_interior_mask,
 					struct bbpos start, struct bbpos *end)
 {
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	size_t btree_nodes = btree_nodes_fit_in_ram(trans->c);
-	enum btree_id btree;
+	struct bch_fs *c = trans->c;
+	s64 mem_may_pin = mem_may_pin_bytes(c);
 	int ret = 0;
 
-	for (btree = start.btree; btree < BTREE_ID_NR && !ret; btree++) {
-		unsigned depth = ((1U << btree) & btree_leaf_mask) ? 1 : 2;
+	btree_interior_mask |= btree_leaf_mask;
+
+	c->btree_cache.pinned_nodes_leaf_mask		= btree_leaf_mask;
+	c->btree_cache.pinned_nodes_interior_mask	= btree_interior_mask;
+	c->btree_cache.pinned_nodes_start		= start;
+	c->btree_cache.pinned_nodes_end			= *end = BBPOS_MAX;
+
+	for (enum btree_id btree = start.btree;
+	     btree < BTREE_ID_NR && !ret;
+	     btree++) {
+		unsigned depth = ((1U << btree) & btree_leaf_mask) ? 0 : 1;
+		struct btree_iter iter;
+		struct btree *b;
 
 		if (!((1U << btree) & btree_leaf_mask) &&
 		    !((1U << btree) & btree_interior_mask))
 			continue;
 
-		bch2_trans_node_iter_init(trans, &iter, btree,
-					  btree == start.btree ? start.pos : POS_MIN,
-					  0, depth, 0);
-		/*
-		 * for_each_btree_key_contineu() doesn't check the return value
-		 * from bch2_btree_iter_advance(), which is needed when
-		 * iterating over interior nodes where we'll see keys at
-		 * SPOS_MAX:
-		 */
-		do {
-			k = __bch2_btree_iter_peek_and_restart(trans, &iter, 0);
-			ret = bkey_err(k);
-			if (!k.k || ret)
-				break;
-
-			--btree_nodes;
-			if (!btree_nodes) {
-				*end = BBPOS(btree, k.k->p);
+		__for_each_btree_node(trans, iter, btree,
+				      btree == start.btree ? start.pos : POS_MIN,
+				      0, depth, BTREE_ITER_PREFETCH, b, ret) {
+			mem_may_pin -= btree_buf_bytes(b);
+			if (mem_may_pin <= 0) {
+				c->btree_cache.pinned_nodes_end = *end =
+					BBPOS(btree, b->key.k.p);
 				bch2_trans_iter_exit(trans, &iter);
 				return 0;
 			}
-		} while (bch2_btree_iter_advance(&iter));
+		}
 		bch2_trans_iter_exit(trans, &iter);
 	}
 
-	*end = BBPOS_MAX;
 	return ret;
 }
 
@@ -663,62 +664,6 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans,
 	return 0;
 }
 
-static struct bpos bucket_pos_to_bp_safe(const struct bch_fs *c,
-					 struct bpos bucket)
-{
-	return bch2_dev_exists2(c, bucket.inode)
-		? bucket_pos_to_bp(c, bucket, 0)
-		: bucket;
-}
-
-static int bch2_get_alloc_in_memory_pos(struct btree_trans *trans,
-					struct bpos start, struct bpos *end)
-{
-	struct btree_iter alloc_iter;
-	struct btree_iter bp_iter;
-	struct bkey_s_c alloc_k, bp_k;
-	size_t btree_nodes = btree_nodes_fit_in_ram(trans->c);
-	bool alloc_end = false, bp_end = false;
-	int ret = 0;
-
-	bch2_trans_node_iter_init(trans, &alloc_iter, BTREE_ID_alloc,
-				  start, 0, 1, 0);
-	bch2_trans_node_iter_init(trans, &bp_iter, BTREE_ID_backpointers,
-				  bucket_pos_to_bp_safe(trans->c, start), 0, 1, 0);
-	while (1) {
-		alloc_k = !alloc_end
-			? __bch2_btree_iter_peek_and_restart(trans, &alloc_iter, 0)
-			: bkey_s_c_null;
-		bp_k = !bp_end
-			? __bch2_btree_iter_peek_and_restart(trans, &bp_iter, 0)
-			: bkey_s_c_null;
-
-		ret = bkey_err(alloc_k) ?: bkey_err(bp_k);
-		if ((!alloc_k.k && !bp_k.k) || ret) {
-			*end = SPOS_MAX;
-			break;
-		}
-
-		--btree_nodes;
-		if (!btree_nodes) {
-			*end = alloc_k.k ? alloc_k.k->p : SPOS_MAX;
-			break;
-		}
-
-		if (bpos_lt(alloc_iter.pos, SPOS_MAX) &&
-		    bpos_lt(bucket_pos_to_bp_safe(trans->c, alloc_iter.pos), bp_iter.pos)) {
-			if (!bch2_btree_iter_advance(&alloc_iter))
-				alloc_end = true;
-		} else {
-			if (!bch2_btree_iter_advance(&bp_iter))
-				bp_end = true;
-		}
-	}
-	bch2_trans_iter_exit(trans, &bp_iter);
-	bch2_trans_iter_exit(trans, &alloc_iter);
-	return ret;
-}
-
 int bch2_check_extents_to_backpointers(struct bch_fs *c)
 {
 	struct btree_trans *trans = bch2_trans_get(c);
@@ -729,10 +674,16 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c)
 	bkey_init(&s.last_flushed.k->k);
 
 	while (1) {
-		ret = bch2_get_alloc_in_memory_pos(trans, s.bucket_start, &s.bucket_end);
+		struct bbpos end;
+		ret = bch2_get_btree_in_memory_pos(trans,
+				BIT_ULL(BTREE_ID_backpointers),
+				BIT_ULL(BTREE_ID_backpointers),
+				BBPOS(BTREE_ID_backpointers, s.bucket_start), &end);
 		if (ret)
 			break;
 
+		s.bucket_end = end.pos;
+
 		if ( bpos_eq(s.bucket_start, POS_MIN) &&
 		    !bpos_eq(s.bucket_end, SPOS_MAX))
 			bch_verbose(c, "%s(): alloc info does not fit in ram, running in multiple passes with %zu nodes per pass",
@@ -760,6 +711,9 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c)
 	bch2_trans_put(trans);
 	bch2_bkey_buf_exit(&s.last_flushed, c);
 
+	c->btree_cache.pinned_nodes_leaf_mask = 0;
+	c->btree_cache.pinned_nodes_interior_mask = 0;
+
 	bch_err_fn(c, ret);
 	return ret;
 }
@@ -865,6 +819,9 @@ int bch2_check_backpointers_to_extents(struct bch_fs *c)
 	}
 	bch2_trans_put(trans);
 
+	c->btree_cache.pinned_nodes_leaf_mask = 0;
+	c->btree_cache.pinned_nodes_interior_mask = 0;
+
 	bch_err_fn(c, ret);
 	return ret;
 }
diff --git a/fs/bcachefs/bbpos_types.h b/fs/bcachefs/bbpos_types.h
index 5198e94cf3b89c..f63893344f80aa 100644
--- a/fs/bcachefs/bbpos_types.h
+++ b/fs/bcachefs/bbpos_types.h
@@ -13,6 +13,6 @@ static inline struct bbpos BBPOS(enum btree_id btree, struct bpos pos)
 }
 
 #define BBPOS_MIN	BBPOS(0, POS_MIN)
-#define BBPOS_MAX	BBPOS(BTREE_ID_NR - 1, POS_MAX)
+#define BBPOS_MAX	BBPOS(BTREE_ID_NR - 1, SPOS_MAX)
 
 #endif /* _BCACHEFS_BBPOS_TYPES_H */
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 37ec3dbde4eee4..799750464969a5 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 
 #include "bcachefs.h"
+#include "bbpos.h"
 #include "bkey_buf.h"
 #include "btree_cache.h"
 #include "btree_io.h"
@@ -208,6 +209,18 @@ static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush)
 	int ret = 0;
 
 	lockdep_assert_held(&bc->lock);
+
+	struct bbpos pos = BBPOS(b->c.btree_id, b->key.k.p);
+
+	u64 mask = b->c.level
+		? bc->pinned_nodes_interior_mask
+		: bc->pinned_nodes_leaf_mask;
+
+	if ((mask & BIT_ULL(b->c.btree_id)) &&
+	    bbpos_cmp(bc->pinned_nodes_start, pos) < 0 &&
+	    bbpos_cmp(bc->pinned_nodes_end, pos) >= 0)
+		return -BCH_ERR_ENOMEM_btree_node_reclaim;
+
 wait_on_io:
 	if (b->flags & ((1U << BTREE_NODE_dirty)|
 			(1U << BTREE_NODE_read_in_flight)|
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index f163257724a9bf..b2ebf143c3b7d6 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -6,6 +6,7 @@
 #include <linux/list.h>
 #include <linux/rhashtable.h>
 
+#include "bbpos_types.h"
 #include "btree_key_cache_types.h"
 #include "buckets_types.h"
 #include "errcode.h"
@@ -173,6 +174,11 @@ struct btree_cache {
 	 */
 	struct task_struct	*alloc_lock;
 	struct closure_waitlist	alloc_wait;
+
+	struct bbpos		pinned_nodes_start;
+	struct bbpos		pinned_nodes_end;
+	u64			pinned_nodes_leaf_mask;
+	u64			pinned_nodes_interior_mask;
 };
 
 struct btree_node_iter {
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index 9a4b7faa376503..f8c2341e8d3d32 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -332,6 +332,11 @@ enum fsck_err_opts {
 	  OPT_BOOL(),							\
 	  BCH2_NO_SB_OPT,		false,				\
 	  NULL,		"Run fsck on mount")				\
+	x(fsck_memory_usage_percent,	u8,				\
+	  OPT_FS|OPT_MOUNT,						\
+	  OPT_UINT(20, 70),						\
+	  BCH2_NO_SB_OPT,		50,				\
+	  NULL,		"Maximum percentage of system ram fsck is allowed to pin")\
 	x(fix_errors,			u8,				\
 	  OPT_FS|OPT_MOUNT,						\
 	  OPT_FN(bch2_opt_fix_errors),					\

From c42d097fff7551b7194875f8253320d43b1680bd Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 9 Feb 2024 20:15:03 -0500
Subject: [PATCH 0576/1406] bcachefs: Save key_cache_path in peek_slot()

When bch2_btree_iter_peek_slot() clones the iterator to search for the
next key, and then discovers that the key from the cloned iterator is
the key we want to return - we also want to save the
iter->key_cache_path as well, for the update path.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 2195e602abf09b..3139646f659708 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -2501,6 +2501,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
 			k = bch2_btree_iter_peek_upto(&iter2, end);
 
 			if (k.k && !bkey_err(k)) {
+				swap(iter->key_cache_path, iter2.key_cache_path);
 				iter->k = iter2.k;
 				k.k = &iter->k;
 			}

From 527c703fb297b56629ef1e92460c932220068360 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 9 Feb 2024 20:16:41 -0500
Subject: [PATCH 0577/1406] bcachefs: Track iter->ip_allocated at
 bch2_trans_copy_iter()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 3139646f659708..07b1de5cdee6e6 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -2761,6 +2761,9 @@ void bch2_trans_copy_iter(struct btree_iter *dst, struct btree_iter *src)
 	struct btree_trans *trans = src->trans;
 
 	*dst = *src;
+#ifdef TRACK_PATH_ALLOCATED
+	dst->ip_allocated = _RET_IP_;
+#endif
 	if (src->path)
 		__btree_path_get(trans->paths + src->path, src->flags & BTREE_ITER_INTENT);
 	if (src->update_path)

From ab41b35b22fd604001cee22dad2efd12ba2a1858 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 12 Feb 2024 15:17:14 -0500
Subject: [PATCH 0578/1406] bcachefs: Use kvzalloc() when dynamically
 allocating btree paths

THis silences a mm/page_alloc.c warning about allocating more than a
page with GFP_NOFAIL - and there's no reason for this to not have a
vmalloc fallback anyways.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 07b1de5cdee6e6..20b32c71b20af9 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1520,7 +1520,7 @@ static noinline void btree_paths_realloc(struct btree_trans *trans)
 {
 	unsigned nr = trans->nr_paths * 2;
 
-	void *p = kzalloc(BITS_TO_LONGS(nr) * sizeof(unsigned long) +
+	void *p = kvzalloc(BITS_TO_LONGS(nr) * sizeof(unsigned long) +
 			  sizeof(struct btree_trans_paths) +
 			  nr * sizeof(struct btree_path) +
 			  nr * sizeof(btree_path_idx_t) + 8 +
@@ -3087,7 +3087,7 @@ void bch2_trans_put(struct btree_trans *trans)
 	trans->paths		= NULL;
 
 	if (paths_allocated != trans->_paths_allocated)
-		kfree_rcu_mightsleep(paths_allocated);
+		kvfree_rcu_mightsleep(paths_allocated);
 
 	if (trans->mem_bytes == BTREE_TRANS_MEM_MAX)
 		mempool_free(trans->mem, &c->btree_trans_mem_pool);

From a249625492a65fe7eb79f9c5b931ad10fe3736a0 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 12 Feb 2024 15:19:22 -0500
Subject: [PATCH 0579/1406] bcachefs: Improve error messages in device remove
 path

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/super.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 8c6caebf843174..742de2f383c562 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -1592,27 +1592,27 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
 	__bch2_dev_read_only(c, ca);
 
 	ret = bch2_dev_data_drop(c, ca->dev_idx, flags);
-	bch_err_msg(ca, ret, "dropping data");
+	bch_err_msg(ca, ret, "bch2_dev_data_drop()");
 	if (ret)
 		goto err;
 
 	ret = bch2_dev_remove_alloc(c, ca);
-	bch_err_msg(ca, ret, "deleting alloc info");
+	bch_err_msg(ca, ret, "bch2_dev_remove_alloc()");
 	if (ret)
 		goto err;
 
 	ret = bch2_journal_flush_device_pins(&c->journal, ca->dev_idx);
-	bch_err_msg(ca, ret, "flushing journal");
+	bch_err_msg(ca, ret, "bch2_journal_flush_device_pins()");
 	if (ret)
 		goto err;
 
 	ret = bch2_journal_flush(&c->journal);
-	bch_err(ca, "journal error");
+	bch_err_msg(ca, ret, "bch2_journal_flush()");
 	if (ret)
 		goto err;
 
 	ret = bch2_replicas_gc2(c);
-	bch_err_msg(ca, ret, "in replicas_gc2()");
+	bch_err_msg(ca, ret, "bch2_replicas_gc2()");
 	if (ret)
 		goto err;
 

From 873da86d4817022921a01e2d483bd0fed9183cac Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 12 Feb 2024 17:15:29 -0500
Subject: [PATCH 0580/1406] bcachefs: bch2_print_opts()

Make sure early error messages get redirected, for
kernel-fsck-from-userland.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h |  3 +++
 fs/bcachefs/super-io.c | 13 +++++++------
 fs/bcachefs/super.c    | 17 +++++++++++++++++
 3 files changed, 27 insertions(+), 6 deletions(-)

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 3b48c5e133b5b5..4d04e9c04dc348 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -267,6 +267,9 @@ do {									\
 
 #define bch2_fmt(_c, fmt)		bch2_log_msg(_c, fmt "\n")
 
+__printf(2, 3)
+void bch2_print_opts(struct bch_opts *, const char *, ...);
+
 __printf(2, 3)
 void __bch2_print(struct bch_fs *c, const char *fmt, ...);
 
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index 36988add581fb5..a3a9e85ab03c0b 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -717,6 +717,7 @@ static int __bch2_read_super(const char *path, struct bch_opts *opts,
 
 	if (IS_ERR(sb->bdev_handle)) {
 		ret = PTR_ERR(sb->bdev_handle);
+		prt_printf(&err, "error opening %s: %s", path, bch2_err_str(ret));
 		goto err;
 	}
 	sb->bdev = sb->bdev_handle->bdev;
@@ -743,9 +744,9 @@ static int __bch2_read_super(const char *path, struct bch_opts *opts,
 	prt_printf(&err2, "bcachefs (%s): error reading default superblock: %s\n",
 	       path, err.buf);
 	if (ret == -BCH_ERR_invalid_sb_magic && ignore_notbchfs_msg)
-		printk(KERN_INFO "%s", err2.buf);
+		bch2_print_opts(opts, KERN_INFO "%s", err2.buf);
 	else
-		printk(KERN_ERR "%s", err2.buf);
+		bch2_print_opts(opts, KERN_ERR "%s", err2.buf);
 
 	printbuf_exit(&err2);
 	printbuf_reset(&err);
@@ -808,16 +809,16 @@ static int __bch2_read_super(const char *path, struct bch_opts *opts,
 
 	ret = bch2_sb_validate(sb, &err, READ);
 	if (ret) {
-		printk(KERN_ERR "bcachefs (%s): error validating superblock: %s\n",
-		       path, err.buf);
+		bch2_print_opts(opts, KERN_ERR "bcachefs (%s): error validating superblock: %s\n",
+				path, err.buf);
 		goto err_no_print;
 	}
 out:
 	printbuf_exit(&err);
 	return ret;
 err:
-	printk(KERN_ERR "bcachefs (%s): error reading superblock: %s\n",
-	       path, err.buf);
+	bch2_print_opts(opts, KERN_ERR "bcachefs (%s): error reading superblock: %s\n",
+			path, err.buf);
 err_no_print:
 	bch2_free_super(sb);
 	goto out;
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 742de2f383c562..a7f9de220d903f 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -87,6 +87,23 @@ const char * const bch2_fs_flag_strs[] = {
 	NULL
 };
 
+void bch2_print_opts(struct bch_opts *opts, const char *fmt, ...)
+{
+	struct stdio_redirect *stdio = (void *)(unsigned long)opts->stdio;
+
+	va_list args;
+	va_start(args, fmt);
+	if (likely(!stdio)) {
+		vprintk(fmt, args);
+	} else {
+		if (fmt[0] == KERN_SOH[0])
+			fmt += 2;
+
+		stdio_redirect_vprintf(stdio, true, fmt, args);
+	}
+	va_end(args);
+}
+
 void __bch2_print(struct bch_fs *c, const char *fmt, ...)
 {
 	struct stdio_redirect *stdio = bch2_fs_stdio_redirect(c);

From d741c84848a7c2d81b8f60b592622723b2d2b8c1 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Wed, 7 Feb 2024 11:43:32 -0800
Subject: [PATCH 0581/1406] thread_with_file: allow creation of readonly files

Create a new run_thread_with_stdout function that opens a file in
O_RDONLY mode so that the kernel can write things to userspace but
userspace cannot write to the kernel.  This will be used to convey xfs
health event information to userspace.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 include/linux/thread_with_file.h |  3 +++
 lib/thread_with_file.c           | 36 ++++++++++++++++++++++++++++++++
 2 files changed, 39 insertions(+)

diff --git a/include/linux/thread_with_file.h b/include/linux/thread_with_file.h
index 54091f7ff3383d..5f7e85bc8322b4 100644
--- a/include/linux/thread_with_file.h
+++ b/include/linux/thread_with_file.h
@@ -62,6 +62,9 @@ struct thread_with_stdio {
 int run_thread_with_stdio(struct thread_with_stdio *,
 			  void (*exit)(struct thread_with_stdio *),
 			  void (*fn)(struct thread_with_stdio *));
+int run_thread_with_stdout(struct thread_with_stdio *,
+			  void (*exit)(struct thread_with_stdio *),
+			  void (*fn)(struct thread_with_stdio *));
 int stdio_redirect_read(struct stdio_redirect *, char *, size_t);
 int stdio_redirect_readline(struct stdio_redirect *, char *, size_t);
 
diff --git a/lib/thread_with_file.c b/lib/thread_with_file.c
index b09dc60ba62804..71028611b8d59e 100644
--- a/lib/thread_with_file.c
+++ b/lib/thread_with_file.c
@@ -344,6 +344,22 @@ static int thread_with_stdio_release(struct inode *inode, struct file *file)
 	return 0;
 }
 
+static __poll_t thread_with_stdout_poll(struct file *file, struct poll_table_struct *wait)
+{
+	struct thread_with_stdio *thr =
+		container_of(file->private_data, struct thread_with_stdio, thr);
+
+	poll_wait(file, &thr->stdio.output.wait, wait);
+
+	__poll_t mask = 0;
+
+	if (stdio_redirect_has_output(&thr->stdio))
+		mask |= EPOLLIN;
+	if (thr->thr.done)
+		mask |= EPOLLHUP|EPOLLERR;
+	return mask;
+}
+
 static const struct file_operations thread_with_stdio_fops = {
 	.llseek		= no_llseek,
 	.read		= thread_with_stdio_read,
@@ -352,6 +368,13 @@ static const struct file_operations thread_with_stdio_fops = {
 	.release	= thread_with_stdio_release,
 };
 
+static const struct file_operations thread_with_stdout_fops = {
+	.llseek		= no_llseek,
+	.read		= thread_with_stdio_read,
+	.poll		= thread_with_stdout_poll,
+	.release	= thread_with_stdio_release,
+};
+
 static int thread_with_stdio_fn(void *arg)
 {
 	struct thread_with_stdio *thr = arg;
@@ -375,5 +398,18 @@ int run_thread_with_stdio(struct thread_with_stdio *thr,
 }
 EXPORT_SYMBOL_GPL(run_thread_with_stdio);
 
+int run_thread_with_stdout(struct thread_with_stdio *thr,
+			  void (*exit)(struct thread_with_stdio *),
+			  void (*fn)(struct thread_with_stdio *))
+{
+	stdio_buf_init(&thr->stdio.input);
+	stdio_buf_init(&thr->stdio.output);
+	thr->exit	= exit;
+	thr->fn		= fn;
+
+	return run_thread_with_file(&thr->thr, &thread_with_stdout_fops, thread_with_stdio_fn);
+}
+EXPORT_SYMBOL_GPL(run_thread_with_stdout);
+
 MODULE_AUTHOR("Kent Overstreet");
 MODULE_LICENSE("GPL");

From 45439fea63e241dd34e5ed9fdc600787d5898792 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Wed, 7 Feb 2024 11:39:03 -0800
Subject: [PATCH 0582/1406] thread_with_file: fix various printf problems

Experimentally fix some problems with stdio_redirect_vprintf by creating
a MOO variant with which we can experiment.  We can't do a GFP_KERNEL
allocation while holding the spinlock, and I don't like how the printf
function can silently truncate the output if memory allocation fails.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 include/linux/thread_with_file.h |  4 +--
 lib/thread_with_file.c           | 55 +++++++++++++++++++++-----------
 2 files changed, 39 insertions(+), 20 deletions(-)

diff --git a/include/linux/thread_with_file.h b/include/linux/thread_with_file.h
index 5f7e85bc8322b4..7b133a15d3540e 100644
--- a/include/linux/thread_with_file.h
+++ b/include/linux/thread_with_file.h
@@ -68,7 +68,7 @@ int run_thread_with_stdout(struct thread_with_stdio *,
 int stdio_redirect_read(struct stdio_redirect *, char *, size_t);
 int stdio_redirect_readline(struct stdio_redirect *, char *, size_t);
 
-__printf(3, 0) void stdio_redirect_vprintf(struct stdio_redirect *, bool, const char *, va_list);
-__printf(3, 4) void stdio_redirect_printf(struct stdio_redirect *, bool, const char *, ...);
+__printf(3, 0) ssize_t stdio_redirect_vprintf(struct stdio_redirect *, bool, const char *, va_list);
+__printf(3, 4) ssize_t stdio_redirect_printf(struct stdio_redirect *, bool, const char *, ...);
 
 #endif /* _LINUX_THREAD_WITH_FILE_H */
diff --git a/lib/thread_with_file.c b/lib/thread_with_file.c
index 71028611b8d59e..70a805ef017f96 100644
--- a/lib/thread_with_file.c
+++ b/lib/thread_with_file.c
@@ -108,49 +108,68 @@ int stdio_redirect_readline(struct stdio_redirect *stdio, char *ubuf, size_t len
 EXPORT_SYMBOL_GPL(stdio_redirect_readline);
 
 __printf(3, 0)
-static void darray_vprintf(darray_char *out, gfp_t gfp, const char *fmt, va_list args)
+static ssize_t darray_vprintf(darray_char *out, gfp_t gfp, const char *fmt, va_list args)
 {
-	size_t len;
+	ssize_t ret;
 
 	do {
 		va_list args2;
-		va_copy(args2, args);
+		size_t len;
 
+		va_copy(args2, args);
 		len = vsnprintf(out->data + out->nr, darray_room(*out), fmt, args2);
-	} while (len + 1 > darray_room(*out) && !darray_make_room_gfp(out, len + 1, gfp));
+		if (len + 1 <= darray_room(*out)) {
+			out->nr += len;
+			return len;
+		}
 
-	out->nr += min(len, darray_room(*out));
+		ret = darray_make_room_gfp(out, len + 1, gfp);
+	} while (ret == 0);
+
+	return ret;
 }
 
-void stdio_redirect_vprintf(struct stdio_redirect *stdio, bool nonblocking,
-			    const char *fmt, va_list args)
+ssize_t stdio_redirect_vprintf(struct stdio_redirect *stdio, bool nonblocking,
+			       const char *fmt, va_list args)
 {
 	struct stdio_buf *buf = &stdio->output;
 	unsigned long flags;
+	ssize_t ret;
 
-	if (!nonblocking)
-		wait_event(buf->wait, stdio_redirect_has_output_space(stdio));
-	else if (!stdio_redirect_has_output_space(stdio))
-		return;
-	if (stdio->done)
-		return;
-
+again:
 	spin_lock_irqsave(&buf->lock, flags);
-	darray_vprintf(&buf->buf, nonblocking ? GFP_NOWAIT : GFP_KERNEL, fmt, args);
+	ret = darray_vprintf(&buf->buf, GFP_NOWAIT, fmt, args);
 	spin_unlock_irqrestore(&buf->lock, flags);
 
+	if (ret < 0) {
+		if (nonblocking)
+			return -EAGAIN;
+
+		ret = wait_event_interruptible(buf->wait,
+				stdio_redirect_has_output_space(stdio));
+		if (ret)
+			return ret;
+		goto again;
+	}
+
 	wake_up(&buf->wait);
+	return ret;
+
 }
 EXPORT_SYMBOL_GPL(stdio_redirect_vprintf);
 
-void stdio_redirect_printf(struct stdio_redirect *stdio, bool nonblocking,
-			   const char *fmt, ...)
+ssize_t stdio_redirect_printf(struct stdio_redirect *stdio, bool nonblocking,
+			      const char *fmt, ...)
 {
 
 	va_list args;
+	ssize_t ret;
+
 	va_start(args, fmt);
-	stdio_redirect_vprintf(stdio, nonblocking, fmt, args);
+	ret = stdio_redirect_vprintf(stdio, nonblocking, fmt, args);
 	va_end(args);
+
+	return ret;
 }
 EXPORT_SYMBOL_GPL(stdio_redirect_printf);
 

From 41fabbfc56cb7e6e891ef05042f668a7473a55a2 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Sat, 10 Feb 2024 11:23:01 -0800
Subject: [PATCH 0583/1406] thread_with_file: create ops structure for
 thread_with_stdio

Create an ops structure so we can add more file-based functionality in
the next few patches.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/chardev.c            | 18 ++++++++++++------
 include/linux/thread_with_file.h | 16 ++++++++++------
 lib/thread_with_file.c           | 16 ++++++----------
 3 files changed, 28 insertions(+), 22 deletions(-)

diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c
index 4cbda66bb6e0fa..a2f30f45f93f75 100644
--- a/fs/bcachefs/chardev.c
+++ b/fs/bcachefs/chardev.c
@@ -165,6 +165,11 @@ static void bch2_fsck_offline_thread_fn(struct thread_with_stdio *stdio)
 		bch2_fs_stop(c);
 }
 
+static const struct thread_with_stdio_ops bch2_offline_fsck_ops = {
+	.exit		= bch2_fsck_thread_exit,
+	.fn		= bch2_fsck_offline_thread_fn,
+};
+
 static long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_arg)
 {
 	struct bch_ioctl_fsck_offline arg;
@@ -217,9 +222,7 @@ static long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_a
 
 	opt_set(thr->opts, stdio, (u64)(unsigned long)&thr->thr.stdio);
 
-	ret = run_thread_with_stdio(&thr->thr,
-			bch2_fsck_thread_exit,
-			bch2_fsck_offline_thread_fn);
+	ret = run_thread_with_stdio(&thr->thr, &bch2_offline_fsck_ops);
 err:
 	if (ret < 0) {
 		if (thr)
@@ -794,6 +797,11 @@ static void bch2_fsck_online_thread_fn(struct thread_with_stdio *stdio)
 	bch2_ro_ref_put(c);
 }
 
+static const struct thread_with_stdio_ops bch2_online_fsck_ops = {
+	.exit		= bch2_fsck_thread_exit,
+	.fn		= bch2_fsck_online_thread_fn,
+};
+
 static long bch2_ioctl_fsck_online(struct bch_fs *c,
 				   struct bch_ioctl_fsck_online arg)
 {
@@ -834,9 +842,7 @@ static long bch2_ioctl_fsck_online(struct bch_fs *c,
 			goto err;
 	}
 
-	ret = run_thread_with_stdio(&thr->thr,
-			bch2_fsck_thread_exit,
-			bch2_fsck_online_thread_fn);
+	ret = run_thread_with_stdio(&thr->thr, &bch2_online_fsck_ops);
 err:
 	if (ret < 0) {
 		bch_err_fn(c, ret);
diff --git a/include/linux/thread_with_file.h b/include/linux/thread_with_file.h
index 7b133a15d3540e..445b1b12a0bd61 100644
--- a/include/linux/thread_with_file.h
+++ b/include/linux/thread_with_file.h
@@ -52,19 +52,23 @@ int run_thread_with_file(struct thread_with_file *,
 			 const struct file_operations *,
 			 int (*fn)(void *));
 
+struct thread_with_stdio;
+
+struct thread_with_stdio_ops {
+	void (*exit)(struct thread_with_stdio *);
+	void (*fn)(struct thread_with_stdio *);
+};
+
 struct thread_with_stdio {
 	struct thread_with_file	thr;
 	struct stdio_redirect	stdio;
-	void			(*exit)(struct thread_with_stdio *);
-	void			(*fn)(struct thread_with_stdio *);
+	const struct thread_with_stdio_ops	*ops;
 };
 
 int run_thread_with_stdio(struct thread_with_stdio *,
-			  void (*exit)(struct thread_with_stdio *),
-			  void (*fn)(struct thread_with_stdio *));
+			  const struct thread_with_stdio_ops *);
 int run_thread_with_stdout(struct thread_with_stdio *,
-			  void (*exit)(struct thread_with_stdio *),
-			  void (*fn)(struct thread_with_stdio *));
+			  const struct thread_with_stdio_ops *);
 int stdio_redirect_read(struct stdio_redirect *, char *, size_t);
 int stdio_redirect_readline(struct stdio_redirect *, char *, size_t);
 
diff --git a/lib/thread_with_file.c b/lib/thread_with_file.c
index 70a805ef017f96..2edf33c3e7dc53 100644
--- a/lib/thread_with_file.c
+++ b/lib/thread_with_file.c
@@ -359,7 +359,7 @@ static int thread_with_stdio_release(struct inode *inode, struct file *file)
 	thread_with_file_exit(&thr->thr);
 	darray_exit(&thr->stdio.input.buf);
 	darray_exit(&thr->stdio.output.buf);
-	thr->exit(thr);
+	thr->ops->exit(thr);
 	return 0;
 }
 
@@ -398,33 +398,29 @@ static int thread_with_stdio_fn(void *arg)
 {
 	struct thread_with_stdio *thr = arg;
 
-	thr->fn(thr);
+	thr->ops->fn(thr);
 
 	thread_with_stdio_done(thr);
 	return 0;
 }
 
 int run_thread_with_stdio(struct thread_with_stdio *thr,
-			  void (*exit)(struct thread_with_stdio *),
-			  void (*fn)(struct thread_with_stdio *))
+			  const struct thread_with_stdio_ops *ops)
 {
 	stdio_buf_init(&thr->stdio.input);
 	stdio_buf_init(&thr->stdio.output);
-	thr->exit	= exit;
-	thr->fn		= fn;
+	thr->ops = ops;
 
 	return run_thread_with_file(&thr->thr, &thread_with_stdio_fops, thread_with_stdio_fn);
 }
 EXPORT_SYMBOL_GPL(run_thread_with_stdio);
 
 int run_thread_with_stdout(struct thread_with_stdio *thr,
-			  void (*exit)(struct thread_with_stdio *),
-			  void (*fn)(struct thread_with_stdio *))
+			   const struct thread_with_stdio_ops *ops)
 {
 	stdio_buf_init(&thr->stdio.input);
 	stdio_buf_init(&thr->stdio.output);
-	thr->exit	= exit;
-	thr->fn		= fn;
+	thr->ops = ops;
 
 	return run_thread_with_file(&thr->thr, &thread_with_stdout_fops, thread_with_stdio_fn);
 }

From f7db35a6d6bde8d971c6959c322bb3388a02a7db Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Sat, 10 Feb 2024 11:32:20 -0800
Subject: [PATCH 0584/1406] thread_with_file: allow ioctls against these files

Make it so that a thread_with_stdio user can handle ioctls against the
file descriptor.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 include/linux/thread_with_file.h |  1 +
 lib/thread_with_file.c           | 12 ++++++++++++
 2 files changed, 13 insertions(+)

diff --git a/include/linux/thread_with_file.h b/include/linux/thread_with_file.h
index 445b1b12a0bd61..33770938d5d9a8 100644
--- a/include/linux/thread_with_file.h
+++ b/include/linux/thread_with_file.h
@@ -57,6 +57,7 @@ struct thread_with_stdio;
 struct thread_with_stdio_ops {
 	void (*exit)(struct thread_with_stdio *);
 	void (*fn)(struct thread_with_stdio *);
+	long (*unlocked_ioctl)(struct thread_with_stdio *, unsigned int, unsigned long);
 };
 
 struct thread_with_stdio {
diff --git a/lib/thread_with_file.c b/lib/thread_with_file.c
index 2edf33c3e7dc53..8b129744a48a33 100644
--- a/lib/thread_with_file.c
+++ b/lib/thread_with_file.c
@@ -379,12 +379,23 @@ static __poll_t thread_with_stdout_poll(struct file *file, struct poll_table_str
 	return mask;
 }
 
+static long thread_with_stdio_ioctl(struct file *file, unsigned int cmd, unsigned long p)
+{
+	struct thread_with_stdio *thr =
+		container_of(file->private_data, struct thread_with_stdio, thr);
+
+	if (thr->ops->unlocked_ioctl)
+		return thr->ops->unlocked_ioctl(thr, cmd, p);
+	return -ENOTTY;
+}
+
 static const struct file_operations thread_with_stdio_fops = {
 	.llseek		= no_llseek,
 	.read		= thread_with_stdio_read,
 	.write		= thread_with_stdio_write,
 	.poll		= thread_with_stdio_poll,
 	.release	= thread_with_stdio_release,
+	.unlocked_ioctl	= thread_with_stdio_ioctl,
 };
 
 static const struct file_operations thread_with_stdout_fops = {
@@ -392,6 +403,7 @@ static const struct file_operations thread_with_stdout_fops = {
 	.read		= thread_with_stdio_read,
 	.poll		= thread_with_stdout_poll,
 	.release	= thread_with_stdio_release,
+	.unlocked_ioctl	= thread_with_stdio_ioctl,
 };
 
 static int thread_with_stdio_fn(void *arg)

From 2e113aa5413606cc54c3fc5e17cb6dd444d3c64f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 13 Feb 2024 20:26:09 -0500
Subject: [PATCH 0585/1406] thread_with_file: Fix missing va_end()

Fixes: https://lore.kernel.org/linux-bcachefs/202402131603.E953E2CF@keescook/T/#u
Reported-by: coverity scan
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 lib/thread_with_file.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/lib/thread_with_file.c b/lib/thread_with_file.c
index 8b129744a48a33..37a1ea22823cae 100644
--- a/lib/thread_with_file.c
+++ b/lib/thread_with_file.c
@@ -118,6 +118,8 @@ static ssize_t darray_vprintf(darray_char *out, gfp_t gfp, const char *fmt, va_l
 
 		va_copy(args2, args);
 		len = vsnprintf(out->data + out->nr, darray_room(*out), fmt, args2);
+		va_end(args2);
+
 		if (len + 1 <= darray_room(*out)) {
 			out->nr += len;
 			return len;

From fa5c69daa8321b65fdc3dd0f0b177568ce5c994f Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Thu, 15 Feb 2024 12:16:05 -0500
Subject: [PATCH 0586/1406] bcachefs: fix iov_iter count underflow on sub-block
 dio read

bch2_direct_IO_read() checks the request offset and size for sector
alignment and then falls through to a couple calculations to shrink
the size of the request based on the inode size. The problem is that
these checks round up to the fs block size, which runs the risk of
underflowing iter->count if the block size happens to be large
enough. This is triggered by fstest generic/361 with a 4k block
size, which subsequently leads to a crash. To avoid this crash,
check that the shorten length doesn't exceed the overall length of
the iter.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Su Yue <glass.su@suse.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io-direct.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/bcachefs/fs-io-direct.c b/fs/bcachefs/fs-io-direct.c
index e3b219e19e1008..33cb6da3a5ad28 100644
--- a/fs/bcachefs/fs-io-direct.c
+++ b/fs/bcachefs/fs-io-direct.c
@@ -88,6 +88,8 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter)
 		return ret;
 
 	shorten = iov_iter_count(iter) - round_up(ret, block_bytes(c));
+	if (shorten >= iter->count)
+		shorten = 0;
 	iter->count -= shorten;
 
 	bio = bio_alloc_bioset(NULL,

From c5327d8acaa1dd1bc4b9d9317541dcf2ebafee96 Mon Sep 17 00:00:00 2001
From: Guoyu Ou <benogy@gmail.com>
Date: Tue, 13 Feb 2024 16:20:04 +0800
Subject: [PATCH 0587/1406] bcachefs: skip invisible entries in empty subvolume
 checking

When we are checking whether a subvolume is empty in the specified snapshot,
entries that do not belong to this subvolume should be skipped.

This fixes the following case:

    $ bcachefs subvolume create ./sub
    $ cd sub
    $ bcachefs subvolume create ./sub2
    $ bcachefs subvolume snapshot . ./snap
    $ ls -a snap
    . ..
    $ rmdir snap
    rmdir: failed to remove 'snap': Directory not empty

As Kent suggested, we pass 0 in may_delete_deleted_inode() to ignore subvols
in the subvol we are checking, because inode.bi_subvol is only set on
subvolume roots, and we can't go through every inode in the subvolume and
change bi_subvol when taking a snapshot. It makes the check less strict, but
that's ok, the rest of fsck will still catch it.

Signed-off-by: Guoyu Ou <benogy@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/dirent.c | 7 +++++--
 fs/bcachefs/dirent.h | 2 +-
 fs/bcachefs/inode.c  | 2 +-
 3 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index b5ee11b50f5c09..d37bd07afbfe40 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -512,7 +512,7 @@ u64 bch2_dirent_lookup(struct bch_fs *c, subvol_inum dir,
 	return ret;
 }
 
-int bch2_empty_dir_snapshot(struct btree_trans *trans, u64 dir, u32 snapshot)
+int bch2_empty_dir_snapshot(struct btree_trans *trans, u64 dir, u32 subvol, u32 snapshot)
 {
 	struct btree_iter iter;
 	struct bkey_s_c k;
@@ -522,6 +522,9 @@ int bch2_empty_dir_snapshot(struct btree_trans *trans, u64 dir, u32 snapshot)
 			   SPOS(dir, 0, snapshot),
 			   POS(dir, U64_MAX), 0, k, ret)
 		if (k.k->type == KEY_TYPE_dirent) {
+			struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
+			if (d.v->d_type == DT_SUBVOL && le32_to_cpu(d.v->d_parent_subvol) != subvol)
+				continue;
 			ret = -BCH_ERR_ENOTEMPTY_dir_not_empty;
 			break;
 		}
@@ -535,7 +538,7 @@ int bch2_empty_dir_trans(struct btree_trans *trans, subvol_inum dir)
 	u32 snapshot;
 
 	return bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot) ?:
-		bch2_empty_dir_snapshot(trans, dir.inum, snapshot);
+		bch2_empty_dir_snapshot(trans, dir.inum, dir.subvol, snapshot);
 }
 
 int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx)
diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h
index 34cb8e18eaf897..bee55cca2aa0dd 100644
--- a/fs/bcachefs/dirent.h
+++ b/fs/bcachefs/dirent.h
@@ -69,7 +69,7 @@ u64 bch2_dirent_lookup(struct bch_fs *, subvol_inum,
 		       const struct bch_hash_info *,
 		       const struct qstr *, subvol_inum *);
 
-int bch2_empty_dir_snapshot(struct btree_trans *, u64, u32);
+int bch2_empty_dir_snapshot(struct btree_trans *, u64, u32, u32);
 int bch2_empty_dir_trans(struct btree_trans *, subvol_inum);
 int bch2_readdir(struct bch_fs *, subvol_inum, struct dir_context *);
 
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index f9a566c52d61c7..a3139bb66f77e2 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -1117,7 +1117,7 @@ static int may_delete_deleted_inode(struct btree_trans *trans,
 		goto out;
 
 	if (S_ISDIR(inode.bi_mode)) {
-		ret = bch2_empty_dir_snapshot(trans, pos.offset, pos.snapshot);
+		ret = bch2_empty_dir_snapshot(trans, pos.offset, 0, pos.snapshot);
 		if (fsck_err_on(bch2_err_matches(ret, ENOTEMPTY),
 				c, deleted_inode_is_dir,
 				"non empty directory %llu:%u in deleted_inodes btree",

From a49db63526134bb3cb0641b1057b38eb2a2615d9 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 15 Feb 2024 21:42:10 -0500
Subject: [PATCH 0588/1406] bcachefs: bch2_trigger_alloc() handles state
 changes better

bch2_trigger_alloc() kicks off certain tasks on bucket state changes;
e.g. triggering the bucket discard worker and the invalidate worker.

We've observed the discard worker running too often - most runs it
doesn't do any work, according to the tracepoint - so clearly, we're
kicking it off too often.

This adds an explicit statechange() macro to make these checks more
precise.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index fd3e175d834232..c7be6afe89553b 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -860,23 +860,28 @@ int bch2_trigger_alloc(struct btree_trans *trans,
 			*bucket_gen(ca, new.k->p.offset) = new_a->gen;
 
 		bch2_dev_usage_update(c, ca, old_a, new_a, journal_seq, false);
+		percpu_up_read(&c->mark_lock);
+
+#define eval_state(_a, expr)		({ const struct bch_alloc_v4 *a = _a; expr; })
+#define statechange(expr)		!eval_state(old_a, expr) && eval_state(new_a, expr)
+#define bucket_flushed(a)		(!a->journal_seq || a->journal_seq <= c->journal.flushed_seq_ondisk)
 
-		if (new_a->data_type == BCH_DATA_free &&
-		    (!new_a->journal_seq || new_a->journal_seq < c->journal.flushed_seq_ondisk))
+		if (statechange(a->data_type == BCH_DATA_free &&
+				bucket_flushed(a)))
 			closure_wake_up(&c->freelist_wait);
 
-		if (new_a->data_type == BCH_DATA_need_discard &&
-		    (!bucket_journal_seq || bucket_journal_seq < c->journal.flushed_seq_ondisk))
+		if (statechange(a->data_type == BCH_DATA_need_discard &&
+				bucket_flushed(a)) &&
+		    !bch2_bucket_is_open(c, new.k->p.inode, new.k->p.offset))
 			bch2_do_discards(c);
 
-		if (old_a->data_type != BCH_DATA_cached &&
-		    new_a->data_type == BCH_DATA_cached &&
+		if (statechange(a->data_type == BCH_DATA_cached) &&
+		    !bch2_bucket_is_open(c, new.k->p.inode, new.k->p.offset) &&
 		    should_invalidate_buckets(ca, bch2_dev_usage_read(ca)))
 			bch2_do_invalidates(c);
 
-		if (new_a->data_type == BCH_DATA_need_gc_gens)
+		if (statechange(a->data_type == BCH_DATA_need_gc_gens))
 			bch2_do_gc_gens(c);
-		percpu_up_read(&c->mark_lock);
 	}
 
 	if ((flags & BTREE_TRIGGER_GC) &&

From 8bbccdd8c264ccac9d6b7c3beafec093c42996a7 Mon Sep 17 00:00:00 2001
From: Lad Prabhakar <prabhakar.mahadev-lad.rj@bp.renesas.com>
Date: Sat, 3 Feb 2024 21:26:40 +0000
Subject: [PATCH 0589/1406] cache: ax45mp_cache: Align end size to cache
 boundary in ax45mp_dma_cache_wback()

Align the end size to cache boundary size in ax45mp_dma_cache_wback()
callback likewise done in ax45mp_dma_cache_inv() callback.

Additionally return early in case of start == end.

Fixes: d34599bcd2e4 ("cache: Add L2 cache management for Andes AX45MP RISC-V core")
Reported-by: Pavel Machek <pavel@denx.de>
Signed-off-by: Lad Prabhakar <prabhakar.mahadev-lad.rj@bp.renesas.com>
Signed-off-by: Conor Dooley <conor.dooley@microchip.com>
---
 drivers/cache/ax45mp_cache.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/cache/ax45mp_cache.c b/drivers/cache/ax45mp_cache.c
index 57186c58dc849c..1d7dd3d2c101cd 100644
--- a/drivers/cache/ax45mp_cache.c
+++ b/drivers/cache/ax45mp_cache.c
@@ -129,8 +129,12 @@ static void ax45mp_dma_cache_wback(phys_addr_t paddr, size_t size)
 	unsigned long line_size;
 	unsigned long flags;
 
+	if (unlikely(start == end))
+		return;
+
 	line_size = ax45mp_priv.ax45mp_cache_line_size;
 	start = start & (~(line_size - 1));
+	end = ((end + line_size - 1) & (~(line_size - 1)));
 	local_irq_save(flags);
 	ax45mp_cpu_dcache_wb_range(start, end);
 	local_irq_restore(flags);

From bd107d86bb292a25feca32a8115152608172ff98 Mon Sep 17 00:00:00 2001
From: Okan Akyuz <okanakyuz@okanakyuz.com>
Date: Fri, 16 Feb 2024 10:52:12 +0300
Subject: [PATCH 0590/1406] hwmon: (max6620) Update broken Datasheet URL in
 driver documentation

The URL for the MAX6620 datasheet has changed. Update it to reflect the
current location.

Signed-off-by: Okan Akyuz <okanakyuz@okanakyuz.com>
Link: https://lore.kernel.org/r/20240216075212.69118-1-okanakyuz@okanakyuz.com
[groeck: Updated subject and patch description]
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
---
 Documentation/hwmon/max6620.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/hwmon/max6620.rst b/Documentation/hwmon/max6620.rst
index 84c1c44d3de4de..d70173bf0242c1 100644
--- a/Documentation/hwmon/max6620.rst
+++ b/Documentation/hwmon/max6620.rst
@@ -11,7 +11,7 @@ Supported chips:
 
     Addresses scanned: none
 
-    Datasheet: http://pdfserv.maxim-ic.com/en/ds/MAX6620.pdf
+    Datasheet: https://www.analog.com/media/en/technical-documentation/data-sheets/max6620.pdf
 
 Authors:
     - L\. Grunenberg <contact@lgrunenberg.de>

From b6dce0452a0276339392bc5eeb722370a466ba25 Mon Sep 17 00:00:00 2001
From: Nuno Sa <nuno.sa@analog.com>
Date: Fri, 9 Feb 2024 16:50:34 +0100
Subject: [PATCH 0591/1406] counter: fix privdata alignment

Aligning to the L1 cache does not guarantee the same alignment as
kmallocing an object [1]. Furthermore, in some platforms, that
alignment is not sufficient for DMA safety (in case someone wants
to have a DMA safe buffer in privdata) [2].

Sometime ago, we had the same fixes in IIO.

[1]: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/drivers/base/devres.c#n35
[2]: https://lore.kernel.org/linux-iio/20220508175712.647246-2-jic23@kernel.org/

Fixes: c18e2760308e ("counter: Provide alternative counter registration functions")
Signed-off-by: Nuno Sa <nuno.sa@analog.com>
Link: https://lore.kernel.org/r/20240209-counter-align-fix-v2-1-5777ea0a2722@analog.com
Signed-off-by: William Breathitt Gray <william.gray@linaro.org>
---
 drivers/counter/counter-core.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/counter/counter-core.c b/drivers/counter/counter-core.c
index f6a939d5117176..29df0985f2ba08 100644
--- a/drivers/counter/counter-core.c
+++ b/drivers/counter/counter-core.c
@@ -31,10 +31,11 @@ struct counter_device_allochelper {
 	struct counter_device counter;
 
 	/*
-	 * This is cache line aligned to ensure private data behaves like if it
-	 * were kmalloced separately.
+	 * This ensures private data behaves like if it were kmalloced
+	 * separately. Also ensures the minimum alignment for safe DMA
+	 * operations (which may or may not mean cache alignment).
 	 */
-	unsigned long privdata[] ____cacheline_aligned;
+	unsigned long privdata[] __aligned(ARCH_DMA_MINALIGN);
 };
 
 static void counter_device_release(struct device *dev)

From 5141fa1ec23fdbe755552e78ef066f632e3226a3 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Fri, 16 Feb 2024 19:11:34 -0800
Subject: [PATCH 0592/1406] cxl/acpi: Fix load failures due to single window
 creation failure

The expectation is that cxl_parse_cfwms() continues in the face the of
failure as evidenced by code like:

    cxlrd = cxl_root_decoder_alloc(root_port, ways, cxl_calc_hb);
    if (IS_ERR(cxlrd))
    	return 0;

There are other error paths in that function which mistakenly follow
idiomatic expectations and return an error when they should not. Most of
those mistakes are innocuous checks that hardly ever fail in practice.
However, a recent change succeed in making the implementation more
fragile by applying an idiomatic, but still wrong "fix" [1]. In this
failure case the kernel reports:

    cxl root0: Failed to populate active decoder targets
    cxl_acpi ACPI0017:00: Failed to add decode range: [mem 0x00000000-0x7fffffff flags 0x200]

...which is a real issue with that one window (to be fixed separately),
but ends up failing the entirety of cxl_acpi_probe().

Undo that recent breakage while also removing the confusion about
ignoring errors. Update all exits paths to return an error per typical
expectations and let an outer wrapper function handle dropping the
error.

Fixes: 91019b5bc7c2 ("cxl/acpi: Return 'rc' instead of '0' in cxl_parse_cfmws()") [1]
Cc: <stable@vger.kernel.org>
Cc: Breno Leitao <leitao@debian.org>
Cc: Alison Schofield <alison.schofield@intel.com>
Cc: Vishal Verma <vishal.l.verma@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/cxl/acpi.c | 45 +++++++++++++++++++++++++++------------------
 1 file changed, 27 insertions(+), 18 deletions(-)

diff --git a/drivers/cxl/acpi.c b/drivers/cxl/acpi.c
index dcf2b39e104882..53d2dff0c7a33a 100644
--- a/drivers/cxl/acpi.c
+++ b/drivers/cxl/acpi.c
@@ -316,31 +316,27 @@ static const struct cxl_root_ops acpi_root_ops = {
 	.qos_class = cxl_acpi_qos_class,
 };
 
-static int cxl_parse_cfmws(union acpi_subtable_headers *header, void *arg,
-			   const unsigned long end)
+static int __cxl_parse_cfmws(struct acpi_cedt_cfmws *cfmws,
+			     struct cxl_cfmws_context *ctx)
 {
 	int target_map[CXL_DECODER_MAX_INTERLEAVE];
-	struct cxl_cfmws_context *ctx = arg;
 	struct cxl_port *root_port = ctx->root_port;
 	struct resource *cxl_res = ctx->cxl_res;
 	struct cxl_cxims_context cxims_ctx;
 	struct cxl_root_decoder *cxlrd;
 	struct device *dev = ctx->dev;
-	struct acpi_cedt_cfmws *cfmws;
 	cxl_calc_hb_fn cxl_calc_hb;
 	struct cxl_decoder *cxld;
 	unsigned int ways, i, ig;
 	struct resource *res;
 	int rc;
 
-	cfmws = (struct acpi_cedt_cfmws *) header;
-
 	rc = cxl_acpi_cfmws_verify(dev, cfmws);
 	if (rc) {
 		dev_err(dev, "CFMWS range %#llx-%#llx not registered\n",
 			cfmws->base_hpa,
 			cfmws->base_hpa + cfmws->window_size - 1);
-		return 0;
+		return rc;
 	}
 
 	rc = eiw_to_ways(cfmws->interleave_ways, &ways);
@@ -376,7 +372,7 @@ static int cxl_parse_cfmws(union acpi_subtable_headers *header, void *arg,
 
 	cxlrd = cxl_root_decoder_alloc(root_port, ways, cxl_calc_hb);
 	if (IS_ERR(cxlrd))
-		return 0;
+		return PTR_ERR(cxlrd);
 
 	cxld = &cxlrd->cxlsd.cxld;
 	cxld->flags = cfmws_to_decoder_flags(cfmws->restrictions);
@@ -420,16 +416,7 @@ static int cxl_parse_cfmws(union acpi_subtable_headers *header, void *arg,
 		put_device(&cxld->dev);
 	else
 		rc = cxl_decoder_autoremove(dev, cxld);
-	if (rc) {
-		dev_err(dev, "Failed to add decode range: %pr", res);
-		return rc;
-	}
-	dev_dbg(dev, "add: %s node: %d range [%#llx - %#llx]\n",
-		dev_name(&cxld->dev),
-		phys_to_target_node(cxld->hpa_range.start),
-		cxld->hpa_range.start, cxld->hpa_range.end);
-
-	return 0;
+	return rc;
 
 err_insert:
 	kfree(res->name);
@@ -438,6 +425,28 @@ static int cxl_parse_cfmws(union acpi_subtable_headers *header, void *arg,
 	return -ENOMEM;
 }
 
+static int cxl_parse_cfmws(union acpi_subtable_headers *header, void *arg,
+			   const unsigned long end)
+{
+	struct acpi_cedt_cfmws *cfmws = (struct acpi_cedt_cfmws *)header;
+	struct cxl_cfmws_context *ctx = arg;
+	struct device *dev = ctx->dev;
+	int rc;
+
+	dev_dbg(dev, "decode range: node: %d range [%#llx - %#llx]\n",
+		phys_to_target_node(cfmws->base_hpa), cfmws->base_hpa,
+		cfmws->base_hpa + cfmws->window_size - 1);
+	rc = __cxl_parse_cfmws(cfmws, ctx);
+	if (rc)
+		dev_err(dev,
+			"Failed to add decode range: [%#llx - %#llx] (%d)\n",
+			cfmws->base_hpa,
+			cfmws->base_hpa + cfmws->window_size - 1, rc);
+
+	/* never fail cxl_acpi load for a single window failure */
+	return 0;
+}
+
 __mock struct acpi_device *to_cxl_host_bridge(struct device *host,
 					      struct device *dev)
 {

From e6b33455c319e77613f44862d9d19ef63d208862 Mon Sep 17 00:00:00 2001
From: Guenter Roeck <linux@roeck-us.net>
Date: Sat, 17 Feb 2024 08:16:51 -0800
Subject: [PATCH 0593/1406] MAINTAINERS: Drop redundant hwmon entries

I am listed as maintainer of several individual hardware monitoring drivers
and for the hardware monitoring subsystem itself. That is redundant and
just bloats the MAINTAINERS file. Drop all the redundant entries.

Signed-off-by: Guenter Roeck <linux@roeck-us.net>
---
 MAINTAINERS | 108 ----------------------------------------------------
 1 file changed, 108 deletions(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index afe08a63f7a968..9c14c97d1056ea 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -10493,22 +10493,6 @@ L:	linux-fbdev@vger.kernel.org
 S:	Orphan
 F:	drivers/video/fbdev/imsttfb.c
 
-INA209 HARDWARE MONITOR DRIVER
-M:	Guenter Roeck <linux@roeck-us.net>
-L:	linux-hwmon@vger.kernel.org
-S:	Maintained
-F:	Documentation/devicetree/bindings/hwmon/ti,ina2xx.yaml
-F:	Documentation/hwmon/ina209.rst
-F:	drivers/hwmon/ina209.c
-
-INA2XX HARDWARE MONITOR DRIVER
-M:	Guenter Roeck <linux@roeck-us.net>
-L:	linux-hwmon@vger.kernel.org
-S:	Maintained
-F:	Documentation/hwmon/ina2xx.rst
-F:	drivers/hwmon/ina2xx.c
-F:	include/linux/platform_data/ina2xx.h
-
 INDEX OF FURTHER KERNEL DOCUMENTATION
 M:	Carlos Bilbao <carlos.bilbao@amd.com>
 S:	Maintained
@@ -11484,14 +11468,6 @@ S:	Maintained
 F:	arch/x86/include/asm/jailhouse_para.h
 F:	arch/x86/kernel/jailhouse.c
 
-JC42.4 TEMPERATURE SENSOR DRIVER
-M:	Guenter Roeck <linux@roeck-us.net>
-L:	linux-hwmon@vger.kernel.org
-S:	Maintained
-F:	Documentation/devicetree/bindings/hwmon/jedec,jc42.yaml
-F:	Documentation/hwmon/jc42.rst
-F:	drivers/hwmon/jc42.c
-
 JFS FILESYSTEM
 M:	Dave Kleikamp <shaggy@kernel.org>
 L:	jfs-discussion@lists.sourceforge.net
@@ -12557,13 +12533,6 @@ F:	Documentation/hwmon/lm90.rst
 F:	drivers/hwmon/lm90.c
 F:	include/dt-bindings/thermal/lm90.h
 
-LM95234 HARDWARE MONITOR DRIVER
-M:	Guenter Roeck <linux@roeck-us.net>
-L:	linux-hwmon@vger.kernel.org
-S:	Maintained
-F:	Documentation/hwmon/lm95234.rst
-F:	drivers/hwmon/lm95234.c
-
 LME2510 MEDIA DRIVER
 M:	Malcolm Priestley <tvboxspy@gmail.com>
 L:	linux-media@vger.kernel.org
@@ -12767,13 +12736,6 @@ W:	https://ez.analog.com/linux-software-drivers
 F:	Documentation/devicetree/bindings/iio/temperature/adi,ltc2983.yaml
 F:	drivers/iio/temperature/ltc2983.c
 
-LTC4261 HARDWARE MONITOR DRIVER
-M:	Guenter Roeck <linux@roeck-us.net>
-L:	linux-hwmon@vger.kernel.org
-S:	Maintained
-F:	Documentation/hwmon/ltc4261.rst
-F:	drivers/hwmon/ltc4261.c
-
 LTC4282 HARDWARE MONITOR DRIVER
 M:	Nuno Sa <nuno.sa@analog.com>
 L:	linux-hwmon@vger.kernel.org
@@ -13129,13 +13091,6 @@ S:	Maintained
 F:	Documentation/hwmon/max15301.rst
 F:	drivers/hwmon/pmbus/max15301.c
 
-MAX16065 HARDWARE MONITOR DRIVER
-M:	Guenter Roeck <linux@roeck-us.net>
-L:	linux-hwmon@vger.kernel.org
-S:	Maintained
-F:	Documentation/hwmon/max16065.rst
-F:	drivers/hwmon/max16065.c
-
 MAX2175 SDR TUNER DRIVER
 M:	Ramesh Shanmugasundaram <rashanmu@gmail.com>
 L:	linux-media@vger.kernel.org
@@ -13160,15 +13115,6 @@ S:	Orphan
 F:	Documentation/hwmon/max6650.rst
 F:	drivers/hwmon/max6650.c
 
-MAX6697 HARDWARE MONITOR DRIVER
-M:	Guenter Roeck <linux@roeck-us.net>
-L:	linux-hwmon@vger.kernel.org
-S:	Maintained
-F:	Documentation/devicetree/bindings/hwmon/max6697.txt
-F:	Documentation/hwmon/max6697.rst
-F:	drivers/hwmon/max6697.c
-F:	include/linux/platform_data/max6697.h
-
 MAX9286 QUAD GMSL DESERIALIZER DRIVER
 M:	Jacopo Mondi <jacopo+renesas@jmondi.org>
 M:	Kieran Bingham <kieran.bingham+renesas@ideasonboard.com>
@@ -15075,15 +15021,6 @@ M:	Samuel Mendoza-Jonas <sam@mendozajonas.com>
 S:	Maintained
 F:	net/ncsi/
 
-NCT6775 HARDWARE MONITOR DRIVER - CORE & PLATFORM DRIVER
-M:	Guenter Roeck <linux@roeck-us.net>
-L:	linux-hwmon@vger.kernel.org
-S:	Maintained
-F:	Documentation/hwmon/nct6775.rst
-F:	drivers/hwmon/nct6775-core.c
-F:	drivers/hwmon/nct6775-platform.c
-F:	drivers/hwmon/nct6775.h
-
 NCT6775 HARDWARE MONITOR DRIVER - I2C DRIVER
 M:	Zev Weiss <zev@bewilderbeest.net>
 L:	linux-hwmon@vger.kernel.org
@@ -17450,35 +17387,6 @@ S:	Maintained
 F:	Documentation/hwmon/pm6764tr.rst
 F:	drivers/hwmon/pmbus/pm6764tr.c
 
-PMBUS HARDWARE MONITORING DRIVERS
-M:	Guenter Roeck <linux@roeck-us.net>
-L:	linux-hwmon@vger.kernel.org
-S:	Maintained
-W:	http://hwmon.wiki.kernel.org/
-W:	http://www.roeck-us.net/linux/drivers/
-T:	git git://git.kernel.org/pub/scm/linux/kernel/git/groeck/linux-staging.git
-F:	Documentation/devicetree/bindings/hwmon/ltc2978.txt
-F:	Documentation/devicetree/bindings/hwmon/max31785.txt
-F:	Documentation/hwmon/adm1275.rst
-F:	Documentation/hwmon/ibm-cffps.rst
-F:	Documentation/hwmon/ir35221.rst
-F:	Documentation/hwmon/lm25066.rst
-F:	Documentation/hwmon/ltc2978.rst
-F:	Documentation/hwmon/ltc3815.rst
-F:	Documentation/hwmon/max16064.rst
-F:	Documentation/hwmon/max20751.rst
-F:	Documentation/hwmon/max31785.rst
-F:	Documentation/hwmon/max34440.rst
-F:	Documentation/hwmon/max8688.rst
-F:	Documentation/hwmon/pmbus-core.rst
-F:	Documentation/hwmon/pmbus.rst
-F:	Documentation/hwmon/tps40422.rst
-F:	Documentation/hwmon/ucd9000.rst
-F:	Documentation/hwmon/ucd9200.rst
-F:	Documentation/hwmon/zl6100.rst
-F:	drivers/hwmon/pmbus/
-F:	include/linux/pmbus.h
-
 PMC SIERRA MaxRAID DRIVER
 L:	linux-scsi@vger.kernel.org
 S:	Orphan
@@ -22182,22 +22090,6 @@ F:	drivers/mmc/host/renesas_sdhi*
 F:	drivers/mmc/host/tmio_mmc*
 F:	include/linux/mfd/tmio.h
 
-TMP401 HARDWARE MONITOR DRIVER
-M:	Guenter Roeck <linux@roeck-us.net>
-L:	linux-hwmon@vger.kernel.org
-S:	Maintained
-F:	Documentation/devicetree/bindings/hwmon/ti,tmp401.yaml
-F:	Documentation/hwmon/tmp401.rst
-F:	drivers/hwmon/tmp401.c
-
-TMP464 HARDWARE MONITOR DRIVER
-M:	Guenter Roeck <linux@roeck-us.net>
-L:	linux-hwmon@vger.kernel.org
-S:	Maintained
-F:	Documentation/devicetree/bindings/hwmon/ti,tmp464.yaml
-F:	Documentation/hwmon/tmp464.rst
-F:	drivers/hwmon/tmp464.c
-
 TMP513 HARDWARE MONITOR DRIVER
 M:	Eric Tremblay <etremblay@distech-controls.com>
 L:	linux-hwmon@vger.kernel.org

From 84fb722f6ba71fea13603c246ac07779f5a8a5e6 Mon Sep 17 00:00:00 2001
From: Zhenhua Huang <quic_zhenhuah@quicinc.com>
Date: Tue, 2 Jan 2024 18:19:37 +0800
Subject: [PATCH 0594/1406] fs/proc: remove redundant comments from
 /proc/bootconfig

commit 717c7c894d4b ("fs/proc: Add boot loader arguments as comment to
/proc/bootconfig") adds bootloader argument comments into /proc/bootconfig.

/proc/bootconfig shows boot_command_line[] multiple times following
every xbc key value pair, that's duplicated and not necessary.
Remove redundant ones.

Output before and after the fix is like:
key1 = value1
*bootloader argument comments*
key2 = value2
*bootloader argument comments*
key3 = value3
*bootloader argument comments*
...

key1 = value1
key2 = value2
key3 = value3
*bootloader argument comments*
...

Fixes: 717c7c894d4b ("fs/proc: Add boot loader arguments as comment to /proc/bootconfig")
Signed-off-by: Zhenhua Huang <quic_zhenhuah@quicinc.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 fs/proc/bootconfig.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/fs/proc/bootconfig.c b/fs/proc/bootconfig.c
index 902b326e1e5607..e5635a6b127b0b 100644
--- a/fs/proc/bootconfig.c
+++ b/fs/proc/bootconfig.c
@@ -62,12 +62,12 @@ static int __init copy_xbc_key_value_list(char *dst, size_t size)
 				break;
 			dst += ret;
 		}
-		if (ret >= 0 && boot_command_line[0]) {
-			ret = snprintf(dst, rest(dst, end), "# Parameters from bootloader:\n# %s\n",
-				       boot_command_line);
-			if (ret > 0)
-				dst += ret;
-		}
+	}
+	if (ret >= 0 && boot_command_line[0]) {
+		ret = snprintf(dst, rest(dst, end), "# Parameters from bootloader:\n# %s\n",
+			       boot_command_line);
+		if (ret > 0)
+			dst += ret;
 	}
 out:
 	kfree(key);

From e01bd0d9d1b864ebf7e86330557519320896b5bb Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Tue, 30 Jan 2024 06:08:32 -0800
Subject: [PATCH 0595/1406] Documentation/atomic_t: Emphasize that failed
 atomic operations give no ordering

The ORDERING section of Documentation/atomic_t.txt can easily be read as
saying that conditional atomic RMW operations that fail are ordered when
those operations have the _acquire() or _release() suffixes.  This is
not the case, therefore update this section to make it clear that failed
conditional atomic RMW operations provide no ordering.

Reported-by: Anna-Maria Behnsen <anna-maria@linutronix.de>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Cc: Alan Stern <stern@rowland.harvard.edu>
Cc: Will Deacon <will@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Jade Alglave <j.alglave@ucl.ac.uk>
Cc: Luc Maranget <luc.maranget@inria.fr>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Akira Yokosawa <akiyks@gmail.com>
Cc: Daniel Lustig <dlustig@nvidia.com>
Cc: Joel Fernandes <joel@joelfernandes.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: <linux-arch@vger.kernel.org>
Cc: <linux-doc@vger.kernel.org>
Acked-by: Andrea Parri <parri.andrea@gmail.com>
Acked-by: Mark Rutland <mark.rutland@arm.com>
---
 Documentation/atomic_t.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Documentation/atomic_t.txt b/Documentation/atomic_t.txt
index d7adc6d543db4f..bee3b1bca9a7b4 100644
--- a/Documentation/atomic_t.txt
+++ b/Documentation/atomic_t.txt
@@ -171,14 +171,14 @@ The rule of thumb:
  - RMW operations that are conditional are unordered on FAILURE,
    otherwise the above rules apply.
 
-Except of course when an operation has an explicit ordering like:
+Except of course when a successful operation has an explicit ordering like:
 
  {}_relaxed: unordered
  {}_acquire: the R of the RMW (or atomic_read) is an ACQUIRE
  {}_release: the W of the RMW (or atomic_set)  is a  RELEASE
 
 Where 'unordered' is against other memory locations. Address dependencies are
-not defeated.
+not defeated.  Conditional operations are still unordered on FAILURE.
 
 Fully ordered primitives are ordered against everything prior and everything
 subsequent. Therefore a fully ordered primitive is like having an smp_mb()

From 62c0504a55ad374933b49cc5f22466ca56084541 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Wed, 31 Jan 2024 04:15:42 -0800
Subject: [PATCH 0596/1406] scftorture: Increase memory provided to guest OS

The tradition, extending back almost a full year, has been 2GB plus an
additional number of GBs equal to the number of CPUs divided by sixteen.
This tradition has served scftorture well, even the CONFIG_PREEMPT=y
version running KASAN within guest OSes having 40 CPUs.  However, this
test recently started OOMing on larger systems, and this commit therefore
gives this test an additional GB of memory.

It is quite possible that further testing on larger systems will show
a need to decrease the divisor from 16 to (say) 8, but that is a change
to make once it has been demonstrated to be required.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 tools/testing/selftests/rcutorture/bin/torture.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/testing/selftests/rcutorture/bin/torture.sh b/tools/testing/selftests/rcutorture/bin/torture.sh
index d5a0d8a33c27ba..154ef81386485f 100755
--- a/tools/testing/selftests/rcutorture/bin/torture.sh
+++ b/tools/testing/selftests/rcutorture/bin/torture.sh
@@ -425,7 +425,7 @@ fi
 if test "$do_scftorture" = "yes"
 then
 	# Scale memory based on the number of CPUs.
-	scfmem=$((2+HALF_ALLOTED_CPUS/16))
+	scfmem=$((3+HALF_ALLOTED_CPUS/16))
 	torture_bootargs="scftorture.nthreads=$HALF_ALLOTED_CPUS torture.disable_onoff_at_boot csdlock_debug=1"
 	torture_set "scftorture" tools/testing/selftests/rcutorture/bin/kvm.sh --torture scf --allcpus --duration "$duration_scftorture" --configs "$configs_scftorture" --kconfig "CONFIG_NR_CPUS=$HALF_ALLOTED_CPUS" --memory ${scfmem}G --trust-make
 fi

From c511d769ffe42137871cb33bee4b807d96e240b4 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Thu, 1 Feb 2024 10:51:28 -0800
Subject: [PATCH 0597/1406] Documentation/litmus-tests: Add locking tests to
 README

This commit documents the litmus tests in the "locking" directory.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Cc: Alan Stern <stern@rowland.harvard.edu>
Cc: Will Deacon <will@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Jade Alglave <j.alglave@ucl.ac.uk>
Cc: Luc Maranget <luc.maranget@inria.fr>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Akira Yokosawa <akiyks@gmail.com>
Cc: Daniel Lustig <dlustig@nvidia.com>
Cc: Joel Fernandes <joel@joelfernandes.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: <linux-arch@vger.kernel.org>
Cc: <linux-doc@vger.kernel.org>
---
 Documentation/litmus-tests/README | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/Documentation/litmus-tests/README b/Documentation/litmus-tests/README
index 658d37860d3974..5c8915e6fb6848 100644
--- a/Documentation/litmus-tests/README
+++ b/Documentation/litmus-tests/README
@@ -22,6 +22,35 @@ Atomic-RMW-ops-are-atomic-WRT-atomic_set.litmus
     NOTE: Require herd7 7.56 or later which supports "(void)expr".
 
 
+locking (/locking directory)
+----------------------------
+
+DCL-broken.litmus
+	Demonstrates that double-checked locking needs more than just
+	the obvious lock acquisitions and releases.
+
+DCL-fixed.litmus
+	Demonstrates corrected double-checked locking that uses
+	smp_store_release() and smp_load_acquire() in addition to the
+	obvious lock acquisitions and releases.
+
+RM-broken.litmus
+	Demonstrates problems with "roach motel" locking, where code is
+	freely moved into lock-based critical sections.  This example also
+	shows how to use the "filter" clause to discard executions that
+	would be excluded by other code not modeled in the litmus test.
+	Note also that this "roach motel" optimization is emulated by
+	physically moving P1()'s two reads from x under the lock.
+
+	What is a roach motel?	This is from an old advertisement for
+	a cockroach trap, much later featured in one of the "Men in
+	Black" movies.	"The roaches check in.	They don't check out."
+
+RM-fixed.litmus
+	The counterpart to RM-broken.litmus, showing P0()'s two loads from
+	x safely outside of the critical section.
+
+
 RCU (/rcu directory)
 --------------------
 

From beef86616fa759a2b9d7f76a47b5f89c191f90fd Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Thu, 1 Feb 2024 11:17:54 -0800
Subject: [PATCH 0598/1406] Documentation/litmus-tests: Demonstrate unordered
 failing cmpxchg

This commit adds four litmus tests showing that a failing cmpxchg()
operation is unordered unless followed by an smp_mb__after_atomic()
operation.

Suggested-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Cc: Alan Stern <stern@rowland.harvard.edu>
Cc: Will Deacon <will@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Jade Alglave <j.alglave@ucl.ac.uk>
Cc: Luc Maranget <luc.maranget@inria.fr>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Akira Yokosawa <akiyks@gmail.com>
Cc: Daniel Lustig <dlustig@nvidia.com>
Cc: Joel Fernandes <joel@joelfernandes.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: <linux-arch@vger.kernel.org>
Cc: <linux-doc@vger.kernel.org>
---
 Documentation/litmus-tests/README             | 48 ++++++++++++-------
 .../atomic/cmpxchg-fail-ordered-1.litmus      | 34 +++++++++++++
 .../atomic/cmpxchg-fail-ordered-2.litmus      | 30 ++++++++++++
 .../atomic/cmpxchg-fail-unordered-1.litmus    | 33 +++++++++++++
 .../atomic/cmpxchg-fail-unordered-2.litmus    | 30 ++++++++++++
 5 files changed, 159 insertions(+), 16 deletions(-)
 create mode 100644 Documentation/litmus-tests/atomic/cmpxchg-fail-ordered-1.litmus
 create mode 100644 Documentation/litmus-tests/atomic/cmpxchg-fail-ordered-2.litmus
 create mode 100644 Documentation/litmus-tests/atomic/cmpxchg-fail-unordered-1.litmus
 create mode 100644 Documentation/litmus-tests/atomic/cmpxchg-fail-unordered-2.litmus

diff --git a/Documentation/litmus-tests/README b/Documentation/litmus-tests/README
index 5c8915e6fb6848..6c666f3422ea3a 100644
--- a/Documentation/litmus-tests/README
+++ b/Documentation/litmus-tests/README
@@ -21,34 +21,50 @@ Atomic-RMW-ops-are-atomic-WRT-atomic_set.litmus
     Test that atomic_set() cannot break the atomicity of atomic RMWs.
     NOTE: Require herd7 7.56 or later which supports "(void)expr".
 
+cmpxchg-fail-ordered-1.litmus
+    Demonstrate that a failing cmpxchg() operation acts as a full barrier
+    when followed by smp_mb__after_atomic().
+
+cmpxchg-fail-ordered-2.litmus
+    Demonstrate that a failing cmpxchg() operation acts as an acquire
+    operation when followed by smp_mb__after_atomic().
+
+cmpxchg-fail-unordered-1.litmus
+    Demonstrate that a failing cmpxchg() operation does not act as a
+    full barrier.
+
+cmpxchg-fail-unordered-2.litmus
+    Demonstrate that a failing cmpxchg() operation does not act as an
+    acquire operation.
+
 
 locking (/locking directory)
 ----------------------------
 
 DCL-broken.litmus
-	Demonstrates that double-checked locking needs more than just
-	the obvious lock acquisitions and releases.
+    Demonstrates that double-checked locking needs more than just
+    the obvious lock acquisitions and releases.
 
 DCL-fixed.litmus
-	Demonstrates corrected double-checked locking that uses
-	smp_store_release() and smp_load_acquire() in addition to the
-	obvious lock acquisitions and releases.
+    Demonstrates corrected double-checked locking that uses
+    smp_store_release() and smp_load_acquire() in addition to the
+    obvious lock acquisitions and releases.
 
 RM-broken.litmus
-	Demonstrates problems with "roach motel" locking, where code is
-	freely moved into lock-based critical sections.  This example also
-	shows how to use the "filter" clause to discard executions that
-	would be excluded by other code not modeled in the litmus test.
-	Note also that this "roach motel" optimization is emulated by
-	physically moving P1()'s two reads from x under the lock.
+    Demonstrates problems with "roach motel" locking, where code is
+    freely moved into lock-based critical sections.  This example also
+    shows how to use the "filter" clause to discard executions that
+    would be excluded by other code not modeled in the litmus test.
+    Note also that this "roach motel" optimization is emulated by
+    physically moving P1()'s two reads from x under the lock.
 
-	What is a roach motel?	This is from an old advertisement for
-	a cockroach trap, much later featured in one of the "Men in
-	Black" movies.	"The roaches check in.	They don't check out."
+    What is a roach motel?  This is from an old advertisement for
+    a cockroach trap, much later featured in one of the "Men in
+    Black" movies.  "The roaches check in.  They don't check out."
 
 RM-fixed.litmus
-	The counterpart to RM-broken.litmus, showing P0()'s two loads from
-	x safely outside of the critical section.
+    The counterpart to RM-broken.litmus, showing P0()'s two loads from
+    x safely outside of the critical section.
 
 
 RCU (/rcu directory)
diff --git a/Documentation/litmus-tests/atomic/cmpxchg-fail-ordered-1.litmus b/Documentation/litmus-tests/atomic/cmpxchg-fail-ordered-1.litmus
new file mode 100644
index 00000000000000..3df1d140b189bb
--- /dev/null
+++ b/Documentation/litmus-tests/atomic/cmpxchg-fail-ordered-1.litmus
@@ -0,0 +1,34 @@
+C cmpxchg-fail-ordered-1
+
+(*
+ * Result: Never
+ *
+ * Demonstrate that a failing cmpxchg() operation will act as a full
+ * barrier when followed by smp_mb__after_atomic().
+ *)
+
+{}
+
+P0(int *x, int *y, int *z)
+{
+	int r0;
+	int r1;
+
+	WRITE_ONCE(*x, 1);
+	r1 = cmpxchg(z, 1, 0);
+	smp_mb__after_atomic();
+	r0 = READ_ONCE(*y);
+}
+
+P1(int *x, int *y, int *z)
+{
+	int r0;
+
+	WRITE_ONCE(*y, 1);
+	r1 = cmpxchg(z, 1, 0);
+	smp_mb__after_atomic();
+	r0 = READ_ONCE(*x);
+}
+
+locations[0:r1;1:r1]
+exists (0:r0=0 /\ 1:r0=0)
diff --git a/Documentation/litmus-tests/atomic/cmpxchg-fail-ordered-2.litmus b/Documentation/litmus-tests/atomic/cmpxchg-fail-ordered-2.litmus
new file mode 100644
index 00000000000000..54146044a16f6d
--- /dev/null
+++ b/Documentation/litmus-tests/atomic/cmpxchg-fail-ordered-2.litmus
@@ -0,0 +1,30 @@
+C cmpxchg-fail-ordered-2
+
+(*
+ * Result: Never
+ *
+ * Demonstrate use of smp_mb__after_atomic() to make a failing cmpxchg
+ * operation have acquire ordering.
+ *)
+
+{}
+
+P0(int *x, int *y)
+{
+	int r0;
+	int r1;
+
+	WRITE_ONCE(*x, 1);
+	r1 = cmpxchg(y, 0, 1);
+}
+
+P1(int *x, int *y)
+{
+	int r0;
+
+	r1 = cmpxchg(y, 0, 1);
+	smp_mb__after_atomic();
+	r2 = READ_ONCE(*x);
+}
+
+exists (0:r1=0 /\ 1:r1=1 /\ 1:r2=0)
diff --git a/Documentation/litmus-tests/atomic/cmpxchg-fail-unordered-1.litmus b/Documentation/litmus-tests/atomic/cmpxchg-fail-unordered-1.litmus
new file mode 100644
index 00000000000000..a727ce23b1a6ea
--- /dev/null
+++ b/Documentation/litmus-tests/atomic/cmpxchg-fail-unordered-1.litmus
@@ -0,0 +1,33 @@
+C cmpxchg-fail-unordered-1
+
+(*
+ * Result: Sometimes
+ *
+ * Demonstrate that a failing cmpxchg() operation does not act as a
+ * full barrier.  (In contrast, a successful cmpxchg() does act as a
+ * full barrier.)
+ *)
+
+{}
+
+P0(int *x, int *y, int *z)
+{
+	int r0;
+	int r1;
+
+	WRITE_ONCE(*x, 1);
+	r1 = cmpxchg(z, 1, 0);
+	r0 = READ_ONCE(*y);
+}
+
+P1(int *x, int *y, int *z)
+{
+	int r0;
+
+	WRITE_ONCE(*y, 1);
+	r1 = cmpxchg(z, 1, 0);
+	r0 = READ_ONCE(*x);
+}
+
+locations[0:r1;1:r1]
+exists (0:r0=0 /\ 1:r0=0)
diff --git a/Documentation/litmus-tests/atomic/cmpxchg-fail-unordered-2.litmus b/Documentation/litmus-tests/atomic/cmpxchg-fail-unordered-2.litmus
new file mode 100644
index 00000000000000..a245bac55b578d
--- /dev/null
+++ b/Documentation/litmus-tests/atomic/cmpxchg-fail-unordered-2.litmus
@@ -0,0 +1,30 @@
+C cmpxchg-fail-unordered-2
+
+(*
+ * Result: Sometimes
+ *
+ * Demonstrate that a failing cmpxchg() operation does not act as either
+ * an acquire release operation.  (In contrast, a successful cmpxchg()
+ * does act as both an acquire and a release operation.)
+ *)
+
+{}
+
+P0(int *x, int *y)
+{
+	int r0;
+	int r1;
+
+	WRITE_ONCE(*x, 1);
+	r1 = cmpxchg(y, 0, 1);
+}
+
+P1(int *x, int *y)
+{
+	int r0;
+
+	r1 = cmpxchg(y, 0, 1);
+	r2 = READ_ONCE(*x);
+}
+
+exists (0:r1=0 /\ 1:r1=1 /\ 1:r2=0)

From 66aa67decf0c1a6daccf912925ebec59edb9f529 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Tue, 2 Jan 2024 15:55:12 -0800
Subject: [PATCH 0599/1406] tsc: Check for sockets instead of CPUs to make code
 match comment

The unsynchronized_tsc() eventually checks num_possible_cpus(), and
if the system is non-Intel and the number of possible CPUs is greater
than one, assumes that TSCs are unsynchronized.  This despite the
comment saying "assume multi socket systems are not synchronized",
that is, socket rather than CPU.  This behavior was preserved by
commit 8fbbc4b45ce3 ("x86: merge tsc_init and clocksource code") and
by the previous relevant commit 7e69f2b1ead2 ("clocksource: Remove the
update callback").

The clocksource drivers were added by commit 5d0cf410e94b ("Time: i386
Clocksource Drivers") back in 2006, and the comment still said "socket"
rather than "CPU".

Therefore, bravely (and perhaps foolishly) make the code match the
comment.

Note that it is possible to bypass both code and comment by booting
with tsc=reliable, but this also disables the clocksource watchdog,
which is undesirable when trust in the TSC is strictly limited.

Reported-by: Zhengxu Chen <zhxchen17@meta.com>
Reported-by: Danielle Costantino <dcostantino@meta.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Feng Tang <feng.tang@intel.com>
Cc: Waiman Long <longman@redhat.com>
Cc: John Stultz <jstultz@google.com>
Cc: <x86@kernel.org>
Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
---
 arch/x86/kernel/tsc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 15f97c0abc9d09..d45084c6a15ed3 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -1287,7 +1287,7 @@ int unsynchronized_tsc(void)
 	 */
 	if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) {
 		/* assume multi socket systems are not synchronized: */
-		if (num_possible_cpus() > 1)
+		if (nr_online_nodes > 1)
 			return 1;
 	}
 

From 75af5180a24a61f80cd1689103a66ed5113c1fd5 Mon Sep 17 00:00:00 2001
From: Breno Leitao <leitao@debian.org>
Date: Wed, 7 Feb 2024 08:52:35 -0800
Subject: [PATCH 0600/1406] x86/nmi: Fix "in NMI handler" check

Commit 344da544f177 ("x86/nmi: Print reasons why backtrace NMIs are
ignored") creates a super nice framework to diagnose NMIs.

Every time nmi_exc() is called, it increments a per_cpu counter
(nsp->idt_nmi_seq). At its exit, it also increments the same counter.
Looking at this counter, you can see how many times that function was
called (dividing by 2), and, if the function is still being executed, by
checking the idt_nmi_seq's last bit.

On the check side (nmi_backtrace_stall_check()), that variable is
queried to check if the NMI is still being executed, but, there is a
mistake in the bitwise operation. That code wants to check if the last
bit of the idt_nmi_seq is set or not, but, does the opposite, and check
for all the other bits, which will always be true after the first
exc_nmi() executed successfully.

This appends the misleading string to the dump "(CPU currently in NMI
handler function)"

Fix it by checking the last bit, and if it is set, append the string.

Fixes: 344da544f177 ("x86/nmi: Print reasons why backtrace NMIs are ignored")
Signed-off-by: Breno Leitao <leitao@debian.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 arch/x86/kernel/nmi.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index 17e955ab69feda..6e738ad474dcf2 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -639,7 +639,7 @@ void nmi_backtrace_stall_check(const struct cpumask *btp)
 			msgp = nmi_check_stall_msg[idx];
 			if (nsp->idt_ignored_snap != READ_ONCE(nsp->idt_ignored) && (idx & 0x1))
 				modp = ", but OK because ignore_nmis was set";
-			if (nmi_seq & ~0x1)
+			if (nmi_seq & 0x1)
 				msghp = " (CPU currently in NMI handler function)";
 			else if (nsp->idt_nmi_seq_snap + 1 == nmi_seq)
 				msghp = " (CPU exited one NMI handler function)";

From 667cae3b49d06f4c45ebe5e3ace34a22ad677fe2 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Fri, 9 Feb 2024 03:11:25 -0800
Subject: [PATCH 0601/1406] rcutorture: Disable tracing to permit Tasks Rude
 RCU testing

Now that the KPROBES, TRACING, BLK_DEV_IO_TRACE, and UPROBE_EVENTS
Kconfig options select the TASKS_TRACE_RCU option, the torture.sh tests
of enabling exactly one of the RCU Tasks flavors fail.  This commit
therefore disables these options to allow this testing to succeed.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 tools/testing/selftests/rcutorture/bin/torture.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/testing/selftests/rcutorture/bin/torture.sh b/tools/testing/selftests/rcutorture/bin/torture.sh
index 154ef81386485f..60be37951b7b67 100755
--- a/tools/testing/selftests/rcutorture/bin/torture.sh
+++ b/tools/testing/selftests/rcutorture/bin/torture.sh
@@ -391,7 +391,7 @@ __EOF__
 		forceflavor="`echo $flavor | sed -e 's/^CONFIG/CONFIG_FORCE/'`"
 		deselectedflavors="`grep -v $flavor $T/rcutasksflavors | tr '\012' ' ' | tr -s ' ' | sed -e 's/ *$//'`"
 		echo " --- Running RCU Tasks Trace flavor $flavor `date`" >> $rtfdir/log
-		tools/testing/selftests/rcutorture/bin/kvm.sh --datestamp "$ds/results-rcutasksflavors/$flavor" --buildonly --configs "TINY01 TREE04" --kconfig "CONFIG_RCU_EXPERT=y CONFIG_RCU_SCALE_TEST=y $forceflavor=y $deselectedflavors" --trust-make > $T/$flavor.out 2>&1
+		tools/testing/selftests/rcutorture/bin/kvm.sh --datestamp "$ds/results-rcutasksflavors/$flavor" --buildonly --configs "TINY01 TREE04" --kconfig "CONFIG_RCU_EXPERT=y CONFIG_RCU_SCALE_TEST=y CONFIG_KPROBES=n CONFIG_RCU_TRACE=n CONFIG_TRACING=n CONFIG_BLK_DEV_IO_TRACE=n CONFIG_UPROBE_EVENTS=n $forceflavor=y $deselectedflavors" --trust-make > $T/$flavor.out 2>&1
 		retcode=$?
 		if test "$retcode" -ne 0
 		then

From 6691ef5c3344bbd631f88525cb2515ac35b02f85 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Wed, 14 Feb 2024 14:45:53 -0800
Subject: [PATCH 0602/1406] rcu: Update lockdep while in RCU read-side critical
 section

With Ankur's lazy-/auto-preemption patches applied and with a
lazy-preemptible kernel in combination with a non-preemptible RCU,
lockdep sometimes complains about context switches within RCU read-side
critical sections.  This is a false positive due to rcu_read_unlock()
updating lockdep state too late:

	__release(RCU);
	__rcu_read_unlock();
	// Context switch here results in lockdep false positive!!!
	rcu_lock_release(&rcu_lock_map); /* Keep acq info for rls diags. */

Although this complaint could also happen with preemptible RCU
in a preemptible kernel, the odds of that happening aer quite low.
In constrast, with non-preemptible RCU, a long critical section has a
high probability of performing a context switch from the preempt_enable()
in __rcu_read_unlock().

The fix is straightforward, just move the rcu_lock_release()
within rcu_read_unlock() to obtain the reverse order from that of
rcu_read_lock():

	rcu_lock_release(&rcu_lock_map); /* Keep acq info for rls diags. */
	__release(RCU);
	__rcu_read_unlock();

This commit makes this change.

Co-developed-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Co-developed-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Co-developed-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Co-developed-by: Boqun Feng <boqun.feng@gmail.com>
Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Cc: Ankur Arora <ankur.a.arora@oracle.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/rcupdate.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 16f519914415eb..1d36c199994e2f 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -778,9 +778,9 @@ static inline void rcu_read_unlock(void)
 {
 	RCU_LOCKDEP_WARN(!rcu_is_watching(),
 			 "rcu_read_unlock() used illegally while idle");
+	rcu_lock_release(&rcu_lock_map); /* Keep acq info for rls diags. */
 	__release(RCU);
 	__rcu_read_unlock();
-	rcu_lock_release(&rcu_lock_map); /* Keep acq info for rls diags. */
 }
 
 /**

From b21a705516888ca33ff6b4e6be4945f93821aafb Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Wed, 14 Feb 2024 15:33:55 -0800
Subject: [PATCH 0603/1406] rcu: Make TINY_RCU depend on !PREEMPT_RCU rather
 than !PREEMPTION

Right now, TINY_RCU depends on (!PREEMPTION && !SMP), which has served the
kernel well for many years due to the fact that PREEMPT_RCU is normally
a synonym for PREEMPTION.  But with the advent of lazy preemption,
it will be possible to have non-preemptible RCU in a preemptible kernel,
so that kernels could be built with PREEMPT_RCU=n and PREEMPTION=y.

This commit therefore makes TINY_RCU depend on (!PREEMPT_RCU && !SMP),
thus allowing for a non-preemptible RCU in preemptible kernels.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Cc: Ankur Arora <ankur.a.arora@oracle.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/rcu/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig
index e7d2dd2675931f..7dca0138260c35 100644
--- a/kernel/rcu/Kconfig
+++ b/kernel/rcu/Kconfig
@@ -31,7 +31,7 @@ config PREEMPT_RCU
 
 config TINY_RCU
 	bool
-	default y if !PREEMPTION && !SMP
+	default y if !PREEMPT_RCU && !SMP
 	help
 	  This option selects the RCU implementation that is
 	  designed for UP systems from which real-time response

From 15fb43be0d2b881f61fea77ddbd53509d031394b Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Thu, 15 Feb 2024 09:04:30 -0800
Subject: [PATCH 0604/1406] srcu: Make Tiny SRCU explicitly disable preemption

Because Tiny SRCU is used only in kernels built with either
CONFIG_PREEMPT_NONE=y or CONFIG_PREEMPT_VOLUNTARY=y, there has not
been any need for TINY SRCU to explicitly disable preemption.  However,
the prospect of lazy preemption changes that, and the lazy-preemption
patches do result in rcutorture runs finding both too-short grace periods
and grace-period hangs for Tiny SRCU.

This commit therefore adds the needed preempt_disable() and
preempt_enable() calls to Tiny SRCU.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Cc: Ankur Arora <ankur.a.arora@oracle.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/srcutiny.h |  2 ++
 kernel/rcu/srcutiny.c    | 31 ++++++++++++++++++++++++++-----
 2 files changed, 28 insertions(+), 5 deletions(-)

diff --git a/include/linux/srcutiny.h b/include/linux/srcutiny.h
index 447133171d95fd..4d96bbdb45f086 100644
--- a/include/linux/srcutiny.h
+++ b/include/linux/srcutiny.h
@@ -64,8 +64,10 @@ static inline int __srcu_read_lock(struct srcu_struct *ssp)
 {
 	int idx;
 
+	preempt_disable();  // Needed for PREEMPT_AUTO
 	idx = ((READ_ONCE(ssp->srcu_idx) + 1) & 0x2) >> 1;
 	WRITE_ONCE(ssp->srcu_lock_nesting[idx], READ_ONCE(ssp->srcu_lock_nesting[idx]) + 1);
+	preempt_enable();
 	return idx;
 }
 
diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c
index c38e5933a5d693..5afd5cf494dba3 100644
--- a/kernel/rcu/srcutiny.c
+++ b/kernel/rcu/srcutiny.c
@@ -96,9 +96,12 @@ EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
  */
 void __srcu_read_unlock(struct srcu_struct *ssp, int idx)
 {
-	int newval = READ_ONCE(ssp->srcu_lock_nesting[idx]) - 1;
+	int newval;
 
+	preempt_disable();  // Needed for PREEMPT_AUTO
+	newval = READ_ONCE(ssp->srcu_lock_nesting[idx]) - 1;
 	WRITE_ONCE(ssp->srcu_lock_nesting[idx], newval);
+	preempt_enable();
 	if (!newval && READ_ONCE(ssp->srcu_gp_waiting) && in_task())
 		swake_up_one(&ssp->srcu_wq);
 }
@@ -117,8 +120,11 @@ void srcu_drive_gp(struct work_struct *wp)
 	struct srcu_struct *ssp;
 
 	ssp = container_of(wp, struct srcu_struct, srcu_work);
-	if (ssp->srcu_gp_running || ULONG_CMP_GE(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max)))
+	preempt_disable();  // Needed for PREEMPT_AUTO
+	if (ssp->srcu_gp_running || ULONG_CMP_GE(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max))) {
 		return; /* Already running or nothing to do. */
+		preempt_enable();
+	}
 
 	/* Remove recently arrived callbacks and wait for readers. */
 	WRITE_ONCE(ssp->srcu_gp_running, true);
@@ -130,9 +136,12 @@ void srcu_drive_gp(struct work_struct *wp)
 	idx = (ssp->srcu_idx & 0x2) / 2;
 	WRITE_ONCE(ssp->srcu_idx, ssp->srcu_idx + 1);
 	WRITE_ONCE(ssp->srcu_gp_waiting, true);  /* srcu_read_unlock() wakes! */
+	preempt_enable();
 	swait_event_exclusive(ssp->srcu_wq, !READ_ONCE(ssp->srcu_lock_nesting[idx]));
+	preempt_disable();  // Needed for PREEMPT_AUTO
 	WRITE_ONCE(ssp->srcu_gp_waiting, false); /* srcu_read_unlock() cheap. */
 	WRITE_ONCE(ssp->srcu_idx, ssp->srcu_idx + 1);
+	preempt_enable();
 
 	/* Invoke the callbacks we removed above. */
 	while (lh) {
@@ -150,8 +159,11 @@ void srcu_drive_gp(struct work_struct *wp)
 	 * at interrupt level, but the ->srcu_gp_running checks will
 	 * straighten that out.
 	 */
+	preempt_disable();  // Needed for PREEMPT_AUTO
 	WRITE_ONCE(ssp->srcu_gp_running, false);
-	if (ULONG_CMP_LT(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max)))
+	idx = ULONG_CMP_LT(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max));
+	preempt_enable();
+	if (idx)
 		schedule_work(&ssp->srcu_work);
 }
 EXPORT_SYMBOL_GPL(srcu_drive_gp);
@@ -160,9 +172,12 @@ static void srcu_gp_start_if_needed(struct srcu_struct *ssp)
 {
 	unsigned long cookie;
 
+	preempt_disable();  // Needed for PREEMPT_AUTO
 	cookie = get_state_synchronize_srcu(ssp);
-	if (ULONG_CMP_GE(READ_ONCE(ssp->srcu_idx_max), cookie))
+	if (ULONG_CMP_GE(READ_ONCE(ssp->srcu_idx_max), cookie)) {
+		preempt_enable();
 		return;
+	}
 	WRITE_ONCE(ssp->srcu_idx_max, cookie);
 	if (!READ_ONCE(ssp->srcu_gp_running)) {
 		if (likely(srcu_init_done))
@@ -170,6 +185,7 @@ static void srcu_gp_start_if_needed(struct srcu_struct *ssp)
 		else if (list_empty(&ssp->srcu_work.entry))
 			list_add(&ssp->srcu_work.entry, &srcu_boot_list);
 	}
+	preempt_enable();
 }
 
 /*
@@ -183,11 +199,13 @@ void call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp,
 
 	rhp->func = func;
 	rhp->next = NULL;
+	preempt_disable();  // Needed for PREEMPT_AUTO
 	local_irq_save(flags);
 	*ssp->srcu_cb_tail = rhp;
 	ssp->srcu_cb_tail = &rhp->next;
 	local_irq_restore(flags);
 	srcu_gp_start_if_needed(ssp);
+	preempt_enable();
 }
 EXPORT_SYMBOL_GPL(call_srcu);
 
@@ -241,9 +259,12 @@ EXPORT_SYMBOL_GPL(get_state_synchronize_srcu);
  */
 unsigned long start_poll_synchronize_srcu(struct srcu_struct *ssp)
 {
-	unsigned long ret = get_state_synchronize_srcu(ssp);
+	unsigned long ret;
 
+	preempt_disable();  // Needed for PREEMPT_AUTO
+	ret = get_state_synchronize_srcu(ssp);
 	srcu_gp_start_if_needed(ssp);
+	preempt_enable();
 	return ret;
 }
 EXPORT_SYMBOL_GPL(start_poll_synchronize_srcu);

From 7e9591a1f5ca98fe0683b8fa052e4c47bdbde6b8 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Thu, 15 Feb 2024 16:44:48 -0800
Subject: [PATCH 0605/1406] ftrace: Asynchronous grace period for
 register_ftrace_direct()

When running heavy test workloads with KASAN enabled, RCU Tasks grace
periods can extend for many tens of seconds, significantly slowing
trace registration.  Therefore, make the registration-side RCU Tasks
grace period be asynchronous via call_rcu_tasks().

Reported-by: Jakub Kicinski <kuba@kernel.org>
Reported-by: Alexei Starovoitov <ast@kernel.org>
Reported-by: Chris Mason <clm@fb.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: <linux-trace-kernel@vger.kernel.org>
---
 kernel/trace/ftrace.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index b01ae7d3602181..2da4eaa2777d68 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -5353,6 +5353,13 @@ static void remove_direct_functions_hash(struct ftrace_hash *hash, unsigned long
 	}
 }
 
+static void register_ftrace_direct_cb(struct rcu_head *rhp)
+{
+	struct ftrace_hash *fhp = container_of(rhp, struct ftrace_hash, rcu);
+
+	free_ftrace_hash(fhp);
+}
+
 /**
  * register_ftrace_direct - Call a custom trampoline directly
  * for multiple functions registered in @ops
@@ -5451,10 +5458,8 @@ int register_ftrace_direct(struct ftrace_ops *ops, unsigned long addr)
  out_unlock:
 	mutex_unlock(&direct_mutex);
 
-	if (free_hash && free_hash != EMPTY_HASH) {
-		synchronize_rcu_tasks();
-		free_ftrace_hash(free_hash);
-	}
+	if (free_hash && free_hash != EMPTY_HASH)
+		call_rcu_tasks(&free_hash->rcu, register_ftrace_direct_cb);
 
 	if (new_hash)
 		free_ftrace_hash(new_hash);

From 234942a05e40bb8f3eb88a040457cb438a4046f8 Mon Sep 17 00:00:00 2001
From: Zenghui Yu <zenghui.yu@linux.dev>
Date: Fri, 16 Feb 2024 23:44:55 +0800
Subject: [PATCH 0606/1406] doc: Remove references to arrayRCU.rst

arrayRCU.rst has been removed since commit ef2555cf68c3 ("doc: Remove
arrayRCU.rst") but is still referenced by whatisRCU.rst. Update it to
reflect the current state of the documentation.

Signed-off-by: Zenghui Yu <zenghui.yu@linux.dev>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 Documentation/RCU/whatisRCU.rst | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Documentation/RCU/whatisRCU.rst b/Documentation/RCU/whatisRCU.rst
index 872ac665223fbd..94838c65c7d971 100644
--- a/Documentation/RCU/whatisRCU.rst
+++ b/Documentation/RCU/whatisRCU.rst
@@ -427,7 +427,7 @@ their assorted primitives.
 
 This section shows a simple use of the core RCU API to protect a
 global pointer to a dynamically allocated structure.  More-typical
-uses of RCU may be found in listRCU.rst, arrayRCU.rst, and NMI-RCU.rst.
+uses of RCU may be found in listRCU.rst and NMI-RCU.rst.
 ::
 
 	struct foo {
@@ -510,8 +510,8 @@ So, to sum up:
 	data item.
 
 See checklist.rst for additional rules to follow when using RCU.
-And again, more-typical uses of RCU may be found in listRCU.rst,
-arrayRCU.rst, and NMI-RCU.rst.
+And again, more-typical uses of RCU may be found in listRCU.rst
+and NMI-RCU.rst.
 
 .. _4_whatisRCU:
 

From b4c7a9cd36e1d1eb1ce43b4329e359a00d75a355 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Sat, 17 Feb 2024 17:57:35 -0800
Subject: [PATCH 0607/1406] rcutorture: Enable RCU priority boosting for TREE09

The TREE09 rcutorture scenario exhausts memory from time to time, and
this is due to a reader being preempted and blocking grace periods,
thus preventing recycling of the memory used in callback-flooding tests.
This commit therefore enables RCU priority boosting and sets the boosting
delay to 100 milliseconds after grace-period start.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 tools/testing/selftests/rcutorture/configs/rcu/TREE09 | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE09 b/tools/testing/selftests/rcutorture/configs/rcu/TREE09
index fc45645bb5f421..9ecd1b4e653d3f 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/TREE09
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE09
@@ -10,8 +10,9 @@ CONFIG_NO_HZ_FULL=n
 CONFIG_RCU_TRACE=n
 CONFIG_RCU_NOCB_CPU=n
 CONFIG_DEBUG_LOCK_ALLOC=n
-CONFIG_RCU_BOOST=n
+CONFIG_RCU_BOOST=y
+CONFIG_RCU_BOOST_DELAY=100
 CONFIG_DEBUG_OBJECTS_RCU_HEAD=n
-#CHECK#CONFIG_RCU_EXPERT=n
+CONFIG_RCU_EXPERT=y
 CONFIG_KPROBES=n
 CONFIG_FTRACE=n

From 7ab7acb68adf053c78a2cdf32bf1a3dce95912ec Mon Sep 17 00:00:00 2001
From: Max Filippov <jcmvbkbc@gmail.com>
Date: Sat, 17 Feb 2024 05:15:42 -0800
Subject: [PATCH 0608/1406] xtensa: fix MAKE_PC_FROM_RA second argument

Xtensa has two-argument MAKE_PC_FROM_RA macro to convert a0 to an actual
return address because when windowed ABI is used call{,x}{4,8,12}
opcodes stuff encoded window size into the top 2 bits of the register
that becomes a return address in the called function. Second argument of
that macro is supposed to be an address having these 2 topmost bits set
correctly, but the comment suggested that that could be the stack
address. However the stack doesn't have to be in the same 1GByte region
as the code, especially in noMMU XIP configurations.

Fix the comment and use either _text or regs->pc as the second argument
for the MAKE_PC_FROM_RA macro.

Cc: stable@vger.kernel.org
Signed-off-by: Max Filippov <jcmvbkbc@gmail.com>
---
 arch/xtensa/include/asm/processor.h | 8 ++++----
 arch/xtensa/include/asm/ptrace.h    | 2 +-
 arch/xtensa/kernel/process.c        | 5 +++--
 arch/xtensa/kernel/stacktrace.c     | 3 ++-
 4 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/arch/xtensa/include/asm/processor.h b/arch/xtensa/include/asm/processor.h
index d008a153a2b9f7..7ed1a2085bd728 100644
--- a/arch/xtensa/include/asm/processor.h
+++ b/arch/xtensa/include/asm/processor.h
@@ -115,9 +115,9 @@
 #define MAKE_RA_FOR_CALL(ra,ws)   (((ra) & 0x3fffffff) | (ws) << 30)
 
 /* Convert return address to a valid pc
- * Note: We assume that the stack pointer is in the same 1GB ranges as the ra
+ * Note: 'text' is the address within the same 1GB range as the ra
  */
-#define MAKE_PC_FROM_RA(ra,sp)    (((ra) & 0x3fffffff) | ((sp) & 0xc0000000))
+#define MAKE_PC_FROM_RA(ra, text) (((ra) & 0x3fffffff) | ((unsigned long)(text) & 0xc0000000))
 
 #elif defined(__XTENSA_CALL0_ABI__)
 
@@ -127,9 +127,9 @@
 #define MAKE_RA_FOR_CALL(ra, ws)   (ra)
 
 /* Convert return address to a valid pc
- * Note: We assume that the stack pointer is in the same 1GB ranges as the ra
+ * Note: 'text' is not used as 'ra' is always the full address
  */
-#define MAKE_PC_FROM_RA(ra, sp)    (ra)
+#define MAKE_PC_FROM_RA(ra, text)  (ra)
 
 #else
 #error Unsupported Xtensa ABI
diff --git a/arch/xtensa/include/asm/ptrace.h b/arch/xtensa/include/asm/ptrace.h
index a270467556dc84..86c70117371bb7 100644
--- a/arch/xtensa/include/asm/ptrace.h
+++ b/arch/xtensa/include/asm/ptrace.h
@@ -87,7 +87,7 @@ struct pt_regs {
 # define user_mode(regs) (((regs)->ps & 0x00000020)!=0)
 # define instruction_pointer(regs) ((regs)->pc)
 # define return_pointer(regs) (MAKE_PC_FROM_RA((regs)->areg[0], \
-					       (regs)->areg[1]))
+					       (regs)->pc))
 
 # ifndef CONFIG_SMP
 #  define profile_pc(regs) instruction_pointer(regs)
diff --git a/arch/xtensa/kernel/process.c b/arch/xtensa/kernel/process.c
index a815577d25fd02..7bd66677f7b6de 100644
--- a/arch/xtensa/kernel/process.c
+++ b/arch/xtensa/kernel/process.c
@@ -47,6 +47,7 @@
 #include <asm/asm-offsets.h>
 #include <asm/regs.h>
 #include <asm/hw_breakpoint.h>
+#include <asm/sections.h>
 #include <asm/traps.h>
 
 extern void ret_from_fork(void);
@@ -380,7 +381,7 @@ unsigned long __get_wchan(struct task_struct *p)
 	int count = 0;
 
 	sp = p->thread.sp;
-	pc = MAKE_PC_FROM_RA(p->thread.ra, p->thread.sp);
+	pc = MAKE_PC_FROM_RA(p->thread.ra, _text);
 
 	do {
 		if (sp < stack_page + sizeof(struct task_struct) ||
@@ -392,7 +393,7 @@ unsigned long __get_wchan(struct task_struct *p)
 
 		/* Stack layout: sp-4: ra, sp-3: sp' */
 
-		pc = MAKE_PC_FROM_RA(SPILL_SLOT(sp, 0), sp);
+		pc = MAKE_PC_FROM_RA(SPILL_SLOT(sp, 0), _text);
 		sp = SPILL_SLOT(sp, 1);
 	} while (count++ < 16);
 	return 0;
diff --git a/arch/xtensa/kernel/stacktrace.c b/arch/xtensa/kernel/stacktrace.c
index 831ffb648bda7e..ed324fdf2a2f91 100644
--- a/arch/xtensa/kernel/stacktrace.c
+++ b/arch/xtensa/kernel/stacktrace.c
@@ -13,6 +13,7 @@
 #include <linux/stacktrace.h>
 
 #include <asm/ftrace.h>
+#include <asm/sections.h>
 #include <asm/stacktrace.h>
 #include <asm/traps.h>
 #include <linux/uaccess.h>
@@ -189,7 +190,7 @@ void walk_stackframe(unsigned long *sp,
 		if (a1 <= (unsigned long)sp)
 			break;
 
-		frame.pc = MAKE_PC_FROM_RA(a0, a1);
+		frame.pc = MAKE_PC_FROM_RA(a0, _text);
 		frame.sp = a1;
 
 		if (fn(&frame, data))

From 1fe7ee4dde82b718ae38ced0f458fadf11632864 Mon Sep 17 00:00:00 2001
From: Zhipeng Lu <alexious@zju.edu.cn>
Date: Sun, 24 Dec 2023 16:20:33 +0800
Subject: [PATCH 0609/1406] SUNRPC: fix a memleak in gss_import_v2_context

The ctx->mech_used.data allocated by kmemdup is not freed in neither
gss_import_v2_context nor it only caller gss_krb5_import_sec_context,
which frees ctx on error.

Thus, this patch reform the last call of gss_import_v2_context to the
gss_krb5_import_ctx_v2, preventing the memleak while keepping the return
formation.

Fixes: 47d848077629 ("gss_krb5: handle new context format from gssd")
Signed-off-by: Zhipeng Lu <alexious@zju.edu.cn>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 net/sunrpc/auth_gss/gss_krb5_mech.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/net/sunrpc/auth_gss/gss_krb5_mech.c b/net/sunrpc/auth_gss/gss_krb5_mech.c
index 64cff717c3d9b3..3366505bc669a0 100644
--- a/net/sunrpc/auth_gss/gss_krb5_mech.c
+++ b/net/sunrpc/auth_gss/gss_krb5_mech.c
@@ -398,6 +398,7 @@ gss_import_v2_context(const void *p, const void *end, struct krb5_ctx *ctx,
 	u64 seq_send64;
 	int keylen;
 	u32 time32;
+	int ret;
 
 	p = simple_get_bytes(p, end, &ctx->flags, sizeof(ctx->flags));
 	if (IS_ERR(p))
@@ -450,8 +451,16 @@ gss_import_v2_context(const void *p, const void *end, struct krb5_ctx *ctx,
 	}
 	ctx->mech_used.len = gss_kerberos_mech.gm_oid.len;
 
-	return gss_krb5_import_ctx_v2(ctx, gfp_mask);
+	ret = gss_krb5_import_ctx_v2(ctx, gfp_mask);
+	if (ret) {
+		p = ERR_PTR(ret);
+		goto out_free;
+	}
 
+	return 0;
+
+out_free:
+	kfree(ctx->mech_used.data);
 out_err:
 	return PTR_ERR(p);
 }

From d2237d3b89327ed525ed7e10885f43f6952fd64b Mon Sep 17 00:00:00 2001
From: Zhipeng Lu <alexious@zju.edu.cn>
Date: Tue, 2 Jan 2024 13:38:13 +0800
Subject: [PATCH 0610/1406] SUNRPC: fix some memleaks in gssx_dec_option_array

The creds and oa->data need to be freed in the error-handling paths after
their allocation. So this patch add these deallocations in the
corresponding paths.

Fixes: 1d658336b05f ("SUNRPC: Add RPC based upcall mechanism for RPCGSS auth")
Signed-off-by: Zhipeng Lu <alexious@zju.edu.cn>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 net/sunrpc/auth_gss/gss_rpc_xdr.c | 27 +++++++++++++++++++--------
 1 file changed, 19 insertions(+), 8 deletions(-)

diff --git a/net/sunrpc/auth_gss/gss_rpc_xdr.c b/net/sunrpc/auth_gss/gss_rpc_xdr.c
index d79f12c2550ac3..cb32ab9a839521 100644
--- a/net/sunrpc/auth_gss/gss_rpc_xdr.c
+++ b/net/sunrpc/auth_gss/gss_rpc_xdr.c
@@ -250,8 +250,8 @@ static int gssx_dec_option_array(struct xdr_stream *xdr,
 
 	creds = kzalloc(sizeof(struct svc_cred), GFP_KERNEL);
 	if (!creds) {
-		kfree(oa->data);
-		return -ENOMEM;
+		err = -ENOMEM;
+		goto free_oa;
 	}
 
 	oa->data[0].option.data = CREDS_VALUE;
@@ -265,29 +265,40 @@ static int gssx_dec_option_array(struct xdr_stream *xdr,
 
 		/* option buffer */
 		p = xdr_inline_decode(xdr, 4);
-		if (unlikely(p == NULL))
-			return -ENOSPC;
+		if (unlikely(p == NULL)) {
+			err = -ENOSPC;
+			goto free_creds;
+		}
 
 		length = be32_to_cpup(p);
 		p = xdr_inline_decode(xdr, length);
-		if (unlikely(p == NULL))
-			return -ENOSPC;
+		if (unlikely(p == NULL)) {
+			err = -ENOSPC;
+			goto free_creds;
+		}
 
 		if (length == sizeof(CREDS_VALUE) &&
 		    memcmp(p, CREDS_VALUE, sizeof(CREDS_VALUE)) == 0) {
 			/* We have creds here. parse them */
 			err = gssx_dec_linux_creds(xdr, creds);
 			if (err)
-				return err;
+				goto free_creds;
 			oa->data[0].value.len = 1; /* presence */
 		} else {
 			/* consume uninteresting buffer */
 			err = gssx_dec_buffer(xdr, &dummy);
 			if (err)
-				return err;
+				goto free_creds;
 		}
 	}
 	return 0;
+
+free_creds:
+	kfree(creds);
+free_oa:
+	kfree(oa->data);
+	oa->data = NULL;
+	return err;
 }
 
 static int gssx_dec_status(struct xdr_stream *xdr,

From 3ce396c72d167d99d659b4ab8436f525ad463edc Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 1 Jan 2024 11:37:45 -0500
Subject: [PATCH 0611/1406] SUNRPC: Use a static buffer for the checksum
 initialization vector

Allocating and zeroing a buffer during every call to
krb5_etm_checksum() is inefficient. Instead, set aside a static
buffer that is the maximum crypto block size, and use a portion
(or all) of that.

Reported-by: Markus Elfring <Markus.Elfring@web.de>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 net/sunrpc/auth_gss/gss_krb5_crypto.c | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/net/sunrpc/auth_gss/gss_krb5_crypto.c b/net/sunrpc/auth_gss/gss_krb5_crypto.c
index d2b02710ab0709..b2c1b683a88ee2 100644
--- a/net/sunrpc/auth_gss/gss_krb5_crypto.c
+++ b/net/sunrpc/auth_gss/gss_krb5_crypto.c
@@ -921,6 +921,8 @@ gss_krb5_aes_decrypt(struct krb5_ctx *kctx, u32 offset, u32 len,
  * Caller provides the truncation length of the output token (h) in
  * cksumout.len.
  *
+ * Note that for RPCSEC, the "initial cipher state" is always all zeroes.
+ *
  * Return values:
  *   %GSS_S_COMPLETE: Digest computed, @cksumout filled in
  *   %GSS_S_FAILURE: Call failed
@@ -931,22 +933,19 @@ u32 krb5_etm_checksum(struct crypto_sync_skcipher *cipher,
 		      int body_offset, struct xdr_netobj *cksumout)
 {
 	unsigned int ivsize = crypto_sync_skcipher_ivsize(cipher);
+	static const u8 iv[GSS_KRB5_MAX_BLOCKSIZE];
 	struct ahash_request *req;
 	struct scatterlist sg[1];
-	u8 *iv, *checksumdata;
 	int err = -ENOMEM;
+	u8 *checksumdata;
 
 	checksumdata = kmalloc(crypto_ahash_digestsize(tfm), GFP_KERNEL);
 	if (!checksumdata)
 		return GSS_S_FAILURE;
-	/* For RPCSEC, the "initial cipher state" is always all zeroes. */
-	iv = kzalloc(ivsize, GFP_KERNEL);
-	if (!iv)
-		goto out_free_mem;
 
 	req = ahash_request_alloc(tfm, GFP_KERNEL);
 	if (!req)
-		goto out_free_mem;
+		goto out_free_cksumdata;
 	ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP, NULL, NULL);
 	err = crypto_ahash_init(req);
 	if (err)
@@ -970,8 +969,7 @@ u32 krb5_etm_checksum(struct crypto_sync_skcipher *cipher,
 
 out_free_ahash:
 	ahash_request_free(req);
-out_free_mem:
-	kfree(iv);
+out_free_cksumdata:
 	kfree_sensitive(checksumdata);
 	return err ? GSS_S_FAILURE : GSS_S_COMPLETE;
 }

From 23e41e0f2b4d7741ef29341c81edd41e26554ffc Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 15 Dec 2023 12:18:30 +1100
Subject: [PATCH 0612/1406] nfsd: Don't leave work of closing files to a work
 queue

The work of closing a file can have non-trivial cost.  Doing it in a
separate work queue thread means that cost isn't imposed on the nfsd
threads and an imbalance can be created.  This can result in files being
queued for the work queue more quickly that the work queue can process
them, resulting in unbounded growth of the queue and memory exhaustion.

To avoid this work imbalance that exhausts memory, this patch moves all
closing of files into the nfsd threads.  This means that when the work
imposes a cost, that cost appears where it would be expected - in the
work of the nfsd thread.  A subsequent patch will ensure the final
__fput() is called in the same (nfsd) thread which calls filp_close().

Files opened for NFSv3 are never explicitly closed by the client and are
kept open by the server in the "filecache", which responds to memory
pressure, is garbage collected even when there is no pressure, and
sometimes closes files when there is particular need such as for rename.
These files currently have filp_close() called in a dedicated work
queue, so their __fput() can have no effect on nfsd threads.

This patch discards the work queue and instead has each nfsd thread call
flip_close() on as many as 8 files from the filecache each time it acts
on a client request (or finds there are no pending client requests).  If
there are more to be closed, more threads are woken.  This spreads the
work of __fput() over multiple threads and imposes any cost on those
threads.

The number 8 is somewhat arbitrary.  It needs to be greater than 1 to
ensure that files are closed more quickly than they can be added to the
cache.  It needs to be small enough to limit the per-request delays that
will be imposed on clients when all threads are busy closing files.

Signed-off-by: NeilBrown <neilb@suse.de>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/filecache.c | 67 +++++++++++++++++++++------------------------
 fs/nfsd/filecache.h |  1 +
 fs/nfsd/nfssvc.c    |  2 ++
 3 files changed, 34 insertions(+), 36 deletions(-)

diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index 9cb7f0c33df587..f8b100bca6e4da 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -61,13 +61,10 @@ static DEFINE_PER_CPU(unsigned long, nfsd_file_total_age);
 static DEFINE_PER_CPU(unsigned long, nfsd_file_evictions);
 
 struct nfsd_fcache_disposal {
-	struct work_struct work;
 	spinlock_t lock;
 	struct list_head freeme;
 };
 
-static struct workqueue_struct *nfsd_filecache_wq __read_mostly;
-
 static struct kmem_cache		*nfsd_file_slab;
 static struct kmem_cache		*nfsd_file_mark_slab;
 static struct list_lru			nfsd_file_lru;
@@ -421,7 +418,37 @@ nfsd_file_dispose_list_delayed(struct list_head *dispose)
 		spin_lock(&l->lock);
 		list_move_tail(&nf->nf_lru, &l->freeme);
 		spin_unlock(&l->lock);
-		queue_work(nfsd_filecache_wq, &l->work);
+		svc_wake_up(nn->nfsd_serv);
+	}
+}
+
+/**
+ * nfsd_file_net_dispose - deal with nfsd_files waiting to be disposed.
+ * @nn: nfsd_net in which to find files to be disposed.
+ *
+ * When files held open for nfsv3 are removed from the filecache, whether
+ * due to memory pressure or garbage collection, they are queued to
+ * a per-net-ns queue.  This function completes the disposal, either
+ * directly or by waking another nfsd thread to help with the work.
+ */
+void nfsd_file_net_dispose(struct nfsd_net *nn)
+{
+	struct nfsd_fcache_disposal *l = nn->fcache_disposal;
+
+	if (!list_empty(&l->freeme)) {
+		LIST_HEAD(dispose);
+		int i;
+
+		spin_lock(&l->lock);
+		for (i = 0; i < 8 && !list_empty(&l->freeme); i++)
+			list_move(l->freeme.next, &dispose);
+		spin_unlock(&l->lock);
+		if (!list_empty(&l->freeme))
+			/* Wake up another thread to share the work
+			 * *before* doing any actual disposing.
+			 */
+			svc_wake_up(nn->nfsd_serv);
+		nfsd_file_dispose_list(&dispose);
 	}
 }
 
@@ -634,27 +661,6 @@ nfsd_file_close_inode_sync(struct inode *inode)
 	flush_delayed_fput();
 }
 
-/**
- * nfsd_file_delayed_close - close unused nfsd_files
- * @work: dummy
- *
- * Scrape the freeme list for this nfsd_net, and then dispose of them
- * all.
- */
-static void
-nfsd_file_delayed_close(struct work_struct *work)
-{
-	LIST_HEAD(head);
-	struct nfsd_fcache_disposal *l = container_of(work,
-			struct nfsd_fcache_disposal, work);
-
-	spin_lock(&l->lock);
-	list_splice_init(&l->freeme, &head);
-	spin_unlock(&l->lock);
-
-	nfsd_file_dispose_list(&head);
-}
-
 static int
 nfsd_file_lease_notifier_call(struct notifier_block *nb, unsigned long arg,
 			    void *data)
@@ -717,10 +723,6 @@ nfsd_file_cache_init(void)
 		return ret;
 
 	ret = -ENOMEM;
-	nfsd_filecache_wq = alloc_workqueue("nfsd_filecache", WQ_UNBOUND, 0);
-	if (!nfsd_filecache_wq)
-		goto out;
-
 	nfsd_file_slab = kmem_cache_create("nfsd_file",
 				sizeof(struct nfsd_file), 0, 0, NULL);
 	if (!nfsd_file_slab) {
@@ -735,7 +737,6 @@ nfsd_file_cache_init(void)
 		goto out_err;
 	}
 
-
 	ret = list_lru_init(&nfsd_file_lru);
 	if (ret) {
 		pr_err("nfsd: failed to init nfsd_file_lru: %d\n", ret);
@@ -785,8 +786,6 @@ nfsd_file_cache_init(void)
 	nfsd_file_slab = NULL;
 	kmem_cache_destroy(nfsd_file_mark_slab);
 	nfsd_file_mark_slab = NULL;
-	destroy_workqueue(nfsd_filecache_wq);
-	nfsd_filecache_wq = NULL;
 	rhltable_destroy(&nfsd_file_rhltable);
 	goto out;
 }
@@ -832,7 +831,6 @@ nfsd_alloc_fcache_disposal(void)
 	l = kmalloc(sizeof(*l), GFP_KERNEL);
 	if (!l)
 		return NULL;
-	INIT_WORK(&l->work, nfsd_file_delayed_close);
 	spin_lock_init(&l->lock);
 	INIT_LIST_HEAD(&l->freeme);
 	return l;
@@ -841,7 +839,6 @@ nfsd_alloc_fcache_disposal(void)
 static void
 nfsd_free_fcache_disposal(struct nfsd_fcache_disposal *l)
 {
-	cancel_work_sync(&l->work);
 	nfsd_file_dispose_list(&l->freeme);
 	kfree(l);
 }
@@ -910,8 +907,6 @@ nfsd_file_cache_shutdown(void)
 	fsnotify_wait_marks_destroyed();
 	kmem_cache_destroy(nfsd_file_mark_slab);
 	nfsd_file_mark_slab = NULL;
-	destroy_workqueue(nfsd_filecache_wq);
-	nfsd_filecache_wq = NULL;
 	rhltable_destroy(&nfsd_file_rhltable);
 
 	for_each_possible_cpu(i) {
diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h
index e54165a3224f0b..c61884def906d0 100644
--- a/fs/nfsd/filecache.h
+++ b/fs/nfsd/filecache.h
@@ -56,6 +56,7 @@ void nfsd_file_cache_shutdown_net(struct net *net);
 void nfsd_file_put(struct nfsd_file *nf);
 struct nfsd_file *nfsd_file_get(struct nfsd_file *nf);
 void nfsd_file_close_inode_sync(struct inode *inode);
+void nfsd_file_net_dispose(struct nfsd_net *nn);
 bool nfsd_file_is_cached(struct inode *inode);
 __be32 nfsd_file_acquire_gc(struct svc_rqst *rqstp, struct svc_fh *fhp,
 		  unsigned int may_flags, struct nfsd_file **nfp);
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index a667802e08e75f..9a894c3511baf3 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -941,6 +941,8 @@ nfsd(void *vrqstp)
 		rqstp->rq_server->sv_maxconn = nn->max_connections;
 
 		svc_recv(rqstp);
+
+		nfsd_file_net_dispose(nn);
 	}
 
 	atomic_dec(&nfsdstats.th_cnt);

From 7086d8e69014a940979fd4f69964a7c41611800f Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 15 Dec 2023 12:18:31 +1100
Subject: [PATCH 0613/1406] nfsd: use __fput_sync() to avoid delayed closing of
 files.

Calling fput() directly or though filp_close() from a kernel thread like
nfsd causes the final __fput() (if necessary) to be called from a
workqueue.  This means that nfsd is not forced to wait for any work to
complete.  If the ->release or ->destroy_inode function is slow for any
reason, this can result in nfsd closing files more quickly than the
workqueue can complete the close and the queue of pending closes can
grow without bounces (30 million has been seen at one customer site,
though this was in part due to a slowness in xfs which has since been
fixed).

nfsd does not need this.  It is quite appropriate and safe for nfsd to
do its own close work.  There is no reason that close should ever wait
for nfsd, so no deadlock can occur.

It should be safe and sensible to change all fput() calls to
__fput_sync().  However in the interests of caution this patch only
changes two - the two that can be most directly affected by client
behaviour and could occur at high frequency.

- the fput() implicitly in flip_close() is changed to __fput_sync()
  by calling get_file() first to ensure filp_close() doesn't do
  the final fput() itself.  If is where files opened for IO are closed.

- the fput() in nfsd_read() is also changed.  This is where directories
  opened for readdir are closed.

This ensure that minimal fput work is queued to the workqueue.

This removes the need for the flush_delayed_fput() call in
nfsd_file_close_inode_sync()

Signed-off-by: NeilBrown <neilb@suse.de>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/filecache.c |  3 +--
 fs/nfsd/vfs.c       | 42 +++++++++++++++++++++++++++++++++++++-----
 fs/nfsd/vfs.h       |  2 ++
 3 files changed, 40 insertions(+), 7 deletions(-)

diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index f8b100bca6e4da..8d9f7b07e35b39 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -280,7 +280,7 @@ nfsd_file_free(struct nfsd_file *nf)
 		nfsd_file_mark_put(nf->nf_mark);
 	if (nf->nf_file) {
 		nfsd_file_check_write_error(nf);
-		filp_close(nf->nf_file, NULL);
+		nfsd_filp_close(nf->nf_file);
 	}
 
 	/*
@@ -658,7 +658,6 @@ nfsd_file_close_inode_sync(struct inode *inode)
 		list_del_init(&nf->nf_lru);
 		nfsd_file_free(nf);
 	}
-	flush_delayed_fput();
 }
 
 static int
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index b7c7a9273ea01d..f57749cd6f0b1a 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1906,10 +1906,10 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
 	fh_drop_write(ffhp);
 
 	/*
-	 * If the target dentry has cached open files, then we need to try to
-	 * close them prior to doing the rename. Flushing delayed fput
-	 * shouldn't be done with locks held however, so we delay it until this
-	 * point and then reattempt the whole shebang.
+	 * If the target dentry has cached open files, then we need to
+	 * try to close them prior to doing the rename.  Final fput
+	 * shouldn't be done with locks held however, so we delay it
+	 * until this point and then reattempt the whole shebang.
 	 */
 	if (close_cached) {
 		close_cached = false;
@@ -2177,11 +2177,43 @@ nfsd_readdir(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t *offsetp,
 	if (err == nfserr_eof || err == nfserr_toosmall)
 		err = nfs_ok; /* can still be found in ->err */
 out_close:
-	fput(file);
+	nfsd_filp_close(file);
 out:
 	return err;
 }
 
+/**
+ * nfsd_filp_close: close a file synchronously
+ * @fp: the file to close
+ *
+ * nfsd_filp_close() is similar in behaviour to filp_close().
+ * The difference is that if this is the final close on the
+ * file, the that finalisation happens immediately, rather then
+ * being handed over to a work_queue, as it the case for
+ * filp_close().
+ * When a user-space process closes a file (even when using
+ * filp_close() the finalisation happens before returning to
+ * userspace, so it is effectively synchronous.  When a kernel thread
+ * uses file_close(), on the other hand, the handling is completely
+ * asynchronous.  This means that any cost imposed by that finalisation
+ * is not imposed on the nfsd thread, and nfsd could potentually
+ * close files more quickly than the work queue finalises the close,
+ * which would lead to unbounded growth in the queue.
+ *
+ * In some contexts is it not safe to synchronously wait for
+ * close finalisation (see comment for __fput_sync()), but nfsd
+ * does not match those contexts.  In partcilarly it does not, at the
+ * time that this function is called, hold and locks and no finalisation
+ * of any file, socket, or device driver would have any cause to wait
+ * for nfsd to make progress.
+ */
+void nfsd_filp_close(struct file *fp)
+{
+	get_file(fp);
+	filp_close(fp, NULL);
+	__fput_sync(fp);
+}
+
 /*
  * Get file system stats
  * N.B. After this call fhp needs an fh_put
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index 702fbc4483bf16..1efa4e8dfb0349 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -148,6 +148,8 @@ __be32		nfsd_statfs(struct svc_rqst *, struct svc_fh *,
 __be32		nfsd_permission(struct svc_rqst *, struct svc_export *,
 				struct dentry *, int);
 
+void		nfsd_filp_close(struct file *fp);
+
 static inline int fh_want_write(struct svc_fh *fh)
 {
 	int ret;

From 98be4be88369b5edfab44bdfc083fc7bc3ee71ea Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Wed, 17 Jan 2024 14:48:04 +1100
Subject: [PATCH 0614/1406] nfsd: drop st_mutex and rp_mutex before calling
 move_to_close_lru()

move_to_close_lru() is currently called with ->st_mutex and .rp_mutex held.
This can lead to a deadlock as move_to_close_lru() waits for sc_count to
drop to 2, and some threads holding a reference might be waiting for either
mutex.  These references will never be dropped so sc_count will never
reach 2.

There can be no harm in dropping ->st_mutex to before
move_to_close_lru() because the only place that takes the mutex is
nfsd4_lock_ol_stateid(), and it quickly aborts if sc_type is
NFS4_CLOSED_STID, which it will be before move_to_close_lru() is called.

Similarly dropping .rp_mutex is safe after the state is closed and so
no longer usable.  Another way to look at this is that nothing
significant happens between when nfsd4_close() now calls
nfsd4_cstate_clear_replay(), and where nfsd4_proc_compound calls
nfsd4_cstate_clear_replay() a little later.

See also
 https://lore.kernel.org/lkml/4dd1fe21e11344e5969bb112e954affb@jd.com/T/
where this problem was raised but not successfully resolved.

Signed-off-by: NeilBrown <neilb@suse.de>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4state.c | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 7d6c657e0409dd..b4c2a5f4e10b79 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -6990,7 +6990,7 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp,
 	return status;
 }
 
-static void nfsd4_close_open_stateid(struct nfs4_ol_stateid *s)
+static bool nfsd4_close_open_stateid(struct nfs4_ol_stateid *s)
 {
 	struct nfs4_client *clp = s->st_stid.sc_client;
 	bool unhashed;
@@ -7007,11 +7007,11 @@ static void nfsd4_close_open_stateid(struct nfs4_ol_stateid *s)
 		list_for_each_entry(stp, &reaplist, st_locks)
 			nfs4_free_cpntf_statelist(clp->net, &stp->st_stid);
 		free_ol_stateid_reaplist(&reaplist);
+		return false;
 	} else {
 		spin_unlock(&clp->cl_lock);
 		free_ol_stateid_reaplist(&reaplist);
-		if (unhashed)
-			move_to_close_lru(s, clp->net);
+		return unhashed;
 	}
 }
 
@@ -7027,6 +7027,7 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	struct nfs4_ol_stateid *stp;
 	struct net *net = SVC_NET(rqstp);
 	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+	bool need_move_to_close_list;
 
 	dprintk("NFSD: nfsd4_close on file %pd\n", 
 			cstate->current_fh.fh_dentry);
@@ -7049,8 +7050,17 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	 */
 	nfs4_inc_and_copy_stateid(&close->cl_stateid, &stp->st_stid);
 
-	nfsd4_close_open_stateid(stp);
+	need_move_to_close_list = nfsd4_close_open_stateid(stp);
 	mutex_unlock(&stp->st_mutex);
+	if (need_move_to_close_list) {
+		/* Drop the replay mutex early as move_to_close_lru()
+		 * can wait for other threads which hold that mutex.
+		 * This call is idempotent, so that fact that it will
+		 * be called twice is harmless.
+		 */
+		nfsd4_cstate_clear_replay(cstate);
+		move_to_close_lru(stp, net);
+	}
 
 	/* v4.1+ suggests that we send a special stateid in here, since the
 	 * clients should just ignore this anyway. Since this is not useful

From 18427247cf4879e2cb6394b792fd43af7f2b5a3c Mon Sep 17 00:00:00 2001
From: Jorge Mora <jmora1300@gmail.com>
Date: Thu, 25 Jan 2024 07:46:54 -0700
Subject: [PATCH 0615/1406] NFSD: fix nfsd4_listxattr_validate_cookie

If LISTXATTRS is sent with a correct cookie but a small maxcount,
this could lead function nfsd4_listxattr_validate_cookie to
return NFS4ERR_BAD_COOKIE. If maxcount = 20, then second check
on function gives RHS = 3 thus any cookie larger than 3 returns
NFS4ERR_BAD_COOKIE.

There is no need to validate the cookie on the return XDR buffer
since attribute referenced by cookie will be the first in the
return buffer.

Fixes: 23e50fe3a5e6 ("nfsd: implement the xattr functions and en/decode logic")
Signed-off-by: Jorge Mora <mora@netapp.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4xdr.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index c719c475a068ef..f0be0d6fe63fd2 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -5386,16 +5386,11 @@ nfsd4_listxattr_validate_cookie(struct nfsd4_listxattrs *listxattrs,
 
 	/*
 	 * If the cookie is larger than the maximum number we can fit
-	 * in either the buffer we just got back from vfs_listxattr, or,
-	 * XDR-encoded, in the return buffer, it's invalid.
+	 * in the buffer we just got back from vfs_listxattr, it's invalid.
 	 */
 	if (cookie > (listxattrs->lsxa_len) / (XATTR_USER_PREFIX_LEN + 2))
 		return nfserr_badcookie;
 
-	if (cookie > (listxattrs->lsxa_maxcount /
-		      (XDR_QUADLEN(XATTR_USER_PREFIX_LEN + 2) + 4)))
-		return nfserr_badcookie;
-
 	*offsetp = (u32)cookie;
 	return 0;
 }

From aa5d148198206dc1191174fecd66466ba4b68c55 Mon Sep 17 00:00:00 2001
From: Jorge Mora <jmora1300@gmail.com>
Date: Thu, 25 Jan 2024 07:46:12 -0700
Subject: [PATCH 0616/1406] NFSD: change LISTXATTRS cookie encoding to
 big-endian

Function nfsd4_listxattr_validate_cookie() expects the cookie
as an offset to the list thus it needs to be encoded in big-endian.

Fixes: 23e50fe3a5e6 ("nfsd: implement the xattr functions and en/decode logic")
Signed-off-by: Jorge Mora <mora@netapp.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4xdr.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index f0be0d6fe63fd2..5649076df4b4f7 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -5407,6 +5407,7 @@ nfsd4_encode_listxattrs(struct nfsd4_compoundres *resp, __be32 nfserr,
 	u64 cookie;
 	char *sp;
 	__be32 status, tmp;
+	__be64 wire_cookie;
 	__be32 *p;
 	u32 nuser;
 
@@ -5498,7 +5499,8 @@ nfsd4_encode_listxattrs(struct nfsd4_compoundres *resp, __be32 nfserr,
 
 	cookie = offset + count;
 
-	write_bytes_to_xdr_buf(xdr->buf, cookie_offset, &cookie, 8);
+	wire_cookie = cpu_to_be64(cookie);
+	write_bytes_to_xdr_buf(xdr->buf, cookie_offset, &wire_cookie, 8);
 	tmp = cpu_to_be32(count);
 	write_bytes_to_xdr_buf(xdr->buf, count_offset, &tmp, 4);
 out:

From a08c8965e9b17fd96501f91eedbf47be940ec8f6 Mon Sep 17 00:00:00 2001
From: Jorge Mora <jmora1300@gmail.com>
Date: Thu, 25 Jan 2024 07:45:28 -0700
Subject: [PATCH 0617/1406] NFSD: fix LISTXATTRS returning a short list with
 eof=TRUE

If the XDR buffer is not large enough to fit all attributes
and the remaining bytes left in the XDR buffer (xdrleft) is
equal to the number of bytes for the current attribute, then
the loop will prematurely exit without setting eof to FALSE.
Also in this case, adding the eof flag to the buffer will
make the reply 4 bytes larger than lsxa_maxcount.

Need to check if there are enough bytes to fit not only the
next attribute name but also the eof as well.

Fixes: 23e50fe3a5e6 ("nfsd: implement the xattr functions and en/decode logic")
Signed-off-by: Jorge Mora <mora@netapp.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4xdr.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 5649076df4b4f7..840ecd7eaf0713 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -5447,7 +5447,8 @@ nfsd4_encode_listxattrs(struct nfsd4_compoundres *resp, __be32 nfserr,
 
 		slen -= XATTR_USER_PREFIX_LEN;
 		xdrlen = 4 + ((slen + 3) & ~3);
-		if (xdrlen > xdrleft) {
+		/* Check if both entry and eof can fit in the XDR buffer */
+		if (xdrlen + XDR_UNIT > xdrleft) {
 			if (count == 0) {
 				/*
 				 * Can't even fit the first attribute name.

From 986dbea0a704855884dc184b751f688eeb43f6c2 Mon Sep 17 00:00:00 2001
From: Jorge Mora <jmora1300@gmail.com>
Date: Thu, 25 Jan 2024 07:42:23 -0700
Subject: [PATCH 0618/1406] NFSD: fix LISTXATTRS returning more bytes than
 maxcount

The maxcount is the maximum number of bytes for the LISTXATTRS4resok
result. This includes the cookie and the count for the name array,
thus subtract 12 bytes from the maxcount: 8 (cookie) + 4 (array count)
when filling up the name array.

Fixes: 23e50fe3a5e6 ("nfsd: implement the xattr functions and en/decode logic")
Signed-off-by: Jorge Mora <mora@netapp.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4xdr.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 840ecd7eaf0713..e3f761cd5ee78d 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -5423,7 +5423,7 @@ nfsd4_encode_listxattrs(struct nfsd4_compoundres *resp, __be32 nfserr,
 	 */
 	cookie_offset = xdr->buf->len;
 	count_offset = cookie_offset + 8;
-	p = xdr_reserve_space(xdr, 12);
+	p = xdr_reserve_space(xdr, XDR_UNIT * 3);
 	if (!p) {
 		status = nfserr_resource;
 		goto out;
@@ -5434,7 +5434,8 @@ nfsd4_encode_listxattrs(struct nfsd4_compoundres *resp, __be32 nfserr,
 	sp = listxattrs->lsxa_buf;
 	nuser = 0;
 
-	xdrleft = listxattrs->lsxa_maxcount;
+	/* Bytes left is maxcount - 8 (cookie) - 4 (array count) */
+	xdrleft = listxattrs->lsxa_maxcount - XDR_UNIT * 3;
 
 	while (left > 0 && xdrleft > 0) {
 		slen = strlen(sp);

From a3db2337ae4d5c1f950362a432e427f1b9f453a0 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 26 Jan 2024 10:39:40 -0500
Subject: [PATCH 0619/1406] sunrpc: don't change ->sv_stats if it doesn't exist

We check for the existence of ->sv_stats elsewhere except in the core
processing code.  It appears that only nfsd actual exports these values
anywhere, everybody else just has a write only copy of sv_stats in their
svc_program.  Add a check for ->sv_stats before every adjustment to
allow us to eliminate the stats struct from all the users who don't
report the stats.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 net/sunrpc/svc.c | 27 ++++++++++++++++++---------
 1 file changed, 18 insertions(+), 9 deletions(-)

diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index b969e505c7b770..62afc3dbc83717 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -1375,7 +1375,8 @@ svc_process_common(struct svc_rqst *rqstp)
 		goto err_bad_proc;
 
 	/* Syntactic check complete */
-	serv->sv_stats->rpccnt++;
+	if (serv->sv_stats)
+		serv->sv_stats->rpccnt++;
 	trace_svc_process(rqstp, progp->pg_name);
 
 	aoffset = xdr_stream_pos(xdr);
@@ -1427,7 +1428,8 @@ svc_process_common(struct svc_rqst *rqstp)
 	goto close_xprt;
 
 err_bad_rpc:
-	serv->sv_stats->rpcbadfmt++;
+	if (serv->sv_stats)
+		serv->sv_stats->rpcbadfmt++;
 	xdr_stream_encode_u32(xdr, RPC_MSG_DENIED);
 	xdr_stream_encode_u32(xdr, RPC_MISMATCH);
 	/* Only RPCv2 supported */
@@ -1438,7 +1440,8 @@ svc_process_common(struct svc_rqst *rqstp)
 err_bad_auth:
 	dprintk("svc: authentication failed (%d)\n",
 		be32_to_cpu(rqstp->rq_auth_stat));
-	serv->sv_stats->rpcbadauth++;
+	if (serv->sv_stats)
+		serv->sv_stats->rpcbadauth++;
 	/* Restore write pointer to location of reply status: */
 	xdr_truncate_encode(xdr, XDR_UNIT * 2);
 	xdr_stream_encode_u32(xdr, RPC_MSG_DENIED);
@@ -1448,7 +1451,8 @@ svc_process_common(struct svc_rqst *rqstp)
 
 err_bad_prog:
 	dprintk("svc: unknown program %d\n", rqstp->rq_prog);
-	serv->sv_stats->rpcbadfmt++;
+	if (serv->sv_stats)
+		serv->sv_stats->rpcbadfmt++;
 	*rqstp->rq_accept_statp = rpc_prog_unavail;
 	goto sendit;
 
@@ -1456,7 +1460,8 @@ svc_process_common(struct svc_rqst *rqstp)
 	svc_printk(rqstp, "unknown version (%d for prog %d, %s)\n",
 		       rqstp->rq_vers, rqstp->rq_prog, progp->pg_name);
 
-	serv->sv_stats->rpcbadfmt++;
+	if (serv->sv_stats)
+		serv->sv_stats->rpcbadfmt++;
 	*rqstp->rq_accept_statp = rpc_prog_mismatch;
 
 	/*
@@ -1470,19 +1475,22 @@ svc_process_common(struct svc_rqst *rqstp)
 err_bad_proc:
 	svc_printk(rqstp, "unknown procedure (%d)\n", rqstp->rq_proc);
 
-	serv->sv_stats->rpcbadfmt++;
+	if (serv->sv_stats)
+		serv->sv_stats->rpcbadfmt++;
 	*rqstp->rq_accept_statp = rpc_proc_unavail;
 	goto sendit;
 
 err_garbage_args:
 	svc_printk(rqstp, "failed to decode RPC header\n");
 
-	serv->sv_stats->rpcbadfmt++;
+	if (serv->sv_stats)
+		serv->sv_stats->rpcbadfmt++;
 	*rqstp->rq_accept_statp = rpc_garbage_args;
 	goto sendit;
 
 err_system_err:
-	serv->sv_stats->rpcbadfmt++;
+	if (serv->sv_stats)
+		serv->sv_stats->rpcbadfmt++;
 	*rqstp->rq_accept_statp = rpc_system_err;
 	goto sendit;
 }
@@ -1534,7 +1542,8 @@ void svc_process(struct svc_rqst *rqstp)
 out_baddir:
 	svc_printk(rqstp, "bad direction 0x%08x, dropping request\n",
 		   be32_to_cpu(*p));
-	rqstp->rq_server->sv_stats->rpcbadfmt++;
+	if (rqstp->rq_server->sv_stats)
+		rqstp->rq_server->sv_stats->rpcbadfmt++;
 out_drop:
 	svc_drop(rqstp);
 }

From daa8478a253ce050a8e0282f1f21f1cae452b412 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 26 Jan 2024 10:39:41 -0500
Subject: [PATCH 0620/1406] nfsd: stop setting ->pg_stats for unused stats

A lot of places are setting a blank svc_stats in ->pg_stats and never
utilizing these stats.  Remove all of these extra structs as we're not
reporting these stats anywhere.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/lockd/svc.c    | 3 ---
 fs/nfs/callback.c | 3 ---
 fs/nfsd/nfssvc.c  | 5 -----
 3 files changed, 11 deletions(-)

diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index ce5862482097a1..ab8042a5b895bc 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -710,8 +710,6 @@ static const struct svc_version *nlmsvc_version[] = {
 #endif
 };
 
-static struct svc_stat		nlmsvc_stats;
-
 #define NLM_NRVERS	ARRAY_SIZE(nlmsvc_version)
 static struct svc_program	nlmsvc_program = {
 	.pg_prog		= NLM_PROGRAM,		/* program number */
@@ -719,7 +717,6 @@ static struct svc_program	nlmsvc_program = {
 	.pg_vers		= nlmsvc_version,	/* version table */
 	.pg_name		= "lockd",		/* service name */
 	.pg_class		= "nfsd",		/* share authentication with nfsd */
-	.pg_stats		= &nlmsvc_stats,	/* stats table */
 	.pg_authenticate	= &lockd_authenticate,	/* export authentication */
 	.pg_init_request	= svc_generic_init_request,
 	.pg_rpcbind_set		= svc_generic_rpcbind_set,
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 760d27dd7225e9..8adfcd4c8c1a0a 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -356,15 +356,12 @@ static const struct svc_version *nfs4_callback_version[] = {
 	[4] = &nfs4_callback_version4,
 };
 
-static struct svc_stat nfs4_callback_stats;
-
 static struct svc_program nfs4_callback_program = {
 	.pg_prog = NFS4_CALLBACK,			/* RPC service number */
 	.pg_nvers = ARRAY_SIZE(nfs4_callback_version),	/* Number of entries */
 	.pg_vers = nfs4_callback_version,		/* version table */
 	.pg_name = "NFSv4 callback",			/* service name */
 	.pg_class = "nfs",				/* authentication class */
-	.pg_stats = &nfs4_callback_stats,
 	.pg_authenticate = nfs_callback_authenticate,
 	.pg_init_request = svc_generic_init_request,
 	.pg_rpcbind_set	= svc_generic_rpcbind_set,
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 9a894c3511baf3..a0b117107e8605 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -80,7 +80,6 @@ unsigned long	nfsd_drc_max_mem;
 unsigned long	nfsd_drc_mem_used;
 
 #if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
-static struct svc_stat	nfsd_acl_svcstats;
 static const struct svc_version *nfsd_acl_version[] = {
 # if defined(CONFIG_NFSD_V2_ACL)
 	[2] = &nfsd_acl_version2,
@@ -99,15 +98,11 @@ static struct svc_program	nfsd_acl_program = {
 	.pg_vers		= nfsd_acl_version,
 	.pg_name		= "nfsacl",
 	.pg_class		= "nfsd",
-	.pg_stats		= &nfsd_acl_svcstats,
 	.pg_authenticate	= &svc_set_client,
 	.pg_init_request	= nfsd_acl_init_request,
 	.pg_rpcbind_set		= nfsd_acl_rpcbind_set,
 };
 
-static struct svc_stat	nfsd_acl_svcstats = {
-	.program	= &nfsd_acl_program,
-};
 #endif /* defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) */
 
 static const struct svc_version *nfsd_version[] = {

From 6fd8b770a4e03bdae2dcc9fa648a41b84a966955 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 26 Jan 2024 10:39:42 -0500
Subject: [PATCH 0621/1406] sunrpc: pass in the sv_stats struct through
 svc_create_pooled

Since only one service actually reports the rpc stats there's not much
of a reason to have a pointer to it in the svc_program struct.  Adjust
the svc_create_pooled function to take the sv_stats as an argument and
pass the struct through there as desired instead of getting it from the
svc_program->pg_stats.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfssvc.c           |  3 ++-
 include/linux/sunrpc/svc.h |  4 +++-
 net/sunrpc/svc.c           | 12 +++++++-----
 3 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index a0b117107e8605..d640f893021a71 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -661,7 +661,8 @@ int nfsd_create_serv(struct net *net)
 	if (nfsd_max_blksize == 0)
 		nfsd_max_blksize = nfsd_get_default_max_blksize();
 	nfsd_reset_versions(nn);
-	serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize, nfsd);
+	serv = svc_create_pooled(&nfsd_program, &nfsd_svcstats,
+				 nfsd_max_blksize, nfsd);
 	if (serv == NULL)
 		return -ENOMEM;
 
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 67cf1c9efd809b..91a653eb3a5073 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -411,7 +411,9 @@ bool		   svc_rqst_replace_page(struct svc_rqst *rqstp,
 void		   svc_rqst_release_pages(struct svc_rqst *rqstp);
 void		   svc_rqst_free(struct svc_rqst *);
 void		   svc_exit_thread(struct svc_rqst *);
-struct svc_serv *  svc_create_pooled(struct svc_program *, unsigned int,
+struct svc_serv *  svc_create_pooled(struct svc_program *prog,
+				     struct svc_stat *stats,
+				     unsigned int bufsize,
 				     int (*threadfn)(void *data));
 int		   svc_set_num_threads(struct svc_serv *, struct svc_pool *, int);
 int		   svc_pool_stats_open(struct svc_info *si, struct file *file);
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 62afc3dbc83717..1ce6a3b7175caf 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -451,8 +451,8 @@ __svc_init_bc(struct svc_serv *serv)
  * Create an RPC service
  */
 static struct svc_serv *
-__svc_create(struct svc_program *prog, unsigned int bufsize, int npools,
-	     int (*threadfn)(void *data))
+__svc_create(struct svc_program *prog, struct svc_stat *stats,
+	     unsigned int bufsize, int npools, int (*threadfn)(void *data))
 {
 	struct svc_serv	*serv;
 	unsigned int vers;
@@ -463,7 +463,7 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools,
 		return NULL;
 	serv->sv_name      = prog->pg_name;
 	serv->sv_program   = prog;
-	serv->sv_stats     = prog->pg_stats;
+	serv->sv_stats     = stats;
 	if (bufsize > RPCSVC_MAXPAYLOAD)
 		bufsize = RPCSVC_MAXPAYLOAD;
 	serv->sv_max_payload = bufsize? bufsize : 4096;
@@ -529,26 +529,28 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools,
 struct svc_serv *svc_create(struct svc_program *prog, unsigned int bufsize,
 			    int (*threadfn)(void *data))
 {
-	return __svc_create(prog, bufsize, 1, threadfn);
+	return __svc_create(prog, NULL, bufsize, 1, threadfn);
 }
 EXPORT_SYMBOL_GPL(svc_create);
 
 /**
  * svc_create_pooled - Create an RPC service with pooled threads
  * @prog: the RPC program the new service will handle
+ * @stats: the stats struct if desired
  * @bufsize: maximum message size for @prog
  * @threadfn: a function to service RPC requests for @prog
  *
  * Returns an instantiated struct svc_serv object or NULL.
  */
 struct svc_serv *svc_create_pooled(struct svc_program *prog,
+				   struct svc_stat *stats,
 				   unsigned int bufsize,
 				   int (*threadfn)(void *data))
 {
 	struct svc_serv *serv;
 	unsigned int npools = svc_pool_map_get();
 
-	serv = __svc_create(prog, bufsize, npools, threadfn);
+	serv = __svc_create(prog, stats, bufsize, npools, threadfn);
 	if (!serv)
 		goto out_err;
 	return serv;

From 38d6f824667518e990024c71bb019525e0be7470 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 26 Jan 2024 10:39:43 -0500
Subject: [PATCH 0622/1406] sunrpc: remove ->pg_stats from svc_program

Now that this isn't used anywhere, remove it.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfssvc.c           | 1 -
 include/linux/sunrpc/svc.h | 1 -
 2 files changed, 2 deletions(-)

diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index d640f893021a71..d98a6abad99010 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -127,7 +127,6 @@ struct svc_program		nfsd_program = {
 	.pg_vers		= nfsd_version,		/* version table */
 	.pg_name		= "nfsd",		/* program name */
 	.pg_class		= "nfsd",		/* authentication class */
-	.pg_stats		= &nfsd_svcstats,	/* version table */
 	.pg_authenticate	= &svc_set_client,	/* export authentication */
 	.pg_init_request	= nfsd_init_request,
 	.pg_rpcbind_set		= nfsd_rpcbind_set,
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 91a653eb3a5073..23617da0e565e7 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -339,7 +339,6 @@ struct svc_program {
 	const struct svc_version **pg_vers;	/* version array */
 	char *			pg_name;	/* service name */
 	char *			pg_class;	/* class name: services sharing authentication */
-	struct svc_stat *	pg_stats;	/* rpc statistics */
 	enum svc_auth_status	(*pg_authenticate)(struct svc_rqst *rqstp);
 	__be32			(*pg_init_request)(struct svc_rqst *,
 						   const struct svc_program *,

From ceee57998de9b45885b7acbf7bf6ad0fcde18fc5 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 26 Jan 2024 10:39:44 -0500
Subject: [PATCH 0623/1406] sunrpc: use the struct net as the svc proc private

nfsd is the only thing using this helper, and it doesn't use the private
currently.  When we switch to per-network namespace stats we will need
the struct net * in order to get to the nfsd_net.  Use the net as the
proc private so we can utilize this when we make the switch over.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 net/sunrpc/stats.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/sunrpc/stats.c b/net/sunrpc/stats.c
index 65fc1297c6dfa4..383860cb1d5b0f 100644
--- a/net/sunrpc/stats.c
+++ b/net/sunrpc/stats.c
@@ -314,7 +314,7 @@ EXPORT_SYMBOL_GPL(rpc_proc_unregister);
 struct proc_dir_entry *
 svc_proc_register(struct net *net, struct svc_stat *statp, const struct proc_ops *proc_ops)
 {
-	return do_register(net, statp->program->pg_name, statp, proc_ops);
+	return do_register(net, statp->program->pg_name, net, proc_ops);
 }
 EXPORT_SYMBOL_GPL(svc_proc_register);
 

From 4758da365c12c1129cd79310ce9ca9f382028baa Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 26 Jan 2024 10:39:45 -0500
Subject: [PATCH 0624/1406] nfsd: rename NFSD_NET_* to NFSD_STATS_*

We're going to merge the stats all into per network namespace in
subsequent patches, rename these nn counters to be consistent with the
rest of the stats.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/netns.h    | 4 ++--
 fs/nfsd/nfscache.c | 4 ++--
 fs/nfsd/stats.h    | 6 +++---
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
index 74b4360779a112..e3605cb5f044d8 100644
--- a/fs/nfsd/netns.h
+++ b/fs/nfsd/netns.h
@@ -26,9 +26,9 @@ struct nfsd4_client_tracking_ops;
 
 enum {
 	/* cache misses due only to checksum comparison failures */
-	NFSD_NET_PAYLOAD_MISSES,
+	NFSD_STATS_PAYLOAD_MISSES,
 	/* amount of memory (in bytes) currently consumed by the DRC */
-	NFSD_NET_DRC_MEM_USAGE,
+	NFSD_STATS_DRC_MEM_USAGE,
 	NFSD_NET_COUNTERS_NUM
 };
 
diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index 5c1a4a0aa60568..3d4a9d181c43e2 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -687,7 +687,7 @@ int nfsd_reply_cache_stats_show(struct seq_file *m, void *v)
 		   atomic_read(&nn->num_drc_entries));
 	seq_printf(m, "hash buckets:          %u\n", 1 << nn->maskbits);
 	seq_printf(m, "mem usage:             %lld\n",
-		   percpu_counter_sum_positive(&nn->counter[NFSD_NET_DRC_MEM_USAGE]));
+		   percpu_counter_sum_positive(&nn->counter[NFSD_STATS_DRC_MEM_USAGE]));
 	seq_printf(m, "cache hits:            %lld\n",
 		   percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_RC_HITS]));
 	seq_printf(m, "cache misses:          %lld\n",
@@ -695,7 +695,7 @@ int nfsd_reply_cache_stats_show(struct seq_file *m, void *v)
 	seq_printf(m, "not cached:            %lld\n",
 		   percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_RC_NOCACHE]));
 	seq_printf(m, "payload misses:        %lld\n",
-		   percpu_counter_sum_positive(&nn->counter[NFSD_NET_PAYLOAD_MISSES]));
+		   percpu_counter_sum_positive(&nn->counter[NFSD_STATS_PAYLOAD_MISSES]));
 	seq_printf(m, "longest chain len:     %u\n", nn->longest_chain);
 	seq_printf(m, "cachesize at longest:  %u\n", nn->longest_chain_cachesize);
 	return 0;
diff --git a/fs/nfsd/stats.h b/fs/nfsd/stats.h
index 14f50c660b619e..7ed4325ac69123 100644
--- a/fs/nfsd/stats.h
+++ b/fs/nfsd/stats.h
@@ -81,17 +81,17 @@ static inline void nfsd_stats_io_write_add(struct svc_export *exp, s64 amount)
 
 static inline void nfsd_stats_payload_misses_inc(struct nfsd_net *nn)
 {
-	percpu_counter_inc(&nn->counter[NFSD_NET_PAYLOAD_MISSES]);
+	percpu_counter_inc(&nn->counter[NFSD_STATS_PAYLOAD_MISSES]);
 }
 
 static inline void nfsd_stats_drc_mem_usage_add(struct nfsd_net *nn, s64 amount)
 {
-	percpu_counter_add(&nn->counter[NFSD_NET_DRC_MEM_USAGE], amount);
+	percpu_counter_add(&nn->counter[NFSD_STATS_DRC_MEM_USAGE], amount);
 }
 
 static inline void nfsd_stats_drc_mem_usage_sub(struct nfsd_net *nn, s64 amount)
 {
-	percpu_counter_sub(&nn->counter[NFSD_NET_DRC_MEM_USAGE], amount);
+	percpu_counter_sub(&nn->counter[NFSD_STATS_DRC_MEM_USAGE], amount);
 }
 
 #ifdef CONFIG_NFSD_V4

From 0ce5a735575185181b911278a00db6f4575b1dfb Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 26 Jan 2024 10:39:46 -0500
Subject: [PATCH 0625/1406] nfsd: expose /proc/net/sunrpc/nfsd in net
 namespaces

We are running nfsd servers inside of containers with their own network
namespace, and we want to monitor these services using the stats found
in /proc.  However these are not exposed in the proc inside of the
container, so we have to bind mount the host /proc into our containers
to get at this information.

Separate out the stat counters init and the proc registration, and move
the proc registration into the pernet operations entry and exit points
so that these stats can be exposed inside of network namespaces.

This is an intermediate step, this just exposes the global counters in
the network namespace.  Subsequent patches will move these counters into
the per-network namespace container.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfsctl.c |  8 +++++---
 fs/nfsd/stats.c  | 21 ++++++---------------
 fs/nfsd/stats.h  |  6 ++++--
 3 files changed, 15 insertions(+), 20 deletions(-)

diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index f206ca32e7f53c..b57480b50e350c 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -1679,6 +1679,7 @@ static __net_init int nfsd_net_init(struct net *net)
 	nfsd4_init_leases_net(nn);
 	get_random_bytes(&nn->siphash_key, sizeof(nn->siphash_key));
 	seqlock_init(&nn->writeverf_lock);
+	nfsd_proc_stat_init(net);
 
 	return 0;
 
@@ -1699,6 +1700,7 @@ static __net_exit void nfsd_net_exit(struct net *net)
 {
 	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
 
+	nfsd_proc_stat_shutdown(net);
 	nfsd_net_reply_cache_destroy(nn);
 	nfsd_idmap_shutdown(net);
 	nfsd_export_shutdown(net);
@@ -1722,7 +1724,7 @@ static int __init init_nfsd(void)
 	retval = nfsd4_init_pnfs();
 	if (retval)
 		goto out_free_slabs;
-	retval = nfsd_stat_init();	/* Statistics */
+	retval = nfsd_stat_counters_init();	/* Statistics */
 	if (retval)
 		goto out_free_pnfs;
 	retval = nfsd_drc_slab_create();
@@ -1762,7 +1764,7 @@ static int __init init_nfsd(void)
 	nfsd_lockd_shutdown();
 	nfsd_drc_slab_free();
 out_free_stat:
-	nfsd_stat_shutdown();
+	nfsd_stat_counters_destroy();
 out_free_pnfs:
 	nfsd4_exit_pnfs();
 out_free_slabs:
@@ -1780,7 +1782,7 @@ static void __exit exit_nfsd(void)
 	nfsd_drc_slab_free();
 	remove_proc_entry("fs/nfs/exports", NULL);
 	remove_proc_entry("fs/nfs", NULL);
-	nfsd_stat_shutdown();
+	nfsd_stat_counters_destroy();
 	nfsd_lockd_shutdown();
 	nfsd4_free_slabs();
 	nfsd4_exit_pnfs();
diff --git a/fs/nfsd/stats.c b/fs/nfsd/stats.c
index 12d79f5d4eb1ac..394a65a33942d7 100644
--- a/fs/nfsd/stats.c
+++ b/fs/nfsd/stats.c
@@ -108,31 +108,22 @@ void nfsd_percpu_counters_destroy(struct percpu_counter counters[], int num)
 		percpu_counter_destroy(&counters[i]);
 }
 
-static int nfsd_stat_counters_init(void)
+int nfsd_stat_counters_init(void)
 {
 	return nfsd_percpu_counters_init(nfsdstats.counter, NFSD_STATS_COUNTERS_NUM);
 }
 
-static void nfsd_stat_counters_destroy(void)
+void nfsd_stat_counters_destroy(void)
 {
 	nfsd_percpu_counters_destroy(nfsdstats.counter, NFSD_STATS_COUNTERS_NUM);
 }
 
-int nfsd_stat_init(void)
+void nfsd_proc_stat_init(struct net *net)
 {
-	int err;
-
-	err = nfsd_stat_counters_init();
-	if (err)
-		return err;
-
-	svc_proc_register(&init_net, &nfsd_svcstats, &nfsd_proc_ops);
-
-	return 0;
+	svc_proc_register(net, &nfsd_svcstats, &nfsd_proc_ops);
 }
 
-void nfsd_stat_shutdown(void)
+void nfsd_proc_stat_shutdown(struct net *net)
 {
-	nfsd_stat_counters_destroy();
-	svc_proc_unregister(&init_net, "nfsd");
+	svc_proc_unregister(net, "nfsd");
 }
diff --git a/fs/nfsd/stats.h b/fs/nfsd/stats.h
index 7ed4325ac69123..38811aa7d13e1e 100644
--- a/fs/nfsd/stats.h
+++ b/fs/nfsd/stats.h
@@ -40,8 +40,10 @@ extern struct svc_stat		nfsd_svcstats;
 int nfsd_percpu_counters_init(struct percpu_counter *counters, int num);
 void nfsd_percpu_counters_reset(struct percpu_counter *counters, int num);
 void nfsd_percpu_counters_destroy(struct percpu_counter *counters, int num);
-int nfsd_stat_init(void);
-void nfsd_stat_shutdown(void);
+int nfsd_stat_counters_init(void);
+void nfsd_stat_counters_destroy(void);
+void nfsd_proc_stat_init(struct net *net);
+void nfsd_proc_stat_shutdown(struct net *net);
 
 static inline void nfsd_stats_rc_hits_inc(void)
 {

From 54df72b904d97902ba9d5d4f9bc91eef630f3204 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 26 Jan 2024 10:39:47 -0500
Subject: [PATCH 0626/1406] nfsd: make all of the nfsd stats per-network
 namespace

We have a global set of counters that we modify for all of the nfsd
operations, but now that we're exposing these stats across all network
namespaces we need to make the stats also be per-network namespace.  We
already have some caching stats that are per-network namespace, so move
these definitions into the same counter and then adjust all the helpers
and users of these stats to provide the appropriate nfsd_net struct so
that the stats are maintained for the per-network namespace objects.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/cache.h     |  2 --
 fs/nfsd/netns.h     | 17 ++++++++++++--
 fs/nfsd/nfs4proc.c  |  6 ++---
 fs/nfsd/nfs4state.c |  3 ++-
 fs/nfsd/nfscache.c  | 36 ++++++------------------------
 fs/nfsd/nfsctl.c    | 12 +++-------
 fs/nfsd/nfsfh.c     |  3 ++-
 fs/nfsd/stats.c     | 26 ++++++++++++----------
 fs/nfsd/stats.h     | 54 ++++++++++++++++-----------------------------
 fs/nfsd/vfs.c       |  6 +++--
 10 files changed, 69 insertions(+), 96 deletions(-)

diff --git a/fs/nfsd/cache.h b/fs/nfsd/cache.h
index 4cbe0434cbb8ce..66a05fefae98ea 100644
--- a/fs/nfsd/cache.h
+++ b/fs/nfsd/cache.h
@@ -80,8 +80,6 @@ enum {
 
 int	nfsd_drc_slab_create(void);
 void	nfsd_drc_slab_free(void);
-int	nfsd_net_reply_cache_init(struct nfsd_net *nn);
-void	nfsd_net_reply_cache_destroy(struct nfsd_net *nn);
 int	nfsd_reply_cache_init(struct nfsd_net *);
 void	nfsd_reply_cache_shutdown(struct nfsd_net *);
 int	nfsd_cache_lookup(struct svc_rqst *rqstp, unsigned int start,
diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
index e3605cb5f044d8..0cef4bb407a9c6 100644
--- a/fs/nfsd/netns.h
+++ b/fs/nfsd/netns.h
@@ -11,6 +11,7 @@
 #include <net/net_namespace.h>
 #include <net/netns/generic.h>
 #include <linux/filelock.h>
+#include <linux/nfs4.h>
 #include <linux/percpu_counter.h>
 #include <linux/siphash.h>
 
@@ -29,7 +30,19 @@ enum {
 	NFSD_STATS_PAYLOAD_MISSES,
 	/* amount of memory (in bytes) currently consumed by the DRC */
 	NFSD_STATS_DRC_MEM_USAGE,
-	NFSD_NET_COUNTERS_NUM
+	NFSD_STATS_RC_HITS,		/* repcache hits */
+	NFSD_STATS_RC_MISSES,		/* repcache misses */
+	NFSD_STATS_RC_NOCACHE,		/* uncached reqs */
+	NFSD_STATS_FH_STALE,		/* FH stale error */
+	NFSD_STATS_IO_READ,		/* bytes returned to read requests */
+	NFSD_STATS_IO_WRITE,		/* bytes passed in write requests */
+#ifdef CONFIG_NFSD_V4
+	NFSD_STATS_FIRST_NFS4_OP,	/* count of individual nfsv4 operations */
+	NFSD_STATS_LAST_NFS4_OP = NFSD_STATS_FIRST_NFS4_OP + LAST_NFS4_OP,
+#define NFSD_STATS_NFS4_OP(op)	(NFSD_STATS_FIRST_NFS4_OP + (op))
+	NFSD_STATS_WDELEG_GETATTR,	/* count of getattr conflict with wdeleg */
+#endif
+	NFSD_STATS_COUNTERS_NUM
 };
 
 /*
@@ -164,7 +177,7 @@ struct nfsd_net {
 	atomic_t                 num_drc_entries;
 
 	/* Per-netns stats counters */
-	struct percpu_counter    counter[NFSD_NET_COUNTERS_NUM];
+	struct percpu_counter    counter[NFSD_STATS_COUNTERS_NUM];
 
 	/* longest hash chain seen */
 	unsigned int             longest_chain;
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 14712fa08f769e..648ff427005e6c 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -2490,10 +2490,10 @@ nfsd4_proc_null(struct svc_rqst *rqstp)
 	return rpc_success;
 }
 
-static inline void nfsd4_increment_op_stats(u32 opnum)
+static inline void nfsd4_increment_op_stats(struct nfsd_net *nn, u32 opnum)
 {
 	if (opnum >= FIRST_NFS4_OP && opnum <= LAST_NFS4_OP)
-		percpu_counter_inc(&nfsdstats.counter[NFSD_STATS_NFS4_OP(opnum)]);
+		percpu_counter_inc(&nn->counter[NFSD_STATS_NFS4_OP(opnum)]);
 }
 
 static const struct nfsd4_operation nfsd4_ops[];
@@ -2768,7 +2768,7 @@ nfsd4_proc_compound(struct svc_rqst *rqstp)
 					   status, nfsd4_op_name(op->opnum));
 
 		nfsd4_cstate_clear_replay(cstate);
-		nfsd4_increment_op_stats(op->opnum);
+		nfsd4_increment_op_stats(nn, op->opnum);
 	}
 
 	fh_put(current_fh);
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index b4c2a5f4e10b79..a16039968335e3 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -8460,6 +8460,7 @@ __be32
 nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, struct inode *inode)
 {
 	__be32 status;
+	struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
 	struct file_lock_context *ctx;
 	struct file_lock *fl;
 	struct nfs4_delegation *dp;
@@ -8489,7 +8490,7 @@ nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, struct inode *inode)
 			}
 break_lease:
 			spin_unlock(&ctx->flc_lock);
-			nfsd_stats_wdeleg_getattr_inc();
+			nfsd_stats_wdeleg_getattr_inc(nn);
 			status = nfserrno(nfsd_open_break_lease(inode, NFSD_MAY_READ));
 			if (status != nfserr_jukebox ||
 					!nfsd_wait_for_delegreturn(rqstp, inode))
diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index 3d4a9d181c43e2..cfcc6ac8f255a8 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -176,27 +176,6 @@ void nfsd_drc_slab_free(void)
 	kmem_cache_destroy(drc_slab);
 }
 
-/**
- * nfsd_net_reply_cache_init - per net namespace reply cache set-up
- * @nn: nfsd_net being initialized
- *
- * Returns zero on succes; otherwise a negative errno is returned.
- */
-int nfsd_net_reply_cache_init(struct nfsd_net *nn)
-{
-	return nfsd_percpu_counters_init(nn->counter, NFSD_NET_COUNTERS_NUM);
-}
-
-/**
- * nfsd_net_reply_cache_destroy - per net namespace reply cache tear-down
- * @nn: nfsd_net being freed
- *
- */
-void nfsd_net_reply_cache_destroy(struct nfsd_net *nn)
-{
-	nfsd_percpu_counters_destroy(nn->counter, NFSD_NET_COUNTERS_NUM);
-}
-
 int nfsd_reply_cache_init(struct nfsd_net *nn)
 {
 	unsigned int hashsize;
@@ -501,7 +480,7 @@ nfsd_cache_insert(struct nfsd_drc_bucket *b, struct nfsd_cacherep *key,
 int nfsd_cache_lookup(struct svc_rqst *rqstp, unsigned int start,
 		      unsigned int len, struct nfsd_cacherep **cacherep)
 {
-	struct nfsd_net		*nn;
+	struct nfsd_net		*nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
 	struct nfsd_cacherep	*rp, *found;
 	__wsum			csum;
 	struct nfsd_drc_bucket	*b;
@@ -510,7 +489,7 @@ int nfsd_cache_lookup(struct svc_rqst *rqstp, unsigned int start,
 	int rtn = RC_DOIT;
 
 	if (type == RC_NOCACHE) {
-		nfsd_stats_rc_nocache_inc();
+		nfsd_stats_rc_nocache_inc(nn);
 		goto out;
 	}
 
@@ -520,7 +499,6 @@ int nfsd_cache_lookup(struct svc_rqst *rqstp, unsigned int start,
 	 * Since the common case is a cache miss followed by an insert,
 	 * preallocate an entry.
 	 */
-	nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
 	rp = nfsd_cacherep_alloc(rqstp, csum, nn);
 	if (!rp)
 		goto out;
@@ -537,7 +515,7 @@ int nfsd_cache_lookup(struct svc_rqst *rqstp, unsigned int start,
 
 	nfsd_cacherep_dispose(&dispose);
 
-	nfsd_stats_rc_misses_inc();
+	nfsd_stats_rc_misses_inc(nn);
 	atomic_inc(&nn->num_drc_entries);
 	nfsd_stats_drc_mem_usage_add(nn, sizeof(*rp));
 	goto out;
@@ -545,7 +523,7 @@ int nfsd_cache_lookup(struct svc_rqst *rqstp, unsigned int start,
 found_entry:
 	/* We found a matching entry which is either in progress or done. */
 	nfsd_reply_cache_free_locked(NULL, rp, nn);
-	nfsd_stats_rc_hits_inc();
+	nfsd_stats_rc_hits_inc(nn);
 	rtn = RC_DROPIT;
 	rp = found;
 
@@ -689,11 +667,11 @@ int nfsd_reply_cache_stats_show(struct seq_file *m, void *v)
 	seq_printf(m, "mem usage:             %lld\n",
 		   percpu_counter_sum_positive(&nn->counter[NFSD_STATS_DRC_MEM_USAGE]));
 	seq_printf(m, "cache hits:            %lld\n",
-		   percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_RC_HITS]));
+		   percpu_counter_sum_positive(&nn->counter[NFSD_STATS_RC_HITS]));
 	seq_printf(m, "cache misses:          %lld\n",
-		   percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_RC_MISSES]));
+		   percpu_counter_sum_positive(&nn->counter[NFSD_STATS_RC_MISSES]));
 	seq_printf(m, "not cached:            %lld\n",
-		   percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_RC_NOCACHE]));
+		   percpu_counter_sum_positive(&nn->counter[NFSD_STATS_RC_NOCACHE]));
 	seq_printf(m, "payload misses:        %lld\n",
 		   percpu_counter_sum_positive(&nn->counter[NFSD_STATS_PAYLOAD_MISSES]));
 	seq_printf(m, "longest chain len:     %u\n", nn->longest_chain);
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index b57480b50e350c..ea3c8114245c28 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -1671,7 +1671,7 @@ static __net_init int nfsd_net_init(struct net *net)
 	retval = nfsd_idmap_init(net);
 	if (retval)
 		goto out_idmap_error;
-	retval = nfsd_net_reply_cache_init(nn);
+	retval = nfsd_stat_counters_init(nn);
 	if (retval)
 		goto out_repcache_error;
 	nn->nfsd_versions = NULL;
@@ -1701,7 +1701,7 @@ static __net_exit void nfsd_net_exit(struct net *net)
 	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
 
 	nfsd_proc_stat_shutdown(net);
-	nfsd_net_reply_cache_destroy(nn);
+	nfsd_stat_counters_destroy(nn);
 	nfsd_idmap_shutdown(net);
 	nfsd_export_shutdown(net);
 	nfsd_netns_free_versions(nn);
@@ -1724,12 +1724,9 @@ static int __init init_nfsd(void)
 	retval = nfsd4_init_pnfs();
 	if (retval)
 		goto out_free_slabs;
-	retval = nfsd_stat_counters_init();	/* Statistics */
-	if (retval)
-		goto out_free_pnfs;
 	retval = nfsd_drc_slab_create();
 	if (retval)
-		goto out_free_stat;
+		goto out_free_pnfs;
 	nfsd_lockd_init();	/* lockd->nfsd callbacks */
 	retval = create_proc_exports_entry();
 	if (retval)
@@ -1763,8 +1760,6 @@ static int __init init_nfsd(void)
 out_free_lockd:
 	nfsd_lockd_shutdown();
 	nfsd_drc_slab_free();
-out_free_stat:
-	nfsd_stat_counters_destroy();
 out_free_pnfs:
 	nfsd4_exit_pnfs();
 out_free_slabs:
@@ -1782,7 +1777,6 @@ static void __exit exit_nfsd(void)
 	nfsd_drc_slab_free();
 	remove_proc_entry("fs/nfs/exports", NULL);
 	remove_proc_entry("fs/nfs", NULL);
-	nfsd_stat_counters_destroy();
 	nfsd_lockd_shutdown();
 	nfsd4_free_slabs();
 	nfsd4_exit_pnfs();
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index dbfa0ac13564ac..40fecf7b224f2f 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -327,6 +327,7 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp)
 __be32
 fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, int access)
 {
+	struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
 	struct svc_export *exp = NULL;
 	struct dentry	*dentry;
 	__be32		error;
@@ -395,7 +396,7 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, int access)
 out:
 	trace_nfsd_fh_verify_err(rqstp, fhp, type, access, error);
 	if (error == nfserr_stale)
-		nfsd_stats_fh_stale_inc(exp);
+		nfsd_stats_fh_stale_inc(nn, exp);
 	return error;
 }
 
diff --git a/fs/nfsd/stats.c b/fs/nfsd/stats.c
index 394a65a33942d7..44e275324b06e5 100644
--- a/fs/nfsd/stats.c
+++ b/fs/nfsd/stats.c
@@ -34,15 +34,17 @@ struct svc_stat		nfsd_svcstats = {
 
 static int nfsd_show(struct seq_file *seq, void *v)
 {
+	struct net *net = pde_data(file_inode(seq->file));
+	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
 	int i;
 
 	seq_printf(seq, "rc %lld %lld %lld\nfh %lld 0 0 0 0\nio %lld %lld\n",
-		   percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_RC_HITS]),
-		   percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_RC_MISSES]),
-		   percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_RC_NOCACHE]),
-		   percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_FH_STALE]),
-		   percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_IO_READ]),
-		   percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_IO_WRITE]));
+		   percpu_counter_sum_positive(&nn->counter[NFSD_STATS_RC_HITS]),
+		   percpu_counter_sum_positive(&nn->counter[NFSD_STATS_RC_MISSES]),
+		   percpu_counter_sum_positive(&nn->counter[NFSD_STATS_RC_NOCACHE]),
+		   percpu_counter_sum_positive(&nn->counter[NFSD_STATS_FH_STALE]),
+		   percpu_counter_sum_positive(&nn->counter[NFSD_STATS_IO_READ]),
+		   percpu_counter_sum_positive(&nn->counter[NFSD_STATS_IO_WRITE]));
 
 	/* thread usage: */
 	seq_printf(seq, "th %u 0", atomic_read(&nfsdstats.th_cnt));
@@ -63,10 +65,10 @@ static int nfsd_show(struct seq_file *seq, void *v)
 	seq_printf(seq, "proc4ops %u", LAST_NFS4_OP + 1);
 	for (i = 0; i <= LAST_NFS4_OP; i++) {
 		seq_printf(seq, " %lld",
-			   percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_NFS4_OP(i)]));
+			   percpu_counter_sum_positive(&nn->counter[NFSD_STATS_NFS4_OP(i)]));
 	}
 	seq_printf(seq, "\nwdeleg_getattr %lld",
-		percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_WDELEG_GETATTR]));
+		percpu_counter_sum_positive(&nn->counter[NFSD_STATS_WDELEG_GETATTR]));
 
 	seq_putc(seq, '\n');
 #endif
@@ -108,14 +110,14 @@ void nfsd_percpu_counters_destroy(struct percpu_counter counters[], int num)
 		percpu_counter_destroy(&counters[i]);
 }
 
-int nfsd_stat_counters_init(void)
+int nfsd_stat_counters_init(struct nfsd_net *nn)
 {
-	return nfsd_percpu_counters_init(nfsdstats.counter, NFSD_STATS_COUNTERS_NUM);
+	return nfsd_percpu_counters_init(nn->counter, NFSD_STATS_COUNTERS_NUM);
 }
 
-void nfsd_stat_counters_destroy(void)
+void nfsd_stat_counters_destroy(struct nfsd_net *nn)
 {
-	nfsd_percpu_counters_destroy(nfsdstats.counter, NFSD_STATS_COUNTERS_NUM);
+	nfsd_percpu_counters_destroy(nn->counter, NFSD_STATS_COUNTERS_NUM);
 }
 
 void nfsd_proc_stat_init(struct net *net)
diff --git a/fs/nfsd/stats.h b/fs/nfsd/stats.h
index 38811aa7d13e1e..c24be4ddbe7d70 100644
--- a/fs/nfsd/stats.h
+++ b/fs/nfsd/stats.h
@@ -10,26 +10,7 @@
 #include <uapi/linux/nfsd/stats.h>
 #include <linux/percpu_counter.h>
 
-
-enum {
-	NFSD_STATS_RC_HITS,		/* repcache hits */
-	NFSD_STATS_RC_MISSES,		/* repcache misses */
-	NFSD_STATS_RC_NOCACHE,		/* uncached reqs */
-	NFSD_STATS_FH_STALE,		/* FH stale error */
-	NFSD_STATS_IO_READ,		/* bytes returned to read requests */
-	NFSD_STATS_IO_WRITE,		/* bytes passed in write requests */
-#ifdef CONFIG_NFSD_V4
-	NFSD_STATS_FIRST_NFS4_OP,	/* count of individual nfsv4 operations */
-	NFSD_STATS_LAST_NFS4_OP = NFSD_STATS_FIRST_NFS4_OP + LAST_NFS4_OP,
-#define NFSD_STATS_NFS4_OP(op)	(NFSD_STATS_FIRST_NFS4_OP + (op))
-	NFSD_STATS_WDELEG_GETATTR,	/* count of getattr conflict with wdeleg */
-#endif
-	NFSD_STATS_COUNTERS_NUM
-};
-
 struct nfsd_stats {
-	struct percpu_counter	counter[NFSD_STATS_COUNTERS_NUM];
-
 	atomic_t	th_cnt;		/* number of available threads */
 };
 
@@ -40,43 +21,46 @@ extern struct svc_stat		nfsd_svcstats;
 int nfsd_percpu_counters_init(struct percpu_counter *counters, int num);
 void nfsd_percpu_counters_reset(struct percpu_counter *counters, int num);
 void nfsd_percpu_counters_destroy(struct percpu_counter *counters, int num);
-int nfsd_stat_counters_init(void);
-void nfsd_stat_counters_destroy(void);
+int nfsd_stat_counters_init(struct nfsd_net *nn);
+void nfsd_stat_counters_destroy(struct nfsd_net *nn);
 void nfsd_proc_stat_init(struct net *net);
 void nfsd_proc_stat_shutdown(struct net *net);
 
-static inline void nfsd_stats_rc_hits_inc(void)
+static inline void nfsd_stats_rc_hits_inc(struct nfsd_net *nn)
 {
-	percpu_counter_inc(&nfsdstats.counter[NFSD_STATS_RC_HITS]);
+	percpu_counter_inc(&nn->counter[NFSD_STATS_RC_HITS]);
 }
 
-static inline void nfsd_stats_rc_misses_inc(void)
+static inline void nfsd_stats_rc_misses_inc(struct nfsd_net *nn)
 {
-	percpu_counter_inc(&nfsdstats.counter[NFSD_STATS_RC_MISSES]);
+	percpu_counter_inc(&nn->counter[NFSD_STATS_RC_MISSES]);
 }
 
-static inline void nfsd_stats_rc_nocache_inc(void)
+static inline void nfsd_stats_rc_nocache_inc(struct nfsd_net *nn)
 {
-	percpu_counter_inc(&nfsdstats.counter[NFSD_STATS_RC_NOCACHE]);
+	percpu_counter_inc(&nn->counter[NFSD_STATS_RC_NOCACHE]);
 }
 
-static inline void nfsd_stats_fh_stale_inc(struct svc_export *exp)
+static inline void nfsd_stats_fh_stale_inc(struct nfsd_net *nn,
+					   struct svc_export *exp)
 {
-	percpu_counter_inc(&nfsdstats.counter[NFSD_STATS_FH_STALE]);
+	percpu_counter_inc(&nn->counter[NFSD_STATS_FH_STALE]);
 	if (exp && exp->ex_stats)
 		percpu_counter_inc(&exp->ex_stats->counter[EXP_STATS_FH_STALE]);
 }
 
-static inline void nfsd_stats_io_read_add(struct svc_export *exp, s64 amount)
+static inline void nfsd_stats_io_read_add(struct nfsd_net *nn,
+					  struct svc_export *exp, s64 amount)
 {
-	percpu_counter_add(&nfsdstats.counter[NFSD_STATS_IO_READ], amount);
+	percpu_counter_add(&nn->counter[NFSD_STATS_IO_READ], amount);
 	if (exp && exp->ex_stats)
 		percpu_counter_add(&exp->ex_stats->counter[EXP_STATS_IO_READ], amount);
 }
 
-static inline void nfsd_stats_io_write_add(struct svc_export *exp, s64 amount)
+static inline void nfsd_stats_io_write_add(struct nfsd_net *nn,
+					   struct svc_export *exp, s64 amount)
 {
-	percpu_counter_add(&nfsdstats.counter[NFSD_STATS_IO_WRITE], amount);
+	percpu_counter_add(&nn->counter[NFSD_STATS_IO_WRITE], amount);
 	if (exp && exp->ex_stats)
 		percpu_counter_add(&exp->ex_stats->counter[EXP_STATS_IO_WRITE], amount);
 }
@@ -97,9 +81,9 @@ static inline void nfsd_stats_drc_mem_usage_sub(struct nfsd_net *nn, s64 amount)
 }
 
 #ifdef CONFIG_NFSD_V4
-static inline void nfsd_stats_wdeleg_getattr_inc(void)
+static inline void nfsd_stats_wdeleg_getattr_inc(struct nfsd_net *nn)
 {
-	percpu_counter_inc(&nfsdstats.counter[NFSD_STATS_WDELEG_GETATTR]);
+	percpu_counter_inc(&nn->counter[NFSD_STATS_WDELEG_GETATTR]);
 }
 #endif
 #endif /* _NFSD_STATS_H */
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index f57749cd6f0b1a..38952105ed7fd4 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1002,7 +1002,9 @@ static __be32 nfsd_finish_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
 			       unsigned long *count, u32 *eof, ssize_t host_err)
 {
 	if (host_err >= 0) {
-		nfsd_stats_io_read_add(fhp->fh_export, host_err);
+		struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+
+		nfsd_stats_io_read_add(nn, fhp->fh_export, host_err);
 		*eof = nfsd_eof_on_read(file, offset, host_err, *count);
 		*count = host_err;
 		fsnotify_access(file);
@@ -1185,7 +1187,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf,
 		goto out_nfserr;
 	}
 	*cnt = host_err;
-	nfsd_stats_io_write_add(exp, *cnt);
+	nfsd_stats_io_write_add(nn, exp, *cnt);
 	fsnotify_modify(file);
 	host_err = filemap_check_wb_err(file->f_mapping, since);
 	if (host_err < 0)

From b143a7a4c8fdf514de65292a12d338d28a18e45b Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 26 Jan 2024 10:39:48 -0500
Subject: [PATCH 0627/1406] nfsd: remove nfsd_stats, make th_cnt a global
 counter

This is the last global stat, take it out of the nfsd_stats struct and
make it a global part of nfsd, report it the same as always.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfsd.h   | 1 +
 fs/nfsd/nfssvc.c | 5 +++--
 fs/nfsd/stats.c  | 3 +--
 fs/nfsd/stats.h  | 6 ------
 4 files changed, 5 insertions(+), 10 deletions(-)

diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 304e9728b929a0..be2ea3d6d2a289 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -86,6 +86,7 @@ extern struct mutex		nfsd_mutex;
 extern spinlock_t		nfsd_drc_lock;
 extern unsigned long		nfsd_drc_max_mem;
 extern unsigned long		nfsd_drc_mem_used;
+extern atomic_t			nfsd_th_cnt;		/* number of available threads */
 
 extern const struct seq_operations nfs_exports_op;
 
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index d98a6abad99010..fdb59189643044 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -34,6 +34,7 @@
 
 #define NFSDDBG_FACILITY	NFSDDBG_SVC
 
+atomic_t			nfsd_th_cnt = ATOMIC_INIT(0);
 extern struct svc_program	nfsd_program;
 static int			nfsd(void *vrqstp);
 #if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
@@ -924,7 +925,7 @@ nfsd(void *vrqstp)
 
 	current->fs->umask = 0;
 
-	atomic_inc(&nfsdstats.th_cnt);
+	atomic_inc(&nfsd_th_cnt);
 
 	set_freezable();
 
@@ -940,7 +941,7 @@ nfsd(void *vrqstp)
 		nfsd_file_net_dispose(nn);
 	}
 
-	atomic_dec(&nfsdstats.th_cnt);
+	atomic_dec(&nfsd_th_cnt);
 
 out:
 	/* Release the thread */
diff --git a/fs/nfsd/stats.c b/fs/nfsd/stats.c
index 44e275324b06e5..3a7f791c30528d 100644
--- a/fs/nfsd/stats.c
+++ b/fs/nfsd/stats.c
@@ -27,7 +27,6 @@
 
 #include "nfsd.h"
 
-struct nfsd_stats	nfsdstats;
 struct svc_stat		nfsd_svcstats = {
 	.program	= &nfsd_program,
 };
@@ -47,7 +46,7 @@ static int nfsd_show(struct seq_file *seq, void *v)
 		   percpu_counter_sum_positive(&nn->counter[NFSD_STATS_IO_WRITE]));
 
 	/* thread usage: */
-	seq_printf(seq, "th %u 0", atomic_read(&nfsdstats.th_cnt));
+	seq_printf(seq, "th %u 0", atomic_read(&nfsd_th_cnt));
 
 	/* deprecated thread usage histogram stats */
 	for (i = 0; i < 10; i++)
diff --git a/fs/nfsd/stats.h b/fs/nfsd/stats.h
index c24be4ddbe7d70..5675d283a53730 100644
--- a/fs/nfsd/stats.h
+++ b/fs/nfsd/stats.h
@@ -10,12 +10,6 @@
 #include <uapi/linux/nfsd/stats.h>
 #include <linux/percpu_counter.h>
 
-struct nfsd_stats {
-	atomic_t	th_cnt;		/* number of available threads */
-};
-
-extern struct nfsd_stats	nfsdstats;
-
 extern struct svc_stat		nfsd_svcstats;
 
 int nfsd_percpu_counters_init(struct percpu_counter *counters, int num);

From 094bbe0b0a2ea5a7ccbe82d87bb2795b52e4abf6 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 26 Jan 2024 10:39:49 -0500
Subject: [PATCH 0628/1406] nfsd: make svc_stat per-network namespace instead
 of global

The final bit of stats that is global is the rpc svc_stat.  Move this
into the nfsd_net struct and use that everywhere instead of the global
struct.  Remove the unused global struct.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/netns.h  |  4 ++++
 fs/nfsd/nfsctl.c |  2 ++
 fs/nfsd/nfssvc.c |  2 +-
 fs/nfsd/stats.c  | 10 ++++------
 fs/nfsd/stats.h  |  2 --
 5 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
index 0cef4bb407a9c6..afc16ee4da7428 100644
--- a/fs/nfsd/netns.h
+++ b/fs/nfsd/netns.h
@@ -14,6 +14,7 @@
 #include <linux/nfs4.h>
 #include <linux/percpu_counter.h>
 #include <linux/siphash.h>
+#include <linux/sunrpc/stats.h>
 
 /* Hash tables for nfs4_clientid state */
 #define CLIENT_HASH_BITS                 4
@@ -179,6 +180,9 @@ struct nfsd_net {
 	/* Per-netns stats counters */
 	struct percpu_counter    counter[NFSD_STATS_COUNTERS_NUM];
 
+	/* sunrpc svc stats */
+	struct svc_stat          nfsd_svcstats;
+
 	/* longest hash chain seen */
 	unsigned int             longest_chain;
 
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index ea3c8114245c28..5a5547bd6ecf7e 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -1674,6 +1674,8 @@ static __net_init int nfsd_net_init(struct net *net)
 	retval = nfsd_stat_counters_init(nn);
 	if (retval)
 		goto out_repcache_error;
+	memset(&nn->nfsd_svcstats, 0, sizeof(nn->nfsd_svcstats));
+	nn->nfsd_svcstats.program = &nfsd_program;
 	nn->nfsd_versions = NULL;
 	nn->nfsd4_minorversions = NULL;
 	nfsd4_init_leases_net(nn);
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index fdb59189643044..c0d17b92b249f7 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -661,7 +661,7 @@ int nfsd_create_serv(struct net *net)
 	if (nfsd_max_blksize == 0)
 		nfsd_max_blksize = nfsd_get_default_max_blksize();
 	nfsd_reset_versions(nn);
-	serv = svc_create_pooled(&nfsd_program, &nfsd_svcstats,
+	serv = svc_create_pooled(&nfsd_program, &nn->nfsd_svcstats,
 				 nfsd_max_blksize, nfsd);
 	if (serv == NULL)
 		return -ENOMEM;
diff --git a/fs/nfsd/stats.c b/fs/nfsd/stats.c
index 3a7f791c30528d..be52fb1e928ed6 100644
--- a/fs/nfsd/stats.c
+++ b/fs/nfsd/stats.c
@@ -27,10 +27,6 @@
 
 #include "nfsd.h"
 
-struct svc_stat		nfsd_svcstats = {
-	.program	= &nfsd_program,
-};
-
 static int nfsd_show(struct seq_file *seq, void *v)
 {
 	struct net *net = pde_data(file_inode(seq->file));
@@ -56,7 +52,7 @@ static int nfsd_show(struct seq_file *seq, void *v)
 	seq_puts(seq, "\nra 0 0 0 0 0 0 0 0 0 0 0 0\n");
 
 	/* show my rpc info */
-	svc_seq_show(seq, &nfsd_svcstats);
+	svc_seq_show(seq, &nn->nfsd_svcstats);
 
 #ifdef CONFIG_NFSD_V4
 	/* Show count for individual nfsv4 operations */
@@ -121,7 +117,9 @@ void nfsd_stat_counters_destroy(struct nfsd_net *nn)
 
 void nfsd_proc_stat_init(struct net *net)
 {
-	svc_proc_register(net, &nfsd_svcstats, &nfsd_proc_ops);
+	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+	svc_proc_register(net, &nn->nfsd_svcstats, &nfsd_proc_ops);
 }
 
 void nfsd_proc_stat_shutdown(struct net *net)
diff --git a/fs/nfsd/stats.h b/fs/nfsd/stats.h
index 5675d283a53730..d2753e975dfd34 100644
--- a/fs/nfsd/stats.h
+++ b/fs/nfsd/stats.h
@@ -10,8 +10,6 @@
 #include <uapi/linux/nfsd/stats.h>
 #include <linux/percpu_counter.h>
 
-extern struct svc_stat		nfsd_svcstats;
-
 int nfsd_percpu_counters_init(struct percpu_counter *counters, int num);
 void nfsd_percpu_counters_reset(struct percpu_counter *counters, int num);
 void nfsd_percpu_counters_destroy(struct percpu_counter *counters, int num);

From 0bc2f896a394db1400c9fb7cab7c9995771b84f2 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 26 Jan 2024 12:45:17 -0500
Subject: [PATCH 0629/1406] NFSD: Reset cb_seq_status after NFS4ERR_DELAY

I noticed that once an NFSv4.1 callback operation gets a
NFS4ERR_DELAY status on CB_SEQUENCE and then the connection is lost,
the callback client loops, resending it indefinitely.

The switch arm in nfsd4_cb_sequence_done() that handles
NFS4ERR_DELAY uses rpc_restart_call() to rearm the RPC state machine
for the retransmit, but that path does not call the rpc_prepare_call
callback again. Thus cb_seq_status is set to -10008 by the first
NFS4ERR_DELAY result, but is never set back to 1 for the retransmits.

nfsd4_cb_sequence_done() thinks it's getting nothing but a
long series of CB_SEQUENCE NFS4ERR_DELAY replies.

Fixes: 7ba6cad6c88f ("nfsd: New helper nfsd4_cb_sequence_done() for processing more cb errors")
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Benjamin Coddington <bcodding@redhat.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4callback.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 926c29879c6ab8..43b0a34a5d5b8a 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -1178,6 +1178,7 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
 		ret = false;
 		break;
 	case -NFS4ERR_DELAY:
+		cb->cb_seq_status = 1;
 		if (!rpc_restart_call(task))
 			goto out;
 

From b4ac203ba054b3c491ac15879690b3308056ae8f Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 26 Jan 2024 12:45:23 -0500
Subject: [PATCH 0630/1406] NFSD: Convert the callback workqueue to use
 delayed_work

Normally, NFSv4 callback operations are supposed to be sent to the
client as soon as they are queued up.

In a moment, I will introduce a recovery path where the server has
to wait for the client to reconnect. We don't want a hard busy wait
here -- the callback should be requeued to try again in several
milliseconds.

For now, convert nfsd4_callback from struct work_struct to struct
delayed_work, and queue with a zero delay argument. This should
avoid behavior changes for current operation.

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Benjamin Coddington <bcodding@redhat.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4callback.c | 6 +++---
 fs/nfsd/state.h        | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 43b0a34a5d5b8a..1ed2512b364846 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -887,7 +887,7 @@ static struct workqueue_struct *callback_wq;
 
 static bool nfsd4_queue_cb(struct nfsd4_callback *cb)
 {
-	return queue_work(callback_wq, &cb->cb_work);
+	return queue_delayed_work(callback_wq, &cb->cb_work, 0);
 }
 
 static void nfsd41_cb_inflight_begin(struct nfs4_client *clp)
@@ -1370,7 +1370,7 @@ static void
 nfsd4_run_cb_work(struct work_struct *work)
 {
 	struct nfsd4_callback *cb =
-		container_of(work, struct nfsd4_callback, cb_work);
+		container_of(work, struct nfsd4_callback, cb_work.work);
 	struct nfs4_client *clp = cb->cb_clp;
 	struct rpc_clnt *clnt;
 	int flags;
@@ -1415,7 +1415,7 @@ void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp,
 	cb->cb_msg.rpc_argp = cb;
 	cb->cb_msg.rpc_resp = cb;
 	cb->cb_ops = ops;
-	INIT_WORK(&cb->cb_work, nfsd4_run_cb_work);
+	INIT_DELAYED_WORK(&cb->cb_work, nfsd4_run_cb_work);
 	cb->cb_seq_status = 1;
 	cb->cb_status = 0;
 	cb->cb_need_restart = false;
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 41bdc913fa715b..87c4372ba36a8d 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -68,7 +68,7 @@ struct nfsd4_callback {
 	struct nfs4_client *cb_clp;
 	struct rpc_message cb_msg;
 	const struct nfsd4_callback_ops *cb_ops;
-	struct work_struct cb_work;
+	struct delayed_work cb_work;
 	int cb_seq_status;
 	int cb_status;
 	bool cb_need_restart;

From bd92a6d95e76e79d4845d5f249afd39eeefc4694 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 26 Jan 2024 12:45:29 -0500
Subject: [PATCH 0631/1406] NFSD: Reschedule CB operations when backchannel
 rpc_clnt is shut down

As part of managing a client disconnect, NFSD closes down and
replaces the backchannel rpc_clnt.

If a callback operation is pending when the backchannel rpc_clnt is
shut down, currently nfsd4_run_cb_work() just discards that
callback. But there are multiple cases to deal with here:

 o The client's lease is getting destroyed. Throw the CB away.

 o The client disconnected. It might be forcing a retransmit of
   CB operations, or it could have disconnected for other reasons.
   Reschedule the CB so it is retransmitted when the client
   reconnects.

Since callback operations can now be rescheduled, ensure that
cb_ops->prepare can be called only once by moving the
cb_ops->prepare paragraph down to just before the rpc_call_async()
call.

Fixes: 2bbfed98a4d8 ("nfsd: Fix races between nfsd4_cb_release() and nfsd4_shutdown_callback()")
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Benjamin Coddington <bcodding@redhat.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4callback.c | 32 +++++++++++++++++++++++---------
 1 file changed, 23 insertions(+), 9 deletions(-)

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 1ed2512b364846..389d05985c5230 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -890,6 +890,13 @@ static bool nfsd4_queue_cb(struct nfsd4_callback *cb)
 	return queue_delayed_work(callback_wq, &cb->cb_work, 0);
 }
 
+static void nfsd4_queue_cb_delayed(struct nfsd4_callback *cb,
+				   unsigned long msecs)
+{
+	queue_delayed_work(callback_wq, &cb->cb_work,
+			   msecs_to_jiffies(msecs));
+}
+
 static void nfsd41_cb_inflight_begin(struct nfs4_client *clp)
 {
 	atomic_inc(&clp->cl_cb_inflight);
@@ -1375,20 +1382,21 @@ nfsd4_run_cb_work(struct work_struct *work)
 	struct rpc_clnt *clnt;
 	int flags;
 
-	if (cb->cb_need_restart) {
-		cb->cb_need_restart = false;
-	} else {
-		if (cb->cb_ops && cb->cb_ops->prepare)
-			cb->cb_ops->prepare(cb);
-	}
-
 	if (clp->cl_flags & NFSD4_CLIENT_CB_FLAG_MASK)
 		nfsd4_process_cb_update(cb);
 
 	clnt = clp->cl_cb_client;
 	if (!clnt) {
-		/* Callback channel broken, or client killed; give up: */
-		nfsd41_destroy_cb(cb);
+		if (test_bit(NFSD4_CLIENT_CB_KILL, &clp->cl_flags))
+			nfsd41_destroy_cb(cb);
+		else {
+			/*
+			 * XXX: Ideally, we could wait for the client to
+			 *	reconnect, but I haven't figured out how
+			 *	to do that yet.
+			 */
+			nfsd4_queue_cb_delayed(cb, 25);
+		}
 		return;
 	}
 
@@ -1401,6 +1409,12 @@ nfsd4_run_cb_work(struct work_struct *work)
 		return;
 	}
 
+	if (cb->cb_need_restart) {
+		cb->cb_need_restart = false;
+	} else {
+		if (cb->cb_ops && cb->cb_ops->prepare)
+			cb->cb_ops->prepare(cb);
+	}
 	cb->cb_msg.rpc_cred = clp->cl_cb_cred;
 	flags = clp->cl_minorversion ? RPC_TASK_NOCONNECT : RPC_TASK_SOFTCONN;
 	rpc_call_async(clnt, &cb->cb_msg, RPC_TASK_SOFT | flags,

From 0134ae80abdd544cb0c8be637233f4c8262e5359 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 26 Jan 2024 12:45:36 -0500
Subject: [PATCH 0632/1406] NFSD: Retransmit callbacks after client reconnects

NFSv4.1 clients assume that if they disconnect, that will force the
server to resend pending callback operations once a fresh connection
has been established.

Turns out NFSD has not been resending after reconnect.

Fixes: 7ba6cad6c88f ("nfsd: New helper nfsd4_cb_sequence_done() for processing more cb errors")
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Benjamin Coddington <bcodding@redhat.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4callback.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 389d05985c5230..3bff14241b3cc5 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -1178,12 +1178,21 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
 		break;
 	case -ESERVERFAULT:
 		++session->se_cb_seq_nr;
-		fallthrough;
+		nfsd4_mark_cb_fault(cb->cb_clp, cb->cb_seq_status);
+		ret = false;
+		break;
 	case 1:
+		/*
+		 * cb_seq_status remains 1 if an RPC Reply was never
+		 * received. NFSD can't know if the client processed
+		 * the CB_SEQUENCE operation. Ask the client to send a
+		 * DESTROY_SESSION to recover.
+		 */
+		fallthrough;
 	case -NFS4ERR_BADSESSION:
 		nfsd4_mark_cb_fault(cb->cb_clp, cb->cb_seq_status);
 		ret = false;
-		break;
+		goto need_restart;
 	case -NFS4ERR_DELAY:
 		cb->cb_seq_status = 1;
 		if (!rpc_restart_call(task))

From e468538e2ae13ac52d5559a531737632d76ef834 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 26 Jan 2024 12:45:42 -0500
Subject: [PATCH 0633/1406] NFSD: Add nfsd_seq4_status trace event

Add a trace point that records SEQ4_STATUS flags returned in an
NFSv4.1 SEQUENCE response. SEQ4_STATUS flags report backchannel
issues and changes to lease state to clients. Knowing what the
server is reporting to clients is useful for debugging both
configuration and operational issues in real time.

For example, upcoming patches will enable server administrators to
revoke parts of a client's lease; that revocation is indicated to
the client when a subsequent SEQUENCE operation has one or more
SEQ4_STATUS flags that are set.

Sample trace records:

nfsd-927   [006]   615.581821: nfsd_seq4_status:     xid=0x095ded07 sessionid=65a032c3:b7845faf:00000001:00000000 status_flags=BACKCHANNEL_FAULT
nfsd-927   [006]   615.588043: nfsd_seq4_status:     xid=0x0a5ded07 sessionid=65a032c3:b7845faf:00000001:00000000 status_flags=BACKCHANNEL_FAULT
nfsd-928   [003]   615.588448: nfsd_seq4_status:     xid=0x0b5ded07 sessionid=65a032c3:b7845faf:00000001:00000000 status_flags=BACKCHANNEL_FAULT

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Benjamin Coddington <bcodding@redhat.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4state.c |  1 +
 fs/nfsd/trace.h     | 35 +++++++++++++++++++++++++++++++++++
 2 files changed, 36 insertions(+)

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index a16039968335e3..a5bfa8da2cb497 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -4058,6 +4058,7 @@ nfsd4_sequence(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	}
 	if (!list_empty(&clp->cl_revoked))
 		seq->status_flags |= SEQ4_STATUS_RECALLABLE_STATE_REVOKED;
+	trace_nfsd_seq4_status(rqstp, seq);
 out_no_session:
 	if (conn)
 		free_conn(conn);
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
index d1e8cf079b0f4b..38d11b43779c77 100644
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h
@@ -696,6 +696,41 @@ DEFINE_EVENT(nfsd_stid_class, nfsd_stid_##name,			\
 
 DEFINE_STID_EVENT(revoke);
 
+TRACE_EVENT_CONDITION(nfsd_seq4_status,
+	TP_PROTO(
+		const struct svc_rqst *rqstp,
+		const struct nfsd4_sequence *sequence
+	),
+	TP_ARGS(rqstp, sequence),
+	TP_CONDITION(sequence->status_flags),
+	TP_STRUCT__entry(
+		__field(unsigned int, netns_ino)
+		__field(u32, xid)
+		__field(u32, cl_boot)
+		__field(u32, cl_id)
+		__field(u32, seqno)
+		__field(u32, reserved)
+		__field(unsigned long, status_flags)
+	),
+	TP_fast_assign(
+		const struct nfsd4_sessionid *sid =
+			(struct nfsd4_sessionid *)&sequence->sessionid;
+
+		__entry->netns_ino = SVC_NET(rqstp)->ns.inum;
+		__entry->xid = be32_to_cpu(rqstp->rq_xid);
+		__entry->cl_boot = sid->clientid.cl_boot;
+		__entry->cl_id = sid->clientid.cl_id;
+		__entry->seqno = sid->sequence;
+		__entry->reserved = sid->reserved;
+		__entry->status_flags = sequence->status_flags;
+	),
+	TP_printk("xid=0x%08x sessionid=%08x:%08x:%08x:%08x status_flags=%s",
+		__entry->xid, __entry->cl_boot, __entry->cl_id,
+		__entry->seqno, __entry->reserved,
+		show_nfs4_seq4_status(__entry->status_flags)
+	)
+);
+
 DECLARE_EVENT_CLASS(nfsd_clientid_class,
 	TP_PROTO(const clientid_t *clid),
 	TP_ARGS(clid),

From 99ad53994f3c1b83398ae1372aaaa73acef537f1 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 26 Jan 2024 12:45:48 -0500
Subject: [PATCH 0634/1406] NFSD: Replace dprintks in nfsd4_cb_sequence_done()

Improve observability of backchannel session operation.

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Benjamin Coddington <bcodding@redhat.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4callback.c |  9 +++--
 fs/nfsd/trace.h        | 82 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 86 insertions(+), 5 deletions(-)

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 3bff14241b3cc5..78d9939cf4b093 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -1165,6 +1165,8 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
 	if (!cb->cb_holds_slot)
 		goto need_restart;
 
+	/* This is the operation status code for CB_SEQUENCE */
+	trace_nfsd_cb_seq_status(task, cb);
 	switch (cb->cb_seq_status) {
 	case 0:
 		/*
@@ -1210,13 +1212,10 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
 		break;
 	default:
 		nfsd4_mark_cb_fault(cb->cb_clp, cb->cb_seq_status);
-		dprintk("%s: unprocessed error %d\n", __func__,
-			cb->cb_seq_status);
 	}
-
 	nfsd41_cb_release_slot(cb);
-	dprintk("%s: freed slot, new seqid=%d\n", __func__,
-		clp->cl_cb_session->se_cb_seq_nr);
+
+	trace_nfsd_cb_free_slot(task, cb);
 
 	if (RPC_SIGNALLED(task))
 		goto need_restart;
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
index 38d11b43779c77..c134c755ae5d1e 100644
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h
@@ -9,8 +9,10 @@
 #define _NFSD_TRACE_H
 
 #include <linux/tracepoint.h>
+#include <linux/sunrpc/clnt.h>
 #include <linux/sunrpc/xprt.h>
 #include <trace/misc/nfs.h>
+#include <trace/misc/sunrpc.h>
 
 #include "export.h"
 #include "nfsfh.h"
@@ -1440,6 +1442,86 @@ TRACE_EVENT(nfsd_cb_setup_err,
 		__entry->error)
 );
 
+TRACE_EVENT(nfsd_cb_seq_status,
+	TP_PROTO(
+		const struct rpc_task *task,
+		const struct nfsd4_callback *cb
+	),
+	TP_ARGS(task, cb),
+	TP_STRUCT__entry(
+		__field(unsigned int, task_id)
+		__field(unsigned int, client_id)
+		__field(u32, cl_boot)
+		__field(u32, cl_id)
+		__field(u32, seqno)
+		__field(u32, reserved)
+		__field(int, tk_status)
+		__field(int, seq_status)
+	),
+	TP_fast_assign(
+		const struct nfs4_client *clp = cb->cb_clp;
+		const struct nfsd4_session *session = clp->cl_cb_session;
+		const struct nfsd4_sessionid *sid =
+			(struct nfsd4_sessionid *)&session->se_sessionid;
+
+		__entry->task_id = task->tk_pid;
+		__entry->client_id = task->tk_client ?
+				     task->tk_client->cl_clid : -1;
+		__entry->cl_boot = sid->clientid.cl_boot;
+		__entry->cl_id = sid->clientid.cl_id;
+		__entry->seqno = sid->sequence;
+		__entry->reserved = sid->reserved;
+		__entry->tk_status = task->tk_status;
+		__entry->seq_status = cb->cb_seq_status;
+	),
+	TP_printk(SUNRPC_TRACE_TASK_SPECIFIER
+		" sessionid=%08x:%08x:%08x:%08x tk_status=%d seq_status=%d\n",
+		__entry->task_id, __entry->client_id,
+		__entry->cl_boot, __entry->cl_id,
+		__entry->seqno, __entry->reserved,
+		__entry->tk_status, __entry->seq_status
+	)
+);
+
+TRACE_EVENT(nfsd_cb_free_slot,
+	TP_PROTO(
+		const struct rpc_task *task,
+		const struct nfsd4_callback *cb
+	),
+	TP_ARGS(task, cb),
+	TP_STRUCT__entry(
+		__field(unsigned int, task_id)
+		__field(unsigned int, client_id)
+		__field(u32, cl_boot)
+		__field(u32, cl_id)
+		__field(u32, seqno)
+		__field(u32, reserved)
+		__field(u32, slot_seqno)
+	),
+	TP_fast_assign(
+		const struct nfs4_client *clp = cb->cb_clp;
+		const struct nfsd4_session *session = clp->cl_cb_session;
+		const struct nfsd4_sessionid *sid =
+			(struct nfsd4_sessionid *)&session->se_sessionid;
+
+		__entry->task_id = task->tk_pid;
+		__entry->client_id = task->tk_client ?
+				     task->tk_client->cl_clid : -1;
+		__entry->cl_boot = sid->clientid.cl_boot;
+		__entry->cl_id = sid->clientid.cl_id;
+		__entry->seqno = sid->sequence;
+		__entry->reserved = sid->reserved;
+		__entry->slot_seqno = session->se_cb_seq_nr;
+	),
+	TP_printk(SUNRPC_TRACE_TASK_SPECIFIER
+		" sessionid=%08x:%08x:%08x:%08x new slot seqno=%u\n",
+		__entry->task_id, __entry->client_id,
+		__entry->cl_boot, __entry->cl_id,
+		__entry->seqno, __entry->reserved,
+		__entry->slot_seqno
+	)
+);
+
 TRACE_EVENT_CONDITION(nfsd_cb_recall,
 	TP_PROTO(
 		const struct nfs4_stid *stid

From 1696bfc945be3f018c92f599f250c8dfd8c4182d Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 26 Jan 2024 12:45:54 -0500
Subject: [PATCH 0635/1406] NFSD: Rename nfsd_cb_state trace point

Make it clear where backchannel state is updated.

Example trace point output:

kworker/u16:0-10    [006]  2800.080404: nfsd_cb_new_state:    addr=192.168.122.6:0 client 65b3c5b8:f541f749 state=UP
         nfsd-940   [003]  2800.478368: nfsd_cb_new_state:    addr=192.168.122.6:0 client 65b3c5b8:f541f749 state=UNKNOWN
kworker/u16:0-10    [003]  2800.478828: nfsd_cb_new_state:    addr=192.168.122.6:0 client 65b3c5b8:f541f749 state=DOWN

kworker/u16:0-10    [005]  2802.039724: nfsd_cb_start:        addr=192.168.122.6:0 client 65b3c5b8:f541f749 state=UP
kworker/u16:0-10    [005]  2810.611452: nfsd_cb_start:        addr=192.168.122.6:0 client 65b3c5b8:f541f749 state=FAULT
kworker/u16:0-10    [005]  2810.616832: nfsd_cb_start:        addr=192.168.122.6:0 client 65b3c5b8:f541f749 state=UNKNOWN
kworker/u16:0-10    [005]  2810.616931: nfsd_cb_start:        addr=192.168.122.6:0 client 65b3c5b8:f541f749 state=DOWN

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Benjamin Coddington <bcodding@redhat.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4callback.c | 4 +++-
 fs/nfsd/trace.h        | 3 ++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 78d9939cf4b093..a63171ccfc2b88 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -1006,7 +1006,7 @@ static void nfsd4_mark_cb_state(struct nfs4_client *clp, int newstate)
 {
 	if (clp->cl_cb_state != newstate) {
 		clp->cl_cb_state = newstate;
-		trace_nfsd_cb_state(clp);
+		trace_nfsd_cb_new_state(clp);
 	}
 }
 
@@ -1390,6 +1390,8 @@ nfsd4_run_cb_work(struct work_struct *work)
 	struct rpc_clnt *clnt;
 	int flags;
 
+	trace_nfsd_cb_start(clp);
+
 	if (clp->cl_flags & NFSD4_CLIENT_CB_FLAG_MASK)
 		nfsd4_process_cb_update(cb);
 
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
index c134c755ae5d1e..6003af2bee33cb 100644
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h
@@ -1371,7 +1371,8 @@ DEFINE_EVENT(nfsd_cb_class, nfsd_cb_##name,		\
 	TP_PROTO(const struct nfs4_client *clp),	\
 	TP_ARGS(clp))
 
-DEFINE_NFSD_CB_EVENT(state);
+DEFINE_NFSD_CB_EVENT(start);
+DEFINE_NFSD_CB_EVENT(new_state);
 DEFINE_NFSD_CB_EVENT(probe);
 DEFINE_NFSD_CB_EVENT(lost);
 DEFINE_NFSD_CB_EVENT(shutdown);

From 58b465c60ce5b9261b5ccb5d0e20b0cfec14b08f Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 26 Jan 2024 12:46:01 -0500
Subject: [PATCH 0636/1406] NFSD: Add callback operation lifetime trace points

Help observe the flow of callback operations.

bc_shutdown() records exactly when the backchannel RPC client is
destroyed and cl_cb_client is replaced with NULL.

Examples include:

         nfsd-955   [004]   650.013997: nfsd_cb_queue:        addr=192.168.122.6:0 client 65b3c5b8:f541f749 cb=0xffff8881134b02f8 (first try)
kworker/u21:4-497   [004]   650.014050: nfsd_cb_seq_status:   task:00000001@00000001 sessionid=65b3c5b8:f541f749:00000001:00000000 tk_status=-107 seq_status=1
kworker/u21:4-497   [004]   650.014051: nfsd_cb_restart:      addr=192.168.122.6:0 client 65b3c5b8:f541f749 cb=0xffff88810e39f400 (first try)
kworker/u21:4-497   [004]   650.014066: nfsd_cb_queue:        addr=192.168.122.6:0 client 65b3c5b8:f541f749 cb=0xffff88810e39f400 (need restart)


kworker/u16:0-10    [006]   650.065750: nfsd_cb_start:        addr=192.168.122.6:0 client 65b3c5b8:f541f749 state=UNKNOWN
kworker/u16:0-10    [006]   650.065752: nfsd_cb_bc_update:    addr=192.168.122.6:0 client 65b3c5b8:f541f749 cb=0xffff8881134b02f8 (first try)
kworker/u16:0-10    [006]   650.065754: nfsd_cb_bc_shutdown:  addr=192.168.122.6:0 client 65b3c5b8:f541f749 cb=0xffff8881134b02f8 (first try)
kworker/u16:0-10    [006]   650.065810: nfsd_cb_new_state:    addr=192.168.122.6:0 client 65b3c5b8:f541f749 state=DOWN

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Benjamin Coddington <bcodding@redhat.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4callback.c   |  8 ++++++++
 fs/nfsd/trace.h          | 42 ++++++++++++++++++++++++++++++++++++++++
 include/trace/misc/nfs.h | 34 ++++++++++++++++++++++++++++++++
 3 files changed, 84 insertions(+)

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index a63171ccfc2b88..b50ce54aa1bfab 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -887,12 +887,14 @@ static struct workqueue_struct *callback_wq;
 
 static bool nfsd4_queue_cb(struct nfsd4_callback *cb)
 {
+	trace_nfsd_cb_queue(cb->cb_clp, cb);
 	return queue_delayed_work(callback_wq, &cb->cb_work, 0);
 }
 
 static void nfsd4_queue_cb_delayed(struct nfsd4_callback *cb,
 				   unsigned long msecs)
 {
+	trace_nfsd_cb_queue(cb->cb_clp, cb);
 	queue_delayed_work(callback_wq, &cb->cb_work,
 			   msecs_to_jiffies(msecs));
 }
@@ -1113,6 +1115,7 @@ static void nfsd41_destroy_cb(struct nfsd4_callback *cb)
 {
 	struct nfs4_client *clp = cb->cb_clp;
 
+	trace_nfsd_cb_destroy(clp, cb);
 	nfsd41_cb_release_slot(cb);
 	if (cb->cb_ops && cb->cb_ops->release)
 		cb->cb_ops->release(cb);
@@ -1227,6 +1230,7 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
 	goto out;
 need_restart:
 	if (!test_bit(NFSD4_CLIENT_CB_KILL, &clp->cl_flags)) {
+		trace_nfsd_cb_restart(clp, cb);
 		task->tk_status = 0;
 		cb->cb_need_restart = true;
 	}
@@ -1340,11 +1344,14 @@ static void nfsd4_process_cb_update(struct nfsd4_callback *cb)
 	struct nfsd4_conn *c;
 	int err;
 
+	trace_nfsd_cb_bc_update(clp, cb);
+
 	/*
 	 * This is either an update, or the client dying; in either case,
 	 * kill the old client:
 	 */
 	if (clp->cl_cb_client) {
+		trace_nfsd_cb_bc_shutdown(clp, cb);
 		rpc_shutdown_client(clp->cl_cb_client);
 		clp->cl_cb_client = NULL;
 		put_cred(clp->cl_cb_cred);
@@ -1356,6 +1363,7 @@ static void nfsd4_process_cb_update(struct nfsd4_callback *cb)
 	}
 	if (test_bit(NFSD4_CLIENT_CB_KILL, &clp->cl_flags))
 		return;
+
 	spin_lock(&clp->cl_lock);
 	/*
 	 * Only serialized callback code is allowed to clear these
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
index 6003af2bee33cb..9f9e58debc2611 100644
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h
@@ -1443,6 +1443,48 @@ TRACE_EVENT(nfsd_cb_setup_err,
 		__entry->error)
 );
 
+DECLARE_EVENT_CLASS(nfsd_cb_lifetime_class,
+	TP_PROTO(
+		const struct nfs4_client *clp,
+		const struct nfsd4_callback *cb
+	),
+	TP_ARGS(clp, cb),
+	TP_STRUCT__entry(
+		__field(u32, cl_boot)
+		__field(u32, cl_id)
+		__field(const void *, cb)
+		__field(bool, need_restart)
+		__sockaddr(addr, clp->cl_cb_conn.cb_addrlen)
+	),
+	TP_fast_assign(
+		__entry->cl_boot = clp->cl_clientid.cl_boot;
+		__entry->cl_id = clp->cl_clientid.cl_id;
+		__entry->cb = cb;
+		__entry->need_restart = cb->cb_need_restart;
+		__assign_sockaddr(addr, &clp->cl_cb_conn.cb_addr,
+				  clp->cl_cb_conn.cb_addrlen)
+	),
+	TP_printk("addr=%pISpc client %08x:%08x cb=%p%s",
+		__get_sockaddr(addr), __entry->cl_boot, __entry->cl_id,
+		__entry->cb, __entry->need_restart ?
+			" (need restart)" : " (first try)"
+	)
+);
+
+#define DEFINE_NFSD_CB_LIFETIME_EVENT(name)		\
+DEFINE_EVENT(nfsd_cb_lifetime_class, nfsd_cb_##name,	\
+	TP_PROTO(					\
+		const struct nfs4_client *clp,		\
+		const struct nfsd4_callback *cb		\
+	),						\
+	TP_ARGS(clp, cb))
+
+DEFINE_NFSD_CB_LIFETIME_EVENT(queue);
+DEFINE_NFSD_CB_LIFETIME_EVENT(destroy);
+DEFINE_NFSD_CB_LIFETIME_EVENT(restart);
+DEFINE_NFSD_CB_LIFETIME_EVENT(bc_update);
+DEFINE_NFSD_CB_LIFETIME_EVENT(bc_shutdown);
+
 TRACE_EVENT(nfsd_cb_seq_status,
 	TP_PROTO(
 		const struct rpc_task *task,
diff --git a/include/trace/misc/nfs.h b/include/trace/misc/nfs.h
index 0d9d48dca38a89..64ab5dac59ce0c 100644
--- a/include/trace/misc/nfs.h
+++ b/include/trace/misc/nfs.h
@@ -385,3 +385,37 @@ TRACE_DEFINE_ENUM(IOMODE_ANY);
 		{ SEQ4_STATUS_RESTART_RECLAIM_NEEDED,	"RESTART_RECLAIM_NEEDED" }, \
 		{ SEQ4_STATUS_CB_PATH_DOWN_SESSION,	"CB_PATH_DOWN_SESSION" }, \
 		{ SEQ4_STATUS_BACKCHANNEL_FAULT,	"BACKCHANNEL_FAULT" })
+
+TRACE_DEFINE_ENUM(OP_CB_GETATTR);
+TRACE_DEFINE_ENUM(OP_CB_RECALL);
+TRACE_DEFINE_ENUM(OP_CB_LAYOUTRECALL);
+TRACE_DEFINE_ENUM(OP_CB_NOTIFY);
+TRACE_DEFINE_ENUM(OP_CB_PUSH_DELEG);
+TRACE_DEFINE_ENUM(OP_CB_RECALL_ANY);
+TRACE_DEFINE_ENUM(OP_CB_RECALLABLE_OBJ_AVAIL);
+TRACE_DEFINE_ENUM(OP_CB_RECALL_SLOT);
+TRACE_DEFINE_ENUM(OP_CB_SEQUENCE);
+TRACE_DEFINE_ENUM(OP_CB_WANTS_CANCELLED);
+TRACE_DEFINE_ENUM(OP_CB_NOTIFY_LOCK);
+TRACE_DEFINE_ENUM(OP_CB_NOTIFY_DEVICEID);
+TRACE_DEFINE_ENUM(OP_CB_OFFLOAD);
+TRACE_DEFINE_ENUM(OP_CB_ILLEGAL);
+
+#define show_nfs4_cb_op(x) \
+	__print_symbolic(x, \
+		{ 0,				"CB_NULL" }, \
+		{ 1,				"CB_COMPOUND" }, \
+		{ OP_CB_GETATTR,		"CB_GETATTR" }, \
+		{ OP_CB_RECALL,			"CB_RECALL" }, \
+		{ OP_CB_LAYOUTRECALL,		"CB_LAYOUTRECALL" }, \
+		{ OP_CB_NOTIFY,			"CB_NOTIFY" }, \
+		{ OP_CB_PUSH_DELEG,		"CB_PUSH_DELEG" }, \
+		{ OP_CB_RECALL_ANY,		"CB_RECALL_ANY" }, \
+		{ OP_CB_RECALLABLE_OBJ_AVAIL,	"CB_RECALLABLE_OBJ_AVAIL" }, \
+		{ OP_CB_RECALL_SLOT,		"CB_RECALL_SLOT" }, \
+		{ OP_CB_SEQUENCE,		"CB_SEQUENCE" }, \
+		{ OP_CB_WANTS_CANCELLED,	"CB_WANTS_CANCELLED" }, \
+		{ OP_CB_NOTIFY_LOCK,		"CB_NOTIFY_LOCK" }, \
+		{ OP_CB_NOTIFY_DEVICEID,	"CB_NOTIFY_DEVICEID" }, \
+		{ OP_CB_OFFLOAD,		"CB_OFFLOAD" }, \
+		{ OP_CB_ILLEGAL,		"CB_ILLEGAL" })

From dbfb8e2237f633ac38f06736b364da299abdf390 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 26 Jan 2024 12:46:07 -0500
Subject: [PATCH 0637/1406] SUNRPC: Remove EXPORT_SYMBOL_GPL for
 svc_process_bc()

svc_process_bc(), previously known as bc_svc_process(), was
added in commit 4d6bbb6233c9 ("nfs41: Backchannel bc_svc_process()")
but there has never been a call site outside of the sunrpc.ko
module.

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Benjamin Coddington <bcodding@redhat.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 net/sunrpc/svc.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 1ce6a3b7175caf..b33e429336fb7f 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -1623,7 +1623,6 @@ void svc_process_bc(struct rpc_rqst *req, struct svc_rqst *rqstp)
 	WARN_ON_ONCE(atomic_read(&task->tk_count) != 1);
 	rpc_put_task(task);
 }
-EXPORT_SYMBOL_GPL(svc_process_bc);
 #endif /* CONFIG_SUNRPC_BACKCHANNEL */
 
 /**

From b28b9f7824e8557fd48d6365c9f071ab3d125fcc Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 26 Jan 2024 12:46:13 -0500
Subject: [PATCH 0638/1406] NFSD: Remove unused @reason argument

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Benjamin Coddington <bcodding@redhat.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4callback.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index b50ce54aa1bfab..45a31f05159598 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -45,7 +45,7 @@
 
 #define NFSDDBG_FACILITY                NFSDDBG_PROC
 
-static void nfsd4_mark_cb_fault(struct nfs4_client *, int reason);
+static void nfsd4_mark_cb_fault(struct nfs4_client *clp);
 
 #define NFSPROC4_CB_NULL 0
 #define NFSPROC4_CB_COMPOUND 1
@@ -1012,14 +1012,14 @@ static void nfsd4_mark_cb_state(struct nfs4_client *clp, int newstate)
 	}
 }
 
-static void nfsd4_mark_cb_down(struct nfs4_client *clp, int reason)
+static void nfsd4_mark_cb_down(struct nfs4_client *clp)
 {
 	if (test_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_flags))
 		return;
 	nfsd4_mark_cb_state(clp, NFSD4_CB_DOWN);
 }
 
-static void nfsd4_mark_cb_fault(struct nfs4_client *clp, int reason)
+static void nfsd4_mark_cb_fault(struct nfs4_client *clp)
 {
 	if (test_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_flags))
 		return;
@@ -1031,7 +1031,7 @@ static void nfsd4_cb_probe_done(struct rpc_task *task, void *calldata)
 	struct nfs4_client *clp = container_of(calldata, struct nfs4_client, cl_cb_null);
 
 	if (task->tk_status)
-		nfsd4_mark_cb_down(clp, task->tk_status);
+		nfsd4_mark_cb_down(clp);
 	else
 		nfsd4_mark_cb_state(clp, NFSD4_CB_UP);
 }
@@ -1183,7 +1183,7 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
 		break;
 	case -ESERVERFAULT:
 		++session->se_cb_seq_nr;
-		nfsd4_mark_cb_fault(cb->cb_clp, cb->cb_seq_status);
+		nfsd4_mark_cb_fault(cb->cb_clp);
 		ret = false;
 		break;
 	case 1:
@@ -1195,7 +1195,7 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
 		 */
 		fallthrough;
 	case -NFS4ERR_BADSESSION:
-		nfsd4_mark_cb_fault(cb->cb_clp, cb->cb_seq_status);
+		nfsd4_mark_cb_fault(cb->cb_clp);
 		ret = false;
 		goto need_restart;
 	case -NFS4ERR_DELAY:
@@ -1214,7 +1214,7 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
 		}
 		break;
 	default:
-		nfsd4_mark_cb_fault(cb->cb_clp, cb->cb_seq_status);
+		nfsd4_mark_cb_fault(cb->cb_clp);
 	}
 	nfsd41_cb_release_slot(cb);
 
@@ -1260,7 +1260,7 @@ static void nfsd4_cb_done(struct rpc_task *task, void *calldata)
 		case -EIO:
 		case -ETIMEDOUT:
 		case -EACCES:
-			nfsd4_mark_cb_down(clp, task->tk_status);
+			nfsd4_mark_cb_down(clp);
 		}
 		break;
 	default:
@@ -1382,7 +1382,7 @@ static void nfsd4_process_cb_update(struct nfsd4_callback *cb)
 
 	err = setup_callback_client(clp, &conn, ses);
 	if (err) {
-		nfsd4_mark_cb_down(clp, err);
+		nfsd4_mark_cb_down(clp);
 		if (c)
 			svc_xprt_put(c->cn_xprt);
 		return;

From fdb724b8034ab6d986984458f2fdc9b30160ac8f Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 26 Jan 2024 12:46:20 -0500
Subject: [PATCH 0639/1406] NFSD: Replace comment with lockdep assertion

Convert a code comment into a real assertion.

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Benjamin Coddington <bcodding@redhat.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4callback.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 45a31f05159598..d73c66fa131df7 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -1315,12 +1315,13 @@ void nfsd4_shutdown_callback(struct nfs4_client *clp)
 	nfsd41_cb_inflight_wait_complete(clp);
 }
 
-/* requires cl_lock: */
 static struct nfsd4_conn * __nfsd4_find_backchannel(struct nfs4_client *clp)
 {
 	struct nfsd4_session *s;
 	struct nfsd4_conn *c;
 
+	lockdep_assert_held(&clp->cl_lock);
+
 	list_for_each_entry(s, &clp->cl_sessions, se_perclnt) {
 		list_for_each_entry(c, &s->se_conns, cn_persession) {
 			if (c->cn_flags & NFS4_CDFC4_BACK)

From 35f414b8afeb97f5d4654cf9fe193d10ddbd0775 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 26 Jan 2024 12:46:26 -0500
Subject: [PATCH 0640/1406] NFSD: Remove BUG_ON in nfsd4_process_cb_update()

Don't kill the kworker thread, and don't panic while cl_lock is
held. There's no need for scorching the earth here.

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Benjamin Coddington <bcodding@redhat.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4callback.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index d73c66fa131df7..fd6a27e20f65ba 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -1370,8 +1370,9 @@ static void nfsd4_process_cb_update(struct nfsd4_callback *cb)
 	 * Only serialized callback code is allowed to clear these
 	 * flags; main nfsd code can only set them:
 	 */
-	BUG_ON(!(clp->cl_flags & NFSD4_CLIENT_CB_FLAG_MASK));
+	WARN_ON(!(clp->cl_flags & NFSD4_CLIENT_CB_FLAG_MASK));
 	clear_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_flags);
+
 	memcpy(&conn, &cb->cb_clp->cl_cb_conn, sizeof(struct nfs4_cb_conn));
 	c = __nfsd4_find_backchannel(clp);
 	if (c) {

From 3dfaf45c38a29a9bab4a96f62722dd2c7a394f0b Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 26 Jan 2024 12:46:32 -0500
Subject: [PATCH 0641/1406] SUNRPC: Remove stale comments

bc_close() and bc_destroy now do something, so the comments are
no longer correct. Commit 6221f1d9b63f ("SUNRPC: Fix backchannel
RPC soft lockups") should have removed these.

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Benjamin Coddington <bcodding@redhat.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 net/sunrpc/xprtsock.c | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 58f3dc8d0d71c3..d92c13e78a56cf 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -2987,20 +2987,11 @@ static int bc_send_request(struct rpc_rqst *req)
 	return len;
 }
 
-/*
- * The close routine. Since this is client initiated, we do nothing
- */
-
 static void bc_close(struct rpc_xprt *xprt)
 {
 	xprt_disconnect_done(xprt);
 }
 
-/*
- * The xprt destroy routine. Again, because this connection is client
- * initiated, we do nothing
- */
-
 static void bc_destroy(struct rpc_xprt *xprt)
 {
 	dprintk("RPC:       bc_destroy xprt %p\n", xprt);

From 87d2aa49da9e1a1ed86989d948e586127388aea6 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 26 Jan 2024 12:46:38 -0500
Subject: [PATCH 0642/1406] NFSD: Remove redundant cb_seq_status initialization

As far as I can see, setting cb_seq_status in nfsd4_init_cb() is
superfluous because it is set again in nfsd4_cb_prepare().

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Benjamin Coddington <bcodding@redhat.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4callback.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index fd6a27e20f65ba..32dd2fbb1f301b 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -1450,7 +1450,6 @@ void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp,
 	cb->cb_msg.rpc_resp = cb;
 	cb->cb_ops = ops;
 	INIT_DELAYED_WORK(&cb->cb_work, nfsd4_run_cb_work);
-	cb->cb_seq_status = 1;
 	cb->cb_status = 0;
 	cb->cb_need_restart = false;
 	cb->cb_holds_slot = false;

From 10c6c97be2b9826df5b747c615c250ff85551924 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 30 Jan 2024 12:08:21 +1100
Subject: [PATCH 0643/1406] nfsd: remove stale comment in nfs4_show_deleg()

As we do now support write delegations, this comment is unhelpful and
misleading.

Reported-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4state.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index a5bfa8da2cb497..1cfebf2a77b6d1 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2711,7 +2711,6 @@ static int nfs4_show_deleg(struct seq_file *s, struct nfs4_stid *st)
 	nfs4_show_stateid(s, &st->sc_stateid);
 	seq_printf(s, ": { type: deleg, ");
 
-	/* Kinda dead code as long as we only support read delegs: */
 	seq_printf(s, "access: %s, ",
 		ds->dl_type == NFS4_OPEN_DELEGATE_READ ? "r" : "w");
 

From 300c0523440932c74da6cbbc3b52aab63f0ecff0 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 30 Jan 2024 12:08:22 +1100
Subject: [PATCH 0644/1406] nfsd: hold ->cl_lock for hash_delegation_locked()

The protocol for creating a new state in nfsd is to allocate the state
leaving it largely uninitialised, add that state to the ->cl_stateids
idr so as to reserve a state-id, then complete initialisation of the
state and only set ->sc_type to non-zero once the state is fully
initialised.

If a state is found in the idr with ->sc_type == 0, it is ignored.
The ->cl_lock lock is used to avoid races - it is held while checking
sc_type during lookup, and held when a non-zero value is stored in
->sc_type.

... except... hash_delegation_locked() finalises the initialisation of a
delegation state, but does NOT hold ->cl_lock.

So this patch takes ->cl_lock at the appropriate time w.r.t other locks,
and so ensures there are no races (which are extremely unlikely in any
case).
As ->fi_lock is often taken when ->cl_lock is held, we need to take
->cl_lock first of those two.
Currently ->cl_lock and state_lock are never both taken at the same time.
We need both for this patch so an arbitrary choice is needed concerning
which to take first.  As state_lock is more global, it might be more
contended, so take it first.

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4state.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 1cfebf2a77b6d1..327876da4b96b4 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1312,6 +1312,7 @@ hash_delegation_locked(struct nfs4_delegation *dp, struct nfs4_file *fp)
 
 	lockdep_assert_held(&state_lock);
 	lockdep_assert_held(&fp->fi_lock);
+	lockdep_assert_held(&clp->cl_lock);
 
 	if (nfs4_delegation_exists(clp, fp))
 		return -EAGAIN;
@@ -5560,9 +5561,11 @@ nfs4_set_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp,
 		goto out_unlock;
 
 	spin_lock(&state_lock);
+	spin_lock(&clp->cl_lock);
 	spin_lock(&fp->fi_lock);
 	status = hash_delegation_locked(dp, fp);
 	spin_unlock(&fp->fi_lock);
+	spin_unlock(&clp->cl_lock);
 	spin_unlock(&state_lock);
 
 	if (status)

From df17f4c58411c385fd889cecccd0d417fd036c9a Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 30 Jan 2024 12:08:23 +1100
Subject: [PATCH 0645/1406] nfsd: don't call functions with side-effecting
 inside WARN_ON()

Code like:

    WARN_ON(foo())

looks like an assertion and might not be expected to have any side
effects.
When testing if a function with side-effects fails a construct like

    if (foo())
       WARN_ON(1);

makes the intent more obvious.

nfsd has several WARN_ON calls where the test has side effects, so it
would be good to change them.  These cases don't really need the
WARN_ON.  They have never failed in 8 years of usage so let's just
remove the WARN_ON wrapper.

Suggested-by: Chuck Lever <chuck.lever@oracle.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4state.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 327876da4b96b4..37408212cbb210 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1600,7 +1600,7 @@ static void release_open_stateid_locks(struct nfs4_ol_stateid *open_stp,
 	while (!list_empty(&open_stp->st_locks)) {
 		stp = list_entry(open_stp->st_locks.next,
 				struct nfs4_ol_stateid, st_locks);
-		WARN_ON(!unhash_lock_stateid(stp));
+		unhash_lock_stateid(stp);
 		put_ol_stateid_locked(stp, reaplist);
 	}
 }
@@ -2229,7 +2229,7 @@ __destroy_client(struct nfs4_client *clp)
 	spin_lock(&state_lock);
 	while (!list_empty(&clp->cl_delegations)) {
 		dp = list_entry(clp->cl_delegations.next, struct nfs4_delegation, dl_perclnt);
-		WARN_ON(!unhash_delegation_locked(dp));
+		unhash_delegation_locked(dp);
 		list_add(&dp->dl_recall_lru, &reaplist);
 	}
 	spin_unlock(&state_lock);
@@ -6169,7 +6169,7 @@ nfs4_laundromat(struct nfsd_net *nn)
 		dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
 		if (!state_expired(&lt, dp->dl_time))
 			break;
-		WARN_ON(!unhash_delegation_locked(dp));
+		unhash_delegation_locked(dp);
 		list_add(&dp->dl_recall_lru, &reaplist);
 	}
 	spin_unlock(&state_lock);
@@ -8009,7 +8009,7 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp,
 		stp = list_first_entry(&lo->lo_owner.so_stateids,
 				       struct nfs4_ol_stateid,
 				       st_perstateowner);
-		WARN_ON(!unhash_lock_stateid(stp));
+		unhash_lock_stateid(stp);
 		put_ol_stateid_locked(stp, &reaplist);
 	}
 	spin_unlock(&clp->cl_lock);
@@ -8302,7 +8302,7 @@ nfs4_state_shutdown_net(struct net *net)
 	spin_lock(&state_lock);
 	list_for_each_safe(pos, next, &nn->del_recall_lru) {
 		dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
-		WARN_ON(!unhash_delegation_locked(dp));
+		unhash_delegation_locked(dp);
 		list_add(&dp->dl_recall_lru, &reaplist);
 	}
 	spin_unlock(&state_lock);

From f1790e41f8cb62952549254c6be2ba2bf6cf73ca Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 30 Jan 2024 12:08:24 +1100
Subject: [PATCH 0646/1406] nfsd: avoid race after unhash_delegation_locked()

NFS4_CLOSED_DELEG_STID and NFS4_REVOKED_DELEG_STID are similar in
purpose.
REVOKED is used for NFSv4.1 states which have been revoked because the
lease has expired.  CLOSED is used in other cases.
The difference has two practical effects.
1/ REVOKED states are on the ->cl_revoked list
2/ REVOKED states result in nfserr_deleg_revoked from
   nfsd4_verify_open_stid() and nfsd4_validate_stateid while
   CLOSED states result in nfserr_bad_stid.

Currently a state that is being revoked is first set to "CLOSED" in
unhash_delegation_locked(), then possibly to "REVOKED" in
revoke_delegation(), at which point it is added to the cl_revoked list.

It is possible that a stateid test could see the CLOSED state
which really should be REVOKED, and so return the wrong error code.  So
it is safest to remove this window of inconsistency.

With this patch, unhash_delegation_locked() always sets the state
correctly, and revoke_delegation() no longer changes the state.

Also remove a redundant test on minorversion when
NFS4_REVOKED_DELEG_STID is seen - it can only be seen when minorversion
is non-zero.

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4state.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 37408212cbb210..fbdf159c3854b5 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1329,7 +1329,7 @@ static bool delegation_hashed(struct nfs4_delegation *dp)
 }
 
 static bool
-unhash_delegation_locked(struct nfs4_delegation *dp)
+unhash_delegation_locked(struct nfs4_delegation *dp, unsigned char type)
 {
 	struct nfs4_file *fp = dp->dl_stid.sc_file;
 
@@ -1338,7 +1338,9 @@ unhash_delegation_locked(struct nfs4_delegation *dp)
 	if (!delegation_hashed(dp))
 		return false;
 
-	dp->dl_stid.sc_type = NFS4_CLOSED_DELEG_STID;
+	if (dp->dl_stid.sc_client->cl_minorversion == 0)
+		type = NFS4_CLOSED_DELEG_STID;
+	dp->dl_stid.sc_type = type;
 	/* Ensure that deleg break won't try to requeue it */
 	++dp->dl_time;
 	spin_lock(&fp->fi_lock);
@@ -1354,7 +1356,7 @@ static void destroy_delegation(struct nfs4_delegation *dp)
 	bool unhashed;
 
 	spin_lock(&state_lock);
-	unhashed = unhash_delegation_locked(dp);
+	unhashed = unhash_delegation_locked(dp, NFS4_CLOSED_DELEG_STID);
 	spin_unlock(&state_lock);
 	if (unhashed)
 		destroy_unhashed_deleg(dp);
@@ -1368,9 +1370,8 @@ static void revoke_delegation(struct nfs4_delegation *dp)
 
 	trace_nfsd_stid_revoke(&dp->dl_stid);
 
-	if (clp->cl_minorversion) {
+	if (dp->dl_stid.sc_type == NFS4_REVOKED_DELEG_STID) {
 		spin_lock(&clp->cl_lock);
-		dp->dl_stid.sc_type = NFS4_REVOKED_DELEG_STID;
 		refcount_inc(&dp->dl_stid.sc_count);
 		list_add(&dp->dl_recall_lru, &clp->cl_revoked);
 		spin_unlock(&clp->cl_lock);
@@ -2229,7 +2230,7 @@ __destroy_client(struct nfs4_client *clp)
 	spin_lock(&state_lock);
 	while (!list_empty(&clp->cl_delegations)) {
 		dp = list_entry(clp->cl_delegations.next, struct nfs4_delegation, dl_perclnt);
-		unhash_delegation_locked(dp);
+		unhash_delegation_locked(dp, NFS4_CLOSED_DELEG_STID);
 		list_add(&dp->dl_recall_lru, &reaplist);
 	}
 	spin_unlock(&state_lock);
@@ -5144,8 +5145,7 @@ nfs4_check_deleg(struct nfs4_client *cl, struct nfsd4_open *open,
 		goto out;
 	if (deleg->dl_stid.sc_type == NFS4_REVOKED_DELEG_STID) {
 		nfs4_put_stid(&deleg->dl_stid);
-		if (cl->cl_minorversion)
-			status = nfserr_deleg_revoked;
+		status = nfserr_deleg_revoked;
 		goto out;
 	}
 	flags = share_access_to_flags(open->op_share_access);
@@ -6169,7 +6169,7 @@ nfs4_laundromat(struct nfsd_net *nn)
 		dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
 		if (!state_expired(&lt, dp->dl_time))
 			break;
-		unhash_delegation_locked(dp);
+		unhash_delegation_locked(dp, NFS4_REVOKED_DELEG_STID);
 		list_add(&dp->dl_recall_lru, &reaplist);
 	}
 	spin_unlock(&state_lock);
@@ -8302,7 +8302,7 @@ nfs4_state_shutdown_net(struct net *net)
 	spin_lock(&state_lock);
 	list_for_each_safe(pos, next, &nn->del_recall_lru) {
 		dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
-		unhash_delegation_locked(dp);
+		unhash_delegation_locked(dp, NFS4_CLOSED_DELEG_STID);
 		list_add(&dp->dl_recall_lru, &reaplist);
 	}
 	spin_unlock(&state_lock);

From 4146702b2dca55ced60faf8e37180d40fb20eae5 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 30 Jan 2024 12:08:25 +1100
Subject: [PATCH 0647/1406] nfsd: split sc_status out of sc_type

sc_type identifies the type of a state - open, lock, deleg, layout - and
also the status of a state - closed or revoked.

This is a bit untidy and could get worse when "admin-revoked" states are
added.  So clean it up.

With this patch, the type is now all that is stored in sc_type.  This is
zero when the state is first added to ->cl_stateids (causing it to be
ignored), and is then set appropriately once it is fully initialised.
It is set under ->cl_lock to ensure atomicity w.r.t lookup.  It is now
never cleared.

sc_type is still a bit-set even though at most one bit is set.  This allows
lookup functions to be given a bitmap of acceptable types.

sc_type is now an unsigned short rather than char.  There is no value in
restricting to just 8 bits.

All the constants now start SC_TYPE_ matching the field in which they
are stored.  Keeping the existing names and ensuring clear separation
from non-type flags would have required something like
NFS4_STID_TYPE_CLOSED which is cumbersome.  The "NFS4" prefix is
redundant was they only appear in NFS4 code, so remove that and change
STID to SC to match the field.

The status is stored in a separate unsigned short named "sc_status".  It
has two flags: SC_STATUS_CLOSED and SC_STATUS_REVOKED.
CLOSED combines NFS4_CLOSED_STID, NFS4_CLOSED_DELEG_STID, and is used
for SC_TYPE_LOCK and SC_TYPE_LAYOUT instead of setting the sc_type to zero.
These flags are only ever set, never cleared.
For deleg stateids they are set under the global state_lock.
For open and lock stateids they are set under ->cl_lock.
For layout stateids they are set under ->ls_lock

nfs4_unhash_stid() has been removed, and we never set sc_type = 0.  This
was only used for LOCK and LAYOUT stids and they now use
SC_STATUS_CLOSED.

Also TRACE_DEFINE_NUM() calls for the various STID #define have been
removed because these things are not enums, and so that call is
incorrect.

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4layouts.c |  14 +--
 fs/nfsd/nfs4state.c   | 207 +++++++++++++++++++++---------------------
 fs/nfsd/state.h       |  40 +++++---
 fs/nfsd/trace.h       |  31 +++----
 4 files changed, 151 insertions(+), 141 deletions(-)

diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
index 5e8096bc5eaa45..857b822450b4fe 100644
--- a/fs/nfsd/nfs4layouts.c
+++ b/fs/nfsd/nfs4layouts.c
@@ -236,7 +236,7 @@ nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate,
 	nfsd4_init_cb(&ls->ls_recall, clp, &nfsd4_cb_layout_ops,
 			NFSPROC4_CLNT_CB_LAYOUT);
 
-	if (parent->sc_type == NFS4_DELEG_STID)
+	if (parent->sc_type == SC_TYPE_DELEG)
 		ls->ls_file = nfsd_file_get(fp->fi_deleg_file);
 	else
 		ls->ls_file = find_any_file(fp);
@@ -250,7 +250,7 @@ nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate,
 	}
 
 	spin_lock(&clp->cl_lock);
-	stp->sc_type = NFS4_LAYOUT_STID;
+	stp->sc_type = SC_TYPE_LAYOUT;
 	list_add(&ls->ls_perclnt, &clp->cl_lo_states);
 	spin_unlock(&clp->cl_lock);
 
@@ -269,13 +269,13 @@ nfsd4_preprocess_layout_stateid(struct svc_rqst *rqstp,
 {
 	struct nfs4_layout_stateid *ls;
 	struct nfs4_stid *stid;
-	unsigned char typemask = NFS4_LAYOUT_STID;
+	unsigned short typemask = SC_TYPE_LAYOUT;
 	__be32 status;
 
 	if (create)
-		typemask |= (NFS4_OPEN_STID | NFS4_LOCK_STID | NFS4_DELEG_STID);
+		typemask |= (SC_TYPE_OPEN | SC_TYPE_LOCK | SC_TYPE_DELEG);
 
-	status = nfsd4_lookup_stateid(cstate, stateid, typemask, &stid,
+	status = nfsd4_lookup_stateid(cstate, stateid, typemask, 0, &stid,
 			net_generic(SVC_NET(rqstp), nfsd_net_id));
 	if (status)
 		goto out;
@@ -286,7 +286,7 @@ nfsd4_preprocess_layout_stateid(struct svc_rqst *rqstp,
 		goto out_put_stid;
 	}
 
-	if (stid->sc_type != NFS4_LAYOUT_STID) {
+	if (stid->sc_type != SC_TYPE_LAYOUT) {
 		ls = nfsd4_alloc_layout_stateid(cstate, stid, layout_type);
 		nfs4_put_stid(stid);
 
@@ -518,7 +518,7 @@ nfsd4_return_file_layouts(struct svc_rqst *rqstp,
 		lrp->lrs_present = true;
 	} else {
 		trace_nfsd_layoutstate_unhash(&ls->ls_stid.sc_stateid);
-		nfs4_unhash_stid(&ls->ls_stid);
+		ls->ls_stid.sc_status |= SC_STATUS_CLOSED;
 		lrp->lrs_present = false;
 	}
 	spin_unlock(&ls->ls_lock);
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index fbdf159c3854b5..84b5ee743002ec 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1260,11 +1260,6 @@ static void destroy_unhashed_deleg(struct nfs4_delegation *dp)
 	nfs4_put_stid(&dp->dl_stid);
 }
 
-void nfs4_unhash_stid(struct nfs4_stid *s)
-{
-	s->sc_type = 0;
-}
-
 /**
  * nfs4_delegation_exists - Discover if this delegation already exists
  * @clp:     a pointer to the nfs4_client we're granting a delegation to
@@ -1317,7 +1312,7 @@ hash_delegation_locked(struct nfs4_delegation *dp, struct nfs4_file *fp)
 	if (nfs4_delegation_exists(clp, fp))
 		return -EAGAIN;
 	refcount_inc(&dp->dl_stid.sc_count);
-	dp->dl_stid.sc_type = NFS4_DELEG_STID;
+	dp->dl_stid.sc_type = SC_TYPE_DELEG;
 	list_add(&dp->dl_perfile, &fp->fi_delegations);
 	list_add(&dp->dl_perclnt, &clp->cl_delegations);
 	return 0;
@@ -1329,7 +1324,7 @@ static bool delegation_hashed(struct nfs4_delegation *dp)
 }
 
 static bool
-unhash_delegation_locked(struct nfs4_delegation *dp, unsigned char type)
+unhash_delegation_locked(struct nfs4_delegation *dp, unsigned short statusmask)
 {
 	struct nfs4_file *fp = dp->dl_stid.sc_file;
 
@@ -1339,8 +1334,9 @@ unhash_delegation_locked(struct nfs4_delegation *dp, unsigned char type)
 		return false;
 
 	if (dp->dl_stid.sc_client->cl_minorversion == 0)
-		type = NFS4_CLOSED_DELEG_STID;
-	dp->dl_stid.sc_type = type;
+		statusmask = SC_STATUS_CLOSED;
+	dp->dl_stid.sc_status |= statusmask;
+
 	/* Ensure that deleg break won't try to requeue it */
 	++dp->dl_time;
 	spin_lock(&fp->fi_lock);
@@ -1356,7 +1352,7 @@ static void destroy_delegation(struct nfs4_delegation *dp)
 	bool unhashed;
 
 	spin_lock(&state_lock);
-	unhashed = unhash_delegation_locked(dp, NFS4_CLOSED_DELEG_STID);
+	unhashed = unhash_delegation_locked(dp, SC_STATUS_CLOSED);
 	spin_unlock(&state_lock);
 	if (unhashed)
 		destroy_unhashed_deleg(dp);
@@ -1370,7 +1366,7 @@ static void revoke_delegation(struct nfs4_delegation *dp)
 
 	trace_nfsd_stid_revoke(&dp->dl_stid);
 
-	if (dp->dl_stid.sc_type == NFS4_REVOKED_DELEG_STID) {
+	if (dp->dl_stid.sc_status & SC_STATUS_REVOKED) {
 		spin_lock(&clp->cl_lock);
 		refcount_inc(&dp->dl_stid.sc_count);
 		list_add(&dp->dl_recall_lru, &clp->cl_revoked);
@@ -1379,8 +1375,8 @@ static void revoke_delegation(struct nfs4_delegation *dp)
 	destroy_unhashed_deleg(dp);
 }
 
-/* 
- * SETCLIENTID state 
+/*
+ * SETCLIENTID state
  */
 
 static unsigned int clientid_hashval(u32 id)
@@ -1543,7 +1539,7 @@ static bool unhash_lock_stateid(struct nfs4_ol_stateid *stp)
 	if (!unhash_ol_stateid(stp))
 		return false;
 	list_del_init(&stp->st_locks);
-	nfs4_unhash_stid(&stp->st_stid);
+	stp->st_stid.sc_status |= SC_STATUS_CLOSED;
 	return true;
 }
 
@@ -1622,6 +1618,7 @@ static void release_open_stateid(struct nfs4_ol_stateid *stp)
 	LIST_HEAD(reaplist);
 
 	spin_lock(&stp->st_stid.sc_client->cl_lock);
+	stp->st_stid.sc_status |= SC_STATUS_CLOSED;
 	if (unhash_open_stateid(stp, &reaplist))
 		put_ol_stateid_locked(stp, &reaplist);
 	spin_unlock(&stp->st_stid.sc_client->cl_lock);
@@ -2230,7 +2227,7 @@ __destroy_client(struct nfs4_client *clp)
 	spin_lock(&state_lock);
 	while (!list_empty(&clp->cl_delegations)) {
 		dp = list_entry(clp->cl_delegations.next, struct nfs4_delegation, dl_perclnt);
-		unhash_delegation_locked(dp, NFS4_CLOSED_DELEG_STID);
+		unhash_delegation_locked(dp, SC_STATUS_CLOSED);
 		list_add(&dp->dl_recall_lru, &reaplist);
 	}
 	spin_unlock(&state_lock);
@@ -2462,14 +2459,16 @@ find_stateid_locked(struct nfs4_client *cl, stateid_t *t)
 }
 
 static struct nfs4_stid *
-find_stateid_by_type(struct nfs4_client *cl, stateid_t *t, char typemask)
+find_stateid_by_type(struct nfs4_client *cl, stateid_t *t,
+		     unsigned short typemask, unsigned short ok_states)
 {
 	struct nfs4_stid *s;
 
 	spin_lock(&cl->cl_lock);
 	s = find_stateid_locked(cl, t);
 	if (s != NULL) {
-		if (typemask & s->sc_type)
+		if ((s->sc_status & ~ok_states) == 0 &&
+		    (typemask & s->sc_type))
 			refcount_inc(&s->sc_count);
 		else
 			s = NULL;
@@ -2622,7 +2621,7 @@ static int nfs4_show_open(struct seq_file *s, struct nfs4_stid *st)
 	struct nfs4_stateowner *oo;
 	unsigned int access, deny;
 
-	if (st->sc_type != NFS4_OPEN_STID && st->sc_type != NFS4_LOCK_STID)
+	if (st->sc_type != SC_TYPE_OPEN && st->sc_type != SC_TYPE_LOCK)
 		return 0; /* XXX: or SEQ_SKIP? */
 	ols = openlockstateid(st);
 	oo = ols->st_stateowner;
@@ -2754,13 +2753,13 @@ static int states_show(struct seq_file *s, void *v)
 	struct nfs4_stid *st = v;
 
 	switch (st->sc_type) {
-	case NFS4_OPEN_STID:
+	case SC_TYPE_OPEN:
 		return nfs4_show_open(s, st);
-	case NFS4_LOCK_STID:
+	case SC_TYPE_LOCK:
 		return nfs4_show_lock(s, st);
-	case NFS4_DELEG_STID:
+	case SC_TYPE_DELEG:
 		return nfs4_show_deleg(s, st);
-	case NFS4_LAYOUT_STID:
+	case SC_TYPE_LAYOUT:
 		return nfs4_show_layout(s, st);
 	default:
 		return 0; /* XXX: or SEQ_SKIP? */
@@ -4533,7 +4532,8 @@ nfsd4_find_existing_open(struct nfs4_file *fp, struct nfsd4_open *open)
 			continue;
 		if (local->st_stateowner != &oo->oo_owner)
 			continue;
-		if (local->st_stid.sc_type == NFS4_OPEN_STID) {
+		if (local->st_stid.sc_type == SC_TYPE_OPEN &&
+		    !local->st_stid.sc_status) {
 			ret = local;
 			refcount_inc(&ret->st_stid.sc_count);
 			break;
@@ -4547,17 +4547,10 @@ nfsd4_verify_open_stid(struct nfs4_stid *s)
 {
 	__be32 ret = nfs_ok;
 
-	switch (s->sc_type) {
-	default:
-		break;
-	case 0:
-	case NFS4_CLOSED_STID:
-	case NFS4_CLOSED_DELEG_STID:
-		ret = nfserr_bad_stateid;
-		break;
-	case NFS4_REVOKED_DELEG_STID:
+	if (s->sc_status & SC_STATUS_REVOKED)
 		ret = nfserr_deleg_revoked;
-	}
+	else if (s->sc_status & SC_STATUS_CLOSED)
+		ret = nfserr_bad_stateid;
 	return ret;
 }
 
@@ -4643,7 +4636,7 @@ init_open_stateid(struct nfs4_file *fp, struct nfsd4_open *open)
 
 	open->op_stp = NULL;
 	refcount_inc(&stp->st_stid.sc_count);
-	stp->st_stid.sc_type = NFS4_OPEN_STID;
+	stp->st_stid.sc_type = SC_TYPE_OPEN;
 	INIT_LIST_HEAD(&stp->st_locks);
 	stp->st_stateowner = nfs4_get_stateowner(&oo->oo_owner);
 	get_nfs4_file(fp);
@@ -4870,9 +4863,9 @@ static int nfsd4_cb_recall_done(struct nfsd4_callback *cb,
 
 	trace_nfsd_cb_recall_done(&dp->dl_stid.sc_stateid, task);
 
-	if (dp->dl_stid.sc_type == NFS4_CLOSED_DELEG_STID ||
-	    dp->dl_stid.sc_type == NFS4_REVOKED_DELEG_STID)
-	        return 1;
+	if (dp->dl_stid.sc_status)
+		/* CLOSED or REVOKED */
+		return 1;
 
 	switch (task->tk_status) {
 	case 0:
@@ -5115,12 +5108,12 @@ static int share_access_to_flags(u32 share_access)
 	return share_access == NFS4_SHARE_ACCESS_READ ? RD_STATE : WR_STATE;
 }
 
-static struct nfs4_delegation *find_deleg_stateid(struct nfs4_client *cl, stateid_t *s)
+static struct nfs4_delegation *find_deleg_stateid(struct nfs4_client *cl,
+						  stateid_t *s)
 {
 	struct nfs4_stid *ret;
 
-	ret = find_stateid_by_type(cl, s,
-				NFS4_DELEG_STID|NFS4_REVOKED_DELEG_STID);
+	ret = find_stateid_by_type(cl, s, SC_TYPE_DELEG, SC_STATUS_REVOKED);
 	if (!ret)
 		return NULL;
 	return delegstateid(ret);
@@ -5143,7 +5136,7 @@ nfs4_check_deleg(struct nfs4_client *cl, struct nfsd4_open *open,
 	deleg = find_deleg_stateid(cl, &open->op_delegate_stateid);
 	if (deleg == NULL)
 		goto out;
-	if (deleg->dl_stid.sc_type == NFS4_REVOKED_DELEG_STID) {
+	if (deleg->dl_stid.sc_status & SC_STATUS_REVOKED) {
 		nfs4_put_stid(&deleg->dl_stid);
 		status = nfserr_deleg_revoked;
 		goto out;
@@ -5777,7 +5770,6 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
 	} else {
 		status = nfs4_get_vfs_file(rqstp, fp, current_fh, stp, open, true);
 		if (status) {
-			stp->st_stid.sc_type = NFS4_CLOSED_STID;
 			release_open_stateid(stp);
 			mutex_unlock(&stp->st_mutex);
 			goto out;
@@ -6169,7 +6161,7 @@ nfs4_laundromat(struct nfsd_net *nn)
 		dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
 		if (!state_expired(&lt, dp->dl_time))
 			break;
-		unhash_delegation_locked(dp, NFS4_REVOKED_DELEG_STID);
+		unhash_delegation_locked(dp, SC_STATUS_REVOKED);
 		list_add(&dp->dl_recall_lru, &reaplist);
 	}
 	spin_unlock(&state_lock);
@@ -6408,22 +6400,20 @@ static __be32 nfsd4_validate_stateid(struct nfs4_client *cl, stateid_t *stateid)
 	status = nfsd4_stid_check_stateid_generation(stateid, s, 1);
 	if (status)
 		goto out_unlock;
+	status = nfsd4_verify_open_stid(s);
+	if (status)
+		goto out_unlock;
+
 	switch (s->sc_type) {
-	case NFS4_DELEG_STID:
+	case SC_TYPE_DELEG:
 		status = nfs_ok;
 		break;
-	case NFS4_REVOKED_DELEG_STID:
-		status = nfserr_deleg_revoked;
-		break;
-	case NFS4_OPEN_STID:
-	case NFS4_LOCK_STID:
+	case SC_TYPE_OPEN:
+	case SC_TYPE_LOCK:
 		status = nfsd4_check_openowner_confirmed(openlockstateid(s));
 		break;
 	default:
 		printk("unknown stateid type %x\n", s->sc_type);
-		fallthrough;
-	case NFS4_CLOSED_STID:
-	case NFS4_CLOSED_DELEG_STID:
 		status = nfserr_bad_stateid;
 	}
 out_unlock:
@@ -6433,7 +6423,8 @@ static __be32 nfsd4_validate_stateid(struct nfs4_client *cl, stateid_t *stateid)
 
 __be32
 nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate,
-		     stateid_t *stateid, unsigned char typemask,
+		     stateid_t *stateid,
+		     unsigned short typemask, unsigned short statusmask,
 		     struct nfs4_stid **s, struct nfsd_net *nn)
 {
 	__be32 status;
@@ -6444,10 +6435,13 @@ nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate,
 	 *  only return revoked delegations if explicitly asked.
 	 *  otherwise we report revoked or bad_stateid status.
 	 */
-	if (typemask & NFS4_REVOKED_DELEG_STID)
+	if (statusmask & SC_STATUS_REVOKED)
 		return_revoked = true;
-	else if (typemask & NFS4_DELEG_STID)
-		typemask |= NFS4_REVOKED_DELEG_STID;
+	if (typemask & SC_TYPE_DELEG)
+		/* Always allow REVOKED for DELEG so we can
+		 * retturn the appropriate error.
+		 */
+		statusmask |= SC_STATUS_REVOKED;
 
 	if (ZERO_STATEID(stateid) || ONE_STATEID(stateid) ||
 		CLOSE_STATEID(stateid))
@@ -6460,14 +6454,12 @@ nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate,
 	}
 	if (status)
 		return status;
-	stid = find_stateid_by_type(cstate->clp, stateid, typemask);
+	stid = find_stateid_by_type(cstate->clp, stateid, typemask, statusmask);
 	if (!stid)
 		return nfserr_bad_stateid;
-	if ((stid->sc_type == NFS4_REVOKED_DELEG_STID) && !return_revoked) {
+	if ((stid->sc_status & SC_STATUS_REVOKED) && !return_revoked) {
 		nfs4_put_stid(stid);
-		if (cstate->minorversion)
-			return nfserr_deleg_revoked;
-		return nfserr_bad_stateid;
+		return nfserr_deleg_revoked;
 	}
 	*s = stid;
 	return nfs_ok;
@@ -6478,17 +6470,17 @@ nfs4_find_file(struct nfs4_stid *s, int flags)
 {
 	struct nfsd_file *ret = NULL;
 
-	if (!s)
+	if (!s || s->sc_status)
 		return NULL;
 
 	switch (s->sc_type) {
-	case NFS4_DELEG_STID:
+	case SC_TYPE_DELEG:
 		spin_lock(&s->sc_file->fi_lock);
 		ret = nfsd_file_get(s->sc_file->fi_deleg_file);
 		spin_unlock(&s->sc_file->fi_lock);
 		break;
-	case NFS4_OPEN_STID:
-	case NFS4_LOCK_STID:
+	case SC_TYPE_OPEN:
+	case SC_TYPE_LOCK:
 		if (flags & RD_STATE)
 			ret = find_readable_file(s->sc_file);
 		else
@@ -6601,7 +6593,8 @@ static __be32 find_cpntf_state(struct nfsd_net *nn, stateid_t *st,
 		goto out;
 
 	*stid = find_stateid_by_type(found, &cps->cp_p_stateid,
-			NFS4_DELEG_STID|NFS4_OPEN_STID|NFS4_LOCK_STID);
+				     SC_TYPE_DELEG|SC_TYPE_OPEN|SC_TYPE_LOCK,
+				     0);
 	if (*stid)
 		status = nfs_ok;
 	else
@@ -6658,8 +6651,8 @@ nfs4_preprocess_stateid_op(struct svc_rqst *rqstp,
 	}
 
 	status = nfsd4_lookup_stateid(cstate, stateid,
-				NFS4_DELEG_STID|NFS4_OPEN_STID|NFS4_LOCK_STID,
-				&s, nn);
+				SC_TYPE_DELEG|SC_TYPE_OPEN|SC_TYPE_LOCK,
+				0, &s, nn);
 	if (status == nfserr_bad_stateid)
 		status = find_cpntf_state(nn, stateid, &s);
 	if (status)
@@ -6670,16 +6663,13 @@ nfs4_preprocess_stateid_op(struct svc_rqst *rqstp,
 		goto out;
 
 	switch (s->sc_type) {
-	case NFS4_DELEG_STID:
+	case SC_TYPE_DELEG:
 		status = nfs4_check_delegmode(delegstateid(s), flags);
 		break;
-	case NFS4_OPEN_STID:
-	case NFS4_LOCK_STID:
+	case SC_TYPE_OPEN:
+	case SC_TYPE_LOCK:
 		status = nfs4_check_olstateid(openlockstateid(s), flags);
 		break;
-	default:
-		status = nfserr_bad_stateid;
-		break;
 	}
 	if (status)
 		goto out;
@@ -6758,33 +6748,34 @@ nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 
 	spin_lock(&cl->cl_lock);
 	s = find_stateid_locked(cl, stateid);
-	if (!s)
+	if (!s || s->sc_status & SC_STATUS_CLOSED)
 		goto out_unlock;
 	spin_lock(&s->sc_lock);
 	switch (s->sc_type) {
-	case NFS4_DELEG_STID:
+	case SC_TYPE_DELEG:
+		if (s->sc_status & SC_STATUS_REVOKED) {
+			spin_unlock(&s->sc_lock);
+			dp = delegstateid(s);
+			list_del_init(&dp->dl_recall_lru);
+			spin_unlock(&cl->cl_lock);
+			nfs4_put_stid(s);
+			ret = nfs_ok;
+			goto out;
+		}
 		ret = nfserr_locks_held;
 		break;
-	case NFS4_OPEN_STID:
+	case SC_TYPE_OPEN:
 		ret = check_stateid_generation(stateid, &s->sc_stateid, 1);
 		if (ret)
 			break;
 		ret = nfserr_locks_held;
 		break;
-	case NFS4_LOCK_STID:
+	case SC_TYPE_LOCK:
 		spin_unlock(&s->sc_lock);
 		refcount_inc(&s->sc_count);
 		spin_unlock(&cl->cl_lock);
 		ret = nfsd4_free_lock_stateid(stateid, s);
 		goto out;
-	case NFS4_REVOKED_DELEG_STID:
-		spin_unlock(&s->sc_lock);
-		dp = delegstateid(s);
-		list_del_init(&dp->dl_recall_lru);
-		spin_unlock(&cl->cl_lock);
-		nfs4_put_stid(s);
-		ret = nfs_ok;
-		goto out;
 	/* Default falls through and returns nfserr_bad_stateid */
 	}
 	spin_unlock(&s->sc_lock);
@@ -6827,6 +6818,7 @@ static __be32 nfs4_seqid_op_checks(struct nfsd4_compound_state *cstate, stateid_
  * @seqid: seqid (provided by client)
  * @stateid: stateid (provided by client)
  * @typemask: mask of allowable types for this operation
+ * @statusmask: mask of allowed states: 0 or STID_CLOSED
  * @stpp: return pointer for the stateid found
  * @nn: net namespace for request
  *
@@ -6836,7 +6828,8 @@ static __be32 nfs4_seqid_op_checks(struct nfsd4_compound_state *cstate, stateid_
  */
 static __be32
 nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
-			 stateid_t *stateid, char typemask,
+			 stateid_t *stateid,
+			 unsigned short typemask, unsigned short statusmask,
 			 struct nfs4_ol_stateid **stpp,
 			 struct nfsd_net *nn)
 {
@@ -6847,7 +6840,8 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
 	trace_nfsd_preprocess(seqid, stateid);
 
 	*stpp = NULL;
-	status = nfsd4_lookup_stateid(cstate, stateid, typemask, &s, nn);
+	status = nfsd4_lookup_stateid(cstate, stateid,
+				      typemask, statusmask, &s, nn);
 	if (status)
 		return status;
 	stp = openlockstateid(s);
@@ -6869,7 +6863,7 @@ static __be32 nfs4_preprocess_confirmed_seqid_op(struct nfsd4_compound_state *cs
 	struct nfs4_ol_stateid *stp;
 
 	status = nfs4_preprocess_seqid_op(cstate, seqid, stateid,
-						NFS4_OPEN_STID, &stp, nn);
+					  SC_TYPE_OPEN, 0, &stp, nn);
 	if (status)
 		return status;
 	oo = openowner(stp->st_stateowner);
@@ -6900,8 +6894,8 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		return status;
 
 	status = nfs4_preprocess_seqid_op(cstate,
-					oc->oc_seqid, &oc->oc_req_stateid,
-					NFS4_OPEN_STID, &stp, nn);
+					  oc->oc_seqid, &oc->oc_req_stateid,
+					  SC_TYPE_OPEN, 0, &stp, nn);
 	if (status)
 		goto out;
 	oo = openowner(stp->st_stateowner);
@@ -7032,18 +7026,20 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
 	bool need_move_to_close_list;
 
-	dprintk("NFSD: nfsd4_close on file %pd\n", 
+	dprintk("NFSD: nfsd4_close on file %pd\n",
 			cstate->current_fh.fh_dentry);
 
 	status = nfs4_preprocess_seqid_op(cstate, close->cl_seqid,
-					&close->cl_stateid,
-					NFS4_OPEN_STID|NFS4_CLOSED_STID,
-					&stp, nn);
+					  &close->cl_stateid,
+					  SC_TYPE_OPEN, SC_STATUS_CLOSED,
+					  &stp, nn);
 	nfsd4_bump_seqid(cstate, status);
 	if (status)
-		goto out; 
+		goto out;
 
-	stp->st_stid.sc_type = NFS4_CLOSED_STID;
+	spin_lock(&stp->st_stid.sc_client->cl_lock);
+	stp->st_stid.sc_status |= SC_STATUS_CLOSED;
+	spin_unlock(&stp->st_stid.sc_client->cl_lock);
 
 	/*
 	 * Technically we don't _really_ have to increment or copy it, since
@@ -7094,7 +7090,7 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0)))
 		return status;
 
-	status = nfsd4_lookup_stateid(cstate, stateid, NFS4_DELEG_STID, &s, nn);
+	status = nfsd4_lookup_stateid(cstate, stateid, SC_TYPE_DELEG, 0, &s, nn);
 	if (status)
 		goto out;
 	dp = delegstateid(s);
@@ -7361,7 +7357,7 @@ init_lock_stateid(struct nfs4_ol_stateid *stp, struct nfs4_lockowner *lo,
 	if (retstp)
 		goto out_found;
 	refcount_inc(&stp->st_stid.sc_count);
-	stp->st_stid.sc_type = NFS4_LOCK_STID;
+	stp->st_stid.sc_type = SC_TYPE_LOCK;
 	stp->st_stateowner = nfs4_get_stateowner(&lo->lo_owner);
 	get_nfs4_file(fp);
 	stp->st_stid.sc_file = fp;
@@ -7548,9 +7544,10 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 							&lock_stp, &new);
 	} else {
 		status = nfs4_preprocess_seqid_op(cstate,
-				       lock->lk_old_lock_seqid,
-				       &lock->lk_old_lock_stateid,
-				       NFS4_LOCK_STID, &lock_stp, nn);
+						  lock->lk_old_lock_seqid,
+						  &lock->lk_old_lock_stateid,
+						  SC_TYPE_LOCK, 0, &lock_stp,
+						  nn);
 	}
 	if (status)
 		goto out;
@@ -7863,8 +7860,8 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		 return nfserr_inval;
 
 	status = nfs4_preprocess_seqid_op(cstate, locku->lu_seqid,
-					&locku->lu_stateid, NFS4_LOCK_STID,
-					&stp, nn);
+					  &locku->lu_stateid, SC_TYPE_LOCK, 0,
+					  &stp, nn);
 	if (status)
 		goto out;
 	nf = find_any_file(stp->st_stid.sc_file);
@@ -8302,7 +8299,7 @@ nfs4_state_shutdown_net(struct net *net)
 	spin_lock(&state_lock);
 	list_for_each_safe(pos, next, &nn->del_recall_lru) {
 		dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
-		unhash_delegation_locked(dp, NFS4_CLOSED_DELEG_STID);
+		unhash_delegation_locked(dp, SC_STATUS_CLOSED);
 		list_add(&dp->dl_recall_lru, &reaplist);
 	}
 	spin_unlock(&state_lock);
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 87c4372ba36a8d..1d4bf1a7d229c5 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -88,17 +88,33 @@ struct nfsd4_callback_ops {
  */
 struct nfs4_stid {
 	refcount_t		sc_count;
-#define NFS4_OPEN_STID 1
-#define NFS4_LOCK_STID 2
-#define NFS4_DELEG_STID 4
-/* For an open stateid kept around *only* to process close replays: */
-#define NFS4_CLOSED_STID 8
+
+	/* A new stateid is added to the cl_stateids idr early before it
+	 * is fully initialised.  Its sc_type is then zero.  After
+	 * initialisation the sc_type it set under cl_lock, and then
+	 * never changes.
+	 */
+#define SC_TYPE_OPEN		BIT(0)
+#define SC_TYPE_LOCK		BIT(1)
+#define SC_TYPE_DELEG		BIT(2)
+#define SC_TYPE_LAYOUT		BIT(3)
+	unsigned short		sc_type;
+
+/* state_lock protects sc_status for delegation stateids.
+ * ->cl_lock protects sc_status for open and lock stateids.
+ * ->st_mutex also protect sc_status for open stateids.
+ * ->ls_lock protects sc_status for layout stateids.
+ */
+/*
+ * For an open stateid kept around *only* to process close replays.
+ * For deleg stateid, kept in idr until last reference is dropped.
+ */
+#define SC_STATUS_CLOSED	BIT(0)
 /* For a deleg stateid kept around only to process free_stateid's: */
-#define NFS4_REVOKED_DELEG_STID 16
-#define NFS4_CLOSED_DELEG_STID 32
-#define NFS4_LAYOUT_STID 64
+#define SC_STATUS_REVOKED	BIT(1)
+	unsigned short		sc_status;
+
 	struct list_head	sc_cp_list;
-	unsigned char		sc_type;
 	stateid_t		sc_stateid;
 	spinlock_t		sc_lock;
 	struct nfs4_client	*sc_client;
@@ -672,15 +688,15 @@ extern __be32 nfs4_preprocess_stateid_op(struct svc_rqst *rqstp,
 		stateid_t *stateid, int flags, struct nfsd_file **filp,
 		struct nfs4_stid **cstid);
 __be32 nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate,
-		     stateid_t *stateid, unsigned char typemask,
-		     struct nfs4_stid **s, struct nfsd_net *nn);
+			    stateid_t *stateid, unsigned short typemask,
+			    unsigned short statusmask,
+			    struct nfs4_stid **s, struct nfsd_net *nn);
 struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, struct kmem_cache *slab,
 				  void (*sc_free)(struct nfs4_stid *));
 int nfs4_init_copy_state(struct nfsd_net *nn, struct nfsd4_copy *copy);
 void nfs4_free_copy_state(struct nfsd4_copy *copy);
 struct nfs4_cpntf_state *nfs4_alloc_init_cpntf_state(struct nfsd_net *nn,
 			struct nfs4_stid *p_stid);
-void nfs4_unhash_stid(struct nfs4_stid *s);
 void nfs4_put_stid(struct nfs4_stid *s);
 void nfs4_inc_and_copy_stateid(stateid_t *dst, struct nfs4_stid *stid);
 void nfs4_remove_reclaim_record(struct nfs4_client_reclaim *, struct nfsd_net *);
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
index 9f9e58debc2611..f87dad1fa1d66d 100644
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h
@@ -643,23 +643,17 @@ DEFINE_EVENT(nfsd_stateseqid_class, nfsd_##name, \
 DEFINE_STATESEQID_EVENT(preprocess);
 DEFINE_STATESEQID_EVENT(open_confirm);
 
-TRACE_DEFINE_ENUM(NFS4_OPEN_STID);
-TRACE_DEFINE_ENUM(NFS4_LOCK_STID);
-TRACE_DEFINE_ENUM(NFS4_DELEG_STID);
-TRACE_DEFINE_ENUM(NFS4_CLOSED_STID);
-TRACE_DEFINE_ENUM(NFS4_REVOKED_DELEG_STID);
-TRACE_DEFINE_ENUM(NFS4_CLOSED_DELEG_STID);
-TRACE_DEFINE_ENUM(NFS4_LAYOUT_STID);
-
 #define show_stid_type(x)						\
 	__print_flags(x, "|",						\
-		{ NFS4_OPEN_STID,		"OPEN" },		\
-		{ NFS4_LOCK_STID,		"LOCK" },		\
-		{ NFS4_DELEG_STID,		"DELEG" },		\
-		{ NFS4_CLOSED_STID,		"CLOSED" },		\
-		{ NFS4_REVOKED_DELEG_STID,	"REVOKED" },		\
-		{ NFS4_CLOSED_DELEG_STID,	"CLOSED_DELEG" },	\
-		{ NFS4_LAYOUT_STID,		"LAYOUT" })
+		{ SC_TYPE_OPEN,		"OPEN" },		\
+		{ SC_TYPE_LOCK,		"LOCK" },		\
+		{ SC_TYPE_DELEG,		"DELEG" },		\
+		{ SC_TYPE_LAYOUT,		"LAYOUT" })
+
+#define show_stid_status(x)						\
+	__print_flags(x, "|",						\
+		{ SC_STATUS_CLOSED,		"CLOSED" },		\
+		{ SC_STATUS_REVOKED,		"REVOKED" })		\
 
 DECLARE_EVENT_CLASS(nfsd_stid_class,
 	TP_PROTO(
@@ -668,6 +662,7 @@ DECLARE_EVENT_CLASS(nfsd_stid_class,
 	TP_ARGS(stid),
 	TP_STRUCT__entry(
 		__field(unsigned long, sc_type)
+		__field(unsigned long, sc_status)
 		__field(int, sc_count)
 		__field(u32, cl_boot)
 		__field(u32, cl_id)
@@ -678,16 +673,18 @@ DECLARE_EVENT_CLASS(nfsd_stid_class,
 		const stateid_t *stp = &stid->sc_stateid;
 
 		__entry->sc_type = stid->sc_type;
+		__entry->sc_status = stid->sc_status;
 		__entry->sc_count = refcount_read(&stid->sc_count);
 		__entry->cl_boot = stp->si_opaque.so_clid.cl_boot;
 		__entry->cl_id = stp->si_opaque.so_clid.cl_id;
 		__entry->si_id = stp->si_opaque.so_id;
 		__entry->si_generation = stp->si_generation;
 	),
-	TP_printk("client %08x:%08x stateid %08x:%08x ref=%d type=%s",
+	TP_printk("client %08x:%08x stateid %08x:%08x ref=%d type=%s state=%s",
 		__entry->cl_boot, __entry->cl_id,
 		__entry->si_id, __entry->si_generation,
-		__entry->sc_count, show_stid_type(__entry->sc_type)
+		__entry->sc_count, show_stid_type(__entry->sc_type),
+		show_stid_status(__entry->sc_status)
 	)
 );
 

From 41585368964db1ef7e3eae85a5f4342909f4afda Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 30 Jan 2024 12:08:26 +1100
Subject: [PATCH 0648/1406] nfsd: prepare for supporting admin-revocation of
 state

The NFSv4 protocol allows state to be revoked by the admin and has error
codes which allow this to be communicated to the client.

This patch
 - introduces a new state-id status SC_STATUS_ADMIN_REVOKED
   which can be set on open, lock, or delegation state.
 - reports NFS4ERR_ADMIN_REVOKED when these are accessed
 - introduces a per-client counter of these states and returns
   SEQ4_STATUS_ADMIN_STATE_REVOKED when the counter is not zero.
   Decrements this when freeing any admin-revoked state.
 - introduces stub code to find all interesting states for a given
   superblock so they can be revoked via the 'unlock_filesystem'
   file in /proc/fs/nfsd/
   No actual states are handled yet.

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4state.c | 85 ++++++++++++++++++++++++++++++++++++++++++++-
 fs/nfsd/nfsctl.c    |  1 +
 fs/nfsd/nfsd.h      |  1 +
 fs/nfsd/state.h     | 10 ++++++
 fs/nfsd/trace.h     |  3 +-
 5 files changed, 98 insertions(+), 2 deletions(-)

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 84b5ee743002ec..c7b66c9b7ba953 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1210,6 +1210,8 @@ nfs4_put_stid(struct nfs4_stid *s)
 		return;
 	}
 	idr_remove(&clp->cl_stateids, s->sc_stateid.si_opaque.so_id);
+	if (s->sc_status & SC_STATUS_ADMIN_REVOKED)
+		atomic_dec(&s->sc_client->cl_admin_revoked);
 	nfs4_free_cpntf_statelist(clp->net, s);
 	spin_unlock(&clp->cl_lock);
 	s->sc_free(s);
@@ -1529,6 +1531,8 @@ static void put_ol_stateid_locked(struct nfs4_ol_stateid *stp,
 	}
 
 	idr_remove(&clp->cl_stateids, s->sc_stateid.si_opaque.so_id);
+	if (s->sc_status & SC_STATUS_ADMIN_REVOKED)
+		atomic_dec(&s->sc_client->cl_admin_revoked);
 	list_add(&stp->st_locks, reaplist);
 }
 
@@ -1674,6 +1678,68 @@ static void release_openowner(struct nfs4_openowner *oo)
 	nfs4_put_stateowner(&oo->oo_owner);
 }
 
+static struct nfs4_stid *find_one_sb_stid(struct nfs4_client *clp,
+					  struct super_block *sb,
+					  unsigned int sc_types)
+{
+	unsigned long id, tmp;
+	struct nfs4_stid *stid;
+
+	spin_lock(&clp->cl_lock);
+	idr_for_each_entry_ul(&clp->cl_stateids, stid, tmp, id)
+		if ((stid->sc_type & sc_types) &&
+		    stid->sc_status == 0 &&
+		    stid->sc_file->fi_inode->i_sb == sb) {
+			refcount_inc(&stid->sc_count);
+			break;
+		}
+	spin_unlock(&clp->cl_lock);
+	return stid;
+}
+
+/**
+ * nfsd4_revoke_states - revoke all nfsv4 states associated with given filesystem
+ * @net:  used to identify instance of nfsd (there is one per net namespace)
+ * @sb:   super_block used to identify target filesystem
+ *
+ * All nfs4 states (open, lock, delegation, layout) held by the server instance
+ * and associated with a file on the given filesystem will be revoked resulting
+ * in any files being closed and so all references from nfsd to the filesystem
+ * being released.  Thus nfsd will no longer prevent the filesystem from being
+ * unmounted.
+ *
+ * The clients which own the states will subsequently being notified that the
+ * states have been "admin-revoked".
+ */
+void nfsd4_revoke_states(struct net *net, struct super_block *sb)
+{
+	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+	unsigned int idhashval;
+	unsigned int sc_types;
+
+	sc_types = 0;
+
+	spin_lock(&nn->client_lock);
+	for (idhashval = 0; idhashval < CLIENT_HASH_MASK; idhashval++) {
+		struct list_head *head = &nn->conf_id_hashtbl[idhashval];
+		struct nfs4_client *clp;
+	retry:
+		list_for_each_entry(clp, head, cl_idhash) {
+			struct nfs4_stid *stid = find_one_sb_stid(clp, sb,
+								  sc_types);
+			if (stid) {
+				spin_unlock(&nn->client_lock);
+				switch (stid->sc_type) {
+				}
+				nfs4_put_stid(stid);
+				spin_lock(&nn->client_lock);
+				goto retry;
+			}
+		}
+	}
+	spin_unlock(&nn->client_lock);
+}
+
 static inline int
 hash_sessionid(struct nfs4_sessionid *sessionid)
 {
@@ -2545,6 +2611,8 @@ static int client_info_show(struct seq_file *m, void *v)
 	}
 	seq_printf(m, "callback state: %s\n", cb_state2str(clp->cl_cb_state));
 	seq_printf(m, "callback address: %pISpc\n", &clp->cl_cb_conn.cb_addr);
+	seq_printf(m, "admin-revoked states: %d\n",
+		   atomic_read(&clp->cl_admin_revoked));
 	drop_client(clp);
 
 	return 0;
@@ -4058,6 +4126,8 @@ nfsd4_sequence(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	}
 	if (!list_empty(&clp->cl_revoked))
 		seq->status_flags |= SEQ4_STATUS_RECALLABLE_STATE_REVOKED;
+	if (atomic_read(&clp->cl_admin_revoked))
+		seq->status_flags |= SEQ4_STATUS_ADMIN_STATE_REVOKED;
 	trace_nfsd_seq4_status(rqstp, seq);
 out_no_session:
 	if (conn)
@@ -4547,7 +4617,9 @@ nfsd4_verify_open_stid(struct nfs4_stid *s)
 {
 	__be32 ret = nfs_ok;
 
-	if (s->sc_status & SC_STATUS_REVOKED)
+	if (s->sc_status & SC_STATUS_ADMIN_REVOKED)
+		ret = nfserr_admin_revoked;
+	else if (s->sc_status & SC_STATUS_REVOKED)
 		ret = nfserr_deleg_revoked;
 	else if (s->sc_status & SC_STATUS_CLOSED)
 		ret = nfserr_bad_stateid;
@@ -5136,6 +5208,11 @@ nfs4_check_deleg(struct nfs4_client *cl, struct nfsd4_open *open,
 	deleg = find_deleg_stateid(cl, &open->op_delegate_stateid);
 	if (deleg == NULL)
 		goto out;
+	if (deleg->dl_stid.sc_status & SC_STATUS_ADMIN_REVOKED) {
+		nfs4_put_stid(&deleg->dl_stid);
+		status = nfserr_admin_revoked;
+		goto out;
+	}
 	if (deleg->dl_stid.sc_status & SC_STATUS_REVOKED) {
 		nfs4_put_stid(&deleg->dl_stid);
 		status = nfserr_deleg_revoked;
@@ -6443,6 +6520,8 @@ nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate,
 		 */
 		statusmask |= SC_STATUS_REVOKED;
 
+	statusmask |= SC_STATUS_ADMIN_REVOKED;
+
 	if (ZERO_STATEID(stateid) || ONE_STATEID(stateid) ||
 		CLOSE_STATEID(stateid))
 		return nfserr_bad_stateid;
@@ -6461,6 +6540,10 @@ nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate,
 		nfs4_put_stid(stid);
 		return nfserr_deleg_revoked;
 	}
+	if (stid->sc_status & SC_STATUS_ADMIN_REVOKED) {
+		nfs4_put_stid(stid);
+		return nfserr_admin_revoked;
+	}
 	*s = stid;
 	return nfs_ok;
 }
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 5a5547bd6ecf7e..ecd18bffeebc75 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -281,6 +281,7 @@ static ssize_t write_unlock_fs(struct file *file, char *buf, size_t size)
 	 * 3.  Is that directory the root of an exported file system?
 	 */
 	error = nlmsvc_unlock_all_by_sb(path.dentry->d_sb);
+	nfsd4_revoke_states(netns(file), path.dentry->d_sb);
 
 	path_put(&path);
 	return error;
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index be2ea3d6d2a289..8daf22d766c60a 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -275,6 +275,7 @@ void		nfsd_lockd_shutdown(void);
 #define	nfserr_no_grace		cpu_to_be32(NFSERR_NO_GRACE)
 #define	nfserr_reclaim_bad	cpu_to_be32(NFSERR_RECLAIM_BAD)
 #define	nfserr_badname		cpu_to_be32(NFSERR_BADNAME)
+#define	nfserr_admin_revoked	cpu_to_be32(NFS4ERR_ADMIN_REVOKED)
 #define	nfserr_cb_path_down	cpu_to_be32(NFSERR_CB_PATH_DOWN)
 #define	nfserr_locked		cpu_to_be32(NFSERR_LOCKED)
 #define	nfserr_wrongsec		cpu_to_be32(NFSERR_WRONGSEC)
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 1d4bf1a7d229c5..be02bf1a16bdd9 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -112,6 +112,7 @@ struct nfs4_stid {
 #define SC_STATUS_CLOSED	BIT(0)
 /* For a deleg stateid kept around only to process free_stateid's: */
 #define SC_STATUS_REVOKED	BIT(1)
+#define SC_STATUS_ADMIN_REVOKED	BIT(2)
 	unsigned short		sc_status;
 
 	struct list_head	sc_cp_list;
@@ -367,6 +368,7 @@ struct nfs4_client {
 	clientid_t		cl_clientid;	/* generated by server */
 	nfs4_verifier		cl_confirm;	/* generated by server */
 	u32			cl_minorversion;
+	atomic_t		cl_admin_revoked; /* count of admin-revoked states */
 	/* NFSv4.1 client implementation id: */
 	struct xdr_netobj	cl_nii_domain;
 	struct xdr_netobj	cl_nii_name;
@@ -730,6 +732,14 @@ static inline void get_nfs4_file(struct nfs4_file *fi)
 }
 struct nfsd_file *find_any_file(struct nfs4_file *f);
 
+#ifdef CONFIG_NFSD_V4
+void nfsd4_revoke_states(struct net *net, struct super_block *sb);
+#else
+static inline void nfsd4_revoke_states(struct net *net, struct super_block *sb)
+{
+}
+#endif
+
 /* grace period management */
 void nfsd4_end_grace(struct nfsd_net *nn);
 
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
index f87dad1fa1d66d..d8e56268a250ba 100644
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h
@@ -653,7 +653,8 @@ DEFINE_STATESEQID_EVENT(open_confirm);
 #define show_stid_status(x)						\
 	__print_flags(x, "|",						\
 		{ SC_STATUS_CLOSED,		"CLOSED" },		\
-		{ SC_STATUS_REVOKED,		"REVOKED" })		\
+		{ SC_STATUS_REVOKED,		"REVOKED" },		\
+		{ SC_STATUS_ADMIN_REVOKED,	"ADMIN_REVOKED" })
 
 DECLARE_EVENT_CLASS(nfsd_stid_class,
 	TP_PROTO(

From b9e22f4015208e67690d89d214e6e5b8ed5ef4e8 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 30 Jan 2024 12:08:27 +1100
Subject: [PATCH 0649/1406] nfsd: allow state with no file to appear in
 /proc/fs/nfsd/clients/*/states

Change the "show" functions to show some content even if a file cannot
be found.  This is the case for admin-revoked state.
This is primarily useful for debugging - to ensure states are being
removed eventually.

So change several seq_printf() to seq_puts().  Some of these are needed
to keep checkpatch happy.  Others were done for consistency.

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4state.c | 118 ++++++++++++++++++++++----------------------
 1 file changed, 58 insertions(+), 60 deletions(-)

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index c7b66c9b7ba953..f8e74a0a86d2ce 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2554,9 +2554,9 @@ static struct nfs4_client *get_nfsdfs_clp(struct inode *inode)
 
 static void seq_quote_mem(struct seq_file *m, char *data, int len)
 {
-	seq_printf(m, "\"");
+	seq_puts(m, "\"");
 	seq_escape_mem(m, data, len, ESCAPE_HEX | ESCAPE_NAP | ESCAPE_APPEND, "\"\\");
-	seq_printf(m, "\"");
+	seq_puts(m, "\"");
 }
 
 static const char *cb_state2str(int state)
@@ -2597,14 +2597,14 @@ static int client_info_show(struct seq_file *m, void *v)
 		seq_puts(m, "status: unconfirmed\n");
 	seq_printf(m, "seconds from last renew: %lld\n",
 		ktime_get_boottime_seconds() - clp->cl_time);
-	seq_printf(m, "name: ");
+	seq_puts(m, "name: ");
 	seq_quote_mem(m, clp->cl_name.data, clp->cl_name.len);
 	seq_printf(m, "\nminor version: %d\n", clp->cl_minorversion);
 	if (clp->cl_nii_domain.data) {
-		seq_printf(m, "Implementation domain: ");
+		seq_puts(m, "Implementation domain: ");
 		seq_quote_mem(m, clp->cl_nii_domain.data,
 					clp->cl_nii_domain.len);
-		seq_printf(m, "\nImplementation name: ");
+		seq_puts(m, "\nImplementation name: ");
 		seq_quote_mem(m, clp->cl_nii_name.data, clp->cl_nii_name.len);
 		seq_printf(m, "\nImplementation time: [%lld, %ld]\n",
 			clp->cl_nii_time.tv_sec, clp->cl_nii_time.tv_nsec);
@@ -2671,7 +2671,7 @@ static void nfs4_show_superblock(struct seq_file *s, struct nfsd_file *f)
 
 static void nfs4_show_owner(struct seq_file *s, struct nfs4_stateowner *oo)
 {
-	seq_printf(s, "owner: ");
+	seq_puts(s, "owner: ");
 	seq_quote_mem(s, oo->so_owner.data, oo->so_owner.len);
 }
 
@@ -2689,20 +2689,13 @@ static int nfs4_show_open(struct seq_file *s, struct nfs4_stid *st)
 	struct nfs4_stateowner *oo;
 	unsigned int access, deny;
 
-	if (st->sc_type != SC_TYPE_OPEN && st->sc_type != SC_TYPE_LOCK)
-		return 0; /* XXX: or SEQ_SKIP? */
 	ols = openlockstateid(st);
 	oo = ols->st_stateowner;
 	nf = st->sc_file;
 
-	spin_lock(&nf->fi_lock);
-	file = find_any_file_locked(nf);
-	if (!file)
-		goto out;
-
-	seq_printf(s, "- ");
+	seq_puts(s, "- ");
 	nfs4_show_stateid(s, &st->sc_stateid);
-	seq_printf(s, ": { type: open, ");
+	seq_puts(s, ": { type: open, ");
 
 	access = bmap_to_share_mode(ols->st_access_bmap);
 	deny   = bmap_to_share_mode(ols->st_deny_bmap);
@@ -2714,14 +2707,17 @@ static int nfs4_show_open(struct seq_file *s, struct nfs4_stid *st)
 		deny & NFS4_SHARE_ACCESS_READ ? "r" : "-",
 		deny & NFS4_SHARE_ACCESS_WRITE ? "w" : "-");
 
-	nfs4_show_superblock(s, file);
-	seq_printf(s, ", ");
-	nfs4_show_fname(s, file);
-	seq_printf(s, ", ");
-	nfs4_show_owner(s, oo);
-	seq_printf(s, " }\n");
-out:
+	spin_lock(&nf->fi_lock);
+	file = find_any_file_locked(nf);
+	if (file) {
+		nfs4_show_superblock(s, file);
+		seq_puts(s, ", ");
+		nfs4_show_fname(s, file);
+		seq_puts(s, ", ");
+	}
 	spin_unlock(&nf->fi_lock);
+	nfs4_show_owner(s, oo);
+	seq_puts(s, " }\n");
 	return 0;
 }
 
@@ -2735,30 +2731,29 @@ static int nfs4_show_lock(struct seq_file *s, struct nfs4_stid *st)
 	ols = openlockstateid(st);
 	oo = ols->st_stateowner;
 	nf = st->sc_file;
-	spin_lock(&nf->fi_lock);
-	file = find_any_file_locked(nf);
-	if (!file)
-		goto out;
 
-	seq_printf(s, "- ");
+	seq_puts(s, "- ");
 	nfs4_show_stateid(s, &st->sc_stateid);
-	seq_printf(s, ": { type: lock, ");
+	seq_puts(s, ": { type: lock, ");
 
-	/*
-	 * Note: a lock stateid isn't really the same thing as a lock,
-	 * it's the locking state held by one owner on a file, and there
-	 * may be multiple (or no) lock ranges associated with it.
-	 * (Same for the matter is true of open stateids.)
-	 */
+	spin_lock(&nf->fi_lock);
+	file = find_any_file_locked(nf);
+	if (file) {
+		/*
+		 * Note: a lock stateid isn't really the same thing as a lock,
+		 * it's the locking state held by one owner on a file, and there
+		 * may be multiple (or no) lock ranges associated with it.
+		 * (Same for the matter is true of open stateids.)
+		 */
 
-	nfs4_show_superblock(s, file);
-	/* XXX: open stateid? */
-	seq_printf(s, ", ");
-	nfs4_show_fname(s, file);
-	seq_printf(s, ", ");
+		nfs4_show_superblock(s, file);
+		/* XXX: open stateid? */
+		seq_puts(s, ", ");
+		nfs4_show_fname(s, file);
+		seq_puts(s, ", ");
+	}
 	nfs4_show_owner(s, oo);
-	seq_printf(s, " }\n");
-out:
+	seq_puts(s, " }\n");
 	spin_unlock(&nf->fi_lock);
 	return 0;
 }
@@ -2771,25 +2766,25 @@ static int nfs4_show_deleg(struct seq_file *s, struct nfs4_stid *st)
 
 	ds = delegstateid(st);
 	nf = st->sc_file;
-	spin_lock(&nf->fi_lock);
-	file = nf->fi_deleg_file;
-	if (!file)
-		goto out;
 
-	seq_printf(s, "- ");
+	seq_puts(s, "- ");
 	nfs4_show_stateid(s, &st->sc_stateid);
-	seq_printf(s, ": { type: deleg, ");
+	seq_puts(s, ": { type: deleg, ");
 
-	seq_printf(s, "access: %s, ",
-		ds->dl_type == NFS4_OPEN_DELEGATE_READ ? "r" : "w");
+	seq_printf(s, "access: %s",
+		   ds->dl_type == NFS4_OPEN_DELEGATE_READ ? "r" : "w");
 
 	/* XXX: lease time, whether it's being recalled. */
 
-	nfs4_show_superblock(s, file);
-	seq_printf(s, ", ");
-	nfs4_show_fname(s, file);
-	seq_printf(s, " }\n");
-out:
+	spin_lock(&nf->fi_lock);
+	file = nf->fi_deleg_file;
+	if (file) {
+		seq_puts(s, ", ");
+		nfs4_show_superblock(s, file);
+		seq_puts(s, ", ");
+		nfs4_show_fname(s, file);
+	}
+	seq_puts(s, " }\n");
 	spin_unlock(&nf->fi_lock);
 	return 0;
 }
@@ -2802,16 +2797,19 @@ static int nfs4_show_layout(struct seq_file *s, struct nfs4_stid *st)
 	ls = container_of(st, struct nfs4_layout_stateid, ls_stid);
 	file = ls->ls_file;
 
-	seq_printf(s, "- ");
+	seq_puts(s, "- ");
 	nfs4_show_stateid(s, &st->sc_stateid);
-	seq_printf(s, ": { type: layout, ");
+	seq_puts(s, ": { type: layout");
 
 	/* XXX: What else would be useful? */
 
-	nfs4_show_superblock(s, file);
-	seq_printf(s, ", ");
-	nfs4_show_fname(s, file);
-	seq_printf(s, " }\n");
+	if (file) {
+		seq_puts(s, ", ");
+		nfs4_show_superblock(s, file);
+		seq_puts(s, ", ");
+		nfs4_show_fname(s, file);
+	}
+	seq_puts(s, " }\n");
 
 	return 0;
 }

From cf93b52a74a90182fed84b838aff859413bd2f2b Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 30 Jan 2024 12:08:28 +1100
Subject: [PATCH 0650/1406] nfsd: report in /proc/fs/nfsd/clients/*/states when
 state is admin-revoke

Add "admin-revoked" to the status information for any states that have
been admin-revoked.  This can be useful for confirming correct
behaviour.

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4state.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index f8e74a0a86d2ce..c91a7deb26a65b 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2717,6 +2717,8 @@ static int nfs4_show_open(struct seq_file *s, struct nfs4_stid *st)
 	}
 	spin_unlock(&nf->fi_lock);
 	nfs4_show_owner(s, oo);
+	if (st->sc_status & SC_STATUS_ADMIN_REVOKED)
+		seq_puts(s, ", admin-revoked");
 	seq_puts(s, " }\n");
 	return 0;
 }
@@ -2753,6 +2755,8 @@ static int nfs4_show_lock(struct seq_file *s, struct nfs4_stid *st)
 		seq_puts(s, ", ");
 	}
 	nfs4_show_owner(s, oo);
+	if (st->sc_status & SC_STATUS_ADMIN_REVOKED)
+		seq_puts(s, ", admin-revoked");
 	seq_puts(s, " }\n");
 	spin_unlock(&nf->fi_lock);
 	return 0;
@@ -2784,8 +2788,10 @@ static int nfs4_show_deleg(struct seq_file *s, struct nfs4_stid *st)
 		seq_puts(s, ", ");
 		nfs4_show_fname(s, file);
 	}
-	seq_puts(s, " }\n");
 	spin_unlock(&nf->fi_lock);
+	if (st->sc_status & SC_STATUS_ADMIN_REVOKED)
+		seq_puts(s, ", admin-revoked");
+	seq_puts(s, " }\n");
 	return 0;
 }
 
@@ -2809,6 +2815,8 @@ static int nfs4_show_layout(struct seq_file *s, struct nfs4_stid *st)
 		seq_puts(s, ", ");
 		nfs4_show_fname(s, file);
 	}
+	if (st->sc_status & SC_STATUS_ADMIN_REVOKED)
+		seq_puts(s, ", admin-revoked");
 	seq_puts(s, " }\n");
 
 	return 0;

From 2bb75f5ba7e56ce1c1d0d9236db57defd7a8a028 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 30 Jan 2024 12:08:29 +1100
Subject: [PATCH 0651/1406] nfsd: allow admin-revoked NFSv4.0 state to be
 freed.

For NFSv4.1 and later the client easily discovers if there is any
admin-revoked state and will then find and explicitly free it.

For NFSv4.0 there is no such mechanism.  The client can only find that
state is admin-revoked if it tries to use that state, and there is no
way for it to explicitly free the state.  So the server must hold on to
the stateid (at least) for an indefinite amount of time.  A
RELEASE_LOCKOWNER request might justify forgetting some of these
stateids, as would the whole clients lease lapsing, but these are not
reliable.

This patch takes two approaches.

Whenever a client uses an revoked stateid, that stateid is then
discarded and will not be recognised again.  This might confuse a client
which expect to get NFS4ERR_ADMIN_REVOKED consistently once it get it at
all, but should mostly work.  Hopefully one error will lead to other
resources being closed (e.g.  process exits), which will result in more
stateid being freed when a CLOSE attempt gets NFS4ERR_ADMIN_REVOKED.

Also, any admin-revoked stateids that have been that way for more than
one lease time are periodically revoke.

No actual freeing of state happens in this patch.  That will come in
future patches which handle the different sorts of revoked state.

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/netns.h     |  4 ++
 fs/nfsd/nfs4state.c | 98 ++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 101 insertions(+), 1 deletion(-)

diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
index afc16ee4da7428..d4be519b5734e3 100644
--- a/fs/nfsd/netns.h
+++ b/fs/nfsd/netns.h
@@ -209,6 +209,10 @@ struct nfsd_net {
 	atomic_t		nfsd_courtesy_clients;
 	struct shrinker		*nfsd_client_shrinker;
 	struct work_struct	nfsd_shrinker_work;
+
+	/* last time an admin-revoke happened for NFSv4.0 */
+	time64_t		nfs40_last_revoke;
+
 };
 
 /* Simple check to find out if a given net was properly initialized */
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index c91a7deb26a65b..ec2db53ccae7d4 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1733,6 +1733,14 @@ void nfsd4_revoke_states(struct net *net, struct super_block *sb)
 				}
 				nfs4_put_stid(stid);
 				spin_lock(&nn->client_lock);
+				if (clp->cl_minorversion == 0)
+					/* Allow cleanup after a lease period.
+					 * store_release ensures cleanup will
+					 * see any newly revoked states if it
+					 * sees the time updated.
+					 */
+					nn->nfs40_last_revoke =
+						ktime_get_boottime_seconds();
 				goto retry;
 			}
 		}
@@ -4618,6 +4626,40 @@ nfsd4_find_existing_open(struct nfs4_file *fp, struct nfsd4_open *open)
 	return ret;
 }
 
+static void nfsd4_drop_revoked_stid(struct nfs4_stid *s)
+	__releases(&s->sc_client->cl_lock)
+{
+	struct nfs4_client *cl = s->sc_client;
+
+	switch (s->sc_type) {
+	default:
+		spin_unlock(&cl->cl_lock);
+	}
+}
+
+static void nfsd40_drop_revoked_stid(struct nfs4_client *cl,
+				    stateid_t *stid)
+{
+	/* NFSv4.0 has no way for the client to tell the server
+	 * that it can forget an admin-revoked stateid.
+	 * So we keep it around until the first time that the
+	 * client uses it, and drop it the first time
+	 * nfserr_admin_revoked is returned.
+	 * For v4.1 and later we wait until explicitly told
+	 * to free the stateid.
+	 */
+	if (cl->cl_minorversion == 0) {
+		struct nfs4_stid *st;
+
+		spin_lock(&cl->cl_lock);
+		st = find_stateid_locked(cl, stid);
+		if (st)
+			nfsd4_drop_revoked_stid(st);
+		else
+			spin_unlock(&cl->cl_lock);
+	}
+}
+
 static __be32
 nfsd4_verify_open_stid(struct nfs4_stid *s)
 {
@@ -4640,6 +4682,10 @@ nfsd4_lock_ol_stateid(struct nfs4_ol_stateid *stp)
 
 	mutex_lock_nested(&stp->st_mutex, LOCK_STATEID_MUTEX);
 	ret = nfsd4_verify_open_stid(&stp->st_stid);
+	if (ret == nfserr_admin_revoked)
+		nfsd40_drop_revoked_stid(stp->st_stid.sc_client,
+					&stp->st_stid.sc_stateid);
+
 	if (ret != nfs_ok)
 		mutex_unlock(&stp->st_mutex);
 	return ret;
@@ -5221,6 +5267,7 @@ nfs4_check_deleg(struct nfs4_client *cl, struct nfsd4_open *open,
 	}
 	if (deleg->dl_stid.sc_status & SC_STATUS_REVOKED) {
 		nfs4_put_stid(&deleg->dl_stid);
+		nfsd40_drop_revoked_stid(cl, &open->op_delegate_stateid);
 		status = nfserr_deleg_revoked;
 		goto out;
 	}
@@ -6206,6 +6253,43 @@ nfs4_process_client_reaplist(struct list_head *reaplist)
 	}
 }
 
+static void nfs40_clean_admin_revoked(struct nfsd_net *nn,
+				      struct laundry_time *lt)
+{
+	struct nfs4_client *clp;
+
+	spin_lock(&nn->client_lock);
+	if (nn->nfs40_last_revoke == 0 ||
+	    nn->nfs40_last_revoke > lt->cutoff) {
+		spin_unlock(&nn->client_lock);
+		return;
+	}
+	nn->nfs40_last_revoke = 0;
+
+retry:
+	list_for_each_entry(clp, &nn->client_lru, cl_lru) {
+		unsigned long id, tmp;
+		struct nfs4_stid *stid;
+
+		if (atomic_read(&clp->cl_admin_revoked) == 0)
+			continue;
+
+		spin_lock(&clp->cl_lock);
+		idr_for_each_entry_ul(&clp->cl_stateids, stid, tmp, id)
+			if (stid->sc_status & SC_STATUS_ADMIN_REVOKED) {
+				refcount_inc(&stid->sc_count);
+				spin_unlock(&nn->client_lock);
+				/* this function drops ->cl_lock */
+				nfsd4_drop_revoked_stid(stid);
+				nfs4_put_stid(stid);
+				spin_lock(&nn->client_lock);
+				goto retry;
+			}
+		spin_unlock(&clp->cl_lock);
+	}
+	spin_unlock(&nn->client_lock);
+}
+
 static time64_t
 nfs4_laundromat(struct nfsd_net *nn)
 {
@@ -6239,6 +6323,8 @@ nfs4_laundromat(struct nfsd_net *nn)
 	nfs4_get_client_reaplist(nn, &reaplist, &lt);
 	nfs4_process_client_reaplist(&reaplist);
 
+	nfs40_clean_admin_revoked(nn, &lt);
+
 	spin_lock(&state_lock);
 	list_for_each_safe(pos, next, &nn->del_recall_lru) {
 		dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
@@ -6457,6 +6543,9 @@ static __be32 nfsd4_stid_check_stateid_generation(stateid_t *in, struct nfs4_sti
 	if (ret == nfs_ok)
 		ret = check_stateid_generation(in, &s->sc_stateid, has_session);
 	spin_unlock(&s->sc_lock);
+	if (ret == nfserr_admin_revoked)
+		nfsd40_drop_revoked_stid(s->sc_client,
+					&s->sc_stateid);
 	return ret;
 }
 
@@ -6501,6 +6590,8 @@ static __be32 nfsd4_validate_stateid(struct nfs4_client *cl, stateid_t *stateid)
 	}
 out_unlock:
 	spin_unlock(&cl->cl_lock);
+	if (status == nfserr_admin_revoked)
+		nfsd40_drop_revoked_stid(cl, stateid);
 	return status;
 }
 
@@ -6547,6 +6638,7 @@ nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate,
 		return nfserr_deleg_revoked;
 	}
 	if (stid->sc_status & SC_STATUS_ADMIN_REVOKED) {
+		nfsd40_drop_revoked_stid(cstate->clp, stateid);
 		nfs4_put_stid(stid);
 		return nfserr_admin_revoked;
 	}
@@ -6839,6 +6931,11 @@ nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	s = find_stateid_locked(cl, stateid);
 	if (!s || s->sc_status & SC_STATUS_CLOSED)
 		goto out_unlock;
+	if (s->sc_status & SC_STATUS_ADMIN_REVOKED) {
+		nfsd4_drop_revoked_stid(s);
+		ret = nfs_ok;
+		goto out;
+	}
 	spin_lock(&s->sc_lock);
 	switch (s->sc_type) {
 	case SC_TYPE_DELEG:
@@ -6865,7 +6962,6 @@ nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		spin_unlock(&cl->cl_lock);
 		ret = nfsd4_free_lock_stateid(stateid, s);
 		goto out;
-	/* Default falls through and returns nfserr_bad_stateid */
 	}
 	spin_unlock(&s->sc_lock);
 out_unlock:

From ef567f67b94db1912d1e5a52196b35c7d0d08c3c Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 30 Jan 2024 12:08:30 +1100
Subject: [PATCH 0652/1406] nfsd: allow lock state ids to be revoked and then
 freed

Revoking state through 'unlock_filesystem' now revokes any lock states
found.  When the stateids are then freed by the client, the revoked
stateids will be cleaned up correctly.

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4state.c | 40 +++++++++++++++++++++++++++++++++++++++-
 1 file changed, 39 insertions(+), 1 deletion(-)

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index ec2db53ccae7d4..c04eabc734db5f 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1717,7 +1717,7 @@ void nfsd4_revoke_states(struct net *net, struct super_block *sb)
 	unsigned int idhashval;
 	unsigned int sc_types;
 
-	sc_types = 0;
+	sc_types = SC_TYPE_LOCK;
 
 	spin_lock(&nn->client_lock);
 	for (idhashval = 0; idhashval < CLIENT_HASH_MASK; idhashval++) {
@@ -1728,8 +1728,36 @@ void nfsd4_revoke_states(struct net *net, struct super_block *sb)
 			struct nfs4_stid *stid = find_one_sb_stid(clp, sb,
 								  sc_types);
 			if (stid) {
+				struct nfs4_ol_stateid *stp;
+
 				spin_unlock(&nn->client_lock);
 				switch (stid->sc_type) {
+				case SC_TYPE_LOCK:
+					stp = openlockstateid(stid);
+					mutex_lock_nested(&stp->st_mutex,
+							  LOCK_STATEID_MUTEX);
+					spin_lock(&clp->cl_lock);
+					if (stid->sc_status == 0) {
+						struct nfs4_lockowner *lo =
+							lockowner(stp->st_stateowner);
+						struct nfsd_file *nf;
+
+						stid->sc_status |=
+							SC_STATUS_ADMIN_REVOKED;
+						atomic_inc(&clp->cl_admin_revoked);
+						spin_unlock(&clp->cl_lock);
+						nf = find_any_file(stp->st_stid.sc_file);
+						if (nf) {
+							get_file(nf->nf_file);
+							filp_close(nf->nf_file,
+								   (fl_owner_t)lo);
+							nfsd_file_put(nf);
+						}
+						release_all_access(stp);
+					} else
+						spin_unlock(&clp->cl_lock);
+					mutex_unlock(&stp->st_mutex);
+					break;
 				}
 				nfs4_put_stid(stid);
 				spin_lock(&nn->client_lock);
@@ -4630,8 +4658,18 @@ static void nfsd4_drop_revoked_stid(struct nfs4_stid *s)
 	__releases(&s->sc_client->cl_lock)
 {
 	struct nfs4_client *cl = s->sc_client;
+	LIST_HEAD(reaplist);
+	struct nfs4_ol_stateid *stp;
+	bool unhashed;
 
 	switch (s->sc_type) {
+	case SC_TYPE_LOCK:
+		stp = openlockstateid(s);
+		unhashed = unhash_lock_stateid(stp);
+		spin_unlock(&cl->cl_lock);
+		if (unhashed)
+			nfs4_put_stid(s);
+		break;
 	default:
 		spin_unlock(&cl->cl_lock);
 	}

From 49a4155a07b4bd7a4b2fc3239e23409b428df677 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 30 Jan 2024 12:08:31 +1100
Subject: [PATCH 0653/1406] nfsd: allow open state ids to be revoked and then
 freed

Revoking state through 'unlock_filesystem' now revokes any open states
found.  When the stateids are then freed by the client, the revoked
stateids will be cleaned up correctly.

Possibly the related lock states should be revoked too, but a
subsequent patch will do that for all lock state on the superblock.

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4state.c | 25 ++++++++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index c04eabc734db5f..c33c618f09a818 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1717,7 +1717,7 @@ void nfsd4_revoke_states(struct net *net, struct super_block *sb)
 	unsigned int idhashval;
 	unsigned int sc_types;
 
-	sc_types = SC_TYPE_LOCK;
+	sc_types = SC_TYPE_OPEN | SC_TYPE_LOCK;
 
 	spin_lock(&nn->client_lock);
 	for (idhashval = 0; idhashval < CLIENT_HASH_MASK; idhashval++) {
@@ -1732,6 +1732,22 @@ void nfsd4_revoke_states(struct net *net, struct super_block *sb)
 
 				spin_unlock(&nn->client_lock);
 				switch (stid->sc_type) {
+				case SC_TYPE_OPEN:
+					stp = openlockstateid(stid);
+					mutex_lock_nested(&stp->st_mutex,
+							  OPEN_STATEID_MUTEX);
+
+					spin_lock(&clp->cl_lock);
+					if (stid->sc_status == 0) {
+						stid->sc_status |=
+							SC_STATUS_ADMIN_REVOKED;
+						atomic_inc(&clp->cl_admin_revoked);
+						spin_unlock(&clp->cl_lock);
+						release_all_access(stp);
+					} else
+						spin_unlock(&clp->cl_lock);
+					mutex_unlock(&stp->st_mutex);
+					break;
 				case SC_TYPE_LOCK:
 					stp = openlockstateid(stid);
 					mutex_lock_nested(&stp->st_mutex,
@@ -4663,6 +4679,13 @@ static void nfsd4_drop_revoked_stid(struct nfs4_stid *s)
 	bool unhashed;
 
 	switch (s->sc_type) {
+	case SC_TYPE_OPEN:
+		stp = openlockstateid(s);
+		if (unhash_open_stateid(stp, &reaplist))
+			put_ol_stateid_locked(stp, &reaplist);
+		spin_unlock(&cl->cl_lock);
+		free_ol_stateid_reaplist(&reaplist);
+		break;
 	case SC_TYPE_LOCK:
 		stp = openlockstateid(s);
 		unhashed = unhash_lock_stateid(stp);

From 203ce91eddbc5884a95ceec0b1e6f59c65aa28ed Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 30 Jan 2024 12:08:32 +1100
Subject: [PATCH 0654/1406] nfsd: allow delegation state ids to be revoked and
 then freed

Revoking state through 'unlock_filesystem' now revokes any delegation
states found.  When the stateids are then freed by the client, the
revoked stateids will be cleaned up correctly.

As there is already support for revoking delegations, we build on that
for admin-revoking.

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4state.c | 28 +++++++++++++++++++++++++---
 1 file changed, 25 insertions(+), 3 deletions(-)

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index c33c618f09a818..a551c6ddec84d4 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1335,9 +1335,12 @@ unhash_delegation_locked(struct nfs4_delegation *dp, unsigned short statusmask)
 	if (!delegation_hashed(dp))
 		return false;
 
-	if (dp->dl_stid.sc_client->cl_minorversion == 0)
+	if (statusmask == SC_STATUS_REVOKED &&
+	    dp->dl_stid.sc_client->cl_minorversion == 0)
 		statusmask = SC_STATUS_CLOSED;
 	dp->dl_stid.sc_status |= statusmask;
+	if (statusmask & SC_STATUS_ADMIN_REVOKED)
+		atomic_inc(&dp->dl_stid.sc_client->cl_admin_revoked);
 
 	/* Ensure that deleg break won't try to requeue it */
 	++dp->dl_time;
@@ -1368,7 +1371,8 @@ static void revoke_delegation(struct nfs4_delegation *dp)
 
 	trace_nfsd_stid_revoke(&dp->dl_stid);
 
-	if (dp->dl_stid.sc_status & SC_STATUS_REVOKED) {
+	if (dp->dl_stid.sc_status &
+	    (SC_STATUS_REVOKED | SC_STATUS_ADMIN_REVOKED)) {
 		spin_lock(&clp->cl_lock);
 		refcount_inc(&dp->dl_stid.sc_count);
 		list_add(&dp->dl_recall_lru, &clp->cl_revoked);
@@ -1717,7 +1721,7 @@ void nfsd4_revoke_states(struct net *net, struct super_block *sb)
 	unsigned int idhashval;
 	unsigned int sc_types;
 
-	sc_types = SC_TYPE_OPEN | SC_TYPE_LOCK;
+	sc_types = SC_TYPE_OPEN | SC_TYPE_LOCK | SC_TYPE_DELEG;
 
 	spin_lock(&nn->client_lock);
 	for (idhashval = 0; idhashval < CLIENT_HASH_MASK; idhashval++) {
@@ -1729,6 +1733,7 @@ void nfsd4_revoke_states(struct net *net, struct super_block *sb)
 								  sc_types);
 			if (stid) {
 				struct nfs4_ol_stateid *stp;
+				struct nfs4_delegation *dp;
 
 				spin_unlock(&nn->client_lock);
 				switch (stid->sc_type) {
@@ -1774,6 +1779,16 @@ void nfsd4_revoke_states(struct net *net, struct super_block *sb)
 						spin_unlock(&clp->cl_lock);
 					mutex_unlock(&stp->st_mutex);
 					break;
+				case SC_TYPE_DELEG:
+					dp = delegstateid(stid);
+					spin_lock(&state_lock);
+					if (!unhash_delegation_locked(
+						    dp, SC_STATUS_ADMIN_REVOKED))
+						dp = NULL;
+					spin_unlock(&state_lock);
+					if (dp)
+						revoke_delegation(dp);
+					break;
 				}
 				nfs4_put_stid(stid);
 				spin_lock(&nn->client_lock);
@@ -4676,6 +4691,7 @@ static void nfsd4_drop_revoked_stid(struct nfs4_stid *s)
 	struct nfs4_client *cl = s->sc_client;
 	LIST_HEAD(reaplist);
 	struct nfs4_ol_stateid *stp;
+	struct nfs4_delegation *dp;
 	bool unhashed;
 
 	switch (s->sc_type) {
@@ -4693,6 +4709,12 @@ static void nfsd4_drop_revoked_stid(struct nfs4_stid *s)
 		if (unhashed)
 			nfs4_put_stid(s);
 		break;
+	case SC_TYPE_DELEG:
+		dp = delegstateid(s);
+		list_del_init(&dp->dl_recall_lru);
+		spin_unlock(&cl->cl_lock);
+		nfs4_put_stid(s);
+		break;
 	default:
 		spin_unlock(&cl->cl_lock);
 	}

From b16a6bf27ce3fd8cedcc90ae497c992095f2d23a Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 30 Jan 2024 12:08:33 +1100
Subject: [PATCH 0655/1406] nfsd: allow layout state to be admin-revoked.

When there is layout state on a filesystem that is being "unlocked" that
is now revoked, which involves closing the nfsd_file and releasing the
vfs lease.

To avoid races, ->ls_file can now be accessed either:
 - under ->fi_lock for the state's sc_file or
 - under rcu_read_lock() if nfsd_file_get() is used.
To support this, ->fence_client and nfsd4_cb_layout_fail() now take a
second argument being the nfsd_file.

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/blocklayout.c |  4 ++--
 fs/nfsd/nfs4layouts.c | 43 ++++++++++++++++++++++++++++++++-----------
 fs/nfsd/nfs4state.c   | 11 +++++++++--
 fs/nfsd/pnfs.h        |  8 +++++++-
 4 files changed, 50 insertions(+), 16 deletions(-)

diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
index 46fd74d91ea929..3c040c81c77d01 100644
--- a/fs/nfsd/blocklayout.c
+++ b/fs/nfsd/blocklayout.c
@@ -328,10 +328,10 @@ nfsd4_scsi_proc_layoutcommit(struct inode *inode,
 }
 
 static void
-nfsd4_scsi_fence_client(struct nfs4_layout_stateid *ls)
+nfsd4_scsi_fence_client(struct nfs4_layout_stateid *ls, struct nfsd_file *file)
 {
 	struct nfs4_client *clp = ls->ls_stid.sc_client;
-	struct block_device *bdev = ls->ls_file->nf_file->f_path.mnt->mnt_sb->s_bdev;
+	struct block_device *bdev = file->nf_file->f_path.mnt->mnt_sb->s_bdev;
 
 	bdev->bd_disk->fops->pr_ops->pr_preempt(bdev, NFSD_MDS_PR_KEY,
 			nfsd4_scsi_pr_key(clp), 0, true);
diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
index 857b822450b4fe..1cfd61db247297 100644
--- a/fs/nfsd/nfs4layouts.c
+++ b/fs/nfsd/nfs4layouts.c
@@ -152,6 +152,23 @@ void nfsd4_setup_layout_type(struct svc_export *exp)
 #endif
 }
 
+void nfsd4_close_layout(struct nfs4_layout_stateid *ls)
+{
+	struct nfsd_file *fl;
+
+	spin_lock(&ls->ls_stid.sc_file->fi_lock);
+	fl = ls->ls_file;
+	ls->ls_file = NULL;
+	spin_unlock(&ls->ls_stid.sc_file->fi_lock);
+
+	if (fl) {
+		if (!nfsd4_layout_ops[ls->ls_layout_type]->disable_recalls)
+			vfs_setlease(fl->nf_file, F_UNLCK, NULL,
+				     (void **)&ls);
+		nfsd_file_put(fl);
+	}
+}
+
 static void
 nfsd4_free_layout_stateid(struct nfs4_stid *stid)
 {
@@ -169,9 +186,7 @@ nfsd4_free_layout_stateid(struct nfs4_stid *stid)
 	list_del_init(&ls->ls_perfile);
 	spin_unlock(&fp->fi_lock);
 
-	if (!nfsd4_layout_ops[ls->ls_layout_type]->disable_recalls)
-		vfs_setlease(ls->ls_file->nf_file, F_UNLCK, NULL, (void **)&ls);
-	nfsd_file_put(ls->ls_file);
+	nfsd4_close_layout(ls);
 
 	if (ls->ls_recalled)
 		atomic_dec(&ls->ls_stid.sc_file->fi_lo_recalls);
@@ -605,7 +620,7 @@ nfsd4_return_all_file_layouts(struct nfs4_client *clp, struct nfs4_file *fp)
 }
 
 static void
-nfsd4_cb_layout_fail(struct nfs4_layout_stateid *ls)
+nfsd4_cb_layout_fail(struct nfs4_layout_stateid *ls, struct nfsd_file *file)
 {
 	struct nfs4_client *clp = ls->ls_stid.sc_client;
 	char addr_str[INET6_ADDRSTRLEN];
@@ -627,7 +642,7 @@ nfsd4_cb_layout_fail(struct nfs4_layout_stateid *ls)
 
 	argv[0] = (char *)nfsd_recall_failed;
 	argv[1] = addr_str;
-	argv[2] = ls->ls_file->nf_file->f_path.mnt->mnt_sb->s_id;
+	argv[2] = file->nf_file->f_path.mnt->mnt_sb->s_id;
 	argv[3] = NULL;
 
 	error = call_usermodehelper(nfsd_recall_failed, argv, envp,
@@ -657,6 +672,7 @@ nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task)
 	struct nfsd_net *nn;
 	ktime_t now, cutoff;
 	const struct nfsd4_layout_ops *ops;
+	struct nfsd_file *fl;
 
 	trace_nfsd_cb_layout_done(&ls->ls_stid.sc_stateid, task);
 	switch (task->tk_status) {
@@ -688,12 +704,17 @@ nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task)
 		 * Unknown error or non-responding client, we'll need to fence.
 		 */
 		trace_nfsd_layout_recall_fail(&ls->ls_stid.sc_stateid);
-
-		ops = nfsd4_layout_ops[ls->ls_layout_type];
-		if (ops->fence_client)
-			ops->fence_client(ls);
-		else
-			nfsd4_cb_layout_fail(ls);
+		rcu_read_lock();
+		fl = nfsd_file_get(ls->ls_file);
+		rcu_read_unlock();
+		if (fl) {
+			ops = nfsd4_layout_ops[ls->ls_layout_type];
+			if (ops->fence_client)
+				ops->fence_client(ls, fl);
+			else
+				nfsd4_cb_layout_fail(ls, fl);
+			nfsd_file_put(fl);
+		}
 		return 1;
 	case -NFS4ERR_NOMATCHING_LAYOUT:
 		trace_nfsd_layout_recall_done(&ls->ls_stid.sc_stateid);
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index a551c6ddec84d4..b8c00cbbce8288 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1721,7 +1721,7 @@ void nfsd4_revoke_states(struct net *net, struct super_block *sb)
 	unsigned int idhashval;
 	unsigned int sc_types;
 
-	sc_types = SC_TYPE_OPEN | SC_TYPE_LOCK | SC_TYPE_DELEG;
+	sc_types = SC_TYPE_OPEN | SC_TYPE_LOCK | SC_TYPE_DELEG | SC_TYPE_LAYOUT;
 
 	spin_lock(&nn->client_lock);
 	for (idhashval = 0; idhashval < CLIENT_HASH_MASK; idhashval++) {
@@ -1734,6 +1734,7 @@ void nfsd4_revoke_states(struct net *net, struct super_block *sb)
 			if (stid) {
 				struct nfs4_ol_stateid *stp;
 				struct nfs4_delegation *dp;
+				struct nfs4_layout_stateid *ls;
 
 				spin_unlock(&nn->client_lock);
 				switch (stid->sc_type) {
@@ -1789,6 +1790,10 @@ void nfsd4_revoke_states(struct net *net, struct super_block *sb)
 					if (dp)
 						revoke_delegation(dp);
 					break;
+				case SC_TYPE_LAYOUT:
+					ls = layoutstateid(stid);
+					nfsd4_close_layout(ls);
+					break;
 				}
 				nfs4_put_stid(stid);
 				spin_lock(&nn->client_lock);
@@ -2868,7 +2873,6 @@ static int nfs4_show_layout(struct seq_file *s, struct nfs4_stid *st)
 	struct nfsd_file *file;
 
 	ls = container_of(st, struct nfs4_layout_stateid, ls_stid);
-	file = ls->ls_file;
 
 	seq_puts(s, "- ");
 	nfs4_show_stateid(s, &st->sc_stateid);
@@ -2876,12 +2880,15 @@ static int nfs4_show_layout(struct seq_file *s, struct nfs4_stid *st)
 
 	/* XXX: What else would be useful? */
 
+	spin_lock(&ls->ls_stid.sc_file->fi_lock);
+	file = ls->ls_file;
 	if (file) {
 		seq_puts(s, ", ");
 		nfs4_show_superblock(s, file);
 		seq_puts(s, ", ");
 		nfs4_show_fname(s, file);
 	}
+	spin_unlock(&ls->ls_stid.sc_file->fi_lock);
 	if (st->sc_status & SC_STATUS_ADMIN_REVOKED)
 		seq_puts(s, ", admin-revoked");
 	seq_puts(s, " }\n");
diff --git a/fs/nfsd/pnfs.h b/fs/nfsd/pnfs.h
index de1e0dfed06a23..925817f669176c 100644
--- a/fs/nfsd/pnfs.h
+++ b/fs/nfsd/pnfs.h
@@ -37,7 +37,8 @@ struct nfsd4_layout_ops {
 	__be32 (*proc_layoutcommit)(struct inode *inode,
 			struct nfsd4_layoutcommit *lcp);
 
-	void (*fence_client)(struct nfs4_layout_stateid *ls);
+	void (*fence_client)(struct nfs4_layout_stateid *ls,
+			     struct nfsd_file *file);
 };
 
 extern const struct nfsd4_layout_ops *nfsd4_layout_ops[];
@@ -72,11 +73,13 @@ void nfsd4_setup_layout_type(struct svc_export *exp);
 void nfsd4_return_all_client_layouts(struct nfs4_client *);
 void nfsd4_return_all_file_layouts(struct nfs4_client *clp,
 		struct nfs4_file *fp);
+void nfsd4_close_layout(struct nfs4_layout_stateid *ls);
 int nfsd4_init_pnfs(void);
 void nfsd4_exit_pnfs(void);
 #else
 struct nfs4_client;
 struct nfs4_file;
+struct nfs4_layout_stateid;
 
 static inline void nfsd4_setup_layout_type(struct svc_export *exp)
 {
@@ -89,6 +92,9 @@ static inline void nfsd4_return_all_file_layouts(struct nfs4_client *clp,
 		struct nfs4_file *fp)
 {
 }
+static inline void nfsd4_close_layout(struct nfs4_layout_stateid *ls)
+{
+}
 static inline void nfsd4_exit_pnfs(void)
 {
 }

From a3b7ccac2865d1399a02bce0340cad9e368e6636 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Wed, 31 Jan 2024 11:17:40 +1100
Subject: [PATCH 0656/1406] nfsd: don't call locks_release_private() twice
 concurrently

It is possible for free_blocked_lock() to be called twice concurrently,
once from nfsd4_lock() and once from nfsd4_release_lockowner() calling
remove_blocked_locks().  This is why a kref was added.

It is perfectly safe for locks_delete_block() and kref_put() to be
called in parallel as they use locking or atomicity respectively as
protection.  However locks_release_private() has no locking.  It is
safe for it to be called twice sequentially, but not concurrently.

This patch moves that call from free_blocked_lock() where it could race
with itself, to free_nbl() where it cannot.  This will slightly delay
the freeing of private info or release of the owner - but not by much.
It is arguably more natural for this freeing to happen in free_nbl()
where the structure itself is freed.

This bug was found by code inspection - it has not been seen in practice.

Fixes: 47446d74f170 ("nfsd4: add refcount for nfsd4_blocked_lock")
Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4state.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index b8c00cbbce8288..8b112673d389b7 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -318,6 +318,7 @@ free_nbl(struct kref *kref)
 	struct nfsd4_blocked_lock *nbl;
 
 	nbl = container_of(kref, struct nfsd4_blocked_lock, nbl_kref);
+	locks_release_private(&nbl->nbl_lock);
 	kfree(nbl);
 }
 
@@ -325,7 +326,6 @@ static void
 free_blocked_lock(struct nfsd4_blocked_lock *nbl)
 {
 	locks_delete_block(&nbl->nbl_lock);
-	locks_release_private(&nbl->nbl_lock);
 	kref_put(&nbl->nbl_kref, free_nbl);
 }
 

From 2c64cbbe57053e1de4fdb10f4dbdd47f6a92dbd0 Mon Sep 17 00:00:00 2001
From: Kunwu Chan <chentao@kylinos.cn>
Date: Wed, 31 Jan 2024 14:22:27 +0800
Subject: [PATCH 0657/1406] nfsd: Simplify the allocation of slab caches in
 nfsd4_init_pnfs

commit 0a31bd5f2bbb ("KMEM_CACHE(): simplify slab cache creation")
introduces a new macro.
Use the new KMEM_CACHE() macro instead of direct kmem_cache_create
to simplify the creation of SLAB caches.

Signed-off-by: Kunwu Chan <chentao@kylinos.cn>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4layouts.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
index 1cfd61db247297..b1e585c1d9a3aa 100644
--- a/fs/nfsd/nfs4layouts.c
+++ b/fs/nfsd/nfs4layouts.c
@@ -777,13 +777,11 @@ nfsd4_init_pnfs(void)
 	for (i = 0; i < DEVID_HASH_SIZE; i++)
 		INIT_LIST_HEAD(&nfsd_devid_hash[i]);
 
-	nfs4_layout_cache = kmem_cache_create("nfs4_layout",
-			sizeof(struct nfs4_layout), 0, 0, NULL);
+	nfs4_layout_cache = KMEM_CACHE(nfs4_layout, 0);
 	if (!nfs4_layout_cache)
 		return -ENOMEM;
 
-	nfs4_layout_stateid_cache = kmem_cache_create("nfs4_layout_stateid",
-			sizeof(struct nfs4_layout_stateid), 0, 0, NULL);
+	nfs4_layout_stateid_cache = KMEM_CACHE(nfs4_layout_stateid, 0);
 	if (!nfs4_layout_stateid_cache) {
 		kmem_cache_destroy(nfs4_layout_cache);
 		return -ENOMEM;

From 898bf4d0a384ee0b595b8177eff72ede29d09200 Mon Sep 17 00:00:00 2001
From: Kunwu Chan <chentao@kylinos.cn>
Date: Wed, 31 Jan 2024 14:56:53 +0800
Subject: [PATCH 0658/1406] nfsd: Simplify the allocation of slab caches in
 nfsd_file_cache_init

Use the new KMEM_CACHE() macro instead of direct kmem_cache_create
to simplify the creation of SLAB caches.

Signed-off-by: Kunwu Chan <chentao@kylinos.cn>
Acked-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/filecache.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index 8d9f7b07e35b39..f3a642fd0ecaa8 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -722,15 +722,13 @@ nfsd_file_cache_init(void)
 		return ret;
 
 	ret = -ENOMEM;
-	nfsd_file_slab = kmem_cache_create("nfsd_file",
-				sizeof(struct nfsd_file), 0, 0, NULL);
+	nfsd_file_slab = KMEM_CACHE(nfsd_file, 0);
 	if (!nfsd_file_slab) {
 		pr_err("nfsd: unable to create nfsd_file_slab\n");
 		goto out_err;
 	}
 
-	nfsd_file_mark_slab = kmem_cache_create("nfsd_file_mark",
-					sizeof(struct nfsd_file_mark), 0, 0, NULL);
+	nfsd_file_mark_slab = KMEM_CACHE(nfsd_file_mark, 0);
 	if (!nfsd_file_mark_slab) {
 		pr_err("nfsd: unable to create nfsd_file_mark_slab\n");
 		goto out_err;

From 32ba8ecc3ddf5ef7668eac58bbdec9829e10f0fa Mon Sep 17 00:00:00 2001
From: Kunwu Chan <chentao@kylinos.cn>
Date: Sun, 4 Feb 2024 11:28:21 +0800
Subject: [PATCH 0659/1406] nfsd: Simplify the allocation of slab caches in
 nfsd_drc_slab_create

Use the new KMEM_CACHE() macro instead of direct kmem_cache_create
to simplify the creation of SLAB caches.
And change cache name from 'nfsd_drc' to 'nfsd_cacherep'.

Signed-off-by: Kunwu Chan <chentao@kylinos.cn>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfscache.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index cfcc6ac8f255a8..ba9d326b3de647 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -166,8 +166,7 @@ nfsd_reply_cache_free(struct nfsd_drc_bucket *b, struct nfsd_cacherep *rp,
 
 int nfsd_drc_slab_create(void)
 {
-	drc_slab = kmem_cache_create("nfsd_drc",
-				sizeof(struct nfsd_cacherep), 0, 0, NULL);
+	drc_slab = KMEM_CACHE(nfsd_cacherep, 0);
 	return drc_slab ? 0: -ENOMEM;
 }
 

From df4117e5dfb76bf0f01abbc0b06f38238038dc6b Mon Sep 17 00:00:00 2001
From: Kunwu Chan <chentao@kylinos.cn>
Date: Thu, 1 Feb 2024 16:37:52 +0800
Subject: [PATCH 0660/1406] nfsd: Simplify the allocation of slab caches in
 nfsd4_init_slabs

Use the new KMEM_CACHE() macro instead of direct kmem_cache_create
to simplify the creation of SLAB caches.
Make the code cleaner and more readable.

Signed-off-by: Kunwu Chan <chentao@kylinos.cn>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4state.c | 21 +++++++--------------
 1 file changed, 7 insertions(+), 14 deletions(-)

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 8b112673d389b7..0dde9eee75e74e 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -4503,32 +4503,25 @@ nfsd4_free_slabs(void)
 int
 nfsd4_init_slabs(void)
 {
-	client_slab = kmem_cache_create("nfsd4_clients",
-			sizeof(struct nfs4_client), 0, 0, NULL);
+	client_slab = KMEM_CACHE(nfs4_client, 0);
 	if (client_slab == NULL)
 		goto out;
-	openowner_slab = kmem_cache_create("nfsd4_openowners",
-			sizeof(struct nfs4_openowner), 0, 0, NULL);
+	openowner_slab = KMEM_CACHE(nfs4_openowner, 0);
 	if (openowner_slab == NULL)
 		goto out_free_client_slab;
-	lockowner_slab = kmem_cache_create("nfsd4_lockowners",
-			sizeof(struct nfs4_lockowner), 0, 0, NULL);
+	lockowner_slab = KMEM_CACHE(nfs4_lockowner, 0);
 	if (lockowner_slab == NULL)
 		goto out_free_openowner_slab;
-	file_slab = kmem_cache_create("nfsd4_files",
-			sizeof(struct nfs4_file), 0, 0, NULL);
+	file_slab = KMEM_CACHE(nfs4_file, 0);
 	if (file_slab == NULL)
 		goto out_free_lockowner_slab;
-	stateid_slab = kmem_cache_create("nfsd4_stateids",
-			sizeof(struct nfs4_ol_stateid), 0, 0, NULL);
+	stateid_slab = KMEM_CACHE(nfs4_ol_stateid, 0);
 	if (stateid_slab == NULL)
 		goto out_free_file_slab;
-	deleg_slab = kmem_cache_create("nfsd4_delegations",
-			sizeof(struct nfs4_delegation), 0, 0, NULL);
+	deleg_slab = KMEM_CACHE(nfs4_delegation, 0);
 	if (deleg_slab == NULL)
 		goto out_free_stateid_slab;
-	odstate_slab = kmem_cache_create("nfsd4_odstate",
-			sizeof(struct nfs4_clnt_odstate), 0, 0, NULL);
+	odstate_slab = KMEM_CACHE(nfs4_clnt_odstate, 0);
 	if (odstate_slab == NULL)
 		goto out_free_deleg_slab;
 	return 0;

From dd98f68b0083dc060e6b50df8f9706853869a35d Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Tue, 6 Feb 2024 07:36:42 -0500
Subject: [PATCH 0661/1406] MAINTAINERS: add Alex Aring as Reviewer for file
 locking code

Alex helps co-maintain the DLM code and did some recent work to fix up
how lockd and GFS2 work together. Add him as a Reviewer for file locking
changes.

Acked-by: Alexander Aring <aahringo@redhat.com>
Acked-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 MAINTAINERS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index 9ed4d38685394d..aee83609d59ee7 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -8165,6 +8165,7 @@ F:	include/uapi/scsi/fc/
 FILE LOCKING (flock() and fcntl()/lockf())
 M:	Jeff Layton <jlayton@kernel.org>
 M:	Chuck Lever <chuck.lever@oracle.com>
+R:	Alexander Aring <alex.aring@gmail.com>
 L:	linux-fsdevel@vger.kernel.org
 S:	Maintained
 F:	fs/fcntl.c

From 0ed0efb0afd48ce014fd371ab6fe55cba3feca1d Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Sun, 4 Feb 2024 18:16:37 -0500
Subject: [PATCH 0662/1406] svcrdma: Reserve an extra WQE for ib_drain_rq()

Do as other ULPs already do: ensure there is an extra Receive WQE
reserved for the tear-down drain WR. I haven't heard reports of
problems but it can't hurt.

Note that rq_depth is used to compute the Send Queue depth as well,
so this fix should affect both the SQ and RQ.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 net/sunrpc/xprtrdma/svc_rdma_transport.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index 4f27325ace4a4f..4a038c7e86f988 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -415,7 +415,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
 	if (newxprt->sc_max_send_sges > dev->attrs.max_send_sge)
 		newxprt->sc_max_send_sges = dev->attrs.max_send_sge;
 	rq_depth = newxprt->sc_max_requests + newxprt->sc_max_bc_requests +
-		   newxprt->sc_recv_batch;
+		   newxprt->sc_recv_batch + 1 /* drain */;
 	if (rq_depth > dev->attrs.max_qp_wr) {
 		rq_depth = dev->attrs.max_qp_wr;
 		newxprt->sc_recv_batch = 1;

From 723aecef65439e25710b9ab8448c35eb250fdb9c Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Sun, 4 Feb 2024 18:16:44 -0500
Subject: [PATCH 0663/1406] svcrdma: Report CQ depths in debugging output

Check that svc_rdma_accept() is allocating an appropriate number of
CQEs.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 net/sunrpc/xprtrdma/svc_rdma_transport.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index 4a038c7e86f988..8be0493797cf55 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -460,7 +460,8 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
 		qp_attr.cap.max_send_wr, qp_attr.cap.max_recv_wr);
 	dprintk("    cap.max_send_sge = %d, cap.max_recv_sge = %d\n",
 		qp_attr.cap.max_send_sge, qp_attr.cap.max_recv_sge);
-
+	dprintk("    send CQ depth = %u, recv CQ depth = %u\n",
+		newxprt->sc_sq_depth, rq_depth);
 	ret = rdma_create_qp(newxprt->sc_cm_id, newxprt->sc_pd, &qp_attr);
 	if (ret) {
 		trace_svcrdma_qp_err(newxprt, ret);

From 1af356341f2c3414d4d4dc3da05664157d396b63 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Sun, 4 Feb 2024 18:16:50 -0500
Subject: [PATCH 0664/1406] svcrdma: Update max_send_sges after QP is created

rdma_create_qp() can modify cap.max_send_sges. Copy the new value
to the svcrdma transport so it is bound by the new limit instead
of the requested one.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 net/sunrpc/xprtrdma/svc_rdma_transport.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index 8be0493797cf55..839c0e80e5cd4a 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -467,6 +467,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
 		trace_svcrdma_qp_err(newxprt, ret);
 		goto errout;
 	}
+	newxprt->sc_max_send_sges = qp_attr.cap.max_send_sge;
 	newxprt->sc_qp = newxprt->sc_cm_id->qp;
 
 	if (!(dev->attrs.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS))

From 4e501a032a5c531041eecb13f0f13984297afb09 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Sun, 4 Feb 2024 18:16:56 -0500
Subject: [PATCH 0665/1406] svcrdma: Increase the per-transport rw_ctx count

rdma_rw_mr_factor() returns the smallest number of MRs needed to
move a particular number of pages. svcrdma currently asks for the
number of MRs needed to move RPCSVC_MAXPAGES (a little over one
megabyte), as that is the number of pages in the largest r/wsize
the server supports.

This call assumes that the client's NIC can bundle a full one
megabyte payload in a single rdma_segment. In fact, most NICs cannot
handle a full megabyte with a single rkey / rdma_segment. Clients
will typically split even a single Read chunk into many segments.

The server needs one MR to read each rdma_segment in a Read chunk,
and thus each one needs an rw_ctx.

svcrdma has been vastly underestimating the number of rw_ctxs needed
to handle 64 RPC requests with large Read chunks using small
rdma_segments.

Unfortunately there doesn't seem to be a good way to estimate this
number without knowing the client NIC's capabilities. Even then,
the client RPC/RDMA implementation is still free to split a chunk
into smaller segments (for example, it might be using physical
registration, which needs an rdma_segment per page).

The best we can do for now is choose a number that will guarantee
forward progress in the worst case (one page per segment).

At some later point, we could add some mechanisms to make this
much less of a problem:
- Add a core API to add more rw_ctxs to an already-established QP
- svcrdma could treat rw_ctx exhaustion as a temporary error and
  try again
- Limit the number of Reads in flight

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 net/sunrpc/xprtrdma/svc_rdma_transport.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index 839c0e80e5cd4a..2b1c16b9547ddd 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -422,8 +422,13 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
 		newxprt->sc_max_requests = rq_depth - 2;
 		newxprt->sc_max_bc_requests = 2;
 	}
-	ctxts = rdma_rw_mr_factor(dev, newxprt->sc_port_num, RPCSVC_MAXPAGES);
-	ctxts *= newxprt->sc_max_requests;
+
+	/* Arbitrarily estimate the number of rw_ctxs needed for
+	 * this transport. This is enough rw_ctxs to make forward
+	 * progress even if the client is using one rkey per page
+	 * in each Read chunk.
+	 */
+	ctxts = 3 * RPCSVC_MAXPAGES;
 	newxprt->sc_sq_depth = rq_depth + ctxts;
 	if (newxprt->sc_sq_depth > dev->attrs.max_qp_wr)
 		newxprt->sc_sq_depth = dev->attrs.max_qp_wr;

From 94134ddcc9498fd3bace6d981bda66c01e8ab47e Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Sun, 4 Feb 2024 18:17:03 -0500
Subject: [PATCH 0666/1406] svcrdma: Fix SQ wake-ups

Ensure there is a wake-up when increasing sc_sq_avail.

Likewise, if a wake-up is done, sc_sq_avail needs to be updated,
otherwise the wait_event() conditional is never going to be met.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 net/sunrpc/xprtrdma/svc_rdma_sendto.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index 1a49b7f0204120..f1f5c7b58fce52 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -335,11 +335,11 @@ int svc_rdma_send(struct svcxprt_rdma *rdma, struct svc_rdma_send_ctxt *ctxt)
 	/* If the SQ is full, wait until an SQ entry is available */
 	while (1) {
 		if ((atomic_dec_return(&rdma->sc_sq_avail) < 0)) {
+			svc_rdma_wake_send_waiters(rdma, 1);
 			percpu_counter_inc(&svcrdma_stat_sq_starve);
 			trace_svcrdma_sq_full(rdma, &ctxt->sc_cid);
-			atomic_inc(&rdma->sc_sq_avail);
 			wait_event(rdma->sc_send_wait,
-				   atomic_read(&rdma->sc_sq_avail) > 1);
+				   atomic_read(&rdma->sc_sq_avail) > 0);
 			if (test_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags))
 				return -ENOTCONN;
 			trace_svcrdma_sq_retry(rdma, &ctxt->sc_cid);
@@ -355,7 +355,7 @@ int svc_rdma_send(struct svcxprt_rdma *rdma, struct svc_rdma_send_ctxt *ctxt)
 
 	trace_svcrdma_sq_post_err(rdma, &ctxt->sc_cid, ret);
 	svc_xprt_deferred_close(&rdma->sc_xprt);
-	wake_up(&rdma->sc_send_wait);
+	svc_rdma_wake_send_waiters(rdma, 1);
 	return ret;
 }
 

From 6c4636cd531c00256d0d670565f5a91627b75555 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Sun, 4 Feb 2024 18:17:09 -0500
Subject: [PATCH 0667/1406] svcrdma: Prevent a UAF in svc_rdma_send()

In some error flow cases, svc_rdma_wc_send() releases @ctxt. Copy
the sc_cid field in @ctxt to a stack variable in order to guarantee
that the value is available after the ib_post_send() call.

In case the new comment looks a little strange, this will be done
with at least one more field in a subsequent patch.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 net/sunrpc/xprtrdma/svc_rdma_sendto.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index f1f5c7b58fce52..b6fc9299b472d6 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -316,12 +316,17 @@ static void svc_rdma_wc_send(struct ib_cq *cq, struct ib_wc *wc)
  * @rdma: transport on which to post the WR
  * @ctxt: send ctxt with a Send WR ready to post
  *
+ * Copy fields in @ctxt to stack variables in order to guarantee
+ * that these values remain available after the ib_post_send() call.
+ * In some error flow cases, svc_rdma_wc_send() releases @ctxt.
+ *
  * Returns zero if the Send WR was posted successfully. Otherwise, a
  * negative errno is returned.
  */
 int svc_rdma_send(struct svcxprt_rdma *rdma, struct svc_rdma_send_ctxt *ctxt)
 {
 	struct ib_send_wr *wr = &ctxt->sc_send_wr;
+	struct rpc_rdma_cid cid = ctxt->sc_cid;
 	int ret;
 
 	might_sleep();
@@ -337,12 +342,12 @@ int svc_rdma_send(struct svcxprt_rdma *rdma, struct svc_rdma_send_ctxt *ctxt)
 		if ((atomic_dec_return(&rdma->sc_sq_avail) < 0)) {
 			svc_rdma_wake_send_waiters(rdma, 1);
 			percpu_counter_inc(&svcrdma_stat_sq_starve);
-			trace_svcrdma_sq_full(rdma, &ctxt->sc_cid);
+			trace_svcrdma_sq_full(rdma, &cid);
 			wait_event(rdma->sc_send_wait,
 				   atomic_read(&rdma->sc_sq_avail) > 0);
 			if (test_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags))
 				return -ENOTCONN;
-			trace_svcrdma_sq_retry(rdma, &ctxt->sc_cid);
+			trace_svcrdma_sq_retry(rdma, &cid);
 			continue;
 		}
 
@@ -353,7 +358,7 @@ int svc_rdma_send(struct svcxprt_rdma *rdma, struct svc_rdma_send_ctxt *ctxt)
 		return 0;
 	}
 
-	trace_svcrdma_sq_post_err(rdma, &ctxt->sc_cid, ret);
+	trace_svcrdma_sq_post_err(rdma, &cid, ret);
 	svc_xprt_deferred_close(&rdma->sc_xprt);
 	svc_rdma_wake_send_waiters(rdma, 1);
 	return ret;

From 5e5d91081fd4e909c2a878bbe43fb10fb6d0c162 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Sun, 4 Feb 2024 18:17:15 -0500
Subject: [PATCH 0668/1406] svcrdma: Fix retry loop in svc_rdma_send()

Don't call ib_post_send() at all if the transport is already
shutting down.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 net/sunrpc/xprtrdma/svc_rdma_sendto.c | 28 ++++++++++++++++-----------
 1 file changed, 17 insertions(+), 11 deletions(-)

diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index b6fc9299b472d6..0ee9185f5f3f4a 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -320,8 +320,9 @@ static void svc_rdma_wc_send(struct ib_cq *cq, struct ib_wc *wc)
  * that these values remain available after the ib_post_send() call.
  * In some error flow cases, svc_rdma_wc_send() releases @ctxt.
  *
- * Returns zero if the Send WR was posted successfully. Otherwise, a
- * negative errno is returned.
+ * Return values:
+ *   %0: @ctxt's WR chain was posted successfully
+ *   %-ENOTCONN: The connection was lost
  */
 int svc_rdma_send(struct svcxprt_rdma *rdma, struct svc_rdma_send_ctxt *ctxt)
 {
@@ -338,30 +339,35 @@ int svc_rdma_send(struct svcxprt_rdma *rdma, struct svc_rdma_send_ctxt *ctxt)
 				      DMA_TO_DEVICE);
 
 	/* If the SQ is full, wait until an SQ entry is available */
-	while (1) {
+	while (!test_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags)) {
 		if ((atomic_dec_return(&rdma->sc_sq_avail) < 0)) {
 			svc_rdma_wake_send_waiters(rdma, 1);
+
+			/* When the transport is torn down, assume
+			 * ib_drain_sq() will trigger enough Send
+			 * completions to wake us. The XPT_CLOSE test
+			 * above should then cause the while loop to
+			 * exit.
+			 */
 			percpu_counter_inc(&svcrdma_stat_sq_starve);
 			trace_svcrdma_sq_full(rdma, &cid);
 			wait_event(rdma->sc_send_wait,
 				   atomic_read(&rdma->sc_sq_avail) > 0);
-			if (test_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags))
-				return -ENOTCONN;
 			trace_svcrdma_sq_retry(rdma, &cid);
 			continue;
 		}
 
 		trace_svcrdma_post_send(ctxt);
 		ret = ib_post_send(rdma->sc_qp, wr, NULL);
-		if (ret)
+		if (ret) {
+			trace_svcrdma_sq_post_err(rdma, &cid, ret);
+			svc_xprt_deferred_close(&rdma->sc_xprt);
+			svc_rdma_wake_send_waiters(rdma, 1);
 			break;
+		}
 		return 0;
 	}
-
-	trace_svcrdma_sq_post_err(rdma, &cid, ret);
-	svc_xprt_deferred_close(&rdma->sc_xprt);
-	svc_rdma_wake_send_waiters(rdma, 1);
-	return ret;
+	return -ENOTCONN;
 }
 
 /**

From d4b2563bfc6b6c9ea252f6494a9a9157d11a55bc Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Sun, 4 Feb 2024 18:17:22 -0500
Subject: [PATCH 0669/1406] svcrdma: Post Send WR chain

Eventually I'd like the server to post the reply's Send WR along
with any Write WRs using only a single call to ib_post_send(), in
order to reduce the NIC's doorbell rate.

To do this, add an anchor for a WR chain to svc_rdma_send_ctxt, and
refactor svc_rdma_send() to post this WR chain to the Send Queue. For
the moment, the posted chain will continue to contain a single Send
WR.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/svc_rdma.h            |  6 ++-
 net/sunrpc/xprtrdma/svc_rdma_backchannel.c |  2 +-
 net/sunrpc/xprtrdma/svc_rdma_sendto.c      | 49 +++++++++++++++-------
 3 files changed, 38 insertions(+), 19 deletions(-)

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index e7595ae62fe294..ee05087d64994e 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -210,6 +210,8 @@ struct svc_rdma_send_ctxt {
 
 	struct svcxprt_rdma	*sc_rdma;
 	struct ib_send_wr	sc_send_wr;
+	struct ib_send_wr	*sc_wr_chain;
+	int			sc_sqecount;
 	struct ib_cqe		sc_cqe;
 	struct xdr_buf		sc_hdrbuf;
 	struct xdr_stream	sc_stream;
@@ -258,8 +260,8 @@ extern struct svc_rdma_send_ctxt *
 		svc_rdma_send_ctxt_get(struct svcxprt_rdma *rdma);
 extern void svc_rdma_send_ctxt_put(struct svcxprt_rdma *rdma,
 				   struct svc_rdma_send_ctxt *ctxt);
-extern int svc_rdma_send(struct svcxprt_rdma *rdma,
-			 struct svc_rdma_send_ctxt *ctxt);
+extern int svc_rdma_post_send(struct svcxprt_rdma *rdma,
+			      struct svc_rdma_send_ctxt *ctxt);
 extern int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma,
 				  struct svc_rdma_send_ctxt *sctxt,
 				  const struct svc_rdma_pcl *write_pcl,
diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
index c9be6778643bff..e5a78b76101258 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
@@ -90,7 +90,7 @@ static int svc_rdma_bc_sendto(struct svcxprt_rdma *rdma,
 	 */
 	get_page(virt_to_page(rqst->rq_buffer));
 	sctxt->sc_send_wr.opcode = IB_WR_SEND;
-	return svc_rdma_send(rdma, sctxt);
+	return svc_rdma_post_send(rdma, sctxt);
 }
 
 /* Server-side transport endpoint wants a whole page for its send
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index 0ee9185f5f3f4a..0f02fb09d5b05a 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -208,6 +208,9 @@ struct svc_rdma_send_ctxt *svc_rdma_send_ctxt_get(struct svcxprt_rdma *rdma)
 	ctxt->sc_send_wr.num_sge = 0;
 	ctxt->sc_cur_sge_no = 0;
 	ctxt->sc_page_count = 0;
+	ctxt->sc_wr_chain = &ctxt->sc_send_wr;
+	ctxt->sc_sqecount = 1;
+
 	return ctxt;
 
 out_empty:
@@ -293,7 +296,7 @@ static void svc_rdma_wc_send(struct ib_cq *cq, struct ib_wc *wc)
 	struct svc_rdma_send_ctxt *ctxt =
 		container_of(cqe, struct svc_rdma_send_ctxt, sc_cqe);
 
-	svc_rdma_wake_send_waiters(rdma, 1);
+	svc_rdma_wake_send_waiters(rdma, ctxt->sc_sqecount);
 
 	if (unlikely(wc->status != IB_WC_SUCCESS))
 		goto flushed;
@@ -312,36 +315,44 @@ static void svc_rdma_wc_send(struct ib_cq *cq, struct ib_wc *wc)
 }
 
 /**
- * svc_rdma_send - Post a single Send WR
- * @rdma: transport on which to post the WR
- * @ctxt: send ctxt with a Send WR ready to post
+ * svc_rdma_post_send - Post a WR chain to the Send Queue
+ * @rdma: transport context
+ * @ctxt: WR chain to post
  *
  * Copy fields in @ctxt to stack variables in order to guarantee
  * that these values remain available after the ib_post_send() call.
  * In some error flow cases, svc_rdma_wc_send() releases @ctxt.
  *
+ * Note there is potential for starvation when the Send Queue is
+ * full because there is no order to when waiting threads are
+ * awoken. The transport is typically provisioned with a deep
+ * enough Send Queue that SQ exhaustion should be a rare event.
+ *
  * Return values:
  *   %0: @ctxt's WR chain was posted successfully
  *   %-ENOTCONN: The connection was lost
  */
-int svc_rdma_send(struct svcxprt_rdma *rdma, struct svc_rdma_send_ctxt *ctxt)
+int svc_rdma_post_send(struct svcxprt_rdma *rdma,
+		       struct svc_rdma_send_ctxt *ctxt)
 {
-	struct ib_send_wr *wr = &ctxt->sc_send_wr;
+	struct ib_send_wr *first_wr = ctxt->sc_wr_chain;
+	struct ib_send_wr *send_wr = &ctxt->sc_send_wr;
+	const struct ib_send_wr *bad_wr = first_wr;
 	struct rpc_rdma_cid cid = ctxt->sc_cid;
-	int ret;
+	int ret, sqecount = ctxt->sc_sqecount;
 
 	might_sleep();
 
 	/* Sync the transport header buffer */
 	ib_dma_sync_single_for_device(rdma->sc_pd->device,
-				      wr->sg_list[0].addr,
-				      wr->sg_list[0].length,
+				      send_wr->sg_list[0].addr,
+				      send_wr->sg_list[0].length,
 				      DMA_TO_DEVICE);
 
 	/* If the SQ is full, wait until an SQ entry is available */
 	while (!test_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags)) {
-		if ((atomic_dec_return(&rdma->sc_sq_avail) < 0)) {
-			svc_rdma_wake_send_waiters(rdma, 1);
+		if (atomic_sub_return(sqecount, &rdma->sc_sq_avail) < 0) {
+			svc_rdma_wake_send_waiters(rdma, sqecount);
 
 			/* When the transport is torn down, assume
 			 * ib_drain_sq() will trigger enough Send
@@ -358,12 +369,18 @@ int svc_rdma_send(struct svcxprt_rdma *rdma, struct svc_rdma_send_ctxt *ctxt)
 		}
 
 		trace_svcrdma_post_send(ctxt);
-		ret = ib_post_send(rdma->sc_qp, wr, NULL);
+		ret = ib_post_send(rdma->sc_qp, first_wr, &bad_wr);
 		if (ret) {
 			trace_svcrdma_sq_post_err(rdma, &cid, ret);
 			svc_xprt_deferred_close(&rdma->sc_xprt);
-			svc_rdma_wake_send_waiters(rdma, 1);
-			break;
+
+			/* If even one WR was posted, there will be a
+			 * Send completion that bumps sc_sq_avail.
+			 */
+			if (bad_wr == first_wr) {
+				svc_rdma_wake_send_waiters(rdma, sqecount);
+				break;
+			}
 		}
 		return 0;
 	}
@@ -884,7 +901,7 @@ static int svc_rdma_send_reply_msg(struct svcxprt_rdma *rdma,
 		sctxt->sc_send_wr.opcode = IB_WR_SEND;
 	}
 
-	return svc_rdma_send(rdma, sctxt);
+	return svc_rdma_post_send(rdma, sctxt);
 }
 
 /**
@@ -948,7 +965,7 @@ void svc_rdma_send_error_msg(struct svcxprt_rdma *rdma,
 	sctxt->sc_send_wr.num_sge = 1;
 	sctxt->sc_send_wr.opcode = IB_WR_SEND;
 	sctxt->sc_sges[0].length = sctxt->sc_hdrbuf.len;
-	if (svc_rdma_send(rdma, sctxt))
+	if (svc_rdma_post_send(rdma, sctxt))
 		goto put_ctxt;
 	return;
 

From 1e9f9516c42fd92e615ddeb6929d2411963796c6 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Sun, 4 Feb 2024 18:17:28 -0500
Subject: [PATCH 0670/1406] svcrdma: Move write_info for Reply chunks into
 struct svc_rdma_send_ctxt

Since the RPC transaction's svc_rdma_send_ctxt will stay around for
the duration of the RDMA Write operation, the write_info structure
for the Reply chunk can reside in the request's svc_rdma_send_ctxt
instead of being allocated separately.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/svc_rdma.h       | 25 ++++++++
 include/trace/events/rpcrdma.h        |  4 ++
 net/sunrpc/xprtrdma/svc_rdma_rw.c     | 91 +++++++++++++++------------
 net/sunrpc/xprtrdma/svc_rdma_sendto.c |  2 +-
 4 files changed, 82 insertions(+), 40 deletions(-)

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index ee05087d64994e..918cf4fda72818 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -203,6 +203,29 @@ struct svc_rdma_recv_ctxt {
 	struct page		*rc_pages[RPCSVC_MAXPAGES];
 };
 
+/*
+ * State for sending a Write chunk.
+ *  - Tracks progress of writing one chunk over all its segments
+ *  - Stores arguments for the SGL constructor functions
+ */
+struct svc_rdma_write_info {
+	struct svcxprt_rdma	*wi_rdma;
+
+	const struct svc_rdma_chunk	*wi_chunk;
+
+	/* write state of this chunk */
+	unsigned int		wi_seg_off;
+	unsigned int		wi_seg_no;
+
+	/* SGL constructor arguments */
+	const struct xdr_buf	*wi_xdr;
+	unsigned char		*wi_base;
+	unsigned int		wi_next_off;
+
+	struct svc_rdma_chunk_ctxt	wi_cc;
+	struct work_struct	wi_work;
+};
+
 struct svc_rdma_send_ctxt {
 	struct llist_node	sc_node;
 	struct rpc_rdma_cid	sc_cid;
@@ -215,6 +238,7 @@ struct svc_rdma_send_ctxt {
 	struct ib_cqe		sc_cqe;
 	struct xdr_buf		sc_hdrbuf;
 	struct xdr_stream	sc_stream;
+	struct svc_rdma_write_info sc_reply_info;
 	void			*sc_xprt_buf;
 	int			sc_page_count;
 	int			sc_cur_sge_no;
@@ -249,6 +273,7 @@ extern int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma,
 				     const struct xdr_buf *xdr);
 extern int svc_rdma_send_reply_chunk(struct svcxprt_rdma *rdma,
 				     const struct svc_rdma_recv_ctxt *rctxt,
+				     struct svc_rdma_send_ctxt *sctxt,
 				     const struct xdr_buf *xdr);
 extern int svc_rdma_process_read_list(struct svcxprt_rdma *rdma,
 				      struct svc_rqst *rqstp,
diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h
index 110c1475c527b7..027ac3ab457de0 100644
--- a/include/trace/events/rpcrdma.h
+++ b/include/trace/events/rpcrdma.h
@@ -2118,6 +2118,10 @@ DEFINE_SIMPLE_CID_EVENT(svcrdma_wc_write);
 DEFINE_SEND_FLUSH_EVENT(svcrdma_wc_write_flush);
 DEFINE_SEND_FLUSH_EVENT(svcrdma_wc_write_err);
 
+DEFINE_SIMPLE_CID_EVENT(svcrdma_wc_reply);
+DEFINE_SEND_FLUSH_EVENT(svcrdma_wc_reply_flush);
+DEFINE_SEND_FLUSH_EVENT(svcrdma_wc_reply_err);
+
 TRACE_EVENT(svcrdma_qp_error,
 	TP_PROTO(
 		const struct ib_event *event,
diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c
index c00fcce61d1ec0..2ca3c6311c5e92 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_rw.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c
@@ -197,28 +197,6 @@ void svc_rdma_cc_release(struct svcxprt_rdma *rdma,
 		llist_add_batch(first, last, &rdma->sc_rw_ctxts);
 }
 
-/* State for sending a Write or Reply chunk.
- *  - Tracks progress of writing one chunk over all its segments
- *  - Stores arguments for the SGL constructor functions
- */
-struct svc_rdma_write_info {
-	struct svcxprt_rdma	*wi_rdma;
-
-	const struct svc_rdma_chunk	*wi_chunk;
-
-	/* write state of this chunk */
-	unsigned int		wi_seg_off;
-	unsigned int		wi_seg_no;
-
-	/* SGL constructor arguments */
-	const struct xdr_buf	*wi_xdr;
-	unsigned char		*wi_base;
-	unsigned int		wi_next_off;
-
-	struct svc_rdma_chunk_ctxt	wi_cc;
-	struct work_struct	wi_work;
-};
-
 static struct svc_rdma_write_info *
 svc_rdma_write_info_alloc(struct svcxprt_rdma *rdma,
 			  const struct svc_rdma_chunk *chunk)
@@ -252,6 +230,43 @@ static void svc_rdma_write_info_free(struct svc_rdma_write_info *info)
 	queue_work(svcrdma_wq, &info->wi_work);
 }
 
+static void svc_rdma_reply_chunk_release(struct svcxprt_rdma *rdma,
+					 struct svc_rdma_chunk_ctxt *cc)
+{
+	svc_rdma_wake_send_waiters(rdma, cc->cc_sqecount);
+	svc_rdma_cc_release(rdma, cc, DMA_TO_DEVICE);
+}
+
+/**
+ * svc_rdma_reply_done - Reply chunk Write completion handler
+ * @cq: controlling Completion Queue
+ * @wc: Work Completion report
+ *
+ * Pages under I/O are released by a subsequent Send completion.
+ */
+static void svc_rdma_reply_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+	struct ib_cqe *cqe = wc->wr_cqe;
+	struct svc_rdma_chunk_ctxt *cc =
+			container_of(cqe, struct svc_rdma_chunk_ctxt, cc_cqe);
+	struct svcxprt_rdma *rdma = cq->cq_context;
+
+	switch (wc->status) {
+	case IB_WC_SUCCESS:
+		trace_svcrdma_wc_reply(&cc->cc_cid);
+		svc_rdma_reply_chunk_release(rdma, cc);
+		return;
+	case IB_WC_WR_FLUSH_ERR:
+		trace_svcrdma_wc_reply_flush(wc, &cc->cc_cid);
+		break;
+	default:
+		trace_svcrdma_wc_reply_err(wc, &cc->cc_cid);
+	}
+
+	svc_rdma_reply_chunk_release(rdma, cc);
+	svc_xprt_deferred_close(&rdma->sc_xprt);
+}
+
 /**
  * svc_rdma_write_done - Write chunk completion
  * @cq: controlling Completion Queue
@@ -624,7 +639,8 @@ int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma,
 /**
  * svc_rdma_send_reply_chunk - Write all segments in the Reply chunk
  * @rdma: controlling RDMA transport
- * @rctxt: Write and Reply chunks from client
+ * @rctxt: Write and Reply chunks provisioned by the client
+ * @sctxt: Send WR resources
  * @xdr: xdr_buf containing an RPC Reply
  *
  * Returns a non-negative number of bytes the chunk consumed, or
@@ -636,37 +652,34 @@ int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma,
  */
 int svc_rdma_send_reply_chunk(struct svcxprt_rdma *rdma,
 			      const struct svc_rdma_recv_ctxt *rctxt,
+			      struct svc_rdma_send_ctxt *sctxt,
 			      const struct xdr_buf *xdr)
 {
-	struct svc_rdma_write_info *info;
-	struct svc_rdma_chunk_ctxt *cc;
-	struct svc_rdma_chunk *chunk;
+	struct svc_rdma_write_info *info = &sctxt->sc_reply_info;
+	struct svc_rdma_chunk_ctxt *cc = &info->wi_cc;
 	int ret;
 
-	if (pcl_is_empty(&rctxt->rc_reply_pcl))
-		return 0;
+	if (likely(pcl_is_empty(&rctxt->rc_reply_pcl)))
+		return 0;	/* client provided no Reply chunk */
 
-	chunk = pcl_first_chunk(&rctxt->rc_reply_pcl);
-	info = svc_rdma_write_info_alloc(rdma, chunk);
-	if (!info)
-		return -ENOMEM;
-	cc = &info->wi_cc;
+	info->wi_rdma = rdma;
+	info->wi_chunk = pcl_first_chunk(&rctxt->rc_reply_pcl);
+	info->wi_seg_off = 0;
+	info->wi_seg_no = 0;
+	svc_rdma_cc_init(rdma, &info->wi_cc);
+	info->wi_cc.cc_cqe.done = svc_rdma_reply_done;
 
 	ret = pcl_process_nonpayloads(&rctxt->rc_write_pcl, xdr,
 				      svc_rdma_xb_write, info);
 	if (ret < 0)
-		goto out_err;
+		return ret;
 
 	trace_svcrdma_post_reply_chunk(&cc->cc_cid, cc->cc_sqecount);
 	ret = svc_rdma_post_chunk_ctxt(rdma, cc);
 	if (ret < 0)
-		goto out_err;
+		return ret;
 
 	return xdr->len;
-
-out_err:
-	svc_rdma_write_info_free(info);
-	return ret;
 }
 
 /**
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index 0f02fb09d5b05a..d8e079be36e2a6 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -1012,7 +1012,7 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
 	if (!p)
 		goto put_ctxt;
 
-	ret = svc_rdma_send_reply_chunk(rdma, rctxt, &rqstp->rq_res);
+	ret = svc_rdma_send_reply_chunk(rdma, rctxt, sctxt, &rqstp->rq_res);
 	if (ret < 0)
 		goto reply_chunk;
 	rc_size = ret;

From 3a89b0579f69ceeb404c74e08b3df073a42360ee Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Sun, 4 Feb 2024 18:17:34 -0500
Subject: [PATCH 0671/1406] svcrdma: Post the Reply chunk and Send WR together

Reduce the doorbell and Send completion rates when sending RPC/RDMA
replies that have Reply chunks. NFS READDIR procedures typically
return their result in a Reply chunk, for example.

Instead of calling ib_post_send() to post the Write WRs for the
Reply chunk, and then calling it again to post the Send WR that
conveys the transport header, chain the Write WRs to the Send WR
and call ib_post_send() only once.

Thanks to the Send Queue completion ordering rules, when the Send
WR completes, that guarantees that Write WRs posted before it have
also completed successfully. Thus all Write WRs for the Reply chunk
can remain unsignaled. Instead of handling a Write completion and
then a Send completion, only the Send completion is seen, and it
handles clean up for both the Writes and the Send.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/svc_rdma.h       | 13 ++++--
 net/sunrpc/xprtrdma/svc_rdma_rw.c     | 58 +++++++++++++++++----------
 net/sunrpc/xprtrdma/svc_rdma_sendto.c | 34 +++++++++-------
 3 files changed, 66 insertions(+), 39 deletions(-)

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index 918cf4fda72818..ac882bd23ca2a8 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -262,19 +262,24 @@ extern void svc_rdma_release_ctxt(struct svc_xprt *xprt, void *ctxt);
 extern int svc_rdma_recvfrom(struct svc_rqst *);
 
 /* svc_rdma_rw.c */
+extern void svc_rdma_cc_init(struct svcxprt_rdma *rdma,
+			     struct svc_rdma_chunk_ctxt *cc);
 extern void svc_rdma_destroy_rw_ctxts(struct svcxprt_rdma *rdma);
 extern void svc_rdma_cc_init(struct svcxprt_rdma *rdma,
 			     struct svc_rdma_chunk_ctxt *cc);
 extern void svc_rdma_cc_release(struct svcxprt_rdma *rdma,
 				struct svc_rdma_chunk_ctxt *cc,
 				enum dma_data_direction dir);
+extern void svc_rdma_reply_chunk_release(struct svcxprt_rdma *rdma,
+					 struct svc_rdma_send_ctxt *ctxt);
 extern int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma,
 				     const struct svc_rdma_chunk *chunk,
 				     const struct xdr_buf *xdr);
-extern int svc_rdma_send_reply_chunk(struct svcxprt_rdma *rdma,
-				     const struct svc_rdma_recv_ctxt *rctxt,
-				     struct svc_rdma_send_ctxt *sctxt,
-				     const struct xdr_buf *xdr);
+extern int svc_rdma_prepare_reply_chunk(struct svcxprt_rdma *rdma,
+					const struct svc_rdma_pcl *write_pcl,
+					const struct svc_rdma_pcl *reply_pcl,
+					struct svc_rdma_send_ctxt *sctxt,
+					const struct xdr_buf *xdr);
 extern int svc_rdma_process_read_list(struct svcxprt_rdma *rdma,
 				      struct svc_rqst *rqstp,
 				      struct svc_rdma_recv_ctxt *head);
diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c
index 2ca3c6311c5e92..2b25edc6c73c52 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_rw.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c
@@ -230,10 +230,18 @@ static void svc_rdma_write_info_free(struct svc_rdma_write_info *info)
 	queue_work(svcrdma_wq, &info->wi_work);
 }
 
-static void svc_rdma_reply_chunk_release(struct svcxprt_rdma *rdma,
-					 struct svc_rdma_chunk_ctxt *cc)
+/**
+ * svc_rdma_reply_chunk_release - Release Reply chunk I/O resources
+ * @rdma: controlling transport
+ * @ctxt: Send context that is being released
+ */
+void svc_rdma_reply_chunk_release(struct svcxprt_rdma *rdma,
+				  struct svc_rdma_send_ctxt *ctxt)
 {
-	svc_rdma_wake_send_waiters(rdma, cc->cc_sqecount);
+	struct svc_rdma_chunk_ctxt *cc = &ctxt->sc_reply_info.wi_cc;
+
+	if (!cc->cc_sqecount)
+		return;
 	svc_rdma_cc_release(rdma, cc, DMA_TO_DEVICE);
 }
 
@@ -254,7 +262,6 @@ static void svc_rdma_reply_done(struct ib_cq *cq, struct ib_wc *wc)
 	switch (wc->status) {
 	case IB_WC_SUCCESS:
 		trace_svcrdma_wc_reply(&cc->cc_cid);
-		svc_rdma_reply_chunk_release(rdma, cc);
 		return;
 	case IB_WC_WR_FLUSH_ERR:
 		trace_svcrdma_wc_reply_flush(wc, &cc->cc_cid);
@@ -263,7 +270,6 @@ static void svc_rdma_reply_done(struct ib_cq *cq, struct ib_wc *wc)
 		trace_svcrdma_wc_reply_err(wc, &cc->cc_cid);
 	}
 
-	svc_rdma_reply_chunk_release(rdma, cc);
 	svc_xprt_deferred_close(&rdma->sc_xprt);
 }
 
@@ -637,9 +643,10 @@ int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma,
 }
 
 /**
- * svc_rdma_send_reply_chunk - Write all segments in the Reply chunk
+ * svc_rdma_prepare_reply_chunk - Construct WR chain for writing the Reply chunk
  * @rdma: controlling RDMA transport
- * @rctxt: Write and Reply chunks provisioned by the client
+ * @write_pcl: Write chunk list provided by client
+ * @reply_pcl: Reply chunk provided by client
  * @sctxt: Send WR resources
  * @xdr: xdr_buf containing an RPC Reply
  *
@@ -650,35 +657,44 @@ int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma,
  *	%-ENOTCONN if posting failed (connection is lost),
  *	%-EIO if rdma_rw initialization failed (DMA mapping, etc).
  */
-int svc_rdma_send_reply_chunk(struct svcxprt_rdma *rdma,
-			      const struct svc_rdma_recv_ctxt *rctxt,
-			      struct svc_rdma_send_ctxt *sctxt,
-			      const struct xdr_buf *xdr)
+int svc_rdma_prepare_reply_chunk(struct svcxprt_rdma *rdma,
+				 const struct svc_rdma_pcl *write_pcl,
+				 const struct svc_rdma_pcl *reply_pcl,
+				 struct svc_rdma_send_ctxt *sctxt,
+				 const struct xdr_buf *xdr)
 {
 	struct svc_rdma_write_info *info = &sctxt->sc_reply_info;
 	struct svc_rdma_chunk_ctxt *cc = &info->wi_cc;
+	struct ib_send_wr *first_wr;
+	struct list_head *pos;
+	struct ib_cqe *cqe;
 	int ret;
 
-	if (likely(pcl_is_empty(&rctxt->rc_reply_pcl)))
-		return 0;	/* client provided no Reply chunk */
-
 	info->wi_rdma = rdma;
-	info->wi_chunk = pcl_first_chunk(&rctxt->rc_reply_pcl);
+	info->wi_chunk = pcl_first_chunk(reply_pcl);
 	info->wi_seg_off = 0;
 	info->wi_seg_no = 0;
-	svc_rdma_cc_init(rdma, &info->wi_cc);
 	info->wi_cc.cc_cqe.done = svc_rdma_reply_done;
 
-	ret = pcl_process_nonpayloads(&rctxt->rc_write_pcl, xdr,
+	ret = pcl_process_nonpayloads(write_pcl, xdr,
 				      svc_rdma_xb_write, info);
 	if (ret < 0)
 		return ret;
 
-	trace_svcrdma_post_reply_chunk(&cc->cc_cid, cc->cc_sqecount);
-	ret = svc_rdma_post_chunk_ctxt(rdma, cc);
-	if (ret < 0)
-		return ret;
+	first_wr = sctxt->sc_wr_chain;
+	cqe = &cc->cc_cqe;
+	list_for_each(pos, &cc->cc_rwctxts) {
+		struct svc_rdma_rw_ctxt *rwc;
 
+		rwc = list_entry(pos, struct svc_rdma_rw_ctxt, rw_list);
+		first_wr = rdma_rw_ctx_wrs(&rwc->rw_ctx, rdma->sc_qp,
+					   rdma->sc_port_num, cqe, first_wr);
+		cqe = NULL;
+	}
+	sctxt->sc_wr_chain = first_wr;
+	sctxt->sc_sqecount += cc->cc_sqecount;
+
+	trace_svcrdma_post_reply_chunk(&cc->cc_cid, cc->cc_sqecount);
 	return xdr->len;
 }
 
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index d8e079be36e2a6..6dfd2232ce5b41 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -205,6 +205,7 @@ struct svc_rdma_send_ctxt *svc_rdma_send_ctxt_get(struct svcxprt_rdma *rdma)
 	xdr_init_encode(&ctxt->sc_stream, &ctxt->sc_hdrbuf,
 			ctxt->sc_xprt_buf, NULL);
 
+	svc_rdma_cc_init(rdma, &ctxt->sc_reply_info.wi_cc);
 	ctxt->sc_send_wr.num_sge = 0;
 	ctxt->sc_cur_sge_no = 0;
 	ctxt->sc_page_count = 0;
@@ -226,6 +227,8 @@ static void svc_rdma_send_ctxt_release(struct svcxprt_rdma *rdma,
 	struct ib_device *device = rdma->sc_cm_id->device;
 	unsigned int i;
 
+	svc_rdma_reply_chunk_release(rdma, ctxt);
+
 	if (ctxt->sc_page_count)
 		release_pages(ctxt->sc_pages, ctxt->sc_page_count);
 
@@ -867,16 +870,10 @@ static void svc_rdma_save_io_pages(struct svc_rqst *rqstp,
  * in sc_sges[0], and the RPC xdr_buf is prepared in following sges.
  *
  * Depending on whether a Write list or Reply chunk is present,
- * the server may send all, a portion of, or none of the xdr_buf.
+ * the server may Send all, a portion of, or none of the xdr_buf.
  * In the latter case, only the transport header (sc_sges[0]) is
  * transmitted.
  *
- * RDMA Send is the last step of transmitting an RPC reply. Pages
- * involved in the earlier RDMA Writes are here transferred out
- * of the rqstp and into the sctxt's page array. These pages are
- * DMA unmapped by each Write completion, but the subsequent Send
- * completion finally releases these pages.
- *
  * Assumptions:
  * - The Reply's transport header will never be larger than a page.
  */
@@ -885,6 +882,7 @@ static int svc_rdma_send_reply_msg(struct svcxprt_rdma *rdma,
 				   const struct svc_rdma_recv_ctxt *rctxt,
 				   struct svc_rqst *rqstp)
 {
+	struct ib_send_wr *send_wr = &sctxt->sc_send_wr;
 	int ret;
 
 	ret = svc_rdma_map_reply_msg(rdma, sctxt, &rctxt->rc_write_pcl,
@@ -892,13 +890,16 @@ static int svc_rdma_send_reply_msg(struct svcxprt_rdma *rdma,
 	if (ret < 0)
 		return ret;
 
+	/* Transfer pages involved in RDMA Writes to the sctxt's
+	 * page array. Completion handling releases these pages.
+	 */
 	svc_rdma_save_io_pages(rqstp, sctxt);
 
 	if (rctxt->rc_inv_rkey) {
-		sctxt->sc_send_wr.opcode = IB_WR_SEND_WITH_INV;
-		sctxt->sc_send_wr.ex.invalidate_rkey = rctxt->rc_inv_rkey;
+		send_wr->opcode = IB_WR_SEND_WITH_INV;
+		send_wr->ex.invalidate_rkey = rctxt->rc_inv_rkey;
 	} else {
-		sctxt->sc_send_wr.opcode = IB_WR_SEND;
+		send_wr->opcode = IB_WR_SEND;
 	}
 
 	return svc_rdma_post_send(rdma, sctxt);
@@ -1012,10 +1013,15 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
 	if (!p)
 		goto put_ctxt;
 
-	ret = svc_rdma_send_reply_chunk(rdma, rctxt, sctxt, &rqstp->rq_res);
-	if (ret < 0)
-		goto reply_chunk;
-	rc_size = ret;
+	rc_size = 0;
+	if (!pcl_is_empty(&rctxt->rc_reply_pcl)) {
+		ret = svc_rdma_prepare_reply_chunk(rdma, &rctxt->rc_write_pcl,
+						   &rctxt->rc_reply_pcl, sctxt,
+						   &rqstp->rq_res);
+		if (ret < 0)
+			goto reply_chunk;
+		rc_size = ret;
+	}
 
 	*p++ = *rdma_argp;
 	*p++ = *(rdma_argp + 1);

From c3212b8a59ff49488bc5fb2ec341a576b65d55a6 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Sun, 4 Feb 2024 18:17:41 -0500
Subject: [PATCH 0672/1406] svcrdma: Post WRs for Write chunks in
 svc_rdma_sendto()

Refactor to eventually enable svcrdma to post the Write WRs for each
RPC response using the same ib_post_send() as the Send WR (ie, as a
single WR chain).

svc_rdma_result_payload (originally svc_rdma_read_payload) was added
so that the upper layer XDR encoder could identify a range of bytes
to be possibly conveyed by RDMA (if a Write chunk was provided by
the client).

The purpose of commit f6ad77590a5d ("svcrdma: Post RDMA Writes while
XDR encoding replies") was to post as much of the result payload
outside of svc_rdma_sendto() as possible because svc_rdma_sendto()
used to be called with the xpt_mutex held.

However, since commit ca4faf543a33 ("SUNRPC: Move xpt_mutex into
socket xpo_sendto methods"), the xpt_mutex is no longer held when
calling svc_rdma_sendto(). Thus, that benefit is no longer an issue.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/svc_rdma.h       |  6 +--
 net/sunrpc/xprtrdma/svc_rdma_rw.c     | 56 ++++++++++++++++++---------
 net/sunrpc/xprtrdma/svc_rdma_sendto.c | 30 ++++++--------
 3 files changed, 51 insertions(+), 41 deletions(-)

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index ac882bd23ca2a8..d33bab33099ab0 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -272,9 +272,9 @@ extern void svc_rdma_cc_release(struct svcxprt_rdma *rdma,
 				enum dma_data_direction dir);
 extern void svc_rdma_reply_chunk_release(struct svcxprt_rdma *rdma,
 					 struct svc_rdma_send_ctxt *ctxt);
-extern int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma,
-				     const struct svc_rdma_chunk *chunk,
-				     const struct xdr_buf *xdr);
+extern int svc_rdma_send_write_list(struct svcxprt_rdma *rdma,
+				    const struct svc_rdma_recv_ctxt *rctxt,
+				    const struct xdr_buf *xdr);
 extern int svc_rdma_prepare_reply_chunk(struct svcxprt_rdma *rdma,
 					const struct svc_rdma_pcl *write_pcl,
 					const struct svc_rdma_pcl *reply_pcl,
diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c
index 2b25edc6c73c52..40797114d50a49 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_rw.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c
@@ -601,47 +601,65 @@ static int svc_rdma_xb_write(const struct xdr_buf *xdr, void *data)
 	return xdr->len;
 }
 
-/**
- * svc_rdma_send_write_chunk - Write all segments in a Write chunk
- * @rdma: controlling RDMA transport
- * @chunk: Write chunk provided by the client
- * @xdr: xdr_buf containing the data payload
- *
- * Returns a non-negative number of bytes the chunk consumed, or
- *	%-E2BIG if the payload was larger than the Write chunk,
- *	%-EINVAL if client provided too many segments,
- *	%-ENOMEM if rdma_rw context pool was exhausted,
- *	%-ENOTCONN if posting failed (connection is lost),
- *	%-EIO if rdma_rw initialization failed (DMA mapping, etc).
- */
-int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma,
-			      const struct svc_rdma_chunk *chunk,
-			      const struct xdr_buf *xdr)
+static int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma,
+				     const struct svc_rdma_chunk *chunk,
+				     const struct xdr_buf *xdr)
 {
 	struct svc_rdma_write_info *info;
 	struct svc_rdma_chunk_ctxt *cc;
+	struct xdr_buf payload;
 	int ret;
 
+	if (xdr_buf_subsegment(xdr, &payload, chunk->ch_position,
+			       chunk->ch_payload_length))
+		return -EMSGSIZE;
+
 	info = svc_rdma_write_info_alloc(rdma, chunk);
 	if (!info)
 		return -ENOMEM;
 	cc = &info->wi_cc;
 
-	ret = svc_rdma_xb_write(xdr, info);
-	if (ret != xdr->len)
+	ret = svc_rdma_xb_write(&payload, info);
+	if (ret != payload.len)
 		goto out_err;
 
 	trace_svcrdma_post_write_chunk(&cc->cc_cid, cc->cc_sqecount);
 	ret = svc_rdma_post_chunk_ctxt(rdma, cc);
 	if (ret < 0)
 		goto out_err;
-	return xdr->len;
+	return 0;
 
 out_err:
 	svc_rdma_write_info_free(info);
 	return ret;
 }
 
+/**
+ * svc_rdma_send_write_list - Send all chunks on the Write list
+ * @rdma: controlling RDMA transport
+ * @rctxt: Write list provisioned by the client
+ * @xdr: xdr_buf containing an RPC Reply message
+ *
+ * Returns zero on success, or a negative errno if one or more
+ * Write chunks could not be sent.
+ */
+int svc_rdma_send_write_list(struct svcxprt_rdma *rdma,
+			     const struct svc_rdma_recv_ctxt *rctxt,
+			     const struct xdr_buf *xdr)
+{
+	struct svc_rdma_chunk *chunk;
+	int ret;
+
+	pcl_for_each_chunk(chunk, &rctxt->rc_write_pcl) {
+		if (!chunk->ch_payload_length)
+			break;
+		ret = svc_rdma_send_write_chunk(rdma, chunk, xdr);
+		if (ret < 0)
+			return ret;
+	}
+	return 0;
+}
+
 /**
  * svc_rdma_prepare_reply_chunk - Construct WR chain for writing the Reply chunk
  * @rdma: controlling RDMA transport
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index 6dfd2232ce5b41..bb5436b719e051 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -1013,6 +1013,10 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
 	if (!p)
 		goto put_ctxt;
 
+	ret = svc_rdma_send_write_list(rdma, rctxt, &rqstp->rq_res);
+	if (ret < 0)
+		goto put_ctxt;
+
 	rc_size = 0;
 	if (!pcl_is_empty(&rctxt->rc_reply_pcl)) {
 		ret = svc_rdma_prepare_reply_chunk(rdma, &rctxt->rc_write_pcl,
@@ -1064,45 +1068,33 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
 
 /**
  * svc_rdma_result_payload - special processing for a result payload
- * @rqstp: svc_rqst to operate on
- * @offset: payload's byte offset in @xdr
+ * @rqstp: RPC transaction context
+ * @offset: payload's byte offset in @rqstp->rq_res
  * @length: size of payload, in bytes
  *
+ * Assign the passed-in result payload to the current Write chunk,
+ * and advance to cur_result_payload to the next Write chunk, if
+ * there is one.
+ *
  * Return values:
  *   %0 if successful or nothing needed to be done
- *   %-EMSGSIZE on XDR buffer overflow
  *   %-E2BIG if the payload was larger than the Write chunk
- *   %-EINVAL if client provided too many segments
- *   %-ENOMEM if rdma_rw context pool was exhausted
- *   %-ENOTCONN if posting failed (connection is lost)
- *   %-EIO if rdma_rw initialization failed (DMA mapping, etc)
  */
 int svc_rdma_result_payload(struct svc_rqst *rqstp, unsigned int offset,
 			    unsigned int length)
 {
 	struct svc_rdma_recv_ctxt *rctxt = rqstp->rq_xprt_ctxt;
 	struct svc_rdma_chunk *chunk;
-	struct svcxprt_rdma *rdma;
-	struct xdr_buf subbuf;
-	int ret;
 
 	chunk = rctxt->rc_cur_result_payload;
 	if (!length || !chunk)
 		return 0;
 	rctxt->rc_cur_result_payload =
 		pcl_next_chunk(&rctxt->rc_write_pcl, chunk);
+
 	if (length > chunk->ch_length)
 		return -E2BIG;
-
 	chunk->ch_position = offset;
 	chunk->ch_payload_length = length;
-
-	if (xdr_buf_subsegment(&rqstp->rq_res, &subbuf, offset, length))
-		return -EMSGSIZE;
-
-	rdma = container_of(rqstp->rq_xprt, struct svcxprt_rdma, sc_xprt);
-	ret = svc_rdma_send_write_chunk(rdma, chunk, &subbuf);
-	if (ret < 0)
-		return ret;
 	return 0;
 }

From 1c8db11e03cc803c75aee229b9404ea568fd6049 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Sun, 4 Feb 2024 18:17:47 -0500
Subject: [PATCH 0673/1406] svcrdma: Add Write chunk WRs to the RPC's Send WR
 chain

Chain RDMA Writes that convey Write chunks onto the local Send
chain. This means all WRs for an RPC Reply are now posted with a
single ib_post_send() call, and there is a single Send completion
when all of these are done. That reduces both the per-transport
doorbell rate and completion rate.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/svc_rdma.h       | 13 +++-
 net/sunrpc/xprtrdma/svc_rdma_rw.c     | 86 ++++++++++++++++++++-------
 net/sunrpc/xprtrdma/svc_rdma_sendto.c |  5 +-
 3 files changed, 78 insertions(+), 26 deletions(-)

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index d33bab33099ab0..24cd199dd6f3a9 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -210,6 +210,7 @@ struct svc_rdma_recv_ctxt {
  */
 struct svc_rdma_write_info {
 	struct svcxprt_rdma	*wi_rdma;
+	struct list_head	wi_list;
 
 	const struct svc_rdma_chunk	*wi_chunk;
 
@@ -238,7 +239,10 @@ struct svc_rdma_send_ctxt {
 	struct ib_cqe		sc_cqe;
 	struct xdr_buf		sc_hdrbuf;
 	struct xdr_stream	sc_stream;
+
+	struct list_head	sc_write_info_list;
 	struct svc_rdma_write_info sc_reply_info;
+
 	void			*sc_xprt_buf;
 	int			sc_page_count;
 	int			sc_cur_sge_no;
@@ -270,11 +274,14 @@ extern void svc_rdma_cc_init(struct svcxprt_rdma *rdma,
 extern void svc_rdma_cc_release(struct svcxprt_rdma *rdma,
 				struct svc_rdma_chunk_ctxt *cc,
 				enum dma_data_direction dir);
+extern void svc_rdma_write_chunk_release(struct svcxprt_rdma *rdma,
+					 struct svc_rdma_send_ctxt *ctxt);
 extern void svc_rdma_reply_chunk_release(struct svcxprt_rdma *rdma,
 					 struct svc_rdma_send_ctxt *ctxt);
-extern int svc_rdma_send_write_list(struct svcxprt_rdma *rdma,
-				    const struct svc_rdma_recv_ctxt *rctxt,
-				    const struct xdr_buf *xdr);
+extern int svc_rdma_prepare_write_list(struct svcxprt_rdma *rdma,
+				       const struct svc_rdma_pcl *write_pcl,
+				       struct svc_rdma_send_ctxt *sctxt,
+				       const struct xdr_buf *xdr);
 extern int svc_rdma_prepare_reply_chunk(struct svcxprt_rdma *rdma,
 					const struct svc_rdma_pcl *write_pcl,
 					const struct svc_rdma_pcl *reply_pcl,
diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c
index 40797114d50a49..f2a100c4c81f12 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_rw.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c
@@ -230,6 +230,28 @@ static void svc_rdma_write_info_free(struct svc_rdma_write_info *info)
 	queue_work(svcrdma_wq, &info->wi_work);
 }
 
+/**
+ * svc_rdma_write_chunk_release - Release Write chunk I/O resources
+ * @rdma: controlling transport
+ * @ctxt: Send context that is being released
+ */
+void svc_rdma_write_chunk_release(struct svcxprt_rdma *rdma,
+				  struct svc_rdma_send_ctxt *ctxt)
+{
+	struct svc_rdma_write_info *info;
+	struct svc_rdma_chunk_ctxt *cc;
+
+	while (!list_empty(&ctxt->sc_write_info_list)) {
+		info = list_first_entry(&ctxt->sc_write_info_list,
+					struct svc_rdma_write_info, wi_list);
+		list_del(&info->wi_list);
+
+		cc = &info->wi_cc;
+		svc_rdma_wake_send_waiters(rdma, cc->cc_sqecount);
+		svc_rdma_write_info_free(info);
+	}
+}
+
 /**
  * svc_rdma_reply_chunk_release - Release Reply chunk I/O resources
  * @rdma: controlling transport
@@ -286,13 +308,11 @@ static void svc_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc)
 	struct ib_cqe *cqe = wc->wr_cqe;
 	struct svc_rdma_chunk_ctxt *cc =
 			container_of(cqe, struct svc_rdma_chunk_ctxt, cc_cqe);
-	struct svc_rdma_write_info *info =
-			container_of(cc, struct svc_rdma_write_info, wi_cc);
 
 	switch (wc->status) {
 	case IB_WC_SUCCESS:
 		trace_svcrdma_wc_write(&cc->cc_cid);
-		break;
+		return;
 	case IB_WC_WR_FLUSH_ERR:
 		trace_svcrdma_wc_write_flush(wc, &cc->cc_cid);
 		break;
@@ -300,12 +320,11 @@ static void svc_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc)
 		trace_svcrdma_wc_write_err(wc, &cc->cc_cid);
 	}
 
-	svc_rdma_wake_send_waiters(rdma, cc->cc_sqecount);
-
-	if (unlikely(wc->status != IB_WC_SUCCESS))
-		svc_xprt_deferred_close(&rdma->sc_xprt);
-
-	svc_rdma_write_info_free(info);
+	/* The RDMA Write has flushed, so the client won't get
+	 * some of the outgoing RPC message. Signal the loss
+	 * to the client by closing the connection.
+	 */
+	svc_xprt_deferred_close(&rdma->sc_xprt);
 }
 
 /**
@@ -601,13 +620,19 @@ static int svc_rdma_xb_write(const struct xdr_buf *xdr, void *data)
 	return xdr->len;
 }
 
-static int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma,
-				     const struct svc_rdma_chunk *chunk,
-				     const struct xdr_buf *xdr)
+/* Link Write WRs for @chunk onto @sctxt's WR chain.
+ */
+static int svc_rdma_prepare_write_chunk(struct svcxprt_rdma *rdma,
+					struct svc_rdma_send_ctxt *sctxt,
+					const struct svc_rdma_chunk *chunk,
+					const struct xdr_buf *xdr)
 {
 	struct svc_rdma_write_info *info;
 	struct svc_rdma_chunk_ctxt *cc;
+	struct ib_send_wr *first_wr;
 	struct xdr_buf payload;
+	struct list_head *pos;
+	struct ib_cqe *cqe;
 	int ret;
 
 	if (xdr_buf_subsegment(xdr, &payload, chunk->ch_position,
@@ -623,10 +648,25 @@ static int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma,
 	if (ret != payload.len)
 		goto out_err;
 
-	trace_svcrdma_post_write_chunk(&cc->cc_cid, cc->cc_sqecount);
-	ret = svc_rdma_post_chunk_ctxt(rdma, cc);
-	if (ret < 0)
+	ret = -EINVAL;
+	if (unlikely(cc->cc_sqecount > rdma->sc_sq_depth))
 		goto out_err;
+
+	first_wr = sctxt->sc_wr_chain;
+	cqe = &cc->cc_cqe;
+	list_for_each(pos, &cc->cc_rwctxts) {
+		struct svc_rdma_rw_ctxt *rwc;
+
+		rwc = list_entry(pos, struct svc_rdma_rw_ctxt, rw_list);
+		first_wr = rdma_rw_ctx_wrs(&rwc->rw_ctx, rdma->sc_qp,
+					   rdma->sc_port_num, cqe, first_wr);
+		cqe = NULL;
+	}
+	sctxt->sc_wr_chain = first_wr;
+	sctxt->sc_sqecount += cc->cc_sqecount;
+	list_add(&info->wi_list, &sctxt->sc_write_info_list);
+
+	trace_svcrdma_post_write_chunk(&cc->cc_cid, cc->cc_sqecount);
 	return 0;
 
 out_err:
@@ -635,25 +675,27 @@ static int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma,
 }
 
 /**
- * svc_rdma_send_write_list - Send all chunks on the Write list
+ * svc_rdma_prepare_write_list - Construct WR chain for sending Write list
  * @rdma: controlling RDMA transport
- * @rctxt: Write list provisioned by the client
+ * @write_pcl: Write list provisioned by the client
+ * @sctxt: Send WR resources
  * @xdr: xdr_buf containing an RPC Reply message
  *
  * Returns zero on success, or a negative errno if one or more
  * Write chunks could not be sent.
  */
-int svc_rdma_send_write_list(struct svcxprt_rdma *rdma,
-			     const struct svc_rdma_recv_ctxt *rctxt,
-			     const struct xdr_buf *xdr)
+int svc_rdma_prepare_write_list(struct svcxprt_rdma *rdma,
+				const struct svc_rdma_pcl *write_pcl,
+				struct svc_rdma_send_ctxt *sctxt,
+				const struct xdr_buf *xdr)
 {
 	struct svc_rdma_chunk *chunk;
 	int ret;
 
-	pcl_for_each_chunk(chunk, &rctxt->rc_write_pcl) {
+	pcl_for_each_chunk(chunk, write_pcl) {
 		if (!chunk->ch_payload_length)
 			break;
-		ret = svc_rdma_send_write_chunk(rdma, chunk, xdr);
+		ret = svc_rdma_prepare_write_chunk(rdma, sctxt, chunk, xdr);
 		if (ret < 0)
 			return ret;
 	}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index bb5436b719e051..dfca39abd16c88 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -142,6 +142,7 @@ svc_rdma_send_ctxt_alloc(struct svcxprt_rdma *rdma)
 	ctxt->sc_send_wr.sg_list = ctxt->sc_sges;
 	ctxt->sc_send_wr.send_flags = IB_SEND_SIGNALED;
 	ctxt->sc_cqe.done = svc_rdma_wc_send;
+	INIT_LIST_HEAD(&ctxt->sc_write_info_list);
 	ctxt->sc_xprt_buf = buffer;
 	xdr_buf_init(&ctxt->sc_hdrbuf, ctxt->sc_xprt_buf,
 		     rdma->sc_max_req_size);
@@ -227,6 +228,7 @@ static void svc_rdma_send_ctxt_release(struct svcxprt_rdma *rdma,
 	struct ib_device *device = rdma->sc_cm_id->device;
 	unsigned int i;
 
+	svc_rdma_write_chunk_release(rdma, ctxt);
 	svc_rdma_reply_chunk_release(rdma, ctxt);
 
 	if (ctxt->sc_page_count)
@@ -1013,7 +1015,8 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
 	if (!p)
 		goto put_ctxt;
 
-	ret = svc_rdma_send_write_list(rdma, rctxt, &rqstp->rq_res);
+	ret = svc_rdma_prepare_write_list(rdma, &rctxt->rc_write_pcl, sctxt,
+					  &rqstp->rq_res);
 	if (ret < 0)
 		goto put_ctxt;
 

From dc90fe880c36e674cb2d7c62e7d4158bc4f89055 Mon Sep 17 00:00:00 2001
From: Chen Hanxiao <chenhx.fnst@fujitsu.com>
Date: Thu, 8 Feb 2024 16:06:26 +0800
Subject: [PATCH 0674/1406] nfsd: clean up comments over nfs4_client definition

nfsd fault injection has been deprecated since
commit 9d60d93198c6 ("Deprecate nfsd fault injection")
and removed by
commit e56dc9e2949e ("nfsd: remove fault injection code")

So remove the outdated parts about fault injection.

Signed-off-by: Chen Hanxiao <chenhx.fnst@fujitsu.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/state.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index be02bf1a16bdd9..0c8ec578ba7e1b 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -334,8 +334,9 @@ enum {
  * 0. If they are not renewed within a lease period, they become eligible for
  * destruction by the laundromat.
  *
- * These objects can also be destroyed prematurely by the fault injection code,
- * or if the client sends certain forms of SETCLIENTID or EXCHANGE_ID updates.
+ * These objects can also be destroyed if the client sends certain forms of
+ * SETCLIENTID or EXCHANGE_ID operations.
+ *
  * Care is taken *not* to do this however when the objects have an elevated
  * refcount.
  *
@@ -343,7 +344,7 @@ enum {
  *
  * o Each nfs4_clients is also hashed by name (the opaque quantity initially
  *   sent by the client to identify itself).
- * 	  
+ *
  * o cl_perclient list is used to ensure no dangling stateowner references
  *   when we expire the nfs4_client
  */

From d5b4537037919f1029b232b547371cebf8269431 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 8 Feb 2024 11:00:29 -0500
Subject: [PATCH 0675/1406] NFSD: Fix the NFSv4.1 CREATE_SESSION operation

RFC 8881 Section 18.36.4 discusses the implementation of the NFSv4.1
CREATE_SESSION operation. The section defines four phases of
operation.

Phase 2 processes the CREATE_SESSION sequence ID. As a separate
step, Phase 3 evaluates the CREATE_SESSION arguments.

The problem we are concerned with is when phase 2 is successful but
phase 3 fails. The spec language in this case is "No changes are
made to any client records on the server."

RFC 8881 Section 18.35.4 defines a "client record", and it does
/not/ contain any details related to the special CREATE_SESSION
slot. Therefore NFSD is incorrect to skip incrementing the
CREATE_SESSION sequence id when phase 3 (see Section 18.36.4) of
CREATE_SESSION processing fails. In other words, even though NFSD
happens to store the cs_slot in a client record, in terms of the
protocol the slot is logically separate from the client record.

Three complications:

1. The world has moved on since commit 86c3e16cc7aa ("nfsd4: confirm
   only on succesful create_session") broke this. So we can't simply
   revert that commit.

2. NFSD's CREATE_SESSION implementation does not cleanly delineate
   the logic of phases 2 and 3. So this won't be a surgical fix.

3. Because of the way it currently handles the CREATE_SESSION slot
   sequence number, nfsd4_create_session() isn't caching error
   responses in the CREATE_SESSION slot. Instead of replaying the
   response cache in those cases, it's executing the transaction
   again.

Reorganize the CREATE_SESSION slot sequence number accounting. This
requires that error responses are appropriately cached in the
CREATE_SESSION slot (once it is found).

Reported-by: Connor Smith <connor.smith@hitachivantara.com>
Closes: https://bugzilla.kernel.org/show_bug.cgi?id=218382
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4state.c | 57 ++++++++++++++++++++++++---------------------
 1 file changed, 31 insertions(+), 26 deletions(-)

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 0dde9eee75e74e..ddf5e50772ae8a 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -3562,6 +3562,9 @@ nfsd4_exchange_id(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	new->cl_spo_must_allow.u.words[0] = exid->spo_must_allow[0];
 	new->cl_spo_must_allow.u.words[1] = exid->spo_must_allow[1];
 
+	/* Contrived initial CREATE_SESSION response */
+	new->cl_cs_slot.sl_status = nfserr_seq_misordered;
+
 	add_to_unconfirmed(new);
 	swap(new, conf);
 out_copy:
@@ -3732,10 +3735,10 @@ nfsd4_create_session(struct svc_rqst *rqstp,
 	struct nfsd4_create_session *cr_ses = &u->create_session;
 	struct sockaddr *sa = svc_addr(rqstp);
 	struct nfs4_client *conf, *unconf;
+	struct nfsd4_clid_slot *cs_slot;
 	struct nfs4_client *old = NULL;
 	struct nfsd4_session *new;
 	struct nfsd4_conn *conn;
-	struct nfsd4_clid_slot *cs_slot = NULL;
 	__be32 status = 0;
 	struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
 
@@ -3761,50 +3764,51 @@ nfsd4_create_session(struct svc_rqst *rqstp,
 	spin_lock(&nn->client_lock);
 	unconf = find_unconfirmed_client(&cr_ses->clientid, true, nn);
 	conf = find_confirmed_client(&cr_ses->clientid, true, nn);
-	WARN_ON_ONCE(conf && unconf);
+	if (!conf && !unconf) {
+		status = nfserr_stale_clientid;
+		goto out_free_conn;
+	}
 
-	if (conf) {
-		status = nfserr_wrong_cred;
-		if (!nfsd4_mach_creds_match(conf, rqstp))
-			goto out_free_conn;
+	if (conf)
 		cs_slot = &conf->cl_cs_slot;
-		status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0);
-		if (status) {
-			if (status == nfserr_replay_cache)
-				status = nfsd4_replay_create_session(cr_ses, cs_slot);
+	else
+		cs_slot = &unconf->cl_cs_slot;
+	status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0);
+	if (status) {
+		if (status == nfserr_replay_cache) {
+			status = nfsd4_replay_create_session(cr_ses, cs_slot);
 			goto out_free_conn;
 		}
-	} else if (unconf) {
+		goto out_cache_error;
+	}
+	cs_slot->sl_seqid++;
+	cr_ses->seqid = cs_slot->sl_seqid;
+
+	if (conf) {
+		status = nfserr_wrong_cred;
+		if (!nfsd4_mach_creds_match(conf, rqstp))
+			goto out_cache_error;
+	} else {
 		status = nfserr_clid_inuse;
 		if (!same_creds(&unconf->cl_cred, &rqstp->rq_cred) ||
 		    !rpc_cmp_addr(sa, (struct sockaddr *) &unconf->cl_addr)) {
 			trace_nfsd_clid_cred_mismatch(unconf, rqstp);
-			goto out_free_conn;
+			goto out_cache_error;
 		}
 		status = nfserr_wrong_cred;
 		if (!nfsd4_mach_creds_match(unconf, rqstp))
-			goto out_free_conn;
-		cs_slot = &unconf->cl_cs_slot;
-		status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0);
-		if (status) {
-			/* an unconfirmed replay returns misordered */
-			status = nfserr_seq_misordered;
-			goto out_free_conn;
-		}
+			goto out_cache_error;
 		old = find_confirmed_client_by_name(&unconf->cl_name, nn);
 		if (old) {
 			status = mark_client_expired_locked(old);
 			if (status) {
 				old = NULL;
-				goto out_free_conn;
+				goto out_cache_error;
 			}
 			trace_nfsd_clid_replaced(&old->cl_clientid);
 		}
 		move_to_confirmed(unconf);
 		conf = unconf;
-	} else {
-		status = nfserr_stale_clientid;
-		goto out_free_conn;
 	}
 	status = nfs_ok;
 	/* Persistent sessions are not supported */
@@ -3817,8 +3821,6 @@ nfsd4_create_session(struct svc_rqst *rqstp,
 
 	memcpy(cr_ses->sessionid.data, new->se_sessionid.data,
 	       NFS4_MAX_SESSIONID_LEN);
-	cs_slot->sl_seqid++;
-	cr_ses->seqid = cs_slot->sl_seqid;
 
 	/* cache solo and embedded create sessions under the client_lock */
 	nfsd4_cache_create_session(cr_ses, cs_slot, status);
@@ -3831,6 +3833,9 @@ nfsd4_create_session(struct svc_rqst *rqstp,
 	if (old)
 		expire_client(old);
 	return status;
+
+out_cache_error:
+	nfsd4_cache_create_session(cr_ses, cs_slot, status);
 out_free_conn:
 	spin_unlock(&nn->client_lock);
 	free_conn(conn);

From 4d16cbda94245c3a21c9b9454c52f92fcaa3e743 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 8 Feb 2024 11:00:35 -0500
Subject: [PATCH 0676/1406] NFSD: Document the phases of CREATE_SESSION

As described in RFC 8881 Section 18.36.4, CREATE_SESSION can be
split into four phases. NFSD's implementation now does it like that
description.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4state.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index ddf5e50772ae8a..d9260e77ef2d36 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -3762,6 +3762,8 @@ nfsd4_create_session(struct svc_rqst *rqstp,
 		goto out_free_session;
 
 	spin_lock(&nn->client_lock);
+
+	/* RFC 8881 Section 18.36.4 Phase 1: Client record look-up. */
 	unconf = find_unconfirmed_client(&cr_ses->clientid, true, nn);
 	conf = find_confirmed_client(&cr_ses->clientid, true, nn);
 	if (!conf && !unconf) {
@@ -3769,6 +3771,7 @@ nfsd4_create_session(struct svc_rqst *rqstp,
 		goto out_free_conn;
 	}
 
+	/* RFC 8881 Section 18.36.4 Phase 2: Sequence ID processing. */
 	if (conf)
 		cs_slot = &conf->cl_cs_slot;
 	else
@@ -3784,6 +3787,7 @@ nfsd4_create_session(struct svc_rqst *rqstp,
 	cs_slot->sl_seqid++;
 	cr_ses->seqid = cs_slot->sl_seqid;
 
+	/* RFC 8881 Section 18.36.4 Phase 3: Client ID confirmation. */
 	if (conf) {
 		status = nfserr_wrong_cred;
 		if (!nfsd4_mach_creds_match(conf, rqstp))
@@ -3810,6 +3814,8 @@ nfsd4_create_session(struct svc_rqst *rqstp,
 		move_to_confirmed(unconf);
 		conf = unconf;
 	}
+
+	/* RFC 8881 Section 18.36.4 Phase 4: Session creation. */
 	status = nfs_ok;
 	/* Persistent sessions are not supported */
 	cr_ses->flags &= ~SESSION4_PERSIST;

From 5184c05416a7eb8d9d253495670a6e4dfc3a238f Mon Sep 17 00:00:00 2001
From: Dai Ngo <dai.ngo@oracle.com>
Date: Thu, 15 Feb 2024 14:05:21 -0800
Subject: [PATCH 0677/1406] NFSD: add support for CB_GETATTR callback

Includes:
   . CB_GETATTR proc for nfs4_cb_procedures[]
   . XDR encoding and decoding function for CB_GETATTR request/reply
   . add nfs4_cb_fattr to nfs4_delegation for sending CB_GETATTR
     and store file attributes from client's reply.

Signed-off-by: Dai Ngo <dai.ngo@oracle.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4callback.c | 97 +++++++++++++++++++++++++++++++++++++++++-
 fs/nfsd/state.h        | 14 ++++++
 fs/nfsd/xdr4cb.h       | 18 ++++++++
 3 files changed, 128 insertions(+), 1 deletion(-)

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 32dd2fbb1f301b..e440f72b9d4ead 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -85,7 +85,21 @@ static void encode_uint32(struct xdr_stream *xdr, u32 n)
 static void encode_bitmap4(struct xdr_stream *xdr, const __u32 *bitmap,
 			   size_t len)
 {
-	WARN_ON_ONCE(xdr_stream_encode_uint32_array(xdr, bitmap, len) < 0);
+	xdr_stream_encode_uint32_array(xdr, bitmap, len);
+}
+
+static int decode_cb_fattr4(struct xdr_stream *xdr, uint32_t *bitmap,
+				struct nfs4_cb_fattr *fattr)
+{
+	fattr->ncf_cb_change = 0;
+	fattr->ncf_cb_fsize = 0;
+	if (bitmap[0] & FATTR4_WORD0_CHANGE)
+		if (xdr_stream_decode_u64(xdr, &fattr->ncf_cb_change) < 0)
+			return -NFSERR_BAD_XDR;
+	if (bitmap[0] & FATTR4_WORD0_SIZE)
+		if (xdr_stream_decode_u64(xdr, &fattr->ncf_cb_fsize) < 0)
+			return -NFSERR_BAD_XDR;
+	return 0;
 }
 
 static void encode_nfs_cb_opnum4(struct xdr_stream *xdr, enum nfs_cb_opnum4 op)
@@ -333,6 +347,30 @@ encode_cb_recallany4args(struct xdr_stream *xdr,
 	hdr->nops++;
 }
 
+/*
+ * CB_GETATTR4args
+ *	struct CB_GETATTR4args {
+ *	   nfs_fh4 fh;
+ *	   bitmap4 attr_request;
+ *	};
+ *
+ * The size and change attributes are the only one
+ * guaranteed to be serviced by the client.
+ */
+static void
+encode_cb_getattr4args(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr,
+			struct nfs4_cb_fattr *fattr)
+{
+	struct nfs4_delegation *dp =
+		container_of(fattr, struct nfs4_delegation, dl_cb_fattr);
+	struct knfsd_fh *fh = &dp->dl_stid.sc_file->fi_fhandle;
+
+	encode_nfs_cb_opnum4(xdr, OP_CB_GETATTR);
+	encode_nfs_fh4(xdr, fh);
+	encode_bitmap4(xdr, fattr->ncf_cb_bmap, ARRAY_SIZE(fattr->ncf_cb_bmap));
+	hdr->nops++;
+}
+
 /*
  * CB_SEQUENCE4args
  *
@@ -468,6 +506,26 @@ static void nfs4_xdr_enc_cb_null(struct rpc_rqst *req, struct xdr_stream *xdr,
 	xdr_reserve_space(xdr, 0);
 }
 
+/*
+ * 20.1.  Operation 3: CB_GETATTR - Get Attributes
+ */
+static void nfs4_xdr_enc_cb_getattr(struct rpc_rqst *req,
+		struct xdr_stream *xdr, const void *data)
+{
+	const struct nfsd4_callback *cb = data;
+	struct nfs4_cb_fattr *ncf =
+		container_of(cb, struct nfs4_cb_fattr, ncf_getattr);
+	struct nfs4_cb_compound_hdr hdr = {
+		.ident = cb->cb_clp->cl_cb_ident,
+		.minorversion = cb->cb_clp->cl_minorversion,
+	};
+
+	encode_cb_compound4args(xdr, &hdr);
+	encode_cb_sequence4args(xdr, cb, &hdr);
+	encode_cb_getattr4args(xdr, &hdr, ncf);
+	encode_cb_nops(&hdr);
+}
+
 /*
  * 20.2. Operation 4: CB_RECALL - Recall a Delegation
  */
@@ -523,6 +581,42 @@ static int nfs4_xdr_dec_cb_null(struct rpc_rqst *req, struct xdr_stream *xdr,
 	return 0;
 }
 
+/*
+ * 20.1.  Operation 3: CB_GETATTR - Get Attributes
+ */
+static int nfs4_xdr_dec_cb_getattr(struct rpc_rqst *rqstp,
+				  struct xdr_stream *xdr,
+				  void *data)
+{
+	struct nfsd4_callback *cb = data;
+	struct nfs4_cb_compound_hdr hdr;
+	int status;
+	u32 bitmap[3] = {0};
+	u32 attrlen;
+	struct nfs4_cb_fattr *ncf =
+		container_of(cb, struct nfs4_cb_fattr, ncf_getattr);
+
+	status = decode_cb_compound4res(xdr, &hdr);
+	if (unlikely(status))
+		return status;
+
+	status = decode_cb_sequence4res(xdr, cb);
+	if (unlikely(status || cb->cb_seq_status))
+		return status;
+
+	status = decode_cb_op_status(xdr, OP_CB_GETATTR, &cb->cb_status);
+	if (status)
+		return status;
+	if (xdr_stream_decode_uint32_array(xdr, bitmap, 3) < 0)
+		return -NFSERR_BAD_XDR;
+	if (xdr_stream_decode_u32(xdr, &attrlen) < 0)
+		return -NFSERR_BAD_XDR;
+	if (attrlen > (sizeof(ncf->ncf_cb_change) + sizeof(ncf->ncf_cb_fsize)))
+		return -NFSERR_BAD_XDR;
+	status = decode_cb_fattr4(xdr, bitmap, ncf);
+	return status;
+}
+
 /*
  * 20.2. Operation 4: CB_RECALL - Recall a Delegation
  */
@@ -831,6 +925,7 @@ static const struct rpc_procinfo nfs4_cb_procedures[] = {
 	PROC(CB_NOTIFY_LOCK,	COMPOUND,	cb_notify_lock,	cb_notify_lock),
 	PROC(CB_OFFLOAD,	COMPOUND,	cb_offload,	cb_offload),
 	PROC(CB_RECALL_ANY,	COMPOUND,	cb_recall_any,	cb_recall_any),
+	PROC(CB_GETATTR,	COMPOUND,	cb_getattr,	cb_getattr),
 };
 
 static unsigned int nfs4_cb_counts[ARRAY_SIZE(nfs4_cb_procedures)];
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 0c8ec578ba7e1b..3bf418ee6c9702 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -134,6 +134,16 @@ struct nfs4_cpntf_state {
 	time64_t		cpntf_time;	/* last time stateid used */
 };
 
+struct nfs4_cb_fattr {
+	struct nfsd4_callback ncf_getattr;
+	u32 ncf_cb_status;
+	u32 ncf_cb_bmap[1];
+
+	/* from CB_GETATTR reply */
+	u64 ncf_cb_change;
+	u64 ncf_cb_fsize;
+};
+
 /*
  * Represents a delegation stateid. The nfs4_client holds references to these
  * and they are put when it is being destroyed or when the delegation is
@@ -167,6 +177,9 @@ struct nfs4_delegation {
 	int			dl_retries;
 	struct nfsd4_callback	dl_recall;
 	bool			dl_recalled;
+
+	/* for CB_GETATTR */
+	struct nfs4_cb_fattr    dl_cb_fattr;
 };
 
 #define cb_to_delegation(cb) \
@@ -659,6 +672,7 @@ enum nfsd4_cb_op {
 	NFSPROC4_CLNT_CB_SEQUENCE,
 	NFSPROC4_CLNT_CB_NOTIFY_LOCK,
 	NFSPROC4_CLNT_CB_RECALL_ANY,
+	NFSPROC4_CLNT_CB_GETATTR,
 };
 
 /* Returns true iff a is later than b: */
diff --git a/fs/nfsd/xdr4cb.h b/fs/nfsd/xdr4cb.h
index 0d39af1b00a0f3..e8b00309c449fe 100644
--- a/fs/nfsd/xdr4cb.h
+++ b/fs/nfsd/xdr4cb.h
@@ -54,3 +54,21 @@
 #define NFS4_dec_cb_recall_any_sz	(cb_compound_dec_hdr_sz  +      \
 					cb_sequence_dec_sz +            \
 					op_dec_sz)
+
+/*
+ * 1: CB_GETATTR opcode (32-bit)
+ * N: file_handle
+ * 1: number of entry in attribute array (32-bit)
+ * 1: entry 0 in attribute array (32-bit)
+ */
+#define NFS4_enc_cb_getattr_sz		(cb_compound_enc_hdr_sz +       \
+					cb_sequence_enc_sz +            \
+					1 + enc_nfs4_fh_sz + 1 + 1)
+/*
+ * 4: fattr_bitmap_maxsz
+ * 1: attribute array len
+ * 2: change attr (64-bit)
+ * 2: size (64-bit)
+ */
+#define NFS4_dec_cb_getattr_sz		(cb_compound_dec_hdr_sz  +      \
+			cb_sequence_dec_sz + 4 + 1 + 2 + 2 + op_dec_sz)

From b9b89fb3e0b6586698ffe3c6ac6663c3e21de676 Mon Sep 17 00:00:00 2001
From: Dai Ngo <dai.ngo@oracle.com>
Date: Thu, 15 Feb 2024 14:05:22 -0800
Subject: [PATCH 0678/1406] NFSD: handle GETATTR conflict with write delegation

If the GETATTR request on a file that has write delegation in effect
and the request attributes include the change info and size attribute
then the request is handled as below:

Server sends CB_GETATTR to client to get the latest change info and file
size. If these values are the same as the server's cached values then
the GETATTR proceeds as normal.

If either the change info or file size is different from the server's
cached values, or the file was already marked as modified, then:

    . update time_modify and time_metadata into file's metadata
      with current time

    . encode GETATTR as normal except the file size is encoded with
      the value returned from CB_GETATTR

    . mark the file as modified

If the CB_GETATTR fails for any reasons, the delegation is recalled
and NFS4ERR_DELAY is returned for the GETATTR.

Signed-off-by: Dai Ngo <dai.ngo@oracle.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4state.c | 115 ++++++++++++++++++++++++++++++++++++++++----
 fs/nfsd/nfs4xdr.c   |  10 +++-
 fs/nfsd/nfsd.h      |   1 +
 fs/nfsd/state.h     |  10 +++-
 4 files changed, 123 insertions(+), 13 deletions(-)

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index d9260e77ef2d36..948148182cc57b 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -127,6 +127,7 @@ static void free_session(struct nfsd4_session *);
 
 static const struct nfsd4_callback_ops nfsd4_cb_recall_ops;
 static const struct nfsd4_callback_ops nfsd4_cb_notify_lock_ops;
+static const struct nfsd4_callback_ops nfsd4_cb_getattr_ops;
 
 static struct workqueue_struct *laundry_wq;
 
@@ -1189,6 +1190,10 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_file *fp,
 	dp->dl_recalled = false;
 	nfsd4_init_cb(&dp->dl_recall, dp->dl_stid.sc_client,
 		      &nfsd4_cb_recall_ops, NFSPROC4_CLNT_CB_RECALL);
+	nfsd4_init_cb(&dp->dl_cb_fattr.ncf_getattr, dp->dl_stid.sc_client,
+			&nfsd4_cb_getattr_ops, NFSPROC4_CLNT_CB_GETATTR);
+	dp->dl_cb_fattr.ncf_file_modified = false;
+	dp->dl_cb_fattr.ncf_cb_bmap[0] = FATTR4_WORD0_CHANGE | FATTR4_WORD0_SIZE;
 	get_nfs4_file(fp);
 	dp->dl_stid.sc_file = fp;
 	return dp;
@@ -3044,11 +3049,59 @@ nfsd4_cb_recall_any_release(struct nfsd4_callback *cb)
 	spin_unlock(&nn->client_lock);
 }
 
+static int
+nfsd4_cb_getattr_done(struct nfsd4_callback *cb, struct rpc_task *task)
+{
+	struct nfs4_cb_fattr *ncf =
+			container_of(cb, struct nfs4_cb_fattr, ncf_getattr);
+
+	ncf->ncf_cb_status = task->tk_status;
+	switch (task->tk_status) {
+	case -NFS4ERR_DELAY:
+		rpc_delay(task, 2 * HZ);
+		return 0;
+	default:
+		return 1;
+	}
+}
+
+static void
+nfsd4_cb_getattr_release(struct nfsd4_callback *cb)
+{
+	struct nfs4_cb_fattr *ncf =
+			container_of(cb, struct nfs4_cb_fattr, ncf_getattr);
+	struct nfs4_delegation *dp =
+			container_of(ncf, struct nfs4_delegation, dl_cb_fattr);
+
+	nfs4_put_stid(&dp->dl_stid);
+	clear_bit(CB_GETATTR_BUSY, &ncf->ncf_cb_flags);
+	wake_up_bit(&ncf->ncf_cb_flags, CB_GETATTR_BUSY);
+}
+
 static const struct nfsd4_callback_ops nfsd4_cb_recall_any_ops = {
 	.done		= nfsd4_cb_recall_any_done,
 	.release	= nfsd4_cb_recall_any_release,
 };
 
+static const struct nfsd4_callback_ops nfsd4_cb_getattr_ops = {
+	.done		= nfsd4_cb_getattr_done,
+	.release	= nfsd4_cb_getattr_release,
+};
+
+static void nfs4_cb_getattr(struct nfs4_cb_fattr *ncf)
+{
+	struct nfs4_delegation *dp =
+			container_of(ncf, struct nfs4_delegation, dl_cb_fattr);
+
+	if (test_and_set_bit(CB_GETATTR_BUSY, &ncf->ncf_cb_flags))
+		return;
+	/* set to proper status when nfsd4_cb_getattr_done runs */
+	ncf->ncf_cb_status = NFS4ERR_IO;
+
+	refcount_inc(&dp->dl_stid.sc_count);
+	nfsd4_run_cb(&ncf->ncf_getattr);
+}
+
 static struct nfs4_client *create_client(struct xdr_netobj name,
 		struct svc_rqst *rqstp, nfs4_verifier *verf)
 {
@@ -5854,6 +5907,8 @@ nfs4_open_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp,
 	struct svc_fh *parent = NULL;
 	int cb_up;
 	int status = 0;
+	struct kstat stat;
+	struct path path;
 
 	cb_up = nfsd4_cb_channel_good(oo->oo_owner.so_client);
 	open->op_recall = false;
@@ -5891,6 +5946,18 @@ nfs4_open_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp,
 	if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE) {
 		open->op_delegate_type = NFS4_OPEN_DELEGATE_WRITE;
 		trace_nfsd_deleg_write(&dp->dl_stid.sc_stateid);
+		path.mnt = currentfh->fh_export->ex_path.mnt;
+		path.dentry = currentfh->fh_dentry;
+		if (vfs_getattr(&path, &stat,
+				(STATX_SIZE | STATX_CTIME | STATX_CHANGE_COOKIE),
+				AT_STATX_SYNC_AS_STAT)) {
+			nfs4_put_stid(&dp->dl_stid);
+			destroy_delegation(dp);
+			goto out_no_deleg;
+		}
+		dp->dl_cb_fattr.ncf_cur_fsize = stat.size;
+		dp->dl_cb_fattr.ncf_initial_cinfo =
+			nfsd4_change_attribute(&stat, d_inode(currentfh->fh_dentry));
 	} else {
 		open->op_delegate_type = NFS4_OPEN_DELEGATE_READ;
 		trace_nfsd_deleg_read(&dp->dl_stid.sc_stateid);
@@ -8720,6 +8787,8 @@ nfsd4_get_writestateid(struct nfsd4_compound_state *cstate,
  * nfsd4_deleg_getattr_conflict - Recall if GETATTR causes conflict
  * @rqstp: RPC transaction context
  * @inode: file to be checked for a conflict
+ * @modified: return true if file was modified
+ * @size: new size of file if modified is true
  *
  * This function is called when there is a conflict between a write
  * delegation and a change/size GETATTR from another client. The server
@@ -8728,22 +8797,22 @@ nfsd4_get_writestateid(struct nfsd4_compound_state *cstate,
  * delegation before replying to the GETATTR. See RFC 8881 section
  * 18.7.4.
  *
- * The current implementation does not support CB_GETATTR yet. However
- * this can avoid recalling the delegation could be added in follow up
- * work.
- *
  * Returns 0 if there is no conflict; otherwise an nfs_stat
  * code is returned.
  */
 __be32
-nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, struct inode *inode)
+nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, struct inode *inode,
+				bool *modified, u64 *size)
 {
 	__be32 status;
 	struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
 	struct file_lock_context *ctx;
 	struct file_lock *fl;
 	struct nfs4_delegation *dp;
+	struct iattr attrs;
+	struct nfs4_cb_fattr *ncf;
 
+	*modified = false;
 	ctx = locks_inode_context(inode);
 	if (!ctx)
 		return 0;
@@ -8768,12 +8837,38 @@ nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, struct inode *inode)
 				return 0;
 			}
 break_lease:
-			spin_unlock(&ctx->flc_lock);
 			nfsd_stats_wdeleg_getattr_inc(nn);
-			status = nfserrno(nfsd_open_break_lease(inode, NFSD_MAY_READ));
-			if (status != nfserr_jukebox ||
-					!nfsd_wait_for_delegreturn(rqstp, inode))
-				return status;
+			dp = fl->fl_owner;
+			ncf = &dp->dl_cb_fattr;
+			nfs4_cb_getattr(&dp->dl_cb_fattr);
+			spin_unlock(&ctx->flc_lock);
+			wait_on_bit_timeout(&ncf->ncf_cb_flags, CB_GETATTR_BUSY,
+					TASK_INTERRUPTIBLE, NFSD_CB_GETATTR_TIMEOUT);
+			if (ncf->ncf_cb_status) {
+				/* Recall delegation only if client didn't respond */
+				status = nfserrno(nfsd_open_break_lease(inode, NFSD_MAY_READ));
+				if (status != nfserr_jukebox ||
+						!nfsd_wait_for_delegreturn(rqstp, inode))
+					return status;
+			}
+			if (!ncf->ncf_file_modified &&
+					(ncf->ncf_initial_cinfo != ncf->ncf_cb_change ||
+					ncf->ncf_cur_fsize != ncf->ncf_cb_fsize))
+				ncf->ncf_file_modified = true;
+			if (ncf->ncf_file_modified) {
+				/*
+				 * Per section 10.4.3 of RFC 8881, the server would
+				 * not update the file's metadata with the client's
+				 * modified size
+				 */
+				attrs.ia_mtime = attrs.ia_ctime = current_time(inode);
+				attrs.ia_valid = ATTR_MTIME | ATTR_CTIME;
+				setattr_copy(&nop_mnt_idmap, inode, &attrs);
+				mark_inode_dirty(inode);
+				ncf->ncf_cur_fsize = ncf->ncf_cb_fsize;
+				*size = ncf->ncf_cur_fsize;
+				*modified = true;
+			}
 			return 0;
 		}
 		break;
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index e3f761cd5ee78d..9e8f230fc96e51 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -3507,6 +3507,8 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr,
 		unsigned long	mask[2];
 	} u;
 	unsigned long bit;
+	bool file_modified = false;
+	u64 size = 0;
 
 	WARN_ON_ONCE(bmval[1] & NFSD_WRITEONLY_ATTRS_WORD1);
 	WARN_ON_ONCE(!nfsd_attrs_supported(minorversion, bmval));
@@ -3533,7 +3535,8 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr,
 	}
 	args.size = 0;
 	if (u.attrmask[0] & (FATTR4_WORD0_CHANGE | FATTR4_WORD0_SIZE)) {
-		status = nfsd4_deleg_getattr_conflict(rqstp, d_inode(dentry));
+		status = nfsd4_deleg_getattr_conflict(rqstp, d_inode(dentry),
+					&file_modified, &size);
 		if (status)
 			goto out;
 	}
@@ -3543,7 +3546,10 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr,
 			  AT_STATX_SYNC_AS_STAT);
 	if (err)
 		goto out_nfserr;
-	args.size = args.stat.size;
+	if (file_modified)
+		args.size = size;
+	else
+		args.size = args.stat.size;
 
 	if (!(args.stat.result_mask & STATX_BTIME))
 		/* underlying FS does not offer btime so we can't share it */
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 8daf22d766c60a..16c5a05f340e51 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -367,6 +367,7 @@ void		nfsd_lockd_shutdown(void);
 #define	NFSD_CLIENT_MAX_TRIM_PER_RUN	128
 #define	NFS4_CLIENTS_PER_GB		1024
 #define NFSD_DELEGRETURN_TIMEOUT	(HZ / 34)	/* 30ms */
+#define	NFSD_CB_GETATTR_TIMEOUT		NFSD_DELEGRETURN_TIMEOUT
 
 /*
  * The following attributes are currently not supported by the NFSv4 server:
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 3bf418ee6c9702..01c6f344564693 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -142,8 +142,16 @@ struct nfs4_cb_fattr {
 	/* from CB_GETATTR reply */
 	u64 ncf_cb_change;
 	u64 ncf_cb_fsize;
+
+	unsigned long ncf_cb_flags;
+	bool ncf_file_modified;
+	u64 ncf_initial_cinfo;
+	u64 ncf_cur_fsize;
 };
 
+/* bits for ncf_cb_flags */
+#define	CB_GETATTR_BUSY		0
+
 /*
  * Represents a delegation stateid. The nfs4_client holds references to these
  * and they are put when it is being destroyed or when the delegation is
@@ -773,5 +781,5 @@ static inline bool try_to_expire_client(struct nfs4_client *clp)
 }
 
 extern __be32 nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp,
-				struct inode *inode);
+		struct inode *inode, bool *file_modified, u64 *size);
 #endif   /* NFSD4_STATE_H */

From 31306a3be80edf6cc8839384f40ff4be8628b746 Mon Sep 17 00:00:00 2001
From: Dai Ngo <dai.ngo@oracle.com>
Date: Sat, 17 Feb 2024 10:00:22 -0800
Subject: [PATCH 0679/1406] NFSD: OP_CB_RECALL_ANY should recall both read and
 write delegations

Add RCA4_TYPE_MASK_WDATA_DLG to ra_bmval bitmask of OP_CB_RECALL_ANY

Signed-off-by: Dai Ngo <dai.ngo@oracle.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4state.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 948148182cc57b..fdc95bfbfbb621 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -6611,6 +6611,8 @@ deleg_reaper(struct nfsd_net *nn)
 		list_del_init(&clp->cl_ra_cblist);
 		clp->cl_ra->ra_keep = 0;
 		clp->cl_ra->ra_bmval[0] = BIT(RCA4_TYPE_MASK_RDATA_DLG);
+		clp->cl_ra->ra_bmval[0] = BIT(RCA4_TYPE_MASK_RDATA_DLG) |
+						BIT(RCA4_TYPE_MASK_WDATA_DLG);
 		trace_nfsd_cb_recall_any(clp->cl_ra);
 		nfsd4_run_cb(&clp->cl_ra->ra_cb);
 	}

From 432c373ea1ab4bc2c39460326c4b5ba89b36f101 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Thu, 15 Feb 2024 20:24:50 -0500
Subject: [PATCH 0680/1406] nfsd: Fix a regression in nfsd_setattr()

Commit bb4d53d66e4b ("NFSD: use (un)lock_inode instead of
fh_(un)lock for file operations") broke the NFSv3 pre/post op
attributes behaviour when doing a SETATTR rpc call by stripping out
the calls to fh_fill_pre_attrs() and fh_fill_post_attrs().

Fixes: bb4d53d66e4b ("NFSD: use (un)lock_inode instead of fh_(un)lock for file operations")
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: NeilBrown <neilb@suse.de>
Message-ID: <20240216012451.22725-1-trondmy@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4proc.c | 4 ++++
 fs/nfsd/vfs.c      | 9 +++++++--
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 648ff427005e6c..2f524f45350824 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -1143,6 +1143,7 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	};
 	struct inode *inode;
 	__be32 status = nfs_ok;
+	bool save_no_wcc;
 	int err;
 
 	if (setattr->sa_iattr.ia_valid & ATTR_SIZE) {
@@ -1168,8 +1169,11 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 
 	if (status)
 		goto out;
+	save_no_wcc = cstate->current_fh.fh_no_wcc;
+	cstate->current_fh.fh_no_wcc = true;
 	status = nfsd_setattr(rqstp, &cstate->current_fh, &attrs,
 				0, (time64_t)0);
+	cstate->current_fh.fh_no_wcc = save_no_wcc;
 	if (!status)
 		status = nfserrno(attrs.na_labelerr);
 	if (!status)
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 38952105ed7fd4..76e89329b9f11d 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -497,7 +497,7 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp,
 	int		accmode = NFSD_MAY_SATTR;
 	umode_t		ftype = 0;
 	__be32		err;
-	int		host_err;
+	int		host_err = 0;
 	bool		get_write_count;
 	bool		size_change = (iap->ia_valid & ATTR_SIZE);
 	int		retries;
@@ -555,6 +555,9 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp,
 	}
 
 	inode_lock(inode);
+	err = fh_fill_pre_attrs(fhp);
+	if (err)
+		goto out_unlock;
 	for (retries = 1;;) {
 		struct iattr attrs;
 
@@ -582,13 +585,15 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp,
 		attr->na_aclerr = set_posix_acl(&nop_mnt_idmap,
 						dentry, ACL_TYPE_DEFAULT,
 						attr->na_dpacl);
+	fh_fill_post_attrs(fhp);
+out_unlock:
 	inode_unlock(inode);
 	if (size_change)
 		put_write_access(inode);
 out:
 	if (!host_err)
 		host_err = commit_metadata(fhp);
-	return nfserrno(host_err);
+	return err != 0 ? err : nfserrno(host_err);
 }
 
 #if defined(CONFIG_NFSD_V4)

From 9aff478b7803faa137da5cd0605596845fd76f0a Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Thu, 15 Feb 2024 20:24:51 -0500
Subject: [PATCH 0681/1406] nfsd: Fix NFSv3 atomicity bugs in nfsd_setattr()

The main point of the guarded SETATTR is to prevent races with other
WRITE and SETATTR calls. That requires that the check of the guard time
against the inode ctime be done after taking the inode lock.

Furthermore, we need to take into account the 32-bit nature of
timestamps in NFSv3, and the possibility that files may change at a
faster rate than once a second.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: NeilBrown <neilb@suse.de>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs3proc.c  |  6 ++++--
 fs/nfsd/nfs3xdr.c   |  5 +----
 fs/nfsd/nfs4proc.c  |  3 +--
 fs/nfsd/nfs4state.c |  2 +-
 fs/nfsd/nfsproc.c   |  6 +++---
 fs/nfsd/vfs.c       | 20 +++++++++++++-------
 fs/nfsd/vfs.h       |  2 +-
 fs/nfsd/xdr3.h      |  2 +-
 8 files changed, 25 insertions(+), 21 deletions(-)

diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index b78eceebd945e3..dfcc957e460d64 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -71,13 +71,15 @@ nfsd3_proc_setattr(struct svc_rqst *rqstp)
 	struct nfsd_attrs attrs = {
 		.na_iattr	= &argp->attrs,
 	};
+	const struct timespec64 *guardtime = NULL;
 
 	dprintk("nfsd: SETATTR(3)  %s\n",
 				SVCFH_fmt(&argp->fh));
 
 	fh_copy(&resp->fh, &argp->fh);
-	resp->status = nfsd_setattr(rqstp, &resp->fh, &attrs,
-				    argp->check_guard, argp->guardtime);
+	if (argp->check_guard)
+		guardtime = &argp->guardtime;
+	resp->status = nfsd_setattr(rqstp, &resp->fh, &attrs, guardtime);
 	return rpc_success;
 }
 
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index f32128955ec8d1..a7a07470c1f846 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -295,17 +295,14 @@ svcxdr_decode_sattr3(struct svc_rqst *rqstp, struct xdr_stream *xdr,
 static bool
 svcxdr_decode_sattrguard3(struct xdr_stream *xdr, struct nfsd3_sattrargs *args)
 {
-	__be32 *p;
 	u32 check;
 
 	if (xdr_stream_decode_bool(xdr, &check) < 0)
 		return false;
 	if (check) {
-		p = xdr_inline_decode(xdr, XDR_UNIT * 2);
-		if (!p)
+		if (!svcxdr_decode_nfstime3(xdr, &args->guardtime))
 			return false;
 		args->check_guard = 1;
-		args->guardtime = be32_to_cpup(p);
 	} else
 		args->check_guard = 0;
 
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 2f524f45350824..2927b1263f086d 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -1171,8 +1171,7 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		goto out;
 	save_no_wcc = cstate->current_fh.fh_no_wcc;
 	cstate->current_fh.fh_no_wcc = true;
-	status = nfsd_setattr(rqstp, &cstate->current_fh, &attrs,
-				0, (time64_t)0);
+	status = nfsd_setattr(rqstp, &cstate->current_fh, &attrs, NULL);
 	cstate->current_fh.fh_no_wcc = save_no_wcc;
 	if (!status)
 		status = nfserrno(attrs.na_labelerr);
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index fdc95bfbfbb621..2ecd71ec68a535 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -5460,7 +5460,7 @@ nfsd4_truncate(struct svc_rqst *rqstp, struct svc_fh *fh,
 		return 0;
 	if (!(open->op_share_access & NFS4_SHARE_ACCESS_WRITE))
 		return nfserr_inval;
-	return nfsd_setattr(rqstp, fh, &attrs, 0, (time64_t)0);
+	return nfsd_setattr(rqstp, fh, &attrs, NULL);
 }
 
 static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file *fp,
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index a7315928a76079..36370b957b6337 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -103,7 +103,7 @@ nfsd_proc_setattr(struct svc_rqst *rqstp)
 		}
 	}
 
-	resp->status = nfsd_setattr(rqstp, fhp, &attrs, 0, (time64_t)0);
+	resp->status = nfsd_setattr(rqstp, fhp, &attrs, NULL);
 	if (resp->status != nfs_ok)
 		goto out;
 
@@ -390,8 +390,8 @@ nfsd_proc_create(struct svc_rqst *rqstp)
 		 */
 		attr->ia_valid &= ATTR_SIZE;
 		if (attr->ia_valid)
-			resp->status = nfsd_setattr(rqstp, newfhp, &attrs, 0,
-						    (time64_t)0);
+			resp->status = nfsd_setattr(rqstp, newfhp, &attrs,
+						    NULL);
 	}
 
 out_unlock:
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 76e89329b9f11d..a3a4400e75be10 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -476,7 +476,6 @@ static int __nfsd_setattr(struct dentry *dentry, struct iattr *iap)
  * @rqstp: controlling RPC transaction
  * @fhp: filehandle of target
  * @attr: attributes to set
- * @check_guard: set to 1 if guardtime is a valid timestamp
  * @guardtime: do not act if ctime.tv_sec does not match this timestamp
  *
  * This call may adjust the contents of @attr (in particular, this
@@ -488,8 +487,7 @@ static int __nfsd_setattr(struct dentry *dentry, struct iattr *iap)
  */
 __be32
 nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp,
-	     struct nfsd_attrs *attr,
-	     int check_guard, time64_t guardtime)
+	     struct nfsd_attrs *attr, const struct timespec64 *guardtime)
 {
 	struct dentry	*dentry;
 	struct inode	*inode;
@@ -538,9 +536,6 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp,
 
 	nfsd_sanitize_attrs(inode, iap);
 
-	if (check_guard && guardtime != inode_get_ctime_sec(inode))
-		return nfserr_notsync;
-
 	/*
 	 * The size case is special, it changes the file in addition to the
 	 * attributes, and file systems don't expect it to be mixed with
@@ -558,6 +553,16 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp,
 	err = fh_fill_pre_attrs(fhp);
 	if (err)
 		goto out_unlock;
+
+	if (guardtime) {
+		struct timespec64 ctime = inode_get_ctime(inode);
+		if ((u32)guardtime->tv_sec != (u32)ctime.tv_sec ||
+		    guardtime->tv_nsec != ctime.tv_nsec) {
+			err = nfserr_notsync;
+			goto out_fill_attrs;
+		}
+	}
+
 	for (retries = 1;;) {
 		struct iattr attrs;
 
@@ -585,6 +590,7 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp,
 		attr->na_aclerr = set_posix_acl(&nop_mnt_idmap,
 						dentry, ACL_TYPE_DEFAULT,
 						attr->na_dpacl);
+out_fill_attrs:
 	fh_fill_post_attrs(fhp);
 out_unlock:
 	inode_unlock(inode);
@@ -1411,7 +1417,7 @@ nfsd_create_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp,
 	 * if the attributes have not changed.
 	 */
 	if (iap->ia_valid)
-		status = nfsd_setattr(rqstp, resfhp, attrs, 0, (time64_t)0);
+		status = nfsd_setattr(rqstp, resfhp, attrs, NULL);
 	else
 		status = nfserrno(commit_metadata(resfhp));
 
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index 1efa4e8dfb0349..c60fdb6200fded 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -69,7 +69,7 @@ __be32		 nfsd_lookup_dentry(struct svc_rqst *, struct svc_fh *,
 				const char *, unsigned int,
 				struct svc_export **, struct dentry **);
 __be32		nfsd_setattr(struct svc_rqst *, struct svc_fh *,
-				struct nfsd_attrs *, int, time64_t);
+			     struct nfsd_attrs *, const struct timespec64 *);
 int nfsd_mountpoint(struct dentry *, struct svc_export *);
 #ifdef CONFIG_NFSD_V4
 __be32		nfsd4_vfs_fallocate(struct svc_rqst *, struct svc_fh *,
diff --git a/fs/nfsd/xdr3.h b/fs/nfsd/xdr3.h
index 03fe4e21306cba..522067b7fd7559 100644
--- a/fs/nfsd/xdr3.h
+++ b/fs/nfsd/xdr3.h
@@ -14,7 +14,7 @@ struct nfsd3_sattrargs {
 	struct svc_fh		fh;
 	struct iattr		attrs;
 	int			check_guard;
-	time64_t		guardtime;
+	struct timespec64	guardtime;
 };
 
 struct nfsd3_diropargs {

From 26102396d4e0559e0cf147e1f36123d7ee6afaca Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Sun, 18 Feb 2024 11:48:10 -0500
Subject: [PATCH 0682/1406] NFSD: Document nfsd_setattr() fill-attributes
 behavior

Add an explanation to prevent the future removal of the fill-
attribute call sites in nfsd_setattr(). Some client implementations
don't behave correctly if it is not present in an NFSv3 SETATTR
reply.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/vfs.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index a3a4400e75be10..6a4c506038e00d 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -591,6 +591,13 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp,
 						dentry, ACL_TYPE_DEFAULT,
 						attr->na_dpacl);
 out_fill_attrs:
+	/*
+	 * RFC 1813 Section 3.3.2 does not mandate that an NFS server
+	 * returns wcc_data for SETATTR. Some client implementations
+	 * depend on receiving wcc_data, however, to sort out partial
+	 * updates (eg., the client requested that size and mode be
+	 * modified, but the server changed only the file mode).
+	 */
 	fh_fill_post_attrs(fhp);
 out_unlock:
 	inode_unlock(inode);

From d08431235df0ea7aed79b9a7f8f963dd350c45ae Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Sun, 18 Feb 2024 14:28:57 -0800
Subject: [PATCH 0683/1406] acpi/ghes: Remove CXL CPER notifications

Initial tests with the CXL CPER implementation identified that error
reports were being duplicated in the log and the trace event [1].  Then
it was discovered that the notification handler took sleeping locks
while the GHES event handling runs in spin_lock_irqsave() context [2]

While the duplicate reporting was fixed in v6.8-rc4, the fix for the
sleeping-lock-vs-atomic collision would enjoy more time to settle and
gain some test cycles.  Given how late it is in the development cycle,
remove the CXL hookup for now and try again during the next merge
window.

Note that end result is that v6.8 does not emit CXL CPER payloads to the
kernel log, but this is in line with the CXL trend to move error
reporting to trace events instead of the kernel log.

Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Rafael J. Wysocki <rafael@kernel.org>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Link: http://lore.kernel.org/r/20240108165855.00002f5a@Huawei.com [1]
Closes: http://lore.kernel.org/r/b963c490-2c13-4b79-bbe7-34c6568423c7@moroto.mountain [2]
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/acpi/apei/ghes.c  | 63 ---------------------------------------
 drivers/cxl/pci.c         | 57 +----------------------------------
 include/linux/cxl-event.h | 18 -----------
 3 files changed, 1 insertion(+), 137 deletions(-)

diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index fe825a432c5bfc..ab2a82cb1b0b48 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -26,7 +26,6 @@
 #include <linux/interrupt.h>
 #include <linux/timer.h>
 #include <linux/cper.h>
-#include <linux/cxl-event.h>
 #include <linux/platform_device.h>
 #include <linux/mutex.h>
 #include <linux/ratelimit.h>
@@ -674,52 +673,6 @@ static void ghes_defer_non_standard_event(struct acpi_hest_generic_data *gdata,
 	schedule_work(&entry->work);
 }
 
-/*
- * Only a single callback can be registered for CXL CPER events.
- */
-static DECLARE_RWSEM(cxl_cper_rw_sem);
-static cxl_cper_callback cper_callback;
-
-static void cxl_cper_post_event(enum cxl_event_type event_type,
-				struct cxl_cper_event_rec *rec)
-{
-	if (rec->hdr.length <= sizeof(rec->hdr) ||
-	    rec->hdr.length > sizeof(*rec)) {
-		pr_err(FW_WARN "CXL CPER Invalid section length (%u)\n",
-		       rec->hdr.length);
-		return;
-	}
-
-	if (!(rec->hdr.validation_bits & CPER_CXL_COMP_EVENT_LOG_VALID)) {
-		pr_err(FW_WARN "CXL CPER invalid event\n");
-		return;
-	}
-
-	guard(rwsem_read)(&cxl_cper_rw_sem);
-	if (cper_callback)
-		cper_callback(event_type, rec);
-}
-
-int cxl_cper_register_callback(cxl_cper_callback callback)
-{
-	guard(rwsem_write)(&cxl_cper_rw_sem);
-	if (cper_callback)
-		return -EINVAL;
-	cper_callback = callback;
-	return 0;
-}
-EXPORT_SYMBOL_NS_GPL(cxl_cper_register_callback, CXL);
-
-int cxl_cper_unregister_callback(cxl_cper_callback callback)
-{
-	guard(rwsem_write)(&cxl_cper_rw_sem);
-	if (callback != cper_callback)
-		return -EINVAL;
-	cper_callback = NULL;
-	return 0;
-}
-EXPORT_SYMBOL_NS_GPL(cxl_cper_unregister_callback, CXL);
-
 static bool ghes_do_proc(struct ghes *ghes,
 			 const struct acpi_hest_generic_status *estatus)
 {
@@ -754,22 +707,6 @@ static bool ghes_do_proc(struct ghes *ghes,
 		}
 		else if (guid_equal(sec_type, &CPER_SEC_PROC_ARM)) {
 			queued = ghes_handle_arm_hw_error(gdata, sev, sync);
-		} else if (guid_equal(sec_type, &CPER_SEC_CXL_GEN_MEDIA_GUID)) {
-			struct cxl_cper_event_rec *rec =
-				acpi_hest_get_payload(gdata);
-
-			cxl_cper_post_event(CXL_CPER_EVENT_GEN_MEDIA, rec);
-		} else if (guid_equal(sec_type, &CPER_SEC_CXL_DRAM_GUID)) {
-			struct cxl_cper_event_rec *rec =
-				acpi_hest_get_payload(gdata);
-
-			cxl_cper_post_event(CXL_CPER_EVENT_DRAM, rec);
-		} else if (guid_equal(sec_type,
-				      &CPER_SEC_CXL_MEM_MODULE_GUID)) {
-			struct cxl_cper_event_rec *rec =
-				acpi_hest_get_payload(gdata);
-
-			cxl_cper_post_event(CXL_CPER_EVENT_MEM_MODULE, rec);
 		} else {
 			void *err = acpi_hest_get_payload(gdata);
 
diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
index 233e7c42c161d8..2ff361e756d661 100644
--- a/drivers/cxl/pci.c
+++ b/drivers/cxl/pci.c
@@ -974,61 +974,6 @@ static struct pci_driver cxl_pci_driver = {
 	},
 };
 
-#define CXL_EVENT_HDR_FLAGS_REC_SEVERITY GENMASK(1, 0)
-static void cxl_cper_event_call(enum cxl_event_type ev_type,
-				struct cxl_cper_event_rec *rec)
-{
-	struct cper_cxl_event_devid *device_id = &rec->hdr.device_id;
-	struct pci_dev *pdev __free(pci_dev_put) = NULL;
-	enum cxl_event_log_type log_type;
-	struct cxl_dev_state *cxlds;
-	unsigned int devfn;
-	u32 hdr_flags;
-
-	devfn = PCI_DEVFN(device_id->device_num, device_id->func_num);
-	pdev = pci_get_domain_bus_and_slot(device_id->segment_num,
-					   device_id->bus_num, devfn);
-	if (!pdev)
-		return;
-
-	guard(pci_dev)(pdev);
-	if (pdev->driver != &cxl_pci_driver)
-		return;
-
-	cxlds = pci_get_drvdata(pdev);
-	if (!cxlds)
-		return;
-
-	/* Fabricate a log type */
-	hdr_flags = get_unaligned_le24(rec->event.generic.hdr.flags);
-	log_type = FIELD_GET(CXL_EVENT_HDR_FLAGS_REC_SEVERITY, hdr_flags);
-
-	cxl_event_trace_record(cxlds->cxlmd, log_type, ev_type,
-			       &uuid_null, &rec->event);
-}
-
-static int __init cxl_pci_driver_init(void)
-{
-	int rc;
-
-	rc = cxl_cper_register_callback(cxl_cper_event_call);
-	if (rc)
-		return rc;
-
-	rc = pci_register_driver(&cxl_pci_driver);
-	if (rc)
-		cxl_cper_unregister_callback(cxl_cper_event_call);
-
-	return rc;
-}
-
-static void __exit cxl_pci_driver_exit(void)
-{
-	pci_unregister_driver(&cxl_pci_driver);
-	cxl_cper_unregister_callback(cxl_cper_event_call);
-}
-
-module_init(cxl_pci_driver_init);
-module_exit(cxl_pci_driver_exit);
+module_pci_driver(cxl_pci_driver);
 MODULE_LICENSE("GPL v2");
 MODULE_IMPORT_NS(CXL);
diff --git a/include/linux/cxl-event.h b/include/linux/cxl-event.h
index 91125eca4c8ab8..03fa6d50d46fe5 100644
--- a/include/linux/cxl-event.h
+++ b/include/linux/cxl-event.h
@@ -140,22 +140,4 @@ struct cxl_cper_event_rec {
 	union cxl_event event;
 } __packed;
 
-typedef void (*cxl_cper_callback)(enum cxl_event_type type,
-				  struct cxl_cper_event_rec *rec);
-
-#ifdef CONFIG_ACPI_APEI_GHES
-int cxl_cper_register_callback(cxl_cper_callback callback);
-int cxl_cper_unregister_callback(cxl_cper_callback callback);
-#else
-static inline int cxl_cper_register_callback(cxl_cper_callback callback)
-{
-	return 0;
-}
-
-static inline int cxl_cper_unregister_callback(cxl_cper_callback callback)
-{
-	return 0;
-}
-#endif
-
 #endif /* _LINUX_CXL_EVENT_H */

From 0ab0a5fed47634cef2a49b80a35197880aa96658 Mon Sep 17 00:00:00 2001
From: Steve French <stfrench@microsoft.com>
Date: Tue, 13 Feb 2024 00:40:01 -0600
Subject: [PATCH 0684/1406] cifs: allow changing password during remount

There are cases where a session is disconnected and password has changed
on the server (or expired) for this user and this currently can not
be fixed without unmount and mounting again.  This patch allows
remount to change the password when the session is disconnected
and the user can not reconnect due to still using old password.

Future patches should also allow us to setup the keyring (cifscreds)
to have an "alternate password" so we would be able to change
the password before the session drops (without the risk of races
between when the password changes and the disconnect occurs -
ie cases where the old password is still needed because the new
password has not fully rolled out to all servers yet).

Cc: stable@vger.kernel.org
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/client/cifs_debug.c |  2 ++
 fs/smb/client/cifsglob.h   |  1 +
 fs/smb/client/fs_context.c | 23 ++++++++++++++++++-----
 fs/smb/client/smb2pdu.c    |  5 +++++
 4 files changed, 26 insertions(+), 5 deletions(-)

diff --git a/fs/smb/client/cifs_debug.c b/fs/smb/client/cifs_debug.c
index 3e4209f41c18f8..23d2622b969f09 100644
--- a/fs/smb/client/cifs_debug.c
+++ b/fs/smb/client/cifs_debug.c
@@ -488,6 +488,8 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
 				ses->ses_count, ses->serverOS, ses->serverNOS,
 				ses->capabilities, ses->ses_status);
 			}
+			if (ses->expired_pwd)
+				seq_puts(m, "password no longer valid ");
 			spin_unlock(&ses->ses_lock);
 
 			seq_printf(m, "\n\tSecurity type: %s ",
diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h
index 53c75cfb33ab94..ec9a26bd05a122 100644
--- a/fs/smb/client/cifsglob.h
+++ b/fs/smb/client/cifsglob.h
@@ -1066,6 +1066,7 @@ struct cifs_ses {
 	enum securityEnum sectype; /* what security flavor was specified? */
 	bool sign;		/* is signing required? */
 	bool domainAuto:1;
+	bool expired_pwd;  /* track if access denied or expired pwd so can know if need to update */
 	unsigned int flags;
 	__u16 session_flags;
 	__u8 smb3signingkey[SMB3_SIGN_KEY_SIZE];
diff --git a/fs/smb/client/fs_context.c b/fs/smb/client/fs_context.c
index 4b2f5aa2ea0e1d..99702ab05f8d6e 100644
--- a/fs/smb/client/fs_context.c
+++ b/fs/smb/client/fs_context.c
@@ -772,7 +772,7 @@ static void smb3_fs_context_free(struct fs_context *fc)
  */
 static int smb3_verify_reconfigure_ctx(struct fs_context *fc,
 				       struct smb3_fs_context *new_ctx,
-				       struct smb3_fs_context *old_ctx)
+				       struct smb3_fs_context *old_ctx, bool need_recon)
 {
 	if (new_ctx->posix_paths != old_ctx->posix_paths) {
 		cifs_errorf(fc, "can not change posixpaths during remount\n");
@@ -798,8 +798,11 @@ static int smb3_verify_reconfigure_ctx(struct fs_context *fc,
 	}
 	if (new_ctx->password &&
 	    (!old_ctx->password || strcmp(new_ctx->password, old_ctx->password))) {
-		cifs_errorf(fc, "can not change password during remount\n");
-		return -EINVAL;
+		if (need_recon == false) {
+			cifs_errorf(fc,
+				    "can not change password of active session during remount\n");
+			return -EINVAL;
+		}
 	}
 	if (new_ctx->domainname &&
 	    (!old_ctx->domainname || strcmp(new_ctx->domainname, old_ctx->domainname))) {
@@ -843,9 +846,14 @@ static int smb3_reconfigure(struct fs_context *fc)
 	struct smb3_fs_context *ctx = smb3_fc2context(fc);
 	struct dentry *root = fc->root;
 	struct cifs_sb_info *cifs_sb = CIFS_SB(root->d_sb);
+	struct cifs_ses *ses = cifs_sb_master_tcon(cifs_sb)->ses;
+	bool need_recon = false;
 	int rc;
 
-	rc = smb3_verify_reconfigure_ctx(fc, ctx, cifs_sb->ctx);
+	if (ses->expired_pwd)
+		need_recon = true;
+
+	rc = smb3_verify_reconfigure_ctx(fc, ctx, cifs_sb->ctx, need_recon);
 	if (rc)
 		return rc;
 
@@ -858,7 +866,12 @@ static int smb3_reconfigure(struct fs_context *fc)
 	STEAL_STRING(cifs_sb, ctx, UNC);
 	STEAL_STRING(cifs_sb, ctx, source);
 	STEAL_STRING(cifs_sb, ctx, username);
-	STEAL_STRING_SENSITIVE(cifs_sb, ctx, password);
+	if (need_recon == false)
+		STEAL_STRING_SENSITIVE(cifs_sb, ctx, password);
+	else  {
+		kfree_sensitive(ses->password);
+		ses->password = kstrdup(ctx->password, GFP_KERNEL);
+	}
 	STEAL_STRING(cifs_sb, ctx, domainname);
 	STEAL_STRING(cifs_sb, ctx, nodename);
 	STEAL_STRING(cifs_sb, ctx, iocharset);
diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c
index 608ee05491e262..a500380d1b2e9d 100644
--- a/fs/smb/client/smb2pdu.c
+++ b/fs/smb/client/smb2pdu.c
@@ -1536,6 +1536,11 @@ SMB2_sess_sendreceive(struct SMB2_sess_data *sess_data)
 			    &sess_data->buf0_type,
 			    CIFS_LOG_ERROR | CIFS_SESS_OP, &rsp_iov);
 	cifs_small_buf_release(sess_data->iov[0].iov_base);
+	if (rc == 0)
+		sess_data->ses->expired_pwd = false;
+	else if ((rc == -EACCES) || (rc == -EKEYEXPIRED) || (rc == -EKEYREVOKED))
+		sess_data->ses->expired_pwd = true;
+
 	memcpy(&sess_data->iov[0], &rsp_iov, sizeof(struct kvec));
 
 	return rc;

From 06e6424703cee7cf66d7c634f48390bffceff211 Mon Sep 17 00:00:00 2001
From: Thomas Bertschinger <tahbertschinger@gmail.com>
Date: Thu, 15 Feb 2024 19:44:21 -0700
Subject: [PATCH 0685/1406] bcachefs: omit alignment attribute on big endian
 struct bkey

This is needed for building Rust bindings on big endian architectures
like s390x. Currently this is only done in userspace, but it might
happen in-kernel in the future. When creating a Rust binding for struct
bkey, the "packed" attribute is needed to get a type with the correct
member offsets in the big endian case. However, rustc does not allow
types to have both a "packed" and "align" attribute. Thus, in order to
get a Rust type compatible with the C type, we must omit the "aligned"
attribute in C.

This does not affect the struct's size or member offsets, only its
toplevel alignment, which should be an acceptable impact.

The little endian version can have the "align" attribute because the
"packed" attr is redundant, and rust-bindgen will omit the "packed" attr
when an "align" attr is present and it can do so without changing a
type's layout

Signed-off-by: Thomas Bertschinger <tahbertschinger@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs_format.h | 37 +++++++++++++++++++++++++++++++++--
 1 file changed, 35 insertions(+), 2 deletions(-)

diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 1bb24aa7352800..bff8750ac0d743 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -189,7 +189,11 @@ struct bversion {
 	__u32		hi;
 	__u64		lo;
 #endif
-} __packed __aligned(4);
+} __packed
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+__aligned(4)
+#endif
+;
 
 struct bkey {
 	/* Size of combined key and value, in u64s */
@@ -222,7 +226,36 @@ struct bkey {
 
 	__u8		pad[1];
 #endif
-} __packed __aligned(8);
+} __packed
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+/*
+ * The big-endian version of bkey can't be compiled by rustc with the "aligned"
+ * attr since it doesn't allow types to have both "packed" and "aligned" attrs.
+ * So for Rust compatibility, don't include this. It can be included in the LE
+ * version because the "packed" attr is redundant in that case.
+ *
+ * History: (quoting Kent)
+ *
+ * Specifically, when i was designing bkey, I wanted the header to be no
+ * bigger than necessary so that bkey_packed could use the rest. That means that
+ * decently offten extent keys will fit into only 8 bytes, instead of spilling over
+ * to 16.
+ *
+ * But packed_bkey treats the part after the header - the packed section -
+ * as a single multi word, variable length integer. And bkey, the unpacked
+ * version, is just a special case version of a bkey_packed; all the packed
+ * bkey code will work on keys in any packed format, the in-memory
+ * representation of an unpacked key also is just one type of packed key...
+ *
+ * So that constrains the key part of a bkig endian bkey to start right
+ * after the header.
+ *
+ * If we ever do a bkey_v2 and need to expand the hedaer by another byte for
+ * some reason - that will clean up this wart.
+ */
+__aligned(8)
+#endif
+;
 
 struct bkey_packed {
 	__u64		_data[0];

From c7eefd04b807932338dea94b7ba6a9d85bb14f84 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 15 Feb 2024 22:50:42 -0500
Subject: [PATCH 0686/1406] bcachefs: bch2_check_subvolume_structure()

Now that we've got bch_subvolume.fs_path_parent, it's easy to write
subvolume

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fsck.c            | 160 ++++++++++++++++++++++++++++------
 fs/bcachefs/fsck.h            |   1 +
 fs/bcachefs/recovery_types.h  |   1 +
 fs/bcachefs/sb-errors_types.h |   4 +-
 4 files changed, 138 insertions(+), 28 deletions(-)

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 3f74b6769a3838..ffb1bafac2a063 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -342,6 +342,27 @@ static int remove_backpointer(struct btree_trans *trans,
 	return ret;
 }
 
+static int reattach_subvol(struct btree_trans *trans, struct bkey_s_c_subvolume s)
+{
+	struct bch_fs *c = trans->c;
+
+	struct bch_inode_unpacked inode;
+	int ret = bch2_inode_find_by_inum_trans(trans,
+				(subvol_inum) { s.k->p.offset, le64_to_cpu(s.v->inode) },
+				&inode);
+	if (ret)
+		return ret;
+
+	ret = remove_backpointer(trans, &inode);
+	bch_err_msg(c, ret, "removing dirent");
+	if (ret)
+		return ret;
+
+	ret = reattach_inode(trans, &inode, le32_to_cpu(s.v->snapshot));
+	bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum);
+	return ret;
+}
+
 struct snapshots_seen_entry {
 	u32				id;
 	u32				equiv;
@@ -2111,6 +2132,107 @@ int bch2_check_root(struct bch_fs *c)
 	return ret;
 }
 
+typedef DARRAY(u32) darray_u32;
+
+static bool darray_u32_has(darray_u32 *d, u32 v)
+{
+	darray_for_each(*d, i)
+		if (*i == v)
+			return true;
+	return false;
+}
+
+/*
+ * We've checked that inode backpointers point to valid dirents; here, it's
+ * sufficient to check that the subvolume root has a dirent:
+ */
+static int subvol_has_dirent(struct btree_trans *trans, struct bkey_s_c_subvolume s)
+{
+	struct bch_inode_unpacked inode;
+	int ret = bch2_inode_find_by_inum_trans(trans,
+				(subvol_inum) { s.k->p.offset, le64_to_cpu(s.v->inode) },
+				&inode);
+	if (ret)
+		return ret;
+
+	return inode.bi_dir != 0;
+}
+
+static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter parent_iter = {};
+	darray_u32 subvol_path = {};
+	struct printbuf buf = PRINTBUF;
+	int ret = 0;
+
+	if (k.k->type != KEY_TYPE_subvolume)
+		return 0;
+
+	while (k.k->p.offset != BCACHEFS_ROOT_SUBVOL) {
+		ret = darray_push(&subvol_path, k.k->p.offset);
+		if (ret)
+			goto err;
+
+		struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
+
+		ret = subvol_has_dirent(trans, s);
+		if (ret < 0)
+			break;
+
+		if (fsck_err_on(!ret,
+				c, subvol_unreachable,
+				"unreachable subvolume %s",
+				(bch2_bkey_val_to_text(&buf, c, s.s_c),
+				 buf.buf))) {
+			ret = reattach_subvol(trans, s);
+			break;
+		}
+
+		u32 parent = le32_to_cpu(s.v->fs_path_parent);
+
+		if (darray_u32_has(&subvol_path, parent)) {
+			if (fsck_err(c, subvol_loop, "subvolume loop"))
+				ret = reattach_subvol(trans, s);
+			break;
+		}
+
+		bch2_trans_iter_exit(trans, &parent_iter);
+		bch2_trans_iter_init(trans, &parent_iter,
+				     BTREE_ID_subvolumes, POS(0, parent), 0);
+		k = bch2_btree_iter_peek_slot(&parent_iter);
+		ret = bkey_err(k);
+		if (ret)
+			goto err;
+
+		if (fsck_err_on(k.k->type != KEY_TYPE_subvolume,
+				c, subvol_unreachable,
+				"unreachable subvolume %s",
+				(bch2_bkey_val_to_text(&buf, c, s.s_c),
+				 buf.buf))) {
+			ret = reattach_subvol(trans, s);
+			break;
+		}
+	}
+fsck_err:
+err:
+	printbuf_exit(&buf);
+	darray_exit(&subvol_path);
+	bch2_trans_iter_exit(trans, &parent_iter);
+	return ret;
+}
+
+int bch2_check_subvolume_structure(struct bch_fs *c)
+{
+	int ret = bch2_trans_run(c,
+		for_each_btree_key_commit(trans, iter,
+				BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k,
+				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+			check_subvol_path(trans, &iter, k)));
+	bch_err_fn(c, ret);
+	return ret;
+}
+
 struct pathbuf_entry {
 	u64	inum;
 	u32	snapshot;
@@ -2127,20 +2249,6 @@ static bool path_is_dup(pathbuf *p, u64 inum, u32 snapshot)
 	return false;
 }
 
-static int path_down(struct bch_fs *c, pathbuf *p,
-		     u64 inum, u32 snapshot)
-{
-	int ret = darray_push(p, ((struct pathbuf_entry) {
-		.inum		= inum,
-		.snapshot	= snapshot,
-	}));
-
-	if (ret)
-		bch_err(c, "fsck: error allocating memory for pathbuf, size %zu",
-			p->size);
-	return ret;
-}
-
 /*
  * Check that a given inode is reachable from the root:
  *
@@ -2191,11 +2299,12 @@ static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c ino
 		if (!S_ISDIR(inode.bi_mode))
 			break;
 
-		ret = path_down(c, p, inode.bi_inum, snapshot);
-		if (ret) {
-			bch_err(c, "memory allocation failure");
+		ret = darray_push(p, ((struct pathbuf_entry) {
+			.inum		= inode.bi_inum,
+			.snapshot	= snapshot,
+		}));
+		if (ret)
 			return ret;
-		}
 
 		snapshot = parent_snapshot;
 
@@ -2222,18 +2331,15 @@ static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c ino
 				pr_err("%llu:%u", i->inum, i->snapshot);
 			pr_err("%llu:%u", inode.bi_inum, snapshot);
 
-			if (!fsck_err(c, dir_loop, "directory structure loop"))
-				return 0;
-
-			ret = remove_backpointer(trans, &inode);
-			if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
+			if (fsck_err(c, dir_loop, "directory structure loop")) {
+				ret = remove_backpointer(trans, &inode);
 				bch_err_msg(c, ret, "removing dirent");
-			if (ret)
-				break;
+				if (ret)
+					break;
 
-			ret = reattach_inode(trans, &inode, snapshot);
-			if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
+				ret = reattach_inode(trans, &inode, snapshot);
 				bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum);
+			}
 			break;
 		}
 	}
diff --git a/fs/bcachefs/fsck.h b/fs/bcachefs/fsck.h
index da991e8cf27eb4..a4ef9427178433 100644
--- a/fs/bcachefs/fsck.h
+++ b/fs/bcachefs/fsck.h
@@ -8,6 +8,7 @@ int bch2_check_indirect_extents(struct bch_fs *);
 int bch2_check_dirents(struct bch_fs *);
 int bch2_check_xattrs(struct bch_fs *);
 int bch2_check_root(struct bch_fs *);
+int bch2_check_subvolume_structure(struct bch_fs *);
 int bch2_check_directory_structure(struct bch_fs *);
 int bch2_check_nlinks(struct bch_fs *);
 int bch2_fix_reflink_p(struct bch_fs *);
diff --git a/fs/bcachefs/recovery_types.h b/fs/bcachefs/recovery_types.h
index f0fc1dbb723929..1361e34d4e64c2 100644
--- a/fs/bcachefs/recovery_types.h
+++ b/fs/bcachefs/recovery_types.h
@@ -44,6 +44,7 @@
 	x(check_dirents,			27, PASS_FSCK)			\
 	x(check_xattrs,				28, PASS_FSCK)			\
 	x(check_root,				29, PASS_ONLINE|PASS_FSCK)	\
+	x(check_subvolume_structure,		36, PASS_ONLINE|PASS_FSCK)	\
 	x(check_directory_structure,		30, PASS_ONLINE|PASS_FSCK)	\
 	x(check_nlinks,				31, PASS_FSCK)			\
 	x(delete_dead_inodes,			32, PASS_FSCK|PASS_UNCLEAN)	\
diff --git a/fs/bcachefs/sb-errors_types.h b/fs/bcachefs/sb-errors_types.h
index 1530bd35b94572..0df4b0e7071ae4 100644
--- a/fs/bcachefs/sb-errors_types.h
+++ b/fs/bcachefs/sb-errors_types.h
@@ -262,7 +262,9 @@
 	x(subvol_fs_path_parent_wrong,				254)	\
 	x(subvol_root_fs_path_parent_nonzero,			255)	\
 	x(subvol_children_not_set,				256)	\
-	x(subvol_children_bad,					257)
+	x(subvol_children_bad,					257)	\
+	x(subvol_loop,						258)	\
+	x(subvol_unreachable,					259)
 
 enum bch_sb_error_id {
 #define x(t, n) BCH_FSCK_ERR_##t = n,

From 6ec1c937b71a0f9d809be9c5de108fd7ebb004a2 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 15 Feb 2024 23:59:05 -0500
Subject: [PATCH 0687/1406] bcachefs: check_path() now only needs to walk up to
 subvolume root

Now that checking subvolume structure is a separate pass, the main
check_directory_connectivity() pass only needs to walk up to a given
inode's subvolume root.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fsck.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index ffb1bafac2a063..144f074bbc6ce7 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -2250,7 +2250,8 @@ static bool path_is_dup(pathbuf *p, u64 inum, u32 snapshot)
 }
 
 /*
- * Check that a given inode is reachable from the root:
+ * Check that a given inode is reachable from its subvolume root - we already
+ * verified subvolume connectivity:
  *
  * XXX: we should also be verifying that inodes are in the right subvolumes
  */
@@ -2267,8 +2268,7 @@ static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c ino
 
 	BUG_ON(bch2_inode_unpack(inode_k, &inode));
 
-	while (!(inode.bi_inum == BCACHEFS_ROOT_INO &&
-		 inode.bi_subvol == BCACHEFS_ROOT_SUBVOL)) {
+	while (!inode.bi_subvol) {
 		struct btree_iter dirent_iter;
 		struct bkey_s_c_dirent d;
 		u32 parent_snapshot = snapshot;

From d0cbb79037bb95296a2265df932f170859b9cbad Mon Sep 17 00:00:00 2001
From: Lukas Bulwahn <lukas.bulwahn@gmail.com>
Date: Fri, 16 Feb 2024 09:19:17 +0100
Subject: [PATCH 0688/1406] MAINTAINERS: repair file entries in THREAD WITH
 FILE

Commit ead021e0fe5b ("thread_with_file: Lift from bcachefs") adds the
section THREAD WITH FILE with file entries to the relevant header files
thread_with_file.h and thread_with_file_types.h in include/linux/.

The commit however unintentionally refers to files with a .c extension, but
the header files are of course with .h extension. Fortunately, the script
'./scripts/get_maintainer.pl --self-test=patterns' notices that.

Adjust the file entries to use the right extension.

Signed-off-by: Lukas Bulwahn <lukas.bulwahn@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 MAINTAINERS | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index 97d85d9480835a..3fdcc44c82d59a 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -21884,8 +21884,8 @@ M:	Kent Overstreet <kent.overstreet@linux.dev>
 M:	Darrick J. Wong <djwong@kernel.org>
 L:	linux-bcachefs@vger.kernel.org
 S:	Maintained
-F:	include/linux/thread_with_file.c
-F:	include/linux/thread_with_file_types.c
+F:	include/linux/thread_with_file.h
+F:	include/linux/thread_with_file_types.h
 F:	lib/thread_with_file.c
 
 THUNDERBOLT DMA TRAFFIC TEST DRIVER

From 2fcb344fe7314a89b0e420705244d820de7cf885 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 16 Feb 2024 20:03:12 -0500
Subject: [PATCH 0689/1406] bcachefs: more informative write path error message

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/io_write.c | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c
index 3fa2cb1d5b13aa..f7c4a428c17b06 100644
--- a/fs/bcachefs/io_write.c
+++ b/fs/bcachefs/io_write.c
@@ -530,7 +530,8 @@ static void __bch2_write_index(struct bch_write_op *op)
 
 			bch_err_inum_offset_ratelimited(c,
 				insert->k.p.inode, insert->k.p.offset << 9,
-				"write error while doing btree update: %s",
+				"%s write error while doing btree update: %s",
+				op->flags & BCH_WRITE_MOVE ? "move" : "user",
 				bch2_err_str(ret));
 		}
 
@@ -1067,7 +1068,8 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
 	*_dst = dst;
 	return more;
 csum_err:
-	bch_err(c, "error verifying existing checksum while rewriting existing data (memory corruption?)");
+	bch_err(c, "%s writ error: error verifying existing checksum while rewriting existing data (memory corruption?)",
+		op->flags & BCH_WRITE_MOVE ? "move" : "user");
 	ret = -EIO;
 err:
 	if (to_wbio(dst)->bounce)
@@ -1169,7 +1171,8 @@ static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op)
 
 			bch_err_inum_offset_ratelimited(c,
 				insert->k.p.inode, insert->k.p.offset << 9,
-				"write error while doing btree update: %s",
+				"%s write error while doing btree update: %s",
+				op->flags & BCH_WRITE_MOVE ? "move" : "user",
 				bch2_err_str(ret));
 		}
 
@@ -1449,7 +1452,9 @@ static void __bch2_write(struct bch_write_op *op)
 					bch_err_inum_offset_ratelimited(c,
 						op->pos.inode,
 						op->pos.offset << 9,
-						"%s(): error: %s", __func__, bch2_err_str(ret));
+						"%s(): %s error: %s", __func__,
+						op->flags & BCH_WRITE_MOVE ? "move" : "user",
+						bch2_err_str(ret));
 				op->error = ret;
 				break;
 			}
@@ -1573,7 +1578,8 @@ CLOSURE_CALLBACK(bch2_write)
 		bch_err_inum_offset_ratelimited(c,
 			op->pos.inode,
 			op->pos.offset << 9,
-			"misaligned write");
+			"%s write error: misaligned write",
+			op->flags & BCH_WRITE_MOVE ? "move" : "user");
 		op->error = -EIO;
 		goto err;
 	}

From cbe85240650ebb2e7d1f184b4923b3767f2f2d6a Mon Sep 17 00:00:00 2001
From: Daniel Hill <daniel@gluo.nz>
Date: Fri, 19 Jan 2024 00:27:44 +1300
Subject: [PATCH 0690/1406] bcachefs: rebalance_status now shows correct units

Signed-off-by: Daniel Hill <daniel@gluo.nz>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/rebalance.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
index 22d1017aa49b97..56336f3dd1d077 100644
--- a/fs/bcachefs/rebalance.c
+++ b/fs/bcachefs/rebalance.c
@@ -412,11 +412,11 @@ void bch2_rebalance_status_to_text(struct printbuf *out, struct bch_fs *c)
 		u64 now = atomic64_read(&c->io_clock[WRITE].now);
 
 		prt_str(out, "io wait duration:  ");
-		bch2_prt_human_readable_s64(out, r->wait_iotime_end - r->wait_iotime_start);
+		bch2_prt_human_readable_s64(out, (r->wait_iotime_end - r->wait_iotime_start) << 9);
 		prt_newline(out);
 
 		prt_str(out, "io wait remaining: ");
-		bch2_prt_human_readable_s64(out, r->wait_iotime_end - now);
+		bch2_prt_human_readable_s64(out, (r->wait_iotime_end - now) << 9);
 		prt_newline(out);
 
 		prt_str(out, "duration waited:   ");

From 1dab26a5b933b98d80fa61535b6f97eea3605595 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 16 Feb 2024 23:50:05 -0500
Subject: [PATCH 0691/1406] bcachefs: Drop redundant btree_path_downgrade()s

If a path doesn't have any active references, we shouldn't downgrade it;
it'll either be reused, possibly with intent refs again, or dropped at
bch2_trans_begin() time.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_locking.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c
index 68439744233810..b9b151e693ed60 100644
--- a/fs/bcachefs/btree_locking.c
+++ b/fs/bcachefs/btree_locking.c
@@ -747,7 +747,8 @@ void bch2_trans_downgrade(struct btree_trans *trans)
 		return;
 
 	trans_for_each_path(trans, path, i)
-		bch2_btree_path_downgrade(trans, path);
+		if (path->ref)
+			bch2_btree_path_downgrade(trans, path);
 }
 
 int bch2_trans_relock(struct btree_trans *trans)

From 01582bd0dd763584b6ed5af7b841e21ca82aac04 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 17 Feb 2024 03:26:19 -0500
Subject: [PATCH 0692/1406] bcachefs: improve bch2_journal_buf_to_text()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal.c | 33 ++++++++++++++++++++++++---------
 1 file changed, 24 insertions(+), 9 deletions(-)

diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 214c8030048292..0a4b28734a2501 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -53,33 +53,48 @@ static void bch2_journal_buf_to_text(struct printbuf *out, struct journal *j, u6
 	unsigned i = seq & JOURNAL_BUF_MASK;
 	struct journal_buf *buf = j->buf + i;
 
-	prt_printf(out, "seq:");
+	prt_str(out, "seq:");
 	prt_tab(out);
 	prt_printf(out, "%llu", seq);
 	prt_newline(out);
 	printbuf_indent_add(out, 2);
 
-	prt_printf(out, "refcount:");
+	prt_str(out, "refcount:");
 	prt_tab(out);
 	prt_printf(out, "%u", journal_state_count(s, i));
 	prt_newline(out);
 
-	prt_printf(out, "size:");
+	prt_str(out, "size:");
 	prt_tab(out);
 	prt_human_readable_u64(out, vstruct_bytes(buf->data));
 	prt_newline(out);
 
-	prt_printf(out, "expires");
+	prt_str(out, "expires:");
 	prt_tab(out);
 	prt_printf(out, "%li jiffies", buf->expires - jiffies);
 	prt_newline(out);
 
+	prt_str(out, "flags:");
+	prt_tab(out);
+	if (buf->noflush)
+		prt_str(out, "noflush ");
+	if (buf->must_flush)
+		prt_str(out, "must_flush ");
+	if (buf->separate_flush)
+		prt_str(out, "separate_flush ");
+	if (buf->need_flush_to_write_buffer)
+		prt_str(out, "need_flush_to_write_buffer ");
+	if (buf->need_flush_to_write_buffer)
+		prt_str(out, "need_flush_to_write_buffer ");
+	if (buf->write_done)
+		prt_str(out, "write done ");
+	if (buf->write_started)
+		prt_str(out, "write started ");
+	if (buf->write_allocated)
+		prt_str(out, "write allocated ");
 	if (buf->write_done)
-		prt_printf(out, "write done\n");
-	else if (buf->write_allocated)
-		prt_printf(out, "write allocated\n");
-	else if (buf->write_started)
-		prt_printf(out, "write started\n");
+		prt_str(out, "write done");
+	prt_newline(out);
 
 	printbuf_indent_sub(out, 2);
 }

From 83f3d45a447f213cc76eb530ef8ba7a3a2c09910 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 16 Feb 2024 01:08:25 -0500
Subject: [PATCH 0693/1406] bcachefs: Split out discard fastpath

Buckets usually can't be discarded until the transaction that made them
empty has been committed in the journal.

Tracing has indicated that we're queuing the discard worker excessively,
only for it to skip over many buckets that are still waiting on a
journal commit, discarding only one or two buckets per iteration.

We want to switch to only queuing the discard worker after a journal
flush write, but there's an important optimization we need to preserve:
if a bucket becomes empty and it was never committed in the journal
while it was in use, we want to discard it and reuse it right away -
since overwriting it before the previous writes are flushed from the
device cache eans those writes only cost bus bandwidth.

So, this patch implements a fast path for buckets that can be discarded
right away. We need new locking between the two discard workers; the new
list of buckets being discarded provides that locking.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 145 +++++++++++++++++++++++++++++++--
 fs/bcachefs/alloc_background.h |   1 +
 fs/bcachefs/bcachefs.h         |   6 +-
 3 files changed, 145 insertions(+), 7 deletions(-)

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index c7be6afe89553b..4de4036ded069c 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -29,6 +29,8 @@
 #include <linux/sched/task.h>
 #include <linux/sort.h>
 
+static void bch2_discard_one_bucket_fast(struct bch_fs *c, struct bpos bucket);
+
 /* Persistent alloc info: */
 
 static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = {
@@ -866,14 +868,14 @@ int bch2_trigger_alloc(struct btree_trans *trans,
 #define statechange(expr)		!eval_state(old_a, expr) && eval_state(new_a, expr)
 #define bucket_flushed(a)		(!a->journal_seq || a->journal_seq <= c->journal.flushed_seq_ondisk)
 
-		if (statechange(a->data_type == BCH_DATA_free &&
-				bucket_flushed(a)))
+		if (statechange(a->data_type == BCH_DATA_free) &&
+		    bucket_flushed(new_a))
 			closure_wake_up(&c->freelist_wait);
 
-		if (statechange(a->data_type == BCH_DATA_need_discard &&
-				bucket_flushed(a)) &&
-		    !bch2_bucket_is_open(c, new.k->p.inode, new.k->p.offset))
-			bch2_do_discards(c);
+		if (statechange(a->data_type == BCH_DATA_need_discard) &&
+		    !bch2_bucket_is_open(c, new.k->p.inode, new.k->p.offset) &&
+		    bucket_flushed(new_a))
+			bch2_discard_one_bucket_fast(c, new.k->p);
 
 		if (statechange(a->data_type == BCH_DATA_cached) &&
 		    !bch2_bucket_is_open(c, new.k->p.inode, new.k->p.offset) &&
@@ -1609,6 +1611,36 @@ int bch2_check_alloc_to_lru_refs(struct bch_fs *c)
 	return ret;
 }
 
+static int discard_in_flight_add(struct bch_fs *c, struct bpos bucket)
+{
+	int ret;
+
+	mutex_lock(&c->discard_buckets_in_flight_lock);
+	darray_for_each(c->discard_buckets_in_flight, i)
+		if (bkey_eq(*i, bucket)) {
+			ret = -EEXIST;
+			goto out;
+		}
+
+	ret = darray_push(&c->discard_buckets_in_flight, bucket);
+out:
+	mutex_unlock(&c->discard_buckets_in_flight_lock);
+	return ret;
+}
+
+static void discard_in_flight_remove(struct bch_fs *c, struct bpos bucket)
+{
+	mutex_lock(&c->discard_buckets_in_flight_lock);
+	darray_for_each(c->discard_buckets_in_flight, i)
+		if (bkey_eq(*i, bucket)) {
+			darray_remove_item(&c->discard_buckets_in_flight, i);
+			goto found;
+		}
+	BUG();
+found:
+	mutex_unlock(&c->discard_buckets_in_flight_lock);
+}
+
 struct discard_buckets_state {
 	u64		seen;
 	u64		open;
@@ -1647,6 +1679,7 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,
 	struct bch_dev *ca;
 	struct bkey_i_alloc_v4 *a;
 	struct printbuf buf = PRINTBUF;
+	bool discard_locked = false;
 	int ret = 0;
 
 	ca = bch_dev_bkey_exists(c, pos.inode);
@@ -1714,6 +1747,11 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,
 		goto out;
 	}
 
+	if (discard_in_flight_add(c, SPOS(iter.pos.inode, iter.pos.offset, true)))
+		goto out;
+
+	discard_locked = true;
+
 	if (!bkey_eq(*discard_pos_done, iter.pos) &&
 	    ca->mi.discard && !c->opts.nochanges) {
 		/*
@@ -1745,6 +1783,8 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,
 	count_event(c, bucket_discard);
 	s->discarded++;
 out:
+	if (discard_locked)
+		discard_in_flight_remove(c, iter.pos);
 	s->seen++;
 	bch2_trans_iter_exit(trans, &iter);
 	percpu_ref_put(&ca->io_ref);
@@ -1784,6 +1824,92 @@ void bch2_do_discards(struct bch_fs *c)
 		bch2_write_ref_put(c, BCH_WRITE_REF_discard);
 }
 
+static int bch2_clear_bucket_needs_discard(struct btree_trans *trans, struct bpos bucket)
+{
+	struct btree_iter iter;
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, bucket, BTREE_ITER_INTENT);
+	struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter);
+	int ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	struct bkey_i_alloc_v4 *a = bch2_alloc_to_v4_mut(trans, k);
+	ret = PTR_ERR_OR_ZERO(a);
+	if (ret)
+		goto err;
+
+	SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false);
+	a->v.data_type = alloc_data_type(a->v, a->v.data_type);
+
+	ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+static void bch2_do_discards_fast_work(struct work_struct *work)
+{
+	struct bch_fs *c = container_of(work, struct bch_fs, discard_fast_work);
+
+	while (1) {
+		bool got_bucket = false;
+		struct bpos bucket;
+		struct bch_dev *ca;
+
+		mutex_lock(&c->discard_buckets_in_flight_lock);
+		darray_for_each(c->discard_buckets_in_flight, i) {
+			if (i->snapshot)
+				continue;
+
+			ca = bch_dev_bkey_exists(c, i->inode);
+
+			if (!percpu_ref_tryget(&ca->io_ref)) {
+				darray_remove_item(&c->discard_buckets_in_flight, i);
+				continue;
+			}
+
+			got_bucket = true;
+			bucket = *i;
+			i->snapshot = true;
+			break;
+		}
+		mutex_unlock(&c->discard_buckets_in_flight_lock);
+
+		if (!got_bucket)
+			break;
+
+		blkdev_issue_discard(ca->disk_sb.bdev,
+				     bucket.offset * ca->mi.bucket_size,
+				     ca->mi.bucket_size,
+				     GFP_KERNEL);
+
+		int ret = bch2_trans_do(c, NULL, NULL,
+					BCH_WATERMARK_btree|
+					BCH_TRANS_COMMIT_no_enospc,
+					bch2_clear_bucket_needs_discard(trans, bucket));
+		bch_err_fn(c, ret);
+
+		percpu_ref_put(&ca->io_ref);
+		discard_in_flight_remove(c, bucket);
+
+		if (ret)
+			break;
+	}
+
+	bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast);
+}
+
+static void bch2_discard_one_bucket_fast(struct bch_fs *c, struct bpos bucket)
+{
+	struct bch_dev *ca = bch_dev_bkey_exists(c, bucket.inode);
+
+	if (!percpu_ref_is_dying(&ca->io_ref) &&
+	    !discard_in_flight_add(c, bucket) &&
+	    bch2_write_ref_tryget(c, BCH_WRITE_REF_discard_fast) &&
+	    !queue_work(c->write_ref_wq, &c->discard_fast_work))
+		bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast);
+}
+
 static int invalidate_one_bucket(struct btree_trans *trans,
 				 struct btree_iter *lru_iter,
 				 struct bkey_s_c lru_k,
@@ -2215,9 +2341,16 @@ void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca)
 			set_bit(ca->dev_idx, c->rw_devs[i].d);
 }
 
+void bch2_fs_allocator_background_exit(struct bch_fs *c)
+{
+	darray_exit(&c->discard_buckets_in_flight);
+}
+
 void bch2_fs_allocator_background_init(struct bch_fs *c)
 {
 	spin_lock_init(&c->freelist_lock);
+	mutex_init(&c->discard_buckets_in_flight_lock);
 	INIT_WORK(&c->discard_work, bch2_do_discards_work);
+	INIT_WORK(&c->discard_fast_work, bch2_do_discards_fast_work);
 	INIT_WORK(&c->invalidate_work, bch2_do_invalidates_work);
 }
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index e7f7e842ee1b72..052b2fac25d693 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -269,6 +269,7 @@ u64 bch2_min_rw_member_capacity(struct bch_fs *);
 void bch2_dev_allocator_remove(struct bch_fs *, struct bch_dev *);
 void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *);
 
+void bch2_fs_allocator_background_exit(struct bch_fs *);
 void bch2_fs_allocator_background_init(struct bch_fs *);
 
 #endif /* _BCACHEFS_ALLOC_BACKGROUND_H */
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 4d04e9c04dc348..13b8a16c18dca7 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -708,6 +708,7 @@ struct btree_trans_buf {
 	x(reflink)							\
 	x(fallocate)							\
 	x(discard)							\
+	x(discard_fast)							\
 	x(invalidate)							\
 	x(delete_dead_snapshots)					\
 	x(snapshot_delete_pagecache)					\
@@ -943,8 +944,11 @@ struct bch_fs {
 	unsigned		write_points_nr;
 
 	struct buckets_waiting_for_journal buckets_waiting_for_journal;
-	struct work_struct	discard_work;
 	struct work_struct	invalidate_work;
+	struct work_struct	discard_work;
+	struct mutex		discard_buckets_in_flight_lock;
+	DARRAY(struct bpos)	discard_buckets_in_flight;
+	struct work_struct	discard_fast_work;
 
 	/* GARBAGE COLLECTION */
 	struct task_struct	*gc_thread;

From f991daa5e13fd95a74c88ea65de44a3a16dab843 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 17 Feb 2024 17:54:39 -0500
Subject: [PATCH 0694/1406] bcachefs: Fix journal_buf bitfield accesses

All jounal_buf bitfield updates must happen under the journal lock -
perhaps we should just switch these to atomic bit flags.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_write_buffer.c |  2 ++
 fs/bcachefs/journal_io.c         | 17 +++++++++++------
 2 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c
index ac784486196636..a7d86252690a72 100644
--- a/fs/bcachefs/btree_write_buffer.c
+++ b/fs/bcachefs/btree_write_buffer.c
@@ -590,7 +590,9 @@ static int bch2_journal_keys_to_write_buffer(struct bch_fs *c, struct journal_bu
 		entry->type = BCH_JSET_ENTRY_btree_keys;
 	}
 
+	spin_lock(&c->journal.lock);
 	buf->need_flush_to_write_buffer = false;
+	spin_unlock(&c->journal.lock);
 out:
 	bch2_journal_keys_to_write_buffer_end(c, &dst);
 	return ret;
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 16c1249c84e09e..f9e5b100a9da5c 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1830,7 +1830,10 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w)
 
 	if (wb.wb)
 		bch2_journal_keys_to_write_buffer_end(c, &wb);
+
+	spin_lock(&c->journal.lock);
 	w->need_flush_to_write_buffer = false;
+	spin_unlock(&c->journal.lock);
 
 	start = end = vstruct_last(jset);
 
@@ -1948,12 +1951,20 @@ CLOSURE_CALLBACK(bch2_journal_write)
 	unsigned nr_rw_members = 0;
 	int ret;
 
+	for_each_rw_member(c, ca)
+		nr_rw_members++;
+
 	BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb));
+	BUG_ON(!w->write_started);
 	BUG_ON(w->write_allocated);
+	BUG_ON(w->write_done);
 
 	j->write_start_time = local_clock();
 
 	spin_lock(&j->lock);
+	if (nr_rw_members > 1)
+		w->separate_flush = true;
+
 	ret = bch2_journal_write_pick_flush(j, w);
 	spin_unlock(&j->lock);
 	if (ret)
@@ -2008,12 +2019,6 @@ CLOSURE_CALLBACK(bch2_journal_write)
 	if (c->opts.nochanges)
 		goto no_io;
 
-	for_each_rw_member(c, ca)
-		nr_rw_members++;
-
-	if (nr_rw_members > 1)
-		w->separate_flush = true;
-
 	/*
 	 * Mark journal replicas before we submit the write to guarantee
 	 * recovery will find the journal entries after a crash.

From 21e3af8db1dc753c42df6adcd8b6458473421327 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 17 Feb 2024 19:56:19 -0500
Subject: [PATCH 0695/1406] bcachefs: Add journal.blocked to
 journal_debug_to_text()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 0a4b28734a2501..46dc25ad95e753 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -1435,6 +1435,7 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
 	prt_printf(out, "reclaim kicked:\t\t%u\n",		j->reclaim_kicked);
 	prt_printf(out, "reclaim runs in:\t%u ms\n",		time_after(j->next_reclaim, now)
 	       ? jiffies_to_msecs(j->next_reclaim - jiffies) : 0);
+	prt_printf(out, "blocked:\t\t%u\n",			j->blocked);
 	prt_printf(out, "current entry sectors:\t%u\n",		j->cur_entry_sectors);
 	prt_printf(out, "current entry error:\t%s\n",		bch2_journal_errors[j->cur_entry_error]);
 	prt_printf(out, "current entry:\t\t");

From 9eb1987bc78499129662a436f2e54ca1a98b714e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 17 Feb 2024 20:38:47 -0500
Subject: [PATCH 0696/1406] bcachefs: Fix bch2_journal_flush_device_pins()

If a journal write errored, the list of devices it was written to could
be empty - we're not supposed to mark an empty replicas list.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal_reclaim.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index a71550816c3040..ab811c0dad26ac 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -887,9 +887,11 @@ int bch2_journal_flush_device_pins(struct journal *j, int dev_idx)
 					 journal_seq_pin(j, seq)->devs);
 		seq++;
 
-		spin_unlock(&j->lock);
-		ret = bch2_mark_replicas(c, &replicas.e);
-		spin_lock(&j->lock);
+		if (replicas.e.nr_devs) {
+			spin_unlock(&j->lock);
+			ret = bch2_mark_replicas(c, &replicas.e);
+			spin_lock(&j->lock);
+		}
 	}
 	spin_unlock(&j->lock);
 err:

From c887148ebf9989ce8bdf6f814d4342ba5bf465fa Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 17 Feb 2024 20:49:11 -0500
Subject: [PATCH 0697/1406] thread_with_file: add f_ops.flush

Add a flush op, to return the exit code via close().

Also update bcachefs usage to use this to return fsck exit codes.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/chardev.c            | 25 ++++++++++++++++++++-----
 include/linux/thread_with_file.h |  2 +-
 lib/thread_with_file.c           | 12 +++++++++++-
 3 files changed, 32 insertions(+), 7 deletions(-)

diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c
index a2f30f45f93f75..992939152f0190 100644
--- a/fs/bcachefs/chardev.c
+++ b/fs/bcachefs/chardev.c
@@ -155,14 +155,28 @@ static void bch2_fsck_thread_exit(struct thread_with_stdio *_thr)
 	kfree(thr);
 }
 
-static void bch2_fsck_offline_thread_fn(struct thread_with_stdio *stdio)
+static int bch2_fsck_offline_thread_fn(struct thread_with_stdio *stdio)
 {
 	struct fsck_thread *thr = container_of(stdio, struct fsck_thread, thr);
 	struct bch_fs *c = bch2_fs_open(thr->devs, thr->nr_devs, thr->opts);
 
-	thr->thr.thr.ret = PTR_ERR_OR_ZERO(c);
-	if (!thr->thr.thr.ret)
-		bch2_fs_stop(c);
+	if (IS_ERR(c))
+		return PTR_ERR(c);
+
+	int ret = 0;
+	if (test_bit(BCH_FS_errors_fixed, &c->flags))
+		ret |= 1;
+	if (test_bit(BCH_FS_error, &c->flags))
+		ret |= 4;
+
+	bch2_fs_stop(c);
+
+	if (ret & 1)
+		stdio_redirect_printf(&stdio->stdio, false, "%s: errors fixed\n", c->name);
+	if (ret & 4)
+		stdio_redirect_printf(&stdio->stdio, false, "%s: still has errors\n", c->name);
+
+	return ret;
 }
 
 static const struct thread_with_stdio_ops bch2_offline_fsck_ops = {
@@ -763,7 +777,7 @@ static long bch2_ioctl_disk_resize_journal(struct bch_fs *c,
 	return ret;
 }
 
-static void bch2_fsck_online_thread_fn(struct thread_with_stdio *stdio)
+static int bch2_fsck_online_thread_fn(struct thread_with_stdio *stdio)
 {
 	struct fsck_thread *thr = container_of(stdio, struct fsck_thread, thr);
 	struct bch_fs *c = thr->c;
@@ -795,6 +809,7 @@ static void bch2_fsck_online_thread_fn(struct thread_with_stdio *stdio)
 
 	up(&c->online_fsck_mutex);
 	bch2_ro_ref_put(c);
+	return ret;
 }
 
 static const struct thread_with_stdio_ops bch2_online_fsck_ops = {
diff --git a/include/linux/thread_with_file.h b/include/linux/thread_with_file.h
index 33770938d5d9a8..cf44337af3e973 100644
--- a/include/linux/thread_with_file.h
+++ b/include/linux/thread_with_file.h
@@ -56,7 +56,7 @@ struct thread_with_stdio;
 
 struct thread_with_stdio_ops {
 	void (*exit)(struct thread_with_stdio *);
-	void (*fn)(struct thread_with_stdio *);
+	int (*fn)(struct thread_with_stdio *);
 	long (*unlocked_ioctl)(struct thread_with_stdio *, unsigned int, unsigned long);
 };
 
diff --git a/lib/thread_with_file.c b/lib/thread_with_file.c
index 37a1ea22823cae..4f60ce7287ccf7 100644
--- a/lib/thread_with_file.c
+++ b/lib/thread_with_file.c
@@ -381,6 +381,14 @@ static __poll_t thread_with_stdout_poll(struct file *file, struct poll_table_str
 	return mask;
 }
 
+static int thread_with_stdio_flush(struct file *file, fl_owner_t id)
+{
+	struct thread_with_stdio *thr =
+		container_of(file->private_data, struct thread_with_stdio, thr);
+
+	return thr->thr.ret;
+}
+
 static long thread_with_stdio_ioctl(struct file *file, unsigned int cmd, unsigned long p)
 {
 	struct thread_with_stdio *thr =
@@ -396,6 +404,7 @@ static const struct file_operations thread_with_stdio_fops = {
 	.read		= thread_with_stdio_read,
 	.write		= thread_with_stdio_write,
 	.poll		= thread_with_stdio_poll,
+	.flush		= thread_with_stdio_flush,
 	.release	= thread_with_stdio_release,
 	.unlocked_ioctl	= thread_with_stdio_ioctl,
 };
@@ -404,6 +413,7 @@ static const struct file_operations thread_with_stdout_fops = {
 	.llseek		= no_llseek,
 	.read		= thread_with_stdio_read,
 	.poll		= thread_with_stdout_poll,
+	.flush		= thread_with_stdio_flush,
 	.release	= thread_with_stdio_release,
 	.unlocked_ioctl	= thread_with_stdio_ioctl,
 };
@@ -412,7 +422,7 @@ static int thread_with_stdio_fn(void *arg)
 {
 	struct thread_with_stdio *thr = arg;
 
-	thr->ops->fn(thr);
+	thr->thr.ret = thr->ops->fn(thr);
 
 	thread_with_stdio_done(thr);
 	return 0;

From a4406eb87b22e7f03a21ad8e99abbf95b01966f4 Mon Sep 17 00:00:00 2001
From: Artur Weber <aweber.kernel@gmail.com>
Date: Sat, 17 Feb 2024 20:02:47 +0100
Subject: [PATCH 0698/1406] ARM: dts: exynos4212-tab3: limit usable memory
 range

The stock bootloader on the Samsung Galaxy Tab 3 8.0 provides an
incorrect available memory range over ATAG_MEM. Limit the usable
memory in the DTS to prevent it from doing so, without having to
disable ATAG support.

Signed-off-by: Artur Weber <aweber.kernel@gmail.com>
Reviewed-by: Henrik Grimler <henrik@grimler.se>
Link: https://lore.kernel.org/r/20240217-tab3-limit-usable-memory-range-v1-1-49cc9c86a5cc@gmail.com
Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
---
 arch/arm/boot/dts/samsung/exynos4212-tab3.dtsi | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/arch/arm/boot/dts/samsung/exynos4212-tab3.dtsi b/arch/arm/boot/dts/samsung/exynos4212-tab3.dtsi
index d7954ff466b491..8b2a7eba8de2da 100644
--- a/arch/arm/boot/dts/samsung/exynos4212-tab3.dtsi
+++ b/arch/arm/boot/dts/samsung/exynos4212-tab3.dtsi
@@ -45,6 +45,12 @@
 		/* Default S-BOOT bootloader loads initramfs here */
 		linux,initrd-start = <0x42000000>;
 		linux,initrd-end = <0x42800000>;
+
+		/*
+		 * Stock bootloader provides incorrect memory size in ATAG_MEM;
+		 * override it here
+		 */
+		linux,usable-memory-range = <0x40000000 0x3fc00000>;
 	};
 
 	firmware@204f000 {

From 5fb1252944fc3bba4a1026adaa9d150cfe8a3e16 Mon Sep 17 00:00:00 2001
From: Artur Weber <aweber.kernel@gmail.com>
Date: Sat, 17 Feb 2024 19:22:40 +0100
Subject: [PATCH 0699/1406] ARM: dts: samsung: exynos4412: decrease memory to
 account for unusable region

The last 4 MiB of RAM on those devices is likely used by trustzone
firmware, and is unusable under Linux. Change the device tree memory
node accordingly.

The proprietary bootloader (S-BOOT) passes these memory ranges through
ATAG_MEM; this change allows us to have the correct memory ranges
without relying on ATAG_MEM.

Tested-by: Henrik Grimler <henrik@grimler.se> # i9300, i9305
Signed-off-by: Artur Weber <aweber.kernel@gmail.com>
Link: https://lore.kernel.org/r/20240217-exynos4-memsize-fix-v1-1-7858e9c5f844@gmail.com
Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
---
 arch/arm/boot/dts/samsung/exynos4412-i9300.dts   | 2 +-
 arch/arm/boot/dts/samsung/exynos4412-i9305.dts   | 2 +-
 arch/arm/boot/dts/samsung/exynos4412-n710x.dts   | 2 +-
 arch/arm/boot/dts/samsung/exynos4412-p4note.dtsi | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/arm/boot/dts/samsung/exynos4412-i9300.dts b/arch/arm/boot/dts/samsung/exynos4412-i9300.dts
index 61aca5798f388d..b79d456e976dfd 100644
--- a/arch/arm/boot/dts/samsung/exynos4412-i9300.dts
+++ b/arch/arm/boot/dts/samsung/exynos4412-i9300.dts
@@ -18,7 +18,7 @@
 
 	memory@40000000 {
 		device_type = "memory";
-		reg = <0x40000000 0x40000000>;
+		reg = <0x40000000 0x3fc00000>;
 	};
 };
 
diff --git a/arch/arm/boot/dts/samsung/exynos4412-i9305.dts b/arch/arm/boot/dts/samsung/exynos4412-i9305.dts
index 77083f1a827314..1048ef5d9bc3ba 100644
--- a/arch/arm/boot/dts/samsung/exynos4412-i9305.dts
+++ b/arch/arm/boot/dts/samsung/exynos4412-i9305.dts
@@ -11,7 +11,7 @@
 
 	memory@40000000 {
 		device_type = "memory";
-		reg = <0x40000000 0x80000000>;
+		reg = <0x40000000 0x7fc00000>;
 	};
 };
 
diff --git a/arch/arm/boot/dts/samsung/exynos4412-n710x.dts b/arch/arm/boot/dts/samsung/exynos4412-n710x.dts
index 0a151437fc7349..eee1000dea9227 100644
--- a/arch/arm/boot/dts/samsung/exynos4412-n710x.dts
+++ b/arch/arm/boot/dts/samsung/exynos4412-n710x.dts
@@ -9,7 +9,7 @@
 
 	memory@40000000 {
 		device_type = "memory";
-		reg = <0x40000000 0x80000000>;
+		reg = <0x40000000 0x7fc00000>;
 	};
 
 	/* bootargs are passed in by bootloader */
diff --git a/arch/arm/boot/dts/samsung/exynos4412-p4note.dtsi b/arch/arm/boot/dts/samsung/exynos4412-p4note.dtsi
index 39a3d1cbe4c3b5..28a6058027335e 100644
--- a/arch/arm/boot/dts/samsung/exynos4412-p4note.dtsi
+++ b/arch/arm/boot/dts/samsung/exynos4412-p4note.dtsi
@@ -23,7 +23,7 @@
 
 	memory@40000000 {
 		device_type = "memory";
-		reg = <0x40000000 0x80000000>;
+		reg = <0x40000000 0x7fc00000>;
 	};
 
 	aliases {

From 27ce5347f4ef5e76c6cd98dfc57bd21828224087 Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Date: Mon, 19 Feb 2024 08:49:10 +0100
Subject: [PATCH 0700/1406] Revert "ARM: dts: exynos4212-tab3: limit usable
 memory range"

This reverts commit a4406eb87b22e7f03a21ad8e99abbf95b01966f4.

Temporary fix of lunux-next branch. This won't be send to upstream
maintainers.

Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
---
 arch/arm/boot/dts/samsung/exynos4212-tab3.dtsi | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/arch/arm/boot/dts/samsung/exynos4212-tab3.dtsi b/arch/arm/boot/dts/samsung/exynos4212-tab3.dtsi
index 9bc05961577dca..e5254e32aa8fc3 100644
--- a/arch/arm/boot/dts/samsung/exynos4212-tab3.dtsi
+++ b/arch/arm/boot/dts/samsung/exynos4212-tab3.dtsi
@@ -45,12 +45,6 @@
 		/* Default S-BOOT bootloader loads initramfs here */
 		linux,initrd-start = <0x42000000>;
 		linux,initrd-end = <0x42800000>;
-
-		/*
-		 * Stock bootloader provides incorrect memory size in ATAG_MEM;
-		 * override it here
-		 */
-		linux,usable-memory-range = <0x40000000 0x3fc00000>;
 	};
 
 	firmware@204f000 {

From b43b68935124f2e7558835f3e2f13d28afce1a31 Mon Sep 17 00:00:00 2001
From: Artur Weber <aweber.kernel@gmail.com>
Date: Sat, 17 Feb 2024 19:22:40 +0100
Subject: [PATCH 0701/1406] ARM: dts: samsung: exynos4412: decrease memory to
 account for unusable region

The last 4 MiB of RAM on those devices is likely used by trustzone
firmware, and is unusable under Linux. Change the device tree memory
node accordingly.

The proprietary bootloader (S-BOOT) passes these memory ranges through
ATAG_MEM; this change allows us to have the correct memory ranges
without relying on ATAG_MEM.

Tested-by: Henrik Grimler <henrik@grimler.se> # i9300, i9305
Signed-off-by: Artur Weber <aweber.kernel@gmail.com>
Link: https://lore.kernel.org/r/20240217-exynos4-memsize-fix-v1-1-7858e9c5f844@gmail.com
Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
---
 arch/arm/boot/dts/samsung/exynos4412-i9300.dts   | 2 +-
 arch/arm/boot/dts/samsung/exynos4412-i9305.dts   | 2 +-
 arch/arm/boot/dts/samsung/exynos4412-n710x.dts   | 2 +-
 arch/arm/boot/dts/samsung/exynos4412-p4note.dtsi | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/arm/boot/dts/samsung/exynos4412-i9300.dts b/arch/arm/boot/dts/samsung/exynos4412-i9300.dts
index 61aca5798f388d..b79d456e976dfd 100644
--- a/arch/arm/boot/dts/samsung/exynos4412-i9300.dts
+++ b/arch/arm/boot/dts/samsung/exynos4412-i9300.dts
@@ -18,7 +18,7 @@
 
 	memory@40000000 {
 		device_type = "memory";
-		reg = <0x40000000 0x40000000>;
+		reg = <0x40000000 0x3fc00000>;
 	};
 };
 
diff --git a/arch/arm/boot/dts/samsung/exynos4412-i9305.dts b/arch/arm/boot/dts/samsung/exynos4412-i9305.dts
index 77083f1a827314..1048ef5d9bc3ba 100644
--- a/arch/arm/boot/dts/samsung/exynos4412-i9305.dts
+++ b/arch/arm/boot/dts/samsung/exynos4412-i9305.dts
@@ -11,7 +11,7 @@
 
 	memory@40000000 {
 		device_type = "memory";
-		reg = <0x40000000 0x80000000>;
+		reg = <0x40000000 0x7fc00000>;
 	};
 };
 
diff --git a/arch/arm/boot/dts/samsung/exynos4412-n710x.dts b/arch/arm/boot/dts/samsung/exynos4412-n710x.dts
index 0a151437fc7349..eee1000dea9227 100644
--- a/arch/arm/boot/dts/samsung/exynos4412-n710x.dts
+++ b/arch/arm/boot/dts/samsung/exynos4412-n710x.dts
@@ -9,7 +9,7 @@
 
 	memory@40000000 {
 		device_type = "memory";
-		reg = <0x40000000 0x80000000>;
+		reg = <0x40000000 0x7fc00000>;
 	};
 
 	/* bootargs are passed in by bootloader */
diff --git a/arch/arm/boot/dts/samsung/exynos4412-p4note.dtsi b/arch/arm/boot/dts/samsung/exynos4412-p4note.dtsi
index 39a3d1cbe4c3b5..28a6058027335e 100644
--- a/arch/arm/boot/dts/samsung/exynos4412-p4note.dtsi
+++ b/arch/arm/boot/dts/samsung/exynos4412-p4note.dtsi
@@ -23,7 +23,7 @@
 
 	memory@40000000 {
 		device_type = "memory";
-		reg = <0x40000000 0x80000000>;
+		reg = <0x40000000 0x7fc00000>;
 	};
 
 	aliases {

From 13636d5502204e671398470962babbfb46bc2721 Mon Sep 17 00:00:00 2001
From: Pierre-Hugues Husson <phhusson@freebox.fr>
Date: Tue, 13 Feb 2024 18:14:55 +0100
Subject: [PATCH 0702/1406] arm64: dts: amlogic: add fbx8am DT overlays

Add support for two variants of the fbx8am board.

Signed-off-by: Pierre-Hugues Husson <phhusson@freebox.fr>
Signed-off-by: Marc Gonzalez <mgonzalez@freebox.fr>
Link: https://lore.kernel.org/r/79ba726d-d02c-44b9-b6f6-59b17ba9755c@freebox.fr
Signed-off-by: Neil Armstrong <neil.armstrong@linaro.org>
---
 arch/arm64/boot/dts/amlogic/Makefile          |  6 ++++
 .../dts/amlogic/meson-g12a-fbx8am-brcm.dtso   | 35 +++++++++++++++++++
 .../amlogic/meson-g12a-fbx8am-realtek.dtso    | 25 +++++++++++++
 3 files changed, 66 insertions(+)
 create mode 100644 arch/arm64/boot/dts/amlogic/meson-g12a-fbx8am-brcm.dtso
 create mode 100644 arch/arm64/boot/dts/amlogic/meson-g12a-fbx8am-realtek.dtso

diff --git a/arch/arm64/boot/dts/amlogic/Makefile b/arch/arm64/boot/dts/amlogic/Makefile
index cc8b34bd583d84..1ab160bf928ae4 100644
--- a/arch/arm64/boot/dts/amlogic/Makefile
+++ b/arch/arm64/boot/dts/amlogic/Makefile
@@ -8,6 +8,8 @@ dtb-$(CONFIG_ARCH_MESON) += meson-axg-jethome-jethub-j100.dtb
 dtb-$(CONFIG_ARCH_MESON) += meson-axg-jethome-jethub-j110-rev-2.dtb
 dtb-$(CONFIG_ARCH_MESON) += meson-axg-jethome-jethub-j110-rev-3.dtb
 dtb-$(CONFIG_ARCH_MESON) += meson-axg-s400.dtb
+dtb-$(CONFIG_ARCH_MESON) += meson-g12a-fbx8am-brcm.dtb
+dtb-$(CONFIG_ARCH_MESON) += meson-g12a-fbx8am-realtek.dtb
 dtb-$(CONFIG_ARCH_MESON) += meson-g12a-radxa-zero.dtb
 dtb-$(CONFIG_ARCH_MESON) += meson-g12a-sei510.dtb
 dtb-$(CONFIG_ARCH_MESON) += meson-g12a-u200.dtb
@@ -80,3 +82,7 @@ dtb-$(CONFIG_ARCH_MESON) += meson-sm1-odroid-hc4.dtb
 dtb-$(CONFIG_ARCH_MESON) += meson-sm1-sei610.dtb
 dtb-$(CONFIG_ARCH_MESON) += meson-sm1-x96-air-gbit.dtb
 dtb-$(CONFIG_ARCH_MESON) += meson-sm1-x96-air.dtb
+
+# Overlays
+meson-g12a-fbx8am-brcm-dtbs	:= meson-g12a-fbx8am.dtb meson-g12a-fbx8am-brcm.dtbo
+meson-g12a-fbx8am-realtek-dtbs	:= meson-g12a-fbx8am.dtb meson-g12a-fbx8am-realtek.dtbo
diff --git a/arch/arm64/boot/dts/amlogic/meson-g12a-fbx8am-brcm.dtso b/arch/arm64/boot/dts/amlogic/meson-g12a-fbx8am-brcm.dtso
new file mode 100644
index 00000000000000..ed79809b15859b
--- /dev/null
+++ b/arch/arm64/boot/dts/amlogic/meson-g12a-fbx8am-brcm.dtso
@@ -0,0 +1,35 @@
+// SPDX-License-Identifier: (GPL-2.0+ OR MIT)
+// Copyright (c) 2024 Freebox SAS
+
+/dts-v1/;
+/plugin/;
+
+#include <dt-bindings/gpio/gpio.h>
+#include <dt-bindings/gpio/meson-g12a-gpio.h>
+
+/ {
+	compatible = "freebox,fbx8am-brcm", "freebox,fbx8am", "amlogic,g12a";
+};
+
+&uart_A {
+	bluetooth {
+		compatible = "brcm,bcm43438-bt";
+		shutdown-gpios = <&gpio GPIOX_17 GPIO_ACTIVE_HIGH>;
+		max-speed = <2000000>;
+		clocks = <&wifi32k>;
+		clock-names = "lpo";
+		vbat-supply = <&vddao_3v3>;
+		vddio-supply = <&vddio_ao1v8>;
+	};
+};
+
+&sd_emmc_a {
+	/* Per mmc-controller.yaml */
+	#address-cells = <1>;
+	#size-cells = <0>;
+	/* NB: may be either AP6398S or AP6398SR3 wifi module */
+	brcmf: wifi@1 {
+		reg = <1>;
+		compatible = "brcm,bcm4329-fmac";
+	};
+};
diff --git a/arch/arm64/boot/dts/amlogic/meson-g12a-fbx8am-realtek.dtso b/arch/arm64/boot/dts/amlogic/meson-g12a-fbx8am-realtek.dtso
new file mode 100644
index 00000000000000..5da88fb94fb981
--- /dev/null
+++ b/arch/arm64/boot/dts/amlogic/meson-g12a-fbx8am-realtek.dtso
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: (GPL-2.0+ OR MIT)
+// Copyright (c) 2024 Freebox SAS
+
+/dts-v1/;
+/plugin/;
+
+#include <dt-bindings/gpio/gpio.h>
+#include <dt-bindings/gpio/meson-g12a-gpio.h>
+
+/ {
+	compatible = "freebox,fbx8am-realtek", "freebox,fbx8am", "amlogic,g12a";
+};
+
+&uart_A {
+	bluetooth {
+		compatible = "realtek,rtl8822cs-bt";
+		enable-gpios = <&gpio GPIOX_17 GPIO_ACTIVE_HIGH>;
+		host-wake-gpios = <&gpio GPIOX_19 GPIO_ACTIVE_HIGH>;
+		device-wake-gpios = <&gpio GPIOX_18 GPIO_ACTIVE_HIGH>;
+	};
+};
+
+&sd_emmc_a {
+	/* No explicit compatible for rtl8822cs sdio */
+};

From c44c3dda4e40b26542c795eab6eb78c674278b0d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Wed, 14 Feb 2024 10:32:36 +0100
Subject: [PATCH 0703/1406] pwm: sprd: Drop duplicated tracking of the parent
 device
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The pwmchip stores a pointer to the parent device, so there is no need
to store another copy in driver private data. Drop struct
sprd_pwm_chip::dev and use the pwm_chip's parent pointer instead.

Link: https://lore.kernel.org/r/f85771f4bf659c0fdee30cf117b87fd877bad5e4.1707900770.git.u.kleine-koenig@pengutronix.de
Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
---
 drivers/pwm/pwm-sprd.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/drivers/pwm/pwm-sprd.c b/drivers/pwm/pwm-sprd.c
index 2e87666ad2b99f..a38f50c3dc0057 100644
--- a/drivers/pwm/pwm-sprd.c
+++ b/drivers/pwm/pwm-sprd.c
@@ -34,7 +34,6 @@ struct sprd_pwm_chn {
 
 struct sprd_pwm_chip {
 	void __iomem *base;
-	struct device *dev;
 	struct pwm_chip chip;
 	struct sprd_pwm_chn chn[SPRD_PWM_CHN_NUM];
 };
@@ -85,7 +84,7 @@ static int sprd_pwm_get_state(struct pwm_chip *chip, struct pwm_device *pwm,
 	 */
 	ret = clk_bulk_prepare_enable(SPRD_PWM_CHN_CLKS_NUM, chn->clks);
 	if (ret) {
-		dev_err(spc->dev, "failed to enable pwm%u clocks\n",
+		dev_err(pwmchip_parent(chip), "failed to enable pwm%u clocks\n",
 			pwm->hwpwm);
 		return ret;
 	}
@@ -182,7 +181,7 @@ static int sprd_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm,
 			ret = clk_bulk_prepare_enable(SPRD_PWM_CHN_CLKS_NUM,
 						      chn->clks);
 			if (ret) {
-				dev_err(spc->dev,
+				dev_err(pwmchip_parent(chip),
 					"failed to enable pwm%u clocks\n",
 					pwm->hwpwm);
 				return ret;
@@ -265,7 +264,6 @@ static int sprd_pwm_probe(struct platform_device *pdev)
 	if (IS_ERR(spc->base))
 		return PTR_ERR(spc->base);
 
-	spc->dev = &pdev->dev;
 	memcpy(spc->chn, chn, sizeof(chn));
 
 	spc->chip.dev = &pdev->dev;

From 216f66084a11579eb970abcf6857f69ab6c8d553 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Wed, 14 Feb 2024 10:32:37 +0100
Subject: [PATCH 0704/1406] pwm: sprd: Make use of devm_pwmchip_alloc()
 function
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This prepares the pwm-sprd driver to further changes of the pwm core
outlined in the commit introducing devm_pwmchip_alloc(). There is no
intended semantical change and the driver should behave as before.

Link: https://lore.kernel.org/r/543213f44686ee72d8f88897bf2ca616e837ae44.1707900770.git.u.kleine-koenig@pengutronix.de
Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
---
 drivers/pwm/pwm-sprd.c | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/drivers/pwm/pwm-sprd.c b/drivers/pwm/pwm-sprd.c
index a38f50c3dc0057..4c76ca5e4cdd66 100644
--- a/drivers/pwm/pwm-sprd.c
+++ b/drivers/pwm/pwm-sprd.c
@@ -34,13 +34,12 @@ struct sprd_pwm_chn {
 
 struct sprd_pwm_chip {
 	void __iomem *base;
-	struct pwm_chip chip;
 	struct sprd_pwm_chn chn[SPRD_PWM_CHN_NUM];
 };
 
 static inline struct sprd_pwm_chip* sprd_pwm_from_chip(struct pwm_chip *chip)
 {
-	return container_of(chip, struct sprd_pwm_chip, chip);
+	return pwmchip_get_drvdata(chip);
 }
 
 /*
@@ -248,6 +247,7 @@ static int sprd_pwm_clk_init(struct device *dev,
 
 static int sprd_pwm_probe(struct platform_device *pdev)
 {
+	struct pwm_chip *chip;
 	struct sprd_pwm_chip *spc;
 	struct sprd_pwm_chn chn[SPRD_PWM_CHN_NUM];
 	int ret, npwm;
@@ -256,9 +256,10 @@ static int sprd_pwm_probe(struct platform_device *pdev)
 	if (npwm < 0)
 		return npwm;
 
-	spc = devm_kzalloc(&pdev->dev, sizeof(*spc), GFP_KERNEL);
-	if (!spc)
-		return -ENOMEM;
+	chip = devm_pwmchip_alloc(&pdev->dev, npwm, sizeof(*spc));
+	if (IS_ERR(chip))
+		return PTR_ERR(chip);
+	spc = sprd_pwm_from_chip(chip);
 
 	spc->base = devm_platform_ioremap_resource(pdev, 0);
 	if (IS_ERR(spc->base))
@@ -266,11 +267,9 @@ static int sprd_pwm_probe(struct platform_device *pdev)
 
 	memcpy(spc->chn, chn, sizeof(chn));
 
-	spc->chip.dev = &pdev->dev;
-	spc->chip.ops = &sprd_pwm_ops;
-	spc->chip.npwm = npwm;
+	chip->ops = &sprd_pwm_ops;
 
-	ret = devm_pwmchip_add(&pdev->dev, &spc->chip);
+	ret = devm_pwmchip_add(&pdev->dev, chip);
 	if (ret)
 		dev_err(&pdev->dev, "failed to add PWM chip\n");
 

From ed0d9698ffa024cccb08b99eaeddaad53664da84 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Wed, 14 Feb 2024 10:32:38 +0100
Subject: [PATCH 0705/1406] pwm: sti: Prepare removing pwm_chip from driver
 data
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This prepares the driver for further changes that will drop struct
pwm_chip chip from struct sti_pwm_chip. Use the pwm_chip as driver data
instead of the sti_pwm_chip to get access to the pwm_chip in
sti_pwm_remove() without using pc->chip.

Link: https://lore.kernel.org/r/56d53372aacff6871df4d6c6779c9dac94592696.1707900770.git.u.kleine-koenig@pengutronix.de
Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
---
 drivers/pwm/pwm-sti.c | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/drivers/pwm/pwm-sti.c b/drivers/pwm/pwm-sti.c
index 69b1113c6b8217..826eb547cc96f4 100644
--- a/drivers/pwm/pwm-sti.c
+++ b/drivers/pwm/pwm-sti.c
@@ -570,6 +570,7 @@ static int sti_pwm_probe(struct platform_device *pdev)
 {
 	struct device *dev = &pdev->dev;
 	struct sti_pwm_compat_data *cdata;
+	struct pwm_chip *chip;
 	struct sti_pwm_chip *pc;
 	unsigned int i;
 	int irq, ret;
@@ -577,6 +578,7 @@ static int sti_pwm_probe(struct platform_device *pdev)
 	pc = devm_kzalloc(dev, sizeof(*pc), GFP_KERNEL);
 	if (!pc)
 		return -ENOMEM;
+	chip = &pc->chip;
 
 	cdata = devm_kzalloc(dev, sizeof(*cdata), GFP_KERNEL);
 	if (!cdata)
@@ -653,9 +655,9 @@ static int sti_pwm_probe(struct platform_device *pdev)
 			return -ENOMEM;
 	}
 
-	pc->chip.dev = dev;
-	pc->chip.ops = &sti_pwm_ops;
-	pc->chip.npwm = max(cdata->pwm_num_devs, cdata->cpt_num_devs);
+	chip->dev = dev;
+	chip->ops = &sti_pwm_ops;
+	chip->npwm = max(cdata->pwm_num_devs, cdata->cpt_num_devs);
 
 	for (i = 0; i < cdata->cpt_num_devs; i++) {
 		struct sti_cpt_ddata *ddata = &cdata->ddata[i];
@@ -664,23 +666,24 @@ static int sti_pwm_probe(struct platform_device *pdev)
 		mutex_init(&ddata->lock);
 	}
 
-	ret = pwmchip_add(&pc->chip);
+	ret = pwmchip_add(chip);
 	if (ret < 0) {
 		clk_unprepare(pc->pwm_clk);
 		clk_unprepare(pc->cpt_clk);
 		return ret;
 	}
 
-	platform_set_drvdata(pdev, pc);
+	platform_set_drvdata(pdev, chip);
 
 	return 0;
 }
 
 static void sti_pwm_remove(struct platform_device *pdev)
 {
-	struct sti_pwm_chip *pc = platform_get_drvdata(pdev);
+	struct pwm_chip *chip = platform_get_drvdata(pdev);
+	struct sti_pwm_chip *pc = to_sti_pwmchip(chip);
 
-	pwmchip_remove(&pc->chip);
+	pwmchip_remove(chip);
 
 	clk_unprepare(pc->pwm_clk);
 	clk_unprepare(pc->cpt_clk);

From 9fec4eedfa043c017caf77cdb21c50b050c85f16 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Wed, 14 Feb 2024 10:32:39 +0100
Subject: [PATCH 0706/1406] pwm: sti: Make use of devm_pwmchip_alloc() function
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This prepares the pwm-sti driver to further changes of the pwm core
outlined in the commit introducing devm_pwmchip_alloc(). There is no
intended semantical change and the driver should behave as before.

Link: https://lore.kernel.org/r/da6fbb5e98e988400e037b0d2ac0c1749822d702.1707900770.git.u.kleine-koenig@pengutronix.de
Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
---
 drivers/pwm/pwm-sti.c | 50 +++++++++++++++++++++----------------------
 1 file changed, 24 insertions(+), 26 deletions(-)

diff --git a/drivers/pwm/pwm-sti.c b/drivers/pwm/pwm-sti.c
index 826eb547cc96f4..39d80da0e14aff 100644
--- a/drivers/pwm/pwm-sti.c
+++ b/drivers/pwm/pwm-sti.c
@@ -94,7 +94,6 @@ struct sti_pwm_chip {
 	struct regmap_field *pwm_cpt_en;
 	struct regmap_field *pwm_cpt_int_en;
 	struct regmap_field *pwm_cpt_int_stat;
-	struct pwm_chip chip;
 	struct pwm_device *cur;
 	unsigned long configured;
 	unsigned int en_count;
@@ -114,7 +113,7 @@ static const struct reg_field sti_pwm_regfields[MAX_REGFIELDS] = {
 
 static inline struct sti_pwm_chip *to_sti_pwmchip(struct pwm_chip *chip)
 {
-	return container_of(chip, struct sti_pwm_chip, chip);
+	return pwmchip_get_drvdata(chip);
 }
 
 /*
@@ -507,23 +506,7 @@ static int sti_pwm_probe_dt(struct sti_pwm_chip *pc)
 {
 	struct device *dev = pc->dev;
 	const struct reg_field *reg_fields;
-	struct device_node *np = dev->of_node;
 	struct sti_pwm_compat_data *cdata = pc->cdata;
-	u32 num_devs;
-	int ret;
-
-	ret = of_property_read_u32(np, "st,pwm-num-chan", &num_devs);
-	if (!ret)
-		cdata->pwm_num_devs = num_devs;
-
-	ret = of_property_read_u32(np, "st,capture-num-chan", &num_devs);
-	if (!ret)
-		cdata->cpt_num_devs = num_devs;
-
-	if (!cdata->pwm_num_devs && !cdata->cpt_num_devs) {
-		dev_err(dev, "No channels configured\n");
-		return -EINVAL;
-	}
 
 	reg_fields = cdata->reg_fields;
 
@@ -569,16 +552,33 @@ static const struct regmap_config sti_pwm_regmap_config = {
 static int sti_pwm_probe(struct platform_device *pdev)
 {
 	struct device *dev = &pdev->dev;
+	struct device_node *np = dev->of_node;
+	u32 num_devs;
+	unsigned int pwm_num_devs = 0;
+	unsigned int cpt_num_devs = 0;
 	struct sti_pwm_compat_data *cdata;
 	struct pwm_chip *chip;
 	struct sti_pwm_chip *pc;
 	unsigned int i;
 	int irq, ret;
 
-	pc = devm_kzalloc(dev, sizeof(*pc), GFP_KERNEL);
-	if (!pc)
-		return -ENOMEM;
-	chip = &pc->chip;
+	ret = of_property_read_u32(np, "st,pwm-num-chan", &num_devs);
+	if (!ret)
+		pwm_num_devs = num_devs;
+
+	ret = of_property_read_u32(np, "st,capture-num-chan", &num_devs);
+	if (!ret)
+		cpt_num_devs = num_devs;
+
+	if (!pwm_num_devs && !cpt_num_devs) {
+		dev_err(dev, "No channels configured\n");
+		return -EINVAL;
+	}
+
+	chip = devm_pwmchip_alloc(dev, max(pwm_num_devs, cpt_num_devs), sizeof(*pc));
+	if (IS_ERR(chip))
+		return PTR_ERR(chip);
+	pc = to_sti_pwmchip(chip);
 
 	cdata = devm_kzalloc(dev, sizeof(*cdata), GFP_KERNEL);
 	if (!cdata)
@@ -611,8 +611,8 @@ static int sti_pwm_probe(struct platform_device *pdev)
 	cdata->reg_fields = sti_pwm_regfields;
 	cdata->max_prescale = 0xff;
 	cdata->max_pwm_cnt = 255;
-	cdata->pwm_num_devs = 0;
-	cdata->cpt_num_devs = 0;
+	cdata->pwm_num_devs = pwm_num_devs;
+	cdata->cpt_num_devs = cpt_num_devs;
 
 	pc->cdata = cdata;
 	pc->dev = dev;
@@ -655,9 +655,7 @@ static int sti_pwm_probe(struct platform_device *pdev)
 			return -ENOMEM;
 	}
 
-	chip->dev = dev;
 	chip->ops = &sti_pwm_ops;
-	chip->npwm = max(cdata->pwm_num_devs, cdata->cpt_num_devs);
 
 	for (i = 0; i < cdata->cpt_num_devs; i++) {
 		struct sti_cpt_ddata *ddata = &cdata->ddata[i];

From cfe715977b9814857d47d1689d1a846a0a9981fd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Wed, 14 Feb 2024 10:32:40 +0100
Subject: [PATCH 0707/1406] pwm: stm32: Simplify code to determine the
 pwmchip's parent device
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There is already a pointer to the pwmchip, make use of it directly
instead of using the struct stm32_pwm *priv just obtained from it. This
also has the advantage of not using struct stm32_pwm::chip any more
which will be dropped soon.

Link: https://lore.kernel.org/r/54ace92a3c02d22f15a79c7ecf00c29f28386a33.1707900770.git.u.kleine-koenig@pengutronix.de
Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
---
 drivers/pwm/pwm-stm32.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/pwm/pwm-stm32.c b/drivers/pwm/pwm-stm32.c
index 5f10cba492ecba..202114c00ad39a 100644
--- a/drivers/pwm/pwm-stm32.c
+++ b/drivers/pwm/pwm-stm32.c
@@ -170,7 +170,7 @@ static int stm32_pwm_capture(struct pwm_chip *chip, struct pwm_device *pwm,
 
 	ret = clk_enable(priv->clk);
 	if (ret) {
-		dev_err(priv->chip.dev, "failed to enable counter clock\n");
+		dev_err(chip->dev, "failed to enable counter clock\n");
 		goto unlock;
 	}
 

From 8f56af8b06766cdd3984177b68153796cebe205d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Wed, 14 Feb 2024 10:32:41 +0100
Subject: [PATCH 0708/1406] pwm: stm32: Change prototype of a helper to prepare
 further changes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This prepares the driver for further changes that will make it harder to
determine the pwm_chip from a given stm32_pwm. To just not have to do
that, rework stm32_pwm_raw_capture() to take a pwm_chip.

Link: https://lore.kernel.org/r/33790c64563cb0434d7156d96f189c6037b3eb0b.1707900770.git.u.kleine-koenig@pengutronix.de
Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
---
 drivers/pwm/pwm-stm32.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/drivers/pwm/pwm-stm32.c b/drivers/pwm/pwm-stm32.c
index 202114c00ad39a..ec15bc51d930c1 100644
--- a/drivers/pwm/pwm-stm32.c
+++ b/drivers/pwm/pwm-stm32.c
@@ -90,11 +90,12 @@ static u32 active_channels(struct stm32_pwm *dev)
  * - Period     = t2 - t0
  * - Duty cycle = t1 - t0
  */
-static int stm32_pwm_raw_capture(struct stm32_pwm *priv, struct pwm_device *pwm,
+static int stm32_pwm_raw_capture(struct pwm_chip *chip, struct pwm_device *pwm,
 				 unsigned long tmo_ms, u32 *raw_prd,
 				 u32 *raw_dty)
 {
-	struct device *parent = priv->chip.dev->parent;
+	struct stm32_pwm *priv = to_stm32_pwm_dev(chip);
+	struct device *parent = pwmchip_parent(chip)->parent;
 	enum stm32_timers_dmas dma_id;
 	u32 ccen, ccr;
 	int ret;
@@ -170,7 +171,7 @@ static int stm32_pwm_capture(struct pwm_chip *chip, struct pwm_device *pwm,
 
 	ret = clk_enable(priv->clk);
 	if (ret) {
-		dev_err(chip->dev, "failed to enable counter clock\n");
+		dev_err(pwmchip_parent(chip), "failed to enable counter clock\n");
 		goto unlock;
 	}
 
@@ -208,7 +209,7 @@ static int stm32_pwm_capture(struct pwm_chip *chip, struct pwm_device *pwm,
 			   TIM_CCER_CC12P : TIM_CCER_CC34P, pwm->hwpwm < 2 ?
 			   TIM_CCER_CC2P : TIM_CCER_CC4P);
 
-	ret = stm32_pwm_raw_capture(priv, pwm, tmo_ms, &raw_prd, &raw_dty);
+	ret = stm32_pwm_raw_capture(chip, pwm, tmo_ms, &raw_prd, &raw_dty);
 	if (ret)
 		goto stop;
 
@@ -229,7 +230,7 @@ static int stm32_pwm_capture(struct pwm_chip *chip, struct pwm_device *pwm,
 		/* 2nd measure with new scale */
 		psc /= scale;
 		regmap_write(priv->regmap, TIM_PSC, psc);
-		ret = stm32_pwm_raw_capture(priv, pwm, tmo_ms, &raw_prd,
+		ret = stm32_pwm_raw_capture(chip, pwm, tmo_ms, &raw_prd,
 					    &raw_dty);
 		if (ret)
 			goto stop;
@@ -257,7 +258,7 @@ static int stm32_pwm_capture(struct pwm_chip *chip, struct pwm_device *pwm,
 			   FIELD_PREP(TIM_CCMR_IC1PSC, icpsc) |
 			   FIELD_PREP(TIM_CCMR_IC2PSC, icpsc));
 
-	ret = stm32_pwm_raw_capture(priv, pwm, tmo_ms, &raw_prd, &raw_dty);
+	ret = stm32_pwm_raw_capture(chip, pwm, tmo_ms, &raw_prd, &raw_dty);
 	if (ret)
 		goto stop;
 

From ee08daa053f58cb0e161390d665f403906788e42 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Wed, 14 Feb 2024 10:32:42 +0100
Subject: [PATCH 0709/1406] pwm: stm32: Prepare removing pwm_chip from driver
 data
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This prepares the driver for further changes that will drop struct
pwm_chip chip from struct stm32_pwm. Use the pwm_chip as driver
data instead of the stm32_pwm to get access to the pwm_chip in
stm32_pwm_suspend() without using priv->chip.

Link: https://lore.kernel.org/r/3db96cd915d9d8fc350a7193c0d55dd109b1f035.1707900770.git.u.kleine-koenig@pengutronix.de
Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
---
 drivers/pwm/pwm-stm32.c | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/drivers/pwm/pwm-stm32.c b/drivers/pwm/pwm-stm32.c
index ec15bc51d930c1..4d12f3d849cd28 100644
--- a/drivers/pwm/pwm-stm32.c
+++ b/drivers/pwm/pwm-stm32.c
@@ -630,6 +630,7 @@ static int stm32_pwm_probe(struct platform_device *pdev)
 	struct device *dev = &pdev->dev;
 	struct device_node *np = dev->of_node;
 	struct stm32_timers *ddata = dev_get_drvdata(pdev->dev.parent);
+	struct pwm_chip *chip;
 	struct stm32_pwm *priv;
 	unsigned int num_enabled;
 	unsigned int i;
@@ -638,6 +639,7 @@ static int stm32_pwm_probe(struct platform_device *pdev)
 	priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL);
 	if (!priv)
 		return -ENOMEM;
+	chip = &priv->chip;
 
 	mutex_init(&priv->lock);
 	priv->regmap = ddata->regmap;
@@ -653,37 +655,38 @@ static int stm32_pwm_probe(struct platform_device *pdev)
 
 	stm32_pwm_detect_complementary(priv);
 
-	priv->chip.dev = dev;
-	priv->chip.ops = &stm32pwm_ops;
-	priv->chip.npwm = stm32_pwm_detect_channels(priv, &num_enabled);
+	chip->dev = dev;
+	chip->ops = &stm32pwm_ops;
+	chip->npwm = stm32_pwm_detect_channels(priv, &num_enabled);
 
 	/* Initialize clock refcount to number of enabled PWM channels. */
 	for (i = 0; i < num_enabled; i++)
 		clk_enable(priv->clk);
 
-	ret = devm_pwmchip_add(dev, &priv->chip);
+	ret = devm_pwmchip_add(dev, chip);
 	if (ret < 0)
 		return ret;
 
-	platform_set_drvdata(pdev, priv);
+	platform_set_drvdata(pdev, chip);
 
 	return 0;
 }
 
 static int stm32_pwm_suspend(struct device *dev)
 {
-	struct stm32_pwm *priv = dev_get_drvdata(dev);
+	struct pwm_chip *chip = dev_get_drvdata(dev);
+	struct stm32_pwm *priv = to_stm32_pwm_dev(chip);
 	unsigned int i;
 	u32 ccer, mask;
 
 	/* Look for active channels */
 	ccer = active_channels(priv);
 
-	for (i = 0; i < priv->chip.npwm; i++) {
+	for (i = 0; i < chip->npwm; i++) {
 		mask = TIM_CCER_CC1E << (i * 4);
 		if (ccer & mask) {
 			dev_err(dev, "PWM %u still in use by consumer %s\n",
-				i, priv->chip.pwms[i].label);
+				i, chip->pwms[i].label);
 			return -EBUSY;
 		}
 	}
@@ -693,7 +696,8 @@ static int stm32_pwm_suspend(struct device *dev)
 
 static int stm32_pwm_resume(struct device *dev)
 {
-	struct stm32_pwm *priv = dev_get_drvdata(dev);
+	struct pwm_chip *chip = dev_get_drvdata(dev);
+	struct stm32_pwm *priv = to_stm32_pwm_dev(chip);
 	int ret;
 
 	ret = pinctrl_pm_select_default_state(dev);

From d4f5c06a7d3f5fb8d5833d240f2462ce43bf20a5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Wed, 14 Feb 2024 10:32:43 +0100
Subject: [PATCH 0710/1406] pwm: stm32: Change prototype of helper that detects
 npwm to prepare further changes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When the stm32 pwm driver is converted to pwmchip_alloc(), the number of
available PWM lines must be known before the driver private data can be
allocated. So rework the helper function that determines this number to
not take the driver private data struct as input parameter.

Link: https://lore.kernel.org/r/13d4d3e90a9ee1bcd04674dfdc16f242615b8320.1707900770.git.u.kleine-koenig@pengutronix.de
Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
---
 drivers/pwm/pwm-stm32.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/pwm/pwm-stm32.c b/drivers/pwm/pwm-stm32.c
index 4d12f3d849cd28..1440b706ee57b3 100644
--- a/drivers/pwm/pwm-stm32.c
+++ b/drivers/pwm/pwm-stm32.c
@@ -606,7 +606,7 @@ static void stm32_pwm_detect_complementary(struct stm32_pwm *priv)
 	priv->have_complementary_output = (ccer != 0);
 }
 
-static unsigned int stm32_pwm_detect_channels(struct stm32_pwm *priv,
+static unsigned int stm32_pwm_detect_channels(struct regmap *regmap,
 					      unsigned int *num_enabled)
 {
 	u32 ccer, ccer_backup;
@@ -615,10 +615,10 @@ static unsigned int stm32_pwm_detect_channels(struct stm32_pwm *priv,
 	 * If channels enable bits don't exist writing 1 will have no
 	 * effect so we can detect and count them.
 	 */
-	regmap_read(priv->regmap, TIM_CCER, &ccer_backup);
-	regmap_set_bits(priv->regmap, TIM_CCER, TIM_CCER_CCXE);
-	regmap_read(priv->regmap, TIM_CCER, &ccer);
-	regmap_write(priv->regmap, TIM_CCER, ccer_backup);
+	regmap_read(regmap, TIM_CCER, &ccer_backup);
+	regmap_set_bits(regmap, TIM_CCER, TIM_CCER_CCXE);
+	regmap_read(regmap, TIM_CCER, &ccer);
+	regmap_write(regmap, TIM_CCER, ccer_backup);
 
 	*num_enabled = hweight32(ccer_backup & TIM_CCER_CCXE);
 
@@ -657,7 +657,7 @@ static int stm32_pwm_probe(struct platform_device *pdev)
 
 	chip->dev = dev;
 	chip->ops = &stm32pwm_ops;
-	chip->npwm = stm32_pwm_detect_channels(priv, &num_enabled);
+	chip->npwm = stm32_pwm_detect_channels(ddata->regmap, &num_enabled);
 
 	/* Initialize clock refcount to number of enabled PWM channels. */
 	for (i = 0; i < num_enabled; i++)

From d559edff15f065adf9bf31e5bfd61a2e2f72ffd9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Wed, 14 Feb 2024 10:32:44 +0100
Subject: [PATCH 0711/1406] pwm: stm32: Make use of devm_pwmchip_alloc()
 function
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This prepares the pwm-stm32 driver to further changes of the pwm core
outlined in the commit introducing devm_pwmchip_alloc(). There is no
intended semantical change and the driver should behave as before.

Link: https://lore.kernel.org/r/59e5dfff2b878cc8590e286572672e4f10e35380.1707900770.git.u.kleine-koenig@pengutronix.de
Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
---
 drivers/pwm/pwm-stm32.c | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/drivers/pwm/pwm-stm32.c b/drivers/pwm/pwm-stm32.c
index 1440b706ee57b3..0c028d17c07523 100644
--- a/drivers/pwm/pwm-stm32.c
+++ b/drivers/pwm/pwm-stm32.c
@@ -27,7 +27,6 @@ struct stm32_breakinput {
 };
 
 struct stm32_pwm {
-	struct pwm_chip chip;
 	struct mutex lock; /* protect pwm config/enable */
 	struct clk *clk;
 	struct regmap *regmap;
@@ -40,7 +39,7 @@ struct stm32_pwm {
 
 static inline struct stm32_pwm *to_stm32_pwm_dev(struct pwm_chip *chip)
 {
-	return container_of(chip, struct stm32_pwm, chip);
+	return pwmchip_get_drvdata(chip);
 }
 
 static u32 active_channels(struct stm32_pwm *dev)
@@ -632,14 +631,16 @@ static int stm32_pwm_probe(struct platform_device *pdev)
 	struct stm32_timers *ddata = dev_get_drvdata(pdev->dev.parent);
 	struct pwm_chip *chip;
 	struct stm32_pwm *priv;
-	unsigned int num_enabled;
+	unsigned int npwm, num_enabled;
 	unsigned int i;
 	int ret;
 
-	priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL);
-	if (!priv)
-		return -ENOMEM;
-	chip = &priv->chip;
+	npwm = stm32_pwm_detect_channels(ddata->regmap, &num_enabled);
+
+	chip = devm_pwmchip_alloc(dev, npwm, sizeof(*priv));
+	if (IS_ERR(chip))
+		return PTR_ERR(chip);
+	priv = to_stm32_pwm_dev(chip);
 
 	mutex_init(&priv->lock);
 	priv->regmap = ddata->regmap;
@@ -655,9 +656,7 @@ static int stm32_pwm_probe(struct platform_device *pdev)
 
 	stm32_pwm_detect_complementary(priv);
 
-	chip->dev = dev;
 	chip->ops = &stm32pwm_ops;
-	chip->npwm = stm32_pwm_detect_channels(ddata->regmap, &num_enabled);
 
 	/* Initialize clock refcount to number of enabled PWM channels. */
 	for (i = 0; i < num_enabled; i++)

From cf9d260cb43812bbe21cf4483730fc3126897eda Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Wed, 14 Feb 2024 10:32:45 +0100
Subject: [PATCH 0712/1406] pwm: stm32-lp: Simplify code to determine the
 pwmchip's parent device
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There is already a pointer to the pwmchip, make use of it directly
instead of using the struct stm32_pwm_lp *priv just obtained from
it. This also has the advantage of not using struct stm32_pwm_lp::chip
any more which will be dropped soon.

Link: https://lore.kernel.org/r/9ad2399e1a683a6344b12d7f70498393b8f8b9de.1707900770.git.u.kleine-koenig@pengutronix.de
Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
---
 drivers/pwm/pwm-stm32-lp.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/pwm/pwm-stm32-lp.c b/drivers/pwm/pwm-stm32-lp.c
index 439068f3eca18f..bbab6be314a823 100644
--- a/drivers/pwm/pwm-stm32-lp.c
+++ b/drivers/pwm/pwm-stm32-lp.c
@@ -61,7 +61,7 @@ static int stm32_pwm_lp_apply(struct pwm_chip *chip, struct pwm_device *pwm,
 	do_div(div, NSEC_PER_SEC);
 	if (!div) {
 		/* Clock is too slow to achieve requested period. */
-		dev_dbg(priv->chip.dev, "Can't reach %llu ns\n", state->period);
+		dev_dbg(chip->dev, "Can't reach %llu ns\n", state->period);
 		return -EINVAL;
 	}
 
@@ -69,7 +69,7 @@ static int stm32_pwm_lp_apply(struct pwm_chip *chip, struct pwm_device *pwm,
 	while (div > STM32_LPTIM_MAX_ARR) {
 		presc++;
 		if ((1 << presc) > STM32_LPTIM_MAX_PRESCALER) {
-			dev_err(priv->chip.dev, "max prescaler exceeded\n");
+			dev_err(chip->dev, "max prescaler exceeded\n");
 			return -EINVAL;
 		}
 		div = prd >> presc;
@@ -130,7 +130,7 @@ static int stm32_pwm_lp_apply(struct pwm_chip *chip, struct pwm_device *pwm,
 				       (val & STM32_LPTIM_CMPOK_ARROK) == STM32_LPTIM_CMPOK_ARROK,
 				       100, 1000);
 	if (ret) {
-		dev_err(priv->chip.dev, "ARR/CMP registers write issue\n");
+		dev_err(chip->dev, "ARR/CMP registers write issue\n");
 		goto err;
 	}
 	ret = regmap_write(priv->regmap, STM32_LPTIM_ICR,

From 315821b150b130fed547040ee2fc428d0025195c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Wed, 14 Feb 2024 10:32:46 +0100
Subject: [PATCH 0713/1406] pwm: stm32-lp: Prepare removing pwm_chip from
 driver data
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This prepares the driver for further changes that will drop struct
pwm_chip chip from struct stm32_pwm_lp. Use the pwm_chip as driver
data instead of the stm32_pwm_lp to get access to the pwm_chip in
stm32_pwm_lp_suspend() without using priv->chip.

Link: https://lore.kernel.org/r/df47d1aff9b529c9a4762b6ba339a18cecba1497.1707900770.git.u.kleine-koenig@pengutronix.de
Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
---
 drivers/pwm/pwm-stm32-lp.c | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/drivers/pwm/pwm-stm32-lp.c b/drivers/pwm/pwm-stm32-lp.c
index bbab6be314a823..b46d8193dd0f47 100644
--- a/drivers/pwm/pwm-stm32-lp.c
+++ b/drivers/pwm/pwm-stm32-lp.c
@@ -197,6 +197,7 @@ static int stm32_pwm_lp_probe(struct platform_device *pdev)
 {
 	struct stm32_lptimer *ddata = dev_get_drvdata(pdev->dev.parent);
 	struct stm32_pwm_lp *priv;
+	struct pwm_chip *chip;
 	int ret;
 
 	priv = devm_kzalloc(&pdev->dev, sizeof(*priv), GFP_KERNEL);
@@ -205,28 +206,29 @@ static int stm32_pwm_lp_probe(struct platform_device *pdev)
 
 	priv->regmap = ddata->regmap;
 	priv->clk = ddata->clk;
-	priv->chip.dev = &pdev->dev;
-	priv->chip.ops = &stm32_pwm_lp_ops;
-	priv->chip.npwm = 1;
+	chip = &priv->chip;
+	chip->dev = &pdev->dev;
+	chip->ops = &stm32_pwm_lp_ops;
+	chip->npwm = 1;
 
-	ret = devm_pwmchip_add(&pdev->dev, &priv->chip);
+	ret = devm_pwmchip_add(&pdev->dev, chip);
 	if (ret < 0)
 		return ret;
 
-	platform_set_drvdata(pdev, priv);
+	platform_set_drvdata(pdev, chip);
 
 	return 0;
 }
 
 static int stm32_pwm_lp_suspend(struct device *dev)
 {
-	struct stm32_pwm_lp *priv = dev_get_drvdata(dev);
+	struct pwm_chip *chip = dev_get_drvdata(dev);
 	struct pwm_state state;
 
-	pwm_get_state(&priv->chip.pwms[0], &state);
+	pwm_get_state(&chip->pwms[0], &state);
 	if (state.enabled) {
 		dev_err(dev, "The consumer didn't stop us (%s)\n",
-			priv->chip.pwms[0].label);
+			chip->pwms[0].label);
 		return -EBUSY;
 	}
 

From 0d82523be1efbad25f135509d4fbe89aca1d0542 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Wed, 14 Feb 2024 10:32:47 +0100
Subject: [PATCH 0714/1406] pwm: stm32-lp: Make use of pwmchip_parent()
 accessor
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

struct pwm_chip::dev is about to change. To not have to touch this
driver in the same commit as struct pwm_chip::dev, use the accessor
function provided for exactly this purpose.

Link: https://lore.kernel.org/r/d79148ed49389c657b72df05758032be2b516ceb.1707900770.git.u.kleine-koenig@pengutronix.de
Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
---
 drivers/pwm/pwm-stm32-lp.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/pwm/pwm-stm32-lp.c b/drivers/pwm/pwm-stm32-lp.c
index b46d8193dd0f47..958043bc5c9200 100644
--- a/drivers/pwm/pwm-stm32-lp.c
+++ b/drivers/pwm/pwm-stm32-lp.c
@@ -61,7 +61,7 @@ static int stm32_pwm_lp_apply(struct pwm_chip *chip, struct pwm_device *pwm,
 	do_div(div, NSEC_PER_SEC);
 	if (!div) {
 		/* Clock is too slow to achieve requested period. */
-		dev_dbg(chip->dev, "Can't reach %llu ns\n", state->period);
+		dev_dbg(pwmchip_parent(chip), "Can't reach %llu ns\n", state->period);
 		return -EINVAL;
 	}
 
@@ -69,7 +69,7 @@ static int stm32_pwm_lp_apply(struct pwm_chip *chip, struct pwm_device *pwm,
 	while (div > STM32_LPTIM_MAX_ARR) {
 		presc++;
 		if ((1 << presc) > STM32_LPTIM_MAX_PRESCALER) {
-			dev_err(chip->dev, "max prescaler exceeded\n");
+			dev_err(pwmchip_parent(chip), "max prescaler exceeded\n");
 			return -EINVAL;
 		}
 		div = prd >> presc;
@@ -130,7 +130,7 @@ static int stm32_pwm_lp_apply(struct pwm_chip *chip, struct pwm_device *pwm,
 				       (val & STM32_LPTIM_CMPOK_ARROK) == STM32_LPTIM_CMPOK_ARROK,
 				       100, 1000);
 	if (ret) {
-		dev_err(chip->dev, "ARR/CMP registers write issue\n");
+		dev_err(pwmchip_parent(chip), "ARR/CMP registers write issue\n");
 		goto err;
 	}
 	ret = regmap_write(priv->regmap, STM32_LPTIM_ICR,

From 091734a6f1753ed3e31e06f61a5003cd79c2a3bb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Wed, 14 Feb 2024 10:32:48 +0100
Subject: [PATCH 0715/1406] pwm: stm32-lp: Make use of devm_pwmchip_alloc()
 function
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This prepares the pwm-stm32-lp driver to further changes of the pwm core
outlined in the commit introducing devm_pwmchip_alloc(). There is no
intended semantical change and the driver should behave as before.

Link: https://lore.kernel.org/r/04af7b3d00bc932dd025200a3bf74527c29ca47a.1707900770.git.u.kleine-koenig@pengutronix.de
Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
---
 drivers/pwm/pwm-stm32-lp.c | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/drivers/pwm/pwm-stm32-lp.c b/drivers/pwm/pwm-stm32-lp.c
index 958043bc5c9200..989731256f5030 100644
--- a/drivers/pwm/pwm-stm32-lp.c
+++ b/drivers/pwm/pwm-stm32-lp.c
@@ -18,14 +18,13 @@
 #include <linux/pwm.h>
 
 struct stm32_pwm_lp {
-	struct pwm_chip chip;
 	struct clk *clk;
 	struct regmap *regmap;
 };
 
 static inline struct stm32_pwm_lp *to_stm32_pwm_lp(struct pwm_chip *chip)
 {
-	return container_of(chip, struct stm32_pwm_lp, chip);
+	return pwmchip_get_drvdata(chip);
 }
 
 /* STM32 Low-Power Timer is preceded by a configurable power-of-2 prescaler */
@@ -200,16 +199,14 @@ static int stm32_pwm_lp_probe(struct platform_device *pdev)
 	struct pwm_chip *chip;
 	int ret;
 
-	priv = devm_kzalloc(&pdev->dev, sizeof(*priv), GFP_KERNEL);
-	if (!priv)
-		return -ENOMEM;
+	chip = devm_pwmchip_alloc(&pdev->dev, 1, sizeof(*priv));
+	if (IS_ERR(chip))
+		return PTR_ERR(chip);
+	priv = to_stm32_pwm_lp(chip);
 
 	priv->regmap = ddata->regmap;
 	priv->clk = ddata->clk;
-	chip = &priv->chip;
-	chip->dev = &pdev->dev;
 	chip->ops = &stm32_pwm_lp_ops;
-	chip->npwm = 1;
 
 	ret = devm_pwmchip_add(&pdev->dev, chip);
 	if (ret < 0)

From cb2d6de52bd6157bdd46789f4b84e0edc4040292 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Wed, 14 Feb 2024 10:32:49 +0100
Subject: [PATCH 0716/1406] pwm: stmpe: Make use of pwmchip_parent() accessor
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

struct pwm_chip::dev is about to change. To not have to touch this
driver in the same commit as struct pwm_chip::dev, use the accessor
function provided for exactly this purpose.

Link: https://lore.kernel.org/r/2136fbdf9b1e6bac479b935b439e2be73a003b97.1707900770.git.u.kleine-koenig@pengutronix.de
Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
---
 drivers/pwm/pwm-stmpe.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/drivers/pwm/pwm-stmpe.c b/drivers/pwm/pwm-stmpe.c
index 19c0c0f39675d3..05f3f38031eeb2 100644
--- a/drivers/pwm/pwm-stmpe.c
+++ b/drivers/pwm/pwm-stmpe.c
@@ -44,7 +44,7 @@ static int stmpe_24xx_pwm_enable(struct pwm_chip *chip, struct pwm_device *pwm)
 
 	ret = stmpe_reg_read(stmpe_pwm->stmpe, STMPE24XX_PWMCS);
 	if (ret < 0) {
-		dev_dbg(chip->dev, "error reading PWM#%u control\n",
+		dev_dbg(pwmchip_parent(chip), "error reading PWM#%u control\n",
 			pwm->hwpwm);
 		return ret;
 	}
@@ -53,7 +53,7 @@ static int stmpe_24xx_pwm_enable(struct pwm_chip *chip, struct pwm_device *pwm)
 
 	ret = stmpe_reg_write(stmpe_pwm->stmpe, STMPE24XX_PWMCS, value);
 	if (ret) {
-		dev_dbg(chip->dev, "error writing PWM#%u control\n",
+		dev_dbg(pwmchip_parent(chip), "error writing PWM#%u control\n",
 			pwm->hwpwm);
 		return ret;
 	}
@@ -70,7 +70,7 @@ static int stmpe_24xx_pwm_disable(struct pwm_chip *chip,
 
 	ret = stmpe_reg_read(stmpe_pwm->stmpe, STMPE24XX_PWMCS);
 	if (ret < 0) {
-		dev_dbg(chip->dev, "error reading PWM#%u control\n",
+		dev_dbg(pwmchip_parent(chip), "error reading PWM#%u control\n",
 			pwm->hwpwm);
 		return ret;
 	}
@@ -79,7 +79,7 @@ static int stmpe_24xx_pwm_disable(struct pwm_chip *chip,
 
 	ret = stmpe_reg_write(stmpe_pwm->stmpe, STMPE24XX_PWMCS, value);
 	if (ret)
-		dev_dbg(chip->dev, "error writing PWM#%u control\n",
+		dev_dbg(pwmchip_parent(chip), "error writing PWM#%u control\n",
 			pwm->hwpwm);
 	return ret;
 }
@@ -125,7 +125,7 @@ static int stmpe_24xx_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm,
 		ret = stmpe_set_altfunc(stmpe_pwm->stmpe, BIT(pin),
 					STMPE_BLOCK_PWM);
 		if (ret) {
-			dev_err(chip->dev, "unable to connect PWM#%u to pin\n",
+			dev_err(pwmchip_parent(chip), "unable to connect PWM#%u to pin\n",
 				pwm->hwpwm);
 			return ret;
 		}
@@ -150,7 +150,7 @@ static int stmpe_24xx_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm,
 		return -ENODEV;
 	}
 
-	dev_dbg(chip->dev, "PWM#%u: config duty %d ns, period %d ns\n",
+	dev_dbg(pwmchip_parent(chip), "PWM#%u: config duty %d ns, period %d ns\n",
 		pwm->hwpwm, duty_ns, period_ns);
 
 	if (duty_ns == 0) {
@@ -216,7 +216,7 @@ static int stmpe_24xx_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm,
 			program[1] = BRANCH;
 		}
 
-		dev_dbg(chip->dev,
+		dev_dbg(pwmchip_parent(chip),
 			"PWM#%u: value = %02x, last_duty = %02x, program=%04x,%04x,%04x\n",
 			pwm->hwpwm, value, last, program[0], program[1],
 			program[2]);
@@ -233,7 +233,7 @@ static int stmpe_24xx_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm,
 
 		ret = stmpe_reg_write(stmpe_pwm->stmpe, offset, value);
 		if (ret) {
-			dev_dbg(chip->dev, "error writing register %02x: %d\n",
+			dev_dbg(pwmchip_parent(chip), "error writing register %02x: %d\n",
 				offset, ret);
 			return ret;
 		}
@@ -242,7 +242,7 @@ static int stmpe_24xx_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm,
 
 		ret = stmpe_reg_write(stmpe_pwm->stmpe, offset, value);
 		if (ret) {
-			dev_dbg(chip->dev, "error writing register %02x: %d\n",
+			dev_dbg(pwmchip_parent(chip), "error writing register %02x: %d\n",
 				offset, ret);
 			return ret;
 		}
@@ -255,7 +255,7 @@ static int stmpe_24xx_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm,
 	/* Sleep for 200ms so we're sure it will take effect */
 	msleep(200);
 
-	dev_dbg(chip->dev, "programmed PWM#%u, %u bytes\n", pwm->hwpwm, i);
+	dev_dbg(pwmchip_parent(chip), "programmed PWM#%u, %u bytes\n", pwm->hwpwm, i);
 
 	return 0;
 }

From 570a7d90e21b1aadedb69a695967e6ba18643384 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Wed, 14 Feb 2024 10:32:50 +0100
Subject: [PATCH 0717/1406] pwm: stmpe: Make use of devm_pwmchip_alloc()
 function
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This prepares the pwm-stmpe driver to further changes of the pwm core
outlined in the commit introducing devm_pwmchip_alloc(). There is no
intended semantical change and the driver should behave as before.

Link: https://lore.kernel.org/r/7e3dbf3b70126038c0ba16331ca8c07cab575bd3.1707900770.git.u.kleine-koenig@pengutronix.de
Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
---
 drivers/pwm/pwm-stmpe.c | 38 ++++++++++++++++++++------------------
 1 file changed, 20 insertions(+), 18 deletions(-)

diff --git a/drivers/pwm/pwm-stmpe.c b/drivers/pwm/pwm-stmpe.c
index 05f3f38031eeb2..bb91062d5f1da1 100644
--- a/drivers/pwm/pwm-stmpe.c
+++ b/drivers/pwm/pwm-stmpe.c
@@ -27,13 +27,12 @@
 
 struct stmpe_pwm {
 	struct stmpe *stmpe;
-	struct pwm_chip chip;
 	u8 last_duty;
 };
 
 static inline struct stmpe_pwm *to_stmpe_pwm(struct pwm_chip *chip)
 {
-	return container_of(chip, struct stmpe_pwm, chip);
+	return pwmchip_get_drvdata(chip);
 }
 
 static int stmpe_24xx_pwm_enable(struct pwm_chip *chip, struct pwm_device *pwm)
@@ -292,33 +291,36 @@ static const struct pwm_ops stmpe_24xx_pwm_ops = {
 static int __init stmpe_pwm_probe(struct platform_device *pdev)
 {
 	struct stmpe *stmpe = dev_get_drvdata(pdev->dev.parent);
+	struct pwm_chip *chip;
 	struct stmpe_pwm *stmpe_pwm;
 	int ret;
 
-	stmpe_pwm = devm_kzalloc(&pdev->dev, sizeof(*stmpe_pwm), GFP_KERNEL);
-	if (!stmpe_pwm)
-		return -ENOMEM;
+	switch (stmpe->partnum) {
+	case STMPE2401:
+	case STMPE2403:
+		break;
+	case STMPE1601:
+		return dev_err_probe(&pdev->dev, -ENODEV,
+				     "STMPE1601 not yet supported\n");
+	default:
+		return dev_err_probe(&pdev->dev, -ENODEV,
+				     "Unknown STMPE PWM\n");
+	}
 
-	stmpe_pwm->stmpe = stmpe;
-	stmpe_pwm->chip.dev = &pdev->dev;
+	chip = devm_pwmchip_alloc(&pdev->dev, 3, sizeof(*stmpe_pwm));
+	if (IS_ERR(chip))
+		return PTR_ERR(chip);
+	stmpe_pwm = to_stmpe_pwm(chip);
 
-	if (stmpe->partnum == STMPE2401 || stmpe->partnum == STMPE2403) {
-		stmpe_pwm->chip.ops = &stmpe_24xx_pwm_ops;
-		stmpe_pwm->chip.npwm = 3;
-	} else {
-		if (stmpe->partnum == STMPE1601)
-			dev_err(&pdev->dev, "STMPE1601 not yet supported\n");
-		else
-			dev_err(&pdev->dev, "Unknown STMPE PWM\n");
+	stmpe_pwm->stmpe = stmpe;
 
-		return -ENODEV;
-	}
+	chip->ops = &stmpe_24xx_pwm_ops;
 
 	ret = stmpe_enable(stmpe, STMPE_BLOCK_PWM);
 	if (ret)
 		return ret;
 
-	ret = pwmchip_add(&stmpe_pwm->chip);
+	ret = pwmchip_add(chip);
 	if (ret) {
 		stmpe_disable(stmpe, STMPE_BLOCK_PWM);
 		return ret;

From 8523b212b0f148a437c2a77fbafb52d2b2237ed1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Wed, 14 Feb 2024 10:32:51 +0100
Subject: [PATCH 0718/1406] pwm: sun4i: Make use of pwmchip_parent() accessor
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

struct pwm_chip::dev is about to change. To not have to touch this
driver in the same commit as struct pwm_chip::dev, use the accessor
function provided for exactly this purpose.

Reviewed-by: Jernej Skrabec <jernej.skrabec@gmail.com>
Link: https://lore.kernel.org/r/3ddaec73f3abefb45625d0a469026fa8d13da8c0.1707900770.git.u.kleine-koenig@pengutronix.de
Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
---
 drivers/pwm/pwm-sun4i.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/pwm/pwm-sun4i.c b/drivers/pwm/pwm-sun4i.c
index 1a439025540d45..44edf1ce573915 100644
--- a/drivers/pwm/pwm-sun4i.c
+++ b/drivers/pwm/pwm-sun4i.c
@@ -245,7 +245,7 @@ static int sun4i_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm,
 	if (!cstate.enabled) {
 		ret = clk_prepare_enable(sun4i_pwm->clk);
 		if (ret) {
-			dev_err(chip->dev, "failed to enable PWM clock\n");
+			dev_err(pwmchip_parent(chip), "failed to enable PWM clock\n");
 			return ret;
 		}
 	}
@@ -253,7 +253,7 @@ static int sun4i_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm,
 	ret = sun4i_pwm_calculate(sun4i_pwm, state, &duty, &period, &prescaler,
 				  &bypass);
 	if (ret) {
-		dev_err(chip->dev, "period exceeds the maximum value\n");
+		dev_err(pwmchip_parent(chip), "period exceeds the maximum value\n");
 		if (!cstate.enabled)
 			clk_disable_unprepare(sun4i_pwm->clk);
 		return ret;

From dbcc0e63323ee5c0f9b220b2feca2281a46dea98 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Wed, 14 Feb 2024 10:32:52 +0100
Subject: [PATCH 0719/1406] pwm: sun4i: Prepare removing pwm_chip from driver
 data
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This prepares the driver for further changes that will drop struct
pwm_chip chip from struct sun4i_pwm_chip. Use the pwm_chip as driver
data instead of the sun4i_pwm_chip to get access to the pwm_chip in
sun4i_pwm_remove() without using sun4ichip->chip.

Reviewed-by: Jernej Skrabec <jernej.skrabec@gmail.com>
Link: https://lore.kernel.org/r/f8e3fb96fe341ba0a4bed982aa731c5c7c355b83.1707900770.git.u.kleine-koenig@pengutronix.de
Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
---
 drivers/pwm/pwm-sun4i.c | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/drivers/pwm/pwm-sun4i.c b/drivers/pwm/pwm-sun4i.c
index 44edf1ce573915..c2f579043915b1 100644
--- a/drivers/pwm/pwm-sun4i.c
+++ b/drivers/pwm/pwm-sun4i.c
@@ -384,12 +384,14 @@ MODULE_DEVICE_TABLE(of, sun4i_pwm_dt_ids);
 
 static int sun4i_pwm_probe(struct platform_device *pdev)
 {
+	struct pwm_chip *chip;
 	struct sun4i_pwm_chip *sun4ichip;
 	int ret;
 
 	sun4ichip = devm_kzalloc(&pdev->dev, sizeof(*sun4ichip), GFP_KERNEL);
 	if (!sun4ichip)
 		return -ENOMEM;
+	chip = &sun4ichip->chip;
 
 	sun4ichip->data = of_device_get_match_data(&pdev->dev);
 	if (!sun4ichip->data)
@@ -451,19 +453,19 @@ static int sun4i_pwm_probe(struct platform_device *pdev)
 		goto err_bus;
 	}
 
-	sun4ichip->chip.dev = &pdev->dev;
-	sun4ichip->chip.ops = &sun4i_pwm_ops;
-	sun4ichip->chip.npwm = sun4ichip->data->npwm;
+	chip->dev = &pdev->dev;
+	chip->ops = &sun4i_pwm_ops;
+	chip->npwm = sun4ichip->data->npwm;
 
 	spin_lock_init(&sun4ichip->ctrl_lock);
 
-	ret = pwmchip_add(&sun4ichip->chip);
+	ret = pwmchip_add(chip);
 	if (ret < 0) {
 		dev_err(&pdev->dev, "failed to add PWM chip: %d\n", ret);
 		goto err_pwm_add;
 	}
 
-	platform_set_drvdata(pdev, sun4ichip);
+	platform_set_drvdata(pdev, chip);
 
 	return 0;
 
@@ -477,9 +479,10 @@ static int sun4i_pwm_probe(struct platform_device *pdev)
 
 static void sun4i_pwm_remove(struct platform_device *pdev)
 {
-	struct sun4i_pwm_chip *sun4ichip = platform_get_drvdata(pdev);
+	struct pwm_chip *chip = platform_get_drvdata(pdev);
+	struct sun4i_pwm_chip *sun4ichip = to_sun4i_pwm_chip(chip);
 
-	pwmchip_remove(&sun4ichip->chip);
+	pwmchip_remove(chip);
 
 	clk_disable_unprepare(sun4ichip->bus_clk);
 	reset_control_assert(sun4ichip->rst);

From bc80331261a60a8b22de74a273bbd3f64aff3f5d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Wed, 14 Feb 2024 10:32:53 +0100
Subject: [PATCH 0720/1406] pwm: sun4i: Consistently name driver data sun4ichip
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The driver uses three different names for variables of type
sun4i_pwm_chip:

	$ git grep 'struct sun4i_pwm_chip \*' v6.8-rc1 -- drivers/pwm/pwm-sun4i.c
	v6.8-rc1:drivers/pwm/pwm-sun4i.c:static inline struct sun4i_pwm_chip *to_sun4i_pwm_chip(struct pwm_chip *chip)
	v6.8-rc1:drivers/pwm/pwm-sun4i.c:static inline u32 sun4i_pwm_readl(struct sun4i_pwm_chip *chip,
	v6.8-rc1:drivers/pwm/pwm-sun4i.c:static inline void sun4i_pwm_writel(struct sun4i_pwm_chip *chip,
	v6.8-rc1:drivers/pwm/pwm-sun4i.c:       struct sun4i_pwm_chip *sun4i_pwm = to_sun4i_pwm_chip(chip);
	v6.8-rc1:drivers/pwm/pwm-sun4i.c:static int sun4i_pwm_calculate(struct sun4i_pwm_chip *sun4i_pwm,
	v6.8-rc1:drivers/pwm/pwm-sun4i.c:       struct sun4i_pwm_chip *sun4i_pwm = to_sun4i_pwm_chip(chip);
	v6.8-rc1:drivers/pwm/pwm-sun4i.c:       struct sun4i_pwm_chip *sun4ichip;
	v6.8-rc1:drivers/pwm/pwm-sun4i.c:       struct sun4i_pwm_chip *sun4ichip = platform_get_drvdata(pdev);

"chip" is usually reserved for variables of type struct pwm_chip. So
pick sun4ichip as common name which better matches the type name than
sun4i_pwm.

Acked-by: Jernej Skrabec <jernej.skrabec@gmail.com>
Link: https://lore.kernel.org/r/705f54a13b59fff50eaa345d8b1e0c691345b996.1707900770.git.u.kleine-koenig@pengutronix.de
Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
---
 drivers/pwm/pwm-sun4i.c | 64 ++++++++++++++++++++---------------------
 1 file changed, 32 insertions(+), 32 deletions(-)

diff --git a/drivers/pwm/pwm-sun4i.c b/drivers/pwm/pwm-sun4i.c
index c2f579043915b1..2437e5961f5ee1 100644
--- a/drivers/pwm/pwm-sun4i.c
+++ b/drivers/pwm/pwm-sun4i.c
@@ -95,32 +95,32 @@ static inline struct sun4i_pwm_chip *to_sun4i_pwm_chip(struct pwm_chip *chip)
 	return container_of(chip, struct sun4i_pwm_chip, chip);
 }
 
-static inline u32 sun4i_pwm_readl(struct sun4i_pwm_chip *chip,
+static inline u32 sun4i_pwm_readl(struct sun4i_pwm_chip *sun4ichip,
 				  unsigned long offset)
 {
-	return readl(chip->base + offset);
+	return readl(sun4ichip->base + offset);
 }
 
-static inline void sun4i_pwm_writel(struct sun4i_pwm_chip *chip,
+static inline void sun4i_pwm_writel(struct sun4i_pwm_chip *sun4ichip,
 				    u32 val, unsigned long offset)
 {
-	writel(val, chip->base + offset);
+	writel(val, sun4ichip->base + offset);
 }
 
 static int sun4i_pwm_get_state(struct pwm_chip *chip,
 			       struct pwm_device *pwm,
 			       struct pwm_state *state)
 {
-	struct sun4i_pwm_chip *sun4i_pwm = to_sun4i_pwm_chip(chip);
+	struct sun4i_pwm_chip *sun4ichip = to_sun4i_pwm_chip(chip);
 	u64 clk_rate, tmp;
 	u32 val;
 	unsigned int prescaler;
 
-	clk_rate = clk_get_rate(sun4i_pwm->clk);
+	clk_rate = clk_get_rate(sun4ichip->clk);
 	if (!clk_rate)
 		return -EINVAL;
 
-	val = sun4i_pwm_readl(sun4i_pwm, PWM_CTRL_REG);
+	val = sun4i_pwm_readl(sun4ichip, PWM_CTRL_REG);
 
 	/*
 	 * PWM chapter in H6 manual has a diagram which explains that if bypass
@@ -128,7 +128,7 @@ static int sun4i_pwm_get_state(struct pwm_chip *chip,
 	 * proved that also enable bit is ignored in this case.
 	 */
 	if ((val & BIT_CH(PWM_BYPASS, pwm->hwpwm)) &&
-	    sun4i_pwm->data->has_direct_mod_clk_output) {
+	    sun4ichip->data->has_direct_mod_clk_output) {
 		state->period = DIV_ROUND_UP_ULL(NSEC_PER_SEC, clk_rate);
 		state->duty_cycle = DIV_ROUND_UP_ULL(state->period, 2);
 		state->polarity = PWM_POLARITY_NORMAL;
@@ -137,7 +137,7 @@ static int sun4i_pwm_get_state(struct pwm_chip *chip,
 	}
 
 	if ((PWM_REG_PRESCAL(val, pwm->hwpwm) == PWM_PRESCAL_MASK) &&
-	    sun4i_pwm->data->has_prescaler_bypass)
+	    sun4ichip->data->has_prescaler_bypass)
 		prescaler = 1;
 	else
 		prescaler = prescaler_table[PWM_REG_PRESCAL(val, pwm->hwpwm)];
@@ -156,7 +156,7 @@ static int sun4i_pwm_get_state(struct pwm_chip *chip,
 	else
 		state->enabled = false;
 
-	val = sun4i_pwm_readl(sun4i_pwm, PWM_CH_PRD(pwm->hwpwm));
+	val = sun4i_pwm_readl(sun4ichip, PWM_CH_PRD(pwm->hwpwm));
 
 	tmp = (u64)prescaler * NSEC_PER_SEC * PWM_REG_DTY(val);
 	state->duty_cycle = DIV_ROUND_CLOSEST_ULL(tmp, clk_rate);
@@ -167,7 +167,7 @@ static int sun4i_pwm_get_state(struct pwm_chip *chip,
 	return 0;
 }
 
-static int sun4i_pwm_calculate(struct sun4i_pwm_chip *sun4i_pwm,
+static int sun4i_pwm_calculate(struct sun4i_pwm_chip *sun4ichip,
 			       const struct pwm_state *state,
 			       u32 *dty, u32 *prd, unsigned int *prsclr,
 			       bool *bypass)
@@ -175,9 +175,9 @@ static int sun4i_pwm_calculate(struct sun4i_pwm_chip *sun4i_pwm,
 	u64 clk_rate, div = 0;
 	unsigned int prescaler = 0;
 
-	clk_rate = clk_get_rate(sun4i_pwm->clk);
+	clk_rate = clk_get_rate(sun4ichip->clk);
 
-	*bypass = sun4i_pwm->data->has_direct_mod_clk_output &&
+	*bypass = sun4ichip->data->has_direct_mod_clk_output &&
 		  state->enabled &&
 		  (state->period * clk_rate >= NSEC_PER_SEC) &&
 		  (state->period * clk_rate < 2 * NSEC_PER_SEC) &&
@@ -187,7 +187,7 @@ static int sun4i_pwm_calculate(struct sun4i_pwm_chip *sun4i_pwm,
 	if (*bypass)
 		return 0;
 
-	if (sun4i_pwm->data->has_prescaler_bypass) {
+	if (sun4ichip->data->has_prescaler_bypass) {
 		/* First, test without any prescaler when available */
 		prescaler = PWM_PRESCAL_MASK;
 		/*
@@ -233,7 +233,7 @@ static int sun4i_pwm_calculate(struct sun4i_pwm_chip *sun4i_pwm,
 static int sun4i_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm,
 			   const struct pwm_state *state)
 {
-	struct sun4i_pwm_chip *sun4i_pwm = to_sun4i_pwm_chip(chip);
+	struct sun4i_pwm_chip *sun4ichip = to_sun4i_pwm_chip(chip);
 	struct pwm_state cstate;
 	u32 ctrl, duty = 0, period = 0, val;
 	int ret;
@@ -243,31 +243,31 @@ static int sun4i_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm,
 	pwm_get_state(pwm, &cstate);
 
 	if (!cstate.enabled) {
-		ret = clk_prepare_enable(sun4i_pwm->clk);
+		ret = clk_prepare_enable(sun4ichip->clk);
 		if (ret) {
 			dev_err(pwmchip_parent(chip), "failed to enable PWM clock\n");
 			return ret;
 		}
 	}
 
-	ret = sun4i_pwm_calculate(sun4i_pwm, state, &duty, &period, &prescaler,
+	ret = sun4i_pwm_calculate(sun4ichip, state, &duty, &period, &prescaler,
 				  &bypass);
 	if (ret) {
 		dev_err(pwmchip_parent(chip), "period exceeds the maximum value\n");
 		if (!cstate.enabled)
-			clk_disable_unprepare(sun4i_pwm->clk);
+			clk_disable_unprepare(sun4ichip->clk);
 		return ret;
 	}
 
-	spin_lock(&sun4i_pwm->ctrl_lock);
-	ctrl = sun4i_pwm_readl(sun4i_pwm, PWM_CTRL_REG);
+	spin_lock(&sun4ichip->ctrl_lock);
+	ctrl = sun4i_pwm_readl(sun4ichip, PWM_CTRL_REG);
 
-	if (sun4i_pwm->data->has_direct_mod_clk_output) {
+	if (sun4ichip->data->has_direct_mod_clk_output) {
 		if (bypass) {
 			ctrl |= BIT_CH(PWM_BYPASS, pwm->hwpwm);
 			/* We can skip other parameter */
-			sun4i_pwm_writel(sun4i_pwm, ctrl, PWM_CTRL_REG);
-			spin_unlock(&sun4i_pwm->ctrl_lock);
+			sun4i_pwm_writel(sun4ichip, ctrl, PWM_CTRL_REG);
+			spin_unlock(&sun4ichip->ctrl_lock);
 			return 0;
 		}
 
@@ -277,14 +277,14 @@ static int sun4i_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm,
 	if (PWM_REG_PRESCAL(ctrl, pwm->hwpwm) != prescaler) {
 		/* Prescaler changed, the clock has to be gated */
 		ctrl &= ~BIT_CH(PWM_CLK_GATING, pwm->hwpwm);
-		sun4i_pwm_writel(sun4i_pwm, ctrl, PWM_CTRL_REG);
+		sun4i_pwm_writel(sun4ichip, ctrl, PWM_CTRL_REG);
 
 		ctrl &= ~BIT_CH(PWM_PRESCAL_MASK, pwm->hwpwm);
 		ctrl |= BIT_CH(prescaler, pwm->hwpwm);
 	}
 
 	val = (duty & PWM_DTY_MASK) | PWM_PRD(period);
-	sun4i_pwm_writel(sun4i_pwm, val, PWM_CH_PRD(pwm->hwpwm));
+	sun4i_pwm_writel(sun4ichip, val, PWM_CH_PRD(pwm->hwpwm));
 
 	if (state->polarity != PWM_POLARITY_NORMAL)
 		ctrl &= ~BIT_CH(PWM_ACT_STATE, pwm->hwpwm);
@@ -296,9 +296,9 @@ static int sun4i_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm,
 	if (state->enabled)
 		ctrl |= BIT_CH(PWM_EN, pwm->hwpwm);
 
-	sun4i_pwm_writel(sun4i_pwm, ctrl, PWM_CTRL_REG);
+	sun4i_pwm_writel(sun4ichip, ctrl, PWM_CTRL_REG);
 
-	spin_unlock(&sun4i_pwm->ctrl_lock);
+	spin_unlock(&sun4ichip->ctrl_lock);
 
 	if (state->enabled)
 		return 0;
@@ -310,14 +310,14 @@ static int sun4i_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm,
 	else
 		usleep_range(delay_us, delay_us * 2);
 
-	spin_lock(&sun4i_pwm->ctrl_lock);
-	ctrl = sun4i_pwm_readl(sun4i_pwm, PWM_CTRL_REG);
+	spin_lock(&sun4ichip->ctrl_lock);
+	ctrl = sun4i_pwm_readl(sun4ichip, PWM_CTRL_REG);
 	ctrl &= ~BIT_CH(PWM_CLK_GATING, pwm->hwpwm);
 	ctrl &= ~BIT_CH(PWM_EN, pwm->hwpwm);
-	sun4i_pwm_writel(sun4i_pwm, ctrl, PWM_CTRL_REG);
-	spin_unlock(&sun4i_pwm->ctrl_lock);
+	sun4i_pwm_writel(sun4ichip, ctrl, PWM_CTRL_REG);
+	spin_unlock(&sun4ichip->ctrl_lock);
 
-	clk_disable_unprepare(sun4i_pwm->clk);
+	clk_disable_unprepare(sun4ichip->clk);
 
 	return 0;
 }

From d8342a88c9f1f9f626c50a69275b898373529ac9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Wed, 14 Feb 2024 10:32:54 +0100
Subject: [PATCH 0721/1406] pwm: sun4i: Make use of devm_pwmchip_alloc()
 function
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This prepares the pwm-sun4i driver to further changes of the pwm core
outlined in the commit introducing devm_pwmchip_alloc(). There is no
intended semantical change and the driver should behave as before.

Reviewed-by: Jernej Skrabec <jernej.skrabec@gmail.com>
Link: https://lore.kernel.org/r/9d175b4e27878618cef2e75b6ecbf01ad5d18164.1707900770.git.u.kleine-koenig@pengutronix.de
Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
---
 drivers/pwm/pwm-sun4i.c | 21 ++++++++++-----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/drivers/pwm/pwm-sun4i.c b/drivers/pwm/pwm-sun4i.c
index 2437e5961f5ee1..5c29590d1821e2 100644
--- a/drivers/pwm/pwm-sun4i.c
+++ b/drivers/pwm/pwm-sun4i.c
@@ -81,7 +81,6 @@ struct sun4i_pwm_data {
 };
 
 struct sun4i_pwm_chip {
-	struct pwm_chip chip;
 	struct clk *bus_clk;
 	struct clk *clk;
 	struct reset_control *rst;
@@ -92,7 +91,7 @@ struct sun4i_pwm_chip {
 
 static inline struct sun4i_pwm_chip *to_sun4i_pwm_chip(struct pwm_chip *chip)
 {
-	return container_of(chip, struct sun4i_pwm_chip, chip);
+	return pwmchip_get_drvdata(chip);
 }
 
 static inline u32 sun4i_pwm_readl(struct sun4i_pwm_chip *sun4ichip,
@@ -385,18 +384,20 @@ MODULE_DEVICE_TABLE(of, sun4i_pwm_dt_ids);
 static int sun4i_pwm_probe(struct platform_device *pdev)
 {
 	struct pwm_chip *chip;
+	const struct sun4i_pwm_data *data;
 	struct sun4i_pwm_chip *sun4ichip;
 	int ret;
 
-	sun4ichip = devm_kzalloc(&pdev->dev, sizeof(*sun4ichip), GFP_KERNEL);
-	if (!sun4ichip)
-		return -ENOMEM;
-	chip = &sun4ichip->chip;
-
-	sun4ichip->data = of_device_get_match_data(&pdev->dev);
-	if (!sun4ichip->data)
+	data = of_device_get_match_data(&pdev->dev);
+	if (!data)
 		return -ENODEV;
 
+	chip = devm_pwmchip_alloc(&pdev->dev, data->npwm, sizeof(*sun4ichip));
+	if (IS_ERR(chip))
+		return PTR_ERR(chip);
+	sun4ichip = to_sun4i_pwm_chip(chip);
+
+	sun4ichip->data = data;
 	sun4ichip->base = devm_platform_ioremap_resource(pdev, 0);
 	if (IS_ERR(sun4ichip->base))
 		return PTR_ERR(sun4ichip->base);
@@ -453,9 +454,7 @@ static int sun4i_pwm_probe(struct platform_device *pdev)
 		goto err_bus;
 	}
 
-	chip->dev = &pdev->dev;
 	chip->ops = &sun4i_pwm_ops;
-	chip->npwm = sun4ichip->data->npwm;
 
 	spin_lock_init(&sun4ichip->ctrl_lock);
 

From 9b656e62eedcd59566a301fce5f0ffc97d15647a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Wed, 14 Feb 2024 10:32:55 +0100
Subject: [PATCH 0722/1406] pwm: sunplus: Make use of devm_pwmchip_alloc()
 function
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This prepares the pwm-sunplus driver to further changes of the pwm core
outlined in the commit introducing devm_pwmchip_alloc(). There is no
intended semantical change and the driver should behave as before.

Link: https://lore.kernel.org/r/192be7e428eff17dd922c9c0d0d168225b89bb34.1707900770.git.u.kleine-koenig@pengutronix.de
Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
---
 drivers/pwm/pwm-sunplus.c | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/drivers/pwm/pwm-sunplus.c b/drivers/pwm/pwm-sunplus.c
index 773e2f80526e89..b342b843247b7c 100644
--- a/drivers/pwm/pwm-sunplus.c
+++ b/drivers/pwm/pwm-sunplus.c
@@ -43,14 +43,13 @@
 #define SP7021_PWM_NUM			4
 
 struct sunplus_pwm {
-	struct pwm_chip chip;
 	void __iomem *base;
 	struct clk *clk;
 };
 
 static inline struct sunplus_pwm *to_sunplus_pwm(struct pwm_chip *chip)
 {
-	return container_of(chip, struct sunplus_pwm, chip);
+	return pwmchip_get_drvdata(chip);
 }
 
 static int sunplus_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm,
@@ -175,12 +174,14 @@ static void sunplus_pwm_clk_release(void *data)
 static int sunplus_pwm_probe(struct platform_device *pdev)
 {
 	struct device *dev = &pdev->dev;
+	struct pwm_chip *chip;
 	struct sunplus_pwm *priv;
 	int ret;
 
-	priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL);
-	if (!priv)
-		return -ENOMEM;
+	chip = devm_pwmchip_alloc(dev, SP7021_PWM_NUM, sizeof(*priv));
+	if (IS_ERR(chip))
+		return PTR_ERR(chip);
+	priv = to_sunplus_pwm(chip);
 
 	priv->base = devm_platform_ioremap_resource(pdev, 0);
 	if (IS_ERR(priv->base))
@@ -203,11 +204,9 @@ static int sunplus_pwm_probe(struct platform_device *pdev)
 		return ret;
 	}
 
-	priv->chip.dev = dev;
-	priv->chip.ops = &sunplus_pwm_ops;
-	priv->chip.npwm = SP7021_PWM_NUM;
+	chip->ops = &sunplus_pwm_ops;
 
-	ret = devm_pwmchip_add(dev, &priv->chip);
+	ret = devm_pwmchip_add(dev, chip);
 	if (ret < 0)
 		return dev_err_probe(dev, ret, "Cannot register sunplus PWM\n");
 

From 36e199783d7b21aa5fe7190e687a6a610e2209e9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Wed, 14 Feb 2024 10:32:56 +0100
Subject: [PATCH 0723/1406] pwm: tegra: Drop duplicated tracking of the parent
 device
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The pwmchip stores a pointer to the parent device, so there is no need
to store another copy in driver private data. Drop struct
tegra_pwm_chip::dev and use the pwm_chip's parent pointer instead.

Link: https://lore.kernel.org/r/225f4bfcb15fb69eb818ddb71d623157c447180a.1707900770.git.u.kleine-koenig@pengutronix.de
Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
---
 drivers/pwm/pwm-tegra.c | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/drivers/pwm/pwm-tegra.c b/drivers/pwm/pwm-tegra.c
index 82ee2f0754f965..0d5f57c9ee2642 100644
--- a/drivers/pwm/pwm-tegra.c
+++ b/drivers/pwm/pwm-tegra.c
@@ -66,7 +66,6 @@ struct tegra_pwm_soc {
 
 struct tegra_pwm_chip {
 	struct pwm_chip chip;
-	struct device *dev;
 
 	struct clk *clk;
 	struct reset_control*rst;
@@ -158,7 +157,7 @@ static int tegra_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm,
 			 */
 			required_clk_rate *= 2;
 
-		err = dev_pm_opp_set_rate(pc->dev, required_clk_rate);
+		err = dev_pm_opp_set_rate(pwmchip_parent(chip), required_clk_rate);
 		if (err < 0)
 			return -EINVAL;
 
@@ -194,7 +193,7 @@ static int tegra_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm,
 	 * before writing the register. Otherwise, keep it enabled.
 	 */
 	if (!pwm_is_enabled(pwm)) {
-		err = pm_runtime_resume_and_get(pc->dev);
+		err = pm_runtime_resume_and_get(pwmchip_parent(chip));
 		if (err)
 			return err;
 	} else
@@ -206,7 +205,7 @@ static int tegra_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm,
 	 * If the PWM is not enabled, turn the clock off again to save power.
 	 */
 	if (!pwm_is_enabled(pwm))
-		pm_runtime_put(pc->dev);
+		pm_runtime_put(pwmchip_parent(chip));
 
 	return 0;
 }
@@ -217,7 +216,7 @@ static int tegra_pwm_enable(struct pwm_chip *chip, struct pwm_device *pwm)
 	int rc = 0;
 	u32 val;
 
-	rc = pm_runtime_resume_and_get(pc->dev);
+	rc = pm_runtime_resume_and_get(pwmchip_parent(chip));
 	if (rc)
 		return rc;
 
@@ -237,7 +236,7 @@ static void tegra_pwm_disable(struct pwm_chip *chip, struct pwm_device *pwm)
 	val &= ~PWM_ENABLE;
 	pwm_writel(pc, pwm->hwpwm, val);
 
-	pm_runtime_put_sync(pc->dev);
+	pm_runtime_put_sync(pwmchip_parent(chip));
 }
 
 static int tegra_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm,
@@ -280,7 +279,6 @@ static int tegra_pwm_probe(struct platform_device *pdev)
 		return -ENOMEM;
 
 	pc->soc = of_device_get_match_data(&pdev->dev);
-	pc->dev = &pdev->dev;
 
 	pc->regs = devm_platform_ioremap_resource(pdev, 0);
 	if (IS_ERR(pc->regs))
@@ -302,7 +300,7 @@ static int tegra_pwm_probe(struct platform_device *pdev)
 		return ret;
 
 	/* Set maximum frequency of the IP */
-	ret = dev_pm_opp_set_rate(pc->dev, pc->soc->max_frequency);
+	ret = dev_pm_opp_set_rate(&pdev->dev, pc->soc->max_frequency);
 	if (ret < 0) {
 		dev_err(&pdev->dev, "Failed to set max frequency: %d\n", ret);
 		goto put_pm;

From 28dddc0f86d56d881d2de107185fe6e268475440 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Wed, 14 Feb 2024 10:32:57 +0100
Subject: [PATCH 0724/1406] pwm: tegra: Prepare removing pwm_chip from driver
 data
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This prepares the driver for further changes that will drop struct
pwm_chip chip from struct tegra_pwm_chip. Use the pwm_chip as driver
data instead of the tegra_pwm_chip to get access to the pwm_chip in
tegra_pwm_remove() without using pc->chip.

Link: https://lore.kernel.org/r/2813c63bf1317dee808f4c5c4a9411999f2d5746.1707900770.git.u.kleine-koenig@pengutronix.de
Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
---
 drivers/pwm/pwm-tegra.c | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/drivers/pwm/pwm-tegra.c b/drivers/pwm/pwm-tegra.c
index 0d5f57c9ee2642..f61c24376523a4 100644
--- a/drivers/pwm/pwm-tegra.c
+++ b/drivers/pwm/pwm-tegra.c
@@ -271,12 +271,14 @@ static const struct pwm_ops tegra_pwm_ops = {
 
 static int tegra_pwm_probe(struct platform_device *pdev)
 {
+	struct pwm_chip *chip;
 	struct tegra_pwm_chip *pc;
 	int ret;
 
 	pc = devm_kzalloc(&pdev->dev, sizeof(*pc), GFP_KERNEL);
 	if (!pc)
 		return -ENOMEM;
+	chip = &pc->chip;
 
 	pc->soc = of_device_get_match_data(&pdev->dev);
 
@@ -284,7 +286,7 @@ static int tegra_pwm_probe(struct platform_device *pdev)
 	if (IS_ERR(pc->regs))
 		return PTR_ERR(pc->regs);
 
-	platform_set_drvdata(pdev, pc);
+	platform_set_drvdata(pdev, chip);
 
 	pc->clk = devm_clk_get(&pdev->dev, NULL);
 	if (IS_ERR(pc->clk))
@@ -326,11 +328,11 @@ static int tegra_pwm_probe(struct platform_device *pdev)
 
 	reset_control_deassert(pc->rst);
 
-	pc->chip.dev = &pdev->dev;
-	pc->chip.ops = &tegra_pwm_ops;
-	pc->chip.npwm = pc->soc->num_channels;
+	chip->dev = &pdev->dev;
+	chip->ops = &tegra_pwm_ops;
+	chip->npwm = pc->soc->num_channels;
 
-	ret = pwmchip_add(&pc->chip);
+	ret = pwmchip_add(chip);
 	if (ret < 0) {
 		dev_err(&pdev->dev, "pwmchip_add() failed: %d\n", ret);
 		reset_control_assert(pc->rst);
@@ -348,9 +350,10 @@ static int tegra_pwm_probe(struct platform_device *pdev)
 
 static void tegra_pwm_remove(struct platform_device *pdev)
 {
-	struct tegra_pwm_chip *pc = platform_get_drvdata(pdev);
+	struct pwm_chip *chip = platform_get_drvdata(pdev);
+	struct tegra_pwm_chip *pc = to_tegra_pwm_chip(chip);
 
-	pwmchip_remove(&pc->chip);
+	pwmchip_remove(chip);
 
 	reset_control_assert(pc->rst);
 
@@ -359,7 +362,8 @@ static void tegra_pwm_remove(struct platform_device *pdev)
 
 static int __maybe_unused tegra_pwm_runtime_suspend(struct device *dev)
 {
-	struct tegra_pwm_chip *pc = dev_get_drvdata(dev);
+	struct pwm_chip *chip = dev_get_drvdata(dev);
+	struct tegra_pwm_chip *pc = to_tegra_pwm_chip(chip);
 	int err;
 
 	clk_disable_unprepare(pc->clk);
@@ -375,7 +379,8 @@ static int __maybe_unused tegra_pwm_runtime_suspend(struct device *dev)
 
 static int __maybe_unused tegra_pwm_runtime_resume(struct device *dev)
 {
-	struct tegra_pwm_chip *pc = dev_get_drvdata(dev);
+	struct pwm_chip *chip = dev_get_drvdata(dev);
+	struct tegra_pwm_chip *pc = to_tegra_pwm_chip(chip);
 	int err;
 
 	err = pinctrl_pm_select_default_state(dev);

From 0579fa65ba673020d6a810b87c51917a75ba4ca4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Wed, 14 Feb 2024 10:32:58 +0100
Subject: [PATCH 0725/1406] pwm: tegra: Make use of devm_pwmchip_alloc()
 function
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This prepares the pwm-tegra driver to further changes of the pwm core
outlined in the commit introducing devm_pwmchip_alloc(). There is no
intended semantical change and the driver should behave as before.

Link: https://lore.kernel.org/r/8719be3d57b0b5cf575b312e5ff41fe0717e3a43.1707900770.git.u.kleine-koenig@pengutronix.de
Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
---
 drivers/pwm/pwm-tegra.c | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/drivers/pwm/pwm-tegra.c b/drivers/pwm/pwm-tegra.c
index f61c24376523a4..a3d69976148f58 100644
--- a/drivers/pwm/pwm-tegra.c
+++ b/drivers/pwm/pwm-tegra.c
@@ -65,8 +65,6 @@ struct tegra_pwm_soc {
 };
 
 struct tegra_pwm_chip {
-	struct pwm_chip chip;
-
 	struct clk *clk;
 	struct reset_control*rst;
 
@@ -80,7 +78,7 @@ struct tegra_pwm_chip {
 
 static inline struct tegra_pwm_chip *to_tegra_pwm_chip(struct pwm_chip *chip)
 {
-	return container_of(chip, struct tegra_pwm_chip, chip);
+	return pwmchip_get_drvdata(chip);
 }
 
 static inline u32 pwm_readl(struct tegra_pwm_chip *pc, unsigned int offset)
@@ -273,14 +271,17 @@ static int tegra_pwm_probe(struct platform_device *pdev)
 {
 	struct pwm_chip *chip;
 	struct tegra_pwm_chip *pc;
+	const struct tegra_pwm_soc *soc;
 	int ret;
 
-	pc = devm_kzalloc(&pdev->dev, sizeof(*pc), GFP_KERNEL);
-	if (!pc)
-		return -ENOMEM;
-	chip = &pc->chip;
+	soc = of_device_get_match_data(&pdev->dev);
+
+	chip = devm_pwmchip_alloc(&pdev->dev, soc->num_channels, sizeof(*pc));
+	if (IS_ERR(chip))
+		return PTR_ERR(chip);
+	pc = to_tegra_pwm_chip(chip);
 
-	pc->soc = of_device_get_match_data(&pdev->dev);
+	pc->soc = soc;
 
 	pc->regs = devm_platform_ioremap_resource(pdev, 0);
 	if (IS_ERR(pc->regs))
@@ -328,9 +329,7 @@ static int tegra_pwm_probe(struct platform_device *pdev)
 
 	reset_control_deassert(pc->rst);
 
-	chip->dev = &pdev->dev;
 	chip->ops = &tegra_pwm_ops;
-	chip->npwm = pc->soc->num_channels;
 
 	ret = pwmchip_add(chip);
 	if (ret < 0) {

From 71ecf0a6e9bc01fabbab6a7bb7f36d87482442c2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Wed, 14 Feb 2024 10:32:59 +0100
Subject: [PATCH 0726/1406] pwm: tiecap: Simplify code to determine the
 pwmchip's parent device
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There is already a pointer to the pwmchip, make use of it directly
instead of using the struct ecap_pwm_chip *pc just obtained from
it. This also has the advantage of not using struct ecap_pwm_chip::chip
any more which will be dropped soon.

Link: https://lore.kernel.org/r/628f4b8c4ba1321075fc1dff70453a1c79ffb814.1707900770.git.u.kleine-koenig@pengutronix.de
Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
---
 drivers/pwm/pwm-tiecap.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/pwm/pwm-tiecap.c b/drivers/pwm/pwm-tiecap.c
index d974f4414ac9ae..b93e3be318d5e7 100644
--- a/drivers/pwm/pwm-tiecap.c
+++ b/drivers/pwm/pwm-tiecap.c
@@ -70,7 +70,7 @@ static int ecap_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm,
 		duty_cycles = (u32)c;
 	}
 
-	pm_runtime_get_sync(pc->chip.dev);
+	pm_runtime_get_sync(chip->dev);
 
 	value = readw(pc->mmio_base + ECCTL2);
 
@@ -100,7 +100,7 @@ static int ecap_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm,
 		writew(value, pc->mmio_base + ECCTL2);
 	}
 
-	pm_runtime_put_sync(pc->chip.dev);
+	pm_runtime_put_sync(chip->dev);
 
 	return 0;
 }
@@ -111,7 +111,7 @@ static int ecap_pwm_set_polarity(struct pwm_chip *chip, struct pwm_device *pwm,
 	struct ecap_pwm_chip *pc = to_ecap_pwm_chip(chip);
 	u16 value;
 
-	pm_runtime_get_sync(pc->chip.dev);
+	pm_runtime_get_sync(chip->dev);
 
 	value = readw(pc->mmio_base + ECCTL2);
 
@@ -124,7 +124,7 @@ static int ecap_pwm_set_polarity(struct pwm_chip *chip, struct pwm_device *pwm,
 
 	writew(value, pc->mmio_base + ECCTL2);
 
-	pm_runtime_put_sync(pc->chip.dev);
+	pm_runtime_put_sync(chip->dev);
 
 	return 0;
 }
@@ -135,7 +135,7 @@ static int ecap_pwm_enable(struct pwm_chip *chip, struct pwm_device *pwm)
 	u16 value;
 
 	/* Leave clock enabled on enabling PWM */
-	pm_runtime_get_sync(pc->chip.dev);
+	pm_runtime_get_sync(chip->dev);
 
 	/*
 	 * Enable 'Free run Time stamp counter mode' to start counter
@@ -162,7 +162,7 @@ static void ecap_pwm_disable(struct pwm_chip *chip, struct pwm_device *pwm)
 	writew(value, pc->mmio_base + ECCTL2);
 
 	/* Disable clock on PWM disable */
-	pm_runtime_put_sync(pc->chip.dev);
+	pm_runtime_put_sync(chip->dev);
 }
 
 static int ecap_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm,

From 9798f2c2431382f8670f644d89dfb708b63f4c65 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Wed, 14 Feb 2024 10:33:00 +0100
Subject: [PATCH 0727/1406] pwm: tiecap: Change prototype of helpers to prepare
 further changes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This prepares the driver for further changes that will make it harder to
determine the pwm_chip from a given ecap_pwm_chip. To just not have to
do that, rework ecap_pwm_save_context() and ecap_pwm_restore_context
take a pwm_chip. Also use the pwm_chip as driver data instead of the
ecap_pwm_chip.

Link: https://lore.kernel.org/r/ed031f201ff52c6b298de2dc81b06aad3a0207f8.1707900770.git.u.kleine-koenig@pengutronix.de
Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
---
 drivers/pwm/pwm-tiecap.c | 36 +++++++++++++++++++++---------------
 1 file changed, 21 insertions(+), 15 deletions(-)

diff --git a/drivers/pwm/pwm-tiecap.c b/drivers/pwm/pwm-tiecap.c
index b93e3be318d5e7..0d10e83577318c 100644
--- a/drivers/pwm/pwm-tiecap.c
+++ b/drivers/pwm/pwm-tiecap.c
@@ -218,6 +218,7 @@ static int ecap_pwm_probe(struct platform_device *pdev)
 {
 	struct device_node *np = pdev->dev.of_node;
 	struct ecap_pwm_chip *pc;
+	struct pwm_chip *chip;
 	struct clk *clk;
 	int ret;
 
@@ -244,21 +245,22 @@ static int ecap_pwm_probe(struct platform_device *pdev)
 		return -EINVAL;
 	}
 
-	pc->chip.dev = &pdev->dev;
-	pc->chip.ops = &ecap_pwm_ops;
-	pc->chip.npwm = 1;
+	chip = &pc->chip;
+	chip->dev = &pdev->dev;
+	chip->ops = &ecap_pwm_ops;
+	chip->npwm = 1;
 
 	pc->mmio_base = devm_platform_ioremap_resource(pdev, 0);
 	if (IS_ERR(pc->mmio_base))
 		return PTR_ERR(pc->mmio_base);
 
-	ret = devm_pwmchip_add(&pdev->dev, &pc->chip);
+	ret = devm_pwmchip_add(&pdev->dev, chip);
 	if (ret < 0) {
 		dev_err(&pdev->dev, "pwmchip_add() failed: %d\n", ret);
 		return ret;
 	}
 
-	platform_set_drvdata(pdev, pc);
+	platform_set_drvdata(pdev, chip);
 	pm_runtime_enable(&pdev->dev);
 
 	return 0;
@@ -269,17 +271,21 @@ static void ecap_pwm_remove(struct platform_device *pdev)
 	pm_runtime_disable(&pdev->dev);
 }
 
-static void ecap_pwm_save_context(struct ecap_pwm_chip *pc)
+static void ecap_pwm_save_context(struct pwm_chip *chip)
 {
-	pm_runtime_get_sync(pc->chip.dev);
+	struct ecap_pwm_chip *pc = to_ecap_pwm_chip(chip);
+
+	pm_runtime_get_sync(chip->dev);
 	pc->ctx.ecctl2 = readw(pc->mmio_base + ECCTL2);
 	pc->ctx.cap4 = readl(pc->mmio_base + CAP4);
 	pc->ctx.cap3 = readl(pc->mmio_base + CAP3);
-	pm_runtime_put_sync(pc->chip.dev);
+	pm_runtime_put_sync(chip->dev);
 }
 
-static void ecap_pwm_restore_context(struct ecap_pwm_chip *pc)
+static void ecap_pwm_restore_context(struct pwm_chip *chip)
 {
+	struct ecap_pwm_chip *pc = to_ecap_pwm_chip(chip);
+
 	writel(pc->ctx.cap3, pc->mmio_base + CAP3);
 	writel(pc->ctx.cap4, pc->mmio_base + CAP4);
 	writew(pc->ctx.ecctl2, pc->mmio_base + ECCTL2);
@@ -287,10 +293,10 @@ static void ecap_pwm_restore_context(struct ecap_pwm_chip *pc)
 
 static int ecap_pwm_suspend(struct device *dev)
 {
-	struct ecap_pwm_chip *pc = dev_get_drvdata(dev);
-	struct pwm_device *pwm = pc->chip.pwms;
+	struct pwm_chip *chip = dev_get_drvdata(dev);
+	struct pwm_device *pwm = chip->pwms;
 
-	ecap_pwm_save_context(pc);
+	ecap_pwm_save_context(chip);
 
 	/* Disable explicitly if PWM is running */
 	if (pwm_is_enabled(pwm))
@@ -301,14 +307,14 @@ static int ecap_pwm_suspend(struct device *dev)
 
 static int ecap_pwm_resume(struct device *dev)
 {
-	struct ecap_pwm_chip *pc = dev_get_drvdata(dev);
-	struct pwm_device *pwm = pc->chip.pwms;
+	struct pwm_chip *chip = dev_get_drvdata(dev);
+	struct pwm_device *pwm = chip->pwms;
 
 	/* Enable explicitly if PWM was running */
 	if (pwm_is_enabled(pwm))
 		pm_runtime_get_sync(dev);
 
-	ecap_pwm_restore_context(pc);
+	ecap_pwm_restore_context(chip);
 	return 0;
 }
 

From 2bb023b12bbb8b7ab2c6a350f192c4fa2e9c0f5b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Wed, 14 Feb 2024 10:33:01 +0100
Subject: [PATCH 0728/1406] pwm: tiecap: Make use of pwmchip_parent() accessor
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

struct pwm_chip::dev is about to change. To not have to touch this
driver in the same commit as struct pwm_chip::dev, use the accessor
function provided for exactly this purpose.

Link: https://lore.kernel.org/r/ae92e06b49437ca7e768b1f8b405170e33948a70.1707900770.git.u.kleine-koenig@pengutronix.de
Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
---
 drivers/pwm/pwm-tiecap.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/pwm/pwm-tiecap.c b/drivers/pwm/pwm-tiecap.c
index 0d10e83577318c..558b244f074ac4 100644
--- a/drivers/pwm/pwm-tiecap.c
+++ b/drivers/pwm/pwm-tiecap.c
@@ -70,7 +70,7 @@ static int ecap_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm,
 		duty_cycles = (u32)c;
 	}
 
-	pm_runtime_get_sync(chip->dev);
+	pm_runtime_get_sync(pwmchip_parent(chip));
 
 	value = readw(pc->mmio_base + ECCTL2);
 
@@ -100,7 +100,7 @@ static int ecap_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm,
 		writew(value, pc->mmio_base + ECCTL2);
 	}
 
-	pm_runtime_put_sync(chip->dev);
+	pm_runtime_put_sync(pwmchip_parent(chip));
 
 	return 0;
 }
@@ -111,7 +111,7 @@ static int ecap_pwm_set_polarity(struct pwm_chip *chip, struct pwm_device *pwm,
 	struct ecap_pwm_chip *pc = to_ecap_pwm_chip(chip);
 	u16 value;
 
-	pm_runtime_get_sync(chip->dev);
+	pm_runtime_get_sync(pwmchip_parent(chip));
 
 	value = readw(pc->mmio_base + ECCTL2);
 
@@ -124,7 +124,7 @@ static int ecap_pwm_set_polarity(struct pwm_chip *chip, struct pwm_device *pwm,
 
 	writew(value, pc->mmio_base + ECCTL2);
 
-	pm_runtime_put_sync(chip->dev);
+	pm_runtime_put_sync(pwmchip_parent(chip));
 
 	return 0;
 }
@@ -135,7 +135,7 @@ static int ecap_pwm_enable(struct pwm_chip *chip, struct pwm_device *pwm)
 	u16 value;
 
 	/* Leave clock enabled on enabling PWM */
-	pm_runtime_get_sync(chip->dev);
+	pm_runtime_get_sync(pwmchip_parent(chip));
 
 	/*
 	 * Enable 'Free run Time stamp counter mode' to start counter
@@ -162,7 +162,7 @@ static void ecap_pwm_disable(struct pwm_chip *chip, struct pwm_device *pwm)
 	writew(value, pc->mmio_base + ECCTL2);
 
 	/* Disable clock on PWM disable */
-	pm_runtime_put_sync(chip->dev);
+	pm_runtime_put_sync(pwmchip_parent(chip));
 }
 
 static int ecap_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm,
@@ -275,11 +275,11 @@ static void ecap_pwm_save_context(struct pwm_chip *chip)
 {
 	struct ecap_pwm_chip *pc = to_ecap_pwm_chip(chip);
 
-	pm_runtime_get_sync(chip->dev);
+	pm_runtime_get_sync(pwmchip_parent(chip));
 	pc->ctx.ecctl2 = readw(pc->mmio_base + ECCTL2);
 	pc->ctx.cap4 = readl(pc->mmio_base + CAP4);
 	pc->ctx.cap3 = readl(pc->mmio_base + CAP3);
-	pm_runtime_put_sync(chip->dev);
+	pm_runtime_put_sync(pwmchip_parent(chip));
 }
 
 static void ecap_pwm_restore_context(struct pwm_chip *chip)

From 92011df86c77fea99f7e4e33cc17268abad94ec7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Wed, 14 Feb 2024 10:33:02 +0100
Subject: [PATCH 0729/1406] pwm: tiecap: Make use of devm_pwmchip_alloc()
 function
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This prepares the pwm-tegra driver to further changes of the pwm core
outlined in the commit introducing devm_pwmchip_alloc(). There is no
intended semantical change and the driver should behave as before.

Link: https://lore.kernel.org/r/af7846109c0df225126c8e5cd186b89ace70afc0.1707900770.git.u.kleine-koenig@pengutronix.de
Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
---
 drivers/pwm/pwm-tiecap.c | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/drivers/pwm/pwm-tiecap.c b/drivers/pwm/pwm-tiecap.c
index 558b244f074ac4..d6c2b1b1387e03 100644
--- a/drivers/pwm/pwm-tiecap.c
+++ b/drivers/pwm/pwm-tiecap.c
@@ -32,7 +32,6 @@ struct ecap_context {
 };
 
 struct ecap_pwm_chip {
-	struct pwm_chip chip;
 	unsigned int clk_rate;
 	void __iomem *mmio_base;
 	struct ecap_context ctx;
@@ -40,7 +39,7 @@ struct ecap_pwm_chip {
 
 static inline struct ecap_pwm_chip *to_ecap_pwm_chip(struct pwm_chip *chip)
 {
-	return container_of(chip, struct ecap_pwm_chip, chip);
+	return pwmchip_get_drvdata(chip);
 }
 
 /*
@@ -222,9 +221,10 @@ static int ecap_pwm_probe(struct platform_device *pdev)
 	struct clk *clk;
 	int ret;
 
-	pc = devm_kzalloc(&pdev->dev, sizeof(*pc), GFP_KERNEL);
-	if (!pc)
-		return -ENOMEM;
+	chip = devm_pwmchip_alloc(&pdev->dev, 1, sizeof(*pc));
+	if (IS_ERR(chip))
+		return PTR_ERR(chip);
+	pc = to_ecap_pwm_chip(chip);
 
 	clk = devm_clk_get(&pdev->dev, "fck");
 	if (IS_ERR(clk)) {
@@ -245,10 +245,7 @@ static int ecap_pwm_probe(struct platform_device *pdev)
 		return -EINVAL;
 	}
 
-	chip = &pc->chip;
-	chip->dev = &pdev->dev;
 	chip->ops = &ecap_pwm_ops;
-	chip->npwm = 1;
 
 	pc->mmio_base = devm_platform_ioremap_resource(pdev, 0);
 	if (IS_ERR(pc->mmio_base))

From 1c8135ad2534d85acdb5e5e64c47c6032df5773e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Wed, 14 Feb 2024 10:33:03 +0100
Subject: [PATCH 0730/1406] pwm: tiehrpwm: Simplify code to determine the
 pwmchip's parent device
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There is already a pointer to the pwmchip, make use of it directly
instead of using the struct ehrpwm_pwm_chip *ddata just obtained from
it. This also has the advantage of not using struct
ehrpwm_pwm_chip::chip any more which will be dropped soon.

Link: https://lore.kernel.org/r/b2b06a3aabf8c04969d59ddf7ba565b303855878.1707900770.git.u.kleine-koenig@pengutronix.de
Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
---
 drivers/pwm/pwm-tiehrpwm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/pwm/pwm-tiehrpwm.c b/drivers/pwm/pwm-tiehrpwm.c
index af231fa74fa901..9848493dee97e5 100644
--- a/drivers/pwm/pwm-tiehrpwm.c
+++ b/drivers/pwm/pwm-tiehrpwm.c
@@ -347,7 +347,7 @@ static int ehrpwm_pwm_enable(struct pwm_chip *chip, struct pwm_device *pwm)
 	ret = clk_enable(pc->tbclk);
 	if (ret) {
 		dev_err(chip->dev, "Failed to enable TBCLK for %s: %d\n",
-			dev_name(pc->chip.dev), ret);
+			dev_name(chip->dev), ret);
 		return ret;
 	}
 

From ea9096aae7b888ec087b4556b5b05d6a8b7f063d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Wed, 14 Feb 2024 10:33:04 +0100
Subject: [PATCH 0731/1406] pwm: tiehrpwm: Change prototype of helpers to
 prepare further changes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This prepares the driver for further changes that will make it harder to
determine the pwm_chip from a given ehrpwm_pwm_chip. To just not have to
do that, rework ehrpwm_pwm_save_context() and
ehrpwm_pwm_restore_context() take a pwm_chip. Also use the pwm_chip as
driver data instead of the ehrpwm_pwm_chip.

Link: https://lore.kernel.org/r/79052207cdf71f0882ae13fe1a192ef6f6dba35b.1707900770.git.u.kleine-koenig@pengutronix.de
Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
---
 drivers/pwm/pwm-tiehrpwm.c | 45 ++++++++++++++++++++++----------------
 1 file changed, 26 insertions(+), 19 deletions(-)

diff --git a/drivers/pwm/pwm-tiehrpwm.c b/drivers/pwm/pwm-tiehrpwm.c
index 9848493dee97e5..6d7babf6fdb04c 100644
--- a/drivers/pwm/pwm-tiehrpwm.c
+++ b/drivers/pwm/pwm-tiehrpwm.c
@@ -450,6 +450,7 @@ static int ehrpwm_pwm_probe(struct platform_device *pdev)
 {
 	struct device_node *np = pdev->dev.of_node;
 	struct ehrpwm_pwm_chip *pc;
+	struct pwm_chip *chip;
 	struct clk *clk;
 	int ret;
 
@@ -474,9 +475,10 @@ static int ehrpwm_pwm_probe(struct platform_device *pdev)
 		return -EINVAL;
 	}
 
-	pc->chip.dev = &pdev->dev;
-	pc->chip.ops = &ehrpwm_pwm_ops;
-	pc->chip.npwm = NUM_PWM_CHANNEL;
+	chip = &pc->chip;
+	chip->dev = &pdev->dev;
+	chip->ops = &ehrpwm_pwm_ops;
+	chip->npwm = NUM_PWM_CHANNEL;
 
 	pc->mmio_base = devm_platform_ioremap_resource(pdev, 0);
 	if (IS_ERR(pc->mmio_base))
@@ -493,13 +495,13 @@ static int ehrpwm_pwm_probe(struct platform_device *pdev)
 		return ret;
 	}
 
-	ret = pwmchip_add(&pc->chip);
+	ret = pwmchip_add(chip);
 	if (ret < 0) {
 		dev_err(&pdev->dev, "pwmchip_add() failed: %d\n", ret);
 		goto err_clk_unprepare;
 	}
 
-	platform_set_drvdata(pdev, pc);
+	platform_set_drvdata(pdev, chip);
 	pm_runtime_enable(&pdev->dev);
 
 	return 0;
@@ -512,18 +514,21 @@ static int ehrpwm_pwm_probe(struct platform_device *pdev)
 
 static void ehrpwm_pwm_remove(struct platform_device *pdev)
 {
-	struct ehrpwm_pwm_chip *pc = platform_get_drvdata(pdev);
+	struct pwm_chip *chip = platform_get_drvdata(pdev);
+	struct ehrpwm_pwm_chip *pc = to_ehrpwm_pwm_chip(chip);
 
-	pwmchip_remove(&pc->chip);
+	pwmchip_remove(chip);
 
 	clk_unprepare(pc->tbclk);
 
 	pm_runtime_disable(&pdev->dev);
 }
 
-static void ehrpwm_pwm_save_context(struct ehrpwm_pwm_chip *pc)
+static void ehrpwm_pwm_save_context(struct pwm_chip *chip)
 {
-	pm_runtime_get_sync(pc->chip.dev);
+	struct ehrpwm_pwm_chip *pc = to_ehrpwm_pwm_chip(chip);
+
+	pm_runtime_get_sync(chip->dev);
 
 	pc->ctx.tbctl = ehrpwm_read(pc->mmio_base, TBCTL);
 	pc->ctx.tbprd = ehrpwm_read(pc->mmio_base, TBPRD);
@@ -534,11 +539,13 @@ static void ehrpwm_pwm_save_context(struct ehrpwm_pwm_chip *pc)
 	pc->ctx.aqsfrc = ehrpwm_read(pc->mmio_base, AQSFRC);
 	pc->ctx.aqcsfrc = ehrpwm_read(pc->mmio_base, AQCSFRC);
 
-	pm_runtime_put_sync(pc->chip.dev);
+	pm_runtime_put_sync(chip->dev);
 }
 
-static void ehrpwm_pwm_restore_context(struct ehrpwm_pwm_chip *pc)
+static void ehrpwm_pwm_restore_context(struct pwm_chip *chip)
 {
+	struct ehrpwm_pwm_chip *pc = to_ehrpwm_pwm_chip(chip);
+
 	ehrpwm_write(pc->mmio_base, TBPRD, pc->ctx.tbprd);
 	ehrpwm_write(pc->mmio_base, CMPA, pc->ctx.cmpa);
 	ehrpwm_write(pc->mmio_base, CMPB, pc->ctx.cmpb);
@@ -551,13 +558,13 @@ static void ehrpwm_pwm_restore_context(struct ehrpwm_pwm_chip *pc)
 
 static int ehrpwm_pwm_suspend(struct device *dev)
 {
-	struct ehrpwm_pwm_chip *pc = dev_get_drvdata(dev);
+	struct pwm_chip *chip = dev_get_drvdata(dev);
 	unsigned int i;
 
-	ehrpwm_pwm_save_context(pc);
+	ehrpwm_pwm_save_context(chip);
 
-	for (i = 0; i < pc->chip.npwm; i++) {
-		struct pwm_device *pwm = &pc->chip.pwms[i];
+	for (i = 0; i < chip->npwm; i++) {
+		struct pwm_device *pwm = &chip->pwms[i];
 
 		if (!pwm_is_enabled(pwm))
 			continue;
@@ -571,11 +578,11 @@ static int ehrpwm_pwm_suspend(struct device *dev)
 
 static int ehrpwm_pwm_resume(struct device *dev)
 {
-	struct ehrpwm_pwm_chip *pc = dev_get_drvdata(dev);
+	struct pwm_chip *chip = dev_get_drvdata(dev);
 	unsigned int i;
 
-	for (i = 0; i < pc->chip.npwm; i++) {
-		struct pwm_device *pwm = &pc->chip.pwms[i];
+	for (i = 0; i < chip->npwm; i++) {
+		struct pwm_device *pwm = &chip->pwms[i];
 
 		if (!pwm_is_enabled(pwm))
 			continue;
@@ -584,7 +591,7 @@ static int ehrpwm_pwm_resume(struct device *dev)
 		pm_runtime_get_sync(dev);
 	}
 
-	ehrpwm_pwm_restore_context(pc);
+	ehrpwm_pwm_restore_context(chip);
 
 	return 0;
 }

From d083ced47c062d31e852fbcfd0916570fda12bbe Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Wed, 14 Feb 2024 10:33:05 +0100
Subject: [PATCH 0732/1406] pwm: tiehrpwm: Make use of pwmchip_parent()
 accessor
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

struct pwm_chip::dev is about to change. To not have to touch this
driver in the same commit as struct pwm_chip::dev, use the accessor
function provided for exactly this purpose.

Link: https://lore.kernel.org/r/9badd116d0e26a5656b222c5b4adad7e111a53c7.1707900770.git.u.kleine-koenig@pengutronix.de
Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
---
 drivers/pwm/pwm-tiehrpwm.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/drivers/pwm/pwm-tiehrpwm.c b/drivers/pwm/pwm-tiehrpwm.c
index 6d7babf6fdb04c..9a7c721961738b 100644
--- a/drivers/pwm/pwm-tiehrpwm.c
+++ b/drivers/pwm/pwm-tiehrpwm.c
@@ -256,7 +256,7 @@ static int ehrpwm_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm,
 			if (i == pwm->hwpwm)
 				continue;
 
-			dev_err(chip->dev,
+			dev_err(pwmchip_parent(chip),
 				"period value conflicts with channel %u\n",
 				i);
 			return -EINVAL;
@@ -268,11 +268,11 @@ static int ehrpwm_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm,
 	/* Configure clock prescaler to support Low frequency PWM wave */
 	if (set_prescale_div(period_cycles/PERIOD_MAX, &ps_divval,
 			     &tb_divval)) {
-		dev_err(chip->dev, "Unsupported values\n");
+		dev_err(pwmchip_parent(chip), "Unsupported values\n");
 		return -EINVAL;
 	}
 
-	pm_runtime_get_sync(chip->dev);
+	pm_runtime_get_sync(pwmchip_parent(chip));
 
 	/* Update clock prescaler values */
 	ehrpwm_modify(pc->mmio_base, TBCTL, TBCTL_CLKDIV_MASK, tb_divval);
@@ -299,7 +299,7 @@ static int ehrpwm_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm,
 
 	ehrpwm_write(pc->mmio_base, cmp_reg, duty_cycles);
 
-	pm_runtime_put_sync(chip->dev);
+	pm_runtime_put_sync(pwmchip_parent(chip));
 
 	return 0;
 }
@@ -323,7 +323,7 @@ static int ehrpwm_pwm_enable(struct pwm_chip *chip, struct pwm_device *pwm)
 	int ret;
 
 	/* Leave clock enabled on enabling PWM */
-	pm_runtime_get_sync(chip->dev);
+	pm_runtime_get_sync(pwmchip_parent(chip));
 
 	/* Disabling Action Qualifier on PWM output */
 	if (pwm->hwpwm) {
@@ -346,8 +346,8 @@ static int ehrpwm_pwm_enable(struct pwm_chip *chip, struct pwm_device *pwm)
 	/* Enable TBCLK */
 	ret = clk_enable(pc->tbclk);
 	if (ret) {
-		dev_err(chip->dev, "Failed to enable TBCLK for %s: %d\n",
-			dev_name(chip->dev), ret);
+		dev_err(pwmchip_parent(chip), "Failed to enable TBCLK for %s: %d\n",
+			dev_name(pwmchip_parent(chip)), ret);
 		return ret;
 	}
 
@@ -385,7 +385,7 @@ static void ehrpwm_pwm_disable(struct pwm_chip *chip, struct pwm_device *pwm)
 	clk_disable(pc->tbclk);
 
 	/* Disable clock on PWM disable */
-	pm_runtime_put_sync(chip->dev);
+	pm_runtime_put_sync(pwmchip_parent(chip));
 }
 
 static void ehrpwm_pwm_free(struct pwm_chip *chip, struct pwm_device *pwm)
@@ -393,8 +393,8 @@ static void ehrpwm_pwm_free(struct pwm_chip *chip, struct pwm_device *pwm)
 	struct ehrpwm_pwm_chip *pc = to_ehrpwm_pwm_chip(chip);
 
 	if (pwm_is_enabled(pwm)) {
-		dev_warn(chip->dev, "Removing PWM device without disabling\n");
-		pm_runtime_put_sync(chip->dev);
+		dev_warn(pwmchip_parent(chip), "Removing PWM device without disabling\n");
+		pm_runtime_put_sync(pwmchip_parent(chip));
 	}
 
 	/* set period value to zero on free */
@@ -528,7 +528,7 @@ static void ehrpwm_pwm_save_context(struct pwm_chip *chip)
 {
 	struct ehrpwm_pwm_chip *pc = to_ehrpwm_pwm_chip(chip);
 
-	pm_runtime_get_sync(chip->dev);
+	pm_runtime_get_sync(pwmchip_parent(chip));
 
 	pc->ctx.tbctl = ehrpwm_read(pc->mmio_base, TBCTL);
 	pc->ctx.tbprd = ehrpwm_read(pc->mmio_base, TBPRD);
@@ -539,7 +539,7 @@ static void ehrpwm_pwm_save_context(struct pwm_chip *chip)
 	pc->ctx.aqsfrc = ehrpwm_read(pc->mmio_base, AQSFRC);
 	pc->ctx.aqcsfrc = ehrpwm_read(pc->mmio_base, AQCSFRC);
 
-	pm_runtime_put_sync(chip->dev);
+	pm_runtime_put_sync(pwmchip_parent(chip));
 }
 
 static void ehrpwm_pwm_restore_context(struct pwm_chip *chip)

From 420bfd50fd66fe2c6e8f631ff1b97850781ae8a5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Wed, 14 Feb 2024 10:33:06 +0100
Subject: [PATCH 0733/1406] pwm: tiehrpwm: Make use of devm_pwmchip_alloc()
 function
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This prepares the pwm-tiecap driver to further changes of the pwm core
outlined in the commit introducing devm_pwmchip_alloc(). There is no
intended semantical change and the driver should behave as before.

Link: https://lore.kernel.org/r/62fbac428cae0942f8e88234bf249537fcd890a3.1707900770.git.u.kleine-koenig@pengutronix.de
Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
---
 drivers/pwm/pwm-tiehrpwm.c | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/drivers/pwm/pwm-tiehrpwm.c b/drivers/pwm/pwm-tiehrpwm.c
index 9a7c721961738b..e5104725d9b702 100644
--- a/drivers/pwm/pwm-tiehrpwm.c
+++ b/drivers/pwm/pwm-tiehrpwm.c
@@ -105,7 +105,6 @@ struct ehrpwm_context {
 };
 
 struct ehrpwm_pwm_chip {
-	struct pwm_chip chip;
 	unsigned long clk_rate;
 	void __iomem *mmio_base;
 	unsigned long period_cycles[NUM_PWM_CHANNEL];
@@ -116,7 +115,7 @@ struct ehrpwm_pwm_chip {
 
 static inline struct ehrpwm_pwm_chip *to_ehrpwm_pwm_chip(struct pwm_chip *chip)
 {
-	return container_of(chip, struct ehrpwm_pwm_chip, chip);
+	return pwmchip_get_drvdata(chip);
 }
 
 static inline u16 ehrpwm_read(void __iomem *base, unsigned int offset)
@@ -454,9 +453,10 @@ static int ehrpwm_pwm_probe(struct platform_device *pdev)
 	struct clk *clk;
 	int ret;
 
-	pc = devm_kzalloc(&pdev->dev, sizeof(*pc), GFP_KERNEL);
-	if (!pc)
-		return -ENOMEM;
+	chip = devm_pwmchip_alloc(&pdev->dev, NUM_PWM_CHANNEL, sizeof(*pc));
+	if (IS_ERR(chip))
+		return PTR_ERR(chip);
+	pc = to_ehrpwm_pwm_chip(chip);
 
 	clk = devm_clk_get(&pdev->dev, "fck");
 	if (IS_ERR(clk)) {
@@ -475,10 +475,7 @@ static int ehrpwm_pwm_probe(struct platform_device *pdev)
 		return -EINVAL;
 	}
 
-	chip = &pc->chip;
-	chip->dev = &pdev->dev;
 	chip->ops = &ehrpwm_pwm_ops;
-	chip->npwm = NUM_PWM_CHANNEL;
 
 	pc->mmio_base = devm_platform_ioremap_resource(pdev, 0);
 	if (IS_ERR(pc->mmio_base))

From 022f650598e9f948ebc6554e34449fd0f70fef1e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Wed, 14 Feb 2024 10:33:07 +0100
Subject: [PATCH 0734/1406] pwm: twl: Make use of pwmchip_parent() accessor
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

struct pwm_chip::dev is about to change. To not have to touch this
driver in the same commit as struct pwm_chip::dev, use the accessor
function provided for exactly this purpose.

Link: https://lore.kernel.org/r/f11beb6b3a398d1257219a635a78ed0b02263978.1707900770.git.u.kleine-koenig@pengutronix.de
Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
---
 drivers/pwm/pwm-twl.c | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/drivers/pwm/pwm-twl.c b/drivers/pwm/pwm-twl.c
index 68e02c9a6bf9a1..7233ae039bdb2e 100644
--- a/drivers/pwm/pwm-twl.c
+++ b/drivers/pwm/pwm-twl.c
@@ -86,7 +86,7 @@ static int twl_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm,
 
 	ret = twl_i2c_write(TWL_MODULE_PWM, pwm_config, base, 2);
 	if (ret < 0)
-		dev_err(chip->dev, "%s: Failed to configure PWM\n", pwm->label);
+		dev_err(pwmchip_parent(chip), "%s: Failed to configure PWM\n", pwm->label);
 
 	return ret;
 }
@@ -100,7 +100,7 @@ static int twl4030_pwm_enable(struct pwm_chip *chip, struct pwm_device *pwm)
 	mutex_lock(&twl->mutex);
 	ret = twl_i2c_read_u8(TWL4030_MODULE_INTBR, &val, TWL4030_GPBR1_REG);
 	if (ret < 0) {
-		dev_err(chip->dev, "%s: Failed to read GPBR1\n", pwm->label);
+		dev_err(pwmchip_parent(chip), "%s: Failed to read GPBR1\n", pwm->label);
 		goto out;
 	}
 
@@ -108,13 +108,13 @@ static int twl4030_pwm_enable(struct pwm_chip *chip, struct pwm_device *pwm)
 
 	ret = twl_i2c_write_u8(TWL4030_MODULE_INTBR, val, TWL4030_GPBR1_REG);
 	if (ret < 0)
-		dev_err(chip->dev, "%s: Failed to enable PWM\n", pwm->label);
+		dev_err(pwmchip_parent(chip), "%s: Failed to enable PWM\n", pwm->label);
 
 	val |= TWL4030_PWM_TOGGLE(pwm->hwpwm, TWL4030_PWMX_ENABLE);
 
 	ret = twl_i2c_write_u8(TWL4030_MODULE_INTBR, val, TWL4030_GPBR1_REG);
 	if (ret < 0)
-		dev_err(chip->dev, "%s: Failed to enable PWM\n", pwm->label);
+		dev_err(pwmchip_parent(chip), "%s: Failed to enable PWM\n", pwm->label);
 
 out:
 	mutex_unlock(&twl->mutex);
@@ -130,7 +130,7 @@ static void twl4030_pwm_disable(struct pwm_chip *chip, struct pwm_device *pwm)
 	mutex_lock(&twl->mutex);
 	ret = twl_i2c_read_u8(TWL4030_MODULE_INTBR, &val, TWL4030_GPBR1_REG);
 	if (ret < 0) {
-		dev_err(chip->dev, "%s: Failed to read GPBR1\n", pwm->label);
+		dev_err(pwmchip_parent(chip), "%s: Failed to read GPBR1\n", pwm->label);
 		goto out;
 	}
 
@@ -138,13 +138,13 @@ static void twl4030_pwm_disable(struct pwm_chip *chip, struct pwm_device *pwm)
 
 	ret = twl_i2c_write_u8(TWL4030_MODULE_INTBR, val, TWL4030_GPBR1_REG);
 	if (ret < 0)
-		dev_err(chip->dev, "%s: Failed to disable PWM\n", pwm->label);
+		dev_err(pwmchip_parent(chip), "%s: Failed to disable PWM\n", pwm->label);
 
 	val &= ~TWL4030_PWM_TOGGLE(pwm->hwpwm, TWL4030_PWMXCLK_ENABLE);
 
 	ret = twl_i2c_write_u8(TWL4030_MODULE_INTBR, val, TWL4030_GPBR1_REG);
 	if (ret < 0)
-		dev_err(chip->dev, "%s: Failed to disable PWM\n", pwm->label);
+		dev_err(pwmchip_parent(chip), "%s: Failed to disable PWM\n", pwm->label);
 
 out:
 	mutex_unlock(&twl->mutex);
@@ -167,7 +167,7 @@ static int twl4030_pwm_request(struct pwm_chip *chip, struct pwm_device *pwm)
 	mutex_lock(&twl->mutex);
 	ret = twl_i2c_read_u8(TWL4030_MODULE_INTBR, &val, TWL4030_PMBR1_REG);
 	if (ret < 0) {
-		dev_err(chip->dev, "%s: Failed to read PMBR1\n", pwm->label);
+		dev_err(pwmchip_parent(chip), "%s: Failed to read PMBR1\n", pwm->label);
 		goto out;
 	}
 
@@ -181,7 +181,7 @@ static int twl4030_pwm_request(struct pwm_chip *chip, struct pwm_device *pwm)
 
 	ret = twl_i2c_write_u8(TWL4030_MODULE_INTBR, val, TWL4030_PMBR1_REG);
 	if (ret < 0)
-		dev_err(chip->dev, "%s: Failed to request PWM\n", pwm->label);
+		dev_err(pwmchip_parent(chip), "%s: Failed to request PWM\n", pwm->label);
 
 out:
 	mutex_unlock(&twl->mutex);
@@ -202,7 +202,7 @@ static void twl4030_pwm_free(struct pwm_chip *chip, struct pwm_device *pwm)
 	mutex_lock(&twl->mutex);
 	ret = twl_i2c_read_u8(TWL4030_MODULE_INTBR, &val, TWL4030_PMBR1_REG);
 	if (ret < 0) {
-		dev_err(chip->dev, "%s: Failed to read PMBR1\n", pwm->label);
+		dev_err(pwmchip_parent(chip), "%s: Failed to read PMBR1\n", pwm->label);
 		goto out;
 	}
 
@@ -212,7 +212,7 @@ static void twl4030_pwm_free(struct pwm_chip *chip, struct pwm_device *pwm)
 
 	ret = twl_i2c_write_u8(TWL4030_MODULE_INTBR, val, TWL4030_PMBR1_REG);
 	if (ret < 0)
-		dev_err(chip->dev, "%s: Failed to free PWM\n", pwm->label);
+		dev_err(pwmchip_parent(chip), "%s: Failed to free PWM\n", pwm->label);
 
 out:
 	mutex_unlock(&twl->mutex);
@@ -231,7 +231,7 @@ static int twl6030_pwm_enable(struct pwm_chip *chip, struct pwm_device *pwm)
 
 	ret = twl_i2c_write_u8(TWL6030_MODULE_ID1, val, TWL6030_TOGGLE3_REG);
 	if (ret < 0) {
-		dev_err(chip->dev, "%s: Failed to enable PWM\n", pwm->label);
+		dev_err(pwmchip_parent(chip), "%s: Failed to enable PWM\n", pwm->label);
 		goto out;
 	}
 
@@ -254,7 +254,7 @@ static void twl6030_pwm_disable(struct pwm_chip *chip, struct pwm_device *pwm)
 
 	ret = twl_i2c_write_u8(TWL6030_MODULE_ID1, val, TWL6030_TOGGLE3_REG);
 	if (ret < 0) {
-		dev_err(chip->dev, "%s: Failed to disable PWM\n", pwm->label);
+		dev_err(pwmchip_parent(chip), "%s: Failed to disable PWM\n", pwm->label);
 		goto out;
 	}
 
@@ -262,7 +262,7 @@ static void twl6030_pwm_disable(struct pwm_chip *chip, struct pwm_device *pwm)
 
 	ret = twl_i2c_write_u8(TWL6030_MODULE_ID1, val, TWL6030_TOGGLE3_REG);
 	if (ret < 0) {
-		dev_err(chip->dev, "%s: Failed to disable PWM\n", pwm->label);
+		dev_err(pwmchip_parent(chip), "%s: Failed to disable PWM\n", pwm->label);
 		goto out;
 	}
 
@@ -270,7 +270,7 @@ static void twl6030_pwm_disable(struct pwm_chip *chip, struct pwm_device *pwm)
 
 	ret = twl_i2c_write_u8(TWL6030_MODULE_ID1, val, TWL6030_TOGGLE3_REG);
 	if (ret < 0) {
-		dev_err(chip->dev, "%s: Failed to disable PWM\n", pwm->label);
+		dev_err(pwmchip_parent(chip), "%s: Failed to disable PWM\n", pwm->label);
 		goto out;
 	}
 

From a6b3691c0df1b7bf8aa55aef20a46f56d2cbff21 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Wed, 14 Feb 2024 10:33:08 +0100
Subject: [PATCH 0735/1406] pwm: twl: Make use of devm_pwmchip_alloc() function
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This prepares the pwm-twol driver to further changes of the pwm core
outlined in the commit introducing devm_pwmchip_alloc(). There is no
intended semantical change and the driver should behave as before.

Link: https://lore.kernel.org/r/c65e796d46df71cd8d0d0941921997b9501f1cb3.1707900770.git.u.kleine-koenig@pengutronix.de
Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
---
 drivers/pwm/pwm-twl.c | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

diff --git a/drivers/pwm/pwm-twl.c b/drivers/pwm/pwm-twl.c
index 7233ae039bdb2e..8f981ffff4b4e2 100644
--- a/drivers/pwm/pwm-twl.c
+++ b/drivers/pwm/pwm-twl.c
@@ -46,7 +46,6 @@
 #define TWL6030_PWM_TOGGLE(pwm, x)	((x) << (pwm * 3))
 
 struct twl_pwm_chip {
-	struct pwm_chip chip;
 	struct mutex mutex;
 	u8 twl6030_toggle3;
 	u8 twl4030_pwm_mux;
@@ -54,7 +53,7 @@ struct twl_pwm_chip {
 
 static inline struct twl_pwm_chip *to_twl(struct pwm_chip *chip)
 {
-	return container_of(chip, struct twl_pwm_chip, chip);
+	return pwmchip_get_drvdata(chip);
 }
 
 static int twl_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm,
@@ -341,23 +340,22 @@ static const struct pwm_ops twl6030_pwm_ops = {
 
 static int twl_pwm_probe(struct platform_device *pdev)
 {
+	struct pwm_chip *chip;
 	struct twl_pwm_chip *twl;
 
-	twl = devm_kzalloc(&pdev->dev, sizeof(*twl), GFP_KERNEL);
-	if (!twl)
-		return -ENOMEM;
+	chip = devm_pwmchip_alloc(&pdev->dev, 2, sizeof(*twl));
+	if (IS_ERR(chip))
+		return PTR_ERR(chip);
+	twl = to_twl(chip);
 
 	if (twl_class_is_4030())
-		twl->chip.ops = &twl4030_pwm_ops;
+		chip->ops = &twl4030_pwm_ops;
 	else
-		twl->chip.ops = &twl6030_pwm_ops;
-
-	twl->chip.dev = &pdev->dev;
-	twl->chip.npwm = 2;
+		chip->ops = &twl6030_pwm_ops;
 
 	mutex_init(&twl->mutex);
 
-	return devm_pwmchip_add(&pdev->dev, &twl->chip);
+	return devm_pwmchip_add(&pdev->dev, chip);
 }
 
 #ifdef CONFIG_OF

From a5c1791300442e5a2eda687b464fb3f30da3f79d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Wed, 14 Feb 2024 10:33:09 +0100
Subject: [PATCH 0736/1406] pwm: twl-led: Make use of pwmchip_parent() accessor
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

struct pwm_chip::dev is about to change. To not have to touch this
driver in the same commit as struct pwm_chip::dev, use the accessor
function provided for exactly this purpose.

Link: https://lore.kernel.org/r/43c35b7116a637501b51ac93ec24c00ea92ee1af.1707900770.git.u.kleine-koenig@pengutronix.de
Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
---
 drivers/pwm/pwm-twl-led.c | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/drivers/pwm/pwm-twl-led.c b/drivers/pwm/pwm-twl-led.c
index c670ccb8165360..00ef798dae5203 100644
--- a/drivers/pwm/pwm-twl-led.c
+++ b/drivers/pwm/pwm-twl-led.c
@@ -100,7 +100,7 @@ static int twl4030_pwmled_config(struct pwm_chip *chip, struct pwm_device *pwm,
 
 	ret = twl_i2c_write(TWL4030_MODULE_LED, pwm_config, base, 2);
 	if (ret < 0)
-		dev_err(chip->dev, "%s: Failed to configure PWM\n", pwm->label);
+		dev_err(pwmchip_parent(chip), "%s: Failed to configure PWM\n", pwm->label);
 
 	return ret;
 }
@@ -114,7 +114,7 @@ static int twl4030_pwmled_enable(struct pwm_chip *chip, struct pwm_device *pwm)
 	mutex_lock(&twl->mutex);
 	ret = twl_i2c_read_u8(TWL4030_MODULE_LED, &val, TWL4030_LEDEN_REG);
 	if (ret < 0) {
-		dev_err(chip->dev, "%s: Failed to read LEDEN\n", pwm->label);
+		dev_err(pwmchip_parent(chip), "%s: Failed to read LEDEN\n", pwm->label);
 		goto out;
 	}
 
@@ -122,7 +122,7 @@ static int twl4030_pwmled_enable(struct pwm_chip *chip, struct pwm_device *pwm)
 
 	ret = twl_i2c_write_u8(TWL4030_MODULE_LED, val, TWL4030_LEDEN_REG);
 	if (ret < 0)
-		dev_err(chip->dev, "%s: Failed to enable PWM\n", pwm->label);
+		dev_err(pwmchip_parent(chip), "%s: Failed to enable PWM\n", pwm->label);
 
 out:
 	mutex_unlock(&twl->mutex);
@@ -139,7 +139,7 @@ static void twl4030_pwmled_disable(struct pwm_chip *chip,
 	mutex_lock(&twl->mutex);
 	ret = twl_i2c_read_u8(TWL4030_MODULE_LED, &val, TWL4030_LEDEN_REG);
 	if (ret < 0) {
-		dev_err(chip->dev, "%s: Failed to read LEDEN\n", pwm->label);
+		dev_err(pwmchip_parent(chip), "%s: Failed to read LEDEN\n", pwm->label);
 		goto out;
 	}
 
@@ -147,7 +147,7 @@ static void twl4030_pwmled_disable(struct pwm_chip *chip,
 
 	ret = twl_i2c_write_u8(TWL4030_MODULE_LED, val, TWL4030_LEDEN_REG);
 	if (ret < 0)
-		dev_err(chip->dev, "%s: Failed to disable PWM\n", pwm->label);
+		dev_err(pwmchip_parent(chip), "%s: Failed to disable PWM\n", pwm->label);
 
 out:
 	mutex_unlock(&twl->mutex);
@@ -203,7 +203,7 @@ static int twl6030_pwmled_config(struct pwm_chip *chip, struct pwm_device *pwm,
 	ret = twl_i2c_write_u8(TWL6030_MODULE_ID1, on_time,
 			       TWL6030_LED_PWM_CTRL1);
 	if (ret < 0)
-		dev_err(chip->dev, "%s: Failed to configure PWM\n", pwm->label);
+		dev_err(pwmchip_parent(chip), "%s: Failed to configure PWM\n", pwm->label);
 
 	return ret;
 }
@@ -217,7 +217,7 @@ static int twl6030_pwmled_enable(struct pwm_chip *chip, struct pwm_device *pwm)
 	mutex_lock(&twl->mutex);
 	ret = twl_i2c_read_u8(TWL6030_MODULE_ID1, &val, TWL6030_LED_PWM_CTRL2);
 	if (ret < 0) {
-		dev_err(chip->dev, "%s: Failed to read PWM_CTRL2\n",
+		dev_err(pwmchip_parent(chip), "%s: Failed to read PWM_CTRL2\n",
 			pwm->label);
 		goto out;
 	}
@@ -227,7 +227,7 @@ static int twl6030_pwmled_enable(struct pwm_chip *chip, struct pwm_device *pwm)
 
 	ret = twl_i2c_write_u8(TWL6030_MODULE_ID1, val, TWL6030_LED_PWM_CTRL2);
 	if (ret < 0)
-		dev_err(chip->dev, "%s: Failed to enable PWM\n", pwm->label);
+		dev_err(pwmchip_parent(chip), "%s: Failed to enable PWM\n", pwm->label);
 
 out:
 	mutex_unlock(&twl->mutex);
@@ -244,7 +244,7 @@ static void twl6030_pwmled_disable(struct pwm_chip *chip,
 	mutex_lock(&twl->mutex);
 	ret = twl_i2c_read_u8(TWL6030_MODULE_ID1, &val, TWL6030_LED_PWM_CTRL2);
 	if (ret < 0) {
-		dev_err(chip->dev, "%s: Failed to read PWM_CTRL2\n",
+		dev_err(pwmchip_parent(chip), "%s: Failed to read PWM_CTRL2\n",
 			pwm->label);
 		goto out;
 	}
@@ -254,7 +254,7 @@ static void twl6030_pwmled_disable(struct pwm_chip *chip,
 
 	ret = twl_i2c_write_u8(TWL6030_MODULE_ID1, val, TWL6030_LED_PWM_CTRL2);
 	if (ret < 0)
-		dev_err(chip->dev, "%s: Failed to disable PWM\n", pwm->label);
+		dev_err(pwmchip_parent(chip), "%s: Failed to disable PWM\n", pwm->label);
 
 out:
 	mutex_unlock(&twl->mutex);
@@ -295,7 +295,7 @@ static int twl6030_pwmled_request(struct pwm_chip *chip, struct pwm_device *pwm)
 	mutex_lock(&twl->mutex);
 	ret = twl_i2c_read_u8(TWL6030_MODULE_ID1, &val, TWL6030_LED_PWM_CTRL2);
 	if (ret < 0) {
-		dev_err(chip->dev, "%s: Failed to read PWM_CTRL2\n",
+		dev_err(pwmchip_parent(chip), "%s: Failed to read PWM_CTRL2\n",
 			pwm->label);
 		goto out;
 	}
@@ -305,7 +305,7 @@ static int twl6030_pwmled_request(struct pwm_chip *chip, struct pwm_device *pwm)
 
 	ret = twl_i2c_write_u8(TWL6030_MODULE_ID1, val, TWL6030_LED_PWM_CTRL2);
 	if (ret < 0)
-		dev_err(chip->dev, "%s: Failed to request PWM\n", pwm->label);
+		dev_err(pwmchip_parent(chip), "%s: Failed to request PWM\n", pwm->label);
 
 out:
 	mutex_unlock(&twl->mutex);
@@ -321,7 +321,7 @@ static void twl6030_pwmled_free(struct pwm_chip *chip, struct pwm_device *pwm)
 	mutex_lock(&twl->mutex);
 	ret = twl_i2c_read_u8(TWL6030_MODULE_ID1, &val, TWL6030_LED_PWM_CTRL2);
 	if (ret < 0) {
-		dev_err(chip->dev, "%s: Failed to read PWM_CTRL2\n",
+		dev_err(pwmchip_parent(chip), "%s: Failed to read PWM_CTRL2\n",
 			pwm->label);
 		goto out;
 	}
@@ -331,7 +331,7 @@ static void twl6030_pwmled_free(struct pwm_chip *chip, struct pwm_device *pwm)
 
 	ret = twl_i2c_write_u8(TWL6030_MODULE_ID1, val, TWL6030_LED_PWM_CTRL2);
 	if (ret < 0)
-		dev_err(chip->dev, "%s: Failed to free PWM\n", pwm->label);
+		dev_err(pwmchip_parent(chip), "%s: Failed to free PWM\n", pwm->label);
 
 out:
 	mutex_unlock(&twl->mutex);

From ef9faf9f868b0d1cb7b9739ca041dcbeab820ab9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Wed, 14 Feb 2024 10:33:10 +0100
Subject: [PATCH 0737/1406] pwm: twl-led: Make use of devm_pwmchip_alloc()
 function
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This prepares the pwm-twl-led driver to further changes of the pwm core
outlined in the commit introducing devm_pwmchip_alloc(). There is no
intended semantical change and the driver should behave as before.

Link: https://lore.kernel.org/r/5bac90addb3a178ef958a2a524c6ec7e3eea3e6a.1707900770.git.u.kleine-koenig@pengutronix.de
Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
---
 drivers/pwm/pwm-twl-led.c | 27 +++++++++++++++------------
 1 file changed, 15 insertions(+), 12 deletions(-)

diff --git a/drivers/pwm/pwm-twl-led.c b/drivers/pwm/pwm-twl-led.c
index 00ef798dae5203..4b10a8dab31244 100644
--- a/drivers/pwm/pwm-twl-led.c
+++ b/drivers/pwm/pwm-twl-led.c
@@ -62,13 +62,12 @@
 #define TWL6040_LED_MODE_MASK	0x03
 
 struct twl_pwmled_chip {
-	struct pwm_chip chip;
 	struct mutex mutex;
 };
 
 static inline struct twl_pwmled_chip *to_twl(struct pwm_chip *chip)
 {
-	return container_of(chip, struct twl_pwmled_chip, chip);
+	return pwmchip_get_drvdata(chip);
 }
 
 static int twl4030_pwmled_config(struct pwm_chip *chip, struct pwm_device *pwm,
@@ -345,25 +344,29 @@ static const struct pwm_ops twl6030_pwmled_ops = {
 
 static int twl_pwmled_probe(struct platform_device *pdev)
 {
+	struct pwm_chip *chip;
 	struct twl_pwmled_chip *twl;
-
-	twl = devm_kzalloc(&pdev->dev, sizeof(*twl), GFP_KERNEL);
-	if (!twl)
-		return -ENOMEM;
+	unsigned int npwm;
+	const struct pwm_ops *ops;
 
 	if (twl_class_is_4030()) {
-		twl->chip.ops = &twl4030_pwmled_ops;
-		twl->chip.npwm = 2;
+		ops = &twl4030_pwmled_ops;
+		npwm = 2;
 	} else {
-		twl->chip.ops = &twl6030_pwmled_ops;
-		twl->chip.npwm = 1;
+		ops = &twl6030_pwmled_ops;
+		npwm = 1;
 	}
 
-	twl->chip.dev = &pdev->dev;
+	chip = devm_pwmchip_alloc(&pdev->dev, npwm, sizeof(*twl));
+	if (IS_ERR(chip))
+		return PTR_ERR(chip);
+	twl = to_twl(chip);
+
+	chip->ops = ops;
 
 	mutex_init(&twl->mutex);
 
-	return devm_pwmchip_add(&pdev->dev, &twl->chip);
+	return devm_pwmchip_add(&pdev->dev, chip);
 }
 
 #ifdef CONFIG_OF

From cca8d1e0ae2179e62940a348d8d180c025c536dd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Wed, 14 Feb 2024 10:33:11 +0100
Subject: [PATCH 0738/1406] pwm: visconti: Make use of devm_pwmchip_alloc()
 function
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This prepares the pwm-visconti driver to further changes of the pwm core
outlined in the commit introducing devm_pwmchip_alloc(). There is no
intended semantical change and the driver should behave as before.

Link: https://lore.kernel.org/r/24e779de69365686bb004742cd8f07cbda131212.1707900770.git.u.kleine-koenig@pengutronix.de
Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
---
 drivers/pwm/pwm-visconti.c | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/drivers/pwm/pwm-visconti.c b/drivers/pwm/pwm-visconti.c
index 8d736d55812216..9e55380957be5f 100644
--- a/drivers/pwm/pwm-visconti.c
+++ b/drivers/pwm/pwm-visconti.c
@@ -34,13 +34,12 @@
 #define PIPGM_PWMC_POLARITY_MASK	GENMASK(5, 5)
 
 struct visconti_pwm_chip {
-	struct pwm_chip chip;
 	void __iomem *base;
 };
 
 static inline struct visconti_pwm_chip *visconti_pwm_from_chip(struct pwm_chip *chip)
 {
-	return container_of(chip, struct visconti_pwm_chip, chip);
+	return pwmchip_get_drvdata(chip);
 }
 
 static int visconti_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm,
@@ -134,22 +133,22 @@ static const struct pwm_ops visconti_pwm_ops = {
 static int visconti_pwm_probe(struct platform_device *pdev)
 {
 	struct device *dev = &pdev->dev;
+	struct pwm_chip *chip;
 	struct visconti_pwm_chip *priv;
 	int ret;
 
-	priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL);
-	if (!priv)
-		return -ENOMEM;
+	chip = devm_pwmchip_alloc(dev, 4, sizeof(*priv));
+	if (IS_ERR(chip))
+		return PTR_ERR(chip);
+	priv = visconti_pwm_from_chip(chip);
 
 	priv->base = devm_platform_ioremap_resource(pdev, 0);
 	if (IS_ERR(priv->base))
 		return PTR_ERR(priv->base);
 
-	priv->chip.dev = dev;
-	priv->chip.ops = &visconti_pwm_ops;
-	priv->chip.npwm = 4;
+	chip->ops = &visconti_pwm_ops;
 
-	ret = devm_pwmchip_add(&pdev->dev, &priv->chip);
+	ret = devm_pwmchip_add(&pdev->dev, chip);
 	if (ret < 0)
 		return dev_err_probe(&pdev->dev, ret, "Cannot register visconti PWM\n");
 

From 05513ae63f6f048de399e2422a792cd4f74b749e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Wed, 14 Feb 2024 10:33:12 +0100
Subject: [PATCH 0739/1406] pwm: vt8500: Change prototype of a helper to
 prepare further changes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This prepares the driver for further changes that will make it harder to
determine the pwm_chip from a given vt8500_chip. To just not have to do
that, rework vt8500_pwm_busy_wait() to take a pwm_chip.

Link: https://lore.kernel.org/r/fb384c550b359e7707219f87872bcf36482875ff.1707900770.git.u.kleine-koenig@pengutronix.de
Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
---
 drivers/pwm/pwm-vt8500.c | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/drivers/pwm/pwm-vt8500.c b/drivers/pwm/pwm-vt8500.c
index 7bfeacee05d0fa..3646bd45b10172 100644
--- a/drivers/pwm/pwm-vt8500.c
+++ b/drivers/pwm/pwm-vt8500.c
@@ -53,8 +53,9 @@ struct vt8500_chip {
 #define to_vt8500_chip(chip)	container_of(chip, struct vt8500_chip, chip)
 
 #define msecs_to_loops(t) (loops_per_jiffy / 1000 * HZ * t)
-static inline void vt8500_pwm_busy_wait(struct vt8500_chip *vt8500, int nr, u8 bitmask)
+static inline void vt8500_pwm_busy_wait(struct pwm_chip *chip, int nr, u8 bitmask)
 {
+	struct vt8500_chip *vt8500 = to_vt8500_chip(chip);
 	int loops = msecs_to_loops(10);
 	u32 mask = bitmask << (nr << 8);
 
@@ -62,7 +63,7 @@ static inline void vt8500_pwm_busy_wait(struct vt8500_chip *vt8500, int nr, u8 b
 		cpu_relax();
 
 	if (unlikely(!loops))
-		dev_warn(vt8500->chip.dev, "Waiting for status bits 0x%x to clear timed out\n",
+		dev_warn(chip->dev, "Waiting for status bits 0x%x to clear timed out\n",
 			 mask);
 }
 
@@ -103,18 +104,18 @@ static int vt8500_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm,
 	dc = div64_u64(c, period_ns);
 
 	writel(prescale, vt8500->base + REG_SCALAR(pwm->hwpwm));
-	vt8500_pwm_busy_wait(vt8500, pwm->hwpwm, STATUS_SCALAR_UPDATE);
+	vt8500_pwm_busy_wait(chip, pwm->hwpwm, STATUS_SCALAR_UPDATE);
 
 	writel(pv, vt8500->base + REG_PERIOD(pwm->hwpwm));
-	vt8500_pwm_busy_wait(vt8500, pwm->hwpwm, STATUS_PERIOD_UPDATE);
+	vt8500_pwm_busy_wait(chip, pwm->hwpwm, STATUS_PERIOD_UPDATE);
 
 	writel(dc, vt8500->base + REG_DUTY(pwm->hwpwm));
-	vt8500_pwm_busy_wait(vt8500, pwm->hwpwm, STATUS_DUTY_UPDATE);
+	vt8500_pwm_busy_wait(chip, pwm->hwpwm, STATUS_DUTY_UPDATE);
 
 	val = readl(vt8500->base + REG_CTRL(pwm->hwpwm));
 	val |= CTRL_AUTOLOAD;
 	writel(val, vt8500->base + REG_CTRL(pwm->hwpwm));
-	vt8500_pwm_busy_wait(vt8500, pwm->hwpwm, STATUS_CTRL_UPDATE);
+	vt8500_pwm_busy_wait(chip, pwm->hwpwm, STATUS_CTRL_UPDATE);
 
 	clk_disable(vt8500->clk);
 	return 0;
@@ -135,7 +136,7 @@ static int vt8500_pwm_enable(struct pwm_chip *chip, struct pwm_device *pwm)
 	val = readl(vt8500->base + REG_CTRL(pwm->hwpwm));
 	val |= CTRL_ENABLE;
 	writel(val, vt8500->base + REG_CTRL(pwm->hwpwm));
-	vt8500_pwm_busy_wait(vt8500, pwm->hwpwm, STATUS_CTRL_UPDATE);
+	vt8500_pwm_busy_wait(chip, pwm->hwpwm, STATUS_CTRL_UPDATE);
 
 	return 0;
 }
@@ -148,7 +149,7 @@ static void vt8500_pwm_disable(struct pwm_chip *chip, struct pwm_device *pwm)
 	val = readl(vt8500->base + REG_CTRL(pwm->hwpwm));
 	val &= ~CTRL_ENABLE;
 	writel(val, vt8500->base + REG_CTRL(pwm->hwpwm));
-	vt8500_pwm_busy_wait(vt8500, pwm->hwpwm, STATUS_CTRL_UPDATE);
+	vt8500_pwm_busy_wait(chip, pwm->hwpwm, STATUS_CTRL_UPDATE);
 
 	clk_disable(vt8500->clk);
 }
@@ -168,7 +169,7 @@ static int vt8500_pwm_set_polarity(struct pwm_chip *chip,
 		val &= ~CTRL_INVERT;
 
 	writel(val, vt8500->base + REG_CTRL(pwm->hwpwm));
-	vt8500_pwm_busy_wait(vt8500, pwm->hwpwm, STATUS_CTRL_UPDATE);
+	vt8500_pwm_busy_wait(chip, pwm->hwpwm, STATUS_CTRL_UPDATE);
 
 	return 0;
 }

From 2dfea59c26984d4e82c70f9a8f18764477161284 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Wed, 14 Feb 2024 10:33:13 +0100
Subject: [PATCH 0740/1406] pwm: vt8500: Introduce a local pwm_chip variable in
 .probe()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This simplifies converting the driver to pwmchip_alloc() as there is only
a single code line left that makes use of struct vt8500_chip::chip.

Link: https://lore.kernel.org/r/7d903b608609d46cf1ee1e06530f516f42af1ebb.1707900770.git.u.kleine-koenig@pengutronix.de
Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
---
 drivers/pwm/pwm-vt8500.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/drivers/pwm/pwm-vt8500.c b/drivers/pwm/pwm-vt8500.c
index 3646bd45b10172..6da9b8f88afcb7 100644
--- a/drivers/pwm/pwm-vt8500.c
+++ b/drivers/pwm/pwm-vt8500.c
@@ -232,6 +232,7 @@ MODULE_DEVICE_TABLE(of, vt8500_pwm_dt_ids);
 
 static int vt8500_pwm_probe(struct platform_device *pdev)
 {
+	struct pwm_chip *chip;
 	struct vt8500_chip *vt8500;
 	struct device_node *np = pdev->dev.of_node;
 	int ret;
@@ -243,9 +244,10 @@ static int vt8500_pwm_probe(struct platform_device *pdev)
 	if (vt8500 == NULL)
 		return -ENOMEM;
 
-	vt8500->chip.dev = &pdev->dev;
-	vt8500->chip.ops = &vt8500_pwm_ops;
-	vt8500->chip.npwm = VT8500_NR_PWMS;
+	chip = &vt8500->chip;
+	chip->dev = &pdev->dev;
+	chip->ops = &vt8500_pwm_ops;
+	chip->npwm = VT8500_NR_PWMS;
 
 	vt8500->clk = devm_clk_get_prepared(&pdev->dev, NULL);
 	if (IS_ERR(vt8500->clk))
@@ -255,7 +257,7 @@ static int vt8500_pwm_probe(struct platform_device *pdev)
 	if (IS_ERR(vt8500->base))
 		return PTR_ERR(vt8500->base);
 
-	ret = devm_pwmchip_add(&pdev->dev, &vt8500->chip);
+	ret = devm_pwmchip_add(&pdev->dev, chip);
 	if (ret < 0)
 		return dev_err_probe(&pdev->dev, ret, "failed to add PWM chip\n");
 

From 0b2a084ff7597899f832292df61b3930fcad0121 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Wed, 14 Feb 2024 10:33:14 +0100
Subject: [PATCH 0741/1406] pwm: vt8500: Make use of pwmchip_parent() accessor
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

struct pwm_chip::dev is about to change. To not have to touch this
driver in the same commit as struct pwm_chip::dev, use the accessor
function provided for exactly this purpose.

Link: https://lore.kernel.org/r/c3c45a08f2ccb8bb13b4042c73f93064876586eb.1707900770.git.u.kleine-koenig@pengutronix.de
Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
---
 drivers/pwm/pwm-vt8500.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/pwm/pwm-vt8500.c b/drivers/pwm/pwm-vt8500.c
index 6da9b8f88afcb7..fb4dbc88b0a3eb 100644
--- a/drivers/pwm/pwm-vt8500.c
+++ b/drivers/pwm/pwm-vt8500.c
@@ -63,7 +63,7 @@ static inline void vt8500_pwm_busy_wait(struct pwm_chip *chip, int nr, u8 bitmas
 		cpu_relax();
 
 	if (unlikely(!loops))
-		dev_warn(chip->dev, "Waiting for status bits 0x%x to clear timed out\n",
+		dev_warn(pwmchip_parent(chip), "Waiting for status bits 0x%x to clear timed out\n",
 			 mask);
 }
 
@@ -78,7 +78,7 @@ static int vt8500_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm,
 
 	err = clk_enable(vt8500->clk);
 	if (err < 0) {
-		dev_err(chip->dev, "failed to enable clock\n");
+		dev_err(pwmchip_parent(chip), "failed to enable clock\n");
 		return err;
 	}
 
@@ -129,7 +129,7 @@ static int vt8500_pwm_enable(struct pwm_chip *chip, struct pwm_device *pwm)
 
 	err = clk_enable(vt8500->clk);
 	if (err < 0) {
-		dev_err(chip->dev, "failed to enable clock\n");
+		dev_err(pwmchip_parent(chip), "failed to enable clock\n");
 		return err;
 	}
 

From fb9fc6aa5f66ed86db7308efcc1c19ad2e1d3494 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Wed, 14 Feb 2024 10:33:15 +0100
Subject: [PATCH 0742/1406] pwm: vt8500: Make use of devm_pwmchip_alloc()
 function
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This prepares the pwm-vt8500 driver to further changes of the pwm core
outlined in the commit introducing devm_pwmchip_alloc(). There is no
intended semantical change and the driver should behave as before.

Also convert the to_vt8500_chip() helper macro to a static inline to
get some type safety.

Link: https://lore.kernel.org/r/b203c4448db23ebad1165b7bce43ac41468c4e89.1707900770.git.u.kleine-koenig@pengutronix.de
Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
---
 drivers/pwm/pwm-vt8500.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/pwm/pwm-vt8500.c b/drivers/pwm/pwm-vt8500.c
index fb4dbc88b0a3eb..016c82d6552718 100644
--- a/drivers/pwm/pwm-vt8500.c
+++ b/drivers/pwm/pwm-vt8500.c
@@ -45,12 +45,14 @@
 #define STATUS_ALL_UPDATE	0x0F
 
 struct vt8500_chip {
-	struct pwm_chip chip;
 	void __iomem *base;
 	struct clk *clk;
 };
 
-#define to_vt8500_chip(chip)	container_of(chip, struct vt8500_chip, chip)
+static inline struct vt8500_chip *to_vt8500_chip(struct pwm_chip *chip)
+{
+	return pwmchip_get_drvdata(chip);
+}
 
 #define msecs_to_loops(t) (loops_per_jiffy / 1000 * HZ * t)
 static inline void vt8500_pwm_busy_wait(struct pwm_chip *chip, int nr, u8 bitmask)
@@ -240,14 +242,12 @@ static int vt8500_pwm_probe(struct platform_device *pdev)
 	if (!np)
 		return dev_err_probe(&pdev->dev, -EINVAL, "invalid devicetree node\n");
 
-	vt8500 = devm_kzalloc(&pdev->dev, sizeof(*vt8500), GFP_KERNEL);
-	if (vt8500 == NULL)
-		return -ENOMEM;
+	chip = devm_pwmchip_alloc(&pdev->dev, VT8500_NR_PWMS, sizeof(*vt8500));
+	if (IS_ERR(chip))
+		return PTR_ERR(chip);
+	vt8500 = to_vt8500_chip(chip);
 
-	chip = &vt8500->chip;
-	chip->dev = &pdev->dev;
 	chip->ops = &vt8500_pwm_ops;
-	chip->npwm = VT8500_NR_PWMS;
 
 	vt8500->clk = devm_clk_get_prepared(&pdev->dev, NULL);
 	if (IS_ERR(vt8500->clk))

From c0098705d700ece27f6d5c98fb8ad5a049d44750 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Wed, 14 Feb 2024 10:33:16 +0100
Subject: [PATCH 0743/1406] pwm: xilinx: Prepare removing pwm_chip from driver
 data
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This prepares the driver for further changes that will drop struct
pwm_chip chip from struct xilinx_pwm_device. Use the pwm_chip as driver
data instead of the xilinx_pwm_device to get access to the pwm_chip in
xilinx_pwm_remove() without using xilinx_pwm->chip.

Link: https://lore.kernel.org/r/738b9929c1d13bde64050f8bbc4ce8d85f58cc7a.1707900770.git.u.kleine-koenig@pengutronix.de
Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
---
 drivers/pwm/pwm-xilinx.c | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/drivers/pwm/pwm-xilinx.c b/drivers/pwm/pwm-xilinx.c
index 5f3c2a6fed11c3..0ca79fe8c10520 100644
--- a/drivers/pwm/pwm-xilinx.c
+++ b/drivers/pwm/pwm-xilinx.c
@@ -214,6 +214,7 @@ static int xilinx_pwm_probe(struct platform_device *pdev)
 	struct device *dev = &pdev->dev;
 	struct device_node *np = dev->of_node;
 	struct xilinx_timer_priv *priv;
+	struct pwm_chip *chip;
 	struct xilinx_pwm_device *xilinx_pwm;
 	u32 pwm_cells, one_timer, width;
 	void __iomem *regs;
@@ -228,8 +229,9 @@ static int xilinx_pwm_probe(struct platform_device *pdev)
 	xilinx_pwm = devm_kzalloc(dev, sizeof(*xilinx_pwm), GFP_KERNEL);
 	if (!xilinx_pwm)
 		return -ENOMEM;
-	platform_set_drvdata(pdev, xilinx_pwm);
 	priv = &xilinx_pwm->priv;
+	chip = &xilinx_pwm->chip;
+	platform_set_drvdata(pdev, chip);
 
 	regs = devm_platform_ioremap_resource(pdev, 0);
 	if (IS_ERR(regs))
@@ -278,10 +280,10 @@ static int xilinx_pwm_probe(struct platform_device *pdev)
 		return dev_err_probe(dev, ret, "Clock enable failed\n");
 	clk_rate_exclusive_get(priv->clk);
 
-	xilinx_pwm->chip.dev = dev;
-	xilinx_pwm->chip.ops = &xilinx_pwm_ops;
-	xilinx_pwm->chip.npwm = 1;
-	ret = pwmchip_add(&xilinx_pwm->chip);
+	chip->dev = dev;
+	chip->ops = &xilinx_pwm_ops;
+	chip->npwm = 1;
+	ret = pwmchip_add(chip);
 	if (ret) {
 		clk_rate_exclusive_put(priv->clk);
 		clk_disable_unprepare(priv->clk);
@@ -293,11 +295,12 @@ static int xilinx_pwm_probe(struct platform_device *pdev)
 
 static void xilinx_pwm_remove(struct platform_device *pdev)
 {
-	struct xilinx_pwm_device *xilinx_pwm = platform_get_drvdata(pdev);
+	struct pwm_chip *chip = platform_get_drvdata(pdev);
+	struct xilinx_timer_priv *priv = xilinx_pwm_chip_to_priv(chip);
 
-	pwmchip_remove(&xilinx_pwm->chip);
-	clk_rate_exclusive_put(xilinx_pwm->priv.clk);
-	clk_disable_unprepare(xilinx_pwm->priv.clk);
+	pwmchip_remove(chip);
+	clk_rate_exclusive_put(priv->clk);
+	clk_disable_unprepare(priv->clk);
 }
 
 static const struct of_device_id xilinx_pwm_of_match[] = {

From 57721161c73bde0e8faa62a7c94dda7831791123 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Wed, 14 Feb 2024 10:33:17 +0100
Subject: [PATCH 0744/1406] pwm: xilinx: Make use of devm_pwmchip_alloc()
 function
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This prepares the pwm-xilinx driver to further changes of the pwm core
outlined in the commit introducing devm_pwmchip_alloc(). There is no
intended semantical change and the driver should behave as before.

Link: https://lore.kernel.org/r/7cbc32771e94103b8c1c817cfdd613d7a2fc01b9.1707900770.git.u.kleine-koenig@pengutronix.de
Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
---
 drivers/pwm/pwm-xilinx.c | 19 +++++--------------
 1 file changed, 5 insertions(+), 14 deletions(-)

diff --git a/drivers/pwm/pwm-xilinx.c b/drivers/pwm/pwm-xilinx.c
index 0ca79fe8c10520..3a7deebb0d0c8a 100644
--- a/drivers/pwm/pwm-xilinx.c
+++ b/drivers/pwm/pwm-xilinx.c
@@ -80,15 +80,10 @@ unsigned int xilinx_timer_get_period(struct xilinx_timer_priv *priv,
 #define TCSR_PWM_CLEAR (TCSR_MDT | TCSR_LOAD)
 #define TCSR_PWM_MASK (TCSR_PWM_SET | TCSR_PWM_CLEAR)
 
-struct xilinx_pwm_device {
-	struct pwm_chip chip;
-	struct xilinx_timer_priv priv;
-};
-
 static inline struct xilinx_timer_priv
 *xilinx_pwm_chip_to_priv(struct pwm_chip *chip)
 {
-	return &container_of(chip, struct xilinx_pwm_device, chip)->priv;
+	return pwmchip_get_drvdata(chip);
 }
 
 static bool xilinx_timer_pwm_enabled(u32 tcsr0, u32 tcsr1)
@@ -215,7 +210,6 @@ static int xilinx_pwm_probe(struct platform_device *pdev)
 	struct device_node *np = dev->of_node;
 	struct xilinx_timer_priv *priv;
 	struct pwm_chip *chip;
-	struct xilinx_pwm_device *xilinx_pwm;
 	u32 pwm_cells, one_timer, width;
 	void __iomem *regs;
 
@@ -226,11 +220,10 @@ static int xilinx_pwm_probe(struct platform_device *pdev)
 	if (ret)
 		return dev_err_probe(dev, ret, "could not read #pwm-cells\n");
 
-	xilinx_pwm = devm_kzalloc(dev, sizeof(*xilinx_pwm), GFP_KERNEL);
-	if (!xilinx_pwm)
-		return -ENOMEM;
-	priv = &xilinx_pwm->priv;
-	chip = &xilinx_pwm->chip;
+	chip = devm_pwmchip_alloc(dev, 1, sizeof(*priv));
+	if (IS_ERR(chip))
+		return PTR_ERR(chip);
+	priv = xilinx_pwm_chip_to_priv(chip);
 	platform_set_drvdata(pdev, chip);
 
 	regs = devm_platform_ioremap_resource(pdev, 0);
@@ -280,9 +273,7 @@ static int xilinx_pwm_probe(struct platform_device *pdev)
 		return dev_err_probe(dev, ret, "Clock enable failed\n");
 	clk_rate_exclusive_get(priv->clk);
 
-	chip->dev = dev;
 	chip->ops = &xilinx_pwm_ops;
-	chip->npwm = 1;
 	ret = pwmchip_add(chip);
 	if (ret) {
 		clk_rate_exclusive_put(priv->clk);

From 2e68507e7090d39c3560c258f2e857d5b7770bae Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Wed, 14 Feb 2024 10:33:18 +0100
Subject: [PATCH 0745/1406] gpio: mvebu: Make use of devm_pwmchip_alloc()
 function
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This prepares the pwm sub-driver to further changes of the pwm core
outlined in the commit introducing devm_pwmchip_alloc(). There is no
intended semantical change and the driver should behave as before.

Acked-by: Linus Walleij <linus.walleij@linaro.org>
Link: https://lore.kernel.org/r/2edc3adbb2c40b76b3b3dac82de82f3036bec1d5.1707900770.git.u.kleine-koenig@pengutronix.de
Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
---
 drivers/gpio/gpio-mvebu.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/drivers/gpio/gpio-mvebu.c b/drivers/gpio/gpio-mvebu.c
index a13f3c18ccd4ae..8cfd3a89c0184d 100644
--- a/drivers/gpio/gpio-mvebu.c
+++ b/drivers/gpio/gpio-mvebu.c
@@ -99,7 +99,6 @@ struct mvebu_pwm {
 	u32			 offset;
 	unsigned long		 clk_rate;
 	struct gpio_desc	*gpiod;
-	struct pwm_chip		 chip;
 	spinlock_t		 lock;
 	struct mvebu_gpio_chip	*mvchip;
 
@@ -615,7 +614,7 @@ static const struct regmap_config mvebu_gpio_regmap_config = {
  */
 static struct mvebu_pwm *to_mvebu_pwm(struct pwm_chip *chip)
 {
-	return container_of(chip, struct mvebu_pwm, chip);
+	return pwmchip_get_drvdata(chip);
 }
 
 static int mvebu_pwm_request(struct pwm_chip *chip, struct pwm_device *pwm)
@@ -789,6 +788,7 @@ static int mvebu_pwm_probe(struct platform_device *pdev,
 {
 	struct device *dev = &pdev->dev;
 	struct mvebu_pwm *mvpwm;
+	struct pwm_chip *chip;
 	void __iomem *base;
 	u32 offset;
 	u32 set;
@@ -813,9 +813,11 @@ static int mvebu_pwm_probe(struct platform_device *pdev,
 	if (IS_ERR(mvchip->clk))
 		return PTR_ERR(mvchip->clk);
 
-	mvpwm = devm_kzalloc(dev, sizeof(struct mvebu_pwm), GFP_KERNEL);
-	if (!mvpwm)
-		return -ENOMEM;
+	chip = devm_pwmchip_alloc(dev, mvchip->chip.ngpio, sizeof(*mvpwm));
+	if (IS_ERR(chip))
+		return PTR_ERR(chip);
+	mvpwm = to_mvebu_pwm(chip);
+
 	mvchip->mvpwm = mvpwm;
 	mvpwm->mvchip = mvchip;
 	mvpwm->offset = offset;
@@ -868,13 +870,11 @@ static int mvebu_pwm_probe(struct platform_device *pdev,
 		return -EINVAL;
 	}
 
-	mvpwm->chip.dev = dev;
-	mvpwm->chip.ops = &mvebu_pwm_ops;
-	mvpwm->chip.npwm = mvchip->chip.ngpio;
+	chip->ops = &mvebu_pwm_ops;
 
 	spin_lock_init(&mvpwm->lock);
 
-	return devm_pwmchip_add(dev, &mvpwm->chip);
+	return devm_pwmchip_add(dev, chip);
 }
 
 #ifdef CONFIG_DEBUG_FS

From 47510523f563cfe3c81ce921050b766425021beb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Wed, 14 Feb 2024 10:33:19 +0100
Subject: [PATCH 0746/1406] drm/bridge: ti-sn65dsi86: Make use of
 pwmchip_parent() accessor
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

struct pwm_chip::dev is about to change. To not have to touch this
driver in the same commit as struct pwm_chip::dev, use the accessor
function provided for exactly this purpose.

Acked-by: Douglas Anderson <dianders@chromium.org>
Link: https://lore.kernel.org/r/10a8d55110fc48a4759e65cc19556858587e94cc.1707900770.git.u.kleine-koenig@pengutronix.de
Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
---
 drivers/gpu/drm/bridge/ti-sn65dsi86.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/bridge/ti-sn65dsi86.c b/drivers/gpu/drm/bridge/ti-sn65dsi86.c
index 1f6e929c2f6a3c..f1fffbef332470 100644
--- a/drivers/gpu/drm/bridge/ti-sn65dsi86.c
+++ b/drivers/gpu/drm/bridge/ti-sn65dsi86.c
@@ -1415,7 +1415,7 @@ static int ti_sn_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm,
 	int ret;
 
 	if (!pdata->pwm_enabled) {
-		ret = pm_runtime_resume_and_get(chip->dev);
+		ret = pm_runtime_resume_and_get(pwmchip_parent(chip));
 		if (ret < 0)
 			return ret;
 	}
@@ -1431,7 +1431,7 @@ static int ti_sn_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm,
 						 SN_GPIO_MUX_MASK << (2 * SN_PWM_GPIO_IDX),
 						 SN_GPIO_MUX_SPECIAL << (2 * SN_PWM_GPIO_IDX));
 			if (ret) {
-				dev_err(chip->dev, "failed to mux in PWM function\n");
+				dev_err(pwmchip_parent(chip), "failed to mux in PWM function\n");
 				goto out;
 			}
 		}
@@ -1507,7 +1507,7 @@ static int ti_sn_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm,
 
 		ret = regmap_write(pdata->regmap, SN_PWM_PRE_DIV_REG, pre_div);
 		if (ret) {
-			dev_err(chip->dev, "failed to update PWM_PRE_DIV\n");
+			dev_err(pwmchip_parent(chip), "failed to update PWM_PRE_DIV\n");
 			goto out;
 		}
 
@@ -1519,7 +1519,7 @@ static int ti_sn_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm,
 		     FIELD_PREP(SN_PWM_INV_MASK, state->polarity == PWM_POLARITY_INVERSED);
 	ret = regmap_write(pdata->regmap, SN_PWM_EN_INV_REG, pwm_en_inv);
 	if (ret) {
-		dev_err(chip->dev, "failed to update PWM_EN/PWM_INV\n");
+		dev_err(pwmchip_parent(chip), "failed to update PWM_EN/PWM_INV\n");
 		goto out;
 	}
 
@@ -1527,7 +1527,7 @@ static int ti_sn_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm,
 out:
 
 	if (!pdata->pwm_enabled)
-		pm_runtime_put_sync(chip->dev);
+		pm_runtime_put_sync(pwmchip_parent(chip));
 
 	return ret;
 }

From 41728f9725c0ab74b4e9d66907c33b6f28afe7d3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Wed, 14 Feb 2024 10:33:20 +0100
Subject: [PATCH 0747/1406] drm/bridge: ti-sn65dsi86: Make use of
 devm_pwmchip_alloc() function
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This prepares the pwm driver of the ti-sn65dsi86 to further changes of
the pwm core outlined in the commit introducing devm_pwmchip_alloc().
There is no intended semantical change and the driver should behave as
before.

Acked-by: Douglas Anderson <dianders@chromium.org>
Link: https://lore.kernel.org/r/a56cbaf049f5f23c0e0fe36b0799dd20189675e0.1707900770.git.u.kleine-koenig@pengutronix.de
Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
---
 drivers/gpu/drm/bridge/ti-sn65dsi86.c | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/bridge/ti-sn65dsi86.c b/drivers/gpu/drm/bridge/ti-sn65dsi86.c
index f1fffbef332470..7fbc307cc02580 100644
--- a/drivers/gpu/drm/bridge/ti-sn65dsi86.c
+++ b/drivers/gpu/drm/bridge/ti-sn65dsi86.c
@@ -197,7 +197,7 @@ struct ti_sn65dsi86 {
 	DECLARE_BITMAP(gchip_output, SN_NUM_GPIOS);
 #endif
 #if defined(CONFIG_PWM)
-	struct pwm_chip			pchip;
+	struct pwm_chip			*pchip;
 	bool				pwm_enabled;
 	atomic_t			pwm_pin_busy;
 #endif
@@ -1374,7 +1374,7 @@ static void ti_sn_pwm_pin_release(struct ti_sn65dsi86 *pdata)
 
 static struct ti_sn65dsi86 *pwm_chip_to_ti_sn_bridge(struct pwm_chip *chip)
 {
-	return container_of(chip, struct ti_sn65dsi86, pchip);
+	return pwmchip_get_drvdata(chip);
 }
 
 static int ti_sn_pwm_request(struct pwm_chip *chip, struct pwm_device *pwm)
@@ -1585,23 +1585,28 @@ static const struct pwm_ops ti_sn_pwm_ops = {
 static int ti_sn_pwm_probe(struct auxiliary_device *adev,
 			   const struct auxiliary_device_id *id)
 {
+	struct pwm_chip *chip;
 	struct ti_sn65dsi86 *pdata = dev_get_drvdata(adev->dev.parent);
 
-	pdata->pchip.dev = &adev->dev;
-	pdata->pchip.ops = &ti_sn_pwm_ops;
-	pdata->pchip.npwm = 1;
-	pdata->pchip.of_xlate = of_pwm_single_xlate;
+	pdata->pchip = chip = devm_pwmchip_alloc(&adev->dev, 1, 0);
+	if (IS_ERR(chip))
+		return PTR_ERR(chip);
+
+	pwmchip_set_drvdata(chip, pdata);
+
+	chip->ops = &ti_sn_pwm_ops;
+	chip->of_xlate = of_pwm_single_xlate;
 
 	devm_pm_runtime_enable(&adev->dev);
 
-	return pwmchip_add(&pdata->pchip);
+	return pwmchip_add(chip);
 }
 
 static void ti_sn_pwm_remove(struct auxiliary_device *adev)
 {
 	struct ti_sn65dsi86 *pdata = dev_get_drvdata(adev->dev.parent);
 
-	pwmchip_remove(&pdata->pchip);
+	pwmchip_remove(pdata->pchip);
 
 	if (pdata->pwm_enabled)
 		pm_runtime_put_sync(&adev->dev);

From 42bb2d3a8319a58684ca9e2196b489b04c83d5c1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Wed, 14 Feb 2024 10:33:21 +0100
Subject: [PATCH 0748/1406] leds: qcom-lpg: Make use of devm_pwmchip_alloc()
 function
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This prepares the pwm sub-driver to further changes of the pwm core
outlined in the commit introducing devm_pwmchip_alloc(). There is no
intended semantical change and the driver should behave as before.

Acked-by: Lee Jones <lee@kernel.org>
Link: https://lore.kernel.org/r/0be073477092eeccaac6c021cf07e38fc30c74fc.1707900770.git.u.kleine-koenig@pengutronix.de
Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
---
 drivers/leds/rgb/leds-qcom-lpg.c | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/drivers/leds/rgb/leds-qcom-lpg.c b/drivers/leds/rgb/leds-qcom-lpg.c
index 156b73d1f4a29d..0a7acf59a42015 100644
--- a/drivers/leds/rgb/leds-qcom-lpg.c
+++ b/drivers/leds/rgb/leds-qcom-lpg.c
@@ -77,7 +77,7 @@ struct lpg {
 
 	struct mutex lock;
 
-	struct pwm_chip pwm;
+	struct pwm_chip *pwm;
 
 	const struct lpg_data *data;
 
@@ -978,7 +978,7 @@ static int lpg_pattern_mc_clear(struct led_classdev *cdev)
 
 static inline struct lpg *lpg_pwm_from_chip(struct pwm_chip *chip)
 {
-	return container_of(chip, struct lpg, pwm);
+	return pwmchip_get_drvdata(chip);
 }
 
 static int lpg_pwm_request(struct pwm_chip *chip, struct pwm_device *pwm)
@@ -1093,13 +1093,17 @@ static const struct pwm_ops lpg_pwm_ops = {
 
 static int lpg_add_pwm(struct lpg *lpg)
 {
+	struct pwm_chip *chip;
 	int ret;
 
-	lpg->pwm.dev = lpg->dev;
-	lpg->pwm.npwm = lpg->num_channels;
-	lpg->pwm.ops = &lpg_pwm_ops;
+	lpg->pwm = chip = devm_pwmchip_alloc(lpg->dev, lpg->num_channels, 0);
+	if (IS_ERR(chip))
+		return PTR_ERR(chip);
 
-	ret = devm_pwmchip_add(lpg->dev, &lpg->pwm);
+	chip->ops = &lpg_pwm_ops;
+	pwmchip_set_drvdata(chip, lpg);
+
+	ret = devm_pwmchip_add(lpg->dev, chip);
 	if (ret)
 		dev_err_probe(lpg->dev, ret, "failed to add PWM chip\n");
 

From ecbc4f575e03228780591dececd73b09d5092b26 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Wed, 14 Feb 2024 10:33:22 +0100
Subject: [PATCH 0749/1406] staging: greybus: pwm: Change prototype of helpers
 to prepare further changes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This prepares the driver for further changes that will make it harder to
determine the pwm_chip from a given gb_pwm_chip. To just not have
to do that, rework gb_pwm_activate_operation(),
gb_pwm_deactivate_operation(), gb_pwm_config_operation(),
gb_pwm_set_polarity_operation(), gb_pwm_enable_operation() and
gb_pwm_disable_operation() to take a pwm_chip. Also use the pwm_chip as
driver data instead of the gb_pwm_chip.

Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Link: https://lore.kernel.org/r/ef9b346d5bab508d4ded81cf115bf244938d04f1.1707900770.git.u.kleine-koenig@pengutronix.de
Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
---
 drivers/staging/greybus/pwm.c | 60 +++++++++++++++++------------------
 1 file changed, 29 insertions(+), 31 deletions(-)

diff --git a/drivers/staging/greybus/pwm.c b/drivers/staging/greybus/pwm.c
index a3cb68cfa0f9b7..97b49d436c5415 100644
--- a/drivers/staging/greybus/pwm.c
+++ b/drivers/staging/greybus/pwm.c
@@ -39,9 +39,9 @@ static int gb_pwm_count_operation(struct gb_pwm_chip *pwmc)
 	return 0;
 }
 
-static int gb_pwm_activate_operation(struct gb_pwm_chip *pwmc,
-				     u8 which)
+static int gb_pwm_activate_operation(struct pwm_chip *chip, u8 which)
 {
+	struct gb_pwm_chip *pwmc = pwm_chip_to_gb_pwm_chip(chip);
 	struct gb_pwm_activate_request request;
 	struct gbphy_device *gbphy_dev;
 	int ret;
@@ -51,7 +51,7 @@ static int gb_pwm_activate_operation(struct gb_pwm_chip *pwmc,
 
 	request.which = which;
 
-	gbphy_dev = to_gbphy_dev(pwmc->chip.dev);
+	gbphy_dev = to_gbphy_dev(chip->dev);
 	ret = gbphy_runtime_get_sync(gbphy_dev);
 	if (ret)
 		return ret;
@@ -64,9 +64,9 @@ static int gb_pwm_activate_operation(struct gb_pwm_chip *pwmc,
 	return ret;
 }
 
-static int gb_pwm_deactivate_operation(struct gb_pwm_chip *pwmc,
-				       u8 which)
+static int gb_pwm_deactivate_operation(struct pwm_chip *chip, u8 which)
 {
+	struct gb_pwm_chip *pwmc = pwm_chip_to_gb_pwm_chip(chip);
 	struct gb_pwm_deactivate_request request;
 	struct gbphy_device *gbphy_dev;
 	int ret;
@@ -76,7 +76,7 @@ static int gb_pwm_deactivate_operation(struct gb_pwm_chip *pwmc,
 
 	request.which = which;
 
-	gbphy_dev = to_gbphy_dev(pwmc->chip.dev);
+	gbphy_dev = to_gbphy_dev(chip->dev);
 	ret = gbphy_runtime_get_sync(gbphy_dev);
 	if (ret)
 		return ret;
@@ -89,9 +89,10 @@ static int gb_pwm_deactivate_operation(struct gb_pwm_chip *pwmc,
 	return ret;
 }
 
-static int gb_pwm_config_operation(struct gb_pwm_chip *pwmc,
+static int gb_pwm_config_operation(struct pwm_chip *chip,
 				   u8 which, u32 duty, u32 period)
 {
+	struct gb_pwm_chip *pwmc = pwm_chip_to_gb_pwm_chip(chip);
 	struct gb_pwm_config_request request;
 	struct gbphy_device *gbphy_dev;
 	int ret;
@@ -103,7 +104,7 @@ static int gb_pwm_config_operation(struct gb_pwm_chip *pwmc,
 	request.duty = cpu_to_le32(duty);
 	request.period = cpu_to_le32(period);
 
-	gbphy_dev = to_gbphy_dev(pwmc->chip.dev);
+	gbphy_dev = to_gbphy_dev(chip->dev);
 	ret = gbphy_runtime_get_sync(gbphy_dev);
 	if (ret)
 		return ret;
@@ -116,9 +117,10 @@ static int gb_pwm_config_operation(struct gb_pwm_chip *pwmc,
 	return ret;
 }
 
-static int gb_pwm_set_polarity_operation(struct gb_pwm_chip *pwmc,
+static int gb_pwm_set_polarity_operation(struct pwm_chip *chip,
 					 u8 which, u8 polarity)
 {
+	struct gb_pwm_chip *pwmc = pwm_chip_to_gb_pwm_chip(chip);
 	struct gb_pwm_polarity_request request;
 	struct gbphy_device *gbphy_dev;
 	int ret;
@@ -129,7 +131,7 @@ static int gb_pwm_set_polarity_operation(struct gb_pwm_chip *pwmc,
 	request.which = which;
 	request.polarity = polarity;
 
-	gbphy_dev = to_gbphy_dev(pwmc->chip.dev);
+	gbphy_dev = to_gbphy_dev(chip->dev);
 	ret = gbphy_runtime_get_sync(gbphy_dev);
 	if (ret)
 		return ret;
@@ -142,9 +144,9 @@ static int gb_pwm_set_polarity_operation(struct gb_pwm_chip *pwmc,
 	return ret;
 }
 
-static int gb_pwm_enable_operation(struct gb_pwm_chip *pwmc,
-				   u8 which)
+static int gb_pwm_enable_operation(struct pwm_chip *chip, u8 which)
 {
+	struct gb_pwm_chip *pwmc = pwm_chip_to_gb_pwm_chip(chip);
 	struct gb_pwm_enable_request request;
 	struct gbphy_device *gbphy_dev;
 	int ret;
@@ -154,7 +156,7 @@ static int gb_pwm_enable_operation(struct gb_pwm_chip *pwmc,
 
 	request.which = which;
 
-	gbphy_dev = to_gbphy_dev(pwmc->chip.dev);
+	gbphy_dev = to_gbphy_dev(chip->dev);
 	ret = gbphy_runtime_get_sync(gbphy_dev);
 	if (ret)
 		return ret;
@@ -167,9 +169,9 @@ static int gb_pwm_enable_operation(struct gb_pwm_chip *pwmc,
 	return ret;
 }
 
-static int gb_pwm_disable_operation(struct gb_pwm_chip *pwmc,
-				    u8 which)
+static int gb_pwm_disable_operation(struct pwm_chip *chip, u8 which)
 {
+	struct gb_pwm_chip *pwmc = pwm_chip_to_gb_pwm_chip(chip);
 	struct gb_pwm_disable_request request;
 	struct gbphy_device *gbphy_dev;
 	int ret;
@@ -182,7 +184,7 @@ static int gb_pwm_disable_operation(struct gb_pwm_chip *pwmc,
 	ret = gb_operation_sync(pwmc->connection, GB_PWM_TYPE_DISABLE,
 				&request, sizeof(request), NULL, 0);
 
-	gbphy_dev = to_gbphy_dev(pwmc->chip.dev);
+	gbphy_dev = to_gbphy_dev(chip->dev);
 	gbphy_runtime_put_autosuspend(gbphy_dev);
 
 	return ret;
@@ -190,19 +192,15 @@ static int gb_pwm_disable_operation(struct gb_pwm_chip *pwmc,
 
 static int gb_pwm_request(struct pwm_chip *chip, struct pwm_device *pwm)
 {
-	struct gb_pwm_chip *pwmc = pwm_chip_to_gb_pwm_chip(chip);
-
-	return gb_pwm_activate_operation(pwmc, pwm->hwpwm);
+	return gb_pwm_activate_operation(chip, pwm->hwpwm);
 };
 
 static void gb_pwm_free(struct pwm_chip *chip, struct pwm_device *pwm)
 {
-	struct gb_pwm_chip *pwmc = pwm_chip_to_gb_pwm_chip(chip);
-
 	if (pwm_is_enabled(pwm))
 		dev_warn(chip->dev, "freeing PWM device without disabling\n");
 
-	gb_pwm_deactivate_operation(pwmc, pwm->hwpwm);
+	gb_pwm_deactivate_operation(chip, pwm->hwpwm);
 }
 
 static int gb_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm,
@@ -212,22 +210,21 @@ static int gb_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm,
 	bool enabled = pwm->state.enabled;
 	u64 period = state->period;
 	u64 duty_cycle = state->duty_cycle;
-	struct gb_pwm_chip *pwmc = pwm_chip_to_gb_pwm_chip(chip);
 
 	/* Set polarity */
 	if (state->polarity != pwm->state.polarity) {
 		if (enabled) {
-			gb_pwm_disable_operation(pwmc, pwm->hwpwm);
+			gb_pwm_disable_operation(chip, pwm->hwpwm);
 			enabled = false;
 		}
-		err = gb_pwm_set_polarity_operation(pwmc, pwm->hwpwm, state->polarity);
+		err = gb_pwm_set_polarity_operation(chip, pwm->hwpwm, state->polarity);
 		if (err)
 			return err;
 	}
 
 	if (!state->enabled) {
 		if (enabled)
-			gb_pwm_disable_operation(pwmc, pwm->hwpwm);
+			gb_pwm_disable_operation(chip, pwm->hwpwm);
 		return 0;
 	}
 
@@ -243,13 +240,13 @@ static int gb_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm,
 	if (duty_cycle > period)
 		duty_cycle = period;
 
-	err = gb_pwm_config_operation(pwmc, pwm->hwpwm, duty_cycle, period);
+	err = gb_pwm_config_operation(chip, pwm->hwpwm, duty_cycle, period);
 	if (err)
 		return err;
 
 	/* enable/disable */
 	if (!enabled)
-		return gb_pwm_enable_operation(pwmc, pwm->hwpwm);
+		return gb_pwm_enable_operation(chip, pwm->hwpwm);
 
 	return 0;
 }
@@ -282,7 +279,7 @@ static int gb_pwm_probe(struct gbphy_device *gbphy_dev,
 
 	pwmc->connection = connection;
 	gb_connection_set_data(connection, pwmc);
-	gb_gbphy_set_data(gbphy_dev, pwmc);
+	gb_gbphy_set_data(gbphy_dev, chip);
 
 	ret = gb_connection_enable(connection);
 	if (ret)
@@ -320,7 +317,8 @@ static int gb_pwm_probe(struct gbphy_device *gbphy_dev,
 
 static void gb_pwm_remove(struct gbphy_device *gbphy_dev)
 {
-	struct gb_pwm_chip *pwmc = gb_gbphy_get_data(gbphy_dev);
+	struct pwm_chip *chip = gb_gbphy_get_data(gbphy_dev);
+	struct gb_pwm_chip *pwmc = pwm_chip_to_gb_pwm_chip(chip);
 	struct gb_connection *connection = pwmc->connection;
 	int ret;
 
@@ -328,7 +326,7 @@ static void gb_pwm_remove(struct gbphy_device *gbphy_dev)
 	if (ret)
 		gbphy_runtime_get_noresume(gbphy_dev);
 
-	pwmchip_remove(&pwmc->chip);
+	pwmchip_remove(chip);
 	gb_connection_disable(connection);
 	gb_connection_destroy(connection);
 	kfree(pwmc);

From c5a4a04fbf7c3a2585b3a9f03cef4bd81fa0754b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Wed, 14 Feb 2024 10:33:23 +0100
Subject: [PATCH 0750/1406] staging: greybus: pwm: Make use of pwmchip_parent()
 accessor
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

struct pwm_chip::dev is about to change. To not have to touch this
driver in the same commit as struct pwm_chip::dev, use the accessor
function provided for exactly this purpose.

Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Link: https://lore.kernel.org/r/7e7517527b825a18ca10cb0faa837577d4f0ec8a.1707900770.git.u.kleine-koenig@pengutronix.de
Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
---
 drivers/staging/greybus/pwm.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/drivers/staging/greybus/pwm.c b/drivers/staging/greybus/pwm.c
index 97b49d436c5415..0cd1dab2d888c5 100644
--- a/drivers/staging/greybus/pwm.c
+++ b/drivers/staging/greybus/pwm.c
@@ -51,7 +51,7 @@ static int gb_pwm_activate_operation(struct pwm_chip *chip, u8 which)
 
 	request.which = which;
 
-	gbphy_dev = to_gbphy_dev(chip->dev);
+	gbphy_dev = to_gbphy_dev(pwmchip_parent(chip));
 	ret = gbphy_runtime_get_sync(gbphy_dev);
 	if (ret)
 		return ret;
@@ -76,7 +76,7 @@ static int gb_pwm_deactivate_operation(struct pwm_chip *chip, u8 which)
 
 	request.which = which;
 
-	gbphy_dev = to_gbphy_dev(chip->dev);
+	gbphy_dev = to_gbphy_dev(pwmchip_parent(chip));
 	ret = gbphy_runtime_get_sync(gbphy_dev);
 	if (ret)
 		return ret;
@@ -104,7 +104,7 @@ static int gb_pwm_config_operation(struct pwm_chip *chip,
 	request.duty = cpu_to_le32(duty);
 	request.period = cpu_to_le32(period);
 
-	gbphy_dev = to_gbphy_dev(chip->dev);
+	gbphy_dev = to_gbphy_dev(pwmchip_parent(chip));
 	ret = gbphy_runtime_get_sync(gbphy_dev);
 	if (ret)
 		return ret;
@@ -131,7 +131,7 @@ static int gb_pwm_set_polarity_operation(struct pwm_chip *chip,
 	request.which = which;
 	request.polarity = polarity;
 
-	gbphy_dev = to_gbphy_dev(chip->dev);
+	gbphy_dev = to_gbphy_dev(pwmchip_parent(chip));
 	ret = gbphy_runtime_get_sync(gbphy_dev);
 	if (ret)
 		return ret;
@@ -156,7 +156,7 @@ static int gb_pwm_enable_operation(struct pwm_chip *chip, u8 which)
 
 	request.which = which;
 
-	gbphy_dev = to_gbphy_dev(chip->dev);
+	gbphy_dev = to_gbphy_dev(pwmchip_parent(chip));
 	ret = gbphy_runtime_get_sync(gbphy_dev);
 	if (ret)
 		return ret;
@@ -184,7 +184,7 @@ static int gb_pwm_disable_operation(struct pwm_chip *chip, u8 which)
 	ret = gb_operation_sync(pwmc->connection, GB_PWM_TYPE_DISABLE,
 				&request, sizeof(request), NULL, 0);
 
-	gbphy_dev = to_gbphy_dev(chip->dev);
+	gbphy_dev = to_gbphy_dev(pwmchip_parent(chip));
 	gbphy_runtime_put_autosuspend(gbphy_dev);
 
 	return ret;
@@ -198,7 +198,7 @@ static int gb_pwm_request(struct pwm_chip *chip, struct pwm_device *pwm)
 static void gb_pwm_free(struct pwm_chip *chip, struct pwm_device *pwm)
 {
 	if (pwm_is_enabled(pwm))
-		dev_warn(chip->dev, "freeing PWM device without disabling\n");
+		dev_warn(pwmchip_parent(chip), "freeing PWM device without disabling\n");
 
 	gb_pwm_deactivate_operation(chip, pwm->hwpwm);
 }

From d61a7acda5305142675b51a114d5b3518f50a337 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Wed, 14 Feb 2024 10:33:24 +0100
Subject: [PATCH 0751/1406] staging: greybus: pwm: Rely on pwm framework to
 pass a valid hwpwm
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The pwm framework already asserts to only pass a hwpwm value (= which)
less than npwm (= pwmc->pwm_max + 1). So there is no need to recheck
this condition. Drop the respective checks.

Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Link: https://lore.kernel.org/r/e003bc5e8e66f27f2b8fdc525a536d865888cffe.1707900770.git.u.kleine-koenig@pengutronix.de
Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
---
 drivers/staging/greybus/pwm.c | 18 ------------------
 1 file changed, 18 deletions(-)

diff --git a/drivers/staging/greybus/pwm.c b/drivers/staging/greybus/pwm.c
index 0cd1dab2d888c5..3099c2a3611cab 100644
--- a/drivers/staging/greybus/pwm.c
+++ b/drivers/staging/greybus/pwm.c
@@ -46,9 +46,6 @@ static int gb_pwm_activate_operation(struct pwm_chip *chip, u8 which)
 	struct gbphy_device *gbphy_dev;
 	int ret;
 
-	if (which > pwmc->pwm_max)
-		return -EINVAL;
-
 	request.which = which;
 
 	gbphy_dev = to_gbphy_dev(pwmchip_parent(chip));
@@ -71,9 +68,6 @@ static int gb_pwm_deactivate_operation(struct pwm_chip *chip, u8 which)
 	struct gbphy_device *gbphy_dev;
 	int ret;
 
-	if (which > pwmc->pwm_max)
-		return -EINVAL;
-
 	request.which = which;
 
 	gbphy_dev = to_gbphy_dev(pwmchip_parent(chip));
@@ -97,9 +91,6 @@ static int gb_pwm_config_operation(struct pwm_chip *chip,
 	struct gbphy_device *gbphy_dev;
 	int ret;
 
-	if (which > pwmc->pwm_max)
-		return -EINVAL;
-
 	request.which = which;
 	request.duty = cpu_to_le32(duty);
 	request.period = cpu_to_le32(period);
@@ -125,9 +116,6 @@ static int gb_pwm_set_polarity_operation(struct pwm_chip *chip,
 	struct gbphy_device *gbphy_dev;
 	int ret;
 
-	if (which > pwmc->pwm_max)
-		return -EINVAL;
-
 	request.which = which;
 	request.polarity = polarity;
 
@@ -151,9 +139,6 @@ static int gb_pwm_enable_operation(struct pwm_chip *chip, u8 which)
 	struct gbphy_device *gbphy_dev;
 	int ret;
 
-	if (which > pwmc->pwm_max)
-		return -EINVAL;
-
 	request.which = which;
 
 	gbphy_dev = to_gbphy_dev(pwmchip_parent(chip));
@@ -176,9 +161,6 @@ static int gb_pwm_disable_operation(struct pwm_chip *chip, u8 which)
 	struct gbphy_device *gbphy_dev;
 	int ret;
 
-	if (which > pwmc->pwm_max)
-		return -EINVAL;
-
 	request.which = which;
 
 	ret = gb_operation_sync(pwmc->connection, GB_PWM_TYPE_DISABLE,

From 6dad83c5b213e3c61a6efac503591a3d00aa2dc8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Wed, 14 Feb 2024 10:33:25 +0100
Subject: [PATCH 0752/1406] staging: greybus: pwm: Drop unused
 gb_connection_set_data()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The driver never calls gb_connection_get_data(). If there was another
caller (say the greybus core) it cannot use the value because the type
of pwmc (= struct gb_pwm_chip) is only defined in the pwm driver.

So drop the call to gb_connection_set_data().

Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Link: https://lore.kernel.org/r/bd2759c325c295f3d9f990609d97eb83a8ca88b8.1707900770.git.u.kleine-koenig@pengutronix.de
Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
---
 drivers/staging/greybus/pwm.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/staging/greybus/pwm.c b/drivers/staging/greybus/pwm.c
index 3099c2a3611cab..c7a2e874a62be3 100644
--- a/drivers/staging/greybus/pwm.c
+++ b/drivers/staging/greybus/pwm.c
@@ -260,7 +260,6 @@ static int gb_pwm_probe(struct gbphy_device *gbphy_dev,
 	}
 
 	pwmc->connection = connection;
-	gb_connection_set_data(connection, pwmc);
 	gb_gbphy_set_data(gbphy_dev, chip);
 
 	ret = gb_connection_enable(connection);

From da8d54ad2ac317eb6f41054ad926b5085b66e247 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Wed, 14 Feb 2024 10:33:26 +0100
Subject: [PATCH 0753/1406] staging: greybus: pwm: Rework how the number of PWM
 lines is determined
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

With a later patch it becomes necessary to already now the number of PWM
lines when pwmc is allocated. So make the function not use pwmc but a
plain connection and return the number of lines instead of storing it in
pwmc. This allows to get rid of the pwm_max member.

Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Link: https://lore.kernel.org/r/3efd84ac03e7dc288f20b0de20b142b6404cb1fa.1707900770.git.u.kleine-koenig@pengutronix.de
Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
---
 drivers/staging/greybus/pwm.c | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/drivers/staging/greybus/pwm.c b/drivers/staging/greybus/pwm.c
index c7a2e874a62be3..35e98e7c00c1a4 100644
--- a/drivers/staging/greybus/pwm.c
+++ b/drivers/staging/greybus/pwm.c
@@ -16,8 +16,6 @@
 
 struct gb_pwm_chip {
 	struct gb_connection	*connection;
-	u8			pwm_max;	/* max pwm number */
-
 	struct pwm_chip		chip;
 };
 
@@ -26,17 +24,21 @@ static inline struct gb_pwm_chip *pwm_chip_to_gb_pwm_chip(struct pwm_chip *chip)
 	return container_of(chip, struct gb_pwm_chip, chip);
 }
 
-static int gb_pwm_count_operation(struct gb_pwm_chip *pwmc)
+static int gb_pwm_get_npwm(struct gb_connection *connection)
 {
 	struct gb_pwm_count_response response;
 	int ret;
 
-	ret = gb_operation_sync(pwmc->connection, GB_PWM_TYPE_PWM_COUNT,
+	ret = gb_operation_sync(connection, GB_PWM_TYPE_PWM_COUNT,
 				NULL, 0, &response, sizeof(response));
 	if (ret)
 		return ret;
-	pwmc->pwm_max = response.count;
-	return 0;
+
+	/*
+	 * The request returns the highest allowed PWM id parameter. So add one
+	 * to get the number of PWMs.
+	 */
+	return response.count + 1;
 }
 
 static int gb_pwm_activate_operation(struct pwm_chip *chip, u8 which)
@@ -245,7 +247,7 @@ static int gb_pwm_probe(struct gbphy_device *gbphy_dev,
 	struct gb_connection *connection;
 	struct gb_pwm_chip *pwmc;
 	struct pwm_chip *chip;
-	int ret;
+	int ret, npwm;
 
 	pwmc = kzalloc(sizeof(*pwmc), GFP_KERNEL);
 	if (!pwmc)
@@ -267,15 +269,16 @@ static int gb_pwm_probe(struct gbphy_device *gbphy_dev,
 		goto exit_connection_destroy;
 
 	/* Query number of pwms present */
-	ret = gb_pwm_count_operation(pwmc);
-	if (ret)
+	ret = gb_pwm_get_npwm(connection);
+	if (ret < 0)
 		goto exit_connection_disable;
+	npwm = ret;
 
 	chip = &pwmc->chip;
 
 	chip->dev = &gbphy_dev->dev;
 	chip->ops = &gb_pwm_ops;
-	chip->npwm = pwmc->pwm_max + 1;
+	chip->npwm = npwm;
 
 	ret = pwmchip_add(chip);
 	if (ret) {

From 0a3e8c2d3c7ec31b2e5bf05fd606d672cf8ea533 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Wed, 14 Feb 2024 10:33:27 +0100
Subject: [PATCH 0754/1406] staging: greybus: pwm: Make use of
 devm_pwmchip_alloc() function
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This prepares the greybus pwm driver to further changes of the pwm core
outlined in the commit introducing pwmchip_alloc(). There is no intended
semantical change and the driver should behave as before.

Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Link: https://lore.kernel.org/r/3206ab7f49c2c1704ea69446f3b7a7d1e71200fa.1707900770.git.u.kleine-koenig@pengutronix.de
Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
---
 drivers/staging/greybus/pwm.c | 33 +++++++++++++++------------------
 1 file changed, 15 insertions(+), 18 deletions(-)

diff --git a/drivers/staging/greybus/pwm.c b/drivers/staging/greybus/pwm.c
index 35e98e7c00c1a4..01883fbcd79b07 100644
--- a/drivers/staging/greybus/pwm.c
+++ b/drivers/staging/greybus/pwm.c
@@ -249,20 +249,11 @@ static int gb_pwm_probe(struct gbphy_device *gbphy_dev,
 	struct pwm_chip *chip;
 	int ret, npwm;
 
-	pwmc = kzalloc(sizeof(*pwmc), GFP_KERNEL);
-	if (!pwmc)
-		return -ENOMEM;
-
 	connection = gb_connection_create(gbphy_dev->bundle,
 					  le16_to_cpu(gbphy_dev->cport_desc->id),
 					  NULL);
-	if (IS_ERR(connection)) {
-		ret = PTR_ERR(connection);
-		goto exit_pwmc_free;
-	}
-
-	pwmc->connection = connection;
-	gb_gbphy_set_data(gbphy_dev, chip);
+	if (IS_ERR(connection))
+		return PTR_ERR(connection);
 
 	ret = gb_connection_enable(connection);
 	if (ret)
@@ -274,28 +265,34 @@ static int gb_pwm_probe(struct gbphy_device *gbphy_dev,
 		goto exit_connection_disable;
 	npwm = ret;
 
-	chip = &pwmc->chip;
+	chip = pwmchip_alloc(&gbphy_dev->dev, npwm, sizeof(*pwmc));
+	if (IS_ERR(chip)) {
+		ret = PTR_ERR(chip);
+		goto exit_connection_disable;
+	}
+	gb_gbphy_set_data(gbphy_dev, chip);
+
+	pwmc = pwm_chip_to_gb_pwm_chip(chip);
+	pwmc->connection = connection;
 
-	chip->dev = &gbphy_dev->dev;
 	chip->ops = &gb_pwm_ops;
-	chip->npwm = npwm;
 
 	ret = pwmchip_add(chip);
 	if (ret) {
 		dev_err(&gbphy_dev->dev,
 			"failed to register PWM: %d\n", ret);
-		goto exit_connection_disable;
+		goto exit_pwmchip_put;
 	}
 
 	gbphy_runtime_put_autosuspend(gbphy_dev);
 	return 0;
 
+exit_pwmchip_put:
+	pwmchip_put(chip);
 exit_connection_disable:
 	gb_connection_disable(connection);
 exit_connection_destroy:
 	gb_connection_destroy(connection);
-exit_pwmc_free:
-	kfree(pwmc);
 	return ret;
 }
 
@@ -311,9 +308,9 @@ static void gb_pwm_remove(struct gbphy_device *gbphy_dev)
 		gbphy_runtime_get_noresume(gbphy_dev);
 
 	pwmchip_remove(chip);
+	pwmchip_put(chip);
 	gb_connection_disable(connection);
 	gb_connection_destroy(connection);
-	kfree(pwmc);
 }
 
 static const struct gbphy_device_id gb_pwm_id_table[] = {

From 2e2a26f3a92d84de2b37ba096e798d1592ede0c4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Wed, 14 Feb 2024 10:33:28 +0100
Subject: [PATCH 0755/1406] pwm: Ensure that pwm_chips are allocated using
 pwmchip_alloc()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Memory holding a struct device must not be freed before the reference
count drops to zero. So a struct pwm_chip must not live in memory
freed by a driver on unbind. All in-tree drivers were fixed accordingly,
but as out-of-tree drivers, that were not adapted, still compile fine,
catch these in pwmchip_add().

Link: https://lore.kernel.org/r/35f5b229c98f78b2f6ce2397259a4a936be477c0.1707900770.git.u.kleine-koenig@pengutronix.de
Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
---
 drivers/pwm/core.c  | 10 ++++++++++
 include/linux/pwm.h |  2 ++
 2 files changed, 12 insertions(+)

diff --git a/drivers/pwm/core.c b/drivers/pwm/core.c
index d70f793ce4b38d..fe83333c466adf 100644
--- a/drivers/pwm/core.c
+++ b/drivers/pwm/core.c
@@ -481,6 +481,7 @@ struct pwm_chip *pwmchip_alloc(struct device *parent, unsigned int npwm, size_t
 
 	chip->dev = parent;
 	chip->npwm = npwm;
+	chip->uses_pwmchip_alloc = true;
 
 	pwmchip_set_drvdata(chip, pwmchip_priv(chip));
 
@@ -561,6 +562,15 @@ int __pwmchip_add(struct pwm_chip *chip, struct module *owner)
 	if (!chip || !pwmchip_parent(chip) || !chip->ops || !chip->npwm)
 		return -EINVAL;
 
+	/*
+	 * a struct pwm_chip must be allocated using (devm_)pwmchip_alloc,
+	 * otherwise the embedded struct device might disappear too early
+	 * resulting in memory corruption.
+	 * Catch drivers that were not converted appropriately.
+	 */
+	if (!chip->uses_pwmchip_alloc)
+		return -EINVAL;
+
 	if (!pwm_ops_check(chip))
 		return -EINVAL;
 
diff --git a/include/linux/pwm.h b/include/linux/pwm.h
index 4a6568dfdf3fa6..94a642a8881796 100644
--- a/include/linux/pwm.h
+++ b/include/linux/pwm.h
@@ -272,6 +272,7 @@ struct pwm_ops {
  * @npwm: number of PWMs controlled by this chip
  * @of_xlate: request a PWM device given a device tree PWM specifier
  * @atomic: can the driver's ->apply() be called in atomic context
+ * @uses_pwmchip_alloc: signals if pwmchip_allow was used to allocate this chip
  * @driver_data: Private pointer for driver specific info
  * @pwms: array of PWM devices allocated by the framework
  */
@@ -287,6 +288,7 @@ struct pwm_chip {
 	bool atomic;
 
 	/* only used internally by the PWM framework */
+	bool uses_pwmchip_alloc;
 	void *driver_data;
 	struct pwm_device *pwms;
 };

From 409d0bf193bbf1014d47ce49bc5400e7f32a4be0 Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Tue, 13 Feb 2024 09:13:56 +0800
Subject: [PATCH 0756/1406] btrfs: do not skip re-registration for the mounted
 device

There are reports that since version 6.7 update-grub fails to find the
device of the root on systems without initrd and on a single device.

This looks like the device name changed in the output of
/proc/self/mountinfo:

6.5-rc5 working

  18 1 0:16 / / rw,noatime - btrfs /dev/sda8 ...

6.7 not working:

  17 1 0:15 / / rw,noatime - btrfs /dev/root ...

and "update-grub" shows this error:

  /usr/sbin/grub-probe: error: cannot find a device for / (is /dev mounted?)

This looks like it's related to the device name, but grub-probe
recognizes the "/dev/root" path and tries to find the underlying device.
However there's a special case for some filesystems, for btrfs in
particular.

The generic root device detection heuristic is not done and it all
relies on reading the device infos by a btrfs specific ioctl. This ioctl
returns the device name as it was saved at the time of device scan (in
this case it's /dev/root).

The change in 6.7 for temp_fsid to allow several single device
filesystem to exist with the same fsid (and transparently generate a new
UUID at mount time) was to skip caching/registering such devices.

This also skipped mounted device. One step of scanning is to check if
the device name hasn't changed, and if yes then update the cached value.

This broke the grub-probe as it always read the device /dev/root and
couldn't find it in the system. A temporary workaround is to create a
symlink but this does not survive reboot.

The right fix is to allow updating the device path of a mounted
filesystem even if this is a single device one.

In the fix, check if the device's major:minor number matches with the
cached device. If they do, then we can allow the scan to happen so that
device_list_add() can take care of updating the device path. The file
descriptor remains unchanged.

This does not affect the temp_fsid feature, the UUID of the mounted
filesystem remains the same and the matching is based on device major:minor
which is unique per mounted filesystem.

This covers the path when the device (that exists for all mounted
devices) name changes, updating /dev/root to /dev/sdx. Any other single
device with filesystem and is not mounted is still skipped.

Note that if a system is booted and initial mount is done on the
/dev/root device, this will be the cached name of the device. Only after
the command "btrfs device scan" it will change as it triggers the
rename.

The fix was verified by users whose systems were affected.

Fixes: bc27d6f0aa0e ("btrfs: scan but don't register device on single device filesystem")
Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=218353
Link: https://lore.kernel.org/lkml/CAKLYgeJ1tUuqLcsquwuFqjDXPSJpEiokrWK2gisPKDZLs8Y2TQ@mail.gmail.com/
CC: stable@vger.kernel.org # 6.7+
Tested-by: Alex Romosan <aromosan@gmail.com>
Tested-by: CHECK_1234543212345@protonmail.com
Signed-off-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 44 ++++++++++++++++++++++++++++++++++----------
 1 file changed, 34 insertions(+), 10 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index d67785be2c778c..9dd2ba724a7ea8 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1301,6 +1301,31 @@ int btrfs_forget_devices(dev_t devt)
 	return ret;
 }
 
+static bool btrfs_skip_registration(struct btrfs_super_block *disk_super,
+				    dev_t devt, bool mount_arg_dev)
+{
+	struct btrfs_fs_devices *fs_devices;
+
+	list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
+		struct btrfs_device *device;
+
+		mutex_lock(&fs_devices->device_list_mutex);
+		list_for_each_entry(device, &fs_devices->devices, dev_list) {
+			if (device->devt == devt) {
+				mutex_unlock(&fs_devices->device_list_mutex);
+				return false;
+			}
+		}
+		mutex_unlock(&fs_devices->device_list_mutex);
+	}
+
+	if (!mount_arg_dev && btrfs_super_num_devices(disk_super) == 1 &&
+	    !(btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING))
+		return true;
+
+	return false;
+}
+
 /*
  * Look for a btrfs signature on a device. This may be called out of the mount path
  * and we are not allowed to call set_blocksize during the scan. The superblock
@@ -1318,6 +1343,7 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags,
 	struct btrfs_device *device = NULL;
 	struct bdev_handle *bdev_handle;
 	u64 bytenr, bytenr_orig;
+	dev_t devt = 0;
 	int ret;
 
 	lockdep_assert_held(&uuid_mutex);
@@ -1357,18 +1383,16 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags,
 		goto error_bdev_put;
 	}
 
-	if (!mount_arg_dev && btrfs_super_num_devices(disk_super) == 1 &&
-	    !(btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING)) {
-		dev_t devt;
+	ret = lookup_bdev(path, &devt);
+	if (ret)
+		btrfs_warn(NULL, "lookup bdev failed for path %s: %d",
+			   path, ret);
 
-		ret = lookup_bdev(path, &devt);
-		if (ret)
-			btrfs_warn(NULL, "lookup bdev failed for path %s: %d",
-				   path, ret);
-		else
+	if (btrfs_skip_registration(disk_super, devt, mount_arg_dev)) {
+		pr_debug("BTRFS: skip registering single non-seed device %s\n",
+			  path);
+		if (devt)
 			btrfs_free_stale_devices(devt, NULL);
-
-		pr_debug("BTRFS: skip registering single non-seed device %s\n", path);
 		device = NULL;
 		goto free_disk_super;
 	}

From b14df970aa6c8cbae6ede4c77701eeb948716221 Mon Sep 17 00:00:00 2001
From: Claudiu Beznea <claudiu.beznea.uj@bp.renesas.com>
Date: Thu, 8 Feb 2024 14:42:55 +0200
Subject: [PATCH 0757/1406] arm64: dts: renesas: rzg3s-smarc-som: Guard
 Ethernet IRQ GPIO hogs

Ethernet IRQ GPIOs are marked as gpio-hog. Thus, these GPIOs are requested
at probe w/o considering if there are other peripherals that needs them.
The Ethernet IRQ GPIOs are shared w/ SDHI2. Selection b/w Ethernet and
SDHI2 is done through a hardware switch. To avoid scenarios where one wants
to boot with SDHI2 support and some SDHI pins are not propertly configured
because of gpio-hog guard Ethernet IRQ GPIO with proper build flag.

Fixes: 932ff0c802c6 ("arm64: dts: renesas: rzg3s-smarc-som: Enable the Ethernet interfaces")
Signed-off-by: Claudiu Beznea <claudiu.beznea.uj@bp.renesas.com>
Reviewed-by: Geert Uytterhoeven <geert+renesas@glider.be>
Link: https://lore.kernel.org/r/20240208124300.2740313-13-claudiu.beznea.uj@bp.renesas.com
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
---
 arch/arm64/boot/dts/renesas/rzg3s-smarc-som.dtsi | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/arch/arm64/boot/dts/renesas/rzg3s-smarc-som.dtsi b/arch/arm64/boot/dts/renesas/rzg3s-smarc-som.dtsi
index 2b7fa5817d5831..acac4666ae59e3 100644
--- a/arch/arm64/boot/dts/renesas/rzg3s-smarc-som.dtsi
+++ b/arch/arm64/boot/dts/renesas/rzg3s-smarc-som.dtsi
@@ -193,12 +193,14 @@
 #endif
 
 &pinctrl {
+#if SW_CONFIG3 == SW_ON
 	eth0-phy-irq-hog {
 		gpio-hog;
 		gpios = <RZG2L_GPIO(12, 0) GPIO_ACTIVE_LOW>;
 		input;
 		line-name = "eth0-phy-irq";
 	};
+#endif
 
 	eth0_pins: eth0 {
 		txc {
@@ -234,12 +236,14 @@
 		};
 	};
 
+#if SW_CONFIG3 == SW_ON
 	eth1-phy-irq-hog {
 		gpio-hog;
 		gpios = <RZG2L_GPIO(12, 1) GPIO_ACTIVE_LOW>;
 		input;
 		line-name = "eth1-phy-irq";
 	};
+#endif
 
 	eth1_pins: eth1 {
 		txc {

From b4f97d1b5aeb6166ab3d5694a351cb6151daf30f Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Wed, 14 Feb 2024 15:57:42 +0100
Subject: [PATCH 0758/1406] ARM: dts: renesas: rcar-gen2: Add missing
 #interrupt-cells to DA9063 nodes

make dtbs_check W=2:

    arch/arm/boot/dts/renesas/r8a7790-lager.dts:444.11-458.5: Warning (interrupt_provider): /i2c-mux4/pmic@58: Missing '#interrupt-cells' in interrupt provider
    ...

Fix this by adding the missing #interrupt-cells properties.

Reported-by: Rob Herring <robh@kernel.org>
Reviewed-by: Rob Herring <robh@kernel.org>
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Link: https://lore.kernel.org/r/a351e503ea97fb1af68395843f513925ff1bdf26.1707922460.git.geert+renesas@glider.be
---
 arch/arm/boot/dts/renesas/r8a7790-lager.dts   | 1 +
 arch/arm/boot/dts/renesas/r8a7790-stout.dts   | 1 +
 arch/arm/boot/dts/renesas/r8a7791-koelsch.dts | 1 +
 arch/arm/boot/dts/renesas/r8a7791-porter.dts  | 1 +
 arch/arm/boot/dts/renesas/r8a7792-blanche.dts | 1 +
 arch/arm/boot/dts/renesas/r8a7793-gose.dts    | 1 +
 arch/arm/boot/dts/renesas/r8a7794-alt.dts     | 1 +
 arch/arm/boot/dts/renesas/r8a7794-silk.dts    | 1 +
 8 files changed, 8 insertions(+)

diff --git a/arch/arm/boot/dts/renesas/r8a7790-lager.dts b/arch/arm/boot/dts/renesas/r8a7790-lager.dts
index 2fba4d084001b9..8590981245a620 100644
--- a/arch/arm/boot/dts/renesas/r8a7790-lager.dts
+++ b/arch/arm/boot/dts/renesas/r8a7790-lager.dts
@@ -447,6 +447,7 @@
 			interrupt-parent = <&irqc0>;
 			interrupts = <2 IRQ_TYPE_LEVEL_LOW>;
 			interrupt-controller;
+			#interrupt-cells = <2>;
 
 			rtc {
 				compatible = "dlg,da9063-rtc";
diff --git a/arch/arm/boot/dts/renesas/r8a7790-stout.dts b/arch/arm/boot/dts/renesas/r8a7790-stout.dts
index f9bc5b4f019d02..683f7395fab0b6 100644
--- a/arch/arm/boot/dts/renesas/r8a7790-stout.dts
+++ b/arch/arm/boot/dts/renesas/r8a7790-stout.dts
@@ -347,6 +347,7 @@
 		interrupt-parent = <&irqc0>;
 		interrupts = <2 IRQ_TYPE_LEVEL_LOW>;
 		interrupt-controller;
+		#interrupt-cells = <2>;
 
 		onkey {
 			compatible = "dlg,da9063-onkey";
diff --git a/arch/arm/boot/dts/renesas/r8a7791-koelsch.dts b/arch/arm/boot/dts/renesas/r8a7791-koelsch.dts
index e9c13bb03772af..0efd9f98c75ace 100644
--- a/arch/arm/boot/dts/renesas/r8a7791-koelsch.dts
+++ b/arch/arm/boot/dts/renesas/r8a7791-koelsch.dts
@@ -819,6 +819,7 @@
 		interrupt-parent = <&irqc0>;
 		interrupts = <2 IRQ_TYPE_LEVEL_LOW>;
 		interrupt-controller;
+		#interrupt-cells = <2>;
 
 		rtc {
 			compatible = "dlg,da9063-rtc";
diff --git a/arch/arm/boot/dts/renesas/r8a7791-porter.dts b/arch/arm/boot/dts/renesas/r8a7791-porter.dts
index 7e8bc06715f656..93c86e92164555 100644
--- a/arch/arm/boot/dts/renesas/r8a7791-porter.dts
+++ b/arch/arm/boot/dts/renesas/r8a7791-porter.dts
@@ -413,6 +413,7 @@
 		interrupt-parent = <&irqc0>;
 		interrupts = <2 IRQ_TYPE_LEVEL_LOW>;
 		interrupt-controller;
+		#interrupt-cells = <2>;
 
 		watchdog {
 			compatible = "dlg,da9063-watchdog";
diff --git a/arch/arm/boot/dts/renesas/r8a7792-blanche.dts b/arch/arm/boot/dts/renesas/r8a7792-blanche.dts
index 4f9838cf97ee4f..540a9ad28f28ac 100644
--- a/arch/arm/boot/dts/renesas/r8a7792-blanche.dts
+++ b/arch/arm/boot/dts/renesas/r8a7792-blanche.dts
@@ -381,6 +381,7 @@
 		interrupt-parent = <&irqc>;
 		interrupts = <2 IRQ_TYPE_LEVEL_LOW>;
 		interrupt-controller;
+		#interrupt-cells = <2>;
 
 		rtc {
 			compatible = "dlg,da9063-rtc";
diff --git a/arch/arm/boot/dts/renesas/r8a7793-gose.dts b/arch/arm/boot/dts/renesas/r8a7793-gose.dts
index 1744fdbf9e0ce0..1ea6c757893bc0 100644
--- a/arch/arm/boot/dts/renesas/r8a7793-gose.dts
+++ b/arch/arm/boot/dts/renesas/r8a7793-gose.dts
@@ -759,6 +759,7 @@
 		interrupt-parent = <&irqc0>;
 		interrupts = <2 IRQ_TYPE_LEVEL_LOW>;
 		interrupt-controller;
+		#interrupt-cells = <2>;
 
 		rtc {
 			compatible = "dlg,da9063-rtc";
diff --git a/arch/arm/boot/dts/renesas/r8a7794-alt.dts b/arch/arm/boot/dts/renesas/r8a7794-alt.dts
index c0d067df22a03d..b5ecafbb2e4de5 100644
--- a/arch/arm/boot/dts/renesas/r8a7794-alt.dts
+++ b/arch/arm/boot/dts/renesas/r8a7794-alt.dts
@@ -453,6 +453,7 @@
 		interrupt-parent = <&gpio3>;
 		interrupts = <31 IRQ_TYPE_LEVEL_LOW>;
 		interrupt-controller;
+		#interrupt-cells = <2>;
 
 		rtc {
 			compatible = "dlg,da9063-rtc";
diff --git a/arch/arm/boot/dts/renesas/r8a7794-silk.dts b/arch/arm/boot/dts/renesas/r8a7794-silk.dts
index 43d480a7f3eacc..595e074085eb4c 100644
--- a/arch/arm/boot/dts/renesas/r8a7794-silk.dts
+++ b/arch/arm/boot/dts/renesas/r8a7794-silk.dts
@@ -439,6 +439,7 @@
 		interrupt-parent = <&gpio3>;
 		interrupts = <31 IRQ_TYPE_LEVEL_LOW>;
 		interrupt-controller;
+		#interrupt-cells = <2>;
 
 		onkey {
 			compatible = "dlg,da9063-onkey";

From 2d5452e7853d88aca7aab15e90970996ecc9b9b7 Mon Sep 17 00:00:00 2001
From: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Date: Wed, 14 Feb 2024 03:12:14 +0000
Subject: [PATCH 0759/1406] ARM: dts: renesas: r8a7778: Add missing reg-names
 for sound

Sound Driver requires "reg-names" to get register info.  Current driver
tries to get register info via "reg" instead of "reg-names" as backup
plan, but this support will be removed soon.
Use "reg-namess" for r8a7778 sound.

Signed-off-by: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Reviewed-by: Geert Uytterhoeven <geert+renesas@glider.be>
Link: https://lore.kernel.org/r/87cyszpwmp.wl-kuninori.morimoto.gx@renesas.com
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
---
 arch/arm/boot/dts/renesas/r8a7778.dtsi | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/arm/boot/dts/renesas/r8a7778.dtsi b/arch/arm/boot/dts/renesas/r8a7778.dtsi
index 97f71c1f69dd73..b80e832c927757 100644
--- a/arch/arm/boot/dts/renesas/r8a7778.dtsi
+++ b/arch/arm/boot/dts/renesas/r8a7778.dtsi
@@ -255,6 +255,8 @@
 		reg =	<0xffd90000 0x1000>,	/* SRU */
 			<0xffd91000 0x240>,	/* SSI */
 			<0xfffe0000 0x24>;	/* ADG */
+		reg-names = "sru", "ssi", "adg";
+
 		clocks = <&mstp3_clks R8A7778_CLK_SSI8>,
 			<&mstp3_clks R8A7778_CLK_SSI7>,
 			<&mstp3_clks R8A7778_CLK_SSI6>,

From 4b7b36e49f3bb95898d1871ae80f53dfd75d5b03 Mon Sep 17 00:00:00 2001
From: Yazen Ghannam <yazen.ghannam@amd.com>
Date: Tue, 13 Feb 2024 21:35:16 -0600
Subject: [PATCH 0760/1406] RAS: Introduce a FRU memory poison manager

Memory errors are an expected occurrence on systems with high memory
density. Generally, errors within a small number of unique physical
locations are acceptable, based on manufacturer and/or admin policy.
During run time, memory with errors may be retired so it is no longer
used by the system. This is done in mm through page poisoning, and the
effect will remain until the system is restarted.

If a memory location is consistently faulty, then the same run time
error handling may occur in the next reboot cycle, leading to
terminating jobs due to that already known bad memory. This could be
prevented if information from the previous boot was not lost.

Some add-in cards with driver-managed memory have on-board persistent
storage. Their driver saves memory error information to the persistent
storage during run time. The information is then restored after reset,
and known bad memory will be retired before the hardware is used.
A running log of bad memory locations is kept across multiple resets.

A similar solution is desirable for CPUs. However, this solution should
leverage industry-standard components as much as possible, rather than
a bespoke platform driver.

Two components are needed: a record format and a persistent storage
interface.

Implement a new module to manage the record formats on persistent
storage. Use the requirements for an AMD MI300-based system to start.
Vendor- and platform-specific details can be abstracted later as needed.

  [ bp: Massage commit message and code, squash 30-ish more fixes from
    Yazen and me. ]

Signed-off-by: Yazen Ghannam <yazen.ghannam@amd.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Link: https://lore.kernel.org/r/20240214033516.1344948-3-yazen.ghannam@amd.com
---
 MAINTAINERS            |   6 +
 drivers/ras/Kconfig    |  12 +
 drivers/ras/Makefile   |   1 +
 drivers/ras/amd/fmpm.c | 812 +++++++++++++++++++++++++++++++++++++++++
 4 files changed, 831 insertions(+)
 create mode 100644 drivers/ras/amd/fmpm.c

diff --git a/MAINTAINERS b/MAINTAINERS
index fc5996feba70ce..76163f09e4e2bb 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -18363,6 +18363,12 @@ F:	drivers/ras/
 F:	include/linux/ras.h
 F:	include/ras/ras_event.h
 
+RAS FRU MEMORY POISON MANAGER (FMPM)
+M:	Yazen Ghannam <Yazen.Ghannam@amd.com>
+L:	linux-edac@vger.kernel.org
+S:	Maintained
+F:	drivers/ras/amd/fmpm.c
+
 RC-CORE / LIRC FRAMEWORK
 M:	Sean Young <sean@mess.org>
 L:	linux-media@vger.kernel.org
diff --git a/drivers/ras/Kconfig b/drivers/ras/Kconfig
index 2e969f59c0cacb..fc4f4bb94a4c65 100644
--- a/drivers/ras/Kconfig
+++ b/drivers/ras/Kconfig
@@ -34,4 +34,16 @@ if RAS
 source "arch/x86/ras/Kconfig"
 source "drivers/ras/amd/atl/Kconfig"
 
+config RAS_FMPM
+	tristate "FRU Memory Poison Manager"
+	default m
+	depends on AMD_ATL && ACPI_APEI
+	help
+	  Support saving and restoring memory error information across reboot
+	  using ACPI ERST as persistent storage. Error information is saved with
+	  the UEFI CPER "FRU Memory Poison" section format.
+
+	  Memory will be retired during boot time and run time depending on
+	  platform-specific policies.
+
 endif
diff --git a/drivers/ras/Makefile b/drivers/ras/Makefile
index 3fac80f580052d..11f95d59d3972d 100644
--- a/drivers/ras/Makefile
+++ b/drivers/ras/Makefile
@@ -3,4 +3,5 @@ obj-$(CONFIG_RAS)	+= ras.o
 obj-$(CONFIG_DEBUG_FS)	+= debugfs.o
 obj-$(CONFIG_RAS_CEC)	+= cec.o
 
+obj-$(CONFIG_RAS_FMPM)	+= amd/fmpm.o
 obj-y			+= amd/atl/
diff --git a/drivers/ras/amd/fmpm.c b/drivers/ras/amd/fmpm.c
new file mode 100644
index 00000000000000..80dd112b720aff
--- /dev/null
+++ b/drivers/ras/amd/fmpm.c
@@ -0,0 +1,812 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * FRU (Field-Replaceable Unit) Memory Poison Manager
+ *
+ * Copyright (c) 2024, Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Authors:
+ *	Naveen Krishna Chatradhi <naveenkrishna.chatradhi@amd.com>
+ *	Muralidhara M K <muralidhara.mk@amd.com>
+ *	Yazen Ghannam <Yazen.Ghannam@amd.com>
+ *
+ * Implementation notes, assumptions, and limitations:
+ *
+ * - FRU memory poison section and memory poison descriptor definitions are not yet
+ *   included in the UEFI specification. So they are defined here. Afterwards, they
+ *   may be moved to linux/cper.h, if appropriate.
+ *
+ * - Platforms based on AMD MI300 systems will be the first to use these structures.
+ *   There are a number of assumptions made here that will need to be generalized
+ *   to support other platforms.
+ *
+ *   AMD MI300-based platform(s) assumptions:
+ *   - Memory errors are reported through x86 MCA.
+ *   - The entire DRAM row containing a memory error should be retired.
+ *   - There will be (1) FRU memory poison section per CPER.
+ *   - The FRU will be the CPU package (processor socket).
+ *   - The default number of memory poison descriptor entries should be (8).
+ *   - The platform will use ACPI ERST for persistent storage.
+ *   - All FRU records should be saved to persistent storage. Module init will
+ *     fail if any FRU record is not successfully written.
+ *
+ * - Boot time memory retirement may occur later than ideal due to dependencies
+ *   on other libraries and drivers. This leaves a gap where bad memory may be
+ *   accessed during early boot stages.
+ *
+ * - Enough memory should be pre-allocated for each FRU record to be able to hold
+ *   the expected number of descriptor entries. This, mostly empty, record is
+ *   written to storage during init time. Subsequent writes to the same record
+ *   should allow the Platform to update the stored record in-place. Otherwise,
+ *   if the record is extended, then the Platform may need to perform costly memory
+ *   management operations on the storage. For example, the Platform may spend time
+ *   in Firmware copying and invalidating memory on a relatively slow SPI ROM.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/cper.h>
+#include <linux/ras.h>
+#include <linux/cpu.h>
+
+#include <acpi/apei.h>
+
+#include <asm/cpu_device_id.h>
+#include <asm/mce.h>
+
+#define INVALID_CPU			UINT_MAX
+
+/* Validation Bits */
+#define FMP_VALID_ARCH_TYPE		BIT_ULL(0)
+#define FMP_VALID_ARCH			BIT_ULL(1)
+#define FMP_VALID_ID_TYPE		BIT_ULL(2)
+#define FMP_VALID_ID			BIT_ULL(3)
+#define FMP_VALID_LIST_ENTRIES		BIT_ULL(4)
+#define FMP_VALID_LIST			BIT_ULL(5)
+
+/* FRU Architecture Types */
+#define FMP_ARCH_TYPE_X86_CPUID_1_EAX	0
+
+/* FRU ID Types */
+#define FMP_ID_TYPE_X86_PPIN		0
+
+/* FRU Memory Poison Section */
+struct cper_sec_fru_mem_poison {
+	u32 checksum;
+	u64 validation_bits;
+	u32 fru_arch_type;
+	u64 fru_arch;
+	u32 fru_id_type;
+	u64 fru_id;
+	u32 nr_entries;
+} __packed;
+
+/* FRU Descriptor ID Types */
+#define FPD_HW_ID_TYPE_MCA_IPID		0
+
+/* FRU Descriptor Address Types */
+#define FPD_ADDR_TYPE_MCA_ADDR		0
+
+/* Memory Poison Descriptor */
+struct cper_fru_poison_desc {
+	u64 timestamp;
+	u32 hw_id_type;
+	u64 hw_id;
+	u32 addr_type;
+	u64 addr;
+} __packed;
+
+/* Collection of headers and sections for easy pointer use. */
+struct fru_rec {
+	struct cper_record_header	hdr;
+	struct cper_section_descriptor	sec_desc;
+	struct cper_sec_fru_mem_poison	fmp;
+	struct cper_fru_poison_desc	entries[];
+} __packed;
+
+/*
+ * Pointers to the complete CPER record of each FRU.
+ *
+ * Memory allocation will include padded space for descriptor entries.
+ */
+static struct fru_rec **fru_records;
+
+#define CPER_CREATOR_FMP						\
+	GUID_INIT(0xcd5c2993, 0xf4b2, 0x41b2, 0xb5, 0xd4, 0xf9, 0xc3,	\
+		  0xa0, 0x33, 0x08, 0x75)
+
+#define CPER_SECTION_TYPE_FMP						\
+	GUID_INIT(0x5e4706c1, 0x5356, 0x48c6, 0x93, 0x0b, 0x52, 0xf2,	\
+		  0x12, 0x0a, 0x44, 0x58)
+
+/**
+ * DOC: fru_poison_entries (byte)
+ * Maximum number of descriptor entries possible for each FRU.
+ *
+ * Values between '1' and '255' are valid.
+ * No input or '0' will default to FMPM_DEFAULT_MAX_NR_ENTRIES.
+ */
+static u8 max_nr_entries;
+module_param(max_nr_entries, byte, 0644);
+MODULE_PARM_DESC(max_nr_entries,
+		 "Maximum number of memory poison descriptor entries per FRU");
+
+#define FMPM_DEFAULT_MAX_NR_ENTRIES	8
+
+/* Maximum number of FRUs in the system. */
+#define FMPM_MAX_NR_FRU			256
+static unsigned int max_nr_fru;
+
+/* Total length of record including headers and list of descriptor entries. */
+static size_t max_rec_len;
+
+/*
+ * Protect the local records cache in fru_records and prevent concurrent
+ * writes to storage. This is only needed after init once notifier block
+ * registration is done.
+ */
+static DEFINE_MUTEX(fmpm_update_mutex);
+
+#define for_each_fru(i, rec) \
+	for (i = 0; rec = fru_records[i], i < max_nr_fru; i++)
+
+static inline u32 get_fmp_len(struct fru_rec *rec)
+{
+	return rec->sec_desc.section_length - sizeof(struct cper_section_descriptor);
+}
+
+static struct fru_rec *get_fru_record(u64 fru_id)
+{
+	struct fru_rec *rec;
+	unsigned int i;
+
+	for_each_fru(i, rec) {
+		if (rec->fmp.fru_id == fru_id)
+			return rec;
+	}
+
+	pr_debug("Record not found for FRU 0x%016llx\n", fru_id);
+
+	return NULL;
+}
+
+/*
+ * Sum up all bytes within the FRU Memory Poison Section including the Memory
+ * Poison Descriptor entries.
+ *
+ * Don't include the old checksum here. It's a u32 value, so summing each of its
+ * bytes will give the wrong total.
+ */
+static u32 do_fmp_checksum(struct cper_sec_fru_mem_poison *fmp, u32 len)
+{
+	u32 checksum = 0;
+	u8 *buf, *end;
+
+	/* Skip old checksum. */
+	buf = (u8 *)fmp + sizeof(u32);
+	end = buf + len;
+
+	while (buf < end)
+		checksum += (u8)(*(buf++));
+
+	return checksum;
+}
+
+static int update_record_on_storage(struct fru_rec *rec)
+{
+	u32 len, checksum;
+	int ret;
+
+	/* Calculate a new checksum. */
+	len = get_fmp_len(rec);
+
+	/* Get the current total. */
+	checksum = do_fmp_checksum(&rec->fmp, len);
+
+	/* Use the complement value. */
+	rec->fmp.checksum = -checksum;
+
+	pr_debug("Writing to storage\n");
+
+	ret = erst_write(&rec->hdr);
+	if (ret) {
+		pr_warn("Storage update failed for FRU 0x%016llx\n", rec->fmp.fru_id);
+
+		if (ret == -ENOSPC)
+			pr_warn("Not enough space on storage\n");
+	}
+
+	return ret;
+}
+
+static bool rec_has_valid_entries(struct fru_rec *rec)
+{
+	if (!(rec->fmp.validation_bits & FMP_VALID_LIST_ENTRIES))
+		return false;
+
+	if (!(rec->fmp.validation_bits & FMP_VALID_LIST))
+		return false;
+
+	return true;
+}
+
+static bool fpds_equal(struct cper_fru_poison_desc *old, struct cper_fru_poison_desc *new)
+{
+	/*
+	 * Ignore timestamp field.
+	 * The same physical error may be reported multiple times due to stuck bits, etc.
+	 *
+	 * Also, order the checks from most->least likely to fail to shortcut the code.
+	 */
+	if (old->addr != new->addr)
+		return false;
+
+	if (old->hw_id != new->hw_id)
+		return false;
+
+	if (old->addr_type != new->addr_type)
+		return false;
+
+	if (old->hw_id_type != new->hw_id_type)
+		return false;
+
+	return true;
+}
+
+static bool rec_has_fpd(struct fru_rec *rec, struct cper_fru_poison_desc *fpd)
+{
+	unsigned int i;
+
+	for (i = 0; i < rec->fmp.nr_entries; i++) {
+		struct cper_fru_poison_desc *fpd_i = &rec->entries[i];
+
+		if (fpds_equal(fpd_i, fpd)) {
+			pr_debug("Found duplicate record\n");
+			return true;
+		}
+	}
+
+	return false;
+}
+
+static void update_fru_record(struct fru_rec *rec, struct mce *m)
+{
+	struct cper_sec_fru_mem_poison *fmp = &rec->fmp;
+	struct cper_fru_poison_desc fpd, *fpd_dest;
+	u32 entry = 0;
+
+	mutex_lock(&fmpm_update_mutex);
+
+	memset(&fpd, 0, sizeof(struct cper_fru_poison_desc));
+
+	fpd.timestamp	= m->time;
+	fpd.hw_id_type = FPD_HW_ID_TYPE_MCA_IPID;
+	fpd.hw_id	= m->ipid;
+	fpd.addr_type	= FPD_ADDR_TYPE_MCA_ADDR;
+	fpd.addr	= m->addr;
+
+	/* This is the first entry, so just save it. */
+	if (!rec_has_valid_entries(rec))
+		goto save_fpd;
+
+	/* Ignore already recorded errors. */
+	if (rec_has_fpd(rec, &fpd))
+		goto out_unlock;
+
+	if (rec->fmp.nr_entries >= max_nr_entries) {
+		pr_warn("Exceeded number of entries for FRU 0x%016llx\n", rec->fmp.fru_id);
+		goto out_unlock;
+	}
+
+	entry  = fmp->nr_entries;
+
+save_fpd:
+	fpd_dest  = &rec->entries[entry];
+	memcpy(fpd_dest, &fpd, sizeof(struct cper_fru_poison_desc));
+
+	fmp->nr_entries		 = entry + 1;
+	fmp->validation_bits	|= FMP_VALID_LIST_ENTRIES;
+	fmp->validation_bits	|= FMP_VALID_LIST;
+
+	pr_debug("Updated FRU 0x%016llx entry #%u\n", fmp->fru_id, entry);
+
+	update_record_on_storage(rec);
+
+out_unlock:
+	mutex_unlock(&fmpm_update_mutex);
+}
+
+static void retire_dram_row(u64 addr, u64 id, u32 cpu)
+{
+	struct atl_err a_err;
+
+	memset(&a_err, 0, sizeof(struct atl_err));
+
+	a_err.addr = addr;
+	a_err.ipid = id;
+	a_err.cpu  = cpu;
+
+	amd_retire_dram_row(&a_err);
+}
+
+static int fru_handle_mem_poison(struct notifier_block *nb, unsigned long val, void *data)
+{
+	struct mce *m = (struct mce *)data;
+	struct fru_rec *rec;
+
+	if (!mce_is_memory_error(m))
+		return NOTIFY_DONE;
+
+	retire_dram_row(m->addr, m->ipid, m->extcpu);
+
+	/*
+	 * An invalid FRU ID should not happen on real errors. But it
+	 * could happen from software error injection, etc.
+	 */
+	rec = get_fru_record(m->ppin);
+	if (!rec)
+		return NOTIFY_DONE;
+
+	update_fru_record(rec, m);
+
+	return NOTIFY_OK;
+}
+
+static struct notifier_block fru_mem_poison_nb = {
+	.notifier_call  = fru_handle_mem_poison,
+	.priority	= MCE_PRIO_LOWEST,
+};
+
+static void retire_mem_fmp(struct fru_rec *rec)
+{
+	struct cper_sec_fru_mem_poison *fmp = &rec->fmp;
+	unsigned int i, cpu;
+
+	for (i = 0; i < fmp->nr_entries; i++) {
+		struct cper_fru_poison_desc *fpd = &rec->entries[i];
+		unsigned int err_cpu = INVALID_CPU;
+
+		if (fpd->hw_id_type != FPD_HW_ID_TYPE_MCA_IPID)
+			continue;
+
+		if (fpd->addr_type != FPD_ADDR_TYPE_MCA_ADDR)
+			continue;
+
+		cpus_read_lock();
+		for_each_online_cpu(cpu) {
+			if (topology_ppin(cpu) == fmp->fru_id) {
+				err_cpu = cpu;
+				break;
+			}
+		}
+		cpus_read_unlock();
+
+		if (err_cpu == INVALID_CPU)
+			continue;
+
+		retire_dram_row(fpd->addr, fpd->hw_id, err_cpu);
+	}
+}
+
+static void retire_mem_records(void)
+{
+	struct fru_rec *rec;
+	unsigned int i;
+
+	for_each_fru(i, rec) {
+		if (!rec_has_valid_entries(rec))
+			continue;
+
+		retire_mem_fmp(rec);
+	}
+}
+
+/* Set the CPER Record Header and CPER Section Descriptor fields. */
+static void set_rec_fields(struct fru_rec *rec)
+{
+	struct cper_section_descriptor	*sec_desc = &rec->sec_desc;
+	struct cper_record_header	*hdr	  = &rec->hdr;
+
+	memcpy(hdr->signature, CPER_SIG_RECORD, CPER_SIG_SIZE);
+	hdr->revision			= CPER_RECORD_REV;
+	hdr->signature_end		= CPER_SIG_END;
+
+	/*
+	 * Currently, it is assumed that there is one FRU Memory Poison
+	 * section per CPER. But this may change for other implementations.
+	 */
+	hdr->section_count		= 1;
+
+	/* The logged errors are recoverable. Otherwise, they'd never make it here. */
+	hdr->error_severity		= CPER_SEV_RECOVERABLE;
+
+	hdr->validation_bits		= 0;
+	hdr->record_length		= max_rec_len;
+	hdr->creator_id			= CPER_CREATOR_FMP;
+	hdr->notification_type		= CPER_NOTIFY_MCE;
+	hdr->record_id			= cper_next_record_id();
+	hdr->flags			= CPER_HW_ERROR_FLAGS_PREVERR;
+
+	sec_desc->section_offset	= sizeof(struct cper_record_header);
+	sec_desc->section_length	= max_rec_len - sizeof(struct cper_record_header);
+	sec_desc->revision		= CPER_SEC_REV;
+	sec_desc->validation_bits	= 0;
+	sec_desc->flags			= CPER_SEC_PRIMARY;
+	sec_desc->section_type		= CPER_SECTION_TYPE_FMP;
+	sec_desc->section_severity	= CPER_SEV_RECOVERABLE;
+}
+
+static int save_new_records(void)
+{
+	DECLARE_BITMAP(new_records, FMPM_MAX_NR_FRU);
+	struct fru_rec *rec;
+	unsigned int i;
+	int ret = 0;
+
+	for_each_fru(i, rec) {
+		if (rec->hdr.record_length)
+			continue;
+
+		set_rec_fields(rec);
+
+		ret = update_record_on_storage(rec);
+		if (ret)
+			goto out_clear;
+
+		set_bit(i, new_records);
+	}
+
+	return ret;
+
+out_clear:
+	for_each_fru(i, rec) {
+		if (!test_bit(i, new_records))
+			continue;
+
+		erst_clear(rec->hdr.record_id);
+	}
+
+	return ret;
+}
+
+/* Check that the record matches expected types for the current system.*/
+static bool fmp_is_usable(struct fru_rec *rec)
+{
+	struct cper_sec_fru_mem_poison *fmp = &rec->fmp;
+	u64 cpuid;
+
+	pr_debug("Validation bits: 0x%016llx\n", fmp->validation_bits);
+
+	if (!(fmp->validation_bits & FMP_VALID_ARCH_TYPE)) {
+		pr_debug("Arch type unknown\n");
+		return false;
+	}
+
+	if (fmp->fru_arch_type != FMP_ARCH_TYPE_X86_CPUID_1_EAX) {
+		pr_debug("Arch type not 'x86 Family/Model/Stepping'\n");
+		return false;
+	}
+
+	if (!(fmp->validation_bits & FMP_VALID_ARCH)) {
+		pr_debug("Arch value unknown\n");
+		return false;
+	}
+
+	cpuid = cpuid_eax(1);
+	if (fmp->fru_arch != cpuid) {
+		pr_debug("Arch value mismatch: record = 0x%016llx, system = 0x%016llx\n",
+			 fmp->fru_arch, cpuid);
+		return false;
+	}
+
+	if (!(fmp->validation_bits & FMP_VALID_ID_TYPE)) {
+		pr_debug("FRU ID type unknown\n");
+		return false;
+	}
+
+	if (fmp->fru_id_type != FMP_ID_TYPE_X86_PPIN) {
+		pr_debug("FRU ID type is not 'x86 PPIN'\n");
+		return false;
+	}
+
+	if (!(fmp->validation_bits & FMP_VALID_ID)) {
+		pr_debug("FRU ID value unknown\n");
+		return false;
+	}
+
+	return true;
+}
+
+static bool fmp_is_valid(struct fru_rec *rec)
+{
+	struct cper_sec_fru_mem_poison *fmp = &rec->fmp;
+	u32 checksum, len;
+
+	len = get_fmp_len(rec);
+	if (len < sizeof(struct cper_sec_fru_mem_poison)) {
+		pr_debug("fmp length is too small\n");
+		return false;
+	}
+
+	/* Checksum must sum to zero for the entire section. */
+	checksum = do_fmp_checksum(fmp, len) + fmp->checksum;
+	if (checksum) {
+		pr_debug("fmp checksum failed: sum = 0x%x\n", checksum);
+		print_hex_dump_debug("fmp record: ", DUMP_PREFIX_NONE, 16, 1, fmp, len, false);
+		return false;
+	}
+
+	if (!fmp_is_usable(rec))
+		return false;
+
+	return true;
+}
+
+static struct fru_rec *get_valid_record(struct fru_rec *old)
+{
+	struct fru_rec *new;
+
+	if (!fmp_is_valid(old)) {
+		pr_debug("Ignoring invalid record\n");
+		return NULL;
+	}
+
+	new = get_fru_record(old->fmp.fru_id);
+	if (!new)
+		pr_debug("Ignoring record for absent FRU\n");
+
+	return new;
+}
+
+/*
+ * Fetch saved records from persistent storage.
+ *
+ * For each found record:
+ * - If it was not created by this module, then ignore it.
+ * - If it is valid, then copy its data to the local cache.
+ * - If it is not valid, then erase it.
+ */
+static int get_saved_records(void)
+{
+	struct fru_rec *old, *new;
+	u64 record_id;
+	int ret, pos;
+	ssize_t len;
+
+	/*
+	 * Assume saved records match current max size.
+	 *
+	 * However, this may not be true depending on module parameters.
+	 */
+	old = kmalloc(max_rec_len, GFP_KERNEL);
+	if (!old) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	ret = erst_get_record_id_begin(&pos);
+	if (ret < 0)
+		goto out_end;
+
+	while (!erst_get_record_id_next(&pos, &record_id)) {
+		if (record_id == APEI_ERST_INVALID_RECORD_ID)
+			goto out_end;
+		/*
+		 * Make sure to clear temporary buffer between reads to avoid
+		 * leftover data from records of various sizes.
+		 */
+		memset(old, 0, max_rec_len);
+
+		len = erst_read_record(record_id, &old->hdr, max_rec_len,
+				       sizeof(struct fru_rec), &CPER_CREATOR_FMP);
+		if (len < 0)
+			continue;
+
+		if (len > max_rec_len) {
+			pr_debug("Found record larger than max_rec_len\n");
+			continue;
+		}
+
+		new = get_valid_record(old);
+		if (!new)
+			erst_clear(record_id);
+
+		/* Restore the record */
+		memcpy(new, old, len);
+	}
+
+out_end:
+	erst_get_record_id_end();
+	kfree(old);
+out:
+	return ret;
+}
+
+static void set_fmp_fields(struct fru_rec *rec, unsigned int cpu)
+{
+	struct cper_sec_fru_mem_poison *fmp = &rec->fmp;
+
+	fmp->fru_arch_type    = FMP_ARCH_TYPE_X86_CPUID_1_EAX;
+	fmp->validation_bits |= FMP_VALID_ARCH_TYPE;
+
+	/* Assume all CPUs in the system have the same value for now. */
+	fmp->fru_arch	      = cpuid_eax(1);
+	fmp->validation_bits |= FMP_VALID_ARCH;
+
+	fmp->fru_id_type      = FMP_ID_TYPE_X86_PPIN;
+	fmp->validation_bits |= FMP_VALID_ID_TYPE;
+
+	fmp->fru_id	      = topology_ppin(cpu);
+	fmp->validation_bits |= FMP_VALID_ID;
+}
+
+static int init_fmps(void)
+{
+	struct fru_rec *rec;
+	unsigned int i, cpu;
+	int ret = 0;
+
+	for_each_fru(i, rec) {
+		unsigned int fru_cpu = INVALID_CPU;
+
+		cpus_read_lock();
+		for_each_online_cpu(cpu) {
+			if (topology_physical_package_id(cpu) == i) {
+				fru_cpu = cpu;
+				break;
+			}
+		}
+		cpus_read_unlock();
+
+		if (fru_cpu == INVALID_CPU) {
+			pr_debug("Failed to find matching CPU for FRU #%u\n", i);
+			ret = -ENODEV;
+			break;
+		}
+
+		set_fmp_fields(rec, fru_cpu);
+	}
+
+	return ret;
+}
+
+static int get_system_info(void)
+{
+	/* Only load on MI300A systems for now. */
+	if (!(boot_cpu_data.x86_model >= 0x90 &&
+	      boot_cpu_data.x86_model <= 0x9f))
+		return -ENODEV;
+
+	if (!cpu_feature_enabled(X86_FEATURE_AMD_PPIN)) {
+		pr_debug("PPIN feature not available\n");
+		return -ENODEV;
+	}
+
+	/* Use CPU socket as FRU for MI300 systems. */
+	max_nr_fru = topology_max_packages();
+	if (!max_nr_fru)
+		return -ENODEV;
+
+	if (max_nr_fru > FMPM_MAX_NR_FRU) {
+		pr_warn("Too many FRUs to manage: found: %u, max: %u\n",
+			max_nr_fru, FMPM_MAX_NR_FRU);
+		return -ENODEV;
+	}
+
+	if (!max_nr_entries)
+		max_nr_entries = FMPM_DEFAULT_MAX_NR_ENTRIES;
+
+	max_rec_len  = sizeof(struct fru_rec);
+	max_rec_len += sizeof(struct cper_fru_poison_desc) * max_nr_entries;
+
+	pr_info("max FRUs: %u, max entries: %u, max record length: %lu\n",
+		 max_nr_fru, max_nr_entries, max_rec_len);
+
+	return 0;
+}
+
+static void free_records(void)
+{
+	struct fru_rec *rec;
+	int i;
+
+	for_each_fru(i, rec)
+		kfree(rec);
+
+	kfree(fru_records);
+}
+
+static int allocate_records(void)
+{
+	int i, ret = 0;
+
+	fru_records = kcalloc(max_nr_fru, sizeof(struct fru_rec *), GFP_KERNEL);
+	if (!fru_records) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	for (i = 0; i < max_nr_fru; i++) {
+		fru_records[i] = kzalloc(max_rec_len, GFP_KERNEL);
+		if (!fru_records[i]) {
+			ret = -ENOMEM;
+			goto out_free;
+		}
+	}
+
+	return ret;
+
+out_free:
+	for (; i >= 0; i--)
+		kfree(fru_records[i]);
+
+	kfree(fru_records);
+out:
+	return ret;
+}
+
+static const struct x86_cpu_id fmpm_cpuids[] = {
+	X86_MATCH_VENDOR_FAM(AMD, 0x19, NULL),
+	{ }
+};
+MODULE_DEVICE_TABLE(x86cpu, fmpm_cpuids);
+
+static int __init fru_mem_poison_init(void)
+{
+	int ret;
+
+	if (!x86_match_cpu(fmpm_cpuids)) {
+		ret = -ENODEV;
+		goto out;
+	}
+
+	if (erst_disable) {
+		pr_debug("ERST not available\n");
+		ret = -ENODEV;
+		goto out;
+	}
+
+	ret = get_system_info();
+	if (ret)
+		goto out;
+
+	ret = allocate_records();
+	if (ret)
+		goto out;
+
+	ret = init_fmps();
+	if (ret)
+		goto out_free;
+
+	ret = get_saved_records();
+	if (ret)
+		goto out_free;
+
+	ret = save_new_records();
+	if (ret)
+		goto out_free;
+
+	retire_mem_records();
+
+	mce_register_decode_chain(&fru_mem_poison_nb);
+
+	pr_info("FRU Memory Poison Manager initialized\n");
+	return 0;
+
+out_free:
+	free_records();
+out:
+	return ret;
+}
+
+static void __exit fru_mem_poison_exit(void)
+{
+	mce_unregister_decode_chain(&fru_mem_poison_nb);
+	free_records();
+}
+
+module_init(fru_mem_poison_init);
+module_exit(fru_mem_poison_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("FRU Memory Poison Manager");

From 3b6344d23e749182680227e1a762a4039f67aa5d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micka=C3=ABl=20Sala=C3=BCn?= <mic@digikod.net>
Date: Mon, 19 Feb 2024 20:03:45 +0100
Subject: [PATCH 0761/1406] landlock: Fix asymmetric private inodes referring
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When linking or renaming a file, if only one of the source or
destination directory is backed by an S_PRIVATE inode, then the related
set of layer masks would be used as uninitialized by
is_access_to_paths_allowed().  This would result to indeterministic
access for one side instead of always being allowed.

This bug could only be triggered with a mounted filesystem containing
both S_PRIVATE and !S_PRIVATE inodes, which doesn't seem possible.

The collect_domain_accesses() calls return early if
is_nouser_or_private() returns false, which means that the directory's
superblock has SB_NOUSER or its inode has S_PRIVATE.  Because rename or
link actions are only allowed on the same mounted filesystem, the
superblock is always the same for both source and destination
directories.  However, it might be possible in theory to have an
S_PRIVATE parent source inode with an !S_PRIVATE parent destination
inode, or vice versa.

To make sure this case is not an issue, explicitly initialized both set
of layer masks to 0, which means to allow all actions on the related
side.  If at least on side has !S_PRIVATE, then
collect_domain_accesses() and is_access_to_paths_allowed() check for the
required access rights.

Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Günther Noack <gnoack@google.com>
Cc: Jann Horn <jannh@google.com>
Cc: Shervin Oloumi <enlightened@chromium.org>
Cc: stable@vger.kernel.org
Fixes: b91c3e4ea756 ("landlock: Add support for file reparenting with LANDLOCK_ACCESS_FS_REFER")
Link: https://lore.kernel.org/r/20240219190345.2928627-1-mic@digikod.net
Signed-off-by: Mickaël Salaün <mic@digikod.net>
---
 security/landlock/fs.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/security/landlock/fs.c b/security/landlock/fs.c
index fc520a06f9af10..0171f7eb6ee15d 100644
--- a/security/landlock/fs.c
+++ b/security/landlock/fs.c
@@ -737,8 +737,8 @@ static int current_check_refer_path(struct dentry *const old_dentry,
 	bool allow_parent1, allow_parent2;
 	access_mask_t access_request_parent1, access_request_parent2;
 	struct path mnt_dir;
-	layer_mask_t layer_masks_parent1[LANDLOCK_NUM_ACCESS_FS],
-		layer_masks_parent2[LANDLOCK_NUM_ACCESS_FS];
+	layer_mask_t layer_masks_parent1[LANDLOCK_NUM_ACCESS_FS] = {},
+		     layer_masks_parent2[LANDLOCK_NUM_ACCESS_FS] = {};
 
 	if (!dom)
 		return 0;

From 039fa41531ed2fedc9d197b7407b8eb75c898bfb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micka=C3=ABl=20Sala=C3=BCn?= <mic@digikod.net>
Date: Mon, 19 Feb 2024 20:18:04 +0100
Subject: [PATCH 0762/1406] landlock: Warn once if a Landlock action is
 requested while disabled
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Because sandboxing can be used as an opportunistic security measure,
user space may not log unsupported features.  Let the system
administrator know if an application tries to use Landlock but failed
because it isn't enabled at boot time.  This may be caused by bootloader
configurations with outdated "lsm" kernel's command-line parameter.

Cc: Günther Noack <gnoack@google.com>
Cc: stable@vger.kernel.org
Fixes: 265885daf3e5 ("landlock: Add syscall implementations")
Link: https://lore.kernel.org/r/20240219191804.2978911-1-mic@digikod.net
Signed-off-by: Mickaël Salaün <mic@digikod.net>
---
 security/landlock/syscalls.c | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/security/landlock/syscalls.c b/security/landlock/syscalls.c
index 898358f57fa085..ccb1c3e0897fd8 100644
--- a/security/landlock/syscalls.c
+++ b/security/landlock/syscalls.c
@@ -33,6 +33,18 @@
 #include "ruleset.h"
 #include "setup.h"
 
+static bool is_not_initialized(void)
+{
+	if (likely(landlock_initialized))
+		return false;
+
+	pr_warn_once(
+		"Disabled but requested by user space. "
+		"You should enable Landlock at boot time: "
+		"https://docs.kernel.org/userspace-api/landlock.html#kernel-support\n");
+	return true;
+}
+
 /**
  * copy_min_struct_from_user - Safe future-proof argument copying
  *
@@ -173,7 +185,7 @@ SYSCALL_DEFINE3(landlock_create_ruleset,
 	/* Build-time checks. */
 	build_check_abi();
 
-	if (!landlock_initialized)
+	if (is_not_initialized())
 		return -EOPNOTSUPP;
 
 	if (flags) {
@@ -398,7 +410,7 @@ SYSCALL_DEFINE4(landlock_add_rule, const int, ruleset_fd,
 	struct landlock_ruleset *ruleset;
 	int err;
 
-	if (!landlock_initialized)
+	if (is_not_initialized())
 		return -EOPNOTSUPP;
 
 	/* No flag for now. */
@@ -458,7 +470,7 @@ SYSCALL_DEFINE2(landlock_restrict_self, const int, ruleset_fd, const __u32,
 	struct landlock_cred_security *new_llcred;
 	int err;
 
-	if (!landlock_initialized)
+	if (is_not_initialized())
 		return -EOPNOTSUPP;
 
 	/*

From eefcb4e500f918da0d27bd24c88254b10193e966 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micka=C3=ABl=20Sala=C3=BCn?= <mic@digikod.net>
Date: Thu, 25 Jan 2024 16:32:30 +0100
Subject: [PATCH 0763/1406] selftests/landlock: Clean up error logs related to
 capabilities
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

It doesn't help to call TH_LOG() for every cap_*() error. Let's only
log errors returned by the kernel, not by libcap specificities.

Link: https://lore.kernel.org/r/20240125153230.3817165-3-mic@digikod.net
Signed-off-by: Mickaël Salaün <mic@digikod.net>
---
 tools/testing/selftests/landlock/common.h | 39 ++++++-----------------
 1 file changed, 9 insertions(+), 30 deletions(-)

diff --git a/tools/testing/selftests/landlock/common.h b/tools/testing/selftests/landlock/common.h
index e64bbdf0e86eac..425c5698aea239 100644
--- a/tools/testing/selftests/landlock/common.h
+++ b/tools/testing/selftests/landlock/common.h
@@ -128,31 +128,19 @@ static void _init_caps(struct __test_metadata *const _metadata, bool drop_all)
 		EXPECT_EQ(0, cap_set_secbits(noroot));
 
 	cap_p = cap_get_proc();
-	EXPECT_NE(NULL, cap_p)
-	{
-		TH_LOG("Failed to cap_get_proc: %s", strerror(errno));
-	}
-	EXPECT_NE(-1, cap_clear(cap_p))
-	{
-		TH_LOG("Failed to cap_clear: %s", strerror(errno));
-	}
+	EXPECT_NE(NULL, cap_p);
+	EXPECT_NE(-1, cap_clear(cap_p));
 	if (!drop_all) {
 		EXPECT_NE(-1, cap_set_flag(cap_p, CAP_PERMITTED,
-					   ARRAY_SIZE(caps), caps, CAP_SET))
-		{
-			TH_LOG("Failed to cap_set_flag: %s", strerror(errno));
-		}
+					   ARRAY_SIZE(caps), caps, CAP_SET));
 	}
 
 	/* Automatically resets ambient capabilities. */
 	EXPECT_NE(-1, cap_set_proc(cap_p))
 	{
-		TH_LOG("Failed to cap_set_proc: %s", strerror(errno));
-	}
-	EXPECT_NE(-1, cap_free(cap_p))
-	{
-		TH_LOG("Failed to cap_free: %s", strerror(errno));
+		TH_LOG("Failed to set capabilities: %s", strerror(errno));
 	}
+	EXPECT_NE(-1, cap_free(cap_p));
 
 	/* Quickly checks that ambient capabilities are cleared. */
 	EXPECT_NE(-1, cap_get_ambient(caps[0]));
@@ -176,22 +164,13 @@ static void _change_cap(struct __test_metadata *const _metadata,
 	cap_t cap_p;
 
 	cap_p = cap_get_proc();
-	EXPECT_NE(NULL, cap_p)
-	{
-		TH_LOG("Failed to cap_get_proc: %s", strerror(errno));
-	}
-	EXPECT_NE(-1, cap_set_flag(cap_p, flag, 1, &cap, value))
-	{
-		TH_LOG("Failed to cap_set_flag: %s", strerror(errno));
-	}
+	EXPECT_NE(NULL, cap_p);
+	EXPECT_NE(-1, cap_set_flag(cap_p, flag, 1, &cap, value));
 	EXPECT_NE(-1, cap_set_proc(cap_p))
 	{
-		TH_LOG("Failed to cap_set_proc: %s", strerror(errno));
-	}
-	EXPECT_NE(-1, cap_free(cap_p))
-	{
-		TH_LOG("Failed to cap_free: %s", strerror(errno));
+		TH_LOG("Failed to set capability %d: %s", cap, strerror(errno));
 	}
+	EXPECT_NE(-1, cap_free(cap_p));
 }
 
 static void __maybe_unused set_cap(struct __test_metadata *const _metadata,

From ecf9acf638f2a13e02b9a788930478c7d855bd7e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micka=C3=ABl=20Sala=C3=BCn?= <mic@digikod.net>
Date: Thu, 18 Jan 2024 12:36:32 +0100
Subject: [PATCH 0764/1406] landlock: Add support for KUnit tests
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add the SECURITY_LANDLOCK_KUNIT_TEST option to enable KUnit tests for
Landlock.  The minimal required configuration is listed in the
security/landlock/.kunitconfig file.

Add an initial landlock_fs KUnit test suite with 7 test cases for
filesystem helpers.  These are related to the LANDLOCK_ACCESS_FS_REFER
right.

There is one KUnit test case per:
* mutated state (e.g. test_scope_to_request_*) or,
* shared state between tests (e.g. test_is_eaccess_*).

Add macros to improve readability of tests (i.e. one per line).  Test
cases are collocated with the tested functions to help maintenance and
improve documentation.  This is why SECURITY_LANDLOCK_KUNIT_TEST cannot
be set as module.

This is a nice complement to Landlock's user space kselftests.  We
expect new Landlock features to come with KUnit tests as well.

Thanks to UML support, we can run all KUnit tests for Landlock with:
./tools/testing/kunit/kunit.py run --kunitconfig security/landlock

[00:00:00] ======================= landlock_fs  =======================
[00:00:00] [PASSED] test_no_more_access
[00:00:00] [PASSED] test_scope_to_request_with_exec_none
[00:00:00] [PASSED] test_scope_to_request_with_exec_some
[00:00:00] [PASSED] test_scope_to_request_without_access
[00:00:00] [PASSED] test_is_eacces_with_none
[00:00:00] [PASSED] test_is_eacces_with_refer
[00:00:00] [PASSED] test_is_eacces_with_write
[00:00:00] =================== [PASSED] landlock_fs ===================
[00:00:00] ============================================================
[00:00:00] Testing complete. Ran 7 tests: passed: 7

Cc: Konstantin Meskhidze <konstantin.meskhidze@huawei.com>
Reviewed-by: Günther Noack <gnoack@google.com>
Link: https://lore.kernel.org/r/20240118113632.1948478-1-mic@digikod.net
Signed-off-by: Mickaël Salaün <mic@digikod.net>
---
 security/landlock/.kunitconfig               |   4 +
 security/landlock/Kconfig                    |  15 ++
 security/landlock/common.h                   |   2 +
 security/landlock/fs.c                       | 234 +++++++++++++++++++
 tools/testing/kunit/configs/all_tests.config |   1 +
 5 files changed, 256 insertions(+)
 create mode 100644 security/landlock/.kunitconfig

diff --git a/security/landlock/.kunitconfig b/security/landlock/.kunitconfig
new file mode 100644
index 00000000000000..03e11946660429
--- /dev/null
+++ b/security/landlock/.kunitconfig
@@ -0,0 +1,4 @@
+CONFIG_KUNIT=y
+CONFIG_SECURITY=y
+CONFIG_SECURITY_LANDLOCK=y
+CONFIG_SECURITY_LANDLOCK_KUNIT_TEST=y
diff --git a/security/landlock/Kconfig b/security/landlock/Kconfig
index c4bf0d5eff39f3..3f1493402052ec 100644
--- a/security/landlock/Kconfig
+++ b/security/landlock/Kconfig
@@ -20,3 +20,18 @@ config SECURITY_LANDLOCK
 	  If you are unsure how to answer this question, answer N.  Otherwise,
 	  you should also prepend "landlock," to the content of CONFIG_LSM to
 	  enable Landlock at boot time.
+
+config SECURITY_LANDLOCK_KUNIT_TEST
+	bool "KUnit tests for Landlock" if !KUNIT_ALL_TESTS
+	depends on KUNIT=y
+	depends on SECURITY_LANDLOCK
+	default KUNIT_ALL_TESTS
+	help
+	  Build KUnit tests for Landlock.
+
+	  See the KUnit documentation in Documentation/dev-tools/kunit
+
+	  Run all KUnit tests for Landlock with:
+	  ./tools/testing/kunit/kunit.py run --kunitconfig security/landlock
+
+	  If you are unsure how to answer this question, answer N.
diff --git a/security/landlock/common.h b/security/landlock/common.h
index 5dc0fe15707d6b..0eb1d34c2eaefe 100644
--- a/security/landlock/common.h
+++ b/security/landlock/common.h
@@ -17,4 +17,6 @@
 
 #define pr_fmt(fmt) LANDLOCK_NAME ": " fmt
 
+#define BIT_INDEX(bit) HWEIGHT(bit - 1)
+
 #endif /* _SECURITY_LANDLOCK_COMMON_H */
diff --git a/security/landlock/fs.c b/security/landlock/fs.c
index 0171f7eb6ee15d..6f0bf1434a2c35 100644
--- a/security/landlock/fs.c
+++ b/security/landlock/fs.c
@@ -7,6 +7,7 @@
  * Copyright © 2021-2022 Microsoft Corporation
  */
 
+#include <kunit/test.h>
 #include <linux/atomic.h>
 #include <linux/bitops.h>
 #include <linux/bits.h>
@@ -311,6 +312,119 @@ static bool no_more_access(
 	return true;
 }
 
+#define NMA_TRUE(...) KUNIT_EXPECT_TRUE(test, no_more_access(__VA_ARGS__))
+#define NMA_FALSE(...) KUNIT_EXPECT_FALSE(test, no_more_access(__VA_ARGS__))
+
+#ifdef CONFIG_SECURITY_LANDLOCK_KUNIT_TEST
+
+static void test_no_more_access(struct kunit *const test)
+{
+	const layer_mask_t rx0[LANDLOCK_NUM_ACCESS_FS] = {
+		[BIT_INDEX(LANDLOCK_ACCESS_FS_EXECUTE)] = BIT_ULL(0),
+		[BIT_INDEX(LANDLOCK_ACCESS_FS_READ_FILE)] = BIT_ULL(0),
+	};
+	const layer_mask_t mx0[LANDLOCK_NUM_ACCESS_FS] = {
+		[BIT_INDEX(LANDLOCK_ACCESS_FS_EXECUTE)] = BIT_ULL(0),
+		[BIT_INDEX(LANDLOCK_ACCESS_FS_MAKE_REG)] = BIT_ULL(0),
+	};
+	const layer_mask_t x0[LANDLOCK_NUM_ACCESS_FS] = {
+		[BIT_INDEX(LANDLOCK_ACCESS_FS_EXECUTE)] = BIT_ULL(0),
+	};
+	const layer_mask_t x1[LANDLOCK_NUM_ACCESS_FS] = {
+		[BIT_INDEX(LANDLOCK_ACCESS_FS_EXECUTE)] = BIT_ULL(1),
+	};
+	const layer_mask_t x01[LANDLOCK_NUM_ACCESS_FS] = {
+		[BIT_INDEX(LANDLOCK_ACCESS_FS_EXECUTE)] = BIT_ULL(0) |
+							  BIT_ULL(1),
+	};
+	const layer_mask_t allows_all[LANDLOCK_NUM_ACCESS_FS] = {};
+
+	/* Checks without restriction. */
+	NMA_TRUE(&x0, &allows_all, false, &allows_all, NULL, false);
+	NMA_TRUE(&allows_all, &x0, false, &allows_all, NULL, false);
+	NMA_FALSE(&x0, &x0, false, &allows_all, NULL, false);
+
+	/*
+	 * Checks that we can only refer a file if no more access could be
+	 * inherited.
+	 */
+	NMA_TRUE(&x0, &x0, false, &rx0, NULL, false);
+	NMA_TRUE(&rx0, &rx0, false, &rx0, NULL, false);
+	NMA_FALSE(&rx0, &rx0, false, &x0, NULL, false);
+	NMA_FALSE(&rx0, &rx0, false, &x1, NULL, false);
+
+	/* Checks allowed referring with different nested domains. */
+	NMA_TRUE(&x0, &x1, false, &x0, NULL, false);
+	NMA_TRUE(&x1, &x0, false, &x0, NULL, false);
+	NMA_TRUE(&x0, &x01, false, &x0, NULL, false);
+	NMA_TRUE(&x0, &x01, false, &rx0, NULL, false);
+	NMA_TRUE(&x01, &x0, false, &x0, NULL, false);
+	NMA_TRUE(&x01, &x0, false, &rx0, NULL, false);
+	NMA_FALSE(&x01, &x01, false, &x0, NULL, false);
+
+	/* Checks that file access rights are also enforced for a directory. */
+	NMA_FALSE(&rx0, &rx0, true, &x0, NULL, false);
+
+	/* Checks that directory access rights don't impact file referring... */
+	NMA_TRUE(&mx0, &mx0, false, &x0, NULL, false);
+	/* ...but only directory referring. */
+	NMA_FALSE(&mx0, &mx0, true, &x0, NULL, false);
+
+	/* Checks directory exchange. */
+	NMA_TRUE(&mx0, &mx0, true, &mx0, &mx0, true);
+	NMA_TRUE(&mx0, &mx0, true, &mx0, &x0, true);
+	NMA_FALSE(&mx0, &mx0, true, &x0, &mx0, true);
+	NMA_FALSE(&mx0, &mx0, true, &x0, &x0, true);
+	NMA_FALSE(&mx0, &mx0, true, &x1, &x1, true);
+
+	/* Checks file exchange with directory access rights... */
+	NMA_TRUE(&mx0, &mx0, false, &mx0, &mx0, false);
+	NMA_TRUE(&mx0, &mx0, false, &mx0, &x0, false);
+	NMA_TRUE(&mx0, &mx0, false, &x0, &mx0, false);
+	NMA_TRUE(&mx0, &mx0, false, &x0, &x0, false);
+	/* ...and with file access rights. */
+	NMA_TRUE(&rx0, &rx0, false, &rx0, &rx0, false);
+	NMA_TRUE(&rx0, &rx0, false, &rx0, &x0, false);
+	NMA_FALSE(&rx0, &rx0, false, &x0, &rx0, false);
+	NMA_FALSE(&rx0, &rx0, false, &x0, &x0, false);
+	NMA_FALSE(&rx0, &rx0, false, &x1, &x1, false);
+
+	/*
+	 * Allowing the following requests should not be a security risk
+	 * because domain 0 denies execute access, and domain 1 is always
+	 * nested with domain 0.  However, adding an exception for this case
+	 * would mean to check all nested domains to make sure none can get
+	 * more privileges (e.g. processes only sandboxed by domain 0).
+	 * Moreover, this behavior (i.e. composition of N domains) could then
+	 * be inconsistent compared to domain 1's ruleset alone (e.g. it might
+	 * be denied to link/rename with domain 1's ruleset, whereas it would
+	 * be allowed if nested on top of domain 0).  Another drawback would be
+	 * to create a cover channel that could enable sandboxed processes to
+	 * infer most of the filesystem restrictions from their domain.  To
+	 * make it simple, efficient, safe, and more consistent, this case is
+	 * always denied.
+	 */
+	NMA_FALSE(&x1, &x1, false, &x0, NULL, false);
+	NMA_FALSE(&x1, &x1, false, &rx0, NULL, false);
+	NMA_FALSE(&x1, &x1, true, &x0, NULL, false);
+	NMA_FALSE(&x1, &x1, true, &rx0, NULL, false);
+
+	/* Checks the same case of exclusive domains with a file... */
+	NMA_TRUE(&x1, &x1, false, &x01, NULL, false);
+	NMA_FALSE(&x1, &x1, false, &x01, &x0, false);
+	NMA_FALSE(&x1, &x1, false, &x01, &x01, false);
+	NMA_FALSE(&x1, &x1, false, &x0, &x0, false);
+	/* ...and with a directory. */
+	NMA_FALSE(&x1, &x1, false, &x0, &x0, true);
+	NMA_FALSE(&x1, &x1, true, &x0, &x0, false);
+	NMA_FALSE(&x1, &x1, true, &x0, &x0, true);
+}
+
+#endif /* CONFIG_SECURITY_LANDLOCK_KUNIT_TEST */
+
+#undef NMA_TRUE
+#undef NMA_FALSE
+
 /*
  * Removes @layer_masks accesses that are not requested.
  *
@@ -331,6 +445,57 @@ scope_to_request(const access_mask_t access_request,
 	return !memchr_inv(layer_masks, 0, sizeof(*layer_masks));
 }
 
+#ifdef CONFIG_SECURITY_LANDLOCK_KUNIT_TEST
+
+static void test_scope_to_request_with_exec_none(struct kunit *const test)
+{
+	/* Allows everything. */
+	layer_mask_t layer_masks[LANDLOCK_NUM_ACCESS_FS] = {};
+
+	/* Checks and scopes with execute. */
+	KUNIT_EXPECT_TRUE(test, scope_to_request(LANDLOCK_ACCESS_FS_EXECUTE,
+						 &layer_masks));
+	KUNIT_EXPECT_EQ(test, 0,
+			layer_masks[BIT_INDEX(LANDLOCK_ACCESS_FS_EXECUTE)]);
+	KUNIT_EXPECT_EQ(test, 0,
+			layer_masks[BIT_INDEX(LANDLOCK_ACCESS_FS_WRITE_FILE)]);
+}
+
+static void test_scope_to_request_with_exec_some(struct kunit *const test)
+{
+	/* Denies execute and write. */
+	layer_mask_t layer_masks[LANDLOCK_NUM_ACCESS_FS] = {
+		[BIT_INDEX(LANDLOCK_ACCESS_FS_EXECUTE)] = BIT_ULL(0),
+		[BIT_INDEX(LANDLOCK_ACCESS_FS_WRITE_FILE)] = BIT_ULL(1),
+	};
+
+	/* Checks and scopes with execute. */
+	KUNIT_EXPECT_FALSE(test, scope_to_request(LANDLOCK_ACCESS_FS_EXECUTE,
+						  &layer_masks));
+	KUNIT_EXPECT_EQ(test, BIT_ULL(0),
+			layer_masks[BIT_INDEX(LANDLOCK_ACCESS_FS_EXECUTE)]);
+	KUNIT_EXPECT_EQ(test, 0,
+			layer_masks[BIT_INDEX(LANDLOCK_ACCESS_FS_WRITE_FILE)]);
+}
+
+static void test_scope_to_request_without_access(struct kunit *const test)
+{
+	/* Denies execute and write. */
+	layer_mask_t layer_masks[LANDLOCK_NUM_ACCESS_FS] = {
+		[BIT_INDEX(LANDLOCK_ACCESS_FS_EXECUTE)] = BIT_ULL(0),
+		[BIT_INDEX(LANDLOCK_ACCESS_FS_WRITE_FILE)] = BIT_ULL(1),
+	};
+
+	/* Checks and scopes without access request. */
+	KUNIT_EXPECT_TRUE(test, scope_to_request(0, &layer_masks));
+	KUNIT_EXPECT_EQ(test, 0,
+			layer_masks[BIT_INDEX(LANDLOCK_ACCESS_FS_EXECUTE)]);
+	KUNIT_EXPECT_EQ(test, 0,
+			layer_masks[BIT_INDEX(LANDLOCK_ACCESS_FS_WRITE_FILE)]);
+}
+
+#endif /* CONFIG_SECURITY_LANDLOCK_KUNIT_TEST */
+
 /*
  * Returns true if there is at least one access right different than
  * LANDLOCK_ACCESS_FS_REFER.
@@ -354,6 +519,51 @@ is_eacces(const layer_mask_t (*const layer_masks)[LANDLOCK_NUM_ACCESS_FS],
 	return false;
 }
 
+#define IE_TRUE(...) KUNIT_EXPECT_TRUE(test, is_eacces(__VA_ARGS__))
+#define IE_FALSE(...) KUNIT_EXPECT_FALSE(test, is_eacces(__VA_ARGS__))
+
+#ifdef CONFIG_SECURITY_LANDLOCK_KUNIT_TEST
+
+static void test_is_eacces_with_none(struct kunit *const test)
+{
+	const layer_mask_t layer_masks[LANDLOCK_NUM_ACCESS_FS] = {};
+
+	IE_FALSE(&layer_masks, 0);
+	IE_FALSE(&layer_masks, LANDLOCK_ACCESS_FS_REFER);
+	IE_FALSE(&layer_masks, LANDLOCK_ACCESS_FS_EXECUTE);
+	IE_FALSE(&layer_masks, LANDLOCK_ACCESS_FS_WRITE_FILE);
+}
+
+static void test_is_eacces_with_refer(struct kunit *const test)
+{
+	const layer_mask_t layer_masks[LANDLOCK_NUM_ACCESS_FS] = {
+		[BIT_INDEX(LANDLOCK_ACCESS_FS_REFER)] = BIT_ULL(0),
+	};
+
+	IE_FALSE(&layer_masks, 0);
+	IE_FALSE(&layer_masks, LANDLOCK_ACCESS_FS_REFER);
+	IE_FALSE(&layer_masks, LANDLOCK_ACCESS_FS_EXECUTE);
+	IE_FALSE(&layer_masks, LANDLOCK_ACCESS_FS_WRITE_FILE);
+}
+
+static void test_is_eacces_with_write(struct kunit *const test)
+{
+	const layer_mask_t layer_masks[LANDLOCK_NUM_ACCESS_FS] = {
+		[BIT_INDEX(LANDLOCK_ACCESS_FS_WRITE_FILE)] = BIT_ULL(0),
+	};
+
+	IE_FALSE(&layer_masks, 0);
+	IE_FALSE(&layer_masks, LANDLOCK_ACCESS_FS_REFER);
+	IE_FALSE(&layer_masks, LANDLOCK_ACCESS_FS_EXECUTE);
+
+	IE_TRUE(&layer_masks, LANDLOCK_ACCESS_FS_WRITE_FILE);
+}
+
+#endif /* CONFIG_SECURITY_LANDLOCK_KUNIT_TEST */
+
+#undef IE_TRUE
+#undef IE_FALSE
+
 /**
  * is_access_to_paths_allowed - Check accesses for requests with a common path
  *
@@ -1225,3 +1435,27 @@ __init void landlock_add_fs_hooks(void)
 	security_add_hooks(landlock_hooks, ARRAY_SIZE(landlock_hooks),
 			   &landlock_lsmid);
 }
+
+#ifdef CONFIG_SECURITY_LANDLOCK_KUNIT_TEST
+
+/* clang-format off */
+static struct kunit_case test_cases[] = {
+	KUNIT_CASE(test_no_more_access),
+	KUNIT_CASE(test_scope_to_request_with_exec_none),
+	KUNIT_CASE(test_scope_to_request_with_exec_some),
+	KUNIT_CASE(test_scope_to_request_without_access),
+	KUNIT_CASE(test_is_eacces_with_none),
+	KUNIT_CASE(test_is_eacces_with_refer),
+	KUNIT_CASE(test_is_eacces_with_write),
+	{}
+};
+/* clang-format on */
+
+static struct kunit_suite test_suite = {
+	.name = "landlock_fs",
+	.test_cases = test_cases,
+};
+
+kunit_test_suite(test_suite);
+
+#endif /* CONFIG_SECURITY_LANDLOCK_KUNIT_TEST */
diff --git a/tools/testing/kunit/configs/all_tests.config b/tools/testing/kunit/configs/all_tests.config
index 3bf506d4a63ccd..1b8f1abfedf041 100644
--- a/tools/testing/kunit/configs/all_tests.config
+++ b/tools/testing/kunit/configs/all_tests.config
@@ -37,6 +37,7 @@ CONFIG_REGMAP_BUILD=y
 
 CONFIG_SECURITY=y
 CONFIG_SECURITY_APPARMOR=y
+CONFIG_SECURITY_LANDLOCK=y
 
 CONFIG_SOUND=y
 CONFIG_SND=y

From b47eb78ff80022e51795dde249962843b8fe16bb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?G=C3=BCnther=20Noack?= <gnoack@google.com>
Date: Fri, 9 Feb 2024 18:06:05 +0100
Subject: [PATCH 0765/1406] landlock: Add IOCTL access right
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Introduces the LANDLOCK_ACCESS_FS_IOCTL access right
and increments the Landlock ABI version to 5.

Like the truncate right, these rights are associated with a file
descriptor at the time of open(2), and get respected even when the
file descriptor is used outside of the thread which it was originally
opened in.

A newly enabled Landlock policy therefore does not apply to file
descriptors which are already open.

If the LANDLOCK_ACCESS_FS_IOCTL right is handled, only a small number
of safe IOCTL commands will be permitted on newly opened files.  The
permitted IOCTLs can be configured through the ruleset in limited ways
now.  (See documentation for details.)

Specifically, when LANDLOCK_ACCESS_FS_IOCTL is handled, granting this
right on a file or directory will *not* permit to do all IOCTL
commands, but only influence the IOCTL commands which are not already
handled through other access rights.  The intent is to keep the groups
of IOCTL commands more fine-grained.

Noteworthy scenarios which require special attention:

TTY devices are often passed into a process from the parent process,
and so a newly enabled Landlock policy does not retroactively apply to
them automatically.  In the past, TTY devices have often supported
IOCTL commands like TIOCSTI and some TIOCLINUX subcommands, which were
letting callers control the TTY input buffer (and simulate
keypresses).  This should be restricted to CAP_SYS_ADMIN programs on
modern kernels though.

Some legitimate file system features, like setting up fscrypt, are
exposed as IOCTL commands on regular files and directories -- users of
Landlock are advised to double check that the sandboxed process does
not need to invoke these IOCTLs.

Known limitations:

The LANDLOCK_ACCESS_FS_IOCTL access right is a coarse-grained control
over IOCTL commands.  Future work will enable a more fine-grained
access control for IOCTLs.

In the meantime, Landlock users may use path-based restrictions in
combination with their knowledge about the file system layout to
control what IOCTLs can be done.  Mounting file systems with the nodev
option can help to distinguish regular files and devices, and give
guarantees about the affected files, which Landlock alone can not give
yet.

Signed-off-by: Günther Noack <gnoack@google.com>
Link: https://lore.kernel.org/r/20240209170612.1638517-2-gnoack@google.com
Signed-off-by: Mickaël Salaün <mic@digikod.net>
---
 include/uapi/linux/landlock.h                |  55 ++++-
 security/landlock/fs.c                       | 227 ++++++++++++++++++-
 security/landlock/fs.h                       |   3 +
 security/landlock/limits.h                   |  11 +-
 security/landlock/ruleset.h                  |   2 +-
 security/landlock/syscalls.c                 |  19 +-
 tools/testing/selftests/landlock/base_test.c |   2 +-
 tools/testing/selftests/landlock/fs_test.c   |   5 +-
 8 files changed, 302 insertions(+), 22 deletions(-)

diff --git a/include/uapi/linux/landlock.h b/include/uapi/linux/landlock.h
index 25c8d76775393a..16d7d72804f81b 100644
--- a/include/uapi/linux/landlock.h
+++ b/include/uapi/linux/landlock.h
@@ -128,7 +128,7 @@ struct landlock_net_port_attr {
  * files and directories.  Files or directories opened before the sandboxing
  * are not subject to these restrictions.
  *
- * A file can only receive these access rights:
+ * The following access rights apply only to files:
  *
  * - %LANDLOCK_ACCESS_FS_EXECUTE: Execute a file.
  * - %LANDLOCK_ACCESS_FS_WRITE_FILE: Open a file with write access. Note that
@@ -138,12 +138,13 @@ struct landlock_net_port_attr {
  * - %LANDLOCK_ACCESS_FS_READ_FILE: Open a file with read access.
  * - %LANDLOCK_ACCESS_FS_TRUNCATE: Truncate a file with :manpage:`truncate(2)`,
  *   :manpage:`ftruncate(2)`, :manpage:`creat(2)`, or :manpage:`open(2)` with
- *   ``O_TRUNC``. Whether an opened file can be truncated with
- *   :manpage:`ftruncate(2)` is determined during :manpage:`open(2)`, in the
- *   same way as read and write permissions are checked during
- *   :manpage:`open(2)` using %LANDLOCK_ACCESS_FS_READ_FILE and
- *   %LANDLOCK_ACCESS_FS_WRITE_FILE. This access right is available since the
- *   third version of the Landlock ABI.
+ *   ``O_TRUNC``.  This access right is available since the third version of the
+ *   Landlock ABI.
+ *
+ * Whether an opened file can be truncated with :manpage:`ftruncate(2)` or used
+ * with `ioctl(2)` is determined during :manpage:`open(2)`, in the same way as
+ * read and write permissions are checked during :manpage:`open(2)` using
+ * %LANDLOCK_ACCESS_FS_READ_FILE and %LANDLOCK_ACCESS_FS_WRITE_FILE.
  *
  * A directory can receive access rights related to files or directories.  The
  * following access right is applied to the directory itself, and the
@@ -198,13 +199,50 @@ struct landlock_net_port_attr {
  *   If multiple requirements are not met, the ``EACCES`` error code takes
  *   precedence over ``EXDEV``.
  *
+ * The following access right applies both to files and directories:
+ *
+ * - %LANDLOCK_ACCESS_FS_IOCTL: Invoke :manpage:`ioctl(2)` commands on an opened
+ *   file or directory.
+ *
+ *   This access right applies to all :manpage:`ioctl(2)` commands, except of
+ *   ``FIOCLEX``, ``FIONCLEX``, ``FIONBIO`` and ``FIOASYNC``.  These commands
+ *   continue to be invokable independent of the %LANDLOCK_ACCESS_FS_IOCTL
+ *   access right.
+ *
+ *   When certain other access rights are handled in the ruleset, in addition to
+ *   %LANDLOCK_ACCESS_FS_IOCTL, granting these access rights will unlock access
+ *   to additional groups of IOCTL commands, on the affected files:
+ *
+ *   * %LANDLOCK_ACCESS_FS_READ_FILE and %LANDLOCK_ACCESS_FS_WRITE_FILE unlock
+ *     access to ``FIOQSIZE``, ``FIONREAD``, ``FIGETBSZ``, ``FS_IOC_FIEMAP``,
+ *     ``FIBMAP``, ``FIDEDUPERANGE``, ``FICLONE``, ``FICLONERANGE``,
+ *     ``FS_IOC_RESVSP``, ``FS_IOC_RESVSP64``, ``FS_IOC_UNRESVSP``,
+ *     ``FS_IOC_UNRESVSP64``, ``FS_IOC_ZERO_RANGE``.
+ *
+ *   * %LANDLOCK_ACCESS_FS_READ_DIR unlocks access to ``FIOQSIZE``,
+ *     ``FIONREAD``, ``FIGETBSZ``.
+ *
+ *   When these access rights are handled in the ruleset, the availability of
+ *   the affected IOCTL commands is not governed by %LANDLOCK_ACCESS_FS_IOCTL
+ *   any more, but by the respective access right.
+ *
+ *   All other IOCTL commands are not handled specially, and are governed by
+ *   %LANDLOCK_ACCESS_FS_IOCTL.  This includes %FS_IOC_GETFLAGS and
+ *   %FS_IOC_SETFLAGS for manipulating inode flags (:manpage:`ioctl_iflags(2)`),
+ *   %FS_IOC_FSFETXATTR and %FS_IOC_FSSETXATTR for manipulating extended
+ *   attributes, as well as %FIFREEZE and %FITHAW for freezing and thawing file
+ *   systems.
+ *
+ *   This access right is available since the fifth version of the Landlock
+ *   ABI.
+ *
  * .. warning::
  *
  *   It is currently not possible to restrict some file-related actions
  *   accessible through these syscall families: :manpage:`chdir(2)`,
  *   :manpage:`stat(2)`, :manpage:`flock(2)`, :manpage:`chmod(2)`,
  *   :manpage:`chown(2)`, :manpage:`setxattr(2)`, :manpage:`utime(2)`,
- *   :manpage:`ioctl(2)`, :manpage:`fcntl(2)`, :manpage:`access(2)`.
+ *   :manpage:`fcntl(2)`, :manpage:`access(2)`.
  *   Future Landlock evolutions will enable to restrict them.
  */
 /* clang-format off */
@@ -223,6 +261,7 @@ struct landlock_net_port_attr {
 #define LANDLOCK_ACCESS_FS_MAKE_SYM			(1ULL << 12)
 #define LANDLOCK_ACCESS_FS_REFER			(1ULL << 13)
 #define LANDLOCK_ACCESS_FS_TRUNCATE			(1ULL << 14)
+#define LANDLOCK_ACCESS_FS_IOCTL			(1ULL << 15)
 /* clang-format on */
 
 /**
diff --git a/security/landlock/fs.c b/security/landlock/fs.c
index 6f0bf1434a2c35..e0d0bd350ab033 100644
--- a/security/landlock/fs.c
+++ b/security/landlock/fs.c
@@ -7,6 +7,7 @@
  * Copyright © 2021-2022 Microsoft Corporation
  */
 
+#include <asm/ioctls.h>
 #include <kunit/test.h>
 #include <linux/atomic.h>
 #include <linux/bitops.h>
@@ -14,6 +15,7 @@
 #include <linux/compiler_types.h>
 #include <linux/dcache.h>
 #include <linux/err.h>
+#include <linux/falloc.h>
 #include <linux/fs.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
@@ -29,6 +31,7 @@
 #include <linux/types.h>
 #include <linux/wait_bit.h>
 #include <linux/workqueue.h>
+#include <uapi/linux/fiemap.h>
 #include <uapi/linux/landlock.h>
 
 #include "common.h"
@@ -84,6 +87,186 @@ static const struct landlock_object_underops landlock_fs_underops = {
 	.release = release_inode
 };
 
+/* IOCTL helpers */
+
+/*
+ * These are synthetic access rights, which are only used within the kernel, but
+ * not exposed to callers in userspace.  The mapping between these access rights
+ * and IOCTL commands is defined in the get_required_ioctl_access() helper function.
+ */
+#define LANDLOCK_ACCESS_FS_IOCTL_RW (LANDLOCK_LAST_PUBLIC_ACCESS_FS << 1)
+#define LANDLOCK_ACCESS_FS_IOCTL_RW_FILE (LANDLOCK_LAST_PUBLIC_ACCESS_FS << 2)
+
+/* ioctl_groups - all synthetic access rights for IOCTL command groups */
+/* clang-format off */
+#define IOCTL_GROUPS (				\
+	LANDLOCK_ACCESS_FS_IOCTL_RW |		\
+	LANDLOCK_ACCESS_FS_IOCTL_RW_FILE)
+/* clang-format on */
+
+static_assert((IOCTL_GROUPS & LANDLOCK_MASK_ACCESS_FS) == IOCTL_GROUPS);
+
+/**
+ * get_required_ioctl_access(): Determine required IOCTL access rights.
+ *
+ * @cmd: The IOCTL command that is supposed to be run.
+ *
+ * Any new IOCTL commands that are implemented in fs/ioctl.c's do_vfs_ioctl()
+ * should be considered for inclusion here.
+ *
+ * Returns: The access rights that must be granted on an opened file in order to
+ * use the given @cmd.
+ */
+static __attribute_const__ access_mask_t
+get_required_ioctl_access(const unsigned int cmd)
+{
+	switch (cmd) {
+	case FIOCLEX:
+	case FIONCLEX:
+	case FIONBIO:
+	case FIOASYNC:
+		/*
+		 * FIOCLEX, FIONCLEX, FIONBIO and FIOASYNC manipulate the FD's
+		 * close-on-exec and the file's buffered-IO and async flags.
+		 * These operations are also available through fcntl(2), and are
+		 * unconditionally permitted in Landlock.
+		 */
+		return 0;
+	case FIONREAD:
+	case FIOQSIZE:
+	case FIGETBSZ:
+		/*
+		 * FIONREAD returns the number of bytes available for reading.
+		 * FIONREAD returns the number of immediately readable bytes for
+		 * a file.
+		 *
+		 * FIOQSIZE queries the size of a file or directory.
+		 *
+		 * FIGETBSZ queries the file system's block size for a file or
+		 * directory.
+		 *
+		 * These IOCTL commands are permitted for files which are opened
+		 * with LANDLOCK_ACCESS_FS_READ_DIR,
+		 * LANDLOCK_ACCESS_FS_READ_FILE, or
+		 * LANDLOCK_ACCESS_FS_WRITE_FILE.
+		 */
+		return LANDLOCK_ACCESS_FS_IOCTL_RW;
+	case FS_IOC_FIEMAP:
+	case FIBMAP:
+		/*
+		 * FS_IOC_FIEMAP and FIBMAP query information about the
+		 * allocation of blocks within a file.  They are permitted for
+		 * files which are opened with LANDLOCK_ACCESS_FS_READ_FILE or
+		 * LANDLOCK_ACCESS_FS_WRITE_FILE.
+		 */
+		fallthrough;
+	case FIDEDUPERANGE:
+	case FICLONE:
+	case FICLONERANGE:
+		/*
+		 * FIDEDUPERANGE, FICLONE and FICLONERANGE make files share
+		 * their underlying storage ("reflink") between source and
+		 * destination FDs, on file systems which support that.
+		 *
+		 * The underlying implementations are already checking whether
+		 * the involved files are opened with the appropriate read/write
+		 * modes.  We rely on this being implemented correctly.
+		 *
+		 * These IOCTLs are permitted for files which are opened with
+		 * LANDLOCK_ACCESS_FS_READ_FILE or
+		 * LANDLOCK_ACCESS_FS_WRITE_FILE.
+		 */
+		fallthrough;
+	case FS_IOC_RESVSP:
+	case FS_IOC_RESVSP64:
+	case FS_IOC_UNRESVSP:
+	case FS_IOC_UNRESVSP64:
+	case FS_IOC_ZERO_RANGE:
+		/*
+		 * These IOCTLs reserve space, or create holes like
+		 * fallocate(2).  We rely on the implementations checking the
+		 * files' read/write modes.
+		 *
+		 * These IOCTLs are permitted for files which are opened with
+		 * LANDLOCK_ACCESS_FS_READ_FILE or
+		 * LANDLOCK_ACCESS_FS_WRITE_FILE.
+		 */
+		return LANDLOCK_ACCESS_FS_IOCTL_RW_FILE;
+	default:
+		/*
+		 * Other commands are guarded by the catch-all access right.
+		 */
+		return LANDLOCK_ACCESS_FS_IOCTL;
+	}
+}
+
+/**
+ * expand_ioctl() - Return the dst flags from either the src flag or the
+ * %LANDLOCK_ACCESS_FS_IOCTL flag, depending on whether the
+ * %LANDLOCK_ACCESS_FS_IOCTL and src access rights are handled or not.
+ *
+ * @handled: Handled access rights.
+ * @access: The access mask to copy values from.
+ * @src: A single access right to copy from in @access.
+ * @dst: One or more access rights to copy to.
+ *
+ * Returns: @dst, or 0.
+ */
+static __attribute_const__ access_mask_t
+expand_ioctl(const access_mask_t handled, const access_mask_t access,
+	     const access_mask_t src, const access_mask_t dst)
+{
+	access_mask_t copy_from;
+
+	if (!(handled & LANDLOCK_ACCESS_FS_IOCTL))
+		return 0;
+
+	copy_from = (handled & src) ? src : LANDLOCK_ACCESS_FS_IOCTL;
+	if (access & copy_from)
+		return dst;
+
+	return 0;
+}
+
+/**
+ * landlock_expand_access_fs() - Returns @access with the synthetic IOCTL group
+ * flags enabled if necessary.
+ *
+ * @handled: Handled FS access rights.
+ * @access: FS access rights to expand.
+ *
+ * Returns: @access expanded by the necessary flags for the synthetic IOCTL
+ * access rights.
+ */
+static __attribute_const__ access_mask_t landlock_expand_access_fs(
+	const access_mask_t handled, const access_mask_t access)
+{
+	return access |
+	       expand_ioctl(handled, access, LANDLOCK_ACCESS_FS_WRITE_FILE,
+			    LANDLOCK_ACCESS_FS_IOCTL_RW |
+				    LANDLOCK_ACCESS_FS_IOCTL_RW_FILE) |
+	       expand_ioctl(handled, access, LANDLOCK_ACCESS_FS_READ_FILE,
+			    LANDLOCK_ACCESS_FS_IOCTL_RW |
+				    LANDLOCK_ACCESS_FS_IOCTL_RW_FILE) |
+	       expand_ioctl(handled, access, LANDLOCK_ACCESS_FS_READ_DIR,
+			    LANDLOCK_ACCESS_FS_IOCTL_RW);
+}
+
+/**
+ * landlock_expand_handled_access_fs() - add synthetic IOCTL access rights to an
+ * access mask of handled accesses.
+ *
+ * @handled: The handled accesses of a ruleset that is being created.
+ *
+ * Returns: @handled, with the bits for the synthetic IOCTL access rights set,
+ * if %LANDLOCK_ACCESS_FS_IOCTL is handled.
+ */
+__attribute_const__ access_mask_t
+landlock_expand_handled_access_fs(const access_mask_t handled)
+{
+	return landlock_expand_access_fs(handled, handled);
+}
+
 /* Ruleset management */
 
 static struct landlock_object *get_inode_object(struct inode *const inode)
@@ -148,7 +331,8 @@ static struct landlock_object *get_inode_object(struct inode *const inode)
 	LANDLOCK_ACCESS_FS_EXECUTE | \
 	LANDLOCK_ACCESS_FS_WRITE_FILE | \
 	LANDLOCK_ACCESS_FS_READ_FILE | \
-	LANDLOCK_ACCESS_FS_TRUNCATE)
+	LANDLOCK_ACCESS_FS_TRUNCATE | \
+	LANDLOCK_ACCESS_FS_IOCTL)
 /* clang-format on */
 
 /*
@@ -158,6 +342,7 @@ int landlock_append_fs_rule(struct landlock_ruleset *const ruleset,
 			    const struct path *const path,
 			    access_mask_t access_rights)
 {
+	access_mask_t handled;
 	int err;
 	struct landlock_id id = {
 		.type = LANDLOCK_KEY_INODE,
@@ -170,9 +355,11 @@ int landlock_append_fs_rule(struct landlock_ruleset *const ruleset,
 	if (WARN_ON_ONCE(ruleset->num_layers != 1))
 		return -EINVAL;
 
+	handled = landlock_get_fs_access_mask(ruleset, 0);
+	/* Expands the synthetic IOCTL groups. */
+	access_rights |= landlock_expand_access_fs(handled, access_rights);
 	/* Transforms relative access rights to absolute ones. */
-	access_rights |= LANDLOCK_MASK_ACCESS_FS &
-			 ~landlock_get_fs_access_mask(ruleset, 0);
+	access_rights |= LANDLOCK_MASK_ACCESS_FS & ~handled;
 	id.key.object = get_inode_object(d_backing_inode(path->dentry));
 	if (IS_ERR(id.key.object))
 		return PTR_ERR(id.key.object);
@@ -1333,7 +1520,9 @@ static int hook_file_open(struct file *const file)
 {
 	layer_mask_t layer_masks[LANDLOCK_NUM_ACCESS_FS] = {};
 	access_mask_t open_access_request, full_access_request, allowed_access;
-	const access_mask_t optional_access = LANDLOCK_ACCESS_FS_TRUNCATE;
+	const access_mask_t optional_access = LANDLOCK_ACCESS_FS_TRUNCATE |
+					      LANDLOCK_ACCESS_FS_IOCTL |
+					      IOCTL_GROUPS;
 	const struct landlock_ruleset *const dom = get_current_fs_domain();
 
 	if (!dom)
@@ -1375,6 +1564,16 @@ static int hook_file_open(struct file *const file)
 		}
 	}
 
+	/*
+	 * Named pipes should be treated just like anonymous pipes.
+	 * Therefore, we permit all IOCTLs on them.
+	 */
+	if (S_ISFIFO(file_inode(file)->i_mode)) {
+		allowed_access |= LANDLOCK_ACCESS_FS_IOCTL |
+				  LANDLOCK_ACCESS_FS_IOCTL_RW |
+				  LANDLOCK_ACCESS_FS_IOCTL_RW_FILE;
+	}
+
 	/*
 	 * For operations on already opened files (i.e. ftruncate()), it is the
 	 * access rights at the time of open() which decide whether the
@@ -1406,6 +1605,25 @@ static int hook_file_truncate(struct file *const file)
 	return -EACCES;
 }
 
+static int hook_file_ioctl(struct file *file, unsigned int cmd,
+			   unsigned long arg)
+{
+	const access_mask_t required_access = get_required_ioctl_access(cmd);
+	const access_mask_t allowed_access =
+		landlock_file(file)->allowed_access;
+
+	/*
+	 * It is the access rights at the time of opening the file which
+	 * determine whether IOCTL can be used on the opened file later.
+	 *
+	 * The access right is attached to the opened file in hook_file_open().
+	 */
+	if ((allowed_access & required_access) == required_access)
+		return 0;
+
+	return -EACCES;
+}
+
 static struct security_hook_list landlock_hooks[] __ro_after_init = {
 	LSM_HOOK_INIT(inode_free_security, hook_inode_free_security),
 
@@ -1428,6 +1646,7 @@ static struct security_hook_list landlock_hooks[] __ro_after_init = {
 	LSM_HOOK_INIT(file_alloc_security, hook_file_alloc_security),
 	LSM_HOOK_INIT(file_open, hook_file_open),
 	LSM_HOOK_INIT(file_truncate, hook_file_truncate),
+	LSM_HOOK_INIT(file_ioctl, hook_file_ioctl),
 };
 
 __init void landlock_add_fs_hooks(void)
diff --git a/security/landlock/fs.h b/security/landlock/fs.h
index 488e4813680ab7..086576b8386bef 100644
--- a/security/landlock/fs.h
+++ b/security/landlock/fs.h
@@ -92,4 +92,7 @@ int landlock_append_fs_rule(struct landlock_ruleset *const ruleset,
 			    const struct path *const path,
 			    access_mask_t access_hierarchy);
 
+__attribute_const__ access_mask_t
+landlock_expand_handled_access_fs(const access_mask_t handled);
+
 #endif /* _SECURITY_LANDLOCK_FS_H */
diff --git a/security/landlock/limits.h b/security/landlock/limits.h
index 93c9c6f915567e..ecbdc8bbf906a0 100644
--- a/security/landlock/limits.h
+++ b/security/landlock/limits.h
@@ -18,7 +18,16 @@
 #define LANDLOCK_MAX_NUM_LAYERS		16
 #define LANDLOCK_MAX_NUM_RULES		U32_MAX
 
-#define LANDLOCK_LAST_ACCESS_FS		LANDLOCK_ACCESS_FS_TRUNCATE
+/*
+ * For file system access rights, Landlock distinguishes between the publicly
+ * visible access rights (1 to LANDLOCK_LAST_PUBLIC_ACCESS_FS) and the private
+ * ones which are not exposed to userspace (LANDLOCK_LAST_PUBLIC_ACCESS_FS + 1
+ * to LANDLOCK_LAST_ACCESS_FS).  The private access rights are defined in fs.c.
+ */
+#define LANDLOCK_LAST_PUBLIC_ACCESS_FS	LANDLOCK_ACCESS_FS_IOCTL
+#define LANDLOCK_MASK_PUBLIC_ACCESS_FS	((LANDLOCK_LAST_PUBLIC_ACCESS_FS << 1) - 1)
+
+#define LANDLOCK_LAST_ACCESS_FS		(LANDLOCK_LAST_PUBLIC_ACCESS_FS << 2)
 #define LANDLOCK_MASK_ACCESS_FS		((LANDLOCK_LAST_ACCESS_FS << 1) - 1)
 #define LANDLOCK_NUM_ACCESS_FS		__const_hweight64(LANDLOCK_MASK_ACCESS_FS)
 #define LANDLOCK_SHIFT_ACCESS_FS	0
diff --git a/security/landlock/ruleset.h b/security/landlock/ruleset.h
index c7f1526784fd10..5a28ea8e1c3d50 100644
--- a/security/landlock/ruleset.h
+++ b/security/landlock/ruleset.h
@@ -30,7 +30,7 @@
 	LANDLOCK_ACCESS_FS_REFER)
 /* clang-format on */
 
-typedef u16 access_mask_t;
+typedef u32 access_mask_t;
 /* Makes sure all filesystem access rights can be stored. */
 static_assert(BITS_PER_TYPE(access_mask_t) >= LANDLOCK_NUM_ACCESS_FS);
 /* Makes sure all network access rights can be stored. */
diff --git a/security/landlock/syscalls.c b/security/landlock/syscalls.c
index ccb1c3e0897fd8..b5b424819deec6 100644
--- a/security/landlock/syscalls.c
+++ b/security/landlock/syscalls.c
@@ -149,7 +149,7 @@ static const struct file_operations ruleset_fops = {
 	.write = fop_dummy_write,
 };
 
-#define LANDLOCK_ABI_VERSION 4
+#define LANDLOCK_ABI_VERSION 5
 
 /**
  * sys_landlock_create_ruleset - Create a new ruleset
@@ -204,8 +204,8 @@ SYSCALL_DEFINE3(landlock_create_ruleset,
 		return err;
 
 	/* Checks content (and 32-bits cast). */
-	if ((ruleset_attr.handled_access_fs | LANDLOCK_MASK_ACCESS_FS) !=
-	    LANDLOCK_MASK_ACCESS_FS)
+	if ((ruleset_attr.handled_access_fs | LANDLOCK_MASK_PUBLIC_ACCESS_FS) !=
+	    LANDLOCK_MASK_PUBLIC_ACCESS_FS)
 		return -EINVAL;
 
 	/* Checks network content (and 32-bits cast). */
@@ -213,6 +213,10 @@ SYSCALL_DEFINE3(landlock_create_ruleset,
 	    LANDLOCK_MASK_ACCESS_NET)
 		return -EINVAL;
 
+	/* Expands synthetic IOCTL groups. */
+	ruleset_attr.handled_access_fs = landlock_expand_handled_access_fs(
+		ruleset_attr.handled_access_fs);
+
 	/* Checks arguments and transforms to kernel struct. */
 	ruleset = landlock_create_ruleset(ruleset_attr.handled_access_fs,
 					  ruleset_attr.handled_access_net);
@@ -321,8 +325,13 @@ static int add_rule_path_beneath(struct landlock_ruleset *const ruleset,
 	if (!path_beneath_attr.allowed_access)
 		return -ENOMSG;
 
-	/* Checks that allowed_access matches the @ruleset constraints. */
-	mask = landlock_get_raw_fs_access_mask(ruleset, 0);
+	/*
+	 * Checks that allowed_access matches the @ruleset constraints and only
+	 * consists of publicly visible access rights (as opposed to synthetic
+	 * ones).
+	 */
+	mask = landlock_get_raw_fs_access_mask(ruleset, 0) &
+	       LANDLOCK_MASK_PUBLIC_ACCESS_FS;
 	if ((path_beneath_attr.allowed_access | mask) != mask)
 		return -EINVAL;
 
diff --git a/tools/testing/selftests/landlock/base_test.c b/tools/testing/selftests/landlock/base_test.c
index 646f778dfb1eee..d292b419ccba40 100644
--- a/tools/testing/selftests/landlock/base_test.c
+++ b/tools/testing/selftests/landlock/base_test.c
@@ -75,7 +75,7 @@ TEST(abi_version)
 	const struct landlock_ruleset_attr ruleset_attr = {
 		.handled_access_fs = LANDLOCK_ACCESS_FS_READ_FILE,
 	};
-	ASSERT_EQ(4, landlock_create_ruleset(NULL, 0,
+	ASSERT_EQ(5, landlock_create_ruleset(NULL, 0,
 					     LANDLOCK_CREATE_RULESET_VERSION));
 
 	ASSERT_EQ(-1, landlock_create_ruleset(&ruleset_attr, 0,
diff --git a/tools/testing/selftests/landlock/fs_test.c b/tools/testing/selftests/landlock/fs_test.c
index 2d6d9b43d958cf..3203f4a5bc8595 100644
--- a/tools/testing/selftests/landlock/fs_test.c
+++ b/tools/testing/selftests/landlock/fs_test.c
@@ -527,9 +527,10 @@ TEST_F_FORK(layout1, inval)
 	LANDLOCK_ACCESS_FS_EXECUTE | \
 	LANDLOCK_ACCESS_FS_WRITE_FILE | \
 	LANDLOCK_ACCESS_FS_READ_FILE | \
-	LANDLOCK_ACCESS_FS_TRUNCATE)
+	LANDLOCK_ACCESS_FS_TRUNCATE | \
+	LANDLOCK_ACCESS_FS_IOCTL)
 
-#define ACCESS_LAST LANDLOCK_ACCESS_FS_TRUNCATE
+#define ACCESS_LAST LANDLOCK_ACCESS_FS_IOCTL
 
 #define ACCESS_ALL ( \
 	ACCESS_FILE | \

From cae8fd4f1fa76faafa8ac5a492f7200ec604df80 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?G=C3=BCnther=20Noack?= <gnoack@google.com>
Date: Fri, 9 Feb 2024 18:06:06 +0100
Subject: [PATCH 0766/1406] selftests/landlock: Test IOCTL support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Exercises Landlock's IOCTL feature in different combinations of
handling and permitting the rights LANDLOCK_ACCESS_FS_IOCTL,
LANDLOCK_ACCESS_FS_READ_FILE, LANDLOCK_ACCESS_FS_WRITE_FILE and
LANDLOCK_ACCESS_FS_READ_DIR, and in different combinations of using
files and directories.

Signed-off-by: Günther Noack <gnoack@google.com>
Link: https://lore.kernel.org/r/20240209170612.1638517-3-gnoack@google.com
Signed-off-by: Mickaël Salaün <mic@digikod.net>
---
 tools/testing/selftests/landlock/fs_test.c | 379 ++++++++++++++++++++-
 1 file changed, 376 insertions(+), 3 deletions(-)

diff --git a/tools/testing/selftests/landlock/fs_test.c b/tools/testing/selftests/landlock/fs_test.c
index 3203f4a5bc8595..6ff1026c26c283 100644
--- a/tools/testing/selftests/landlock/fs_test.c
+++ b/tools/testing/selftests/landlock/fs_test.c
@@ -23,6 +23,12 @@
 #include <sys/vfs.h>
 #include <unistd.h>
 
+/*
+ * Intentionally included last to work around header conflict.
+ * See https://sourceware.org/glibc/wiki/Synchronizing_Headers.
+ */
+#include <linux/fs.h>
+
 #include "common.h"
 
 #ifndef renameat2
@@ -735,6 +741,9 @@ static int create_ruleset(struct __test_metadata *const _metadata,
 	}
 
 	for (i = 0; rules[i].path; i++) {
+		if (!rules[i].access)
+			continue;
+
 		add_path_beneath(_metadata, ruleset_fd, rules[i].access,
 				 rules[i].path);
 	}
@@ -3443,7 +3452,7 @@ TEST_F_FORK(layout1, truncate_unhandled)
 			      LANDLOCK_ACCESS_FS_WRITE_FILE;
 	int ruleset_fd;
 
-	/* Enable Landlock. */
+	/* Enables Landlock. */
 	ruleset_fd = create_ruleset(_metadata, handled, rules);
 
 	ASSERT_LE(0, ruleset_fd);
@@ -3526,7 +3535,7 @@ TEST_F_FORK(layout1, truncate)
 			      LANDLOCK_ACCESS_FS_TRUNCATE;
 	int ruleset_fd;
 
-	/* Enable Landlock. */
+	/* Enables Landlock. */
 	ruleset_fd = create_ruleset(_metadata, handled, rules);
 
 	ASSERT_LE(0, ruleset_fd);
@@ -3752,7 +3761,7 @@ TEST_F_FORK(ftruncate, open_and_ftruncate)
 	};
 	int fd, ruleset_fd;
 
-	/* Enable Landlock. */
+	/* Enables Landlock. */
 	ruleset_fd = create_ruleset(_metadata, variant->handled, rules);
 	ASSERT_LE(0, ruleset_fd);
 	enforce_ruleset(_metadata, ruleset_fd);
@@ -3829,6 +3838,16 @@ TEST_F_FORK(ftruncate, open_and_ftruncate_in_different_processes)
 	ASSERT_EQ(0, close(socket_fds[1]));
 }
 
+/* Invokes the FS_IOC_GETFLAGS IOCTL and returns its errno or 0. */
+static int test_fs_ioc_getflags_ioctl(int fd)
+{
+	uint32_t flags;
+
+	if (ioctl(fd, FS_IOC_GETFLAGS, &flags) < 0)
+		return errno;
+	return 0;
+}
+
 TEST(memfd_ftruncate)
 {
 	int fd;
@@ -3845,6 +3864,360 @@ TEST(memfd_ftruncate)
 	ASSERT_EQ(0, close(fd));
 }
 
+/* clang-format off */
+FIXTURE(ioctl) {};
+/* clang-format on */
+
+FIXTURE_SETUP(ioctl)
+{
+	prepare_layout(_metadata);
+	create_file(_metadata, file1_s1d1);
+}
+
+FIXTURE_TEARDOWN(ioctl)
+{
+	EXPECT_EQ(0, remove_path(file1_s1d1));
+	cleanup_layout(_metadata);
+}
+
+FIXTURE_VARIANT(ioctl)
+{
+	const __u64 handled;
+	const __u64 allowed;
+	const mode_t open_mode;
+	/*
+	 * These are the expected IOCTL results for a representative IOCTL from
+	 * each of the IOCTL groups.  We only distinguish the 0 and EACCES
+	 * results here, and treat other errors as 0.
+	 */
+	const int expected_fioqsize_result; /* RW */
+	const int expected_fibmap_result; /* RW_FILE */
+	const int expected_fionread_result; /* special */
+};
+
+/* clang-format off */
+FIXTURE_VARIANT_ADD(ioctl, handled_i_allowed_none) {
+	/* clang-format on */
+	.handled = LANDLOCK_ACCESS_FS_IOCTL,
+	.allowed = 0,
+	.open_mode = O_RDWR,
+	/*
+	 * If LANDLOCK_ACCESS_FS_IOCTL is handled, but nothing else is
+	 * explicitly handled, almost all IOCTL commands will be governed by the
+	 * LANDLOCK_ACCESS_FS_IOCTL right.  Files can be opened, but IOCTLs are
+	 * disallowed.
+	 */
+	.expected_fioqsize_result = EACCES,
+	.expected_fibmap_result = EACCES,
+	.expected_fionread_result = EACCES,
+};
+
+/* clang-format off */
+FIXTURE_VARIANT_ADD(ioctl, handled_i_allowed_i) {
+	/* clang-format on */
+	.handled = LANDLOCK_ACCESS_FS_IOCTL,
+	.allowed = LANDLOCK_ACCESS_FS_IOCTL,
+	.open_mode = O_RDWR,
+	.expected_fioqsize_result = 0,
+	.expected_fibmap_result = 0,
+	.expected_fionread_result = 0,
+};
+
+/* clang-format off */
+FIXTURE_VARIANT_ADD(ioctl, unhandled) {
+	/* clang-format on */
+	.handled = LANDLOCK_ACCESS_FS_EXECUTE,
+	.allowed = LANDLOCK_ACCESS_FS_EXECUTE,
+	.open_mode = O_RDWR,
+	.expected_fioqsize_result = 0,
+	.expected_fibmap_result = 0,
+	.expected_fionread_result = 0,
+};
+
+/* clang-format off */
+FIXTURE_VARIANT_ADD(ioctl, handled_rwd_allowed_r) {
+	/* clang-format on */
+	.handled = LANDLOCK_ACCESS_FS_READ_FILE |
+		   LANDLOCK_ACCESS_FS_WRITE_FILE | LANDLOCK_ACCESS_FS_READ_DIR,
+	.allowed = LANDLOCK_ACCESS_FS_READ_FILE,
+	.open_mode = O_RDONLY,
+	/* If LANDLOCK_ACCESS_FS_IOCTL is not handled, all IOCTLs work. */
+	.expected_fioqsize_result = 0,
+	.expected_fibmap_result = 0,
+	.expected_fionread_result = 0,
+};
+
+/* clang-format off */
+FIXTURE_VARIANT_ADD(ioctl, handled_rwd_allowed_w) {
+	/* clang-format on */
+	.handled = LANDLOCK_ACCESS_FS_READ_FILE |
+		   LANDLOCK_ACCESS_FS_WRITE_FILE | LANDLOCK_ACCESS_FS_READ_DIR,
+	.allowed = LANDLOCK_ACCESS_FS_WRITE_FILE,
+	.open_mode = O_WRONLY,
+	/* If LANDLOCK_ACCESS_FS_IOCTL is not handled, all IOCTLs work. */
+	.expected_fioqsize_result = 0,
+	.expected_fibmap_result = 0,
+	.expected_fionread_result = 0,
+};
+
+/* clang-format off */
+FIXTURE_VARIANT_ADD(ioctl, handled_ri_allowed_r) {
+	/* clang-format on */
+	.handled = LANDLOCK_ACCESS_FS_READ_FILE | LANDLOCK_ACCESS_FS_IOCTL,
+	.allowed = LANDLOCK_ACCESS_FS_READ_FILE,
+	.open_mode = O_RDONLY,
+	.expected_fioqsize_result = 0,
+	.expected_fibmap_result = 0,
+	.expected_fionread_result = 0,
+};
+
+/* clang-format off */
+FIXTURE_VARIANT_ADD(ioctl, handled_wi_allowed_w) {
+	/* clang-format on */
+	.handled = LANDLOCK_ACCESS_FS_WRITE_FILE | LANDLOCK_ACCESS_FS_IOCTL,
+	.allowed = LANDLOCK_ACCESS_FS_WRITE_FILE,
+	.open_mode = O_WRONLY,
+	.expected_fioqsize_result = 0,
+	.expected_fibmap_result = 0,
+	.expected_fionread_result = 0,
+};
+
+/* clang-format off */
+FIXTURE_VARIANT_ADD(ioctl, handled_di_allowed_d) {
+	/* clang-format on */
+	.handled = LANDLOCK_ACCESS_FS_READ_DIR | LANDLOCK_ACCESS_FS_IOCTL,
+	.allowed = LANDLOCK_ACCESS_FS_READ_DIR,
+	.open_mode = O_RDWR,
+	.expected_fioqsize_result = 0,
+	.expected_fibmap_result = EACCES,
+	.expected_fionread_result = 0,
+};
+
+/* clang-format off */
+FIXTURE_VARIANT_ADD(ioctl, handled_rwi_allowed_rw) {
+	/* clang-format on */
+	.handled = LANDLOCK_ACCESS_FS_READ_FILE |
+		   LANDLOCK_ACCESS_FS_WRITE_FILE | LANDLOCK_ACCESS_FS_IOCTL,
+	.allowed = LANDLOCK_ACCESS_FS_READ_FILE | LANDLOCK_ACCESS_FS_WRITE_FILE,
+	.open_mode = O_RDWR,
+	.expected_fioqsize_result = 0,
+	.expected_fibmap_result = 0,
+	.expected_fionread_result = 0,
+};
+
+/* clang-format off */
+FIXTURE_VARIANT_ADD(ioctl, handled_rwi_allowed_r) {
+	/* clang-format on */
+	.handled = LANDLOCK_ACCESS_FS_READ_FILE |
+		   LANDLOCK_ACCESS_FS_WRITE_FILE | LANDLOCK_ACCESS_FS_IOCTL,
+	.allowed = LANDLOCK_ACCESS_FS_READ_FILE,
+	.open_mode = O_RDONLY,
+	.expected_fioqsize_result = 0,
+	.expected_fibmap_result = 0,
+	.expected_fionread_result = 0,
+};
+
+/* clang-format off */
+FIXTURE_VARIANT_ADD(ioctl, handled_rwi_allowed_ri) {
+	/* clang-format on */
+	.handled = LANDLOCK_ACCESS_FS_READ_FILE |
+		   LANDLOCK_ACCESS_FS_WRITE_FILE | LANDLOCK_ACCESS_FS_IOCTL,
+	.allowed = LANDLOCK_ACCESS_FS_READ_FILE | LANDLOCK_ACCESS_FS_IOCTL,
+	.open_mode = O_RDONLY,
+	.expected_fioqsize_result = 0,
+	.expected_fibmap_result = 0,
+	.expected_fionread_result = 0,
+};
+
+/* clang-format off */
+FIXTURE_VARIANT_ADD(ioctl, handled_rwi_allowed_w) {
+	/* clang-format on */
+	.handled = LANDLOCK_ACCESS_FS_READ_FILE |
+		   LANDLOCK_ACCESS_FS_WRITE_FILE | LANDLOCK_ACCESS_FS_IOCTL,
+	.allowed = LANDLOCK_ACCESS_FS_WRITE_FILE,
+	.open_mode = O_WRONLY,
+	.expected_fioqsize_result = 0,
+	.expected_fibmap_result = 0,
+	.expected_fionread_result = 0,
+};
+
+/* clang-format off */
+FIXTURE_VARIANT_ADD(ioctl, handled_rwi_allowed_wi) {
+	/* clang-format on */
+	.handled = LANDLOCK_ACCESS_FS_READ_FILE |
+		   LANDLOCK_ACCESS_FS_WRITE_FILE | LANDLOCK_ACCESS_FS_IOCTL,
+	.allowed = LANDLOCK_ACCESS_FS_WRITE_FILE | LANDLOCK_ACCESS_FS_IOCTL,
+	.open_mode = O_WRONLY,
+	.expected_fioqsize_result = 0,
+	.expected_fibmap_result = 0,
+	.expected_fionread_result = 0,
+};
+
+static int test_fioqsize_ioctl(int fd)
+{
+	size_t sz;
+
+	if (ioctl(fd, FIOQSIZE, &sz) < 0)
+		return errno;
+	return 0;
+}
+
+static int test_fibmap_ioctl(int fd)
+{
+	int blk = 0;
+
+	/*
+	 * We only want to distinguish here whether Landlock already caught it,
+	 * so we treat anything but EACCESS as success.  (It commonly returns
+	 * EPERM when missing CAP_SYS_RAWIO.)
+	 */
+	if (ioctl(fd, FIBMAP, &blk) < 0 && errno == EACCES)
+		return errno;
+	return 0;
+}
+
+static int test_fionread_ioctl(int fd)
+{
+	size_t sz = 0;
+
+	if (ioctl(fd, FIONREAD, &sz) < 0 && errno == EACCES)
+		return errno;
+	return 0;
+}
+
+TEST_F_FORK(ioctl, handle_dir_access_file)
+{
+	const int flag = 0;
+	const struct rule rules[] = {
+		{
+			.path = dir_s1d1,
+			.access = variant->allowed,
+		},
+		{},
+	};
+	int file_fd, ruleset_fd;
+
+	/* Enables Landlock. */
+	ruleset_fd = create_ruleset(_metadata, variant->handled, rules);
+	ASSERT_LE(0, ruleset_fd);
+	enforce_ruleset(_metadata, ruleset_fd);
+	ASSERT_EQ(0, close(ruleset_fd));
+
+	file_fd = open(file1_s1d1, variant->open_mode);
+	ASSERT_LE(0, file_fd);
+
+	/*
+	 * Checks that IOCTL commands in each IOCTL group return the expected
+	 * errors.
+	 */
+	EXPECT_EQ(variant->expected_fioqsize_result,
+		  test_fioqsize_ioctl(file_fd));
+	EXPECT_EQ(variant->expected_fibmap_result, test_fibmap_ioctl(file_fd));
+	EXPECT_EQ(variant->expected_fionread_result,
+		  test_fionread_ioctl(file_fd));
+
+	/* Checks that unrestrictable commands are unrestricted. */
+	EXPECT_EQ(0, ioctl(file_fd, FIOCLEX));
+	EXPECT_EQ(0, ioctl(file_fd, FIONCLEX));
+	EXPECT_EQ(0, ioctl(file_fd, FIONBIO, &flag));
+	EXPECT_EQ(0, ioctl(file_fd, FIOASYNC, &flag));
+
+	ASSERT_EQ(0, close(file_fd));
+}
+
+TEST_F_FORK(ioctl, handle_dir_access_dir)
+{
+	const char *const path = dir_s1d1;
+	const int flag = 0;
+	const struct rule rules[] = {
+		{
+			.path = path,
+			.access = variant->allowed,
+		},
+		{},
+	};
+	int dir_fd, ruleset_fd;
+
+	/* Enables Landlock. */
+	ruleset_fd = create_ruleset(_metadata, variant->handled, rules);
+	ASSERT_LE(0, ruleset_fd);
+	enforce_ruleset(_metadata, ruleset_fd);
+	ASSERT_EQ(0, close(ruleset_fd));
+
+	/*
+	 * Ignore variant->open_mode for this test, as we intend to open a
+	 * directory.  If the directory can not be opened, the variant is
+	 * infeasible to test with an opened directory.
+	 */
+	dir_fd = open(path, O_RDONLY);
+	if (dir_fd < 0)
+		return;
+
+	/*
+	 * Checks that IOCTL commands in each IOCTL group return the expected
+	 * errors.
+	 */
+	EXPECT_EQ(variant->expected_fioqsize_result,
+		  test_fioqsize_ioctl(dir_fd));
+	EXPECT_EQ(variant->expected_fibmap_result, test_fibmap_ioctl(dir_fd));
+	EXPECT_EQ(variant->expected_fionread_result,
+		  test_fionread_ioctl(dir_fd));
+
+	/* Checks that unrestrictable commands are unrestricted. */
+	EXPECT_EQ(0, ioctl(dir_fd, FIOCLEX));
+	EXPECT_EQ(0, ioctl(dir_fd, FIONCLEX));
+	EXPECT_EQ(0, ioctl(dir_fd, FIONBIO, &flag));
+	EXPECT_EQ(0, ioctl(dir_fd, FIOASYNC, &flag));
+
+	ASSERT_EQ(0, close(dir_fd));
+}
+
+TEST_F_FORK(ioctl, handle_file_access_file)
+{
+	const char *const path = file1_s1d1;
+	const int flag = 0;
+	const struct rule rules[] = {
+		{
+			.path = path,
+			.access = variant->allowed,
+		},
+		{},
+	};
+	int file_fd, ruleset_fd;
+
+	if (variant->allowed & LANDLOCK_ACCESS_FS_READ_DIR) {
+		SKIP(return, "LANDLOCK_ACCESS_FS_READ_DIR "
+			     "can not be granted on files");
+	}
+
+	/* Enables Landlock. */
+	ruleset_fd = create_ruleset(_metadata, variant->handled, rules);
+	ASSERT_LE(0, ruleset_fd);
+	enforce_ruleset(_metadata, ruleset_fd);
+	ASSERT_EQ(0, close(ruleset_fd));
+
+	file_fd = open(path, variant->open_mode);
+	ASSERT_LE(0, file_fd);
+
+	/*
+	 * Checks that IOCTL commands in each IOCTL group return the expected
+	 * errors.
+	 */
+	EXPECT_EQ(variant->expected_fioqsize_result,
+		  test_fioqsize_ioctl(file_fd));
+	EXPECT_EQ(variant->expected_fibmap_result, test_fibmap_ioctl(file_fd));
+	EXPECT_EQ(variant->expected_fionread_result,
+		  test_fionread_ioctl(file_fd));
+
+	/* Checks that unrestrictable commands are unrestricted. */
+	EXPECT_EQ(0, ioctl(file_fd, FIOCLEX));
+	EXPECT_EQ(0, ioctl(file_fd, FIONCLEX));
+	EXPECT_EQ(0, ioctl(file_fd, FIONBIO, &flag));
+	EXPECT_EQ(0, ioctl(file_fd, FIOASYNC, &flag));
+
+	ASSERT_EQ(0, close(file_fd));
+}
+
 /* clang-format off */
 FIXTURE(layout1_bind) {};
 /* clang-format on */

From ab2d265595551ea2c4bf999c8f5da9b5160ddae4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?G=C3=BCnther=20Noack?= <gnoack@google.com>
Date: Fri, 9 Feb 2024 18:06:07 +0100
Subject: [PATCH 0767/1406] selftests/landlock: Test IOCTL with memfds
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Because the LANDLOCK_ACCESS_FS_IOCTL right is associated with the
opened file during open(2), IOCTLs are supposed to work with files
which are opened by means other than open(2).

Signed-off-by: Günther Noack <gnoack@google.com>
Link: https://lore.kernel.org/r/20240209170612.1638517-4-gnoack@google.com
Signed-off-by: Mickaël Salaün <mic@digikod.net>
---
 tools/testing/selftests/landlock/fs_test.c | 36 ++++++++++++++++------
 1 file changed, 27 insertions(+), 9 deletions(-)

diff --git a/tools/testing/selftests/landlock/fs_test.c b/tools/testing/selftests/landlock/fs_test.c
index 6ff1026c26c283..aa4e5524b22f53 100644
--- a/tools/testing/selftests/landlock/fs_test.c
+++ b/tools/testing/selftests/landlock/fs_test.c
@@ -3848,20 +3848,38 @@ static int test_fs_ioc_getflags_ioctl(int fd)
 	return 0;
 }
 
-TEST(memfd_ftruncate)
+TEST(memfd_ftruncate_and_ioctl)
 {
-	int fd;
-
-	fd = memfd_create("name", MFD_CLOEXEC);
-	ASSERT_LE(0, fd);
+	const struct landlock_ruleset_attr attr = {
+		.handled_access_fs = ACCESS_ALL,
+	};
+	int ruleset_fd, fd, i;
 
 	/*
-	 * Checks that ftruncate is permitted on file descriptors that are
-	 * created in ways other than open(2).
+	 * We exercise the same test both with and without Landlock enabled, to
+	 * ensure that it behaves the same in both cases.
 	 */
-	EXPECT_EQ(0, test_ftruncate(fd));
+	for (i = 0; i < 2; i++) {
+		/* Creates a new memfd. */
+		fd = memfd_create("name", MFD_CLOEXEC);
+		ASSERT_LE(0, fd);
 
-	ASSERT_EQ(0, close(fd));
+		/*
+		 * Checks that operations associated with the opened file
+		 * (ftruncate, ioctl) are permitted on file descriptors that are
+		 * created in ways other than open(2).
+		 */
+		EXPECT_EQ(0, test_ftruncate(fd));
+		EXPECT_EQ(0, test_fs_ioc_getflags_ioctl(fd));
+
+		ASSERT_EQ(0, close(fd));
+
+		/* Enables Landlock. */
+		ruleset_fd = landlock_create_ruleset(&attr, sizeof(attr), 0);
+		ASSERT_LE(0, ruleset_fd);
+		enforce_ruleset(_metadata, ruleset_fd);
+		ASSERT_EQ(0, close(ruleset_fd));
+	}
 }
 
 /* clang-format off */

From ec3f9edf2d176b0faf5332c5ced418c605806c45 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?G=C3=BCnther=20Noack?= <gnoack@google.com>
Date: Fri, 9 Feb 2024 18:06:08 +0100
Subject: [PATCH 0768/1406] selftests/landlock: Test ioctl(2) and ftruncate(2)
 with open(O_PATH)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

ioctl(2) and ftruncate(2) operations on files opened with O_PATH
should always return EBADF, independent of the
LANDLOCK_ACCESS_FS_TRUNCATE and LANDLOCK_ACCESS_FS_IOCTL access rights
in that file hierarchy.

Suggested-by: Mickaël Salaün <mic@digikod.net>
Signed-off-by: Günther Noack <gnoack@google.com>
Link: https://lore.kernel.org/r/20240209170612.1638517-5-gnoack@google.com
Signed-off-by: Mickaël Salaün <mic@digikod.net>
---
 tools/testing/selftests/landlock/fs_test.c | 40 ++++++++++++++++++++++
 1 file changed, 40 insertions(+)

diff --git a/tools/testing/selftests/landlock/fs_test.c b/tools/testing/selftests/landlock/fs_test.c
index aa4e5524b22f53..9e9b828a898b72 100644
--- a/tools/testing/selftests/landlock/fs_test.c
+++ b/tools/testing/selftests/landlock/fs_test.c
@@ -3882,6 +3882,46 @@ TEST(memfd_ftruncate_and_ioctl)
 	}
 }
 
+TEST_F_FORK(layout1, o_path_ftruncate_and_ioctl)
+{
+	const struct landlock_ruleset_attr attr = {
+		.handled_access_fs = ACCESS_ALL,
+	};
+	int ruleset_fd, fd;
+
+	/*
+	 * Checks that for files opened with O_PATH, both ioctl(2) and
+	 * ftruncate(2) yield EBADF, as it is documented in open(2) for the
+	 * O_PATH flag.
+	 */
+	fd = open(dir_s1d1, O_PATH | O_CLOEXEC);
+	ASSERT_LE(0, fd);
+
+	EXPECT_EQ(EBADF, test_ftruncate(fd));
+	EXPECT_EQ(EBADF, test_fs_ioc_getflags_ioctl(fd));
+
+	ASSERT_EQ(0, close(fd));
+
+	/* Enables Landlock. */
+	ruleset_fd = landlock_create_ruleset(&attr, sizeof(attr), 0);
+	ASSERT_LE(0, ruleset_fd);
+	enforce_ruleset(_metadata, ruleset_fd);
+	ASSERT_EQ(0, close(ruleset_fd));
+
+	/*
+	 * Checks that after enabling Landlock,
+	 * - the file can still be opened with O_PATH
+	 * - both ioctl and truncate still yield EBADF (not EACCES).
+	 */
+	fd = open(dir_s1d1, O_PATH | O_CLOEXEC);
+	ASSERT_LE(0, fd);
+
+	EXPECT_EQ(EBADF, test_ftruncate(fd));
+	EXPECT_EQ(EBADF, test_fs_ioc_getflags_ioctl(fd));
+
+	ASSERT_EQ(0, close(fd));
+}
+
 /* clang-format off */
 FIXTURE(ioctl) {};
 /* clang-format on */

From 5a0efb9865cd81ab803ed42b8ed36b78086ff43c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?G=C3=BCnther=20Noack?= <gnoack@google.com>
Date: Fri, 9 Feb 2024 18:06:09 +0100
Subject: [PATCH 0769/1406] selftests/landlock: Test IOCTLs on named pipes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Named pipes should behave like pipes created with pipe(2),
so we don't want to restrict IOCTLs on them.

Suggested-by: Mickaël Salaün <mic@digikod.net>
Signed-off-by: Günther Noack <gnoack@google.com>
Link: https://lore.kernel.org/r/20240209170612.1638517-6-gnoack@google.com
Signed-off-by: Mickaël Salaün <mic@digikod.net>
---
 tools/testing/selftests/landlock/fs_test.c | 70 +++++++++++++++++++---
 1 file changed, 61 insertions(+), 9 deletions(-)

diff --git a/tools/testing/selftests/landlock/fs_test.c b/tools/testing/selftests/landlock/fs_test.c
index 9e9b828a898b72..ae8b8b41282823 100644
--- a/tools/testing/selftests/landlock/fs_test.c
+++ b/tools/testing/selftests/landlock/fs_test.c
@@ -3922,6 +3922,67 @@ TEST_F_FORK(layout1, o_path_ftruncate_and_ioctl)
 	ASSERT_EQ(0, close(fd));
 }
 
+static int test_fionread_ioctl(int fd)
+{
+	size_t sz = 0;
+
+	if (ioctl(fd, FIONREAD, &sz) < 0 && errno == EACCES)
+		return errno;
+	return 0;
+}
+
+/*
+ * For named pipes, the same rules should apply as for anonymous pipes.
+ *
+ * That means, if the pipe is opened, we should permit the IOCTLs which are
+ * implemented by pipefifo_fops (fs/pipe.c), even if they were otherwise
+ * forbidden by Landlock policy.
+ */
+TEST_F_FORK(layout1, named_pipe_ioctl)
+{
+	pid_t child_pid;
+	int fd, ruleset_fd;
+	const char *const path = file1_s1d1;
+	const struct landlock_ruleset_attr attr = {
+		.handled_access_fs = LANDLOCK_ACCESS_FS_IOCTL,
+	};
+
+	ASSERT_EQ(0, unlink(path));
+	ASSERT_EQ(0, mkfifo(path, 0600));
+
+	/* Enables Landlock. */
+	ruleset_fd = landlock_create_ruleset(&attr, sizeof(attr), 0);
+	ASSERT_LE(0, ruleset_fd);
+	enforce_ruleset(_metadata, ruleset_fd);
+	ASSERT_EQ(0, close(ruleset_fd));
+
+	/* The child process opens the pipe for writing. */
+	child_pid = fork();
+	ASSERT_NE(-1, child_pid);
+	if (child_pid == 0) {
+		fd = open(path, O_WRONLY);
+		close(fd);
+		exit(0);
+	}
+
+	fd = open(path, O_RDONLY);
+	ASSERT_LE(0, fd);
+
+	/* FIONREAD is implemented by pipefifo_fops. */
+	EXPECT_EQ(0, test_fionread_ioctl(fd));
+
+	ASSERT_EQ(0, close(fd));
+	ASSERT_EQ(0, unlink(path));
+
+	/* Under the same conditions, FIONREAD on a regular file fails. */
+	fd = open(file2_s1d1, O_RDONLY);
+	ASSERT_LE(0, fd);
+	EXPECT_EQ(EACCES, test_fionread_ioctl(fd));
+	ASSERT_EQ(0, close(fd));
+
+	ASSERT_EQ(child_pid, waitpid(child_pid, NULL, 0));
+}
+
 /* clang-format off */
 FIXTURE(ioctl) {};
 /* clang-format on */
@@ -4134,15 +4195,6 @@ static int test_fibmap_ioctl(int fd)
 	return 0;
 }
 
-static int test_fionread_ioctl(int fd)
-{
-	size_t sz = 0;
-
-	if (ioctl(fd, FIONREAD, &sz) < 0 && errno == EACCES)
-		return errno;
-	return 0;
-}
-
 TEST_F_FORK(ioctl, handle_dir_access_file)
 {
 	const int flag = 0;

From 3561598624dbea67886b8179cd1fbc98f7ff1550 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?G=C3=BCnther=20Noack?= <gnoack@google.com>
Date: Fri, 9 Feb 2024 18:06:10 +0100
Subject: [PATCH 0770/1406] selftests/landlock: Check IOCTL restrictions for
 named UNIX domain sockets
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Suggested-by: Mickaël Salaün <mic@digikod.net>
Signed-off-by: Günther Noack <gnoack@google.com>
Link: https://lore.kernel.org/r/20240209170612.1638517-7-gnoack@google.com
Signed-off-by: Mickaël Salaün <mic@digikod.net>
---
 tools/testing/selftests/landlock/fs_test.c | 53 ++++++++++++++++++++++
 1 file changed, 53 insertions(+)

diff --git a/tools/testing/selftests/landlock/fs_test.c b/tools/testing/selftests/landlock/fs_test.c
index ae8b8b41282823..59b57ff6915b56 100644
--- a/tools/testing/selftests/landlock/fs_test.c
+++ b/tools/testing/selftests/landlock/fs_test.c
@@ -18,8 +18,10 @@
 #include <sys/mount.h>
 #include <sys/prctl.h>
 #include <sys/sendfile.h>
+#include <sys/socket.h>
 #include <sys/stat.h>
 #include <sys/sysmacros.h>
+#include <sys/un.h>
 #include <sys/vfs.h>
 #include <unistd.h>
 
@@ -3983,6 +3985,57 @@ TEST_F_FORK(layout1, named_pipe_ioctl)
 	ASSERT_EQ(child_pid, waitpid(child_pid, NULL, 0));
 }
 
+/* For named UNIX domain sockets, no IOCTL restrictions apply. */
+TEST_F_FORK(layout1, named_unix_domain_socket_ioctl)
+{
+	const char *const path = file1_s1d1;
+	int srv_fd, cli_fd, ruleset_fd;
+	socklen_t size;
+	struct sockaddr_un srv_un, cli_un;
+	const struct landlock_ruleset_attr attr = {
+		.handled_access_fs = LANDLOCK_ACCESS_FS_IOCTL,
+	};
+
+	/* Sets up a server */
+	srv_un.sun_family = AF_UNIX;
+	strncpy(srv_un.sun_path, path, sizeof(srv_un.sun_path));
+
+	ASSERT_EQ(0, unlink(path));
+	ASSERT_LE(0, (srv_fd = socket(AF_UNIX, SOCK_STREAM, 0)));
+
+	size = offsetof(struct sockaddr_un, sun_path) + strlen(srv_un.sun_path);
+	ASSERT_EQ(0, bind(srv_fd, (struct sockaddr *)&srv_un, size));
+	ASSERT_EQ(0, listen(srv_fd, 10 /* qlen */));
+
+	/* Enables Landlock. */
+	ruleset_fd = landlock_create_ruleset(&attr, sizeof(attr), 0);
+	ASSERT_LE(0, ruleset_fd);
+	enforce_ruleset(_metadata, ruleset_fd);
+	ASSERT_EQ(0, close(ruleset_fd));
+
+	/* Sets up a client connection to it */
+	cli_un.sun_family = AF_UNIX;
+	snprintf(cli_un.sun_path, sizeof(cli_un.sun_path), "%s%ld", path,
+		 (long)getpid());
+
+	ASSERT_LE(0, (cli_fd = socket(AF_UNIX, SOCK_STREAM, 0)));
+
+	size = offsetof(struct sockaddr_un, sun_path) + strlen(cli_un.sun_path);
+	ASSERT_EQ(0, bind(cli_fd, (struct sockaddr *)&cli_un, size));
+
+	bzero(&cli_un, sizeof(cli_un));
+	cli_un.sun_family = AF_UNIX;
+	strncpy(cli_un.sun_path, path, sizeof(cli_un.sun_path));
+	size = offsetof(struct sockaddr_un, sun_path) + strlen(cli_un.sun_path);
+
+	ASSERT_EQ(0, connect(cli_fd, (struct sockaddr *)&cli_un, size));
+
+	/* FIONREAD and other IOCTLs should not be forbidden. */
+	EXPECT_EQ(0, test_fionread_ioctl(cli_fd));
+
+	ASSERT_EQ(0, close(cli_fd));
+}
+
 /* clang-format off */
 FIXTURE(ioctl) {};
 /* clang-format on */

From 3eaaf3c9f802574fc7394f45f6aa180d45e0322a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?G=C3=BCnther=20Noack?= <gnoack@google.com>
Date: Fri, 9 Feb 2024 18:06:11 +0100
Subject: [PATCH 0771/1406] samples/landlock: Add support for
 LANDLOCK_ACCESS_FS_IOCTL
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add ioctl support to the Landlock sample tool.

The ioctl right is grouped with the read-write rights in the sample
tool, as some ioctl requests provide features that mutate state.

Signed-off-by: Günther Noack <gnoack@google.com>
Link: https://lore.kernel.org/r/20240209170612.1638517-8-gnoack@google.com
Signed-off-by: Mickaël Salaün <mic@digikod.net>
---
 samples/landlock/sandboxer.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/samples/landlock/sandboxer.c b/samples/landlock/sandboxer.c
index 08596c0ef0707c..d7323e5526be29 100644
--- a/samples/landlock/sandboxer.c
+++ b/samples/landlock/sandboxer.c
@@ -81,7 +81,8 @@ static int parse_path(char *env_path, const char ***const path_list)
 	LANDLOCK_ACCESS_FS_EXECUTE | \
 	LANDLOCK_ACCESS_FS_WRITE_FILE | \
 	LANDLOCK_ACCESS_FS_READ_FILE | \
-	LANDLOCK_ACCESS_FS_TRUNCATE)
+	LANDLOCK_ACCESS_FS_TRUNCATE | \
+	LANDLOCK_ACCESS_FS_IOCTL)
 
 /* clang-format on */
 
@@ -199,11 +200,12 @@ static int populate_ruleset_net(const char *const env_var, const int ruleset_fd,
 	LANDLOCK_ACCESS_FS_MAKE_BLOCK | \
 	LANDLOCK_ACCESS_FS_MAKE_SYM | \
 	LANDLOCK_ACCESS_FS_REFER | \
-	LANDLOCK_ACCESS_FS_TRUNCATE)
+	LANDLOCK_ACCESS_FS_TRUNCATE | \
+	LANDLOCK_ACCESS_FS_IOCTL)
 
 /* clang-format on */
 
-#define LANDLOCK_ABI_LAST 4
+#define LANDLOCK_ABI_LAST 5
 
 int main(const int argc, char *const argv[], char *const *const envp)
 {
@@ -317,6 +319,11 @@ int main(const int argc, char *const argv[], char *const *const envp)
 		ruleset_attr.handled_access_net &=
 			~(LANDLOCK_ACCESS_NET_BIND_TCP |
 			  LANDLOCK_ACCESS_NET_CONNECT_TCP);
+		__attribute__((fallthrough));
+	case 4:
+		/* Removes LANDLOCK_ACCESS_FS_IOCTL for ABI < 5 */
+		ruleset_attr.handled_access_fs &= ~LANDLOCK_ACCESS_FS_IOCTL;
+
 		fprintf(stderr,
 			"Hint: You should update the running kernel "
 			"to leverage Landlock features "

From 28c2be13a1e03127a01d8cc6015ab36063cbb715 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?G=C3=BCnther=20Noack?= <gnoack@google.com>
Date: Fri, 9 Feb 2024 18:06:12 +0100
Subject: [PATCH 0772/1406] landlock: Document IOCTL support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In the paragraph above the fallback logic, use the shorter phrasing
from the landlock(7) man page.

Signed-off-by: Günther Noack <gnoack@google.com>
Link: https://lore.kernel.org/r/20240209170612.1638517-9-gnoack@google.com
Signed-off-by: Mickaël Salaün <mic@digikod.net>
---
 Documentation/userspace-api/landlock.rst | 121 ++++++++++++++++++++---
 1 file changed, 106 insertions(+), 15 deletions(-)

diff --git a/Documentation/userspace-api/landlock.rst b/Documentation/userspace-api/landlock.rst
index 2e38226770615d..a6e55912139b57 100644
--- a/Documentation/userspace-api/landlock.rst
+++ b/Documentation/userspace-api/landlock.rst
@@ -75,7 +75,8 @@ to be explicit about the denied-by-default access rights.
             LANDLOCK_ACCESS_FS_MAKE_BLOCK |
             LANDLOCK_ACCESS_FS_MAKE_SYM |
             LANDLOCK_ACCESS_FS_REFER |
-            LANDLOCK_ACCESS_FS_TRUNCATE,
+            LANDLOCK_ACCESS_FS_TRUNCATE |
+            LANDLOCK_ACCESS_FS_IOCTL,
         .handled_access_net =
             LANDLOCK_ACCESS_NET_BIND_TCP |
             LANDLOCK_ACCESS_NET_CONNECT_TCP,
@@ -84,10 +85,10 @@ to be explicit about the denied-by-default access rights.
 Because we may not know on which kernel version an application will be
 executed, it is safer to follow a best-effort security approach.  Indeed, we
 should try to protect users as much as possible whatever the kernel they are
-using.  To avoid binary enforcement (i.e. either all security features or
-none), we can leverage a dedicated Landlock command to get the current version
-of the Landlock ABI and adapt the handled accesses.  Let's check if we should
-remove access rights which are only supported in higher versions of the ABI.
+using.
+
+To be compatible with older Linux versions, we detect the available Landlock ABI
+version, and only use the available subset of access rights:
 
 .. code-block:: c
 
@@ -113,6 +114,10 @@ remove access rights which are only supported in higher versions of the ABI.
         ruleset_attr.handled_access_net &=
             ~(LANDLOCK_ACCESS_NET_BIND_TCP |
               LANDLOCK_ACCESS_NET_CONNECT_TCP);
+        __attribute__((fallthrough));
+    case 4:
+        /* Removes LANDLOCK_ACCESS_FS_IOCTL for ABI < 5 */
+        ruleset_attr.handled_access_fs &= ~LANDLOCK_ACCESS_FS_IOCTL;
     }
 
 This enables to create an inclusive ruleset that will contain our rules.
@@ -224,6 +229,7 @@ access rights per directory enables to change the location of such directory
 without relying on the destination directory access rights (except those that
 are required for this operation, see ``LANDLOCK_ACCESS_FS_REFER``
 documentation).
+
 Having self-sufficient hierarchies also helps to tighten the required access
 rights to the minimal set of data.  This also helps avoid sinkhole directories,
 i.e.  directories where data can be linked to but not linked from.  However,
@@ -317,18 +323,72 @@ It should also be noted that truncating files does not require the
 system call, this can also be done through :manpage:`open(2)` with the flags
 ``O_RDONLY | O_TRUNC``.
 
-When opening a file, the availability of the ``LANDLOCK_ACCESS_FS_TRUNCATE``
-right is associated with the newly created file descriptor and will be used for
-subsequent truncation attempts using :manpage:`ftruncate(2)`.  The behavior is
-similar to opening a file for reading or writing, where permissions are checked
-during :manpage:`open(2)`, but not during the subsequent :manpage:`read(2)` and
+The truncate right is associated with the opened file (see below).
+
+Rights associated with file descriptors
+---------------------------------------
+
+When opening a file, the availability of the ``LANDLOCK_ACCESS_FS_TRUNCATE`` and
+``LANDLOCK_ACCESS_FS_IOCTL`` rights is associated with the newly created file
+descriptor and will be used for subsequent truncation and ioctl attempts using
+:manpage:`ftruncate(2)` and :manpage:`ioctl(2)`.  The behavior is similar to
+opening a file for reading or writing, where permissions are checked during
+:manpage:`open(2)`, but not during the subsequent :manpage:`read(2)` and
 :manpage:`write(2)` calls.
 
-As a consequence, it is possible to have multiple open file descriptors for the
-same file, where one grants the right to truncate the file and the other does
-not.  It is also possible to pass such file descriptors between processes,
-keeping their Landlock properties, even when these processes do not have an
-enforced Landlock ruleset.
+As a consequence, it is possible that a process has multiple open file
+descriptors referring to the same file, but Landlock enforces different things
+when operating with these file descriptors.  This can happen when a Landlock
+ruleset gets enforced and the process keeps file descriptors which were opened
+both before and after the enforcement.  It is also possible to pass such file
+descriptors between processes, keeping their Landlock properties, even when some
+of the involved processes do not have an enforced Landlock ruleset.
+
+Restricting IOCTL commands
+--------------------------
+
+When the ``LANDLOCK_ACCESS_FS_IOCTL`` right is handled, Landlock will restrict
+the invocation of IOCTL commands.  However, to *allow* these IOCTL commands
+again, some of these IOCTL commands are then granted through other, preexisting
+access rights.
+
+For example, consider a program which handles ``LANDLOCK_ACCESS_FS_IOCTL`` and
+``LANDLOCK_ACCESS_FS_READ_FILE``.  The program *allows*
+``LANDLOCK_ACCESS_FS_READ_FILE`` on a file ``foo.log``.
+
+By virtue of granting this access on the ``foo.log`` file, it is now possible to
+use common and harmless IOCTL commands which are useful when reading files, such
+as ``FIONREAD``.
+
+When both ``LANDLOCK_ACCESS_FS_IOCTL`` and other access rights are
+handled in the ruleset, these other access rights may start governing
+the use of individual IOCTL commands instead of
+``LANDLOCK_ACCESS_FS_IOCTL``.  For instance, if both
+``LANDLOCK_ACCESS_FS_IOCTL`` and ``LANDLOCK_ACCESS_FS_READ_FILE`` are
+handled, allowing ``LANDLOCK_ACCESS_FS_READ_FILE`` will make it
+possible to use ``FIONREAD`` and other IOCTL commands.
+
+The following table illustrates how IOCTL attempts for ``FIONREAD`` are
+filtered, depending on how a Landlock ruleset handles and allows the
+``LANDLOCK_ACCESS_FS_IOCTL`` and ``LANDLOCK_ACCESS_FS_READ_FILE`` rights:
+
++-------------------------+--------------+--------------+--------------+
+|                         | ``FS_IOCTL`` | ``FS_IOCTL`` | ``FS_IOCTL`` |
+|                         | not handled  | handled and  | handled and  |
+|                         |              | allowed      | not allowed  |
++-------------------------+--------------+--------------+--------------+
+| ``FS_READ_FILE``        | allow        | allow        | deny         |
+| not handled             |              |              |              |
++-------------------------+              +--------------+--------------+
+| ``FS_READ_FILE``        |              | allow                       |
+| handled and allowed     |              |                             |
++-------------------------+              +-----------------------------+
+| ``FS_READ_FILE``        |              | deny                        |
+| handled and not allowed |              |                             |
++-------------------------+--------------+-----------------------------+
+
+The full list of IOCTL commands and the access rights which affect them is
+documented below.
 
 Compatibility
 =============
@@ -457,6 +517,27 @@ Memory usage
 Kernel memory allocated to create rulesets is accounted and can be restricted
 by the Documentation/admin-guide/cgroup-v1/memory.rst.
 
+IOCTL support
+-------------
+
+The ``LANDLOCK_ACCESS_FS_IOCTL`` right restricts the use of :manpage:`ioctl(2)`,
+but it only applies to newly opened files.  This means specifically that
+pre-existing file descriptors like stdin, stdout and stderr are unaffected.
+
+Users should be aware that TTY devices have traditionally permitted to control
+other processes on the same TTY through the ``TIOCSTI`` and ``TIOCLINUX`` IOCTL
+commands.  It is therefore recommended to close inherited TTY file descriptors,
+or to reopen them from ``/proc/self/fd/*`` without the
+``LANDLOCK_ACCESS_FS_IOCTL`` right, if possible.  The :manpage:`isatty(3)`
+function checks whether a given file descriptor is a TTY.
+
+Landlock's IOCTL support is coarse-grained at the moment, but may become more
+fine-grained in the future.  Until then, users are advised to establish the
+guarantees that they need through the file hierarchy, by only allowing the
+``LANDLOCK_ACCESS_FS_IOCTL`` right on files where it is really harmless.  In
+cases where you can control the mounts, the ``nodev`` mount option can help to
+rule out that device files can be accessed.
+
 Previous limitations
 ====================
 
@@ -494,6 +575,16 @@ bind and connect actions to only a set of allowed ports thanks to the new
 ``LANDLOCK_ACCESS_NET_BIND_TCP`` and ``LANDLOCK_ACCESS_NET_CONNECT_TCP``
 access rights.
 
+IOCTL (ABI < 5)
+---------------
+
+IOCTL operations could not be denied before the fifth Landlock ABI, so
+:manpage:`ioctl(2)` is always allowed when using a kernel that only supports an
+earlier ABI.
+
+Starting with the Landlock ABI version 5, it is possible to restrict the use of
+:manpage:`ioctl(2)` using the new ``LANDLOCK_ACCESS_FS_IOCTL`` access right.
+
 .. _kernel_support:
 
 Kernel support

From 1f784abd2eb152b446a212e0f133433510ece419 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Mon, 29 Jan 2024 10:21:58 -0800
Subject: [PATCH 0773/1406] overflow: Adjust check_*_overflow() kern-doc to
 reflect results

The check_*_overflow() helpers will return results with potentially
wrapped-around values. These values have always been checked by the
selftests, so avoid the confusing language in the kern-doc. The idea of
"safe for use" was relative to the expectation of whether or not the
caller wants a wrapped value -- the calculation itself will always follow
arithmetic wrapping rules.

Reviewed-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
---
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: linux-hardening@vger.kernel.org
---
 include/linux/overflow.h | 21 +++++++++------------
 1 file changed, 9 insertions(+), 12 deletions(-)

diff --git a/include/linux/overflow.h b/include/linux/overflow.h
index 7b5cf4a5cd1914..ad64d810c8aa81 100644
--- a/include/linux/overflow.h
+++ b/include/linux/overflow.h
@@ -57,11 +57,10 @@ static inline bool __must_check __must_check_overflow(bool overflow)
  * @b: second addend
  * @d: pointer to store sum
  *
- * Returns 0 on success.
+ * Returns true on wrap-around, false otherwise.
  *
- * *@d holds the results of the attempted addition, but is not considered
- * "safe for use" on a non-zero return value, which indicates that the
- * sum has overflowed or been truncated.
+ * *@d holds the results of the attempted addition, regardless of whether
+ * wrap-around occurred.
  */
 #define check_add_overflow(a, b, d)	\
 	__must_check_overflow(__builtin_add_overflow(a, b, d))
@@ -72,11 +71,10 @@ static inline bool __must_check __must_check_overflow(bool overflow)
  * @b: subtrahend; value to subtract from @a
  * @d: pointer to store difference
  *
- * Returns 0 on success.
+ * Returns true on wrap-around, false otherwise.
  *
- * *@d holds the results of the attempted subtraction, but is not considered
- * "safe for use" on a non-zero return value, which indicates that the
- * difference has underflowed or been truncated.
+ * *@d holds the results of the attempted subtraction, regardless of whether
+ * wrap-around occurred.
  */
 #define check_sub_overflow(a, b, d)	\
 	__must_check_overflow(__builtin_sub_overflow(a, b, d))
@@ -87,11 +85,10 @@ static inline bool __must_check __must_check_overflow(bool overflow)
  * @b: second factor
  * @d: pointer to store product
  *
- * Returns 0 on success.
+ * Returns true on wrap-around, false otherwise.
  *
- * *@d holds the results of the attempted multiplication, but is not
- * considered "safe for use" on a non-zero return value, which indicates
- * that the product has overflowed or been truncated.
+ * *@d holds the results of the attempted multiplication, regardless of whether
+ * wrap-around occurred.
  */
 #define check_mul_overflow(a, b, d)	\
 	__must_check_overflow(__builtin_mul_overflow(a, b, d))

From 952d7c7edc9524a7efef3f2913ab4c5130f21d4d Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Thu, 18 Jan 2024 16:05:52 -0800
Subject: [PATCH 0774/1406] overflow: Introduce wrapping_add(), wrapping_sub(),
 and wrapping_mul()

Provide helpers that will perform wrapping addition, subtraction, or
multiplication without tripping the arithmetic wrap-around sanitizers. The
first argument is the type under which the wrap-around should happen
with. In other words, these two calls will get very different results:

	wrapping_mul(int, 50, 50) == 2500
	wrapping_mul(u8,  50, 50) ==  196

Add to the selftests to validate behavior and lack of side-effects.

Reviewed-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Reviewed-by: Marco Elver <elver@google.com>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
---
Cc: Rasmus Villemoes <rasmus.villemoes@prevas.dk>
Cc: Marco Elver <elver@google.com>
Cc: Eric Biggers <ebiggers@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: linux-hardening@vger.kernel.org
---
 include/linux/overflow.h | 48 ++++++++++++++++++++++++++++++++++++++++
 lib/overflow_kunit.c     | 24 ++++++++++++++++----
 2 files changed, 68 insertions(+), 4 deletions(-)

diff --git a/include/linux/overflow.h b/include/linux/overflow.h
index ad64d810c8aa81..d3ff8e2bec2972 100644
--- a/include/linux/overflow.h
+++ b/include/linux/overflow.h
@@ -65,6 +65,22 @@ static inline bool __must_check __must_check_overflow(bool overflow)
 #define check_add_overflow(a, b, d)	\
 	__must_check_overflow(__builtin_add_overflow(a, b, d))
 
+/**
+ * wrapping_add() - Intentionally perform a wrapping addition
+ * @type: type for result of calculation
+ * @a: first addend
+ * @b: second addend
+ *
+ * Return the potentially wrapped-around addition without
+ * tripping any wrap-around sanitizers that may be enabled.
+ */
+#define wrapping_add(type, a, b)				\
+	({							\
+		type __val;					\
+		__builtin_add_overflow(a, b, &__val);		\
+		__val;						\
+	})
+
 /**
  * check_sub_overflow() - Calculate subtraction with overflow checking
  * @a: minuend; value to subtract from
@@ -79,6 +95,22 @@ static inline bool __must_check __must_check_overflow(bool overflow)
 #define check_sub_overflow(a, b, d)	\
 	__must_check_overflow(__builtin_sub_overflow(a, b, d))
 
+/**
+ * wrapping_sub() - Intentionally perform a wrapping subtraction
+ * @type: type for result of calculation
+ * @a: minuend; value to subtract from
+ * @b: subtrahend; value to subtract from @a
+ *
+ * Return the potentially wrapped-around subtraction without
+ * tripping any wrap-around sanitizers that may be enabled.
+ */
+#define wrapping_sub(type, a, b)				\
+	({							\
+		type __val;					\
+		__builtin_sub_overflow(a, b, &__val);		\
+		__val;						\
+	})
+
 /**
  * check_mul_overflow() - Calculate multiplication with overflow checking
  * @a: first factor
@@ -93,6 +125,22 @@ static inline bool __must_check __must_check_overflow(bool overflow)
 #define check_mul_overflow(a, b, d)	\
 	__must_check_overflow(__builtin_mul_overflow(a, b, d))
 
+/**
+ * wrapping_mul() - Intentionally perform a wrapping multiplication
+ * @type: type for result of calculation
+ * @a: first factor
+ * @b: second factor
+ *
+ * Return the potentially wrapped-around multiplication without
+ * tripping any wrap-around sanitizers that may be enabled.
+ */
+#define wrapping_mul(type, a, b)				\
+	({							\
+		type __val;					\
+		__builtin_mul_overflow(a, b, &__val);		\
+		__val;						\
+	})
+
 /**
  * check_shl_overflow() - Calculate a left-shifted value and check overflow
  * @a: Value to be shifted
diff --git a/lib/overflow_kunit.c b/lib/overflow_kunit.c
index c527f6b7578946..d3fdb906d3fef7 100644
--- a/lib/overflow_kunit.c
+++ b/lib/overflow_kunit.c
@@ -258,20 +258,36 @@ DEFINE_TEST_ARRAY(s64) = {
 									\
 	_of = check_ ## op ## _overflow(a, b, &_r);			\
 	KUNIT_EXPECT_EQ_MSG(test, _of, of,				\
-		"expected "fmt" "sym" "fmt" to%s overflow (type %s)\n",	\
+		"expected check "fmt" "sym" "fmt" to%s overflow (type %s)\n",	\
 		a, b, of ? "" : " not", #t);				\
 	KUNIT_EXPECT_EQ_MSG(test, _r, r,				\
-		"expected "fmt" "sym" "fmt" == "fmt", got "fmt" (type %s)\n", \
+		"expected check "fmt" "sym" "fmt" == "fmt", got "fmt" (type %s)\n", \
 		a, b, r, _r, #t);					\
 	/* Check for internal macro side-effects. */			\
 	_of = check_ ## op ## _overflow(_a_orig++, _b_orig++, &_r);	\
-	KUNIT_EXPECT_EQ_MSG(test, _a_orig, _a_bump, "Unexpected " #op " macro side-effect!\n"); \
-	KUNIT_EXPECT_EQ_MSG(test, _b_orig, _b_bump, "Unexpected " #op " macro side-effect!\n"); \
+	KUNIT_EXPECT_EQ_MSG(test, _a_orig, _a_bump,			\
+		"Unexpected check " #op " macro side-effect!\n");	\
+	KUNIT_EXPECT_EQ_MSG(test, _b_orig, _b_bump,			\
+		"Unexpected check " #op " macro side-effect!\n");	\
+									\
+	_r = wrapping_ ## op(t, a, b);					\
+	KUNIT_EXPECT_TRUE_MSG(test, _r == r,				\
+		"expected wrap "fmt" "sym" "fmt" == "fmt", got "fmt" (type %s)\n", \
+		a, b, r, _r, #t);					\
+	/* Check for internal macro side-effects. */			\
+	_a_orig = a;							\
+	_b_orig = b;							\
+	_r = wrapping_ ## op(t, _a_orig++, _b_orig++);			\
+	KUNIT_EXPECT_EQ_MSG(test, _a_orig, _a_bump,			\
+		"Unexpected wrap " #op " macro side-effect!\n");	\
+	KUNIT_EXPECT_EQ_MSG(test, _b_orig, _b_bump,			\
+		"Unexpected wrap " #op " macro side-effect!\n");	\
 } while (0)
 
 #define DEFINE_TEST_FUNC_TYPED(n, t, fmt)				\
 static void do_test_ ## n(struct kunit *test, const struct test_ ## n *p) \
 {									\
+	/* check_{add,sub,mul}_overflow() and wrapping_{add,sub,mul} */	\
 	check_one_op(t, fmt, add, "+", p->a, p->b, p->sum, p->s_of);	\
 	check_one_op(t, fmt, add, "+", p->b, p->a, p->sum, p->s_of);	\
 	check_one_op(t, fmt, sub, "-", p->a, p->b, p->diff, p->d_of);	\

From 87c2b25bda06e5c79532a4ad56d904be857229a1 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 26 Jan 2024 22:09:50 -0800
Subject: [PATCH 0775/1406] overflow: Introduce wrapping_assign_add() and
 wrapping_assign_sub()

This allows replacements of the idioms "var += offset" and "var -=
offset" with the wrapping_assign_add() and wrapping_assign_sub() helpers
respectively. They will avoid wrap-around sanitizer instrumentation.

Add to the selftests to validate behavior and lack of side-effects.

Reviewed-by: Marco Elver <elver@google.com>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
---
Cc: Rasmus Villemoes <rasmus.villemoes@prevas.dk>
Cc: Eric Biggers <ebiggers@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: "Gustavo A. R. Silva" <gustavoars@kernel.org>
Cc: linux-hardening@vger.kernel.org
---
 include/linux/overflow.h | 32 ++++++++++++++++++++++++++++++
 lib/overflow_kunit.c     | 43 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 75 insertions(+)

diff --git a/include/linux/overflow.h b/include/linux/overflow.h
index d3ff8e2bec2972..dede374832c9b9 100644
--- a/include/linux/overflow.h
+++ b/include/linux/overflow.h
@@ -81,6 +81,22 @@ static inline bool __must_check __must_check_overflow(bool overflow)
 		__val;						\
 	})
 
+/**
+ * wrapping_assign_add() - Intentionally perform a wrapping increment assignment
+ * @var: variable to be incremented
+ * @offset: amount to add
+ *
+ * Increments @var by @offset with wrap-around. Returns the resulting
+ * value of @var. Will not trip any wrap-around sanitizers.
+ *
+ * Returns the new value of @var.
+ */
+#define wrapping_assign_add(var, offset)				\
+	({								\
+		typeof(var) *__ptr = &(var);				\
+		*__ptr = wrapping_add(typeof(var), *__ptr, offset);	\
+	})
+
 /**
  * check_sub_overflow() - Calculate subtraction with overflow checking
  * @a: minuend; value to subtract from
@@ -111,6 +127,22 @@ static inline bool __must_check __must_check_overflow(bool overflow)
 		__val;						\
 	})
 
+/**
+ * wrapping_assign_sub() - Intentionally perform a wrapping decrement assign
+ * @var: variable to be decremented
+ * @offset: amount to subtract
+ *
+ * Decrements @var by @offset with wrap-around. Returns the resulting
+ * value of @var. Will not trip any wrap-around sanitizers.
+ *
+ * Returns the new value of @var.
+ */
+#define wrapping_assign_sub(var, offset)				\
+	({								\
+		typeof(var) *__ptr = &(var);				\
+		*__ptr = wrapping_sub(typeof(var), *__ptr, offset);	\
+	})
+
 /**
  * check_mul_overflow() - Calculate multiplication with overflow checking
  * @a: first factor
diff --git a/lib/overflow_kunit.c b/lib/overflow_kunit.c
index d3fdb906d3fef7..65e8a72a83bfaa 100644
--- a/lib/overflow_kunit.c
+++ b/lib/overflow_kunit.c
@@ -284,6 +284,45 @@ DEFINE_TEST_ARRAY(s64) = {
 		"Unexpected wrap " #op " macro side-effect!\n");	\
 } while (0)
 
+static int global_counter;
+static void bump_counter(void)
+{
+	global_counter++;
+}
+
+static int get_index(void)
+{
+	volatile int index = 0;
+	bump_counter();
+	return index;
+}
+
+#define check_self_op(fmt, op, sym, a, b) do {				\
+	typeof(a + 0) _a = a;						\
+	typeof(b + 0) _b = b;						\
+	typeof(a + 0) _a_sym = a;					\
+	typeof(a + 0) _a_orig[1] = { a };				\
+	typeof(b + 0) _b_orig = b;					\
+	typeof(b + 0) _b_bump = b + 1;					\
+	typeof(a + 0) _r;						\
+									\
+	_a_sym sym _b;							\
+	_r = wrapping_ ## op(_a, _b);					\
+	KUNIT_EXPECT_TRUE_MSG(test, _r == _a_sym,			\
+		"expected "fmt" "#op" "fmt" == "fmt", got "fmt"\n",	\
+		a, b, _a_sym, _r);					\
+	KUNIT_EXPECT_TRUE_MSG(test, _a == _a_sym,			\
+		"expected "fmt" "#op" "fmt" == "fmt", got "fmt"\n",	\
+		a, b, _a_sym, _a);					\
+	/* Check for internal macro side-effects. */			\
+	global_counter = 0;						\
+	wrapping_ ## op(_a_orig[get_index()], _b_orig++);		\
+	KUNIT_EXPECT_EQ_MSG(test, global_counter, 1,			\
+		"Unexpected wrapping_" #op " macro side-effect on arg1!\n"); \
+	KUNIT_EXPECT_EQ_MSG(test, _b_orig, _b_bump,			\
+		"Unexpected wrapping_" #op " macro side-effect on arg2!\n"); \
+} while (0)
+
 #define DEFINE_TEST_FUNC_TYPED(n, t, fmt)				\
 static void do_test_ ## n(struct kunit *test, const struct test_ ## n *p) \
 {									\
@@ -293,6 +332,10 @@ static void do_test_ ## n(struct kunit *test, const struct test_ ## n *p) \
 	check_one_op(t, fmt, sub, "-", p->a, p->b, p->diff, p->d_of);	\
 	check_one_op(t, fmt, mul, "*", p->a, p->b, p->prod, p->p_of);	\
 	check_one_op(t, fmt, mul, "*", p->b, p->a, p->prod, p->p_of);	\
+	/* wrapping_assign_{add,sub}() */				\
+	check_self_op(fmt, assign_add, +=, p->a, p->b);			\
+	check_self_op(fmt, assign_add, +=, p->b, p->a);			\
+	check_self_op(fmt, assign_sub, -=, p->a, p->b);			\
 }									\
 									\
 static void n ## _overflow_test(struct kunit *test) {			\

From d7501cbe17bff053d87feb8c8793d14890eb905a Mon Sep 17 00:00:00 2001
From: Michal Wajdeczko <michal.wajdeczko@intel.com>
Date: Wed, 14 Feb 2024 17:50:15 +0100
Subject: [PATCH 0776/1406] lib/string_choices: Add str_plural() helper

Add str_plural() helper to replace existing open implementations
used by many drivers and help improve future user facing messages.

Signed-off-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
Link: https://lore.kernel.org/r/20240214165015.1656-1-michal.wajdeczko@intel.com
Signed-off-by: Kees Cook <keescook@chromium.org>
---
Cc: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Jani Nikula <jani.nikula@intel.com>
---
 include/linux/string_choices.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/include/linux/string_choices.h b/include/linux/string_choices.h
index 3c1091941eb897..d9ebe20229f810 100644
--- a/include/linux/string_choices.h
+++ b/include/linux/string_choices.h
@@ -42,4 +42,15 @@ static inline const char *str_yes_no(bool v)
 	return v ? "yes" : "no";
 }
 
+/**
+ * str_plural - Return the simple pluralization based on English counts
+ * @num: Number used for deciding pluralization
+ *
+ * If @num is 1, returns empty string, otherwise returns "s".
+ */
+static inline const char *str_plural(size_t num)
+{
+	return num == 1 ? "" : "s";
+}
+
 #endif

From bb532a78a33c2963b716b178bb74ff3c05af1543 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Thu, 15 Feb 2024 09:58:10 -0800
Subject: [PATCH 0777/1406] coccinelle: Add rules to find str_plural()
 replacements

Add rules for finding places where str_plural() can be used. This
currently finds:
 54 files changed, 62 insertions(+), 61 deletions(-)

Co-developed-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
Signed-off-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
Link: https://lore.kernel.org/all/fc1b25a8-6381-47c2-831c-ab6b8201a82b@intel.com/
Signed-off-by: Kees Cook <keescook@chromium.org>
---
Cc: Michal Wajdeczko <michal.wajdeczko@intel.com>
Cc: Jani Nikula <jani.nikula@intel.com>
Cc: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Julia Lawall <Julia.Lawall@inria.fr>
Cc: Nicolas Palix <nicolas.palix@imag.fr>
Cc: cocci@inria.fr
---
 MAINTAINERS                                 |  1 +
 scripts/coccinelle/api/string_choices.cocci | 41 +++++++++++++++++++++
 2 files changed, 42 insertions(+)
 create mode 100644 scripts/coccinelle/api/string_choices.cocci

diff --git a/MAINTAINERS b/MAINTAINERS
index d0df728734c190..216d02a3fed5b0 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -8979,6 +8979,7 @@ F:	lib/string.c
 F:	lib/string_helpers.c
 F:	lib/test-string_helpers.c
 F:	lib/test_string.c
+F:	scripts/coccinelle/api/string_choices.cocci
 
 GENERIC UIO DRIVER FOR PCI DEVICES
 M:	"Michael S. Tsirkin" <mst@redhat.com>
diff --git a/scripts/coccinelle/api/string_choices.cocci b/scripts/coccinelle/api/string_choices.cocci
new file mode 100644
index 00000000000000..a71966c0494ef6
--- /dev/null
+++ b/scripts/coccinelle/api/string_choices.cocci
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/// Find places to use string_choices.h's various helpers.
+//
+// Confidence: Medium
+// Options: --no-includes --include-headers
+virtual patch
+virtual context
+virtual report
+
+@str_plural depends on patch@
+expression E;
+@@
+(
+-	((E == 1) ? "" : "s")
++	str_plural(E)
+|
+-	((E != 1) ? "s" : "")
++	str_plural(E)
+|
+-	((E > 1) ? "s" : "")
++	str_plural(E)
+)
+
+@str_plural_r depends on !patch exists@
+expression E;
+position P;
+@@
+(
+*	((E@P == 1) ? "" : "s")
+|
+*	((E@P != 1) ? "s" : "")
+|
+*	((E@P > 1) ? "s" : "")
+)
+
+@script:python depends on report@
+p << str_plural_r.P;
+e << str_plural_r.E;
+@@
+
+coccilib.report.print_report(p[0], "opportunity for str_plural(%s)" % e)

From 7f49a68b4f997cd81b8e7d18713aa9031b733f08 Mon Sep 17 00:00:00 2001
From: Jacob Keller <jacob.e.keller@intel.com>
Date: Mon, 27 Feb 2023 12:24:28 -0800
Subject: [PATCH 0778/1406] coccinelle: semantic patch to check for potential
 struct_size calls

include/linux/overflow.h includes helper macros intended for calculating
sizes of allocations. These macros prevent accidental overflow by
saturating at SIZE_MAX.

In general when calculating such sizes use of the macros is preferred. Add
a semantic patch which can detect code patterns which can be replaced by
struct_size.

Note that I set the confidence to medium because this patch doesn't make an
attempt to ensure that the relevant array is actually a flexible array. The
struct_size macro does specifically require a flexible array. In many cases
the detected code could be refactored to a flexible array, but this is not
always possible (such as if there are multiple over-allocations).

Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Link: https://lore.kernel.org/r/20230227202428.3657443-1-jacob.e.keller@intel.com
Signed-off-by: Kees Cook <keescook@chromium.org>
---
Cc: Julia Lawall <Julia.Lawall@lip6.fr>
Cc: Kees Cook <keescook@chromium.org>
Cc: Gustavo A. R. Silva <gustavoars@kernel.org>
Cc: cocci@systeme.lip6.fr
Cc: linux-kernel@vger.kernel.org
---
 scripts/coccinelle/misc/struct_size.cocci | 74 +++++++++++++++++++++++
 1 file changed, 74 insertions(+)
 create mode 100644 scripts/coccinelle/misc/struct_size.cocci

diff --git a/scripts/coccinelle/misc/struct_size.cocci b/scripts/coccinelle/misc/struct_size.cocci
new file mode 100644
index 00000000000000..9b02c37438e407
--- /dev/null
+++ b/scripts/coccinelle/misc/struct_size.cocci
@@ -0,0 +1,74 @@
+// SPDX-License-Identifier: GPL-2.0-only
+///
+/// Check for code that could use struct_size().
+///
+// Confidence: Medium
+// Author: Jacob Keller <jacob.e.keller@intel.com>
+// Copyright: (C) 2023 Intel Corporation
+// Options: --no-includes --include-headers
+
+virtual patch
+virtual context
+virtual org
+virtual report
+
+// the overflow Kunit tests have some code which intentionally does not use
+// the macros, so we want to ignore this code when reporting potential
+// issues.
+@overflow_tests@
+identifier f = overflow_size_helpers_test;
+@@
+
+f
+
+//----------------------------------------------------------
+//  For context mode
+//----------------------------------------------------------
+
+@depends on !overflow_tests && context@
+expression E1, E2;
+identifier m;
+@@
+(
+* (sizeof(*E1) + (E2 * sizeof(*E1->m)))
+)
+
+//----------------------------------------------------------
+//  For patch mode
+//----------------------------------------------------------
+
+@depends on !overflow_tests && patch@
+expression E1, E2;
+identifier m;
+@@
+(
+- (sizeof(*E1) + (E2 * sizeof(*E1->m)))
++ struct_size(E1, m, E2)
+)
+
+//----------------------------------------------------------
+//  For org and report mode
+//----------------------------------------------------------
+
+@r depends on !overflow_tests && (org || report)@
+expression E1, E2;
+identifier m;
+position p;
+@@
+(
+ (sizeof(*E1)@p + (E2 * sizeof(*E1->m)))
+)
+
+@script:python depends on org@
+p << r.p;
+@@
+
+coccilib.org.print_todo(p[0], "WARNING should use struct_size")
+
+@script:python depends on report@
+p << r.p;
+@@
+
+msg="WARNING: Use struct_size"
+coccilib.report.print_report(p[0], msg)
+

From f0f4273404295c368efd3054e58dd0ec681ca175 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Sun, 18 Feb 2024 09:31:48 -0800
Subject: [PATCH 0779/1406] leaking_addresses: Provide mechanism to scan binary
 files

Introduce --kallsyms argument for scanning binary files for known symbol
addresses. This would have found the exposure in /sys/kernel/notes:

$ scripts/leaking_addresses.pl --kallsyms=<(sudo cat /proc/kallsyms)
/sys/kernel/notes: hypercall_page @ 156
/sys/kernel/notes: xen_hypercall_set_trap_table @ 156
/sys/kernel/notes: startup_xen @ 132

Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
---
Cc: "Tobin C. Harding" <me@tobin.cc>
Cc: Tycho Andersen <tycho@tycho.pizza>
Cc: Guixiong Wei <guixiongwei@gmail.com>
Cc: linux-hardening@vger.kernel.org
---
 scripts/leaking_addresses.pl | 53 ++++++++++++++++++++++++++++++++++++
 1 file changed, 53 insertions(+)

diff --git a/scripts/leaking_addresses.pl b/scripts/leaking_addresses.pl
index e695634d153d63..cbaa17c244cc68 100755
--- a/scripts/leaking_addresses.pl
+++ b/scripts/leaking_addresses.pl
@@ -51,10 +51,13 @@
 my $suppress_dmesg = 0;		# Don't show dmesg in output.
 my $squash_by_path = 0;		# Summary report grouped by absolute path.
 my $squash_by_filename = 0;	# Summary report grouped by filename.
+my $kallsyms_file = "";		# Kernel symbols file.
 my $kernel_config_file = "";	# Kernel configuration file.
 my $opt_32bit = 0;		# Scan 32-bit kernel.
 my $page_offset_32bit = 0;	# Page offset for 32-bit kernel.
 
+my @kallsyms = ();
+
 # Skip these absolute paths.
 my @skip_abs = (
 	'/proc/kmsg',
@@ -95,6 +98,8 @@ sub help
 	      --squash-by-path		Show one result per unique path.
 	      --squash-by-filename	Show one result per unique filename.
 	--kernel-config-file=<file>     Kernel configuration file (e.g /boot/config)
+	--kallsyms=<file>		Read kernel symbol addresses from file (for
+						scanning binary files).
 	--32-bit			Scan 32-bit kernel.
 	--page-offset-32-bit=o		Page offset (for 32-bit kernel 0xABCD1234).
 	-d, --debug			Display debugging output.
@@ -115,6 +120,7 @@ sub help
 	'squash-by-path'        => \$squash_by_path,
 	'squash-by-filename'    => \$squash_by_filename,
 	'raw'                   => \$raw,
+	'kallsyms=s'            => \$kallsyms_file,
 	'kernel-config-file=s'	=> \$kernel_config_file,
 	'32-bit'		=> \$opt_32bit,
 	'page-offset-32-bit=o'	=> \$page_offset_32bit,
@@ -155,6 +161,25 @@ sub help
 	select $fh;
 }
 
+if ($kallsyms_file) {
+	open my $fh, '<', $kallsyms_file or die "$0: $kallsyms_file: $!\n";
+	while (<$fh>) {
+		chomp;
+		my @entry = split / /, $_;
+		my $addr_text = $entry[0];
+		# TODO: Why is hex() so impossibly slow?
+		my $addr = hex($addr_text);
+		my $symbol = $entry[2];
+		# Only keep kernel text addresses.
+		if ($addr_text !~ /^0/) {
+			my $long = pack("J", $addr);
+			my $entry = [$long, $symbol];
+			push @kallsyms, $entry;
+		}
+	}
+	close $fh;
+}
+
 parse_dmesg();
 walk(@DIRS);
 
@@ -442,6 +467,25 @@ sub timed_parse_file
 	}
 }
 
+sub parse_binary
+{
+	my ($file) = @_;
+
+	open my $fh, "<:raw", $file or return;
+	local $/ = undef;
+	my $bytes = <$fh>;
+	close $fh;
+
+	foreach my $entry (@kallsyms) {
+		my $addr = $entry->[0];
+		my $symbol = $entry->[1];
+		my $offset = index($bytes, $addr);
+		if ($offset != -1) {
+			printf("$file: $symbol @ $offset\n");
+		}
+	}
+}
+
 sub parse_file
 {
 	my ($file) = @_;
@@ -451,6 +495,15 @@ sub parse_file
 	}
 
 	if (! -T $file) {
+		if ($file =~ m|^/sys/kernel/btf/| or
+		    $file =~ m|^/sys/devices/pci| or
+		    $file =~ m|^/sys/firmware/efi/efivars/| or
+		    $file =~ m|^/proc/bus/pci/|) {
+			return;
+		}
+		if (scalar @kallsyms > 0) {
+			parse_binary($file);
+		}
 		return;
 	}
 

From 3e9876812f3a747622edf0a734a8419d441067d1 Mon Sep 17 00:00:00 2001
From: Paul Durrant <pdurrant@amazon.com>
Date: Thu, 15 Feb 2024 15:28:56 +0000
Subject: [PATCH 0780/1406] KVM: pfncache: Add a map helper function

There is a pfncache unmap helper but mapping is open-coded. Arguably this
is fine because mapping is done in only one place, hva_to_pfn_retry(), but
adding the helper does make that function more readable.

No functional change intended.

Signed-off-by: Paul Durrant <pdurrant@amazon.com>
Reviewed-by: David Woodhouse <dwmw@amazon.co.uk>
Link: https://lore.kernel.org/r/20240215152916.1158-2-paul@xen.org
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 virt/kvm/pfncache.c | 47 ++++++++++++++++++++++++++++-----------------
 1 file changed, 29 insertions(+), 18 deletions(-)

diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c
index 2d6aba67783078..10842f1eeeae9b 100644
--- a/virt/kvm/pfncache.c
+++ b/virt/kvm/pfncache.c
@@ -96,17 +96,32 @@ bool kvm_gpc_check(struct gfn_to_pfn_cache *gpc, unsigned long len)
 }
 EXPORT_SYMBOL_GPL(kvm_gpc_check);
 
-static void gpc_unmap_khva(kvm_pfn_t pfn, void *khva)
+static void *gpc_map(kvm_pfn_t pfn)
 {
-	/* Unmap the old pfn/page if it was mapped before. */
-	if (!is_error_noslot_pfn(pfn) && khva) {
-		if (pfn_valid(pfn))
-			kunmap(pfn_to_page(pfn));
+	if (pfn_valid(pfn))
+		return kmap(pfn_to_page(pfn));
+
 #ifdef CONFIG_HAS_IOMEM
-		else
-			memunmap(khva);
+	return memremap(pfn_to_hpa(pfn), PAGE_SIZE, MEMREMAP_WB);
+#else
+	return NULL;
 #endif
+}
+
+static void gpc_unmap(kvm_pfn_t pfn, void *khva)
+{
+	/* Unmap the old pfn/page if it was mapped before. */
+	if (is_error_noslot_pfn(pfn) || !khva)
+		return;
+
+	if (pfn_valid(pfn)) {
+		kunmap(pfn_to_page(pfn));
+		return;
 	}
+
+#ifdef CONFIG_HAS_IOMEM
+	memunmap(khva);
+#endif
 }
 
 static inline bool mmu_notifier_retry_cache(struct kvm *kvm, unsigned long mmu_seq)
@@ -175,7 +190,7 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
 			 * the existing mapping and didn't create a new one.
 			 */
 			if (new_khva != old_khva)
-				gpc_unmap_khva(new_pfn, new_khva);
+				gpc_unmap(new_pfn, new_khva);
 
 			kvm_release_pfn_clean(new_pfn);
 
@@ -193,15 +208,11 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
 		 * too must be done outside of gpc->lock!
 		 */
 		if (gpc->usage & KVM_HOST_USES_PFN) {
-			if (new_pfn == gpc->pfn) {
+			if (new_pfn == gpc->pfn)
 				new_khva = old_khva;
-			} else if (pfn_valid(new_pfn)) {
-				new_khva = kmap(pfn_to_page(new_pfn));
-#ifdef CONFIG_HAS_IOMEM
-			} else {
-				new_khva = memremap(pfn_to_hpa(new_pfn), PAGE_SIZE, MEMREMAP_WB);
-#endif
-			}
+			else
+				new_khva = gpc_map(new_pfn);
+
 			if (!new_khva) {
 				kvm_release_pfn_clean(new_pfn);
 				goto out_error;
@@ -326,7 +337,7 @@ static int __kvm_gpc_refresh(struct gfn_to_pfn_cache *gpc, gpa_t gpa,
 	mutex_unlock(&gpc->refresh_lock);
 
 	if (unmap_old)
-		gpc_unmap_khva(old_pfn, old_khva);
+		gpc_unmap(old_pfn, old_khva);
 
 	return ret;
 }
@@ -412,7 +423,7 @@ void kvm_gpc_deactivate(struct gfn_to_pfn_cache *gpc)
 		list_del(&gpc->list);
 		spin_unlock(&kvm->gpc_lock);
 
-		gpc_unmap_khva(old_pfn, old_khva);
+		gpc_unmap(old_pfn, old_khva);
 	}
 }
 EXPORT_SYMBOL_GPL(kvm_gpc_deactivate);

From cc823798c9eb9e8acd03efeabc9b3681a5bb05c3 Mon Sep 17 00:00:00 2001
From: Paul Durrant <pdurrant@amazon.com>
Date: Thu, 15 Feb 2024 15:28:57 +0000
Subject: [PATCH 0781/1406] KVM: pfncache: remove unnecessary exports

There is no need for the existing kvm_gpc_XXX() functions to be exported.
Clean up now before additional functions are added in subsequent patches.

Signed-off-by: Paul Durrant <pdurrant@amazon.com>
Reviewed-by: David Woodhouse <dwmw@amazon.co.uk>
Link: https://lore.kernel.org/r/20240215152916.1158-3-paul@xen.org
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 virt/kvm/pfncache.c | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c
index 10842f1eeeae9b..f3571f44d9af14 100644
--- a/virt/kvm/pfncache.c
+++ b/virt/kvm/pfncache.c
@@ -94,7 +94,6 @@ bool kvm_gpc_check(struct gfn_to_pfn_cache *gpc, unsigned long len)
 
 	return true;
 }
-EXPORT_SYMBOL_GPL(kvm_gpc_check);
 
 static void *gpc_map(kvm_pfn_t pfn)
 {
@@ -346,7 +345,6 @@ int kvm_gpc_refresh(struct gfn_to_pfn_cache *gpc, unsigned long len)
 {
 	return __kvm_gpc_refresh(gpc, gpc->gpa, len);
 }
-EXPORT_SYMBOL_GPL(kvm_gpc_refresh);
 
 void kvm_gpc_init(struct gfn_to_pfn_cache *gpc, struct kvm *kvm,
 		  struct kvm_vcpu *vcpu, enum pfn_cache_usage usage)
@@ -363,7 +361,6 @@ void kvm_gpc_init(struct gfn_to_pfn_cache *gpc, struct kvm *kvm,
 	gpc->pfn = KVM_PFN_ERR_FAULT;
 	gpc->uhva = KVM_HVA_ERR_BAD;
 }
-EXPORT_SYMBOL_GPL(kvm_gpc_init);
 
 int kvm_gpc_activate(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned long len)
 {
@@ -388,7 +385,6 @@ int kvm_gpc_activate(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned long len)
 	}
 	return __kvm_gpc_refresh(gpc, gpa, len);
 }
-EXPORT_SYMBOL_GPL(kvm_gpc_activate);
 
 void kvm_gpc_deactivate(struct gfn_to_pfn_cache *gpc)
 {
@@ -426,4 +422,3 @@ void kvm_gpc_deactivate(struct gfn_to_pfn_cache *gpc)
 		gpc_unmap(old_pfn, old_khva);
 	}
 }
-EXPORT_SYMBOL_GPL(kvm_gpc_deactivate);

From 0248d6bba2979dff766961ae158b08697c874e59 Mon Sep 17 00:00:00 2001
From: Paul Durrant <pdurrant@amazon.com>
Date: Thu, 15 Feb 2024 15:28:58 +0000
Subject: [PATCH 0782/1406] KVM: x86/xen: mark guest pages dirty with the
 pfncache lock held

Sampling gpa and memslot from an unlocked pfncache may yield inconsistent
values so, since there is no problem with calling mark_page_dirty_in_slot()
with the pfncache lock held, relocate the calls in
kvm_xen_update_runstate_guest() and kvm_xen_inject_pending_events()
accordingly.

Signed-off-by: Paul Durrant <pdurrant@amazon.com>
Reviewed-by: David Woodhouse <dwmw@amazon.co.uk>
Link: https://lore.kernel.org/r/20240215152916.1158-4-paul@xen.org
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/xen.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
index 4b4e738c6f1b79..f3327508ae415c 100644
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -452,14 +452,13 @@ static void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, bool atomic)
 		smp_wmb();
 	}
 
-	if (user_len2)
+	if (user_len2) {
+		mark_page_dirty_in_slot(v->kvm, gpc2->memslot, gpc2->gpa >> PAGE_SHIFT);
 		read_unlock(&gpc2->lock);
-
-	read_unlock_irqrestore(&gpc1->lock, flags);
+	}
 
 	mark_page_dirty_in_slot(v->kvm, gpc1->memslot, gpc1->gpa >> PAGE_SHIFT);
-	if (user_len2)
-		mark_page_dirty_in_slot(v->kvm, gpc2->memslot, gpc2->gpa >> PAGE_SHIFT);
+	read_unlock_irqrestore(&gpc1->lock, flags);
 }
 
 void kvm_xen_update_runstate(struct kvm_vcpu *v, int state)
@@ -565,13 +564,13 @@ void kvm_xen_inject_pending_events(struct kvm_vcpu *v)
 			     : "0" (evtchn_pending_sel32));
 		WRITE_ONCE(vi->evtchn_upcall_pending, 1);
 	}
+
+	mark_page_dirty_in_slot(v->kvm, gpc->memslot, gpc->gpa >> PAGE_SHIFT);
 	read_unlock_irqrestore(&gpc->lock, flags);
 
 	/* For the per-vCPU lapic vector, deliver it as MSI. */
 	if (v->arch.xen.upcall_vector)
 		kvm_xen_inject_vcpu_vector(v);
-
-	mark_page_dirty_in_slot(v->kvm, gpc->memslot, gpc->gpa >> PAGE_SHIFT);
 }
 
 int __kvm_xen_has_interrupt(struct kvm_vcpu *v)

From a4c3cb8eff589a61a8d59e6b37eac74e6c76c777 Mon Sep 17 00:00:00 2001
From: Paul Durrant <pdurrant@amazon.com>
Date: Thu, 15 Feb 2024 15:28:59 +0000
Subject: [PATCH 0783/1406] KVM: pfncache: add a mark-dirty helper

At the moment pages are marked dirty by open-coded calls to
mark_page_dirty_in_slot(), directly deferefencing the gpa and memslot
from the cache. After a subsequent patch these may not always be set
so add a helper now so that caller will protected from the need to know
about this detail.

Signed-off-by: Paul Durrant <pdurrant@amazon.com>
Reviewed-by: David Woodhouse <dwmw@amazon.co.uk>
Link: https://lore.kernel.org/r/20240215152916.1158-5-paul@xen.org
[sean: decrease indentation, use gpa_to_gfn()]
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/x86.c       |  2 +-
 arch/x86/kvm/xen.c       |  6 +++---
 include/linux/kvm_host.h | 10 ++++++++++
 3 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index bf10a9073a0928..f0f37c769a3afa 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3160,7 +3160,7 @@ static void kvm_setup_guest_pvclock(struct kvm_vcpu *v,
 
 	guest_hv_clock->version = ++vcpu->hv_clock.version;
 
-	mark_page_dirty_in_slot(v->kvm, gpc->memslot, gpc->gpa >> PAGE_SHIFT);
+	kvm_gpc_mark_dirty_in_slot(gpc);
 	read_unlock_irqrestore(&gpc->lock, flags);
 
 	trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock);
diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
index f3327508ae415c..2d001a9c63787e 100644
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -453,11 +453,11 @@ static void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, bool atomic)
 	}
 
 	if (user_len2) {
-		mark_page_dirty_in_slot(v->kvm, gpc2->memslot, gpc2->gpa >> PAGE_SHIFT);
+		kvm_gpc_mark_dirty_in_slot(gpc2);
 		read_unlock(&gpc2->lock);
 	}
 
-	mark_page_dirty_in_slot(v->kvm, gpc1->memslot, gpc1->gpa >> PAGE_SHIFT);
+	kvm_gpc_mark_dirty_in_slot(gpc1);
 	read_unlock_irqrestore(&gpc1->lock, flags);
 }
 
@@ -565,7 +565,7 @@ void kvm_xen_inject_pending_events(struct kvm_vcpu *v)
 		WRITE_ONCE(vi->evtchn_upcall_pending, 1);
 	}
 
-	mark_page_dirty_in_slot(v->kvm, gpc->memslot, gpc->gpa >> PAGE_SHIFT);
+	kvm_gpc_mark_dirty_in_slot(gpc);
 	read_unlock_irqrestore(&gpc->lock, flags);
 
 	/* For the per-vCPU lapic vector, deliver it as MSI. */
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 7e7fd25b09b3eb..604ae285d9a993 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1795,6 +1795,16 @@ static inline bool kvm_is_error_gpa(struct kvm *kvm, gpa_t gpa)
 	return kvm_is_error_hva(hva);
 }
 
+static inline void kvm_gpc_mark_dirty_in_slot(struct gfn_to_pfn_cache *gpc)
+{
+	lockdep_assert_held(&gpc->lock);
+
+	if (!gpc->memslot)
+		return;
+
+	mark_page_dirty_in_slot(gpc->kvm, gpc->memslot, gpa_to_gfn(gpc->gpa));
+}
+
 enum kvm_stat_kind {
 	KVM_STAT_VM,
 	KVM_STAT_VCPU,

From d555fa26ce03a411a59f0f3ee16a5328ed983b4a Mon Sep 17 00:00:00 2001
From: Paul Durrant <pdurrant@amazon.com>
Date: Thu, 15 Feb 2024 15:29:00 +0000
Subject: [PATCH 0784/1406] KVM: pfncache: remove KVM_GUEST_USES_PFN usage

As noted in [1] the KVM_GUEST_USES_PFN usage flag is never set by any
callers of kvm_gpc_init(), and for good reason: the implementation is
incomplete/broken.  And it's not clear that there will ever be a user of
KVM_GUEST_USES_PFN, as coordinating vCPUs with mmu_notifier events is
non-trivial.

Remove KVM_GUEST_USES_PFN and all related code, e.g. dropping
KVM_GUEST_USES_PFN also makes the 'vcpu' argument redundant, to avoid
having to reason about broken code as __kvm_gpc_refresh() evolves.

Moreover, all existing callers specify KVM_HOST_USES_PFN so the usage
check in hva_to_pfn_retry() and hence the 'usage' argument to
kvm_gpc_init() are also redundant.

[1] https://lore.kernel.org/all/ZQiR8IpqOZrOpzHC@google.com

Signed-off-by: Paul Durrant <pdurrant@amazon.com>
Reviewed-by: David Woodhouse <dwmw@amazon.co.uk>
Link: https://lore.kernel.org/r/20240215152916.1158-6-paul@xen.org
[sean: explicitly call out that guest usage is incomplete]
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/x86.c        |  2 +-
 arch/x86/kvm/xen.c        | 14 ++++-----
 include/linux/kvm_host.h  | 11 +------
 include/linux/kvm_types.h |  8 -----
 virt/kvm/pfncache.c       | 61 ++++++---------------------------------
 5 files changed, 16 insertions(+), 80 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index f0f37c769a3afa..415723a28dcec7 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -12056,7 +12056,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
 	vcpu->arch.regs_avail = ~0;
 	vcpu->arch.regs_dirty = ~0;
 
-	kvm_gpc_init(&vcpu->arch.pv_time, vcpu->kvm, vcpu, KVM_HOST_USES_PFN);
+	kvm_gpc_init(&vcpu->arch.pv_time, vcpu->kvm);
 
 	if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu))
 		vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
index 2d001a9c63787e..e904642254677c 100644
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -2108,14 +2108,10 @@ void kvm_xen_init_vcpu(struct kvm_vcpu *vcpu)
 
 	timer_setup(&vcpu->arch.xen.poll_timer, cancel_evtchn_poll, 0);
 
-	kvm_gpc_init(&vcpu->arch.xen.runstate_cache, vcpu->kvm, NULL,
-		     KVM_HOST_USES_PFN);
-	kvm_gpc_init(&vcpu->arch.xen.runstate2_cache, vcpu->kvm, NULL,
-		     KVM_HOST_USES_PFN);
-	kvm_gpc_init(&vcpu->arch.xen.vcpu_info_cache, vcpu->kvm, NULL,
-		     KVM_HOST_USES_PFN);
-	kvm_gpc_init(&vcpu->arch.xen.vcpu_time_info_cache, vcpu->kvm, NULL,
-		     KVM_HOST_USES_PFN);
+	kvm_gpc_init(&vcpu->arch.xen.runstate_cache, vcpu->kvm);
+	kvm_gpc_init(&vcpu->arch.xen.runstate2_cache, vcpu->kvm);
+	kvm_gpc_init(&vcpu->arch.xen.vcpu_info_cache, vcpu->kvm);
+	kvm_gpc_init(&vcpu->arch.xen.vcpu_time_info_cache, vcpu->kvm);
 }
 
 void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu)
@@ -2158,7 +2154,7 @@ void kvm_xen_init_vm(struct kvm *kvm)
 {
 	mutex_init(&kvm->arch.xen.xen_lock);
 	idr_init(&kvm->arch.xen.evtchn_ports);
-	kvm_gpc_init(&kvm->arch.xen.shinfo_cache, kvm, NULL, KVM_HOST_USES_PFN);
+	kvm_gpc_init(&kvm->arch.xen.shinfo_cache, kvm);
 }
 
 void kvm_xen_destroy_vm(struct kvm *kvm)
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 604ae285d9a993..3e1c04608c67d4 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1319,21 +1319,12 @@ void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn);
  *
  * @gpc:	   struct gfn_to_pfn_cache object.
  * @kvm:	   pointer to kvm instance.
- * @vcpu:	   vCPU to be used for marking pages dirty and to be woken on
- *		   invalidation.
- * @usage:	   indicates if the resulting host physical PFN is used while
- *		   the @vcpu is IN_GUEST_MODE (in which case invalidation of 
- *		   the cache from MMU notifiers---but not for KVM memslot
- *		   changes!---will also force @vcpu to exit the guest and
- *		   refresh the cache); and/or if the PFN used directly
- *		   by KVM (and thus needs a kernel virtual mapping).
  *
  * This sets up a gfn_to_pfn_cache by initializing locks and assigning the
  * immutable attributes.  Note, the cache must be zero-allocated (or zeroed by
  * the caller before init).
  */
-void kvm_gpc_init(struct gfn_to_pfn_cache *gpc, struct kvm *kvm,
-		  struct kvm_vcpu *vcpu, enum pfn_cache_usage usage);
+void kvm_gpc_init(struct gfn_to_pfn_cache *gpc, struct kvm *kvm);
 
 /**
  * kvm_gpc_activate - prepare a cached kernel mapping and HPA for a given guest
diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h
index 9d1f7835d8c139..d93f6522b2c34c 100644
--- a/include/linux/kvm_types.h
+++ b/include/linux/kvm_types.h
@@ -49,12 +49,6 @@ typedef u64            hfn_t;
 
 typedef hfn_t kvm_pfn_t;
 
-enum pfn_cache_usage {
-	KVM_GUEST_USES_PFN = BIT(0),
-	KVM_HOST_USES_PFN  = BIT(1),
-	KVM_GUEST_AND_HOST_USE_PFN = KVM_GUEST_USES_PFN | KVM_HOST_USES_PFN,
-};
-
 struct gfn_to_hva_cache {
 	u64 generation;
 	gpa_t gpa;
@@ -69,13 +63,11 @@ struct gfn_to_pfn_cache {
 	unsigned long uhva;
 	struct kvm_memory_slot *memslot;
 	struct kvm *kvm;
-	struct kvm_vcpu *vcpu;
 	struct list_head list;
 	rwlock_t lock;
 	struct mutex refresh_lock;
 	void *khva;
 	kvm_pfn_t pfn;
-	enum pfn_cache_usage usage;
 	bool active;
 	bool valid;
 };
diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c
index f3571f44d9af14..6f4b537eb25b1d 100644
--- a/virt/kvm/pfncache.c
+++ b/virt/kvm/pfncache.c
@@ -25,9 +25,7 @@
 void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm, unsigned long start,
 				       unsigned long end, bool may_block)
 {
-	DECLARE_BITMAP(vcpu_bitmap, KVM_MAX_VCPUS);
 	struct gfn_to_pfn_cache *gpc;
-	bool evict_vcpus = false;
 
 	spin_lock(&kvm->gpc_lock);
 	list_for_each_entry(gpc, &kvm->gpc_list, list) {
@@ -37,43 +35,10 @@ void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm, unsigned long start,
 		if (gpc->valid && !is_error_noslot_pfn(gpc->pfn) &&
 		    gpc->uhva >= start && gpc->uhva < end) {
 			gpc->valid = false;
-
-			/*
-			 * If a guest vCPU could be using the physical address,
-			 * it needs to be forced out of guest mode.
-			 */
-			if (gpc->usage & KVM_GUEST_USES_PFN) {
-				if (!evict_vcpus) {
-					evict_vcpus = true;
-					bitmap_zero(vcpu_bitmap, KVM_MAX_VCPUS);
-				}
-				__set_bit(gpc->vcpu->vcpu_idx, vcpu_bitmap);
-			}
 		}
 		write_unlock_irq(&gpc->lock);
 	}
 	spin_unlock(&kvm->gpc_lock);
-
-	if (evict_vcpus) {
-		/*
-		 * KVM needs to ensure the vCPU is fully out of guest context
-		 * before allowing the invalidation to continue.
-		 */
-		unsigned int req = KVM_REQ_OUTSIDE_GUEST_MODE;
-		bool called;
-
-		/*
-		 * If the OOM reaper is active, then all vCPUs should have
-		 * been stopped already, so perform the request without
-		 * KVM_REQUEST_WAIT and be sad if any needed to be IPI'd.
-		 */
-		if (!may_block)
-			req &= ~KVM_REQUEST_WAIT;
-
-		called = kvm_make_vcpus_request_mask(kvm, req, vcpu_bitmap);
-
-		WARN_ON_ONCE(called && !may_block);
-	}
 }
 
 bool kvm_gpc_check(struct gfn_to_pfn_cache *gpc, unsigned long len)
@@ -206,16 +171,14 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
 		 * pfn.  Note, kmap() and memremap() can both sleep, so this
 		 * too must be done outside of gpc->lock!
 		 */
-		if (gpc->usage & KVM_HOST_USES_PFN) {
-			if (new_pfn == gpc->pfn)
-				new_khva = old_khva;
-			else
-				new_khva = gpc_map(new_pfn);
-
-			if (!new_khva) {
-				kvm_release_pfn_clean(new_pfn);
-				goto out_error;
-			}
+		if (new_pfn == gpc->pfn)
+			new_khva = old_khva;
+		else
+			new_khva = gpc_map(new_pfn);
+
+		if (!new_khva) {
+			kvm_release_pfn_clean(new_pfn);
+			goto out_error;
 		}
 
 		write_lock_irq(&gpc->lock);
@@ -346,18 +309,12 @@ int kvm_gpc_refresh(struct gfn_to_pfn_cache *gpc, unsigned long len)
 	return __kvm_gpc_refresh(gpc, gpc->gpa, len);
 }
 
-void kvm_gpc_init(struct gfn_to_pfn_cache *gpc, struct kvm *kvm,
-		  struct kvm_vcpu *vcpu, enum pfn_cache_usage usage)
+void kvm_gpc_init(struct gfn_to_pfn_cache *gpc, struct kvm *kvm)
 {
-	WARN_ON_ONCE(!usage || (usage & KVM_GUEST_AND_HOST_USE_PFN) != usage);
-	WARN_ON_ONCE((usage & KVM_GUEST_USES_PFN) && !vcpu);
-
 	rwlock_init(&gpc->lock);
 	mutex_init(&gpc->refresh_lock);
 
 	gpc->kvm = kvm;
-	gpc->vcpu = vcpu;
-	gpc->usage = usage;
 	gpc->pfn = KVM_PFN_ERR_FAULT;
 	gpc->uhva = KVM_HVA_ERR_BAD;
 }

From 5dec17213be3448588d10ceb27466e48145f6fdf Mon Sep 17 00:00:00 2001
From: Paul Durrant <pdurrant@amazon.com>
Date: Thu, 15 Feb 2024 15:29:01 +0000
Subject: [PATCH 0785/1406] KVM: pfncache: stop open-coding offset_in_page()

Some code in pfncache uses offset_in_page() but in other places it is open-
coded. Use offset_in_page() consistently everywhere.

Signed-off-by: Paul Durrant <pdurrant@amazon.com>
Reviewed-by: David Woodhouse <dwmw@amazon.co.uk>
Link: https://lore.kernel.org/r/20240215152916.1158-7-paul@xen.org
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 virt/kvm/pfncache.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c
index 6f4b537eb25b1d..0eeb034d06749c 100644
--- a/virt/kvm/pfncache.c
+++ b/virt/kvm/pfncache.c
@@ -48,7 +48,7 @@ bool kvm_gpc_check(struct gfn_to_pfn_cache *gpc, unsigned long len)
 	if (!gpc->active)
 		return false;
 
-	if ((gpc->gpa & ~PAGE_MASK) + len > PAGE_SIZE)
+	if (offset_in_page(gpc->gpa) + len > PAGE_SIZE)
 		return false;
 
 	if (gpc->generation != slots->generation || kvm_is_error_hva(gpc->uhva))
@@ -192,7 +192,7 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
 
 	gpc->valid = true;
 	gpc->pfn = new_pfn;
-	gpc->khva = new_khva + (gpc->gpa & ~PAGE_MASK);
+	gpc->khva = new_khva + offset_in_page(gpc->gpa);
 
 	/*
 	 * Put the reference to the _new_ pfn.  The pfn is now tracked by the
@@ -213,7 +213,7 @@ static int __kvm_gpc_refresh(struct gfn_to_pfn_cache *gpc, gpa_t gpa,
 			     unsigned long len)
 {
 	struct kvm_memslots *slots = kvm_memslots(gpc->kvm);
-	unsigned long page_offset = gpa & ~PAGE_MASK;
+	unsigned long page_offset = offset_in_page(gpa);
 	bool unmap_old = false;
 	unsigned long old_uhva;
 	kvm_pfn_t old_pfn;

From 328aa4cee95f0f820e964d590a9855ffd93ea9ab Mon Sep 17 00:00:00 2001
From: Paul Durrant <pdurrant@amazon.com>
Date: Thu, 15 Feb 2024 15:29:02 +0000
Subject: [PATCH 0786/1406] KVM: pfncache: include page offset in uhva and use
 it consistently

Currently the pfncache page offset is sometimes determined using the gpa
and sometimes the khva, whilst the uhva is always page-aligned. After a
subsequent patch is applied the gpa will not always be valid so adjust
the code to include the page offset in the uhva and use it consistently
as the source of truth.

Also, where a page-aligned address is required, use PAGE_ALIGN_DOWN()
for clarity.

No functional change intended.

Signed-off-by: Paul Durrant <pdurrant@amazon.com>
Reviewed-by: David Woodhouse <dwmw@amazon.co.uk>
Link: https://lore.kernel.org/r/20240215152916.1158-8-paul@xen.org
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 virt/kvm/pfncache.c | 29 +++++++++++++++++++++--------
 1 file changed, 21 insertions(+), 8 deletions(-)

diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c
index 0eeb034d06749c..97eec8ee34493a 100644
--- a/virt/kvm/pfncache.c
+++ b/virt/kvm/pfncache.c
@@ -48,10 +48,10 @@ bool kvm_gpc_check(struct gfn_to_pfn_cache *gpc, unsigned long len)
 	if (!gpc->active)
 		return false;
 
-	if (offset_in_page(gpc->gpa) + len > PAGE_SIZE)
+	if (gpc->generation != slots->generation || kvm_is_error_hva(gpc->uhva))
 		return false;
 
-	if (gpc->generation != slots->generation || kvm_is_error_hva(gpc->uhva))
+	if (offset_in_page(gpc->uhva) + len > PAGE_SIZE)
 		return false;
 
 	if (!gpc->valid)
@@ -119,7 +119,7 @@ static inline bool mmu_notifier_retry_cache(struct kvm *kvm, unsigned long mmu_s
 static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
 {
 	/* Note, the new page offset may be different than the old! */
-	void *old_khva = gpc->khva - offset_in_page(gpc->khva);
+	void *old_khva = (void *)PAGE_ALIGN_DOWN((uintptr_t)gpc->khva);
 	kvm_pfn_t new_pfn = KVM_PFN_ERR_FAULT;
 	void *new_khva = NULL;
 	unsigned long mmu_seq;
@@ -192,7 +192,7 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
 
 	gpc->valid = true;
 	gpc->pfn = new_pfn;
-	gpc->khva = new_khva + offset_in_page(gpc->gpa);
+	gpc->khva = new_khva + offset_in_page(gpc->uhva);
 
 	/*
 	 * Put the reference to the _new_ pfn.  The pfn is now tracked by the
@@ -217,6 +217,7 @@ static int __kvm_gpc_refresh(struct gfn_to_pfn_cache *gpc, gpa_t gpa,
 	bool unmap_old = false;
 	unsigned long old_uhva;
 	kvm_pfn_t old_pfn;
+	bool hva_change = false;
 	void *old_khva;
 	int ret;
 
@@ -242,10 +243,10 @@ static int __kvm_gpc_refresh(struct gfn_to_pfn_cache *gpc, gpa_t gpa,
 	}
 
 	old_pfn = gpc->pfn;
-	old_khva = gpc->khva - offset_in_page(gpc->khva);
-	old_uhva = gpc->uhva;
+	old_khva = (void *)PAGE_ALIGN_DOWN((uintptr_t)gpc->khva);
+	old_uhva = PAGE_ALIGN_DOWN(gpc->uhva);
 
-	/* If the userspace HVA is invalid, refresh that first */
+	/* Refresh the userspace HVA if necessary */
 	if (gpc->gpa != gpa || gpc->generation != slots->generation ||
 	    kvm_is_error_hva(gpc->uhva)) {
 		gfn_t gfn = gpa_to_gfn(gpa);
@@ -259,13 +260,25 @@ static int __kvm_gpc_refresh(struct gfn_to_pfn_cache *gpc, gpa_t gpa,
 			ret = -EFAULT;
 			goto out;
 		}
+
+		/*
+		 * Even if the GPA and/or the memslot generation changed, the
+		 * HVA may still be the same.
+		 */
+		if (gpc->uhva != old_uhva)
+			hva_change = true;
+	} else {
+		gpc->uhva = old_uhva;
 	}
 
+	/* Note: the offset must be correct before calling hva_to_pfn_retry() */
+	gpc->uhva += page_offset;
+
 	/*
 	 * If the userspace HVA changed or the PFN was already invalid,
 	 * drop the lock and do the HVA to PFN lookup again.
 	 */
-	if (!gpc->valid || old_uhva != gpc->uhva) {
+	if (!gpc->valid || hva_change) {
 		ret = hva_to_pfn_retry(gpc);
 	} else {
 		/*

From 1491322ad1cbfc6292ce75a071aa34a96341f912 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Thu, 15 Feb 2024 15:29:03 +0000
Subject: [PATCH 0787/1406] KVM: s390: Refactor kvm_is_error_gpa() into
 kvm_is_gpa_in_memslot()

Rename kvm_is_error_gpa() to kvm_is_gpa_in_memslot() and invert the
polarity accordingly in order to (a) free up kvm_is_error_gpa() to match
with kvm_is_error_{hva,page}(), and (b) to make it more obvious that the
helper is doing a memslot lookup, i.e. not simply checking for INVALID_GPA.

No functional change intended.

Link: https://lore.kernel.org/r/20240215152916.1158-9-paul@xen.org
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/s390/kvm/diag.c     |  2 +-
 arch/s390/kvm/gaccess.c  | 14 +++++++-------
 arch/s390/kvm/kvm-s390.c |  4 ++--
 arch/s390/kvm/priv.c     |  4 ++--
 arch/s390/kvm/sigp.c     |  2 +-
 include/linux/kvm_host.h |  4 ++--
 6 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/arch/s390/kvm/diag.c b/arch/s390/kvm/diag.c
index 3c65b8258ae67a..2a32438e09ceba 100644
--- a/arch/s390/kvm/diag.c
+++ b/arch/s390/kvm/diag.c
@@ -102,7 +102,7 @@ static int __diag_page_ref_service(struct kvm_vcpu *vcpu)
 		    parm.token_addr & 7 || parm.zarch != 0x8000000000000000ULL)
 			return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
 
-		if (kvm_is_error_gpa(vcpu->kvm, parm.token_addr))
+		if (!kvm_is_gpa_in_memslot(vcpu->kvm, parm.token_addr))
 			return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
 
 		vcpu->arch.pfault_token = parm.token_addr;
diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c
index 5bfcc50c1a6828..415c99649e43eb 100644
--- a/arch/s390/kvm/gaccess.c
+++ b/arch/s390/kvm/gaccess.c
@@ -664,7 +664,7 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva,
 	case ASCE_TYPE_REGION1:	{
 		union region1_table_entry rfte;
 
-		if (kvm_is_error_gpa(vcpu->kvm, ptr))
+		if (!kvm_is_gpa_in_memslot(vcpu->kvm, ptr))
 			return PGM_ADDRESSING;
 		if (deref_table(vcpu->kvm, ptr, &rfte.val))
 			return -EFAULT;
@@ -682,7 +682,7 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva,
 	case ASCE_TYPE_REGION2: {
 		union region2_table_entry rste;
 
-		if (kvm_is_error_gpa(vcpu->kvm, ptr))
+		if (!kvm_is_gpa_in_memslot(vcpu->kvm, ptr))
 			return PGM_ADDRESSING;
 		if (deref_table(vcpu->kvm, ptr, &rste.val))
 			return -EFAULT;
@@ -700,7 +700,7 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva,
 	case ASCE_TYPE_REGION3: {
 		union region3_table_entry rtte;
 
-		if (kvm_is_error_gpa(vcpu->kvm, ptr))
+		if (!kvm_is_gpa_in_memslot(vcpu->kvm, ptr))
 			return PGM_ADDRESSING;
 		if (deref_table(vcpu->kvm, ptr, &rtte.val))
 			return -EFAULT;
@@ -728,7 +728,7 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva,
 	case ASCE_TYPE_SEGMENT: {
 		union segment_table_entry ste;
 
-		if (kvm_is_error_gpa(vcpu->kvm, ptr))
+		if (!kvm_is_gpa_in_memslot(vcpu->kvm, ptr))
 			return PGM_ADDRESSING;
 		if (deref_table(vcpu->kvm, ptr, &ste.val))
 			return -EFAULT;
@@ -748,7 +748,7 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva,
 		ptr = ste.fc0.pto * (PAGE_SIZE / 2) + vaddr.px * 8;
 	}
 	}
-	if (kvm_is_error_gpa(vcpu->kvm, ptr))
+	if (!kvm_is_gpa_in_memslot(vcpu->kvm, ptr))
 		return PGM_ADDRESSING;
 	if (deref_table(vcpu->kvm, ptr, &pte.val))
 		return -EFAULT;
@@ -770,7 +770,7 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva,
 		*prot = PROT_TYPE_IEP;
 		return PGM_PROTECTION;
 	}
-	if (kvm_is_error_gpa(vcpu->kvm, raddr.addr))
+	if (!kvm_is_gpa_in_memslot(vcpu->kvm, raddr.addr))
 		return PGM_ADDRESSING;
 	*gpa = raddr.addr;
 	return 0;
@@ -957,7 +957,7 @@ static int guest_range_to_gpas(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar,
 				return rc;
 		} else {
 			gpa = kvm_s390_real_to_abs(vcpu, ga);
-			if (kvm_is_error_gpa(vcpu->kvm, gpa)) {
+			if (!kvm_is_gpa_in_memslot(vcpu->kvm, gpa)) {
 				rc = PGM_ADDRESSING;
 				prot = PROT_NONE;
 			}
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index ea63ac76988914..3e5a1d7aa81ac2 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -2878,7 +2878,7 @@ static int kvm_s390_vm_mem_op_abs(struct kvm *kvm, struct kvm_s390_mem_op *mop)
 
 	srcu_idx = srcu_read_lock(&kvm->srcu);
 
-	if (kvm_is_error_gpa(kvm, mop->gaddr)) {
+	if (!kvm_is_gpa_in_memslot(kvm, mop->gaddr)) {
 		r = PGM_ADDRESSING;
 		goto out_unlock;
 	}
@@ -2940,7 +2940,7 @@ static int kvm_s390_vm_mem_op_cmpxchg(struct kvm *kvm, struct kvm_s390_mem_op *m
 
 	srcu_idx = srcu_read_lock(&kvm->srcu);
 
-	if (kvm_is_error_gpa(kvm, mop->gaddr)) {
+	if (!kvm_is_gpa_in_memslot(kvm, mop->gaddr)) {
 		r = PGM_ADDRESSING;
 		goto out_unlock;
 	}
diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index f875a404a0a025..1be19cc9d73c19 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -149,7 +149,7 @@ static int handle_set_prefix(struct kvm_vcpu *vcpu)
 	 * first page, since address is 8k aligned and memory pieces are always
 	 * at least 1MB aligned and have at least a size of 1MB.
 	 */
-	if (kvm_is_error_gpa(vcpu->kvm, address))
+	if (!kvm_is_gpa_in_memslot(vcpu->kvm, address))
 		return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
 
 	kvm_s390_set_prefix(vcpu, address);
@@ -464,7 +464,7 @@ static int handle_test_block(struct kvm_vcpu *vcpu)
 		return kvm_s390_inject_prog_irq(vcpu, &vcpu->arch.pgm);
 	addr = kvm_s390_real_to_abs(vcpu, addr);
 
-	if (kvm_is_error_gpa(vcpu->kvm, addr))
+	if (!kvm_is_gpa_in_memslot(vcpu->kvm, addr))
 		return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
 	/*
 	 * We don't expect errors on modern systems, and do not care
diff --git a/arch/s390/kvm/sigp.c b/arch/s390/kvm/sigp.c
index d9696b5300647c..55c34cb354281e 100644
--- a/arch/s390/kvm/sigp.c
+++ b/arch/s390/kvm/sigp.c
@@ -172,7 +172,7 @@ static int __sigp_set_prefix(struct kvm_vcpu *vcpu, struct kvm_vcpu *dst_vcpu,
 	 * first page, since address is 8k aligned and memory pieces are always
 	 * at least 1MB aligned and have at least a size of 1MB.
 	 */
-	if (kvm_is_error_gpa(vcpu->kvm, irq.u.prefix.address)) {
+	if (!kvm_is_gpa_in_memslot(vcpu->kvm, irq.u.prefix.address)) {
 		*reg &= 0xffffffff00000000UL;
 		*reg |= SIGP_STATUS_INVALID_PARAMETER;
 		return SIGP_CC_STATUS_STORED;
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 3e1c04608c67d4..81a9d1cf91a260 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1779,11 +1779,11 @@ static inline hpa_t pfn_to_hpa(kvm_pfn_t pfn)
 	return (hpa_t)pfn << PAGE_SHIFT;
 }
 
-static inline bool kvm_is_error_gpa(struct kvm *kvm, gpa_t gpa)
+static inline bool kvm_is_gpa_in_memslot(struct kvm *kvm, gpa_t gpa)
 {
 	unsigned long hva = gfn_to_hva(kvm, gpa_to_gfn(gpa));
 
-	return kvm_is_error_hva(hva);
+	return !kvm_is_error_hva(hva);
 }
 
 static inline void kvm_gpc_mark_dirty_in_slot(struct gfn_to_pfn_cache *gpc)

From 3e1c946954d7481d0962ba6d8f253fc40ef53c59 Mon Sep 17 00:00:00 2001
From: Vegard Nossum <vegard.nossum@oracle.com>
Date: Thu, 15 Feb 2024 14:48:24 +0100
Subject: [PATCH 0788/1406] scripts/kernel-doc: add modeline for vim users

Set 'softtabstop' to 4 spaces, which will hopefully help keep the
indentation in this file consistent going forwards.

This mirrors the modeline in scripts such as recordmcount.pl, ktest.pl,
and others.

Emacs seems to use 4 spaces to indent by default, so it doesn't require
anything special here.

No functional change.

Signed-off-by: Vegard Nossum <vegard.nossum@oracle.com>
Reviewed-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
Link: https://lore.kernel.org/r/20240215134828.1277109-2-vegard.nossum@oracle.com
---
 scripts/kernel-doc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/scripts/kernel-doc b/scripts/kernel-doc
index a9947080ead3ca..4dc5c3909d8c42 100755
--- a/scripts/kernel-doc
+++ b/scripts/kernel-doc
@@ -1,5 +1,6 @@
 #!/usr/bin/env perl
 # SPDX-License-Identifier: GPL-2.0
+# vim: softtabstop=4
 
 use warnings;
 use strict;

From 11fed183359d26d3fe1b17fcecca9cb4e0f6e16f Mon Sep 17 00:00:00 2001
From: Vegard Nossum <vegard.nossum@oracle.com>
Date: Thu, 15 Feb 2024 14:48:25 +0100
Subject: [PATCH 0789/1406] scripts/kernel-doc: simplify function printing

Get rid of the $start variable, since it's really not necessary.

No functional change.

Signed-off-by: Vegard Nossum <vegard.nossum@oracle.com>
Reviewed-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
Link: https://lore.kernel.org/r/20240215134828.1277109-3-vegard.nossum@oracle.com
---
 scripts/kernel-doc | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/scripts/kernel-doc b/scripts/kernel-doc
index 4dc5c3909d8c42..9b7441e776697f 100755
--- a/scripts/kernel-doc
+++ b/scripts/kernel-doc
@@ -820,7 +820,6 @@ sub output_function_rst(%) {
     my %args = %{$_[0]};
     my ($parameter, $section);
     my $oldprefix = $lineprefix;
-    my $start = "";
     my $is_macro = 0;
 
     if ($sphinx_major < 3) {
@@ -830,7 +829,7 @@ sub output_function_rst(%) {
             print "   **Typedef**: ";
             $lineprefix = "";
             output_highlight_rst($args{'purpose'});
-            $start = "\n\n**Syntax**\n\n  ``";
+            print "\n\n**Syntax**\n\n  ``";
             $is_macro = 1;
         } else {
             print ".. c:function:: ";
@@ -848,17 +847,16 @@ sub output_function_rst(%) {
             print "   **Typedef**: ";
             $lineprefix = "";
             output_highlight_rst($args{'purpose'});
-            $start = "\n\n**Syntax**\n\n  ``";
+            print "\n\n**Syntax**\n\n  ``";
         } else {
             print "``" if ($is_macro);
         }
     }
     if ($args{'functiontype'} ne "") {
-        $start .= $args{'functiontype'} . " " . $args{'function'} . " (";
+        print $args{'functiontype'} . " " . $args{'function'} . " (";
     } else {
-        $start .= $args{'function'} . " (";
+        print $args{'function'} . " (";
     }
-    print $start;
 
     my $count = 0;
     foreach my $parameter (@{$args{'parameterlist'}}) {

From df00f872ab6643b342ff901c4c1dcb04ad09c40d Mon Sep 17 00:00:00 2001
From: Vegard Nossum <vegard.nossum@oracle.com>
Date: Thu, 15 Feb 2024 14:48:26 +0100
Subject: [PATCH 0790/1406] scripts/kernel-doc: separate out function signature

Format the entire function signature and place it in a separate variable;
this both makes it easier to understand what these lines of code are doing
and will allow us to simplify the code further in the following patch.

No functional change.

Signed-off-by: Vegard Nossum <vegard.nossum@oracle.com>
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
Link: https://lore.kernel.org/r/20240215134828.1277109-4-vegard.nossum@oracle.com
---
 scripts/kernel-doc | 49 ++++++++++++++++++++++++++--------------------
 1 file changed, 28 insertions(+), 21 deletions(-)

diff --git a/scripts/kernel-doc b/scripts/kernel-doc
index 9b7441e776697f..1af2c68f6bd8eb 100755
--- a/scripts/kernel-doc
+++ b/scripts/kernel-doc
@@ -822,6 +822,31 @@ sub output_function_rst(%) {
     my $oldprefix = $lineprefix;
     my $is_macro = 0;
 
+    my $signature = "";
+    if ($args{'functiontype'} ne "") {
+        $signature = $args{'functiontype'} . " " . $args{'function'} . " (";
+    } else {
+        $signature = $args{'function'} . " (";
+    }
+
+    my $count = 0;
+    foreach my $parameter (@{$args{'parameterlist'}}) {
+        if ($count ne 0) {
+            $signature .= ", ";
+        }
+        $count++;
+        $type = $args{'parametertypes'}{$parameter};
+
+        if ($type =~ m/$function_pointer/) {
+            # pointer-to-function
+            $signature .= $1 . $parameter . ") (" . $2 . ")";
+        } else {
+            $signature .= $type;
+        }
+    }
+
+    $signature .= ")";
+
     if ($sphinx_major < 3) {
         if ($args{'typedef'}) {
             print ".. c:type:: ". $args{'function'} . "\n\n";
@@ -852,31 +877,13 @@ sub output_function_rst(%) {
             print "``" if ($is_macro);
         }
     }
-    if ($args{'functiontype'} ne "") {
-        print $args{'functiontype'} . " " . $args{'function'} . " (";
-    } else {
-        print $args{'function'} . " (";
-    }
 
-    my $count = 0;
-    foreach my $parameter (@{$args{'parameterlist'}}) {
-        if ($count ne 0) {
-            print ", ";
-        }
-        $count++;
-        $type = $args{'parametertypes'}{$parameter};
+    print $signature;
 
-        if ($type =~ m/$function_pointer/) {
-            # pointer-to-function
-            print $1 . $parameter . ") (" . $2 . ")";
-        } else {
-            print $type;
-        }
-    }
     if ($is_macro) {
-        print ")``\n\n";
+        print "``\n\n";
     } else {
-        print ")\n\n";
+        print "\n\n";
     }
     if (!$args{'typedef'}) {
         print_lineno($declaration_start_line);

From c9fe5e24994968fd0c96721b537dbe82dfc3fd8f Mon Sep 17 00:00:00 2001
From: Vegard Nossum <vegard.nossum@oracle.com>
Date: Thu, 15 Feb 2024 14:48:27 +0100
Subject: [PATCH 0791/1406] scripts/kernel-doc: simplify signature printing

Untangle some of the $is_macro logic and the nested conditionals.

This makes it easier to see where and how the signature is actually
printed.

No functional change.

Signed-off-by: Vegard Nossum <vegard.nossum@oracle.com>
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
Link: https://lore.kernel.org/r/20240215134828.1277109-5-vegard.nossum@oracle.com
---
 scripts/kernel-doc | 36 ++++++++++++++----------------------
 1 file changed, 14 insertions(+), 22 deletions(-)

diff --git a/scripts/kernel-doc b/scripts/kernel-doc
index 1af2c68f6bd8eb..7acb7554abb9c5 100755
--- a/scripts/kernel-doc
+++ b/scripts/kernel-doc
@@ -820,7 +820,6 @@ sub output_function_rst(%) {
     my %args = %{$_[0]};
     my ($parameter, $section);
     my $oldprefix = $lineprefix;
-    my $is_macro = 0;
 
     my $signature = "";
     if ($args{'functiontype'} ne "") {
@@ -854,37 +853,30 @@ sub output_function_rst(%) {
             print "   **Typedef**: ";
             $lineprefix = "";
             output_highlight_rst($args{'purpose'});
-            print "\n\n**Syntax**\n\n  ``";
-            $is_macro = 1;
+            print "\n\n**Syntax**\n\n";
+            print "  ``$signature``\n\n";
         } else {
-            print ".. c:function:: ";
+            print ".. c:function:: $signature\n\n";
         }
     } else {
         if ($args{'typedef'} || $args{'functiontype'} eq "") {
-            $is_macro = 1;
             print ".. c:macro:: ". $args{'function'} . "\n\n";
-        } else {
-            print ".. c:function:: ";
-        }
 
-        if ($args{'typedef'}) {
-            print_lineno($declaration_start_line);
-            print "   **Typedef**: ";
-            $lineprefix = "";
-            output_highlight_rst($args{'purpose'});
-            print "\n\n**Syntax**\n\n  ``";
+            if ($args{'typedef'}) {
+                print_lineno($declaration_start_line);
+                print "   **Typedef**: ";
+                $lineprefix = "";
+                output_highlight_rst($args{'purpose'});
+                print "\n\n**Syntax**\n\n";
+                print "  ``$signature``\n\n";
+            } else {
+                print "``$signature``\n\n";
+            }
         } else {
-            print "``" if ($is_macro);
+            print ".. c:function:: $signature\n\n";
         }
     }
 
-    print $signature;
-
-    if ($is_macro) {
-        print "``\n\n";
-    } else {
-        print "\n\n";
-    }
     if (!$args{'typedef'}) {
         print_lineno($declaration_start_line);
         $lineprefix = "   ";

From 2cc094037766898068734e30624203cd1be81ae5 Mon Sep 17 00:00:00 2001
From: Vegard Nossum <vegard.nossum@oracle.com>
Date: Thu, 15 Feb 2024 14:48:28 +0100
Subject: [PATCH 0792/1406] doc: kerneldoc.py: fix indentation

kerneldoc.py is mostly indented with 4 spaces (like PEP8 suggests);
replace the last remaining tabs for consistency.

No functional change.

Cc: Jani Nikula <jani.nikula@intel.com>
Cc: Mauro Carvalho Chehab <mchehab@kernel.org>
Signed-off-by: Vegard Nossum <vegard.nossum@oracle.com>
Reviewed-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
Link: https://lore.kernel.org/r/20240215134828.1277109-6-vegard.nossum@oracle.com
---
 Documentation/sphinx/kerneldoc.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Documentation/sphinx/kerneldoc.py b/Documentation/sphinx/kerneldoc.py
index 7acf09963daa52..ec1ddfff1863f8 100644
--- a/Documentation/sphinx/kerneldoc.py
+++ b/Documentation/sphinx/kerneldoc.py
@@ -61,9 +61,9 @@ def run(self):
         env = self.state.document.settings.env
         cmd = [env.config.kerneldoc_bin, '-rst', '-enable-lineno']
 
-	# Pass the version string to kernel-doc, as it needs to use a different
-	# dialect, depending what the C domain supports for each specific
-	# Sphinx versions
+        # Pass the version string to kernel-doc, as it needs to use a different
+        # dialect, depending what the C domain supports for each specific
+        # Sphinx versions
         cmd += ['-sphinx-version', sphinx.__version__]
 
         filename = env.config.kerneldoc_srctree + '/' + self.arguments[0]

From abb560df34205dadfd254df0a947858e1a2ded94 Mon Sep 17 00:00:00 2001
From: Guenter Roeck <linux@roeck-us.net>
Date: Sat, 10 Feb 2024 09:55:26 -0800
Subject: [PATCH 0793/1406] parisc: Fix ip_fast_csum

IP checksum unit tests report the following error when run on hppa/hppa64.

    # test_ip_fast_csum: ASSERTION FAILED at lib/checksum_kunit.c:463
    Expected ( u64)csum_result == ( u64)expected, but
        ( u64)csum_result == 33754 (0x83da)
        ( u64)expected == 10946 (0x2ac2)
    not ok 4 test_ip_fast_csum

0x83da is the expected result if the IP header length is 20 bytes. 0x2ac2
is the expected result if the IP header length is 24 bytes. The test fails
with an IP header length of 24 bytes. It appears that ip_fast_csum()
always returns the checksum for a 20-byte header, no matter how long
the header actually is.

Code analysis shows a suspicious assembler sequence in ip_fast_csum().

 "      addc            %0, %3, %0\n"
 "1:    ldws,ma         4(%1), %3\n"
 "      addib,<         0, %2, 1b\n"	<---

While my understanding of HPPA assembler is limited, it does not seem
to make much sense to subtract 0 from a register and to expect the result
to ever be negative. Subtracting 1 from the length parameter makes more
sense. On top of that, the operation should be repeated if and only if
the result is still > 0, so change the suspicious instruction to
 "      addib,>         -1, %2, 1b\n"

The IP checksum unit test passes after this change.

Cc: Palmer Dabbelt <palmer@rivosinc.com>
Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Cc: stable@vger.kernel.org
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
Tested-by: Charlie Jenkins <charlie@rivosinc.com>
Reviewed-by: Charlie Jenkins <charlie@rivosinc.com>
Signed-off-by: Helge Deller <deller@gmx.de>
---
 arch/parisc/include/asm/checksum.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/parisc/include/asm/checksum.h b/arch/parisc/include/asm/checksum.h
index 3c43baca7b397d..f705e5dd107421 100644
--- a/arch/parisc/include/asm/checksum.h
+++ b/arch/parisc/include/asm/checksum.h
@@ -40,7 +40,7 @@ static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl)
 "	addc		%0, %5, %0\n"
 "	addc		%0, %3, %0\n"
 "1:	ldws,ma		4(%1), %3\n"
-"	addib,<		0, %2, 1b\n"
+"	addib,>		-1, %2, 1b\n"
 "	addc		%0, %3, %0\n"
 "\n"
 "	extru		%0, 31, 16, %4\n"

From bb89c7937dd1e26bd420dae6c0c08cb8531e92ee Mon Sep 17 00:00:00 2001
From: Guenter Roeck <linux@roeck-us.net>
Date: Sat, 10 Feb 2024 11:15:56 -0800
Subject: [PATCH 0794/1406] parisc: Fix csum_ipv6_magic on 32-bit systems

Calculating the IPv6 checksum on 32-bit systems missed overflows when
adding the proto+len fields into the checksum. This results in the
following unit test failure.

    # test_csum_ipv6_magic: ASSERTION FAILED at lib/checksum_kunit.c:506
    Expected ( u64)csum_result == ( u64)expected, but
        ( u64)csum_result == 46722 (0xb682)
        ( u64)expected == 46721 (0xb681)
    not ok 5 test_csum_ipv6_magic

This is probably rarely seen in the real world because proto+len are
usually small values which will rarely result in overflows when calculating
the checksum. However, the unit test code uses large values for the length
field, causing the test to fail.

Fix the problem by adding the missing carry into the final checksum.

Cc: Palmer Dabbelt <palmer@rivosinc.com>
Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Cc: stable@vger.kernel.org
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
Tested-by: Charlie Jenkins <charlie@rivosinc.com>
Reviewed-by: Charlie Jenkins <charlie@rivosinc.com>
Signed-off-by: Helge Deller <deller@gmx.de>
---
 arch/parisc/include/asm/checksum.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/parisc/include/asm/checksum.h b/arch/parisc/include/asm/checksum.h
index f705e5dd107421..e619e67440db95 100644
--- a/arch/parisc/include/asm/checksum.h
+++ b/arch/parisc/include/asm/checksum.h
@@ -163,7 +163,8 @@ static __inline__ __sum16 csum_ipv6_magic(const struct in6_addr *saddr,
 "	ldw,ma		4(%2), %7\n"	/* 4th daddr */
 "	addc		%6, %0, %0\n"
 "	addc		%7, %0, %0\n"
-"	addc		%3, %0, %0\n"	/* fold in proto+len, catch carry */
+"	addc		%3, %0, %0\n"	/* fold in proto+len */
+"	addc		0, %0, %0\n"	/* add carry */
 
 #endif
 	: "=r" (sum), "=r" (saddr), "=r" (daddr), "=r" (len),

From 3a34e3fcdd835cc18e7e54bd835451a82828b72e Mon Sep 17 00:00:00 2001
From: Guenter Roeck <linux@roeck-us.net>
Date: Tue, 13 Feb 2024 15:46:31 -0800
Subject: [PATCH 0795/1406] parisc: Fix csum_ipv6_magic on 64-bit systems

hppa 64-bit systems calculates the IPv6 checksum using 64-bit add
operations. The last add folds protocol and length fields into the 64-bit
result. While unlikely, this operation can overflow. The overflow can be
triggered with a code sequence such as the following.

	/* try to trigger massive overflows */
	memset(tmp_buf, 0xff, sizeof(struct in6_addr));
	csum_result = csum_ipv6_magic((struct in6_addr *)tmp_buf,
				      (struct in6_addr *)tmp_buf,
				      0xffff, 0xff, 0xffffffff);

Fix the problem by adding any overflows from the final add operation into
the calculated checksum. Fortunately, we can do this without additional
cost by replacing the add operation used to fold the checksum into 32 bit
with "add,dc" to add in the missing carry.

Cc: Palmer Dabbelt <palmer@rivosinc.com>
Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Cc: stable@vger.kernel.org
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
Reviewed-by: Charlie Jenkins <charlie@rivosinc.com>
Tested-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Helge Deller <deller@gmx.de>
---
 arch/parisc/include/asm/checksum.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/parisc/include/asm/checksum.h b/arch/parisc/include/asm/checksum.h
index e619e67440db95..c949aa20fa1622 100644
--- a/arch/parisc/include/asm/checksum.h
+++ b/arch/parisc/include/asm/checksum.h
@@ -137,8 +137,8 @@ static __inline__ __sum16 csum_ipv6_magic(const struct in6_addr *saddr,
 "	add,dc		%3, %0, %0\n"  /* fold in proto+len | carry bit */
 "	extrd,u		%0, 31, 32, %4\n"/* copy upper half down */
 "	depdi		0, 31, 32, %0\n"/* clear upper half */
-"	add		%4, %0, %0\n"	/* fold into 32-bits */
-"	addc		0, %0, %0\n"	/* add carry */
+"	add,dc		%4, %0, %0\n"	/* fold into 32-bits, plus carry */
+"	addc		0, %0, %0\n"	/* add final carry */
 
 #else
 

From 93ce205487e79f182f305a2ac4ff2a9330f63f70 Mon Sep 17 00:00:00 2001
From: Konstantin Ryabitsev <konstantin@linuxfoundation.org>
Date: Wed, 14 Feb 2024 15:09:53 -0500
Subject: [PATCH 0796/1406] Documentation: update mailing list addresses

The mailman2 server running on lists.linuxfoundation.org will be shut
down in very imminent future. Update all instances of obsolete list
addresses throughout the tree with their new destinations.

Signed-off-by: Konstantin Ryabitsev <konstantin@linuxfoundation.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
Link: https://lore.kernel.org/r/20240214-lf-org-list-migration-v1-1-ef1eab4b1543@linuxfoundation.org
---
 Documentation/ABI/testing/sysfs-bus-vdpa               | 10 +++++-----
 Documentation/networking/bridge.rst                    |  2 +-
 Documentation/process/researcher-guidelines.rst        |  2 +-
 .../sp_SP/process/researcher-guidelines.rst            |  2 +-
 MAINTAINERS                                            |  6 +++---
 5 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/Documentation/ABI/testing/sysfs-bus-vdpa b/Documentation/ABI/testing/sysfs-bus-vdpa
index 4da53878bff6b8..2c833b5163f217 100644
--- a/Documentation/ABI/testing/sysfs-bus-vdpa
+++ b/Documentation/ABI/testing/sysfs-bus-vdpa
@@ -1,6 +1,6 @@
 What:		/sys/bus/vdpa/drivers_autoprobe
 Date:		March 2020
-Contact:	virtualization@lists.linux-foundation.org
+Contact:	virtualization@lists.linux.dev
 Description:
 		This file determines whether new devices are immediately bound
 		to a driver after the creation. It initially contains 1, which
@@ -12,7 +12,7 @@ Description:
 
 What:		/sys/bus/vdpa/driver_probe
 Date:		March 2020
-Contact:	virtualization@lists.linux-foundation.org
+Contact:	virtualization@lists.linux.dev
 Description:
 		Writing a device name to this file will cause the kernel binds
 		devices to a compatible driver.
@@ -22,7 +22,7 @@ Description:
 
 What:		/sys/bus/vdpa/drivers/.../bind
 Date:		March 2020
-Contact:	virtualization@lists.linux-foundation.org
+Contact:	virtualization@lists.linux.dev
 Description:
 		Writing a device name to this file will cause the driver to
 		attempt to bind to the device. This is useful for overriding
@@ -30,7 +30,7 @@ Description:
 
 What:		/sys/bus/vdpa/drivers/.../unbind
 Date:		March 2020
-Contact:	virtualization@lists.linux-foundation.org
+Contact:	virtualization@lists.linux.dev
 Description:
 		Writing a device name to this file will cause the driver to
 		attempt to unbind from the device. This may be useful when
@@ -38,7 +38,7 @@ Description:
 
 What:		/sys/bus/vdpa/devices/.../driver_override
 Date:		November 2021
-Contact:	virtualization@lists.linux-foundation.org
+Contact:	virtualization@lists.linux.dev
 Description:
 		This file allows the driver for a device to be specified.
 		When specified, only a driver with a name matching the value
diff --git a/Documentation/networking/bridge.rst b/Documentation/networking/bridge.rst
index ba14e7b07869c9..ef8b73e157b268 100644
--- a/Documentation/networking/bridge.rst
+++ b/Documentation/networking/bridge.rst
@@ -324,7 +324,7 @@ Contact Info
 The code is currently maintained by Roopa Prabhu <roopa@nvidia.com> and
 Nikolay Aleksandrov <razor@blackwall.org>. Bridge bugs and enhancements
 are discussed on the linux-netdev mailing list netdev@vger.kernel.org and
-bridge@lists.linux-foundation.org.
+bridge@lists.linux.dev.
 
 The list is open to anyone interested: http://vger.kernel.org/vger-lists.html#netdev
 
diff --git a/Documentation/process/researcher-guidelines.rst b/Documentation/process/researcher-guidelines.rst
index d159cd4f5e5b3b..beb484c5965dcf 100644
--- a/Documentation/process/researcher-guidelines.rst
+++ b/Documentation/process/researcher-guidelines.rst
@@ -167,4 +167,4 @@ If no one can be found to internally review patches and you need
 help finding such a person, or if you have any other questions
 related to this document and the developer community's expectations,
 please reach out to the private Technical Advisory Board mailing list:
-<tech-board@lists.linux-foundation.org>.
+<tech-board@groups.linuxfoundation.org>.
diff --git a/Documentation/translations/sp_SP/process/researcher-guidelines.rst b/Documentation/translations/sp_SP/process/researcher-guidelines.rst
index 462b3290b7b835..deccc908a68de3 100644
--- a/Documentation/translations/sp_SP/process/researcher-guidelines.rst
+++ b/Documentation/translations/sp_SP/process/researcher-guidelines.rst
@@ -147,4 +147,4 @@ Si no se puede encontrar a nadie para revisar internamente los parches y necesit
 ayuda para encontrar a esa persona, o si tiene alguna otra pregunta relacionada
 con este documento y las expectativas de la comunidad de desarrolladores, por
 favor contacte con la lista de correo privada Technical Advisory Board:
-<tech-board@lists.linux-foundation.org>.
+<tech-board@groups.linuxfoundation.org>.
diff --git a/MAINTAINERS b/MAINTAINERS
index 8d1052fa6a6924..106fb189de2013 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -14007,7 +14007,7 @@ F:	include/uapi/rdma/mlx5-abi.h
 
 MELLANOX MLX5 VDPA DRIVER
 M:	Dragos Tatulea <dtatulea@nvidia.com>
-L:	virtualization@lists.linux-foundation.org
+L:	virtualization@lists.linux.dev
 S:	Supported
 F:	drivers/vdpa/mlx5/
 
@@ -21515,7 +21515,7 @@ F:	tools/testing/selftests/drivers/net/team/
 TECHNICAL ADVISORY BOARD PROCESS DOCS
 M:	"Theodore Ts'o" <tytso@mit.edu>
 M:	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
-L:	tech-board-discuss@lists.linux-foundation.org
+L:	tech-board-discuss@lists.linux.dev
 S:	Maintained
 F:	Documentation/process/contribution-maturity-model.rst
 F:	Documentation/process/researcher-guidelines.rst
@@ -23074,7 +23074,7 @@ F:	drivers/vfio/pci/mlx5/
 VFIO VIRTIO PCI DRIVER
 M:	Yishai Hadas <yishaih@nvidia.com>
 L:	kvm@vger.kernel.org
-L:	virtualization@lists.linux-foundation.org
+L:	virtualization@lists.linux.dev
 S:	Maintained
 F:	drivers/vfio/pci/virtio
 

From 2117e9017b128bd8796cd892c5eb4b99be615572 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Wed, 14 Feb 2024 14:29:38 +0100
Subject: [PATCH 0797/1406] kernel-doc: handle #if in enums as well

In addition to #ifdef, #define and #endif, also handle
any #if since we may be using e.g. #if IS_ENABLED(...).

I didn't find any instances of this in the kernel now,
there are enums with such ifs inside, but I didn't find
any with kernel-doc as well. However, it came up as we
were adding such a construct in our driver and warnings
from kernel-doc were the result.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
Link: https://lore.kernel.org/r/20240214142937.80ee86a3beae.Ibcc5bd97a20cd10a792663e4b254cd46c7e8b520@changeid
---
 scripts/kernel-doc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/kernel-doc b/scripts/kernel-doc
index 7acb7554abb9c5..71a89d8832c1a8 100755
--- a/scripts/kernel-doc
+++ b/scripts/kernel-doc
@@ -1328,7 +1328,7 @@ sub dump_enum($$) {
 
     $x =~ s@/\*.*?\*/@@gos;	# strip comments.
     # strip #define macros inside enums
-    $x =~ s@#\s*((define|ifdef)\s+|endif)[^;]*;@@gos;
+    $x =~ s@#\s*((define|ifdef|if)\s+|endif)[^;]*;@@gos;
 
     if ($x =~ /typedef\s+enum\s*\{(.*)\}\s*(\w*)\s*;/) {
         $declaration_name = $2;

From 4bc4185104cd443914d6976e0c312ba2c39fc067 Mon Sep 17 00:00:00 2001
From: "Ran.Park" <ranpark@foxmail.com>
Date: Wed, 14 Feb 2024 19:40:08 +0800
Subject: [PATCH 0798/1406] Fixed case issue with 'fault-injection' in
 documentation

In the 'fault-injection' subdirectory, the first letter F
is capitalized, whereas in index.rst f is lowercase, but in
index.rst all other elements in the same column are capitalized.

Signed-off-by: "Ran.Park" <ranpark@foxmail.com>
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
Link: https://lore.kernel.org/r/tencent_3EA07E65C43816C2A8402DC655CF98916B06@qq.com
---
 Documentation/fault-injection/index.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/fault-injection/index.rst b/Documentation/fault-injection/index.rst
index 8408a8a91b3411..a6ea1d1902225c 100644
--- a/Documentation/fault-injection/index.rst
+++ b/Documentation/fault-injection/index.rst
@@ -1,7 +1,7 @@
 .. SPDX-License-Identifier: GPL-2.0
 
 ===============
-fault-injection
+Fault-injection
 ===============
 
 .. toctree::

From 98c8db91b4d577ae7ef31cd7558729d9978d8d67 Mon Sep 17 00:00:00 2001
From: Juntong Deng <juntong.deng@outlook.com>
Date: Thu, 15 Feb 2024 19:17:23 +0000
Subject: [PATCH 0799/1406] kasan: Add documentation for
 CONFIG_KASAN_EXTRA_INFO

This patch adds CONFIG_KASAN_EXTRA_INFO introduction information to
KASAN documentation.

Signed-off-by: Juntong Deng <juntong.deng@outlook.com>
Reviewed-by: Andrey Konovalov <andreyknvl@gmail.com>
Link: https://lore.kernel.org/r/AM6PR03MB5848C52B871DA67455F0B2F2994D2@AM6PR03MB5848.eurprd03.prod.outlook.com
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 Documentation/dev-tools/kasan.rst | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/Documentation/dev-tools/kasan.rst b/Documentation/dev-tools/kasan.rst
index 858c77fe7dc46c..d56f298a9d7cae 100644
--- a/Documentation/dev-tools/kasan.rst
+++ b/Documentation/dev-tools/kasan.rst
@@ -277,6 +277,27 @@ traces point to places in code that interacted with the object but that are not
 directly present in the bad access stack trace. Currently, this includes
 call_rcu() and workqueue queuing.
 
+CONFIG_KASAN_EXTRA_INFO
+~~~~~~~~~~~~~~~~~~~~~~~
+
+Enabling CONFIG_KASAN_EXTRA_INFO allows KASAN to record and report more
+information. The extra information currently supported is the CPU number and
+timestamp at allocation and free. More information can help find the cause of
+the bug and correlate the error with other system events, at the cost of using
+extra memory to record more information (more cost details in the help text of
+CONFIG_KASAN_EXTRA_INFO).
+
+Here is the report with CONFIG_KASAN_EXTRA_INFO enabled (only the
+different parts are shown)::
+
+    ==================================================================
+    ...
+    Allocated by task 134 on cpu 5 at 229.133855s:
+    ...
+    Freed by task 136 on cpu 3 at 230.199335s:
+    ...
+    ==================================================================
+
 Implementation details
 ----------------------
 

From 7ef45a747a133a1340d38ad95b8aa7e230616aa4 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 16 Feb 2024 12:19:02 -0800
Subject: [PATCH 0800/1406] MAINTAINERS: Set the field name for subsystem
 profile section

Subsystem profile section entry identifier is not having its field name
that can be parsed by maintainers_include.py, unlike other sections
which have their own human-readable field names.  As a result, profile
sections on rendered rst file is having weird name, 'P:'.  Set the field
name as 'Subsystem Profile'.

Fixes: 4699c504e603 ("Maintainer Handbook: Maintainer Entry Profile")
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
Link: https://lore.kernel.org/r/20240216201902.10095-1-sj@kernel.org
---
 MAINTAINERS | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index 106fb189de2013..36d69d90aa8ebd 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -24,7 +24,7 @@ Descriptions of section entries and preferred order
 	   filing info, a direct bug tracker link, or a mailto: URI.
 	C: URI for *chat* protocol, server and channel where developers
 	   usually hang out, for example irc://server/channel.
-	P: Subsystem Profile document for more details submitting
+	P: *Subsystem Profile* document for more details submitting
 	   patches to the given subsystem. This is either an in-tree file,
 	   or a URI. See Documentation/maintainer/maintainer-entry-profile.rst
 	   for details.

From 47aeda1d5a4716fe7f6184e6a749d860ab59cb84 Mon Sep 17 00:00:00 2001
From: Paul Durrant <pdurrant@amazon.com>
Date: Thu, 15 Feb 2024 15:29:04 +0000
Subject: [PATCH 0801/1406] KVM: pfncache: allow a cache to be activated with a
 fixed (userspace) HVA

Some pfncache pages may actually be overlays on guest memory that have a
fixed HVA within the VMM. It's pointless to invalidate such cached
mappings if the overlay is moved so allow a cache to be activated directly
with the HVA to cater for such cases. A subsequent patch will make use
of this facility.

Signed-off-by: Paul Durrant <pdurrant@amazon.com>
Reviewed-by: David Woodhouse <dwmw@amazon.co.uk>
Link: https://lore.kernel.org/r/20240215152916.1158-10-paul@xen.org
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 include/linux/kvm_host.h | 31 +++++++++++++
 virt/kvm/pfncache.c      | 98 ++++++++++++++++++++++++++++------------
 2 files changed, 101 insertions(+), 28 deletions(-)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 81a9d1cf91a260..fa070c36f98a2d 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -148,6 +148,11 @@ static inline bool kvm_is_error_hva(unsigned long addr)
 
 #endif
 
+static inline bool kvm_is_error_gpa(gpa_t gpa)
+{
+	return gpa == INVALID_GPA;
+}
+
 #define KVM_ERR_PTR_BAD_PAGE	(ERR_PTR(-ENOENT))
 
 static inline bool is_error_page(struct page *page)
@@ -1344,6 +1349,22 @@ void kvm_gpc_init(struct gfn_to_pfn_cache *gpc, struct kvm *kvm);
  */
 int kvm_gpc_activate(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned long len);
 
+/**
+ * kvm_gpc_activate_hva - prepare a cached kernel mapping and HPA for a given HVA.
+ *
+ * @gpc:          struct gfn_to_pfn_cache object.
+ * @hva:          userspace virtual address to map.
+ * @len:          sanity check; the range being access must fit a single page.
+ *
+ * @return:       0 for success.
+ *                -EINVAL for a mapping which would cross a page boundary.
+ *                -EFAULT for an untranslatable guest physical address.
+ *
+ * The semantics of this function are the same as those of kvm_gpc_activate(). It
+ * merely bypasses a layer of address translation.
+ */
+int kvm_gpc_activate_hva(struct gfn_to_pfn_cache *gpc, unsigned long hva, unsigned long len);
+
 /**
  * kvm_gpc_check - check validity of a gfn_to_pfn_cache.
  *
@@ -1390,6 +1411,16 @@ int kvm_gpc_refresh(struct gfn_to_pfn_cache *gpc, unsigned long len);
  */
 void kvm_gpc_deactivate(struct gfn_to_pfn_cache *gpc);
 
+static inline bool kvm_gpc_is_gpa_active(struct gfn_to_pfn_cache *gpc)
+{
+	return gpc->active && !kvm_is_error_gpa(gpc->gpa);
+}
+
+static inline bool kvm_gpc_is_hva_active(struct gfn_to_pfn_cache *gpc)
+{
+	return gpc->active && kvm_is_error_gpa(gpc->gpa);
+}
+
 void kvm_sigset_activate(struct kvm_vcpu *vcpu);
 void kvm_sigset_deactivate(struct kvm_vcpu *vcpu);
 
diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c
index 97eec8ee34493a..a47ca6fd75c27b 100644
--- a/virt/kvm/pfncache.c
+++ b/virt/kvm/pfncache.c
@@ -48,7 +48,14 @@ bool kvm_gpc_check(struct gfn_to_pfn_cache *gpc, unsigned long len)
 	if (!gpc->active)
 		return false;
 
-	if (gpc->generation != slots->generation || kvm_is_error_hva(gpc->uhva))
+	/*
+	 * If the page was cached from a memslot, make sure the memslots have
+	 * not been re-configured.
+	 */
+	if (!kvm_is_error_gpa(gpc->gpa) && gpc->generation != slots->generation)
+		return false;
+
+	if (kvm_is_error_hva(gpc->uhva))
 		return false;
 
 	if (offset_in_page(gpc->uhva) + len > PAGE_SIZE)
@@ -209,11 +216,10 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
 	return -EFAULT;
 }
 
-static int __kvm_gpc_refresh(struct gfn_to_pfn_cache *gpc, gpa_t gpa,
+static int __kvm_gpc_refresh(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned long uhva,
 			     unsigned long len)
 {
-	struct kvm_memslots *slots = kvm_memslots(gpc->kvm);
-	unsigned long page_offset = offset_in_page(gpa);
+	unsigned long page_offset;
 	bool unmap_old = false;
 	unsigned long old_uhva;
 	kvm_pfn_t old_pfn;
@@ -221,10 +227,16 @@ static int __kvm_gpc_refresh(struct gfn_to_pfn_cache *gpc, gpa_t gpa,
 	void *old_khva;
 	int ret;
 
+	/* Either gpa or uhva must be valid, but not both */
+	if (WARN_ON_ONCE(kvm_is_error_gpa(gpa) == kvm_is_error_hva(uhva)))
+		return -EINVAL;
+
 	/*
-	 * If must fit within a single page. The 'len' argument is
-	 * only to enforce that.
+	 * The cached acces must fit within a single page. The 'len' argument
+	 * exists only to enforce that.
 	 */
+	page_offset = kvm_is_error_gpa(gpa) ? offset_in_page(uhva) :
+					      offset_in_page(gpa);
 	if (page_offset + len > PAGE_SIZE)
 		return -EINVAL;
 
@@ -246,29 +258,39 @@ static int __kvm_gpc_refresh(struct gfn_to_pfn_cache *gpc, gpa_t gpa,
 	old_khva = (void *)PAGE_ALIGN_DOWN((uintptr_t)gpc->khva);
 	old_uhva = PAGE_ALIGN_DOWN(gpc->uhva);
 
-	/* Refresh the userspace HVA if necessary */
-	if (gpc->gpa != gpa || gpc->generation != slots->generation ||
-	    kvm_is_error_hva(gpc->uhva)) {
-		gfn_t gfn = gpa_to_gfn(gpa);
-
-		gpc->gpa = gpa;
-		gpc->generation = slots->generation;
-		gpc->memslot = __gfn_to_memslot(slots, gfn);
-		gpc->uhva = gfn_to_hva_memslot(gpc->memslot, gfn);
+	if (kvm_is_error_gpa(gpa)) {
+		gpc->gpa = INVALID_GPA;
+		gpc->memslot = NULL;
+		gpc->uhva = PAGE_ALIGN_DOWN(uhva);
 
-		if (kvm_is_error_hva(gpc->uhva)) {
-			ret = -EFAULT;
-			goto out;
-		}
-
-		/*
-		 * Even if the GPA and/or the memslot generation changed, the
-		 * HVA may still be the same.
-		 */
 		if (gpc->uhva != old_uhva)
 			hva_change = true;
 	} else {
-		gpc->uhva = old_uhva;
+		struct kvm_memslots *slots = kvm_memslots(gpc->kvm);
+
+		if (gpc->gpa != gpa || gpc->generation != slots->generation ||
+		    kvm_is_error_hva(gpc->uhva)) {
+			gfn_t gfn = gpa_to_gfn(gpa);
+
+			gpc->gpa = gpa;
+			gpc->generation = slots->generation;
+			gpc->memslot = __gfn_to_memslot(slots, gfn);
+			gpc->uhva = gfn_to_hva_memslot(gpc->memslot, gfn);
+
+			if (kvm_is_error_hva(gpc->uhva)) {
+				ret = -EFAULT;
+				goto out;
+			}
+
+			/*
+			 * Even if the GPA and/or the memslot generation changed, the
+			 * HVA may still be the same.
+			 */
+			if (gpc->uhva != old_uhva)
+				hva_change = true;
+		} else {
+			gpc->uhva = old_uhva;
+		}
 	}
 
 	/* Note: the offset must be correct before calling hva_to_pfn_retry() */
@@ -319,7 +341,15 @@ static int __kvm_gpc_refresh(struct gfn_to_pfn_cache *gpc, gpa_t gpa,
 
 int kvm_gpc_refresh(struct gfn_to_pfn_cache *gpc, unsigned long len)
 {
-	return __kvm_gpc_refresh(gpc, gpc->gpa, len);
+	/*
+	 * If the GPA is valid then ignore the HVA, as a cache can be GPA-based
+	 * or HVA-based, not both.  For GPA-based caches, the HVA will be
+	 * recomputed during refresh if necessary.
+	 */
+	unsigned long uhva = kvm_is_error_gpa(gpc->gpa) ? gpc->uhva :
+							  KVM_HVA_ERR_BAD;
+
+	return __kvm_gpc_refresh(gpc, gpc->gpa, uhva, len);
 }
 
 void kvm_gpc_init(struct gfn_to_pfn_cache *gpc, struct kvm *kvm)
@@ -329,10 +359,12 @@ void kvm_gpc_init(struct gfn_to_pfn_cache *gpc, struct kvm *kvm)
 
 	gpc->kvm = kvm;
 	gpc->pfn = KVM_PFN_ERR_FAULT;
+	gpc->gpa = INVALID_GPA;
 	gpc->uhva = KVM_HVA_ERR_BAD;
 }
 
-int kvm_gpc_activate(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned long len)
+static int __kvm_gpc_activate(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned long uhva,
+			      unsigned long len)
 {
 	struct kvm *kvm = gpc->kvm;
 
@@ -353,7 +385,17 @@ int kvm_gpc_activate(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned long len)
 		gpc->active = true;
 		write_unlock_irq(&gpc->lock);
 	}
-	return __kvm_gpc_refresh(gpc, gpa, len);
+	return __kvm_gpc_refresh(gpc, gpa, uhva, len);
+}
+
+int kvm_gpc_activate(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned long len)
+{
+	return __kvm_gpc_activate(gpc, gpa, KVM_HVA_ERR_BAD, len);
+}
+
+int kvm_gpc_activate_hva(struct gfn_to_pfn_cache *gpc, unsigned long uhva, unsigned long len)
+{
+	return __kvm_gpc_activate(gpc, INVALID_GPA, uhva, len);
 }
 
 void kvm_gpc_deactivate(struct gfn_to_pfn_cache *gpc)

From 5550854a7ab69fafde989cfc20758133e0f93d8b Mon Sep 17 00:00:00 2001
From: Paul Durrant <pdurrant@amazon.com>
Date: Thu, 15 Feb 2024 15:29:05 +0000
Subject: [PATCH 0802/1406] KVM: x86/xen: separate initialization of
 shared_info cache and content

A subsequent patch will allow shared_info to be initialized using either a
GPA or a user-space (i.e. VMM) HVA. To make that patch cleaner, separate
the initialization of the shared_info content from the activation of the
pfncache.

Signed-off-by: Paul Durrant <pdurrant@amazon.com>
Reviewed-by: David Woodhouse <dwmw@amazon.co.uk>
Link: https://lore.kernel.org/r/20240215152916.1158-11-paul@xen.org
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/xen.c | 55 +++++++++++++++++++++++++++-------------------
 1 file changed, 32 insertions(+), 23 deletions(-)

diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
index e904642254677c..031e98d88ba22e 100644
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -34,41 +34,32 @@ static bool kvm_xen_hcall_evtchn_send(struct kvm_vcpu *vcpu, u64 param, u64 *r);
 
 DEFINE_STATIC_KEY_DEFERRED_FALSE(kvm_xen_enabled, HZ);
 
-static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn)
+static int kvm_xen_shared_info_init(struct kvm *kvm)
 {
 	struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache;
 	struct pvclock_wall_clock *wc;
-	gpa_t gpa = gfn_to_gpa(gfn);
 	u32 *wc_sec_hi;
 	u32 wc_version;
 	u64 wall_nsec;
 	int ret = 0;
 	int idx = srcu_read_lock(&kvm->srcu);
 
-	if (gfn == KVM_XEN_INVALID_GFN) {
-		kvm_gpc_deactivate(gpc);
-		goto out;
-	}
+	read_lock_irq(&gpc->lock);
+	while (!kvm_gpc_check(gpc, PAGE_SIZE)) {
+		read_unlock_irq(&gpc->lock);
 
-	do {
-		ret = kvm_gpc_activate(gpc, gpa, PAGE_SIZE);
+		ret = kvm_gpc_refresh(gpc, PAGE_SIZE);
 		if (ret)
 			goto out;
 
-		/*
-		 * This code mirrors kvm_write_wall_clock() except that it writes
-		 * directly through the pfn cache and doesn't mark the page dirty.
-		 */
-		wall_nsec = kvm_get_wall_clock_epoch(kvm);
-
-		/* It could be invalid again already, so we need to check */
 		read_lock_irq(&gpc->lock);
+	}
 
-		if (gpc->valid)
-			break;
-
-		read_unlock_irq(&gpc->lock);
-	} while (1);
+	/*
+	 * This code mirrors kvm_write_wall_clock() except that it writes
+	 * directly through the pfn cache and doesn't mark the page dirty.
+	 */
+	wall_nsec = kvm_get_wall_clock_epoch(kvm);
 
 	/* Paranoia checks on the 32-bit struct layout */
 	BUILD_BUG_ON(offsetof(struct compat_shared_info, wc) != 0x900);
@@ -639,12 +630,30 @@ int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
 		}
 		break;
 
-	case KVM_XEN_ATTR_TYPE_SHARED_INFO:
+	case KVM_XEN_ATTR_TYPE_SHARED_INFO: {
+		int idx;
+
 		mutex_lock(&kvm->arch.xen.xen_lock);
-		r = kvm_xen_shared_info_init(kvm, data->u.shared_info.gfn);
+
+		idx = srcu_read_lock(&kvm->srcu);
+
+		if (data->u.shared_info.gfn == KVM_XEN_INVALID_GFN) {
+			kvm_gpc_deactivate(&kvm->arch.xen.shinfo_cache);
+			r = 0;
+		} else {
+			r = kvm_gpc_activate(&kvm->arch.xen.shinfo_cache,
+					     gfn_to_gpa(data->u.shared_info.gfn),
+					     PAGE_SIZE);
+		}
+
+		srcu_read_unlock(&kvm->srcu, idx);
+
+		if (!r && kvm->arch.xen.shinfo_cache.active)
+			r = kvm_xen_shared_info_init(kvm);
+
 		mutex_unlock(&kvm->arch.xen.xen_lock);
 		break;
-
+	}
 	case KVM_XEN_ATTR_TYPE_UPCALL_VECTOR:
 		if (data->u.vector && data->u.vector < 0x10)
 			r = -EINVAL;

From 90a9038a4669d9561a54a2720b42259bcc4a879d Mon Sep 17 00:00:00 2001
From: Paul Durrant <pdurrant@amazon.com>
Date: Thu, 15 Feb 2024 15:29:06 +0000
Subject: [PATCH 0803/1406] KVM: x86/xen: re-initialize shared_info if guest
 (32/64-bit) mode is set

If the shared_info PFN cache has already been initialized then the content
of the shared_info page needs to be re-initialized whenever the guest
mode is (re)set.
Setting the guest mode is either done explicitly by the VMM via the
KVM_XEN_ATTR_TYPE_LONG_MODE attribute, or implicitly when the guest writes
the MSR to set up the hypercall page.

Signed-off-by: Paul Durrant <pdurrant@amazon.com>
Reviewed-by: David Woodhouse <dwmw@amazon.co.uk>
Link: https://lore.kernel.org/r/20240215152916.1158-12-paul@xen.org
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/xen.c | 29 ++++++++++++++++++++++++++---
 1 file changed, 26 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
index 031e98d88ba22e..52edf676c47127 100644
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -625,8 +625,16 @@ int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
 		} else {
 			mutex_lock(&kvm->arch.xen.xen_lock);
 			kvm->arch.xen.long_mode = !!data->u.long_mode;
+
+			/*
+			 * Re-initialize shared_info to put the wallclock in the
+			 * correct place. Whilst it's not necessary to do this
+			 * unless the mode is actually changed, it does no harm
+			 * to make the call anyway.
+			 */
+			r = kvm->arch.xen.shinfo_cache.active ?
+				kvm_xen_shared_info_init(kvm) : 0;
 			mutex_unlock(&kvm->arch.xen.xen_lock);
-			r = 0;
 		}
 		break;
 
@@ -1101,9 +1109,24 @@ int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data)
 	u32 page_num = data & ~PAGE_MASK;
 	u64 page_addr = data & PAGE_MASK;
 	bool lm = is_long_mode(vcpu);
+	int r = 0;
+
+	mutex_lock(&kvm->arch.xen.xen_lock);
+	if (kvm->arch.xen.long_mode != lm) {
+		kvm->arch.xen.long_mode = lm;
+
+		/*
+		 * Re-initialize shared_info to put the wallclock in the
+		 * correct place.
+		 */
+		if (kvm->arch.xen.shinfo_cache.active &&
+		    kvm_xen_shared_info_init(kvm))
+			r = 1;
+	}
+	mutex_unlock(&kvm->arch.xen.xen_lock);
 
-	/* Latch long_mode for shared_info pages etc. */
-	vcpu->kvm->arch.xen.long_mode = lm;
+	if (r)
+		return r;
 
 	/*
 	 * If Xen hypercall intercept is enabled, fill the hypercall

From 01a871852b1133fc5f611ad7c0c072870c90aede Mon Sep 17 00:00:00 2001
From: Paul Durrant <pdurrant@amazon.com>
Date: Thu, 15 Feb 2024 15:29:07 +0000
Subject: [PATCH 0804/1406] KVM: x86/xen: allow shared_info to be mapped by
 fixed HVA

The shared_info page is not guest memory as such. It is a dedicated page
allocated by the VMM and overlaid onto guest memory in a GFN chosen by the
guest and specified in the XENMEM_add_to_physmap hypercall. The guest may
even request that shared_info be moved from one GFN to another by
re-issuing that hypercall, but the HVA is never going to change.

Because the shared_info page is an overlay the memory slots need to be
updated in response to the hypercall. However, memory slot adjustment is
not atomic and, whilst all vCPUs are paused, there is still the possibility
that events may be delivered (which requires the shared_info page to be
updated) whilst the shared_info GPA is absent. The HVA is never absent
though, so it makes much more sense to use that as the basis for the
kernel's mapping.

Hence add a new KVM_XEN_ATTR_TYPE_SHARED_INFO_HVA attribute type for this
purpose and a KVM_XEN_HVM_CONFIG_SHARED_INFO_HVA flag to advertize its
availability. Don't actually advertize it yet though. That will be done in
a subsequent patch, which will also add tests for the new attribute type.

Also update the KVM API documentation with the new attribute and also fix
it up to consistently refer to 'shared_info' (with the underscore).

Signed-off-by: Paul Durrant <pdurrant@amazon.com>
Reviewed-by: David Woodhouse <dwmw@amazon.co.uk>
Link: https://lore.kernel.org/r/20240215152916.1158-13-paul@xen.org
[sean: store "hva" as a user pointer, use kvm_gpc_is_{gpa,hva}_active()]
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 Documentation/virt/kvm/api.rst | 25 ++++++++++++++++-----
 arch/x86/kvm/xen.c             | 40 +++++++++++++++++++++++++++-------
 include/uapi/linux/kvm.h       |  6 ++++-
 3 files changed, 56 insertions(+), 15 deletions(-)

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 3ec0b7a455a0cf..3372be85b33557 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -372,7 +372,7 @@ The bits in the dirty bitmap are cleared before the ioctl returns, unless
 KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 is enabled.  For more information,
 see the description of the capability.
 
-Note that the Xen shared info page, if configured, shall always be assumed
+Note that the Xen shared_info page, if configured, shall always be assumed
 to be dirty. KVM will not explicitly mark it such.
 
 
@@ -5487,8 +5487,9 @@ KVM_PV_ASYNC_CLEANUP_PERFORM
 		__u8 long_mode;
 		__u8 vector;
 		__u8 runstate_update_flag;
-		struct {
+		union {
 			__u64 gfn;
+			__u64 hva;
 		} shared_info;
 		struct {
 			__u32 send_port;
@@ -5516,10 +5517,10 @@ type values:
 
 KVM_XEN_ATTR_TYPE_LONG_MODE
   Sets the ABI mode of the VM to 32-bit or 64-bit (long mode). This
-  determines the layout of the shared info pages exposed to the VM.
+  determines the layout of the shared_info page exposed to the VM.
 
 KVM_XEN_ATTR_TYPE_SHARED_INFO
-  Sets the guest physical frame number at which the Xen "shared info"
+  Sets the guest physical frame number at which the Xen shared_info
   page resides. Note that although Xen places vcpu_info for the first
   32 vCPUs in the shared_info page, KVM does not automatically do so
   and instead requires that KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO be used
@@ -5528,7 +5529,7 @@ KVM_XEN_ATTR_TYPE_SHARED_INFO
   not be aware of the Xen CPU id which is used as the index into the
   vcpu_info[] array, so may know the correct default location.
 
-  Note that the shared info page may be constantly written to by KVM;
+  Note that the shared_info page may be constantly written to by KVM;
   it contains the event channel bitmap used to deliver interrupts to
   a Xen guest, amongst other things. It is exempt from dirty tracking
   mechanisms — KVM will not explicitly mark the page as dirty each
@@ -5537,9 +5538,21 @@ KVM_XEN_ATTR_TYPE_SHARED_INFO
   any vCPU has been running or any event channel interrupts can be
   routed to the guest.
 
-  Setting the gfn to KVM_XEN_INVALID_GFN will disable the shared info
+  Setting the gfn to KVM_XEN_INVALID_GFN will disable the shared_info
   page.
 
+KVM_XEN_ATTR_TYPE_SHARED_INFO_HVA
+  If the KVM_XEN_HVM_CONFIG_SHARED_INFO_HVA flag is also set in the
+  Xen capabilities, then this attribute may be used to set the
+  userspace address at which the shared_info page resides, which
+  will always be fixed in the VMM regardless of where it is mapped
+  in guest physical address space. This attribute should be used in
+  preference to KVM_XEN_ATTR_TYPE_SHARED_INFO as it avoids
+  unnecessary invalidation of an internal cache when the page is
+  re-mapped in guest physcial address space.
+
+  Setting the hva to zero will disable the shared_info page.
+
 KVM_XEN_ATTR_TYPE_UPCALL_VECTOR
   Sets the exception vector used to deliver Xen event channel upcalls.
   This is the HVM-wide vector injected directly by the hypervisor
diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
index 52edf676c47127..cca72e6022bfd0 100644
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -638,20 +638,36 @@ int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
 		}
 		break;
 
-	case KVM_XEN_ATTR_TYPE_SHARED_INFO: {
+	case KVM_XEN_ATTR_TYPE_SHARED_INFO:
+	case KVM_XEN_ATTR_TYPE_SHARED_INFO_HVA: {
 		int idx;
 
 		mutex_lock(&kvm->arch.xen.xen_lock);
 
 		idx = srcu_read_lock(&kvm->srcu);
 
-		if (data->u.shared_info.gfn == KVM_XEN_INVALID_GFN) {
-			kvm_gpc_deactivate(&kvm->arch.xen.shinfo_cache);
-			r = 0;
+		if (data->type == KVM_XEN_ATTR_TYPE_SHARED_INFO) {
+			gfn_t gfn = data->u.shared_info.gfn;
+
+			if (gfn == KVM_XEN_INVALID_GFN) {
+				kvm_gpc_deactivate(&kvm->arch.xen.shinfo_cache);
+				r = 0;
+			} else {
+				r = kvm_gpc_activate(&kvm->arch.xen.shinfo_cache,
+						     gfn_to_gpa(gfn), PAGE_SIZE);
+			}
 		} else {
-			r = kvm_gpc_activate(&kvm->arch.xen.shinfo_cache,
-					     gfn_to_gpa(data->u.shared_info.gfn),
-					     PAGE_SIZE);
+			void __user * hva = (void *)data->u.shared_info.hva;
+
+			if (!PAGE_ALIGNED(hva) || !access_ok(hva, PAGE_SIZE)) {
+				r = -EINVAL;
+			} else if (!hva) {
+				kvm_gpc_deactivate(&kvm->arch.xen.shinfo_cache);
+				r = 0;
+			} else {
+				r = kvm_gpc_activate_hva(&kvm->arch.xen.shinfo_cache,
+							 (unsigned long)hva, PAGE_SIZE);
+			}
 		}
 
 		srcu_read_unlock(&kvm->srcu, idx);
@@ -715,13 +731,21 @@ int kvm_xen_hvm_get_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
 		break;
 
 	case KVM_XEN_ATTR_TYPE_SHARED_INFO:
-		if (kvm->arch.xen.shinfo_cache.active)
+		if (kvm_gpc_is_gpa_active(&kvm->arch.xen.shinfo_cache))
 			data->u.shared_info.gfn = gpa_to_gfn(kvm->arch.xen.shinfo_cache.gpa);
 		else
 			data->u.shared_info.gfn = KVM_XEN_INVALID_GFN;
 		r = 0;
 		break;
 
+	case KVM_XEN_ATTR_TYPE_SHARED_INFO_HVA:
+		if (kvm_gpc_is_hva_active(&kvm->arch.xen.shinfo_cache))
+			data->u.shared_info.hva = kvm->arch.xen.shinfo_cache.uhva;
+		else
+			data->u.shared_info.hva = 0;
+		r = 0;
+		break;
+
 	case KVM_XEN_ATTR_TYPE_UPCALL_VECTOR:
 		data->u.vector = kvm->arch.xen.upcall_vector;
 		r = 0;
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index c3308536482bdb..ac5caba313d1e8 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1246,6 +1246,7 @@ struct kvm_x86_mce {
 #define KVM_XEN_HVM_CONFIG_EVTCHN_SEND		(1 << 5)
 #define KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG	(1 << 6)
 #define KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE	(1 << 7)
+#define KVM_XEN_HVM_CONFIG_SHARED_INFO_HVA	(1 << 8)
 
 struct kvm_xen_hvm_config {
 	__u32 flags;
@@ -1744,9 +1745,10 @@ struct kvm_xen_hvm_attr {
 		__u8 long_mode;
 		__u8 vector;
 		__u8 runstate_update_flag;
-		struct {
+		union {
 			__u64 gfn;
 #define KVM_XEN_INVALID_GFN ((__u64)-1)
+			__u64 hva;
 		} shared_info;
 		struct {
 			__u32 send_port;
@@ -1788,6 +1790,8 @@ struct kvm_xen_hvm_attr {
 #define KVM_XEN_ATTR_TYPE_XEN_VERSION		0x4
 /* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG */
 #define KVM_XEN_ATTR_TYPE_RUNSTATE_UPDATE_FLAG	0x5
+/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO_HVA */
+#define KVM_XEN_ATTR_TYPE_SHARED_INFO_HVA	0x6
 
 /* Per-vCPU Xen attributes */
 #define KVM_XEN_VCPU_GET_ATTR	_IOWR(KVMIO, 0xca, struct kvm_xen_vcpu_attr)

From 3a0c9c41959d11ab217e93403bacf40fe57c96a0 Mon Sep 17 00:00:00 2001
From: Paul Durrant <pdurrant@amazon.com>
Date: Thu, 15 Feb 2024 15:29:08 +0000
Subject: [PATCH 0805/1406] KVM: x86/xen: allow vcpu_info to be mapped by fixed
 HVA

If the guest does not explicitly set the GPA of vcpu_info structure in
memory then, for guests with 32 vCPUs or fewer, the vcpu_info embedded
in the shared_info page may be used. As described in a previous commit,
the shared_info page is an overlay at a fixed HVA within the VMM, so in
this case it also more optimal to activate the vcpu_info cache with a
fixed HVA to avoid unnecessary invalidation if the guest memory layout
is modified.

Signed-off-by: Paul Durrant <pdurrant@amazon.com>
Reviewed-by: David Woodhouse <dwmw@amazon.co.uk>
Link: https://lore.kernel.org/r/20240215152916.1158-14-paul@xen.org
[sean: use kvm_gpc_is_{gpa,hva}_active()]
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 Documentation/virt/kvm/api.rst | 26 ++++++++++++++++++++-----
 arch/x86/kvm/xen.c             | 35 +++++++++++++++++++++++++++-------
 include/uapi/linux/kvm.h       |  3 +++
 3 files changed, 52 insertions(+), 12 deletions(-)

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 3372be85b33557..bd93cafd3e4e3e 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -5523,11 +5523,12 @@ KVM_XEN_ATTR_TYPE_SHARED_INFO
   Sets the guest physical frame number at which the Xen shared_info
   page resides. Note that although Xen places vcpu_info for the first
   32 vCPUs in the shared_info page, KVM does not automatically do so
-  and instead requires that KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO be used
-  explicitly even when the vcpu_info for a given vCPU resides at the
-  "default" location in the shared_info page. This is because KVM may
-  not be aware of the Xen CPU id which is used as the index into the
-  vcpu_info[] array, so may know the correct default location.
+  and instead requires that KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO or
+  KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO_HVA be used explicitly even when
+  the vcpu_info for a given vCPU resides at the "default" location
+  in the shared_info page. This is because KVM may not be aware of
+  the Xen CPU id which is used as the index into the vcpu_info[]
+  array, so may know the correct default location.
 
   Note that the shared_info page may be constantly written to by KVM;
   it contains the event channel bitmap used to deliver interrupts to
@@ -5649,6 +5650,21 @@ KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO
   on dirty logging. Setting the gpa to KVM_XEN_INVALID_GPA will disable
   the vcpu_info.
 
+KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO_HVA
+  If the KVM_XEN_HVM_CONFIG_SHARED_INFO_HVA flag is also set in the
+  Xen capabilities, then this attribute may be used to set the
+  userspace address of the vcpu_info for a given vCPU. It should
+  only be used when the vcpu_info resides at the "default" location
+  in the shared_info page. In this case it is safe to assume the
+  userspace address will not change, because the shared_info page is
+  an overlay on guest memory and remains at a fixed host address
+  regardless of where it is mapped in guest physical address space
+  and hence unnecessary invalidation of an internal cache may be
+  avoided if the guest memory layout is modified.
+  If the vcpu_info does not reside at the "default" location then
+  it is not guaranteed to remain at the same host address and
+  hence the aforementioned cache invalidation is required.
+
 KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO
   Sets the guest physical address of an additional pvclock structure
   for a given vCPU. This is typically used for guest vsyscall support.
diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
index cca72e6022bfd0..16e76a6279efee 100644
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -782,20 +782,33 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
 
 	switch (data->type) {
 	case KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO:
+	case KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO_HVA:
 		/* No compat necessary here. */
 		BUILD_BUG_ON(sizeof(struct vcpu_info) !=
 			     sizeof(struct compat_vcpu_info));
 		BUILD_BUG_ON(offsetof(struct vcpu_info, time) !=
 			     offsetof(struct compat_vcpu_info, time));
 
-		if (data->u.gpa == KVM_XEN_INVALID_GPA) {
-			kvm_gpc_deactivate(&vcpu->arch.xen.vcpu_info_cache);
-			r = 0;
-			break;
+		if (data->type == KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO) {
+			if (data->u.gpa == KVM_XEN_INVALID_GPA) {
+				kvm_gpc_deactivate(&vcpu->arch.xen.vcpu_info_cache);
+				r = 0;
+				break;
+			}
+
+			r = kvm_gpc_activate(&vcpu->arch.xen.vcpu_info_cache,
+					     data->u.gpa, sizeof(struct vcpu_info));
+		} else {
+			if (data->u.hva == 0) {
+				kvm_gpc_deactivate(&vcpu->arch.xen.vcpu_info_cache);
+				r = 0;
+				break;
+			}
+
+			r = kvm_gpc_activate_hva(&vcpu->arch.xen.vcpu_info_cache,
+						 data->u.hva, sizeof(struct vcpu_info));
 		}
 
-		r = kvm_gpc_activate(&vcpu->arch.xen.vcpu_info_cache,
-				     data->u.gpa, sizeof(struct vcpu_info));
 		if (!r)
 			kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
 
@@ -1017,13 +1030,21 @@ int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
 
 	switch (data->type) {
 	case KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO:
-		if (vcpu->arch.xen.vcpu_info_cache.active)
+		if (kvm_gpc_is_gpa_active(&vcpu->arch.xen.vcpu_info_cache))
 			data->u.gpa = vcpu->arch.xen.vcpu_info_cache.gpa;
 		else
 			data->u.gpa = KVM_XEN_INVALID_GPA;
 		r = 0;
 		break;
 
+	case KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO_HVA:
+		if (kvm_gpc_is_hva_active(&vcpu->arch.xen.vcpu_info_cache))
+			data->u.hva = vcpu->arch.xen.vcpu_info_cache.uhva;
+		else
+			data->u.hva = 0;
+		r = 0;
+		break;
+
 	case KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO:
 		if (vcpu->arch.xen.vcpu_time_info_cache.active)
 			data->u.gpa = vcpu->arch.xen.vcpu_time_info_cache.gpa;
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index ac5caba313d1e8..d2665319db6e1c 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1809,6 +1809,7 @@ struct kvm_xen_vcpu_attr {
 	union {
 		__u64 gpa;
 #define KVM_XEN_INVALID_GPA ((__u64)-1)
+		__u64 hva;
 		__u64 pad[8];
 		struct {
 			__u64 state;
@@ -1839,6 +1840,8 @@ struct kvm_xen_vcpu_attr {
 #define KVM_XEN_VCPU_ATTR_TYPE_VCPU_ID		0x6
 #define KVM_XEN_VCPU_ATTR_TYPE_TIMER		0x7
 #define KVM_XEN_VCPU_ATTR_TYPE_UPCALL_VECTOR	0x8
+/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO_HVA */
+#define KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO_HVA	0x9
 
 /* Secure Encrypted Virtualization command */
 enum sev_cmd_id {

From 44e4a616b5a6b175fc0c093ee2ec65fc9c0bc480 Mon Sep 17 00:00:00 2001
From: Paul Durrant <pdurrant@amazon.com>
Date: Thu, 15 Feb 2024 15:29:09 +0000
Subject: [PATCH 0806/1406] KVM: selftests: map Xen's shared_info page using
 HVA rather than GFN

Using the HVA of the shared_info page is more efficient, so if the
capability (KVM_XEN_HVM_CONFIG_SHARED_INFO_HVA) is present use that method
to do the mapping.

NOTE: Have the juggle_shinfo_state() thread map and unmap using both
      GFN and HVA, to make sure the older mechanism is not broken.

Signed-off-by: Paul Durrant <pdurrant@amazon.com>
Reviewed-by: David Woodhouse <dwmw@amazon.co.uk>
Link: https://lore.kernel.org/r/20240215152916.1158-15-paul@xen.org
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 .../selftests/kvm/x86_64/xen_shinfo_test.c    | 44 +++++++++++++++----
 1 file changed, 35 insertions(+), 9 deletions(-)

diff --git a/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c b/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c
index 9ec9ab60b63ee2..a61500ff0822b7 100644
--- a/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c
+++ b/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c
@@ -389,6 +389,7 @@ static int cmp_timespec(struct timespec *a, struct timespec *b)
 		return 0;
 }
 
+static struct shared_info *shinfo;
 static struct vcpu_info *vinfo;
 static struct kvm_vcpu *vcpu;
 
@@ -404,20 +405,38 @@ static void *juggle_shinfo_state(void *arg)
 {
 	struct kvm_vm *vm = (struct kvm_vm *)arg;
 
-	struct kvm_xen_hvm_attr cache_activate = {
+	struct kvm_xen_hvm_attr cache_activate_gfn = {
 		.type = KVM_XEN_ATTR_TYPE_SHARED_INFO,
 		.u.shared_info.gfn = SHINFO_REGION_GPA / PAGE_SIZE
 	};
 
-	struct kvm_xen_hvm_attr cache_deactivate = {
+	struct kvm_xen_hvm_attr cache_deactivate_gfn = {
 		.type = KVM_XEN_ATTR_TYPE_SHARED_INFO,
 		.u.shared_info.gfn = KVM_XEN_INVALID_GFN
 	};
 
+	struct kvm_xen_hvm_attr cache_activate_hva = {
+		.type = KVM_XEN_ATTR_TYPE_SHARED_INFO_HVA,
+		.u.shared_info.hva = (unsigned long)shinfo
+	};
+
+	struct kvm_xen_hvm_attr cache_deactivate_hva = {
+		.type = KVM_XEN_ATTR_TYPE_SHARED_INFO,
+		.u.shared_info.hva = 0
+	};
+
+	int xen_caps = kvm_check_cap(KVM_CAP_XEN_HVM);
+
 	for (;;) {
-		__vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &cache_activate);
-		__vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &cache_deactivate);
+		__vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &cache_activate_gfn);
 		pthread_testcancel();
+		__vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &cache_deactivate_gfn);
+
+		if (xen_caps & KVM_XEN_HVM_CONFIG_SHARED_INFO_HVA) {
+			__vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &cache_activate_hva);
+			pthread_testcancel();
+			__vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &cache_deactivate_hva);
+		}
 	}
 
 	return NULL;
@@ -442,6 +461,7 @@ int main(int argc, char *argv[])
 	bool do_runstate_flag = !!(xen_caps & KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG);
 	bool do_eventfd_tests = !!(xen_caps & KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL);
 	bool do_evtchn_tests = do_eventfd_tests && !!(xen_caps & KVM_XEN_HVM_CONFIG_EVTCHN_SEND);
+	bool has_shinfo_hva = !!(xen_caps & KVM_XEN_HVM_CONFIG_SHARED_INFO_HVA);
 
 	clock_gettime(CLOCK_REALTIME, &min_ts);
 
@@ -452,7 +472,7 @@ int main(int argc, char *argv[])
 				    SHINFO_REGION_GPA, SHINFO_REGION_SLOT, 3, 0);
 	virt_map(vm, SHINFO_REGION_GVA, SHINFO_REGION_GPA, 3);
 
-	struct shared_info *shinfo = addr_gpa2hva(vm, SHINFO_VADDR);
+	shinfo = addr_gpa2hva(vm, SHINFO_VADDR);
 
 	int zero_fd = open("/dev/zero", O_RDONLY);
 	TEST_ASSERT(zero_fd != -1, "Failed to open /dev/zero");
@@ -488,10 +508,16 @@ int main(int argc, char *argv[])
 			    "Failed to read back RUNSTATE_UPDATE_FLAG attr");
 	}
 
-	struct kvm_xen_hvm_attr ha = {
-		.type = KVM_XEN_ATTR_TYPE_SHARED_INFO,
-		.u.shared_info.gfn = SHINFO_REGION_GPA / PAGE_SIZE,
-	};
+	struct kvm_xen_hvm_attr ha = {};
+
+	if (has_shinfo_hva) {
+		ha.type = KVM_XEN_ATTR_TYPE_SHARED_INFO_HVA;
+		ha.u.shared_info.hva = (unsigned long)shinfo;
+	} else {
+		ha.type = KVM_XEN_ATTR_TYPE_SHARED_INFO;
+		ha.u.shared_info.gfn = SHINFO_ADDR / PAGE_SIZE;
+	}
+
 	vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &ha);
 
 	/*

From 7508f933cb2571d75244b05736174be590b3c3b9 Mon Sep 17 00:00:00 2001
From: Paul Durrant <pdurrant@amazon.com>
Date: Thu, 15 Feb 2024 15:29:10 +0000
Subject: [PATCH 0807/1406] KVM: selftests: re-map Xen's vcpu_info using HVA
 rather than GPA

If the relevant capability (KVM_XEN_HVM_CONFIG_SHARED_INFO_HVA) is present
then re-map vcpu_info using the HVA part way through the tests to make sure
then there is no functional change.

Signed-off-by: Paul Durrant <pdurrant@amazon.com>
Reviewed-by: David Woodhouse <dwmw@amazon.co.uk>
Link: https://lore.kernel.org/r/20240215152916.1158-16-paul@xen.org
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 .../selftests/kvm/x86_64/xen_shinfo_test.c        | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c b/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c
index a61500ff0822b7..d2ea0435f4f763 100644
--- a/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c
+++ b/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c
@@ -62,6 +62,7 @@ enum {
 	TEST_POLL_TIMEOUT,
 	TEST_POLL_MASKED,
 	TEST_POLL_WAKE,
+	SET_VCPU_INFO,
 	TEST_TIMER_PAST,
 	TEST_LOCKING_SEND_RACE,
 	TEST_LOCKING_POLL_RACE,
@@ -321,6 +322,10 @@ static void guest_code(void)
 
 	GUEST_SYNC(TEST_POLL_WAKE);
 
+	/* Set the vcpu_info to point at exactly the place it already is to
+	 * make sure the attribute is functional. */
+	GUEST_SYNC(SET_VCPU_INFO);
+
 	/* A timer wake an *unmasked* port which should wake us with an
 	 * actual interrupt, while we're polling on a different port. */
 	ports[0]++;
@@ -888,6 +893,16 @@ int main(int argc, char *argv[])
 				alarm(1);
 				break;
 
+			case SET_VCPU_INFO:
+				if (has_shinfo_hva) {
+					struct kvm_xen_vcpu_attr vih = {
+						.type = KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO_HVA,
+						.u.hva = (unsigned long)vinfo
+					};
+					vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &vih);
+				}
+				break;
+
 			case TEST_TIMER_PAST:
 				TEST_ASSERT(!evtchn_irq_expected,
 					    "Expected event channel IRQ but it didn't happen");

From cb3f9af9c117f5848c0fbfc14f0b9aaf61c53394 Mon Sep 17 00:00:00 2001
From: Paul Durrant <pdurrant@amazon.com>
Date: Thu, 15 Feb 2024 15:29:11 +0000
Subject: [PATCH 0808/1406] KVM: x86/xen: advertize the
 KVM_XEN_HVM_CONFIG_SHARED_INFO_HVA capability

Now that all relevant kernel changes and selftests are in place, enable the
new capability.

Signed-off-by: Paul Durrant <pdurrant@amazon.com>
Reviewed-by: David Woodhouse <dwmw@amazon.co.uk>
Link: https://lore.kernel.org/r/20240215152916.1158-17-paul@xen.org
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/x86.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 415723a28dcec7..2911e6383fef30 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4682,7 +4682,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 		    KVM_XEN_HVM_CONFIG_SHARED_INFO |
 		    KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL |
 		    KVM_XEN_HVM_CONFIG_EVTCHN_SEND |
-		    KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE;
+		    KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE |
+		    KVM_XEN_HVM_CONFIG_SHARED_INFO_HVA;
 		if (sched_info_on())
 			r |= KVM_XEN_HVM_CONFIG_RUNSTATE |
 			     KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG;

From 939880d9bbdeb60af9acbda60b70a24a19f638d8 Mon Sep 17 00:00:00 2001
From: Paul Durrant <pdurrant@amazon.com>
Date: Thu, 15 Feb 2024 15:29:14 +0000
Subject: [PATCH 0809/1406] KVM: pfncache: check the need for invalidation
 under read lock first

When processing mmu_notifier invalidations for gpc caches, pre-check for
overlap with the invalidation event while holding gpc->lock for read, and
only take gpc->lock for write if the cache needs to be invalidated.  Doing
a pre-check without taking gpc->lock for write avoids unnecessarily
contending the lock for unrelated invalidations, which is very beneficial
for caches that are heavily used (but rarely subjected to mmu_notifier
invalidations).

Signed-off-by: Paul Durrant <pdurrant@amazon.com>
Reviewed-by: David Woodhouse <dwmw@amazon.co.uk>
Link: https://lore.kernel.org/r/20240215152916.1158-20-paul@xen.org
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 virt/kvm/pfncache.c | 22 +++++++++++++++++++---
 1 file changed, 19 insertions(+), 3 deletions(-)

diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c
index a47ca6fd75c27b..9ac8c9da4eda11 100644
--- a/virt/kvm/pfncache.c
+++ b/virt/kvm/pfncache.c
@@ -29,14 +29,30 @@ void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm, unsigned long start,
 
 	spin_lock(&kvm->gpc_lock);
 	list_for_each_entry(gpc, &kvm->gpc_list, list) {
-		write_lock_irq(&gpc->lock);
+		read_lock_irq(&gpc->lock);
 
 		/* Only a single page so no need to care about length */
 		if (gpc->valid && !is_error_noslot_pfn(gpc->pfn) &&
 		    gpc->uhva >= start && gpc->uhva < end) {
-			gpc->valid = false;
+			read_unlock_irq(&gpc->lock);
+
+			/*
+			 * There is a small window here where the cache could
+			 * be modified, and invalidation would no longer be
+			 * necessary. Hence check again whether invalidation
+			 * is still necessary once the write lock has been
+			 * acquired.
+			 */
+
+			write_lock_irq(&gpc->lock);
+			if (gpc->valid && !is_error_noslot_pfn(gpc->pfn) &&
+			    gpc->uhva >= start && gpc->uhva < end)
+				gpc->valid = false;
+			write_unlock_irq(&gpc->lock);
+			continue;
 		}
-		write_unlock_irq(&gpc->lock);
+
+		read_unlock_irq(&gpc->lock);
 	}
 	spin_unlock(&kvm->gpc_lock);
 }

From 498eebba628b8de860b22534e5764a5cf6a00372 Mon Sep 17 00:00:00 2001
From: Paul Durrant <pdurrant@amazon.com>
Date: Thu, 15 Feb 2024 15:29:15 +0000
Subject: [PATCH 0810/1406] KVM: x86/xen: allow vcpu_info content to be
 'safely' copied

If the guest sets an explicit vcpu_info GPA then, for any of the first 32
vCPUs, the content of the default vcpu_info in the shared_info page must be
copied into the new location. Because this copy may race with event
delivery (which updates the 'evtchn_pending_sel' field in vcpu_info),
event delivery needs to be deferred until the copy is complete.

Happily there is already a shadow of 'evtchn_pending_sel' in kvm_vcpu_xen
that is used in atomic context if the vcpu_info PFN cache has been
invalidated so that the update of vcpu_info can be deferred until the
cache can be refreshed (on vCPU thread's the way back into guest context).

Use this shadow if the vcpu_info cache has been *deactivated*, so that
the VMM can safely copy the vcpu_info content and then re-activate the
cache with the new GPA. To do this, stop considering an inactive vcpu_info
cache as a hard error in kvm_xen_set_evtchn_fast(), and let the existing
kvm_gpc_check() fail and kick the vCPU (if necessary).

Signed-off-by: Paul Durrant <pdurrant@amazon.com>
Reviewed-by: David Woodhouse <dwmw@amazon.co.uk>
Link: https://lore.kernel.org/r/20240215152916.1158-21-paul@xen.org
[sean: add a bit of verbosity to the changelog]
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/xen.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
index 16e76a6279efee..01c0fd138d2fb7 100644
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -1697,9 +1697,6 @@ int kvm_xen_set_evtchn_fast(struct kvm_xen_evtchn *xe, struct kvm *kvm)
 		WRITE_ONCE(xe->vcpu_idx, vcpu->vcpu_idx);
 	}
 
-	if (!vcpu->arch.xen.vcpu_info_cache.active)
-		return -EINVAL;
-
 	if (xe->port >= max_evtchn_port(kvm))
 		return -EINVAL;
 

From 5d71c8016898b87d4ac11a3454fd11d1769d6099 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micka=C3=ABl=20Sala=C3=BCn?= <mic@digikod.net>
Date: Thu, 18 Jan 2024 16:14:22 +0100
Subject: [PATCH 0811/1406] kunit: tool: Print UML command
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

As for the Qemu command, print the command used to run tests with UML.

Cc: Brendan Higgins <brendan.higgins@linux.dev>
Cc: David Gow <davidgow@google.com>
Signed-off-by: Mickaël Salaün <mic@digikod.net>
Reviewed-by: David Gow <davidgow@google.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/kunit/kunit_kernel.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/testing/kunit/kunit_kernel.py b/tools/testing/kunit/kunit_kernel.py
index 0b6488efed47ac..7254c110ff23ab 100644
--- a/tools/testing/kunit/kunit_kernel.py
+++ b/tools/testing/kunit/kunit_kernel.py
@@ -146,6 +146,7 @@ def start(self, params: List[str], build_dir: str) -> subprocess.Popen:
 		"""Runs the Linux UML binary. Must be named 'linux'."""
 		linux_bin = os.path.join(build_dir, 'linux')
 		params.extend(['mem=1G', 'console=tty', 'kunit_shutdown=halt'])
+		print('Running tests with:\n$', linux_bin, ' '.join(shlex.quote(arg) for arg in params))
 		return subprocess.Popen([linux_bin] + params,
 					   stdin=subprocess.PIPE,
 					   stdout=subprocess.PIPE,

From 08c454e26daab6f843e5883fb96f680f11784fa6 Mon Sep 17 00:00:00 2001
From: Lucas De Marchi <lucas.demarchi@intel.com>
Date: Mon, 22 Jan 2024 09:14:08 -0800
Subject: [PATCH 0812/1406] kunit: Mark filter* params as rw

By allowing the filter_glob parameter to be written to, it's possible to
tweak the testsuites that will be executed on new module loads. This
makes it easier to run specific tests without having to reload kunit and
provides a way to filter tests on real HW even if kunit is builtin.
Example for xe driver:

1) Run just 1 test
	# echo -n xe_bo > /sys/module/kunit/parameters/filter_glob
	# modprobe -r xe_live_test
	# modprobe xe_live_test
	# ls /sys/kernel/debug/kunit/
	xe_bo

2) Run all tests
	# echo \* > /sys/module/kunit/parameters/filter_glob
	# modprobe -r xe_live_test
	# modprobe xe_live_test
	# ls /sys/kernel/debug/kunit/
	xe_bo  xe_dma_buf  xe_migrate  xe_mocs

For completeness and to cover other use cases, also change filter and
filter_action to rw.

Link: https://lore.kernel.org/intel-xe/dzacvbdditbneiu3e3fmstjmttcbne44yspumpkd6sjn56jqpk@vxu7sksbqrp6/
Reviewed-by: Rae Moar <rmoar@google.com>
Signed-off-by: Lucas De Marchi <lucas.demarchi@intel.com>
Reviewed-by: David Gow <davidgow@google.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 lib/kunit/executor.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/lib/kunit/executor.c b/lib/kunit/executor.c
index 689fff2b2b106a..70b9a43cd25716 100644
--- a/lib/kunit/executor.c
+++ b/lib/kunit/executor.c
@@ -33,13 +33,13 @@ static char *filter_glob_param;
 static char *filter_param;
 static char *filter_action_param;
 
-module_param_named(filter_glob, filter_glob_param, charp, 0400);
+module_param_named(filter_glob, filter_glob_param, charp, 0600);
 MODULE_PARM_DESC(filter_glob,
 		"Filter which KUnit test suites/tests run at boot-time, e.g. list* or list*.*del_test");
-module_param_named(filter, filter_param, charp, 0400);
+module_param_named(filter, filter_param, charp, 0600);
 MODULE_PARM_DESC(filter,
 		"Filter which KUnit test suites/tests run at boot-time using attributes, e.g. speed>slow");
-module_param_named(filter_action, filter_action_param, charp, 0400);
+module_param_named(filter_action, filter_action_param, charp, 0600);
 MODULE_PARM_DESC(filter_action,
 		"Changes behavior of filtered tests using attributes, valid values are:\n"
 		"<none>: do not run filtered tests as normal\n"

From babb46746cc5683fc930fea7d0ef6d5323d6a6cd Mon Sep 17 00:00:00 2001
From: Jan Kratochvil <jan@jankratochvil.net>
Date: Mon, 12 Feb 2024 21:32:56 +0800
Subject: [PATCH 0813/1406] Fix cpupower-frequency-info.1 man page typo

Hello,

utils/cpufreq-info.c
        {"related-cpus",  no_argument,   NULL,   'r'},
        {"affected-cpus", no_argument,   NULL,   'a'},

Jan Kratochvil

Signed-off-by: Jan Kratochvil <jan@jankratochvil.net>

 tools/power/cpupower/man/cpupower-frequency-info.1 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/power/cpupower/man/cpupower-frequency-info.1 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/power/cpupower/man/cpupower-frequency-info.1 b/tools/power/cpupower/man/cpupower-frequency-info.1
index dd545b499480bd..47fdd72187487e 100644
--- a/tools/power/cpupower/man/cpupower-frequency-info.1
+++ b/tools/power/cpupower/man/cpupower-frequency-info.1
@@ -32,7 +32,7 @@ Gets the currently used cpufreq policy.
 \fB\-g\fR \fB\-\-governors\fR
 Determines available cpufreq governors.
 .TP  
-\fB\-a\fR \fB\-\-related\-cpus\fR
+\fB\-r\fR \fB\-\-related\-cpus\fR
 Determines which CPUs run at the same hardware frequency.
 .TP  
 \fB\-a\fR \fB\-\-affected\-cpus\fR

From bb8576deed306ec1ccebdccc4aa9664ed2db5514 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Mon, 12 Feb 2024 16:00:50 +0100
Subject: [PATCH 0814/1406] pidfd: move struct pidfd_fops

Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/Makefile   |   2 +-
 fs/pidfs.c    | 122 ++++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/fork.c | 110 ---------------------------------------------
 3 files changed, 123 insertions(+), 111 deletions(-)
 create mode 100644 fs/pidfs.c

diff --git a/fs/Makefile b/fs/Makefile
index c09016257f05e8..12530167398583 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -15,7 +15,7 @@ obj-y :=	open.o read_write.o file_table.o super.o \
 		pnode.o splice.o sync.o utimes.o d_path.o \
 		stack.o fs_struct.o statfs.o fs_pin.o nsfs.o \
 		fs_types.o fs_context.o fs_parser.o fsopen.o init.o \
-		kernel_read_file.o mnt_idmapping.o remap_range.o
+		kernel_read_file.o mnt_idmapping.o remap_range.o pidfs.o
 
 obj-$(CONFIG_BUFFER_HEAD)	+= buffer.o mpage.o
 obj-$(CONFIG_PROC_FS)		+= proc_namespace.o
diff --git a/fs/pidfs.c b/fs/pidfs.c
new file mode 100644
index 00000000000000..eccb291862a06d
--- /dev/null
+++ b/fs/pidfs.c
@@ -0,0 +1,122 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/magic.h>
+#include <linux/mount.h>
+#include <linux/pid.h>
+#include <linux/pid_namespace.h>
+#include <linux/poll.h>
+#include <linux/proc_fs.h>
+#include <linux/proc_ns.h>
+#include <linux/pseudo_fs.h>
+#include <linux/seq_file.h>
+#include <uapi/linux/pidfd.h>
+
+static int pidfd_release(struct inode *inode, struct file *file)
+{
+	struct pid *pid = file->private_data;
+
+	file->private_data = NULL;
+	put_pid(pid);
+	return 0;
+}
+
+#ifdef CONFIG_PROC_FS
+/**
+ * pidfd_show_fdinfo - print information about a pidfd
+ * @m: proc fdinfo file
+ * @f: file referencing a pidfd
+ *
+ * Pid:
+ * This function will print the pid that a given pidfd refers to in the
+ * pid namespace of the procfs instance.
+ * If the pid namespace of the process is not a descendant of the pid
+ * namespace of the procfs instance 0 will be shown as its pid. This is
+ * similar to calling getppid() on a process whose parent is outside of
+ * its pid namespace.
+ *
+ * NSpid:
+ * If pid namespaces are supported then this function will also print
+ * the pid of a given pidfd refers to for all descendant pid namespaces
+ * starting from the current pid namespace of the instance, i.e. the
+ * Pid field and the first entry in the NSpid field will be identical.
+ * If the pid namespace of the process is not a descendant of the pid
+ * namespace of the procfs instance 0 will be shown as its first NSpid
+ * entry and no others will be shown.
+ * Note that this differs from the Pid and NSpid fields in
+ * /proc/<pid>/status where Pid and NSpid are always shown relative to
+ * the  pid namespace of the procfs instance. The difference becomes
+ * obvious when sending around a pidfd between pid namespaces from a
+ * different branch of the tree, i.e. where no ancestral relation is
+ * present between the pid namespaces:
+ * - create two new pid namespaces ns1 and ns2 in the initial pid
+ *   namespace (also take care to create new mount namespaces in the
+ *   new pid namespace and mount procfs)
+ * - create a process with a pidfd in ns1
+ * - send pidfd from ns1 to ns2
+ * - read /proc/self/fdinfo/<pidfd> and observe that both Pid and NSpid
+ *   have exactly one entry, which is 0
+ */
+static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
+{
+	struct pid *pid = f->private_data;
+	struct pid_namespace *ns;
+	pid_t nr = -1;
+
+	if (likely(pid_has_task(pid, PIDTYPE_PID))) {
+		ns = proc_pid_ns(file_inode(m->file)->i_sb);
+		nr = pid_nr_ns(pid, ns);
+	}
+
+	seq_put_decimal_ll(m, "Pid:\t", nr);
+
+#ifdef CONFIG_PID_NS
+	seq_put_decimal_ll(m, "\nNSpid:\t", nr);
+	if (nr > 0) {
+		int i;
+
+		/* If nr is non-zero it means that 'pid' is valid and that
+		 * ns, i.e. the pid namespace associated with the procfs
+		 * instance, is in the pid namespace hierarchy of pid.
+		 * Start at one below the already printed level.
+		 */
+		for (i = ns->level + 1; i <= pid->level; i++)
+			seq_put_decimal_ll(m, "\t", pid->numbers[i].nr);
+	}
+#endif
+	seq_putc(m, '\n');
+}
+#endif
+
+/*
+ * Poll support for process exit notification.
+ */
+static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts)
+{
+	struct pid *pid = file->private_data;
+	bool thread = file->f_flags & PIDFD_THREAD;
+	struct task_struct *task;
+	__poll_t poll_flags = 0;
+
+	poll_wait(file, &pid->wait_pidfd, pts);
+	/*
+	 * Depending on PIDFD_THREAD, inform pollers when the thread
+	 * or the whole thread-group exits.
+	 */
+	guard(rcu)();
+	task = pid_task(pid, PIDTYPE_PID);
+	if (!task)
+		poll_flags = EPOLLIN | EPOLLRDNORM | EPOLLHUP;
+	else if (task->exit_state && (thread || thread_group_empty(task)))
+		poll_flags = EPOLLIN | EPOLLRDNORM;
+
+	return poll_flags;
+}
+
+const struct file_operations pidfd_fops = {
+	.release	= pidfd_release,
+	.poll		= pidfd_poll,
+#ifdef CONFIG_PROC_FS
+	.show_fdinfo	= pidfd_show_fdinfo,
+#endif
+};
diff --git a/kernel/fork.c b/kernel/fork.c
index 3f22ec90c5c615..662a61f340ce86 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1993,116 +1993,6 @@ struct pid *pidfd_pid(const struct file *file)
 	return ERR_PTR(-EBADF);
 }
 
-static int pidfd_release(struct inode *inode, struct file *file)
-{
-	struct pid *pid = file->private_data;
-
-	file->private_data = NULL;
-	put_pid(pid);
-	return 0;
-}
-
-#ifdef CONFIG_PROC_FS
-/**
- * pidfd_show_fdinfo - print information about a pidfd
- * @m: proc fdinfo file
- * @f: file referencing a pidfd
- *
- * Pid:
- * This function will print the pid that a given pidfd refers to in the
- * pid namespace of the procfs instance.
- * If the pid namespace of the process is not a descendant of the pid
- * namespace of the procfs instance 0 will be shown as its pid. This is
- * similar to calling getppid() on a process whose parent is outside of
- * its pid namespace.
- *
- * NSpid:
- * If pid namespaces are supported then this function will also print
- * the pid of a given pidfd refers to for all descendant pid namespaces
- * starting from the current pid namespace of the instance, i.e. the
- * Pid field and the first entry in the NSpid field will be identical.
- * If the pid namespace of the process is not a descendant of the pid
- * namespace of the procfs instance 0 will be shown as its first NSpid
- * entry and no others will be shown.
- * Note that this differs from the Pid and NSpid fields in
- * /proc/<pid>/status where Pid and NSpid are always shown relative to
- * the  pid namespace of the procfs instance. The difference becomes
- * obvious when sending around a pidfd between pid namespaces from a
- * different branch of the tree, i.e. where no ancestral relation is
- * present between the pid namespaces:
- * - create two new pid namespaces ns1 and ns2 in the initial pid
- *   namespace (also take care to create new mount namespaces in the
- *   new pid namespace and mount procfs)
- * - create a process with a pidfd in ns1
- * - send pidfd from ns1 to ns2
- * - read /proc/self/fdinfo/<pidfd> and observe that both Pid and NSpid
- *   have exactly one entry, which is 0
- */
-static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
-{
-	struct pid *pid = f->private_data;
-	struct pid_namespace *ns;
-	pid_t nr = -1;
-
-	if (likely(pid_has_task(pid, PIDTYPE_PID))) {
-		ns = proc_pid_ns(file_inode(m->file)->i_sb);
-		nr = pid_nr_ns(pid, ns);
-	}
-
-	seq_put_decimal_ll(m, "Pid:\t", nr);
-
-#ifdef CONFIG_PID_NS
-	seq_put_decimal_ll(m, "\nNSpid:\t", nr);
-	if (nr > 0) {
-		int i;
-
-		/* If nr is non-zero it means that 'pid' is valid and that
-		 * ns, i.e. the pid namespace associated with the procfs
-		 * instance, is in the pid namespace hierarchy of pid.
-		 * Start at one below the already printed level.
-		 */
-		for (i = ns->level + 1; i <= pid->level; i++)
-			seq_put_decimal_ll(m, "\t", pid->numbers[i].nr);
-	}
-#endif
-	seq_putc(m, '\n');
-}
-#endif
-
-/*
- * Poll support for process exit notification.
- */
-static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts)
-{
-	struct pid *pid = file->private_data;
-	bool thread = file->f_flags & PIDFD_THREAD;
-	struct task_struct *task;
-	__poll_t poll_flags = 0;
-
-	poll_wait(file, &pid->wait_pidfd, pts);
-	/*
-	 * Depending on PIDFD_THREAD, inform pollers when the thread
-	 * or the whole thread-group exits.
-	 */
-	rcu_read_lock();
-	task = pid_task(pid, PIDTYPE_PID);
-	if (!task)
-		poll_flags = EPOLLIN | EPOLLRDNORM | EPOLLHUP;
-	else if (task->exit_state && (thread || thread_group_empty(task)))
-		poll_flags = EPOLLIN | EPOLLRDNORM;
-	rcu_read_unlock();
-
-	return poll_flags;
-}
-
-const struct file_operations pidfd_fops = {
-	.release = pidfd_release,
-	.poll = pidfd_poll,
-#ifdef CONFIG_PROC_FS
-	.show_fdinfo = pidfd_show_fdinfo,
-#endif
-};
-
 /**
  * __pidfd_prepare - allocate a new pidfd_file and reserve a pidfd
  * @pid:   the struct pid for which to create a pidfd

From 193d98b1d3aa72761cf7a9cae5f8ac9933ef4d0a Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Mon, 12 Feb 2024 16:32:38 +0100
Subject: [PATCH 0815/1406] pidfd: add pidfs

This moves pidfds from the anonymous inode infrastructure to a tiny
pseudo filesystem. This has been on my todo for quite a while as it will
unblock further work that we weren't able to do simply because of the
very justified limitations of anonymous inodes. Moving pidfds to a tiny
pseudo filesystem allows:

* statx() on pidfds becomes useful for the first time.
* pidfds can be compared simply via statx() and then comparing inode
  numbers.
* pidfds have unique inode numbers for the system lifetime.
* struct pid is now stashed in inode->i_private instead of
  file->private_data. This means it is now possible to introduce
  concepts that operate on a process once all file descriptors have been
  closed. A concrete example is kill-on-last-close.
* file->private_data is freed up for per-file options for pidfds.
* Each struct pid will refer to a different inode but the same struct
  pid will refer to the same inode if it's opened multiple times. In
  contrast to now where each struct pid refers to the same inode. Even
  if we were to move to anon_inode_create_getfile() which creates new
  inodes we'd still be associating the same struct pid with multiple
  different inodes.

The tiny pseudo filesystem is not visible anywhere in userspace exactly
like e.g., pipefs and sockfs. There's no lookup, there's no complex
inode operations, nothing. Dentries and inodes are always deleted when
the last pidfd is closed.

We allocate a new inode for each struct pid and we reuse that inode for
all pidfds. We use iget_locked() to find that inode again based on the
inode number which isn't recycled. We allocate a new dentry for each
pidfd that uses the same inode. That is similar to anonymous inodes
which reuse the same inode for thousands of dentries. For pidfds we're
talking way less than that. There usually won't be a lot of concurrent
openers of the same struct pid. They can probably often be counted on
two hands. I know that systemd does use separate pidfd for the same
struct pid for various complex process tracking issues. So I think with
that things actually become way simpler. Especially because we don't
have to care about lookup. Dentries and inodes continue to be always
deleted.

The code is entirely optional and fairly small. If it's not selected we
fallback to anonymous inodes. Heavily inspired by nsfs which uses a
similar stashing mechanism just for namespaces.

Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/Kconfig                 |   7 ++
 fs/pidfs.c                 | 127 ++++++++++++++++++++++++++++++++++++-
 include/linux/pid.h        |   5 +-
 include/linux/pidfs.h      |  10 +++
 include/uapi/linux/magic.h |   1 +
 init/main.c                |   2 +
 kernel/fork.c              |  13 +---
 kernel/nsproxy.c           |   2 +-
 kernel/pid.c               |   7 ++
 9 files changed, 158 insertions(+), 16 deletions(-)
 create mode 100644 include/linux/pidfs.h

diff --git a/fs/Kconfig b/fs/Kconfig
index 89fdbefd1075f8..f3dbd84a0e40a0 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -174,6 +174,13 @@ source "fs/proc/Kconfig"
 source "fs/kernfs/Kconfig"
 source "fs/sysfs/Kconfig"
 
+config FS_PID
+	bool "Pseudo filesystem for process file descriptors"
+	depends on 64BIT
+	default y
+	help
+	  Pidfs implements advanced features for process file descriptors.
+
 config TMPFS
 	bool "Tmpfs virtual memory file system support (former shm fs)"
 	depends on SHMEM
diff --git a/fs/pidfs.c b/fs/pidfs.c
index eccb291862a06d..9796a15dec1ce3 100644
--- a/fs/pidfs.c
+++ b/fs/pidfs.c
@@ -1,9 +1,11 @@
 // SPDX-License-Identifier: GPL-2.0
+#include <linux/anon_inodes.h>
 #include <linux/file.h>
 #include <linux/fs.h>
 #include <linux/magic.h>
 #include <linux/mount.h>
 #include <linux/pid.h>
+#include <linux/pidfs.h>
 #include <linux/pid_namespace.h>
 #include <linux/poll.h>
 #include <linux/proc_fs.h>
@@ -12,12 +14,25 @@
 #include <linux/seq_file.h>
 #include <uapi/linux/pidfd.h>
 
+struct pid *pidfd_pid(const struct file *file)
+{
+	if (file->f_op != &pidfd_fops)
+		return ERR_PTR(-EBADF);
+#ifdef CONFIG_FS_PID
+	return file_inode(file)->i_private;
+#else
+	return file->private_data;
+#endif
+}
+
 static int pidfd_release(struct inode *inode, struct file *file)
 {
+#ifndef CONFIG_FS_PID
 	struct pid *pid = file->private_data;
 
 	file->private_data = NULL;
 	put_pid(pid);
+#endif
 	return 0;
 }
 
@@ -59,7 +74,7 @@ static int pidfd_release(struct inode *inode, struct file *file)
  */
 static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
 {
-	struct pid *pid = f->private_data;
+	struct pid *pid = pidfd_pid(f);
 	struct pid_namespace *ns;
 	pid_t nr = -1;
 
@@ -93,7 +108,7 @@ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
  */
 static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts)
 {
-	struct pid *pid = file->private_data;
+	struct pid *pid = pidfd_pid(file);
 	bool thread = file->f_flags & PIDFD_THREAD;
 	struct task_struct *task;
 	__poll_t poll_flags = 0;
@@ -120,3 +135,111 @@ const struct file_operations pidfd_fops = {
 	.show_fdinfo	= pidfd_show_fdinfo,
 #endif
 };
+
+#ifdef CONFIG_FS_PID
+static struct vfsmount *pidfs_mnt __ro_after_init;
+static struct super_block *pidfs_sb __ro_after_init;
+
+static void pidfs_evict_inode(struct inode *inode)
+{
+	struct pid *pid = inode->i_private;
+
+	clear_inode(inode);
+	put_pid(pid);
+}
+
+static const struct super_operations pidfs_sops = {
+	.drop_inode	= generic_delete_inode,
+	.evict_inode	= pidfs_evict_inode,
+	.statfs		= simple_statfs,
+};
+
+static char *pidfs_dname(struct dentry *dentry, char *buffer, int buflen)
+{
+	return dynamic_dname(buffer, buflen, "pidfd:[%lu]",
+			     d_inode(dentry)->i_ino);
+}
+
+const struct dentry_operations pidfs_dentry_operations = {
+	.d_delete	= always_delete_dentry,
+	.d_dname	= pidfs_dname,
+};
+
+static int pidfs_init_fs_context(struct fs_context *fc)
+{
+	struct pseudo_fs_context *ctx;
+
+	ctx = init_pseudo(fc, PID_FS_MAGIC);
+	if (!ctx)
+		return -ENOMEM;
+
+	ctx->ops = &pidfs_sops;
+	ctx->dops = &pidfs_dentry_operations;
+	return 0;
+}
+
+static struct file_system_type pidfs_type = {
+	.name			= "pidfs",
+	.init_fs_context	= pidfs_init_fs_context,
+	.kill_sb		= kill_anon_super,
+};
+
+struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags)
+{
+
+	struct inode *inode;
+	struct file *pidfd_file;
+
+	inode = iget_locked(pidfs_sb, pid->ino);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+
+	if (inode->i_state & I_NEW) {
+		inode->i_ino = pid->ino;
+		inode->i_mode = S_IFREG | S_IRUGO;
+		inode->i_fop = &pidfd_fops;
+		inode->i_flags |= S_IMMUTABLE;
+		inode->i_private = get_pid(pid);
+		simple_inode_init_ts(inode);
+		unlock_new_inode(inode);
+	}
+
+	pidfd_file = alloc_file_pseudo(inode, pidfs_mnt, "", flags, &pidfd_fops);
+	if (IS_ERR(pidfd_file))
+		iput(inode);
+
+	return pidfd_file;
+}
+
+void __init pidfs_init(void)
+{
+	int err;
+
+	err = register_filesystem(&pidfs_type);
+	if (err)
+		panic("Failed to register pidfs pseudo filesystem");
+
+	pidfs_mnt = kern_mount(&pidfs_type);
+	if (IS_ERR(pidfs_mnt))
+		panic("Failed to mount pidfs pseudo filesystem");
+
+	pidfs_sb = pidfs_mnt->mnt_sb;
+}
+
+#else /* !CONFIG_FS_PID */
+
+struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags)
+{
+	struct file *pidfd_file;
+
+	pidfd_file = anon_inode_getfile("[pidfd]", &pidfd_fops, pid,
+					flags | O_RDWR);
+	if (IS_ERR(pidfd_file))
+		return pidfd_file;
+
+	get_pid(pid);
+	return pidfd_file;
+}
+
+void __init pidfs_init(void) { }
+#endif
diff --git a/include/linux/pid.h b/include/linux/pid.h
index 8124d57752b938..956481128e8d42 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -55,6 +55,9 @@ struct pid
 	refcount_t count;
 	unsigned int level;
 	spinlock_t lock;
+#ifdef CONFIG_FS_PID
+	unsigned long ino;
+#endif
 	/* lists of tasks that use this pid */
 	struct hlist_head tasks[PIDTYPE_MAX];
 	struct hlist_head inodes;
@@ -66,8 +69,6 @@ struct pid
 
 extern struct pid init_struct_pid;
 
-extern const struct file_operations pidfd_fops;
-
 struct file;
 
 struct pid *pidfd_pid(const struct file *file);
diff --git a/include/linux/pidfs.h b/include/linux/pidfs.h
new file mode 100644
index 00000000000000..ca6f4023a829b9
--- /dev/null
+++ b/include/linux/pidfs.h
@@ -0,0 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_PID_FS_H
+#define _LINUX_PID_FS_H
+
+extern const struct file_operations pidfd_fops;
+
+struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags);
+void __init pidfs_init(void);
+
+#endif /* _LINUX_PID_FS_H */
diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h
index 6325d1d0e90f5d..1b40a968ba91fc 100644
--- a/include/uapi/linux/magic.h
+++ b/include/uapi/linux/magic.h
@@ -101,5 +101,6 @@
 #define DMA_BUF_MAGIC		0x444d4142	/* "DMAB" */
 #define DEVMEM_MAGIC		0x454d444d	/* "DMEM" */
 #define SECRETMEM_MAGIC		0x5345434d	/* "SECM" */
+#define PID_FS_MAGIC		0x50494446	/* "PIDF" */
 
 #endif /* __LINUX_MAGIC_H__ */
diff --git a/init/main.c b/init/main.c
index e24b0780fdff7a..2fbf6a3114d57a 100644
--- a/init/main.c
+++ b/init/main.c
@@ -99,6 +99,7 @@
 #include <linux/init_syscalls.h>
 #include <linux/stackdepot.h>
 #include <linux/randomize_kstack.h>
+#include <linux/pidfs.h>
 #include <net/net_namespace.h>
 
 #include <asm/io.h>
@@ -1059,6 +1060,7 @@ void start_kernel(void)
 	seq_file_init();
 	proc_root_init();
 	nsfs_init();
+	pidfs_init();
 	cpuset_init();
 	cgroup_init();
 	taskstats_init_early();
diff --git a/kernel/fork.c b/kernel/fork.c
index 662a61f340ce86..2f839c290dcf29 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -102,6 +102,7 @@
 #include <linux/iommu.h>
 #include <linux/rseq.h>
 #include <uapi/linux/pidfd.h>
+#include <linux/pidfs.h>
 
 #include <asm/pgalloc.h>
 #include <linux/uaccess.h>
@@ -1985,14 +1986,6 @@ static inline void rcu_copy_process(struct task_struct *p)
 #endif /* #ifdef CONFIG_TASKS_TRACE_RCU */
 }
 
-struct pid *pidfd_pid(const struct file *file)
-{
-	if (file->f_op == &pidfd_fops)
-		return file->private_data;
-
-	return ERR_PTR(-EBADF);
-}
-
 /**
  * __pidfd_prepare - allocate a new pidfd_file and reserve a pidfd
  * @pid:   the struct pid for which to create a pidfd
@@ -2030,13 +2023,11 @@ static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **re
 	if (pidfd < 0)
 		return pidfd;
 
-	pidfd_file = anon_inode_getfile("[pidfd]", &pidfd_fops, pid,
-					flags | O_RDWR);
+	pidfd_file = pidfs_alloc_file(pid, flags | O_RDWR);
 	if (IS_ERR(pidfd_file)) {
 		put_unused_fd(pidfd);
 		return PTR_ERR(pidfd_file);
 	}
-	get_pid(pid); /* held by pidfd_file now */
 	/*
 	 * anon_inode_getfile() ignores everything outside of the
 	 * O_ACCMODE | O_NONBLOCK mask, set PIDFD_THREAD manually.
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 15781acaac1cee..6ec3deec68c200 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -573,7 +573,7 @@ SYSCALL_DEFINE2(setns, int, fd, int, flags)
 	if (proc_ns_file(f.file))
 		err = validate_ns(&nsset, ns);
 	else
-		err = validate_nsset(&nsset, f.file->private_data);
+		err = validate_nsset(&nsset, pidfd_pid(f.file));
 	if (!err) {
 		commit_nsset(&nsset);
 		perf_event_namespaces(current);
diff --git a/kernel/pid.c b/kernel/pid.c
index c1d940fbd3140a..7f8d029d5ad138 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -42,6 +42,7 @@
 #include <linux/sched/signal.h>
 #include <linux/sched/task.h>
 #include <linux/idr.h>
+#include <linux/pidfs.h>
 #include <net/sock.h>
 #include <uapi/linux/pidfd.h>
 
@@ -65,6 +66,9 @@ int pid_max = PID_MAX_DEFAULT;
 
 int pid_max_min = RESERVED_PIDS + 1;
 int pid_max_max = PID_MAX_LIMIT;
+#ifdef CONFIG_FS_PID
+static u64 pidfs_ino = 0;
+#endif
 
 /*
  * PID-map pages start out as NULL, they get allocated upon
@@ -272,6 +276,9 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
 	spin_lock_irq(&pidmap_lock);
 	if (!(ns->pid_allocated & PIDNS_ADDING))
 		goto out_unlock;
+#ifdef CONFIG_FS_PID
+	pid->ino = ++pidfs_ino;
+#endif
 	for ( ; upid >= pid->numbers; --upid) {
 		/* Make the PID visible to find_pid_ns. */
 		idr_replace(&upid->ns->idr, pid, upid->nr);

From 5d910367b75254981f5ebb40b0dadca91a8c2912 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Sun, 18 Feb 2024 14:50:13 +0100
Subject: [PATCH 0816/1406] libfs: add path_from_stashed()

Add a helper for both nsfs and pidfs to reuse an already stashed dentry
or to add and stash a new dentry.

Link: https://lore.kernel.org/r/20240218-neufahrzeuge-brauhaus-fb0eb6459771@brauner
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/internal.h |  3 ++
 fs/libfs.c    | 93 +++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 96 insertions(+)

diff --git a/fs/internal.h b/fs/internal.h
index b67406435fc027..cfddaec6fbf641 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -310,3 +310,6 @@ ssize_t __kernel_write_iter(struct file *file, struct iov_iter *from, loff_t *po
 struct mnt_idmap *alloc_mnt_idmap(struct user_namespace *mnt_userns);
 struct mnt_idmap *mnt_idmap_get(struct mnt_idmap *idmap);
 void mnt_idmap_put(struct mnt_idmap *idmap);
+int path_from_stashed(struct dentry **stashed, unsigned long ino,
+		      struct vfsmount *mnt, const struct file_operations *fops,
+		      void *data, struct path *path);
diff --git a/fs/libfs.c b/fs/libfs.c
index eec6031b015544..8f115b6d212bed 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -1973,3 +1973,96 @@ struct timespec64 simple_inode_init_ts(struct inode *inode)
 	return ts;
 }
 EXPORT_SYMBOL(simple_inode_init_ts);
+
+static inline struct dentry *get_stashed_dentry(struct dentry *stashed)
+{
+	struct dentry *dentry;
+
+	rcu_read_lock();
+	dentry = READ_ONCE(stashed);
+	if (!dentry || !lockref_get_not_dead(&dentry->d_lockref))
+		dentry = NULL;
+	rcu_read_unlock();
+	return dentry;
+}
+
+static struct dentry *stash_dentry(struct dentry **stashed, unsigned long ino,
+				   struct super_block *sb,
+				   const struct file_operations *fops,
+				   void *data)
+{
+	struct dentry *dentry;
+	struct inode *inode;
+
+	dentry = d_alloc_anon(sb);
+	if (!dentry)
+		return ERR_PTR(-ENOMEM);
+
+	inode = new_inode_pseudo(sb);
+	if (!inode) {
+		dput(dentry);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	inode->i_ino = ino;
+	inode->i_flags |= S_IMMUTABLE;
+	inode->i_mode = S_IFREG | S_IRUGO;
+	inode->i_fop = fops;
+	inode->i_private = data;
+	simple_inode_init_ts(inode);
+
+	/* @data is now owned by the fs */
+	d_instantiate(dentry, inode);
+
+	if (cmpxchg(stashed, NULL, dentry)) {
+		d_delete(dentry); /* make sure ->d_prune() does nothing */
+		dput(dentry);
+		cpu_relax();
+		return ERR_PTR(-EAGAIN);
+	}
+
+	return dentry;
+}
+
+/**
+ * path_from_stashed - create path from stashed or new dentry
+ * @stashed:    where to retrieve or stash dentry
+ * @ino:        inode number to use
+ * @mnt:        mnt of the filesystems to use
+ * @fops:       file operations to use
+ * @data:       data to store in inode->i_private
+ * @path:       path to create
+ *
+ * The function tries to retrieve a stashed dentry from @stashed. If the dentry
+ * is still valid then it will be reused. If the dentry isn't able the function
+ * will allocate a new dentry and inode. It will then try to update @stashed
+ * with the newly added dentry. If it fails -EAGAIN is returned and the caller
+ * my retry.
+ *
+ * Special-purpose helper for nsfs and pidfs.
+ *
+ * Return: If 0 or an error is returned the caller can be sure that @data must
+ *         be cleaned up. If 1 or -EAGAIN is returned @data is owned by the
+ *         filesystem.
+ */
+int path_from_stashed(struct dentry **stashed, unsigned long ino,
+		      struct vfsmount *mnt, const struct file_operations *fops,
+		      void *data, struct path *path)
+{
+	struct dentry *dentry;
+	int ret = 0;
+
+	dentry = get_stashed_dentry(*stashed);
+	if (dentry)
+		goto out_path;
+
+	dentry = stash_dentry(stashed, ino, mnt->mnt_sb, fops, data);
+	if (IS_ERR(dentry))
+		return PTR_ERR(dentry);
+	ret = 1;
+
+out_path:
+	path->dentry = dentry;
+	path->mnt = mntget(mnt);
+	return ret;
+}

From b46d6924a876dbc445e63e8ce1fed2b3d2f54081 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Sun, 18 Feb 2024 14:51:23 +0100
Subject: [PATCH 0817/1406] nsfs: convert to path_from_stashed() helper

Link: https://lore.kernel.org/r/20240218-neufahrzeuge-brauhaus-fb0eb6459771@brauner
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/nsfs.c                 | 73 ++++++++++-----------------------------
 include/linux/ns_common.h |  2 +-
 include/linux/proc_ns.h   |  2 +-
 3 files changed, 20 insertions(+), 57 deletions(-)

diff --git a/fs/nsfs.c b/fs/nsfs.c
index 34e1e3e36733da..39dc2604bec01e 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -27,7 +27,8 @@ static const struct file_operations ns_file_operations = {
 static char *ns_dname(struct dentry *dentry, char *buffer, int buflen)
 {
 	struct inode *inode = d_inode(dentry);
-	const struct proc_ns_operations *ns_ops = dentry->d_fsdata;
+	struct ns_common *ns = inode->i_private;
+	const struct proc_ns_operations *ns_ops = ns->ops;
 
 	return dynamic_dname(buffer, buflen, "%s:[%lu]",
 		ns_ops->name, inode->i_ino);
@@ -38,7 +39,7 @@ static void ns_prune_dentry(struct dentry *dentry)
 	struct inode *inode = d_inode(dentry);
 	if (inode) {
 		struct ns_common *ns = inode->i_private;
-		atomic_long_set(&ns->stashed, 0);
+		WRITE_ONCE(ns->stashed, NULL);
 	}
 }
 
@@ -56,54 +57,6 @@ static void nsfs_evict(struct inode *inode)
 	ns->ops->put(ns);
 }
 
-static int __ns_get_path(struct path *path, struct ns_common *ns)
-{
-	struct vfsmount *mnt = nsfs_mnt;
-	struct dentry *dentry;
-	struct inode *inode;
-	unsigned long d;
-
-	rcu_read_lock();
-	d = atomic_long_read(&ns->stashed);
-	if (!d)
-		goto slow;
-	dentry = (struct dentry *)d;
-	if (!lockref_get_not_dead(&dentry->d_lockref))
-		goto slow;
-	rcu_read_unlock();
-	ns->ops->put(ns);
-got_it:
-	path->mnt = mntget(mnt);
-	path->dentry = dentry;
-	return 0;
-slow:
-	rcu_read_unlock();
-	inode = new_inode_pseudo(mnt->mnt_sb);
-	if (!inode) {
-		ns->ops->put(ns);
-		return -ENOMEM;
-	}
-	inode->i_ino = ns->inum;
-	simple_inode_init_ts(inode);
-	inode->i_flags |= S_IMMUTABLE;
-	inode->i_mode = S_IFREG | S_IRUGO;
-	inode->i_fop = &ns_file_operations;
-	inode->i_private = ns;
-
-	dentry = d_make_root(inode);	/* not the normal use, but... */
-	if (!dentry)
-		return -ENOMEM;
-	dentry->d_fsdata = (void *)ns->ops;
-	d = atomic_long_cmpxchg(&ns->stashed, 0, (unsigned long)dentry);
-	if (d) {
-		d_delete(dentry);	/* make sure ->d_prune() does nothing */
-		dput(dentry);
-		cpu_relax();
-		return -EAGAIN;
-	}
-	goto got_it;
-}
-
 int ns_get_path_cb(struct path *path, ns_get_path_helper_t *ns_get_cb,
 		     void *private_data)
 {
@@ -113,10 +66,16 @@ int ns_get_path_cb(struct path *path, ns_get_path_helper_t *ns_get_cb,
 		struct ns_common *ns = ns_get_cb(private_data);
 		if (!ns)
 			return -ENOENT;
-		ret = __ns_get_path(path, ns);
+		ret = path_from_stashed(&ns->stashed, ns->inum, nsfs_mnt,
+					&ns_file_operations, ns, path);
+		if (ret <= 0 && ret != -EAGAIN)
+			ns->ops->put(ns);
 	} while (ret == -EAGAIN);
 
-	return ret;
+	if (ret < 0)
+		return ret;
+
+	return 0;
 }
 
 struct ns_get_path_task_args {
@@ -163,10 +122,13 @@ int open_related_ns(struct ns_common *ns,
 			return PTR_ERR(relative);
 		}
 
-		err = __ns_get_path(&path, relative);
+		err = path_from_stashed(&relative->stashed, relative->inum, nsfs_mnt,
+					&ns_file_operations, relative, &path);
+		if (err <= 0 && err != -EAGAIN)
+			relative->ops->put(relative);
 	} while (err == -EAGAIN);
 
-	if (err) {
+	if (err < 0) {
 		put_unused_fd(fd);
 		return err;
 	}
@@ -249,7 +211,8 @@ bool ns_match(const struct ns_common *ns, dev_t dev, ino_t ino)
 static int nsfs_show_path(struct seq_file *seq, struct dentry *dentry)
 {
 	struct inode *inode = d_inode(dentry);
-	const struct proc_ns_operations *ns_ops = dentry->d_fsdata;
+	const struct ns_common *ns = inode->i_private;
+	const struct proc_ns_operations *ns_ops = ns->ops;
 
 	seq_printf(seq, "%s:[%lu]", ns_ops->name, inode->i_ino);
 	return 0;
diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h
index 0f1d024bd95826..7d22ea50b09841 100644
--- a/include/linux/ns_common.h
+++ b/include/linux/ns_common.h
@@ -7,7 +7,7 @@
 struct proc_ns_operations;
 
 struct ns_common {
-	atomic_long_t stashed;
+	struct dentry *stashed;
 	const struct proc_ns_operations *ops;
 	unsigned int inum;
 	refcount_t count;
diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h
index 49539bc416cecb..5ea470eb4d768a 100644
--- a/include/linux/proc_ns.h
+++ b/include/linux/proc_ns.h
@@ -66,7 +66,7 @@ static inline void proc_free_inum(unsigned int inum) {}
 
 static inline int ns_alloc_inum(struct ns_common *ns)
 {
-	atomic_long_set(&ns->stashed, 0);
+	WRITE_ONCE(ns->stashed, NULL);
 	return proc_alloc_inum(&ns->inum);
 }
 

From 26c747b339da14f146425672a0f6058e917628b8 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Mon, 19 Feb 2024 16:30:57 +0100
Subject: [PATCH 0818/1406] pidfdfs: convert to path_from_stashed() helper

Link: https://lore.kernel.org/r/20240218-neufahrzeuge-brauhaus-fb0eb6459771@brauner
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/pidfs.c          | 43 ++++++++++++++++++++++++++-----------------
 include/linux/pid.h |  1 +
 kernel/pid.c        |  1 +
 3 files changed, 28 insertions(+), 17 deletions(-)

diff --git a/fs/pidfs.c b/fs/pidfs.c
index 9796a15dec1ce3..ce539aa0341185 100644
--- a/fs/pidfs.c
+++ b/fs/pidfs.c
@@ -14,6 +14,8 @@
 #include <linux/seq_file.h>
 #include <uapi/linux/pidfd.h>
 
+#include "internal.h"
+
 struct pid *pidfd_pid(const struct file *file)
 {
 	if (file->f_op != &pidfd_fops)
@@ -160,9 +162,21 @@ static char *pidfs_dname(struct dentry *dentry, char *buffer, int buflen)
 			     d_inode(dentry)->i_ino);
 }
 
+static void pidfdfs_prune_dentry(struct dentry *dentry)
+{
+	struct inode *inode;
+
+	inode = d_inode(dentry);
+	if (inode) {
+		struct pid *pid = inode->i_private;
+		WRITE_ONCE(pid->stashed, NULL);
+	}
+}
+
 const struct dentry_operations pidfs_dentry_operations = {
 	.d_delete	= always_delete_dentry,
 	.d_dname	= pidfs_dname,
+	.d_prune	= pidfdfs_prune_dentry,
 };
 
 static int pidfs_init_fs_context(struct fs_context *fc)
@@ -187,27 +201,22 @@ static struct file_system_type pidfs_type = {
 struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags)
 {
 
-	struct inode *inode;
 	struct file *pidfd_file;
+	struct path path;
+	int ret;
 
-	inode = iget_locked(pidfs_sb, pid->ino);
-	if (!inode)
-		return ERR_PTR(-ENOMEM);
-
-	if (inode->i_state & I_NEW) {
-		inode->i_ino = pid->ino;
-		inode->i_mode = S_IFREG | S_IRUGO;
-		inode->i_fop = &pidfd_fops;
-		inode->i_flags |= S_IMMUTABLE;
-		inode->i_private = get_pid(pid);
-		simple_inode_init_ts(inode);
-		unlock_new_inode(inode);
-	}
+	do {
+		ret = path_from_stashed(&pid->stashed, pid->ino, pidfs_mnt,
+					&pidfd_fops, get_pid(pid), &path);
+		if (ret <= 0 && ret != -EAGAIN)
+			put_pid(pid);
+	} while (ret == -EAGAIN);
 
-	pidfd_file = alloc_file_pseudo(inode, pidfs_mnt, "", flags, &pidfd_fops);
-	if (IS_ERR(pidfd_file))
-		iput(inode);
+	if (ret < 0)
+		return ERR_PTR(ret);
 
+	pidfd_file = dentry_open(&path, flags, current_cred());
+	path_put(&path);
 	return pidfd_file;
 }
 
diff --git a/include/linux/pid.h b/include/linux/pid.h
index 956481128e8d42..c79a0efd02586b 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -56,6 +56,7 @@ struct pid
 	unsigned int level;
 	spinlock_t lock;
 #ifdef CONFIG_FS_PID
+	struct dentry *stashed;
 	unsigned long ino;
 #endif
 	/* lists of tasks that use this pid */
diff --git a/kernel/pid.c b/kernel/pid.c
index 7f8d029d5ad138..8ab095e9ff76a4 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -277,6 +277,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
 	if (!(ns->pid_allocated & PIDNS_ADDING))
 		goto out_unlock;
 #ifdef CONFIG_FS_PID
+	pid->stashed = NULL;
 	pid->ino = ++pidfs_ino;
 #endif
 	for ( ; upid >= pid->numbers; --upid) {

From 73d98ca0b54579908fa6a0a8df7f5bf2ddc38e54 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Sun, 18 Feb 2024 14:52:24 +0100
Subject: [PATCH 0819/1406] libfs: improve path_from_stashed() helper

In earlier patches we moved both nsfs and pidfs to path_from_stashed().
The helper currently tries to add and stash a new dentry if a reusable
dentry couldn't be found and returns EAGAIN if it lost the race to stash
the dentry. The caller can use EAGAIN to retry.

The helper and the two filesystems be written in a way that makes
returning EAGAIN unnecessary. To do this we need to change the
dentry->d_prune() implementation of nsfs and pidfs to not simply replace
the stashed dentry with NULL but to use a cmpxchg() and only replace
their own dentry.

Then path_from_stashed() can then be changed to not just stash a new
dentry when no dentry is currently stashed but also when an already dead
dentry is stashed. If another task managed to install a dentry in the
meantime it can simply be reused. Pack that into a loop and call it a
day.

Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Link: https://lore.kernel.org/r/CAHk-=wgtLF5Z5=15-LKAczWm=-tUjHO+Bpf7WjBG+UU3s=fEQw@mail.gmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/internal.h |  1 +
 fs/libfs.c    | 68 ++++++++++++++++++++++++++++++++++++---------------
 fs/nsfs.c     | 49 +++++++++++++++++--------------------
 fs/pidfs.c    | 16 ++++--------
 4 files changed, 76 insertions(+), 58 deletions(-)

diff --git a/fs/internal.h b/fs/internal.h
index cfddaec6fbf641..070959e12a343d 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -313,3 +313,4 @@ void mnt_idmap_put(struct mnt_idmap *idmap);
 int path_from_stashed(struct dentry **stashed, unsigned long ino,
 		      struct vfsmount *mnt, const struct file_operations *fops,
 		      void *data, struct path *path);
+void prune_stashed_dentry(struct dentry **stashed, struct dentry *dentry);
diff --git a/fs/libfs.c b/fs/libfs.c
index 8f115b6d212bed..38ccbeb103f12b 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -1986,10 +1986,10 @@ static inline struct dentry *get_stashed_dentry(struct dentry *stashed)
 	return dentry;
 }
 
-static struct dentry *stash_dentry(struct dentry **stashed, unsigned long ino,
-				   struct super_block *sb,
-				   const struct file_operations *fops,
-				   void *data)
+static struct dentry *prepare_anon_dentry(unsigned long ino,
+					  struct super_block *sb,
+					  const struct file_operations *fops,
+					  void *data)
 {
 	struct dentry *dentry;
 	struct inode *inode;
@@ -2013,15 +2013,29 @@ static struct dentry *stash_dentry(struct dentry **stashed, unsigned long ino,
 
 	/* @data is now owned by the fs */
 	d_instantiate(dentry, inode);
+	return dentry;
+}
 
-	if (cmpxchg(stashed, NULL, dentry)) {
-		d_delete(dentry); /* make sure ->d_prune() does nothing */
-		dput(dentry);
-		cpu_relax();
-		return ERR_PTR(-EAGAIN);
-	}
+static struct dentry *stash_dentry(struct dentry **stashed,
+				   struct dentry *dentry)
+{
+	guard(rcu)();
+	for (;;) {
+		struct dentry *old;
 
-	return dentry;
+		/* Assume any old dentry was cleared out. */
+		old = cmpxchg(stashed, NULL, dentry);
+		if (likely(!old))
+			return dentry;
+
+		/* Check if somebody else installed a reusable dentry. */
+		if (lockref_get_not_dead(&old->d_lockref))
+			return old;
+
+		/* There's an old dead dentry there, try to take it over. */
+		if (likely(try_cmpxchg(stashed, &old, dentry)))
+			return dentry;
+	}
 }
 
 /**
@@ -2035,15 +2049,14 @@ static struct dentry *stash_dentry(struct dentry **stashed, unsigned long ino,
  *
  * The function tries to retrieve a stashed dentry from @stashed. If the dentry
  * is still valid then it will be reused. If the dentry isn't able the function
- * will allocate a new dentry and inode. It will then try to update @stashed
- * with the newly added dentry. If it fails -EAGAIN is returned and the caller
- * my retry.
+ * will allocate a new dentry and inode. It will then check again whether it
+ * can reuse an existing dentry in case one has been added in the meantime or
+ * update @stashed with the newly added dentry.
  *
  * Special-purpose helper for nsfs and pidfs.
  *
  * Return: If 0 or an error is returned the caller can be sure that @data must
- *         be cleaned up. If 1 or -EAGAIN is returned @data is owned by the
- *         filesystem.
+ *         be cleaned up. If 1 is returned @data is owned by the filesystem.
  */
 int path_from_stashed(struct dentry **stashed, unsigned long ino,
 		      struct vfsmount *mnt, const struct file_operations *fops,
@@ -2052,17 +2065,32 @@ int path_from_stashed(struct dentry **stashed, unsigned long ino,
 	struct dentry *dentry;
 	int ret = 0;
 
-	dentry = get_stashed_dentry(*stashed);
-	if (dentry)
+	/* See if dentry can be reused. */
+	path->dentry = get_stashed_dentry(*stashed);
+	if (path->dentry)
 		goto out_path;
 
-	dentry = stash_dentry(stashed, ino, mnt->mnt_sb, fops, data);
+	/* Allocate a new dentry. */
+	dentry = prepare_anon_dentry(ino, mnt->mnt_sb, fops, data);
 	if (IS_ERR(dentry))
 		return PTR_ERR(dentry);
+
+	/* Added a new dentry. @data is now owned by the filesystem. */
+	path->dentry = stash_dentry(stashed, dentry);
+	if (path->dentry != dentry)
+		dput(dentry);
 	ret = 1;
 
 out_path:
-	path->dentry = dentry;
 	path->mnt = mntget(mnt);
 	return ret;
 }
+
+void prune_stashed_dentry(struct dentry **stashed, struct dentry *dentry)
+{
+	/*
+	 * Only replace our own @dentry as someone else might've already
+	 * cleared out @dentry and stashed their own dentry in there.
+	 */
+	cmpxchg(stashed, dentry, NULL);
+}
diff --git a/fs/nsfs.c b/fs/nsfs.c
index 39dc2604bec01e..572e34dc2f37f0 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -36,10 +36,12 @@ static char *ns_dname(struct dentry *dentry, char *buffer, int buflen)
 
 static void ns_prune_dentry(struct dentry *dentry)
 {
-	struct inode *inode = d_inode(dentry);
+	struct inode *inode;
+
+	inode = d_inode(dentry);
 	if (inode) {
 		struct ns_common *ns = inode->i_private;
-		WRITE_ONCE(ns->stashed, NULL);
+		prune_stashed_dentry(&ns->stashed, dentry);
 	}
 }
 
@@ -61,20 +63,17 @@ int ns_get_path_cb(struct path *path, ns_get_path_helper_t *ns_get_cb,
 		     void *private_data)
 {
 	int ret;
+	struct ns_common *ns;
 
-	do {
-		struct ns_common *ns = ns_get_cb(private_data);
-		if (!ns)
-			return -ENOENT;
-		ret = path_from_stashed(&ns->stashed, ns->inum, nsfs_mnt,
-					&ns_file_operations, ns, path);
-		if (ret <= 0 && ret != -EAGAIN)
-			ns->ops->put(ns);
-	} while (ret == -EAGAIN);
-
+	ns = ns_get_cb(private_data);
+	if (!ns)
+		return -ENOENT;
+	ret = path_from_stashed(&ns->stashed, ns->inum, nsfs_mnt,
+				&ns_file_operations, ns, path);
+	if (ret <= 0)
+		ns->ops->put(ns);
 	if (ret < 0)
 		return ret;
-
 	return 0;
 }
 
@@ -105,6 +104,7 @@ int open_related_ns(struct ns_common *ns,
 		   struct ns_common *(*get_ns)(struct ns_common *ns))
 {
 	struct path path = {};
+	struct ns_common *relative;
 	struct file *f;
 	int err;
 	int fd;
@@ -113,21 +113,16 @@ int open_related_ns(struct ns_common *ns,
 	if (fd < 0)
 		return fd;
 
-	do {
-		struct ns_common *relative;
-
-		relative = get_ns(ns);
-		if (IS_ERR(relative)) {
-			put_unused_fd(fd);
-			return PTR_ERR(relative);
-		}
-
-		err = path_from_stashed(&relative->stashed, relative->inum, nsfs_mnt,
-					&ns_file_operations, relative, &path);
-		if (err <= 0 && err != -EAGAIN)
-			relative->ops->put(relative);
-	} while (err == -EAGAIN);
+	relative = get_ns(ns);
+	if (IS_ERR(relative)) {
+		put_unused_fd(fd);
+		return PTR_ERR(relative);
+	}
 
+	err = path_from_stashed(&relative->stashed, relative->inum, nsfs_mnt,
+				&ns_file_operations, relative, &path);
+	if (err <= 0)
+		relative->ops->put(relative);
 	if (err < 0) {
 		put_unused_fd(fd);
 		return err;
diff --git a/fs/pidfs.c b/fs/pidfs.c
index ce539aa0341185..c33501c9cd8bda 100644
--- a/fs/pidfs.c
+++ b/fs/pidfs.c
@@ -140,7 +140,6 @@ const struct file_operations pidfd_fops = {
 
 #ifdef CONFIG_FS_PID
 static struct vfsmount *pidfs_mnt __ro_after_init;
-static struct super_block *pidfs_sb __ro_after_init;
 
 static void pidfs_evict_inode(struct inode *inode)
 {
@@ -169,7 +168,7 @@ static void pidfdfs_prune_dentry(struct dentry *dentry)
 	inode = d_inode(dentry);
 	if (inode) {
 		struct pid *pid = inode->i_private;
-		WRITE_ONCE(pid->stashed, NULL);
+		prune_stashed_dentry(&pid->stashed, dentry);
 	}
 }
 
@@ -205,13 +204,10 @@ struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags)
 	struct path path;
 	int ret;
 
-	do {
-		ret = path_from_stashed(&pid->stashed, pid->ino, pidfs_mnt,
-					&pidfd_fops, get_pid(pid), &path);
-		if (ret <= 0 && ret != -EAGAIN)
-			put_pid(pid);
-	} while (ret == -EAGAIN);
-
+	ret = path_from_stashed(&pid->stashed, pid->ino, pidfs_mnt,
+				&pidfd_fops, get_pid(pid), &path);
+	if (ret <= 0)
+		put_pid(pid);
 	if (ret < 0)
 		return ERR_PTR(ret);
 
@@ -231,8 +227,6 @@ void __init pidfs_init(void)
 	pidfs_mnt = kern_mount(&pidfs_type);
 	if (IS_ERR(pidfs_mnt))
 		panic("Failed to mount pidfs pseudo filesystem");
-
-	pidfs_sb = pidfs_mnt->mnt_sb;
 }
 
 #else /* !CONFIG_FS_PID */

From f3dd8c812c240ae1f4162b9494b625e6cf1f6cfa Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 16 Feb 2024 21:23:34 +0100
Subject: [PATCH 0820/1406] fs/select: rework stack allocation hack for clang

A while ago, we changed the way that select() and poll() preallocate
a temporary buffer just under the size of the static warning limit of
1024 bytes, as clang was frequently going slightly above that limit.

The warnings have recently returned and I took another look. As it turns
out, clang is not actually inherently worse at reserving stack space,
it just happens to inline do_select() into core_sys_select(), while gcc
never inlines it.

Annotate do_select() to never be inlined and in turn remove the special
case for the allocation size. This should give the same behavior for
both clang and gcc all the time and once more avoids those warnings.

Fixes: ad312f95d41c ("fs/select: avoid clang stack usage warning")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Link: https://lore.kernel.org/r/20240216202352.2492798-1-arnd@kernel.org
Reviewed-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Andi Kleen <ak@linux.intel.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/select.c          | 2 +-
 include/linux/poll.h | 4 ----
 2 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/fs/select.c b/fs/select.c
index 11a3b1312abeff..9515c3fa1a03e8 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -476,7 +476,7 @@ static inline void wait_key_set(poll_table *wait, unsigned long in,
 		wait->_key |= POLLOUT_SET;
 }
 
-static int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time)
+static noinline_for_stack int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time)
 {
 	ktime_t expire, *to = NULL;
 	struct poll_wqueues table;
diff --git a/include/linux/poll.h b/include/linux/poll.h
index a9e0e1c2d1f2ff..d1ea4f3714a848 100644
--- a/include/linux/poll.h
+++ b/include/linux/poll.h
@@ -14,11 +14,7 @@
 
 /* ~832 bytes of stack space used max in sys_select/sys_poll before allocating
    additional memory. */
-#ifdef __clang__
-#define MAX_STACK_ALLOC 768
-#else
 #define MAX_STACK_ALLOC 832
-#endif
 #define FRONTEND_STACK_ALLOC	256
 #define SELECT_STACK_ALLOC	FRONTEND_STACK_ALLOC
 #define POLL_STACK_ALLOC	FRONTEND_STACK_ALLOC

From 27b63e8ee376b8e3649dcb8d9eff7c6e5040f678 Mon Sep 17 00:00:00 2001
From: Bill O'Donnell <bodonnel@redhat.com>
Date: Mon, 19 Feb 2024 18:33:18 -0600
Subject: [PATCH 0821/1406] efs: convert efs to use the new mount api

Convert the efs filesystem to use the new mount API.

Signed-off-by: Bill O'Donnell <bodonnel@redhat.com>
Link: https://lore.kernel.org/r/20240220003318.166143-1-bodonnel@redhat.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/efs/super.c | 114 ++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 84 insertions(+), 30 deletions(-)

diff --git a/fs/efs/super.c b/fs/efs/super.c
index f17fdac76b2eea..c837ac89b384f1 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -14,19 +14,14 @@
 #include <linux/buffer_head.h>
 #include <linux/vfs.h>
 #include <linux/blkdev.h>
-
+#include <linux/fs_context.h>
+#include <linux/fs_parser.h>
 #include "efs.h"
 #include <linux/efs_vh.h>
 #include <linux/efs_fs_sb.h>
 
 static int efs_statfs(struct dentry *dentry, struct kstatfs *buf);
-static int efs_fill_super(struct super_block *s, void *d, int silent);
-
-static struct dentry *efs_mount(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
-{
-	return mount_bdev(fs_type, flags, dev_name, data, efs_fill_super);
-}
+static int efs_init_fs_context(struct fs_context *fc);
 
 static void efs_kill_sb(struct super_block *s)
 {
@@ -35,15 +30,6 @@ static void efs_kill_sb(struct super_block *s)
 	kfree(sbi);
 }
 
-static struct file_system_type efs_fs_type = {
-	.owner		= THIS_MODULE,
-	.name		= "efs",
-	.mount		= efs_mount,
-	.kill_sb	= efs_kill_sb,
-	.fs_flags	= FS_REQUIRES_DEV,
-};
-MODULE_ALIAS_FS("efs");
-
 static struct pt_types sgi_pt_types[] = {
 	{0x00,		"SGI vh"},
 	{0x01,		"SGI trkrepl"},
@@ -63,6 +49,27 @@ static struct pt_types sgi_pt_types[] = {
 	{0,		NULL}
 };
 
+enum {
+	Opt_explicit_open,
+};
+
+static const struct fs_parameter_spec efs_param_spec[] = {
+	fsparam_flag    ("explicit-open",       Opt_explicit_open),
+	{}
+};
+
+/*
+ * File system definition and registration.
+ */
+static struct file_system_type efs_fs_type = {
+	.owner			= THIS_MODULE,
+	.name			= "efs",
+	.kill_sb		= efs_kill_sb,
+	.fs_flags		= FS_REQUIRES_DEV,
+	.init_fs_context	= efs_init_fs_context,
+	.parameters		= efs_param_spec,
+};
+MODULE_ALIAS_FS("efs");
 
 static struct kmem_cache * efs_inode_cachep;
 
@@ -108,18 +115,10 @@ static void destroy_inodecache(void)
 	kmem_cache_destroy(efs_inode_cachep);
 }
 
-static int efs_remount(struct super_block *sb, int *flags, char *data)
-{
-	sync_filesystem(sb);
-	*flags |= SB_RDONLY;
-	return 0;
-}
-
 static const struct super_operations efs_superblock_operations = {
 	.alloc_inode	= efs_alloc_inode,
 	.free_inode	= efs_free_inode,
 	.statfs		= efs_statfs,
-	.remount_fs	= efs_remount,
 };
 
 static const struct export_operations efs_export_ops = {
@@ -249,26 +248,26 @@ static int efs_validate_super(struct efs_sb_info *sb, struct efs_super *super) {
 	return 0;    
 }
 
-static int efs_fill_super(struct super_block *s, void *d, int silent)
+static int efs_fill_super(struct super_block *s, struct fs_context *fc)
 {
 	struct efs_sb_info *sb;
 	struct buffer_head *bh;
 	struct inode *root;
 
- 	sb = kzalloc(sizeof(struct efs_sb_info), GFP_KERNEL);
+	sb = kzalloc(sizeof(struct efs_sb_info), GFP_KERNEL);
 	if (!sb)
 		return -ENOMEM;
 	s->s_fs_info = sb;
 	s->s_time_min = 0;
 	s->s_time_max = U32_MAX;
- 
+
 	s->s_magic		= EFS_SUPER_MAGIC;
 	if (!sb_set_blocksize(s, EFS_BLOCKSIZE)) {
 		pr_err("device does not support %d byte blocks\n",
 			EFS_BLOCKSIZE);
 		return -EINVAL;
 	}
-  
+
 	/* read the vh (volume header) block */
 	bh = sb_bread(s, 0);
 
@@ -294,7 +293,7 @@ static int efs_fill_super(struct super_block *s, void *d, int silent)
 		pr_err("cannot read superblock\n");
 		return -EIO;
 	}
-		
+
 	if (efs_validate_super(sb, (struct efs_super *) bh->b_data)) {
 #ifdef DEBUG
 		pr_warn("invalid superblock at block %u\n",
@@ -328,6 +327,61 @@ static int efs_fill_super(struct super_block *s, void *d, int silent)
 	return 0;
 }
 
+static void efs_free_fc(struct fs_context *fc)
+{
+	kfree(fc->fs_private);
+}
+
+static int efs_get_tree(struct fs_context *fc)
+{
+	return get_tree_bdev(fc, efs_fill_super);
+}
+
+static int efs_parse_param(struct fs_context *fc, struct fs_parameter *param)
+{
+	int token;
+	struct fs_parse_result result;
+
+	token = fs_parse(fc, efs_param_spec, param, &result);
+	if (token < 0)
+		return token;
+	return 0;
+}
+
+static int efs_reconfigure(struct fs_context *fc)
+{
+	sync_filesystem(fc->root->d_sb);
+
+	return 0;
+}
+
+struct efs_context {
+	unsigned long s_mount_opts;
+};
+
+static const struct fs_context_operations efs_context_opts = {
+	.parse_param	= efs_parse_param,
+	.get_tree	= efs_get_tree,
+	.reconfigure	= efs_reconfigure,
+	.free		= efs_free_fc,
+};
+
+/*
+ * Set up the filesystem mount context.
+ */
+static int efs_init_fs_context(struct fs_context *fc)
+{
+	struct efs_context *ctx;
+
+	ctx = kzalloc(sizeof(struct efs_context), GFP_KERNEL);
+	if (!ctx)
+		return -ENOMEM;
+	fc->fs_private = ctx;
+	fc->ops = &efs_context_opts;
+
+	return 0;
+}
+
 static int efs_statfs(struct dentry *dentry, struct kstatfs *buf) {
 	struct super_block *sb = dentry->d_sb;
 	struct efs_sb_info *sbi = SUPER_INFO(sb);

From d27816b7548c0ea24f9656cab6163446f356842a Mon Sep 17 00:00:00 2001
From: Daniel Golle <daniel@makrotopia.org>
Date: Wed, 7 Feb 2024 17:42:41 +0000
Subject: [PATCH 0822/1406] soc: mediatek: mtk-socinfo: depends on
 CONFIG_SOC_BUS

The mtk-socinfo driver uses symbols 'soc_device_register' and
'soc_device_unregister' which are part of the bus driver for
System-on-Chip devices.

Select SOC_BUS to make sure that driver is built and the symbols are
available.

Fixes: 423a54da3c7e ("soc: mediatek: mtk-socinfo: Add driver for getting chip information")
Signed-off-by: Daniel Golle <daniel@makrotopia.org>
Reviewed-by: Chen-Yu Tsai <wenst@chromium.org>
Link: https://lore.kernel.org/r/cc8f7f7da5bdccce514a320e0ae7468659cf7346.1707327680.git.daniel@makrotopia.org
Signed-off-by: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
---
 drivers/soc/mediatek/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/soc/mediatek/Kconfig b/drivers/soc/mediatek/Kconfig
index 50c664b65f4d44..1b7afb19ccd637 100644
--- a/drivers/soc/mediatek/Kconfig
+++ b/drivers/soc/mediatek/Kconfig
@@ -72,6 +72,7 @@ config MTK_SOCINFO
 	tristate "MediaTek SoC Information"
 	default y
 	depends on NVMEM_MTK_EFUSE
+	select SOC_BUS
 	help
 	  The MediaTek SoC Information (mtk-socinfo) driver provides
 	  information about the SoC to the userspace including the

From 0c5f7c2f302fe2f695ba1d8709c9657e3de0cea0 Mon Sep 17 00:00:00 2001
From: Chen-Yu Tsai <wenst@chromium.org>
Date: Wed, 31 Jan 2024 16:39:29 +0800
Subject: [PATCH 0823/1406] arm64: dts: mediatek: mt8183-kukui-jacuzzi: Add
 ports node for anx7625

The anx7625 binding requires a "ports" node as a container for the
"port" nodes. The jacuzzi dtsi file is missing it.

Add a "ports" node under the anx7625 node, and move the port related
nodes and properties under it.

Fixes: cabc71b08eb5 ("arm64: dts: mt8183: Add kukui-jacuzzi-damu board")
Signed-off-by: Chen-Yu Tsai <wenst@chromium.org>
Reviewed-by: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
Link: https://lore.kernel.org/r/20240131083931.3970388-1-wenst@chromium.org
Signed-off-by: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
---
 .../dts/mediatek/mt8183-kukui-jacuzzi.dtsi    | 25 +++++++++++--------
 1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/arch/arm64/boot/dts/mediatek/mt8183-kukui-jacuzzi.dtsi b/arch/arm64/boot/dts/mediatek/mt8183-kukui-jacuzzi.dtsi
index 7592e3b860377e..fa4ab4d2899f9b 100644
--- a/arch/arm64/boot/dts/mediatek/mt8183-kukui-jacuzzi.dtsi
+++ b/arch/arm64/boot/dts/mediatek/mt8183-kukui-jacuzzi.dtsi
@@ -155,21 +155,24 @@
 		vdd18-supply = <&pp1800_mipibrdg>;
 		vdd33-supply = <&vddio_mipibrdg>;
 
-		#address-cells = <1>;
-		#size-cells = <0>;
-		port@0 {
-			reg = <0>;
+		ports {
+			#address-cells = <1>;
+			#size-cells = <0>;
 
-			anx7625_in: endpoint {
-				remote-endpoint = <&dsi_out>;
+			port@0 {
+				reg = <0>;
+
+				anx7625_in: endpoint {
+					remote-endpoint = <&dsi_out>;
+				};
 			};
-		};
 
-		port@1 {
-			reg = <1>;
+			port@1 {
+				reg = <1>;
 
-			anx7625_out: endpoint {
-				remote-endpoint = <&panel_in>;
+				anx7625_out: endpoint {
+					remote-endpoint = <&panel_in>;
+				};
 			};
 		};
 

From ba90af39ba57b3fe3ecfdba0c87a80d20c7b788d Mon Sep 17 00:00:00 2001
From: Chen-Yu Tsai <wenst@chromium.org>
Date: Wed, 31 Jan 2024 16:40:41 +0800
Subject: [PATCH 0824/1406] arm64: dts: mediatek: mt8183-pico6: Fix wake-on-X
 event node names

The wake-on-bt and wake-on-wlan nodes don't have a button- or event-
prefix that the gpio-keys binding requires.

Fix up the node names to satisfy the binding. While at it, also fix up
the GPIO overriding structure for the wake-on-wlan node. Instead of
referencing the gpio-keys node and then open coding the node, add a
label for the event node, and use that to reference and override the
GPIO settings.

Fixes: 055ef10ccdd4 ("arm64: dts: mt8183: Add jacuzzi pico/pico6 board")
Signed-off-by: Chen-Yu Tsai <wenst@chromium.org>
Reviewed-by: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
Link: https://lore.kernel.org/r/20240131084043.3970576-1-wenst@chromium.org
Signed-off-by: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
---
 .../boot/dts/mediatek/mt8183-kukui-jacuzzi-pico6.dts      | 8 +++-----
 arch/arm64/boot/dts/mediatek/mt8183-kukui.dtsi            | 2 +-
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/arch/arm64/boot/dts/mediatek/mt8183-kukui-jacuzzi-pico6.dts b/arch/arm64/boot/dts/mediatek/mt8183-kukui-jacuzzi-pico6.dts
index a2e74b82932064..a3f10898adae58 100644
--- a/arch/arm64/boot/dts/mediatek/mt8183-kukui-jacuzzi-pico6.dts
+++ b/arch/arm64/boot/dts/mediatek/mt8183-kukui-jacuzzi-pico6.dts
@@ -17,7 +17,7 @@
 		pinctrl-names = "default";
 		pinctrl-0 = <&bt_pins_wakeup>;
 
-		wobt {
+		event-wobt {
 			label = "Wake on BT";
 			gpios = <&pio 42 GPIO_ACTIVE_HIGH>;
 			linux,code = <KEY_WAKEUP>;
@@ -47,10 +47,8 @@
 	};
 };
 
-&wifi_wakeup {
-	wowlan {
-		gpios = <&pio 113 GPIO_ACTIVE_LOW>;
-	};
+&wifi_wakeup_event {
+	gpios = <&pio 113 GPIO_ACTIVE_LOW>;
 };
 
 &wifi_pwrseq {
diff --git a/arch/arm64/boot/dts/mediatek/mt8183-kukui.dtsi b/arch/arm64/boot/dts/mediatek/mt8183-kukui.dtsi
index 869b1a3203297d..3830daf7ea8519 100644
--- a/arch/arm64/boot/dts/mediatek/mt8183-kukui.dtsi
+++ b/arch/arm64/boot/dts/mediatek/mt8183-kukui.dtsi
@@ -152,7 +152,7 @@
 		pinctrl-names = "default";
 		pinctrl-0 = <&wifi_pins_wakeup>;
 
-		button-wowlan {
+		wifi_wakeup_event: event-wowlan {
 			label = "Wake on WiFi";
 			gpios = <&pio 113 GPIO_ACTIVE_HIGH>;
 			linux,code = <KEY_WAKEUP>;

From 297ff2f5a0e47d3bdc432bf507f35a3a8d69efa1 Mon Sep 17 00:00:00 2001
From: Li zeming <zeming@nfschina.com>
Date: Tue, 20 Feb 2024 14:20:30 +0800
Subject: [PATCH 0825/1406] =?UTF-8?q?libfs:=20Remove=20unnecessary=20?=
 =?UTF-8?q?=E2=80=980=E2=80=99=20values=20from=20ret?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

ret is assigned first, so it does not need to initialize the assignment.

Signed-off-by: Li zeming <zeming@nfschina.com>
Link: https://lore.kernel.org/r/20240220062030.114203-1-zeming@nfschina.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/libfs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/libfs.c b/fs/libfs.c
index eec6031b015544..6fb8244b259e81 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -1752,7 +1752,7 @@ static int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str)
 	const struct inode *dir = READ_ONCE(dentry->d_inode);
 	struct super_block *sb = dentry->d_sb;
 	const struct unicode_map *um = sb->s_encoding;
-	int ret = 0;
+	int ret;
 
 	if (!dir || !IS_CASEFOLDED(dir))
 		return 0;

From 1b76e7291c1b958992d7c8a823b919b3b312780e Mon Sep 17 00:00:00 2001
From: Hai Pham <hai.pham.ud@renesas.com>
Date: Fri, 26 Jan 2024 11:54:57 +0100
Subject: [PATCH 0826/1406] arm64: dts: renesas: r8a779h0: Add pinctrl device
 node

Add a device node for the Pin Function Controller on the Renesas R-Car
V4M (R8A779H0) SoC.

Signed-off-by: Hai Pham <hai.pham.ud@renesas.com>
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Link: https://lore.kernel.org/r/0ab32290014b64ddbee5c9ec2808c8294d0b6192.1706266286.git.geert+renesas@glider.be
---
 arch/arm64/boot/dts/renesas/r8a779h0.dtsi | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/arch/arm64/boot/dts/renesas/r8a779h0.dtsi b/arch/arm64/boot/dts/renesas/r8a779h0.dtsi
index 09cf4fe97d81a1..95574bbebd9c86 100644
--- a/arch/arm64/boot/dts/renesas/r8a779h0.dtsi
+++ b/arch/arm64/boot/dts/renesas/r8a779h0.dtsi
@@ -169,6 +169,14 @@
 			status = "disabled";
 		};
 
+		pfc: pinctrl@e6050000 {
+			compatible = "renesas,pfc-r8a779h0";
+			reg = <0 0xe6050000 0 0x16c>, <0 0xe6050800 0 0x16c>,
+			      <0 0xe6058000 0 0x16c>, <0 0xe6058800 0 0x16c>,
+			      <0 0xe6060000 0 0x16c>, <0 0xe6060800 0 0x16c>,
+			      <0 0xe6061000 0 0x16c>, <0 0xe6061800 0 0x16c>;
+		};
+
 		cpg: clock-controller@e6150000 {
 			compatible = "renesas,r8a779h0-cpg-mssr";
 			reg = <0 0xe6150000 0 0x4000>;

From bce7b55b2a9a47150d82bc141ec2f34c01fc0095 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Fri, 26 Jan 2024 11:54:58 +0100
Subject: [PATCH 0827/1406] arm64: dts: renesas: gray-hawk-single: Add serial
 console pin control

Complete the descriptions of the serial console and the external serial
clock by adding pin control.

Based on patches for Gray Hawk in the BSP by Hai Pham and Nghia Nguyen.

Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Link: https://lore.kernel.org/r/b1eb2d3364d5ead7f7bcf7a737c5914971db64d3.1706266286.git.geert+renesas@glider.be
---
 .../dts/renesas/r8a779h0-gray-hawk-single.dts  | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/arch/arm64/boot/dts/renesas/r8a779h0-gray-hawk-single.dts b/arch/arm64/boot/dts/renesas/r8a779h0-gray-hawk-single.dts
index e04de1d74825ea..e68cd962f90833 100644
--- a/arch/arm64/boot/dts/renesas/r8a779h0-gray-hawk-single.dts
+++ b/arch/arm64/boot/dts/renesas/r8a779h0-gray-hawk-single.dts
@@ -43,10 +43,28 @@
 };
 
 &hscif0 {
+	pinctrl-0 = <&hscif0_pins>;
+	pinctrl-names = "default";
+
 	uart-has-rtscts;
 	status = "okay";
 };
 
+&pfc {
+	pinctrl-0 = <&scif_clk_pins>;
+	pinctrl-names = "default";
+
+	hscif0_pins: hscif0 {
+		groups = "hscif0_data", "hscif0_ctrl";
+		function = "hscif0";
+	};
+
+	scif_clk_pins: scif-clk {
+		groups = "scif_clk";
+		function = "scif_clk";
+	};
+};
+
 &rwdt {
 	timeout-sec = <60>;
 	status = "okay";

From 775d3714d8644edf29ed53cb4ad7205e22fe9a9d Mon Sep 17 00:00:00 2001
From: Hai Pham <hai.pham.ud@renesas.com>
Date: Thu, 1 Feb 2024 15:14:33 +0100
Subject: [PATCH 0828/1406] arm64: dts: renesas: r8a779h0: Add I2C nodes

Add device nodes for the I2C Bus Interfaces on the Renesas R-Car V4M
(R8A779H0) SoC.

Signed-off-by: Hai Pham <hai.pham.ud@renesas.com>
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Reviewed-by: Wolfram Sang <wsa+renesas@sang-engineering.com>
Link: https://lore.kernel.org/r/7dbbe13428273c5786ddff6ea7af6724fcdd4de8.1706796660.git.geert+renesas@glider.be
---
 arch/arm64/boot/dts/renesas/r8a779h0.dtsi | 56 +++++++++++++++++++++++
 1 file changed, 56 insertions(+)

diff --git a/arch/arm64/boot/dts/renesas/r8a779h0.dtsi b/arch/arm64/boot/dts/renesas/r8a779h0.dtsi
index 95574bbebd9c86..5a075a975fb6b5 100644
--- a/arch/arm64/boot/dts/renesas/r8a779h0.dtsi
+++ b/arch/arm64/boot/dts/renesas/r8a779h0.dtsi
@@ -198,6 +198,62 @@
 			#power-domain-cells = <1>;
 		};
 
+		i2c0: i2c@e6500000 {
+			compatible = "renesas,i2c-r8a779h0",
+				     "renesas,rcar-gen4-i2c";
+			reg = <0 0xe6500000 0 0x40>;
+			interrupts = <GIC_SPI 610 IRQ_TYPE_LEVEL_HIGH>;
+			clocks = <&cpg CPG_MOD 518>;
+			power-domains = <&sysc R8A779H0_PD_ALWAYS_ON>;
+			resets = <&cpg 518>;
+			i2c-scl-internal-delay-ns = <110>;
+			#address-cells = <1>;
+			#size-cells = <0>;
+			status = "disabled";
+		};
+
+		i2c1: i2c@e6508000 {
+			compatible = "renesas,i2c-r8a779h0",
+				     "renesas,rcar-gen4-i2c";
+			reg = <0 0xe6508000 0 0x40>;
+			interrupts = <GIC_SPI 611 IRQ_TYPE_LEVEL_HIGH>;
+			clocks = <&cpg CPG_MOD 519>;
+			power-domains = <&sysc R8A779H0_PD_ALWAYS_ON>;
+			resets = <&cpg 519>;
+			i2c-scl-internal-delay-ns = <110>;
+			#address-cells = <1>;
+			#size-cells = <0>;
+			status = "disabled";
+		};
+
+		i2c2: i2c@e6510000 {
+			compatible = "renesas,i2c-r8a779h0",
+				     "renesas,rcar-gen4-i2c";
+			reg = <0 0xe6510000 0 0x40>;
+			interrupts = <GIC_SPI 612 IRQ_TYPE_LEVEL_HIGH>;
+			clocks = <&cpg CPG_MOD 520>;
+			power-domains = <&sysc R8A779H0_PD_ALWAYS_ON>;
+			resets = <&cpg 520>;
+			i2c-scl-internal-delay-ns = <110>;
+			#address-cells = <1>;
+			#size-cells = <0>;
+			status = "disabled";
+		};
+
+		i2c3: i2c@e66d0000 {
+			compatible = "renesas,i2c-r8a779h0",
+				     "renesas,rcar-gen4-i2c";
+			reg = <0 0xe66d0000 0 0x40>;
+			interrupts = <GIC_SPI 613 IRQ_TYPE_LEVEL_HIGH>;
+			clocks = <&cpg CPG_MOD 521>;
+			power-domains = <&sysc R8A779H0_PD_ALWAYS_ON>;
+			resets = <&cpg 521>;
+			i2c-scl-internal-delay-ns = <110>;
+			#address-cells = <1>;
+			#size-cells = <0>;
+			status = "disabled";
+		};
+
 		hscif0: serial@e6540000 {
 			compatible = "renesas,hscif-r8a779h0",
 				     "renesas,rcar-gen4-hscif", "renesas,hscif";

From 7102e3f9ef71c21dc25813cb4fb42f04fea6367d Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Thu, 1 Feb 2024 15:14:34 +0100
Subject: [PATCH 0829/1406] arm64: dts: renesas: gray-hawk-single: Add I2C0 and
 EEPROMs

Enable the I2C0 bus on the Gray Hawk Single board, and describe the I2C
EEPROMs present.

Based on patches for Gray Hawk in the BSP by Hai Pham.

Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Reviewed-by: Wolfram Sang <wsa+renesas@sang-engineering.com>
Link: https://lore.kernel.org/r/960595394a274b675f1ec9ec1c324e4cc1ac1f77.1706796660.git.geert+renesas@glider.be
---
 .../dts/renesas/r8a779h0-gray-hawk-single.dts | 41 +++++++++++++++++++
 1 file changed, 41 insertions(+)

diff --git a/arch/arm64/boot/dts/renesas/r8a779h0-gray-hawk-single.dts b/arch/arm64/boot/dts/renesas/r8a779h0-gray-hawk-single.dts
index e68cd962f90833..5a7e1bea9f6601 100644
--- a/arch/arm64/boot/dts/renesas/r8a779h0-gray-hawk-single.dts
+++ b/arch/arm64/boot/dts/renesas/r8a779h0-gray-hawk-single.dts
@@ -50,6 +50,42 @@
 	status = "okay";
 };
 
+&i2c0 {
+	pinctrl-0 = <&i2c0_pins>;
+	pinctrl-names = "default";
+
+	status = "okay";
+	clock-frequency = <400000>;
+
+	eeprom@50 {
+		compatible = "rohm,br24g01", "atmel,24c01";
+		label = "cpu-board";
+		reg = <0x50>;
+		pagesize = <8>;
+	};
+
+	eeprom@51 {
+		compatible = "rohm,br24g01", "atmel,24c01";
+		label = "breakout-board";
+		reg = <0x51>;
+		pagesize = <8>;
+	};
+
+	eeprom@52 {
+		compatible = "rohm,br24g01", "atmel,24c01";
+		label = "csi-dsi-sub-board-id";
+		reg = <0x52>;
+		pagesize = <8>;
+	};
+
+	eeprom@53 {
+		compatible = "rohm,br24g01", "atmel,24c01";
+		label = "ethernet-sub-board-id";
+		reg = <0x53>;
+		pagesize = <8>;
+	};
+};
+
 &pfc {
 	pinctrl-0 = <&scif_clk_pins>;
 	pinctrl-names = "default";
@@ -59,6 +95,11 @@
 		function = "hscif0";
 	};
 
+	i2c0_pins: i2c0 {
+		groups = "i2c0";
+		function = "i2c0";
+	};
+
 	scif_clk_pins: scif-clk {
 		groups = "scif_clk";
 		function = "scif_clk";

From 12171b8475e6c116ac148c39163db1ad584fd77a Mon Sep 17 00:00:00 2001
From: Cong Dang <cong.dang.xn@renesas.com>
Date: Thu, 1 Feb 2024 15:15:56 +0100
Subject: [PATCH 0830/1406] arm64: dts: renesas: r8a779h0: Add GPIO nodes

Add device nodes for the General Purpose Input/Output (GPIO) blocks on
the Renesas R-Car V4M (R8A779H0) SoC.

Signed-off-by: Cong Dang <cong.dang.xn@renesas.com>
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Link: https://lore.kernel.org/r/d4c1c40404ab84c7e7c07612077ca1a319ae7283.1706796918.git.geert+renesas@glider.be
---
 arch/arm64/boot/dts/renesas/r8a779h0.dtsi | 120 ++++++++++++++++++++++
 1 file changed, 120 insertions(+)

diff --git a/arch/arm64/boot/dts/renesas/r8a779h0.dtsi b/arch/arm64/boot/dts/renesas/r8a779h0.dtsi
index 5a075a975fb6b5..4e9e487ec51661 100644
--- a/arch/arm64/boot/dts/renesas/r8a779h0.dtsi
+++ b/arch/arm64/boot/dts/renesas/r8a779h0.dtsi
@@ -177,6 +177,126 @@
 			      <0 0xe6061000 0 0x16c>, <0 0xe6061800 0 0x16c>;
 		};
 
+		gpio0: gpio@e6050180 {
+			compatible = "renesas,gpio-r8a779h0",
+				     "renesas,rcar-gen4-gpio";
+			reg = <0 0xe6050180 0 0x54>;
+			interrupts = <GIC_SPI 619 IRQ_TYPE_LEVEL_HIGH>;
+			#gpio-cells = <2>;
+			gpio-controller;
+			gpio-ranges = <&pfc 0 0 19>;
+			#interrupt-cells = <2>;
+			interrupt-controller;
+			clocks = <&cpg CPG_MOD 915>;
+			power-domains = <&sysc R8A779H0_PD_ALWAYS_ON>;
+			resets = <&cpg 915>;
+		};
+
+		gpio1: gpio@e6050980 {
+			compatible = "renesas,gpio-r8a779h0",
+				     "renesas,rcar-gen4-gpio";
+			reg = <0 0xe6050980 0 0x54>;
+			interrupts = <GIC_SPI 623 IRQ_TYPE_LEVEL_HIGH>;
+			#gpio-cells = <2>;
+			gpio-controller;
+			gpio-ranges = <&pfc 0 32 30>;
+			#interrupt-cells = <2>;
+			interrupt-controller;
+			clocks = <&cpg CPG_MOD 915>;
+			power-domains = <&sysc R8A779H0_PD_ALWAYS_ON>;
+			resets = <&cpg 915>;
+		};
+
+		gpio2: gpio@e6058180 {
+			compatible = "renesas,gpio-r8a779h0",
+				     "renesas,rcar-gen4-gpio";
+			reg = <0 0xe6058180 0 0x54>;
+			interrupts = <GIC_SPI 627 IRQ_TYPE_LEVEL_HIGH>;
+			#gpio-cells = <2>;
+			gpio-controller;
+			gpio-ranges = <&pfc 0 64 20>;
+			#interrupt-cells = <2>;
+			interrupt-controller;
+			clocks = <&cpg CPG_MOD 916>;
+			power-domains = <&sysc R8A779H0_PD_ALWAYS_ON>;
+			resets = <&cpg 916>;
+		};
+
+		gpio3: gpio@e6058980 {
+			compatible = "renesas,gpio-r8a779h0",
+				     "renesas,rcar-gen4-gpio";
+			reg = <0 0xe6058980 0 0x54>;
+			interrupts = <GIC_SPI 631 IRQ_TYPE_LEVEL_HIGH>;
+			#gpio-cells = <2>;
+			gpio-controller;
+			gpio-ranges = <&pfc 0 96 32>;
+			#interrupt-cells = <2>;
+			interrupt-controller;
+			clocks = <&cpg CPG_MOD 916>;
+			power-domains = <&sysc R8A779H0_PD_ALWAYS_ON>;
+			resets = <&cpg 916>;
+		};
+
+		gpio4: gpio@e6060180 {
+			compatible = "renesas,gpio-r8a779h0",
+				     "renesas,rcar-gen4-gpio";
+			reg = <0 0xe6060180 0 0x54>;
+			interrupts = <GIC_SPI 635 IRQ_TYPE_LEVEL_HIGH>;
+			#gpio-cells = <2>;
+			gpio-controller;
+			gpio-ranges = <&pfc 0 128 25>;
+			#interrupt-cells = <2>;
+			interrupt-controller;
+			clocks = <&cpg CPG_MOD 917>;
+			power-domains = <&sysc R8A779H0_PD_ALWAYS_ON>;
+			resets = <&cpg 917>;
+		};
+
+		gpio5: gpio@e6060980 {
+			compatible = "renesas,gpio-r8a779h0",
+				     "renesas,rcar-gen4-gpio";
+			reg = <0 0xe6060980 0 0x54>;
+			interrupts = <GIC_SPI 639 IRQ_TYPE_LEVEL_HIGH>;
+			#gpio-cells = <2>;
+			gpio-controller;
+			gpio-ranges = <&pfc 0 160 21>;
+			#interrupt-cells = <2>;
+			interrupt-controller;
+			clocks = <&cpg CPG_MOD 917>;
+			power-domains = <&sysc R8A779H0_PD_ALWAYS_ON>;
+			resets = <&cpg 917>;
+		};
+
+		gpio6: gpio@e6061180 {
+			compatible = "renesas,gpio-r8a779h0",
+				     "renesas,rcar-gen4-gpio";
+			reg = <0 0xe6061180 0 0x54>;
+			interrupts = <GIC_SPI 643 IRQ_TYPE_LEVEL_HIGH>;
+			#gpio-cells = <2>;
+			gpio-controller;
+			gpio-ranges = <&pfc 0 192 21>;
+			#interrupt-cells = <2>;
+			interrupt-controller;
+			clocks = <&cpg CPG_MOD 917>;
+			power-domains = <&sysc R8A779H0_PD_ALWAYS_ON>;
+			resets = <&cpg 917>;
+		};
+
+		gpio7: gpio@e6061980 {
+			compatible = "renesas,gpio-r8a779h0",
+				     "renesas,rcar-gen4-gpio";
+			reg = <0 0xe6061980 0 0x54>;
+			interrupts = <GIC_SPI 647 IRQ_TYPE_LEVEL_HIGH>;
+			#gpio-cells = <2>;
+			gpio-controller;
+			gpio-ranges = <&pfc 0 224 21>;
+			#interrupt-cells = <2>;
+			interrupt-controller;
+			clocks = <&cpg CPG_MOD 917>;
+			power-domains = <&sysc R8A779H0_PD_ALWAYS_ON>;
+			resets = <&cpg 917>;
+		};
+
 		cpg: clock-controller@e6150000 {
 			compatible = "renesas,r8a779h0-cpg-mssr";
 			reg = <0 0xe6150000 0 0x4000>;

From b9236e6161860f93ab7b86d50a136303ed855658 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Sun, 11 Feb 2024 15:21:30 +0100
Subject: [PATCH 0831/1406] arm64: dts: renesas: r8a779a0: Correct avb[01] reg
 sizes

All Ethernet AVB instances on R-Car V3U have registers related to UDP/IP
support, but the declared register blocks for the first two instances
are too small to cover them.

Fix this by extending the register block sizes.

Fixes: 5a633320f08b8c9b ("arm64: dts: renesas: r8a779a0: Add Ethernet-AVB support")
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Link: https://lore.kernel.org/r/ce6ce3c4b1495e02e7c1803fca810a7178a84500.1707660323.git.geert+renesas@glider.be
---
 arch/arm64/boot/dts/renesas/r8a779a0.dtsi | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/boot/dts/renesas/r8a779a0.dtsi b/arch/arm64/boot/dts/renesas/r8a779a0.dtsi
index 2f8f2ccab8c237..cfa70b441e329a 100644
--- a/arch/arm64/boot/dts/renesas/r8a779a0.dtsi
+++ b/arch/arm64/boot/dts/renesas/r8a779a0.dtsi
@@ -667,7 +667,7 @@
 		avb0: ethernet@e6800000 {
 			compatible = "renesas,etheravb-r8a779a0",
 				     "renesas,etheravb-rcar-gen4";
-			reg = <0 0xe6800000 0 0x800>;
+			reg = <0 0xe6800000 0 0x1000>;
 			interrupts = <GIC_SPI 256 IRQ_TYPE_LEVEL_HIGH>,
 				     <GIC_SPI 257 IRQ_TYPE_LEVEL_HIGH>,
 				     <GIC_SPI 258 IRQ_TYPE_LEVEL_HIGH>,
@@ -715,7 +715,7 @@
 		avb1: ethernet@e6810000 {
 			compatible = "renesas,etheravb-r8a779a0",
 				     "renesas,etheravb-rcar-gen4";
-			reg = <0 0xe6810000 0 0x800>;
+			reg = <0 0xe6810000 0 0x1000>;
 			interrupts = <GIC_SPI 281 IRQ_TYPE_LEVEL_HIGH>,
 				     <GIC_SPI 282 IRQ_TYPE_LEVEL_HIGH>,
 				     <GIC_SPI 283 IRQ_TYPE_LEVEL_HIGH>,

From 61dab16ddfb20ba00cda4e50b729d425359bb3cb Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Sun, 11 Feb 2024 15:21:31 +0100
Subject: [PATCH 0832/1406] arm64: dts: renesas: r8a779g0: Correct avb[01] reg
 sizes

All Ethernet AVB instances on R-Car V4H have registers related to UDP/IP
support, but the declared register blocks for the first two instances
are too small to cover them.

Fix this by extending the register block sizes.

Fixes: 848c82db56923a8b ("arm64: dts: renesas: r8a779g0: Add RAVB nodes")
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Link: https://lore.kernel.org/r/83437778614a7c96f4d8f1be98dffeee29bb4a0b.1707660323.git.geert+renesas@glider.be
---
 arch/arm64/boot/dts/renesas/r8a779g0.dtsi | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/boot/dts/renesas/r8a779g0.dtsi b/arch/arm64/boot/dts/renesas/r8a779g0.dtsi
index 7b05b9b3e0a85f..9bc542bc616909 100644
--- a/arch/arm64/boot/dts/renesas/r8a779g0.dtsi
+++ b/arch/arm64/boot/dts/renesas/r8a779g0.dtsi
@@ -776,7 +776,7 @@
 		avb0: ethernet@e6800000 {
 			compatible = "renesas,etheravb-r8a779g0",
 				     "renesas,etheravb-rcar-gen4";
-			reg = <0 0xe6800000 0 0x800>;
+			reg = <0 0xe6800000 0 0x1000>;
 			interrupts = <GIC_SPI 335 IRQ_TYPE_LEVEL_HIGH>,
 				     <GIC_SPI 336 IRQ_TYPE_LEVEL_HIGH>,
 				     <GIC_SPI 337 IRQ_TYPE_LEVEL_HIGH>,
@@ -823,7 +823,7 @@
 		avb1: ethernet@e6810000 {
 			compatible = "renesas,etheravb-r8a779g0",
 				     "renesas,etheravb-rcar-gen4";
-			reg = <0 0xe6810000 0 0x800>;
+			reg = <0 0xe6810000 0 0x1000>;
 			interrupts = <GIC_SPI 360 IRQ_TYPE_LEVEL_HIGH>,
 				     <GIC_SPI 361 IRQ_TYPE_LEVEL_HIGH>,
 				     <GIC_SPI 362 IRQ_TYPE_LEVEL_HIGH>,

From 34086c3406601d86462fadd7be946d254870e1e1 Mon Sep 17 00:00:00 2001
From: Thanh Quan <thanh.quan.xn@renesas.com>
Date: Sun, 11 Feb 2024 15:30:45 +0100
Subject: [PATCH 0833/1406] arm64: dts: renesas: r8a779h0: Add Ethernet-AVB
 support

Add device nodes for the Renesas Ethernet AVB (EtherAVB-IF) blocks on
the Renesas R-Car V4M (R8A779H0) SoC.

Signed-off-by: Thanh Quan <thanh.quan.xn@renesas.com>
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Link: https://lore.kernel.org/r/ae7488c0065b455b45f23253c997a94f4850064f.1707661382.git.geert+renesas@glider.be
---
 arch/arm64/boot/dts/renesas/r8a779h0.dtsi | 144 ++++++++++++++++++++++
 1 file changed, 144 insertions(+)

diff --git a/arch/arm64/boot/dts/renesas/r8a779h0.dtsi b/arch/arm64/boot/dts/renesas/r8a779h0.dtsi
index 4e9e487ec51661..66a93c88f87424 100644
--- a/arch/arm64/boot/dts/renesas/r8a779h0.dtsi
+++ b/arch/arm64/boot/dts/renesas/r8a779h0.dtsi
@@ -388,6 +388,150 @@
 			status = "disabled";
 		};
 
+		avb0: ethernet@e6800000 {
+			compatible = "renesas,etheravb-r8a779h0",
+				     "renesas,etheravb-rcar-gen4";
+			reg = <0 0xe6800000 0 0x1000>;
+			interrupts = <GIC_SPI 335 IRQ_TYPE_LEVEL_HIGH>,
+				     <GIC_SPI 336 IRQ_TYPE_LEVEL_HIGH>,
+				     <GIC_SPI 337 IRQ_TYPE_LEVEL_HIGH>,
+				     <GIC_SPI 338 IRQ_TYPE_LEVEL_HIGH>,
+				     <GIC_SPI 339 IRQ_TYPE_LEVEL_HIGH>,
+				     <GIC_SPI 340 IRQ_TYPE_LEVEL_HIGH>,
+				     <GIC_SPI 341 IRQ_TYPE_LEVEL_HIGH>,
+				     <GIC_SPI 342 IRQ_TYPE_LEVEL_HIGH>,
+				     <GIC_SPI 343 IRQ_TYPE_LEVEL_HIGH>,
+				     <GIC_SPI 344 IRQ_TYPE_LEVEL_HIGH>,
+				     <GIC_SPI 345 IRQ_TYPE_LEVEL_HIGH>,
+				     <GIC_SPI 346 IRQ_TYPE_LEVEL_HIGH>,
+				     <GIC_SPI 347 IRQ_TYPE_LEVEL_HIGH>,
+				     <GIC_SPI 348 IRQ_TYPE_LEVEL_HIGH>,
+				     <GIC_SPI 349 IRQ_TYPE_LEVEL_HIGH>,
+				     <GIC_SPI 350 IRQ_TYPE_LEVEL_HIGH>,
+				     <GIC_SPI 351 IRQ_TYPE_LEVEL_HIGH>,
+				     <GIC_SPI 352 IRQ_TYPE_LEVEL_HIGH>,
+				     <GIC_SPI 353 IRQ_TYPE_LEVEL_HIGH>,
+				     <GIC_SPI 354 IRQ_TYPE_LEVEL_HIGH>,
+				     <GIC_SPI 355 IRQ_TYPE_LEVEL_HIGH>,
+				     <GIC_SPI 356 IRQ_TYPE_LEVEL_HIGH>,
+				     <GIC_SPI 357 IRQ_TYPE_LEVEL_HIGH>,
+				     <GIC_SPI 358 IRQ_TYPE_LEVEL_HIGH>,
+				     <GIC_SPI 359 IRQ_TYPE_LEVEL_HIGH>;
+			interrupt-names = "ch0", "ch1", "ch2", "ch3",
+					  "ch4", "ch5", "ch6", "ch7",
+					  "ch8", "ch9", "ch10", "ch11",
+					  "ch12", "ch13", "ch14", "ch15",
+					  "ch16", "ch17", "ch18", "ch19",
+					  "ch20", "ch21", "ch22", "ch23",
+					  "ch24";
+			clocks = <&cpg CPG_MOD 211>;
+			clock-names = "fck";
+			power-domains = <&sysc R8A779H0_PD_C4>;
+			resets = <&cpg 211>;
+			phy-mode = "rgmii";
+			rx-internal-delay-ps = <0>;
+			tx-internal-delay-ps = <0>;
+			#address-cells = <1>;
+			#size-cells = <0>;
+			status = "disabled";
+		};
+
+		avb1: ethernet@e6810000 {
+			compatible = "renesas,etheravb-r8a779h0",
+				     "renesas,etheravb-rcar-gen4";
+			reg = <0 0xe6810000 0 0x1000>;
+			interrupts = <GIC_SPI 360 IRQ_TYPE_LEVEL_HIGH>,
+				     <GIC_SPI 361 IRQ_TYPE_LEVEL_HIGH>,
+				     <GIC_SPI 362 IRQ_TYPE_LEVEL_HIGH>,
+				     <GIC_SPI 363 IRQ_TYPE_LEVEL_HIGH>,
+				     <GIC_SPI 364 IRQ_TYPE_LEVEL_HIGH>,
+				     <GIC_SPI 365 IRQ_TYPE_LEVEL_HIGH>,
+				     <GIC_SPI 366 IRQ_TYPE_LEVEL_HIGH>,
+				     <GIC_SPI 367 IRQ_TYPE_LEVEL_HIGH>,
+				     <GIC_SPI 368 IRQ_TYPE_LEVEL_HIGH>,
+				     <GIC_SPI 369 IRQ_TYPE_LEVEL_HIGH>,
+				     <GIC_SPI 370 IRQ_TYPE_LEVEL_HIGH>,
+				     <GIC_SPI 371 IRQ_TYPE_LEVEL_HIGH>,
+				     <GIC_SPI 372 IRQ_TYPE_LEVEL_HIGH>,
+				     <GIC_SPI 373 IRQ_TYPE_LEVEL_HIGH>,
+				     <GIC_SPI 374 IRQ_TYPE_LEVEL_HIGH>,
+				     <GIC_SPI 375 IRQ_TYPE_LEVEL_HIGH>,
+				     <GIC_SPI 376 IRQ_TYPE_LEVEL_HIGH>,
+				     <GIC_SPI 377 IRQ_TYPE_LEVEL_HIGH>,
+				     <GIC_SPI 378 IRQ_TYPE_LEVEL_HIGH>,
+				     <GIC_SPI 379 IRQ_TYPE_LEVEL_HIGH>,
+				     <GIC_SPI 380 IRQ_TYPE_LEVEL_HIGH>,
+				     <GIC_SPI 381 IRQ_TYPE_LEVEL_HIGH>,
+				     <GIC_SPI 382 IRQ_TYPE_LEVEL_HIGH>,
+				     <GIC_SPI 383 IRQ_TYPE_LEVEL_HIGH>,
+				     <GIC_SPI 384 IRQ_TYPE_LEVEL_HIGH>;
+			interrupt-names = "ch0", "ch1", "ch2", "ch3",
+					  "ch4", "ch5", "ch6", "ch7",
+					  "ch8", "ch9", "ch10", "ch11",
+					  "ch12", "ch13", "ch14", "ch15",
+					  "ch16", "ch17", "ch18", "ch19",
+					  "ch20", "ch21", "ch22", "ch23",
+					  "ch24";
+			clocks = <&cpg CPG_MOD 212>;
+			clock-names = "fck";
+			power-domains = <&sysc R8A779H0_PD_C4>;
+			resets = <&cpg 212>;
+			phy-mode = "rgmii";
+			rx-internal-delay-ps = <0>;
+			tx-internal-delay-ps = <0>;
+			#address-cells = <1>;
+			#size-cells = <0>;
+			status = "disabled";
+		};
+
+		avb2: ethernet@e6820000 {
+			compatible = "renesas,etheravb-r8a779h0",
+				     "renesas,etheravb-rcar-gen4";
+			reg = <0 0xe6820000 0 0x1000>;
+			interrupts = <GIC_SPI 385 IRQ_TYPE_LEVEL_HIGH>,
+				     <GIC_SPI 386 IRQ_TYPE_LEVEL_HIGH>,
+				     <GIC_SPI 387 IRQ_TYPE_LEVEL_HIGH>,
+				     <GIC_SPI 388 IRQ_TYPE_LEVEL_HIGH>,
+				     <GIC_SPI 389 IRQ_TYPE_LEVEL_HIGH>,
+				     <GIC_SPI 390 IRQ_TYPE_LEVEL_HIGH>,
+				     <GIC_SPI 391 IRQ_TYPE_LEVEL_HIGH>,
+				     <GIC_SPI 392 IRQ_TYPE_LEVEL_HIGH>,
+				     <GIC_SPI 393 IRQ_TYPE_LEVEL_HIGH>,
+				     <GIC_SPI 394 IRQ_TYPE_LEVEL_HIGH>,
+				     <GIC_SPI 395 IRQ_TYPE_LEVEL_HIGH>,
+				     <GIC_SPI 396 IRQ_TYPE_LEVEL_HIGH>,
+				     <GIC_SPI 397 IRQ_TYPE_LEVEL_HIGH>,
+				     <GIC_SPI 398 IRQ_TYPE_LEVEL_HIGH>,
+				     <GIC_SPI 399 IRQ_TYPE_LEVEL_HIGH>,
+				     <GIC_SPI 400 IRQ_TYPE_LEVEL_HIGH>,
+				     <GIC_SPI 401 IRQ_TYPE_LEVEL_HIGH>,
+				     <GIC_SPI 402 IRQ_TYPE_LEVEL_HIGH>,
+				     <GIC_SPI 403 IRQ_TYPE_LEVEL_HIGH>,
+				     <GIC_SPI 404 IRQ_TYPE_LEVEL_HIGH>,
+				     <GIC_SPI 405 IRQ_TYPE_LEVEL_HIGH>,
+				     <GIC_SPI 406 IRQ_TYPE_LEVEL_HIGH>,
+				     <GIC_SPI 407 IRQ_TYPE_LEVEL_HIGH>,
+				     <GIC_SPI 408 IRQ_TYPE_LEVEL_HIGH>,
+				     <GIC_SPI 409 IRQ_TYPE_LEVEL_HIGH>;
+			interrupt-names = "ch0", "ch1", "ch2", "ch3",
+					  "ch4", "ch5", "ch6", "ch7",
+					  "ch8", "ch9", "ch10", "ch11",
+					  "ch12", "ch13", "ch14", "ch15",
+					  "ch16", "ch17", "ch18", "ch19",
+					  "ch20", "ch21", "ch22", "ch23",
+					  "ch24";
+			clocks = <&cpg CPG_MOD 213>;
+			clock-names = "fck";
+			power-domains = <&sysc R8A779H0_PD_C4>;
+			resets = <&cpg 213>;
+			phy-mode = "rgmii";
+			rx-internal-delay-ps = <0>;
+			tx-internal-delay-ps = <0>;
+			#address-cells = <1>;
+			#size-cells = <0>;
+			status = "disabled";
+		};
+
 		gic: interrupt-controller@f1000000 {
 			compatible = "arm,gic-v3";
 			#interrupt-cells = <3>;

From 2e45b42f1bb81921f0de7ab7de92939a293c7934 Mon Sep 17 00:00:00 2001
From: Thanh Quan <thanh.quan.xn@renesas.com>
Date: Sun, 11 Feb 2024 15:30:46 +0100
Subject: [PATCH 0834/1406] arm64: dts: renesas: gray-hawk-single: Add Ethernet
 support

Describe the wiring of the first Ethernet AVB instance to the Micrel
KSZ9031RNXVB PHY.

Signed-off-by: Thanh Quan <thanh.quan.xn@renesas.com>
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Link: https://lore.kernel.org/r/b83b21b6477a0e31f99eaedbd36c03014b72ec8a.1707661382.git.geert+renesas@glider.be
---
 .../dts/renesas/r8a779h0-gray-hawk-single.dts | 40 +++++++++++++++++++
 1 file changed, 40 insertions(+)

diff --git a/arch/arm64/boot/dts/renesas/r8a779h0-gray-hawk-single.dts b/arch/arm64/boot/dts/renesas/r8a779h0-gray-hawk-single.dts
index 5a7e1bea9f6601..c900ccd18573a6 100644
--- a/arch/arm64/boot/dts/renesas/r8a779h0-gray-hawk-single.dts
+++ b/arch/arm64/boot/dts/renesas/r8a779h0-gray-hawk-single.dts
@@ -7,6 +7,9 @@
  */
 
 /dts-v1/;
+
+#include <dt-bindings/gpio/gpio.h>
+
 #include "r8a779h0.dtsi"
 
 / {
@@ -15,6 +18,7 @@
 
 	aliases {
 		serial0 = &hscif0;
+		ethernet0 = &avb0;
 	};
 
 	chosen {
@@ -34,6 +38,24 @@
 	};
 };
 
+&avb0 {
+	pinctrl-0 = <&avb0_pins>;
+	pinctrl-names = "default";
+	phy-handle = <&phy0>;
+	tx-internal-delay-ps = <2000>;
+	status = "okay";
+
+	phy0: ethernet-phy@0 {
+		compatible = "ethernet-phy-id0022.1622",
+			     "ethernet-phy-ieee802.3-c22";
+		rxc-skew-ps = <1500>;
+		reg = <0>;
+		interrupt-parent = <&gpio7>;
+		interrupts = <5 IRQ_TYPE_LEVEL_LOW>;
+		reset-gpios = <&gpio7 10 GPIO_ACTIVE_LOW>;
+	};
+};
+
 &extal_clk {
 	clock-frequency = <16666666>;
 };
@@ -90,6 +112,24 @@
 	pinctrl-0 = <&scif_clk_pins>;
 	pinctrl-names = "default";
 
+	avb0_pins: avb0 {
+		mux {
+			groups = "avb0_link", "avb0_mdio", "avb0_rgmii",
+				 "avb0_txcrefclk";
+			function = "avb0";
+		};
+
+		pins_mdio {
+			groups = "avb0_mdio";
+			drive-strength = <21>;
+		};
+
+		pins_mii {
+			groups = "avb0_rgmii";
+			drive-strength = <21>;
+		};
+	};
+
 	hscif0_pins: hscif0 {
 		groups = "hscif0_data", "hscif0_ctrl";
 		function = "hscif0";

From ebf3b77a75bf72fa54938ce0bbd4c1934a573d8a Mon Sep 17 00:00:00 2001
From: Cong Dang <cong.dang.xn@renesas.com>
Date: Wed, 14 Feb 2024 14:18:02 +0100
Subject: [PATCH 0835/1406] arm64: dts: renesas: r8a779h0: Add SD/MMC node

Add a device node for SD/MMC on Renesas R-Car V4M (R8A779H0) SoC.

Signed-off-by: Cong Dang <cong.dang.xn@renesas.com>
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Link: https://lore.kernel.org/r/4433cd73049ae517ea163bc703555ee8d9a4dd82.1707915763.git.geert+renesas@glider.be
---
 arch/arm64/boot/dts/renesas/r8a779h0.dtsi | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/arch/arm64/boot/dts/renesas/r8a779h0.dtsi b/arch/arm64/boot/dts/renesas/r8a779h0.dtsi
index 66a93c88f87424..d50ed1989ef52d 100644
--- a/arch/arm64/boot/dts/renesas/r8a779h0.dtsi
+++ b/arch/arm64/boot/dts/renesas/r8a779h0.dtsi
@@ -532,6 +532,20 @@
 			status = "disabled";
 		};
 
+		mmc0: mmc@ee140000 {
+			compatible = "renesas,sdhi-r8a779h0",
+				     "renesas,rcar-gen4-sdhi";
+			reg = <0 0xee140000 0 0x2000>;
+			interrupts = <GIC_SPI 440 IRQ_TYPE_LEVEL_HIGH>;
+			clocks = <&cpg CPG_MOD 706>,
+				 <&cpg CPG_CORE R8A779H0_CLK_SD0H>;
+			clock-names = "core", "clkh";
+			power-domains = <&sysc R8A779H0_PD_ALWAYS_ON>;
+			resets = <&cpg 706>;
+			max-frequency = <200000000>;
+			status = "disabled";
+		};
+
 		gic: interrupt-controller@f1000000 {
 			compatible = "arm,gic-v3";
 			#interrupt-cells = <3>;

From 13dd267358c2e63c8d737b6cf6ff9294ccdb5fe4 Mon Sep 17 00:00:00 2001
From: Cong Dang <cong.dang.xn@renesas.com>
Date: Wed, 14 Feb 2024 14:18:03 +0100
Subject: [PATCH 0836/1406] arm64: dts: renesas: gray-hawk-single: Add eMMC
 support

Describe the eMMC on the Gray Hawk Single board.

Signed-off-by: Cong Dang <cong.dang.xn@renesas.com>
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Link: https://lore.kernel.org/r/208bfc781b1488e253e19626ad80876c34c86758.1707915763.git.geert+renesas@glider.be
---
 .../dts/renesas/r8a779h0-gray-hawk-single.dts | 41 +++++++++++++++++++
 1 file changed, 41 insertions(+)

diff --git a/arch/arm64/boot/dts/renesas/r8a779h0-gray-hawk-single.dts b/arch/arm64/boot/dts/renesas/r8a779h0-gray-hawk-single.dts
index c900ccd18573a6..1163ac5e292755 100644
--- a/arch/arm64/boot/dts/renesas/r8a779h0-gray-hawk-single.dts
+++ b/arch/arm64/boot/dts/renesas/r8a779h0-gray-hawk-single.dts
@@ -36,6 +36,24 @@
 		device_type = "memory";
 		reg = <0x4 0x80000000 0x1 0x80000000>;
 	};
+
+	reg_1p8v: regulator-1p8v {
+			compatible = "regulator-fixed";
+			regulator-name = "fixed-1.8V";
+			regulator-min-microvolt = <1800000>;
+			regulator-max-microvolt = <1800000>;
+			regulator-boot-on;
+			regulator-always-on;
+	};
+
+	reg_3p3v: regulator-3p3v {
+			compatible = "regulator-fixed";
+			regulator-name = "fixed-3.3V";
+			regulator-min-microvolt = <3300000>;
+			regulator-max-microvolt = <3300000>;
+			regulator-boot-on;
+			regulator-always-on;
+	};
 };
 
 &avb0 {
@@ -108,6 +126,23 @@
 	};
 };
 
+&mmc0 {
+	pinctrl-0 = <&mmc_pins>;
+	pinctrl-1 = <&mmc_pins>;
+	pinctrl-names = "default", "state_uhs";
+
+	vmmc-supply = <&reg_3p3v>;
+	vqmmc-supply = <&reg_1p8v>;
+	mmc-hs200-1_8v;
+	mmc-hs400-1_8v;
+	bus-width = <8>;
+	no-sd;
+	no-sdio;
+	non-removable;
+	full-pwr-cycle-in-suspend;
+	status = "okay";
+};
+
 &pfc {
 	pinctrl-0 = <&scif_clk_pins>;
 	pinctrl-names = "default";
@@ -140,6 +175,12 @@
 		function = "i2c0";
 	};
 
+	mmc_pins: mmc {
+		groups = "mmc_data8", "mmc_ctrl", "mmc_ds";
+		function = "mmc";
+		power-source = <1800>;
+	};
+
 	scif_clk_pins: scif-clk {
 		groups = "scif_clk";
 		function = "scif_clk";

From b5cbe1bfb009ea6bf6fcc2d5fa01d8e2ae6ae4e1 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Wed, 14 Feb 2024 14:19:27 +0100
Subject: [PATCH 0837/1406] arm64: dts: renesas: r8a779h0: Add DMA support

Add device nodes for the Direct Memory Access Controllers for System
(SYS-DMAC) on the Renesas R-Car V4M (R8A779H0) SoC.

Link all DMA consumers to the corresponding DMA controller channels.

Based on patches in the BSP by Thanh Le and Minh Le.

Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Link: https://lore.kernel.org/r/44de886cd83f41659d2f6962c0e277f548fb0adb.1707916707.git.geert+renesas@glider.be
---
 arch/arm64/boot/dts/renesas/r8a779h0.dtsi | 75 +++++++++++++++++++++++
 1 file changed, 75 insertions(+)

diff --git a/arch/arm64/boot/dts/renesas/r8a779h0.dtsi b/arch/arm64/boot/dts/renesas/r8a779h0.dtsi
index d50ed1989ef52d..46c586f4e32695 100644
--- a/arch/arm64/boot/dts/renesas/r8a779h0.dtsi
+++ b/arch/arm64/boot/dts/renesas/r8a779h0.dtsi
@@ -326,6 +326,9 @@
 			clocks = <&cpg CPG_MOD 518>;
 			power-domains = <&sysc R8A779H0_PD_ALWAYS_ON>;
 			resets = <&cpg 518>;
+			dmas = <&dmac1 0x91>, <&dmac1 0x90>,
+			       <&dmac2 0x91>, <&dmac2 0x90>;
+			dma-names = "tx", "rx", "tx", "rx";
 			i2c-scl-internal-delay-ns = <110>;
 			#address-cells = <1>;
 			#size-cells = <0>;
@@ -340,6 +343,9 @@
 			clocks = <&cpg CPG_MOD 519>;
 			power-domains = <&sysc R8A779H0_PD_ALWAYS_ON>;
 			resets = <&cpg 519>;
+			dmas = <&dmac1 0x93>, <&dmac1 0x92>,
+			       <&dmac2 0x93>, <&dmac2 0x92>;
+			dma-names = "tx", "rx", "tx", "rx";
 			i2c-scl-internal-delay-ns = <110>;
 			#address-cells = <1>;
 			#size-cells = <0>;
@@ -354,6 +360,9 @@
 			clocks = <&cpg CPG_MOD 520>;
 			power-domains = <&sysc R8A779H0_PD_ALWAYS_ON>;
 			resets = <&cpg 520>;
+			dmas = <&dmac1 0x95>, <&dmac1 0x94>,
+			       <&dmac2 0x95>, <&dmac2 0x94>;
+			dma-names = "tx", "rx", "tx", "rx";
 			i2c-scl-internal-delay-ns = <110>;
 			#address-cells = <1>;
 			#size-cells = <0>;
@@ -368,6 +377,9 @@
 			clocks = <&cpg CPG_MOD 521>;
 			power-domains = <&sysc R8A779H0_PD_ALWAYS_ON>;
 			resets = <&cpg 521>;
+			dmas = <&dmac1 0x97>, <&dmac1 0x96>,
+			       <&dmac2 0x97>, <&dmac2 0x96>;
+			dma-names = "tx", "rx", "tx", "rx";
 			i2c-scl-internal-delay-ns = <110>;
 			#address-cells = <1>;
 			#size-cells = <0>;
@@ -385,6 +397,9 @@
 			clock-names = "fck", "brg_int", "scif_clk";
 			power-domains = <&sysc R8A779H0_PD_ALWAYS_ON>;
 			resets = <&cpg 514>;
+			dmas = <&dmac1 0x31>, <&dmac1 0x30>,
+			       <&dmac2 0x31>, <&dmac2 0x30>;
+			dma-names = "tx", "rx", "tx", "rx";
 			status = "disabled";
 		};
 
@@ -532,6 +547,66 @@
 			status = "disabled";
 		};
 
+		dmac1: dma-controller@e7350000 {
+			compatible = "renesas,dmac-r8a779h0",
+				     "renesas,rcar-gen4-dmac";
+			reg = <0 0xe7350000 0 0x1000>,
+			      <0 0xe7300000 0 0x10000>;
+			interrupts = <GIC_SPI 80 IRQ_TYPE_LEVEL_HIGH>,
+				     <GIC_SPI 82 IRQ_TYPE_LEVEL_HIGH>,
+				     <GIC_SPI 83 IRQ_TYPE_LEVEL_HIGH>,
+				     <GIC_SPI 84 IRQ_TYPE_LEVEL_HIGH>,
+				     <GIC_SPI 85 IRQ_TYPE_LEVEL_HIGH>,
+				     <GIC_SPI 86 IRQ_TYPE_LEVEL_HIGH>,
+				     <GIC_SPI 87 IRQ_TYPE_LEVEL_HIGH>,
+				     <GIC_SPI 88 IRQ_TYPE_LEVEL_HIGH>,
+				     <GIC_SPI 89 IRQ_TYPE_LEVEL_HIGH>,
+				     <GIC_SPI 90 IRQ_TYPE_LEVEL_HIGH>,
+				     <GIC_SPI 91 IRQ_TYPE_LEVEL_HIGH>,
+				     <GIC_SPI 92 IRQ_TYPE_LEVEL_HIGH>,
+				     <GIC_SPI 93 IRQ_TYPE_LEVEL_HIGH>,
+				     <GIC_SPI 94 IRQ_TYPE_LEVEL_HIGH>,
+				     <GIC_SPI 95 IRQ_TYPE_LEVEL_HIGH>,
+				     <GIC_SPI 96 IRQ_TYPE_LEVEL_HIGH>,
+				     <GIC_SPI 97 IRQ_TYPE_LEVEL_HIGH>;
+			interrupt-names = "error",
+					  "ch0", "ch1", "ch2", "ch3", "ch4",
+					  "ch5", "ch6", "ch7", "ch8", "ch9",
+					  "ch10", "ch11", "ch12", "ch13",
+					  "ch14", "ch15";
+			clocks = <&cpg CPG_MOD 709>;
+			clock-names = "fck";
+			power-domains = <&sysc R8A779H0_PD_ALWAYS_ON>;
+			resets = <&cpg 709>;
+			#dma-cells = <1>;
+			dma-channels = <16>;
+		};
+
+		dmac2: dma-controller@e7351000 {
+			compatible = "renesas,dmac-r8a779h0",
+				     "renesas,rcar-gen4-dmac";
+			reg = <0 0xe7351000 0 0x1000>,
+			      <0 0xe7310000 0 0x10000>;
+			interrupts = <GIC_SPI 98 IRQ_TYPE_LEVEL_HIGH>,
+				     <GIC_SPI 100 IRQ_TYPE_LEVEL_HIGH>,
+				     <GIC_SPI 101 IRQ_TYPE_LEVEL_HIGH>,
+				     <GIC_SPI 102 IRQ_TYPE_LEVEL_HIGH>,
+				     <GIC_SPI 103 IRQ_TYPE_LEVEL_HIGH>,
+				     <GIC_SPI 104 IRQ_TYPE_LEVEL_HIGH>,
+				     <GIC_SPI 105 IRQ_TYPE_LEVEL_HIGH>,
+				     <GIC_SPI 106 IRQ_TYPE_LEVEL_HIGH>,
+				     <GIC_SPI 107 IRQ_TYPE_LEVEL_HIGH>;
+			interrupt-names = "error",
+					  "ch0", "ch1", "ch2", "ch3", "ch4",
+					  "ch5", "ch6", "ch7";
+			clocks = <&cpg CPG_MOD 710>;
+			clock-names = "fck";
+			power-domains = <&sysc R8A779H0_PD_ALWAYS_ON>;
+			resets = <&cpg 710>;
+			#dma-cells = <1>;
+			dma-channels = <8>;
+		};
+
 		mmc0: mmc@ee140000 {
 			compatible = "renesas,sdhi-r8a779h0",
 				     "renesas,rcar-gen4-sdhi";

From 1e0e81a0f6d65974c07f034241cb6a11a83af96f Mon Sep 17 00:00:00 2001
From: Cong Dang <cong.dang.xn@renesas.com>
Date: Mon, 19 Feb 2024 16:00:40 +0100
Subject: [PATCH 0838/1406] arm64: dts: renesas: r8a779h0: Add RPC node

Add a device node for the SPI Multi I/O Bus Controller (RPC-IF) on the
Renesas R-Car V4M (R8A779H0) SoC.

Signed-off-by: Cong Dang <cong.dang.xn@renesas.com>
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Reviewed-by: Wolfram Sang <wsa+renesas@sang-engineering.com>
Link: https://lore.kernel.org/r/26f9c5d5a7944db3813ed53459aa9c5767c1bdc0.1708354463.git.geert+renesas@glider.be
---
 arch/arm64/boot/dts/renesas/r8a779h0.dtsi | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/arch/arm64/boot/dts/renesas/r8a779h0.dtsi b/arch/arm64/boot/dts/renesas/r8a779h0.dtsi
index 46c586f4e32695..11885729181bc9 100644
--- a/arch/arm64/boot/dts/renesas/r8a779h0.dtsi
+++ b/arch/arm64/boot/dts/renesas/r8a779h0.dtsi
@@ -621,6 +621,22 @@
 			status = "disabled";
 		};
 
+		rpc: spi@ee200000 {
+			compatible = "renesas,r8a779h0-rpc-if",
+				     "renesas,rcar-gen4-rpc-if";
+			reg = <0 0xee200000 0 0x200>,
+			      <0 0x08000000 0 0x04000000>,
+			      <0 0xee208000 0 0x100>;
+			reg-names = "regs", "dirmap", "wbuf";
+			interrupts = <GIC_SPI 225 IRQ_TYPE_LEVEL_HIGH>;
+			clocks = <&cpg CPG_MOD 629>;
+			power-domains = <&sysc R8A779H0_PD_ALWAYS_ON>;
+			resets = <&cpg 629>;
+			#address-cells = <1>;
+			#size-cells = <0>;
+			status = "disabled";
+		};
+
 		gic: interrupt-controller@f1000000 {
 			compatible = "arm,gic-v3";
 			#interrupt-cells = <3>;

From 2be6d3e14d85b3aa15965e7840a7a6f0f3ad1fe8 Mon Sep 17 00:00:00 2001
From: Cong Dang <cong.dang.xn@renesas.com>
Date: Mon, 19 Feb 2024 16:00:41 +0100
Subject: [PATCH 0839/1406] arm64: dts: renesas: gray-hawk-single: Add QSPI
 FLASH support

Describe the QSPI FLASH on the Gray Hawk Single board.

Signed-off-by: Cong Dang <cong.dang.xn@renesas.com>
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Reviewed-by: Wolfram Sang <wsa+renesas@sang-engineering.com>
Link: https://lore.kernel.org/r/96d6567024cef9fcd6b04f92a697301c1c8d1d8e.1708354463.git.geert+renesas@glider.be
---
 .../dts/renesas/r8a779h0-gray-hawk-single.dts | 33 +++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/arch/arm64/boot/dts/renesas/r8a779h0-gray-hawk-single.dts b/arch/arm64/boot/dts/renesas/r8a779h0-gray-hawk-single.dts
index 1163ac5e292755..bc8616a56c039b 100644
--- a/arch/arm64/boot/dts/renesas/r8a779h0-gray-hawk-single.dts
+++ b/arch/arm64/boot/dts/renesas/r8a779h0-gray-hawk-single.dts
@@ -181,12 +181,45 @@
 		power-source = <1800>;
 	};
 
+	qspi0_pins: qspi0 {
+		groups = "qspi0_ctrl", "qspi0_data4";
+		function = "qspi0";
+	};
+
 	scif_clk_pins: scif-clk {
 		groups = "scif_clk";
 		function = "scif_clk";
 	};
 };
 
+&rpc {
+	pinctrl-0 = <&qspi0_pins>;
+	pinctrl-names = "default";
+
+	status = "okay";
+
+	flash@0 {
+		compatible = "spansion,s25fs512s", "jedec,spi-nor";
+		reg = <0>;
+		spi-max-frequency = <40000000>;
+		spi-rx-bus-width = <4>;
+
+		partitions {
+			compatible = "fixed-partitions";
+			#address-cells = <1>;
+			#size-cells = <1>;
+
+			boot@0 {
+				reg = <0x0 0x1200000>;
+				read-only;
+			};
+			user@1200000 {
+				reg = <0x1200000 0x2e00000>;
+			};
+		};
+	};
+};
+
 &rwdt {
 	timeout-sec = <60>;
 	status = "okay";

From 4a7b0850fc7a27ece15698b331267aa3d5dc6226 Mon Sep 17 00:00:00 2001
From: Stephen Boyd <swboyd@chromium.org>
Date: Mon, 19 Feb 2024 12:23:23 -0800
Subject: [PATCH 0840/1406] dt-bindings: cros-ec: Add properties for GPIO
 controller

The ChromeOS embedded controller (EC) supports setting the state of
GPIOs when the system is unlocked, and getting the state of GPIOs in all
cases. The GPIOs are on the EC itself, so the EC acts similar to a GPIO
expander. Add the #gpio-cells and gpio-controller properties to the
cros-ec binding so that other devices described in DT can get the GPIOs
on the EC.

Signed-off-by: Stephen Boyd <swboyd@chromium.org>
Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
---
 Documentation/devicetree/bindings/mfd/google,cros-ec.yaml | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/Documentation/devicetree/bindings/mfd/google,cros-ec.yaml b/Documentation/devicetree/bindings/mfd/google,cros-ec.yaml
index e1ca4f297c6d81..aac8819bd00ba4 100644
--- a/Documentation/devicetree/bindings/mfd/google,cros-ec.yaml
+++ b/Documentation/devicetree/bindings/mfd/google,cros-ec.yaml
@@ -93,6 +93,11 @@ properties:
   '#size-cells':
     const: 0
 
+  '#gpio-cells':
+    const: 2
+
+  gpio-controller: true
+
   typec:
     $ref: /schemas/chrome/google,cros-ec-typec.yaml#
 
@@ -275,6 +280,8 @@ examples:
             interrupts = <99 0>;
             interrupt-parent = <&gpio7>;
             spi-max-frequency = <5000000>;
+            #gpio-cells = <2>;
+            gpio-controller;
 
             proximity {
                 compatible = "google,cros-ec-mkbp-proximity";

From 56c608c9e773a9d9827643eec352d831f7da6220 Mon Sep 17 00:00:00 2001
From: Stephen Boyd <swboyd@chromium.org>
Date: Mon, 19 Feb 2024 20:52:27 -0800
Subject: [PATCH 0841/1406] gpio: Add ChromeOS EC GPIO driver

The ChromeOS embedded controller (EC) supports setting the state of
GPIOs when the system is unlocked, and getting the state of GPIOs in all
cases. The GPIOs are on the EC itself, so the EC acts similar to a GPIO
expander. Add a driver to get and set the GPIOs on the EC through the
host command interface.

Signed-off-by: Stephen Boyd <swboyd@chromium.org>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
---
 drivers/gpio/Kconfig        |  10 ++
 drivers/gpio/Makefile       |   1 +
 drivers/gpio/gpio-cros-ec.c | 209 ++++++++++++++++++++++++++++++++++++
 3 files changed, 220 insertions(+)
 create mode 100644 drivers/gpio/gpio-cros-ec.c

diff --git a/drivers/gpio/Kconfig b/drivers/gpio/Kconfig
index 3081406ff57a91..3fbb0bdb15c142 100644
--- a/drivers/gpio/Kconfig
+++ b/drivers/gpio/Kconfig
@@ -1241,6 +1241,16 @@ config GPIO_BD9571MWV
 	  This driver can also be built as a module. If so, the module
 	  will be called gpio-bd9571mwv.
 
+config GPIO_CROS_EC
+	tristate "ChromeOS EC GPIO support"
+	depends on CROS_EC
+	help
+	  GPIO driver for the ChromeOS Embedded Controller (EC). GPIOs
+	  cannot be set unless the system is unlocked.
+
+	  This driver can also be built as a module. If so, the module
+	  will be called gpio-cros-ec.
+
 config GPIO_CRYSTAL_COVE
 	tristate "GPIO support for Crystal Cove PMIC"
 	depends on (X86 || COMPILE_TEST) && INTEL_SOC_PMIC
diff --git a/drivers/gpio/Makefile b/drivers/gpio/Makefile
index 9e40af196aae67..7ae4d81de1df70 100644
--- a/drivers/gpio/Makefile
+++ b/drivers/gpio/Makefile
@@ -46,6 +46,7 @@ obj-$(CONFIG_GPIO_BT8XX)		+= gpio-bt8xx.o
 obj-$(CONFIG_GPIO_CADENCE)		+= gpio-cadence.o
 obj-$(CONFIG_GPIO_CLPS711X)		+= gpio-clps711x.o
 obj-$(CONFIG_GPIO_SNPS_CREG)		+= gpio-creg-snps.o
+obj-$(CONFIG_GPIO_CROS_EC)		+= gpio-cros-ec.o
 obj-$(CONFIG_GPIO_CRYSTAL_COVE)		+= gpio-crystalcove.o
 obj-$(CONFIG_GPIO_CS5535)		+= gpio-cs5535.o
 obj-$(CONFIG_GPIO_DA9052)		+= gpio-da9052.o
diff --git a/drivers/gpio/gpio-cros-ec.c b/drivers/gpio/gpio-cros-ec.c
new file mode 100644
index 00000000000000..842e1c06041442
--- /dev/null
+++ b/drivers/gpio/gpio-cros-ec.c
@@ -0,0 +1,209 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2024 Google LLC
+ *
+ * This driver provides the ability to control GPIOs on the Chrome OS EC.
+ * There isn't any direction control, and setting values on GPIOs is only
+ * possible when the system is unlocked.
+ */
+
+#include <linux/bitops.h>
+#include <linux/device.h>
+#include <linux/errno.h>
+#include <linux/gpio/driver.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/platform_data/cros_ec_commands.h>
+#include <linux/platform_data/cros_ec_proto.h>
+#include <linux/platform_device.h>
+#include <linux/property.h>
+#include <linux/slab.h>
+
+/* Prefix all names to avoid collisions with EC <-> AP nets */
+static const char cros_ec_gpio_prefix[] = "EC:";
+
+/* Setting gpios is only supported when the system is unlocked */
+static void cros_ec_gpio_set(struct gpio_chip *gc, unsigned int gpio, int val)
+{
+	const char *name = gc->names[gpio] + strlen(cros_ec_gpio_prefix);
+	struct cros_ec_device *cros_ec = gpiochip_get_data(gc);
+	struct ec_params_gpio_set params = {
+		.val = val,
+	};
+	int ret;
+	ssize_t copied;
+
+	copied = strscpy(params.name, name, sizeof(params.name));
+	if (copied < 0)
+		return;
+
+	ret = cros_ec_cmd(cros_ec, 0, EC_CMD_GPIO_SET, &params,
+			  sizeof(params), NULL, 0);
+	if (ret < 0)
+		dev_err(gc->parent, "error setting gpio%d (%s) on EC: %d\n", gpio, name, ret);
+}
+
+static int cros_ec_gpio_get(struct gpio_chip *gc, unsigned int gpio)
+{
+	const char *name = gc->names[gpio] + strlen(cros_ec_gpio_prefix);
+	struct cros_ec_device *cros_ec = gpiochip_get_data(gc);
+	struct ec_params_gpio_get params;
+	struct ec_response_gpio_get response;
+	int ret;
+	ssize_t copied;
+
+	copied = strscpy(params.name, name, sizeof(params.name));
+	if (copied < 0)
+		return -EINVAL;
+
+	ret = cros_ec_cmd(cros_ec, 0, EC_CMD_GPIO_GET, &params,
+			  sizeof(params), &response, sizeof(response));
+	if (ret < 0) {
+		dev_err(gc->parent, "error getting gpio%d (%s) on EC: %d\n", gpio, name, ret);
+		return ret;
+	}
+
+	return response.val;
+}
+
+#define CROS_EC_GPIO_INPUT         BIT(8)
+#define CROS_EC_GPIO_OUTPUT        BIT(9)
+
+static int cros_ec_gpio_get_direction(struct gpio_chip *gc, unsigned int gpio)
+{
+	const char *name = gc->names[gpio] + strlen(cros_ec_gpio_prefix);
+	struct cros_ec_device *cros_ec = gpiochip_get_data(gc);
+	struct ec_params_gpio_get_v1 params = {
+		.subcmd = EC_GPIO_GET_INFO,
+		.get_info.index = gpio,
+	};
+	struct ec_response_gpio_get_v1 response;
+	int ret;
+
+	ret = cros_ec_cmd(cros_ec, 1, EC_CMD_GPIO_GET, &params,
+			  sizeof(params), &response, sizeof(response));
+	if (ret < 0) {
+		dev_err(gc->parent, "error getting direction of gpio%d (%s) on EC: %d\n", gpio, name, ret);
+		return ret;
+	}
+
+	if (response.get_info.flags & CROS_EC_GPIO_INPUT)
+		return GPIO_LINE_DIRECTION_IN;
+
+	if (response.get_info.flags & CROS_EC_GPIO_OUTPUT)
+		return GPIO_LINE_DIRECTION_OUT;
+
+	return -EINVAL;
+}
+
+/* Query EC for all gpio line names */
+static int cros_ec_gpio_init_names(struct cros_ec_device *cros_ec, struct gpio_chip *gc)
+{
+	struct ec_params_gpio_get_v1 params = {
+		.subcmd = EC_GPIO_GET_INFO,
+	};
+	struct ec_response_gpio_get_v1 response;
+	int ret, i;
+	/* EC may not NUL terminate */
+	size_t name_len = strlen(cros_ec_gpio_prefix) + sizeof(response.get_info.name) + 1;
+	ssize_t copied;
+	const char **names;
+	char *str;
+
+	names = devm_kcalloc(gc->parent, gc->ngpio, sizeof(*names), GFP_KERNEL);
+	if (!names)
+		return -ENOMEM;
+	gc->names = names;
+
+	str = devm_kcalloc(gc->parent, gc->ngpio, name_len, GFP_KERNEL);
+	if (!str)
+		return -ENOMEM;
+
+	/* Get gpio line names one at a time */
+	for (i = 0; i < gc->ngpio; i++) {
+		params.get_info.index = i;
+		ret = cros_ec_cmd(cros_ec, 1, EC_CMD_GPIO_GET, &params,
+				  sizeof(params), &response, sizeof(response));
+		if (ret < 0) {
+			dev_err_probe(gc->parent, ret, "error getting gpio%d info\n", i);
+			return ret;
+		}
+
+		names[i] = str;
+		copied = scnprintf(str, name_len, "%s%s", cros_ec_gpio_prefix,
+				   response.get_info.name);
+		if (copied < 0)
+			return copied;
+
+		str += copied + 1;
+	}
+
+	return 0;
+}
+
+/* Query EC for number of gpios */
+static int cros_ec_gpio_ngpios(struct cros_ec_device *cros_ec)
+{
+	struct ec_params_gpio_get_v1 params = {
+		.subcmd = EC_GPIO_GET_COUNT,
+	};
+	struct ec_response_gpio_get_v1 response;
+	int ret;
+
+	ret = cros_ec_cmd(cros_ec, 1, EC_CMD_GPIO_GET, &params,
+			  sizeof(params), &response, sizeof(response));
+	if (ret < 0)
+		return ret;
+
+	return response.get_count.val;
+}
+
+static int cros_ec_gpio_probe(struct platform_device *pdev)
+{
+	struct device *dev = &pdev->dev;
+	struct device *parent = dev->parent;
+	struct cros_ec_dev *ec_dev = dev_get_drvdata(parent);
+	struct cros_ec_device *cros_ec = ec_dev->ec_dev;
+	struct gpio_chip *gc;
+	int ngpios;
+	int ret;
+
+	/* Use the fwnode from the protocol device, e.g. cros-ec-spi */
+	device_set_node(dev, dev_fwnode(cros_ec->dev));
+
+	ngpios = cros_ec_gpio_ngpios(cros_ec);
+	if (ngpios < 0) {
+		dev_err_probe(dev, ngpios, "error getting gpio count\n");
+		return ngpios;
+	}
+
+	gc = devm_kzalloc(dev, sizeof(*gc), GFP_KERNEL);
+	if (!gc)
+		return -ENOMEM;
+
+	gc->ngpio = ngpios;
+	gc->parent = dev;
+	ret = cros_ec_gpio_init_names(cros_ec, gc);
+	if (ret)
+		return ret;
+
+	gc->can_sleep = true;
+	gc->label = dev_name(dev);
+	gc->base = -1;
+	gc->set = cros_ec_gpio_set;
+	gc->get = cros_ec_gpio_get;
+	gc->get_direction = cros_ec_gpio_get_direction;
+
+	return devm_gpiochip_add_data(dev, gc, cros_ec);
+}
+
+static struct platform_driver cros_ec_gpio_driver = {
+	.probe = cros_ec_gpio_probe,
+	.driver = {
+		.name = "cros-ec-gpio",
+	},
+};
+module_platform_driver(cros_ec_gpio_driver);
+
+MODULE_DESCRIPTION("ChromeOS EC GPIO Driver");
+MODULE_LICENSE("GPL");

From 5b3c6bc8003df24b43edb14e07992f83ff21b490 Mon Sep 17 00:00:00 2001
From: Cristian Marussi <cristian.marussi@arm.com>
Date: Wed, 14 Feb 2024 18:30:01 +0000
Subject: [PATCH 0842/1406] firmware: arm_scmi: Add support for v3.2
 NEGOTIATE_PROTOCOL_VERSION

Freshly introduced NEGOTIATE_PROTOCOL_VERSION allows the agent to ascertain
upfront if a specific protocol(usually older) version is supported by the
platform.

It is used by the agent in case the platform has advertised the support of
a newer protocol version than the latest version supported by the agent,
since backward compatibility cannot be automatically assumed.

Emit a warning about possible incompatibility when negotiation was not
possible or just print the successfully negotiated protocol.

Signed-off-by: Cristian Marussi <cristian.marussi@arm.com>
Link: https://lore.kernel.org/r/20240214183006.3403207-3-cristian.marussi@arm.com
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 drivers/firmware/arm_scmi/driver.c    | 65 ++++++++++++++++++++++++---
 drivers/firmware/arm_scmi/protocols.h |  1 +
 2 files changed, 61 insertions(+), 5 deletions(-)

diff --git a/drivers/firmware/arm_scmi/driver.c b/drivers/firmware/arm_scmi/driver.c
index 4a64ad5c21ee5c..34d77802c990ac 100644
--- a/drivers/firmware/arm_scmi/driver.c
+++ b/drivers/firmware/arm_scmi/driver.c
@@ -86,6 +86,12 @@ struct scmi_xfers_info {
  * @users: A refcount to track effective users of this protocol.
  * @priv: Reference for optional protocol private data.
  * @version: Protocol version supported by the platform as detected at runtime.
+ * @negotiated_version: When the platform supports a newer protocol version,
+ *			the agent will try to negotiate with the platform the
+ *			usage of the newest version known to it, since
+ *			backward compatibility is NOT automatically assured.
+ *			This field is NON-zero when a successful negotiation
+ *			has completed.
  * @ph: An embedded protocol handle that will be passed down to protocol
  *	initialization code to identify this instance.
  *
@@ -99,6 +105,7 @@ struct scmi_protocol_instance {
 	refcount_t			users;
 	void				*priv;
 	unsigned int			version;
+	unsigned int			negotiated_version;
 	struct scmi_protocol_handle	ph;
 };
 
@@ -1815,6 +1822,44 @@ scmi_revision_area_get(const struct scmi_protocol_handle *ph)
 	return pi->handle->version;
 }
 
+/**
+ * scmi_protocol_version_negotiate  - Negotiate protocol version
+ *
+ * @ph: A reference to the protocol handle.
+ *
+ * An helper to negotiate a protocol version different from the latest
+ * advertised as supported from the platform: on Success backward
+ * compatibility is assured by the platform.
+ *
+ * Return: 0 on Success
+ */
+static int scmi_protocol_version_negotiate(struct scmi_protocol_handle *ph)
+{
+	int ret;
+	struct scmi_xfer *t;
+	struct scmi_protocol_instance *pi = ph_to_pi(ph);
+
+	/* At first check if NEGOTIATE_PROTOCOL_VERSION is supported ... */
+	ret = scmi_protocol_msg_check(ph, NEGOTIATE_PROTOCOL_VERSION, NULL);
+	if (ret)
+		return ret;
+
+	/* ... then attempt protocol version negotiation */
+	ret = xfer_get_init(ph, NEGOTIATE_PROTOCOL_VERSION,
+			    sizeof(__le32), 0, &t);
+	if (ret)
+		return ret;
+
+	put_unaligned_le32(pi->proto->supported_version, t->tx.buf);
+	ret = do_xfer(ph, t);
+	if (!ret)
+		pi->negotiated_version = pi->proto->supported_version;
+
+	xfer_put(ph, t);
+
+	return ret;
+}
+
 /**
  * scmi_alloc_init_protocol_instance  - Allocate and initialize a protocol
  * instance descriptor.
@@ -1887,11 +1932,21 @@ scmi_alloc_init_protocol_instance(struct scmi_info *info,
 	devres_close_group(handle->dev, pi->gid);
 	dev_dbg(handle->dev, "Initialized protocol: 0x%X\n", pi->proto->id);
 
-	if (pi->version > proto->supported_version)
-		dev_warn(handle->dev,
-			 "Detected UNSUPPORTED higher version 0x%X for protocol 0x%X."
-			 "Backward compatibility is NOT assured.\n",
-			 pi->version, pi->proto->id);
+	if (pi->version > proto->supported_version) {
+		ret = scmi_protocol_version_negotiate(&pi->ph);
+		if (!ret) {
+			dev_info(handle->dev,
+				 "Protocol 0x%X successfully negotiated version 0x%X\n",
+				 proto->id, pi->negotiated_version);
+		} else {
+			dev_warn(handle->dev,
+				 "Detected UNSUPPORTED higher version 0x%X for protocol 0x%X.\n",
+				 pi->version, pi->proto->id);
+			dev_warn(handle->dev,
+				 "Trying version 0x%X. Backward compatibility is NOT assured.\n",
+				 pi->proto->supported_version);
+		}
+	}
 
 	return pi;
 
diff --git a/drivers/firmware/arm_scmi/protocols.h b/drivers/firmware/arm_scmi/protocols.h
index 26a3edd49fea79..693019fff0f67e 100644
--- a/drivers/firmware/arm_scmi/protocols.h
+++ b/drivers/firmware/arm_scmi/protocols.h
@@ -33,6 +33,7 @@ enum scmi_common_cmd {
 	PROTOCOL_VERSION = 0x0,
 	PROTOCOL_ATTRIBUTES = 0x1,
 	PROTOCOL_MESSAGE_ATTRIBUTES = 0x2,
+	NEGOTIATE_PROTOCOL_VERSION = 0x10,
 };
 
 /**

From de57bf58ca3ccfe304885fb4b4443f5aaf3a6989 Mon Sep 17 00:00:00 2001
From: Cristian Marussi <cristian.marussi@arm.com>
Date: Wed, 14 Feb 2024 18:30:02 +0000
Subject: [PATCH 0843/1406] firmware: arm_scmi: Add clock check for extended
 config support

SCMI v3.2 added support to set/get clock custom OEM types; such support is
conditionally present, though, depending on an extended config attribute
bit possibly advertised by the platform server on a per-domain base.

Add a check to verify if OEM types are supported before allowing any kind
of OEM-specific get/set operation. Also add a check around all the new
v3.2 clock features.

Signed-off-by: Cristian Marussi <cristian.marussi@arm.com>
Link: https://lore.kernel.org/r/20240214183006.3403207-4-cristian.marussi@arm.com
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 drivers/firmware/arm_scmi/clock.c | 33 +++++++++++++++++++++++++------
 include/linux/scmi_protocol.h     |  1 +
 2 files changed, 28 insertions(+), 6 deletions(-)

diff --git a/drivers/firmware/arm_scmi/clock.c b/drivers/firmware/arm_scmi/clock.c
index 85eda5db40bab4..add350bf2a7f2d 100644
--- a/drivers/firmware/arm_scmi/clock.c
+++ b/drivers/firmware/arm_scmi/clock.c
@@ -54,6 +54,7 @@ struct scmi_msg_resp_clock_attributes {
 #define SUPPORTS_RATE_CHANGE_REQUESTED_NOTIF(x)	((x) & BIT(30))
 #define SUPPORTS_EXTENDED_NAMES(x)		((x) & BIT(29))
 #define SUPPORTS_PARENT_CLOCK(x)		((x) & BIT(28))
+#define SUPPORTS_EXTENDED_CONFIG(x)		((x) & BIT(27))
 #define SUPPORTS_GET_PERMISSIONS(x)		((x) & BIT(1))
 	u8 name[SCMI_SHORT_NAME_MAX_SIZE];
 	__le32 clock_enable_latency;
@@ -388,10 +389,14 @@ static int scmi_clock_attributes_get(const struct scmi_protocol_handle *ph,
 		if (cinfo->notify_rate_change_requested_cmd &&
 		    SUPPORTS_RATE_CHANGE_REQUESTED_NOTIF(attributes))
 			clk->rate_change_requested_notifications = true;
-		if (SUPPORTS_PARENT_CLOCK(attributes))
-			scmi_clock_possible_parents(ph, clk_id, clk);
-		if (SUPPORTS_GET_PERMISSIONS(attributes))
-			scmi_clock_get_permissions(ph, clk_id, clk);
+		if (PROTOCOL_REV_MAJOR(version) >= 0x3) {
+			if (SUPPORTS_PARENT_CLOCK(attributes))
+				scmi_clock_possible_parents(ph, clk_id, clk);
+			if (SUPPORTS_GET_PERMISSIONS(attributes))
+				scmi_clock_get_permissions(ph, clk_id, clk);
+			if (SUPPORTS_EXTENDED_CONFIG(attributes))
+				clk->extended_config = true;
+		}
 	}
 
 	return ret;
@@ -700,7 +705,7 @@ scmi_clock_get_parent(const struct scmi_protocol_handle *ph, u32 clk_id,
 	return ret;
 }
 
-/* For SCMI clock v2.1 and onwards */
+/* For SCMI clock v3.0 and onwards */
 static int
 scmi_clock_config_set_v2(const struct scmi_protocol_handle *ph, u32 clk_id,
 			 enum clk_state state, u8 oem_type, u32 oem_val,
@@ -773,7 +778,7 @@ static int scmi_clock_disable(const struct scmi_protocol_handle *ph, u32 clk_id,
 				    NULL_OEM_TYPE, 0, atomic);
 }
 
-/* For SCMI clock v2.1 and onwards */
+/* For SCMI clock v3.0 and onwards */
 static int
 scmi_clock_config_get_v2(const struct scmi_protocol_handle *ph, u32 clk_id,
 			 u8 oem_type, u32 *attributes, bool *enabled,
@@ -860,6 +865,14 @@ static int scmi_clock_config_oem_set(const struct scmi_protocol_handle *ph,
 				     bool atomic)
 {
 	struct clock_info *ci = ph->get_priv(ph);
+	struct scmi_clock_info *clk;
+
+	clk = scmi_clock_domain_lookup(ci, clk_id);
+	if (IS_ERR(clk))
+		return PTR_ERR(clk);
+
+	if (!clk->extended_config)
+		return -EOPNOTSUPP;
 
 	return ci->clock_config_set(ph, clk_id, CLK_STATE_UNCHANGED,
 				    oem_type, oem_val, atomic);
@@ -870,6 +883,14 @@ static int scmi_clock_config_oem_get(const struct scmi_protocol_handle *ph,
 				     u32 *attributes, bool atomic)
 {
 	struct clock_info *ci = ph->get_priv(ph);
+	struct scmi_clock_info *clk;
+
+	clk = scmi_clock_domain_lookup(ci, clk_id);
+	if (IS_ERR(clk))
+		return PTR_ERR(clk);
+
+	if (!clk->extended_config)
+		return -EOPNOTSUPP;
 
 	return ci->clock_config_get(ph, clk_id, oem_type, attributes,
 				    NULL, oem_val, atomic);
diff --git a/include/linux/scmi_protocol.h b/include/linux/scmi_protocol.h
index 9b9351e07a110d..46a61173c91ca8 100644
--- a/include/linux/scmi_protocol.h
+++ b/include/linux/scmi_protocol.h
@@ -50,6 +50,7 @@ struct scmi_clock_info {
 	bool state_ctrl_forbidden;
 	bool rate_ctrl_forbidden;
 	bool parent_ctrl_forbidden;
+	bool extended_config;
 	union {
 		struct {
 			int num_rates;

From e37453587a52743bd2a3c81b06376b8924897968 Mon Sep 17 00:00:00 2001
From: Cristian Marussi <cristian.marussi@arm.com>
Date: Wed, 14 Feb 2024 18:30:03 +0000
Subject: [PATCH 0844/1406] firmware: arm_scmi: Add standard clock OEM
 definitions

Add a common enum to define the standard clock OEM types defined by the
SCMI specification, so as to enable the configuration of such extended
configuration properties with the existent clock protocol operations.

Signed-off-by: Cristian Marussi <cristian.marussi@arm.com>
Link: https://lore.kernel.org/r/20240214183006.3403207-5-cristian.marussi@arm.com
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 drivers/firmware/arm_scmi/clock.c | 32 ++++++++++++++++++-------------
 include/linux/scmi_protocol.h     | 14 +++++++++++---
 2 files changed, 30 insertions(+), 16 deletions(-)

diff --git a/drivers/firmware/arm_scmi/clock.c b/drivers/firmware/arm_scmi/clock.c
index add350bf2a7f2d..95393af655edd8 100644
--- a/drivers/firmware/arm_scmi/clock.c
+++ b/drivers/firmware/arm_scmi/clock.c
@@ -165,10 +165,12 @@ struct clock_info {
 	struct scmi_clock_info *clk;
 	int (*clock_config_set)(const struct scmi_protocol_handle *ph,
 				u32 clk_id, enum clk_state state,
-				u8 oem_type, u32 oem_val, bool atomic);
+				enum scmi_clock_oem_config oem_type,
+				u32 oem_val, bool atomic);
 	int (*clock_config_get)(const struct scmi_protocol_handle *ph,
-				u32 clk_id, u8 oem_type, u32 *attributes,
-				bool *enabled, u32 *oem_val, bool atomic);
+				u32 clk_id, enum scmi_clock_oem_config oem_type,
+				u32 *attributes, bool *enabled, u32 *oem_val,
+				bool atomic);
 };
 
 static enum scmi_clock_protocol_cmd evt_2_cmd[] = {
@@ -618,7 +620,8 @@ static int scmi_clock_rate_set(const struct scmi_protocol_handle *ph,
 
 static int
 scmi_clock_config_set(const struct scmi_protocol_handle *ph, u32 clk_id,
-		      enum clk_state state, u8 __unused0, u32 __unused1,
+		      enum clk_state state,
+		      enum scmi_clock_oem_config __unused0, u32 __unused1,
 		      bool atomic)
 {
 	int ret;
@@ -708,7 +711,8 @@ scmi_clock_get_parent(const struct scmi_protocol_handle *ph, u32 clk_id,
 /* For SCMI clock v3.0 and onwards */
 static int
 scmi_clock_config_set_v2(const struct scmi_protocol_handle *ph, u32 clk_id,
-			 enum clk_state state, u8 oem_type, u32 oem_val,
+			 enum clk_state state,
+			 enum scmi_clock_oem_config oem_type, u32 oem_val,
 			 bool atomic)
 {
 	int ret;
@@ -781,8 +785,8 @@ static int scmi_clock_disable(const struct scmi_protocol_handle *ph, u32 clk_id,
 /* For SCMI clock v3.0 and onwards */
 static int
 scmi_clock_config_get_v2(const struct scmi_protocol_handle *ph, u32 clk_id,
-			 u8 oem_type, u32 *attributes, bool *enabled,
-			 u32 *oem_val, bool atomic)
+			 enum scmi_clock_oem_config oem_type, u32 *attributes,
+			 bool *enabled, u32 *oem_val, bool atomic)
 {
 	int ret;
 	u32 flags;
@@ -823,8 +827,8 @@ scmi_clock_config_get_v2(const struct scmi_protocol_handle *ph, u32 clk_id,
 
 static int
 scmi_clock_config_get(const struct scmi_protocol_handle *ph, u32 clk_id,
-		      u8 oem_type, u32 *attributes, bool *enabled,
-		      u32 *oem_val, bool atomic)
+		      enum scmi_clock_oem_config oem_type, u32 *attributes,
+		      bool *enabled, u32 *oem_val, bool atomic)
 {
 	int ret;
 	struct scmi_xfer *t;
@@ -861,8 +865,9 @@ static int scmi_clock_state_get(const struct scmi_protocol_handle *ph,
 }
 
 static int scmi_clock_config_oem_set(const struct scmi_protocol_handle *ph,
-				     u32 clk_id, u8 oem_type, u32 oem_val,
-				     bool atomic)
+				     u32 clk_id,
+				     enum scmi_clock_oem_config oem_type,
+				     u32 oem_val, bool atomic)
 {
 	struct clock_info *ci = ph->get_priv(ph);
 	struct scmi_clock_info *clk;
@@ -879,8 +884,9 @@ static int scmi_clock_config_oem_set(const struct scmi_protocol_handle *ph,
 }
 
 static int scmi_clock_config_oem_get(const struct scmi_protocol_handle *ph,
-				     u32 clk_id, u8 oem_type, u32 *oem_val,
-				     u32 *attributes, bool atomic)
+				     u32 clk_id,
+				     enum scmi_clock_oem_config oem_type,
+				     u32 *oem_val, u32 *attributes, bool atomic)
 {
 	struct clock_info *ci = ph->get_priv(ph);
 	struct scmi_clock_info *clk;
diff --git a/include/linux/scmi_protocol.h b/include/linux/scmi_protocol.h
index 46a61173c91ca8..2ee94ff0320c32 100644
--- a/include/linux/scmi_protocol.h
+++ b/include/linux/scmi_protocol.h
@@ -76,6 +76,13 @@ struct scmi_handle;
 struct scmi_device;
 struct scmi_protocol_handle;
 
+enum scmi_clock_oem_config {
+	SCMI_CLOCK_CFG_DUTY_CYCLE = 0x1,
+	SCMI_CLOCK_CFG_PHASE,
+	SCMI_CLOCK_CFG_OEM_START = 0x80,
+	SCMI_CLOCK_CFG_OEM_END = 0xFF,
+};
+
 /**
  * struct scmi_clk_proto_ops - represents the various operations provided
  *	by SCMI Clock Protocol
@@ -108,10 +115,11 @@ struct scmi_clk_proto_ops {
 	int (*state_get)(const struct scmi_protocol_handle *ph, u32 clk_id,
 			 bool *enabled, bool atomic);
 	int (*config_oem_get)(const struct scmi_protocol_handle *ph, u32 clk_id,
-			      u8 oem_type, u32 *oem_val, u32 *attributes,
-			      bool atomic);
+			      enum scmi_clock_oem_config oem_type,
+			      u32 *oem_val, u32 *attributes, bool atomic);
 	int (*config_oem_set)(const struct scmi_protocol_handle *ph, u32 clk_id,
-			      u8 oem_type, u32 oem_val, bool atomic);
+			      enum scmi_clock_oem_config oem_type,
+			      u32 oem_val, bool atomic);
 	int (*parent_get)(const struct scmi_protocol_handle *ph, u32 clk_id, u32 *parent_id);
 	int (*parent_set)(const struct scmi_protocol_handle *ph, u32 clk_id, u32 parent_id);
 };

From 666d09dd5e8fa82c6ed2e382d7623bbcb7f7a68a Mon Sep 17 00:00:00 2001
From: Cristian Marussi <cristian.marussi@arm.com>
Date: Wed, 14 Feb 2024 18:30:04 +0000
Subject: [PATCH 0845/1406] firmware: arm_scmi: Update the supported clock
 protocol version

And finally update the supported clock protocol version to v3.2(0x30000).

Signed-off-by: Cristian Marussi <cristian.marussi@arm.com>
Link: https://lore.kernel.org/r/20240214183006.3403207-6-cristian.marussi@arm.com
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 drivers/firmware/arm_scmi/clock.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/firmware/arm_scmi/clock.c b/drivers/firmware/arm_scmi/clock.c
index 95393af655edd8..134019297d08b1 100644
--- a/drivers/firmware/arm_scmi/clock.c
+++ b/drivers/firmware/arm_scmi/clock.c
@@ -13,7 +13,7 @@
 #include "notify.h"
 
 /* Updated only after ALL the mandatory features for that version are merged */
-#define SCMI_PROTOCOL_SUPPORTED_VERSION		0x20000
+#define SCMI_PROTOCOL_SUPPORTED_VERSION		0x30000
 
 enum scmi_clock_protocol_cmd {
 	CLOCK_ATTRIBUTES = 0x3,

From f20d4a130208b6858bf7441d39834a2c915937ae Mon Sep 17 00:00:00 2001
From: Peng Fan <peng.fan@nxp.com>
Date: Sun, 21 Jan 2024 19:09:01 +0800
Subject: [PATCH 0846/1406] clk: scmi: Add support for forbidden clock state
 controls

Some clocks may be exported to OS agent, while certain configurations/
access to those clocks are restricted to the OS agent by the SCMI platform
firmware. For example:

SYS_CLK1-----
             \
	     --MUX--->MMC1_CLK
             /
SYS_CLK2-----

MMC1_CLK needs to set parent as part of it initialisation and enabling.
SYS_CLK1 and SYS_CLK2 are exported to the OS agent. The clk propagation
will access SYS_CLK1 or SYS_CLK2 based on the configuration. However,
we need bypass the failure to access SYS_CLK1 or SYS_CLK2 when MMC1_CLK
is accessed and enabled.

Add a separate scmi_no_state_ctrl clk_ops where the calls to the SCMI
platform firmware are avoided if the access is not permitted.

Signed-off-by: Peng Fan <peng.fan@nxp.com>
Link: https://lore.kernel.org/r/20240121110901.1414856-2-peng.fan@oss.nxp.com
Suggested-by: Cristian Marussi <cristian.marussi@arm.com>
Reviewed-by: Cristian Marussi <cristian.marussi@arm.com>
Tested-by: Cristian Marussi <cristian.marussi@arm.com>
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 drivers/clk/clk-scmi.c | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/drivers/clk/clk-scmi.c b/drivers/clk/clk-scmi.c
index 8cbe24789c24bb..5747b6d651f04f 100644
--- a/drivers/clk/clk-scmi.c
+++ b/drivers/clk/clk-scmi.c
@@ -194,6 +194,15 @@ static const struct clk_ops scmi_atomic_clk_ops = {
 	.determine_rate = scmi_clk_determine_rate,
 };
 
+static const struct clk_ops scmi_no_state_ctrl_clk_ops = {
+	.recalc_rate = scmi_clk_recalc_rate,
+	.round_rate = scmi_clk_round_rate,
+	.set_rate = scmi_clk_set_rate,
+	.set_parent = scmi_clk_set_parent,
+	.get_parent = scmi_clk_get_parent,
+	.determine_rate = scmi_clk_determine_rate,
+};
+
 static int scmi_clk_ops_init(struct device *dev, struct scmi_clk *sclk,
 			     const struct clk_ops *scmi_ops)
 {
@@ -290,8 +299,10 @@ static int scmi_clocks_probe(struct scmi_device *sdev)
 		 * specify (or support) an enable_latency associated with a
 		 * clock, we default to use atomic operations mode.
 		 */
-		if (is_atomic &&
-		    sclk->info->enable_latency <= atomic_threshold)
+		if (sclk->info->state_ctrl_forbidden)
+			scmi_ops = &scmi_no_state_ctrl_clk_ops;
+		else if (is_atomic &&
+			 sclk->info->enable_latency <= atomic_threshold)
 			scmi_ops = &scmi_atomic_clk_ops;
 		else
 			scmi_ops = &scmi_clk_ops;

From a99ec6a1d9c12885cc4c41ff6c1d71ae2738f3e0 Mon Sep 17 00:00:00 2001
From: Cristian Marussi <cristian.marussi@arm.com>
Date: Wed, 14 Feb 2024 18:30:05 +0000
Subject: [PATCH 0847/1406] clk: scmi: Allocate CLK operations dynamically

SCMI clocks descriptors expose and increasing number of properties that in
turn lead to a different set of supported CLK operations to be associated
dynamically with a clock.

Providing statically pre-defined CLK operations structs for all the
possible combinations of allowed properties is cumbersome and error-prone.

Allocate per-clock operations descriptor dynamically and populate it with
the strictly needed set of operations depending on the advertised clock
properties.

CC: Michael Turquette <mturquette@baylibre.com>
CC: Stephen Boyd <sboyd@kernel.org>
CC:  <linux-clk@vger.kernel.org>
Signed-off-by: Cristian Marussi <cristian.marussi@arm.com>
Link: https://lore.kernel.org/r/20240214183006.3403207-7-cristian.marussi@arm.com
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 drivers/clk/clk-scmi.c | 131 ++++++++++++++++++++++-------------------
 1 file changed, 71 insertions(+), 60 deletions(-)

diff --git a/drivers/clk/clk-scmi.c b/drivers/clk/clk-scmi.c
index 5747b6d651f04f..e35f4ff0e12652 100644
--- a/drivers/clk/clk-scmi.c
+++ b/drivers/clk/clk-scmi.c
@@ -158,51 +158,6 @@ static int scmi_clk_atomic_is_enabled(struct clk_hw *hw)
 	return !!enabled;
 }
 
-/*
- * We can provide enable/disable/is_enabled atomic callbacks only if the
- * underlying SCMI transport for an SCMI instance is configured to handle
- * SCMI commands in an atomic manner.
- *
- * When no SCMI atomic transport support is available we instead provide only
- * the prepare/unprepare API, as allowed by the clock framework when atomic
- * calls are not available.
- *
- * Two distinct sets of clk_ops are provided since we could have multiple SCMI
- * instances with different underlying transport quality, so they cannot be
- * shared.
- */
-static const struct clk_ops scmi_clk_ops = {
-	.recalc_rate = scmi_clk_recalc_rate,
-	.round_rate = scmi_clk_round_rate,
-	.set_rate = scmi_clk_set_rate,
-	.prepare = scmi_clk_enable,
-	.unprepare = scmi_clk_disable,
-	.set_parent = scmi_clk_set_parent,
-	.get_parent = scmi_clk_get_parent,
-	.determine_rate = scmi_clk_determine_rate,
-};
-
-static const struct clk_ops scmi_atomic_clk_ops = {
-	.recalc_rate = scmi_clk_recalc_rate,
-	.round_rate = scmi_clk_round_rate,
-	.set_rate = scmi_clk_set_rate,
-	.enable = scmi_clk_atomic_enable,
-	.disable = scmi_clk_atomic_disable,
-	.is_enabled = scmi_clk_atomic_is_enabled,
-	.set_parent = scmi_clk_set_parent,
-	.get_parent = scmi_clk_get_parent,
-	.determine_rate = scmi_clk_determine_rate,
-};
-
-static const struct clk_ops scmi_no_state_ctrl_clk_ops = {
-	.recalc_rate = scmi_clk_recalc_rate,
-	.round_rate = scmi_clk_round_rate,
-	.set_rate = scmi_clk_set_rate,
-	.set_parent = scmi_clk_set_parent,
-	.get_parent = scmi_clk_get_parent,
-	.determine_rate = scmi_clk_determine_rate,
-};
-
 static int scmi_clk_ops_init(struct device *dev, struct scmi_clk *sclk,
 			     const struct clk_ops *scmi_ops)
 {
@@ -239,10 +194,75 @@ static int scmi_clk_ops_init(struct device *dev, struct scmi_clk *sclk,
 	return ret;
 }
 
+/**
+ * scmi_clk_ops_alloc() - Alloc and configure CLK ops
+ * @sclk: A reference to an SCMI clock descriptor
+ * @atomic_capable: A flag to indicate if atomic mode is supported by the
+ *		    transport
+ * @atomic_threshold: Platform atomic threshold value
+ *
+ * Allocate and configure a proper set of CLK operations depending on the
+ * specific SCMI clock characteristics and platform atomic operation capability.
+ *
+ * We can provide enable/disable/is_enabled atomic callbacks only if the
+ * underlying SCMI transport for an SCMI instance is configured to handle
+ * SCMI commands in an atomic manner.
+ *
+ * When no SCMI atomic transport support is available we instead provide only
+ * the prepare/unprepare API, as allowed by the clock framework when atomic
+ * calls are not available.
+ *
+ * Return: A pointer to the allocated and configured clk_ops on Success,
+ *	   NULL otherwise.
+ */
+static const struct clk_ops *
+scmi_clk_ops_alloc(struct scmi_clk *sclk, bool atomic_capable,
+		   unsigned int atomic_threshold)
+{
+	const struct scmi_clock_info *ci = sclk->info;
+	struct clk_ops *ops;
+
+	ops = devm_kzalloc(sclk->dev, sizeof(*ops), GFP_KERNEL);
+	if (!ops)
+		return NULL;
+
+	/*
+	 * Note that when transport is atomic but SCMI protocol did not
+	 * specify (or support) an enable_latency associated with a
+	 * clock, we default to use atomic operations mode.
+	 */
+	if (!ci->state_ctrl_forbidden) {
+		if (atomic_capable && ci->enable_latency <= atomic_threshold) {
+			ops->enable = scmi_clk_atomic_enable;
+			ops->disable = scmi_clk_atomic_disable;
+		} else {
+			ops->prepare = scmi_clk_enable;
+			ops->unprepare = scmi_clk_disable;
+		}
+	}
+
+	if (atomic_capable)
+		ops->is_enabled = scmi_clk_atomic_is_enabled;
+
+	/* Rate ops */
+	ops->recalc_rate = scmi_clk_recalc_rate;
+	ops->round_rate = scmi_clk_round_rate;
+	ops->determine_rate = scmi_clk_determine_rate;
+	if (!ci->rate_ctrl_forbidden)
+		ops->set_rate = scmi_clk_set_rate;
+
+	/* Parent ops */
+	ops->get_parent = scmi_clk_get_parent;
+	if (!ci->parent_ctrl_forbidden)
+		ops->set_parent = scmi_clk_set_parent;
+
+	return ops;
+}
+
 static int scmi_clocks_probe(struct scmi_device *sdev)
 {
 	int idx, count, err;
-	unsigned int atomic_threshold;
+	unsigned int atomic_threshold = 0;
 	bool is_atomic;
 	struct clk_hw **hws;
 	struct clk_hw_onecell_data *clk_data;
@@ -294,18 +314,9 @@ static int scmi_clocks_probe(struct scmi_device *sdev)
 		sclk->ph = ph;
 		sclk->dev = dev;
 
-		/*
-		 * Note that when transport is atomic but SCMI protocol did not
-		 * specify (or support) an enable_latency associated with a
-		 * clock, we default to use atomic operations mode.
-		 */
-		if (sclk->info->state_ctrl_forbidden)
-			scmi_ops = &scmi_no_state_ctrl_clk_ops;
-		else if (is_atomic &&
-			 sclk->info->enable_latency <= atomic_threshold)
-			scmi_ops = &scmi_atomic_clk_ops;
-		else
-			scmi_ops = &scmi_clk_ops;
+		scmi_ops = scmi_clk_ops_alloc(sclk, is_atomic, atomic_threshold);
+		if (!scmi_ops)
+			return -ENOMEM;
 
 		/* Initialize clock parent data. */
 		if (sclk->info->num_parents > 0) {
@@ -324,13 +335,13 @@ static int scmi_clocks_probe(struct scmi_device *sdev)
 		if (err) {
 			dev_err(dev, "failed to register clock %d\n", idx);
 			devm_kfree(dev, sclk->parent_data);
+			devm_kfree(dev, scmi_ops);
 			devm_kfree(dev, sclk);
 			hws[idx] = NULL;
 		} else {
 			dev_dbg(dev, "Registered clock:%s%s\n",
 				sclk->info->name,
-				scmi_ops == &scmi_atomic_clk_ops ?
-				" (atomic ops)" : "");
+				scmi_ops->enable ? " (atomic ops)" : "");
 			hws[idx] = &sclk->hw;
 		}
 	}

From 1c2c88cfcb2bfb14245cf27661ade415fbb7ea9a Mon Sep 17 00:00:00 2001
From: Cristian Marussi <cristian.marussi@arm.com>
Date: Wed, 14 Feb 2024 18:30:06 +0000
Subject: [PATCH 0848/1406] clk: scmi: Support get/set duty_cycle operations

Provide the CLK framework callbacks related to get/set clock duty cycle if
the relared SCMI clock supports OEM extended configurations.

CC: Michael Turquette <mturquette@baylibre.com>
CC: Stephen Boyd <sboyd@kernel.org>
CC:  <linux-clk@vger.kernel.org>
Signed-off-by: Cristian Marussi <cristian.marussi@arm.com>
Link: https://lore.kernel.org/r/20240214183006.3403207-8-cristian.marussi@arm.com
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 drivers/clk/clk-scmi.c | 45 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

diff --git a/drivers/clk/clk-scmi.c b/drivers/clk/clk-scmi.c
index e35f4ff0e12652..30696b39382b5b 100644
--- a/drivers/clk/clk-scmi.c
+++ b/drivers/clk/clk-scmi.c
@@ -158,6 +158,45 @@ static int scmi_clk_atomic_is_enabled(struct clk_hw *hw)
 	return !!enabled;
 }
 
+static int scmi_clk_get_duty_cycle(struct clk_hw *hw, struct clk_duty *duty)
+{
+	int ret;
+	u32 val;
+	struct scmi_clk *clk = to_scmi_clk(hw);
+
+	ret = scmi_proto_clk_ops->config_oem_get(clk->ph, clk->id,
+						 SCMI_CLOCK_CFG_DUTY_CYCLE,
+						 &val, NULL, false);
+	if (!ret) {
+		duty->num = val;
+		duty->den = 100;
+	} else {
+		dev_warn(clk->dev,
+			 "Failed to get duty cycle for clock ID %d\n", clk->id);
+	}
+
+	return ret;
+}
+
+static int scmi_clk_set_duty_cycle(struct clk_hw *hw, struct clk_duty *duty)
+{
+	int ret;
+	u32 val;
+	struct scmi_clk *clk = to_scmi_clk(hw);
+
+	/* SCMI OEM Duty Cycle is expressed as a percentage */
+	val = (duty->num * 100) / duty->den;
+	ret = scmi_proto_clk_ops->config_oem_set(clk->ph, clk->id,
+						 SCMI_CLOCK_CFG_DUTY_CYCLE,
+						 val, false);
+	if (ret)
+		dev_warn(clk->dev,
+			 "Failed to set duty cycle(%u/%u) for clock ID %d\n",
+			 duty->num, duty->den, clk->id);
+
+	return ret;
+}
+
 static int scmi_clk_ops_init(struct device *dev, struct scmi_clk *sclk,
 			     const struct clk_ops *scmi_ops)
 {
@@ -256,6 +295,12 @@ scmi_clk_ops_alloc(struct scmi_clk *sclk, bool atomic_capable,
 	if (!ci->parent_ctrl_forbidden)
 		ops->set_parent = scmi_clk_set_parent;
 
+	/* Duty cycle */
+	if (ci->extended_config) {
+		ops->get_duty_cycle = scmi_clk_get_duty_cycle;
+		ops->set_duty_cycle = scmi_clk_set_duty_cycle;
+	}
+
 	return ops;
 }
 

From f5c43d3ec893dd3066228393666a8ed575e071e8 Mon Sep 17 00:00:00 2001
From: Dani Liberman <dliberman@habana.ai>
Date: Thu, 21 Sep 2023 17:02:33 +0300
Subject: [PATCH 0849/1406] accel/habanalabs/gaudi2: add interrupt affinity for
 user interrupts

User interrupts are MSIx interrupts coming from Gaudi2, that have
specific range of IDs and are assigned to the sole use of the user
process that opened the Gaudi2 device (reminder: there can be only
a single user process running on Gaudi2 at any given time).

The interrupts are allocated and managed by the driver and therefore,
the user expects the driver to initialize them properly, which also
includes setting the affinity to the related CPU cores of the
device's NUMA node to get maximum performance.

Signed-off-by: Dani Liberman <dliberman@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/accel/habanalabs/common/device.c     | 32 ++++++++++++++++++++
 drivers/accel/habanalabs/common/habanalabs.h |  5 +++
 drivers/accel/habanalabs/gaudi2/gaudi2.c     |  5 +++
 3 files changed, 42 insertions(+)

diff --git a/drivers/accel/habanalabs/common/device.c b/drivers/accel/habanalabs/common/device.c
index a73bd4be94b156..5eacbc73f1bb98 100644
--- a/drivers/accel/habanalabs/common/device.c
+++ b/drivers/accel/habanalabs/common/device.c
@@ -2801,3 +2801,35 @@ void hl_enable_err_info_capture(struct hl_error_info *captured_err_info)
 	atomic_set(&captured_err_info->cs_timeout.write_enable, 1);
 	captured_err_info->undef_opcode.write_enable = true;
 }
+
+void hl_init_cpu_for_irq(struct hl_device *hdev)
+{
+#ifdef CONFIG_NUMA
+	struct cpumask *available_mask = &hdev->irq_affinity_mask;
+	int numa_node = hdev->pdev->dev.numa_node, i;
+	static struct cpumask cpu_mask;
+
+	if (numa_node < 0)
+		return;
+
+	if (!cpumask_and(&cpu_mask, cpumask_of_node(numa_node), cpu_online_mask)) {
+		dev_err(hdev->dev, "No available affinities in current numa node\n");
+		return;
+	}
+
+	/* Remove HT siblings */
+	for_each_cpu(i, &cpu_mask)
+		cpumask_set_cpu(cpumask_first(topology_sibling_cpumask(i)), available_mask);
+#endif
+}
+
+void hl_set_irq_affinity(struct hl_device *hdev, int irq)
+{
+	if (cpumask_empty(&hdev->irq_affinity_mask)) {
+		dev_dbg(hdev->dev, "affinity mask is empty\n");
+		return;
+	}
+
+	if (irq_set_affinity_hint(irq, &hdev->irq_affinity_mask))
+		dev_err(hdev->dev, "Failed setting irq %d affinity\n", irq);
+}
diff --git a/drivers/accel/habanalabs/common/habanalabs.h b/drivers/accel/habanalabs/common/habanalabs.h
index 2a900c9941fee6..b1a7b229e16160 100644
--- a/drivers/accel/habanalabs/common/habanalabs.h
+++ b/drivers/accel/habanalabs/common/habanalabs.h
@@ -3257,6 +3257,7 @@ struct hl_reset_info {
  * @clk_throttling: holds information about current/previous clock throttling events
  * @captured_err_info: holds information about errors.
  * @reset_info: holds current device reset information.
+ * @irq_affinity_mask: mask of available CPU cores for user and decoder interrupt handling.
  * @stream_master_qid_arr: pointer to array with QIDs of master streams.
  * @fw_inner_major_ver: the major of current loaded preboot inner version.
  * @fw_inner_minor_ver: the minor of current loaded preboot inner version.
@@ -3446,6 +3447,8 @@ struct hl_device {
 
 	struct hl_reset_info		reset_info;
 
+	cpumask_t			irq_affinity_mask;
+
 	u32				*stream_master_qid_arr;
 	u32				fw_inner_major_ver;
 	u32				fw_inner_minor_ver;
@@ -4032,6 +4035,8 @@ void hl_handle_critical_hw_err(struct hl_device *hdev, u16 event_id, u64 *event_
 void hl_handle_fw_err(struct hl_device *hdev, struct hl_info_fw_err_info *info);
 void hl_capture_engine_err(struct hl_device *hdev, u16 engine_id, u16 error_count);
 void hl_enable_err_info_capture(struct hl_error_info *captured_err_info);
+void hl_init_cpu_for_irq(struct hl_device *hdev);
+void hl_set_irq_affinity(struct hl_device *hdev, int irq);
 
 #ifdef CONFIG_DEBUG_FS
 
diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c b/drivers/accel/habanalabs/gaudi2/gaudi2.c
index e0e5615ef9b0f6..fd01525b1ea204 100644
--- a/drivers/accel/habanalabs/gaudi2/gaudi2.c
+++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c
@@ -4254,6 +4254,8 @@ static int gaudi2_enable_msix(struct hl_device *hdev)
 	if (gaudi2->hw_cap_initialized & HW_CAP_MSIX)
 		return 0;
 
+	hl_init_cpu_for_irq(hdev);
+
 	rc = pci_alloc_irq_vectors(hdev->pdev, GAUDI2_MSIX_ENTRIES, GAUDI2_MSIX_ENTRIES,
 					PCI_IRQ_MSIX);
 	if (rc < 0) {
@@ -4307,6 +4309,7 @@ static int gaudi2_enable_msix(struct hl_device *hdev)
 			i++, j++, user_irq_init_cnt++) {
 
 		irq = pci_irq_vector(hdev->pdev, i);
+		hl_set_irq_affinity(hdev, irq);
 		rc = request_irq(irq, hl_irq_user_interrupt_handler, 0, gaudi2_irq_name(i),
 				&hdev->user_interrupt[j]);
 		if (rc) {
@@ -4333,6 +4336,7 @@ static int gaudi2_enable_msix(struct hl_device *hdev)
 			i < GAUDI2_IRQ_NUM_USER_FIRST + user_irq_init_cnt ; i++, j++) {
 
 		irq = pci_irq_vector(hdev->pdev, i);
+		irq_set_affinity_hint(irq, NULL);
 		free_irq(irq, &hdev->user_interrupt[j]);
 	}
 	irq = pci_irq_vector(hdev->pdev, GAUDI2_IRQ_NUM_UNEXPECTED_ERROR);
@@ -4413,6 +4417,7 @@ static void gaudi2_disable_msix(struct hl_device *hdev)
 			k < hdev->asic_prop.user_interrupt_count ; i++, j++, k++) {
 
 		irq = pci_irq_vector(hdev->pdev, i);
+		irq_set_affinity_hint(irq, NULL);
 		free_irq(irq, &hdev->user_interrupt[j]);
 	}
 

From 6901be9deddaebb2fca5097180c0fea349662599 Mon Sep 17 00:00:00 2001
From: Koby Elbaz <kelbaz@habana.ai>
Date: Mon, 11 Dec 2023 10:03:29 +0200
Subject: [PATCH 0850/1406] accel/habanalabs: increase HL_MAX_STR to 64 bytes
 to avoid warnings
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fix a warning of a buffer overflow:
‘snprintf’ output between 38 and 47 bytes into a destination of size 32

Signed-off-by: Koby Elbaz <kelbaz@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/accel/habanalabs/common/habanalabs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/accel/habanalabs/common/habanalabs.h b/drivers/accel/habanalabs/common/habanalabs.h
index b1a7b229e16160..253873315888e1 100644
--- a/drivers/accel/habanalabs/common/habanalabs.h
+++ b/drivers/accel/habanalabs/common/habanalabs.h
@@ -2547,7 +2547,7 @@ struct hl_state_dump_specs {
  * DEVICES
  */
 
-#define HL_STR_MAX	32
+#define HL_STR_MAX	64
 
 #define HL_DEV_STS_MAX (HL_DEVICE_STATUS_LAST + 1)
 

From 2aade196909f49a5d521fb1f395b0fc8ffc62dbf Mon Sep 17 00:00:00 2001
From: Tomer Tayar <ttayar@habana.ai>
Date: Thu, 14 Dec 2023 10:38:06 +0200
Subject: [PATCH 0851/1406] accel/habanalabs: fix DRAM BAR base address
 calculation

When the DRAM region size in the BAR is not a power of 2, calculating
the corresponding BAR base address should be done using the offset from
the DRAM start address, and not using directly the DRAM address.

Signed-off-by: Tomer Tayar <ttayar@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/accel/habanalabs/common/device.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/accel/habanalabs/common/device.c b/drivers/accel/habanalabs/common/device.c
index 5eacbc73f1bb98..5c46826e365929 100644
--- a/drivers/accel/habanalabs/common/device.c
+++ b/drivers/accel/habanalabs/common/device.c
@@ -55,7 +55,8 @@ static u64 hl_set_dram_bar(struct hl_device *hdev, u64 addr, struct pci_mem_regi
 	if (is_power_of_2(prop->dram_pci_bar_size))
 		bar_base_addr = addr & ~(prop->dram_pci_bar_size - 0x1ull);
 	else
-		bar_base_addr = DIV_ROUND_DOWN_ULL(addr, prop->dram_pci_bar_size) *
+		bar_base_addr = region->region_base +
+				div64_u64((addr - region->region_base), prop->dram_pci_bar_size) *
 				prop->dram_pci_bar_size;
 
 	old_base = hdev->asic_funcs->set_dram_bar_base(hdev, bar_base_addr);

From 97d0052323ba27151d2b55dfd2c37255cf4d3f7c Mon Sep 17 00:00:00 2001
From: Tomer Tayar <ttayar@habana.ai>
Date: Mon, 25 Dec 2023 00:28:36 +0200
Subject: [PATCH 0852/1406] accel/habanalabs: abort device reset for
 consecutive heartbeat failures

The mechanism of aborting device reset for consecutive fatal errors is
currently only for fatal errors that are reported by FW.
A non-responsive FW and consecutive heartbeat failures is also
considered fatal, so add them as well to this mechanism to avoid
recurring device reset in such a case.

Signed-off-by: Tomer Tayar <ttayar@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/accel/habanalabs/common/device.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/drivers/accel/habanalabs/common/device.c b/drivers/accel/habanalabs/common/device.c
index 5c46826e365929..cf004baf5e6213 100644
--- a/drivers/accel/habanalabs/common/device.c
+++ b/drivers/accel/habanalabs/common/device.c
@@ -1769,14 +1769,16 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
 		hdev->device_cpu_disabled = false;
 		hdev->reset_info.hard_reset_pending = false;
 
+		/*
+		 * Put the device in an unusable state if there are 2 back to back resets due to
+		 * fatal errors.
+		 */
 		if (hdev->reset_info.reset_trigger_repeated &&
-				(hdev->reset_info.prev_reset_trigger ==
-						HL_DRV_RESET_FW_FATAL_ERR)) {
-			/* if there 2 back to back resets from FW,
-			 * ensure driver puts the driver in a unusable state
-			 */
+				(hdev->reset_info.prev_reset_trigger == HL_DRV_RESET_FW_FATAL_ERR ||
+						hdev->reset_info.prev_reset_trigger ==
+								HL_DRV_RESET_HEARTBEAT)) {
 			dev_crit(hdev->dev,
-				"%s Consecutive FW fatal errors received, stopping hard reset\n",
+				"%s Consecutive fatal errors, stopping hard reset\n",
 				dev_name(&(hdev)->pdev->dev));
 			rc = -EIO;
 			goto out_err;

From 8a23b0554726662df785dc8601c87dfdb3115da4 Mon Sep 17 00:00:00 2001
From: Farah Kassabri <fkassabri@habana.ai>
Date: Thu, 2 Nov 2023 11:53:29 +0200
Subject: [PATCH 0853/1406] accel/habanalabs/gaudi2: move HMMU page tables to
 device memory

Currently the HMMU page tables reside in the host memory,
which will cause host access from the device for every page walk.
This can affect PCIe bandwidth in certain scenarios.

To prevent that problem, HMMU page tables will be moved to the device
memory so the miss transaction will read the hops from there instead of
going to the host.

Signed-off-by: Farah Kassabri <fkassabri@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/accel/habanalabs/common/habanalabs.h  |  26 ++
 drivers/accel/habanalabs/common/hw_queue.c    |  17 +
 drivers/accel/habanalabs/common/mmu/Makefile  |   2 +-
 drivers/accel/habanalabs/common/mmu/mmu.c     | 223 ++++++++++-
 drivers/accel/habanalabs/common/mmu/mmu_v1.c  | 352 +++---------------
 drivers/accel/habanalabs/common/mmu/mmu_v2.c  | 338 +++++++++++++++++
 drivers/accel/habanalabs/gaudi/gaudi.c        |   1 +
 drivers/accel/habanalabs/gaudi2/gaudi2.c      | 245 ++++++++----
 drivers/accel/habanalabs/gaudi2/gaudi2P.h     |  12 +-
 .../include/hw_ip/mmu/mmu_general.h           |   2 +
 10 files changed, 836 insertions(+), 382 deletions(-)
 create mode 100644 drivers/accel/habanalabs/common/mmu/mmu_v2.c

diff --git a/drivers/accel/habanalabs/common/habanalabs.h b/drivers/accel/habanalabs/common/habanalabs.h
index 253873315888e1..7397ce86b7f03a 100644
--- a/drivers/accel/habanalabs/common/habanalabs.h
+++ b/drivers/accel/habanalabs/common/habanalabs.h
@@ -443,18 +443,22 @@ enum hl_collective_mode {
  *                  a CB handle can be provided for jobs on this queue.
  *                  Otherwise, a CB address must be provided.
  * @collective_mode: collective mode of current queue
+ * @q_dram_bd_address: PQ dram address, used when PQ need to reside in DRAM.
  * @driver_only: true if only the driver is allowed to send a job to this queue,
  *               false otherwise.
  * @binned: True if the queue is binned out and should not be used
  * @supports_sync_stream: True if queue supports sync stream
+ * @dram_bd: True if the bd should be copied to dram, needed for PQ which has been allocated on dram
  */
 struct hw_queue_properties {
 	enum hl_queue_type		type;
 	enum queue_cb_alloc_flags	cb_alloc_flags;
 	enum hl_collective_mode		collective_mode;
+	u64				q_dram_bd_address;
 	u8				driver_only;
 	u8				binned;
 	u8				supports_sync_stream;
+	u8				dram_bd;
 };
 
 /**
@@ -1052,6 +1056,8 @@ struct hl_encaps_signals_mgr {
  * @collective_mode: collective mode of current queue
  * @kernel_address: holds the queue's kernel virtual address.
  * @bus_address: holds the queue's DMA address.
+ * @pq_dram_address: hold the dram address when the PQ is allocated, used when dram_bd is true in
+ *                   queue properites.
  * @pi: holds the queue's pi value.
  * @ci: holds the queue's ci value, AS CALCULATED BY THE DRIVER (not real ci).
  * @hw_queue_id: the id of the H/W queue.
@@ -1061,6 +1067,7 @@ struct hl_encaps_signals_mgr {
  * @valid: is the queue valid (we have array of 32 queues, not all of them
  *         exist).
  * @supports_sync_stream: True if queue supports sync stream
+ * @dram_bd: True if the bd should be copied to dram, needed for PQ which has been allocated on dram
  */
 struct hl_hw_queue {
 	struct hl_cs_job			**shadow_queue;
@@ -1069,6 +1076,7 @@ struct hl_hw_queue {
 	enum hl_collective_mode			collective_mode;
 	void					*kernel_address;
 	dma_addr_t				bus_address;
+	u64					pq_dram_address;
 	u32					pi;
 	atomic_t				ci;
 	u32					hw_queue_id;
@@ -1077,6 +1085,7 @@ struct hl_hw_queue {
 	u16					int_queue_len;
 	u8					valid;
 	u8					supports_sync_stream;
+	u8					dram_bd;
 };
 
 /**
@@ -3889,6 +3898,7 @@ int hl_mmu_hr_get_tlb_info(struct hl_ctx *ctx, u64 virt_addr, struct hl_mmu_hop_
 							struct hl_hr_mmu_funcs *hr_func);
 int hl_mmu_if_set_funcs(struct hl_device *hdev);
 void hl_mmu_v1_set_funcs(struct hl_device *hdev, struct hl_mmu_funcs *mmu);
+void hl_mmu_v2_set_funcs(struct hl_device *hdev, struct hl_mmu_funcs *mmu);
 void hl_mmu_v2_hr_set_funcs(struct hl_device *hdev, struct hl_mmu_funcs *mmu);
 int hl_mmu_va_to_pa(struct hl_ctx *ctx, u64 virt_addr, u64 *phys_addr);
 int hl_mmu_get_tlb_info(struct hl_ctx *ctx, u64 virt_addr,
@@ -3896,6 +3906,22 @@ int hl_mmu_get_tlb_info(struct hl_ctx *ctx, u64 virt_addr,
 u64 hl_mmu_scramble_addr(struct hl_device *hdev, u64 addr);
 u64 hl_mmu_descramble_addr(struct hl_device *hdev, u64 addr);
 bool hl_is_dram_va(struct hl_device *hdev, u64 virt_addr);
+struct pgt_info *hl_mmu_dr_get_pgt_info(struct hl_ctx *ctx, u64 hop_addr);
+void hl_mmu_dr_free_hop(struct hl_ctx *ctx, u64 hop_addr);
+void hl_mmu_dr_free_pgt_node(struct hl_ctx *ctx, struct pgt_info *pgt_info);
+u64 hl_mmu_dr_get_phys_hop0_addr(struct hl_ctx *ctx);
+u64 hl_mmu_dr_get_hop0_addr(struct hl_ctx *ctx);
+void hl_mmu_dr_write_pte(struct hl_ctx *ctx, u64 shadow_pte_addr, u64 val);
+void hl_mmu_dr_write_final_pte(struct hl_ctx *ctx, u64 shadow_pte_addr, u64 val);
+void hl_mmu_dr_clear_pte(struct hl_ctx *ctx, u64 pte_addr);
+u64 hl_mmu_dr_get_phys_addr(struct hl_ctx *ctx, u64 shadow_addr);
+void hl_mmu_dr_get_pte(struct hl_ctx *ctx, u64 hop_addr);
+int hl_mmu_dr_put_pte(struct hl_ctx *ctx, u64 hop_addr);
+u64 hl_mmu_dr_get_alloc_next_hop_addr(struct hl_ctx *ctx, u64 curr_pte, bool *is_new_hop);
+u64 hl_mmu_dr_alloc_hop(struct hl_ctx *ctx);
+void hl_mmu_dr_flush(struct hl_ctx *ctx);
+int hl_mmu_dr_init(struct hl_device *hdev);
+void hl_mmu_dr_fini(struct hl_device *hdev);
 
 int hl_fw_load_fw_to_device(struct hl_device *hdev, const char *fw_name,
 				void __iomem *dst, u32 src_offset, u32 size);
diff --git a/drivers/accel/habanalabs/common/hw_queue.c b/drivers/accel/habanalabs/common/hw_queue.c
index d0087c0ec48c9f..3d04a7507cce3c 100644
--- a/drivers/accel/habanalabs/common/hw_queue.c
+++ b/drivers/accel/habanalabs/common/hw_queue.c
@@ -84,6 +84,8 @@ void hl_hw_queue_submit_bd(struct hl_device *hdev, struct hl_hw_queue *q,
 		u32 ctl, u32 len, u64 ptr)
 {
 	struct hl_bd *bd;
+	u64 addr;
+	int i;
 
 	bd = q->kernel_address;
 	bd += hl_pi_2_offset(q->pi);
@@ -91,7 +93,16 @@ void hl_hw_queue_submit_bd(struct hl_device *hdev, struct hl_hw_queue *q,
 	bd->len = cpu_to_le32(len);
 	bd->ptr = cpu_to_le64(ptr);
 
+	if (q->dram_bd)
+		for (i = 0 ; i < 2 ; i++) {
+			addr = q->pq_dram_address +
+			((hl_pi_2_offset(q->pi) * sizeof(struct hl_bd))	+ (i * sizeof(u64)));
+			hdev->asic_funcs->access_dev_mem(hdev, PCI_REGION_DRAM,	addr,
+						(u64 *)(bd) + i, DEBUGFS_WRITE64);
+		}
+
 	q->pi = hl_queue_inc_ptr(q->pi);
+
 	hdev->asic_funcs->ring_doorbell(hdev, q->hw_queue_id, q->pi);
 }
 
@@ -1087,12 +1098,18 @@ int hl_hw_queues_create(struct hl_device *hdev)
 		q->supports_sync_stream =
 				asic->hw_queues_props[i].supports_sync_stream;
 		q->collective_mode = asic->hw_queues_props[i].collective_mode;
+		q->dram_bd = asic->hw_queues_props[i].dram_bd;
+
 		rc = queue_init(hdev, q, i);
 		if (rc) {
 			dev_err(hdev->dev,
 				"failed to initialize queue %d\n", i);
 			goto release_queues;
 		}
+
+		/* Set DRAM PQ address for the queue if it should be at DRAM */
+		if (q->dram_bd)
+			q->pq_dram_address = asic->hw_queues_props[i].q_dram_bd_address;
 	}
 
 	return 0;
diff --git a/drivers/accel/habanalabs/common/mmu/Makefile b/drivers/accel/habanalabs/common/mmu/Makefile
index 1806c524e04aca..f4b815bf4f7d63 100644
--- a/drivers/accel/habanalabs/common/mmu/Makefile
+++ b/drivers/accel/habanalabs/common/mmu/Makefile
@@ -1,3 +1,3 @@
 # SPDX-License-Identifier: GPL-2.0-only
 HL_COMMON_MMU_FILES := common/mmu/mmu.o common/mmu/mmu_v1.o \
-			common/mmu/mmu_v2_hr.o
+			common/mmu/mmu_v2.o common/mmu/mmu_v2_hr.o
diff --git a/drivers/accel/habanalabs/common/mmu/mmu.c b/drivers/accel/habanalabs/common/mmu/mmu.c
index b654302a68fc08..fa7919dba783c4 100644
--- a/drivers/accel/habanalabs/common/mmu/mmu.c
+++ b/drivers/accel/habanalabs/common/mmu/mmu.c
@@ -585,6 +585,8 @@ int hl_mmu_get_tlb_info(struct hl_ctx *ctx, u64 virt_addr,
 
 int hl_mmu_if_set_funcs(struct hl_device *hdev)
 {
+	struct asic_fixed_properties *prop = &hdev->asic_prop;
+
 	if (hdev->mmu_disable)
 		return 0;
 
@@ -597,8 +599,9 @@ int hl_mmu_if_set_funcs(struct hl_device *hdev)
 	case ASIC_GAUDI2:
 	case ASIC_GAUDI2B:
 	case ASIC_GAUDI2C:
-		/* MMUs in Gaudi2 are always host resident */
-		hl_mmu_v2_hr_set_funcs(hdev, &hdev->mmu_func[MMU_HR_PGT]);
+		hl_mmu_v2_set_funcs(hdev, &hdev->mmu_func[MMU_DR_PGT]);
+		if (prop->pmmu.host_resident)
+			hl_mmu_v2_hr_set_funcs(hdev, &hdev->mmu_func[MMU_HR_PGT]);
 		break;
 	default:
 		dev_err(hdev->dev, "Unrecognized ASIC type %d\n",
@@ -1209,3 +1212,219 @@ int hl_mmu_hr_get_tlb_info(struct hl_ctx *ctx, u64 virt_addr, struct hl_mmu_hop_
 	return 0;
 }
 
+struct pgt_info *hl_mmu_dr_get_pgt_info(struct hl_ctx *ctx, u64 hop_addr)
+{
+	struct pgt_info *pgt_info = NULL;
+
+	hash_for_each_possible(ctx->mmu_shadow_hash, pgt_info, node,
+			(unsigned long) hop_addr)
+		if (hop_addr == pgt_info->shadow_addr)
+			break;
+
+	return pgt_info;
+}
+
+void hl_mmu_dr_free_hop(struct hl_ctx *ctx, u64 hop_addr)
+{
+	struct pgt_info *pgt_info = hl_mmu_dr_get_pgt_info(ctx, hop_addr);
+
+	hl_mmu_dr_free_pgt_node(ctx, pgt_info);
+}
+
+void hl_mmu_dr_free_pgt_node(struct hl_ctx *ctx, struct pgt_info *pgt_info)
+{
+	struct hl_device *hdev = ctx->hdev;
+
+	gen_pool_free(hdev->mmu_priv.dr.mmu_pgt_pool, pgt_info->phys_addr,
+			hdev->asic_prop.mmu_hop_table_size);
+	hash_del(&pgt_info->node);
+	kfree((u64 *) (uintptr_t) pgt_info->shadow_addr);
+	kfree(pgt_info);
+}
+
+u64 hl_mmu_dr_get_phys_hop0_addr(struct hl_ctx *ctx)
+{
+	return ctx->hdev->asic_prop.mmu_pgt_addr +
+			(ctx->asid * ctx->hdev->asic_prop.mmu_hop_table_size);
+}
+
+u64 hl_mmu_dr_get_hop0_addr(struct hl_ctx *ctx)
+{
+	return (u64) (uintptr_t) ctx->hdev->mmu_priv.dr.mmu_shadow_hop0 +
+			(ctx->asid * ctx->hdev->asic_prop.mmu_hop_table_size);
+}
+
+u64 hl_mmu_dr_get_phys_addr(struct hl_ctx *ctx, u64 shadow_addr)
+{
+	u64 page_mask = ctx->hdev->asic_prop.mmu_hop_table_size - 1;
+	u64 shadow_hop_addr = shadow_addr & (~page_mask);
+	u64 pte_offset = shadow_addr & page_mask;
+	u64 phys_hop_addr;
+
+	if (shadow_hop_addr != hl_mmu_dr_get_hop0_addr(ctx))
+		phys_hop_addr = hl_mmu_dr_get_pgt_info(ctx, shadow_hop_addr)->phys_addr;
+	else
+		phys_hop_addr = hl_mmu_dr_get_phys_hop0_addr(ctx);
+
+	return phys_hop_addr + pte_offset;
+}
+
+void hl_mmu_dr_write_pte(struct hl_ctx *ctx, u64 shadow_pte_addr, u64 val)
+{
+	u64 phys_val = hl_mmu_dr_get_phys_addr(ctx, val);
+
+	ctx->hdev->asic_funcs->write_pte(ctx->hdev, hl_mmu_dr_get_phys_addr(ctx, shadow_pte_addr),
+					phys_val);
+
+	*(u64 *) (uintptr_t) shadow_pte_addr = val;
+}
+
+void hl_mmu_dr_write_final_pte(struct hl_ctx *ctx, u64 shadow_pte_addr, u64 val)
+{
+	ctx->hdev->asic_funcs->write_pte(ctx->hdev,
+				hl_mmu_dr_get_phys_addr(ctx, shadow_pte_addr), val);
+	*(u64 *) (uintptr_t) shadow_pte_addr = val;
+}
+
+void hl_mmu_dr_clear_pte(struct hl_ctx *ctx, u64 pte_addr)
+{
+	hl_mmu_dr_write_final_pte(ctx, pte_addr, 0);
+}
+
+void hl_mmu_dr_get_pte(struct hl_ctx *ctx, u64 hop_addr)
+{
+	hl_mmu_dr_get_pgt_info(ctx, hop_addr)->num_of_ptes++;
+}
+
+int hl_mmu_dr_put_pte(struct hl_ctx *ctx, u64 hop_addr)
+{
+	struct pgt_info *pgt_info = hl_mmu_dr_get_pgt_info(ctx, hop_addr);
+	int num_of_ptes_left;
+
+	pgt_info->num_of_ptes--;
+
+	/*
+	 * Need to save the number of ptes left because hl_mmu_free_hop might free
+	 * the pgt_info
+	 */
+	num_of_ptes_left = pgt_info->num_of_ptes;
+	if (!num_of_ptes_left)
+		hl_mmu_dr_free_pgt_node(ctx, pgt_info);
+
+	return num_of_ptes_left;
+}
+
+u64 hl_mmu_dr_alloc_hop(struct hl_ctx *ctx)
+{
+	struct hl_device *hdev = ctx->hdev;
+	struct asic_fixed_properties *prop = &hdev->asic_prop;
+	struct pgt_info *pgt_info;
+	u64 phys_addr, shadow_addr;
+
+	pgt_info = kmalloc(sizeof(*pgt_info), GFP_KERNEL);
+	if (!pgt_info)
+		return ULLONG_MAX;
+
+	phys_addr = (u64) gen_pool_alloc(hdev->mmu_priv.dr.mmu_pgt_pool,
+					prop->mmu_hop_table_size);
+	if (!phys_addr) {
+		dev_err(hdev->dev, "failed to allocate page\n");
+		goto pool_add_err;
+	}
+
+	shadow_addr = (u64) (uintptr_t) kzalloc(prop->mmu_hop_table_size,
+						GFP_KERNEL);
+	if (!shadow_addr)
+		goto shadow_err;
+
+	pgt_info->phys_addr = phys_addr;
+	pgt_info->shadow_addr = shadow_addr;
+	pgt_info->ctx = ctx;
+	pgt_info->num_of_ptes = 0;
+	hash_add(ctx->mmu_shadow_hash, &pgt_info->node, shadow_addr);
+
+	return shadow_addr;
+
+shadow_err:
+	gen_pool_free(hdev->mmu_priv.dr.mmu_pgt_pool,
+			phys_addr, prop->mmu_hop_table_size);
+pool_add_err:
+	kfree(pgt_info);
+
+	return ULLONG_MAX;
+}
+
+u64 hl_mmu_dr_get_alloc_next_hop_addr(struct hl_ctx *ctx, u64 curr_pte, bool *is_new_hop)
+{
+	u64 hop_addr = hl_mmu_get_next_hop_addr(ctx, curr_pte);
+
+	if (hop_addr == ULLONG_MAX) {
+		hop_addr = hl_mmu_dr_alloc_hop(ctx);
+		*is_new_hop = (hop_addr != ULLONG_MAX);
+	}
+
+	return hop_addr;
+}
+
+void hl_mmu_dr_flush(struct hl_ctx *ctx)
+{
+	/* flush all writes from all cores to reach PCI */
+	mb();
+	ctx->hdev->asic_funcs->read_pte(ctx->hdev, hl_mmu_dr_get_phys_hop0_addr(ctx));
+}
+
+int hl_mmu_dr_init(struct hl_device *hdev)
+{
+	struct asic_fixed_properties *prop = &hdev->asic_prop;
+	int rc;
+
+	hdev->mmu_priv.dr.mmu_pgt_pool =
+			gen_pool_create(__ffs(prop->mmu_hop_table_size), -1);
+
+	if (!hdev->mmu_priv.dr.mmu_pgt_pool) {
+		dev_err(hdev->dev, "Failed to create page gen pool\n");
+		return -ENOMEM;
+	}
+
+	rc = gen_pool_add(hdev->mmu_priv.dr.mmu_pgt_pool, prop->mmu_pgt_addr +
+			prop->mmu_hop0_tables_total_size,
+			prop->dmmu.pgt_size - prop->mmu_hop0_tables_total_size,
+			-1);
+	if (rc) {
+		dev_err(hdev->dev, "Failed to add memory to page gen pool\n");
+		goto err_pool_add;
+	}
+
+	hdev->mmu_priv.dr.mmu_shadow_hop0 = kvcalloc(prop->max_asid,
+						prop->mmu_hop_table_size, GFP_KERNEL);
+	if (ZERO_OR_NULL_PTR(hdev->mmu_priv.dr.mmu_shadow_hop0)) {
+		rc = -ENOMEM;
+		goto err_pool_add;
+	}
+
+	/* MMU H/W init will be done in device hw_init() */
+
+	return 0;
+
+err_pool_add:
+	gen_pool_destroy(hdev->mmu_priv.dr.mmu_pgt_pool);
+
+	return rc;
+}
+
+void hl_mmu_dr_fini(struct hl_device *hdev)
+{
+	/* MMU H/W fini was already done in device hw_fini() */
+
+	if (ZERO_OR_NULL_PTR(hdev->mmu_priv.dr.mmu_shadow_hop0))
+		return;
+
+	kvfree(hdev->mmu_priv.dr.mmu_shadow_hop0);
+	gen_pool_destroy(hdev->mmu_priv.dr.mmu_pgt_pool);
+
+	/* Make sure that if we arrive here again without init was
+	 * called we won't cause kernel panic. This can happen for
+	 * example if we fail during hard reset code at certain points
+	 */
+	hdev->mmu_priv.dr.mmu_shadow_hop0 = NULL;
+}
diff --git a/drivers/accel/habanalabs/common/mmu/mmu_v1.c b/drivers/accel/habanalabs/common/mmu/mmu_v1.c
index d925dc4dd09725..64b5c8fbb166d9 100644
--- a/drivers/accel/habanalabs/common/mmu/mmu_v1.c
+++ b/drivers/accel/habanalabs/common/mmu/mmu_v1.c
@@ -12,166 +12,6 @@
 
 #define MMU_V1_MAX_HOPS	(MMU_HOP4 + 1)
 
-static inline u64 get_phys_addr(struct hl_ctx *ctx, u64 shadow_addr);
-
-static struct pgt_info *get_pgt_info(struct hl_ctx *ctx, u64 hop_addr)
-{
-	struct pgt_info *pgt_info = NULL;
-
-	hash_for_each_possible(ctx->mmu_shadow_hash, pgt_info, node,
-				(unsigned long) hop_addr)
-		if (hop_addr == pgt_info->shadow_addr)
-			break;
-
-	return pgt_info;
-}
-
-static void _free_hop(struct hl_ctx *ctx, struct pgt_info *pgt_info)
-{
-	struct hl_device *hdev = ctx->hdev;
-
-	gen_pool_free(hdev->mmu_priv.dr.mmu_pgt_pool, pgt_info->phys_addr,
-			hdev->asic_prop.mmu_hop_table_size);
-	hash_del(&pgt_info->node);
-	kfree((u64 *) (uintptr_t) pgt_info->shadow_addr);
-	kfree(pgt_info);
-}
-
-static void free_hop(struct hl_ctx *ctx, u64 hop_addr)
-{
-	struct pgt_info *pgt_info = get_pgt_info(ctx, hop_addr);
-
-	_free_hop(ctx, pgt_info);
-}
-
-static u64 alloc_hop(struct hl_ctx *ctx)
-{
-	struct hl_device *hdev = ctx->hdev;
-	struct asic_fixed_properties *prop = &hdev->asic_prop;
-	struct pgt_info *pgt_info;
-	u64 phys_addr, shadow_addr;
-
-	pgt_info = kmalloc(sizeof(*pgt_info), GFP_KERNEL);
-	if (!pgt_info)
-		return ULLONG_MAX;
-
-	phys_addr = (u64) gen_pool_alloc(hdev->mmu_priv.dr.mmu_pgt_pool,
-					prop->mmu_hop_table_size);
-	if (!phys_addr) {
-		dev_err(hdev->dev, "failed to allocate page\n");
-		goto pool_add_err;
-	}
-
-	shadow_addr = (u64) (uintptr_t) kzalloc(prop->mmu_hop_table_size,
-						GFP_KERNEL);
-	if (!shadow_addr)
-		goto shadow_err;
-
-	pgt_info->phys_addr = phys_addr;
-	pgt_info->shadow_addr = shadow_addr;
-	pgt_info->ctx = ctx;
-	pgt_info->num_of_ptes = 0;
-	hash_add(ctx->mmu_shadow_hash, &pgt_info->node, shadow_addr);
-
-	return shadow_addr;
-
-shadow_err:
-	gen_pool_free(hdev->mmu_priv.dr.mmu_pgt_pool, phys_addr,
-			prop->mmu_hop_table_size);
-pool_add_err:
-	kfree(pgt_info);
-
-	return ULLONG_MAX;
-}
-
-static inline u64 get_phys_hop0_addr(struct hl_ctx *ctx)
-{
-	return ctx->hdev->asic_prop.mmu_pgt_addr +
-			(ctx->asid * ctx->hdev->asic_prop.mmu_hop_table_size);
-}
-
-static inline u64 get_hop0_addr(struct hl_ctx *ctx)
-{
-	return (u64) (uintptr_t) ctx->hdev->mmu_priv.dr.mmu_shadow_hop0 +
-			(ctx->asid * ctx->hdev->asic_prop.mmu_hop_table_size);
-}
-
-static void flush(struct hl_ctx *ctx)
-{
-	/* flush all writes from all cores to reach PCI */
-	mb();
-	ctx->hdev->asic_funcs->read_pte(ctx->hdev, get_phys_hop0_addr(ctx));
-}
-
-/* transform the value to physical address when writing to H/W */
-static inline void write_pte(struct hl_ctx *ctx, u64 shadow_pte_addr, u64 val)
-{
-	/*
-	 * The value to write is actually the address of the next shadow hop +
-	 * flags at the 12 LSBs.
-	 * Hence in order to get the value to write to the physical PTE, we
-	 * clear the 12 LSBs and translate the shadow hop to its associated
-	 * physical hop, and add back the original 12 LSBs.
-	 */
-	u64 phys_val = get_phys_addr(ctx, val & HOP_PHYS_ADDR_MASK) |
-				(val & FLAGS_MASK);
-
-	ctx->hdev->asic_funcs->write_pte(ctx->hdev,
-					get_phys_addr(ctx, shadow_pte_addr),
-					phys_val);
-
-	*(u64 *) (uintptr_t) shadow_pte_addr = val;
-}
-
-/* do not transform the value to physical address when writing to H/W */
-static inline void write_final_pte(struct hl_ctx *ctx, u64 shadow_pte_addr,
-					u64 val)
-{
-	ctx->hdev->asic_funcs->write_pte(ctx->hdev,
-					get_phys_addr(ctx, shadow_pte_addr),
-					val);
-	*(u64 *) (uintptr_t) shadow_pte_addr = val;
-}
-
-/* clear the last and present bits */
-static inline void clear_pte(struct hl_ctx *ctx, u64 pte_addr)
-{
-	/* no need to transform the value to physical address */
-	write_final_pte(ctx, pte_addr, 0);
-}
-
-static inline void get_pte(struct hl_ctx *ctx, u64 hop_addr)
-{
-	get_pgt_info(ctx, hop_addr)->num_of_ptes++;
-}
-
-/*
- * put_pte - decrement the num of ptes and free the hop if possible
- *
- * @ctx: pointer to the context structure
- * @hop_addr: addr of the hop
- *
- * This function returns the number of ptes left on this hop. If the number is
- * 0, it means the pte was freed.
- */
-static inline int put_pte(struct hl_ctx *ctx, u64 hop_addr)
-{
-	struct pgt_info *pgt_info = get_pgt_info(ctx, hop_addr);
-	int num_of_ptes_left;
-
-	pgt_info->num_of_ptes--;
-
-	/*
-	 * Need to save the number of ptes left because free_hop might free
-	 * the pgt_info
-	 */
-	num_of_ptes_left = pgt_info->num_of_ptes;
-	if (!num_of_ptes_left)
-		_free_hop(ctx, pgt_info);
-
-	return num_of_ptes_left;
-}
-
 static inline u64 get_hop_pte_addr(struct hl_ctx *ctx, struct hl_mmu_properties *mmu_prop,
 					u64 *hop_addr_arr, u64 virt_addr, enum mmu_hop_num hop_idx)
 {
@@ -183,35 +23,6 @@ static inline u64 get_hop_pte_addr(struct hl_ctx *ctx, struct hl_mmu_properties
 			ctx->hdev->asic_prop.mmu_pte_size * ((virt_addr & mask) >> shift);
 }
 
-static inline u64 get_alloc_next_hop_addr(struct hl_ctx *ctx, u64 curr_pte,
-						bool *is_new_hop)
-{
-	u64 hop_addr = hl_mmu_get_next_hop_addr(ctx, curr_pte);
-
-	if (hop_addr == ULLONG_MAX) {
-		hop_addr = alloc_hop(ctx);
-		*is_new_hop = (hop_addr != ULLONG_MAX);
-	}
-
-	return hop_addr;
-}
-
-/* translates shadow address inside hop to a physical address */
-static inline u64 get_phys_addr(struct hl_ctx *ctx, u64 shadow_addr)
-{
-	u64 page_mask = (ctx->hdev->asic_prop.mmu_hop_table_size - 1);
-	u64 shadow_hop_addr = shadow_addr & ~page_mask;
-	u64 pte_offset = shadow_addr & page_mask;
-	u64 phys_hop_addr;
-
-	if (shadow_hop_addr != get_hop0_addr(ctx))
-		phys_hop_addr = get_pgt_info(ctx, shadow_hop_addr)->phys_addr;
-	else
-		phys_hop_addr = get_phys_hop0_addr(ctx);
-
-	return phys_hop_addr + pte_offset;
-}
-
 static int dram_default_mapping_init(struct hl_ctx *ctx)
 {
 	struct hl_device *hdev = ctx->hdev;
@@ -236,9 +47,9 @@ static int dram_default_mapping_init(struct hl_ctx *ctx)
 	if (!ctx->dram_default_hops)
 		return -ENOMEM;
 
-	hop0_addr = get_hop0_addr(ctx);
+	hop0_addr = hl_mmu_dr_get_hop0_addr(ctx);
 
-	hop1_addr = alloc_hop(ctx);
+	hop1_addr = hl_mmu_dr_alloc_hop(ctx);
 	if (hop1_addr == ULLONG_MAX) {
 		dev_err(hdev->dev, "failed to alloc hop 1\n");
 		rc = -ENOMEM;
@@ -247,7 +58,7 @@ static int dram_default_mapping_init(struct hl_ctx *ctx)
 
 	ctx->dram_default_hops[total_hops - 1] = hop1_addr;
 
-	hop2_addr = alloc_hop(ctx);
+	hop2_addr = hl_mmu_dr_alloc_hop(ctx);
 	if (hop2_addr == ULLONG_MAX) {
 		dev_err(hdev->dev, "failed to alloc hop 2\n");
 		rc = -ENOMEM;
@@ -257,7 +68,7 @@ static int dram_default_mapping_init(struct hl_ctx *ctx)
 	ctx->dram_default_hops[total_hops - 2] = hop2_addr;
 
 	for (i = 0 ; i < num_of_hop3 ; i++) {
-		ctx->dram_default_hops[i] = alloc_hop(ctx);
+		ctx->dram_default_hops[i] = hl_mmu_dr_alloc_hop(ctx);
 		if (ctx->dram_default_hops[i] == ULLONG_MAX) {
 			dev_err(hdev->dev, "failed to alloc hop 3, i: %d\n", i);
 			rc = -ENOMEM;
@@ -268,18 +79,18 @@ static int dram_default_mapping_init(struct hl_ctx *ctx)
 
 	/* need only pte 0 in hops 0 and 1 */
 	pte_val = (hop1_addr & HOP_PHYS_ADDR_MASK) | PAGE_PRESENT_MASK;
-	write_pte(ctx, hop0_addr, pte_val);
+	hl_mmu_dr_write_pte(ctx, hop0_addr, pte_val);
 
 	pte_val = (hop2_addr & HOP_PHYS_ADDR_MASK) | PAGE_PRESENT_MASK;
-	write_pte(ctx, hop1_addr, pte_val);
-	get_pte(ctx, hop1_addr);
+	hl_mmu_dr_write_pte(ctx, hop1_addr, pte_val);
+	hl_mmu_dr_get_pte(ctx, hop1_addr);
 
 	hop2_pte_addr = hop2_addr;
 	for (i = 0 ; i < num_of_hop3 ; i++) {
 		pte_val = (ctx->dram_default_hops[i] & HOP_PHYS_ADDR_MASK) |
 				PAGE_PRESENT_MASK;
-		write_pte(ctx, hop2_pte_addr, pte_val);
-		get_pte(ctx, hop2_addr);
+		hl_mmu_dr_write_pte(ctx, hop2_pte_addr, pte_val);
+		hl_mmu_dr_get_pte(ctx, hop2_addr);
 		hop2_pte_addr += HL_PTE_SIZE;
 	}
 
@@ -289,23 +100,23 @@ static int dram_default_mapping_init(struct hl_ctx *ctx)
 	for (i = 0 ; i < num_of_hop3 ; i++) {
 		hop3_pte_addr = ctx->dram_default_hops[i];
 		for (j = 0 ; j < HOP_PTE_ENTRIES_512 ; j++) {
-			write_final_pte(ctx, hop3_pte_addr, pte_val);
-			get_pte(ctx, ctx->dram_default_hops[i]);
+			hl_mmu_dr_write_final_pte(ctx, hop3_pte_addr, pte_val);
+			hl_mmu_dr_get_pte(ctx, ctx->dram_default_hops[i]);
 			hop3_pte_addr += HL_PTE_SIZE;
 		}
 	}
 
-	flush(ctx);
+	hl_mmu_dr_flush(ctx);
 
 	return 0;
 
 hop3_err:
 	for (i = 0 ; i < hop3_allocated ; i++)
-		free_hop(ctx, ctx->dram_default_hops[i]);
+		hl_mmu_dr_free_hop(ctx, ctx->dram_default_hops[i]);
 
-	free_hop(ctx, hop2_addr);
+	hl_mmu_dr_free_hop(ctx, hop2_addr);
 hop2_err:
-	free_hop(ctx, hop1_addr);
+	hl_mmu_dr_free_hop(ctx, hop1_addr);
 hop1_err:
 	kfree(ctx->dram_default_hops);
 
@@ -329,7 +140,7 @@ static void dram_default_mapping_fini(struct hl_ctx *ctx)
 	do_div(num_of_hop3, prop->dram_page_size);
 	do_div(num_of_hop3, HOP_PTE_ENTRIES_512);
 
-	hop0_addr = get_hop0_addr(ctx);
+	hop0_addr = hl_mmu_dr_get_hop0_addr(ctx);
 	/* add hop1 and hop2 */
 	total_hops = num_of_hop3 + 2;
 	hop1_addr = ctx->dram_default_hops[total_hops - 1];
@@ -338,101 +149,26 @@ static void dram_default_mapping_fini(struct hl_ctx *ctx)
 	for (i = 0 ; i < num_of_hop3 ; i++) {
 		hop3_pte_addr = ctx->dram_default_hops[i];
 		for (j = 0 ; j < HOP_PTE_ENTRIES_512 ; j++) {
-			clear_pte(ctx, hop3_pte_addr);
-			put_pte(ctx, ctx->dram_default_hops[i]);
+			hl_mmu_dr_clear_pte(ctx, hop3_pte_addr);
+			hl_mmu_dr_put_pte(ctx, ctx->dram_default_hops[i]);
 			hop3_pte_addr += HL_PTE_SIZE;
 		}
 	}
 
 	hop2_pte_addr = hop2_addr;
 	for (i = 0 ; i < num_of_hop3 ; i++) {
-		clear_pte(ctx, hop2_pte_addr);
-		put_pte(ctx, hop2_addr);
+		hl_mmu_dr_clear_pte(ctx, hop2_pte_addr);
+		hl_mmu_dr_put_pte(ctx, hop2_addr);
 		hop2_pte_addr += HL_PTE_SIZE;
 	}
 
-	clear_pte(ctx, hop1_addr);
-	put_pte(ctx, hop1_addr);
-	clear_pte(ctx, hop0_addr);
+	hl_mmu_dr_clear_pte(ctx, hop1_addr);
+	hl_mmu_dr_put_pte(ctx, hop1_addr);
+	hl_mmu_dr_clear_pte(ctx, hop0_addr);
 
 	kfree(ctx->dram_default_hops);
 
-	flush(ctx);
-}
-
-/**
- * hl_mmu_v1_init() - initialize the MMU module.
- * @hdev: habanalabs device structure.
- *
- * This function does the following:
- * - Create a pool of pages for pgt_infos.
- * - Create a shadow table for pgt
- *
- * Return: 0 for success, non-zero for failure.
- */
-static int hl_mmu_v1_init(struct hl_device *hdev)
-{
-	struct asic_fixed_properties *prop = &hdev->asic_prop;
-	int rc;
-
-	hdev->mmu_priv.dr.mmu_pgt_pool =
-			gen_pool_create(__ffs(prop->mmu_hop_table_size), -1);
-
-	if (!hdev->mmu_priv.dr.mmu_pgt_pool) {
-		dev_err(hdev->dev, "Failed to create page gen pool\n");
-		return -ENOMEM;
-	}
-
-	rc = gen_pool_add(hdev->mmu_priv.dr.mmu_pgt_pool, prop->mmu_pgt_addr +
-			prop->mmu_hop0_tables_total_size,
-			prop->mmu_pgt_size - prop->mmu_hop0_tables_total_size,
-			-1);
-	if (rc) {
-		dev_err(hdev->dev, "Failed to add memory to page gen pool\n");
-		goto err_pool_add;
-	}
-
-	hdev->mmu_priv.dr.mmu_shadow_hop0 = kvcalloc(prop->max_asid, prop->mmu_hop_table_size,
-										GFP_KERNEL);
-	if (ZERO_OR_NULL_PTR(hdev->mmu_priv.dr.mmu_shadow_hop0)) {
-		rc = -ENOMEM;
-		goto err_pool_add;
-	}
-
-	/* MMU H/W init will be done in device hw_init() */
-
-	return 0;
-
-err_pool_add:
-	gen_pool_destroy(hdev->mmu_priv.dr.mmu_pgt_pool);
-
-	return rc;
-}
-
-/**
- * hl_mmu_v1_fini() - release the MMU module.
- * @hdev: habanalabs device structure.
- *
- * This function does the following:
- * - Disable MMU in H/W.
- * - Free the pgt_infos pool.
- *
- * All contexts should be freed before calling this function.
- */
-static void hl_mmu_v1_fini(struct hl_device *hdev)
-{
-	/* MMU H/W fini was already done in device hw_fini() */
-
-	if (!ZERO_OR_NULL_PTR(hdev->mmu_priv.dr.mmu_shadow_hop0)) {
-		kvfree(hdev->mmu_priv.dr.mmu_shadow_hop0);
-		gen_pool_destroy(hdev->mmu_priv.dr.mmu_pgt_pool);
-
-		/* Make sure that if we arrive here again without init was
-		 * called we won't cause kernel panic. This can happen for
-		 * example if we fail during hard reset code at certain points
-		 */
-		hdev->mmu_priv.dr.mmu_shadow_hop0 = NULL;
-	}
+	hl_mmu_dr_flush(ctx);
 }
 
 /**
@@ -476,7 +212,7 @@ static void hl_mmu_v1_ctx_fini(struct hl_ctx *ctx)
 		dev_err_ratelimited(hdev->dev,
 			"pgt_info of addr 0x%llx of asid %d was not destroyed, num_ptes: %d\n",
 			pgt_info->phys_addr, ctx->asid, pgt_info->num_of_ptes);
-		_free_hop(ctx, pgt_info);
+		hl_mmu_dr_free_pgt_node(ctx, pgt_info);
 	}
 }
 
@@ -495,7 +231,7 @@ static int hl_mmu_v1_unmap(struct hl_ctx *ctx,
 
 	for (hop_idx = MMU_HOP0; hop_idx < MMU_HOP4; hop_idx++) {
 		if (hop_idx == MMU_HOP0) {
-			hop_addr[hop_idx] = get_hop0_addr(ctx);
+			hop_addr[hop_idx] = hl_mmu_dr_get_hop0_addr(ctx);
 		} else {
 			hop_addr[hop_idx] = hl_mmu_get_next_hop_addr(ctx, curr_pte);
 			if (hop_addr[hop_idx] == ULLONG_MAX)
@@ -546,30 +282,30 @@ static int hl_mmu_v1_unmap(struct hl_ctx *ctx,
 		}
 
 		hop_idx = MMU_HOP3;
-		write_final_pte(ctx, hop_pte_addr[hop_idx], default_pte);
-		put_pte(ctx, hop_addr[hop_idx]);
+		hl_mmu_dr_write_final_pte(ctx, hop_pte_addr[hop_idx], default_pte);
+		hl_mmu_dr_put_pte(ctx, hop_addr[hop_idx]);
 	} else {
 		if (!(curr_pte & PAGE_PRESENT_MASK))
 			goto not_mapped;
 
 		if (hop_addr[MMU_HOP4])
-			clear_pte(ctx, hop_pte_addr[MMU_HOP4]);
+			hl_mmu_dr_clear_pte(ctx, hop_pte_addr[MMU_HOP4]);
 		else
-			clear_pte(ctx, hop_pte_addr[MMU_HOP3]);
+			hl_mmu_dr_clear_pte(ctx, hop_pte_addr[MMU_HOP3]);
 
-		if (hop_addr[MMU_HOP4] && !put_pte(ctx, hop_addr[MMU_HOP4]))
+		if (hop_addr[MMU_HOP4] && !hl_mmu_dr_put_pte(ctx, hop_addr[MMU_HOP4]))
 			clear_hop3 = true;
 
 		if (!clear_hop3)
 			goto mapped;
 
 		for (hop_idx = MMU_HOP3; hop_idx >= 0; hop_idx--) {
-			clear_pte(ctx, hop_pte_addr[hop_idx]);
+			hl_mmu_dr_clear_pte(ctx, hop_pte_addr[hop_idx]);
 
 			if (hop_idx == MMU_HOP0)
 				break;
 
-			if (put_pte(ctx, hop_addr[hop_idx]))
+			if (hl_mmu_dr_put_pte(ctx, hop_addr[hop_idx]))
 				goto mapped;
 		}
 	}
@@ -616,10 +352,10 @@ static int hl_mmu_v1_map(struct hl_ctx *ctx, u64 virt_addr, u64 phys_addr,
 
 	for (hop_idx = MMU_HOP0; hop_idx < num_hops; hop_idx++) {
 		if (hop_idx == MMU_HOP0) {
-			hop_addr[hop_idx] = get_hop0_addr(ctx);
+			hop_addr[hop_idx] = hl_mmu_dr_get_hop0_addr(ctx);
 		} else {
 			hop_addr[hop_idx] =
-					get_alloc_next_hop_addr(ctx, curr_pte, &hop_new[hop_idx]);
+				hl_mmu_dr_get_alloc_next_hop_addr(ctx, curr_pte, &hop_new[hop_idx]);
 			if (hop_addr[hop_idx] == ULLONG_MAX)
 				goto err;
 		}
@@ -666,27 +402,27 @@ static int hl_mmu_v1_map(struct hl_ctx *ctx, u64 virt_addr, u64 phys_addr,
 	curr_pte = (phys_addr & HOP_PHYS_ADDR_MASK) | mmu_prop->last_mask
 			| PAGE_PRESENT_MASK;
 
-	write_final_pte(ctx, hop_pte_addr[num_hops - 1], curr_pte);
+	hl_mmu_dr_write_final_pte(ctx, hop_pte_addr[num_hops - 1], curr_pte);
 
 	for (hop_idx = MMU_HOP1; hop_idx < num_hops; hop_idx++) {
 		prev_hop = hop_idx - 1;
 
 		if (hop_new[hop_idx]) {
 			curr_pte = (hop_addr[hop_idx] & HOP_PHYS_ADDR_MASK) | PAGE_PRESENT_MASK;
-			write_pte(ctx, hop_pte_addr[prev_hop], curr_pte);
+			hl_mmu_dr_write_pte(ctx, hop_pte_addr[prev_hop], curr_pte);
 			if (hop_idx != MMU_HOP1)
-				get_pte(ctx, hop_addr[prev_hop]);
+				hl_mmu_dr_get_pte(ctx, hop_addr[prev_hop]);
 		}
 	}
 
-	get_pte(ctx, hop_addr[num_hops - 1]);
+	hl_mmu_dr_get_pte(ctx, hop_addr[num_hops - 1]);
 
 	return 0;
 
 err:
 	for (hop_idx = num_hops; hop_idx > MMU_HOP0; hop_idx--) {
 		if (hop_new[hop_idx])
-			free_hop(ctx, hop_addr[hop_idx]);
+			hl_mmu_dr_free_hop(ctx, hop_addr[hop_idx]);
 	}
 
 	return rc;
@@ -752,7 +488,7 @@ static int hl_mmu_v1_get_tlb_info(struct hl_ctx *ctx, u64 virt_addr,
 	if (is_huge)
 		used_hops--;
 
-	hops->hop_info[0].hop_addr = get_phys_hop0_addr(ctx);
+	hops->hop_info[0].hop_addr = hl_mmu_dr_get_phys_hop0_addr(ctx);
 	hops->hop_info[0].hop_pte_addr =
 			hl_mmu_get_hop_pte_phys_addr(ctx, mmu_prop, 0,
 					hops->hop_info[0].hop_addr, virt_addr);
@@ -801,13 +537,13 @@ static int hl_mmu_v1_get_tlb_info(struct hl_ctx *ctx, u64 virt_addr,
  */
 void hl_mmu_v1_set_funcs(struct hl_device *hdev, struct hl_mmu_funcs *mmu)
 {
-	mmu->init = hl_mmu_v1_init;
-	mmu->fini = hl_mmu_v1_fini;
+	mmu->init = hl_mmu_dr_init;
+	mmu->fini = hl_mmu_dr_fini;
 	mmu->ctx_init = hl_mmu_v1_ctx_init;
 	mmu->ctx_fini = hl_mmu_v1_ctx_fini;
 	mmu->map = hl_mmu_v1_map;
 	mmu->unmap = hl_mmu_v1_unmap;
-	mmu->flush = flush;
+	mmu->flush = hl_mmu_dr_flush;
 	mmu->swap_out = hl_mmu_v1_swap_out;
 	mmu->swap_in = hl_mmu_v1_swap_in;
 	mmu->get_tlb_info = hl_mmu_v1_get_tlb_info;
diff --git a/drivers/accel/habanalabs/common/mmu/mmu_v2.c b/drivers/accel/habanalabs/common/mmu/mmu_v2.c
new file mode 100644
index 00000000000000..4bc0268fff1cf0
--- /dev/null
+++ b/drivers/accel/habanalabs/common/mmu/mmu_v2.c
@@ -0,0 +1,338 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Copyright 2016-2020 HabanaLabs, Ltd.
+ * All Rights Reserved.
+ */
+
+#include "../habanalabs.h"
+#include "../../include/hw_ip/mmu/mmu_general.h"
+#include "../../include/hw_ip/mmu/mmu_v2_0.h"
+
+#include <linux/slab.h>
+
+/**
+ * hl_mmu_v2_ctx_init() - initialize a context for using the MMU module.
+ * @ctx: pointer to the context structure to initialize.
+ *
+ * Initialize a mutex to protect the concurrent mapping flow, a hash to hold all
+ * page tables hops related to this context.
+ * Return: 0 on success, non-zero otherwise.
+ */
+static int hl_mmu_v2_ctx_init(struct hl_ctx *ctx)
+{
+	hash_init(ctx->mmu_shadow_hash);
+
+	return 0;
+}
+
+/*
+ * hl_mmu_v2_ctx_fini - disable a ctx from using the mmu module
+ *
+ * @ctx: pointer to the context structure
+ *
+ * This function does the following:
+ * - Free any pgts which were not freed yet
+ * - Free the mutex
+ * - Free DRAM default page mapping hops
+ */
+static void hl_mmu_v2_ctx_fini(struct hl_ctx *ctx)
+{
+	struct hl_device *hdev = ctx->hdev;
+	struct pgt_info *pgt_info;
+	struct hlist_node *tmp;
+	int i;
+
+	if (!hash_empty(ctx->mmu_shadow_hash))
+		dev_err(hdev->dev, "ctx %d is freed while it has pgts in use\n",
+			ctx->asid);
+
+	hash_for_each_safe(ctx->mmu_shadow_hash, i, tmp, pgt_info, node) {
+		dev_err_ratelimited(hdev->dev,
+			"pgt_info of addr 0x%llx of asid %d was not destroyed, num_ptes: %d\n",
+			pgt_info->phys_addr, ctx->asid, pgt_info->num_of_ptes);
+		hl_mmu_dr_free_pgt_node(ctx, pgt_info);
+	}
+}
+
+static int hl_mmu_v2_unmap(struct hl_ctx *ctx,	u64 virt_addr, bool is_dram_addr)
+{
+	u64 hop_addr[MMU_ARCH_6_HOPS] = { 0 }, hop_pte_addr[MMU_ARCH_6_HOPS] = { 0 }, curr_pte,
+							scrambled_virt_addr;
+	struct asic_fixed_properties *prop = &ctx->hdev->asic_prop;
+	struct hl_device *hdev = ctx->hdev;
+	struct hl_mmu_properties *mmu_prop;
+	bool is_huge = false;
+	int i, hop_last;
+
+	/* device resident in V2 are allowed only for HMMU */
+	if (!is_dram_addr)
+		return -EINVAL;
+
+	mmu_prop = &prop->dmmu;
+
+	hop_last = mmu_prop->num_hops - 1;
+
+	scrambled_virt_addr = hdev->asic_funcs->scramble_addr(hdev, virt_addr);
+
+	hop_addr[0] = hl_mmu_dr_get_hop0_addr(ctx);
+	hop_pte_addr[0] = hl_mmu_get_hop_pte_phys_addr(ctx, mmu_prop, 0,
+					hop_addr[0], scrambled_virt_addr);
+	if (hop_pte_addr[0] == U64_MAX)
+		return -EFAULT;
+
+	curr_pte = *(u64 *) (uintptr_t) hop_pte_addr[0];
+
+	for (i = 1 ; i < mmu_prop->num_hops ; i++) {
+		hop_addr[i] = hl_mmu_get_next_hop_addr(ctx, curr_pte);
+		if (hop_addr[i] == ULLONG_MAX)
+			goto not_mapped;
+
+		hop_pte_addr[i] = hl_mmu_get_hop_pte_phys_addr(ctx, mmu_prop, i,
+					hop_addr[i], scrambled_virt_addr);
+		if (hop_pte_addr[i] == U64_MAX)
+			return -EFAULT;
+
+		curr_pte = *(u64 *) (uintptr_t) hop_pte_addr[i];
+
+		if ((i <= hop_last) && (curr_pte & mmu_prop->last_mask)) {
+			hop_last = i;
+			is_huge = true;
+			break;
+		}
+	}
+
+	if (is_dram_addr && !is_huge) {
+		dev_err(hdev->dev, "DRAM unmapping should use huge pages only\n");
+		return -EFAULT;
+	}
+
+	if (!(curr_pte & PAGE_PRESENT_MASK))
+		goto not_mapped;
+
+	for (i = hop_last ; i > 0 ; i--) {
+		hl_mmu_dr_clear_pte(ctx, hop_pte_addr[i]);
+		if (hl_mmu_dr_put_pte(ctx, hop_addr[i]))
+			goto mapped;
+	}
+	hl_mmu_dr_clear_pte(ctx, hop_pte_addr[0]);
+
+mapped:
+	return 0;
+
+not_mapped:
+	dev_err(hdev->dev, "virt addr 0x%llx is not mapped to phys addr\n",
+		virt_addr);
+
+	return -EINVAL;
+}
+
+static int hl_mmu_v2_map(struct hl_ctx *ctx, u64 virt_addr, u64 phys_addr,
+							u32 page_size, bool is_dram_addr)
+{
+	u64 hop_addr[MMU_ARCH_6_HOPS] = { 0 }, hop_pte_addr[MMU_ARCH_6_HOPS] = { 0 },
+			curr_pte = 0, scrambled_virt_addr, scrambled_phys_addr;
+	struct asic_fixed_properties *prop = &ctx->hdev->asic_prop;
+	bool hop_new[MMU_ARCH_6_HOPS] = { false };
+	struct hl_device *hdev = ctx->hdev;
+	struct hl_mmu_properties *mmu_prop;
+	int rc, i, hop_last;
+
+	/* device resident in V2 are allowed only for HMMU */
+	if (!is_dram_addr)
+		return -EINVAL;
+
+	mmu_prop = &prop->dmmu;
+
+	hop_last = mmu_prop->num_hops - 1;
+
+	scrambled_virt_addr = hdev->asic_funcs->scramble_addr(hdev, virt_addr);
+	scrambled_phys_addr = hdev->asic_funcs->scramble_addr(hdev, phys_addr);
+
+	/* First hop is preallocated therefore it is treated differently  */
+	hop_addr[0] = hl_mmu_dr_get_hop0_addr(ctx);
+	hop_pte_addr[0] = hl_mmu_get_hop_pte_phys_addr(ctx, mmu_prop, 0,
+						hop_addr[0], scrambled_virt_addr);
+	curr_pte = *(u64 *) (uintptr_t) hop_pte_addr[0];
+
+	/* Handle hop1 to hop_last */
+	for (i = 1 ; i <= hop_last ; i++) {
+		hop_addr[i] = hl_mmu_dr_get_alloc_next_hop_addr(ctx, curr_pte, &hop_new[i]);
+		if (hop_addr[i] == ULLONG_MAX) {
+			rc = -ENOMEM;
+			goto err;
+		}
+
+		hop_pte_addr[i] = hl_mmu_get_hop_pte_phys_addr(ctx, mmu_prop, i,
+					hop_addr[i], scrambled_virt_addr);
+		if (hop_pte_addr[i] == U64_MAX) {
+			rc = -EINVAL;
+			goto err;
+		}
+
+		if (!hop_pte_addr[i]) {
+			rc = -EINVAL;
+			goto err;
+		}
+
+		curr_pte = *(u64 *) (uintptr_t) hop_pte_addr[i];
+	}
+
+	if (curr_pte & PAGE_PRESENT_MASK) {
+		dev_err(hdev->dev,
+			"mapping already exists for virt_addr 0x%llx\n",
+				virt_addr);
+
+		for (i = 0 ; i <= hop_last ; i++)
+			dev_dbg(hdev->dev, "hop%d pte: 0x%llx (0x%llx)\n",
+				i, *(u64 *) (uintptr_t) hop_pte_addr[i],
+				hop_pte_addr[i]);
+
+		rc = -EINVAL;
+		goto err;
+	}
+
+	curr_pte = (scrambled_phys_addr & HOP_PHYS_ADDR_MASK)
+					| mmu_prop->last_mask | PAGE_PRESENT_MASK;
+
+	/* Write the PTEs */
+	hl_mmu_dr_write_final_pte(ctx, hop_pte_addr[hop_last], curr_pte);
+
+	/* for each new hop, add its address to the table of previous-hop */
+	for (i = 1 ; i <= hop_last ; i++) {
+		if (hop_new[i]) {
+			curr_pte = (hop_addr[i] & HOP_PHYS_ADDR_MASK) | PAGE_PRESENT_MASK;
+			hl_mmu_dr_write_pte(ctx, hop_pte_addr[i - 1], curr_pte);
+
+			if (i - 1)
+				hl_mmu_dr_get_pte(ctx, hop_addr[i - 1]);
+		}
+	}
+	hl_mmu_dr_get_pte(ctx, hop_addr[hop_last]);
+
+	return 0;
+
+err:
+	for (i = 1 ; i <= hop_last ; i++)
+		if (hop_new[i] && (hop_addr[i] != U64_MAX))
+			hl_mmu_dr_free_hop(ctx, hop_addr[i]);
+
+	return rc;
+}
+
+/*
+ * hl_mmu_v2_swap_out - marks all mapping of the given ctx as swapped out
+ *
+ * @ctx: pointer to the context structure
+ *
+ */
+static void hl_mmu_v2_swap_out(struct hl_ctx *ctx)
+{
+
+}
+
+/*
+ * hl_mmu_v2_swap_in - marks all mapping of the given ctx as swapped in
+ *
+ * @ctx: pointer to the context structure
+ *
+ */
+static void hl_mmu_v2_swap_in(struct hl_ctx *ctx)
+{
+
+}
+
+static int hl_mmu_v2_get_tlb_info(struct hl_ctx *ctx, u64 virt_addr, struct hl_mmu_hop_info *hops)
+{
+	struct asic_fixed_properties *prop = &ctx->hdev->asic_prop;
+	struct hl_device *hdev = ctx->hdev;
+	struct hl_mmu_properties *mmu_prop;
+	bool is_dram_addr;
+	int i;
+
+	is_dram_addr = hl_mem_area_inside_range(virt_addr, prop->dmmu.page_size,
+						prop->dmmu.start_addr,
+						prop->dmmu.end_addr);
+
+	/* device resident in V2 are allowed only for HMMU */
+	if (!is_dram_addr)
+		return -EINVAL;
+
+	mmu_prop = &prop->dmmu;
+	hops->range_type = HL_VA_RANGE_TYPE_DRAM;
+
+	hops->scrambled_vaddr = hdev->asic_funcs->scramble_addr(hdev, virt_addr);
+
+	hops->hop_info[0].hop_addr = hl_mmu_dr_get_phys_hop0_addr(ctx);
+	hops->hop_info[0].hop_pte_addr = hl_mmu_get_hop_pte_phys_addr(ctx, mmu_prop, 0,
+						hops->hop_info[0].hop_addr,
+							hops->scrambled_vaddr);
+	if (hops->hop_info[0].hop_pte_addr == U64_MAX)
+		return -EFAULT;
+
+	hops->hop_info[0].hop_pte_val = hdev->asic_funcs->read_pte(hdev,
+						hops->hop_info[0].hop_pte_addr);
+	if (hops->hop_info[0].hop_pte_val == U64_MAX)
+		return -EFAULT;
+
+	for (i = 1 ; i < mmu_prop->num_hops ; i++) {
+		hops->hop_info[i].hop_addr =
+			hl_mmu_get_next_hop_addr(ctx, hops->hop_info[i - 1].hop_pte_val);
+		if (hops->hop_info[i].hop_addr == ULLONG_MAX)
+			return -EFAULT;
+
+		hops->hop_info[i].hop_pte_addr =
+				hl_mmu_get_hop_pte_phys_addr(ctx, mmu_prop, i,
+						hops->hop_info[i].hop_addr,
+						hops->scrambled_vaddr);
+		if (hops->hop_info[i].hop_pte_addr == U64_MAX)
+			return -EFAULT;
+
+		hops->hop_info[i].hop_pte_val =
+				hdev->asic_funcs->read_pte(hdev,
+					hops->hop_info[i].hop_pte_addr);
+
+		if (!(hops->hop_info[i].hop_pte_val & PAGE_PRESENT_MASK))
+			return -EFAULT;
+
+		if (hops->hop_info[i].hop_pte_val & mmu_prop->last_mask)
+			break;
+	}
+
+	/* if passed over all hops then no last hop was found */
+	if (i == mmu_prop->num_hops)
+		return -EFAULT;
+
+	if (!(hops->hop_info[i].hop_pte_val & PAGE_PRESENT_MASK))
+		return -EFAULT;
+
+	if (hops->scrambled_vaddr != virt_addr)
+		hops->unscrambled_paddr = hdev->asic_funcs->descramble_addr
+				(hdev, hops->hop_info[i].hop_pte_val);
+	else
+		hops->unscrambled_paddr = hops->hop_info[i].hop_pte_val;
+
+	hops->used_hops = i + 1;
+
+	return 0;
+}
+
+/*
+ * hl_mmu_v2_prepare - prepare mmu_if for working with mmu v2
+ *
+ * @hdev: pointer to the device structure
+ * @mmu_if: pointer to the mmu interface structure
+ */
+void hl_mmu_v2_set_funcs(struct hl_device *hdev, struct hl_mmu_funcs *mmu)
+{
+	mmu->init = hl_mmu_dr_init;
+	mmu->fini = hl_mmu_dr_fini;
+	mmu->ctx_init = hl_mmu_v2_ctx_init;
+	mmu->ctx_fini = hl_mmu_v2_ctx_fini;
+	mmu->map = hl_mmu_v2_map;
+	mmu->unmap = hl_mmu_v2_unmap;
+	mmu->flush = hl_mmu_dr_flush;
+	mmu->swap_out = hl_mmu_v2_swap_out;
+	mmu->swap_in = hl_mmu_v2_swap_in;
+	mmu->get_tlb_info = hl_mmu_v2_get_tlb_info;
+}
diff --git a/drivers/accel/habanalabs/gaudi/gaudi.c b/drivers/accel/habanalabs/gaudi/gaudi.c
index 53292d4c15c865..dde3839fe0e070 100644
--- a/drivers/accel/habanalabs/gaudi/gaudi.c
+++ b/drivers/accel/habanalabs/gaudi/gaudi.c
@@ -649,6 +649,7 @@ static int gaudi_set_fixed_properties(struct hl_device *hdev)
 	prop->dmmu.start_addr = (VA_HOST_SPACE_START + VA_HOST_SPACE_SIZE / 2);
 	prop->dmmu.end_addr = VA_HOST_SPACE_END;
 	prop->dmmu.page_size = PAGE_SIZE_2MB;
+	prop->dmmu.pgt_size = prop->mmu_pgt_size;
 
 	prop->cfg_size = CFG_SIZE;
 	prop->max_asid = MAX_ASID;
diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c b/drivers/accel/habanalabs/gaudi2/gaudi2.c
index fd01525b1ea204..5863c904913433 100644
--- a/drivers/accel/habanalabs/gaudi2/gaudi2.c
+++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c
@@ -2308,11 +2308,26 @@ static int set_number_of_functional_hbms(struct hl_device *hdev)
 	return 0;
 }
 
+static bool gaudi2_is_edma_queue_id(u32 queue_id)
+{
+
+	switch (queue_id) {
+	case GAUDI2_QUEUE_ID_DCORE0_EDMA_0_0...GAUDI2_QUEUE_ID_DCORE0_EDMA_1_3:
+	case GAUDI2_QUEUE_ID_DCORE1_EDMA_0_0...GAUDI2_QUEUE_ID_DCORE1_EDMA_1_3:
+	case GAUDI2_QUEUE_ID_DCORE2_EDMA_0_0...GAUDI2_QUEUE_ID_DCORE2_EDMA_1_3:
+	case GAUDI2_QUEUE_ID_DCORE3_EDMA_0_0...GAUDI2_QUEUE_ID_DCORE3_EDMA_1_3:
+		return true;
+	default:
+		return false;
+	}
+}
+
 static int gaudi2_set_dram_properties(struct hl_device *hdev)
 {
 	struct asic_fixed_properties *prop = &hdev->asic_prop;
-	u32 basic_hbm_page_size;
-	int rc;
+	u64 hbm_drv_base_offset = 0, edma_pq_base_addr;
+	u32 basic_hbm_page_size, edma_idx = 0;
+	int rc, i;
 
 	rc = set_number_of_functional_hbms(hdev);
 	if (rc)
@@ -2356,9 +2371,35 @@ static int gaudi2_set_dram_properties(struct hl_device *hdev)
 	prop->dmmu.start_addr = prop->dram_base_address +
 			(prop->dram_page_size *
 				DIV_ROUND_UP_SECTOR_T(prop->dram_size, prop->dram_page_size));
-
 	prop->dmmu.end_addr = prop->dmmu.start_addr + prop->dram_page_size *
 			div_u64((VA_HBM_SPACE_END - prop->dmmu.start_addr), prop->dmmu.page_size);
+	/*
+	 * Driver can't share an (48MB) HBM page with the F/W in order to prevent FW to block
+	 * the driver part by range register, so it must start at the next (48MB) page
+	 */
+	hbm_drv_base_offset = roundup(CPU_FW_IMAGE_SIZE, prop->num_functional_hbms * SZ_8M);
+
+	/*
+	 * The NIC driver section size and the HMMU page tables section in the HBM needs
+	 * to be the remaining size in the first dram page after taking into
+	 * account the F/W image size
+	 */
+
+	/* Reserve region in HBM for HMMU page tables */
+	prop->mmu_pgt_addr = DRAM_PHYS_BASE + hbm_drv_base_offset +
+				((prop->dram_page_size - hbm_drv_base_offset) -
+				(HMMU_PAGE_TABLES_SIZE + EDMA_PQS_SIZE + EDMA_SCRATCHPAD_SIZE));
+
+	/* Set EDMA PQs HBM addresses */
+	edma_pq_base_addr = prop->mmu_pgt_addr + HMMU_PAGE_TABLES_SIZE;
+
+	for (i = 0 ; i < GAUDI2_QUEUE_ID_CPU_PQ ; i++) {
+		if (gaudi2_is_edma_queue_id(i)) {
+			prop->hw_queues_props[i].q_dram_bd_address = edma_pq_base_addr +
+							(edma_idx * HL_QUEUE_SIZE_IN_BYTES);
+			edma_idx++;
+		}
+	}
 
 	return 0;
 }
@@ -2368,7 +2409,7 @@ static int gaudi2_set_fixed_properties(struct hl_device *hdev)
 	struct asic_fixed_properties *prop = &hdev->asic_prop;
 	struct hw_queue_properties *q_props;
 	u32 num_sync_stream_queues = 0;
-	int i;
+	int i, rc;
 
 	prop->max_queues = GAUDI2_QUEUE_ID_SIZE;
 	prop->hw_queues_props = kcalloc(prop->max_queues, sizeof(struct hw_queue_properties),
@@ -2391,6 +2432,9 @@ static int gaudi2_set_fixed_properties(struct hl_device *hdev)
 		}
 
 		q_props[i].cb_alloc_flags = CB_ALLOC_USER;
+
+		if (gaudi2_is_edma_queue_id(i))
+			q_props[i].dram_bd = 1;
 	}
 
 	q_props[GAUDI2_QUEUE_ID_CPU_PQ].type = QUEUE_TYPE_CPU;
@@ -2419,40 +2463,39 @@ static int gaudi2_set_fixed_properties(struct hl_device *hdev)
 
 	prop->rotator_enabled_mask = BIT(NUM_OF_ROT) - 1;
 
-	if (hdev->pldm)
-		prop->mmu_pgt_size = 0x800000; /* 8MB */
-	else
-		prop->mmu_pgt_size = MMU_PAGE_TABLES_INITIAL_SIZE;
+	prop->max_asid = 2;
 
+	prop->dmmu.pgt_size = HMMU_PAGE_TABLES_SIZE;
 	prop->mmu_pte_size = HL_PTE_SIZE;
 	prop->mmu_hop_table_size = HOP_TABLE_SIZE_512_PTE;
-	prop->mmu_hop0_tables_total_size = HOP0_512_PTE_TABLES_TOTAL_SIZE;
+	prop->mmu_hop0_tables_total_size = HOP_TABLE_SIZE_512_PTE * prop->max_asid;
 
 	prop->dmmu.hop_shifts[MMU_HOP0] = DHOP0_SHIFT;
 	prop->dmmu.hop_shifts[MMU_HOP1] = DHOP1_SHIFT;
 	prop->dmmu.hop_shifts[MMU_HOP2] = DHOP2_SHIFT;
 	prop->dmmu.hop_shifts[MMU_HOP3] = DHOP3_SHIFT;
-	prop->dmmu.hop_shifts[MMU_HOP4] = DHOP4_SHIFT;
 	prop->dmmu.hop_masks[MMU_HOP0] = DHOP0_MASK;
 	prop->dmmu.hop_masks[MMU_HOP1] = DHOP1_MASK;
 	prop->dmmu.hop_masks[MMU_HOP2] = DHOP2_MASK;
 	prop->dmmu.hop_masks[MMU_HOP3] = DHOP3_MASK;
-	prop->dmmu.hop_masks[MMU_HOP4] = DHOP4_MASK;
 	prop->dmmu.page_size = PAGE_SIZE_1GB;
-	prop->dmmu.num_hops = MMU_ARCH_6_HOPS;
+	prop->dmmu.num_hops = MMU_ARCH_4_HOPS;
 	prop->dmmu.last_mask = LAST_MASK;
-	prop->dmmu.host_resident = 1;
+	prop->dmmu.host_resident = 0;
 	prop->dmmu.hop_table_size = prop->mmu_hop_table_size;
 	prop->dmmu.hop0_tables_total_size = prop->mmu_hop0_tables_total_size;
 
-	/*
-	 * this is done in order to be able to validate FW descriptor (i.e. validating that
-	 * the addresses and allocated space for FW image does not cross memory bounds).
-	 * for this reason we set the DRAM size to the minimum possible and later it will
-	 * be modified according to what reported in the cpucp info packet
+	/* As we need to set the pgt address in dram for HMMU init so we cannot
+	 * wait to the fw cpucp info to set the dram props as mmu init comes before
+	 * hw init
 	 */
-	prop->dram_size = (GAUDI2_HBM_NUM - 1) * SZ_16G;
+	rc = hdev->asic_funcs->set_dram_properties(hdev);
+	if (rc)
+		goto free_qprops;
+
+	prop->mmu_pgt_size = PMMU_PAGE_TABLES_SIZE;
 
+	prop->pmmu.pgt_size = prop->mmu_pgt_size;
 	hdev->pmmu_huge_range = true;
 	prop->pmmu.host_resident = 1;
 	prop->pmmu.num_hops = MMU_ARCH_6_HOPS;
@@ -2516,7 +2559,6 @@ static int gaudi2_set_fixed_properties(struct hl_device *hdev)
 	prop->max_num_of_engines = GAUDI2_ENGINE_ID_SIZE;
 	prop->num_engine_cores = CPU_ID_MAX;
 	prop->cfg_size = CFG_SIZE;
-	prop->max_asid = MAX_ASID;
 	prop->num_of_events = GAUDI2_EVENT_SIZE;
 
 	prop->supports_engine_modes = true;
@@ -2560,6 +2602,10 @@ static int gaudi2_set_fixed_properties(struct hl_device *hdev)
 	prop->hbw_flush_reg = mmPCIE_WRAP_SPECIAL_GLBL_SPARE_0;
 
 	return 0;
+
+free_qprops:
+	kfree(prop->hw_queues_props);
+	return rc;
 }
 
 static int gaudi2_pci_bars_map(struct hl_device *hdev)
@@ -3033,6 +3079,25 @@ static int gaudi2_fetch_psoc_frequency(struct hl_device *hdev)
 	return 0;
 }
 
+static int gaudi2_mmu_clear_pgt_range(struct hl_device *hdev)
+{
+	struct gaudi2_device *gaudi2 = hdev->asic_specific;
+	struct asic_fixed_properties *prop = &hdev->asic_prop;
+	int rc;
+
+	if (!(gaudi2->hw_cap_initialized & HW_CAP_MMU_MASK))
+		return 0;
+
+	if (prop->dmmu.host_resident)
+		return 0;
+
+	rc = gaudi2_memset_device_memory(hdev, prop->mmu_pgt_addr, prop->dmmu.pgt_size, 0);
+	if (rc)
+		dev_err(hdev->dev, "Failed to clear mmu pgt");
+
+	return rc;
+}
+
 static int gaudi2_early_init(struct hl_device *hdev)
 {
 	struct asic_fixed_properties *prop = &hdev->asic_prop;
@@ -3258,6 +3323,12 @@ static int gaudi2_late_init(struct hl_device *hdev)
 		goto disable_pci_access;
 	}
 
+	rc = gaudi2_mmu_clear_pgt_range(hdev);
+	if (rc) {
+		dev_err(hdev->dev, "Failed to clear MMU page tables range\n");
+		goto disable_pci_access;
+	}
+
 	gaudi2_init_arcs(hdev);
 
 	rc = gaudi2_scrub_arcs_dccm(hdev);
@@ -3697,13 +3768,7 @@ static int gaudi2_sw_init(struct hl_device *hdev)
 
 	spin_lock_init(&gaudi2->hw_queues_lock);
 
-	gaudi2->scratchpad_kernel_address = hl_asic_dma_alloc_coherent(hdev, PAGE_SIZE,
-							&gaudi2->scratchpad_bus_address,
-							GFP_KERNEL | __GFP_ZERO);
-	if (!gaudi2->scratchpad_kernel_address) {
-		rc = -ENOMEM;
-		goto free_virt_msix_db_mem;
-	}
+	gaudi2->scratchpad_bus_address = prop->mmu_pgt_addr + HMMU_PAGE_TABLES_SIZE + EDMA_PQS_SIZE;
 
 	gaudi2_user_mapped_blocks_init(hdev);
 
@@ -3727,7 +3792,7 @@ static int gaudi2_sw_init(struct hl_device *hdev)
 
 	rc = gaudi2_special_blocks_iterator_config(hdev);
 	if (rc)
-		goto free_scratchpad_mem;
+		goto free_virt_msix_db_mem;
 
 	rc = gaudi2_test_queues_msgs_alloc(hdev);
 	if (rc)
@@ -3737,9 +3802,6 @@ static int gaudi2_sw_init(struct hl_device *hdev)
 
 special_blocks_free:
 	gaudi2_special_blocks_iterator_free(hdev);
-free_scratchpad_mem:
-	hl_asic_dma_free_coherent(hdev, PAGE_SIZE, gaudi2->scratchpad_kernel_address,
-				  gaudi2->scratchpad_bus_address);
 free_virt_msix_db_mem:
 	hl_cpu_accessible_dma_pool_free(hdev, prop->pmmu.page_size, gaudi2->virt_msix_db_cpu_addr);
 free_cpu_accessible_dma_pool:
@@ -3770,9 +3832,6 @@ static int gaudi2_sw_fini(struct hl_device *hdev)
 	hl_asic_dma_free_coherent(hdev, HL_CPU_ACCESSIBLE_MEM_SIZE, hdev->cpu_accessible_dma_mem,
 						hdev->cpu_accessible_dma_address);
 
-	hl_asic_dma_free_coherent(hdev, PAGE_SIZE, gaudi2->scratchpad_kernel_address,
-					gaudi2->scratchpad_bus_address);
-
 	dma_pool_destroy(hdev->dma_pool);
 
 	kfree(gaudi2);
@@ -4962,10 +5021,17 @@ static void gaudi2_init_qman_pq(struct hl_device *hdev, u32 reg_base,
 		q = &hdev->kernel_queues[queue_id_base + pq_id];
 		pq_offset = pq_id * 4;
 
-		WREG32(reg_base + QM_PQ_BASE_LO_0_OFFSET + pq_offset,
-				lower_32_bits(q->bus_address));
-		WREG32(reg_base + QM_PQ_BASE_HI_0_OFFSET + pq_offset,
-				upper_32_bits(q->bus_address));
+		if (q->dram_bd) {
+			WREG32(reg_base + QM_PQ_BASE_LO_0_OFFSET + pq_offset,
+					lower_32_bits(q->pq_dram_address));
+			WREG32(reg_base + QM_PQ_BASE_HI_0_OFFSET + pq_offset,
+					upper_32_bits(q->pq_dram_address));
+		} else {
+			WREG32(reg_base + QM_PQ_BASE_LO_0_OFFSET + pq_offset,
+					lower_32_bits(q->bus_address));
+			WREG32(reg_base + QM_PQ_BASE_HI_0_OFFSET + pq_offset,
+					upper_32_bits(q->bus_address));
+		}
 		WREG32(reg_base + QM_PQ_SIZE_0_OFFSET + pq_offset, ilog2(HL_QUEUE_LENGTH));
 		WREG32(reg_base + QM_PQ_PI_0_OFFSET + pq_offset, 0);
 		WREG32(reg_base + QM_PQ_CI_0_OFFSET + pq_offset, 0);
@@ -5852,7 +5918,8 @@ static int gaudi2_mmu_invalidate_cache_range(struct hl_device *hdev, bool is_har
 	return rc;
 }
 
-static int gaudi2_mmu_update_hop0_addr(struct hl_device *hdev, u32 stlb_base)
+static int gaudi2_mmu_update_hop0_addr(struct hl_device *hdev, u32 stlb_base,
+									bool host_resident_pgt)
 {
 	struct asic_fixed_properties *prop = &hdev->asic_prop;
 	u64 hop0_addr;
@@ -5864,7 +5931,11 @@ static int gaudi2_mmu_update_hop0_addr(struct hl_device *hdev, u32 stlb_base)
 		max_asid = min((u32) 8, max_asid);
 
 	for (asid = 0 ; asid < max_asid ; asid++) {
-		hop0_addr = hdev->mmu_priv.hr.mmu_asid_hop0[asid].phys_addr;
+		if (host_resident_pgt)
+			hop0_addr = hdev->mmu_priv.hr.mmu_asid_hop0[asid].phys_addr;
+		else
+			hop0_addr = prop->mmu_pgt_addr + (asid * prop->mmu_hop_table_size);
+
 		rc = gaudi2_mmu_update_asid_hop0_addr(hdev, stlb_base, asid, hop0_addr);
 		if (rc) {
 			dev_err(hdev->dev, "failed to set hop0 addr for asid %d\n", asid);
@@ -5875,7 +5946,8 @@ static int gaudi2_mmu_update_hop0_addr(struct hl_device *hdev, u32 stlb_base)
 	return 0;
 }
 
-static int gaudi2_mmu_init_common(struct hl_device *hdev, u32 mmu_base, u32 stlb_base)
+static int gaudi2_mmu_init_common(struct hl_device *hdev, u32 mmu_base, u32 stlb_base,
+								bool host_resident_pgt)
 {
 	u32 status, timeout_usec;
 	int rc;
@@ -5898,7 +5970,7 @@ static int gaudi2_mmu_init_common(struct hl_device *hdev, u32 mmu_base, u32 stlb
 	if (rc)
 		dev_notice_ratelimited(hdev->dev, "Timeout when waiting for MMU SRAM init\n");
 
-	rc = gaudi2_mmu_update_hop0_addr(hdev, stlb_base);
+	rc = gaudi2_mmu_update_hop0_addr(hdev, stlb_base, host_resident_pgt);
 	if (rc)
 		return rc;
 
@@ -5922,6 +5994,7 @@ static int gaudi2_mmu_init_common(struct hl_device *hdev, u32 mmu_base, u32 stlb
 
 static int gaudi2_pci_mmu_init(struct hl_device *hdev)
 {
+	struct asic_fixed_properties *prop = &hdev->asic_prop;
 	struct gaudi2_device *gaudi2 = hdev->asic_specific;
 	u32 mmu_base, stlb_base;
 	int rc;
@@ -5961,7 +6034,7 @@ static int gaudi2_pci_mmu_init(struct hl_device *hdev)
 
 	WREG32(mmu_base + MMU_SPI_SEI_MASK_OFFSET, GAUDI2_PMMU_SPI_SEI_ENABLE_MASK);
 
-	rc = gaudi2_mmu_init_common(hdev, mmu_base, stlb_base);
+	rc = gaudi2_mmu_init_common(hdev, mmu_base, stlb_base, prop->pmmu.host_resident);
 	if (rc)
 		return rc;
 
@@ -6013,7 +6086,7 @@ static int gaudi2_dcore_hmmu_init(struct hl_device *hdev, int dcore_id,
 
 	WREG32(mmu_base + MMU_SPI_SEI_MASK_OFFSET, GAUDI2_HMMU_SPI_SEI_ENABLE_MASK);
 
-	rc = gaudi2_mmu_init_common(hdev, mmu_base, stlb_base);
+	rc = gaudi2_mmu_init_common(hdev, mmu_base, stlb_base, prop->dmmu.host_resident);
 	if (rc)
 		return rc;
 
@@ -7051,7 +7124,7 @@ static int gaudi2_test_queues(struct hl_device *hdev)
 
 	/* send test message on all enabled Qs */
 	for (i = GAUDI2_QUEUE_ID_PDMA_0_0 ; i < GAUDI2_QUEUE_ID_CPU_PQ; i++) {
-		if (!gaudi2_is_queue_enabled(hdev, i))
+		if (!gaudi2_is_queue_enabled(hdev, i) || gaudi2_is_edma_queue_id(i))
 			continue;
 
 		msg_info = &gaudi2->queues_test_info[i - GAUDI2_QUEUE_ID_PDMA_0_0];
@@ -7068,7 +7141,7 @@ static int gaudi2_test_queues(struct hl_device *hdev)
 
 	/* verify that all messages were processed */
 	for (i = GAUDI2_QUEUE_ID_PDMA_0_0 ; i < GAUDI2_QUEUE_ID_CPU_PQ; i++) {
-		if (!gaudi2_is_queue_enabled(hdev, i))
+		if (!gaudi2_is_queue_enabled(hdev, i) || gaudi2_is_edma_queue_id(i))
 			continue;
 
 		rc = gaudi2_test_queue_wait_completion(hdev, i, sob_val);
@@ -8988,7 +9061,6 @@ static void gaudi2_handle_page_error(struct hl_device *hdev, u64 mmu_base, bool
 	if (is_pmmu) {
 		dev_err_ratelimited(hdev->dev, "PMMU page fault on va 0x%llx\n", addr);
 	} else {
-
 		addr = gaudi2_mmu_descramble_addr(hdev, addr);
 		addr &= HW_UNSCRAMBLED_BITS_MASK;
 		dev_err_ratelimited(hdev->dev, "HMMU page fault on va range 0x%llx - 0x%llx\n",
@@ -10255,11 +10327,11 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent
 }
 
 static int gaudi2_memset_memory_chunk_using_edma_qm(struct hl_device *hdev,
-			struct packet_lin_dma *lin_dma_pkt, dma_addr_t pkt_dma_addr,
-			u32 hw_queue_id, u32 size, u64 addr, u32 val)
+			struct packet_lin_dma *lin_dma_pkt,
+			u64 phys_addr, u32 hw_queue_id, u32 size, u64 addr, u32 val)
 {
 	u32 ctl, pkt_size;
-	int rc = 0;
+	int rc = 0, i;
 
 	ctl = FIELD_PREP(GAUDI2_PKT_CTL_OPCODE_MASK, PACKET_LIN_DMA);
 	ctl |= FIELD_PREP(GAUDI2_PKT_LIN_DMA_CTL_MEMSET_MASK, 1);
@@ -10273,7 +10345,12 @@ static int gaudi2_memset_memory_chunk_using_edma_qm(struct hl_device *hdev,
 
 	pkt_size = sizeof(struct packet_lin_dma);
 
-	rc = hl_hw_queue_send_cb_no_cmpl(hdev, hw_queue_id, pkt_size, pkt_dma_addr);
+	for (i = 0; i < 3; i++)
+		rc = hdev->asic_funcs->access_dev_mem(hdev, PCI_REGION_DRAM,
+				phys_addr + (i * sizeof(u64)),
+				((u64 *)(lin_dma_pkt)) + i, DEBUGFS_WRITE64);
+
+	rc = hl_hw_queue_send_cb_no_cmpl(hdev, hw_queue_id, pkt_size, phys_addr);
 	if (rc)
 		dev_err(hdev->dev, "Failed to send lin dma packet to H/W queue %d\n",
 				hw_queue_id);
@@ -10288,12 +10365,11 @@ static int gaudi2_memset_device_memory(struct hl_device *hdev, u64 addr, u64 siz
 					GAUDI2_QUEUE_ID_DCORE2_EDMA_0_0,
 					GAUDI2_QUEUE_ID_DCORE3_EDMA_0_0};
 	u32 chunk_size, dcore, edma_idx, sob_offset, sob_addr, comp_val,
-		old_mmubp, mmubp, num_of_pkts, busy, pkt_size;
+		old_mmubp, mmubp, num_of_pkts, busy, pkt_size, cb_len;
 	u64 comp_addr, cur_addr = addr, end_addr = addr + size;
 	struct asic_fixed_properties *prop = &hdev->asic_prop;
+	int rc = 0, dma_num = 0, i;
 	void *lin_dma_pkts_arr;
-	dma_addr_t pkt_dma_addr;
-	int rc = 0, dma_num = 0;
 
 	if (prop->edma_enabled_mask == 0) {
 		dev_info(hdev->dev, "non of the EDMA engines is enabled - skip dram scrubbing\n");
@@ -10311,9 +10387,19 @@ static int gaudi2_memset_device_memory(struct hl_device *hdev, u64 addr, u64 siz
 	/* Calculate how many lin dma pkts we'll need */
 	num_of_pkts = div64_u64(round_up(size, SZ_2G), SZ_2G);
 	pkt_size = sizeof(struct packet_lin_dma);
+	cb_len = pkt_size * num_of_pkts;
+
+	/*
+	 * if we're not scrubing HMMU or NIC reserved sections in hbm,
+	 * then it the scrubing of the user section, as we use the start of the user section
+	 * to store the CB of the EDMA QM, so shift the start address of the scrubbing accordingly
+	 * and scrub the CB section before leaving this function.
+	 */
+	if ((addr >= prop->dram_user_base_address) &&
+				(addr < prop->dram_user_base_address + cb_len))
+		cur_addr += (prop->dram_user_base_address + cb_len) - addr;
 
-	lin_dma_pkts_arr = hl_asic_dma_alloc_coherent(hdev, pkt_size * num_of_pkts,
-					&pkt_dma_addr, GFP_KERNEL);
+	lin_dma_pkts_arr = kvcalloc(num_of_pkts, pkt_size, GFP_KERNEL);
 	if (!lin_dma_pkts_arr)
 		return -ENOMEM;
 
@@ -10359,7 +10445,7 @@ static int gaudi2_memset_device_memory(struct hl_device *hdev, u64 addr, u64 siz
 
 				rc = gaudi2_memset_memory_chunk_using_edma_qm(hdev,
 					(struct packet_lin_dma *)lin_dma_pkts_arr + dma_num,
-					pkt_dma_addr + dma_num * pkt_size,
+					prop->dram_user_base_address + (dma_num * pkt_size),
 					edma_queues_id[dcore] + edma_idx * 4,
 					chunk_size, cur_addr, val);
 				if (rc)
@@ -10368,14 +10454,16 @@ static int gaudi2_memset_device_memory(struct hl_device *hdev, u64 addr, u64 siz
 				dma_num++;
 				cur_addr += chunk_size;
 				if (cur_addr == end_addr)
-					break;
+					goto edma_wait;
 			}
 		}
 	}
 
+edma_wait:
 	rc = hl_poll_timeout(hdev, sob_addr, busy, (busy == dma_num), 1000, 1000000);
 	if (rc) {
-		dev_err(hdev->dev, "DMA Timeout during HBM scrubbing\n");
+		dev_err(hdev->dev, "DMA Timeout during HBM scrubbing(sob: 0x%x, dma_num: 0x%x)\n",
+						busy, dma_num);
 		goto end;
 	}
 end:
@@ -10396,8 +10484,16 @@ static int gaudi2_memset_device_memory(struct hl_device *hdev, u64 addr, u64 siz
 		}
 	}
 
+	memset(lin_dma_pkts_arr, 0, sizeof(u64));
+
+	/* Zero the HBM area where we copied the CB */
+	for (i = 0; i < cb_len / sizeof(u64); i += sizeof(u64))
+		rc = hdev->asic_funcs->access_dev_mem(hdev, PCI_REGION_DRAM,
+			prop->dram_user_base_address + i,
+				(u64 *)(lin_dma_pkts_arr), DEBUGFS_WRITE64);
 	WREG32(sob_addr, 0);
-	hl_asic_dma_free_coherent(hdev, pkt_size * num_of_pkts, lin_dma_pkts_arr, pkt_dma_addr);
+
+	kfree(lin_dma_pkts_arr);
 
 	return rc;
 }
@@ -11455,7 +11551,7 @@ static int gaudi2_mmu_get_real_page_size(struct hl_device *hdev, struct hl_mmu_p
 	return 0;
 
 page_size_err:
-	dev_err(hdev->dev, "page size of %u is not %uKB aligned, can't map\n",
+	dev_err(hdev->dev, "page size of 0x%X is not 0x%X aligned, can't map\n",
 							page_size, mmu_prop->page_size >> 10);
 	return -EFAULT;
 }
@@ -11475,6 +11571,29 @@ int gaudi2_send_device_activity(struct hl_device *hdev, bool open)
 	return hl_fw_send_device_activity(hdev, open);
 }
 
+static u64 gaudi2_read_pte(struct hl_device *hdev, u64 addr)
+{
+	struct gaudi2_device *gaudi2 = hdev->asic_specific;
+	u64 val;
+
+	if (hdev->reset_info.hard_reset_pending)
+		return U64_MAX;
+
+	val = readq(hdev->pcie_bar[DRAM_BAR_ID] + (addr - gaudi2->dram_bar_cur_addr));
+
+	return val;
+}
+
+static void gaudi2_write_pte(struct hl_device *hdev, u64 addr, u64 val)
+{
+	struct gaudi2_device *gaudi2 = hdev->asic_specific;
+
+	if (hdev->reset_info.hard_reset_pending)
+		return;
+
+	writeq(val, hdev->pcie_bar[DRAM_BAR_ID] + (addr - gaudi2->dram_bar_cur_addr));
+}
+
 static const struct hl_asic_funcs gaudi2_funcs = {
 	.early_init = gaudi2_early_init,
 	.early_fini = gaudi2_early_fini,
@@ -11511,8 +11630,8 @@ static const struct hl_asic_funcs gaudi2_funcs = {
 	.add_device_attr = gaudi2_add_device_attr,
 	.handle_eqe = gaudi2_handle_eqe,
 	.get_events_stat = gaudi2_get_events_stat,
-	.read_pte = NULL,
-	.write_pte = NULL,
+	.read_pte = gaudi2_read_pte,
+	.write_pte = gaudi2_write_pte,
 	.mmu_invalidate_cache = gaudi2_mmu_invalidate_cache,
 	.mmu_invalidate_cache_range = gaudi2_mmu_invalidate_cache_range,
 	.mmu_prefetch_cache_range = NULL,
diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2P.h b/drivers/accel/habanalabs/gaudi2/gaudi2P.h
index 9b9eef0d97d6e8..bc508c9cee5c50 100644
--- a/drivers/accel/habanalabs/gaudi2/gaudi2P.h
+++ b/drivers/accel/habanalabs/gaudi2/gaudi2P.h
@@ -19,8 +19,6 @@
 #define GAUDI2_LINUX_FW_FILE	"habanalabs/gaudi2/gaudi2-fit.itb"
 #define GAUDI2_BOOT_FIT_FILE	"habanalabs/gaudi2/gaudi2-boot-fit.itb"
 
-#define MMU_PAGE_TABLES_INITIAL_SIZE	0x10000000	/* 256MB */
-
 #define GAUDI2_CPU_TIMEOUT_USEC		30000000	/* 30s */
 
 #define NUMBER_OF_PDMA_QUEUES		2
@@ -109,13 +107,11 @@
 /* DRAM Memory Map */
 
 #define CPU_FW_IMAGE_SIZE			0x10000000	/* 256MB */
-
-/* This define should be used only when working in a debug mode without dram.
- * When working with dram, the driver size will be calculated dynamically.
- */
-#define NIC_DEFAULT_DRV_SIZE			0x20000000	/* 512MB */
-
 #define CPU_FW_IMAGE_ADDR			DRAM_PHYS_BASE
+#define PMMU_PAGE_TABLES_SIZE			0x10000000      /* 256MB */
+#define EDMA_PQS_SIZE				SZ_2M
+#define EDMA_SCRATCHPAD_SIZE			SZ_1M
+#define HMMU_PAGE_TABLES_SIZE			SZ_1M
 
 #define NIC_NUMBER_OF_PORTS			NIC_NUMBER_OF_ENGINES
 
diff --git a/drivers/accel/habanalabs/include/hw_ip/mmu/mmu_general.h b/drivers/accel/habanalabs/include/hw_ip/mmu/mmu_general.h
index d408feecd4834d..b4a5e95be35421 100644
--- a/drivers/accel/habanalabs/include/hw_ip/mmu/mmu_general.h
+++ b/drivers/accel/habanalabs/include/hw_ip/mmu/mmu_general.h
@@ -26,6 +26,8 @@
 #define LAST_MASK			0x0000000000800ull
 #define FLAGS_MASK			0x0000000000FFFull
 
+#define MMU_ARCH_3_HOPS			3
+#define MMU_ARCH_4_HOPS			4
 #define MMU_ARCH_5_HOPS			5
 #define MMU_ARCH_6_HOPS			6
 

From d6ad8a09696cb067d9f27f3e8356b14e69d6fe4b Mon Sep 17 00:00:00 2001
From: Malkoot Khan <engr.mkhan1990@gmail.com>
Date: Thu, 28 Dec 2023 21:08:58 +0000
Subject: [PATCH 0854/1406] accel/habanalabs: Remove unnecessary braces from if
 statement

The coding style in the Linux kernel prefers not to use
braces for single-statement if conditions.
This patch removes the unnecessary braces from an if statement
in the file drivers/accel/habanalabs/common/command_submission.c,
which also resolves a coding style warning.

Signed-off-by: Malkoot Khan <engr.mkhan1990@gmail.com>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/accel/habanalabs/common/command_submission.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/accel/habanalabs/common/command_submission.c b/drivers/accel/habanalabs/common/command_submission.c
index 3aa6eeef443b41..39e23d625a3cbb 100644
--- a/drivers/accel/habanalabs/common/command_submission.c
+++ b/drivers/accel/habanalabs/common/command_submission.c
@@ -1360,9 +1360,8 @@ static int hl_cs_sanity_checks(struct hl_fpriv *hpriv, union hl_cs_args *args)
 			return -EINVAL;
 		}
 
-	if (!hl_device_operational(hdev, &status)) {
+	if (!hl_device_operational(hdev, &status))
 		return -EBUSY;
-	}
 
 	if ((args->in.cs_flags & HL_CS_FLAGS_STAGED_SUBMISSION) &&
 			!hdev->supports_staged_submission) {

From 1931161f3feeb0ae11f0832ca58ed811a054c2c8 Mon Sep 17 00:00:00 2001
From: Dani Liberman <dliberman@habana.ai>
Date: Mon, 1 Jan 2024 22:37:43 +0200
Subject: [PATCH 0855/1406] accel/habanalabs: remove call to deprecated
 function

In newer kernel versions, irq_set_affinity_hint() is deprecated.
Instead, use the newer version which is irq_set_affinity_and_hint().

Signed-off-by: Dani Liberman <dliberman@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/accel/habanalabs/common/device.c | 2 +-
 drivers/accel/habanalabs/gaudi2/gaudi2.c | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/accel/habanalabs/common/device.c b/drivers/accel/habanalabs/common/device.c
index cf004baf5e6213..3b9e8a21d7df8b 100644
--- a/drivers/accel/habanalabs/common/device.c
+++ b/drivers/accel/habanalabs/common/device.c
@@ -2833,6 +2833,6 @@ void hl_set_irq_affinity(struct hl_device *hdev, int irq)
 		return;
 	}
 
-	if (irq_set_affinity_hint(irq, &hdev->irq_affinity_mask))
+	if (irq_set_affinity_and_hint(irq, &hdev->irq_affinity_mask))
 		dev_err(hdev->dev, "Failed setting irq %d affinity\n", irq);
 }
diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c b/drivers/accel/habanalabs/gaudi2/gaudi2.c
index 5863c904913433..05e2170c815e6b 100644
--- a/drivers/accel/habanalabs/gaudi2/gaudi2.c
+++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c
@@ -4395,7 +4395,7 @@ static int gaudi2_enable_msix(struct hl_device *hdev)
 			i < GAUDI2_IRQ_NUM_USER_FIRST + user_irq_init_cnt ; i++, j++) {
 
 		irq = pci_irq_vector(hdev->pdev, i);
-		irq_set_affinity_hint(irq, NULL);
+		irq_set_affinity_and_hint(irq, NULL);
 		free_irq(irq, &hdev->user_interrupt[j]);
 	}
 	irq = pci_irq_vector(hdev->pdev, GAUDI2_IRQ_NUM_UNEXPECTED_ERROR);
@@ -4476,7 +4476,7 @@ static void gaudi2_disable_msix(struct hl_device *hdev)
 			k < hdev->asic_prop.user_interrupt_count ; i++, j++, k++) {
 
 		irq = pci_irq_vector(hdev->pdev, i);
-		irq_set_affinity_hint(irq, NULL);
+		irq_set_affinity_and_hint(irq, NULL);
 		free_irq(irq, &hdev->user_interrupt[j]);
 	}
 

From 036509b3083b657d692237900e5d89832f1a70d1 Mon Sep 17 00:00:00 2001
From: Tomer Tayar <ttayar@habana.ai>
Date: Tue, 2 Jan 2024 16:51:09 +0200
Subject: [PATCH 0856/1406] accel/habanalabs/gaudi2: fail memory memset when
 failing to copy QM packet to device

gaudi2_memset_memory_chunk_using_edma_qm() calls the access_dev_mem()
ASIC function, but ignores its return value.
Add this missing check.

Signed-off-by: Tomer Tayar <ttayar@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/accel/habanalabs/gaudi2/gaudi2.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c b/drivers/accel/habanalabs/gaudi2/gaudi2.c
index 05e2170c815e6b..1f061209ae2157 100644
--- a/drivers/accel/habanalabs/gaudi2/gaudi2.c
+++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c
@@ -10345,14 +10345,20 @@ static int gaudi2_memset_memory_chunk_using_edma_qm(struct hl_device *hdev,
 
 	pkt_size = sizeof(struct packet_lin_dma);
 
-	for (i = 0; i < 3; i++)
+	for (i = 0; i < 3; i++) {
 		rc = hdev->asic_funcs->access_dev_mem(hdev, PCI_REGION_DRAM,
 				phys_addr + (i * sizeof(u64)),
 				((u64 *)(lin_dma_pkt)) + i, DEBUGFS_WRITE64);
+		if (rc) {
+			dev_err(hdev->dev, "Failed to copy lin_dma packet to HBM (%#llx)\n",
+				phys_addr);
+			return rc;
+		}
+	}
 
 	rc = hl_hw_queue_send_cb_no_cmpl(hdev, hw_queue_id, pkt_size, phys_addr);
 	if (rc)
-		dev_err(hdev->dev, "Failed to send lin dma packet to H/W queue %d\n",
+		dev_err(hdev->dev, "Failed to send lin_dma packet to H/W queue %d\n",
 				hw_queue_id);
 
 	return rc;

From c7a1ff61ca055f14359bdfbe220ed4e6b3a6c9fe Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.i.king@intel.com>
Date: Sat, 6 Jan 2024 12:42:13 +0000
Subject: [PATCH 0857/1406] accel/habanalabs/goya: remove redundant assignment
 to pointer 'input'

The pointer input is assigned a value that is not read, it is
being re-assigned again later with the same value. Resolve this
by moving the declaration to input into the if block.

Cleans up clang scan build warning:
warning: Value stored to 'input' during its initialization is never
read [deadcode.DeadStores]

Signed-off-by: Colin Ian King <colin.i.king@intel.com>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/accel/habanalabs/goya/goya_coresight.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/accel/habanalabs/goya/goya_coresight.c b/drivers/accel/habanalabs/goya/goya_coresight.c
index 41cae5fd843b88..3827ea4c02f740 100644
--- a/drivers/accel/habanalabs/goya/goya_coresight.c
+++ b/drivers/accel/habanalabs/goya/goya_coresight.c
@@ -576,7 +576,6 @@ static int goya_config_spmu(struct hl_device *hdev,
 		struct hl_debug_params *params)
 {
 	u64 base_reg;
-	struct hl_debug_params_spmu *input = params->input;
 	u64 *output;
 	u32 output_arr_len;
 	u32 events_num;
@@ -592,7 +591,7 @@ static int goya_config_spmu(struct hl_device *hdev,
 	base_reg = debug_spmu_regs[params->reg_idx] - CFG_BASE;
 
 	if (params->enable) {
-		input = params->input;
+		struct hl_debug_params_spmu *input = params->input;
 
 		if (!input)
 			return -EINVAL;

From f60043de72ac6c8bf500e948b4cb7cd0f3551d47 Mon Sep 17 00:00:00 2001
From: Erick Archer <erick.archer@gmx.com>
Date: Sat, 20 Jan 2024 16:10:28 +0100
Subject: [PATCH 0858/1406] accel/habanalabs: use kcalloc() instead of
 kzalloc()

As noted in the "Deprecated Interfaces, Language Features, Attributes,
and Conventions" documentation [1], size calculations (especially
multiplication) should not be performed in memory allocator (or similar)
function arguments due to the risk of them overflowing. This could lead
to values wrapping around and a smaller allocation being made than the
caller was expecting. Using those allocations could lead to linear
overflows of heap memory and other misbehaviors.

So, use the purpose specific kcalloc() function instead of the argument
size * count in the kzalloc() function.

Link: https://www.kernel.org/doc/html/next/process/deprecated.html#open-coded-arithmetic-in-allocator-arguments [1]
Link: https://github.com/KSPP/linux/issues/162

Signed-off-by: Erick Archer <erick.archer@gmx.com>
Reviewed-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/accel/habanalabs/common/mmu/mmu_v1.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/accel/habanalabs/common/mmu/mmu_v1.c b/drivers/accel/habanalabs/common/mmu/mmu_v1.c
index 64b5c8fbb166d9..845d16aaa63741 100644
--- a/drivers/accel/habanalabs/common/mmu/mmu_v1.c
+++ b/drivers/accel/habanalabs/common/mmu/mmu_v1.c
@@ -43,7 +43,7 @@ static int dram_default_mapping_init(struct hl_ctx *ctx)
 	/* add hop1 and hop2 */
 	total_hops = num_of_hop3 + 2;
 
-	ctx->dram_default_hops = kzalloc(HL_PTE_SIZE * total_hops,  GFP_KERNEL);
+	ctx->dram_default_hops = kcalloc(total_hops, HL_PTE_SIZE,  GFP_KERNEL);
 	if (!ctx->dram_default_hops)
 		return -ENOMEM;
 

From 67a1a77630c00f457a46e1164caf0d32c0edc127 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Tue, 20 Feb 2024 13:53:00 +0100
Subject: [PATCH 0859/1406] signal: adjust si_code restriction in
 pidfd_send_signal()

Since we now allow specifying PIDFD_SEND_PROCESS_GROUP for
pidfd_send_signal() to send signals to process groups we need to adjust
the check restricting si_code emulation by userspace to account for
PIDTYPE_PGID.

Reported-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Link: https://lore.kernel.org/r/20240214123655.GB16265@redhat.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 kernel/signal.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/signal.c b/kernel/signal.c
index cf6539a6b1cb36..5f5620c81d3a17 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -3956,7 +3956,7 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig,
 
 		/* Only allow sending arbitrary signals to yourself. */
 		ret = -EPERM;
-		if ((task_pid(current) != pid) &&
+		if (((task_pid(current) != pid) || type > PIDTYPE_TGID) &&
 		    (kinfo.si_code >= 0 || kinfo.si_code == SI_TKILL))
 			goto err;
 	} else {

From 8a466ef97ef11339221d75212a66970d2b1cae88 Mon Sep 17 00:00:00 2001
From: Ohad Sharabi <osharabi@habana.ai>
Date: Wed, 10 Jan 2024 14:27:44 +0200
Subject: [PATCH 0860/1406] accel/habanalabs/gaudi2: use single function to
 compare FW versions

Currently, the code contains 2 types of FW version comparison functions:
- hl_is_fw_sw_ver_[below/equal_or_greater]()
- gaudi2 specific function of the type
  gaudi2_is_fw_ver_[below/above]x_y_z()

Moreover, some functions use the inner FW version which should be only
stage during development but not version dependencies.

Finally, some tests are done to deprecated FW version to which LKD
should hold no compatibility.

This commit aligns all APIs to a single function that just compares the
version and return an integers indicator (similar in some way to
strcmp()).

In addition, this generic function now considers also the sub-minor FW
version and also remove dead code resulting in deprecated FW versions
compatibility.

Signed-off-by: Ohad Sharabi <osharabi@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/accel/habanalabs/common/firmware_if.c | 25 ++++++++
 drivers/accel/habanalabs/common/habanalabs.h  | 20 +------
 drivers/accel/habanalabs/gaudi2/gaudi2.c      | 57 +++----------------
 3 files changed, 34 insertions(+), 68 deletions(-)

diff --git a/drivers/accel/habanalabs/common/firmware_if.c b/drivers/accel/habanalabs/common/firmware_if.c
index 3558a6a8e1925a..e7dcf2fe6552aa 100644
--- a/drivers/accel/habanalabs/common/firmware_if.c
+++ b/drivers/accel/habanalabs/common/firmware_if.c
@@ -40,6 +40,31 @@ static char *comms_sts_str_arr[COMMS_STS_INVLD_LAST] = {
 	[COMMS_STS_TIMEOUT_ERR] = __stringify(COMMS_STS_TIMEOUT_ERR),
 };
 
+/**
+ * hl_fw_version_cmp() - compares the FW version to a specific version
+ *
+ * @hdev: pointer to hl_device structure
+ * @major: major number of a reference version
+ * @minor: minor number of a reference version
+ * @subminor: sub-minor number of a reference version
+ *
+ * Return 1 if FW version greater than the reference version, -1 if it's
+ *         smaller and 0 if versions are identical.
+ */
+int hl_fw_version_cmp(struct hl_device *hdev, u32 major, u32 minor, u32 subminor)
+{
+	if (hdev->fw_sw_major_ver != major)
+		return (hdev->fw_sw_major_ver > major) ? 1 : -1;
+
+	if (hdev->fw_sw_minor_ver != minor)
+		return (hdev->fw_sw_minor_ver > minor) ? 1 : -1;
+
+	if (hdev->fw_sw_sub_minor_ver != subminor)
+		return (hdev->fw_sw_sub_minor_ver > subminor) ? 1 : -1;
+
+	return 0;
+}
+
 static char *extract_fw_ver_from_str(const char *fw_str)
 {
 	char *str, *fw_ver, *whitespace;
diff --git a/drivers/accel/habanalabs/common/habanalabs.h b/drivers/accel/habanalabs/common/habanalabs.h
index 7397ce86b7f03a..634a470efe2705 100644
--- a/drivers/accel/habanalabs/common/habanalabs.h
+++ b/drivers/accel/habanalabs/common/habanalabs.h
@@ -3600,25 +3600,6 @@ struct hl_ioctl_desc {
 	hl_ioctl_t *func;
 };
 
-static inline bool hl_is_fw_sw_ver_below(struct hl_device *hdev, u32 fw_sw_major, u32 fw_sw_minor)
-{
-	if (hdev->fw_sw_major_ver < fw_sw_major)
-		return true;
-	if (hdev->fw_sw_major_ver > fw_sw_major)
-		return false;
-	if (hdev->fw_sw_minor_ver < fw_sw_minor)
-		return true;
-	return false;
-}
-
-static inline bool hl_is_fw_sw_ver_equal_or_greater(struct hl_device *hdev, u32 fw_sw_major,
-							u32 fw_sw_minor)
-{
-	return (hdev->fw_sw_major_ver > fw_sw_major ||
-			(hdev->fw_sw_major_ver == fw_sw_major &&
-					hdev->fw_sw_minor_ver >= fw_sw_minor));
-}
-
 /*
  * Kernel module functions that can be accessed by entire module
  */
@@ -3923,6 +3904,7 @@ void hl_mmu_dr_flush(struct hl_ctx *ctx);
 int hl_mmu_dr_init(struct hl_device *hdev);
 void hl_mmu_dr_fini(struct hl_device *hdev);
 
+int hl_fw_version_cmp(struct hl_device *hdev, u32 major, u32 minor, u32 subminor);
 int hl_fw_load_fw_to_device(struct hl_device *hdev, const char *fw_name,
 				void __iomem *dst, u32 src_offset, u32 size);
 int hl_fw_send_pci_access_msg(struct hl_device *hdev, u32 opcode, u64 value);
diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c b/drivers/accel/habanalabs/gaudi2/gaudi2.c
index 1f061209ae2157..4a0917aa4dd741 100644
--- a/drivers/accel/habanalabs/gaudi2/gaudi2.c
+++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c
@@ -2601,6 +2601,8 @@ static int gaudi2_set_fixed_properties(struct hl_device *hdev)
 
 	prop->hbw_flush_reg = mmPCIE_WRAP_SPECIAL_GLBL_SPARE_0;
 
+	prop->supports_advanced_cpucp_rc = true;
+
 	return 0;
 
 free_qprops:
@@ -3308,8 +3310,6 @@ static int gaudi2_late_init(struct hl_device *hdev)
 	struct gaudi2_device *gaudi2 = hdev->asic_specific;
 	int rc;
 
-	hdev->asic_prop.supports_advanced_cpucp_rc = true;
-
 	rc = hl_fw_send_pci_access_msg(hdev, CPUCP_PACKET_ENABLE_PCI_ACCESS,
 					gaudi2->virt_msix_db_dma_addr);
 	if (rc) {
@@ -3783,7 +3783,7 @@ static int gaudi2_sw_init(struct hl_device *hdev)
 	prop->supports_compute_reset = true;
 
 	/* Event queue sanity check added in FW version 1.11 */
-	if (hl_is_fw_sw_ver_below(hdev, 1, 11))
+	if (hl_fw_version_cmp(hdev, 1, 11, 0) < 0)
 		hdev->event_queue.check_eqe_index = false;
 	else
 		hdev->event_queue.check_eqe_index = true;
@@ -6314,26 +6314,6 @@ static void gaudi2_execute_hard_reset(struct hl_device *hdev)
 	WREG32(mmPSOC_RESET_CONF_SW_ALL_RST, 1);
 }
 
-static int gaudi2_get_soft_rst_done_indication(struct hl_device *hdev, u32 poll_timeout_us)
-{
-	int i, rc = 0;
-	u32 reg_val;
-
-	for (i = 0 ; i < GAUDI2_RESET_POLL_CNT ; i++)
-		rc = hl_poll_timeout(
-			hdev,
-			mmCPU_RST_STATUS_TO_HOST,
-			reg_val,
-			reg_val == CPU_RST_STATUS_SOFT_RST_DONE,
-			1000,
-			poll_timeout_us);
-
-	if (rc)
-		dev_err(hdev->dev, "Timeout while waiting for FW to complete soft reset (0x%x)\n",
-				reg_val);
-	return rc;
-}
-
 /**
  * gaudi2_execute_soft_reset - execute soft reset by driver/FW
  *
@@ -6346,23 +6326,8 @@ static int gaudi2_get_soft_rst_done_indication(struct hl_device *hdev, u32 poll_
 static int gaudi2_execute_soft_reset(struct hl_device *hdev, bool driver_performs_reset,
 						u32 poll_timeout_us)
 {
-	int rc;
-
-	if (!driver_performs_reset) {
-		if (hl_is_fw_sw_ver_below(hdev, 1, 10)) {
-			/* set SP to indicate reset request sent to FW */
-			WREG32(mmCPU_RST_STATUS_TO_HOST, CPU_RST_STATUS_NA);
-
-			WREG32(mmGIC_HOST_SOFT_RST_IRQ_POLL_REG,
-				gaudi2_irq_map_table[GAUDI2_EVENT_CPU_SOFT_RESET].cpu_id);
-
-			/* wait for f/w response */
-			rc = gaudi2_get_soft_rst_done_indication(hdev, poll_timeout_us);
-		} else {
-			rc = hl_fw_send_soft_reset(hdev);
-		}
-		return rc;
-	}
+	if (!driver_performs_reset)
+		return hl_fw_send_soft_reset(hdev);
 
 	/* Block access to engines, QMANs and SM during reset, these
 	 * RRs will be reconfigured after soft reset.
@@ -7914,7 +7879,7 @@ static bool gaudi2_handle_ecc_event(struct hl_device *hdev, u16 event_type,
 	bool has_block_id = false;
 	u16 block_id;
 
-	if (!hl_is_fw_sw_ver_below(hdev, 1, 12))
+	if (hl_fw_version_cmp(hdev, 1, 12, 0) >= 0)
 		has_block_id = true;
 
 	ecc_address = le64_to_cpu(ecc_data->ecc_address);
@@ -8165,13 +8130,7 @@ static void gaudi2_ack_module_razwi_event_handler(struct hl_device *hdev,
 		}
 
 		hbw_rtr_id = gaudi2_tpc_initiator_hbw_rtr_id[module_idx];
-
-		if (hl_is_fw_sw_ver_below(hdev, 1, 9) &&
-				!hdev->asic_prop.fw_security_enabled &&
-				((module_idx == 0) || (module_idx == 1)))
-			lbw_rtr_id = DCORE0_RTR0;
-		else
-			lbw_rtr_id = gaudi2_tpc_initiator_lbw_rtr_id[module_idx];
+		lbw_rtr_id = gaudi2_tpc_initiator_lbw_rtr_id[module_idx];
 		break;
 	case RAZWI_MME:
 		sprintf(initiator_name, "MME_%u", module_idx);
@@ -10080,7 +10039,7 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent
 		error_count = gaudi2_handle_pcie_drain(hdev, &eq_entry->pcie_drain_ind_data);
 		reset_flags |= HL_DRV_RESET_FW_FATAL_ERR;
 		event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR;
-		if (hl_is_fw_sw_ver_equal_or_greater(hdev, 1, 13))
+		if (hl_fw_version_cmp(hdev, 1, 13, 0) >= 0)
 			is_critical = true;
 		break;
 

From 80f249252cc494f75470e48125d98b7fc9a2e92f Mon Sep 17 00:00:00 2001
From: Farah Kassabri <fkassabri@habana.ai>
Date: Mon, 15 Jan 2024 12:49:24 +0200
Subject: [PATCH 0861/1406] accel/habanalabs: remove hop size from asic
 properties

The hop size related properties is a MMU properties and not
asic properties.
As for PMMU and HMMU we could have different sizes.

Signed-off-by: Farah Kassabri <fkassabri@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/accel/habanalabs/common/habanalabs.h  |  4 ----
 drivers/accel/habanalabs/common/mmu/mmu.c     | 22 ++++++++---------
 .../accel/habanalabs/common/mmu/mmu_v2_hr.c   | 24 +++++++++----------
 drivers/accel/habanalabs/gaudi/gaudi.c        |  8 +++----
 drivers/accel/habanalabs/gaudi2/gaudi2.c      | 12 ++++------
 drivers/accel/habanalabs/goya/goya.c          | 12 ++++------
 6 files changed, 36 insertions(+), 46 deletions(-)

diff --git a/drivers/accel/habanalabs/common/habanalabs.h b/drivers/accel/habanalabs/common/habanalabs.h
index 634a470efe2705..c85849aefba6e2 100644
--- a/drivers/accel/habanalabs/common/habanalabs.h
+++ b/drivers/accel/habanalabs/common/habanalabs.h
@@ -594,8 +594,6 @@ struct hl_hints_range {
  *                 we display to the user
  * @mmu_pgt_size: MMU page tables total size.
  * @mmu_pte_size: PTE size in MMU page tables.
- * @mmu_hop_table_size: MMU hop table size.
- * @mmu_hop0_tables_total_size: total size of MMU hop0 tables.
  * @dram_page_size: The DRAM physical page size.
  * @cfg_size: configuration space size on SRAM.
  * @sram_size: total size of SRAM.
@@ -747,8 +745,6 @@ struct asic_fixed_properties {
 	u32				clk_pll_index;
 	u32				mmu_pgt_size;
 	u32				mmu_pte_size;
-	u32				mmu_hop_table_size;
-	u32				mmu_hop0_tables_total_size;
 	u32				dram_page_size;
 	u32				cfg_size;
 	u32				sram_size;
diff --git a/drivers/accel/habanalabs/common/mmu/mmu.c b/drivers/accel/habanalabs/common/mmu/mmu.c
index fa7919dba783c4..d3eaab9084572b 100644
--- a/drivers/accel/habanalabs/common/mmu/mmu.c
+++ b/drivers/accel/habanalabs/common/mmu/mmu.c
@@ -1236,7 +1236,7 @@ void hl_mmu_dr_free_pgt_node(struct hl_ctx *ctx, struct pgt_info *pgt_info)
 	struct hl_device *hdev = ctx->hdev;
 
 	gen_pool_free(hdev->mmu_priv.dr.mmu_pgt_pool, pgt_info->phys_addr,
-			hdev->asic_prop.mmu_hop_table_size);
+			hdev->asic_prop.dmmu.hop_table_size);
 	hash_del(&pgt_info->node);
 	kfree((u64 *) (uintptr_t) pgt_info->shadow_addr);
 	kfree(pgt_info);
@@ -1245,18 +1245,18 @@ void hl_mmu_dr_free_pgt_node(struct hl_ctx *ctx, struct pgt_info *pgt_info)
 u64 hl_mmu_dr_get_phys_hop0_addr(struct hl_ctx *ctx)
 {
 	return ctx->hdev->asic_prop.mmu_pgt_addr +
-			(ctx->asid * ctx->hdev->asic_prop.mmu_hop_table_size);
+			(ctx->asid * ctx->hdev->asic_prop.dmmu.hop_table_size);
 }
 
 u64 hl_mmu_dr_get_hop0_addr(struct hl_ctx *ctx)
 {
 	return (u64) (uintptr_t) ctx->hdev->mmu_priv.dr.mmu_shadow_hop0 +
-			(ctx->asid * ctx->hdev->asic_prop.mmu_hop_table_size);
+			(ctx->asid * ctx->hdev->asic_prop.dmmu.hop_table_size);
 }
 
 u64 hl_mmu_dr_get_phys_addr(struct hl_ctx *ctx, u64 shadow_addr)
 {
-	u64 page_mask = ctx->hdev->asic_prop.mmu_hop_table_size - 1;
+	u64 page_mask = ctx->hdev->asic_prop.dmmu.hop_table_size - 1;
 	u64 shadow_hop_addr = shadow_addr & (~page_mask);
 	u64 pte_offset = shadow_addr & page_mask;
 	u64 phys_hop_addr;
@@ -1326,13 +1326,13 @@ u64 hl_mmu_dr_alloc_hop(struct hl_ctx *ctx)
 		return ULLONG_MAX;
 
 	phys_addr = (u64) gen_pool_alloc(hdev->mmu_priv.dr.mmu_pgt_pool,
-					prop->mmu_hop_table_size);
+					prop->dmmu.hop_table_size);
 	if (!phys_addr) {
 		dev_err(hdev->dev, "failed to allocate page\n");
 		goto pool_add_err;
 	}
 
-	shadow_addr = (u64) (uintptr_t) kzalloc(prop->mmu_hop_table_size,
+	shadow_addr = (u64) (uintptr_t) kzalloc(prop->dmmu.hop_table_size,
 						GFP_KERNEL);
 	if (!shadow_addr)
 		goto shadow_err;
@@ -1347,7 +1347,7 @@ u64 hl_mmu_dr_alloc_hop(struct hl_ctx *ctx)
 
 shadow_err:
 	gen_pool_free(hdev->mmu_priv.dr.mmu_pgt_pool,
-			phys_addr, prop->mmu_hop_table_size);
+			phys_addr, prop->dmmu.hop_table_size);
 pool_add_err:
 	kfree(pgt_info);
 
@@ -1379,7 +1379,7 @@ int hl_mmu_dr_init(struct hl_device *hdev)
 	int rc;
 
 	hdev->mmu_priv.dr.mmu_pgt_pool =
-			gen_pool_create(__ffs(prop->mmu_hop_table_size), -1);
+			gen_pool_create(__ffs(prop->dmmu.hop_table_size), -1);
 
 	if (!hdev->mmu_priv.dr.mmu_pgt_pool) {
 		dev_err(hdev->dev, "Failed to create page gen pool\n");
@@ -1387,8 +1387,8 @@ int hl_mmu_dr_init(struct hl_device *hdev)
 	}
 
 	rc = gen_pool_add(hdev->mmu_priv.dr.mmu_pgt_pool, prop->mmu_pgt_addr +
-			prop->mmu_hop0_tables_total_size,
-			prop->dmmu.pgt_size - prop->mmu_hop0_tables_total_size,
+			prop->dmmu.hop0_tables_total_size,
+			prop->dmmu.pgt_size - prop->dmmu.hop0_tables_total_size,
 			-1);
 	if (rc) {
 		dev_err(hdev->dev, "Failed to add memory to page gen pool\n");
@@ -1396,7 +1396,7 @@ int hl_mmu_dr_init(struct hl_device *hdev)
 	}
 
 	hdev->mmu_priv.dr.mmu_shadow_hop0 = kvcalloc(prop->max_asid,
-						prop->mmu_hop_table_size, GFP_KERNEL);
+						prop->dmmu.hop_table_size, GFP_KERNEL);
 	if (ZERO_OR_NULL_PTR(hdev->mmu_priv.dr.mmu_shadow_hop0)) {
 		rc = -ENOMEM;
 		goto err_pool_add;
diff --git a/drivers/accel/habanalabs/common/mmu/mmu_v2_hr.c b/drivers/accel/habanalabs/common/mmu/mmu_v2_hr.c
index afe7ef964f82e0..31507b2a431be3 100644
--- a/drivers/accel/habanalabs/common/mmu/mmu_v2_hr.c
+++ b/drivers/accel/habanalabs/common/mmu/mmu_v2_hr.c
@@ -47,7 +47,7 @@ static inline int hl_mmu_v2_hr_init(struct hl_device *hdev)
 {
 	struct asic_fixed_properties *prop = &hdev->asic_prop;
 
-	return hl_mmu_hr_init(hdev, &hdev->mmu_priv.hr, prop->mmu_hop_table_size,
+	return hl_mmu_hr_init(hdev, &hdev->mmu_priv.hr, prop->pmmu.hop_table_size,
 				prop->mmu_pgt_size);
 }
 
@@ -65,7 +65,7 @@ static inline void hl_mmu_v2_hr_fini(struct hl_device *hdev)
 {
 	struct asic_fixed_properties *prop = &hdev->asic_prop;
 
-	hl_mmu_hr_fini(hdev, &hdev->mmu_priv.hr, prop->mmu_hop_table_size);
+	hl_mmu_hr_fini(hdev, &hdev->mmu_priv.hr, prop->pmmu.hop_table_size);
 }
 
 /**
@@ -108,7 +108,7 @@ static void hl_mmu_v2_hr_ctx_fini(struct hl_ctx *ctx)
 			"pgt_info of addr 0x%llx of asid %d was not destroyed, num_ptes: %d\n",
 			pgt_info->phys_addr, ctx->asid, pgt_info->num_of_ptes);
 		hl_mmu_hr_free_hop_remove_pgt(pgt_info, &ctx->hdev->mmu_priv.hr,
-							ctx->hdev->asic_prop.mmu_hop_table_size);
+							ctx->hdev->asic_prop.pmmu.hop_table_size);
 	}
 }
 
@@ -150,7 +150,7 @@ static int _hl_mmu_v2_hr_unmap(struct hl_ctx *ctx,
 
 		curr_pte = *(u64 *) (uintptr_t) hl_mmu_hr_pte_phys_to_virt(ctx, hops_pgt_info[i],
 							hop_pte_phys_addr[i],
-							ctx->hdev->asic_prop.mmu_hop_table_size);
+							ctx->hdev->asic_prop.pmmu.hop_table_size);
 
 		if ((i < hop_last) && (curr_pte & mmu_prop->last_mask)) {
 			hop_last = i;
@@ -169,14 +169,14 @@ static int _hl_mmu_v2_hr_unmap(struct hl_ctx *ctx,
 
 	for (i = hop_last ; i > 0 ; i--) {
 		hl_mmu_hr_clear_pte(ctx, hops_pgt_info[i], hop_pte_phys_addr[i],
-						ctx->hdev->asic_prop.mmu_hop_table_size);
+						ctx->hdev->asic_prop.pmmu.hop_table_size);
 
 		if (hl_mmu_hr_put_pte(ctx, hops_pgt_info[i], &ctx->hdev->mmu_priv.hr,
-						ctx->hdev->asic_prop.mmu_hop_table_size))
+						ctx->hdev->asic_prop.pmmu.hop_table_size))
 			goto mapped;
 	}
 	hl_mmu_hr_clear_pte(ctx, hops_pgt_info[0], hop_pte_phys_addr[0],
-						ctx->hdev->asic_prop.mmu_hop_table_size);
+						ctx->hdev->asic_prop.pmmu.hop_table_size);
 
 mapped:
 	return 0;
@@ -255,7 +255,7 @@ static int _hl_mmu_v2_hr_map(struct hl_ctx *ctx,
 									scrambled_virt_addr);
 		curr_pte = *(u64 *) (uintptr_t) hl_mmu_hr_pte_phys_to_virt(ctx, hops_pgt_info[i],
 							hop_pte_phys_addr[i],
-							ctx->hdev->asic_prop.mmu_hop_table_size);
+							ctx->hdev->asic_prop.pmmu.hop_table_size);
 	}
 
 	if (curr_pte & PAGE_PRESENT_MASK) {
@@ -268,7 +268,7 @@ static int _hl_mmu_v2_hr_map(struct hl_ctx *ctx,
 					*(u64 *) (uintptr_t)
 					hl_mmu_hr_pte_phys_to_virt(ctx, hops_pgt_info[i],
 							hop_pte_phys_addr[i],
-							ctx->hdev->asic_prop.mmu_hop_table_size),
+							ctx->hdev->asic_prop.pmmu.hop_table_size),
 					hop_pte_phys_addr[i]);
 		rc = -EINVAL;
 		goto err;
@@ -279,7 +279,7 @@ static int _hl_mmu_v2_hr_map(struct hl_ctx *ctx,
 
 	/* Write the PTEs */
 	hl_mmu_hr_write_pte(ctx, hops_pgt_info[hop_last], hop_pte_phys_addr[hop_last], curr_pte,
-							ctx->hdev->asic_prop.mmu_hop_table_size);
+							ctx->hdev->asic_prop.pmmu.hop_table_size);
 
 	/* for each new hop, add its address to the table of previous-hop */
 	for (i = 1 ; i <= hop_last ; i++) {
@@ -287,7 +287,7 @@ static int _hl_mmu_v2_hr_map(struct hl_ctx *ctx,
 			curr_pte = (hops_pgt_info[i]->phys_addr & HOP_PHYS_ADDR_MASK) |
 							PAGE_PRESENT_MASK;
 			hl_mmu_hr_write_pte(ctx, hops_pgt_info[i - 1], hop_pte_phys_addr[i - 1],
-						curr_pte, ctx->hdev->asic_prop.mmu_hop_table_size);
+						curr_pte, ctx->hdev->asic_prop.pmmu.hop_table_size);
 			if (i - 1)
 				hl_mmu_hr_get_pte(ctx, &ctx->hdev->mmu_func[MMU_HR_PGT].hr_funcs,
 								hops_pgt_info[i - 1]->phys_addr);
@@ -303,7 +303,7 @@ static int _hl_mmu_v2_hr_map(struct hl_ctx *ctx,
 	for (i = 1 ; i <= hop_last ; i++)
 		if (hop_new[i] && hops_pgt_info[i])
 			hl_mmu_hr_free_hop_remove_pgt(hops_pgt_info[i], &ctx->hdev->mmu_priv.hr,
-							ctx->hdev->asic_prop.mmu_hop_table_size);
+							ctx->hdev->asic_prop.pmmu.hop_table_size);
 
 	return rc;
 }
diff --git a/drivers/accel/habanalabs/gaudi/gaudi.c b/drivers/accel/habanalabs/gaudi/gaudi.c
index dde3839fe0e070..f2b04ffb0ecb28 100644
--- a/drivers/accel/habanalabs/gaudi/gaudi.c
+++ b/drivers/accel/habanalabs/gaudi/gaudi.c
@@ -614,8 +614,6 @@ static int gaudi_set_fixed_properties(struct hl_device *hdev)
 	else
 		prop->mmu_pgt_size = MMU_PAGE_TABLES_SIZE;
 	prop->mmu_pte_size = HL_PTE_SIZE;
-	prop->mmu_hop_table_size = HOP_TABLE_SIZE_512_PTE;
-	prop->mmu_hop0_tables_total_size = HOP0_512_PTE_TABLES_TOTAL_SIZE;
 	prop->dram_page_size = PAGE_SIZE_2MB;
 	prop->device_mem_alloc_default_page_size = prop->dram_page_size;
 	prop->dram_supports_virtual_memory = false;
@@ -637,8 +635,8 @@ static int gaudi_set_fixed_properties(struct hl_device *hdev)
 	prop->pmmu.num_hops = MMU_ARCH_5_HOPS;
 	prop->pmmu.last_mask = LAST_MASK;
 	/* TODO: will be duplicated until implementing per-MMU props */
-	prop->pmmu.hop_table_size = prop->mmu_hop_table_size;
-	prop->pmmu.hop0_tables_total_size = prop->mmu_hop0_tables_total_size;
+	prop->pmmu.hop_table_size = HOP_TABLE_SIZE_512_PTE;
+	prop->pmmu.hop0_tables_total_size = HOP0_512_PTE_TABLES_TOTAL_SIZE;
 
 	/* PMMU and HPMMU are the same except of page size */
 	memcpy(&prop->pmmu_huge, &prop->pmmu, sizeof(prop->pmmu));
@@ -3653,7 +3651,7 @@ static int gaudi_mmu_init(struct hl_device *hdev)
 
 	for (i = 0 ; i < prop->max_asid ; i++) {
 		hop0_addr = prop->mmu_pgt_addr +
-				(i * prop->mmu_hop_table_size);
+				(i * prop->dmmu.hop_table_size);
 
 		rc = gaudi_mmu_update_asid_hop0_addr(hdev, i, hop0_addr);
 		if (rc) {
diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c b/drivers/accel/habanalabs/gaudi2/gaudi2.c
index 4a0917aa4dd741..26975179763ac0 100644
--- a/drivers/accel/habanalabs/gaudi2/gaudi2.c
+++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c
@@ -2467,8 +2467,6 @@ static int gaudi2_set_fixed_properties(struct hl_device *hdev)
 
 	prop->dmmu.pgt_size = HMMU_PAGE_TABLES_SIZE;
 	prop->mmu_pte_size = HL_PTE_SIZE;
-	prop->mmu_hop_table_size = HOP_TABLE_SIZE_512_PTE;
-	prop->mmu_hop0_tables_total_size = HOP_TABLE_SIZE_512_PTE * prop->max_asid;
 
 	prop->dmmu.hop_shifts[MMU_HOP0] = DHOP0_SHIFT;
 	prop->dmmu.hop_shifts[MMU_HOP1] = DHOP1_SHIFT;
@@ -2482,8 +2480,8 @@ static int gaudi2_set_fixed_properties(struct hl_device *hdev)
 	prop->dmmu.num_hops = MMU_ARCH_4_HOPS;
 	prop->dmmu.last_mask = LAST_MASK;
 	prop->dmmu.host_resident = 0;
-	prop->dmmu.hop_table_size = prop->mmu_hop_table_size;
-	prop->dmmu.hop0_tables_total_size = prop->mmu_hop0_tables_total_size;
+	prop->dmmu.hop_table_size = HOP_TABLE_SIZE_512_PTE;
+	prop->dmmu.hop0_tables_total_size = HOP_TABLE_SIZE_512_PTE * prop->max_asid;
 
 	/* As we need to set the pgt address in dram for HMMU init so we cannot
 	 * wait to the fw cpucp info to set the dram props as mmu init comes before
@@ -2500,8 +2498,8 @@ static int gaudi2_set_fixed_properties(struct hl_device *hdev)
 	prop->pmmu.host_resident = 1;
 	prop->pmmu.num_hops = MMU_ARCH_6_HOPS;
 	prop->pmmu.last_mask = LAST_MASK;
-	prop->pmmu.hop_table_size = prop->mmu_hop_table_size;
-	prop->pmmu.hop0_tables_total_size = prop->mmu_hop0_tables_total_size;
+	prop->pmmu.hop_table_size = HOP_TABLE_SIZE_512_PTE;
+	prop->pmmu.hop0_tables_total_size = HOP_TABLE_SIZE_512_PTE * prop->max_asid;
 
 	prop->hints_host_reserved_va_range.start_addr = RESERVED_VA_FOR_VIRTUAL_MSIX_DOORBELL_START;
 	prop->hints_host_reserved_va_range.end_addr = RESERVED_VA_RANGE_FOR_ARC_ON_HOST_END;
@@ -5934,7 +5932,7 @@ static int gaudi2_mmu_update_hop0_addr(struct hl_device *hdev, u32 stlb_base,
 		if (host_resident_pgt)
 			hop0_addr = hdev->mmu_priv.hr.mmu_asid_hop0[asid].phys_addr;
 		else
-			hop0_addr = prop->mmu_pgt_addr + (asid * prop->mmu_hop_table_size);
+			hop0_addr = prop->mmu_pgt_addr + (asid * prop->dmmu.hop_table_size);
 
 		rc = gaudi2_mmu_update_asid_hop0_addr(hdev, stlb_base, asid, hop0_addr);
 		if (rc) {
diff --git a/drivers/accel/habanalabs/goya/goya.c b/drivers/accel/habanalabs/goya/goya.c
index 1322cb330c5774..5a359c3bdc782e 100644
--- a/drivers/accel/habanalabs/goya/goya.c
+++ b/drivers/accel/habanalabs/goya/goya.c
@@ -413,8 +413,6 @@ int goya_set_fixed_properties(struct hl_device *hdev)
 	else
 		prop->mmu_pgt_size = MMU_PAGE_TABLES_SIZE;
 	prop->mmu_pte_size = HL_PTE_SIZE;
-	prop->mmu_hop_table_size = HOP_TABLE_SIZE_512_PTE;
-	prop->mmu_hop0_tables_total_size = HOP0_512_PTE_TABLES_TOTAL_SIZE;
 	prop->dram_page_size = PAGE_SIZE_2MB;
 	prop->device_mem_alloc_default_page_size = prop->dram_page_size;
 	prop->dram_supports_virtual_memory = true;
@@ -435,8 +433,8 @@ int goya_set_fixed_properties(struct hl_device *hdev)
 	prop->dmmu.num_hops = MMU_ARCH_5_HOPS;
 	prop->dmmu.last_mask = LAST_MASK;
 	/* TODO: will be duplicated until implementing per-MMU props */
-	prop->dmmu.hop_table_size = prop->mmu_hop_table_size;
-	prop->dmmu.hop0_tables_total_size = prop->mmu_hop0_tables_total_size;
+	prop->dmmu.hop_table_size = HOP_TABLE_SIZE_512_PTE;
+	prop->dmmu.hop0_tables_total_size = HOP0_512_PTE_TABLES_TOTAL_SIZE;
 
 	/* shifts and masks are the same in PMMU and DMMU */
 	memcpy(&prop->pmmu, &prop->dmmu, sizeof(prop->dmmu));
@@ -446,8 +444,8 @@ int goya_set_fixed_properties(struct hl_device *hdev)
 	prop->pmmu.num_hops = MMU_ARCH_5_HOPS;
 	prop->pmmu.last_mask = LAST_MASK;
 	/* TODO: will be duplicated until implementing per-MMU props */
-	prop->pmmu.hop_table_size = prop->mmu_hop_table_size;
-	prop->pmmu.hop0_tables_total_size = prop->mmu_hop0_tables_total_size;
+	prop->pmmu.hop_table_size = HOP_TABLE_SIZE_512_PTE;
+	prop->pmmu.hop0_tables_total_size = HOP0_512_PTE_TABLES_TOTAL_SIZE;
 
 	/* PMMU and HPMMU are the same except of page size */
 	memcpy(&prop->pmmu_huge, &prop->pmmu, sizeof(prop->pmmu));
@@ -2678,7 +2676,7 @@ int goya_mmu_init(struct hl_device *hdev)
 
 	for (i = 0 ; i < prop->max_asid ; i++) {
 		hop0_addr = prop->mmu_pgt_addr +
-				(i * prop->mmu_hop_table_size);
+				(i * prop->dmmu.hop_table_size);
 
 		rc = goya_mmu_update_asid_hop0_addr(hdev, i, hop0_addr);
 		if (rc) {

From 386bd16b666a9ce7ccfe4005390c3052f4dd9743 Mon Sep 17 00:00:00 2001
From: Tomer Tayar <ttayar@habana.ai>
Date: Thu, 18 Jan 2024 19:18:43 +0200
Subject: [PATCH 0862/1406] accel/habanalabs: modify print for skip loading
 linux FW to debug log

Skip loading a linux FW image into the device with the current supported
ASICs is done for test purposes only.
Moreover, for future supported ASICs it is possible that there won't be
a need to load such an image.
The print in such a case is therefore not needed in most cases, so
replace the used dev_info() with dev_dbg().

Signed-off-by: Tomer Tayar <ttayar@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/accel/habanalabs/common/firmware_if.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/accel/habanalabs/common/firmware_if.c b/drivers/accel/habanalabs/common/firmware_if.c
index e7dcf2fe6552aa..364d292c76fa38 100644
--- a/drivers/accel/habanalabs/common/firmware_if.c
+++ b/drivers/accel/habanalabs/common/firmware_if.c
@@ -2820,7 +2820,7 @@ static int hl_fw_dynamic_init_cpu(struct hl_device *hdev,
 	hdev->asic_funcs->init_cpu_scrambler_dram(hdev);
 
 	if (!(hdev->fw_components & FW_TYPE_LINUX)) {
-		dev_info(hdev->dev, "Skip loading Linux F/W\n");
+		dev_dbg(hdev->dev, "Skip loading Linux F/W\n");
 		return 0;
 	}
 

From bb2ad94e4c1f6ccf76716a300cde490d3b9b9b54 Mon Sep 17 00:00:00 2001
From: Tomer Tayar <ttayar@habana.ai>
Date: Thu, 18 Jan 2024 14:29:02 +0200
Subject: [PATCH 0863/1406] accel/habanalabs/gaudi2: check extended errors
 according to PCIe addr_dec interrupt info

The FW interrupt info for a PCIe addr_dec event is set correctly, so
check for either global errors or razwi according to the indications
there.

Signed-off-by: Tomer Tayar <ttayar@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/accel/habanalabs/gaudi2/gaudi2.c | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c b/drivers/accel/habanalabs/gaudi2/gaudi2.c
index 26975179763ac0..671241735a6e98 100644
--- a/drivers/accel/habanalabs/gaudi2/gaudi2.c
+++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c
@@ -8942,9 +8942,6 @@ static int gaudi2_print_pcie_addr_dec_info(struct hl_device *hdev, u16 event_typ
 	u32 error_count = 0;
 	int i;
 
-	gaudi2_print_event(hdev, event_type, true,
-		"intr_cause_data: %#llx", intr_cause_data);
-
 	for (i = 0 ; i < GAUDI2_NUM_OF_PCIE_ADDR_DEC_ERR_CAUSE ; i++) {
 		if (!(intr_cause_data & BIT_ULL(i)))
 			continue;
@@ -8953,15 +8950,16 @@ static int gaudi2_print_pcie_addr_dec_info(struct hl_device *hdev, u16 event_typ
 			"err cause: %s", gaudi2_pcie_addr_dec_error_cause[i]);
 		error_count++;
 
-		/*
-		 * Always check for LBW and HBW additional info as the indication itself is
-		 * sometimes missing
-		 */
+		switch (intr_cause_data & BIT_ULL(i)) {
+		case PCIE_WRAP_PCIE_IC_SEI_INTR_IND_AXI_LBW_ERR_INTR_MASK:
+			hl_check_for_glbl_errors(hdev);
+			break;
+		case PCIE_WRAP_PCIE_IC_SEI_INTR_IND_BAD_ACCESS_INTR_MASK:
+			gaudi2_print_pcie_mstr_rr_mstr_if_razwi_info(hdev, event_mask);
+			break;
+		}
 	}
 
-	hl_check_for_glbl_errors(hdev);
-	gaudi2_print_pcie_mstr_rr_mstr_if_razwi_info(hdev, event_mask);
-
 	return error_count;
 }
 

From 56dda4aae9563cd513a8b7e9f460e149c7c83293 Mon Sep 17 00:00:00 2001
From: Tomer Tayar <ttayar@habana.ai>
Date: Thu, 25 Jan 2024 22:59:02 +0200
Subject: [PATCH 0864/1406] accel/habanalabs: fix glbl error cause handling

The glbl error cause handling has a wrong assumption that all error
bits are consecutive.
Fix the handling to check all relevant error bits per ASIC.

Signed-off-by: Tomer Tayar <ttayar@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/accel/habanalabs/common/habanalabs.h |  4 +--
 drivers/accel/habanalabs/common/security.c   | 33 +++++++++++++++-----
 drivers/accel/habanalabs/common/security.h   |  3 +-
 drivers/accel/habanalabs/gaudi2/gaudi2.c     | 10 +++---
 drivers/accel/habanalabs/gaudi2/gaudi2P.h    |  3 +-
 5 files changed, 35 insertions(+), 18 deletions(-)

diff --git a/drivers/accel/habanalabs/common/habanalabs.h b/drivers/accel/habanalabs/common/habanalabs.h
index c85849aefba6e2..40107a4eba93d3 100644
--- a/drivers/accel/habanalabs/common/habanalabs.h
+++ b/drivers/accel/habanalabs/common/habanalabs.h
@@ -647,7 +647,7 @@ struct hl_hints_range {
  * @num_engine_cores: number of engine cpu cores.
  * @max_num_of_engines: maximum number of all engines in the ASIC.
  * @num_of_special_blocks: special_blocks array size.
- * @glbl_err_cause_num: global err cause number.
+ * @glbl_err_max_cause_num: global err max cause number.
  * @hbw_flush_reg: register to read to generate HBW flush. value of 0 means HBW flush is
  *                 not supported.
  * @reserved_fw_mem_size: size in MB of dram memory reserved for FW.
@@ -779,7 +779,7 @@ struct asic_fixed_properties {
 	u32				num_engine_cores;
 	u32				max_num_of_engines;
 	u32				num_of_special_blocks;
-	u32				glbl_err_cause_num;
+	u32				glbl_err_max_cause_num;
 	u32				hbw_flush_reg;
 	u32				reserved_fw_mem_size;
 	u16				collective_first_sob;
diff --git a/drivers/accel/habanalabs/common/security.c b/drivers/accel/habanalabs/common/security.c
index fe913965dbad7b..5402a3cd0491e2 100644
--- a/drivers/accel/habanalabs/common/security.c
+++ b/drivers/accel/habanalabs/common/security.c
@@ -7,15 +7,31 @@
 
 #include "habanalabs.h"
 
-static const char * const hl_glbl_error_cause[HL_MAX_NUM_OF_GLBL_ERR_CAUSE] = {
+static const char * const hl_glbl_error_cause[] = {
 	"Error due to un-priv read",
 	"Error due to un-secure read",
 	"Error due to read from unmapped reg",
 	"Error due to un-priv write",
 	"Error due to un-secure write",
 	"Error due to write to unmapped reg",
+	"N/A",
+	"N/A",
+	"N/A",
+	"N/A",
+	"N/A",
+	"N/A",
+	"N/A",
+	"N/A",
+	"N/A",
+	"N/A",
 	"External I/F write sec violation",
 	"External I/F write to un-mapped reg",
+	"N/A",
+	"N/A",
+	"N/A",
+	"N/A",
+	"N/A",
+	"N/A",
 	"Read to write only",
 	"Write to read only"
 };
@@ -671,10 +687,11 @@ static bool hl_check_block_range_exclusion(struct hl_device *hdev,
 static int hl_read_glbl_errors(struct hl_device *hdev,
 		u32 blk_idx, u32 major, u32 minor, u32 sub_minor, void *data)
 {
-	struct hl_special_block_info *special_blocks = hdev->asic_prop.special_blocks;
+	struct asic_fixed_properties *prop = &hdev->asic_prop;
+	struct hl_special_block_info *special_blocks = prop->special_blocks;
 	struct hl_special_block_info *current_block = &special_blocks[blk_idx];
 	u32 glbl_err_addr, glbl_err_cause, addr_val, cause_val, block_base,
-		base = current_block->base_addr - lower_32_bits(hdev->asic_prop.cfg_base_address);
+		base = current_block->base_addr - lower_32_bits(prop->cfg_base_address);
 	int i;
 
 	block_base = base + major * current_block->major_offset +
@@ -689,13 +706,13 @@ static int hl_read_glbl_errors(struct hl_device *hdev,
 	glbl_err_addr = block_base + HL_GLBL_ERR_ADDR_OFFSET;
 	addr_val = RREG32(glbl_err_addr);
 
-	for (i = 0 ; i < hdev->asic_prop.glbl_err_cause_num ; i++) {
+	for (i = 0 ; i <= prop->glbl_err_max_cause_num ; i++) {
 		if (cause_val & BIT(i))
 			dev_err_ratelimited(hdev->dev,
-				"%s, addr %#llx\n",
-				hl_glbl_error_cause[i],
-				hdev->asic_prop.cfg_base_address + block_base +
-				FIELD_GET(HL_GLBL_ERR_ADDRESS_MASK, addr_val));
+					"%s, addr %#llx\n",
+					hl_glbl_error_cause[i],
+					prop->cfg_base_address + block_base +
+						FIELD_GET(HL_GLBL_ERR_ADDRESS_MASK, addr_val));
 	}
 
 	WREG32(glbl_err_cause, cause_val);
diff --git a/drivers/accel/habanalabs/common/security.h b/drivers/accel/habanalabs/common/security.h
index d7a3b3e82ea4b0..476f70687c0997 100644
--- a/drivers/accel/habanalabs/common/security.h
+++ b/drivers/accel/habanalabs/common/security.h
@@ -13,8 +13,7 @@
 struct hl_device;
 
 /* special blocks */
-#define HL_MAX_NUM_OF_GLBL_ERR_CAUSE		10
-#define HL_GLBL_ERR_ADDRESS_MASK		GENMASK(11, 0)
+#define HL_GLBL_ERR_ADDRESS_MASK	GENMASK(11, 0)
 /* GLBL_ERR_ADDR register offset from the start of the block */
 #define HL_GLBL_ERR_ADDR_OFFSET		0xF44
 /* GLBL_ERR_CAUSE register offset from the start of the block */
diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c b/drivers/accel/habanalabs/gaudi2/gaudi2.c
index 671241735a6e98..189d8da6a624c6 100644
--- a/drivers/accel/habanalabs/gaudi2/gaudi2.c
+++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c
@@ -158,11 +158,13 @@
 #define RAZWI_INITIATOR_ID_X_Y(xl, yl, xh) \
 	(RAZWI_INITIATOR_ID_X_Y_LOW(xl, yl) | RAZWI_INITIATOR_ID_X_HIGH(xh))
 
-#define PSOC_RAZWI_ENG_STR_SIZE 128
-#define PSOC_RAZWI_MAX_ENG_PER_RTR 5
+#define PSOC_RAZWI_ENG_STR_SIZE			128
+#define PSOC_RAZWI_MAX_ENG_PER_RTR		5
 
 /* HW scrambles only bits 0-25 */
-#define HW_UNSCRAMBLED_BITS_MASK GENMASK_ULL(63, 26)
+#define HW_UNSCRAMBLED_BITS_MASK		GENMASK_ULL(63, 26)
+
+#define GAUDI2_GLBL_ERR_MAX_CAUSE_NUM		17
 
 struct gaudi2_razwi_info {
 	u32 axuser_xy;
@@ -3587,7 +3589,7 @@ static int gaudi2_special_blocks_config(struct hl_device *hdev)
 	int i, rc;
 
 	/* Configure Special blocks */
-	prop->glbl_err_cause_num = GAUDI2_NUM_OF_GLBL_ERR_CAUSE;
+	prop->glbl_err_max_cause_num = GAUDI2_GLBL_ERR_MAX_CAUSE_NUM;
 	prop->num_of_special_blocks = ARRAY_SIZE(gaudi2_special_blocks);
 	prop->special_blocks = kmalloc_array(prop->num_of_special_blocks,
 			sizeof(*prop->special_blocks), GFP_KERNEL);
diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2P.h b/drivers/accel/habanalabs/gaudi2/gaudi2P.h
index bc508c9cee5c50..eee41387b269c3 100644
--- a/drivers/accel/habanalabs/gaudi2/gaudi2P.h
+++ b/drivers/accel/habanalabs/gaudi2/gaudi2P.h
@@ -237,9 +237,8 @@
 #define GAUDI2_SOB_INCREMENT_BY_ONE	(FIELD_PREP(DCORE0_SYNC_MNGR_OBJS_SOB_OBJ_VAL_MASK, 1) | \
 					FIELD_PREP(DCORE0_SYNC_MNGR_OBJS_SOB_OBJ_INC_MASK, 1))
 
-#define GAUDI2_NUM_TESTED_QS (GAUDI2_QUEUE_ID_CPU_PQ - GAUDI2_QUEUE_ID_PDMA_0_0)
+#define GAUDI2_NUM_TESTED_QS		(GAUDI2_QUEUE_ID_CPU_PQ - GAUDI2_QUEUE_ID_PDMA_0_0)
 
-#define GAUDI2_NUM_OF_GLBL_ERR_CAUSE		8
 
 enum gaudi2_reserved_sob_id {
 	GAUDI2_RESERVED_SOB_CS_COMPLETION_FIRST,

From 212a08ec2dcd959ab2ee0ece988a96360792a1e4 Mon Sep 17 00:00:00 2001
From: Avri Kehat <akehat@habana.ai>
Date: Tue, 16 Jan 2024 17:54:36 +0200
Subject: [PATCH 0865/1406] accel/habanalabs: fix debugfs files permissions

debugfs files are created with permissions that don't align
with the access requirements.

Signed-off-by: Avri Kehat <akehat@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/accel/habanalabs/common/debugfs.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/drivers/accel/habanalabs/common/debugfs.c b/drivers/accel/habanalabs/common/debugfs.c
index 01f071d52570f8..ab0fe74b49d011 100644
--- a/drivers/accel/habanalabs/common/debugfs.c
+++ b/drivers/accel/habanalabs/common/debugfs.c
@@ -1643,19 +1643,19 @@ static void add_files_to_device(struct hl_device *hdev, struct hl_dbg_device_ent
 				&hl_data64b_fops);
 
 	debugfs_create_file("set_power_state",
-				0200,
+				0644,
 				root,
 				dev_entry,
 				&hl_power_fops);
 
 	debugfs_create_file("device",
-				0200,
+				0644,
 				root,
 				dev_entry,
 				&hl_device_fops);
 
 	debugfs_create_file("clk_gate",
-				0200,
+				0644,
 				root,
 				dev_entry,
 				&hl_clk_gate_fops);
@@ -1667,13 +1667,13 @@ static void add_files_to_device(struct hl_device *hdev, struct hl_dbg_device_ent
 				&hl_stop_on_err_fops);
 
 	debugfs_create_file("dump_security_violations",
-				0644,
+				0400,
 				root,
 				dev_entry,
 				&hl_security_violations_fops);
 
 	debugfs_create_file("dump_razwi_events",
-				0644,
+				0400,
 				root,
 				dev_entry,
 				&hl_razwi_check_fops);
@@ -1706,7 +1706,7 @@ static void add_files_to_device(struct hl_device *hdev, struct hl_dbg_device_ent
 				&hdev->reset_info.skip_reset_on_timeout);
 
 	debugfs_create_file("state_dump",
-				0600,
+				0644,
 				root,
 				dev_entry,
 				&hl_state_dump_fops);
@@ -1724,7 +1724,7 @@ static void add_files_to_device(struct hl_device *hdev, struct hl_dbg_device_ent
 
 	for (i = 0, entry = dev_entry->entry_arr ; i < count ; i++, entry++) {
 		debugfs_create_file(hl_debugfs_list[i].name,
-					0444,
+					0644,
 					root,
 					entry,
 					&hl_debugfs_fops);

From 7d9048d174dc33634cec67d98d097fd768ccf292 Mon Sep 17 00:00:00 2001
From: Raag Jadav <raag.jadav@intel.com>
Date: Mon, 19 Feb 2024 09:08:33 +0530
Subject: [PATCH 0866/1406] pwm: dwc: drop redundant error check
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

pcim_iomap_table() fails only if pcim_iomap_regions() fails. No need to
check for failure if the latter is already successful.

Suggested-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Raag Jadav <raag.jadav@intel.com>
Tested-by: Jarkko Nikula <jarkko.nikula@linux.intel.com>
Link: https://lore.kernel.org/r/20240219033835.11369-3-raag.jadav@intel.com
Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
---
 drivers/pwm/pwm-dwc.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/drivers/pwm/pwm-dwc.c b/drivers/pwm/pwm-dwc.c
index c0e586688e57fe..7dbb72c80ef597 100644
--- a/drivers/pwm/pwm-dwc.c
+++ b/drivers/pwm/pwm-dwc.c
@@ -51,11 +51,8 @@ static int dwc_pwm_probe(struct pci_dev *pci, const struct pci_device_id *id)
 		return ret;
 	}
 
+	/* No need to check for failure, pcim_iomap_regions() does it for us. */
 	dwc->base = pcim_iomap_table(pci)[0];
-	if (!dwc->base) {
-		dev_err(dev, "Base address missing\n");
-		return -ENOMEM;
-	}
 
 	ret = devm_pwmchip_add(dev, chip);
 	if (ret)

From ec333072f84f097ca937a45f6935c4dd01931b49 Mon Sep 17 00:00:00 2001
From: Raag Jadav <raag.jadav@intel.com>
Date: Mon, 19 Feb 2024 09:08:34 +0530
Subject: [PATCH 0867/1406] pwm: dwc: Add 16 channel support for Intel Elkhart
 Lake
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Intel Elkhart Lake PSE includes two instances of PWM as a single PCI
function with 8 channels each. Add support for the remaining channels.

Signed-off-by: Raag Jadav <raag.jadav@intel.com>
Tested-by: Jarkko Nikula <jarkko.nikula@linux.intel.com>
Tested-by: Lakshmi Sowjanya D <lakshmi.sowjanya.d@intel.com>
Link: https://lore.kernel.org/r/20240219033835.11369-4-raag.jadav@intel.com
Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
---
 drivers/pwm/pwm-dwc.c | 38 +++++++++++++++++++++++++++++---------
 drivers/pwm/pwm-dwc.h |  5 +++++
 2 files changed, 34 insertions(+), 9 deletions(-)

diff --git a/drivers/pwm/pwm-dwc.c b/drivers/pwm/pwm-dwc.c
index 7dbb72c80ef597..de95352081fa8c 100644
--- a/drivers/pwm/pwm-dwc.c
+++ b/drivers/pwm/pwm-dwc.c
@@ -25,17 +25,32 @@
 
 #include "pwm-dwc.h"
 
-static int dwc_pwm_probe(struct pci_dev *pci, const struct pci_device_id *id)
+/* Elkhart Lake */
+static const struct dwc_pwm_info ehl_pwm_info = {
+	.nr = 2,
+	.size = 0x1000,
+};
+
+static int dwc_pwm_init_one(struct device *dev, void __iomem *base, unsigned int offset)
 {
-	struct device *dev = &pci->dev;
 	struct pwm_chip *chip;
 	struct dwc_pwm *dwc;
-	int ret;
 
 	chip = dwc_pwm_alloc(dev);
 	if (IS_ERR(chip))
 		return PTR_ERR(chip);
+
 	dwc = to_dwc_pwm(chip);
+	dwc->base = base + offset;
+
+	return devm_pwmchip_add(dev, chip);
+}
+
+static int dwc_pwm_probe(struct pci_dev *pci, const struct pci_device_id *id)
+{
+	const struct dwc_pwm_info *info;
+	struct device *dev = &pci->dev;
+	int i, ret;
 
 	ret = pcim_enable_device(pci);
 	if (ret) {
@@ -51,12 +66,17 @@ static int dwc_pwm_probe(struct pci_dev *pci, const struct pci_device_id *id)
 		return ret;
 	}
 
-	/* No need to check for failure, pcim_iomap_regions() does it for us. */
-	dwc->base = pcim_iomap_table(pci)[0];
+	info = (const struct dwc_pwm_info *)id->driver_data;
 
-	ret = devm_pwmchip_add(dev, chip);
-	if (ret)
-		return ret;
+	for (i = 0; i < info->nr; i++) {
+		/*
+		 * No need to check for pcim_iomap_table() failure,
+		 * pcim_iomap_regions() already does it for us.
+		 */
+		ret = dwc_pwm_init_one(dev, pcim_iomap_table(pci)[0], i * info->size);
+		if (ret)
+			return ret;
+	}
 
 	pm_runtime_put(dev);
 	pm_runtime_allow(dev);
@@ -108,7 +128,7 @@ static int dwc_pwm_resume(struct device *dev)
 static DEFINE_SIMPLE_DEV_PM_OPS(dwc_pwm_pm_ops, dwc_pwm_suspend, dwc_pwm_resume);
 
 static const struct pci_device_id dwc_pwm_id_table[] = {
-	{ PCI_VDEVICE(INTEL, 0x4bb7) }, /* Elkhart Lake */
+	{ PCI_VDEVICE(INTEL, 0x4bb7), (kernel_ulong_t)&ehl_pwm_info },
 	{  }	/* Terminating Entry */
 };
 MODULE_DEVICE_TABLE(pci, dwc_pwm_id_table);
diff --git a/drivers/pwm/pwm-dwc.h b/drivers/pwm/pwm-dwc.h
index 5887371803fd86..a8b074841ae805 100644
--- a/drivers/pwm/pwm-dwc.h
+++ b/drivers/pwm/pwm-dwc.h
@@ -33,6 +33,11 @@ MODULE_IMPORT_NS(dwc_pwm);
 #define DWC_TIM_CTRL_INT_MASK	BIT(2)
 #define DWC_TIM_CTRL_PWM	BIT(3)
 
+struct dwc_pwm_info {
+	unsigned int nr;
+	unsigned int size;
+};
+
 struct dwc_pwm_ctx {
 	u32 cnt;
 	u32 cnt2;

From 801de0882d8a95aa1b1fe67df1696e037d785656 Mon Sep 17 00:00:00 2001
From: Raag Jadav <raag.jadav@intel.com>
Date: Mon, 19 Feb 2024 09:08:35 +0530
Subject: [PATCH 0868/1406] pwm: dwc: simplify error handling
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Simplify error handling in ->probe() function using dev_err_probe() helper
and while at it, drop error codes from the message to prevent duplication.

Signed-off-by: Raag Jadav <raag.jadav@intel.com>
Tested-by: Jarkko Nikula <jarkko.nikula@linux.intel.com>
Link: https://lore.kernel.org/r/20240219033835.11369-5-raag.jadav@intel.com
Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
---
 drivers/pwm/pwm-dwc.c | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/drivers/pwm/pwm-dwc.c b/drivers/pwm/pwm-dwc.c
index de95352081fa8c..676eaf8d7a53f7 100644
--- a/drivers/pwm/pwm-dwc.c
+++ b/drivers/pwm/pwm-dwc.c
@@ -53,18 +53,14 @@ static int dwc_pwm_probe(struct pci_dev *pci, const struct pci_device_id *id)
 	int i, ret;
 
 	ret = pcim_enable_device(pci);
-	if (ret) {
-		dev_err(dev, "Failed to enable device (%pe)\n", ERR_PTR(ret));
-		return ret;
-	}
+	if (ret)
+		return dev_err_probe(dev, ret, "Failed to enable device\n");
 
 	pci_set_master(pci);
 
 	ret = pcim_iomap_regions(pci, BIT(0), pci_name(pci));
-	if (ret) {
-		dev_err(dev, "Failed to iomap PCI BAR (%pe)\n", ERR_PTR(ret));
-		return ret;
-	}
+	if (ret)
+		return dev_err_probe(dev, ret, "Failed to iomap PCI BAR\n");
 
 	info = (const struct dwc_pwm_info *)id->driver_data;
 

From fe1e6701c2215f8b196682b748b73c93d1eef832 Mon Sep 17 00:00:00 2001
From: Tal Risin <trisin@habana.ai>
Date: Wed, 31 Jan 2024 11:08:33 +0200
Subject: [PATCH 0869/1406] accel/habanalabs: initialize maybe-uninitialized
 variables

Prevent static analysis warning.

Signed-off-by: Tal Risin <trisin@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/accel/habanalabs/common/debugfs.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/accel/habanalabs/common/debugfs.c b/drivers/accel/habanalabs/common/debugfs.c
index ab0fe74b49d011..b1c88d1837d97c 100644
--- a/drivers/accel/habanalabs/common/debugfs.c
+++ b/drivers/accel/habanalabs/common/debugfs.c
@@ -484,7 +484,7 @@ static ssize_t mmu_asid_va_write(struct file *file, const char __user *buf,
 	struct hl_debugfs_entry *entry = s->private;
 	struct hl_dbg_device_entry *dev_entry = entry->dev_entry;
 	struct hl_device *hdev = dev_entry->hdev;
-	char kbuf[MMU_KBUF_SIZE];
+	char kbuf[MMU_KBUF_SIZE] = {0};
 	char *c;
 	ssize_t rc;
 
@@ -546,7 +546,7 @@ static ssize_t mmu_ack_error_value_write(struct file *file,
 	struct hl_debugfs_entry *entry = s->private;
 	struct hl_dbg_device_entry *dev_entry = entry->dev_entry;
 	struct hl_device *hdev = dev_entry->hdev;
-	char kbuf[MMU_KBUF_SIZE];
+	char kbuf[MMU_KBUF_SIZE] = {0};
 	ssize_t rc;
 
 	if (count > sizeof(kbuf) - 1)

From 481df5cb8f024fc2222dbb586de7b75ae9ec7131 Mon Sep 17 00:00:00 2001
From: Dani Liberman <dliberman@habana.ai>
Date: Mon, 5 Feb 2024 09:19:30 +0200
Subject: [PATCH 0870/1406] accel/habanalabs: fix error print

The unmasking is for event and it can be other event than RAZWI.

Signed-off-by: Dani Liberman <dliberman@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/accel/habanalabs/common/firmware_if.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/accel/habanalabs/common/firmware_if.c b/drivers/accel/habanalabs/common/firmware_if.c
index 364d292c76fa38..a3df7cf162d8c2 100644
--- a/drivers/accel/habanalabs/common/firmware_if.c
+++ b/drivers/accel/habanalabs/common/firmware_if.c
@@ -526,7 +526,7 @@ int hl_fw_unmask_irq(struct hl_device *hdev, u16 event_type)
 						0, &result);
 
 	if (rc)
-		dev_err(hdev->dev, "failed to unmask RAZWI IRQ %d", event_type);
+		dev_err(hdev->dev, "failed to unmask event %d", event_type);
 
 	return rc;
 }
@@ -565,7 +565,7 @@ int hl_fw_unmask_irq_arr(struct hl_device *hdev, const u32 *irq_arr,
 						total_pkt_size, 0, &result);
 
 	if (rc)
-		dev_err(hdev->dev, "failed to unmask IRQ array\n");
+		dev_err(hdev->dev, "failed to unmask event array\n");
 
 	kfree(pkt);
 

From 404a9b299c644b130ae2f5bfe885813b3e4d98a5 Mon Sep 17 00:00:00 2001
From: Ofir Bitton <obitton@habana.ai>
Date: Mon, 5 Feb 2024 17:36:38 +0200
Subject: [PATCH 0871/1406] accel/habanalabs/gaudi2: drain event lacks rd/wr
 indication

Due to a H/W issue, AXI drain event does not include a read/write
indication, hence we remove this print.

Signed-off-by: Ofir Bitton <obitton@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/accel/habanalabs/gaudi2/gaudi2.c | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c b/drivers/accel/habanalabs/gaudi2/gaudi2.c
index 189d8da6a624c6..ba1518f2bf5c85 100644
--- a/drivers/accel/habanalabs/gaudi2/gaudi2.c
+++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c
@@ -9548,25 +9548,17 @@ static int gaudi2_handle_pcie_p2p_msix(struct hl_device *hdev, u16 event_type)
 static int gaudi2_handle_pcie_drain(struct hl_device *hdev,
 			struct hl_eq_pcie_drain_ind_data *drain_data)
 {
-	u64 lbw_rd, lbw_wr, hbw_rd, hbw_wr, cause, error_count = 0;
+	u64 cause, error_count = 0;
 
 	cause = le64_to_cpu(drain_data->intr_cause.intr_cause_data);
-	lbw_rd = le64_to_cpu(drain_data->drain_rd_addr_lbw);
-	lbw_wr = le64_to_cpu(drain_data->drain_wr_addr_lbw);
-	hbw_rd = le64_to_cpu(drain_data->drain_rd_addr_hbw);
-	hbw_wr = le64_to_cpu(drain_data->drain_wr_addr_hbw);
 
 	if (cause & BIT_ULL(0)) {
-		dev_err_ratelimited(hdev->dev,
-			"PCIE AXI drain LBW completed, read_err %u, write_err %u\n",
-			!!lbw_rd, !!lbw_wr);
+		dev_err_ratelimited(hdev->dev, "PCIE AXI drain LBW completed\n");
 		error_count++;
 	}
 
 	if (cause & BIT_ULL(1)) {
-		dev_err_ratelimited(hdev->dev,
-			"PCIE AXI drain HBW completed, raddr %#llx, waddr %#llx\n",
-			hbw_rd, hbw_wr);
+		dev_err_ratelimited(hdev->dev, "PCIE AXI drain HBW completed\n");
 		error_count++;
 	}
 

From e77c4ad9a6dad63fe8502d0d40ef09c6fad7bb98 Mon Sep 17 00:00:00 2001
From: Ofir Bitton <obitton@habana.ai>
Date: Tue, 6 Feb 2024 21:12:21 +0200
Subject: [PATCH 0872/1406] accel/habanalabs/hwmon: rate limit errors user can
 generate

Fetching sensor data can fail due to various reasons. In order
not to pollute the kernel log, those error prints must be
rate limited.

Signed-off-by: Ofir Bitton <obitton@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/accel/habanalabs/common/hwmon.c | 29 +++++++++++++------------
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/drivers/accel/habanalabs/common/hwmon.c b/drivers/accel/habanalabs/common/hwmon.c
index 1ee2ee07e9ed51..36b951b5f5039d 100644
--- a/drivers/accel/habanalabs/common/hwmon.c
+++ b/drivers/accel/habanalabs/common/hwmon.c
@@ -46,7 +46,7 @@ static u32 fixup_flags_legacy_fw(struct hl_device *hdev, enum hwmon_sensor_types
 		break;
 
 	default:
-		dev_err(hdev->dev, "unsupported h/w sensor type %d\n", type);
+		dev_err_ratelimited(hdev->dev, "unsupported h/w sensor type %d\n", type);
 		flags = cpucp_flags;
 		break;
 	}
@@ -134,7 +134,7 @@ static u32 adjust_hwmon_flags(struct hl_device *hdev, enum hwmon_sensor_types ty
 			break;
 
 		default:
-			dev_err(hdev->dev, "unsupported h/w sensor type %d\n", type);
+			dev_err_ratelimited(hdev->dev, "unsupported h/w sensor type %d\n", type);
 			flags = cpucp_flags;
 			break;
 		}
@@ -162,7 +162,8 @@ int hl_build_hwmon_channel_info(struct hl_device *hdev, struct cpucp_sensor *sen
 			break;
 
 		if (type >= HWMON_NR_SENSOR_TYPES) {
-			dev_err(hdev->dev, "Got wrong sensor type %d from device\n", type);
+			dev_err_ratelimited(hdev->dev,
+				"Got wrong sensor type %d from device\n", type);
 			return -EINVAL;
 		}
 
@@ -584,7 +585,7 @@ int hl_get_temperature(struct hl_device *hdev,
 	*value = (long) result;
 
 	if (rc) {
-		dev_err(hdev->dev,
+		dev_err_ratelimited(hdev->dev,
 			"Failed to get temperature from sensor %d, error %d\n",
 			sensor_index, rc);
 		*value = 0;
@@ -611,7 +612,7 @@ int hl_set_temperature(struct hl_device *hdev,
 						0, NULL);
 
 	if (rc)
-		dev_err(hdev->dev,
+		dev_err_ratelimited(hdev->dev,
 			"Failed to set temperature of sensor %d, error %d\n",
 			sensor_index, rc);
 
@@ -638,7 +639,7 @@ int hl_get_voltage(struct hl_device *hdev,
 	*value = (long) result;
 
 	if (rc) {
-		dev_err(hdev->dev,
+		dev_err_ratelimited(hdev->dev,
 			"Failed to get voltage from sensor %d, error %d\n",
 			sensor_index, rc);
 		*value = 0;
@@ -667,7 +668,7 @@ int hl_get_current(struct hl_device *hdev,
 	*value = (long) result;
 
 	if (rc) {
-		dev_err(hdev->dev,
+		dev_err_ratelimited(hdev->dev,
 			"Failed to get current from sensor %d, error %d\n",
 			sensor_index, rc);
 		*value = 0;
@@ -696,7 +697,7 @@ int hl_get_fan_speed(struct hl_device *hdev,
 	*value = (long) result;
 
 	if (rc) {
-		dev_err(hdev->dev,
+		dev_err_ratelimited(hdev->dev,
 			"Failed to get fan speed from sensor %d, error %d\n",
 			sensor_index, rc);
 		*value = 0;
@@ -725,7 +726,7 @@ int hl_get_pwm_info(struct hl_device *hdev,
 	*value = (long) result;
 
 	if (rc) {
-		dev_err(hdev->dev,
+		dev_err_ratelimited(hdev->dev,
 			"Failed to get pwm info from sensor %d, error %d\n",
 			sensor_index, rc);
 		*value = 0;
@@ -752,7 +753,7 @@ void hl_set_pwm_info(struct hl_device *hdev, int sensor_index, u32 attr,
 						0, NULL);
 
 	if (rc)
-		dev_err(hdev->dev,
+		dev_err_ratelimited(hdev->dev,
 			"Failed to set pwm info to sensor %d, error %d\n",
 			sensor_index, rc);
 }
@@ -775,7 +776,7 @@ int hl_set_voltage(struct hl_device *hdev,
 						0, NULL);
 
 	if (rc)
-		dev_err(hdev->dev,
+		dev_err_ratelimited(hdev->dev,
 			"Failed to set voltage of sensor %d, error %d\n",
 			sensor_index, rc);
 
@@ -800,7 +801,7 @@ int hl_set_current(struct hl_device *hdev,
 						0, NULL);
 
 	if (rc)
-		dev_err(hdev->dev,
+		dev_err_ratelimited(hdev->dev,
 			"Failed to set current of sensor %d, error %d\n",
 			sensor_index, rc);
 
@@ -831,7 +832,7 @@ int hl_set_power(struct hl_device *hdev,
 						0, NULL);
 
 	if (rc)
-		dev_err(hdev->dev,
+		dev_err_ratelimited(hdev->dev,
 			"Failed to set power of sensor %d, error %d\n",
 			sensor_index, rc);
 
@@ -858,7 +859,7 @@ int hl_get_power(struct hl_device *hdev,
 	*value = (long) result;
 
 	if (rc) {
-		dev_err(hdev->dev,
+		dev_err_ratelimited(hdev->dev,
 			"Failed to get power of sensor %d, error %d\n",
 			sensor_index, rc);
 		*value = 0;

From f7b8ed6d64463cc3ccfa589b5851149d3b313e1a Mon Sep 17 00:00:00 2001
From: Tomer Tayar <ttayar@habana.ai>
Date: Mon, 29 Jan 2024 17:26:17 +0200
Subject: [PATCH 0873/1406] accel/habanalabs: handle reserved memory request
 when working with full FW

Currently the reserved memory request from FW is handled when running
with preboot only, but this request is relevant also when running with
full FW.
Modify to always handle this reservation request.

Signed-off-by: Tomer Tayar <ttayar@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/accel/habanalabs/common/firmware_if.c | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

diff --git a/drivers/accel/habanalabs/common/firmware_if.c b/drivers/accel/habanalabs/common/firmware_if.c
index a3df7cf162d8c2..4246162b680768 100644
--- a/drivers/accel/habanalabs/common/firmware_if.c
+++ b/drivers/accel/habanalabs/common/firmware_if.c
@@ -2743,18 +2743,20 @@ static int hl_fw_dynamic_init_cpu(struct hl_device *hdev,
 		hdev->reset_info.curr_reset_cause = HL_RESET_CAUSE_UNKNOWN;
 	}
 
+	rc = hl_fw_dynamic_request_descriptor(hdev, fw_loader, sizeof(struct lkd_msg_comms));
+	if (rc)
+		goto protocol_err;
+
+	if (hdev->asic_prop.support_dynamic_resereved_fw_size)
+		hdev->asic_prop.reserved_fw_mem_size =
+				le32_to_cpu(fw_loader->dynamic_loader.comm_desc.rsvd_mem_size_mb);
+
 	if (!(hdev->fw_components & FW_TYPE_BOOT_CPU)) {
 		struct lkd_fw_binning_info *binning_info;
 
-		rc = hl_fw_dynamic_request_descriptor(hdev, fw_loader,
-							sizeof(struct lkd_msg_comms));
-		if (rc)
-			goto protocol_err;
-
 		/* read preboot version */
 		rc = hl_fw_dynamic_read_device_fw_version(hdev, FW_COMP_PREBOOT,
 				fw_loader->dynamic_loader.comm_desc.cur_fw_ver);
-
 		if (rc)
 			return rc;
 
@@ -2781,11 +2783,6 @@ static int hl_fw_dynamic_init_cpu(struct hl_device *hdev,
 				hdev->decoder_binning, hdev->rotator_binning);
 		}
 
-		if (hdev->asic_prop.support_dynamic_resereved_fw_size) {
-			hdev->asic_prop.reserved_fw_mem_size =
-				le32_to_cpu(fw_loader->dynamic_loader.comm_desc.rsvd_mem_size_mb);
-		}
-
 		return 0;
 	}
 

From 4b96e9b5d113e9a92f67fcd5fe5cd2105fe3f20d Mon Sep 17 00:00:00 2001
From: Tomer Tayar <ttayar@habana.ai>
Date: Tue, 30 Jan 2024 09:57:32 +0200
Subject: [PATCH 0874/1406] accel/habanalabs: keep explicit size of reserved
 memory for FW

The reserved memory for FW is currently saved in an ASIC property in
units of MB, just like the value that comes from FW.
Except the fact that it is not clear from the property's name, it means
also that a calculation to actual size is required everywhere that it is
used.
Modify the property to hold the size in bytes.

Signed-off-by: Tomer Tayar <ttayar@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/accel/habanalabs/common/firmware_if.c | 2 +-
 drivers/accel/habanalabs/common/habanalabs.h  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/accel/habanalabs/common/firmware_if.c b/drivers/accel/habanalabs/common/firmware_if.c
index 4246162b680768..348418643709dc 100644
--- a/drivers/accel/habanalabs/common/firmware_if.c
+++ b/drivers/accel/habanalabs/common/firmware_if.c
@@ -2749,7 +2749,7 @@ static int hl_fw_dynamic_init_cpu(struct hl_device *hdev,
 
 	if (hdev->asic_prop.support_dynamic_resereved_fw_size)
 		hdev->asic_prop.reserved_fw_mem_size =
-				le32_to_cpu(fw_loader->dynamic_loader.comm_desc.rsvd_mem_size_mb);
+			le32_to_cpu(fw_loader->dynamic_loader.comm_desc.rsvd_mem_size_mb) * SZ_1M;
 
 	if (!(hdev->fw_components & FW_TYPE_BOOT_CPU)) {
 		struct lkd_fw_binning_info *binning_info;
diff --git a/drivers/accel/habanalabs/common/habanalabs.h b/drivers/accel/habanalabs/common/habanalabs.h
index 40107a4eba93d3..55495861f43259 100644
--- a/drivers/accel/habanalabs/common/habanalabs.h
+++ b/drivers/accel/habanalabs/common/habanalabs.h
@@ -650,7 +650,7 @@ struct hl_hints_range {
  * @glbl_err_max_cause_num: global err max cause number.
  * @hbw_flush_reg: register to read to generate HBW flush. value of 0 means HBW flush is
  *                 not supported.
- * @reserved_fw_mem_size: size in MB of dram memory reserved for FW.
+ * @reserved_fw_mem_size: size of dram memory reserved for FW.
  * @collective_first_sob: first sync object available for collective use
  * @collective_first_mon: first monitor available for collective use
  * @sync_stream_first_sob: first sync object available for sync stream use

From 570a7f66cc7a1b3f3eae63c6c3639bb5b456a928 Mon Sep 17 00:00:00 2001
From: Ofir Bitton <obitton@habana.ai>
Date: Mon, 12 Feb 2024 14:35:24 +0200
Subject: [PATCH 0875/1406] accel/habanalabs: modify pci health check

Today we read PCI VENDOR-ID in order to make sure PCI link is
healthy. Apparently the VENDOR-ID might be stored on host and
hence, when we read it we might not access the PCI bus.
In order to make sure PCI health check is reliable, we will start
checking the DEVICE-ID instead.

Signed-off-by: Ofir Bitton <obitton@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/accel/habanalabs/common/device.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/accel/habanalabs/common/device.c b/drivers/accel/habanalabs/common/device.c
index 3b9e8a21d7df8b..8f92445c5a9010 100644
--- a/drivers/accel/habanalabs/common/device.c
+++ b/drivers/accel/habanalabs/common/device.c
@@ -1035,14 +1035,14 @@ static void device_early_fini(struct hl_device *hdev)
 
 static bool is_pci_link_healthy(struct hl_device *hdev)
 {
-	u16 vendor_id;
+	u16 device_id;
 
 	if (!hdev->pdev)
 		return false;
 
-	pci_read_config_word(hdev->pdev, PCI_VENDOR_ID, &vendor_id);
+	pci_read_config_word(hdev->pdev, PCI_DEVICE_ID, &device_id);
 
-	return (vendor_id == PCI_VENDOR_ID_HABANALABS);
+	return (device_id == hdev->pdev->device);
 }
 
 static int hl_device_eq_heartbeat_check(struct hl_device *hdev)

From fdc78ddb7889b0fbf25abf36c1cdce1f930ba60d Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Mon, 15 Jan 2024 11:24:47 -0800
Subject: [PATCH 0876/1406] tpm/tpm_ftpm_tee: fix all kernel-doc warnings

Change @pdev to @dev in 2 places to match the function parameters.
Correct one function name in kernel-doc comment to match the function
implementation.

This prevents these warnings:

tpm_ftpm_tee.c:217: warning: Function parameter or struct member 'dev' not described in 'ftpm_tee_probe'
tpm_ftpm_tee.c:217: warning: Excess function parameter 'pdev' description in 'ftpm_tee_probe'
tpm_ftpm_tee.c:313: warning: Function parameter or struct member 'dev' not described in 'ftpm_tee_remove'
tpm_ftpm_tee.c:313: warning: Excess function parameter 'pdev' description in 'ftpm_tee_remove'
tpm_ftpm_tee.c:348: warning: expecting prototype for ftpm_tee_shutdown(). Prototype was for ftpm_plat_tee_shutdown() instead

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org>
Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
---
 drivers/char/tpm/tpm_ftpm_tee.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/char/tpm/tpm_ftpm_tee.c b/drivers/char/tpm/tpm_ftpm_tee.c
index 76adb108076cfa..2ea4882251cf7f 100644
--- a/drivers/char/tpm/tpm_ftpm_tee.c
+++ b/drivers/char/tpm/tpm_ftpm_tee.c
@@ -208,7 +208,7 @@ static int ftpm_tee_match(struct tee_ioctl_version_data *ver, const void *data)
 
 /**
  * ftpm_tee_probe() - initialize the fTPM
- * @pdev: the platform_device description.
+ * @dev: the device description.
  *
  * Return:
  *	On success, 0. On failure, -errno.
@@ -304,7 +304,7 @@ static int ftpm_plat_tee_probe(struct platform_device *pdev)
 
 /**
  * ftpm_tee_remove() - remove the TPM device
- * @pdev: the platform_device description.
+ * @dev: the device description.
  *
  * Return:
  *	0 always.
@@ -341,7 +341,7 @@ static void ftpm_plat_tee_remove(struct platform_device *pdev)
 }
 
 /**
- * ftpm_tee_shutdown() - shutdown the TPM device
+ * ftpm_plat_tee_shutdown() - shutdown the TPM device
  * @pdev: the platform_device description.
  */
 static void ftpm_plat_tee_shutdown(struct platform_device *pdev)

From 229327a4136f6d412ecc8ddf4ddb2b04724c1fac Mon Sep 17 00:00:00 2001
From: Lino Sanfilippo <l.sanfilippo@kunbus.com>
Date: Thu, 1 Feb 2024 12:36:45 +0100
Subject: [PATCH 0877/1406] tpm,tpm_tis: Avoid warning splat at shutdown

If interrupts are not activated the work struct 'free_irq_work' is not
initialized. This results in a warning splat at module shutdown.

Fix this by always initializing the work regardless of whether interrupts
are activated or not.

cc: stable@vger.kernel.org
Fixes: 481c2d14627d ("tpm,tpm_tis: Disable interrupts after 1000 unhandled IRQs")
Reported-by: Jarkko Sakkinen <jarkko@kernel.org>
Closes: https://lore.kernel.org/all/CX32RFOMJUQ0.3R4YCL9MDCB96@kernel.org/
Signed-off-by: Lino Sanfilippo <l.sanfilippo@kunbus.com>
Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
---
 drivers/char/tpm/tpm_tis_core.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/char/tpm/tpm_tis_core.c b/drivers/char/tpm/tpm_tis_core.c
index 1b350412d8a6be..64c875657687d2 100644
--- a/drivers/char/tpm/tpm_tis_core.c
+++ b/drivers/char/tpm/tpm_tis_core.c
@@ -919,8 +919,6 @@ static int tpm_tis_probe_irq_single(struct tpm_chip *chip, u32 intmask,
 	int rc;
 	u32 int_status;
 
-	INIT_WORK(&priv->free_irq_work, tpm_tis_free_irq_func);
-
 	rc = devm_request_threaded_irq(chip->dev.parent, irq, NULL,
 				       tis_int_handler, IRQF_ONESHOT | flags,
 				       dev_name(&chip->dev), chip);
@@ -1132,6 +1130,7 @@ int tpm_tis_core_init(struct device *dev, struct tpm_tis_data *priv, int irq,
 	priv->phy_ops = phy_ops;
 	priv->locality_count = 0;
 	mutex_init(&priv->locality_count_mutex);
+	INIT_WORK(&priv->free_irq_work, tpm_tis_free_irq_func);
 
 	dev_set_drvdata(&chip->dev, priv);
 

From 22f2e655b1788d2dde01f0276546bc98cff6d6b3 Mon Sep 17 00:00:00 2001
From: Lukas Wunner <lukas@wunner.de>
Date: Sat, 13 Jan 2024 18:10:51 +0100
Subject: [PATCH 0878/1406] dt-bindings: tpm: Add compatible string
 atmel,attpm20p

Commit 4f2a348aa365 ("arm64: dts: imx8mm-venice-gw73xx: add TPM device")
added a devicetree node for the Trusted Platform Module on certain
Gateworks boards.

The commit only used the generic "tcg,tpm_tis-spi" compatible string,
but public documentation shows that the chip is an ATTPM20P from Atmel
(nowadays Microchip):
https://trac.gateworks.com/wiki/tpm

Add the chip to the supported compatible strings of the TPM TIS SPI
schema.

For reference, a datasheet is available at:
https://ww1.microchip.com/downloads/en/DeviceDoc/ATTPM20P-Trusted-Platform-Module-TPM-2.0-SPI-Interface-Summary-Data-Sheet-DS40002082A.pdf

Signed-off-by: Lukas Wunner <lukas@wunner.de>
Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org>
Cc: Tim Harvey <tharvey@gateworks.com>
Acked-by: Rob Herring <robh@kernel.org>
Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
---
 Documentation/devicetree/bindings/tpm/tcg,tpm_tis-spi.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Documentation/devicetree/bindings/tpm/tcg,tpm_tis-spi.yaml b/Documentation/devicetree/bindings/tpm/tcg,tpm_tis-spi.yaml
index c3413b47ac3df9..6cb2de7cb5688e 100644
--- a/Documentation/devicetree/bindings/tpm/tcg,tpm_tis-spi.yaml
+++ b/Documentation/devicetree/bindings/tpm/tcg,tpm_tis-spi.yaml
@@ -20,6 +20,7 @@ properties:
   compatible:
     items:
       - enum:
+          - atmel,attpm20p
           - infineon,slb9670
           - st,st33htpm-spi
           - st,st33zp24-spi

From 4c36ad8d25ae92e0f59f84469b6dac5fffa0cfb0 Mon Sep 17 00:00:00 2001
From: Lukas Wunner <lukas@wunner.de>
Date: Sat, 13 Jan 2024 18:10:52 +0100
Subject: [PATCH 0879/1406] tpm_tis_spi: Add compatible string atmel,attpm20p

Commit 4f2a348aa365 ("arm64: dts: imx8mm-venice-gw73xx: add TPM device")
added a devicetree node for the Trusted Platform Module on certain
Gateworks boards.

The commit only used the generic "tcg,tpm_tis-spi" compatible string,
but public documentation shows that the chip is an ATTPM20P from Atmel
(nowadays Microchip):
https://trac.gateworks.com/wiki/tpm

Add the chip to the supported compatible strings of the TPM TIS SPI
driver.

For reference, a datasheet is available at:
https://ww1.microchip.com/downloads/en/DeviceDoc/ATTPM20P-Trusted-Platform-Module-TPM-2.0-SPI-Interface-Summary-Data-Sheet-DS40002082A.pdf

Signed-off-by: Lukas Wunner <lukas@wunner.de>
Reviewed-by: Jarkko Sakkinen <jarkko.sakkinen@kernel.org>
Cc: Tim Harvey <tharvey@gateworks.com>
Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
---
 drivers/char/tpm/tpm_tis_spi_main.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/char/tpm/tpm_tis_spi_main.c b/drivers/char/tpm/tpm_tis_spi_main.c
index c5c3197ee29f04..7d376a64280783 100644
--- a/drivers/char/tpm/tpm_tis_spi_main.c
+++ b/drivers/char/tpm/tpm_tis_spi_main.c
@@ -327,6 +327,7 @@ static const struct spi_device_id tpm_tis_spi_id[] = {
 MODULE_DEVICE_TABLE(spi, tpm_tis_spi_id);
 
 static const struct of_device_id of_tis_spi_match[] __maybe_unused = {
+	{ .compatible = "atmel,attpm20p", .data = tpm_tis_spi_probe },
 	{ .compatible = "st,st33htpm-spi", .data = tpm_tis_spi_probe },
 	{ .compatible = "infineon,slb9670", .data = tpm_tis_spi_probe },
 	{ .compatible = "tcg,tpm_tis-spi", .data = tpm_tis_spi_probe },

From 246275da5b7b9568fe0cd1fc4d7249a0a2b0d4a9 Mon Sep 17 00:00:00 2001
From: Lukas Wunner <lukas@wunner.de>
Date: Sat, 13 Jan 2024 18:10:53 +0100
Subject: [PATCH 0880/1406] tpm_tis: Add compatible string atmel,at97sc3204

Commit 420d439849ca ("tpm_tis: Allow tpm_tis to be bound using DT")
added the fallback compatible "tcg,tpm-tis-mmio" to the TPM TIS driver,
but not the chip-specific "atmel,at97sc3204".  However it did document
it as a valid compatible string.

Add it to tis_of_platform_match[] for consistency.

Signed-off-by: Lukas Wunner <lukas@wunner.de>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org>
Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
---
 drivers/char/tpm/tpm_tis.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/char/tpm/tpm_tis.c b/drivers/char/tpm/tpm_tis.c
index 2c52b7905b0706..14652aaf825468 100644
--- a/drivers/char/tpm/tpm_tis.c
+++ b/drivers/char/tpm/tpm_tis.c
@@ -347,6 +347,7 @@ static void tpm_tis_plat_remove(struct platform_device *pdev)
 
 #ifdef CONFIG_OF
 static const struct of_device_id tis_of_platform_match[] = {
+	{.compatible = "atmel,at97sc3204"},
 	{.compatible = "tcg,tpm-tis-mmio"},
 	{},
 };

From 4a25541b236f5d8f98c1fd2f8848a290eafdb8a8 Mon Sep 17 00:00:00 2001
From: Lukas Wunner <lukas@wunner.de>
Date: Sat, 13 Jan 2024 18:10:54 +0100
Subject: [PATCH 0881/1406] tpm: tis_i2c: Add compatible string nuvoton,npct75x
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add "nuvoton,npct75x" as well as the fallback compatible string
"tcg,tpm-tis-i2c" to the TPM TIS I²C driver.  They're used by:

  arch/arm/boot/dts/aspeed/aspeed-bmc-ibm-bonnell.dts
  arch/arm/boot/dts/aspeed/aspeed-bmc-ibm-everest.dts

And by all accounts, NPCT75x is supported by the driver:

  https://lore.kernel.org/all/60e23fd0f0ff4d1f8954034237ae8865@NTILML02.nuvoton.com/
  https://lore.kernel.org/all/20220808220839.1006341-8-peter@pjd.dev/

Signed-off-by: Lukas Wunner <lukas@wunner.de>
Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org>
Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
---
 drivers/char/tpm/tpm_tis_i2c.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/char/tpm/tpm_tis_i2c.c b/drivers/char/tpm/tpm_tis_i2c.c
index a897402cc36a85..9511c0d501852b 100644
--- a/drivers/char/tpm/tpm_tis_i2c.c
+++ b/drivers/char/tpm/tpm_tis_i2c.c
@@ -383,6 +383,8 @@ MODULE_DEVICE_TABLE(i2c, tpm_tis_i2c_id);
 #ifdef CONFIG_OF
 static const struct of_device_id of_tis_i2c_match[] = {
 	{ .compatible = "infineon,slb9673", },
+	{ .compatible = "nuvoton,npct75x", },
+	{ .compatible = "tcg,tpm-tis-i2c", },
 	{}
 };
 MODULE_DEVICE_TABLE(of, of_tis_i2c_match);

From e685fa827c538c3035afd2aee841c044309a6e86 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@kernel.org>
Date: Thu, 8 Feb 2024 15:55:29 -0600
Subject: [PATCH 0882/1406] dm vdo: move indexer files into sub-directory

Goal being to assist high-level understanding of which code is
conceptually specific VDO's indexer.

Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 drivers/md/dm-vdo/Makefile                    | 32 +++++++++----------
 drivers/md/dm-vdo/data-vio.c                  |  3 +-
 drivers/md/dm-vdo/data-vio.h                  |  2 +-
 drivers/md/dm-vdo/dedupe.c                    |  3 +-
 drivers/md/dm-vdo/dedupe.h                    |  2 +-
 drivers/md/dm-vdo/encodings.h                 |  2 +-
 drivers/md/dm-vdo/funnel-queue.c              |  2 +-
 drivers/md/dm-vdo/funnel-requestqueue.h       |  2 +-
 .../md/dm-vdo/{ => indexer}/chapter-index.c   | 10 +++---
 .../md/dm-vdo/{ => indexer}/chapter-index.h   |  0
 drivers/md/dm-vdo/{ => indexer}/config.c      | 10 +++---
 drivers/md/dm-vdo/{ => indexer}/config.h      |  0
 drivers/md/dm-vdo/{ => indexer}/delta-index.c | 17 +++++-----
 drivers/md/dm-vdo/{ => indexer}/delta-index.h |  4 +--
 drivers/md/dm-vdo/{ => indexer}/geometry.c    |  9 +++---
 drivers/md/dm-vdo/{ => indexer}/geometry.h    |  0
 drivers/md/dm-vdo/{ => indexer}/hash-utils.h  |  3 +-
 .../md/dm-vdo/{ => indexer}/index-layout.c    |  9 +++---
 .../md/dm-vdo/{ => indexer}/index-layout.h    |  0
 .../md/dm-vdo/{ => indexer}/index-page-map.c  | 15 +++++----
 .../md/dm-vdo/{ => indexer}/index-page-map.h  |  0
 .../md/dm-vdo/{ => indexer}/index-session.c   |  9 +++---
 .../md/dm-vdo/{ => indexer}/index-session.h   |  3 +-
 drivers/md/dm-vdo/{ => indexer}/index.c       |  9 +++---
 drivers/md/dm-vdo/{ => indexer}/index.h       |  0
 drivers/md/dm-vdo/{ => indexer}/io-factory.c  |  6 ++--
 drivers/md/dm-vdo/{ => indexer}/io-factory.h  |  0
 drivers/md/dm-vdo/{ => indexer}/murmurhash3.c |  0
 drivers/md/dm-vdo/{ => indexer}/murmurhash3.h |  0
 .../md/dm-vdo/{ => indexer}/open-chapter.c    |  9 +++---
 .../md/dm-vdo/{ => indexer}/open-chapter.h    |  0
 drivers/md/dm-vdo/{ => indexer}/radix-sort.c  |  4 +--
 drivers/md/dm-vdo/{ => indexer}/radix-sort.h  |  0
 .../md/dm-vdo/{ => indexer}/sparse-cache.c    |  9 +++---
 .../md/dm-vdo/{ => indexer}/sparse-cache.h    |  0
 drivers/md/dm-vdo/{ => indexer}/uds.h         |  2 +-
 .../md/dm-vdo/{ => indexer}/volume-index.c    | 13 ++++----
 .../md/dm-vdo/{ => indexer}/volume-index.h    |  3 +-
 drivers/md/dm-vdo/{ => indexer}/volume.c      | 13 ++++----
 drivers/md/dm-vdo/{ => indexer}/volume.h      |  6 ++--
 drivers/md/dm-vdo/uds-sysfs.c                 |  2 +-
 drivers/md/dm-vdo/vdo.h                       |  2 +-
 42 files changed, 115 insertions(+), 100 deletions(-)
 rename drivers/md/dm-vdo/{ => indexer}/chapter-index.c (98%)
 rename drivers/md/dm-vdo/{ => indexer}/chapter-index.h (100%)
 rename drivers/md/dm-vdo/{ => indexer}/config.c (98%)
 rename drivers/md/dm-vdo/{ => indexer}/config.h (100%)
 rename drivers/md/dm-vdo/{ => indexer}/delta-index.c (99%)
 rename drivers/md/dm-vdo/{ => indexer}/delta-index.h (99%)
 rename drivers/md/dm-vdo/{ => indexer}/geometry.c (98%)
 rename drivers/md/dm-vdo/{ => indexer}/geometry.h (100%)
 rename drivers/md/dm-vdo/{ => indexer}/hash-utils.h (98%)
 rename drivers/md/dm-vdo/{ => indexer}/index-layout.c (99%)
 rename drivers/md/dm-vdo/{ => indexer}/index-layout.h (100%)
 rename drivers/md/dm-vdo/{ => indexer}/index-page-map.c (96%)
 rename drivers/md/dm-vdo/{ => indexer}/index-page-map.h (100%)
 rename drivers/md/dm-vdo/{ => indexer}/index-session.c (99%)
 rename drivers/md/dm-vdo/{ => indexer}/index-session.h (98%)
 rename drivers/md/dm-vdo/{ => indexer}/index.c (99%)
 rename drivers/md/dm-vdo/{ => indexer}/index.h (100%)
 rename drivers/md/dm-vdo/{ => indexer}/io-factory.c (99%)
 rename drivers/md/dm-vdo/{ => indexer}/io-factory.h (100%)
 rename drivers/md/dm-vdo/{ => indexer}/murmurhash3.c (100%)
 rename drivers/md/dm-vdo/{ => indexer}/murmurhash3.h (100%)
 rename drivers/md/dm-vdo/{ => indexer}/open-chapter.c (99%)
 rename drivers/md/dm-vdo/{ => indexer}/open-chapter.h (100%)
 rename drivers/md/dm-vdo/{ => indexer}/radix-sort.c (99%)
 rename drivers/md/dm-vdo/{ => indexer}/radix-sort.h (100%)
 rename drivers/md/dm-vdo/{ => indexer}/sparse-cache.c (99%)
 rename drivers/md/dm-vdo/{ => indexer}/sparse-cache.h (100%)
 rename drivers/md/dm-vdo/{ => indexer}/uds.h (99%)
 rename drivers/md/dm-vdo/{ => indexer}/volume-index.c (99%)
 rename drivers/md/dm-vdo/{ => indexer}/volume-index.h (99%)
 rename drivers/md/dm-vdo/{ => indexer}/volume.c (99%)
 rename drivers/md/dm-vdo/{ => indexer}/volume.h (98%)

diff --git a/drivers/md/dm-vdo/Makefile b/drivers/md/dm-vdo/Makefile
index 8c06c3b969e3ef..db9a7a16765ccd 100644
--- a/drivers/md/dm-vdo/Makefile
+++ b/drivers/md/dm-vdo/Makefile
@@ -6,12 +6,9 @@ dm-vdo-objs := \
 	action-manager.o \
 	admin-state.o \
 	block-map.o \
-	chapter-index.o \
 	completion.o \
-	config.o \
 	data-vio.o \
 	dedupe.o \
-	delta-index.o \
 	dm-vdo-target.o \
 	dump.o \
 	encodings.o \
@@ -20,31 +17,21 @@ dm-vdo-objs := \
 	funnel-queue.o \
 	funnel-requestqueue.o \
 	funnel-workqueue.o \
-	geometry.o \
-	index-layout.o \
-	index.o \
-	index-page-map.o \
-	index-session.o \
 	int-map.o \
-	io-factory.o \
 	io-submitter.o \
 	logger.o \
 	logical-zone.o \
 	memory-alloc.o \
 	message-stats.o \
-	murmurhash3.o \
-	open-chapter.o \
 	packer.o \
 	permassert.o \
 	physical-zone.o \
 	pool-sysfs.o \
 	pool-sysfs-stats.o \
 	priority-table.o \
-	radix-sort.o \
 	recovery-journal.o \
 	repair.o \
 	slab-depot.o \
-	sparse-cache.o \
 	status-codes.o \
 	string-utils.o \
 	sysfs.o \
@@ -55,6 +42,19 @@ dm-vdo-objs := \
 	uds-threads.o \
 	vdo.o \
 	vio.o \
-	volume-index.o \
-	volume.o \
-	wait-queue.o
+	wait-queue.o \
+	indexer/chapter-index.o \
+	indexer/config.o \
+	indexer/delta-index.o \
+	indexer/geometry.o \
+	indexer/index.o \
+	indexer/index-layout.o \
+	indexer/index-page-map.o \
+	indexer/index-session.o \
+	indexer/io-factory.o \
+	indexer/murmurhash3.o \
+	indexer/open-chapter.o \
+	indexer/radix-sort.o \
+	indexer/sparse-cache.o \
+	indexer/volume.o \
+	indexer/volume-index.o
diff --git a/drivers/md/dm-vdo/data-vio.c b/drivers/md/dm-vdo/data-vio.c
index d77adeb5006efe..26877c6bdc9e15 100644
--- a/drivers/md/dm-vdo/data-vio.c
+++ b/drivers/md/dm-vdo/data-vio.c
@@ -21,9 +21,10 @@
 
 #include "logger.h"
 #include "memory-alloc.h"
-#include "murmurhash3.h"
 #include "permassert.h"
 
+#include "indexer/murmurhash3.h"
+
 #include "block-map.h"
 #include "dump.h"
 #include "encodings.h"
diff --git a/drivers/md/dm-vdo/data-vio.h b/drivers/md/dm-vdo/data-vio.h
index 78744d064e9638..33d5753ee1075a 100644
--- a/drivers/md/dm-vdo/data-vio.h
+++ b/drivers/md/dm-vdo/data-vio.h
@@ -11,7 +11,7 @@
 #include <linux/list.h>
 
 #include "permassert.h"
-#include "uds.h"
+#include "indexer/uds.h"
 
 #include "block-map.h"
 #include "completion.h"
diff --git a/drivers/md/dm-vdo/dedupe.c b/drivers/md/dm-vdo/dedupe.c
index 2a1902c4423c5a..b819d7228e13b4 100644
--- a/drivers/md/dm-vdo/dedupe.c
+++ b/drivers/md/dm-vdo/dedupe.c
@@ -131,7 +131,8 @@
 #include "numeric.h"
 #include "permassert.h"
 #include "string-utils.h"
-#include "uds.h"
+
+#include "indexer/uds.h"
 
 #include "action-manager.h"
 #include "admin-state.h"
diff --git a/drivers/md/dm-vdo/dedupe.h b/drivers/md/dm-vdo/dedupe.h
index 773dde5f93654b..1fb5740e5e9bc9 100644
--- a/drivers/md/dm-vdo/dedupe.h
+++ b/drivers/md/dm-vdo/dedupe.h
@@ -9,7 +9,7 @@
 #include <linux/list.h>
 #include <linux/timer.h>
 
-#include "uds.h"
+#include "indexer/uds.h"
 
 #include "admin-state.h"
 #include "constants.h"
diff --git a/drivers/md/dm-vdo/encodings.h b/drivers/md/dm-vdo/encodings.h
index ba3db9867f4a8e..4a36c36ef98390 100644
--- a/drivers/md/dm-vdo/encodings.h
+++ b/drivers/md/dm-vdo/encodings.h
@@ -12,7 +12,7 @@
 #include <linux/uuid.h>
 
 #include "numeric.h"
-#include "uds.h"
+#include "indexer/uds.h"
 
 #include "constants.h"
 #include "types.h"
diff --git a/drivers/md/dm-vdo/funnel-queue.c b/drivers/md/dm-vdo/funnel-queue.c
index 6940b282086d68..7f5e4f2d1505ec 100644
--- a/drivers/md/dm-vdo/funnel-queue.c
+++ b/drivers/md/dm-vdo/funnel-queue.c
@@ -8,7 +8,7 @@
 #include "cpu.h"
 #include "memory-alloc.h"
 #include "permassert.h"
-#include "uds.h"
+#include "indexer/uds.h"
 
 int uds_make_funnel_queue(struct funnel_queue **queue_ptr)
 {
diff --git a/drivers/md/dm-vdo/funnel-requestqueue.h b/drivers/md/dm-vdo/funnel-requestqueue.h
index e74c231fe2690c..88d90ed263e599 100644
--- a/drivers/md/dm-vdo/funnel-requestqueue.h
+++ b/drivers/md/dm-vdo/funnel-requestqueue.h
@@ -6,7 +6,7 @@
 #ifndef UDS_REQUEST_QUEUE_H
 #define UDS_REQUEST_QUEUE_H
 
-#include "uds.h"
+#include "indexer/uds.h"
 
 /*
  * A simple request queue which will handle new requests in the order in which they are received,
diff --git a/drivers/md/dm-vdo/chapter-index.c b/drivers/md/dm-vdo/indexer/chapter-index.c
similarity index 98%
rename from drivers/md/dm-vdo/chapter-index.c
rename to drivers/md/dm-vdo/indexer/chapter-index.c
index 363991d5621832..1e52004163ef15 100644
--- a/drivers/md/dm-vdo/chapter-index.c
+++ b/drivers/md/dm-vdo/indexer/chapter-index.c
@@ -4,14 +4,14 @@
  */
 
 #include "chapter-index.h"
-
-#include "errors.h"
 #include "hash-utils.h"
-#include "logger.h"
-#include "memory-alloc.h"
-#include "permassert.h"
 #include "uds.h"
 
+#include "../errors.h"
+#include "../logger.h"
+#include "../memory-alloc.h"
+#include "../permassert.h"
+
 int uds_make_open_chapter_index(struct open_chapter_index **chapter_index,
 				const struct index_geometry *geometry, u64 volume_nonce)
 {
diff --git a/drivers/md/dm-vdo/chapter-index.h b/drivers/md/dm-vdo/indexer/chapter-index.h
similarity index 100%
rename from drivers/md/dm-vdo/chapter-index.h
rename to drivers/md/dm-vdo/indexer/chapter-index.h
diff --git a/drivers/md/dm-vdo/config.c b/drivers/md/dm-vdo/indexer/config.c
similarity index 98%
rename from drivers/md/dm-vdo/config.c
rename to drivers/md/dm-vdo/indexer/config.c
index e9c7e9bdbce064..88d42897137141 100644
--- a/drivers/md/dm-vdo/config.c
+++ b/drivers/md/dm-vdo/indexer/config.c
@@ -5,11 +5,11 @@
 
 #include "config.h"
 
-#include "logger.h"
-#include "memory-alloc.h"
-#include "numeric.h"
-#include "string-utils.h"
-#include "uds-threads.h"
+#include "../logger.h"
+#include "../memory-alloc.h"
+#include "../numeric.h"
+#include "../string-utils.h"
+#include "../uds-threads.h"
 
 static const u8 INDEX_CONFIG_MAGIC[] = "ALBIC";
 static const u8 INDEX_CONFIG_VERSION_6_02[] = "06.02";
diff --git a/drivers/md/dm-vdo/config.h b/drivers/md/dm-vdo/indexer/config.h
similarity index 100%
rename from drivers/md/dm-vdo/config.h
rename to drivers/md/dm-vdo/indexer/config.h
diff --git a/drivers/md/dm-vdo/delta-index.c b/drivers/md/dm-vdo/indexer/delta-index.c
similarity index 99%
rename from drivers/md/dm-vdo/delta-index.c
rename to drivers/md/dm-vdo/indexer/delta-index.c
index 6306777bb20288..d8494bb03bd21b 100644
--- a/drivers/md/dm-vdo/delta-index.c
+++ b/drivers/md/dm-vdo/indexer/delta-index.c
@@ -10,15 +10,16 @@
 #include <linux/limits.h>
 #include <linux/log2.h>
 
+#include "../cpu.h"
+#include "../errors.h"
+#include "../logger.h"
+#include "../memory-alloc.h"
+#include "../numeric.h"
+#include "../permassert.h"
+#include "../string-utils.h"
+#include "../time-utils.h"
+
 #include "config.h"
-#include "cpu.h"
-#include "errors.h"
-#include "logger.h"
-#include "memory-alloc.h"
-#include "numeric.h"
-#include "permassert.h"
-#include "string-utils.h"
-#include "time-utils.h"
 #include "uds.h"
 
 /*
diff --git a/drivers/md/dm-vdo/delta-index.h b/drivers/md/dm-vdo/indexer/delta-index.h
similarity index 99%
rename from drivers/md/dm-vdo/delta-index.h
rename to drivers/md/dm-vdo/indexer/delta-index.h
index b3b38fb440bfd2..2d990c215c2d8f 100644
--- a/drivers/md/dm-vdo/delta-index.h
+++ b/drivers/md/dm-vdo/indexer/delta-index.h
@@ -10,8 +10,8 @@
 
 #include "config.h"
 #include "io-factory.h"
-#include "numeric.h"
-#include "time-utils.h"
+#include "../numeric.h"
+#include "../time-utils.h"
 
 /*
  * A delta index is a key-value store, where each entry maps an address (the key) to a payload (the
diff --git a/drivers/md/dm-vdo/geometry.c b/drivers/md/dm-vdo/indexer/geometry.c
similarity index 98%
rename from drivers/md/dm-vdo/geometry.c
rename to drivers/md/dm-vdo/indexer/geometry.c
index 0e83bba4184ab2..11f055c20f6ef8 100644
--- a/drivers/md/dm-vdo/geometry.c
+++ b/drivers/md/dm-vdo/indexer/geometry.c
@@ -8,11 +8,12 @@
 #include <linux/compiler.h>
 #include <linux/log2.h>
 
+#include "../errors.h"
+#include "../logger.h"
+#include "../memory-alloc.h"
+#include "../permassert.h"
+
 #include "delta-index.h"
-#include "errors.h"
-#include "logger.h"
-#include "memory-alloc.h"
-#include "permassert.h"
 #include "uds.h"
 
 /*
diff --git a/drivers/md/dm-vdo/geometry.h b/drivers/md/dm-vdo/indexer/geometry.h
similarity index 100%
rename from drivers/md/dm-vdo/geometry.h
rename to drivers/md/dm-vdo/indexer/geometry.h
diff --git a/drivers/md/dm-vdo/hash-utils.h b/drivers/md/dm-vdo/indexer/hash-utils.h
similarity index 98%
rename from drivers/md/dm-vdo/hash-utils.h
rename to drivers/md/dm-vdo/indexer/hash-utils.h
index e22be69695beb4..bb679d58707762 100644
--- a/drivers/md/dm-vdo/hash-utils.h
+++ b/drivers/md/dm-vdo/indexer/hash-utils.h
@@ -7,9 +7,10 @@
 #define UDS_HASH_UTILS_H
 
 #include "geometry.h"
-#include "numeric.h"
 #include "uds.h"
 
+#include "../numeric.h"
+
 /* Utilities for extracting portions of a request name for various uses. */
 
 /* How various portions of a record name are apportioned. */
diff --git a/drivers/md/dm-vdo/index-layout.c b/drivers/md/dm-vdo/indexer/index-layout.c
similarity index 99%
rename from drivers/md/dm-vdo/index-layout.c
rename to drivers/md/dm-vdo/indexer/index-layout.c
index 2da507b26fd5bd..cee36e6caf0e1d 100644
--- a/drivers/md/dm-vdo/index-layout.c
+++ b/drivers/md/dm-vdo/indexer/index-layout.c
@@ -8,14 +8,15 @@
 #include <linux/random.h>
 
 #include "config.h"
-#include "logger.h"
-#include "memory-alloc.h"
 #include "murmurhash3.h"
-#include "numeric.h"
 #include "open-chapter.h"
-#include "time-utils.h"
 #include "volume-index.h"
 
+#include "../logger.h"
+#include "../memory-alloc.h"
+#include "../numeric.h"
+#include "../time-utils.h"
+
 /*
  * The UDS layout on storage media is divided into a number of fixed-size regions, the sizes of
  * which are computed when the index is created. Every header and region begins on 4K block
diff --git a/drivers/md/dm-vdo/index-layout.h b/drivers/md/dm-vdo/indexer/index-layout.h
similarity index 100%
rename from drivers/md/dm-vdo/index-layout.h
rename to drivers/md/dm-vdo/indexer/index-layout.h
diff --git a/drivers/md/dm-vdo/index-page-map.c b/drivers/md/dm-vdo/indexer/index-page-map.c
similarity index 96%
rename from drivers/md/dm-vdo/index-page-map.c
rename to drivers/md/dm-vdo/indexer/index-page-map.c
index f3748a915c03bd..14d2e9912e287d 100644
--- a/drivers/md/dm-vdo/index-page-map.c
+++ b/drivers/md/dm-vdo/indexer/index-page-map.c
@@ -5,14 +5,15 @@
 
 #include "index-page-map.h"
 
-#include "errors.h"
+#include "../errors.h"
+#include "../logger.h"
+#include "../memory-alloc.h"
+#include "../numeric.h"
+#include "../permassert.h"
+#include "../string-utils.h"
+#include "../uds-threads.h"
+
 #include "hash-utils.h"
-#include "logger.h"
-#include "memory-alloc.h"
-#include "numeric.h"
-#include "permassert.h"
-#include "string-utils.h"
-#include "uds-threads.h"
 #include "uds.h"
 
 /*
diff --git a/drivers/md/dm-vdo/index-page-map.h b/drivers/md/dm-vdo/indexer/index-page-map.h
similarity index 100%
rename from drivers/md/dm-vdo/index-page-map.h
rename to drivers/md/dm-vdo/indexer/index-page-map.h
diff --git a/drivers/md/dm-vdo/index-session.c b/drivers/md/dm-vdo/indexer/index-session.c
similarity index 99%
rename from drivers/md/dm-vdo/index-session.c
rename to drivers/md/dm-vdo/indexer/index-session.c
index 7afc197487121f..06ccfa79e00d57 100644
--- a/drivers/md/dm-vdo/index-session.c
+++ b/drivers/md/dm-vdo/indexer/index-session.c
@@ -7,12 +7,13 @@
 
 #include <linux/atomic.h>
 
-#include "funnel-requestqueue.h"
+#include "../funnel-requestqueue.h"
+#include "../logger.h"
+#include "../memory-alloc.h"
+#include "../time-utils.h"
+
 #include "index.h"
 #include "index-layout.h"
-#include "logger.h"
-#include "memory-alloc.h"
-#include "time-utils.h"
 
 /*
  * The index session contains a lock (the request_mutex) which ensures that only one thread can
diff --git a/drivers/md/dm-vdo/index-session.h b/drivers/md/dm-vdo/indexer/index-session.h
similarity index 98%
rename from drivers/md/dm-vdo/index-session.h
rename to drivers/md/dm-vdo/indexer/index-session.h
index c77ee021d510df..60cf15ba7b8e53 100644
--- a/drivers/md/dm-vdo/index-session.h
+++ b/drivers/md/dm-vdo/indexer/index-session.h
@@ -9,8 +9,9 @@
 #include <linux/atomic.h>
 #include <linux/cache.h>
 
+#include "../uds-threads.h"
+
 #include "config.h"
-#include "uds-threads.h"
 #include "uds.h"
 
 /*
diff --git a/drivers/md/dm-vdo/index.c b/drivers/md/dm-vdo/indexer/index.c
similarity index 99%
rename from drivers/md/dm-vdo/index.c
rename to drivers/md/dm-vdo/indexer/index.c
index 1596f6ba43a5a2..bf25f0cf0ee031 100644
--- a/drivers/md/dm-vdo/index.c
+++ b/drivers/md/dm-vdo/indexer/index.c
@@ -3,15 +3,14 @@
  * Copyright 2023 Red Hat
  */
 
-
 #include "index.h"
-
-#include "funnel-requestqueue.h"
 #include "hash-utils.h"
-#include "logger.h"
-#include "memory-alloc.h"
 #include "sparse-cache.h"
 
+#include "../funnel-requestqueue.h"
+#include "../logger.h"
+#include "../memory-alloc.h"
+
 static const u64 NO_LAST_SAVE = U64_MAX;
 
 /*
diff --git a/drivers/md/dm-vdo/index.h b/drivers/md/dm-vdo/indexer/index.h
similarity index 100%
rename from drivers/md/dm-vdo/index.h
rename to drivers/md/dm-vdo/indexer/index.h
diff --git a/drivers/md/dm-vdo/io-factory.c b/drivers/md/dm-vdo/indexer/io-factory.c
similarity index 99%
rename from drivers/md/dm-vdo/io-factory.c
rename to drivers/md/dm-vdo/indexer/io-factory.c
index 02242df94e3735..a2d0f09b4b9f4c 100644
--- a/drivers/md/dm-vdo/io-factory.c
+++ b/drivers/md/dm-vdo/indexer/io-factory.c
@@ -10,9 +10,9 @@
 #include <linux/err.h>
 #include <linux/mount.h>
 
-#include "logger.h"
-#include "memory-alloc.h"
-#include "numeric.h"
+#include "../logger.h"
+#include "../memory-alloc.h"
+#include "../numeric.h"
 
 /*
  * The I/O factory object manages access to index storage, which is a contiguous range of blocks on
diff --git a/drivers/md/dm-vdo/io-factory.h b/drivers/md/dm-vdo/indexer/io-factory.h
similarity index 100%
rename from drivers/md/dm-vdo/io-factory.h
rename to drivers/md/dm-vdo/indexer/io-factory.h
diff --git a/drivers/md/dm-vdo/murmurhash3.c b/drivers/md/dm-vdo/indexer/murmurhash3.c
similarity index 100%
rename from drivers/md/dm-vdo/murmurhash3.c
rename to drivers/md/dm-vdo/indexer/murmurhash3.c
diff --git a/drivers/md/dm-vdo/murmurhash3.h b/drivers/md/dm-vdo/indexer/murmurhash3.h
similarity index 100%
rename from drivers/md/dm-vdo/murmurhash3.h
rename to drivers/md/dm-vdo/indexer/murmurhash3.h
diff --git a/drivers/md/dm-vdo/open-chapter.c b/drivers/md/dm-vdo/indexer/open-chapter.c
similarity index 99%
rename from drivers/md/dm-vdo/open-chapter.c
rename to drivers/md/dm-vdo/indexer/open-chapter.c
index d9d6e5d45bfbde..28b2f472c29ee5 100644
--- a/drivers/md/dm-vdo/open-chapter.c
+++ b/drivers/md/dm-vdo/indexer/open-chapter.c
@@ -9,10 +9,11 @@
 
 #include "config.h"
 #include "hash-utils.h"
-#include "logger.h"
-#include "memory-alloc.h"
-#include "numeric.h"
-#include "permassert.h"
+
+#include "../logger.h"
+#include "../memory-alloc.h"
+#include "../numeric.h"
+#include "../permassert.h"
 
 /*
  * Each index zone has a dedicated open chapter zone structure which gets an equal share of the
diff --git a/drivers/md/dm-vdo/open-chapter.h b/drivers/md/dm-vdo/indexer/open-chapter.h
similarity index 100%
rename from drivers/md/dm-vdo/open-chapter.h
rename to drivers/md/dm-vdo/indexer/open-chapter.h
diff --git a/drivers/md/dm-vdo/radix-sort.c b/drivers/md/dm-vdo/indexer/radix-sort.c
similarity index 99%
rename from drivers/md/dm-vdo/radix-sort.c
rename to drivers/md/dm-vdo/indexer/radix-sort.c
index 1f17c708a65266..e1c40521c7ee77 100644
--- a/drivers/md/dm-vdo/radix-sort.c
+++ b/drivers/md/dm-vdo/indexer/radix-sort.c
@@ -8,8 +8,8 @@
 #include <linux/limits.h>
 #include <linux/types.h>
 
-#include "memory-alloc.h"
-#include "string-utils.h"
+#include "../memory-alloc.h"
+#include "../string-utils.h"
 
 /*
  * This implementation allocates one large object to do the sorting, which can be reused as many
diff --git a/drivers/md/dm-vdo/radix-sort.h b/drivers/md/dm-vdo/indexer/radix-sort.h
similarity index 100%
rename from drivers/md/dm-vdo/radix-sort.h
rename to drivers/md/dm-vdo/indexer/radix-sort.h
diff --git a/drivers/md/dm-vdo/sparse-cache.c b/drivers/md/dm-vdo/indexer/sparse-cache.c
similarity index 99%
rename from drivers/md/dm-vdo/sparse-cache.c
rename to drivers/md/dm-vdo/indexer/sparse-cache.c
index 5b41c94f53faac..cc2a65f5fc0949 100644
--- a/drivers/md/dm-vdo/sparse-cache.c
+++ b/drivers/md/dm-vdo/indexer/sparse-cache.c
@@ -11,10 +11,11 @@
 #include "chapter-index.h"
 #include "config.h"
 #include "index.h"
-#include "logger.h"
-#include "memory-alloc.h"
-#include "permassert.h"
-#include "uds-threads.h"
+
+#include "../logger.h"
+#include "../memory-alloc.h"
+#include "../permassert.h"
+#include "../uds-threads.h"
 
 /*
  * Since the cache is small, it is implemented as a simple array of cache entries. Searching for a
diff --git a/drivers/md/dm-vdo/sparse-cache.h b/drivers/md/dm-vdo/indexer/sparse-cache.h
similarity index 100%
rename from drivers/md/dm-vdo/sparse-cache.h
rename to drivers/md/dm-vdo/indexer/sparse-cache.h
diff --git a/drivers/md/dm-vdo/uds.h b/drivers/md/dm-vdo/indexer/uds.h
similarity index 99%
rename from drivers/md/dm-vdo/uds.h
rename to drivers/md/dm-vdo/indexer/uds.h
index 1264362f83725b..1c5ff0746a8647 100644
--- a/drivers/md/dm-vdo/uds.h
+++ b/drivers/md/dm-vdo/indexer/uds.h
@@ -8,7 +8,7 @@
 
 #include <linux/types.h>
 
-#include "funnel-queue.h"
+#include "../funnel-queue.h"
 
 /*
  * UDS public API
diff --git a/drivers/md/dm-vdo/volume-index.c b/drivers/md/dm-vdo/indexer/volume-index.c
similarity index 99%
rename from drivers/md/dm-vdo/volume-index.c
rename to drivers/md/dm-vdo/indexer/volume-index.c
index 8731ea1662b1fb..f7a516fbec5c21 100644
--- a/drivers/md/dm-vdo/volume-index.c
+++ b/drivers/md/dm-vdo/indexer/volume-index.c
@@ -10,16 +10,17 @@
 #include <linux/compiler.h>
 #include <linux/log2.h>
 
+#include "../errors.h"
+#include "../logger.h"
+#include "../memory-alloc.h"
+#include "../numeric.h"
+#include "../permassert.h"
+#include "../uds-threads.h"
+
 #include "config.h"
-#include "errors.h"
 #include "geometry.h"
 #include "hash-utils.h"
-#include "logger.h"
-#include "memory-alloc.h"
-#include "numeric.h"
-#include "permassert.h"
 #include "uds.h"
-#include "uds-threads.h"
 
 /*
  * The volume index is a combination of two separate subindexes, one containing sparse hook entries
diff --git a/drivers/md/dm-vdo/volume-index.h b/drivers/md/dm-vdo/indexer/volume-index.h
similarity index 99%
rename from drivers/md/dm-vdo/volume-index.h
rename to drivers/md/dm-vdo/indexer/volume-index.h
index 537e9947cf4a3a..d26c32a95e9ea0 100644
--- a/drivers/md/dm-vdo/volume-index.h
+++ b/drivers/md/dm-vdo/indexer/volume-index.h
@@ -8,10 +8,11 @@
 
 #include <linux/limits.h>
 
+#include "../uds-threads.h"
+
 #include "config.h"
 #include "delta-index.h"
 #include "uds.h"
-#include "uds-threads.h"
 
 /*
  * The volume index is the primary top-level index for UDS. It contains records which map a record
diff --git a/drivers/md/dm-vdo/volume.c b/drivers/md/dm-vdo/indexer/volume.c
similarity index 99%
rename from drivers/md/dm-vdo/volume.c
rename to drivers/md/dm-vdo/indexer/volume.c
index 8bd64057c2ca63..a501334429dc27 100644
--- a/drivers/md/dm-vdo/volume.c
+++ b/drivers/md/dm-vdo/indexer/volume.c
@@ -9,18 +9,19 @@
 #include <linux/dm-bufio.h>
 #include <linux/err.h>
 
+#include "../errors.h"
+#include "../logger.h"
+#include "../memory-alloc.h"
+#include "../permassert.h"
+#include "../string-utils.h"
+#include "../uds-threads.h"
+
 #include "chapter-index.h"
 #include "config.h"
-#include "errors.h"
 #include "geometry.h"
 #include "hash-utils.h"
 #include "index.h"
-#include "logger.h"
-#include "memory-alloc.h"
-#include "permassert.h"
 #include "sparse-cache.h"
-#include "string-utils.h"
-#include "uds-threads.h"
 
 /*
  * The first block of the volume layout is reserved for the volume header, which is no longer used.
diff --git a/drivers/md/dm-vdo/volume.h b/drivers/md/dm-vdo/indexer/volume.h
similarity index 98%
rename from drivers/md/dm-vdo/volume.h
rename to drivers/md/dm-vdo/indexer/volume.h
index 066680282340b9..53c85664845272 100644
--- a/drivers/md/dm-vdo/volume.h
+++ b/drivers/md/dm-vdo/indexer/volume.h
@@ -11,16 +11,18 @@
 #include <linux/dm-bufio.h>
 #include <linux/limits.h>
 
+#include "../permassert.h"
+#include "../uds-threads.h"
+
 #include "chapter-index.h"
 #include "config.h"
 #include "geometry.h"
 #include "index-layout.h"
 #include "index-page-map.h"
-#include "permassert.h"
 #include "radix-sort.h"
 #include "sparse-cache.h"
 #include "uds.h"
-#include "uds-threads.h"
+
 
 /*
  * The volume manages deduplication records on permanent storage. The term "volume" can also refer
diff --git a/drivers/md/dm-vdo/uds-sysfs.c b/drivers/md/dm-vdo/uds-sysfs.c
index eee8a5b7d147d6..754732c12d4da5 100644
--- a/drivers/md/dm-vdo/uds-sysfs.c
+++ b/drivers/md/dm-vdo/uds-sysfs.c
@@ -12,7 +12,7 @@
 #include "logger.h"
 #include "memory-alloc.h"
 #include "string-utils.h"
-#include "uds.h"
+#include "indexer/uds.h"
 
 #define UDS_SYSFS_NAME "uds"
 
diff --git a/drivers/md/dm-vdo/vdo.h b/drivers/md/dm-vdo/vdo.h
index 772317e6db52bd..61179476d44605 100644
--- a/drivers/md/dm-vdo/vdo.h
+++ b/drivers/md/dm-vdo/vdo.h
@@ -22,7 +22,7 @@
 #include "statistics.h"
 #include "thread-registry.h"
 #include "types.h"
-#include "uds.h"
+#include "indexer/uds.h"
 
 enum notifier_state {
 	/* Notifications are allowed but not in progress */

From 5b85590056f8b0d4b899e9d3822c6eca981abfae Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@kernel.org>
Date: Fri, 9 Feb 2024 09:54:53 -0600
Subject: [PATCH 0883/1406] dm vdo: fold thread-cond-var.c into uds-threads

Also make uds_*_semaphore() interface private to uds-threads.c

Further cleanup is needed for uds-threads interfaces given many
functions should return void or be removed entirely because they
amount to obfuscation via wrappers.

Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 drivers/md/dm-vdo/Makefile          |  1 -
 drivers/md/dm-vdo/thread-cond-var.c | 46 -----------------------
 drivers/md/dm-vdo/uds-threads.c     | 50 +++++++++++++++++++++++++
 drivers/md/dm-vdo/uds-threads.h     | 57 +++++++++--------------------
 4 files changed, 68 insertions(+), 86 deletions(-)
 delete mode 100644 drivers/md/dm-vdo/thread-cond-var.c

diff --git a/drivers/md/dm-vdo/Makefile b/drivers/md/dm-vdo/Makefile
index db9a7a16765ccd..bb57ce9e128585 100644
--- a/drivers/md/dm-vdo/Makefile
+++ b/drivers/md/dm-vdo/Makefile
@@ -35,7 +35,6 @@ dm-vdo-objs := \
 	status-codes.o \
 	string-utils.o \
 	sysfs.o \
-	thread-cond-var.o \
 	thread-device.o \
 	thread-registry.o \
 	uds-sysfs.o \
diff --git a/drivers/md/dm-vdo/thread-cond-var.c b/drivers/md/dm-vdo/thread-cond-var.c
deleted file mode 100644
index ed7f0b79ca0a89..00000000000000
--- a/drivers/md/dm-vdo/thread-cond-var.c
+++ /dev/null
@@ -1,46 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright 2023 Red Hat
- */
-
-#include <linux/jiffies.h>
-#include <linux/minmax.h>
-
-#include "errors.h"
-#include "time-utils.h"
-#include "uds-threads.h"
-
-int uds_init_cond(struct cond_var *cv)
-{
-	init_waitqueue_head(&cv->wait_queue);
-	return UDS_SUCCESS;
-}
-
-int uds_signal_cond(struct cond_var *cv)
-{
-	wake_up(&cv->wait_queue);
-	return UDS_SUCCESS;
-}
-
-int uds_broadcast_cond(struct cond_var *cv)
-{
-	wake_up_all(&cv->wait_queue);
-	return UDS_SUCCESS;
-}
-
-int uds_wait_cond(struct cond_var *cv, struct mutex *mutex)
-{
-	DEFINE_WAIT(__wait);
-
-	prepare_to_wait(&cv->wait_queue, &__wait, TASK_IDLE);
-	uds_unlock_mutex(mutex);
-	schedule();
-	finish_wait(&cv->wait_queue, &__wait);
-	uds_lock_mutex(mutex);
-	return UDS_SUCCESS;
-}
-
-int uds_destroy_cond(struct cond_var *cv)
-{
-	return UDS_SUCCESS;
-}
diff --git a/drivers/md/dm-vdo/uds-threads.c b/drivers/md/dm-vdo/uds-threads.c
index 769c783e342a2f..daf20796d3dc17 100644
--- a/drivers/md/dm-vdo/uds-threads.c
+++ b/drivers/md/dm-vdo/uds-threads.c
@@ -136,10 +136,49 @@ int uds_join_threads(struct thread *thread)
 	return UDS_SUCCESS;
 }
 
+static inline int __must_check uds_initialize_semaphore(struct semaphore *semaphore,
+							unsigned int value)
+{
+	sema_init(semaphore, value);
+	return UDS_SUCCESS;
+}
+
+static inline int uds_destroy_semaphore(struct semaphore *semaphore)
+{
+	return UDS_SUCCESS;
+}
+
+static inline void uds_acquire_semaphore(struct semaphore *semaphore)
+{
+	/*
+	 * Do not use down(semaphore). Instead use down_interruptible so that
+	 * we do not get 120 second stall messages in kern.log.
+	 */
+	while (down_interruptible(semaphore) != 0) {
+		/*
+		 * If we're called from a user-mode process (e.g., "dmsetup
+		 * remove") while waiting for an operation that may take a
+		 * while (e.g., UDS index save), and a signal is sent (SIGINT,
+		 * SIGUSR2), then down_interruptible will not block. If that
+		 * happens, sleep briefly to avoid keeping the CPU locked up in
+		 * this loop. We could just call cond_resched, but then we'd
+		 * still keep consuming CPU time slices and swamp other threads
+		 * trying to do computational work. [VDO-4980]
+		 */
+		fsleep(1000);
+	}
+}
+
+static inline void uds_release_semaphore(struct semaphore *semaphore)
+{
+	up(semaphore);
+}
+
 int uds_initialize_barrier(struct barrier *barrier, unsigned int thread_count)
 {
 	int result;
 
+	/* FIXME: must cleanup, uds_initialize_semaphore never fails! */
 	result = uds_initialize_semaphore(&barrier->mutex, 1);
 	if (result != UDS_SUCCESS)
 		return result;
@@ -181,3 +220,14 @@ int uds_enter_barrier(struct barrier *barrier)
 
 	return UDS_SUCCESS;
 }
+
+void uds_wait_cond(struct cond_var *cv, struct mutex *mutex)
+{
+	DEFINE_WAIT(__wait);
+
+	prepare_to_wait(&cv->wait_queue, &__wait, TASK_IDLE);
+	uds_unlock_mutex(mutex);
+	schedule();
+	finish_wait(&cv->wait_queue, &__wait);
+	uds_lock_mutex(mutex);
+}
diff --git a/drivers/md/dm-vdo/uds-threads.h b/drivers/md/dm-vdo/uds-threads.h
index 9f3bf799138394..c020413816b4f1 100644
--- a/drivers/md/dm-vdo/uds-threads.h
+++ b/drivers/md/dm-vdo/uds-threads.h
@@ -47,69 +47,48 @@ int __must_check uds_initialize_barrier(struct barrier *barrier,
 int uds_destroy_barrier(struct barrier *barrier);
 int uds_enter_barrier(struct barrier *barrier);
 
-int __must_check uds_init_cond(struct cond_var *cond);
-int uds_signal_cond(struct cond_var *cond);
-int uds_broadcast_cond(struct cond_var *cond);
-int uds_wait_cond(struct cond_var *cond, struct mutex *mutex);
-int uds_destroy_cond(struct cond_var *cond);
-
-static inline int __must_check uds_init_mutex(struct mutex *mutex)
+static inline int __must_check uds_init_cond(struct cond_var *cv)
 {
-	mutex_init(mutex);
+	init_waitqueue_head(&cv->wait_queue);
 	return UDS_SUCCESS;
 }
 
-static inline int uds_destroy_mutex(struct mutex *mutex)
+static inline void uds_signal_cond(struct cond_var *cv)
 {
-	return UDS_SUCCESS;
+	wake_up(&cv->wait_queue);
 }
 
-static inline void uds_lock_mutex(struct mutex *mutex)
+static inline void uds_broadcast_cond(struct cond_var *cv)
 {
-	mutex_lock(mutex);
+	wake_up_all(&cv->wait_queue);
 }
 
-static inline void uds_unlock_mutex(struct mutex *mutex)
+void uds_wait_cond(struct cond_var *cond, struct mutex *mutex);
+
+/* FIXME: all below wrappers should be removed! */
+
+static inline void uds_destroy_cond(struct cond_var *cv)
 {
-	mutex_unlock(mutex);
 }
 
-static inline int __must_check uds_initialize_semaphore(struct semaphore *semaphore,
-							unsigned int value)
+static inline int __must_check uds_init_mutex(struct mutex *mutex)
 {
-	sema_init(semaphore, value);
+	mutex_init(mutex);
 	return UDS_SUCCESS;
 }
 
-static inline int uds_destroy_semaphore(struct semaphore *semaphore)
+static inline void uds_destroy_mutex(struct mutex *mutex)
 {
-	return UDS_SUCCESS;
 }
 
-static inline void uds_acquire_semaphore(struct semaphore *semaphore)
+static inline void uds_lock_mutex(struct mutex *mutex)
 {
-	/*
-	 * Do not use down(semaphore). Instead use down_interruptible so that
-	 * we do not get 120 second stall messages in kern.log.
-	 */
-	while (down_interruptible(semaphore) != 0) {
-		/*
-		 * If we're called from a user-mode process (e.g., "dmsetup
-		 * remove") while waiting for an operation that may take a
-		 * while (e.g., UDS index save), and a signal is sent (SIGINT,
-		 * SIGUSR2), then down_interruptible will not block. If that
-		 * happens, sleep briefly to avoid keeping the CPU locked up in
-		 * this loop. We could just call cond_resched, but then we'd
-		 * still keep consuming CPU time slices and swamp other threads
-		 * trying to do computational work. [VDO-4980]
-		 */
-		fsleep(1000);
-	}
+	mutex_lock(mutex);
 }
 
-static inline void uds_release_semaphore(struct semaphore *semaphore)
+static inline void uds_unlock_mutex(struct mutex *mutex)
 {
-	up(semaphore);
+	mutex_unlock(mutex);
 }
 
 #endif /* UDS_THREADS_H */

From ed67c91ebae31089c273027b862c87dd5bafebc6 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@kernel.org>
Date: Fri, 9 Feb 2024 10:10:03 -0600
Subject: [PATCH 0884/1406] dm vdo: rename uds-threads.[ch] to
 thread-utils.[ch]

Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 drivers/md/dm-vdo/Makefile                          | 2 +-
 drivers/md/dm-vdo/funnel-requestqueue.c             | 2 +-
 drivers/md/dm-vdo/indexer/config.c                  | 2 +-
 drivers/md/dm-vdo/indexer/index-page-map.c          | 2 +-
 drivers/md/dm-vdo/indexer/index-session.h           | 2 +-
 drivers/md/dm-vdo/indexer/sparse-cache.c            | 2 +-
 drivers/md/dm-vdo/indexer/volume-index.c            | 2 +-
 drivers/md/dm-vdo/indexer/volume-index.h            | 2 +-
 drivers/md/dm-vdo/indexer/volume.c                  | 2 +-
 drivers/md/dm-vdo/indexer/volume.h                  | 2 +-
 drivers/md/dm-vdo/logger.c                          | 2 +-
 drivers/md/dm-vdo/status-codes.c                    | 2 +-
 drivers/md/dm-vdo/{uds-threads.c => thread-utils.c} | 2 +-
 drivers/md/dm-vdo/{uds-threads.h => thread-utils.h} | 7 +++----
 14 files changed, 16 insertions(+), 17 deletions(-)
 rename drivers/md/dm-vdo/{uds-threads.c => thread-utils.c} (99%)
 rename drivers/md/dm-vdo/{uds-threads.h => thread-utils.h} (95%)

diff --git a/drivers/md/dm-vdo/Makefile b/drivers/md/dm-vdo/Makefile
index bb57ce9e128585..199f5d564724ae 100644
--- a/drivers/md/dm-vdo/Makefile
+++ b/drivers/md/dm-vdo/Makefile
@@ -37,8 +37,8 @@ dm-vdo-objs := \
 	sysfs.o \
 	thread-device.o \
 	thread-registry.o \
+	thread-utils.o \
 	uds-sysfs.o \
-	uds-threads.o \
 	vdo.o \
 	vio.o \
 	wait-queue.o \
diff --git a/drivers/md/dm-vdo/funnel-requestqueue.c b/drivers/md/dm-vdo/funnel-requestqueue.c
index c8ba04c1089c7c..e7a3a49622959b 100644
--- a/drivers/md/dm-vdo/funnel-requestqueue.c
+++ b/drivers/md/dm-vdo/funnel-requestqueue.c
@@ -12,7 +12,7 @@
 #include "funnel-queue.h"
 #include "logger.h"
 #include "memory-alloc.h"
-#include "uds-threads.h"
+#include "thread-utils.h"
 
 /*
  * This queue will attempt to handle requests in reasonably sized batches instead of reacting
diff --git a/drivers/md/dm-vdo/indexer/config.c b/drivers/md/dm-vdo/indexer/config.c
index 88d42897137141..350075ba69b11e 100644
--- a/drivers/md/dm-vdo/indexer/config.c
+++ b/drivers/md/dm-vdo/indexer/config.c
@@ -9,7 +9,7 @@
 #include "../memory-alloc.h"
 #include "../numeric.h"
 #include "../string-utils.h"
-#include "../uds-threads.h"
+#include "../thread-utils.h"
 
 static const u8 INDEX_CONFIG_MAGIC[] = "ALBIC";
 static const u8 INDEX_CONFIG_VERSION_6_02[] = "06.02";
diff --git a/drivers/md/dm-vdo/indexer/index-page-map.c b/drivers/md/dm-vdo/indexer/index-page-map.c
index 14d2e9912e287d..7857fd80b2c99f 100644
--- a/drivers/md/dm-vdo/indexer/index-page-map.c
+++ b/drivers/md/dm-vdo/indexer/index-page-map.c
@@ -11,7 +11,7 @@
 #include "../numeric.h"
 #include "../permassert.h"
 #include "../string-utils.h"
-#include "../uds-threads.h"
+#include "../thread-utils.h"
 
 #include "hash-utils.h"
 #include "uds.h"
diff --git a/drivers/md/dm-vdo/indexer/index-session.h b/drivers/md/dm-vdo/indexer/index-session.h
index 60cf15ba7b8e53..d6a56b46a8cd70 100644
--- a/drivers/md/dm-vdo/indexer/index-session.h
+++ b/drivers/md/dm-vdo/indexer/index-session.h
@@ -9,7 +9,7 @@
 #include <linux/atomic.h>
 #include <linux/cache.h>
 
-#include "../uds-threads.h"
+#include "../thread-utils.h"
 
 #include "config.h"
 #include "uds.h"
diff --git a/drivers/md/dm-vdo/indexer/sparse-cache.c b/drivers/md/dm-vdo/indexer/sparse-cache.c
index cc2a65f5fc0949..1b6aa3e903ed81 100644
--- a/drivers/md/dm-vdo/indexer/sparse-cache.c
+++ b/drivers/md/dm-vdo/indexer/sparse-cache.c
@@ -15,7 +15,7 @@
 #include "../logger.h"
 #include "../memory-alloc.h"
 #include "../permassert.h"
-#include "../uds-threads.h"
+#include "../thread-utils.h"
 
 /*
  * Since the cache is small, it is implemented as a simple array of cache entries. Searching for a
diff --git a/drivers/md/dm-vdo/indexer/volume-index.c b/drivers/md/dm-vdo/indexer/volume-index.c
index f7a516fbec5c21..ea309c530c8bf7 100644
--- a/drivers/md/dm-vdo/indexer/volume-index.c
+++ b/drivers/md/dm-vdo/indexer/volume-index.c
@@ -15,7 +15,7 @@
 #include "../memory-alloc.h"
 #include "../numeric.h"
 #include "../permassert.h"
-#include "../uds-threads.h"
+#include "../thread-utils.h"
 
 #include "config.h"
 #include "geometry.h"
diff --git a/drivers/md/dm-vdo/indexer/volume-index.h b/drivers/md/dm-vdo/indexer/volume-index.h
index d26c32a95e9ea0..631d3d99ac36a7 100644
--- a/drivers/md/dm-vdo/indexer/volume-index.h
+++ b/drivers/md/dm-vdo/indexer/volume-index.h
@@ -8,7 +8,7 @@
 
 #include <linux/limits.h>
 
-#include "../uds-threads.h"
+#include "../thread-utils.h"
 
 #include "config.h"
 #include "delta-index.h"
diff --git a/drivers/md/dm-vdo/indexer/volume.c b/drivers/md/dm-vdo/indexer/volume.c
index a501334429dc27..c57a33a250b7da 100644
--- a/drivers/md/dm-vdo/indexer/volume.c
+++ b/drivers/md/dm-vdo/indexer/volume.c
@@ -14,7 +14,7 @@
 #include "../memory-alloc.h"
 #include "../permassert.h"
 #include "../string-utils.h"
-#include "../uds-threads.h"
+#include "../thread-utils.h"
 
 #include "chapter-index.h"
 #include "config.h"
diff --git a/drivers/md/dm-vdo/indexer/volume.h b/drivers/md/dm-vdo/indexer/volume.h
index 53c85664845272..e3241c9554471e 100644
--- a/drivers/md/dm-vdo/indexer/volume.h
+++ b/drivers/md/dm-vdo/indexer/volume.h
@@ -12,7 +12,7 @@
 #include <linux/limits.h>
 
 #include "../permassert.h"
-#include "../uds-threads.h"
+#include "../thread-utils.h"
 
 #include "chapter-index.h"
 #include "config.h"
diff --git a/drivers/md/dm-vdo/logger.c b/drivers/md/dm-vdo/logger.c
index 1efbf8d52f2cfd..ff1c570f81bf5d 100644
--- a/drivers/md/dm-vdo/logger.c
+++ b/drivers/md/dm-vdo/logger.c
@@ -12,7 +12,7 @@
 #include <linux/sched.h>
 
 #include "thread-device.h"
-#include "uds-threads.h"
+#include "thread-utils.h"
 
 struct priority_name {
 	const char *name;
diff --git a/drivers/md/dm-vdo/status-codes.c b/drivers/md/dm-vdo/status-codes.c
index b4d7eb7f94ff06..d77bc5e4a99a3f 100644
--- a/drivers/md/dm-vdo/status-codes.c
+++ b/drivers/md/dm-vdo/status-codes.c
@@ -8,7 +8,7 @@
 #include "errors.h"
 #include "logger.h"
 #include "permassert.h"
-#include "uds-threads.h"
+#include "thread-utils.h"
 
 const struct error_info vdo_status_list[] = {
 	{ "VDO_NOT_IMPLEMENTED", "Not implemented" },
diff --git a/drivers/md/dm-vdo/uds-threads.c b/drivers/md/dm-vdo/thread-utils.c
similarity index 99%
rename from drivers/md/dm-vdo/uds-threads.c
rename to drivers/md/dm-vdo/thread-utils.c
index daf20796d3dc17..ba2606ad172195 100644
--- a/drivers/md/dm-vdo/uds-threads.c
+++ b/drivers/md/dm-vdo/thread-utils.c
@@ -3,7 +3,7 @@
  * Copyright 2023 Red Hat
  */
 
-#include "uds-threads.h"
+#include "thread-utils.h"
 
 #include <linux/completion.h>
 #include <linux/delay.h>
diff --git a/drivers/md/dm-vdo/uds-threads.h b/drivers/md/dm-vdo/thread-utils.h
similarity index 95%
rename from drivers/md/dm-vdo/uds-threads.h
rename to drivers/md/dm-vdo/thread-utils.h
index c020413816b4f1..9c607a51251133 100644
--- a/drivers/md/dm-vdo/uds-threads.h
+++ b/drivers/md/dm-vdo/thread-utils.h
@@ -3,8 +3,8 @@
  * Copyright 2023 Red Hat
  */
 
-#ifndef UDS_THREADS_H
-#define UDS_THREADS_H
+#ifndef THREAD_UTILS_H
+#define THREAD_UTILS_H
 
 #include <linux/atomic.h>
 #include <linux/delay.h>
@@ -14,7 +14,6 @@
 #include <linux/wait.h>
 
 #include "errors.h"
-#include "time-utils.h"
 
 /* Thread and synchronization utilities for UDS */
 
@@ -91,4 +90,4 @@ static inline void uds_unlock_mutex(struct mutex *mutex)
 	mutex_unlock(mutex);
 }
 
-#endif /* UDS_THREADS_H */
+#endif /* THREAD_UTILS_H */

From dc1bca550701ebf1cbf06bf9086ab83829896b5d Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@kernel.org>
Date: Fri, 9 Feb 2024 10:46:04 -0600
Subject: [PATCH 0885/1406] dm vdo thread-utils: eliminate uds_*_semaphore
 interfaces

The implementation of thread 'barrier' data structure does not require
overdone private semaphore wrappers.  Also rename the barrier
structure's 'mutex' member (a semaphore) to 'lock'.

Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 drivers/md/dm-vdo/thread-utils.c | 55 ++++++++------------------------
 drivers/md/dm-vdo/thread-utils.h |  4 +--
 2 files changed, 15 insertions(+), 44 deletions(-)

diff --git a/drivers/md/dm-vdo/thread-utils.c b/drivers/md/dm-vdo/thread-utils.c
index ba2606ad172195..e63d1a0497b088 100644
--- a/drivers/md/dm-vdo/thread-utils.c
+++ b/drivers/md/dm-vdo/thread-utils.c
@@ -136,19 +136,7 @@ int uds_join_threads(struct thread *thread)
 	return UDS_SUCCESS;
 }
 
-static inline int __must_check uds_initialize_semaphore(struct semaphore *semaphore,
-							unsigned int value)
-{
-	sema_init(semaphore, value);
-	return UDS_SUCCESS;
-}
-
-static inline int uds_destroy_semaphore(struct semaphore *semaphore)
-{
-	return UDS_SUCCESS;
-}
-
-static inline void uds_acquire_semaphore(struct semaphore *semaphore)
+static inline void __down(struct semaphore *semaphore)
 {
 	/*
 	 * Do not use down(semaphore). Instead use down_interruptible so that
@@ -169,53 +157,36 @@ static inline void uds_acquire_semaphore(struct semaphore *semaphore)
 	}
 }
 
-static inline void uds_release_semaphore(struct semaphore *semaphore)
-{
-	up(semaphore);
-}
-
 int uds_initialize_barrier(struct barrier *barrier, unsigned int thread_count)
 {
-	int result;
-
-	/* FIXME: must cleanup, uds_initialize_semaphore never fails! */
-	result = uds_initialize_semaphore(&barrier->mutex, 1);
-	if (result != UDS_SUCCESS)
-		return result;
-
+	sema_init(&barrier->lock, 1);
 	barrier->arrived = 0;
 	barrier->thread_count = thread_count;
-	return uds_initialize_semaphore(&barrier->wait, 0);
+	sema_init(&barrier->wait, 0);
+
+	return UDS_SUCCESS;
 }
 
 int uds_destroy_barrier(struct barrier *barrier)
 {
-	int result;
-
-	result = uds_destroy_semaphore(&barrier->mutex);
-	if (result != UDS_SUCCESS)
-		return result;
-
-	return uds_destroy_semaphore(&barrier->wait);
+	return UDS_SUCCESS;
 }
 
 int uds_enter_barrier(struct barrier *barrier)
 {
-	bool last_thread;
-
-	uds_acquire_semaphore(&barrier->mutex);
-	last_thread = (++barrier->arrived == barrier->thread_count);
-	if (last_thread) {
+	__down(&barrier->lock);
+	if (++barrier->arrived == barrier->thread_count) {
+		/* last thread */
 		int i;
 
 		for (i = 1; i < barrier->thread_count; i++)
-			uds_release_semaphore(&barrier->wait);
+			up(&barrier->wait);
 
 		barrier->arrived = 0;
-		uds_release_semaphore(&barrier->mutex);
+		up(&barrier->lock);
 	} else {
-		uds_release_semaphore(&barrier->mutex);
-		uds_acquire_semaphore(&barrier->wait);
+		up(&barrier->lock);
+		__down(&barrier->wait);
 	}
 
 	return UDS_SUCCESS;
diff --git a/drivers/md/dm-vdo/thread-utils.h b/drivers/md/dm-vdo/thread-utils.h
index 9c607a51251133..b64435ac5bd991 100644
--- a/drivers/md/dm-vdo/thread-utils.h
+++ b/drivers/md/dm-vdo/thread-utils.h
@@ -24,8 +24,8 @@ struct cond_var {
 struct thread;
 
 struct barrier {
-	/* Mutex for this barrier object */
-	struct semaphore mutex;
+	/* Lock for this barrier object */
+	struct semaphore lock;
 	/* Semaphore for threads waiting at the barrier */
 	struct semaphore wait;
 	/* Number of threads which have arrived */

From e367bc659821d802201b5bfa594ba523fc06f6aa Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@kernel.org>
Date: Fri, 9 Feb 2024 11:09:14 -0600
Subject: [PATCH 0886/1406] dm vdo thread-utils: push 'barrier' down to
 indexer's sparse-cache

The VDO indexer's sparse-cache is the only user of the 'barrier' data
structure, so just move it private to it.

Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 drivers/md/dm-vdo/indexer/sparse-cache.c | 69 +++++++++++++++++++++++-
 drivers/md/dm-vdo/thread-utils.c         | 56 -------------------
 drivers/md/dm-vdo/thread-utils.h         | 16 ------
 3 files changed, 68 insertions(+), 73 deletions(-)

diff --git a/drivers/md/dm-vdo/indexer/sparse-cache.c b/drivers/md/dm-vdo/indexer/sparse-cache.c
index 1b6aa3e903ed81..e3a61717cc8621 100644
--- a/drivers/md/dm-vdo/indexer/sparse-cache.c
+++ b/drivers/md/dm-vdo/indexer/sparse-cache.c
@@ -6,6 +6,7 @@
 #include "sparse-cache.h"
 
 #include <linux/cache.h>
+#include <linux/delay.h>
 #include <linux/dm-bufio.h>
 
 #include "chapter-index.h"
@@ -15,7 +16,6 @@
 #include "../logger.h"
 #include "../memory-alloc.h"
 #include "../permassert.h"
-#include "../thread-utils.h"
 
 /*
  * Since the cache is small, it is implemented as a simple array of cache entries. Searching for a
@@ -142,6 +142,17 @@ struct search_list {
 	struct cached_chapter_index *entries[];
 };
 
+struct barrier {
+	/* Lock for this barrier object */
+	struct semaphore lock;
+	/* Semaphore for threads waiting at this barrier */
+	struct semaphore wait;
+	/* Number of threads which have arrived */
+	int arrived;
+	/* Total number of threads using this barrier */
+	int thread_count;
+};
+
 struct sparse_cache {
 	const struct index_geometry *geometry;
 	unsigned int capacity;
@@ -157,6 +168,62 @@ struct sparse_cache {
 	struct cached_chapter_index chapters[];
 };
 
+static int uds_initialize_barrier(struct barrier *barrier, unsigned int thread_count)
+{
+	sema_init(&barrier->lock, 1);
+	barrier->arrived = 0;
+	barrier->thread_count = thread_count;
+	sema_init(&barrier->wait, 0);
+
+	return UDS_SUCCESS;
+}
+
+static int uds_destroy_barrier(struct barrier *barrier)
+{
+	return UDS_SUCCESS;
+}
+
+static inline void __down(struct semaphore *semaphore)
+{
+	/*
+	 * Do not use down(semaphore). Instead use down_interruptible so that
+	 * we do not get 120 second stall messages in kern.log.
+	 */
+	while (down_interruptible(semaphore) != 0) {
+		/*
+		 * If we're called from a user-mode process (e.g., "dmsetup
+		 * remove") while waiting for an operation that may take a
+		 * while (e.g., UDS index save), and a signal is sent (SIGINT,
+		 * SIGUSR2), then down_interruptible will not block. If that
+		 * happens, sleep briefly to avoid keeping the CPU locked up in
+		 * this loop. We could just call cond_resched, but then we'd
+		 * still keep consuming CPU time slices and swamp other threads
+		 * trying to do computational work. [VDO-4980]
+		 */
+		fsleep(1000);
+	}
+}
+
+static int uds_enter_barrier(struct barrier *barrier)
+{
+	__down(&barrier->lock);
+	if (++barrier->arrived == barrier->thread_count) {
+		/* last thread */
+		int i;
+
+		for (i = 1; i < barrier->thread_count; i++)
+			up(&barrier->wait);
+
+		barrier->arrived = 0;
+		up(&barrier->lock);
+	} else {
+		up(&barrier->lock);
+		__down(&barrier->wait);
+	}
+
+	return UDS_SUCCESS;
+}
+
 static int __must_check initialize_cached_chapter_index(struct cached_chapter_index *chapter,
 							const struct index_geometry *geometry)
 {
diff --git a/drivers/md/dm-vdo/thread-utils.c b/drivers/md/dm-vdo/thread-utils.c
index e63d1a0497b088..5d371bfba8ff4e 100644
--- a/drivers/md/dm-vdo/thread-utils.c
+++ b/drivers/md/dm-vdo/thread-utils.c
@@ -136,62 +136,6 @@ int uds_join_threads(struct thread *thread)
 	return UDS_SUCCESS;
 }
 
-static inline void __down(struct semaphore *semaphore)
-{
-	/*
-	 * Do not use down(semaphore). Instead use down_interruptible so that
-	 * we do not get 120 second stall messages in kern.log.
-	 */
-	while (down_interruptible(semaphore) != 0) {
-		/*
-		 * If we're called from a user-mode process (e.g., "dmsetup
-		 * remove") while waiting for an operation that may take a
-		 * while (e.g., UDS index save), and a signal is sent (SIGINT,
-		 * SIGUSR2), then down_interruptible will not block. If that
-		 * happens, sleep briefly to avoid keeping the CPU locked up in
-		 * this loop. We could just call cond_resched, but then we'd
-		 * still keep consuming CPU time slices and swamp other threads
-		 * trying to do computational work. [VDO-4980]
-		 */
-		fsleep(1000);
-	}
-}
-
-int uds_initialize_barrier(struct barrier *barrier, unsigned int thread_count)
-{
-	sema_init(&barrier->lock, 1);
-	barrier->arrived = 0;
-	barrier->thread_count = thread_count;
-	sema_init(&barrier->wait, 0);
-
-	return UDS_SUCCESS;
-}
-
-int uds_destroy_barrier(struct barrier *barrier)
-{
-	return UDS_SUCCESS;
-}
-
-int uds_enter_barrier(struct barrier *barrier)
-{
-	__down(&barrier->lock);
-	if (++barrier->arrived == barrier->thread_count) {
-		/* last thread */
-		int i;
-
-		for (i = 1; i < barrier->thread_count; i++)
-			up(&barrier->wait);
-
-		barrier->arrived = 0;
-		up(&barrier->lock);
-	} else {
-		up(&barrier->lock);
-		__down(&barrier->wait);
-	}
-
-	return UDS_SUCCESS;
-}
-
 void uds_wait_cond(struct cond_var *cv, struct mutex *mutex)
 {
 	DEFINE_WAIT(__wait);
diff --git a/drivers/md/dm-vdo/thread-utils.h b/drivers/md/dm-vdo/thread-utils.h
index b64435ac5bd991..191572b777bf46 100644
--- a/drivers/md/dm-vdo/thread-utils.h
+++ b/drivers/md/dm-vdo/thread-utils.h
@@ -23,17 +23,6 @@ struct cond_var {
 
 struct thread;
 
-struct barrier {
-	/* Lock for this barrier object */
-	struct semaphore lock;
-	/* Semaphore for threads waiting at the barrier */
-	struct semaphore wait;
-	/* Number of threads which have arrived */
-	int arrived;
-	/* Total number of threads using this barrier */
-	int thread_count;
-};
-
 int __must_check uds_create_thread(void (*thread_function)(void *), void *thread_data,
 				   const char *name, struct thread **new_thread);
 
@@ -41,11 +30,6 @@ void uds_perform_once(atomic_t *once_state, void (*function) (void));
 
 int uds_join_threads(struct thread *thread);
 
-int __must_check uds_initialize_barrier(struct barrier *barrier,
-					unsigned int thread_count);
-int uds_destroy_barrier(struct barrier *barrier);
-int uds_enter_barrier(struct barrier *barrier);
-
 static inline int __must_check uds_init_cond(struct cond_var *cv)
 {
 	init_waitqueue_head(&cv->wait_queue);

From 5da00cc83b1c420789fa88bbad086f85f044e1c5 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@kernel.org>
Date: Fri, 9 Feb 2024 11:30:00 -0600
Subject: [PATCH 0887/1406] dm vdo indexer sparse-cache: cleanup
 threads_barrier code

Rename 'barrier' to 'threads_barrier', remove useless
uds_destroy_barrier(), return void from remaining methods and cleanup
uds_make_sparse_cache() accordingly.

Also remove uds_ prefix from the 2 remaining threads_barrier
functions.

Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 drivers/md/dm-vdo/indexer/sparse-cache.c | 60 ++++++++----------------
 1 file changed, 19 insertions(+), 41 deletions(-)

diff --git a/drivers/md/dm-vdo/indexer/sparse-cache.c b/drivers/md/dm-vdo/indexer/sparse-cache.c
index e3a61717cc8621..dded013426e3ff 100644
--- a/drivers/md/dm-vdo/indexer/sparse-cache.c
+++ b/drivers/md/dm-vdo/indexer/sparse-cache.c
@@ -142,7 +142,7 @@ struct search_list {
 	struct cached_chapter_index *entries[];
 };
 
-struct barrier {
+struct threads_barrier {
 	/* Lock for this barrier object */
 	struct semaphore lock;
 	/* Semaphore for threads waiting at this barrier */
@@ -162,25 +162,19 @@ struct sparse_cache {
 	struct search_list *search_lists[MAX_ZONES];
 	struct cached_chapter_index **scratch_entries;
 
-	struct barrier begin_update_barrier;
-	struct barrier end_update_barrier;
+	struct threads_barrier begin_update_barrier;
+	struct threads_barrier end_update_barrier;
 
 	struct cached_chapter_index chapters[];
 };
 
-static int uds_initialize_barrier(struct barrier *barrier, unsigned int thread_count)
+static void initialize_threads_barrier(struct threads_barrier *barrier,
+				       unsigned int thread_count)
 {
 	sema_init(&barrier->lock, 1);
 	barrier->arrived = 0;
 	barrier->thread_count = thread_count;
 	sema_init(&barrier->wait, 0);
-
-	return UDS_SUCCESS;
-}
-
-static int uds_destroy_barrier(struct barrier *barrier)
-{
-	return UDS_SUCCESS;
 }
 
 static inline void __down(struct semaphore *semaphore)
@@ -204,7 +198,7 @@ static inline void __down(struct semaphore *semaphore)
 	}
 }
 
-static int uds_enter_barrier(struct barrier *barrier)
+static void enter_threads_barrier(struct threads_barrier *barrier)
 {
 	__down(&barrier->lock);
 	if (++barrier->arrived == barrier->thread_count) {
@@ -220,8 +214,6 @@ static int uds_enter_barrier(struct barrier *barrier)
 		up(&barrier->lock);
 		__down(&barrier->wait);
 	}
-
-	return UDS_SUCCESS;
 }
 
 static int __must_check initialize_cached_chapter_index(struct cached_chapter_index *chapter,
@@ -288,44 +280,32 @@ int uds_make_sparse_cache(const struct index_geometry *geometry, unsigned int ca
 	 */
 	cache->skip_threshold = (SKIP_SEARCH_THRESHOLD / zone_count);
 
-	result = uds_initialize_barrier(&cache->begin_update_barrier, zone_count);
-	if (result != UDS_SUCCESS) {
-		uds_free_sparse_cache(cache);
-		return result;
-	}
-
-	result = uds_initialize_barrier(&cache->end_update_barrier, zone_count);
-	if (result != UDS_SUCCESS) {
-		uds_free_sparse_cache(cache);
-		return result;
-	}
+	initialize_threads_barrier(&cache->begin_update_barrier, zone_count);
+	initialize_threads_barrier(&cache->end_update_barrier, zone_count);
 
 	for (i = 0; i < capacity; i++) {
 		result = initialize_cached_chapter_index(&cache->chapters[i], geometry);
-		if (result != UDS_SUCCESS) {
-			uds_free_sparse_cache(cache);
-			return result;
-		}
+		if (result != UDS_SUCCESS)
+			goto out;
 	}
 
 	for (i = 0; i < zone_count; i++) {
 		result = make_search_list(cache, &cache->search_lists[i]);
-		if (result != UDS_SUCCESS) {
-			uds_free_sparse_cache(cache);
-			return result;
-		}
+		if (result != UDS_SUCCESS)
+			goto out;
 	}
 
 	/* purge_search_list() needs some temporary lists for sorting. */
 	result = uds_allocate(capacity * 2, struct cached_chapter_index *,
 			      "scratch entries", &cache->scratch_entries);
-	if (result != UDS_SUCCESS) {
-		uds_free_sparse_cache(cache);
-		return result;
-	}
+	if (result != UDS_SUCCESS)
+		goto out;
 
 	*cache_ptr = cache;
 	return UDS_SUCCESS;
+out:
+	uds_free_sparse_cache(cache);
+	return result;
 }
 
 static inline void set_skip_search(struct cached_chapter_index *chapter,
@@ -382,8 +362,6 @@ void uds_free_sparse_cache(struct sparse_cache *cache)
 		uds_free(cache->chapters[i].page_buffers);
 	}
 
-	uds_destroy_barrier(&cache->begin_update_barrier);
-	uds_destroy_barrier(&cache->end_update_barrier);
 	uds_free(cache);
 }
 
@@ -526,7 +504,7 @@ int uds_update_sparse_cache(struct index_zone *zone, u64 virtual_chapter)
 	 * Wait for every zone thread to reach its corresponding barrier request and invoke this
 	 * function before starting to modify the cache.
 	 */
-	uds_enter_barrier(&cache->begin_update_barrier);
+	enter_threads_barrier(&cache->begin_update_barrier);
 
 	/*
 	 * This is the start of the critical section: the zone zero thread is captain, effectively
@@ -554,7 +532,7 @@ int uds_update_sparse_cache(struct index_zone *zone, u64 virtual_chapter)
 	/*
 	 * This is the end of the critical section. All cache invariants must have been restored.
 	 */
-	uds_enter_barrier(&cache->end_update_barrier);
+	enter_threads_barrier(&cache->end_update_barrier);
 	return result;
 }
 

From 0706d4b6a95ee1c55cf38628c0a7c7bffa8706a4 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@kernel.org>
Date: Fri, 9 Feb 2024 12:08:09 -0600
Subject: [PATCH 0888/1406] dm vdo thread-utils: further cleanup of thread
 functions

Change thread function prefix from "uds_" to "vdo_" and fix
vdo_join_threads() to return void.

Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 drivers/md/dm-vdo/funnel-requestqueue.c |  8 ++------
 drivers/md/dm-vdo/indexer/index.c       |  4 ++--
 drivers/md/dm-vdo/indexer/volume.c      |  4 ++--
 drivers/md/dm-vdo/status-codes.c        |  2 +-
 drivers/md/dm-vdo/thread-utils.c        |  9 ++++-----
 drivers/md/dm-vdo/thread-utils.h        | 15 +++++++--------
 6 files changed, 18 insertions(+), 24 deletions(-)

diff --git a/drivers/md/dm-vdo/funnel-requestqueue.c b/drivers/md/dm-vdo/funnel-requestqueue.c
index e7a3a49622959b..d2b49e39550c91 100644
--- a/drivers/md/dm-vdo/funnel-requestqueue.c
+++ b/drivers/md/dm-vdo/funnel-requestqueue.c
@@ -219,7 +219,7 @@ int uds_make_request_queue(const char *queue_name,
 		return result;
 	}
 
-	result = uds_create_thread(request_queue_worker, queue, queue_name,
+	result = vdo_create_thread(request_queue_worker, queue, queue_name,
 				   &queue->thread);
 	if (result != UDS_SUCCESS) {
 		uds_request_queue_finish(queue);
@@ -256,8 +256,6 @@ void uds_request_queue_enqueue(struct uds_request_queue *queue,
 
 void uds_request_queue_finish(struct uds_request_queue *queue)
 {
-	int result;
-
 	if (queue == NULL)
 		return;
 
@@ -272,9 +270,7 @@ void uds_request_queue_finish(struct uds_request_queue *queue)
 
 	if (queue->started) {
 		wake_up_worker(queue);
-		result = uds_join_threads(queue->thread);
-		if (result != UDS_SUCCESS)
-			uds_log_warning_strerror(result, "Failed to join worker thread");
+		vdo_join_threads(queue->thread);
 	}
 
 	uds_free_funnel_queue(queue->main_queue);
diff --git a/drivers/md/dm-vdo/indexer/index.c b/drivers/md/dm-vdo/indexer/index.c
index bf25f0cf0ee031..c33c123ffac489 100644
--- a/drivers/md/dm-vdo/indexer/index.c
+++ b/drivers/md/dm-vdo/indexer/index.c
@@ -743,7 +743,7 @@ static void stop_chapter_writer(struct chapter_writer *writer)
 	uds_unlock_mutex(&writer->mutex);
 
 	if (writer_thread != NULL)
-		uds_join_threads(writer_thread);
+		vdo_join_threads(writer_thread);
 }
 
 static void free_chapter_writer(struct chapter_writer *writer)
@@ -807,7 +807,7 @@ static int make_chapter_writer(struct uds_index *index,
 			       collated_records_size +
 			       writer->open_chapter_index->memory_size);
 
-	result = uds_create_thread(close_chapters, writer, "writer", &writer->thread);
+	result = vdo_create_thread(close_chapters, writer, "writer", &writer->thread);
 	if (result != UDS_SUCCESS) {
 		free_chapter_writer(writer);
 		return result;
diff --git a/drivers/md/dm-vdo/indexer/volume.c b/drivers/md/dm-vdo/indexer/volume.c
index c57a33a250b7da..a978bc51db6b6c 100644
--- a/drivers/md/dm-vdo/indexer/volume.c
+++ b/drivers/md/dm-vdo/indexer/volume.c
@@ -1648,7 +1648,7 @@ int uds_make_volume(const struct uds_configuration *config, struct index_layout
 	}
 
 	for (i = 0; i < config->read_threads; i++) {
-		result = uds_create_thread(read_thread_function, (void *) volume,
+		result = vdo_create_thread(read_thread_function, (void *) volume,
 					   "reader", &volume->reader_threads[i]);
 		if (result != UDS_SUCCESS) {
 			uds_free_volume(volume);
@@ -1690,7 +1690,7 @@ void uds_free_volume(struct volume *volume)
 		uds_broadcast_cond(&volume->read_threads_cond);
 		uds_unlock_mutex(&volume->read_threads_mutex);
 		for (i = 0; i < volume->read_thread_count; i++)
-			uds_join_threads(volume->reader_threads[i]);
+			vdo_join_threads(volume->reader_threads[i]);
 		uds_free(volume->reader_threads);
 		volume->reader_threads = NULL;
 	}
diff --git a/drivers/md/dm-vdo/status-codes.c b/drivers/md/dm-vdo/status-codes.c
index d77bc5e4a99a3f..efba1ead0acaef 100644
--- a/drivers/md/dm-vdo/status-codes.c
+++ b/drivers/md/dm-vdo/status-codes.c
@@ -82,7 +82,7 @@ static void do_status_code_registration(void)
  */
 int vdo_register_status_codes(void)
 {
-	uds_perform_once(&vdo_status_codes_registered, do_status_code_registration);
+	vdo_perform_once(&vdo_status_codes_registered, do_status_code_registration);
 	return status_code_registration_result;
 }
 
diff --git a/drivers/md/dm-vdo/thread-utils.c b/drivers/md/dm-vdo/thread-utils.c
index 5d371bfba8ff4e..2b02f8d871869b 100644
--- a/drivers/md/dm-vdo/thread-utils.c
+++ b/drivers/md/dm-vdo/thread-utils.c
@@ -34,7 +34,7 @@ enum {
 };
 
 /* Run a function once only, and record that fact in the atomic value. */
-void uds_perform_once(atomic_t *once, void (*function)(void))
+void vdo_perform_once(atomic_t *once, void (*function)(void))
 {
 	for (;;) {
 		switch (atomic_cmpxchg(once, ONCE_NOT_DONE, ONCE_IN_PROGRESS)) {
@@ -64,7 +64,7 @@ static int thread_starter(void *arg)
 	struct thread *thread = arg;
 
 	thread->thread_task = current;
-	uds_perform_once(&thread_once, thread_init);
+	vdo_perform_once(&thread_once, thread_init);
 	mutex_lock(&thread_mutex);
 	hlist_add_head(&thread->thread_links, &thread_list);
 	mutex_unlock(&thread_mutex);
@@ -75,7 +75,7 @@ static int thread_starter(void *arg)
 	return 0;
 }
 
-int uds_create_thread(void (*thread_function)(void *), void *thread_data,
+int vdo_create_thread(void (*thread_function)(void *), void *thread_data,
 		      const char *name, struct thread **new_thread)
 {
 	char *name_colon = strchr(name, ':');
@@ -124,7 +124,7 @@ int uds_create_thread(void (*thread_function)(void *), void *thread_data,
 	return UDS_SUCCESS;
 }
 
-int uds_join_threads(struct thread *thread)
+void vdo_join_threads(struct thread *thread)
 {
 	while (wait_for_completion_interruptible(&thread->thread_done))
 		fsleep(1000);
@@ -133,7 +133,6 @@ int uds_join_threads(struct thread *thread)
 	hlist_del(&thread->thread_links);
 	mutex_unlock(&thread_mutex);
 	uds_free(thread);
-	return UDS_SUCCESS;
 }
 
 void uds_wait_cond(struct cond_var *cv, struct mutex *mutex)
diff --git a/drivers/md/dm-vdo/thread-utils.h b/drivers/md/dm-vdo/thread-utils.h
index 191572b777bf46..c13608ab07edc5 100644
--- a/drivers/md/dm-vdo/thread-utils.h
+++ b/drivers/md/dm-vdo/thread-utils.h
@@ -15,20 +15,19 @@
 
 #include "errors.h"
 
-/* Thread and synchronization utilities for UDS */
-
-struct cond_var {
-	wait_queue_head_t wait_queue;
-};
+/* Thread and synchronization utilities */
 
 struct thread;
 
-int __must_check uds_create_thread(void (*thread_function)(void *), void *thread_data,
+int __must_check vdo_create_thread(void (*thread_function)(void *), void *thread_data,
 				   const char *name, struct thread **new_thread);
+void vdo_join_threads(struct thread *thread);
 
-void uds_perform_once(atomic_t *once_state, void (*function) (void));
+void vdo_perform_once(atomic_t *once_state, void (*function) (void));
 
-int uds_join_threads(struct thread *thread);
+struct cond_var {
+	wait_queue_head_t wait_queue;
+};
 
 static inline int __must_check uds_init_cond(struct cond_var *cv)
 {

From e8e16de05b19fd3130c60eac729d819bcc9b896e Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@kernel.org>
Date: Fri, 9 Feb 2024 12:16:13 -0600
Subject: [PATCH 0889/1406] dm vdo indexer: rename uds.h to indexer.h

Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 drivers/md/dm-vdo/data-vio.h                   | 2 +-
 drivers/md/dm-vdo/dedupe.c                     | 2 +-
 drivers/md/dm-vdo/dedupe.h                     | 2 +-
 drivers/md/dm-vdo/encodings.h                  | 2 +-
 drivers/md/dm-vdo/funnel-queue.c               | 2 +-
 drivers/md/dm-vdo/funnel-requestqueue.h        | 2 +-
 drivers/md/dm-vdo/indexer/chapter-index.c      | 2 +-
 drivers/md/dm-vdo/indexer/config.h             | 2 +-
 drivers/md/dm-vdo/indexer/delta-index.c        | 2 +-
 drivers/md/dm-vdo/indexer/geometry.c           | 2 +-
 drivers/md/dm-vdo/indexer/geometry.h           | 2 +-
 drivers/md/dm-vdo/indexer/hash-utils.h         | 2 +-
 drivers/md/dm-vdo/indexer/index-layout.h       | 2 +-
 drivers/md/dm-vdo/indexer/index-page-map.c     | 2 +-
 drivers/md/dm-vdo/indexer/index-session.h      | 2 +-
 drivers/md/dm-vdo/indexer/{uds.h => indexer.h} | 6 +++---
 drivers/md/dm-vdo/indexer/sparse-cache.h       | 2 +-
 drivers/md/dm-vdo/indexer/volume-index.c       | 2 +-
 drivers/md/dm-vdo/indexer/volume-index.h       | 2 +-
 drivers/md/dm-vdo/indexer/volume.h             | 2 +-
 drivers/md/dm-vdo/uds-sysfs.c                  | 2 +-
 drivers/md/dm-vdo/vdo.h                        | 2 +-
 22 files changed, 24 insertions(+), 24 deletions(-)
 rename drivers/md/dm-vdo/indexer/{uds.h => indexer.h} (99%)

diff --git a/drivers/md/dm-vdo/data-vio.h b/drivers/md/dm-vdo/data-vio.h
index 33d5753ee1075a..51324153f622a5 100644
--- a/drivers/md/dm-vdo/data-vio.h
+++ b/drivers/md/dm-vdo/data-vio.h
@@ -11,7 +11,7 @@
 #include <linux/list.h>
 
 #include "permassert.h"
-#include "indexer/uds.h"
+#include "indexer/indexer.h"
 
 #include "block-map.h"
 #include "completion.h"
diff --git a/drivers/md/dm-vdo/dedupe.c b/drivers/md/dm-vdo/dedupe.c
index b819d7228e13b4..e03bf6a0c09ef1 100644
--- a/drivers/md/dm-vdo/dedupe.c
+++ b/drivers/md/dm-vdo/dedupe.c
@@ -132,7 +132,7 @@
 #include "permassert.h"
 #include "string-utils.h"
 
-#include "indexer/uds.h"
+#include "indexer/indexer.h"
 
 #include "action-manager.h"
 #include "admin-state.h"
diff --git a/drivers/md/dm-vdo/dedupe.h b/drivers/md/dm-vdo/dedupe.h
index 1fb5740e5e9bc9..19a6af7dd6e17d 100644
--- a/drivers/md/dm-vdo/dedupe.h
+++ b/drivers/md/dm-vdo/dedupe.h
@@ -9,7 +9,7 @@
 #include <linux/list.h>
 #include <linux/timer.h>
 
-#include "indexer/uds.h"
+#include "indexer/indexer.h"
 
 #include "admin-state.h"
 #include "constants.h"
diff --git a/drivers/md/dm-vdo/encodings.h b/drivers/md/dm-vdo/encodings.h
index 4a36c36ef98390..70927befd122fd 100644
--- a/drivers/md/dm-vdo/encodings.h
+++ b/drivers/md/dm-vdo/encodings.h
@@ -12,7 +12,7 @@
 #include <linux/uuid.h>
 
 #include "numeric.h"
-#include "indexer/uds.h"
+#include "indexer/indexer.h"
 
 #include "constants.h"
 #include "types.h"
diff --git a/drivers/md/dm-vdo/funnel-queue.c b/drivers/md/dm-vdo/funnel-queue.c
index 7f5e4f2d1505ec..12dbf7d05adeea 100644
--- a/drivers/md/dm-vdo/funnel-queue.c
+++ b/drivers/md/dm-vdo/funnel-queue.c
@@ -8,7 +8,7 @@
 #include "cpu.h"
 #include "memory-alloc.h"
 #include "permassert.h"
-#include "indexer/uds.h"
+#include "indexer/indexer.h"
 
 int uds_make_funnel_queue(struct funnel_queue **queue_ptr)
 {
diff --git a/drivers/md/dm-vdo/funnel-requestqueue.h b/drivers/md/dm-vdo/funnel-requestqueue.h
index 88d90ed263e599..6c6c5bf0d61bd5 100644
--- a/drivers/md/dm-vdo/funnel-requestqueue.h
+++ b/drivers/md/dm-vdo/funnel-requestqueue.h
@@ -6,7 +6,7 @@
 #ifndef UDS_REQUEST_QUEUE_H
 #define UDS_REQUEST_QUEUE_H
 
-#include "indexer/uds.h"
+#include "indexer/indexer.h"
 
 /*
  * A simple request queue which will handle new requests in the order in which they are received,
diff --git a/drivers/md/dm-vdo/indexer/chapter-index.c b/drivers/md/dm-vdo/indexer/chapter-index.c
index 1e52004163ef15..ca1f3ea978b04d 100644
--- a/drivers/md/dm-vdo/indexer/chapter-index.c
+++ b/drivers/md/dm-vdo/indexer/chapter-index.c
@@ -5,7 +5,7 @@
 
 #include "chapter-index.h"
 #include "hash-utils.h"
-#include "uds.h"
+#include "indexer.h"
 
 #include "../errors.h"
 #include "../logger.h"
diff --git a/drivers/md/dm-vdo/indexer/config.h b/drivers/md/dm-vdo/indexer/config.h
index 7d19863800d6b0..3cfa7a6c35aecd 100644
--- a/drivers/md/dm-vdo/indexer/config.h
+++ b/drivers/md/dm-vdo/indexer/config.h
@@ -8,7 +8,7 @@
 
 #include "geometry.h"
 #include "io-factory.h"
-#include "uds.h"
+#include "indexer.h"
 
 /*
  * The uds_configuration records a variety of parameters used to configure a new UDS index. Some
diff --git a/drivers/md/dm-vdo/indexer/delta-index.c b/drivers/md/dm-vdo/indexer/delta-index.c
index d8494bb03bd21b..0381ab64b07c56 100644
--- a/drivers/md/dm-vdo/indexer/delta-index.c
+++ b/drivers/md/dm-vdo/indexer/delta-index.c
@@ -20,7 +20,7 @@
 #include "../time-utils.h"
 
 #include "config.h"
-#include "uds.h"
+#include "indexer.h"
 
 /*
  * The entries in a delta index could be stored in a single delta list, but to reduce search times
diff --git a/drivers/md/dm-vdo/indexer/geometry.c b/drivers/md/dm-vdo/indexer/geometry.c
index 11f055c20f6ef8..e73d43de155be2 100644
--- a/drivers/md/dm-vdo/indexer/geometry.c
+++ b/drivers/md/dm-vdo/indexer/geometry.c
@@ -14,7 +14,7 @@
 #include "../permassert.h"
 
 #include "delta-index.h"
-#include "uds.h"
+#include "indexer.h"
 
 /*
  * An index volume is divided into a fixed number of fixed-size chapters, each consisting of a
diff --git a/drivers/md/dm-vdo/indexer/geometry.h b/drivers/md/dm-vdo/indexer/geometry.h
index 9a4a66ac2e467b..a2ecdb238cf2df 100644
--- a/drivers/md/dm-vdo/indexer/geometry.h
+++ b/drivers/md/dm-vdo/indexer/geometry.h
@@ -6,7 +6,7 @@
 #ifndef UDS_INDEX_GEOMETRY_H
 #define UDS_INDEX_GEOMETRY_H
 
-#include "uds.h"
+#include "indexer.h"
 
 /*
  * The index_geometry records parameters that define the layout of a UDS index volume, and the size and
diff --git a/drivers/md/dm-vdo/indexer/hash-utils.h b/drivers/md/dm-vdo/indexer/hash-utils.h
index bb679d58707762..e5f01390e37832 100644
--- a/drivers/md/dm-vdo/indexer/hash-utils.h
+++ b/drivers/md/dm-vdo/indexer/hash-utils.h
@@ -7,7 +7,7 @@
 #define UDS_HASH_UTILS_H
 
 #include "geometry.h"
-#include "uds.h"
+#include "indexer.h"
 
 #include "../numeric.h"
 
diff --git a/drivers/md/dm-vdo/indexer/index-layout.h b/drivers/md/dm-vdo/indexer/index-layout.h
index 84a9eb43a49db6..edb5c73ab7705f 100644
--- a/drivers/md/dm-vdo/indexer/index-layout.h
+++ b/drivers/md/dm-vdo/indexer/index-layout.h
@@ -8,7 +8,7 @@
 
 #include "config.h"
 #include "io-factory.h"
-#include "uds.h"
+#include "indexer.h"
 
 /*
  * The index layout describes the format of the index on the underlying storage, and is responsible
diff --git a/drivers/md/dm-vdo/indexer/index-page-map.c b/drivers/md/dm-vdo/indexer/index-page-map.c
index 7857fd80b2c99f..eb4bf5f9146a7d 100644
--- a/drivers/md/dm-vdo/indexer/index-page-map.c
+++ b/drivers/md/dm-vdo/indexer/index-page-map.c
@@ -14,7 +14,7 @@
 #include "../thread-utils.h"
 
 #include "hash-utils.h"
-#include "uds.h"
+#include "indexer.h"
 
 /*
  * The index page map is conceptually a two-dimensional array indexed by chapter number and index
diff --git a/drivers/md/dm-vdo/indexer/index-session.h b/drivers/md/dm-vdo/indexer/index-session.h
index d6a56b46a8cd70..58e52084a7da72 100644
--- a/drivers/md/dm-vdo/indexer/index-session.h
+++ b/drivers/md/dm-vdo/indexer/index-session.h
@@ -12,7 +12,7 @@
 #include "../thread-utils.h"
 
 #include "config.h"
-#include "uds.h"
+#include "indexer.h"
 
 /*
  * The index session mediates all interactions with a UDS index. Once the index session is created,
diff --git a/drivers/md/dm-vdo/indexer/uds.h b/drivers/md/dm-vdo/indexer/indexer.h
similarity index 99%
rename from drivers/md/dm-vdo/indexer/uds.h
rename to drivers/md/dm-vdo/indexer/indexer.h
index 1c5ff0746a8647..c8e22bda381bd6 100644
--- a/drivers/md/dm-vdo/indexer/uds.h
+++ b/drivers/md/dm-vdo/indexer/indexer.h
@@ -3,8 +3,8 @@
  * Copyright 2023 Red Hat
  */
 
-#ifndef UDS_H
-#define UDS_H
+#ifndef INDEXER_H
+#define INDEXER_H
 
 #include <linux/types.h>
 
@@ -326,4 +326,4 @@ int __must_check uds_get_index_session_stats(struct uds_index_session *session,
 /* This function will fail if any required field of the request is not set. */
 int __must_check uds_launch_request(struct uds_request *request);
 
-#endif /* UDS_H */
+#endif /* INDEXER_H */
diff --git a/drivers/md/dm-vdo/indexer/sparse-cache.h b/drivers/md/dm-vdo/indexer/sparse-cache.h
index 90b0be15545396..45e2dcf165b51d 100644
--- a/drivers/md/dm-vdo/indexer/sparse-cache.h
+++ b/drivers/md/dm-vdo/indexer/sparse-cache.h
@@ -7,7 +7,7 @@
 #define UDS_SPARSE_CACHE_H
 
 #include "geometry.h"
-#include "uds.h"
+#include "indexer.h"
 
 /*
  * The sparse cache is a cache of entire chapter indexes from sparse chapters used for searching
diff --git a/drivers/md/dm-vdo/indexer/volume-index.c b/drivers/md/dm-vdo/indexer/volume-index.c
index ea309c530c8bf7..aad1a2928cde78 100644
--- a/drivers/md/dm-vdo/indexer/volume-index.c
+++ b/drivers/md/dm-vdo/indexer/volume-index.c
@@ -20,7 +20,7 @@
 #include "config.h"
 #include "geometry.h"
 #include "hash-utils.h"
-#include "uds.h"
+#include "indexer.h"
 
 /*
  * The volume index is a combination of two separate subindexes, one containing sparse hook entries
diff --git a/drivers/md/dm-vdo/indexer/volume-index.h b/drivers/md/dm-vdo/indexer/volume-index.h
index 631d3d99ac36a7..1fa34166b91abb 100644
--- a/drivers/md/dm-vdo/indexer/volume-index.h
+++ b/drivers/md/dm-vdo/indexer/volume-index.h
@@ -12,7 +12,7 @@
 
 #include "config.h"
 #include "delta-index.h"
-#include "uds.h"
+#include "indexer.h"
 
 /*
  * The volume index is the primary top-level index for UDS. It contains records which map a record
diff --git a/drivers/md/dm-vdo/indexer/volume.h b/drivers/md/dm-vdo/indexer/volume.h
index e3241c9554471e..c260b22eaa9c01 100644
--- a/drivers/md/dm-vdo/indexer/volume.h
+++ b/drivers/md/dm-vdo/indexer/volume.h
@@ -21,7 +21,7 @@
 #include "index-page-map.h"
 #include "radix-sort.h"
 #include "sparse-cache.h"
-#include "uds.h"
+#include "indexer.h"
 
 
 /*
diff --git a/drivers/md/dm-vdo/uds-sysfs.c b/drivers/md/dm-vdo/uds-sysfs.c
index 754732c12d4da5..101db86f476f21 100644
--- a/drivers/md/dm-vdo/uds-sysfs.c
+++ b/drivers/md/dm-vdo/uds-sysfs.c
@@ -12,7 +12,7 @@
 #include "logger.h"
 #include "memory-alloc.h"
 #include "string-utils.h"
-#include "indexer/uds.h"
+#include "indexer/indexer.h"
 
 #define UDS_SYSFS_NAME "uds"
 
diff --git a/drivers/md/dm-vdo/vdo.h b/drivers/md/dm-vdo/vdo.h
index 61179476d44605..85536dc5835151 100644
--- a/drivers/md/dm-vdo/vdo.h
+++ b/drivers/md/dm-vdo/vdo.h
@@ -22,7 +22,7 @@
 #include "statistics.h"
 #include "thread-registry.h"
 #include "types.h"
-#include "indexer/uds.h"
+#include "indexer/indexer.h"
 
 enum notifier_state {
 	/* Notifications are allowed but not in progress */

From 60bc8cb18b5b6abaefaa619ef8f5a81f775d7472 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@kernel.org>
Date: Fri, 9 Feb 2024 12:35:10 -0600
Subject: [PATCH 0890/1406] dm vdo thread-utils: remove all uds_*_mutex
 wrappers

Just use mutex_init, mutex_lock and mutex_unlock.

Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 drivers/md/dm-vdo/indexer/index-session.c | 107 +++++++++-------------
 drivers/md/dm-vdo/indexer/index.c         |  42 ++++-----
 drivers/md/dm-vdo/indexer/volume-index.c  |  40 +++-----
 drivers/md/dm-vdo/indexer/volume.c        |  37 ++++----
 drivers/md/dm-vdo/thread-utils.c          |   4 +-
 drivers/md/dm-vdo/thread-utils.h          |  22 -----
 6 files changed, 96 insertions(+), 156 deletions(-)

diff --git a/drivers/md/dm-vdo/indexer/index-session.c b/drivers/md/dm-vdo/indexer/index-session.c
index 06ccfa79e00d57..464daab1d86988 100644
--- a/drivers/md/dm-vdo/indexer/index-session.c
+++ b/drivers/md/dm-vdo/indexer/index-session.c
@@ -62,10 +62,10 @@ enum index_session_flag {
 /* Release a reference to an index session. */
 static void release_index_session(struct uds_index_session *index_session)
 {
-	uds_lock_mutex(&index_session->request_mutex);
+	mutex_lock(&index_session->request_mutex);
 	if (--index_session->request_count == 0)
 		uds_broadcast_cond(&index_session->request_cond);
-	uds_unlock_mutex(&index_session->request_mutex);
+	mutex_unlock(&index_session->request_mutex);
 }
 
 /*
@@ -77,10 +77,10 @@ static int get_index_session(struct uds_index_session *index_session)
 	unsigned int state;
 	int result = UDS_SUCCESS;
 
-	uds_lock_mutex(&index_session->request_mutex);
+	mutex_lock(&index_session->request_mutex);
 	index_session->request_count++;
 	state = index_session->state;
-	uds_unlock_mutex(&index_session->request_mutex);
+	mutex_unlock(&index_session->request_mutex);
 
 	if (state == IS_FLAG_LOADED) {
 		return UDS_SUCCESS;
@@ -142,9 +142,9 @@ static void enter_callback_stage(struct uds_request *request)
 {
 	if (request->status != UDS_SUCCESS) {
 		/* All request errors are considered unrecoverable */
-		uds_lock_mutex(&request->session->request_mutex);
+		mutex_lock(&request->session->request_mutex);
 		request->session->state |= IS_FLAG_DISABLED;
-		uds_unlock_mutex(&request->session->request_mutex);
+		mutex_unlock(&request->session->request_mutex);
 	}
 
 	uds_request_queue_enqueue(request->session->callback_queue, request);
@@ -225,32 +225,19 @@ static int __must_check make_empty_index_session(struct uds_index_session **inde
 	if (result != UDS_SUCCESS)
 		return result;
 
-	result = uds_init_mutex(&session->request_mutex);
-	if (result != UDS_SUCCESS) {
-		uds_free(session);
-		return result;
-	}
+	mutex_init(&session->request_mutex);
 
 	result = uds_init_cond(&session->request_cond);
 	if (result != UDS_SUCCESS) {
-		uds_destroy_mutex(&session->request_mutex);
 		uds_free(session);
 		return result;
 	}
 
-	result = uds_init_mutex(&session->load_context.mutex);
-	if (result != UDS_SUCCESS) {
-		uds_destroy_cond(&session->request_cond);
-		uds_destroy_mutex(&session->request_mutex);
-		uds_free(session);
-		return result;
-	}
+	mutex_init(&session->load_context.mutex);
 
 	result = uds_init_cond(&session->load_context.cond);
 	if (result != UDS_SUCCESS) {
-		uds_destroy_mutex(&session->load_context.mutex);
 		uds_destroy_cond(&session->request_cond);
-		uds_destroy_mutex(&session->request_mutex);
 		uds_free(session);
 		return result;
 	}
@@ -259,9 +246,7 @@ static int __must_check make_empty_index_session(struct uds_index_session **inde
 					&session->callback_queue);
 	if (result != UDS_SUCCESS) {
 		uds_destroy_cond(&session->load_context.cond);
-		uds_destroy_mutex(&session->load_context.mutex);
 		uds_destroy_cond(&session->request_cond);
-		uds_destroy_mutex(&session->request_mutex);
 		uds_free(session);
 		return result;
 	}
@@ -284,7 +269,7 @@ static int __must_check start_loading_index_session(struct uds_index_session *in
 {
 	int result;
 
-	uds_lock_mutex(&index_session->request_mutex);
+	mutex_lock(&index_session->request_mutex);
 	if (index_session->state & IS_FLAG_SUSPENDED) {
 		uds_log_info("Index session is suspended");
 		result = -EBUSY;
@@ -295,20 +280,20 @@ static int __must_check start_loading_index_session(struct uds_index_session *in
 		index_session->state |= IS_FLAG_LOADING;
 		result = UDS_SUCCESS;
 	}
-	uds_unlock_mutex(&index_session->request_mutex);
+	mutex_unlock(&index_session->request_mutex);
 	return result;
 }
 
 static void finish_loading_index_session(struct uds_index_session *index_session,
 					 int result)
 {
-	uds_lock_mutex(&index_session->request_mutex);
+	mutex_lock(&index_session->request_mutex);
 	index_session->state &= ~IS_FLAG_LOADING;
 	if (result == UDS_SUCCESS)
 		index_session->state |= IS_FLAG_LOADED;
 
 	uds_broadcast_cond(&index_session->request_cond);
-	uds_unlock_mutex(&index_session->request_mutex);
+	mutex_unlock(&index_session->request_mutex);
 }
 
 static int initialize_index_session(struct uds_index_session *index_session,
@@ -392,12 +377,12 @@ int uds_open_index(enum uds_open_index_type open_type,
 
 static void wait_for_no_requests_in_progress(struct uds_index_session *index_session)
 {
-	uds_lock_mutex(&index_session->request_mutex);
+	mutex_lock(&index_session->request_mutex);
 	while (index_session->request_count > 0) {
 		uds_wait_cond(&index_session->request_cond,
 			      &index_session->request_mutex);
 	}
-	uds_unlock_mutex(&index_session->request_mutex);
+	mutex_unlock(&index_session->request_mutex);
 }
 
 static int __must_check save_index(struct uds_index_session *index_session)
@@ -408,7 +393,7 @@ static int __must_check save_index(struct uds_index_session *index_session)
 
 static void suspend_rebuild(struct uds_index_session *session)
 {
-	uds_lock_mutex(&session->load_context.mutex);
+	mutex_lock(&session->load_context.mutex);
 	switch (session->load_context.status) {
 	case INDEX_OPENING:
 		session->load_context.status = INDEX_SUSPENDING;
@@ -435,7 +420,7 @@ static void suspend_rebuild(struct uds_index_session *session)
 				session->load_context.status);
 		break;
 	}
-	uds_unlock_mutex(&session->load_context.mutex);
+	mutex_unlock(&session->load_context.mutex);
 }
 
 /*
@@ -449,7 +434,7 @@ int uds_suspend_index_session(struct uds_index_session *session, bool save)
 	bool rebuilding = false;
 
 	/* Wait for any current index state change to complete. */
-	uds_lock_mutex(&session->request_mutex);
+	mutex_lock(&session->request_mutex);
 	while (session->state & IS_FLAG_CLOSING)
 		uds_wait_cond(&session->request_cond, &session->request_mutex);
 
@@ -469,7 +454,7 @@ int uds_suspend_index_session(struct uds_index_session *session, bool save)
 		session->state |= IS_FLAG_SUSPENDED;
 		uds_broadcast_cond(&session->request_cond);
 	}
-	uds_unlock_mutex(&session->request_mutex);
+	mutex_unlock(&session->request_mutex);
 
 	if (no_work)
 		return uds_status_to_errno(result);
@@ -481,11 +466,11 @@ int uds_suspend_index_session(struct uds_index_session *session, bool save)
 	else
 		result = uds_flush_index_session(session);
 
-	uds_lock_mutex(&session->request_mutex);
+	mutex_lock(&session->request_mutex);
 	session->state &= ~IS_FLAG_WAITING;
 	session->state |= IS_FLAG_SUSPENDED;
 	uds_broadcast_cond(&session->request_cond);
-	uds_unlock_mutex(&session->request_mutex);
+	mutex_unlock(&session->request_mutex);
 	return uds_status_to_errno(result);
 }
 
@@ -512,7 +497,7 @@ int uds_resume_index_session(struct uds_index_session *session,
 	bool no_work = false;
 	bool resume_replay = false;
 
-	uds_lock_mutex(&session->request_mutex);
+	mutex_lock(&session->request_mutex);
 	if (session->state & IS_FLAG_WAITING) {
 		uds_log_info("Index session is already changing state");
 		no_work = true;
@@ -526,7 +511,7 @@ int uds_resume_index_session(struct uds_index_session *session,
 		if (session->state & IS_FLAG_LOADING)
 			resume_replay = true;
 	}
-	uds_unlock_mutex(&session->request_mutex);
+	mutex_unlock(&session->request_mutex);
 
 	if (no_work)
 		return result;
@@ -534,16 +519,16 @@ int uds_resume_index_session(struct uds_index_session *session,
 	if ((session->index != NULL) && (bdev != session->parameters.bdev)) {
 		result = replace_device(session, bdev);
 		if (result != UDS_SUCCESS) {
-			uds_lock_mutex(&session->request_mutex);
+			mutex_lock(&session->request_mutex);
 			session->state &= ~IS_FLAG_WAITING;
 			uds_broadcast_cond(&session->request_cond);
-			uds_unlock_mutex(&session->request_mutex);
+			mutex_unlock(&session->request_mutex);
 			return uds_status_to_errno(result);
 		}
 	}
 
 	if (resume_replay) {
-		uds_lock_mutex(&session->load_context.mutex);
+		mutex_lock(&session->load_context.mutex);
 		switch (session->load_context.status) {
 		case INDEX_SUSPENDED:
 			session->load_context.status = INDEX_OPENING;
@@ -564,14 +549,14 @@ int uds_resume_index_session(struct uds_index_session *session,
 					session->load_context.status);
 			break;
 		}
-		uds_unlock_mutex(&session->load_context.mutex);
+		mutex_unlock(&session->load_context.mutex);
 	}
 
-	uds_lock_mutex(&session->request_mutex);
+	mutex_lock(&session->request_mutex);
 	session->state &= ~IS_FLAG_WAITING;
 	session->state &= ~IS_FLAG_SUSPENDED;
 	uds_broadcast_cond(&session->request_cond);
-	uds_unlock_mutex(&session->request_mutex);
+	mutex_unlock(&session->request_mutex);
 	return UDS_SUCCESS;
 }
 
@@ -584,9 +569,9 @@ static int save_and_free_index(struct uds_index_session *index_session)
 	if (index == NULL)
 		return UDS_SUCCESS;
 
-	uds_lock_mutex(&index_session->request_mutex);
+	mutex_lock(&index_session->request_mutex);
 	suspended = (index_session->state & IS_FLAG_SUSPENDED);
-	uds_unlock_mutex(&index_session->request_mutex);
+	mutex_unlock(&index_session->request_mutex);
 
 	if (!suspended) {
 		result = uds_save_index(index);
@@ -601,14 +586,14 @@ static int save_and_free_index(struct uds_index_session *index_session)
 	 * Reset all index state that happens to be in the index
 	 * session, so it doesn't affect any future index.
 	 */
-	uds_lock_mutex(&index_session->load_context.mutex);
+	mutex_lock(&index_session->load_context.mutex);
 	index_session->load_context.status = INDEX_OPENING;
-	uds_unlock_mutex(&index_session->load_context.mutex);
+	mutex_unlock(&index_session->load_context.mutex);
 
-	uds_lock_mutex(&index_session->request_mutex);
+	mutex_lock(&index_session->request_mutex);
 	/* Only the suspend bit will remain relevant. */
 	index_session->state &= IS_FLAG_SUSPENDED;
-	uds_unlock_mutex(&index_session->request_mutex);
+	mutex_unlock(&index_session->request_mutex);
 
 	return result;
 }
@@ -619,7 +604,7 @@ int uds_close_index(struct uds_index_session *index_session)
 	int result = UDS_SUCCESS;
 
 	/* Wait for any current index state change to complete. */
-	uds_lock_mutex(&index_session->request_mutex);
+	mutex_lock(&index_session->request_mutex);
 	while ((index_session->state & IS_FLAG_WAITING) ||
 	       (index_session->state & IS_FLAG_CLOSING)) {
 		uds_wait_cond(&index_session->request_cond,
@@ -636,7 +621,7 @@ int uds_close_index(struct uds_index_session *index_session)
 	} else {
 		index_session->state |= IS_FLAG_CLOSING;
 	}
-	uds_unlock_mutex(&index_session->request_mutex);
+	mutex_unlock(&index_session->request_mutex);
 	if (result != UDS_SUCCESS)
 		return uds_status_to_errno(result);
 
@@ -645,10 +630,10 @@ int uds_close_index(struct uds_index_session *index_session)
 	result = save_and_free_index(index_session);
 	uds_log_debug("Closed index");
 
-	uds_lock_mutex(&index_session->request_mutex);
+	mutex_lock(&index_session->request_mutex);
 	index_session->state &= ~IS_FLAG_CLOSING;
 	uds_broadcast_cond(&index_session->request_cond);
-	uds_unlock_mutex(&index_session->request_mutex);
+	mutex_unlock(&index_session->request_mutex);
 	return uds_status_to_errno(result);
 }
 
@@ -661,7 +646,7 @@ int uds_destroy_index_session(struct uds_index_session *index_session)
 	uds_log_debug("Destroying index session");
 
 	/* Wait for any current index state change to complete. */
-	uds_lock_mutex(&index_session->request_mutex);
+	mutex_lock(&index_session->request_mutex);
 	while ((index_session->state & IS_FLAG_WAITING) ||
 	       (index_session->state & IS_FLAG_CLOSING)) {
 		uds_wait_cond(&index_session->request_cond,
@@ -669,7 +654,7 @@ int uds_destroy_index_session(struct uds_index_session *index_session)
 	}
 
 	if (index_session->state & IS_FLAG_DESTROYING) {
-		uds_unlock_mutex(&index_session->request_mutex);
+		mutex_unlock(&index_session->request_mutex);
 		uds_log_info("Index session is already closing");
 		return -EBUSY;
 	}
@@ -677,24 +662,24 @@ int uds_destroy_index_session(struct uds_index_session *index_session)
 	index_session->state |= IS_FLAG_DESTROYING;
 	load_pending = ((index_session->state & IS_FLAG_LOADING) &&
 			(index_session->state & IS_FLAG_SUSPENDED));
-	uds_unlock_mutex(&index_session->request_mutex);
+	mutex_unlock(&index_session->request_mutex);
 
 	if (load_pending) {
 		/* Tell the index to terminate the rebuild. */
-		uds_lock_mutex(&index_session->load_context.mutex);
+		mutex_lock(&index_session->load_context.mutex);
 		if (index_session->load_context.status == INDEX_SUSPENDED) {
 			index_session->load_context.status = INDEX_FREEING;
 			uds_broadcast_cond(&index_session->load_context.cond);
 		}
-		uds_unlock_mutex(&index_session->load_context.mutex);
+		mutex_unlock(&index_session->load_context.mutex);
 
 		/* Wait until the load exits before proceeding. */
-		uds_lock_mutex(&index_session->request_mutex);
+		mutex_lock(&index_session->request_mutex);
 		while (index_session->state & IS_FLAG_LOADING) {
 			uds_wait_cond(&index_session->request_cond,
 				      &index_session->request_mutex);
 		}
-		uds_unlock_mutex(&index_session->request_mutex);
+		mutex_unlock(&index_session->request_mutex);
 	}
 
 	wait_for_no_requests_in_progress(index_session);
@@ -702,9 +687,7 @@ int uds_destroy_index_session(struct uds_index_session *index_session)
 	uds_request_queue_finish(index_session->callback_queue);
 	index_session->callback_queue = NULL;
 	uds_destroy_cond(&index_session->load_context.cond);
-	uds_destroy_mutex(&index_session->load_context.mutex);
 	uds_destroy_cond(&index_session->request_cond);
-	uds_destroy_mutex(&index_session->request_mutex);
 	uds_log_debug("Destroyed index session");
 	uds_free(index_session);
 	return uds_status_to_errno(result);
diff --git a/drivers/md/dm-vdo/indexer/index.c b/drivers/md/dm-vdo/indexer/index.c
index c33c123ffac489..dcdddfd2a74aee 100644
--- a/drivers/md/dm-vdo/indexer/index.c
+++ b/drivers/md/dm-vdo/indexer/index.c
@@ -179,11 +179,11 @@ static int finish_previous_chapter(struct uds_index *index, u64 current_chapter_
 	int result;
 	struct chapter_writer *writer = index->chapter_writer;
 
-	uds_lock_mutex(&writer->mutex);
+	mutex_lock(&writer->mutex);
 	while (index->newest_virtual_chapter < current_chapter_number)
 		uds_wait_cond(&writer->cond, &writer->mutex);
 	result = writer->result;
-	uds_unlock_mutex(&writer->mutex);
+	mutex_unlock(&writer->mutex);
 
 	if (result != UDS_SUCCESS)
 		return uds_log_error_strerror(result,
@@ -218,11 +218,11 @@ static unsigned int start_closing_chapter(struct uds_index *index,
 	unsigned int finished_zones;
 	struct chapter_writer *writer = index->chapter_writer;
 
-	uds_lock_mutex(&writer->mutex);
+	mutex_lock(&writer->mutex);
 	finished_zones = ++writer->zones_to_write;
 	writer->chapters[zone_number] = chapter;
 	uds_broadcast_cond(&writer->cond);
-	uds_unlock_mutex(&writer->mutex);
+	mutex_unlock(&writer->mutex);
 
 	return finished_zones;
 }
@@ -677,7 +677,7 @@ static void close_chapters(void *arg)
 	struct uds_index *index = writer->index;
 
 	uds_log_debug("chapter writer starting");
-	uds_lock_mutex(&writer->mutex);
+	mutex_lock(&writer->mutex);
 	for (;;) {
 		while (writer->zones_to_write < index->zone_count) {
 			if (writer->stop && (writer->zones_to_write == 0)) {
@@ -685,7 +685,7 @@ static void close_chapters(void *arg)
 				 * We've been told to stop, and all of the zones are in the same
 				 * open chapter, so we can exit now.
 				 */
-				uds_unlock_mutex(&writer->mutex);
+				mutex_unlock(&writer->mutex);
 				uds_log_debug("chapter writer stopping");
 				return;
 			}
@@ -697,7 +697,7 @@ static void close_chapters(void *arg)
 		 * it seems safer in principle. It's OK to access the chapter and chapter_number
 		 * fields without the lock since those aren't allowed to change until we're done.
 		 */
-		uds_unlock_mutex(&writer->mutex);
+		mutex_unlock(&writer->mutex);
 
 		if (index->has_saved_open_chapter) {
 			/*
@@ -718,7 +718,7 @@ static void close_chapters(void *arg)
 						writer->collated_records,
 						index->newest_virtual_chapter);
 
-		uds_lock_mutex(&writer->mutex);
+		mutex_lock(&writer->mutex);
 		index->newest_virtual_chapter++;
 		index->oldest_virtual_chapter +=
 			uds_chapters_to_expire(index->volume->geometry,
@@ -733,14 +733,14 @@ static void stop_chapter_writer(struct chapter_writer *writer)
 {
 	struct thread *writer_thread = NULL;
 
-	uds_lock_mutex(&writer->mutex);
+	mutex_lock(&writer->mutex);
 	if (writer->thread != NULL) {
 		writer_thread = writer->thread;
 		writer->thread = NULL;
 		writer->stop = true;
 		uds_broadcast_cond(&writer->cond);
 	}
-	uds_unlock_mutex(&writer->mutex);
+	mutex_unlock(&writer->mutex);
 
 	if (writer_thread != NULL)
 		vdo_join_threads(writer_thread);
@@ -752,7 +752,6 @@ static void free_chapter_writer(struct chapter_writer *writer)
 		return;
 
 	stop_chapter_writer(writer);
-	uds_destroy_mutex(&writer->mutex);
 	uds_destroy_cond(&writer->cond);
 	uds_free_open_chapter_index(writer->open_chapter_index);
 	uds_free(writer->collated_records);
@@ -774,15 +773,10 @@ static int make_chapter_writer(struct uds_index *index,
 		return result;
 
 	writer->index = index;
-	result = uds_init_mutex(&writer->mutex);
-	if (result != UDS_SUCCESS) {
-		uds_free(writer);
-		return result;
-	}
+	mutex_init(&writer->mutex);
 
 	result = uds_init_cond(&writer->cond);
 	if (result != UDS_SUCCESS) {
-		uds_destroy_mutex(&writer->mutex);
 		uds_free(writer);
 		return result;
 	}
@@ -962,9 +956,9 @@ static bool check_for_suspend(struct uds_index *index)
 	if (index->load_context == NULL)
 		return false;
 
-	uds_lock_mutex(&index->load_context->mutex);
+	mutex_lock(&index->load_context->mutex);
 	if (index->load_context->status != INDEX_SUSPENDING) {
-		uds_unlock_mutex(&index->load_context->mutex);
+		mutex_unlock(&index->load_context->mutex);
 		return false;
 	}
 
@@ -977,7 +971,7 @@ static bool check_for_suspend(struct uds_index *index)
 		uds_wait_cond(&index->load_context->cond, &index->load_context->mutex);
 
 	closing = (index->load_context->status == INDEX_FREEING);
-	uds_unlock_mutex(&index->load_context->mutex);
+	mutex_unlock(&index->load_context->mutex);
 	return closing;
 }
 
@@ -1266,14 +1260,14 @@ int uds_make_index(struct uds_configuration *config, enum uds_open_index_type op
 	}
 
 	if (index->load_context != NULL) {
-		uds_lock_mutex(&index->load_context->mutex);
+		mutex_lock(&index->load_context->mutex);
 		index->load_context->status = INDEX_READY;
 		/*
 		 * If we get here, suspend is meaningless, but notify any thread trying to suspend
 		 * us so it doesn't hang.
 		 */
 		uds_broadcast_cond(&index->load_context->cond);
-		uds_unlock_mutex(&index->load_context->mutex);
+		mutex_unlock(&index->load_context->mutex);
 	}
 
 	index->has_saved_open_chapter = loaded;
@@ -1312,10 +1306,10 @@ void uds_wait_for_idle_index(struct uds_index *index)
 {
 	struct chapter_writer *writer = index->chapter_writer;
 
-	uds_lock_mutex(&writer->mutex);
+	mutex_lock(&writer->mutex);
 	while (writer->zones_to_write > 0)
 		uds_wait_cond(&writer->cond, &writer->mutex);
-	uds_unlock_mutex(&writer->mutex);
+	mutex_unlock(&writer->mutex);
 }
 
 /* This function assumes that all requests have been drained. */
diff --git a/drivers/md/dm-vdo/indexer/volume-index.c b/drivers/md/dm-vdo/indexer/volume-index.c
index aad1a2928cde78..5fe34e6c1d9b88 100644
--- a/drivers/md/dm-vdo/indexer/volume-index.c
+++ b/drivers/md/dm-vdo/indexer/volume-index.c
@@ -287,13 +287,8 @@ void uds_free_volume_index(struct volume_index *volume_index)
 	if (volume_index == NULL)
 		return;
 
-	if (volume_index->zones != NULL) {
-		unsigned int zone;
-
-		for (zone = 0; zone < volume_index->zone_count; zone++)
-			uds_destroy_mutex(&volume_index->zones[zone].hook_mutex);
+	if (volume_index->zones != NULL)
 		uds_free(uds_forget(volume_index->zones));
-	}
 
 	uninitialize_volume_sub_index(&volume_index->vi_non_hook);
 	uninitialize_volume_sub_index(&volume_index->vi_hook);
@@ -547,10 +542,10 @@ int uds_get_volume_index_record(struct volume_index *volume_index,
 			get_volume_sub_index_zone(&volume_index->vi_hook, name);
 		struct mutex *mutex = &volume_index->zones[zone].hook_mutex;
 
-		uds_lock_mutex(mutex);
+		mutex_lock(mutex);
 		result = get_volume_sub_index_record(&volume_index->vi_hook, name,
 						     record);
-		uds_unlock_mutex(mutex);
+		mutex_unlock(mutex);
 		/* Remember the mutex so that other operations on the index record can use it. */
 		record->mutex = mutex;
 	} else {
@@ -579,13 +574,13 @@ int uds_put_volume_index_record(struct volume_index_record *record, u64 virtual_
 	}
 	address = extract_address(sub_index, record->name);
 	if (unlikely(record->mutex != NULL))
-		uds_lock_mutex(record->mutex);
+		mutex_lock(record->mutex);
 	result = uds_put_delta_index_entry(&record->delta_entry, address,
 					   convert_virtual_to_index(sub_index,
 								    virtual_chapter),
 					   record->is_found ? record->name->name : NULL);
 	if (unlikely(record->mutex != NULL))
-		uds_unlock_mutex(record->mutex);
+		mutex_unlock(record->mutex);
 	switch (result) {
 	case UDS_SUCCESS:
 		record->virtual_chapter = virtual_chapter;
@@ -615,10 +610,10 @@ int uds_remove_volume_index_record(struct volume_index_record *record)
 	/* Mark the record so that it cannot be used again */
 	record->is_found = false;
 	if (unlikely(record->mutex != NULL))
-		uds_lock_mutex(record->mutex);
+		mutex_lock(record->mutex);
 	result = uds_remove_delta_index_entry(&record->delta_entry);
 	if (unlikely(record->mutex != NULL))
-		uds_unlock_mutex(record->mutex);
+		mutex_unlock(record->mutex);
 	return result;
 }
 
@@ -689,10 +684,10 @@ void uds_set_volume_index_zone_open_chapter(struct volume_index *volume_index,
 	 * chapter number is changing.
 	 */
 	if (has_sparse(volume_index)) {
-		uds_lock_mutex(mutex);
+		mutex_lock(mutex);
 		set_volume_sub_index_zone_open_chapter(&volume_index->vi_hook,
 						       zone_number, virtual_chapter);
-		uds_unlock_mutex(mutex);
+		mutex_unlock(mutex);
 	}
 }
 
@@ -731,12 +726,12 @@ int uds_set_volume_index_record_chapter(struct volume_index_record *record,
 	}
 
 	if (unlikely(record->mutex != NULL))
-		uds_lock_mutex(record->mutex);
+		mutex_lock(record->mutex);
 	result = uds_set_delta_entry_value(&record->delta_entry,
 					   convert_virtual_to_index(sub_index,
 								    virtual_chapter));
 	if (unlikely(record->mutex != NULL))
-		uds_unlock_mutex(record->mutex);
+		mutex_unlock(record->mutex);
 	if (result != UDS_SUCCESS)
 		return result;
 
@@ -786,9 +781,9 @@ u64 uds_lookup_volume_index_name(const struct volume_index *volume_index,
 	if (!uds_is_volume_index_sample(volume_index, name))
 		return NO_CHAPTER;
 
-	uds_lock_mutex(mutex);
+	mutex_lock(mutex);
 	virtual_chapter = lookup_volume_sub_index_name(&volume_index->vi_hook, name);
-	uds_unlock_mutex(mutex);
+	mutex_unlock(mutex);
 
 	return virtual_chapter;
 }
@@ -1259,13 +1254,8 @@ int uds_make_volume_index(const struct uds_configuration *config, u64 volume_non
 		return result;
 	}
 
-	for (zone = 0; zone < config->zone_count; zone++) {
-		result = uds_init_mutex(&volume_index->zones[zone].hook_mutex);
-		if (result != UDS_SUCCESS) {
-			uds_free_volume_index(volume_index);
-			return result;
-		}
-	}
+	for (zone = 0; zone < config->zone_count; zone++)
+		mutex_init(&volume_index->zones[zone].hook_mutex);
 
 	split_configuration(config, &split);
 	result = initialize_volume_sub_index(&split.non_hook_config, volume_nonce, 'd',
diff --git a/drivers/md/dm-vdo/indexer/volume.c b/drivers/md/dm-vdo/indexer/volume.c
index a978bc51db6b6c..41913a6cdbdc0a 100644
--- a/drivers/md/dm-vdo/indexer/volume.c
+++ b/drivers/md/dm-vdo/indexer/volume.c
@@ -555,7 +555,7 @@ static int process_entry(struct volume *volume, struct queued_read *entry)
 
 	page = select_victim_in_cache(&volume->page_cache);
 
-	uds_unlock_mutex(&volume->read_threads_mutex);
+	mutex_unlock(&volume->read_threads_mutex);
 	page_data = dm_bufio_read(volume->client, page_number, &page->buffer);
 	if (IS_ERR(page_data)) {
 		result = -PTR_ERR(page_data);
@@ -565,7 +565,7 @@ static int process_entry(struct volume *volume, struct queued_read *entry)
 		cancel_page_in_cache(&volume->page_cache, page_number, page);
 		return result;
 	}
-	uds_lock_mutex(&volume->read_threads_mutex);
+	mutex_lock(&volume->read_threads_mutex);
 
 	if (entry->invalid) {
 		uds_log_warning("Page %u invalidated after read", page_number);
@@ -627,7 +627,7 @@ static void read_thread_function(void *arg)
 	struct volume *volume = arg;
 
 	uds_log_debug("reader starting");
-	uds_lock_mutex(&volume->read_threads_mutex);
+	mutex_lock(&volume->read_threads_mutex);
 	while (true) {
 		struct queued_read *queue_entry;
 		int result;
@@ -639,7 +639,7 @@ static void read_thread_function(void *arg)
 		result = process_entry(volume, queue_entry);
 		release_queued_requests(volume, queue_entry, result);
 	}
-	uds_unlock_mutex(&volume->read_threads_mutex);
+	mutex_unlock(&volume->read_threads_mutex);
 	uds_log_debug("reader done");
 }
 
@@ -770,7 +770,7 @@ static int get_volume_page_protected(struct volume *volume, struct uds_request *
 
 	/* Prepare to enqueue a read for the page. */
 	end_pending_search(&volume->page_cache, request->zone_number);
-	uds_lock_mutex(&volume->read_threads_mutex);
+	mutex_lock(&volume->read_threads_mutex);
 
 	/*
 	 * Do the lookup again while holding the read mutex (no longer the fast case so this should
@@ -788,7 +788,7 @@ static int get_volume_page_protected(struct volume *volume, struct uds_request *
 		 * turns out to be significant in some cases. The page is not available yet so
 		 * the order does not matter for correctness as it does below.
 		 */
-		uds_unlock_mutex(&volume->read_threads_mutex);
+		mutex_unlock(&volume->read_threads_mutex);
 		begin_pending_search(&volume->page_cache, physical_page,
 				     request->zone_number);
 		return UDS_QUEUED;
@@ -800,7 +800,7 @@ static int get_volume_page_protected(struct volume *volume, struct uds_request *
 	 * the caller gets to look at it.
 	 */
 	begin_pending_search(&volume->page_cache, physical_page, request->zone_number);
-	uds_unlock_mutex(&volume->read_threads_mutex);
+	mutex_unlock(&volume->read_threads_mutex);
 	*page_ptr = page;
 	return UDS_SUCCESS;
 }
@@ -811,9 +811,9 @@ static int get_volume_page(struct volume *volume, u32 chapter, u32 page_number,
 	int result;
 	u32 physical_page = map_to_physical_page(volume->geometry, chapter, page_number);
 
-	uds_lock_mutex(&volume->read_threads_mutex);
+	mutex_lock(&volume->read_threads_mutex);
 	result = get_volume_page_locked(volume, physical_page, page_ptr);
-	uds_unlock_mutex(&volume->read_threads_mutex);
+	mutex_unlock(&volume->read_threads_mutex);
 	return result;
 }
 
@@ -1054,10 +1054,10 @@ void uds_forget_chapter(struct volume *volume, u64 virtual_chapter)
 	u32 i;
 
 	uds_log_debug("forgetting chapter %llu", (unsigned long long) virtual_chapter);
-	uds_lock_mutex(&volume->read_threads_mutex);
+	mutex_lock(&volume->read_threads_mutex);
 	for (i = 0; i < volume->geometry->pages_per_chapter; i++)
 		invalidate_page(&volume->page_cache, first_page + i);
-	uds_unlock_mutex(&volume->read_threads_mutex);
+	mutex_unlock(&volume->read_threads_mutex);
 }
 
 /*
@@ -1142,10 +1142,10 @@ static int write_index_pages(struct volume *volume, u32 physical_chapter_number,
 					  physical_chapter_number, index_page_number,
 					  delta_list_number - 1);
 
-		uds_lock_mutex(&volume->read_threads_mutex);
+		mutex_lock(&volume->read_threads_mutex);
 		result = donate_index_page_locked(volume, physical_chapter_number,
 						  index_page_number, page_buffer);
-		uds_unlock_mutex(&volume->read_threads_mutex);
+		mutex_unlock(&volume->read_threads_mutex);
 		if (result != UDS_SUCCESS) {
 			dm_bufio_release(page_buffer);
 			return result;
@@ -1622,11 +1622,7 @@ int uds_make_volume(const struct uds_configuration *config, struct index_layout
 		return result;
 	}
 
-	result = uds_init_mutex(&volume->read_threads_mutex);
-	if (result != UDS_SUCCESS) {
-		uds_free_volume(volume);
-		return result;
-	}
+	mutex_init(&volume->read_threads_mutex);
 
 	result = uds_init_cond(&volume->read_threads_read_done_cond);
 	if (result != UDS_SUCCESS) {
@@ -1685,10 +1681,10 @@ void uds_free_volume(struct volume *volume)
 		unsigned int i;
 
 		/* This works even if some threads weren't started. */
-		uds_lock_mutex(&volume->read_threads_mutex);
+		mutex_lock(&volume->read_threads_mutex);
 		volume->read_threads_exiting = true;
 		uds_broadcast_cond(&volume->read_threads_cond);
-		uds_unlock_mutex(&volume->read_threads_mutex);
+		mutex_unlock(&volume->read_threads_mutex);
 		for (i = 0; i < volume->read_thread_count; i++)
 			vdo_join_threads(volume->reader_threads[i]);
 		uds_free(volume->reader_threads);
@@ -1703,7 +1699,6 @@ void uds_free_volume(struct volume *volume)
 
 	uds_destroy_cond(&volume->read_threads_cond);
 	uds_destroy_cond(&volume->read_threads_read_done_cond);
-	uds_destroy_mutex(&volume->read_threads_mutex);
 	uds_free_index_page_map(volume->index_page_map);
 	uds_free_radix_sorter(volume->radix_sorter);
 	uds_free(volume->geometry);
diff --git a/drivers/md/dm-vdo/thread-utils.c b/drivers/md/dm-vdo/thread-utils.c
index 2b02f8d871869b..492b5b1d156db4 100644
--- a/drivers/md/dm-vdo/thread-utils.c
+++ b/drivers/md/dm-vdo/thread-utils.c
@@ -140,8 +140,8 @@ void uds_wait_cond(struct cond_var *cv, struct mutex *mutex)
 	DEFINE_WAIT(__wait);
 
 	prepare_to_wait(&cv->wait_queue, &__wait, TASK_IDLE);
-	uds_unlock_mutex(mutex);
+	mutex_unlock(mutex);
 	schedule();
 	finish_wait(&cv->wait_queue, &__wait);
-	uds_lock_mutex(mutex);
+	mutex_lock(mutex);
 }
diff --git a/drivers/md/dm-vdo/thread-utils.h b/drivers/md/dm-vdo/thread-utils.h
index c13608ab07edc5..1c94df09916989 100644
--- a/drivers/md/dm-vdo/thread-utils.h
+++ b/drivers/md/dm-vdo/thread-utils.h
@@ -47,30 +47,8 @@ static inline void uds_broadcast_cond(struct cond_var *cv)
 
 void uds_wait_cond(struct cond_var *cond, struct mutex *mutex);
 
-/* FIXME: all below wrappers should be removed! */
-
 static inline void uds_destroy_cond(struct cond_var *cv)
 {
 }
 
-static inline int __must_check uds_init_mutex(struct mutex *mutex)
-{
-	mutex_init(mutex);
-	return UDS_SUCCESS;
-}
-
-static inline void uds_destroy_mutex(struct mutex *mutex)
-{
-}
-
-static inline void uds_lock_mutex(struct mutex *mutex)
-{
-	mutex_lock(mutex);
-}
-
-static inline void uds_unlock_mutex(struct mutex *mutex)
-{
-	mutex_unlock(mutex);
-}
-
 #endif /* THREAD_UTILS_H */

From 297443734a81d3f9f211f3b148c025814044049c Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@kernel.org>
Date: Fri, 9 Feb 2024 13:06:00 -0600
Subject: [PATCH 0891/1406] dm vdo thread-utils: push uds_*_cond interface down
 to indexer

Only used by indexer. Also, return void from uds_init_cond(), remove
uds_destroy_cond() and fix up all callers.

Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 drivers/md/dm-vdo/indexer/index-session.c | 20 ++---------------
 drivers/md/dm-vdo/indexer/index.c         | 19 ++++++++++------
 drivers/md/dm-vdo/indexer/indexer.h       | 24 ++++++++++++++++++++
 drivers/md/dm-vdo/indexer/volume.c        | 16 ++------------
 drivers/md/dm-vdo/thread-utils.c          | 12 ----------
 drivers/md/dm-vdo/thread-utils.h          | 27 -----------------------
 6 files changed, 40 insertions(+), 78 deletions(-)

diff --git a/drivers/md/dm-vdo/indexer/index-session.c b/drivers/md/dm-vdo/indexer/index-session.c
index 464daab1d86988..6aadba678f08eb 100644
--- a/drivers/md/dm-vdo/indexer/index-session.c
+++ b/drivers/md/dm-vdo/indexer/index-session.c
@@ -226,27 +226,13 @@ static int __must_check make_empty_index_session(struct uds_index_session **inde
 		return result;
 
 	mutex_init(&session->request_mutex);
-
-	result = uds_init_cond(&session->request_cond);
-	if (result != UDS_SUCCESS) {
-		uds_free(session);
-		return result;
-	}
-
+	uds_init_cond(&session->request_cond);
 	mutex_init(&session->load_context.mutex);
-
-	result = uds_init_cond(&session->load_context.cond);
-	if (result != UDS_SUCCESS) {
-		uds_destroy_cond(&session->request_cond);
-		uds_free(session);
-		return result;
-	}
+	uds_init_cond(&session->load_context.cond);
 
 	result = uds_make_request_queue("callbackW", &handle_callbacks,
 					&session->callback_queue);
 	if (result != UDS_SUCCESS) {
-		uds_destroy_cond(&session->load_context.cond);
-		uds_destroy_cond(&session->request_cond);
 		uds_free(session);
 		return result;
 	}
@@ -686,8 +672,6 @@ int uds_destroy_index_session(struct uds_index_session *index_session)
 	result = save_and_free_index(index_session);
 	uds_request_queue_finish(index_session->callback_queue);
 	index_session->callback_queue = NULL;
-	uds_destroy_cond(&index_session->load_context.cond);
-	uds_destroy_cond(&index_session->request_cond);
 	uds_log_debug("Destroyed index session");
 	uds_free(index_session);
 	return uds_status_to_errno(result);
diff --git a/drivers/md/dm-vdo/indexer/index.c b/drivers/md/dm-vdo/indexer/index.c
index dcdddfd2a74aee..6d5c30995d5f2e 100644
--- a/drivers/md/dm-vdo/indexer/index.c
+++ b/drivers/md/dm-vdo/indexer/index.c
@@ -752,7 +752,6 @@ static void free_chapter_writer(struct chapter_writer *writer)
 		return;
 
 	stop_chapter_writer(writer);
-	uds_destroy_cond(&writer->cond);
 	uds_free_open_chapter_index(writer->open_chapter_index);
 	uds_free(writer->collated_records);
 	uds_free(writer);
@@ -774,12 +773,7 @@ static int make_chapter_writer(struct uds_index *index,
 
 	writer->index = index;
 	mutex_init(&writer->mutex);
-
-	result = uds_init_cond(&writer->cond);
-	if (result != UDS_SUCCESS) {
-		uds_free(writer);
-		return result;
-	}
+	uds_init_cond(&writer->cond);
 
 	result = uds_allocate_cache_aligned(collated_records_size, "collated records",
 					    &writer->collated_records);
@@ -1390,3 +1384,14 @@ void uds_enqueue_request(struct uds_request *request, enum request_stage stage)
 
 	uds_request_queue_enqueue(queue, request);
 }
+
+void uds_wait_cond(struct cond_var *cv, struct mutex *mutex)
+{
+	DEFINE_WAIT(__wait);
+
+	prepare_to_wait(&cv->wait_queue, &__wait, TASK_IDLE);
+	mutex_unlock(mutex);
+	schedule();
+	finish_wait(&cv->wait_queue, &__wait);
+	mutex_lock(mutex);
+}
diff --git a/drivers/md/dm-vdo/indexer/indexer.h b/drivers/md/dm-vdo/indexer/indexer.h
index c8e22bda381bd6..a832a34d943671 100644
--- a/drivers/md/dm-vdo/indexer/indexer.h
+++ b/drivers/md/dm-vdo/indexer/indexer.h
@@ -7,6 +7,9 @@
 #define INDEXER_H
 
 #include <linux/types.h>
+#include <linux/mutex.h>
+#include <linux/sched.h>
+#include <linux/wait.h>
 
 #include "../funnel-queue.h"
 
@@ -326,4 +329,25 @@ int __must_check uds_get_index_session_stats(struct uds_index_session *session,
 /* This function will fail if any required field of the request is not set. */
 int __must_check uds_launch_request(struct uds_request *request);
 
+struct cond_var {
+	wait_queue_head_t wait_queue;
+};
+
+static inline void uds_init_cond(struct cond_var *cv)
+{
+	init_waitqueue_head(&cv->wait_queue);
+}
+
+static inline void uds_signal_cond(struct cond_var *cv)
+{
+	wake_up(&cv->wait_queue);
+}
+
+static inline void uds_broadcast_cond(struct cond_var *cv)
+{
+	wake_up_all(&cv->wait_queue);
+}
+
+void uds_wait_cond(struct cond_var *cond, struct mutex *mutex);
+
 #endif /* INDEXER_H */
diff --git a/drivers/md/dm-vdo/indexer/volume.c b/drivers/md/dm-vdo/indexer/volume.c
index 41913a6cdbdc0a..8ce05a98d7e547 100644
--- a/drivers/md/dm-vdo/indexer/volume.c
+++ b/drivers/md/dm-vdo/indexer/volume.c
@@ -1623,18 +1623,8 @@ int uds_make_volume(const struct uds_configuration *config, struct index_layout
 	}
 
 	mutex_init(&volume->read_threads_mutex);
-
-	result = uds_init_cond(&volume->read_threads_read_done_cond);
-	if (result != UDS_SUCCESS) {
-		uds_free_volume(volume);
-		return result;
-	}
-
-	result = uds_init_cond(&volume->read_threads_cond);
-	if (result != UDS_SUCCESS) {
-		uds_free_volume(volume);
-		return result;
-	}
+	uds_init_cond(&volume->read_threads_read_done_cond);
+	uds_init_cond(&volume->read_threads_cond);
 
 	result = uds_allocate(config->read_threads, struct thread *, "reader threads",
 			      &volume->reader_threads);
@@ -1697,8 +1687,6 @@ void uds_free_volume(struct volume *volume)
 	if (volume->client != NULL)
 		dm_bufio_client_destroy(uds_forget(volume->client));
 
-	uds_destroy_cond(&volume->read_threads_cond);
-	uds_destroy_cond(&volume->read_threads_read_done_cond);
 	uds_free_index_page_map(volume->index_page_map);
 	uds_free_radix_sorter(volume->radix_sorter);
 	uds_free(volume->geometry);
diff --git a/drivers/md/dm-vdo/thread-utils.c b/drivers/md/dm-vdo/thread-utils.c
index 492b5b1d156db4..0b80247c7f1b02 100644
--- a/drivers/md/dm-vdo/thread-utils.c
+++ b/drivers/md/dm-vdo/thread-utils.c
@@ -9,7 +9,6 @@
 #include <linux/delay.h>
 #include <linux/err.h>
 #include <linux/kthread.h>
-#include <linux/sched.h>
 
 #include "errors.h"
 #include "logger.h"
@@ -134,14 +133,3 @@ void vdo_join_threads(struct thread *thread)
 	mutex_unlock(&thread_mutex);
 	uds_free(thread);
 }
-
-void uds_wait_cond(struct cond_var *cv, struct mutex *mutex)
-{
-	DEFINE_WAIT(__wait);
-
-	prepare_to_wait(&cv->wait_queue, &__wait, TASK_IDLE);
-	mutex_unlock(mutex);
-	schedule();
-	finish_wait(&cv->wait_queue, &__wait);
-	mutex_lock(mutex);
-}
diff --git a/drivers/md/dm-vdo/thread-utils.h b/drivers/md/dm-vdo/thread-utils.h
index 1c94df09916989..325f9bfa59706f 100644
--- a/drivers/md/dm-vdo/thread-utils.h
+++ b/drivers/md/dm-vdo/thread-utils.h
@@ -11,7 +11,6 @@
 #include <linux/jiffies.h>
 #include <linux/mutex.h>
 #include <linux/semaphore.h>
-#include <linux/wait.h>
 
 #include "errors.h"
 
@@ -25,30 +24,4 @@ void vdo_join_threads(struct thread *thread);
 
 void vdo_perform_once(atomic_t *once_state, void (*function) (void));
 
-struct cond_var {
-	wait_queue_head_t wait_queue;
-};
-
-static inline int __must_check uds_init_cond(struct cond_var *cv)
-{
-	init_waitqueue_head(&cv->wait_queue);
-	return UDS_SUCCESS;
-}
-
-static inline void uds_signal_cond(struct cond_var *cv)
-{
-	wake_up(&cv->wait_queue);
-}
-
-static inline void uds_broadcast_cond(struct cond_var *cv)
-{
-	wake_up_all(&cv->wait_queue);
-}
-
-void uds_wait_cond(struct cond_var *cond, struct mutex *mutex);
-
-static inline void uds_destroy_cond(struct cond_var *cv)
-{
-}
-
 #endif /* THREAD_UTILS_H */

From 63fe8d09401f4a4f51c2806f6523daeb0bd88070 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@kernel.org>
Date: Fri, 9 Feb 2024 13:17:05 -0600
Subject: [PATCH 0892/1406] dm vdo thread-utils: cleanup included headers

Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 drivers/md/dm-vdo/logger.c       | 1 +
 drivers/md/dm-vdo/thread-utils.c | 4 ++--
 drivers/md/dm-vdo/thread-utils.h | 6 ------
 3 files changed, 3 insertions(+), 8 deletions(-)

diff --git a/drivers/md/dm-vdo/logger.c b/drivers/md/dm-vdo/logger.c
index ff1c570f81bf5d..969f10771adae1 100644
--- a/drivers/md/dm-vdo/logger.c
+++ b/drivers/md/dm-vdo/logger.c
@@ -11,6 +11,7 @@
 #include <linux/printk.h>
 #include <linux/sched.h>
 
+#include "errors.h"
 #include "thread-device.h"
 #include "thread-utils.h"
 
diff --git a/drivers/md/dm-vdo/thread-utils.c b/drivers/md/dm-vdo/thread-utils.c
index 0b80247c7f1b02..160679984d72bd 100644
--- a/drivers/md/dm-vdo/thread-utils.c
+++ b/drivers/md/dm-vdo/thread-utils.c
@@ -5,10 +5,10 @@
 
 #include "thread-utils.h"
 
-#include <linux/completion.h>
 #include <linux/delay.h>
-#include <linux/err.h>
 #include <linux/kthread.h>
+#include <linux/mutex.h>
+#include <linux/types.h>
 
 #include "errors.h"
 #include "logger.h"
diff --git a/drivers/md/dm-vdo/thread-utils.h b/drivers/md/dm-vdo/thread-utils.h
index 325f9bfa59706f..bc447743c38e1c 100644
--- a/drivers/md/dm-vdo/thread-utils.h
+++ b/drivers/md/dm-vdo/thread-utils.h
@@ -7,12 +7,6 @@
 #define THREAD_UTILS_H
 
 #include <linux/atomic.h>
-#include <linux/delay.h>
-#include <linux/jiffies.h>
-#include <linux/mutex.h>
-#include <linux/semaphore.h>
-
-#include "errors.h"
 
 /* Thread and synchronization utilities */
 

From 4c30e81611099856c2c3ac0ebc22fb14263052ef Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@kernel.org>
Date: Fri, 9 Feb 2024 14:04:34 -0600
Subject: [PATCH 0893/1406] dm vdo thread-registry: rename all methods to
 reflect vdo-only use

Otherwise, uds_ prefix is misleading (vdo_ is the new catch-all for
code that is used by vdo-only or _both_ vdo and the indexer code).

Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 drivers/md/dm-vdo/memory-alloc.c    | 12 ++++++------
 drivers/md/dm-vdo/thread-device.c   |  8 ++++----
 drivers/md/dm-vdo/thread-registry.c |  8 ++++----
 drivers/md/dm-vdo/thread-registry.h | 14 +++++++-------
 4 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/drivers/md/dm-vdo/memory-alloc.c b/drivers/md/dm-vdo/memory-alloc.c
index 46dd5bda682544..3b2bda9248cbd7 100644
--- a/drivers/md/dm-vdo/memory-alloc.c
+++ b/drivers/md/dm-vdo/memory-alloc.c
@@ -15,14 +15,14 @@
 
 /*
  * UDS and VDO keep track of which threads are allowed to allocate memory freely, and which threads
- * must be careful to not do a memory allocation that does an I/O request. The allocating_threads
- * threads_registry and its associated methods implement this tracking.
+ * must be careful to not do a memory allocation that does an I/O request. The 'allocating_threads'
+ * thread_registry and its associated methods implement this tracking.
  */
 static struct thread_registry allocating_threads;
 
 static bool allocations_allowed(void)
 {
-	const bool *pointer = uds_lookup_thread(&allocating_threads);
+	const bool *pointer = vdo_lookup_thread(&allocating_threads);
 
 	return (pointer != NULL) ? *pointer : false;
 }
@@ -48,13 +48,13 @@ void uds_register_allocating_thread(struct registered_thread *new_thread,
 		flag_ptr = &allocation_always_allowed;
 	}
 
-	uds_register_thread(&allocating_threads, new_thread, flag_ptr);
+	vdo_register_thread(&allocating_threads, new_thread, flag_ptr);
 }
 
 /* Unregister the current thread as an allocating thread. */
 void uds_unregister_allocating_thread(void)
 {
-	uds_unregister_thread(&allocating_threads);
+	vdo_unregister_thread(&allocating_threads);
 }
 
 /*
@@ -384,7 +384,7 @@ int uds_duplicate_string(const char *string, const char *what, char **new_string
 void uds_memory_init(void)
 {
 	spin_lock_init(&memory_stats.lock);
-	uds_initialize_thread_registry(&allocating_threads);
+	vdo_initialize_thread_registry(&allocating_threads);
 }
 
 void uds_memory_exit(void)
diff --git a/drivers/md/dm-vdo/thread-device.c b/drivers/md/dm-vdo/thread-device.c
index b87de448a83b50..2bf14b9f67f8ad 100644
--- a/drivers/md/dm-vdo/thread-device.c
+++ b/drivers/md/dm-vdo/thread-device.c
@@ -14,23 +14,23 @@ static struct thread_registry device_id_thread_registry;
 void uds_register_thread_device_id(struct registered_thread *new_thread,
 				   unsigned int *id_ptr)
 {
-	uds_register_thread(&device_id_thread_registry, new_thread, id_ptr);
+	vdo_register_thread(&device_id_thread_registry, new_thread, id_ptr);
 }
 
 void uds_unregister_thread_device_id(void)
 {
-	uds_unregister_thread(&device_id_thread_registry);
+	vdo_unregister_thread(&device_id_thread_registry);
 }
 
 int uds_get_thread_device_id(void)
 {
 	const unsigned int *pointer;
 
-	pointer = uds_lookup_thread(&device_id_thread_registry);
+	pointer = vdo_lookup_thread(&device_id_thread_registry);
 	return (pointer != NULL) ? *pointer : -1;
 }
 
 void uds_initialize_thread_device_registry(void)
 {
-	uds_initialize_thread_registry(&device_id_thread_registry);
+	vdo_initialize_thread_registry(&device_id_thread_registry);
 }
diff --git a/drivers/md/dm-vdo/thread-registry.c b/drivers/md/dm-vdo/thread-registry.c
index 8c887158c22454..1314d2b6a26f5e 100644
--- a/drivers/md/dm-vdo/thread-registry.c
+++ b/drivers/md/dm-vdo/thread-registry.c
@@ -14,14 +14,14 @@
  * their normal operation. For example, we do not want to invoke the logger while holding a lock.
  */
 
-void uds_initialize_thread_registry(struct thread_registry *registry)
+void vdo_initialize_thread_registry(struct thread_registry *registry)
 {
 	INIT_LIST_HEAD(&registry->links);
 	spin_lock_init(&registry->lock);
 }
 
 /* Register the current thread and associate it with a data pointer. */
-void uds_register_thread(struct thread_registry *registry,
+void vdo_register_thread(struct thread_registry *registry,
 			 struct registered_thread *new_thread, const void *pointer)
 {
 	struct registered_thread *thread;
@@ -51,7 +51,7 @@ void uds_register_thread(struct thread_registry *registry,
 	}
 }
 
-void uds_unregister_thread(struct thread_registry *registry)
+void vdo_unregister_thread(struct thread_registry *registry)
 {
 	struct registered_thread *thread;
 	bool found_it = false;
@@ -74,7 +74,7 @@ void uds_unregister_thread(struct thread_registry *registry)
 	}
 }
 
-const void *uds_lookup_thread(struct thread_registry *registry)
+const void *vdo_lookup_thread(struct thread_registry *registry)
 {
 	struct registered_thread *thread;
 	const void *result = NULL;
diff --git a/drivers/md/dm-vdo/thread-registry.h b/drivers/md/dm-vdo/thread-registry.h
index f70f755568a17a..cc6d78312b9eb2 100644
--- a/drivers/md/dm-vdo/thread-registry.h
+++ b/drivers/md/dm-vdo/thread-registry.h
@@ -3,8 +3,8 @@
  * Copyright 2023 Red Hat
  */
 
-#ifndef UDS_THREAD_REGISTRY_H
-#define UDS_THREAD_REGISTRY_H
+#ifndef VDO_THREAD_REGISTRY_H
+#define VDO_THREAD_REGISTRY_H
 
 #include <linux/list.h>
 #include <linux/spinlock.h>
@@ -20,13 +20,13 @@ struct registered_thread {
 	struct task_struct *task;
 };
 
-void uds_initialize_thread_registry(struct thread_registry *registry);
+void vdo_initialize_thread_registry(struct thread_registry *registry);
 
-void uds_register_thread(struct thread_registry *registry,
+void vdo_register_thread(struct thread_registry *registry,
 			 struct registered_thread *new_thread, const void *pointer);
 
-void uds_unregister_thread(struct thread_registry *registry);
+void vdo_unregister_thread(struct thread_registry *registry);
 
-const void *uds_lookup_thread(struct thread_registry *registry);
+const void *vdo_lookup_thread(struct thread_registry *registry);
 
-#endif /* UDS_THREAD_REGISTRY_H */
+#endif /* VDO_THREAD_REGISTRY_H */

From 993f5ad757cb9d3fa43d703456860eb4f0704383 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@kernel.org>
Date: Fri, 9 Feb 2024 14:14:21 -0600
Subject: [PATCH 0894/1406] dm vdo memory-alloc: simplify allocations_allowed()

Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 drivers/md/dm-vdo/memory-alloc.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/drivers/md/dm-vdo/memory-alloc.c b/drivers/md/dm-vdo/memory-alloc.c
index 3b2bda9248cbd7..f1ba6f3bef61c7 100644
--- a/drivers/md/dm-vdo/memory-alloc.c
+++ b/drivers/md/dm-vdo/memory-alloc.c
@@ -20,11 +20,9 @@
  */
 static struct thread_registry allocating_threads;
 
-static bool allocations_allowed(void)
+static inline bool allocations_allowed(void)
 {
-	const bool *pointer = vdo_lookup_thread(&allocating_threads);
-
-	return (pointer != NULL) ? *pointer : false;
+	return vdo_lookup_thread(&allocating_threads) != NULL;
 }
 
 /*

From 7462a3ef8fc4a029a69d70d66020705889c6dd69 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@kernel.org>
Date: Fri, 9 Feb 2024 14:53:05 -0600
Subject: [PATCH 0895/1406] dm vdo thread-device: rename all methods to reflect
 vdo-only use

Also moved vdo_init()'s call to vdo_initialize_thread_device_registry
until after "UDS memory initialization" because this isn't "UDS
memory".

Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 drivers/md/dm-vdo/dm-vdo-target.c | 30 +++++++++++++++---------------
 drivers/md/dm-vdo/logger.c        |  2 +-
 drivers/md/dm-vdo/thread-device.c | 10 ++++------
 drivers/md/dm-vdo/thread-device.h |  8 ++++----
 4 files changed, 24 insertions(+), 26 deletions(-)

diff --git a/drivers/md/dm-vdo/dm-vdo-target.c b/drivers/md/dm-vdo/dm-vdo-target.c
index e754b9e30cab58..7afd1dfec64995 100644
--- a/drivers/md/dm-vdo/dm-vdo-target.c
+++ b/drivers/md/dm-vdo/dm-vdo-target.c
@@ -1107,7 +1107,7 @@ static int vdo_message(struct dm_target *ti, unsigned int argc, char **argv,
 
 	vdo = get_vdo_for_target(ti);
 	uds_register_allocating_thread(&allocating_thread, NULL);
-	uds_register_thread_device_id(&instance_thread, &vdo->instance);
+	vdo_register_thread_device_id(&instance_thread, &vdo->instance);
 
 	/*
 	 * Must be done here so we don't map return codes. The code in dm-ioctl expects a 1 for a
@@ -1120,7 +1120,7 @@ static int vdo_message(struct dm_target *ti, unsigned int argc, char **argv,
 		result = vdo_status_to_errno(process_vdo_message(vdo, argc, argv));
 	}
 
-	uds_unregister_thread_device_id();
+	vdo_unregister_thread_device_id();
 	uds_unregister_allocating_thread();
 	return result;
 }
@@ -1632,9 +1632,9 @@ static int construct_new_vdo(struct dm_target *ti, unsigned int argc, char **arg
 	if (result != VDO_SUCCESS)
 		return -ENOMEM;
 
-	uds_register_thread_device_id(&instance_thread, &instance);
+	vdo_register_thread_device_id(&instance_thread, &instance);
 	result = construct_new_vdo_registered(ti, argc, argv, instance);
-	uds_unregister_thread_device_id();
+	vdo_unregister_thread_device_id();
 	return result;
 }
 
@@ -1913,9 +1913,9 @@ static int vdo_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	if (vdo == NULL) {
 		result = construct_new_vdo(ti, argc, argv);
 	} else {
-		uds_register_thread_device_id(&instance_thread, &vdo->instance);
+		vdo_register_thread_device_id(&instance_thread, &vdo->instance);
 		result = update_existing_vdo(device_name, ti, argc, argv, vdo);
-		uds_unregister_thread_device_id();
+		vdo_unregister_thread_device_id();
 	}
 
 	uds_unregister_allocating_thread();
@@ -1935,7 +1935,7 @@ static void vdo_dtr(struct dm_target *ti)
 		unsigned int instance = vdo->instance;
 		struct registered_thread allocating_thread, instance_thread;
 
-		uds_register_thread_device_id(&instance_thread, &instance);
+		vdo_register_thread_device_id(&instance_thread, &instance);
 		uds_register_allocating_thread(&allocating_thread, NULL);
 
 		device_name = vdo_get_device_name(ti);
@@ -1945,7 +1945,7 @@ static void vdo_dtr(struct dm_target *ti)
 
 		vdo_destroy(uds_forget(vdo));
 		uds_log_info("device '%s' stopped", device_name);
-		uds_unregister_thread_device_id();
+		vdo_unregister_thread_device_id();
 		uds_unregister_allocating_thread();
 		release_instance(instance);
 	} else if (config == vdo->device_config) {
@@ -2104,7 +2104,7 @@ static void vdo_postsuspend(struct dm_target *ti)
 	const char *device_name;
 	int result;
 
-	uds_register_thread_device_id(&instance_thread, &vdo->instance);
+	vdo_register_thread_device_id(&instance_thread, &vdo->instance);
 	device_name = vdo_get_device_name(vdo->device_config->owning_target);
 	uds_log_info("suspending device '%s'", device_name);
 
@@ -2129,7 +2129,7 @@ static void vdo_postsuspend(struct dm_target *ti)
 				       device_name);
 	}
 
-	uds_unregister_thread_device_id();
+	vdo_unregister_thread_device_id();
 }
 
 /**
@@ -2846,11 +2846,11 @@ static int vdo_preresume(struct dm_target *ti)
 	struct vdo *vdo = get_vdo_for_target(ti);
 	int result;
 
-	uds_register_thread_device_id(&instance_thread, &vdo->instance);
+	vdo_register_thread_device_id(&instance_thread, &vdo->instance);
 	result = vdo_preresume_registered(ti, vdo);
 	if ((result == VDO_PARAMETER_MISMATCH) || (result == VDO_INVALID_ADMIN_STATE))
 		result = -EINVAL;
-	uds_unregister_thread_device_id();
+	vdo_unregister_thread_device_id();
 	return vdo_status_to_errno(result);
 }
 
@@ -2858,10 +2858,10 @@ static void vdo_resume(struct dm_target *ti)
 {
 	struct registered_thread instance_thread;
 
-	uds_register_thread_device_id(&instance_thread,
+	vdo_register_thread_device_id(&instance_thread,
 				      &get_vdo_for_target(ti)->instance);
 	uds_log_info("device '%s' resumed", vdo_get_device_name(ti));
-	uds_unregister_thread_device_id();
+	vdo_unregister_thread_device_id();
 }
 
 /*
@@ -2912,10 +2912,10 @@ static int __init vdo_init(void)
 	/*
 	 * UDS module level initialization must be done first, as VDO initialization depends on it
 	 */
-	uds_initialize_thread_device_registry();
 	uds_memory_init();
 	uds_init_sysfs();
 
+	vdo_initialize_thread_device_registry();
 	vdo_initialize_device_registry_once();
 	uds_log_info("loaded version %s", CURRENT_VERSION);
 
diff --git a/drivers/md/dm-vdo/logger.c b/drivers/md/dm-vdo/logger.c
index 969f10771adae1..6ba7e99ee8f910 100644
--- a/drivers/md/dm-vdo/logger.c
+++ b/drivers/md/dm-vdo/logger.c
@@ -176,7 +176,7 @@ static void emit_log_message(int priority, const char *module, const char *prefi
 	}
 
 	/* Not at interrupt level; we have a process we can look at, and might have a device ID. */
-	device_instance = uds_get_thread_device_id();
+	device_instance = vdo_get_thread_device_id();
 	if (device_instance >= 0) {
 		emit_log_message_to_kernel(priority, "%s%u:%s: %s%pV%pV\n", module,
 					   device_instance, current->comm, prefix, vaf1,
diff --git a/drivers/md/dm-vdo/thread-device.c b/drivers/md/dm-vdo/thread-device.c
index 2bf14b9f67f8ad..df13ca914db811 100644
--- a/drivers/md/dm-vdo/thread-device.c
+++ b/drivers/md/dm-vdo/thread-device.c
@@ -5,24 +5,22 @@
 
 #include "thread-device.h"
 
-#include "thread-registry.h"
-
 /* A registry of threads associated with device id numbers. */
 static struct thread_registry device_id_thread_registry;
 
 /* Any registered thread must be unregistered. */
-void uds_register_thread_device_id(struct registered_thread *new_thread,
+void vdo_register_thread_device_id(struct registered_thread *new_thread,
 				   unsigned int *id_ptr)
 {
 	vdo_register_thread(&device_id_thread_registry, new_thread, id_ptr);
 }
 
-void uds_unregister_thread_device_id(void)
+void vdo_unregister_thread_device_id(void)
 {
 	vdo_unregister_thread(&device_id_thread_registry);
 }
 
-int uds_get_thread_device_id(void)
+int vdo_get_thread_device_id(void)
 {
 	const unsigned int *pointer;
 
@@ -30,7 +28,7 @@ int uds_get_thread_device_id(void)
 	return (pointer != NULL) ? *pointer : -1;
 }
 
-void uds_initialize_thread_device_registry(void)
+void vdo_initialize_thread_device_registry(void)
 {
 	vdo_initialize_thread_registry(&device_id_thread_registry);
 }
diff --git a/drivers/md/dm-vdo/thread-device.h b/drivers/md/dm-vdo/thread-device.h
index 428b2908541d3a..04b62307057e5a 100644
--- a/drivers/md/dm-vdo/thread-device.h
+++ b/drivers/md/dm-vdo/thread-device.h
@@ -8,13 +8,13 @@
 
 #include "thread-registry.h"
 
-void uds_register_thread_device_id(struct registered_thread *new_thread,
+void vdo_register_thread_device_id(struct registered_thread *new_thread,
 				   unsigned int *id_ptr);
 
-void uds_unregister_thread_device_id(void);
+void vdo_unregister_thread_device_id(void);
 
-int uds_get_thread_device_id(void);
+int vdo_get_thread_device_id(void);
 
-void uds_initialize_thread_device_registry(void);
+void vdo_initialize_thread_device_registry(void);
 
 #endif /* UDS_THREAD_DEVICE_H */

From 6bca66a650d0a84dffd773ca1ac0a691aef328eb Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@kernel.org>
Date: Sat, 10 Feb 2024 11:05:15 -0600
Subject: [PATCH 0896/1406] dm vdo: remove all sysfs interfaces

It has been decided that all info is (or will be) accessible through
alternative interfaces (e.g. "dmsetup message", module params, etc).

Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 drivers/md/dm-vdo/Makefile           |    4 -
 drivers/md/dm-vdo/dedupe.c           |   66 -
 drivers/md/dm-vdo/dm-vdo-target.c    |   36 -
 drivers/md/dm-vdo/logger.c           |    5 -
 drivers/md/dm-vdo/logger.h           |    2 -
 drivers/md/dm-vdo/pool-sysfs-stats.c | 2063 --------------------------
 drivers/md/dm-vdo/pool-sysfs.c       |  198 ---
 drivers/md/dm-vdo/pool-sysfs.h       |   19 -
 drivers/md/dm-vdo/sysfs.c            |   82 -
 drivers/md/dm-vdo/uds-sysfs.c        |  186 ---
 drivers/md/dm-vdo/uds-sysfs.h        |   12 -
 drivers/md/dm-vdo/vdo.c              |   53 -
 drivers/md/dm-vdo/vdo.h              |   10 -
 13 files changed, 2736 deletions(-)
 delete mode 100644 drivers/md/dm-vdo/pool-sysfs-stats.c
 delete mode 100644 drivers/md/dm-vdo/pool-sysfs.c
 delete mode 100644 drivers/md/dm-vdo/pool-sysfs.h
 delete mode 100644 drivers/md/dm-vdo/sysfs.c
 delete mode 100644 drivers/md/dm-vdo/uds-sysfs.c
 delete mode 100644 drivers/md/dm-vdo/uds-sysfs.h

diff --git a/drivers/md/dm-vdo/Makefile b/drivers/md/dm-vdo/Makefile
index 199f5d564724ae..923088f29ef20a 100644
--- a/drivers/md/dm-vdo/Makefile
+++ b/drivers/md/dm-vdo/Makefile
@@ -26,19 +26,15 @@ dm-vdo-objs := \
 	packer.o \
 	permassert.o \
 	physical-zone.o \
-	pool-sysfs.o \
-	pool-sysfs-stats.o \
 	priority-table.o \
 	recovery-journal.o \
 	repair.o \
 	slab-depot.o \
 	status-codes.o \
 	string-utils.o \
-	sysfs.o \
 	thread-device.o \
 	thread-registry.o \
 	thread-utils.o \
-	uds-sysfs.o \
 	vdo.o \
 	vio.o \
 	wait-queue.o \
diff --git a/drivers/md/dm-vdo/dedupe.c b/drivers/md/dm-vdo/dedupe.c
index e03bf6a0c09ef1..1b1edd50e75814 100644
--- a/drivers/md/dm-vdo/dedupe.c
+++ b/drivers/md/dm-vdo/dedupe.c
@@ -120,7 +120,6 @@
 #include <linux/atomic.h>
 #include <linux/jiffies.h>
 #include <linux/kernel.h>
-#include <linux/kobject.h>
 #include <linux/list.h>
 #include <linux/ratelimit.h>
 #include <linux/spinlock.h>
@@ -285,7 +284,6 @@ enum {
 
 struct hash_zones {
 	struct action_manager *manager;
-	struct kobject dedupe_directory;
 	struct uds_parameters parameters;
 	struct uds_index_session *index_session;
 	struct ratelimit_state ratelimiter;
@@ -2029,56 +2027,8 @@ void vdo_share_compressed_write_lock(struct data_vio *data_vio,
 	ASSERT_LOG_ONLY(claimed, "impossible to fail to claim an initial increment");
 }
 
-static void dedupe_kobj_release(struct kobject *directory)
-{
-	uds_free(container_of(directory, struct hash_zones, dedupe_directory));
-}
-
-static ssize_t dedupe_status_show(struct kobject *directory, struct attribute *attr,
-				  char *buf)
-{
-	struct uds_attribute *ua = container_of(attr, struct uds_attribute, attr);
-	struct hash_zones *zones = container_of(directory, struct hash_zones,
-						dedupe_directory);
-
-	if (ua->show_string != NULL)
-		return sprintf(buf, "%s\n", ua->show_string(zones));
-	else
-		return -EINVAL;
-}
-
-static ssize_t dedupe_status_store(struct kobject *kobj __always_unused,
-				   struct attribute *attr __always_unused,
-				   const char *buf __always_unused,
-				   size_t length __always_unused)
-{
-	return -EINVAL;
-}
-
 /*----------------------------------------------------------------------*/
 
-static const struct sysfs_ops dedupe_sysfs_ops = {
-	.show = dedupe_status_show,
-	.store = dedupe_status_store,
-};
-
-static struct uds_attribute dedupe_status_attribute = {
-	.attr = {.name = "status", .mode = 0444, },
-	.show_string = vdo_get_dedupe_index_state_name,
-};
-
-static struct attribute *dedupe_attrs[] = {
-	&dedupe_status_attribute.attr,
-	NULL,
-};
-ATTRIBUTE_GROUPS(dedupe);
-
-static const struct kobj_type dedupe_directory_type = {
-	.release = dedupe_kobj_release,
-	.sysfs_ops = &dedupe_sysfs_ops,
-	.default_groups = dedupe_groups,
-};
-
 static void start_uds_queue(void *ptr)
 {
 	/*
@@ -2273,7 +2223,6 @@ static int initialize_index(struct vdo *vdo, struct hash_zones *zones)
 	vdo_initialize_completion(&zones->completion, vdo, VDO_HASH_ZONES_COMPLETION);
 	vdo_set_completion_callback(&zones->completion, change_dedupe_state,
 				    vdo->thread_config.dedupe_thread);
-	kobject_init(&zones->dedupe_directory, &dedupe_directory_type);
 	return VDO_SUCCESS;
 }
 
@@ -2546,8 +2495,6 @@ void vdo_free_hash_zones(struct hash_zones *zones)
 	ratelimit_state_exit(&zones->ratelimiter);
 	if (vdo_get_admin_state_code(&zones->state) == VDO_ADMIN_STATE_NEW)
 		uds_free(zones);
-	else
-		kobject_put(&zones->dedupe_directory);
 }
 
 static void initiate_suspend_index(struct admin_state *state)
@@ -3054,19 +3001,6 @@ int vdo_message_dedupe_index(struct hash_zones *zones, const char *name)
 	return -EINVAL;
 }
 
-int vdo_add_dedupe_index_sysfs(struct hash_zones *zones)
-{
-	int result = kobject_add(&zones->dedupe_directory,
-				 &zones->completion.vdo->vdo_directory, "dedupe");
-
-	if (result == 0) {
-		vdo_set_admin_state_code(&zones->state,
-					 VDO_ADMIN_STATE_NORMAL_OPERATION);
-	}
-
-	return result;
-}
-
 /* If create_flag, create a new index without first attempting to load an existing index. */
 void vdo_start_dedupe_index(struct hash_zones *zones, bool create_flag)
 {
diff --git a/drivers/md/dm-vdo/dm-vdo-target.c b/drivers/md/dm-vdo/dm-vdo-target.c
index 7afd1dfec64995..d253c7078a798e 100644
--- a/drivers/md/dm-vdo/dm-vdo-target.c
+++ b/drivers/md/dm-vdo/dm-vdo-target.c
@@ -27,7 +27,6 @@
 #include "logger.h"
 #include "memory-alloc.h"
 #include "message-stats.h"
-#include "pool-sysfs.h"
 #include "recovery-journal.h"
 #include "repair.h"
 #include "slab-depot.h"
@@ -36,7 +35,6 @@
 #include "thread-device.h"
 #include "thread-registry.h"
 #include "types.h"
-#include "uds-sysfs.h"
 #include "vdo.h"
 #include "vio.h"
 
@@ -54,7 +52,6 @@ enum {
 	GROW_PHYSICAL_PHASE_END,
 	GROW_PHYSICAL_PHASE_ERROR,
 	LOAD_PHASE_START,
-	LOAD_PHASE_STATS,
 	LOAD_PHASE_LOAD_DEPOT,
 	LOAD_PHASE_MAKE_DIRTY,
 	LOAD_PHASE_PREPARE_TO_ALLOCATE,
@@ -104,7 +101,6 @@ static const char * const ADMIN_PHASE_NAMES[] = {
 	"GROW_PHYSICAL_PHASE_END",
 	"GROW_PHYSICAL_PHASE_ERROR",
 	"LOAD_PHASE_START",
-	"LOAD_PHASE_STATS",
 	"LOAD_PHASE_LOAD_DEPOT",
 	"LOAD_PHASE_MAKE_DIRTY",
 	"LOAD_PHASE_PREPARE_TO_ALLOCATE",
@@ -2180,32 +2176,6 @@ static enum slab_depot_load_type get_load_type(struct vdo *vdo)
 	return VDO_SLAB_DEPOT_NORMAL_LOAD;
 }
 
-/**
- * vdo_initialize_kobjects() - Initialize the vdo sysfs directory.
- * @vdo: The vdo being initialized.
- *
- * Return: VDO_SUCCESS or an error code.
- */
-static int vdo_initialize_kobjects(struct vdo *vdo)
-{
-	int result;
-	struct dm_target *target = vdo->device_config->owning_target;
-	struct mapped_device *md = dm_table_get_md(target->table);
-
-	kobject_init(&vdo->vdo_directory, &vdo_directory_type);
-	vdo->sysfs_added = true;
-	result = kobject_add(&vdo->vdo_directory, &disk_to_dev(dm_disk(md))->kobj,
-			     "vdo");
-	if (result != 0)
-		return VDO_CANT_ADD_SYSFS_NODE;
-
-	result = vdo_add_dedupe_index_sysfs(vdo->hash_zones);
-	if (result != 0)
-		return VDO_CANT_ADD_SYSFS_NODE;
-
-	return vdo_add_sysfs_stats_dir(vdo);
-}
-
 /**
  * load_callback() - Callback to do the destructive parts of loading a VDO.
  * @completion: The sub-task completion.
@@ -2231,10 +2201,6 @@ static void load_callback(struct vdo_completion *completion)
 		vdo_allow_read_only_mode_entry(completion);
 		return;
 
-	case LOAD_PHASE_STATS:
-		vdo_continue_completion(completion, vdo_initialize_kobjects(vdo));
-		return;
-
 	case LOAD_PHASE_LOAD_DEPOT:
 		if (vdo_is_read_only(vdo)) {
 			/*
@@ -2913,7 +2879,6 @@ static int __init vdo_init(void)
 	 * UDS module level initialization must be done first, as VDO initialization depends on it
 	 */
 	uds_memory_init();
-	uds_init_sysfs();
 
 	vdo_initialize_thread_device_registry();
 	vdo_initialize_device_registry_once();
@@ -2945,7 +2910,6 @@ static void __exit vdo_exit(void)
 	 * UDS module level exit processing must be done after all VDO module exit processing is
 	 * complete.
 	 */
-	uds_put_sysfs();
 	uds_memory_exit();
 }
 
diff --git a/drivers/md/dm-vdo/logger.c b/drivers/md/dm-vdo/logger.c
index 6ba7e99ee8f910..19a45f41d48f9c 100644
--- a/drivers/md/dm-vdo/logger.c
+++ b/drivers/md/dm-vdo/logger.c
@@ -55,11 +55,6 @@ int uds_get_log_level(void)
 	return log_level;
 }
 
-void uds_set_log_level(int new_log_level)
-{
-	log_level = new_log_level;
-}
-
 int uds_log_string_to_priority(const char *string)
 {
 	int i;
diff --git a/drivers/md/dm-vdo/logger.h b/drivers/md/dm-vdo/logger.h
index 4e2f18042ba717..ceb07aa3231fb3 100644
--- a/drivers/md/dm-vdo/logger.h
+++ b/drivers/md/dm-vdo/logger.h
@@ -37,8 +37,6 @@
 
 int uds_get_log_level(void);
 
-void uds_set_log_level(int new_log_level);
-
 int uds_log_string_to_priority(const char *string);
 
 const char *uds_log_priority_to_string(int priority);
diff --git a/drivers/md/dm-vdo/pool-sysfs-stats.c b/drivers/md/dm-vdo/pool-sysfs-stats.c
deleted file mode 100644
index ae3838894a1c8d..00000000000000
--- a/drivers/md/dm-vdo/pool-sysfs-stats.c
+++ /dev/null
@@ -1,2063 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright 2023 Red Hat
- */
-
-#include <linux/mutex.h>
-
-#include "logger.h"
-#include "string-utils.h"
-
-#include "dedupe.h"
-#include "pool-sysfs.h"
-#include "statistics.h"
-#include "vdo.h"
-
-struct pool_stats_attribute {
-	struct attribute attr;
-	ssize_t (*print)(struct vdo_statistics *stats, char *buf);
-};
-
-static ssize_t pool_stats_attr_show(struct kobject *directory,
-				    struct attribute *attr,
-				    char *buf)
-{
-	ssize_t size;
-	struct pool_stats_attribute *pool_stats_attr =
-		container_of(attr, struct pool_stats_attribute, attr);
-	struct vdo *vdo = container_of(directory, struct vdo, stats_directory);
-
-	if (pool_stats_attr->print == NULL)
-		return -EINVAL;
-
-	mutex_lock(&vdo->stats_mutex);
-	vdo_fetch_statistics(vdo, &vdo->stats_buffer);
-	size = pool_stats_attr->print(&vdo->stats_buffer, buf);
-	mutex_unlock(&vdo->stats_mutex);
-
-	return size;
-}
-
-const struct sysfs_ops vdo_pool_stats_sysfs_ops = {
-	.show = pool_stats_attr_show,
-	.store = NULL,
-};
-
-/* Number of blocks used for data */
-static ssize_t
-pool_stats_print_data_blocks_used(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->data_blocks_used);
-}
-
-static struct pool_stats_attribute pool_stats_attr_data_blocks_used = {
-	.attr = { .name = "data_blocks_used", .mode = 0444, },
-	.print = pool_stats_print_data_blocks_used,
-};
-
-/* Number of blocks used for VDO metadata */
-static ssize_t
-pool_stats_print_overhead_blocks_used(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->overhead_blocks_used);
-}
-
-static struct pool_stats_attribute pool_stats_attr_overhead_blocks_used = {
-	.attr = { .name = "overhead_blocks_used", .mode = 0444, },
-	.print = pool_stats_print_overhead_blocks_used,
-};
-
-/* Number of logical blocks that are currently mapped to physical blocks */
-static ssize_t
-pool_stats_print_logical_blocks_used(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->logical_blocks_used);
-}
-
-static struct pool_stats_attribute pool_stats_attr_logical_blocks_used = {
-	.attr = { .name = "logical_blocks_used", .mode = 0444, },
-	.print = pool_stats_print_logical_blocks_used,
-};
-
-/* number of physical blocks */
-static ssize_t
-pool_stats_print_physical_blocks(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->physical_blocks);
-}
-
-static struct pool_stats_attribute pool_stats_attr_physical_blocks = {
-	.attr = { .name = "physical_blocks", .mode = 0444, },
-	.print = pool_stats_print_physical_blocks,
-};
-
-/* number of logical blocks */
-static ssize_t
-pool_stats_print_logical_blocks(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->logical_blocks);
-}
-
-static struct pool_stats_attribute pool_stats_attr_logical_blocks = {
-	.attr = { .name = "logical_blocks", .mode = 0444, },
-	.print = pool_stats_print_logical_blocks,
-};
-
-/* Size of the block map page cache, in bytes */
-static ssize_t
-pool_stats_print_block_map_cache_size(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->block_map_cache_size);
-}
-
-static struct pool_stats_attribute pool_stats_attr_block_map_cache_size = {
-	.attr = { .name = "block_map_cache_size", .mode = 0444, },
-	.print = pool_stats_print_block_map_cache_size,
-};
-
-/* The physical block size */
-static ssize_t
-pool_stats_print_block_size(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->block_size);
-}
-
-static struct pool_stats_attribute pool_stats_attr_block_size = {
-	.attr = { .name = "block_size", .mode = 0444, },
-	.print = pool_stats_print_block_size,
-};
-
-/* Number of times the VDO has successfully recovered */
-static ssize_t
-pool_stats_print_complete_recoveries(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->complete_recoveries);
-}
-
-static struct pool_stats_attribute pool_stats_attr_complete_recoveries = {
-	.attr = { .name = "complete_recoveries", .mode = 0444, },
-	.print = pool_stats_print_complete_recoveries,
-};
-
-/* Number of times the VDO has recovered from read-only mode */
-static ssize_t
-pool_stats_print_read_only_recoveries(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->read_only_recoveries);
-}
-
-static struct pool_stats_attribute pool_stats_attr_read_only_recoveries = {
-	.attr = { .name = "read_only_recoveries", .mode = 0444, },
-	.print = pool_stats_print_read_only_recoveries,
-};
-
-/* String describing the operating mode of the VDO */
-static ssize_t
-pool_stats_print_mode(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%s\n", stats->mode);
-}
-
-static struct pool_stats_attribute pool_stats_attr_mode = {
-	.attr = { .name = "mode", .mode = 0444, },
-	.print = pool_stats_print_mode,
-};
-
-/* Whether the VDO is in recovery mode */
-static ssize_t
-pool_stats_print_in_recovery_mode(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%d\n", stats->in_recovery_mode);
-}
-
-static struct pool_stats_attribute pool_stats_attr_in_recovery_mode = {
-	.attr = { .name = "in_recovery_mode", .mode = 0444, },
-	.print = pool_stats_print_in_recovery_mode,
-};
-
-/* What percentage of recovery mode work has been completed */
-static ssize_t
-pool_stats_print_recovery_percentage(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%u\n", stats->recovery_percentage);
-}
-
-static struct pool_stats_attribute pool_stats_attr_recovery_percentage = {
-	.attr = { .name = "recovery_percentage", .mode = 0444, },
-	.print = pool_stats_print_recovery_percentage,
-};
-
-/* Number of compressed data items written since startup */
-static ssize_t
-pool_stats_print_packer_compressed_fragments_written(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->packer.compressed_fragments_written);
-}
-
-static struct pool_stats_attribute pool_stats_attr_packer_compressed_fragments_written = {
-	.attr = { .name = "packer_compressed_fragments_written", .mode = 0444, },
-	.print = pool_stats_print_packer_compressed_fragments_written,
-};
-
-/* Number of blocks containing compressed items written since startup */
-static ssize_t
-pool_stats_print_packer_compressed_blocks_written(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->packer.compressed_blocks_written);
-}
-
-static struct pool_stats_attribute pool_stats_attr_packer_compressed_blocks_written = {
-	.attr = { .name = "packer_compressed_blocks_written", .mode = 0444, },
-	.print = pool_stats_print_packer_compressed_blocks_written,
-};
-
-/* Number of VIOs that are pending in the packer */
-static ssize_t
-pool_stats_print_packer_compressed_fragments_in_packer(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->packer.compressed_fragments_in_packer);
-}
-
-static struct pool_stats_attribute pool_stats_attr_packer_compressed_fragments_in_packer = {
-	.attr = { .name = "packer_compressed_fragments_in_packer", .mode = 0444, },
-	.print = pool_stats_print_packer_compressed_fragments_in_packer,
-};
-
-/* The total number of slabs from which blocks may be allocated */
-static ssize_t
-pool_stats_print_allocator_slab_count(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->allocator.slab_count);
-}
-
-static struct pool_stats_attribute pool_stats_attr_allocator_slab_count = {
-	.attr = { .name = "allocator_slab_count", .mode = 0444, },
-	.print = pool_stats_print_allocator_slab_count,
-};
-
-/* The total number of slabs from which blocks have ever been allocated */
-static ssize_t
-pool_stats_print_allocator_slabs_opened(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->allocator.slabs_opened);
-}
-
-static struct pool_stats_attribute pool_stats_attr_allocator_slabs_opened = {
-	.attr = { .name = "allocator_slabs_opened", .mode = 0444, },
-	.print = pool_stats_print_allocator_slabs_opened,
-};
-
-/* The number of times since loading that a slab has been re-opened */
-static ssize_t
-pool_stats_print_allocator_slabs_reopened(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->allocator.slabs_reopened);
-}
-
-static struct pool_stats_attribute pool_stats_attr_allocator_slabs_reopened = {
-	.attr = { .name = "allocator_slabs_reopened", .mode = 0444, },
-	.print = pool_stats_print_allocator_slabs_reopened,
-};
-
-/* Number of times the on-disk journal was full */
-static ssize_t
-pool_stats_print_journal_disk_full(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->journal.disk_full);
-}
-
-static struct pool_stats_attribute pool_stats_attr_journal_disk_full = {
-	.attr = { .name = "journal_disk_full", .mode = 0444, },
-	.print = pool_stats_print_journal_disk_full,
-};
-
-/* Number of times the recovery journal requested slab journal commits. */
-static ssize_t
-pool_stats_print_journal_slab_journal_commits_requested(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->journal.slab_journal_commits_requested);
-}
-
-static struct pool_stats_attribute pool_stats_attr_journal_slab_journal_commits_requested = {
-	.attr = { .name = "journal_slab_journal_commits_requested", .mode = 0444, },
-	.print = pool_stats_print_journal_slab_journal_commits_requested,
-};
-
-/* The total number of items on which processing has started */
-static ssize_t
-pool_stats_print_journal_entries_started(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->journal.entries.started);
-}
-
-static struct pool_stats_attribute pool_stats_attr_journal_entries_started = {
-	.attr = { .name = "journal_entries_started", .mode = 0444, },
-	.print = pool_stats_print_journal_entries_started,
-};
-
-/* The total number of items for which a write operation has been issued */
-static ssize_t
-pool_stats_print_journal_entries_written(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->journal.entries.written);
-}
-
-static struct pool_stats_attribute pool_stats_attr_journal_entries_written = {
-	.attr = { .name = "journal_entries_written", .mode = 0444, },
-	.print = pool_stats_print_journal_entries_written,
-};
-
-/* The total number of items for which a write operation has completed */
-static ssize_t
-pool_stats_print_journal_entries_committed(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->journal.entries.committed);
-}
-
-static struct pool_stats_attribute pool_stats_attr_journal_entries_committed = {
-	.attr = { .name = "journal_entries_committed", .mode = 0444, },
-	.print = pool_stats_print_journal_entries_committed,
-};
-
-/* The total number of items on which processing has started */
-static ssize_t
-pool_stats_print_journal_blocks_started(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->journal.blocks.started);
-}
-
-static struct pool_stats_attribute pool_stats_attr_journal_blocks_started = {
-	.attr = { .name = "journal_blocks_started", .mode = 0444, },
-	.print = pool_stats_print_journal_blocks_started,
-};
-
-/* The total number of items for which a write operation has been issued */
-static ssize_t
-pool_stats_print_journal_blocks_written(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->journal.blocks.written);
-}
-
-static struct pool_stats_attribute pool_stats_attr_journal_blocks_written = {
-	.attr = { .name = "journal_blocks_written", .mode = 0444, },
-	.print = pool_stats_print_journal_blocks_written,
-};
-
-/* The total number of items for which a write operation has completed */
-static ssize_t
-pool_stats_print_journal_blocks_committed(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->journal.blocks.committed);
-}
-
-static struct pool_stats_attribute pool_stats_attr_journal_blocks_committed = {
-	.attr = { .name = "journal_blocks_committed", .mode = 0444, },
-	.print = pool_stats_print_journal_blocks_committed,
-};
-
-/* Number of times the on-disk journal was full */
-static ssize_t
-pool_stats_print_slab_journal_disk_full_count(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->slab_journal.disk_full_count);
-}
-
-static struct pool_stats_attribute pool_stats_attr_slab_journal_disk_full_count = {
-	.attr = { .name = "slab_journal_disk_full_count", .mode = 0444, },
-	.print = pool_stats_print_slab_journal_disk_full_count,
-};
-
-/* Number of times an entry was added over the flush threshold */
-static ssize_t
-pool_stats_print_slab_journal_flush_count(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->slab_journal.flush_count);
-}
-
-static struct pool_stats_attribute pool_stats_attr_slab_journal_flush_count = {
-	.attr = { .name = "slab_journal_flush_count", .mode = 0444, },
-	.print = pool_stats_print_slab_journal_flush_count,
-};
-
-/* Number of times an entry was added over the block threshold */
-static ssize_t
-pool_stats_print_slab_journal_blocked_count(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->slab_journal.blocked_count);
-}
-
-static struct pool_stats_attribute pool_stats_attr_slab_journal_blocked_count = {
-	.attr = { .name = "slab_journal_blocked_count", .mode = 0444, },
-	.print = pool_stats_print_slab_journal_blocked_count,
-};
-
-/* Number of times a tail block was written */
-static ssize_t
-pool_stats_print_slab_journal_blocks_written(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->slab_journal.blocks_written);
-}
-
-static struct pool_stats_attribute pool_stats_attr_slab_journal_blocks_written = {
-	.attr = { .name = "slab_journal_blocks_written", .mode = 0444, },
-	.print = pool_stats_print_slab_journal_blocks_written,
-};
-
-/* Number of times we had to wait for the tail to write */
-static ssize_t
-pool_stats_print_slab_journal_tail_busy_count(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->slab_journal.tail_busy_count);
-}
-
-static struct pool_stats_attribute pool_stats_attr_slab_journal_tail_busy_count = {
-	.attr = { .name = "slab_journal_tail_busy_count", .mode = 0444, },
-	.print = pool_stats_print_slab_journal_tail_busy_count,
-};
-
-/* Number of blocks written */
-static ssize_t
-pool_stats_print_slab_summary_blocks_written(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->slab_summary.blocks_written);
-}
-
-static struct pool_stats_attribute pool_stats_attr_slab_summary_blocks_written = {
-	.attr = { .name = "slab_summary_blocks_written", .mode = 0444, },
-	.print = pool_stats_print_slab_summary_blocks_written,
-};
-
-/* Number of reference blocks written */
-static ssize_t
-pool_stats_print_ref_counts_blocks_written(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->ref_counts.blocks_written);
-}
-
-static struct pool_stats_attribute pool_stats_attr_ref_counts_blocks_written = {
-	.attr = { .name = "ref_counts_blocks_written", .mode = 0444, },
-	.print = pool_stats_print_ref_counts_blocks_written,
-};
-
-/* number of dirty (resident) pages */
-static ssize_t
-pool_stats_print_block_map_dirty_pages(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%u\n", stats->block_map.dirty_pages);
-}
-
-static struct pool_stats_attribute pool_stats_attr_block_map_dirty_pages = {
-	.attr = { .name = "block_map_dirty_pages", .mode = 0444, },
-	.print = pool_stats_print_block_map_dirty_pages,
-};
-
-/* number of clean (resident) pages */
-static ssize_t
-pool_stats_print_block_map_clean_pages(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%u\n", stats->block_map.clean_pages);
-}
-
-static struct pool_stats_attribute pool_stats_attr_block_map_clean_pages = {
-	.attr = { .name = "block_map_clean_pages", .mode = 0444, },
-	.print = pool_stats_print_block_map_clean_pages,
-};
-
-/* number of free pages */
-static ssize_t
-pool_stats_print_block_map_free_pages(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%u\n", stats->block_map.free_pages);
-}
-
-static struct pool_stats_attribute pool_stats_attr_block_map_free_pages = {
-	.attr = { .name = "block_map_free_pages", .mode = 0444, },
-	.print = pool_stats_print_block_map_free_pages,
-};
-
-/* number of pages in failed state */
-static ssize_t
-pool_stats_print_block_map_failed_pages(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%u\n", stats->block_map.failed_pages);
-}
-
-static struct pool_stats_attribute pool_stats_attr_block_map_failed_pages = {
-	.attr = { .name = "block_map_failed_pages", .mode = 0444, },
-	.print = pool_stats_print_block_map_failed_pages,
-};
-
-/* number of pages incoming */
-static ssize_t
-pool_stats_print_block_map_incoming_pages(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%u\n", stats->block_map.incoming_pages);
-}
-
-static struct pool_stats_attribute pool_stats_attr_block_map_incoming_pages = {
-	.attr = { .name = "block_map_incoming_pages", .mode = 0444, },
-	.print = pool_stats_print_block_map_incoming_pages,
-};
-
-/* number of pages outgoing */
-static ssize_t
-pool_stats_print_block_map_outgoing_pages(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%u\n", stats->block_map.outgoing_pages);
-}
-
-static struct pool_stats_attribute pool_stats_attr_block_map_outgoing_pages = {
-	.attr = { .name = "block_map_outgoing_pages", .mode = 0444, },
-	.print = pool_stats_print_block_map_outgoing_pages,
-};
-
-/* how many times free page not avail */
-static ssize_t
-pool_stats_print_block_map_cache_pressure(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%u\n", stats->block_map.cache_pressure);
-}
-
-static struct pool_stats_attribute pool_stats_attr_block_map_cache_pressure = {
-	.attr = { .name = "block_map_cache_pressure", .mode = 0444, },
-	.print = pool_stats_print_block_map_cache_pressure,
-};
-
-/* number of get_vdo_page() calls for read */
-static ssize_t
-pool_stats_print_block_map_read_count(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->block_map.read_count);
-}
-
-static struct pool_stats_attribute pool_stats_attr_block_map_read_count = {
-	.attr = { .name = "block_map_read_count", .mode = 0444, },
-	.print = pool_stats_print_block_map_read_count,
-};
-
-/* number of get_vdo_page() calls for write */
-static ssize_t
-pool_stats_print_block_map_write_count(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->block_map.write_count);
-}
-
-static struct pool_stats_attribute pool_stats_attr_block_map_write_count = {
-	.attr = { .name = "block_map_write_count", .mode = 0444, },
-	.print = pool_stats_print_block_map_write_count,
-};
-
-/* number of times pages failed to read */
-static ssize_t
-pool_stats_print_block_map_failed_reads(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->block_map.failed_reads);
-}
-
-static struct pool_stats_attribute pool_stats_attr_block_map_failed_reads = {
-	.attr = { .name = "block_map_failed_reads", .mode = 0444, },
-	.print = pool_stats_print_block_map_failed_reads,
-};
-
-/* number of times pages failed to write */
-static ssize_t
-pool_stats_print_block_map_failed_writes(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->block_map.failed_writes);
-}
-
-static struct pool_stats_attribute pool_stats_attr_block_map_failed_writes = {
-	.attr = { .name = "block_map_failed_writes", .mode = 0444, },
-	.print = pool_stats_print_block_map_failed_writes,
-};
-
-/* number of gets that are reclaimed */
-static ssize_t
-pool_stats_print_block_map_reclaimed(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->block_map.reclaimed);
-}
-
-static struct pool_stats_attribute pool_stats_attr_block_map_reclaimed = {
-	.attr = { .name = "block_map_reclaimed", .mode = 0444, },
-	.print = pool_stats_print_block_map_reclaimed,
-};
-
-/* number of gets for outgoing pages */
-static ssize_t
-pool_stats_print_block_map_read_outgoing(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->block_map.read_outgoing);
-}
-
-static struct pool_stats_attribute pool_stats_attr_block_map_read_outgoing = {
-	.attr = { .name = "block_map_read_outgoing", .mode = 0444, },
-	.print = pool_stats_print_block_map_read_outgoing,
-};
-
-/* number of gets that were already there */
-static ssize_t
-pool_stats_print_block_map_found_in_cache(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->block_map.found_in_cache);
-}
-
-static struct pool_stats_attribute pool_stats_attr_block_map_found_in_cache = {
-	.attr = { .name = "block_map_found_in_cache", .mode = 0444, },
-	.print = pool_stats_print_block_map_found_in_cache,
-};
-
-/* number of gets requiring discard */
-static ssize_t
-pool_stats_print_block_map_discard_required(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->block_map.discard_required);
-}
-
-static struct pool_stats_attribute pool_stats_attr_block_map_discard_required = {
-	.attr = { .name = "block_map_discard_required", .mode = 0444, },
-	.print = pool_stats_print_block_map_discard_required,
-};
-
-/* number of gets enqueued for their page */
-static ssize_t
-pool_stats_print_block_map_wait_for_page(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->block_map.wait_for_page);
-}
-
-static struct pool_stats_attribute pool_stats_attr_block_map_wait_for_page = {
-	.attr = { .name = "block_map_wait_for_page", .mode = 0444, },
-	.print = pool_stats_print_block_map_wait_for_page,
-};
-
-/* number of gets that have to fetch */
-static ssize_t
-pool_stats_print_block_map_fetch_required(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->block_map.fetch_required);
-}
-
-static struct pool_stats_attribute pool_stats_attr_block_map_fetch_required = {
-	.attr = { .name = "block_map_fetch_required", .mode = 0444, },
-	.print = pool_stats_print_block_map_fetch_required,
-};
-
-/* number of page fetches */
-static ssize_t
-pool_stats_print_block_map_pages_loaded(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->block_map.pages_loaded);
-}
-
-static struct pool_stats_attribute pool_stats_attr_block_map_pages_loaded = {
-	.attr = { .name = "block_map_pages_loaded", .mode = 0444, },
-	.print = pool_stats_print_block_map_pages_loaded,
-};
-
-/* number of page saves */
-static ssize_t
-pool_stats_print_block_map_pages_saved(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->block_map.pages_saved);
-}
-
-static struct pool_stats_attribute pool_stats_attr_block_map_pages_saved = {
-	.attr = { .name = "block_map_pages_saved", .mode = 0444, },
-	.print = pool_stats_print_block_map_pages_saved,
-};
-
-/* the number of flushes issued */
-static ssize_t
-pool_stats_print_block_map_flush_count(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->block_map.flush_count);
-}
-
-static struct pool_stats_attribute pool_stats_attr_block_map_flush_count = {
-	.attr = { .name = "block_map_flush_count", .mode = 0444, },
-	.print = pool_stats_print_block_map_flush_count,
-};
-
-/* Number of times the UDS advice proved correct */
-static ssize_t
-pool_stats_print_hash_lock_dedupe_advice_valid(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->hash_lock.dedupe_advice_valid);
-}
-
-static struct pool_stats_attribute pool_stats_attr_hash_lock_dedupe_advice_valid = {
-	.attr = { .name = "hash_lock_dedupe_advice_valid", .mode = 0444, },
-	.print = pool_stats_print_hash_lock_dedupe_advice_valid,
-};
-
-/* Number of times the UDS advice proved incorrect */
-static ssize_t
-pool_stats_print_hash_lock_dedupe_advice_stale(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->hash_lock.dedupe_advice_stale);
-}
-
-static struct pool_stats_attribute pool_stats_attr_hash_lock_dedupe_advice_stale = {
-	.attr = { .name = "hash_lock_dedupe_advice_stale", .mode = 0444, },
-	.print = pool_stats_print_hash_lock_dedupe_advice_stale,
-};
-
-/* Number of writes with the same data as another in-flight write */
-static ssize_t
-pool_stats_print_hash_lock_concurrent_data_matches(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->hash_lock.concurrent_data_matches);
-}
-
-static struct pool_stats_attribute pool_stats_attr_hash_lock_concurrent_data_matches = {
-	.attr = { .name = "hash_lock_concurrent_data_matches", .mode = 0444, },
-	.print = pool_stats_print_hash_lock_concurrent_data_matches,
-};
-
-/* Number of writes whose hash collided with an in-flight write */
-static ssize_t
-pool_stats_print_hash_lock_concurrent_hash_collisions(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->hash_lock.concurrent_hash_collisions);
-}
-
-static struct pool_stats_attribute pool_stats_attr_hash_lock_concurrent_hash_collisions = {
-	.attr = { .name = "hash_lock_concurrent_hash_collisions", .mode = 0444, },
-	.print = pool_stats_print_hash_lock_concurrent_hash_collisions,
-};
-
-/* Current number of dedupe queries that are in flight */
-static ssize_t
-pool_stats_print_hash_lock_curr_dedupe_queries(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%u\n", stats->hash_lock.curr_dedupe_queries);
-}
-
-static struct pool_stats_attribute pool_stats_attr_hash_lock_curr_dedupe_queries = {
-	.attr = { .name = "hash_lock_curr_dedupe_queries", .mode = 0444, },
-	.print = pool_stats_print_hash_lock_curr_dedupe_queries,
-};
-
-/* number of times VDO got an invalid dedupe advice PBN from UDS */
-static ssize_t
-pool_stats_print_errors_invalid_advice_pbn_count(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->errors.invalid_advice_pbn_count);
-}
-
-static struct pool_stats_attribute pool_stats_attr_errors_invalid_advice_pbn_count = {
-	.attr = { .name = "errors_invalid_advice_pbn_count", .mode = 0444, },
-	.print = pool_stats_print_errors_invalid_advice_pbn_count,
-};
-
-/* number of times a VIO completed with a VDO_NO_SPACE error */
-static ssize_t
-pool_stats_print_errors_no_space_error_count(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->errors.no_space_error_count);
-}
-
-static struct pool_stats_attribute pool_stats_attr_errors_no_space_error_count = {
-	.attr = { .name = "errors_no_space_error_count", .mode = 0444, },
-	.print = pool_stats_print_errors_no_space_error_count,
-};
-
-/* number of times a VIO completed with a VDO_READ_ONLY error */
-static ssize_t
-pool_stats_print_errors_read_only_error_count(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->errors.read_only_error_count);
-}
-
-static struct pool_stats_attribute pool_stats_attr_errors_read_only_error_count = {
-	.attr = { .name = "errors_read_only_error_count", .mode = 0444, },
-	.print = pool_stats_print_errors_read_only_error_count,
-};
-
-/* The VDO instance */
-static ssize_t
-pool_stats_print_instance(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%u\n", stats->instance);
-}
-
-static struct pool_stats_attribute pool_stats_attr_instance = {
-	.attr = { .name = "instance", .mode = 0444, },
-	.print = pool_stats_print_instance,
-};
-
-/* Current number of active VIOs */
-static ssize_t
-pool_stats_print_current_vios_in_progress(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%u\n", stats->current_vios_in_progress);
-}
-
-static struct pool_stats_attribute pool_stats_attr_current_vios_in_progress = {
-	.attr = { .name = "current_vios_in_progress", .mode = 0444, },
-	.print = pool_stats_print_current_vios_in_progress,
-};
-
-/* Maximum number of active VIOs */
-static ssize_t
-pool_stats_print_max_vios(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%u\n", stats->max_vios);
-}
-
-static struct pool_stats_attribute pool_stats_attr_max_vios = {
-	.attr = { .name = "max_vios", .mode = 0444, },
-	.print = pool_stats_print_max_vios,
-};
-
-/* Number of times the UDS index was too slow in responding */
-static ssize_t
-pool_stats_print_dedupe_advice_timeouts(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->dedupe_advice_timeouts);
-}
-
-static struct pool_stats_attribute pool_stats_attr_dedupe_advice_timeouts = {
-	.attr = { .name = "dedupe_advice_timeouts", .mode = 0444, },
-	.print = pool_stats_print_dedupe_advice_timeouts,
-};
-
-/* Number of flush requests submitted to the storage device */
-static ssize_t
-pool_stats_print_flush_out(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->flush_out);
-}
-
-static struct pool_stats_attribute pool_stats_attr_flush_out = {
-	.attr = { .name = "flush_out", .mode = 0444, },
-	.print = pool_stats_print_flush_out,
-};
-
-/* Logical block size */
-static ssize_t
-pool_stats_print_logical_block_size(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->logical_block_size);
-}
-
-static struct pool_stats_attribute pool_stats_attr_logical_block_size = {
-	.attr = { .name = "logical_block_size", .mode = 0444, },
-	.print = pool_stats_print_logical_block_size,
-};
-
-/* Number of REQ_OP_READ bios */
-static ssize_t
-pool_stats_print_bios_in_read(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->bios_in.read);
-}
-
-static struct pool_stats_attribute pool_stats_attr_bios_in_read = {
-	.attr = { .name = "bios_in_read", .mode = 0444, },
-	.print = pool_stats_print_bios_in_read,
-};
-
-/* Number of REQ_OP_WRITE bios with data */
-static ssize_t
-pool_stats_print_bios_in_write(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->bios_in.write);
-}
-
-static struct pool_stats_attribute pool_stats_attr_bios_in_write = {
-	.attr = { .name = "bios_in_write", .mode = 0444, },
-	.print = pool_stats_print_bios_in_write,
-};
-
-/* Number of bios tagged with REQ_PREFLUSH and containing no data */
-static ssize_t
-pool_stats_print_bios_in_empty_flush(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->bios_in.empty_flush);
-}
-
-static struct pool_stats_attribute pool_stats_attr_bios_in_empty_flush = {
-	.attr = { .name = "bios_in_empty_flush", .mode = 0444, },
-	.print = pool_stats_print_bios_in_empty_flush,
-};
-
-/* Number of REQ_OP_DISCARD bios */
-static ssize_t
-pool_stats_print_bios_in_discard(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->bios_in.discard);
-}
-
-static struct pool_stats_attribute pool_stats_attr_bios_in_discard = {
-	.attr = { .name = "bios_in_discard", .mode = 0444, },
-	.print = pool_stats_print_bios_in_discard,
-};
-
-/* Number of bios tagged with REQ_PREFLUSH */
-static ssize_t
-pool_stats_print_bios_in_flush(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->bios_in.flush);
-}
-
-static struct pool_stats_attribute pool_stats_attr_bios_in_flush = {
-	.attr = { .name = "bios_in_flush", .mode = 0444, },
-	.print = pool_stats_print_bios_in_flush,
-};
-
-/* Number of bios tagged with REQ_FUA */
-static ssize_t
-pool_stats_print_bios_in_fua(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->bios_in.fua);
-}
-
-static struct pool_stats_attribute pool_stats_attr_bios_in_fua = {
-	.attr = { .name = "bios_in_fua", .mode = 0444, },
-	.print = pool_stats_print_bios_in_fua,
-};
-
-/* Number of REQ_OP_READ bios */
-static ssize_t
-pool_stats_print_bios_in_partial_read(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->bios_in_partial.read);
-}
-
-static struct pool_stats_attribute pool_stats_attr_bios_in_partial_read = {
-	.attr = { .name = "bios_in_partial_read", .mode = 0444, },
-	.print = pool_stats_print_bios_in_partial_read,
-};
-
-/* Number of REQ_OP_WRITE bios with data */
-static ssize_t
-pool_stats_print_bios_in_partial_write(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->bios_in_partial.write);
-}
-
-static struct pool_stats_attribute pool_stats_attr_bios_in_partial_write = {
-	.attr = { .name = "bios_in_partial_write", .mode = 0444, },
-	.print = pool_stats_print_bios_in_partial_write,
-};
-
-/* Number of bios tagged with REQ_PREFLUSH and containing no data */
-static ssize_t
-pool_stats_print_bios_in_partial_empty_flush(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->bios_in_partial.empty_flush);
-}
-
-static struct pool_stats_attribute pool_stats_attr_bios_in_partial_empty_flush = {
-	.attr = { .name = "bios_in_partial_empty_flush", .mode = 0444, },
-	.print = pool_stats_print_bios_in_partial_empty_flush,
-};
-
-/* Number of REQ_OP_DISCARD bios */
-static ssize_t
-pool_stats_print_bios_in_partial_discard(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->bios_in_partial.discard);
-}
-
-static struct pool_stats_attribute pool_stats_attr_bios_in_partial_discard = {
-	.attr = { .name = "bios_in_partial_discard", .mode = 0444, },
-	.print = pool_stats_print_bios_in_partial_discard,
-};
-
-/* Number of bios tagged with REQ_PREFLUSH */
-static ssize_t
-pool_stats_print_bios_in_partial_flush(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->bios_in_partial.flush);
-}
-
-static struct pool_stats_attribute pool_stats_attr_bios_in_partial_flush = {
-	.attr = { .name = "bios_in_partial_flush", .mode = 0444, },
-	.print = pool_stats_print_bios_in_partial_flush,
-};
-
-/* Number of bios tagged with REQ_FUA */
-static ssize_t
-pool_stats_print_bios_in_partial_fua(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->bios_in_partial.fua);
-}
-
-static struct pool_stats_attribute pool_stats_attr_bios_in_partial_fua = {
-	.attr = { .name = "bios_in_partial_fua", .mode = 0444, },
-	.print = pool_stats_print_bios_in_partial_fua,
-};
-
-/* Number of REQ_OP_READ bios */
-static ssize_t
-pool_stats_print_bios_out_read(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->bios_out.read);
-}
-
-static struct pool_stats_attribute pool_stats_attr_bios_out_read = {
-	.attr = { .name = "bios_out_read", .mode = 0444, },
-	.print = pool_stats_print_bios_out_read,
-};
-
-/* Number of REQ_OP_WRITE bios with data */
-static ssize_t
-pool_stats_print_bios_out_write(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->bios_out.write);
-}
-
-static struct pool_stats_attribute pool_stats_attr_bios_out_write = {
-	.attr = { .name = "bios_out_write", .mode = 0444, },
-	.print = pool_stats_print_bios_out_write,
-};
-
-/* Number of bios tagged with REQ_PREFLUSH and containing no data */
-static ssize_t
-pool_stats_print_bios_out_empty_flush(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->bios_out.empty_flush);
-}
-
-static struct pool_stats_attribute pool_stats_attr_bios_out_empty_flush = {
-	.attr = { .name = "bios_out_empty_flush", .mode = 0444, },
-	.print = pool_stats_print_bios_out_empty_flush,
-};
-
-/* Number of REQ_OP_DISCARD bios */
-static ssize_t
-pool_stats_print_bios_out_discard(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->bios_out.discard);
-}
-
-static struct pool_stats_attribute pool_stats_attr_bios_out_discard = {
-	.attr = { .name = "bios_out_discard", .mode = 0444, },
-	.print = pool_stats_print_bios_out_discard,
-};
-
-/* Number of bios tagged with REQ_PREFLUSH */
-static ssize_t
-pool_stats_print_bios_out_flush(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->bios_out.flush);
-}
-
-static struct pool_stats_attribute pool_stats_attr_bios_out_flush = {
-	.attr = { .name = "bios_out_flush", .mode = 0444, },
-	.print = pool_stats_print_bios_out_flush,
-};
-
-/* Number of bios tagged with REQ_FUA */
-static ssize_t
-pool_stats_print_bios_out_fua(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->bios_out.fua);
-}
-
-static struct pool_stats_attribute pool_stats_attr_bios_out_fua = {
-	.attr = { .name = "bios_out_fua", .mode = 0444, },
-	.print = pool_stats_print_bios_out_fua,
-};
-
-/* Number of REQ_OP_READ bios */
-static ssize_t
-pool_stats_print_bios_meta_read(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->bios_meta.read);
-}
-
-static struct pool_stats_attribute pool_stats_attr_bios_meta_read = {
-	.attr = { .name = "bios_meta_read", .mode = 0444, },
-	.print = pool_stats_print_bios_meta_read,
-};
-
-/* Number of REQ_OP_WRITE bios with data */
-static ssize_t
-pool_stats_print_bios_meta_write(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->bios_meta.write);
-}
-
-static struct pool_stats_attribute pool_stats_attr_bios_meta_write = {
-	.attr = { .name = "bios_meta_write", .mode = 0444, },
-	.print = pool_stats_print_bios_meta_write,
-};
-
-/* Number of bios tagged with REQ_PREFLUSH and containing no data */
-static ssize_t
-pool_stats_print_bios_meta_empty_flush(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->bios_meta.empty_flush);
-}
-
-static struct pool_stats_attribute pool_stats_attr_bios_meta_empty_flush = {
-	.attr = { .name = "bios_meta_empty_flush", .mode = 0444, },
-	.print = pool_stats_print_bios_meta_empty_flush,
-};
-
-/* Number of REQ_OP_DISCARD bios */
-static ssize_t
-pool_stats_print_bios_meta_discard(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->bios_meta.discard);
-}
-
-static struct pool_stats_attribute pool_stats_attr_bios_meta_discard = {
-	.attr = { .name = "bios_meta_discard", .mode = 0444, },
-	.print = pool_stats_print_bios_meta_discard,
-};
-
-/* Number of bios tagged with REQ_PREFLUSH */
-static ssize_t
-pool_stats_print_bios_meta_flush(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->bios_meta.flush);
-}
-
-static struct pool_stats_attribute pool_stats_attr_bios_meta_flush = {
-	.attr = { .name = "bios_meta_flush", .mode = 0444, },
-	.print = pool_stats_print_bios_meta_flush,
-};
-
-/* Number of bios tagged with REQ_FUA */
-static ssize_t
-pool_stats_print_bios_meta_fua(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->bios_meta.fua);
-}
-
-static struct pool_stats_attribute pool_stats_attr_bios_meta_fua = {
-	.attr = { .name = "bios_meta_fua", .mode = 0444, },
-	.print = pool_stats_print_bios_meta_fua,
-};
-
-/* Number of REQ_OP_READ bios */
-static ssize_t
-pool_stats_print_bios_journal_read(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->bios_journal.read);
-}
-
-static struct pool_stats_attribute pool_stats_attr_bios_journal_read = {
-	.attr = { .name = "bios_journal_read", .mode = 0444, },
-	.print = pool_stats_print_bios_journal_read,
-};
-
-/* Number of REQ_OP_WRITE bios with data */
-static ssize_t
-pool_stats_print_bios_journal_write(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->bios_journal.write);
-}
-
-static struct pool_stats_attribute pool_stats_attr_bios_journal_write = {
-	.attr = { .name = "bios_journal_write", .mode = 0444, },
-	.print = pool_stats_print_bios_journal_write,
-};
-
-/* Number of bios tagged with REQ_PREFLUSH and containing no data */
-static ssize_t
-pool_stats_print_bios_journal_empty_flush(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->bios_journal.empty_flush);
-}
-
-static struct pool_stats_attribute pool_stats_attr_bios_journal_empty_flush = {
-	.attr = { .name = "bios_journal_empty_flush", .mode = 0444, },
-	.print = pool_stats_print_bios_journal_empty_flush,
-};
-
-/* Number of REQ_OP_DISCARD bios */
-static ssize_t
-pool_stats_print_bios_journal_discard(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->bios_journal.discard);
-}
-
-static struct pool_stats_attribute pool_stats_attr_bios_journal_discard = {
-	.attr = { .name = "bios_journal_discard", .mode = 0444, },
-	.print = pool_stats_print_bios_journal_discard,
-};
-
-/* Number of bios tagged with REQ_PREFLUSH */
-static ssize_t
-pool_stats_print_bios_journal_flush(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->bios_journal.flush);
-}
-
-static struct pool_stats_attribute pool_stats_attr_bios_journal_flush = {
-	.attr = { .name = "bios_journal_flush", .mode = 0444, },
-	.print = pool_stats_print_bios_journal_flush,
-};
-
-/* Number of bios tagged with REQ_FUA */
-static ssize_t
-pool_stats_print_bios_journal_fua(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->bios_journal.fua);
-}
-
-static struct pool_stats_attribute pool_stats_attr_bios_journal_fua = {
-	.attr = { .name = "bios_journal_fua", .mode = 0444, },
-	.print = pool_stats_print_bios_journal_fua,
-};
-
-/* Number of REQ_OP_READ bios */
-static ssize_t
-pool_stats_print_bios_page_cache_read(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->bios_page_cache.read);
-}
-
-static struct pool_stats_attribute pool_stats_attr_bios_page_cache_read = {
-	.attr = { .name = "bios_page_cache_read", .mode = 0444, },
-	.print = pool_stats_print_bios_page_cache_read,
-};
-
-/* Number of REQ_OP_WRITE bios with data */
-static ssize_t
-pool_stats_print_bios_page_cache_write(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->bios_page_cache.write);
-}
-
-static struct pool_stats_attribute pool_stats_attr_bios_page_cache_write = {
-	.attr = { .name = "bios_page_cache_write", .mode = 0444, },
-	.print = pool_stats_print_bios_page_cache_write,
-};
-
-/* Number of bios tagged with REQ_PREFLUSH and containing no data */
-static ssize_t
-pool_stats_print_bios_page_cache_empty_flush(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->bios_page_cache.empty_flush);
-}
-
-static struct pool_stats_attribute pool_stats_attr_bios_page_cache_empty_flush = {
-	.attr = { .name = "bios_page_cache_empty_flush", .mode = 0444, },
-	.print = pool_stats_print_bios_page_cache_empty_flush,
-};
-
-/* Number of REQ_OP_DISCARD bios */
-static ssize_t
-pool_stats_print_bios_page_cache_discard(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->bios_page_cache.discard);
-}
-
-static struct pool_stats_attribute pool_stats_attr_bios_page_cache_discard = {
-	.attr = { .name = "bios_page_cache_discard", .mode = 0444, },
-	.print = pool_stats_print_bios_page_cache_discard,
-};
-
-/* Number of bios tagged with REQ_PREFLUSH */
-static ssize_t
-pool_stats_print_bios_page_cache_flush(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->bios_page_cache.flush);
-}
-
-static struct pool_stats_attribute pool_stats_attr_bios_page_cache_flush = {
-	.attr = { .name = "bios_page_cache_flush", .mode = 0444, },
-	.print = pool_stats_print_bios_page_cache_flush,
-};
-
-/* Number of bios tagged with REQ_FUA */
-static ssize_t
-pool_stats_print_bios_page_cache_fua(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->bios_page_cache.fua);
-}
-
-static struct pool_stats_attribute pool_stats_attr_bios_page_cache_fua = {
-	.attr = { .name = "bios_page_cache_fua", .mode = 0444, },
-	.print = pool_stats_print_bios_page_cache_fua,
-};
-
-/* Number of REQ_OP_READ bios */
-static ssize_t
-pool_stats_print_bios_out_completed_read(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->bios_out_completed.read);
-}
-
-static struct pool_stats_attribute pool_stats_attr_bios_out_completed_read = {
-	.attr = { .name = "bios_out_completed_read", .mode = 0444, },
-	.print = pool_stats_print_bios_out_completed_read,
-};
-
-/* Number of REQ_OP_WRITE bios with data */
-static ssize_t
-pool_stats_print_bios_out_completed_write(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->bios_out_completed.write);
-}
-
-static struct pool_stats_attribute pool_stats_attr_bios_out_completed_write = {
-	.attr = { .name = "bios_out_completed_write", .mode = 0444, },
-	.print = pool_stats_print_bios_out_completed_write,
-};
-
-/* Number of bios tagged with REQ_PREFLUSH and containing no data */
-static ssize_t
-pool_stats_print_bios_out_completed_empty_flush(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->bios_out_completed.empty_flush);
-}
-
-static struct pool_stats_attribute pool_stats_attr_bios_out_completed_empty_flush = {
-	.attr = { .name = "bios_out_completed_empty_flush", .mode = 0444, },
-	.print = pool_stats_print_bios_out_completed_empty_flush,
-};
-
-/* Number of REQ_OP_DISCARD bios */
-static ssize_t
-pool_stats_print_bios_out_completed_discard(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->bios_out_completed.discard);
-}
-
-static struct pool_stats_attribute pool_stats_attr_bios_out_completed_discard = {
-	.attr = { .name = "bios_out_completed_discard", .mode = 0444, },
-	.print = pool_stats_print_bios_out_completed_discard,
-};
-
-/* Number of bios tagged with REQ_PREFLUSH */
-static ssize_t
-pool_stats_print_bios_out_completed_flush(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->bios_out_completed.flush);
-}
-
-static struct pool_stats_attribute pool_stats_attr_bios_out_completed_flush = {
-	.attr = { .name = "bios_out_completed_flush", .mode = 0444, },
-	.print = pool_stats_print_bios_out_completed_flush,
-};
-
-/* Number of bios tagged with REQ_FUA */
-static ssize_t
-pool_stats_print_bios_out_completed_fua(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->bios_out_completed.fua);
-}
-
-static struct pool_stats_attribute pool_stats_attr_bios_out_completed_fua = {
-	.attr = { .name = "bios_out_completed_fua", .mode = 0444, },
-	.print = pool_stats_print_bios_out_completed_fua,
-};
-
-/* Number of REQ_OP_READ bios */
-static ssize_t
-pool_stats_print_bios_meta_completed_read(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->bios_meta_completed.read);
-}
-
-static struct pool_stats_attribute pool_stats_attr_bios_meta_completed_read = {
-	.attr = { .name = "bios_meta_completed_read", .mode = 0444, },
-	.print = pool_stats_print_bios_meta_completed_read,
-};
-
-/* Number of REQ_OP_WRITE bios with data */
-static ssize_t
-pool_stats_print_bios_meta_completed_write(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->bios_meta_completed.write);
-}
-
-static struct pool_stats_attribute pool_stats_attr_bios_meta_completed_write = {
-	.attr = { .name = "bios_meta_completed_write", .mode = 0444, },
-	.print = pool_stats_print_bios_meta_completed_write,
-};
-
-/* Number of bios tagged with REQ_PREFLUSH and containing no data */
-static ssize_t
-pool_stats_print_bios_meta_completed_empty_flush(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->bios_meta_completed.empty_flush);
-}
-
-static struct pool_stats_attribute pool_stats_attr_bios_meta_completed_empty_flush = {
-	.attr = { .name = "bios_meta_completed_empty_flush", .mode = 0444, },
-	.print = pool_stats_print_bios_meta_completed_empty_flush,
-};
-
-/* Number of REQ_OP_DISCARD bios */
-static ssize_t
-pool_stats_print_bios_meta_completed_discard(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->bios_meta_completed.discard);
-}
-
-static struct pool_stats_attribute pool_stats_attr_bios_meta_completed_discard = {
-	.attr = { .name = "bios_meta_completed_discard", .mode = 0444, },
-	.print = pool_stats_print_bios_meta_completed_discard,
-};
-
-/* Number of bios tagged with REQ_PREFLUSH */
-static ssize_t
-pool_stats_print_bios_meta_completed_flush(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->bios_meta_completed.flush);
-}
-
-static struct pool_stats_attribute pool_stats_attr_bios_meta_completed_flush = {
-	.attr = { .name = "bios_meta_completed_flush", .mode = 0444, },
-	.print = pool_stats_print_bios_meta_completed_flush,
-};
-
-/* Number of bios tagged with REQ_FUA */
-static ssize_t
-pool_stats_print_bios_meta_completed_fua(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->bios_meta_completed.fua);
-}
-
-static struct pool_stats_attribute pool_stats_attr_bios_meta_completed_fua = {
-	.attr = { .name = "bios_meta_completed_fua", .mode = 0444, },
-	.print = pool_stats_print_bios_meta_completed_fua,
-};
-
-/* Number of REQ_OP_READ bios */
-static ssize_t
-pool_stats_print_bios_journal_completed_read(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->bios_journal_completed.read);
-}
-
-static struct pool_stats_attribute pool_stats_attr_bios_journal_completed_read = {
-	.attr = { .name = "bios_journal_completed_read", .mode = 0444, },
-	.print = pool_stats_print_bios_journal_completed_read,
-};
-
-/* Number of REQ_OP_WRITE bios with data */
-static ssize_t
-pool_stats_print_bios_journal_completed_write(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->bios_journal_completed.write);
-}
-
-static struct pool_stats_attribute pool_stats_attr_bios_journal_completed_write = {
-	.attr = { .name = "bios_journal_completed_write", .mode = 0444, },
-	.print = pool_stats_print_bios_journal_completed_write,
-};
-
-/* Number of bios tagged with REQ_PREFLUSH and containing no data */
-static ssize_t
-pool_stats_print_bios_journal_completed_empty_flush(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->bios_journal_completed.empty_flush);
-}
-
-static struct pool_stats_attribute pool_stats_attr_bios_journal_completed_empty_flush = {
-	.attr = { .name = "bios_journal_completed_empty_flush", .mode = 0444, },
-	.print = pool_stats_print_bios_journal_completed_empty_flush,
-};
-
-/* Number of REQ_OP_DISCARD bios */
-static ssize_t
-pool_stats_print_bios_journal_completed_discard(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->bios_journal_completed.discard);
-}
-
-static struct pool_stats_attribute pool_stats_attr_bios_journal_completed_discard = {
-	.attr = { .name = "bios_journal_completed_discard", .mode = 0444, },
-	.print = pool_stats_print_bios_journal_completed_discard,
-};
-
-/* Number of bios tagged with REQ_PREFLUSH */
-static ssize_t
-pool_stats_print_bios_journal_completed_flush(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->bios_journal_completed.flush);
-}
-
-static struct pool_stats_attribute pool_stats_attr_bios_journal_completed_flush = {
-	.attr = { .name = "bios_journal_completed_flush", .mode = 0444, },
-	.print = pool_stats_print_bios_journal_completed_flush,
-};
-
-/* Number of bios tagged with REQ_FUA */
-static ssize_t
-pool_stats_print_bios_journal_completed_fua(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->bios_journal_completed.fua);
-}
-
-static struct pool_stats_attribute pool_stats_attr_bios_journal_completed_fua = {
-	.attr = { .name = "bios_journal_completed_fua", .mode = 0444, },
-	.print = pool_stats_print_bios_journal_completed_fua,
-};
-
-/* Number of REQ_OP_READ bios */
-static ssize_t
-pool_stats_print_bios_page_cache_completed_read(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->bios_page_cache_completed.read);
-}
-
-static struct pool_stats_attribute pool_stats_attr_bios_page_cache_completed_read = {
-	.attr = { .name = "bios_page_cache_completed_read", .mode = 0444, },
-	.print = pool_stats_print_bios_page_cache_completed_read,
-};
-
-/* Number of REQ_OP_WRITE bios with data */
-static ssize_t
-pool_stats_print_bios_page_cache_completed_write(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->bios_page_cache_completed.write);
-}
-
-static struct pool_stats_attribute pool_stats_attr_bios_page_cache_completed_write = {
-	.attr = { .name = "bios_page_cache_completed_write", .mode = 0444, },
-	.print = pool_stats_print_bios_page_cache_completed_write,
-};
-
-/* Number of bios tagged with REQ_PREFLUSH and containing no data */
-static ssize_t
-pool_stats_print_bios_page_cache_completed_empty_flush(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->bios_page_cache_completed.empty_flush);
-}
-
-static struct pool_stats_attribute pool_stats_attr_bios_page_cache_completed_empty_flush = {
-	.attr = { .name = "bios_page_cache_completed_empty_flush", .mode = 0444, },
-	.print = pool_stats_print_bios_page_cache_completed_empty_flush,
-};
-
-/* Number of REQ_OP_DISCARD bios */
-static ssize_t
-pool_stats_print_bios_page_cache_completed_discard(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->bios_page_cache_completed.discard);
-}
-
-static struct pool_stats_attribute pool_stats_attr_bios_page_cache_completed_discard = {
-	.attr = { .name = "bios_page_cache_completed_discard", .mode = 0444, },
-	.print = pool_stats_print_bios_page_cache_completed_discard,
-};
-
-/* Number of bios tagged with REQ_PREFLUSH */
-static ssize_t
-pool_stats_print_bios_page_cache_completed_flush(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->bios_page_cache_completed.flush);
-}
-
-static struct pool_stats_attribute pool_stats_attr_bios_page_cache_completed_flush = {
-	.attr = { .name = "bios_page_cache_completed_flush", .mode = 0444, },
-	.print = pool_stats_print_bios_page_cache_completed_flush,
-};
-
-/* Number of bios tagged with REQ_FUA */
-static ssize_t
-pool_stats_print_bios_page_cache_completed_fua(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->bios_page_cache_completed.fua);
-}
-
-static struct pool_stats_attribute pool_stats_attr_bios_page_cache_completed_fua = {
-	.attr = { .name = "bios_page_cache_completed_fua", .mode = 0444, },
-	.print = pool_stats_print_bios_page_cache_completed_fua,
-};
-
-/* Number of REQ_OP_READ bios */
-static ssize_t
-pool_stats_print_bios_acknowledged_read(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->bios_acknowledged.read);
-}
-
-static struct pool_stats_attribute pool_stats_attr_bios_acknowledged_read = {
-	.attr = { .name = "bios_acknowledged_read", .mode = 0444, },
-	.print = pool_stats_print_bios_acknowledged_read,
-};
-
-/* Number of REQ_OP_WRITE bios with data */
-static ssize_t
-pool_stats_print_bios_acknowledged_write(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->bios_acknowledged.write);
-}
-
-static struct pool_stats_attribute pool_stats_attr_bios_acknowledged_write = {
-	.attr = { .name = "bios_acknowledged_write", .mode = 0444, },
-	.print = pool_stats_print_bios_acknowledged_write,
-};
-
-/* Number of bios tagged with REQ_PREFLUSH and containing no data */
-static ssize_t
-pool_stats_print_bios_acknowledged_empty_flush(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->bios_acknowledged.empty_flush);
-}
-
-static struct pool_stats_attribute pool_stats_attr_bios_acknowledged_empty_flush = {
-	.attr = { .name = "bios_acknowledged_empty_flush", .mode = 0444, },
-	.print = pool_stats_print_bios_acknowledged_empty_flush,
-};
-
-/* Number of REQ_OP_DISCARD bios */
-static ssize_t
-pool_stats_print_bios_acknowledged_discard(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->bios_acknowledged.discard);
-}
-
-static struct pool_stats_attribute pool_stats_attr_bios_acknowledged_discard = {
-	.attr = { .name = "bios_acknowledged_discard", .mode = 0444, },
-	.print = pool_stats_print_bios_acknowledged_discard,
-};
-
-/* Number of bios tagged with REQ_PREFLUSH */
-static ssize_t
-pool_stats_print_bios_acknowledged_flush(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->bios_acknowledged.flush);
-}
-
-static struct pool_stats_attribute pool_stats_attr_bios_acknowledged_flush = {
-	.attr = { .name = "bios_acknowledged_flush", .mode = 0444, },
-	.print = pool_stats_print_bios_acknowledged_flush,
-};
-
-/* Number of bios tagged with REQ_FUA */
-static ssize_t
-pool_stats_print_bios_acknowledged_fua(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->bios_acknowledged.fua);
-}
-
-static struct pool_stats_attribute pool_stats_attr_bios_acknowledged_fua = {
-	.attr = { .name = "bios_acknowledged_fua", .mode = 0444, },
-	.print = pool_stats_print_bios_acknowledged_fua,
-};
-
-/* Number of REQ_OP_READ bios */
-static ssize_t
-pool_stats_print_bios_acknowledged_partial_read(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->bios_acknowledged_partial.read);
-}
-
-static struct pool_stats_attribute pool_stats_attr_bios_acknowledged_partial_read = {
-	.attr = { .name = "bios_acknowledged_partial_read", .mode = 0444, },
-	.print = pool_stats_print_bios_acknowledged_partial_read,
-};
-
-/* Number of REQ_OP_WRITE bios with data */
-static ssize_t
-pool_stats_print_bios_acknowledged_partial_write(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->bios_acknowledged_partial.write);
-}
-
-static struct pool_stats_attribute pool_stats_attr_bios_acknowledged_partial_write = {
-	.attr = { .name = "bios_acknowledged_partial_write", .mode = 0444, },
-	.print = pool_stats_print_bios_acknowledged_partial_write,
-};
-
-/* Number of bios tagged with REQ_PREFLUSH and containing no data */
-static ssize_t
-pool_stats_print_bios_acknowledged_partial_empty_flush(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->bios_acknowledged_partial.empty_flush);
-}
-
-static struct pool_stats_attribute pool_stats_attr_bios_acknowledged_partial_empty_flush = {
-	.attr = { .name = "bios_acknowledged_partial_empty_flush", .mode = 0444, },
-	.print = pool_stats_print_bios_acknowledged_partial_empty_flush,
-};
-
-/* Number of REQ_OP_DISCARD bios */
-static ssize_t
-pool_stats_print_bios_acknowledged_partial_discard(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->bios_acknowledged_partial.discard);
-}
-
-static struct pool_stats_attribute pool_stats_attr_bios_acknowledged_partial_discard = {
-	.attr = { .name = "bios_acknowledged_partial_discard", .mode = 0444, },
-	.print = pool_stats_print_bios_acknowledged_partial_discard,
-};
-
-/* Number of bios tagged with REQ_PREFLUSH */
-static ssize_t
-pool_stats_print_bios_acknowledged_partial_flush(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->bios_acknowledged_partial.flush);
-}
-
-static struct pool_stats_attribute pool_stats_attr_bios_acknowledged_partial_flush = {
-	.attr = { .name = "bios_acknowledged_partial_flush", .mode = 0444, },
-	.print = pool_stats_print_bios_acknowledged_partial_flush,
-};
-
-/* Number of bios tagged with REQ_FUA */
-static ssize_t
-pool_stats_print_bios_acknowledged_partial_fua(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->bios_acknowledged_partial.fua);
-}
-
-static struct pool_stats_attribute pool_stats_attr_bios_acknowledged_partial_fua = {
-	.attr = { .name = "bios_acknowledged_partial_fua", .mode = 0444, },
-	.print = pool_stats_print_bios_acknowledged_partial_fua,
-};
-
-/* Number of REQ_OP_READ bios */
-static ssize_t
-pool_stats_print_bios_in_progress_read(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->bios_in_progress.read);
-}
-
-static struct pool_stats_attribute pool_stats_attr_bios_in_progress_read = {
-	.attr = { .name = "bios_in_progress_read", .mode = 0444, },
-	.print = pool_stats_print_bios_in_progress_read,
-};
-
-/* Number of REQ_OP_WRITE bios with data */
-static ssize_t
-pool_stats_print_bios_in_progress_write(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->bios_in_progress.write);
-}
-
-static struct pool_stats_attribute pool_stats_attr_bios_in_progress_write = {
-	.attr = { .name = "bios_in_progress_write", .mode = 0444, },
-	.print = pool_stats_print_bios_in_progress_write,
-};
-
-/* Number of bios tagged with REQ_PREFLUSH and containing no data */
-static ssize_t
-pool_stats_print_bios_in_progress_empty_flush(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->bios_in_progress.empty_flush);
-}
-
-static struct pool_stats_attribute pool_stats_attr_bios_in_progress_empty_flush = {
-	.attr = { .name = "bios_in_progress_empty_flush", .mode = 0444, },
-	.print = pool_stats_print_bios_in_progress_empty_flush,
-};
-
-/* Number of REQ_OP_DISCARD bios */
-static ssize_t
-pool_stats_print_bios_in_progress_discard(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->bios_in_progress.discard);
-}
-
-static struct pool_stats_attribute pool_stats_attr_bios_in_progress_discard = {
-	.attr = { .name = "bios_in_progress_discard", .mode = 0444, },
-	.print = pool_stats_print_bios_in_progress_discard,
-};
-
-/* Number of bios tagged with REQ_PREFLUSH */
-static ssize_t
-pool_stats_print_bios_in_progress_flush(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->bios_in_progress.flush);
-}
-
-static struct pool_stats_attribute pool_stats_attr_bios_in_progress_flush = {
-	.attr = { .name = "bios_in_progress_flush", .mode = 0444, },
-	.print = pool_stats_print_bios_in_progress_flush,
-};
-
-/* Number of bios tagged with REQ_FUA */
-static ssize_t
-pool_stats_print_bios_in_progress_fua(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->bios_in_progress.fua);
-}
-
-static struct pool_stats_attribute pool_stats_attr_bios_in_progress_fua = {
-	.attr = { .name = "bios_in_progress_fua", .mode = 0444, },
-	.print = pool_stats_print_bios_in_progress_fua,
-};
-
-/* Tracked bytes currently allocated. */
-static ssize_t
-pool_stats_print_memory_usage_bytes_used(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->memory_usage.bytes_used);
-}
-
-static struct pool_stats_attribute pool_stats_attr_memory_usage_bytes_used = {
-	.attr = { .name = "memory_usage_bytes_used", .mode = 0444, },
-	.print = pool_stats_print_memory_usage_bytes_used,
-};
-
-/* Maximum tracked bytes allocated. */
-static ssize_t
-pool_stats_print_memory_usage_peak_bytes_used(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->memory_usage.peak_bytes_used);
-}
-
-static struct pool_stats_attribute pool_stats_attr_memory_usage_peak_bytes_used = {
-	.attr = { .name = "memory_usage_peak_bytes_used", .mode = 0444, },
-	.print = pool_stats_print_memory_usage_peak_bytes_used,
-};
-
-/* Number of records stored in the index */
-static ssize_t
-pool_stats_print_index_entries_indexed(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->index.entries_indexed);
-}
-
-static struct pool_stats_attribute pool_stats_attr_index_entries_indexed = {
-	.attr = { .name = "index_entries_indexed", .mode = 0444, },
-	.print = pool_stats_print_index_entries_indexed,
-};
-
-/* Number of post calls that found an existing entry */
-static ssize_t
-pool_stats_print_index_posts_found(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->index.posts_found);
-}
-
-static struct pool_stats_attribute pool_stats_attr_index_posts_found = {
-	.attr = { .name = "index_posts_found", .mode = 0444, },
-	.print = pool_stats_print_index_posts_found,
-};
-
-/* Number of post calls that added a new entry */
-static ssize_t
-pool_stats_print_index_posts_not_found(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->index.posts_not_found);
-}
-
-static struct pool_stats_attribute pool_stats_attr_index_posts_not_found = {
-	.attr = { .name = "index_posts_not_found", .mode = 0444, },
-	.print = pool_stats_print_index_posts_not_found,
-};
-
-/* Number of query calls that found an existing entry */
-static ssize_t
-pool_stats_print_index_queries_found(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->index.queries_found);
-}
-
-static struct pool_stats_attribute pool_stats_attr_index_queries_found = {
-	.attr = { .name = "index_queries_found", .mode = 0444, },
-	.print = pool_stats_print_index_queries_found,
-};
-
-/* Number of query calls that added a new entry */
-static ssize_t
-pool_stats_print_index_queries_not_found(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->index.queries_not_found);
-}
-
-static struct pool_stats_attribute pool_stats_attr_index_queries_not_found = {
-	.attr = { .name = "index_queries_not_found", .mode = 0444, },
-	.print = pool_stats_print_index_queries_not_found,
-};
-
-/* Number of update calls that found an existing entry */
-static ssize_t
-pool_stats_print_index_updates_found(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->index.updates_found);
-}
-
-static struct pool_stats_attribute pool_stats_attr_index_updates_found = {
-	.attr = { .name = "index_updates_found", .mode = 0444, },
-	.print = pool_stats_print_index_updates_found,
-};
-
-/* Number of update calls that added a new entry */
-static ssize_t
-pool_stats_print_index_updates_not_found(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->index.updates_not_found);
-}
-
-static struct pool_stats_attribute pool_stats_attr_index_updates_not_found = {
-	.attr = { .name = "index_updates_not_found", .mode = 0444, },
-	.print = pool_stats_print_index_updates_not_found,
-};
-
-/* Number of entries discarded */
-static ssize_t
-pool_stats_print_index_entries_discarded(struct vdo_statistics *stats, char *buf)
-{
-	return sprintf(buf, "%llu\n", stats->index.entries_discarded);
-}
-
-static struct pool_stats_attribute pool_stats_attr_index_entries_discarded = {
-	.attr = { .name = "index_entries_discarded", .mode = 0444, },
-	.print = pool_stats_print_index_entries_discarded,
-};
-
-struct attribute *vdo_pool_stats_attrs[] = {
-	&pool_stats_attr_data_blocks_used.attr,
-	&pool_stats_attr_overhead_blocks_used.attr,
-	&pool_stats_attr_logical_blocks_used.attr,
-	&pool_stats_attr_physical_blocks.attr,
-	&pool_stats_attr_logical_blocks.attr,
-	&pool_stats_attr_block_map_cache_size.attr,
-	&pool_stats_attr_block_size.attr,
-	&pool_stats_attr_complete_recoveries.attr,
-	&pool_stats_attr_read_only_recoveries.attr,
-	&pool_stats_attr_mode.attr,
-	&pool_stats_attr_in_recovery_mode.attr,
-	&pool_stats_attr_recovery_percentage.attr,
-	&pool_stats_attr_packer_compressed_fragments_written.attr,
-	&pool_stats_attr_packer_compressed_blocks_written.attr,
-	&pool_stats_attr_packer_compressed_fragments_in_packer.attr,
-	&pool_stats_attr_allocator_slab_count.attr,
-	&pool_stats_attr_allocator_slabs_opened.attr,
-	&pool_stats_attr_allocator_slabs_reopened.attr,
-	&pool_stats_attr_journal_disk_full.attr,
-	&pool_stats_attr_journal_slab_journal_commits_requested.attr,
-	&pool_stats_attr_journal_entries_started.attr,
-	&pool_stats_attr_journal_entries_written.attr,
-	&pool_stats_attr_journal_entries_committed.attr,
-	&pool_stats_attr_journal_blocks_started.attr,
-	&pool_stats_attr_journal_blocks_written.attr,
-	&pool_stats_attr_journal_blocks_committed.attr,
-	&pool_stats_attr_slab_journal_disk_full_count.attr,
-	&pool_stats_attr_slab_journal_flush_count.attr,
-	&pool_stats_attr_slab_journal_blocked_count.attr,
-	&pool_stats_attr_slab_journal_blocks_written.attr,
-	&pool_stats_attr_slab_journal_tail_busy_count.attr,
-	&pool_stats_attr_slab_summary_blocks_written.attr,
-	&pool_stats_attr_ref_counts_blocks_written.attr,
-	&pool_stats_attr_block_map_dirty_pages.attr,
-	&pool_stats_attr_block_map_clean_pages.attr,
-	&pool_stats_attr_block_map_free_pages.attr,
-	&pool_stats_attr_block_map_failed_pages.attr,
-	&pool_stats_attr_block_map_incoming_pages.attr,
-	&pool_stats_attr_block_map_outgoing_pages.attr,
-	&pool_stats_attr_block_map_cache_pressure.attr,
-	&pool_stats_attr_block_map_read_count.attr,
-	&pool_stats_attr_block_map_write_count.attr,
-	&pool_stats_attr_block_map_failed_reads.attr,
-	&pool_stats_attr_block_map_failed_writes.attr,
-	&pool_stats_attr_block_map_reclaimed.attr,
-	&pool_stats_attr_block_map_read_outgoing.attr,
-	&pool_stats_attr_block_map_found_in_cache.attr,
-	&pool_stats_attr_block_map_discard_required.attr,
-	&pool_stats_attr_block_map_wait_for_page.attr,
-	&pool_stats_attr_block_map_fetch_required.attr,
-	&pool_stats_attr_block_map_pages_loaded.attr,
-	&pool_stats_attr_block_map_pages_saved.attr,
-	&pool_stats_attr_block_map_flush_count.attr,
-	&pool_stats_attr_hash_lock_dedupe_advice_valid.attr,
-	&pool_stats_attr_hash_lock_dedupe_advice_stale.attr,
-	&pool_stats_attr_hash_lock_concurrent_data_matches.attr,
-	&pool_stats_attr_hash_lock_concurrent_hash_collisions.attr,
-	&pool_stats_attr_hash_lock_curr_dedupe_queries.attr,
-	&pool_stats_attr_errors_invalid_advice_pbn_count.attr,
-	&pool_stats_attr_errors_no_space_error_count.attr,
-	&pool_stats_attr_errors_read_only_error_count.attr,
-	&pool_stats_attr_instance.attr,
-	&pool_stats_attr_current_vios_in_progress.attr,
-	&pool_stats_attr_max_vios.attr,
-	&pool_stats_attr_dedupe_advice_timeouts.attr,
-	&pool_stats_attr_flush_out.attr,
-	&pool_stats_attr_logical_block_size.attr,
-	&pool_stats_attr_bios_in_read.attr,
-	&pool_stats_attr_bios_in_write.attr,
-	&pool_stats_attr_bios_in_empty_flush.attr,
-	&pool_stats_attr_bios_in_discard.attr,
-	&pool_stats_attr_bios_in_flush.attr,
-	&pool_stats_attr_bios_in_fua.attr,
-	&pool_stats_attr_bios_in_partial_read.attr,
-	&pool_stats_attr_bios_in_partial_write.attr,
-	&pool_stats_attr_bios_in_partial_empty_flush.attr,
-	&pool_stats_attr_bios_in_partial_discard.attr,
-	&pool_stats_attr_bios_in_partial_flush.attr,
-	&pool_stats_attr_bios_in_partial_fua.attr,
-	&pool_stats_attr_bios_out_read.attr,
-	&pool_stats_attr_bios_out_write.attr,
-	&pool_stats_attr_bios_out_empty_flush.attr,
-	&pool_stats_attr_bios_out_discard.attr,
-	&pool_stats_attr_bios_out_flush.attr,
-	&pool_stats_attr_bios_out_fua.attr,
-	&pool_stats_attr_bios_meta_read.attr,
-	&pool_stats_attr_bios_meta_write.attr,
-	&pool_stats_attr_bios_meta_empty_flush.attr,
-	&pool_stats_attr_bios_meta_discard.attr,
-	&pool_stats_attr_bios_meta_flush.attr,
-	&pool_stats_attr_bios_meta_fua.attr,
-	&pool_stats_attr_bios_journal_read.attr,
-	&pool_stats_attr_bios_journal_write.attr,
-	&pool_stats_attr_bios_journal_empty_flush.attr,
-	&pool_stats_attr_bios_journal_discard.attr,
-	&pool_stats_attr_bios_journal_flush.attr,
-	&pool_stats_attr_bios_journal_fua.attr,
-	&pool_stats_attr_bios_page_cache_read.attr,
-	&pool_stats_attr_bios_page_cache_write.attr,
-	&pool_stats_attr_bios_page_cache_empty_flush.attr,
-	&pool_stats_attr_bios_page_cache_discard.attr,
-	&pool_stats_attr_bios_page_cache_flush.attr,
-	&pool_stats_attr_bios_page_cache_fua.attr,
-	&pool_stats_attr_bios_out_completed_read.attr,
-	&pool_stats_attr_bios_out_completed_write.attr,
-	&pool_stats_attr_bios_out_completed_empty_flush.attr,
-	&pool_stats_attr_bios_out_completed_discard.attr,
-	&pool_stats_attr_bios_out_completed_flush.attr,
-	&pool_stats_attr_bios_out_completed_fua.attr,
-	&pool_stats_attr_bios_meta_completed_read.attr,
-	&pool_stats_attr_bios_meta_completed_write.attr,
-	&pool_stats_attr_bios_meta_completed_empty_flush.attr,
-	&pool_stats_attr_bios_meta_completed_discard.attr,
-	&pool_stats_attr_bios_meta_completed_flush.attr,
-	&pool_stats_attr_bios_meta_completed_fua.attr,
-	&pool_stats_attr_bios_journal_completed_read.attr,
-	&pool_stats_attr_bios_journal_completed_write.attr,
-	&pool_stats_attr_bios_journal_completed_empty_flush.attr,
-	&pool_stats_attr_bios_journal_completed_discard.attr,
-	&pool_stats_attr_bios_journal_completed_flush.attr,
-	&pool_stats_attr_bios_journal_completed_fua.attr,
-	&pool_stats_attr_bios_page_cache_completed_read.attr,
-	&pool_stats_attr_bios_page_cache_completed_write.attr,
-	&pool_stats_attr_bios_page_cache_completed_empty_flush.attr,
-	&pool_stats_attr_bios_page_cache_completed_discard.attr,
-	&pool_stats_attr_bios_page_cache_completed_flush.attr,
-	&pool_stats_attr_bios_page_cache_completed_fua.attr,
-	&pool_stats_attr_bios_acknowledged_read.attr,
-	&pool_stats_attr_bios_acknowledged_write.attr,
-	&pool_stats_attr_bios_acknowledged_empty_flush.attr,
-	&pool_stats_attr_bios_acknowledged_discard.attr,
-	&pool_stats_attr_bios_acknowledged_flush.attr,
-	&pool_stats_attr_bios_acknowledged_fua.attr,
-	&pool_stats_attr_bios_acknowledged_partial_read.attr,
-	&pool_stats_attr_bios_acknowledged_partial_write.attr,
-	&pool_stats_attr_bios_acknowledged_partial_empty_flush.attr,
-	&pool_stats_attr_bios_acknowledged_partial_discard.attr,
-	&pool_stats_attr_bios_acknowledged_partial_flush.attr,
-	&pool_stats_attr_bios_acknowledged_partial_fua.attr,
-	&pool_stats_attr_bios_in_progress_read.attr,
-	&pool_stats_attr_bios_in_progress_write.attr,
-	&pool_stats_attr_bios_in_progress_empty_flush.attr,
-	&pool_stats_attr_bios_in_progress_discard.attr,
-	&pool_stats_attr_bios_in_progress_flush.attr,
-	&pool_stats_attr_bios_in_progress_fua.attr,
-	&pool_stats_attr_memory_usage_bytes_used.attr,
-	&pool_stats_attr_memory_usage_peak_bytes_used.attr,
-	&pool_stats_attr_index_entries_indexed.attr,
-	&pool_stats_attr_index_posts_found.attr,
-	&pool_stats_attr_index_posts_not_found.attr,
-	&pool_stats_attr_index_queries_found.attr,
-	&pool_stats_attr_index_queries_not_found.attr,
-	&pool_stats_attr_index_updates_found.attr,
-	&pool_stats_attr_index_updates_not_found.attr,
-	&pool_stats_attr_index_entries_discarded.attr,
-	NULL,
-};
diff --git a/drivers/md/dm-vdo/pool-sysfs.c b/drivers/md/dm-vdo/pool-sysfs.c
deleted file mode 100644
index f2be0f2bbd68ee..00000000000000
--- a/drivers/md/dm-vdo/pool-sysfs.c
+++ /dev/null
@@ -1,198 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright 2023 Red Hat
- */
-
-#include "pool-sysfs.h"
-
-#include <linux/kstrtox.h>
-
-#include "memory-alloc.h"
-#include "string-utils.h"
-
-#include "data-vio.h"
-#include "dedupe.h"
-#include "vdo.h"
-
-struct pool_attribute {
-	struct attribute attr;
-	ssize_t (*show)(struct vdo *vdo, char *buf);
-	ssize_t (*store)(struct vdo *vdo, const char *value, size_t count);
-};
-
-static ssize_t vdo_pool_attr_show(struct kobject *directory, struct attribute *attr,
-				  char *buf)
-{
-	struct pool_attribute *pool_attr = container_of(attr, struct pool_attribute,
-							attr);
-	struct vdo *vdo = container_of(directory, struct vdo, vdo_directory);
-
-	if (pool_attr->show == NULL)
-		return -EINVAL;
-	return pool_attr->show(vdo, buf);
-}
-
-static ssize_t vdo_pool_attr_store(struct kobject *directory, struct attribute *attr,
-				   const char *buf, size_t length)
-{
-	struct pool_attribute *pool_attr = container_of(attr, struct pool_attribute,
-							attr);
-	struct vdo *vdo = container_of(directory, struct vdo, vdo_directory);
-
-	if (pool_attr->store == NULL)
-		return -EINVAL;
-	return pool_attr->store(vdo, buf, length);
-}
-
-static const struct sysfs_ops vdo_pool_sysfs_ops = {
-	.show = vdo_pool_attr_show,
-	.store = vdo_pool_attr_store,
-};
-
-static ssize_t pool_compressing_show(struct vdo *vdo, char *buf)
-{
-	return sprintf(buf, "%s\n", (vdo_get_compressing(vdo) ? "1" : "0"));
-}
-
-static ssize_t pool_discards_active_show(struct vdo *vdo, char *buf)
-{
-	return sprintf(buf, "%u\n",
-		       get_data_vio_pool_active_discards(vdo->data_vio_pool));
-}
-
-static ssize_t pool_discards_limit_show(struct vdo *vdo, char *buf)
-{
-	return sprintf(buf, "%u\n", get_data_vio_pool_discard_limit(vdo->data_vio_pool));
-}
-
-static ssize_t pool_discards_limit_store(struct vdo *vdo, const char *buf, size_t length)
-{
-	unsigned int value;
-	int result;
-
-	if ((length > 12) || (kstrtouint(buf, 10, &value) < 0) || (value < 1))
-		return -EINVAL;
-
-	result = set_data_vio_pool_discard_limit(vdo->data_vio_pool, value);
-	if (result != VDO_SUCCESS)
-		return -EINVAL;
-
-	return length;
-}
-
-static ssize_t pool_discards_maximum_show(struct vdo *vdo, char *buf)
-{
-	return sprintf(buf, "%u\n",
-		       get_data_vio_pool_maximum_discards(vdo->data_vio_pool));
-}
-
-static ssize_t pool_instance_show(struct vdo *vdo, char *buf)
-{
-	return sprintf(buf, "%u\n", vdo->instance);
-}
-
-static ssize_t pool_requests_active_show(struct vdo *vdo, char *buf)
-{
-	return sprintf(buf, "%u\n",
-		       get_data_vio_pool_active_requests(vdo->data_vio_pool));
-}
-
-static ssize_t pool_requests_limit_show(struct vdo *vdo, char *buf)
-{
-	return sprintf(buf, "%u\n", get_data_vio_pool_request_limit(vdo->data_vio_pool));
-}
-
-static ssize_t pool_requests_maximum_show(struct vdo *vdo, char *buf)
-{
-	return sprintf(buf, "%u\n",
-		       get_data_vio_pool_maximum_requests(vdo->data_vio_pool));
-}
-
-static void vdo_pool_release(struct kobject *directory)
-{
-	uds_free(container_of(directory, struct vdo, vdo_directory));
-}
-
-static struct pool_attribute vdo_pool_compressing_attr = {
-	.attr = {
-			.name = "compressing",
-			.mode = 0444,
-		},
-	.show = pool_compressing_show,
-};
-
-static struct pool_attribute vdo_pool_discards_active_attr = {
-	.attr = {
-			.name = "discards_active",
-			.mode = 0444,
-		},
-	.show = pool_discards_active_show,
-};
-
-static struct pool_attribute vdo_pool_discards_limit_attr = {
-	.attr = {
-			.name = "discards_limit",
-			.mode = 0644,
-		},
-	.show = pool_discards_limit_show,
-	.store = pool_discards_limit_store,
-};
-
-static struct pool_attribute vdo_pool_discards_maximum_attr = {
-	.attr = {
-			.name = "discards_maximum",
-			.mode = 0444,
-		},
-	.show = pool_discards_maximum_show,
-};
-
-static struct pool_attribute vdo_pool_instance_attr = {
-	.attr = {
-			.name = "instance",
-			.mode = 0444,
-		},
-	.show = pool_instance_show,
-};
-
-static struct pool_attribute vdo_pool_requests_active_attr = {
-	.attr = {
-			.name = "requests_active",
-			.mode = 0444,
-		},
-	.show = pool_requests_active_show,
-};
-
-static struct pool_attribute vdo_pool_requests_limit_attr = {
-	.attr = {
-			.name = "requests_limit",
-			.mode = 0444,
-		},
-	.show = pool_requests_limit_show,
-};
-
-static struct pool_attribute vdo_pool_requests_maximum_attr = {
-	.attr = {
-			.name = "requests_maximum",
-			.mode = 0444,
-		},
-	.show = pool_requests_maximum_show,
-};
-
-static struct attribute *pool_attrs[] = {
-	&vdo_pool_compressing_attr.attr,
-	&vdo_pool_discards_active_attr.attr,
-	&vdo_pool_discards_limit_attr.attr,
-	&vdo_pool_discards_maximum_attr.attr,
-	&vdo_pool_instance_attr.attr,
-	&vdo_pool_requests_active_attr.attr,
-	&vdo_pool_requests_limit_attr.attr,
-	&vdo_pool_requests_maximum_attr.attr,
-	NULL,
-};
-ATTRIBUTE_GROUPS(pool);
-
-const struct kobj_type vdo_directory_type = {
-	.release = vdo_pool_release,
-	.sysfs_ops = &vdo_pool_sysfs_ops,
-	.default_groups = pool_groups,
-};
diff --git a/drivers/md/dm-vdo/pool-sysfs.h b/drivers/md/dm-vdo/pool-sysfs.h
deleted file mode 100644
index 00e680924dc1fb..00000000000000
--- a/drivers/md/dm-vdo/pool-sysfs.h
+++ /dev/null
@@ -1,19 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright 2023 Red Hat
- */
-
-#ifndef VDO_POOL_SYSFS_H
-#define VDO_POOL_SYSFS_H
-
-#include <linux/kobject.h>
-
-/* The kobj_type used for setting up the kernel layer kobject. */
-extern const struct kobj_type vdo_directory_type;
-
-/* The sysfs_ops used for the "statistics" subdirectory. */
-extern const struct sysfs_ops vdo_pool_stats_sysfs_ops;
-/* The attribute used for the "statistics" subdirectory. */
-extern struct attribute *vdo_pool_stats_attrs[];
-
-#endif /* VDO_POOL_SYSFS_H */
diff --git a/drivers/md/dm-vdo/sysfs.c b/drivers/md/dm-vdo/sysfs.c
deleted file mode 100644
index 70feffe9d4c447..00000000000000
--- a/drivers/md/dm-vdo/sysfs.c
+++ /dev/null
@@ -1,82 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright 2023 Red Hat
- */
-
-#include <linux/module.h>
-
-#include "logger.h"
-
-#include "constants.h"
-#include "dedupe.h"
-#include "vdo.h"
-
-static int vdo_log_level_show(char *buf, const struct kernel_param *kp)
-{
-	return sprintf(buf, "%s\n", uds_log_priority_to_string(uds_get_log_level()));
-}
-
-static int vdo_log_level_store(const char *buf, const struct kernel_param *kp)
-{
-	static char internal_buf[11];
-
-	int n = strlen(buf);
-
-	if (n > 10)
-		return -EINVAL;
-
-	memset(internal_buf, '\000', sizeof(internal_buf));
-	memcpy(internal_buf, buf, n);
-	if (internal_buf[n - 1] == '\n')
-		internal_buf[n - 1] = '\000';
-	uds_set_log_level(uds_log_string_to_priority(internal_buf));
-	return 0;
-}
-
-
-static int vdo_dedupe_timeout_interval_store(const char *buf,
-					     const struct kernel_param *kp)
-{
-	int result = param_set_uint(buf, kp);
-
-	if (result != 0)
-		return result;
-	vdo_set_dedupe_index_timeout_interval(*(uint *)kp->arg);
-	return 0;
-}
-
-static int vdo_min_dedupe_timer_interval_store(const char *buf,
-					       const struct kernel_param *kp)
-{
-	int result = param_set_uint(buf, kp);
-
-	if (result != 0)
-		return result;
-	vdo_set_dedupe_index_min_timer_interval(*(uint *)kp->arg);
-	return 0;
-}
-
-static const struct kernel_param_ops log_level_ops = {
-	.set = vdo_log_level_store,
-	.get = vdo_log_level_show,
-};
-
-
-static const struct kernel_param_ops dedupe_timeout_ops = {
-	.set = vdo_dedupe_timeout_interval_store,
-	.get = param_get_uint,
-};
-
-static const struct kernel_param_ops dedupe_timer_ops = {
-	.set = vdo_min_dedupe_timer_interval_store,
-	.get = param_get_uint,
-};
-
-module_param_cb(log_level, &log_level_ops, NULL, 0644);
-
-
-module_param_cb(deduplication_timeout_interval, &dedupe_timeout_ops,
-		&vdo_dedupe_index_timeout_interval, 0644);
-
-module_param_cb(min_deduplication_timer_interval, &dedupe_timer_ops,
-		&vdo_dedupe_index_min_timer_interval, 0644);
diff --git a/drivers/md/dm-vdo/uds-sysfs.c b/drivers/md/dm-vdo/uds-sysfs.c
deleted file mode 100644
index 101db86f476f21..00000000000000
--- a/drivers/md/dm-vdo/uds-sysfs.c
+++ /dev/null
@@ -1,186 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright 2023 Red Hat
- */
-
-#include "uds-sysfs.h"
-
-#include <linux/kobject.h>
-#include <linux/module.h>
-#include <linux/slab.h>
-
-#include "logger.h"
-#include "memory-alloc.h"
-#include "string-utils.h"
-#include "indexer/indexer.h"
-
-#define UDS_SYSFS_NAME "uds"
-
-static struct {
-	/* /sys/uds */
-	struct kobject kobj;
-	/* /sys/uds/parameter */
-	struct kobject parameter_kobj;
-
-	/* These flags are used to ensure a clean shutdown */
-
-	/* /sys/uds flag */
-	bool flag;
-	/* /sys/uds/parameter flag */
-	bool parameter_flag;
-} object_root;
-
-static char *buffer_to_string(const char *buf, size_t length)
-{
-	char *string;
-
-	if (uds_allocate(length + 1, char, __func__, &string) != UDS_SUCCESS)
-		return NULL;
-
-	memcpy(string, buf, length);
-	string[length] = '\0';
-	if (string[length - 1] == '\n')
-		string[length - 1] = '\0';
-
-	return string;
-}
-
-/*
- * This is the code for any directory in the /sys/<module_name> tree that contains no regular files
- * (only subdirectories).
- */
-
-static void empty_release(struct kobject *kobj)
-{
-}
-
-static ssize_t empty_show(struct kobject *kobj, struct attribute *attr, char *buf)
-{
-	return 0;
-}
-
-static ssize_t empty_store(struct kobject *kobj, struct attribute *attr, const char *buf,
-			   size_t length)
-{
-	return length;
-}
-
-static const struct sysfs_ops empty_ops = {
-	.show = empty_show,
-	.store = empty_store,
-};
-
-static struct attribute *empty_attrs[] = {
-	NULL,
-};
-ATTRIBUTE_GROUPS(empty);
-
-static const struct kobj_type empty_object_type = {
-	.release = empty_release,
-	.sysfs_ops = &empty_ops,
-	.default_groups = empty_groups,
-};
-
-/*
- * This is the code for the /sys/<module_name>/parameter directory.
- * <dir>/log_level                 UDS_LOG_LEVEL
- */
-
-struct parameter_attribute {
-	struct attribute attr;
-	const char *(*show_string)(void);
-	void (*store_string)(const char *string);
-};
-
-static ssize_t parameter_show(struct kobject *kobj, struct attribute *attr, char *buf)
-{
-	struct parameter_attribute *pa;
-
-	pa = container_of(attr, struct parameter_attribute, attr);
-	if (pa->show_string != NULL)
-		return sprintf(buf, "%s\n", pa->show_string());
-	else
-		return -EINVAL;
-}
-
-static ssize_t parameter_store(struct kobject *kobj, struct attribute *attr,
-			       const char *buf, size_t length)
-{
-	char *string;
-	struct parameter_attribute *pa;
-
-	pa = container_of(attr, struct parameter_attribute, attr);
-	if (pa->store_string == NULL)
-		return -EINVAL;
-	string = buffer_to_string(buf, length);
-	if (string == NULL)
-		return -ENOMEM;
-
-	pa->store_string(string);
-	uds_free(string);
-	return length;
-}
-
-static const char *parameter_show_log_level(void)
-{
-	return uds_log_priority_to_string(uds_get_log_level());
-}
-
-static void parameter_store_log_level(const char *string)
-{
-	uds_set_log_level(uds_log_string_to_priority(string));
-}
-
-static struct parameter_attribute log_level_attr = {
-	.attr = { .name = "log_level", .mode = 0600 },
-	.show_string = parameter_show_log_level,
-	.store_string = parameter_store_log_level,
-};
-
-static struct attribute *parameter_attrs[] = {
-	&log_level_attr.attr,
-	NULL,
-};
-ATTRIBUTE_GROUPS(parameter);
-
-static const struct sysfs_ops parameter_ops = {
-	.show = parameter_show,
-	.store = parameter_store,
-};
-
-static const struct kobj_type parameter_object_type = {
-	.release = empty_release,
-	.sysfs_ops = &parameter_ops,
-	.default_groups = parameter_groups,
-};
-
-int uds_init_sysfs(void)
-{
-	int result;
-
-	memset(&object_root, 0, sizeof(object_root));
-	kobject_init(&object_root.kobj, &empty_object_type);
-	result = kobject_add(&object_root.kobj, NULL, UDS_SYSFS_NAME);
-	if (result == 0) {
-		object_root.flag = true;
-		kobject_init(&object_root.parameter_kobj, &parameter_object_type);
-		result = kobject_add(&object_root.parameter_kobj, &object_root.kobj,
-				     "parameter");
-		if (result == 0)
-			object_root.parameter_flag = true;
-	}
-
-	if (result != 0)
-		uds_put_sysfs();
-
-	return result;
-}
-
-void uds_put_sysfs(void)
-{
-	if (object_root.parameter_flag)
-		kobject_put(&object_root.parameter_kobj);
-
-	if (object_root.flag)
-		kobject_put(&object_root.kobj);
-}
diff --git a/drivers/md/dm-vdo/uds-sysfs.h b/drivers/md/dm-vdo/uds-sysfs.h
deleted file mode 100644
index c3d00a7187bdb3..00000000000000
--- a/drivers/md/dm-vdo/uds-sysfs.h
+++ /dev/null
@@ -1,12 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright 2023 Red Hat
- */
-
-#ifndef UDS_SYSFS_H
-#define UDS_SYSFS_H
-
-int uds_init_sysfs(void);
-void uds_put_sysfs(void);
-
-#endif  /* UDS_SYSFS_H */
diff --git a/drivers/md/dm-vdo/vdo.c b/drivers/md/dm-vdo/vdo.c
index e0eddd4007b8ba..0fe32dfcf3dd05 100644
--- a/drivers/md/dm-vdo/vdo.c
+++ b/drivers/md/dm-vdo/vdo.c
@@ -53,7 +53,6 @@
 #include "logical-zone.h"
 #include "packer.h"
 #include "physical-zone.h"
-#include "pool-sysfs.h"
 #include "recovery-journal.h"
 #include "slab-depot.h"
 #include "statistics.h"
@@ -694,13 +693,6 @@ void vdo_destroy(struct vdo *vdo)
 
 	vdo->allocations_allowed = true;
 
-	/* Stop services that need to gather VDO statistics from the worker threads. */
-	if (vdo->sysfs_added) {
-		init_completion(&vdo->stats_shutdown);
-		kobject_put(&vdo->stats_directory);
-		wait_for_completion(&vdo->stats_shutdown);
-	}
-
 	finish_vdo(vdo);
 	unregister_vdo(vdo);
 	free_data_vio_pool(vdo->data_vio_pool);
@@ -735,15 +727,6 @@ void vdo_destroy(struct vdo *vdo)
 
 		uds_free(uds_forget(vdo->compression_context));
 	}
-
-	/*
-	 * The call to kobject_put on the kobj sysfs node will decrement its reference count; when
-	 * the count goes to zero the VDO object will be freed as a side effect.
-	 */
-	if (!vdo->sysfs_added)
-		uds_free(vdo);
-	else
-		kobject_put(&vdo->vdo_directory);
 }
 
 static int initialize_super_block(struct vdo *vdo, struct vdo_super_block *super_block)
@@ -820,42 +803,6 @@ void vdo_load_super_block(struct vdo *vdo, struct vdo_completion *parent)
 				REQ_OP_READ);
 }
 
-/**
- * pool_stats_release() - Signal that sysfs stats have been shut down.
- * @directory: The vdo stats directory.
- */
-static void pool_stats_release(struct kobject *directory)
-{
-	struct vdo *vdo = container_of(directory, struct vdo, stats_directory);
-
-	complete(&vdo->stats_shutdown);
-}
-
-ATTRIBUTE_GROUPS(vdo_pool_stats);
-static const struct kobj_type stats_directory_type = {
-	.release = pool_stats_release,
-	.sysfs_ops = &vdo_pool_stats_sysfs_ops,
-	.default_groups = vdo_pool_stats_groups,
-};
-
-/**
- * vdo_add_sysfs_stats_dir() - Add the stats directory to the vdo sysfs directory.
- * @vdo: The vdo.
- *
- * Return: VDO_SUCCESS or an error.
- */
-int vdo_add_sysfs_stats_dir(struct vdo *vdo)
-{
-	int result;
-
-	kobject_init(&vdo->stats_directory, &stats_directory_type);
-	result = kobject_add(&vdo->stats_directory, &vdo->vdo_directory, "statistics");
-	if (result != 0)
-		return VDO_CANT_ADD_SYSFS_NODE;
-
-	return VDO_SUCCESS;
-}
-
 /**
  * vdo_get_backing_device() - Get the block device object underlying a vdo.
  * @vdo: The vdo.
diff --git a/drivers/md/dm-vdo/vdo.h b/drivers/md/dm-vdo/vdo.h
index 85536dc5835151..c46171d2701339 100644
--- a/drivers/md/dm-vdo/vdo.h
+++ b/drivers/md/dm-vdo/vdo.h
@@ -10,7 +10,6 @@
 #include <linux/blk_types.h>
 #include <linux/completion.h>
 #include <linux/dm-kcopyd.h>
-#include <linux/kobject.h>
 #include <linux/list.h>
 #include <linux/spinlock.h>
 
@@ -249,11 +248,6 @@ struct vdo {
 	struct vdo_statistics stats_buffer;
 	/* Protects the stats_buffer */
 	struct mutex stats_mutex;
-	/* true if sysfs directory is set up */
-	bool sysfs_added;
-	/* Used when shutting down the sysfs statistics */
-	struct completion stats_shutdown;
-
 
 	/* A list of all device_configs referencing this vdo */
 	struct list_head device_config_list;
@@ -265,10 +259,6 @@ struct vdo {
 	u64 starting_sector_offset;
 	struct volume_geometry geometry;
 
-	/* For sysfs */
-	struct kobject vdo_directory;
-	struct kobject stats_directory;
-
 	/* N blobs of context data for LZ4 code, one per CPU thread. */
 	char **compression_context;
 };

From 1ba96866cc6fad9146b0b628ddb7b7c2cf19dc28 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@kernel.org>
Date: Sat, 10 Feb 2024 10:42:00 -0600
Subject: [PATCH 0897/1406] dm vdo: add 'log_level' module parameter

Expose control over dm-vdo's log-level in terms of a module param, it
can be read and written via /sys/module/dm_vdo/parameters/log_level

Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 drivers/md/dm-vdo/dm-vdo-target.c |  3 +++
 drivers/md/dm-vdo/logger.c        | 10 ++++++++--
 drivers/md/dm-vdo/logger.h        | 24 ++++++++++++++++--------
 3 files changed, 27 insertions(+), 10 deletions(-)

diff --git a/drivers/md/dm-vdo/dm-vdo-target.c b/drivers/md/dm-vdo/dm-vdo-target.c
index d253c7078a798e..9996ac7005fb34 100644
--- a/drivers/md/dm-vdo/dm-vdo-target.c
+++ b/drivers/md/dm-vdo/dm-vdo-target.c
@@ -2916,6 +2916,9 @@ static void __exit vdo_exit(void)
 module_init(vdo_init);
 module_exit(vdo_exit);
 
+module_param_named(log_level, log_level, uint, 0644);
+MODULE_PARM_DESC(log_level, "Log-level for log messages");
+
 MODULE_DESCRIPTION(DM_NAME " target for transparent deduplication");
 MODULE_AUTHOR("Red Hat, Inc.");
 MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-vdo/logger.c b/drivers/md/dm-vdo/logger.c
index 19a45f41d48f9c..2ec979617dbd21 100644
--- a/drivers/md/dm-vdo/logger.c
+++ b/drivers/md/dm-vdo/logger.c
@@ -48,11 +48,17 @@ static const char *const PRIORITY_STRINGS[] = {
 	"DEBUG",
 };
 
-static int log_level = UDS_LOG_INFO;
+int log_level = UDS_LOG_DEFAULT;
 
 int uds_get_log_level(void)
 {
-	return log_level;
+	int log_level_latch = READ_ONCE(log_level);
+
+	if (unlikely(log_level_latch > UDS_LOG_MAX)) {
+		log_level_latch = UDS_LOG_DEFAULT;
+		WRITE_ONCE(log_level, log_level_latch);
+	}
+	return log_level_latch;
 }
 
 int uds_log_string_to_priority(const char *string)
diff --git a/drivers/md/dm-vdo/logger.h b/drivers/md/dm-vdo/logger.h
index ceb07aa3231fb3..2da2bd351578d6 100644
--- a/drivers/md/dm-vdo/logger.h
+++ b/drivers/md/dm-vdo/logger.h
@@ -6,20 +6,28 @@
 #ifndef UDS_LOGGER_H
 #define UDS_LOGGER_H
 
+#include <linux/kern_levels.h>
 #include <linux/module.h>
 #include <linux/ratelimit.h>
 #include <linux/device-mapper.h>
 
 /* Custom logging utilities for UDS */
 
-#define UDS_LOG_EMERG 0
-#define UDS_LOG_ALERT 1
-#define UDS_LOG_CRIT 2
-#define UDS_LOG_ERR 3
-#define UDS_LOG_WARNING 4
-#define UDS_LOG_NOTICE 5
-#define UDS_LOG_INFO 6
-#define UDS_LOG_DEBUG 7
+enum {
+	UDS_LOG_EMERG = LOGLEVEL_EMERG,
+	UDS_LOG_ALERT = LOGLEVEL_ALERT,
+	UDS_LOG_CRIT = LOGLEVEL_CRIT,
+	UDS_LOG_ERR = LOGLEVEL_ERR,
+	UDS_LOG_WARNING = LOGLEVEL_WARNING,
+	UDS_LOG_NOTICE = LOGLEVEL_NOTICE,
+	UDS_LOG_INFO = LOGLEVEL_INFO,
+	UDS_LOG_DEBUG = LOGLEVEL_DEBUG,
+
+	UDS_LOG_MAX = UDS_LOG_DEBUG,
+	UDS_LOG_DEFAULT = UDS_LOG_INFO,
+};
+
+extern int log_level;
 
 #define DM_MSG_PREFIX "vdo"
 #define UDS_LOGGING_MODULE_NAME DM_NAME ": " DM_MSG_PREFIX

From 26248f47f3b0264d3cf0cd13378ea6fcccaf1941 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@linaro.org>
Date: Fri, 9 Feb 2024 16:06:05 +0300
Subject: [PATCH 0898/1406] dm vdo slab-depot: delete unnecessary check in
 allocate_components

This is a duplicate check so it can't be true.  Delete it.

Signed-off-by: Dan Carpenter <dan.carpenter@linaro.org>
Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 drivers/md/dm-vdo/slab-depot.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/drivers/md/dm-vdo/slab-depot.c b/drivers/md/dm-vdo/slab-depot.c
index 42126bd60242f6..2f4a2ae5e0823e 100644
--- a/drivers/md/dm-vdo/slab-depot.c
+++ b/drivers/md/dm-vdo/slab-depot.c
@@ -4100,9 +4100,6 @@ static int allocate_components(struct slab_depot *depot,
 		};
 	}
 
-	if (result != VDO_SUCCESS)
-		return result;
-
 	slab_count = vdo_compute_slab_count(depot->first_block, depot->last_block,
 					    depot->slab_size_shift);
 	if (thread_config->physical_zone_count > slab_count) {

From c6fa7712f0dfd4d73ce90f6f62974ad90bb8330a Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@kernel.org>
Date: Sun, 11 Feb 2024 14:07:18 -0500
Subject: [PATCH 0899/1406] dm vdo flush: initialize return to NULL in
 allocate_flush

Otherwise, error path could result in allocate_flush's subsequent
check for flush being non-NULL leading to false positive.

Reported-by: Dan Carpenter <dan.carpenter@linaro.org>
Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 drivers/md/dm-vdo/flush.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/md/dm-vdo/flush.c b/drivers/md/dm-vdo/flush.c
index 330b18715027e3..391b6203efc6de 100644
--- a/drivers/md/dm-vdo/flush.c
+++ b/drivers/md/dm-vdo/flush.c
@@ -100,7 +100,7 @@ static struct vdo_flush *vdo_waiter_as_flush(struct vdo_waiter *waiter)
 
 static void *allocate_flush(gfp_t gfp_mask, void *pool_data)
 {
-	struct vdo_flush *flush;
+	struct vdo_flush *flush = NULL;
 
 	if ((gfp_mask & GFP_NOWAIT) == GFP_NOWAIT) {
 		flush = uds_allocate_memory_nowait(sizeof(struct vdo_flush), __func__);

From 817b308b090fc750fc0b0aa3f0b71d5d30d7151b Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@kernel.org>
Date: Sun, 11 Feb 2024 14:49:42 -0500
Subject: [PATCH 0900/1406] dm vdo indexer-volume: fix missing mutex_lock in
 process_entry

Must mutex_lock after dm_bufio_read, before dm_bufio_read error
handling, otherwise process_entry error path will return without
volume->read_threads_mutex held. This fixes potential double
mutex_unlock.

Reported-by: Dan Carpenter <dan.carpenter@linaro.org>
Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 drivers/md/dm-vdo/indexer/volume.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/md/dm-vdo/indexer/volume.c b/drivers/md/dm-vdo/indexer/volume.c
index 8ce05a98d7e547..ee53bd9999d14b 100644
--- a/drivers/md/dm-vdo/indexer/volume.c
+++ b/drivers/md/dm-vdo/indexer/volume.c
@@ -557,6 +557,7 @@ static int process_entry(struct volume *volume, struct queued_read *entry)
 
 	mutex_unlock(&volume->read_threads_mutex);
 	page_data = dm_bufio_read(volume->client, page_number, &page->buffer);
+	mutex_lock(&volume->read_threads_mutex);
 	if (IS_ERR(page_data)) {
 		result = -PTR_ERR(page_data);
 		uds_log_warning_strerror(result,
@@ -565,7 +566,6 @@ static int process_entry(struct volume *volume, struct queued_read *entry)
 		cancel_page_in_cache(&volume->page_cache, page_number, page);
 		return result;
 	}
-	mutex_lock(&volume->read_threads_mutex);
 
 	if (entry->invalid) {
 		uds_log_warning("Page %u invalidated after read", page_number);

From c5a0654be4d259846b313ac850c9833f1e762404 Mon Sep 17 00:00:00 2001
From: Harshit Mogalapalli <harshit.m.mogalapalli@oracle.com>
Date: Mon, 12 Feb 2024 02:25:04 -0800
Subject: [PATCH 0901/1406] dm vdo volume-index: fix an assert statement in
 start_restoring_volume_sub_index()

Use "==" instead of "=" in ASSERT() statement.

Fixes: ef074a31e88e ("dm vdo: implement the volume index")
Signed-off-by: Harshit Mogalapalli <harshit.m.mogalapalli@oracle.com>
Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 drivers/md/dm-vdo/indexer/volume-index.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/md/dm-vdo/indexer/volume-index.c b/drivers/md/dm-vdo/indexer/volume-index.c
index 5fe34e6c1d9b88..d6526fe9bbfc5c 100644
--- a/drivers/md/dm-vdo/indexer/volume-index.c
+++ b/drivers/md/dm-vdo/indexer/volume-index.c
@@ -830,7 +830,7 @@ static int start_restoring_volume_sub_index(struct volume_sub_index *sub_index,
 		decode_u32_le(buffer, &offset, &header.first_list);
 		decode_u32_le(buffer, &offset, &header.list_count);
 
-		result = ASSERT(offset = sizeof(buffer),
+		result = ASSERT(offset == sizeof(buffer),
 				"%zu bytes decoded of %zu expected", offset,
 				sizeof(buffer));
 		if (result != UDS_SUCCESS)

From 26640da64ee8d44678a8cfddd38a8423e7af0143 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@kernel.org>
Date: Tue, 13 Feb 2024 10:51:19 -0500
Subject: [PATCH 0902/1406] dm vdo: include <asm/current.h> to resolve current
 being undeclared

Reported when building on loongarch.

Reported-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 drivers/md/dm-vdo/logger.c          | 1 +
 drivers/md/dm-vdo/thread-registry.c | 1 +
 drivers/md/dm-vdo/thread-utils.c    | 1 +
 3 files changed, 3 insertions(+)

diff --git a/drivers/md/dm-vdo/logger.c b/drivers/md/dm-vdo/logger.c
index 2ec979617dbd21..6dc29219f70259 100644
--- a/drivers/md/dm-vdo/logger.c
+++ b/drivers/md/dm-vdo/logger.c
@@ -5,6 +5,7 @@
 
 #include "logger.h"
 
+#include <asm/current.h>
 #include <linux/delay.h>
 #include <linux/hardirq.h>
 #include <linux/module.h>
diff --git a/drivers/md/dm-vdo/thread-registry.c b/drivers/md/dm-vdo/thread-registry.c
index 1314d2b6a26f5e..03e2f45e8e7874 100644
--- a/drivers/md/dm-vdo/thread-registry.c
+++ b/drivers/md/dm-vdo/thread-registry.c
@@ -5,6 +5,7 @@
 
 #include "thread-registry.h"
 
+#include <asm/current.h>
 #include <linux/rculist.h>
 
 #include "permassert.h"
diff --git a/drivers/md/dm-vdo/thread-utils.c b/drivers/md/dm-vdo/thread-utils.c
index 160679984d72bd..aeca14bba8529b 100644
--- a/drivers/md/dm-vdo/thread-utils.c
+++ b/drivers/md/dm-vdo/thread-utils.c
@@ -5,6 +5,7 @@
 
 #include "thread-utils.h"
 
+#include <asm/current.h>
 #include <linux/delay.h>
 #include <linux/kthread.h>
 #include <linux/mutex.h>

From e3141c327acef4e24faff404a8c88e39a3844142 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@kernel.org>
Date: Tue, 13 Feb 2024 11:04:11 -0600
Subject: [PATCH 0903/1406] dm vdo block-map: rename page state name from
 "UDS_FREE" to "FREE"

Only used for log message, but no need for "UDS_" prefix.

Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 drivers/md/dm-vdo/block-map.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/md/dm-vdo/block-map.c b/drivers/md/dm-vdo/block-map.c
index cc87e1ac72b676..e3fadb5f2c2ddb 100644
--- a/drivers/md/dm-vdo/block-map.c
+++ b/drivers/md/dm-vdo/block-map.c
@@ -284,7 +284,7 @@ static const char * __must_check get_page_state_name(enum vdo_page_buffer_state
 {
 	int result;
 	static const char * const state_names[] = {
-		"UDS_FREE", "INCOMING", "FAILED", "RESIDENT", "DIRTY", "OUTGOING"
+		"FREE", "INCOMING", "FAILED", "RESIDENT", "DIRTY", "OUTGOING"
 	};
 
 	BUILD_BUG_ON(ARRAY_SIZE(state_names) != PAGE_STATE_COUNT);

From b6285412846d618f00d194d3703b2290c18a8c7c Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@kernel.org>
Date: Tue, 13 Feb 2024 10:55:50 -0600
Subject: [PATCH 0904/1406] dm vdo memory-alloc: change from uds_ to vdo_
 namespace

Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 drivers/md/dm-vdo/action-manager.c         |   2 +-
 drivers/md/dm-vdo/admin-state.c            |   2 +-
 drivers/md/dm-vdo/block-map.c              |  60 ++++++------
 drivers/md/dm-vdo/data-vio.c               |  24 ++---
 drivers/md/dm-vdo/dedupe.c                 |  32 +++----
 drivers/md/dm-vdo/dm-vdo-target.c          |  56 +++++------
 drivers/md/dm-vdo/dump.c                   |   2 +-
 drivers/md/dm-vdo/encodings.c              |   4 +-
 drivers/md/dm-vdo/flush.c                  |  12 +--
 drivers/md/dm-vdo/funnel-queue.c           |   6 +-
 drivers/md/dm-vdo/funnel-queue.h           |   2 +-
 drivers/md/dm-vdo/funnel-requestqueue.c    |   8 +-
 drivers/md/dm-vdo/funnel-workqueue.c       |  32 +++----
 drivers/md/dm-vdo/indexer/chapter-index.c  |   8 +-
 drivers/md/dm-vdo/indexer/chapter-index.h  |   2 +-
 drivers/md/dm-vdo/indexer/config.c         |  10 +-
 drivers/md/dm-vdo/indexer/config.h         |   2 +-
 drivers/md/dm-vdo/indexer/delta-index.c    |  20 ++--
 drivers/md/dm-vdo/indexer/geometry.c       |   6 +-
 drivers/md/dm-vdo/indexer/geometry.h       |   2 +-
 drivers/md/dm-vdo/indexer/index-layout.c   | 106 ++++++++++-----------
 drivers/md/dm-vdo/indexer/index-layout.h   |   2 +-
 drivers/md/dm-vdo/indexer/index-page-map.c |  24 ++---
 drivers/md/dm-vdo/indexer/index-page-map.h |   2 +-
 drivers/md/dm-vdo/indexer/index-session.c  |  10 +-
 drivers/md/dm-vdo/indexer/index.c          |  56 +++++------
 drivers/md/dm-vdo/indexer/index.h          |   2 +-
 drivers/md/dm-vdo/indexer/io-factory.c     |  18 ++--
 drivers/md/dm-vdo/indexer/io-factory.h     |   4 +-
 drivers/md/dm-vdo/indexer/open-chapter.c   |  12 +--
 drivers/md/dm-vdo/indexer/open-chapter.h   |   2 +-
 drivers/md/dm-vdo/indexer/radix-sort.c     |   6 +-
 drivers/md/dm-vdo/indexer/radix-sort.h     |   2 +-
 drivers/md/dm-vdo/indexer/sparse-cache.c   |  26 ++---
 drivers/md/dm-vdo/indexer/sparse-cache.h   |   2 +-
 drivers/md/dm-vdo/indexer/volume-index.c   |  26 ++---
 drivers/md/dm-vdo/indexer/volume-index.h   |   2 +-
 drivers/md/dm-vdo/indexer/volume.c         |  64 ++++++-------
 drivers/md/dm-vdo/indexer/volume.h         |   4 +-
 drivers/md/dm-vdo/int-map.c                |  14 +--
 drivers/md/dm-vdo/io-submitter.c           |  10 +-
 drivers/md/dm-vdo/logical-zone.c           |   8 +-
 drivers/md/dm-vdo/memory-alloc.c           |  38 ++++----
 drivers/md/dm-vdo/memory-alloc.h           |  52 +++++-----
 drivers/md/dm-vdo/message-stats.c          |   4 +-
 drivers/md/dm-vdo/packer.c                 |  14 +--
 drivers/md/dm-vdo/physical-zone.c          |  16 ++--
 drivers/md/dm-vdo/priority-table.c         |   4 +-
 drivers/md/dm-vdo/recovery-journal.c       |  36 +++----
 drivers/md/dm-vdo/repair.c                 |  24 ++---
 drivers/md/dm-vdo/slab-depot.c             |  80 ++++++++--------
 drivers/md/dm-vdo/slab-depot.h             |   2 +-
 drivers/md/dm-vdo/thread-utils.c           |  10 +-
 drivers/md/dm-vdo/vdo.c                    |  90 ++++++++---------
 drivers/md/dm-vdo/vio.c                    |  20 ++--
 55 files changed, 541 insertions(+), 543 deletions(-)

diff --git a/drivers/md/dm-vdo/action-manager.c b/drivers/md/dm-vdo/action-manager.c
index 973901fc317450..709be4c17d2721 100644
--- a/drivers/md/dm-vdo/action-manager.c
+++ b/drivers/md/dm-vdo/action-manager.c
@@ -107,7 +107,7 @@ int vdo_make_action_manager(zone_count_t zones,
 			    struct action_manager **manager_ptr)
 {
 	struct action_manager *manager;
-	int result = uds_allocate(1, struct action_manager, __func__, &manager);
+	int result = vdo_allocate(1, struct action_manager, __func__, &manager);
 
 	if (result != VDO_SUCCESS)
 		return result;
diff --git a/drivers/md/dm-vdo/admin-state.c b/drivers/md/dm-vdo/admin-state.c
index 94533a802edbbc..603fd6e3406ab8 100644
--- a/drivers/md/dm-vdo/admin-state.c
+++ b/drivers/md/dm-vdo/admin-state.c
@@ -206,7 +206,7 @@ bool vdo_finish_operation(struct admin_state *state, int result)
 	if (!state->starting) {
 		vdo_set_admin_state_code(state, state->next_state);
 		if (state->waiter != NULL)
-			vdo_launch_completion(uds_forget(state->waiter));
+			vdo_launch_completion(vdo_forget(state->waiter));
 	}
 
 	return true;
diff --git a/drivers/md/dm-vdo/block-map.c b/drivers/md/dm-vdo/block-map.c
index e3fadb5f2c2ddb..5012ddbb5b0ecc 100644
--- a/drivers/md/dm-vdo/block-map.c
+++ b/drivers/md/dm-vdo/block-map.c
@@ -223,12 +223,12 @@ static int __must_check allocate_cache_components(struct vdo_page_cache *cache)
 	u64 size = cache->page_count * (u64) VDO_BLOCK_SIZE;
 	int result;
 
-	result = uds_allocate(cache->page_count, struct page_info, "page infos",
+	result = vdo_allocate(cache->page_count, struct page_info, "page infos",
 			      &cache->infos);
 	if (result != UDS_SUCCESS)
 		return result;
 
-	result = uds_allocate_memory(size, VDO_BLOCK_SIZE, "cache pages", &cache->pages);
+	result = vdo_allocate_memory(size, VDO_BLOCK_SIZE, "cache pages", &cache->pages);
 	if (result != UDS_SUCCESS)
 		return result;
 
@@ -1343,7 +1343,7 @@ int vdo_invalidate_page_cache(struct vdo_page_cache *cache)
 	}
 
 	/* Reset the page map by re-allocating it. */
-	vdo_int_map_free(uds_forget(cache->page_map));
+	vdo_int_map_free(vdo_forget(cache->page_map));
 	return vdo_int_map_create(cache->page_count, &cache->page_map);
 }
 
@@ -2348,17 +2348,17 @@ static int make_segment(struct forest *old_forest, block_count_t new_pages,
 
 	forest->segments = index + 1;
 
-	result = uds_allocate(forest->segments, struct boundary,
+	result = vdo_allocate(forest->segments, struct boundary,
 			      "forest boundary array", &forest->boundaries);
 	if (result != VDO_SUCCESS)
 		return result;
 
-	result = uds_allocate(forest->segments, struct tree_page *,
+	result = vdo_allocate(forest->segments, struct tree_page *,
 			      "forest page pointers", &forest->pages);
 	if (result != VDO_SUCCESS)
 		return result;
 
-	result = uds_allocate(new_pages, struct tree_page,
+	result = vdo_allocate(new_pages, struct tree_page,
 			      "new forest pages", &forest->pages[index]);
 	if (result != VDO_SUCCESS)
 		return result;
@@ -2384,7 +2384,7 @@ static int make_segment(struct forest *old_forest, block_count_t new_pages,
 		struct block_map_tree *tree = &(forest->trees[root]);
 		height_t height;
 
-		int result = uds_allocate(forest->segments,
+		int result = vdo_allocate(forest->segments,
 					  struct block_map_tree_segment,
 					  "tree root segments", &tree->segments);
 		if (result != VDO_SUCCESS)
@@ -2426,15 +2426,15 @@ static void deforest(struct forest *forest, size_t first_page_segment)
 		size_t segment;
 
 		for (segment = first_page_segment; segment < forest->segments; segment++)
-			uds_free(forest->pages[segment]);
-		uds_free(forest->pages);
+			vdo_free(forest->pages[segment]);
+		vdo_free(forest->pages);
 	}
 
 	for (root = 0; root < forest->map->root_count; root++)
-		uds_free(forest->trees[root].segments);
+		vdo_free(forest->trees[root].segments);
 
-	uds_free(forest->boundaries);
-	uds_free(forest);
+	vdo_free(forest->boundaries);
+	vdo_free(forest);
 }
 
 /**
@@ -2461,7 +2461,7 @@ static int make_forest(struct block_map *map, block_count_t entries)
 		return VDO_SUCCESS;
 	}
 
-	result = uds_allocate_extended(struct forest, map->root_count,
+	result = vdo_allocate_extended(struct forest, map->root_count,
 				       struct block_map_tree, __func__,
 				       &forest);
 	if (result != VDO_SUCCESS)
@@ -2487,7 +2487,7 @@ static void replace_forest(struct block_map *map)
 	if (map->next_forest != NULL) {
 		if (map->forest != NULL)
 			deforest(map->forest, map->forest->segments);
-		map->forest = uds_forget(map->next_forest);
+		map->forest = vdo_forget(map->next_forest);
 	}
 
 	map->entry_count = map->next_entry_count;
@@ -2503,11 +2503,11 @@ static void finish_cursor(struct cursor *cursor)
 	struct cursors *cursors = cursor->parent;
 	struct vdo_completion *completion = cursors->completion;
 
-	return_vio_to_pool(cursors->pool, uds_forget(cursor->vio));
+	return_vio_to_pool(cursors->pool, vdo_forget(cursor->vio));
 	if (--cursors->active_roots > 0)
 		return;
 
-	uds_free(cursors);
+	vdo_free(cursors);
 
 	vdo_finish_completion(completion);
 }
@@ -2683,7 +2683,7 @@ void vdo_traverse_forest(struct block_map *map, vdo_entry_callback_fn callback,
 	struct cursors *cursors;
 	int result;
 
-	result = uds_allocate_extended(struct cursors, map->root_count,
+	result = vdo_allocate_extended(struct cursors, map->root_count,
 				       struct cursor, __func__, &cursors);
 	if (result != VDO_SUCCESS) {
 		vdo_fail_completion(completion, result);
@@ -2731,7 +2731,7 @@ static int __must_check initialize_block_map_zone(struct block_map *map,
 	zone->thread_id = vdo->thread_config.logical_threads[zone_number];
 	zone->block_map = map;
 
-	result = uds_allocate_extended(struct dirty_lists, maximum_age,
+	result = vdo_allocate_extended(struct dirty_lists, maximum_age,
 				       dirty_era_t, __func__,
 				       &zone->dirty_lists);
 	if (result != VDO_SUCCESS)
@@ -2824,19 +2824,19 @@ static void uninitialize_block_map_zone(struct block_map_zone *zone)
 {
 	struct vdo_page_cache *cache = &zone->page_cache;
 
-	uds_free(uds_forget(zone->dirty_lists));
-	free_vio_pool(uds_forget(zone->vio_pool));
-	vdo_int_map_free(uds_forget(zone->loading_pages));
+	vdo_free(vdo_forget(zone->dirty_lists));
+	free_vio_pool(vdo_forget(zone->vio_pool));
+	vdo_int_map_free(vdo_forget(zone->loading_pages));
 	if (cache->infos != NULL) {
 		struct page_info *info;
 
 		for (info = cache->infos; info < cache->infos + cache->page_count; info++)
-			free_vio(uds_forget(info->vio));
+			free_vio(vdo_forget(info->vio));
 	}
 
-	vdo_int_map_free(uds_forget(cache->page_map));
-	uds_free(uds_forget(cache->infos));
-	uds_free(uds_forget(cache->pages));
+	vdo_int_map_free(vdo_forget(cache->page_map));
+	vdo_free(vdo_forget(cache->infos));
+	vdo_free(vdo_forget(cache->pages));
 }
 
 void vdo_free_block_map(struct block_map *map)
@@ -2851,9 +2851,9 @@ void vdo_free_block_map(struct block_map *map)
 
 	vdo_abandon_block_map_growth(map);
 	if (map->forest != NULL)
-		deforest(uds_forget(map->forest), 0);
-	uds_free(uds_forget(map->action_manager));
-	uds_free(map);
+		deforest(vdo_forget(map->forest), 0);
+	vdo_free(vdo_forget(map->action_manager));
+	vdo_free(map);
 }
 
 /* @journal may be NULL. */
@@ -2873,7 +2873,7 @@ int vdo_decode_block_map(struct block_map_state_2_0 state, block_count_t logical
 	if (result != UDS_SUCCESS)
 		return result;
 
-	result = uds_allocate_extended(struct block_map,
+	result = vdo_allocate_extended(struct block_map,
 				       vdo->thread_config.logical_zone_count,
 				       struct block_map_zone, __func__, &map);
 	if (result != UDS_SUCCESS)
@@ -3055,7 +3055,7 @@ void vdo_grow_block_map(struct block_map *map, struct vdo_completion *parent)
 
 void vdo_abandon_block_map_growth(struct block_map *map)
 {
-	struct forest *forest = uds_forget(map->next_forest);
+	struct forest *forest = vdo_forget(map->next_forest);
 
 	if (forest != NULL)
 		deforest(forest, forest->segments - 1);
diff --git a/drivers/md/dm-vdo/data-vio.c b/drivers/md/dm-vdo/data-vio.c
index 26877c6bdc9e15..de3dd0afd93a4d 100644
--- a/drivers/md/dm-vdo/data-vio.c
+++ b/drivers/md/dm-vdo/data-vio.c
@@ -791,20 +791,20 @@ static int initialize_data_vio(struct data_vio *data_vio, struct vdo *vdo)
 	int result;
 
 	BUILD_BUG_ON(VDO_BLOCK_SIZE > PAGE_SIZE);
-	result = uds_allocate_memory(VDO_BLOCK_SIZE, 0, "data_vio data",
+	result = vdo_allocate_memory(VDO_BLOCK_SIZE, 0, "data_vio data",
 				     &data_vio->vio.data);
 	if (result != VDO_SUCCESS)
 		return uds_log_error_strerror(result,
 					      "data_vio data allocation failure");
 
-	result = uds_allocate_memory(VDO_BLOCK_SIZE, 0, "compressed block",
+	result = vdo_allocate_memory(VDO_BLOCK_SIZE, 0, "compressed block",
 				     &data_vio->compression.block);
 	if (result != VDO_SUCCESS) {
 		return uds_log_error_strerror(result,
 					      "data_vio compressed block allocation failure");
 	}
 
-	result = uds_allocate_memory(VDO_BLOCK_SIZE, 0, "vio scratch",
+	result = vdo_allocate_memory(VDO_BLOCK_SIZE, 0, "vio scratch",
 				     &data_vio->scratch_block);
 	if (result != VDO_SUCCESS)
 		return uds_log_error_strerror(result,
@@ -827,10 +827,10 @@ static void destroy_data_vio(struct data_vio *data_vio)
 	if (data_vio == NULL)
 		return;
 
-	vdo_free_bio(uds_forget(data_vio->vio.bio));
-	uds_free(uds_forget(data_vio->vio.data));
-	uds_free(uds_forget(data_vio->compression.block));
-	uds_free(uds_forget(data_vio->scratch_block));
+	vdo_free_bio(vdo_forget(data_vio->vio.bio));
+	vdo_free(vdo_forget(data_vio->vio.data));
+	vdo_free(vdo_forget(data_vio->compression.block));
+	vdo_free(vdo_forget(data_vio->scratch_block));
 }
 
 /**
@@ -847,7 +847,7 @@ int make_data_vio_pool(struct vdo *vdo, data_vio_count_t pool_size,
 	struct data_vio_pool *pool;
 	data_vio_count_t i;
 
-	result = uds_allocate_extended(struct data_vio_pool, pool_size, struct data_vio,
+	result = vdo_allocate_extended(struct data_vio_pool, pool_size, struct data_vio,
 				       __func__, &pool);
 	if (result != UDS_SUCCESS)
 		return result;
@@ -869,7 +869,7 @@ int make_data_vio_pool(struct vdo *vdo, data_vio_count_t pool_size,
 
 	result = uds_make_funnel_queue(&pool->queue);
 	if (result != UDS_SUCCESS) {
-		free_data_vio_pool(uds_forget(pool));
+		free_data_vio_pool(vdo_forget(pool));
 		return result;
 	}
 
@@ -926,8 +926,8 @@ void free_data_vio_pool(struct data_vio_pool *pool)
 		destroy_data_vio(data_vio);
 	}
 
-	uds_free_funnel_queue(uds_forget(pool->queue));
-	uds_free(pool);
+	vdo_free_funnel_queue(vdo_forget(pool->queue));
+	vdo_free(pool);
 }
 
 static bool acquire_permit(struct limiter *limiter)
@@ -1433,7 +1433,7 @@ void release_data_vio_allocation_lock(struct data_vio *data_vio, bool reset)
 		allocation->pbn = VDO_ZERO_BLOCK;
 
 	vdo_release_physical_zone_pbn_lock(allocation->zone, locked_pbn,
-					   uds_forget(allocation->lock));
+					   vdo_forget(allocation->lock));
 }
 
 /**
diff --git a/drivers/md/dm-vdo/dedupe.c b/drivers/md/dm-vdo/dedupe.c
index 1b1edd50e75814..b88596f4ae5142 100644
--- a/drivers/md/dm-vdo/dedupe.c
+++ b/drivers/md/dm-vdo/dedupe.c
@@ -704,7 +704,7 @@ static void unlock_duplicate_pbn(struct vdo_completion *completion)
 			"must have a duplicate lock to release");
 
 	vdo_release_physical_zone_pbn_lock(agent->duplicate.zone, agent->duplicate.pbn,
-					   uds_forget(lock->duplicate_lock));
+					   vdo_forget(lock->duplicate_lock));
 	if (lock->state == VDO_HASH_LOCK_BYPASSING) {
 		complete_data_vio(completion);
 		return;
@@ -900,7 +900,7 @@ static int __must_check acquire_lock(struct hash_zone *zone,
 	result = vdo_int_map_put(zone->hash_lock_map, hash_lock_key(new_lock),
 				 new_lock, (replace_lock != NULL), (void **) &lock);
 	if (result != VDO_SUCCESS) {
-		return_hash_lock_to_pool(zone, uds_forget(new_lock));
+		return_hash_lock_to_pool(zone, vdo_forget(new_lock));
 		return result;
 	}
 
@@ -919,7 +919,7 @@ static int __must_check acquire_lock(struct hash_zone *zone,
 		lock->registered = true;
 	} else {
 		/* There's already a lock for the hash, so we don't need the borrowed lock. */
-		return_hash_lock_to_pool(zone, uds_forget(new_lock));
+		return_hash_lock_to_pool(zone, vdo_forget(new_lock));
 	}
 
 	*lock_ptr = lock;
@@ -1984,7 +1984,7 @@ static void transfer_allocation_lock(struct data_vio *data_vio)
 	 * Since the lock is being transferred, the holder count doesn't change (and isn't even
 	 * safe to examine on this thread).
 	 */
-	hash_lock->duplicate_lock = uds_forget(allocation->lock);
+	hash_lock->duplicate_lock = vdo_forget(allocation->lock);
 }
 
 /**
@@ -2039,12 +2039,12 @@ static void start_uds_queue(void *ptr)
 	 */
 	struct vdo_thread *thread = vdo_get_work_queue_owner(vdo_get_current_work_queue());
 
-	uds_register_allocating_thread(&thread->allocating_thread, NULL);
+	vdo_register_allocating_thread(&thread->allocating_thread, NULL);
 }
 
 static void finish_uds_queue(void *ptr __always_unused)
 {
-	uds_unregister_allocating_thread();
+	vdo_unregister_allocating_thread();
 }
 
 static void close_index(struct hash_zones *zones)
@@ -2215,7 +2215,7 @@ static int initialize_index(struct vdo *vdo, struct hash_zones *zones)
 	result = vdo_make_thread(vdo, vdo->thread_config.dedupe_thread, &uds_queue_type,
 				 1, NULL);
 	if (result != VDO_SUCCESS) {
-		uds_destroy_index_session(uds_forget(zones->index_session));
+		uds_destroy_index_session(vdo_forget(zones->index_session));
 		uds_log_error("UDS index queue initialization failed (%d)", result);
 		return result;
 	}
@@ -2372,7 +2372,7 @@ static int __must_check initialize_zone(struct vdo *vdo, struct hash_zones *zone
 	vdo_set_completion_callback(&zone->completion, timeout_index_operations_callback,
 				    zone->thread_id);
 	INIT_LIST_HEAD(&zone->lock_pool);
-	result = uds_allocate(LOCK_POOL_CAPACITY, struct hash_lock, "hash_lock array",
+	result = vdo_allocate(LOCK_POOL_CAPACITY, struct hash_lock, "hash_lock array",
 			      &zone->lock_array);
 	if (result != VDO_SUCCESS)
 		return result;
@@ -2426,14 +2426,14 @@ int vdo_make_hash_zones(struct vdo *vdo, struct hash_zones **zones_ptr)
 	if (zone_count == 0)
 		return VDO_SUCCESS;
 
-	result = uds_allocate_extended(struct hash_zones, zone_count, struct hash_zone,
+	result = vdo_allocate_extended(struct hash_zones, zone_count, struct hash_zone,
 				       __func__, &zones);
 	if (result != VDO_SUCCESS)
 		return result;
 
 	result = initialize_index(vdo, zones);
 	if (result != VDO_SUCCESS) {
-		uds_free(zones);
+		vdo_free(zones);
 		return result;
 	}
 
@@ -2465,7 +2465,7 @@ void vdo_finish_dedupe_index(struct hash_zones *zones)
 	if (zones == NULL)
 		return;
 
-	uds_destroy_index_session(uds_forget(zones->index_session));
+	uds_destroy_index_session(vdo_forget(zones->index_session));
 }
 
 /**
@@ -2479,14 +2479,14 @@ void vdo_free_hash_zones(struct hash_zones *zones)
 	if (zones == NULL)
 		return;
 
-	uds_free(uds_forget(zones->manager));
+	vdo_free(vdo_forget(zones->manager));
 
 	for (i = 0; i < zones->zone_count; i++) {
 		struct hash_zone *zone = &zones->zones[i];
 
-		uds_free_funnel_queue(uds_forget(zone->timed_out_complete));
-		vdo_int_map_free(uds_forget(zone->hash_lock_map));
-		uds_free(uds_forget(zone->lock_array));
+		vdo_free_funnel_queue(vdo_forget(zone->timed_out_complete));
+		vdo_int_map_free(vdo_forget(zone->hash_lock_map));
+		vdo_free(vdo_forget(zone->lock_array));
 	}
 
 	if (zones->index_session != NULL)
@@ -2494,7 +2494,7 @@ void vdo_free_hash_zones(struct hash_zones *zones)
 
 	ratelimit_state_exit(&zones->ratelimiter);
 	if (vdo_get_admin_state_code(&zones->state) == VDO_ADMIN_STATE_NEW)
-		uds_free(zones);
+		vdo_free(zones);
 }
 
 static void initiate_suspend_index(struct admin_state *state)
diff --git a/drivers/md/dm-vdo/dm-vdo-target.c b/drivers/md/dm-vdo/dm-vdo-target.c
index 9996ac7005fb34..bffa48fc71e6ff 100644
--- a/drivers/md/dm-vdo/dm-vdo-target.c
+++ b/drivers/md/dm-vdo/dm-vdo-target.c
@@ -189,12 +189,12 @@ static void free_device_config(struct device_config *config)
 	if (config->owned_device != NULL)
 		dm_put_device(config->owning_target, config->owned_device);
 
-	uds_free(config->parent_device_name);
-	uds_free(config->original_string);
+	vdo_free(config->parent_device_name);
+	vdo_free(config->original_string);
 
 	/* Reduce the chance a use-after-free (as in BZ 1669960) happens to work. */
 	memset(config, 0, sizeof(*config));
-	uds_free(config);
+	vdo_free(config);
 }
 
 /**
@@ -249,15 +249,15 @@ static void free_string_array(char **string_array)
 	unsigned int offset;
 
 	for (offset = 0; string_array[offset] != NULL; offset++)
-		uds_free(string_array[offset]);
-	uds_free(string_array);
+		vdo_free(string_array[offset]);
+	vdo_free(string_array);
 }
 
 /*
  * Split the input string into substrings, separated at occurrences of the indicated character,
  * returning a null-terminated list of string pointers.
  *
- * The string pointers and the pointer array itself should both be freed with uds_free() when no
+ * The string pointers and the pointer array itself should both be freed with vdo_free() when no
  * longer needed. This can be done with vdo_free_string_array (below) if the pointers in the array
  * are not changed. Since the array and copied strings are allocated by this function, it may only
  * be used in contexts where allocation is permitted.
@@ -278,7 +278,7 @@ static int split_string(const char *string, char separator, char ***substring_ar
 			substring_count++;
 	}
 
-	result = uds_allocate(substring_count + 1, char *, "string-splitting array",
+	result = vdo_allocate(substring_count + 1, char *, "string-splitting array",
 			      &substrings);
 	if (result != UDS_SUCCESS)
 		return result;
@@ -287,7 +287,7 @@ static int split_string(const char *string, char separator, char ***substring_ar
 		if (*s == separator) {
 			ptrdiff_t length = s - string;
 
-			result = uds_allocate(length + 1, char, "split string",
+			result = vdo_allocate(length + 1, char, "split string",
 					      &substrings[current_substring]);
 			if (result != UDS_SUCCESS) {
 				free_string_array(substrings);
@@ -308,7 +308,7 @@ static int split_string(const char *string, char separator, char ***substring_ar
 	BUG_ON(current_substring != (substring_count - 1));
 	length = strlen(string);
 
-	result = uds_allocate(length + 1, char, "split string",
+	result = vdo_allocate(length + 1, char, "split string",
 			      &substrings[current_substring]);
 	if (result != UDS_SUCCESS) {
 		free_string_array(substrings);
@@ -337,7 +337,7 @@ static int join_strings(char **substring_array, size_t array_length, char separa
 	for (i = 0; (i < array_length) && (substring_array[i] != NULL); i++)
 		string_length += strlen(substring_array[i]) + 1;
 
-	result = uds_allocate(string_length, char, __func__, &output);
+	result = vdo_allocate(string_length, char, __func__, &output);
 	if (result != VDO_SUCCESS)
 		return result;
 
@@ -731,7 +731,7 @@ static int parse_device_config(int argc, char **argv, struct dm_target *ti,
 		return VDO_BAD_CONFIGURATION;
 	}
 
-	result = uds_allocate(1, struct device_config, "device_config", &config);
+	result = vdo_allocate(1, struct device_config, "device_config", &config);
 	if (result != VDO_SUCCESS) {
 		handle_parse_error(config, error_ptr,
 				   "Could not allocate config structure");
@@ -777,7 +777,7 @@ static int parse_device_config(int argc, char **argv, struct dm_target *ti,
 	if (config->version >= 1)
 		dm_shift_arg(&arg_set);
 
-	result = uds_duplicate_string(dm_shift_arg(&arg_set), "parent device name",
+	result = vdo_duplicate_string(dm_shift_arg(&arg_set), "parent device name",
 				      &config->parent_device_name);
 	if (result != VDO_SUCCESS) {
 		handle_parse_error(config, error_ptr,
@@ -1102,7 +1102,7 @@ static int vdo_message(struct dm_target *ti, unsigned int argc, char **argv,
 	}
 
 	vdo = get_vdo_for_target(ti);
-	uds_register_allocating_thread(&allocating_thread, NULL);
+	vdo_register_allocating_thread(&allocating_thread, NULL);
 	vdo_register_thread_device_id(&instance_thread, &vdo->instance);
 
 	/*
@@ -1117,7 +1117,7 @@ static int vdo_message(struct dm_target *ti, unsigned int argc, char **argv,
 	}
 
 	vdo_unregister_thread_device_id();
-	uds_unregister_allocating_thread();
+	vdo_unregister_allocating_thread();
 	return result;
 }
 
@@ -1538,7 +1538,7 @@ static int grow_bit_array(void)
 	unsigned long *new_words;
 	int result;
 
-	result = uds_reallocate_memory(instances.words,
+	result = vdo_reallocate_memory(instances.words,
 				       get_bit_array_size(instances.bit_count),
 				       get_bit_array_size(new_count),
 				       "instance number bit array", &new_words);
@@ -1704,7 +1704,7 @@ static int grow_layout(struct vdo *vdo, block_count_t old_size, block_count_t ne
 							  VDO_SLAB_SUMMARY_PARTITION),
 				       &vdo->next_layout);
 	if (result != VDO_SUCCESS) {
-		dm_kcopyd_client_destroy(uds_forget(vdo->partition_copier));
+		dm_kcopyd_client_destroy(vdo_forget(vdo->partition_copier));
 		return result;
 	}
 
@@ -1717,7 +1717,7 @@ static int grow_layout(struct vdo *vdo, block_count_t old_size, block_count_t ne
 	if (min_new_size > new_size) {
 		/* Copying the journal and summary would destroy some old metadata. */
 		vdo_uninitialize_layout(&vdo->next_layout);
-		dm_kcopyd_client_destroy(uds_forget(vdo->partition_copier));
+		dm_kcopyd_client_destroy(vdo_forget(vdo->partition_copier));
 		return VDO_INCREMENT_TOO_SMALL;
 	}
 
@@ -1903,7 +1903,7 @@ static int vdo_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	const char *device_name;
 	struct vdo *vdo;
 
-	uds_register_allocating_thread(&allocating_thread, NULL);
+	vdo_register_allocating_thread(&allocating_thread, NULL);
 	device_name = vdo_get_device_name(ti);
 	vdo = vdo_find_matching(vdo_is_named, device_name);
 	if (vdo == NULL) {
@@ -1914,14 +1914,14 @@ static int vdo_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 		vdo_unregister_thread_device_id();
 	}
 
-	uds_unregister_allocating_thread();
+	vdo_unregister_allocating_thread();
 	return result;
 }
 
 static void vdo_dtr(struct dm_target *ti)
 {
 	struct device_config *config = ti->private;
-	struct vdo *vdo = uds_forget(config->vdo);
+	struct vdo *vdo = vdo_forget(config->vdo);
 
 	list_del_init(&config->config_list);
 	if (list_empty(&vdo->device_config_list)) {
@@ -1932,17 +1932,17 @@ static void vdo_dtr(struct dm_target *ti)
 		struct registered_thread allocating_thread, instance_thread;
 
 		vdo_register_thread_device_id(&instance_thread, &instance);
-		uds_register_allocating_thread(&allocating_thread, NULL);
+		vdo_register_allocating_thread(&allocating_thread, NULL);
 
 		device_name = vdo_get_device_name(ti);
 		uds_log_info("stopping device '%s'", device_name);
 		if (vdo->dump_on_shutdown)
 			vdo_dump_all(vdo, "device shutdown");
 
-		vdo_destroy(uds_forget(vdo));
+		vdo_destroy(vdo_forget(vdo));
 		uds_log_info("device '%s' stopped", device_name);
 		vdo_unregister_thread_device_id();
-		uds_unregister_allocating_thread();
+		vdo_unregister_allocating_thread();
 		release_instance(instance);
 	} else if (config == vdo->device_config) {
 		/*
@@ -2295,7 +2295,7 @@ static void handle_load_error(struct vdo_completion *completion)
 	    (vdo->admin.phase == LOAD_PHASE_MAKE_DIRTY)) {
 		uds_log_error_strerror(completion->result, "aborting load");
 		vdo->admin.phase = LOAD_PHASE_DRAIN_JOURNAL;
-		load_callback(uds_forget(completion));
+		load_callback(vdo_forget(completion));
 		return;
 	}
 
@@ -2605,7 +2605,7 @@ static void grow_physical_callback(struct vdo_completion *completion)
 	case GROW_PHYSICAL_PHASE_UPDATE_COMPONENTS:
 		vdo_uninitialize_layout(&vdo->layout);
 		vdo->layout = vdo->next_layout;
-		uds_forget(vdo->next_layout.head);
+		vdo_forget(vdo->next_layout.head);
 		vdo->states.vdo.config.physical_blocks = vdo->layout.size;
 		vdo_update_slab_depot_size(vdo->depot);
 		vdo_save_components(vdo, completion);
@@ -2865,7 +2865,7 @@ static void vdo_module_destroy(void)
 	ASSERT_LOG_ONLY(instances.count == 0,
 			"should have no instance numbers still in use, but have %u",
 			instances.count);
-	uds_free(instances.words);
+	vdo_free(instances.words);
 	memset(&instances, 0, sizeof(struct instance_tracker));
 
 	uds_log_info("unloaded version %s", CURRENT_VERSION);
@@ -2878,7 +2878,7 @@ static int __init vdo_init(void)
 	/*
 	 * UDS module level initialization must be done first, as VDO initialization depends on it
 	 */
-	uds_memory_init();
+	vdo_memory_init();
 
 	vdo_initialize_thread_device_registry();
 	vdo_initialize_device_registry_once();
@@ -2910,7 +2910,7 @@ static void __exit vdo_exit(void)
 	 * UDS module level exit processing must be done after all VDO module exit processing is
 	 * complete.
 	 */
-	uds_memory_exit();
+	vdo_memory_exit();
 }
 
 module_init(vdo_init);
diff --git a/drivers/md/dm-vdo/dump.c b/drivers/md/dm-vdo/dump.c
index 91bc8ed36aa7d3..f5cef6d80c9e37 100644
--- a/drivers/md/dm-vdo/dump.c
+++ b/drivers/md/dm-vdo/dump.c
@@ -79,7 +79,7 @@ static void do_dump(struct vdo *vdo, unsigned int dump_options_requested,
 	if ((dump_options_requested & FLAG_SHOW_VDO_STATUS) != 0)
 		vdo_dump_status(vdo);
 
-	uds_report_memory_usage();
+	vdo_report_memory_usage();
 	uds_log_info("end of %s dump", UDS_LOGGING_MODULE_NAME);
 }
 
diff --git a/drivers/md/dm-vdo/encodings.c b/drivers/md/dm-vdo/encodings.c
index 9e45411fe816e7..b08e3dc9af4107 100644
--- a/drivers/md/dm-vdo/encodings.c
+++ b/drivers/md/dm-vdo/encodings.c
@@ -801,7 +801,7 @@ static int allocate_partition(struct layout *layout, u8 id,
 	struct partition *partition;
 	int result;
 
-	result = uds_allocate(1, struct partition, __func__, &partition);
+	result = vdo_allocate(1, struct partition, __func__, &partition);
 	if (result != UDS_SUCCESS)
 		return result;
 
@@ -930,7 +930,7 @@ void vdo_uninitialize_layout(struct layout *layout)
 		struct partition *part = layout->head;
 
 		layout->head = part->next;
-		uds_free(part);
+		vdo_free(part);
 	}
 
 	memset(layout, 0, sizeof(struct layout));
diff --git a/drivers/md/dm-vdo/flush.c b/drivers/md/dm-vdo/flush.c
index 391b6203efc6de..91512f115c5620 100644
--- a/drivers/md/dm-vdo/flush.c
+++ b/drivers/md/dm-vdo/flush.c
@@ -103,9 +103,9 @@ static void *allocate_flush(gfp_t gfp_mask, void *pool_data)
 	struct vdo_flush *flush = NULL;
 
 	if ((gfp_mask & GFP_NOWAIT) == GFP_NOWAIT) {
-		flush = uds_allocate_memory_nowait(sizeof(struct vdo_flush), __func__);
+		flush = vdo_allocate_memory_nowait(sizeof(struct vdo_flush), __func__);
 	} else {
-		int result = uds_allocate(1, struct vdo_flush, __func__, &flush);
+		int result = vdo_allocate(1, struct vdo_flush, __func__, &flush);
 
 		if (result != VDO_SUCCESS)
 			uds_log_error_strerror(result, "failed to allocate spare flush");
@@ -123,7 +123,7 @@ static void *allocate_flush(gfp_t gfp_mask, void *pool_data)
 
 static void free_flush(void *element, void *pool_data __always_unused)
 {
-	uds_free(element);
+	vdo_free(element);
 }
 
 /**
@@ -134,7 +134,7 @@ static void free_flush(void *element, void *pool_data __always_unused)
  */
 int vdo_make_flusher(struct vdo *vdo)
 {
-	int result = uds_allocate(1, struct flusher, __func__, &vdo->flusher);
+	int result = vdo_allocate(1, struct flusher, __func__, &vdo->flusher);
 
 	if (result != VDO_SUCCESS)
 		return result;
@@ -162,8 +162,8 @@ void vdo_free_flusher(struct flusher *flusher)
 		return;
 
 	if (flusher->flush_pool != NULL)
-		mempool_destroy(uds_forget(flusher->flush_pool));
-	uds_free(flusher);
+		mempool_destroy(vdo_forget(flusher->flush_pool));
+	vdo_free(flusher);
 }
 
 /**
diff --git a/drivers/md/dm-vdo/funnel-queue.c b/drivers/md/dm-vdo/funnel-queue.c
index 12dbf7d05adeea..0474cfc7ba46d0 100644
--- a/drivers/md/dm-vdo/funnel-queue.c
+++ b/drivers/md/dm-vdo/funnel-queue.c
@@ -15,7 +15,7 @@ int uds_make_funnel_queue(struct funnel_queue **queue_ptr)
 	int result;
 	struct funnel_queue *queue;
 
-	result = uds_allocate(1, struct funnel_queue, "funnel queue", &queue);
+	result = vdo_allocate(1, struct funnel_queue, "funnel queue", &queue);
 	if (result != UDS_SUCCESS)
 		return result;
 
@@ -31,9 +31,9 @@ int uds_make_funnel_queue(struct funnel_queue **queue_ptr)
 	return UDS_SUCCESS;
 }
 
-void uds_free_funnel_queue(struct funnel_queue *queue)
+void vdo_free_funnel_queue(struct funnel_queue *queue)
 {
-	uds_free(queue);
+	vdo_free(queue);
 }
 
 static struct funnel_queue_entry *get_oldest(struct funnel_queue *queue)
diff --git a/drivers/md/dm-vdo/funnel-queue.h b/drivers/md/dm-vdo/funnel-queue.h
index 88a30c593fdc87..5d5d249554f0c2 100644
--- a/drivers/md/dm-vdo/funnel-queue.h
+++ b/drivers/md/dm-vdo/funnel-queue.h
@@ -69,7 +69,7 @@ struct __aligned(L1_CACHE_BYTES) funnel_queue {
 
 int __must_check uds_make_funnel_queue(struct funnel_queue **queue_ptr);
 
-void uds_free_funnel_queue(struct funnel_queue *queue);
+void vdo_free_funnel_queue(struct funnel_queue *queue);
 
 /*
  * Put an entry on the end of the queue.
diff --git a/drivers/md/dm-vdo/funnel-requestqueue.c b/drivers/md/dm-vdo/funnel-requestqueue.c
index d2b49e39550c91..a3d241a6a42e41 100644
--- a/drivers/md/dm-vdo/funnel-requestqueue.c
+++ b/drivers/md/dm-vdo/funnel-requestqueue.c
@@ -198,7 +198,7 @@ int uds_make_request_queue(const char *queue_name,
 	int result;
 	struct uds_request_queue *queue;
 
-	result = uds_allocate(1, struct uds_request_queue, __func__, &queue);
+	result = vdo_allocate(1, struct uds_request_queue, __func__, &queue);
 	if (result != UDS_SUCCESS)
 		return result;
 
@@ -273,7 +273,7 @@ void uds_request_queue_finish(struct uds_request_queue *queue)
 		vdo_join_threads(queue->thread);
 	}
 
-	uds_free_funnel_queue(queue->main_queue);
-	uds_free_funnel_queue(queue->retry_queue);
-	uds_free(queue);
+	vdo_free_funnel_queue(queue->main_queue);
+	vdo_free_funnel_queue(queue->retry_queue);
+	vdo_free(queue);
 }
diff --git a/drivers/md/dm-vdo/funnel-workqueue.c b/drivers/md/dm-vdo/funnel-workqueue.c
index 8f0ada13e54997..8dbaeb8326b08b 100644
--- a/drivers/md/dm-vdo/funnel-workqueue.c
+++ b/drivers/md/dm-vdo/funnel-workqueue.c
@@ -275,9 +275,9 @@ static void free_simple_work_queue(struct simple_work_queue *queue)
 	unsigned int i;
 
 	for (i = 0; i <= VDO_WORK_Q_MAX_PRIORITY; i++)
-		uds_free_funnel_queue(queue->priority_lists[i]);
-	uds_free(queue->common.name);
-	uds_free(queue);
+		vdo_free_funnel_queue(queue->priority_lists[i]);
+	vdo_free(queue->common.name);
+	vdo_free(queue);
 }
 
 static void free_round_robin_work_queue(struct round_robin_work_queue *queue)
@@ -290,9 +290,9 @@ static void free_round_robin_work_queue(struct round_robin_work_queue *queue)
 
 	for (i = 0; i < count; i++)
 		free_simple_work_queue(queue_table[i]);
-	uds_free(queue_table);
-	uds_free(queue->common.name);
-	uds_free(queue);
+	vdo_free(queue_table);
+	vdo_free(queue->common.name);
+	vdo_free(queue);
 }
 
 void vdo_free_work_queue(struct vdo_work_queue *queue)
@@ -323,7 +323,7 @@ static int make_simple_work_queue(const char *thread_name_prefix, const char *na
 			"queue priority count %u within limit %u", type->max_priority,
 			VDO_WORK_Q_MAX_PRIORITY);
 
-	result = uds_allocate(1, struct simple_work_queue, "simple work queue", &queue);
+	result = vdo_allocate(1, struct simple_work_queue, "simple work queue", &queue);
 	if (result != UDS_SUCCESS)
 		return result;
 
@@ -333,9 +333,9 @@ static int make_simple_work_queue(const char *thread_name_prefix, const char *na
 	queue->common.owner = owner;
 	init_waitqueue_head(&queue->waiting_worker_threads);
 
-	result = uds_duplicate_string(name, "queue name", &queue->common.name);
+	result = vdo_duplicate_string(name, "queue name", &queue->common.name);
 	if (result != VDO_SUCCESS) {
-		uds_free(queue);
+		vdo_free(queue);
 		return -ENOMEM;
 	}
 
@@ -399,15 +399,15 @@ int vdo_make_work_queue(const char *thread_name_prefix, const char *name,
 		return result;
 	}
 
-	result = uds_allocate(1, struct round_robin_work_queue, "round-robin work queue",
+	result = vdo_allocate(1, struct round_robin_work_queue, "round-robin work queue",
 			      &queue);
 	if (result != UDS_SUCCESS)
 		return result;
 
-	result = uds_allocate(thread_count, struct simple_work_queue *,
+	result = vdo_allocate(thread_count, struct simple_work_queue *,
 			      "subordinate work queues", &queue->service_queues);
 	if (result != UDS_SUCCESS) {
-		uds_free(queue);
+		vdo_free(queue);
 		return result;
 	}
 
@@ -415,10 +415,10 @@ int vdo_make_work_queue(const char *thread_name_prefix, const char *name,
 	queue->common.round_robin_mode = true;
 	queue->common.owner = owner;
 
-	result = uds_duplicate_string(name, "queue name", &queue->common.name);
+	result = vdo_duplicate_string(name, "queue name", &queue->common.name);
 	if (result != VDO_SUCCESS) {
-		uds_free(queue->service_queues);
-		uds_free(queue);
+		vdo_free(queue->service_queues);
+		vdo_free(queue);
 		return -ENOMEM;
 	}
 
@@ -433,7 +433,7 @@ int vdo_make_work_queue(const char *thread_name_prefix, const char *name,
 		if (result != VDO_SUCCESS) {
 			queue->num_service_queues = i;
 			/* Destroy previously created subordinates. */
-			vdo_free_work_queue(uds_forget(*queue_ptr));
+			vdo_free_work_queue(vdo_forget(*queue_ptr));
 			return result;
 		}
 	}
diff --git a/drivers/md/dm-vdo/indexer/chapter-index.c b/drivers/md/dm-vdo/indexer/chapter-index.c
index ca1f3ea978b04d..94b9fadc26437b 100644
--- a/drivers/md/dm-vdo/indexer/chapter-index.c
+++ b/drivers/md/dm-vdo/indexer/chapter-index.c
@@ -19,7 +19,7 @@ int uds_make_open_chapter_index(struct open_chapter_index **chapter_index,
 	size_t memory_size;
 	struct open_chapter_index *index;
 
-	result = uds_allocate(1, struct open_chapter_index, "open chapter index", &index);
+	result = vdo_allocate(1, struct open_chapter_index, "open chapter index", &index);
 	if (result != UDS_SUCCESS)
 		return result;
 
@@ -36,7 +36,7 @@ int uds_make_open_chapter_index(struct open_chapter_index **chapter_index,
 					    geometry->chapter_payload_bits,
 					    memory_size, 'm');
 	if (result != UDS_SUCCESS) {
-		uds_free(index);
+		vdo_free(index);
 		return result;
 	}
 
@@ -45,13 +45,13 @@ int uds_make_open_chapter_index(struct open_chapter_index **chapter_index,
 	return UDS_SUCCESS;
 }
 
-void uds_free_open_chapter_index(struct open_chapter_index *chapter_index)
+void vdo_free_open_chapter_index(struct open_chapter_index *chapter_index)
 {
 	if (chapter_index == NULL)
 		return;
 
 	uds_uninitialize_delta_index(&chapter_index->delta_index);
-	uds_free(chapter_index);
+	vdo_free(chapter_index);
 }
 
 /* Re-initialize an open chapter index for a new chapter. */
diff --git a/drivers/md/dm-vdo/indexer/chapter-index.h b/drivers/md/dm-vdo/indexer/chapter-index.h
index be8bf2b675b1c7..4112506d6847a3 100644
--- a/drivers/md/dm-vdo/indexer/chapter-index.h
+++ b/drivers/md/dm-vdo/indexer/chapter-index.h
@@ -33,7 +33,7 @@ int __must_check uds_make_open_chapter_index(struct open_chapter_index **chapter
 					     const struct index_geometry *geometry,
 					     u64 volume_nonce);
 
-void uds_free_open_chapter_index(struct open_chapter_index *chapter_index);
+void vdo_free_open_chapter_index(struct open_chapter_index *chapter_index);
 
 void uds_empty_open_chapter_index(struct open_chapter_index *chapter_index,
 				  u64 virtual_chapter_number);
diff --git a/drivers/md/dm-vdo/indexer/config.c b/drivers/md/dm-vdo/indexer/config.c
index 350075ba69b11e..153da2273b6fed 100644
--- a/drivers/md/dm-vdo/indexer/config.c
+++ b/drivers/md/dm-vdo/indexer/config.c
@@ -327,7 +327,7 @@ int uds_make_configuration(const struct uds_parameters *params,
 	if (result != UDS_SUCCESS)
 		return result;
 
-	result = uds_allocate(1, struct uds_configuration, __func__, &config);
+	result = vdo_allocate(1, struct uds_configuration, __func__, &config);
 	if (result != UDS_SUCCESS)
 		return result;
 
@@ -335,7 +335,7 @@ int uds_make_configuration(const struct uds_parameters *params,
 					 chapters_per_volume, sparse_chapters_per_volume,
 					 0, 0, &config->geometry);
 	if (result != UDS_SUCCESS) {
-		uds_free_configuration(config);
+		vdo_free_configuration(config);
 		return result;
 	}
 
@@ -354,11 +354,11 @@ int uds_make_configuration(const struct uds_parameters *params,
 	return UDS_SUCCESS;
 }
 
-void uds_free_configuration(struct uds_configuration *config)
+void vdo_free_configuration(struct uds_configuration *config)
 {
 	if (config != NULL) {
-		uds_free_index_geometry(config->geometry);
-		uds_free(config);
+		vdo_free_index_geometry(config->geometry);
+		vdo_free(config);
 	}
 }
 
diff --git a/drivers/md/dm-vdo/indexer/config.h b/drivers/md/dm-vdo/indexer/config.h
index 3cfa7a6c35aecd..fe7958263ed670 100644
--- a/drivers/md/dm-vdo/indexer/config.h
+++ b/drivers/md/dm-vdo/indexer/config.h
@@ -111,7 +111,7 @@ struct uds_configuration_6_02 {
 int __must_check uds_make_configuration(const struct uds_parameters *params,
 					struct uds_configuration **config_ptr);
 
-void uds_free_configuration(struct uds_configuration *config);
+void vdo_free_configuration(struct uds_configuration *config);
 
 int __must_check uds_validate_config_contents(struct buffered_reader *reader,
 					      struct uds_configuration *config);
diff --git a/drivers/md/dm-vdo/indexer/delta-index.c b/drivers/md/dm-vdo/indexer/delta-index.c
index 0381ab64b07c56..86f777a1c95b17 100644
--- a/drivers/md/dm-vdo/indexer/delta-index.c
+++ b/drivers/md/dm-vdo/indexer/delta-index.c
@@ -314,12 +314,12 @@ void uds_uninitialize_delta_index(struct delta_index *delta_index)
 		return;
 
 	for (z = 0; z < delta_index->zone_count; z++) {
-		uds_free(uds_forget(delta_index->delta_zones[z].new_offsets));
-		uds_free(uds_forget(delta_index->delta_zones[z].delta_lists));
-		uds_free(uds_forget(delta_index->delta_zones[z].memory));
+		vdo_free(vdo_forget(delta_index->delta_zones[z].new_offsets));
+		vdo_free(vdo_forget(delta_index->delta_zones[z].delta_lists));
+		vdo_free(vdo_forget(delta_index->delta_zones[z].memory));
 	}
 
-	uds_free(delta_index->delta_zones);
+	vdo_free(delta_index->delta_zones);
 	memset(delta_index, 0, sizeof(struct delta_index));
 }
 
@@ -329,17 +329,17 @@ static int initialize_delta_zone(struct delta_zone *delta_zone, size_t size,
 {
 	int result;
 
-	result = uds_allocate(size, u8, "delta list", &delta_zone->memory);
+	result = vdo_allocate(size, u8, "delta list", &delta_zone->memory);
 	if (result != UDS_SUCCESS)
 		return result;
 
-	result = uds_allocate(list_count + 2, u64, "delta list temp",
+	result = vdo_allocate(list_count + 2, u64, "delta list temp",
 			      &delta_zone->new_offsets);
 	if (result != UDS_SUCCESS)
 		return result;
 
 	/* Allocate the delta lists. */
-	result = uds_allocate(list_count + 2, struct delta_list, "delta lists",
+	result = vdo_allocate(list_count + 2, struct delta_list, "delta lists",
 			      &delta_zone->delta_lists);
 	if (result != UDS_SUCCESS)
 		return result;
@@ -370,7 +370,7 @@ int uds_initialize_delta_index(struct delta_index *delta_index, unsigned int zon
 	unsigned int z;
 	size_t zone_memory;
 
-	result = uds_allocate(zone_count, struct delta_zone, "Delta Index Zones",
+	result = vdo_allocate(zone_count, struct delta_zone, "Delta Index Zones",
 			      &delta_index->delta_zones);
 	if (result != UDS_SUCCESS)
 		return result;
@@ -1065,7 +1065,7 @@ int uds_finish_restoring_delta_index(struct delta_index *delta_index,
 	unsigned int z;
 	u8 *data;
 
-	result = uds_allocate(DELTA_LIST_MAX_BYTE_COUNT, u8, __func__, &data);
+	result = vdo_allocate(DELTA_LIST_MAX_BYTE_COUNT, u8, __func__, &data);
 	if (result != UDS_SUCCESS)
 		return result;
 
@@ -1080,7 +1080,7 @@ int uds_finish_restoring_delta_index(struct delta_index *delta_index,
 		}
 	}
 
-	uds_free(data);
+	vdo_free(data);
 	return saved_result;
 }
 
diff --git a/drivers/md/dm-vdo/indexer/geometry.c b/drivers/md/dm-vdo/indexer/geometry.c
index e73d43de155be2..18479ad78c8913 100644
--- a/drivers/md/dm-vdo/indexer/geometry.c
+++ b/drivers/md/dm-vdo/indexer/geometry.c
@@ -61,7 +61,7 @@ int uds_make_index_geometry(size_t bytes_per_page, u32 record_pages_per_chapter,
 	int result;
 	struct index_geometry *geometry;
 
-	result = uds_allocate(1, struct index_geometry, "geometry", &geometry);
+	result = vdo_allocate(1, struct index_geometry, "geometry", &geometry);
 	if (result != UDS_SUCCESS)
 		return result;
 
@@ -119,9 +119,9 @@ int uds_copy_index_geometry(struct index_geometry *source,
 				       geometry_ptr);
 }
 
-void uds_free_index_geometry(struct index_geometry *geometry)
+void vdo_free_index_geometry(struct index_geometry *geometry)
 {
-	uds_free(geometry);
+	vdo_free(geometry);
 }
 
 u32 __must_check uds_map_to_physical_chapter(const struct index_geometry *geometry,
diff --git a/drivers/md/dm-vdo/indexer/geometry.h b/drivers/md/dm-vdo/indexer/geometry.h
index a2ecdb238cf2df..173bcb8b155e2c 100644
--- a/drivers/md/dm-vdo/indexer/geometry.h
+++ b/drivers/md/dm-vdo/indexer/geometry.h
@@ -104,7 +104,7 @@ int __must_check uds_make_index_geometry(size_t bytes_per_page, u32 record_pages
 int __must_check uds_copy_index_geometry(struct index_geometry *source,
 					 struct index_geometry **geometry_ptr);
 
-void uds_free_index_geometry(struct index_geometry *geometry);
+void vdo_free_index_geometry(struct index_geometry *geometry);
 
 u32 __must_check uds_map_to_physical_chapter(const struct index_geometry *geometry,
 					     u64 virtual_chapter);
diff --git a/drivers/md/dm-vdo/indexer/index-layout.c b/drivers/md/dm-vdo/indexer/index-layout.c
index cee36e6caf0e1d..bc7e9aabc27bee 100644
--- a/drivers/md/dm-vdo/indexer/index-layout.c
+++ b/drivers/md/dm-vdo/indexer/index-layout.c
@@ -272,7 +272,7 @@ int uds_compute_index_size(const struct uds_parameters *parameters, u64 *index_s
 	}
 
 	result = compute_sizes(index_config, &sizes);
-	uds_free_configuration(index_config);
+	vdo_free_configuration(index_config);
 	if (result != UDS_SUCCESS)
 		return uds_status_to_errno(result);
 
@@ -490,7 +490,7 @@ static int __must_check make_index_save_region_table(struct index_save_layout *i
 		type = RH_TYPE_UNSAVED;
 	}
 
-	result = uds_allocate_extended(struct region_table, region_count,
+	result = vdo_allocate_extended(struct region_table, region_count,
 				       struct layout_region,
 				       "layout region table for ISL", &table);
 	if (result != UDS_SUCCESS)
@@ -551,7 +551,7 @@ static int __must_check write_index_save_header(struct index_save_layout *isl,
 	u8 *buffer;
 	size_t offset = 0;
 
-	result = uds_allocate(table->encoded_size, u8, "index save data", &buffer);
+	result = vdo_allocate(table->encoded_size, u8, "index save data", &buffer);
 	if (result != UDS_SUCCESS)
 		return result;
 
@@ -570,7 +570,7 @@ static int __must_check write_index_save_header(struct index_save_layout *isl,
 	}
 
 	result = uds_write_to_buffered_writer(writer, buffer, offset);
-	uds_free(buffer);
+	vdo_free(buffer);
 	if (result != UDS_SUCCESS)
 		return result;
 
@@ -590,13 +590,13 @@ static int write_index_save_layout(struct index_layout *layout,
 
 	result = open_region_writer(layout, &isl->header, &writer);
 	if (result != UDS_SUCCESS) {
-		uds_free(table);
+		vdo_free(table);
 		return result;
 	}
 
 	result = write_index_save_header(isl, table, writer);
-	uds_free(table);
-	uds_free_buffered_writer(writer);
+	vdo_free(table);
+	vdo_free_buffered_writer(writer);
 
 	return result;
 }
@@ -673,7 +673,7 @@ static int __must_check make_layout_region_table(struct index_layout *layout,
 	struct region_table *table;
 	struct layout_region *lr;
 
-	result = uds_allocate_extended(struct region_table, region_count,
+	result = vdo_allocate_extended(struct region_table, region_count,
 				       struct layout_region, "layout region table",
 				       &table);
 	if (result != UDS_SUCCESS)
@@ -721,7 +721,7 @@ static int __must_check write_layout_header(struct index_layout *layout,
 	u8 *buffer;
 	size_t offset = 0;
 
-	result = uds_allocate(table->encoded_size, u8, "layout data", &buffer);
+	result = vdo_allocate(table->encoded_size, u8, "layout data", &buffer);
 	if (result != UDS_SUCCESS)
 		return result;
 
@@ -745,7 +745,7 @@ static int __must_check write_layout_header(struct index_layout *layout,
 	}
 
 	result = uds_write_to_buffered_writer(writer, buffer, offset);
-	uds_free(buffer);
+	vdo_free(buffer);
 	if (result != UDS_SUCCESS)
 		return result;
 
@@ -765,17 +765,17 @@ static int __must_check write_uds_index_config(struct index_layout *layout,
 
 	result = uds_write_config_contents(writer, config, layout->super.version);
 	if (result != UDS_SUCCESS) {
-		uds_free_buffered_writer(writer);
+		vdo_free_buffered_writer(writer);
 		return uds_log_error_strerror(result, "failed to write config region");
 	}
 
 	result = uds_flush_buffered_writer(writer);
 	if (result != UDS_SUCCESS) {
-		uds_free_buffered_writer(writer);
+		vdo_free_buffered_writer(writer);
 		return uds_log_error_strerror(result, "cannot flush config writer");
 	}
 
-	uds_free_buffered_writer(writer);
+	vdo_free_buffered_writer(writer);
 	return UDS_SUCCESS;
 }
 
@@ -791,13 +791,13 @@ static int __must_check save_layout(struct index_layout *layout, off_t offset)
 
 	result = open_layout_writer(layout, &layout->header, offset, &writer);
 	if (result != UDS_SUCCESS) {
-		uds_free(table);
+		vdo_free(table);
 		return result;
 	}
 
 	result = write_layout_header(layout, table, writer);
-	uds_free(table);
-	uds_free_buffered_writer(writer);
+	vdo_free(table);
+	vdo_free_buffered_writer(writer);
 
 	return result;
 }
@@ -811,7 +811,7 @@ static int create_index_layout(struct index_layout *layout, struct uds_configura
 	if (result != UDS_SUCCESS)
 		return result;
 
-	result = uds_allocate(sizes.save_count, struct index_save_layout, __func__,
+	result = vdo_allocate(sizes.save_count, struct index_save_layout, __func__,
 			      &layout->index.saves);
 	if (result != UDS_SUCCESS)
 		return result;
@@ -902,12 +902,12 @@ int uds_discard_open_chapter(struct index_layout *layout)
 
 	result = uds_write_to_buffered_writer(writer, NULL, UDS_BLOCK_SIZE);
 	if (result != UDS_SUCCESS) {
-		uds_free_buffered_writer(writer);
+		vdo_free_buffered_writer(writer);
 		return result;
 	}
 
 	result = uds_flush_buffered_writer(writer);
-	uds_free_buffered_writer(writer);
+	vdo_free_buffered_writer(writer);
 	return result;
 }
 
@@ -931,7 +931,7 @@ int uds_load_index_state(struct index_layout *layout, struct uds_index *index)
 		return result;
 
 	result = uds_load_open_chapter(index, readers[0]);
-	uds_free_buffered_reader(readers[0]);
+	vdo_free_buffered_reader(readers[0]);
 	if (result != UDS_SUCCESS)
 		return result;
 
@@ -940,7 +940,7 @@ int uds_load_index_state(struct index_layout *layout, struct uds_index *index)
 					    &readers[zone]);
 		if (result != UDS_SUCCESS) {
 			for (; zone > 0; zone--)
-				uds_free_buffered_reader(readers[zone - 1]);
+				vdo_free_buffered_reader(readers[zone - 1]);
 
 			return result;
 		}
@@ -948,7 +948,7 @@ int uds_load_index_state(struct index_layout *layout, struct uds_index *index)
 
 	result = uds_load_volume_index(index->volume_index, readers, isl->zone_count);
 	for (zone = 0; zone < isl->zone_count; zone++)
-		uds_free_buffered_reader(readers[zone]);
+		vdo_free_buffered_reader(readers[zone]);
 	if (result != UDS_SUCCESS)
 		return result;
 
@@ -957,7 +957,7 @@ int uds_load_index_state(struct index_layout *layout, struct uds_index *index)
 		return result;
 
 	result = uds_read_index_page_map(index->volume->index_page_map, readers[0]);
-	uds_free_buffered_reader(readers[0]);
+	vdo_free_buffered_reader(readers[0]);
 
 	return result;
 }
@@ -1096,7 +1096,7 @@ int uds_save_index_state(struct index_layout *layout, struct uds_index *index)
 	}
 
 	result = uds_save_open_chapter(index, writers[0]);
-	uds_free_buffered_writer(writers[0]);
+	vdo_free_buffered_writer(writers[0]);
 	if (result != UDS_SUCCESS) {
 		cancel_uds_index_save(isl);
 		return result;
@@ -1107,7 +1107,7 @@ int uds_save_index_state(struct index_layout *layout, struct uds_index *index)
 					    &writers[zone]);
 		if (result != UDS_SUCCESS) {
 			for (; zone > 0; zone--)
-				uds_free_buffered_writer(writers[zone - 1]);
+				vdo_free_buffered_writer(writers[zone - 1]);
 
 			cancel_uds_index_save(isl);
 			return result;
@@ -1116,7 +1116,7 @@ int uds_save_index_state(struct index_layout *layout, struct uds_index *index)
 
 	result = uds_save_volume_index(index->volume_index, writers, index->zone_count);
 	for (zone = 0; zone < index->zone_count; zone++)
-		uds_free_buffered_writer(writers[zone]);
+		vdo_free_buffered_writer(writers[zone]);
 	if (result != UDS_SUCCESS) {
 		cancel_uds_index_save(isl);
 		return result;
@@ -1129,7 +1129,7 @@ int uds_save_index_state(struct index_layout *layout, struct uds_index *index)
 	}
 
 	result = uds_write_index_page_map(index->volume->index_page_map, writers[0]);
-	uds_free_buffered_writer(writers[0]);
+	vdo_free_buffered_writer(writers[0]);
 	if (result != UDS_SUCCESS) {
 		cancel_uds_index_save(isl);
 		return result;
@@ -1168,7 +1168,7 @@ static int __must_check load_region_table(struct buffered_reader *reader,
 					      header.version);
 	}
 
-	result = uds_allocate_extended(struct region_table, header.region_count,
+	result = vdo_allocate_extended(struct region_table, header.region_count,
 				       struct layout_region,
 				       "single file layout region table", &table);
 	if (result != UDS_SUCCESS)
@@ -1182,7 +1182,7 @@ static int __must_check load_region_table(struct buffered_reader *reader,
 		result = uds_read_from_buffered_reader(reader, region_buffer,
 						       sizeof(region_buffer));
 		if (result != UDS_SUCCESS) {
-			uds_free(table);
+			vdo_free(table);
 			return uds_log_error_strerror(UDS_CORRUPT_DATA,
 						      "cannot read region table layouts");
 		}
@@ -1207,13 +1207,13 @@ static int __must_check read_super_block_data(struct buffered_reader *reader,
 	u8 *buffer;
 	size_t offset = 0;
 
-	result = uds_allocate(saved_size, u8, "super block data", &buffer);
+	result = vdo_allocate(saved_size, u8, "super block data", &buffer);
 	if (result != UDS_SUCCESS)
 		return result;
 
 	result = uds_read_from_buffered_reader(reader, buffer, saved_size);
 	if (result != UDS_SUCCESS) {
-		uds_free(buffer);
+		vdo_free(buffer);
 		return uds_log_error_strerror(result, "cannot read region table header");
 	}
 
@@ -1238,7 +1238,7 @@ static int __must_check read_super_block_data(struct buffered_reader *reader,
 		super->start_offset = 0;
 	}
 
-	uds_free(buffer);
+	vdo_free(buffer);
 
 	if (memcmp(super->magic_label, LAYOUT_MAGIC, MAGIC_SIZE) != 0)
 		return uds_log_error_strerror(UDS_CORRUPT_DATA,
@@ -1341,7 +1341,7 @@ static int __must_check reconstitute_layout(struct index_layout *layout,
 	int result;
 	u64 next_block = first_block;
 
-	result = uds_allocate(layout->super.max_saves, struct index_save_layout,
+	result = vdo_allocate(layout->super.max_saves, struct index_save_layout,
 			      __func__, &layout->index.saves);
 	if (result != UDS_SUCCESS)
 		return result;
@@ -1392,19 +1392,19 @@ static int __must_check load_super_block(struct index_layout *layout, size_t blo
 		return result;
 
 	if (table->header.type != RH_TYPE_SUPER) {
-		uds_free(table);
+		vdo_free(table);
 		return uds_log_error_strerror(UDS_CORRUPT_DATA,
 					      "not a superblock region table");
 	}
 
 	result = read_super_block_data(reader, layout, table->header.payload);
 	if (result != UDS_SUCCESS) {
-		uds_free(table);
+		vdo_free(table);
 		return uds_log_error_strerror(result, "unknown superblock format");
 	}
 
 	if (super->block_size != block_size) {
-		uds_free(table);
+		vdo_free(table);
 		return uds_log_error_strerror(UDS_CORRUPT_DATA,
 					      "superblock saved block_size %u differs from supplied block_size %zu",
 					      super->block_size, block_size);
@@ -1412,7 +1412,7 @@ static int __must_check load_super_block(struct index_layout *layout, size_t blo
 
 	first_block -= (super->volume_offset - super->start_offset);
 	result = reconstitute_layout(layout, table, first_block);
-	uds_free(table);
+	vdo_free(table);
 	return result;
 }
 
@@ -1551,7 +1551,7 @@ static int __must_check load_index_save(struct index_save_layout *isl,
 	if (table->header.region_blocks != isl->index_save.block_count) {
 		u64 region_blocks = table->header.region_blocks;
 
-		uds_free(table);
+		vdo_free(table);
 		return uds_log_error_strerror(UDS_CORRUPT_DATA,
 					      "unexpected index save %u region block count %llu",
 					      instance,
@@ -1559,14 +1559,14 @@ static int __must_check load_index_save(struct index_save_layout *isl,
 	}
 
 	if (table->header.type == RH_TYPE_UNSAVED) {
-		uds_free(table);
+		vdo_free(table);
 		reset_index_save_layout(isl, 0);
 		return UDS_SUCCESS;
 	}
 
 
 	if (table->header.type != RH_TYPE_SAVE) {
-		uds_free(table);
+		vdo_free(table);
 		return uds_log_error_strerror(UDS_CORRUPT_DATA,
 					      "unexpected index save %u header type %u",
 					      instance, table->header.type);
@@ -1574,14 +1574,14 @@ static int __must_check load_index_save(struct index_save_layout *isl,
 
 	result = read_index_save_data(reader, isl, table->header.payload);
 	if (result != UDS_SUCCESS) {
-		uds_free(table);
+		vdo_free(table);
 		return uds_log_error_strerror(result,
 					      "unknown index save %u data format",
 					      instance);
 	}
 
 	result = reconstruct_index_save(isl, table);
-	uds_free(table);
+	vdo_free(table);
 	if (result != UDS_SUCCESS) {
 		return uds_log_error_strerror(result, "cannot reconstruct index save %u",
 					      instance);
@@ -1609,7 +1609,7 @@ static int __must_check load_sub_index_regions(struct index_layout *layout)
 		}
 
 		result = load_index_save(isl, reader, j);
-		uds_free_buffered_reader(reader);
+		vdo_free_buffered_reader(reader);
 		if (result != UDS_SUCCESS) {
 			/* Another save slot might be valid. */
 			reset_index_save_layout(isl, 0);
@@ -1634,11 +1634,11 @@ static int __must_check verify_uds_index_config(struct index_layout *layout,
 
 	result = uds_validate_config_contents(reader, config);
 	if (result != UDS_SUCCESS) {
-		uds_free_buffered_reader(reader);
+		vdo_free_buffered_reader(reader);
 		return uds_log_error_strerror(result, "failed to read config region");
 	}
 
-	uds_free_buffered_reader(reader);
+	vdo_free_buffered_reader(reader);
 	return UDS_SUCCESS;
 }
 
@@ -1654,7 +1654,7 @@ static int load_index_layout(struct index_layout *layout, struct uds_configurati
 
 	result = load_super_block(layout, UDS_BLOCK_SIZE,
 				  layout->offset / UDS_BLOCK_SIZE, reader);
-	uds_free_buffered_reader(reader);
+	vdo_free_buffered_reader(reader);
 	if (result != UDS_SUCCESS)
 		return result;
 
@@ -1701,13 +1701,13 @@ int uds_make_index_layout(struct uds_configuration *config, bool new_layout,
 	if (result != UDS_SUCCESS)
 		return result;
 
-	result = uds_allocate(1, struct index_layout, __func__, &layout);
+	result = vdo_allocate(1, struct index_layout, __func__, &layout);
 	if (result != UDS_SUCCESS)
 		return result;
 
 	result = create_layout_factory(layout, config);
 	if (result != UDS_SUCCESS) {
-		uds_free_index_layout(layout);
+		vdo_free_index_layout(layout);
 		return result;
 	}
 
@@ -1715,7 +1715,7 @@ int uds_make_index_layout(struct uds_configuration *config, bool new_layout,
 		uds_log_error("index storage (%zu) is smaller than the required size %llu",
 			      layout->factory_size,
 			      (unsigned long long) sizes.total_size);
-		uds_free_index_layout(layout);
+		vdo_free_index_layout(layout);
 		return -ENOSPC;
 	}
 
@@ -1724,7 +1724,7 @@ int uds_make_index_layout(struct uds_configuration *config, bool new_layout,
 	else
 		result = load_index_layout(layout, config);
 	if (result != UDS_SUCCESS) {
-		uds_free_index_layout(layout);
+		vdo_free_index_layout(layout);
 		return result;
 	}
 
@@ -1732,16 +1732,16 @@ int uds_make_index_layout(struct uds_configuration *config, bool new_layout,
 	return UDS_SUCCESS;
 }
 
-void uds_free_index_layout(struct index_layout *layout)
+void vdo_free_index_layout(struct index_layout *layout)
 {
 	if (layout == NULL)
 		return;
 
-	uds_free(layout->index.saves);
+	vdo_free(layout->index.saves);
 	if (layout->factory != NULL)
 		uds_put_io_factory(layout->factory);
 
-	uds_free(layout);
+	vdo_free(layout);
 }
 
 int uds_replace_index_layout_storage(struct index_layout *layout,
diff --git a/drivers/md/dm-vdo/indexer/index-layout.h b/drivers/md/dm-vdo/indexer/index-layout.h
index edb5c73ab7705f..bd9b90c84a70ca 100644
--- a/drivers/md/dm-vdo/indexer/index-layout.h
+++ b/drivers/md/dm-vdo/indexer/index-layout.h
@@ -21,7 +21,7 @@ struct index_layout;
 int __must_check uds_make_index_layout(struct uds_configuration *config, bool new_layout,
 				       struct index_layout **layout_ptr);
 
-void uds_free_index_layout(struct index_layout *layout);
+void vdo_free_index_layout(struct index_layout *layout);
 
 int __must_check uds_replace_index_layout_storage(struct index_layout *layout,
 						  struct block_device *bdev);
diff --git a/drivers/md/dm-vdo/indexer/index-page-map.c b/drivers/md/dm-vdo/indexer/index-page-map.c
index eb4bf5f9146a7d..41940574731a4b 100644
--- a/drivers/md/dm-vdo/indexer/index-page-map.c
+++ b/drivers/md/dm-vdo/indexer/index-page-map.c
@@ -40,16 +40,16 @@ int uds_make_index_page_map(const struct index_geometry *geometry,
 	int result;
 	struct index_page_map *map;
 
-	result = uds_allocate(1, struct index_page_map, "page map", &map);
+	result = vdo_allocate(1, struct index_page_map, "page map", &map);
 	if (result != UDS_SUCCESS)
 		return result;
 
 	map->geometry = geometry;
 	map->entries_per_chapter = geometry->index_pages_per_chapter - 1;
-	result = uds_allocate(get_entry_count(geometry), u16, "Index Page Map Entries",
+	result = vdo_allocate(get_entry_count(geometry), u16, "Index Page Map Entries",
 			      &map->entries);
 	if (result != UDS_SUCCESS) {
-		uds_free_index_page_map(map);
+		vdo_free_index_page_map(map);
 		return result;
 	}
 
@@ -57,11 +57,11 @@ int uds_make_index_page_map(const struct index_geometry *geometry,
 	return UDS_SUCCESS;
 }
 
-void uds_free_index_page_map(struct index_page_map *map)
+void vdo_free_index_page_map(struct index_page_map *map)
 {
 	if (map != NULL) {
-		uds_free(map->entries);
-		uds_free(map);
+		vdo_free(map->entries);
+		vdo_free(map);
 	}
 }
 
@@ -120,7 +120,7 @@ int uds_write_index_page_map(struct index_page_map *map, struct buffered_writer
 	u64 saved_size = uds_compute_index_page_map_save_size(map->geometry);
 	u32 i;
 
-	result = uds_allocate(saved_size, u8, "page map data", &buffer);
+	result = vdo_allocate(saved_size, u8, "page map data", &buffer);
 	if (result != UDS_SUCCESS)
 		return result;
 
@@ -131,7 +131,7 @@ int uds_write_index_page_map(struct index_page_map *map, struct buffered_writer
 		encode_u16_le(buffer, &offset, map->entries[i]);
 
 	result = uds_write_to_buffered_writer(writer, buffer, offset);
-	uds_free(buffer);
+	vdo_free(buffer);
 	if (result != UDS_SUCCESS)
 		return result;
 
@@ -147,20 +147,20 @@ int uds_read_index_page_map(struct index_page_map *map, struct buffered_reader *
 	u64 saved_size = uds_compute_index_page_map_save_size(map->geometry);
 	u32 i;
 
-	result = uds_allocate(saved_size, u8, "page map data", &buffer);
+	result = vdo_allocate(saved_size, u8, "page map data", &buffer);
 	if (result != UDS_SUCCESS)
 		return result;
 
 	result = uds_read_from_buffered_reader(reader, buffer, saved_size);
 	if (result != UDS_SUCCESS) {
-		uds_free(buffer);
+		vdo_free(buffer);
 		return result;
 	}
 
 	memcpy(&magic, buffer, PAGE_MAP_MAGIC_LENGTH);
 	offset += PAGE_MAP_MAGIC_LENGTH;
 	if (memcmp(magic, PAGE_MAP_MAGIC, PAGE_MAP_MAGIC_LENGTH) != 0) {
-		uds_free(buffer);
+		vdo_free(buffer);
 		return UDS_CORRUPT_DATA;
 	}
 
@@ -168,7 +168,7 @@ int uds_read_index_page_map(struct index_page_map *map, struct buffered_reader *
 	for (i = 0; i < get_entry_count(map->geometry); i++)
 		decode_u16_le(buffer, &offset, &map->entries[i]);
 
-	uds_free(buffer);
+	vdo_free(buffer);
 	uds_log_debug("read index page map, last update %llu",
 		      (unsigned long long) map->last_update);
 	return UDS_SUCCESS;
diff --git a/drivers/md/dm-vdo/indexer/index-page-map.h b/drivers/md/dm-vdo/indexer/index-page-map.h
index b327c0bb965627..0fb7ef467bae0d 100644
--- a/drivers/md/dm-vdo/indexer/index-page-map.h
+++ b/drivers/md/dm-vdo/indexer/index-page-map.h
@@ -25,7 +25,7 @@ struct index_page_map {
 int __must_check uds_make_index_page_map(const struct index_geometry *geometry,
 					 struct index_page_map **map_ptr);
 
-void uds_free_index_page_map(struct index_page_map *map);
+void vdo_free_index_page_map(struct index_page_map *map);
 
 int __must_check uds_read_index_page_map(struct index_page_map *map,
 					 struct buffered_reader *reader);
diff --git a/drivers/md/dm-vdo/indexer/index-session.c b/drivers/md/dm-vdo/indexer/index-session.c
index 6aadba678f08eb..7fada13abc445c 100644
--- a/drivers/md/dm-vdo/indexer/index-session.c
+++ b/drivers/md/dm-vdo/indexer/index-session.c
@@ -221,7 +221,7 @@ static int __must_check make_empty_index_session(struct uds_index_session **inde
 	int result;
 	struct uds_index_session *session;
 
-	result = uds_allocate(1, struct uds_index_session, __func__, &session);
+	result = vdo_allocate(1, struct uds_index_session, __func__, &session);
 	if (result != UDS_SUCCESS)
 		return result;
 
@@ -233,7 +233,7 @@ static int __must_check make_empty_index_session(struct uds_index_session **inde
 	result = uds_make_request_queue("callbackW", &handle_callbacks,
 					&session->callback_queue);
 	if (result != UDS_SUCCESS) {
-		uds_free(session);
+		vdo_free(session);
 		return result;
 	}
 
@@ -302,7 +302,7 @@ static int initialize_index_session(struct uds_index_session *index_session,
 	else
 		uds_log_configuration(config);
 
-	uds_free_configuration(config);
+	vdo_free_configuration(config);
 	return result;
 }
 
@@ -565,7 +565,7 @@ static int save_and_free_index(struct uds_index_session *index_session)
 			uds_log_warning_strerror(result,
 						 "ignoring error from save_index");
 	}
-	uds_free_index(index);
+	vdo_free_index(index);
 	index_session->index = NULL;
 
 	/*
@@ -673,7 +673,7 @@ int uds_destroy_index_session(struct uds_index_session *index_session)
 	uds_request_queue_finish(index_session->callback_queue);
 	index_session->callback_queue = NULL;
 	uds_log_debug("Destroyed index session");
-	uds_free(index_session);
+	vdo_free(index_session);
 	return uds_status_to_errno(result);
 }
 
diff --git a/drivers/md/dm-vdo/indexer/index.c b/drivers/md/dm-vdo/indexer/index.c
index 6d5c30995d5f2e..b114043f16281f 100644
--- a/drivers/md/dm-vdo/indexer/index.c
+++ b/drivers/md/dm-vdo/indexer/index.c
@@ -86,7 +86,7 @@ static int launch_zone_message(struct uds_zone_message message, unsigned int zon
 	int result;
 	struct uds_request *request;
 
-	result = uds_allocate(1, struct uds_request, __func__, &request);
+	result = vdo_allocate(1, struct uds_request, __func__, &request);
 	if (result != UDS_SUCCESS)
 		return result;
 
@@ -287,7 +287,7 @@ static int open_next_chapter(struct index_zone *zone)
 		return UDS_SUCCESS;
 
 	while (expire_chapters-- > 0)
-		uds_forget_chapter(zone->index->volume, expiring++);
+		vdo_forget_chapter(zone->index->volume, expiring++);
 
 	return UDS_SUCCESS;
 }
@@ -621,7 +621,7 @@ static void execute_zone_request(struct uds_request *request)
 		}
 
 		/* Once the message is processed it can be freed. */
-		uds_free(uds_forget(request));
+		vdo_free(vdo_forget(request));
 		return;
 	}
 
@@ -752,9 +752,9 @@ static void free_chapter_writer(struct chapter_writer *writer)
 		return;
 
 	stop_chapter_writer(writer);
-	uds_free_open_chapter_index(writer->open_chapter_index);
-	uds_free(writer->collated_records);
-	uds_free(writer);
+	vdo_free_open_chapter_index(writer->open_chapter_index);
+	vdo_free(writer->collated_records);
+	vdo_free(writer);
 }
 
 static int make_chapter_writer(struct uds_index *index,
@@ -765,7 +765,7 @@ static int make_chapter_writer(struct uds_index *index,
 	size_t collated_records_size =
 		(sizeof(struct uds_volume_record) * index->volume->geometry->records_per_chapter);
 
-	result = uds_allocate_extended(struct chapter_writer, index->zone_count,
+	result = vdo_allocate_extended(struct chapter_writer, index->zone_count,
 				       struct open_chapter_zone *, "Chapter Writer",
 				       &writer);
 	if (result != UDS_SUCCESS)
@@ -775,7 +775,7 @@ static int make_chapter_writer(struct uds_index *index,
 	mutex_init(&writer->mutex);
 	uds_init_cond(&writer->cond);
 
-	result = uds_allocate_cache_aligned(collated_records_size, "collated records",
+	result = vdo_allocate_cache_aligned(collated_records_size, "collated records",
 					    &writer->collated_records);
 	if (result != UDS_SUCCESS) {
 		free_chapter_writer(writer);
@@ -1114,9 +1114,9 @@ static void free_index_zone(struct index_zone *zone)
 	if (zone == NULL)
 		return;
 
-	uds_free_open_chapter(zone->open_chapter);
-	uds_free_open_chapter(zone->writing_chapter);
-	uds_free(zone);
+	vdo_free_open_chapter(zone->open_chapter);
+	vdo_free_open_chapter(zone->writing_chapter);
+	vdo_free(zone);
 }
 
 static int make_index_zone(struct uds_index *index, unsigned int zone_number)
@@ -1124,7 +1124,7 @@ static int make_index_zone(struct uds_index *index, unsigned int zone_number)
 	int result;
 	struct index_zone *zone;
 
-	result = uds_allocate(1, struct index_zone, "index zone", &zone);
+	result = vdo_allocate(1, struct index_zone, "index zone", &zone);
 	if (result != UDS_SUCCESS)
 		return result;
 
@@ -1161,7 +1161,7 @@ int uds_make_index(struct uds_configuration *config, enum uds_open_index_type op
 	u64 nonce;
 	unsigned int z;
 
-	result = uds_allocate_extended(struct uds_index, config->zone_count,
+	result = vdo_allocate_extended(struct uds_index, config->zone_count,
 				       struct uds_request_queue *, "index", &index);
 	if (result != UDS_SUCCESS)
 		return result;
@@ -1170,20 +1170,20 @@ int uds_make_index(struct uds_configuration *config, enum uds_open_index_type op
 
 	result = uds_make_index_layout(config, new, &index->layout);
 	if (result != UDS_SUCCESS) {
-		uds_free_index(index);
+		vdo_free_index(index);
 		return result;
 	}
 
-	result = uds_allocate(index->zone_count, struct index_zone *, "zones",
+	result = vdo_allocate(index->zone_count, struct index_zone *, "zones",
 			      &index->zones);
 	if (result != UDS_SUCCESS) {
-		uds_free_index(index);
+		vdo_free_index(index);
 		return result;
 	}
 
 	result = uds_make_volume(config, index->layout, &index->volume);
 	if (result != UDS_SUCCESS) {
-		uds_free_index(index);
+		vdo_free_index(index);
 		return result;
 	}
 
@@ -1191,7 +1191,7 @@ int uds_make_index(struct uds_configuration *config, enum uds_open_index_type op
 	for (z = 0; z < index->zone_count; z++) {
 		result = make_index_zone(index, z);
 		if (result != UDS_SUCCESS) {
-			uds_free_index(index);
+			vdo_free_index(index);
 			return uds_log_error_strerror(result,
 						      "Could not create index zone");
 		}
@@ -1200,7 +1200,7 @@ int uds_make_index(struct uds_configuration *config, enum uds_open_index_type op
 	nonce = uds_get_volume_nonce(index->layout);
 	result = uds_make_volume_index(config, nonce, &index->volume_index);
 	if (result != UDS_SUCCESS) {
-		uds_free_index(index);
+		vdo_free_index(index);
 		return uds_log_error_strerror(result, "could not make volume index");
 	}
 
@@ -1209,13 +1209,13 @@ int uds_make_index(struct uds_configuration *config, enum uds_open_index_type op
 
 	result = initialize_index_queues(index, config->geometry);
 	if (result != UDS_SUCCESS) {
-		uds_free_index(index);
+		vdo_free_index(index);
 		return result;
 	}
 
 	result = make_chapter_writer(index, &index->chapter_writer);
 	if (result != UDS_SUCCESS) {
-		uds_free_index(index);
+		vdo_free_index(index);
 		return result;
 	}
 
@@ -1243,7 +1243,7 @@ int uds_make_index(struct uds_configuration *config, enum uds_open_index_type op
 	}
 
 	if (result != UDS_SUCCESS) {
-		uds_free_index(index);
+		vdo_free_index(index);
 		return uds_log_error_strerror(result, "fatal error in %s()", __func__);
 	}
 
@@ -1270,7 +1270,7 @@ int uds_make_index(struct uds_configuration *config, enum uds_open_index_type op
 	return UDS_SUCCESS;
 }
 
-void uds_free_index(struct uds_index *index)
+void vdo_free_index(struct uds_index *index)
 {
 	unsigned int i;
 
@@ -1283,16 +1283,16 @@ void uds_free_index(struct uds_index *index)
 
 	free_chapter_writer(index->chapter_writer);
 
-	uds_free_volume_index(index->volume_index);
+	vdo_free_volume_index(index->volume_index);
 	if (index->zones != NULL) {
 		for (i = 0; i < index->zone_count; i++)
 			free_index_zone(index->zones[i]);
-		uds_free(index->zones);
+		vdo_free(index->zones);
 	}
 
-	uds_free_volume(index->volume);
-	uds_free_index_layout(uds_forget(index->layout));
-	uds_free(index);
+	vdo_free_volume(index->volume);
+	vdo_free_index_layout(vdo_forget(index->layout));
+	vdo_free(index);
 }
 
 /* Wait for the chapter writer to complete any outstanding writes. */
diff --git a/drivers/md/dm-vdo/indexer/index.h b/drivers/md/dm-vdo/indexer/index.h
index edabb239548ec9..7fbc63db41312f 100644
--- a/drivers/md/dm-vdo/indexer/index.h
+++ b/drivers/md/dm-vdo/indexer/index.h
@@ -69,7 +69,7 @@ int __must_check uds_make_index(struct uds_configuration *config,
 
 int __must_check uds_save_index(struct uds_index *index);
 
-void uds_free_index(struct uds_index *index);
+void vdo_free_index(struct uds_index *index);
 
 int __must_check uds_replace_index_storage(struct uds_index *index,
 					   struct block_device *bdev);
diff --git a/drivers/md/dm-vdo/indexer/io-factory.c b/drivers/md/dm-vdo/indexer/io-factory.c
index a2d0f09b4b9f4c..795aa5238c047f 100644
--- a/drivers/md/dm-vdo/indexer/io-factory.c
+++ b/drivers/md/dm-vdo/indexer/io-factory.c
@@ -64,7 +64,7 @@ int uds_make_io_factory(struct block_device *bdev, struct io_factory **factory_p
 	int result;
 	struct io_factory *factory;
 
-	result = uds_allocate(1, struct io_factory, __func__, &factory);
+	result = vdo_allocate(1, struct io_factory, __func__, &factory);
 	if (result != UDS_SUCCESS)
 		return result;
 
@@ -85,7 +85,7 @@ int uds_replace_storage(struct io_factory *factory, struct block_device *bdev)
 void uds_put_io_factory(struct io_factory *factory)
 {
 	if (atomic_add_return(-1, &factory->ref_count) <= 0)
-		uds_free(factory);
+		vdo_free(factory);
 }
 
 size_t uds_get_writable_size(struct io_factory *factory)
@@ -119,7 +119,7 @@ static void read_ahead(struct buffered_reader *reader, sector_t block_number)
 	}
 }
 
-void uds_free_buffered_reader(struct buffered_reader *reader)
+void vdo_free_buffered_reader(struct buffered_reader *reader)
 {
 	if (reader == NULL)
 		return;
@@ -129,7 +129,7 @@ void uds_free_buffered_reader(struct buffered_reader *reader)
 
 	dm_bufio_client_destroy(reader->client);
 	uds_put_io_factory(reader->factory);
-	uds_free(reader);
+	vdo_free(reader);
 }
 
 /* Create a buffered reader for an index region starting at offset. */
@@ -144,7 +144,7 @@ int uds_make_buffered_reader(struct io_factory *factory, off_t offset, u64 block
 	if (result != UDS_SUCCESS)
 		return result;
 
-	result = uds_allocate(1, struct buffered_reader, "buffered reader", &reader);
+	result = vdo_allocate(1, struct buffered_reader, "buffered reader", &reader);
 	if (result != UDS_SUCCESS) {
 		dm_bufio_client_destroy(client);
 		return result;
@@ -177,7 +177,7 @@ static int position_reader(struct buffered_reader *reader, sector_t block_number
 			return UDS_OUT_OF_RANGE;
 
 		if (reader->buffer != NULL)
-			dm_bufio_release(uds_forget(reader->buffer));
+			dm_bufio_release(vdo_forget(reader->buffer));
 
 		data = dm_bufio_read(reader->client, block_number, &buffer);
 		if (IS_ERR(data))
@@ -282,7 +282,7 @@ int uds_make_buffered_writer(struct io_factory *factory, off_t offset, u64 block
 	if (result != UDS_SUCCESS)
 		return result;
 
-	result = uds_allocate(1, struct buffered_writer, "buffered writer", &writer);
+	result = vdo_allocate(1, struct buffered_writer, "buffered writer", &writer);
 	if (result != UDS_SUCCESS) {
 		dm_bufio_client_destroy(client);
 		return result;
@@ -355,7 +355,7 @@ static int flush_previous_buffer(struct buffered_writer *writer)
 	return writer->error;
 }
 
-void uds_free_buffered_writer(struct buffered_writer *writer)
+void vdo_free_buffered_writer(struct buffered_writer *writer)
 {
 	int result;
 
@@ -369,7 +369,7 @@ void uds_free_buffered_writer(struct buffered_writer *writer)
 
 	dm_bufio_client_destroy(writer->client);
 	uds_put_io_factory(writer->factory);
-	uds_free(writer);
+	vdo_free(writer);
 }
 
 /*
diff --git a/drivers/md/dm-vdo/indexer/io-factory.h b/drivers/md/dm-vdo/indexer/io-factory.h
index 7fb5a0616a791c..60749a9ff756e6 100644
--- a/drivers/md/dm-vdo/indexer/io-factory.h
+++ b/drivers/md/dm-vdo/indexer/io-factory.h
@@ -42,7 +42,7 @@ int __must_check uds_make_buffered_reader(struct io_factory *factory, off_t offs
 					  u64 block_count,
 					  struct buffered_reader **reader_ptr);
 
-void uds_free_buffered_reader(struct buffered_reader *reader);
+void vdo_free_buffered_reader(struct buffered_reader *reader);
 
 int __must_check uds_read_from_buffered_reader(struct buffered_reader *reader, u8 *data,
 					       size_t length);
@@ -54,7 +54,7 @@ int __must_check uds_make_buffered_writer(struct io_factory *factory, off_t offs
 					  u64 block_count,
 					  struct buffered_writer **writer_ptr);
 
-void uds_free_buffered_writer(struct buffered_writer *buffer);
+void vdo_free_buffered_writer(struct buffered_writer *buffer);
 
 int __must_check uds_write_to_buffered_writer(struct buffered_writer *writer,
 					      const u8 *data, size_t length);
diff --git a/drivers/md/dm-vdo/indexer/open-chapter.c b/drivers/md/dm-vdo/indexer/open-chapter.c
index 28b2f472c29ee5..6dd055ab47deef 100644
--- a/drivers/md/dm-vdo/indexer/open-chapter.c
+++ b/drivers/md/dm-vdo/indexer/open-chapter.c
@@ -70,7 +70,7 @@ int uds_make_open_chapter(const struct index_geometry *geometry, unsigned int zo
 	size_t capacity = geometry->records_per_chapter / zone_count;
 	size_t slot_count = (1 << bits_per(capacity * LOAD_RATIO));
 
-	result = uds_allocate_extended(struct open_chapter_zone, slot_count,
+	result = vdo_allocate_extended(struct open_chapter_zone, slot_count,
 				       struct open_chapter_zone_slot, "open chapter",
 				       &open_chapter);
 	if (result != UDS_SUCCESS)
@@ -78,10 +78,10 @@ int uds_make_open_chapter(const struct index_geometry *geometry, unsigned int zo
 
 	open_chapter->slot_count = slot_count;
 	open_chapter->capacity = capacity;
-	result = uds_allocate_cache_aligned(records_size(open_chapter), "record pages",
+	result = vdo_allocate_cache_aligned(records_size(open_chapter), "record pages",
 					    &open_chapter->records);
 	if (result != UDS_SUCCESS) {
-		uds_free_open_chapter(open_chapter);
+		vdo_free_open_chapter(open_chapter);
 		return result;
 	}
 
@@ -193,11 +193,11 @@ void uds_remove_from_open_chapter(struct open_chapter_zone *open_chapter,
 	}
 }
 
-void uds_free_open_chapter(struct open_chapter_zone *open_chapter)
+void vdo_free_open_chapter(struct open_chapter_zone *open_chapter)
 {
 	if (open_chapter != NULL) {
-		uds_free(open_chapter->records);
-		uds_free(open_chapter);
+		vdo_free(open_chapter->records);
+		vdo_free(open_chapter);
 	}
 }
 
diff --git a/drivers/md/dm-vdo/indexer/open-chapter.h b/drivers/md/dm-vdo/indexer/open-chapter.h
index a4250bb19525ec..42f20b8aaafb85 100644
--- a/drivers/md/dm-vdo/indexer/open-chapter.h
+++ b/drivers/md/dm-vdo/indexer/open-chapter.h
@@ -60,7 +60,7 @@ int __must_check uds_put_open_chapter(struct open_chapter_zone *open_chapter,
 void uds_remove_from_open_chapter(struct open_chapter_zone *open_chapter,
 				  const struct uds_record_name *name);
 
-void uds_free_open_chapter(struct open_chapter_zone *open_chapter);
+void vdo_free_open_chapter(struct open_chapter_zone *open_chapter);
 
 int __must_check uds_close_open_chapter(struct open_chapter_zone **chapter_zones,
 					unsigned int zone_count, struct volume *volume,
diff --git a/drivers/md/dm-vdo/indexer/radix-sort.c b/drivers/md/dm-vdo/indexer/radix-sort.c
index e1c40521c7ee77..1b8233c99ae696 100644
--- a/drivers/md/dm-vdo/indexer/radix-sort.c
+++ b/drivers/md/dm-vdo/indexer/radix-sort.c
@@ -213,7 +213,7 @@ int uds_make_radix_sorter(unsigned int count, struct radix_sorter **sorter)
 	unsigned int stack_size = count / INSERTION_SORT_THRESHOLD;
 	struct radix_sorter *radix_sorter;
 
-	result = uds_allocate_extended(struct radix_sorter, stack_size, struct task,
+	result = vdo_allocate_extended(struct radix_sorter, stack_size, struct task,
 				       __func__, &radix_sorter);
 	if (result != UDS_SUCCESS)
 		return result;
@@ -224,9 +224,9 @@ int uds_make_radix_sorter(unsigned int count, struct radix_sorter **sorter)
 	return UDS_SUCCESS;
 }
 
-void uds_free_radix_sorter(struct radix_sorter *sorter)
+void vdo_free_radix_sorter(struct radix_sorter *sorter)
 {
-	uds_free(sorter);
+	vdo_free(sorter);
 }
 
 /*
diff --git a/drivers/md/dm-vdo/indexer/radix-sort.h b/drivers/md/dm-vdo/indexer/radix-sort.h
index 812949bc2cee90..8565d13161ccea 100644
--- a/drivers/md/dm-vdo/indexer/radix-sort.h
+++ b/drivers/md/dm-vdo/indexer/radix-sort.h
@@ -18,7 +18,7 @@ struct radix_sorter;
 
 int __must_check uds_make_radix_sorter(unsigned int count, struct radix_sorter **sorter);
 
-void uds_free_radix_sorter(struct radix_sorter *sorter);
+void vdo_free_radix_sorter(struct radix_sorter *sorter);
 
 int __must_check uds_radix_sort(struct radix_sorter *sorter, const unsigned char *keys[],
 				unsigned int count, unsigned short length);
diff --git a/drivers/md/dm-vdo/indexer/sparse-cache.c b/drivers/md/dm-vdo/indexer/sparse-cache.c
index dded013426e3ff..cb222bda32c29c 100644
--- a/drivers/md/dm-vdo/indexer/sparse-cache.c
+++ b/drivers/md/dm-vdo/indexer/sparse-cache.c
@@ -224,12 +224,12 @@ static int __must_check initialize_cached_chapter_index(struct cached_chapter_in
 	chapter->virtual_chapter = NO_CHAPTER;
 	chapter->index_pages_count = geometry->index_pages_per_chapter;
 
-	result = uds_allocate(chapter->index_pages_count, struct delta_index_page,
+	result = vdo_allocate(chapter->index_pages_count, struct delta_index_page,
 			      __func__, &chapter->index_pages);
 	if (result != UDS_SUCCESS)
 		return result;
 
-	return uds_allocate(chapter->index_pages_count, struct dm_buffer *,
+	return vdo_allocate(chapter->index_pages_count, struct dm_buffer *,
 			    "sparse index volume pages", &chapter->page_buffers);
 }
 
@@ -243,7 +243,7 @@ static int __must_check make_search_list(struct sparse_cache *cache,
 
 	bytes = (sizeof(struct search_list) +
 		 (cache->capacity * sizeof(struct cached_chapter_index *)));
-	result = uds_allocate_cache_aligned(bytes, "search list", &list);
+	result = vdo_allocate_cache_aligned(bytes, "search list", &list);
 	if (result != UDS_SUCCESS)
 		return result;
 
@@ -266,7 +266,7 @@ int uds_make_sparse_cache(const struct index_geometry *geometry, unsigned int ca
 	unsigned int bytes;
 
 	bytes = (sizeof(struct sparse_cache) + (capacity * sizeof(struct cached_chapter_index)));
-	result = uds_allocate_cache_aligned(bytes, "sparse cache", &cache);
+	result = vdo_allocate_cache_aligned(bytes, "sparse cache", &cache);
 	if (result != UDS_SUCCESS)
 		return result;
 
@@ -296,7 +296,7 @@ int uds_make_sparse_cache(const struct index_geometry *geometry, unsigned int ca
 	}
 
 	/* purge_search_list() needs some temporary lists for sorting. */
-	result = uds_allocate(capacity * 2, struct cached_chapter_index *,
+	result = vdo_allocate(capacity * 2, struct cached_chapter_index *,
 			      "scratch entries", &cache->scratch_entries);
 	if (result != UDS_SUCCESS)
 		goto out;
@@ -304,7 +304,7 @@ int uds_make_sparse_cache(const struct index_geometry *geometry, unsigned int ca
 	*cache_ptr = cache;
 	return UDS_SUCCESS;
 out:
-	uds_free_sparse_cache(cache);
+	vdo_free_sparse_cache(cache);
 	return result;
 }
 
@@ -340,29 +340,29 @@ static void release_cached_chapter_index(struct cached_chapter_index *chapter)
 
 	for (i = 0; i < chapter->index_pages_count; i++) {
 		if (chapter->page_buffers[i] != NULL)
-			dm_bufio_release(uds_forget(chapter->page_buffers[i]));
+			dm_bufio_release(vdo_forget(chapter->page_buffers[i]));
 	}
 }
 
-void uds_free_sparse_cache(struct sparse_cache *cache)
+void vdo_free_sparse_cache(struct sparse_cache *cache)
 {
 	unsigned int i;
 
 	if (cache == NULL)
 		return;
 
-	uds_free(cache->scratch_entries);
+	vdo_free(cache->scratch_entries);
 
 	for (i = 0; i < cache->zone_count; i++)
-		uds_free(cache->search_lists[i]);
+		vdo_free(cache->search_lists[i]);
 
 	for (i = 0; i < cache->capacity; i++) {
 		release_cached_chapter_index(&cache->chapters[i]);
-		uds_free(cache->chapters[i].index_pages);
-		uds_free(cache->chapters[i].page_buffers);
+		vdo_free(cache->chapters[i].index_pages);
+		vdo_free(cache->chapters[i].page_buffers);
 	}
 
-	uds_free(cache);
+	vdo_free(cache);
 }
 
 /*
diff --git a/drivers/md/dm-vdo/indexer/sparse-cache.h b/drivers/md/dm-vdo/indexer/sparse-cache.h
index 45e2dcf165b51d..cb1839812ae739 100644
--- a/drivers/md/dm-vdo/indexer/sparse-cache.h
+++ b/drivers/md/dm-vdo/indexer/sparse-cache.h
@@ -30,7 +30,7 @@ int __must_check uds_make_sparse_cache(const struct index_geometry *geometry,
 				       unsigned int capacity, unsigned int zone_count,
 				       struct sparse_cache **cache_ptr);
 
-void uds_free_sparse_cache(struct sparse_cache *cache);
+void vdo_free_sparse_cache(struct sparse_cache *cache);
 
 bool uds_sparse_cache_contains(struct sparse_cache *cache, u64 virtual_chapter,
 			       unsigned int zone_number);
diff --git a/drivers/md/dm-vdo/indexer/volume-index.c b/drivers/md/dm-vdo/indexer/volume-index.c
index d6526fe9bbfc5c..eb54f5b478318a 100644
--- a/drivers/md/dm-vdo/indexer/volume-index.c
+++ b/drivers/md/dm-vdo/indexer/volume-index.c
@@ -277,22 +277,22 @@ static int compute_volume_sub_index_parameters(const struct uds_configuration *c
 
 static void uninitialize_volume_sub_index(struct volume_sub_index *sub_index)
 {
-	uds_free(uds_forget(sub_index->flush_chapters));
-	uds_free(uds_forget(sub_index->zones));
+	vdo_free(vdo_forget(sub_index->flush_chapters));
+	vdo_free(vdo_forget(sub_index->zones));
 	uds_uninitialize_delta_index(&sub_index->delta_index);
 }
 
-void uds_free_volume_index(struct volume_index *volume_index)
+void vdo_free_volume_index(struct volume_index *volume_index)
 {
 	if (volume_index == NULL)
 		return;
 
 	if (volume_index->zones != NULL)
-		uds_free(uds_forget(volume_index->zones));
+		vdo_free(vdo_forget(volume_index->zones));
 
 	uninitialize_volume_sub_index(&volume_index->vi_non_hook);
 	uninitialize_volume_sub_index(&volume_index->vi_hook);
-	uds_free(volume_index);
+	vdo_free(volume_index);
 }
 
 
@@ -1209,12 +1209,12 @@ static int initialize_volume_sub_index(const struct uds_configuration *config,
 				  (zone_count * sizeof(struct volume_sub_index_zone)));
 
 	/* The following arrays are initialized to all zeros. */
-	result = uds_allocate(params.list_count, u64, "first chapter to flush",
+	result = vdo_allocate(params.list_count, u64, "first chapter to flush",
 			      &sub_index->flush_chapters);
 	if (result != UDS_SUCCESS)
 		return result;
 
-	return uds_allocate(zone_count, struct volume_sub_index_zone,
+	return vdo_allocate(zone_count, struct volume_sub_index_zone,
 			    "volume index zones", &sub_index->zones);
 }
 
@@ -1226,7 +1226,7 @@ int uds_make_volume_index(const struct uds_configuration *config, u64 volume_non
 	struct volume_index *volume_index;
 	int result;
 
-	result = uds_allocate(1, struct volume_index, "volume index", &volume_index);
+	result = vdo_allocate(1, struct volume_index, "volume index", &volume_index);
 	if (result != UDS_SUCCESS)
 		return result;
 
@@ -1236,7 +1236,7 @@ int uds_make_volume_index(const struct uds_configuration *config, u64 volume_non
 		result = initialize_volume_sub_index(config, volume_nonce, 'm',
 						     &volume_index->vi_non_hook);
 		if (result != UDS_SUCCESS) {
-			uds_free_volume_index(volume_index);
+			vdo_free_volume_index(volume_index);
 			return result;
 		}
 
@@ -1247,10 +1247,10 @@ int uds_make_volume_index(const struct uds_configuration *config, u64 volume_non
 
 	volume_index->sparse_sample_rate = config->sparse_sample_rate;
 
-	result = uds_allocate(config->zone_count, struct volume_index_zone,
+	result = vdo_allocate(config->zone_count, struct volume_index_zone,
 			      "volume index zones", &volume_index->zones);
 	if (result != UDS_SUCCESS) {
-		uds_free_volume_index(volume_index);
+		vdo_free_volume_index(volume_index);
 		return result;
 	}
 
@@ -1261,7 +1261,7 @@ int uds_make_volume_index(const struct uds_configuration *config, u64 volume_non
 	result = initialize_volume_sub_index(&split.non_hook_config, volume_nonce, 'd',
 					     &volume_index->vi_non_hook);
 	if (result != UDS_SUCCESS) {
-		uds_free_volume_index(volume_index);
+		vdo_free_volume_index(volume_index);
 		return uds_log_error_strerror(result,
 					      "Error creating non hook volume index");
 	}
@@ -1269,7 +1269,7 @@ int uds_make_volume_index(const struct uds_configuration *config, u64 volume_non
 	result = initialize_volume_sub_index(&split.hook_config, volume_nonce, 's',
 					     &volume_index->vi_hook);
 	if (result != UDS_SUCCESS) {
-		uds_free_volume_index(volume_index);
+		vdo_free_volume_index(volume_index);
 		return uds_log_error_strerror(result,
 					      "Error creating hook volume index");
 	}
diff --git a/drivers/md/dm-vdo/indexer/volume-index.h b/drivers/md/dm-vdo/indexer/volume-index.h
index 1fa34166b91abb..94b6c794a4d67f 100644
--- a/drivers/md/dm-vdo/indexer/volume-index.h
+++ b/drivers/md/dm-vdo/indexer/volume-index.h
@@ -141,7 +141,7 @@ int __must_check uds_make_volume_index(const struct uds_configuration *config,
 				       u64 volume_nonce,
 				       struct volume_index **volume_index);
 
-void uds_free_volume_index(struct volume_index *volume_index);
+void vdo_free_volume_index(struct volume_index *volume_index);
 
 int __must_check uds_compute_volume_index_save_blocks(const struct uds_configuration *config,
 						      size_t block_size,
diff --git a/drivers/md/dm-vdo/indexer/volume.c b/drivers/md/dm-vdo/indexer/volume.c
index ee53bd9999d14b..f8e47f1c54c7ff 100644
--- a/drivers/md/dm-vdo/indexer/volume.c
+++ b/drivers/md/dm-vdo/indexer/volume.c
@@ -200,7 +200,7 @@ static void wait_for_pending_searches(struct page_cache *cache, u32 physical_pag
 static void release_page_buffer(struct cached_page *page)
 {
 	if (page->buffer != NULL)
-		dm_bufio_release(uds_forget(page->buffer));
+		dm_bufio_release(vdo_forget(page->buffer));
 }
 
 static void clear_cache_page(struct page_cache *cache, struct cached_page *page)
@@ -1046,7 +1046,7 @@ static void invalidate_page(struct page_cache *cache, u32 physical_page)
 	}
 }
 
-void uds_forget_chapter(struct volume *volume, u64 virtual_chapter)
+void vdo_forget_chapter(struct volume *volume, u64 virtual_chapter)
 {
 	u32 physical_chapter =
 		uds_map_to_physical_chapter(volume->geometry, virtual_chapter);
@@ -1484,7 +1484,7 @@ int __must_check uds_replace_volume_storage(struct volume *volume,
 	if (volume->sparse_cache != NULL)
 		uds_invalidate_sparse_cache(volume->sparse_cache);
 	if (volume->client != NULL)
-		dm_bufio_client_destroy(uds_forget(volume->client));
+		dm_bufio_client_destroy(vdo_forget(volume->client));
 
 	return uds_open_volume_bufio(layout, volume->geometry->bytes_per_page,
 				     volume->reserved_buffers, &volume->client);
@@ -1509,22 +1509,22 @@ static int __must_check initialize_page_cache(struct page_cache *cache,
 	if (result != UDS_SUCCESS)
 		return result;
 
-	result = uds_allocate(VOLUME_CACHE_MAX_QUEUED_READS, struct queued_read,
+	result = vdo_allocate(VOLUME_CACHE_MAX_QUEUED_READS, struct queued_read,
 			      "volume read queue", &cache->read_queue);
 	if (result != UDS_SUCCESS)
 		return result;
 
-	result = uds_allocate(cache->zone_count, struct search_pending_counter,
+	result = vdo_allocate(cache->zone_count, struct search_pending_counter,
 			      "Volume Cache Zones", &cache->search_pending_counters);
 	if (result != UDS_SUCCESS)
 		return result;
 
-	result = uds_allocate(cache->indexable_pages, u16, "page cache index",
+	result = vdo_allocate(cache->indexable_pages, u16, "page cache index",
 			      &cache->index);
 	if (result != UDS_SUCCESS)
 		return result;
 
-	result = uds_allocate(cache->cache_slots, struct cached_page, "page cache cache",
+	result = vdo_allocate(cache->cache_slots, struct cached_page, "page cache cache",
 			      &cache->cache);
 	if (result != UDS_SUCCESS)
 		return result;
@@ -1548,7 +1548,7 @@ int uds_make_volume(const struct uds_configuration *config, struct index_layout
 	unsigned int reserved_buffers;
 	int result;
 
-	result = uds_allocate(1, struct volume, "volume", &volume);
+	result = vdo_allocate(1, struct volume, "volume", &volume);
 	if (result != UDS_SUCCESS)
 		return result;
 
@@ -1556,7 +1556,7 @@ int uds_make_volume(const struct uds_configuration *config, struct index_layout
 
 	result = uds_copy_index_geometry(config->geometry, &volume->geometry);
 	if (result != UDS_SUCCESS) {
-		uds_free_volume(volume);
+		vdo_free_volume(volume);
 		return uds_log_warning_strerror(result,
 						"failed to allocate geometry: error");
 	}
@@ -1574,22 +1574,22 @@ int uds_make_volume(const struct uds_configuration *config, struct index_layout
 	result = uds_open_volume_bufio(layout, geometry->bytes_per_page,
 				       volume->reserved_buffers, &volume->client);
 	if (result != UDS_SUCCESS) {
-		uds_free_volume(volume);
+		vdo_free_volume(volume);
 		return result;
 	}
 
 	result = uds_make_radix_sorter(geometry->records_per_page,
 				       &volume->radix_sorter);
 	if (result != UDS_SUCCESS) {
-		uds_free_volume(volume);
+		vdo_free_volume(volume);
 		return result;
 	}
 
-	result = uds_allocate(geometry->records_per_page,
+	result = vdo_allocate(geometry->records_per_page,
 			      const struct uds_volume_record *, "record pointers",
 			      &volume->record_pointers);
 	if (result != UDS_SUCCESS) {
-		uds_free_volume(volume);
+		vdo_free_volume(volume);
 		return result;
 	}
 
@@ -1600,7 +1600,7 @@ int uds_make_volume(const struct uds_configuration *config, struct index_layout
 					       config->zone_count,
 					       &volume->sparse_cache);
 		if (result != UDS_SUCCESS) {
-			uds_free_volume(volume);
+			vdo_free_volume(volume);
 			return result;
 		}
 
@@ -1611,14 +1611,14 @@ int uds_make_volume(const struct uds_configuration *config, struct index_layout
 	result = initialize_page_cache(&volume->page_cache, geometry,
 				       config->cache_chapters, config->zone_count);
 	if (result != UDS_SUCCESS) {
-		uds_free_volume(volume);
+		vdo_free_volume(volume);
 		return result;
 	}
 
 	volume->cache_size += volume->page_cache.cache_slots * sizeof(struct delta_index_page);
 	result = uds_make_index_page_map(geometry, &volume->index_page_map);
 	if (result != UDS_SUCCESS) {
-		uds_free_volume(volume);
+		vdo_free_volume(volume);
 		return result;
 	}
 
@@ -1626,10 +1626,10 @@ int uds_make_volume(const struct uds_configuration *config, struct index_layout
 	uds_init_cond(&volume->read_threads_read_done_cond);
 	uds_init_cond(&volume->read_threads_cond);
 
-	result = uds_allocate(config->read_threads, struct thread *, "reader threads",
+	result = vdo_allocate(config->read_threads, struct thread *, "reader threads",
 			      &volume->reader_threads);
 	if (result != UDS_SUCCESS) {
-		uds_free_volume(volume);
+		vdo_free_volume(volume);
 		return result;
 	}
 
@@ -1637,7 +1637,7 @@ int uds_make_volume(const struct uds_configuration *config, struct index_layout
 		result = vdo_create_thread(read_thread_function, (void *) volume,
 					   "reader", &volume->reader_threads[i]);
 		if (result != UDS_SUCCESS) {
-			uds_free_volume(volume);
+			vdo_free_volume(volume);
 			return result;
 		}
 
@@ -1656,13 +1656,13 @@ static void uninitialize_page_cache(struct page_cache *cache)
 		for (i = 0; i < cache->cache_slots; i++)
 			release_page_buffer(&cache->cache[i]);
 	}
-	uds_free(cache->index);
-	uds_free(cache->cache);
-	uds_free(cache->search_pending_counters);
-	uds_free(cache->read_queue);
+	vdo_free(cache->index);
+	vdo_free(cache->cache);
+	vdo_free(cache->search_pending_counters);
+	vdo_free(cache->read_queue);
 }
 
-void uds_free_volume(struct volume *volume)
+void vdo_free_volume(struct volume *volume)
 {
 	if (volume == NULL)
 		return;
@@ -1677,19 +1677,19 @@ void uds_free_volume(struct volume *volume)
 		mutex_unlock(&volume->read_threads_mutex);
 		for (i = 0; i < volume->read_thread_count; i++)
 			vdo_join_threads(volume->reader_threads[i]);
-		uds_free(volume->reader_threads);
+		vdo_free(volume->reader_threads);
 		volume->reader_threads = NULL;
 	}
 
 	/* Must destroy the client AFTER freeing the cached pages. */
 	uninitialize_page_cache(&volume->page_cache);
-	uds_free_sparse_cache(volume->sparse_cache);
+	vdo_free_sparse_cache(volume->sparse_cache);
 	if (volume->client != NULL)
-		dm_bufio_client_destroy(uds_forget(volume->client));
+		dm_bufio_client_destroy(vdo_forget(volume->client));
 
-	uds_free_index_page_map(volume->index_page_map);
-	uds_free_radix_sorter(volume->radix_sorter);
-	uds_free(volume->geometry);
-	uds_free(volume->record_pointers);
-	uds_free(volume);
+	vdo_free_index_page_map(volume->index_page_map);
+	vdo_free_radix_sorter(volume->radix_sorter);
+	vdo_free(volume->geometry);
+	vdo_free(volume->record_pointers);
+	vdo_free(volume);
 }
diff --git a/drivers/md/dm-vdo/indexer/volume.h b/drivers/md/dm-vdo/indexer/volume.h
index c260b22eaa9c01..7fdd44464db202 100644
--- a/drivers/md/dm-vdo/indexer/volume.h
+++ b/drivers/md/dm-vdo/indexer/volume.h
@@ -127,7 +127,7 @@ int __must_check uds_make_volume(const struct uds_configuration *config,
 				 struct index_layout *layout,
 				 struct volume **new_volume);
 
-void uds_free_volume(struct volume *volume);
+void vdo_free_volume(struct volume *volume);
 
 int __must_check uds_replace_volume_storage(struct volume *volume,
 					    struct index_layout *layout,
@@ -150,7 +150,7 @@ int __must_check uds_search_cached_record_page(struct volume *volume,
 					       struct uds_request *request, u32 chapter,
 					       u16 record_page_number, bool *found);
 
-void uds_forget_chapter(struct volume *volume, u64 chapter);
+void vdo_forget_chapter(struct volume *volume, u64 chapter);
 
 int __must_check uds_write_chapter(struct volume *volume,
 				   struct open_chapter_index *chapter_index,
diff --git a/drivers/md/dm-vdo/int-map.c b/drivers/md/dm-vdo/int-map.c
index 99ccbb1339c6c9..b8a955449737e1 100644
--- a/drivers/md/dm-vdo/int-map.c
+++ b/drivers/md/dm-vdo/int-map.c
@@ -166,7 +166,7 @@ static int allocate_buckets(struct int_map *map, size_t capacity)
 	 * without have to wrap back around to element zero.
 	 */
 	map->bucket_count = capacity + (NEIGHBORHOOD - 1);
-	return uds_allocate(map->bucket_count, struct bucket,
+	return vdo_allocate(map->bucket_count, struct bucket,
 			    "struct int_map buckets", &map->buckets);
 }
 
@@ -184,7 +184,7 @@ int vdo_int_map_create(size_t initial_capacity, struct int_map **map_ptr)
 	int result;
 	size_t capacity;
 
-	result = uds_allocate(1, struct int_map, "struct int_map", &map);
+	result = vdo_allocate(1, struct int_map, "struct int_map", &map);
 	if (result != UDS_SUCCESS)
 		return result;
 
@@ -199,7 +199,7 @@ int vdo_int_map_create(size_t initial_capacity, struct int_map **map_ptr)
 
 	result = allocate_buckets(map, capacity);
 	if (result != UDS_SUCCESS) {
-		vdo_int_map_free(uds_forget(map));
+		vdo_int_map_free(vdo_forget(map));
 		return result;
 	}
 
@@ -219,8 +219,8 @@ void vdo_int_map_free(struct int_map *map)
 	if (map == NULL)
 		return;
 
-	uds_free(uds_forget(map->buckets));
-	uds_free(uds_forget(map));
+	vdo_free(vdo_forget(map->buckets));
+	vdo_free(vdo_forget(map));
 }
 
 /**
@@ -401,14 +401,14 @@ static int resize_buckets(struct int_map *map)
 		result = vdo_int_map_put(map, entry->key, entry->value, true, NULL);
 		if (result != UDS_SUCCESS) {
 			/* Destroy the new partial map and restore the map from the stack. */
-			uds_free(uds_forget(map->buckets));
+			vdo_free(vdo_forget(map->buckets));
 			*map = old_map;
 			return result;
 		}
 	}
 
 	/* Destroy the old bucket array. */
-	uds_free(uds_forget(old_map.buckets));
+	vdo_free(vdo_forget(old_map.buckets));
 	return UDS_SUCCESS;
 }
 
diff --git a/drivers/md/dm-vdo/io-submitter.c b/drivers/md/dm-vdo/io-submitter.c
index 6c050f2b3b4411..23549b7e9e6d60 100644
--- a/drivers/md/dm-vdo/io-submitter.c
+++ b/drivers/md/dm-vdo/io-submitter.c
@@ -380,7 +380,7 @@ int vdo_make_io_submitter(unsigned int thread_count, unsigned int rotation_inter
 	struct io_submitter *io_submitter;
 	int result;
 
-	result = uds_allocate_extended(struct io_submitter, thread_count,
+	result = vdo_allocate_extended(struct io_submitter, thread_count,
 				       struct bio_queue_data, "bio submission data",
 				       &io_submitter);
 	if (result != UDS_SUCCESS)
@@ -422,7 +422,7 @@ int vdo_make_io_submitter(unsigned int thread_count, unsigned int rotation_inter
 			 * Clean up the partially initialized bio-queue entirely and indicate that
 			 * initialization failed.
 			 */
-			vdo_int_map_free(uds_forget(bio_queue_data->map));
+			vdo_int_map_free(vdo_forget(bio_queue_data->map));
 			uds_log_error("bio queue initialization failed %d", result);
 			vdo_cleanup_io_submitter(io_submitter);
 			vdo_free_io_submitter(io_submitter);
@@ -470,8 +470,8 @@ void vdo_free_io_submitter(struct io_submitter *io_submitter)
 	for (i = io_submitter->num_bio_queues_used - 1; i >= 0; i--) {
 		io_submitter->num_bio_queues_used--;
 		/* vdo_destroy() will free the work queue, so just give up our reference to it. */
-		uds_forget(io_submitter->bio_queue_data[i].queue);
-		vdo_int_map_free(uds_forget(io_submitter->bio_queue_data[i].map));
+		vdo_forget(io_submitter->bio_queue_data[i].queue);
+		vdo_int_map_free(vdo_forget(io_submitter->bio_queue_data[i].map));
 	}
-	uds_free(io_submitter);
+	vdo_free(io_submitter);
 }
diff --git a/drivers/md/dm-vdo/logical-zone.c b/drivers/md/dm-vdo/logical-zone.c
index cfbf1701ca844a..de231c3a4850a1 100644
--- a/drivers/md/dm-vdo/logical-zone.c
+++ b/drivers/md/dm-vdo/logical-zone.c
@@ -96,7 +96,7 @@ int vdo_make_logical_zones(struct vdo *vdo, struct logical_zones **zones_ptr)
 	if (zone_count == 0)
 		return VDO_SUCCESS;
 
-	result = uds_allocate_extended(struct logical_zones, zone_count,
+	result = vdo_allocate_extended(struct logical_zones, zone_count,
 				       struct logical_zone, __func__, &zones);
 	if (result != VDO_SUCCESS)
 		return result;
@@ -134,12 +134,12 @@ void vdo_free_logical_zones(struct logical_zones *zones)
 	if (zones == NULL)
 		return;
 
-	uds_free(uds_forget(zones->manager));
+	vdo_free(vdo_forget(zones->manager));
 
 	for (index = 0; index < zones->zone_count; index++)
-		vdo_int_map_free(uds_forget(zones->zones[index].lbn_operations));
+		vdo_int_map_free(vdo_forget(zones->zones[index].lbn_operations));
 
-	uds_free(zones);
+	vdo_free(zones);
 }
 
 static inline void assert_on_zone_thread(struct logical_zone *zone, const char *what)
diff --git a/drivers/md/dm-vdo/memory-alloc.c b/drivers/md/dm-vdo/memory-alloc.c
index f1ba6f3bef61c7..b0191001e1ee4a 100644
--- a/drivers/md/dm-vdo/memory-alloc.c
+++ b/drivers/md/dm-vdo/memory-alloc.c
@@ -37,7 +37,7 @@ static inline bool allocations_allowed(void)
  * @new_thread: registered_thread structure to use for the current thread
  * @flag_ptr: Location of the allocation-allowed flag
  */
-void uds_register_allocating_thread(struct registered_thread *new_thread,
+void vdo_register_allocating_thread(struct registered_thread *new_thread,
 				    const bool *flag_ptr)
 {
 	if (flag_ptr == NULL) {
@@ -50,7 +50,7 @@ void uds_register_allocating_thread(struct registered_thread *new_thread,
 }
 
 /* Unregister the current thread as an allocating thread. */
-void uds_unregister_allocating_thread(void)
+void vdo_unregister_allocating_thread(void)
 {
 	vdo_unregister_thread(&allocating_threads);
 }
@@ -148,7 +148,7 @@ static void remove_vmalloc_block(void *ptr)
 
 	spin_unlock_irqrestore(&memory_stats.lock, flags);
 	if (block != NULL)
-		uds_free(block);
+		vdo_free(block);
 	else
 		uds_log_info("attempting to remove ptr %px not found in vmalloc list", ptr);
 }
@@ -196,7 +196,7 @@ static inline bool use_kmalloc(size_t size)
  *
  * Return: UDS_SUCCESS or an error code
  */
-int uds_allocate_memory(size_t size, size_t align, const char *what, void *ptr)
+int vdo_allocate_memory(size_t size, size_t align, const char *what, void *ptr)
 {
 	/*
 	 * The __GFP_RETRY_MAYFAIL flag means the VM implementation will retry memory reclaim
@@ -245,8 +245,7 @@ int uds_allocate_memory(size_t size, size_t align, const char *what, void *ptr)
 	} else {
 		struct vmalloc_block_info *block;
 
-		if (uds_allocate(1, struct vmalloc_block_info, __func__, &block) ==
-		    UDS_SUCCESS) {
+		if (vdo_allocate(1, struct vmalloc_block_info, __func__, &block) == UDS_SUCCESS) {
 			/*
 			 * It is possible for __vmalloc to fail to allocate memory because there
 			 * are no pages available (see VDO-3661). A short sleep may allow the page
@@ -259,7 +258,6 @@ int uds_allocate_memory(size_t size, size_t align, const char *what, void *ptr)
 			 */
 			for (;;) {
 				p = __vmalloc(size, gfp_flags | __GFP_NOWARN);
-
 				if (p != NULL)
 					break;
 
@@ -273,7 +271,7 @@ int uds_allocate_memory(size_t size, size_t align, const char *what, void *ptr)
 			}
 
 			if (p == NULL) {
-				uds_free(block);
+				vdo_free(block);
 			} else {
 				block->ptr = p;
 				block->size = PAGE_ALIGN(size);
@@ -304,7 +302,7 @@ int uds_allocate_memory(size_t size, size_t align, const char *what, void *ptr)
  *
  * Return: pointer to the allocated memory, or NULL if the required space is not available.
  */
-void *uds_allocate_memory_nowait(size_t size, const char *what __maybe_unused)
+void *vdo_allocate_memory_nowait(size_t size, const char *what __maybe_unused)
 {
 	void *p = kmalloc(size, GFP_NOWAIT | __GFP_ZERO);
 
@@ -314,7 +312,7 @@ void *uds_allocate_memory_nowait(size_t size, const char *what __maybe_unused)
 	return p;
 }
 
-void uds_free(void *ptr)
+void vdo_free(void *ptr)
 {
 	if (ptr != NULL) {
 		if (is_vmalloc_addr(ptr)) {
@@ -339,18 +337,18 @@ void uds_free(void *ptr)
  *
  * Return: UDS_SUCCESS or an error code
  */
-int uds_reallocate_memory(void *ptr, size_t old_size, size_t size, const char *what,
+int vdo_reallocate_memory(void *ptr, size_t old_size, size_t size, const char *what,
 			  void *new_ptr)
 {
 	int result;
 
 	if (size == 0) {
-		uds_free(ptr);
+		vdo_free(ptr);
 		*(void **) new_ptr = NULL;
 		return UDS_SUCCESS;
 	}
 
-	result = uds_allocate(size, char, what, new_ptr);
+	result = vdo_allocate(size, char, what, new_ptr);
 	if (result != UDS_SUCCESS)
 		return result;
 
@@ -359,18 +357,18 @@ int uds_reallocate_memory(void *ptr, size_t old_size, size_t size, const char *w
 			size = old_size;
 
 		memcpy(*((void **) new_ptr), ptr, size);
-		uds_free(ptr);
+		vdo_free(ptr);
 	}
 
 	return UDS_SUCCESS;
 }
 
-int uds_duplicate_string(const char *string, const char *what, char **new_string)
+int vdo_duplicate_string(const char *string, const char *what, char **new_string)
 {
 	int result;
 	u8 *dup;
 
-	result = uds_allocate(strlen(string) + 1, u8, what, &dup);
+	result = vdo_allocate(strlen(string) + 1, u8, what, &dup);
 	if (result != UDS_SUCCESS)
 		return result;
 
@@ -379,13 +377,13 @@ int uds_duplicate_string(const char *string, const char *what, char **new_string
 	return UDS_SUCCESS;
 }
 
-void uds_memory_init(void)
+void vdo_memory_init(void)
 {
 	spin_lock_init(&memory_stats.lock);
 	vdo_initialize_thread_registry(&allocating_threads);
 }
 
-void uds_memory_exit(void)
+void vdo_memory_exit(void)
 {
 	ASSERT_LOG_ONLY(memory_stats.kmalloc_bytes == 0,
 			"kmalloc memory used (%zd bytes in %zd blocks) is returned to the kernel",
@@ -396,7 +394,7 @@ void uds_memory_exit(void)
 	uds_log_debug("peak usage %zd bytes", memory_stats.peak_bytes);
 }
 
-void uds_get_memory_stats(u64 *bytes_used, u64 *peak_bytes_used)
+void vdo_get_memory_stats(u64 *bytes_used, u64 *peak_bytes_used)
 {
 	unsigned long flags;
 
@@ -410,7 +408,7 @@ void uds_get_memory_stats(u64 *bytes_used, u64 *peak_bytes_used)
  * Report stats on any allocated memory that we're tracking. Not all allocation types are
  * guaranteed to be tracked in bytes (e.g., bios).
  */
-void uds_report_memory_usage(void)
+void vdo_report_memory_usage(void)
 {
 	unsigned long flags;
 	u64 kmalloc_blocks;
diff --git a/drivers/md/dm-vdo/memory-alloc.h b/drivers/md/dm-vdo/memory-alloc.h
index d72d597f98cf31..3f27dd722a2d2b 100644
--- a/drivers/md/dm-vdo/memory-alloc.h
+++ b/drivers/md/dm-vdo/memory-alloc.h
@@ -3,8 +3,8 @@
  * Copyright 2023 Red Hat
  */
 
-#ifndef UDS_MEMORY_ALLOC_H
-#define UDS_MEMORY_ALLOC_H
+#ifndef VDO_MEMORY_ALLOC_H
+#define VDO_MEMORY_ALLOC_H
 
 #include <linux/cache.h>
 #include <linux/io.h> /* for PAGE_SIZE */
@@ -12,8 +12,8 @@
 #include "permassert.h"
 #include "thread-registry.h"
 
-/* Custom memory allocation function for UDS that tracks memory usage */
-int __must_check uds_allocate_memory(size_t size, size_t align, const char *what, void *ptr);
+/* Custom memory allocation function that tracks memory usage */
+int __must_check vdo_allocate_memory(size_t size, size_t align, const char *what, void *ptr);
 
 /*
  * Allocate storage based on element counts, sizes, and alignment.
@@ -37,7 +37,7 @@ int __must_check uds_allocate_memory(size_t size, size_t align, const char *what
  *
  * Return: UDS_SUCCESS or an error code
  */
-static inline int uds_do_allocation(size_t count, size_t size, size_t extra,
+static inline int vdo_do_allocation(size_t count, size_t size, size_t extra,
 				    size_t align, const char *what, void *ptr)
 {
 	size_t total_size = count * size + extra;
@@ -53,7 +53,7 @@ static inline int uds_do_allocation(size_t count, size_t size, size_t extra,
 		total_size = SIZE_MAX;
 	}
 
-	return uds_allocate_memory(total_size, align, what, ptr);
+	return vdo_allocate_memory(total_size, align, what, ptr);
 }
 
 /*
@@ -67,8 +67,8 @@ static inline int uds_do_allocation(size_t count, size_t size, size_t extra,
  *
  * Return: UDS_SUCCESS or an error code
  */
-#define uds_allocate(COUNT, TYPE, WHAT, PTR) \
-	uds_do_allocation(COUNT, sizeof(TYPE), 0, __alignof__(TYPE), WHAT, PTR)
+#define vdo_allocate(COUNT, TYPE, WHAT, PTR) \
+	vdo_do_allocation(COUNT, sizeof(TYPE), 0, __alignof__(TYPE), WHAT, PTR)
 
 /*
  * Allocate one object of an indicated type, followed by one or more elements of a second type,
@@ -83,12 +83,12 @@ static inline int uds_do_allocation(size_t count, size_t size, size_t extra,
  *
  * Return: UDS_SUCCESS or an error code
  */
-#define uds_allocate_extended(TYPE1, COUNT, TYPE2, WHAT, PTR)            \
+#define vdo_allocate_extended(TYPE1, COUNT, TYPE2, WHAT, PTR)            \
 	__extension__({                                                  \
 		int _result;						 \
 		TYPE1 **_ptr = (PTR);                                    \
 		BUILD_BUG_ON(__alignof__(TYPE1) < __alignof__(TYPE2));   \
-		_result = uds_do_allocation(COUNT,                       \
+		_result = vdo_do_allocation(COUNT,                       \
 					    sizeof(TYPE2),               \
 					    sizeof(TYPE1),               \
 					    __alignof__(TYPE1),          \
@@ -107,9 +107,9 @@ static inline int uds_do_allocation(size_t count, size_t size, size_t extra,
  *
  * Return: UDS_SUCCESS or an error code
  */
-static inline int __must_check uds_allocate_cache_aligned(size_t size, const char *what, void *ptr)
+static inline int __must_check vdo_allocate_cache_aligned(size_t size, const char *what, void *ptr)
 {
-	return uds_allocate_memory(size, L1_CACHE_BYTES, what, ptr);
+	return vdo_allocate_memory(size, L1_CACHE_BYTES, what, ptr);
 }
 
 /*
@@ -121,18 +121,18 @@ static inline int __must_check uds_allocate_cache_aligned(size_t size, const cha
  *
  * Return: pointer to the memory, or NULL if the memory is not available.
  */
-void *__must_check uds_allocate_memory_nowait(size_t size, const char *what);
+void *__must_check vdo_allocate_memory_nowait(size_t size, const char *what);
 
-int __must_check uds_reallocate_memory(void *ptr, size_t old_size, size_t size,
+int __must_check vdo_reallocate_memory(void *ptr, size_t old_size, size_t size,
 				       const char *what, void *new_ptr);
 
-int __must_check uds_duplicate_string(const char *string, const char *what,
+int __must_check vdo_duplicate_string(const char *string, const char *what,
 				      char **new_string);
 
-/* Free memory allocated with uds_allocate(). */
-void uds_free(void *ptr);
+/* Free memory allocated with vdo_allocate(). */
+void vdo_free(void *ptr);
 
-static inline void *__uds_forget(void **ptr_ptr)
+static inline void *__vdo_forget(void **ptr_ptr)
 {
 	void *ptr = *ptr_ptr;
 
@@ -144,19 +144,19 @@ static inline void *__uds_forget(void **ptr_ptr)
  * Null out a pointer and return a copy to it. This macro should be used when passing a pointer to
  * a function for which it is not safe to access the pointer once the function returns.
  */
-#define uds_forget(ptr) __uds_forget((void **) &(ptr))
+#define vdo_forget(ptr) __vdo_forget((void **) &(ptr))
 
-void uds_memory_init(void);
+void vdo_memory_init(void);
 
-void uds_memory_exit(void);
+void vdo_memory_exit(void);
 
-void uds_register_allocating_thread(struct registered_thread *new_thread,
+void vdo_register_allocating_thread(struct registered_thread *new_thread,
 				    const bool *flag_ptr);
 
-void uds_unregister_allocating_thread(void);
+void vdo_unregister_allocating_thread(void);
 
-void uds_get_memory_stats(u64 *bytes_used, u64 *peak_bytes_used);
+void vdo_get_memory_stats(u64 *bytes_used, u64 *peak_bytes_used);
 
-void uds_report_memory_usage(void);
+void vdo_report_memory_usage(void);
 
-#endif /* UDS_MEMORY_ALLOC_H */
+#endif /* VDO_MEMORY_ALLOC_H */
diff --git a/drivers/md/dm-vdo/message-stats.c b/drivers/md/dm-vdo/message-stats.c
index af964e55b98c31..e9c71793033d16 100644
--- a/drivers/md/dm-vdo/message-stats.c
+++ b/drivers/md/dm-vdo/message-stats.c
@@ -761,12 +761,12 @@ int vdo_write_stats(struct vdo *vdo, char *buf, unsigned int maxlen)
 	struct vdo_statistics *stats;
 	int result;
 
-	result = uds_allocate(1, struct vdo_statistics, __func__, &stats);
+	result = vdo_allocate(1, struct vdo_statistics, __func__, &stats);
 	if (result != VDO_SUCCESS)
 		return result;
 
 	vdo_fetch_statistics(vdo, stats);
 	result = write_vdo_statistics(NULL, stats, NULL, &buf, &maxlen);
-	uds_free(stats);
+	vdo_free(stats);
 	return result;
 }
diff --git a/drivers/md/dm-vdo/packer.c b/drivers/md/dm-vdo/packer.c
index e391cac6c92d70..59820f91a70246 100644
--- a/drivers/md/dm-vdo/packer.c
+++ b/drivers/md/dm-vdo/packer.c
@@ -122,7 +122,7 @@ static int __must_check make_bin(struct packer *packer)
 	struct packer_bin *bin;
 	int result;
 
-	result = uds_allocate_extended(struct packer_bin, VDO_MAX_COMPRESSION_SLOTS,
+	result = vdo_allocate_extended(struct packer_bin, VDO_MAX_COMPRESSION_SLOTS,
 				       struct vio *, __func__, &bin);
 	if (result != VDO_SUCCESS)
 		return result;
@@ -148,7 +148,7 @@ int vdo_make_packer(struct vdo *vdo, block_count_t bin_count, struct packer **pa
 	block_count_t i;
 	int result;
 
-	result = uds_allocate(1, struct packer, __func__, &packer);
+	result = vdo_allocate(1, struct packer, __func__, &packer);
 	if (result != VDO_SUCCESS)
 		return result;
 
@@ -170,7 +170,7 @@ int vdo_make_packer(struct vdo *vdo, block_count_t bin_count, struct packer **pa
 	 * bin must have a canceler for which it is waiting, and any canceler will only have
 	 * canceled one lock holder at a time.
 	 */
-	result = uds_allocate_extended(struct packer_bin, MAXIMUM_VDO_USER_VIOS / 2,
+	result = vdo_allocate_extended(struct packer_bin, MAXIMUM_VDO_USER_VIOS / 2,
 				       struct vio *, __func__, &packer->canceled_bin);
 	if (result != VDO_SUCCESS) {
 		vdo_free_packer(packer);
@@ -200,11 +200,11 @@ void vdo_free_packer(struct packer *packer)
 
 	list_for_each_entry_safe(bin, tmp, &packer->bins, list) {
 		list_del_init(&bin->list);
-		uds_free(bin);
+		vdo_free(bin);
 	}
 
-	uds_free(uds_forget(packer->canceled_bin));
-	uds_free(packer);
+	vdo_free(vdo_forget(packer->canceled_bin));
+	vdo_free(packer);
 }
 
 /**
@@ -673,7 +673,7 @@ void vdo_remove_lock_holder_from_packer(struct vdo_completion *completion)
 
 	assert_data_vio_in_packer_zone(data_vio);
 
-	lock_holder = uds_forget(data_vio->compression.lock_holder);
+	lock_holder = vdo_forget(data_vio->compression.lock_holder);
 	bin = lock_holder->compression.bin;
 	ASSERT_LOG_ONLY((bin != NULL), "data_vio in packer has a bin");
 
diff --git a/drivers/md/dm-vdo/physical-zone.c b/drivers/md/dm-vdo/physical-zone.c
index 3bcf6f1ba77f2f..b0a1d75567bac0 100644
--- a/drivers/md/dm-vdo/physical-zone.c
+++ b/drivers/md/dm-vdo/physical-zone.c
@@ -241,7 +241,7 @@ static int make_pbn_lock_pool(size_t capacity, struct pbn_lock_pool **pool_ptr)
 	struct pbn_lock_pool *pool;
 	int result;
 
-	result = uds_allocate_extended(struct pbn_lock_pool, capacity, idle_pbn_lock,
+	result = vdo_allocate_extended(struct pbn_lock_pool, capacity, idle_pbn_lock,
 				       __func__, &pool);
 	if (result != VDO_SUCCESS)
 		return result;
@@ -272,7 +272,7 @@ static void free_pbn_lock_pool(struct pbn_lock_pool *pool)
 	ASSERT_LOG_ONLY(pool->borrowed == 0,
 			"All PBN locks must be returned to the pool before it is freed, but %zu locks are still on loan",
 			pool->borrowed);
-	uds_free(pool);
+	vdo_free(pool);
 }
 
 /**
@@ -346,7 +346,7 @@ static int initialize_zone(struct vdo *vdo, struct physical_zones *zones)
 	zone->next = &zones->zones[(zone_number + 1) % vdo->thread_config.physical_zone_count];
 	result = vdo_make_default_thread(vdo, zone->thread_id);
 	if (result != VDO_SUCCESS) {
-		free_pbn_lock_pool(uds_forget(zone->lock_pool));
+		free_pbn_lock_pool(vdo_forget(zone->lock_pool));
 		vdo_int_map_free(zone->pbn_operations);
 		return result;
 	}
@@ -369,7 +369,7 @@ int vdo_make_physical_zones(struct vdo *vdo, struct physical_zones **zones_ptr)
 	if (zone_count == 0)
 		return VDO_SUCCESS;
 
-	result = uds_allocate_extended(struct physical_zones, zone_count,
+	result = vdo_allocate_extended(struct physical_zones, zone_count,
 				       struct physical_zone, __func__, &zones);
 	if (result != VDO_SUCCESS)
 		return result;
@@ -400,11 +400,11 @@ void vdo_free_physical_zones(struct physical_zones *zones)
 	for (index = 0; index < zones->zone_count; index++) {
 		struct physical_zone *zone = &zones->zones[index];
 
-		free_pbn_lock_pool(uds_forget(zone->lock_pool));
-		vdo_int_map_free(uds_forget(zone->pbn_operations));
+		free_pbn_lock_pool(vdo_forget(zone->lock_pool));
+		vdo_int_map_free(vdo_forget(zone->pbn_operations));
 	}
 
-	uds_free(zones);
+	vdo_free(zones);
 }
 
 /**
@@ -462,7 +462,7 @@ int vdo_attempt_physical_zone_pbn_lock(struct physical_zone *zone,
 
 	if (lock != NULL) {
 		/* The lock is already held, so we don't need the borrowed one. */
-		return_pbn_lock_to_pool(zone->lock_pool, uds_forget(new_lock));
+		return_pbn_lock_to_pool(zone->lock_pool, vdo_forget(new_lock));
 		result = ASSERT(lock->holder_count > 0, "physical block %llu lock held",
 				(unsigned long long) pbn);
 		if (result != VDO_SUCCESS)
diff --git a/drivers/md/dm-vdo/priority-table.c b/drivers/md/dm-vdo/priority-table.c
index 9408219b5700bd..bb98fb06b73fcd 100644
--- a/drivers/md/dm-vdo/priority-table.c
+++ b/drivers/md/dm-vdo/priority-table.c
@@ -62,7 +62,7 @@ int vdo_make_priority_table(unsigned int max_priority, struct priority_table **t
 	if (max_priority > MAX_PRIORITY)
 		return UDS_INVALID_ARGUMENT;
 
-	result = uds_allocate_extended(struct priority_table, max_priority + 1,
+	result = vdo_allocate_extended(struct priority_table, max_priority + 1,
 				       struct bucket, __func__, &table);
 	if (result != VDO_SUCCESS)
 		return result;
@@ -98,7 +98,7 @@ void vdo_free_priority_table(struct priority_table *table)
 	 */
 	vdo_reset_priority_table(table);
 
-	uds_free(table);
+	vdo_free(table);
 }
 
 /**
diff --git a/drivers/md/dm-vdo/recovery-journal.c b/drivers/md/dm-vdo/recovery-journal.c
index 1e15bfe42cfcf8..c0f3dea5d64b92 100644
--- a/drivers/md/dm-vdo/recovery-journal.c
+++ b/drivers/md/dm-vdo/recovery-journal.c
@@ -593,31 +593,31 @@ static int __must_check initialize_lock_counter(struct recovery_journal *journal
 	struct thread_config *config = &vdo->thread_config;
 	struct lock_counter *counter = &journal->lock_counter;
 
-	result = uds_allocate(journal->size, u16, __func__, &counter->journal_counters);
+	result = vdo_allocate(journal->size, u16, __func__, &counter->journal_counters);
 	if (result != VDO_SUCCESS)
 		return result;
 
-	result = uds_allocate(journal->size, atomic_t, __func__,
+	result = vdo_allocate(journal->size, atomic_t, __func__,
 			      &counter->journal_decrement_counts);
 	if (result != VDO_SUCCESS)
 		return result;
 
-	result = uds_allocate(journal->size * config->logical_zone_count, u16, __func__,
+	result = vdo_allocate(journal->size * config->logical_zone_count, u16, __func__,
 			      &counter->logical_counters);
 	if (result != VDO_SUCCESS)
 		return result;
 
-	result = uds_allocate(journal->size, atomic_t, __func__,
+	result = vdo_allocate(journal->size, atomic_t, __func__,
 			      &counter->logical_zone_counts);
 	if (result != VDO_SUCCESS)
 		return result;
 
-	result = uds_allocate(journal->size * config->physical_zone_count, u16, __func__,
+	result = vdo_allocate(journal->size * config->physical_zone_count, u16, __func__,
 			      &counter->physical_counters);
 	if (result != VDO_SUCCESS)
 		return result;
 
-	result = uds_allocate(journal->size, atomic_t, __func__,
+	result = vdo_allocate(journal->size, atomic_t, __func__,
 			      &counter->physical_zone_counts);
 	if (result != VDO_SUCCESS)
 		return result;
@@ -672,14 +672,14 @@ static int initialize_recovery_block(struct vdo *vdo, struct recovery_journal *j
 	 * Allocate a full block for the journal block even though not all of the space is used
 	 * since the VIO needs to write a full disk block.
 	 */
-	result = uds_allocate(VDO_BLOCK_SIZE, char, __func__, &data);
+	result = vdo_allocate(VDO_BLOCK_SIZE, char, __func__, &data);
 	if (result != VDO_SUCCESS)
 		return result;
 
 	result = allocate_vio_components(vdo, VIO_TYPE_RECOVERY_JOURNAL,
 					 VIO_PRIORITY_HIGH, block, 1, data, &block->vio);
 	if (result != VDO_SUCCESS) {
-		uds_free(data);
+		vdo_free(data);
 		return result;
 	}
 
@@ -711,7 +711,7 @@ int vdo_decode_recovery_journal(struct recovery_journal_state_7_0 state, nonce_t
 	struct recovery_journal *journal;
 	int result;
 
-	result = uds_allocate_extended(struct recovery_journal,
+	result = vdo_allocate_extended(struct recovery_journal,
 				       RECOVERY_JOURNAL_RESERVED_BLOCKS,
 				       struct recovery_journal_block, __func__,
 				       &journal);
@@ -789,13 +789,13 @@ void vdo_free_recovery_journal(struct recovery_journal *journal)
 	if (journal == NULL)
 		return;
 
-	uds_free(uds_forget(journal->lock_counter.logical_zone_counts));
-	uds_free(uds_forget(journal->lock_counter.physical_zone_counts));
-	uds_free(uds_forget(journal->lock_counter.journal_counters));
-	uds_free(uds_forget(journal->lock_counter.journal_decrement_counts));
-	uds_free(uds_forget(journal->lock_counter.logical_counters));
-	uds_free(uds_forget(journal->lock_counter.physical_counters));
-	free_vio(uds_forget(journal->flush_vio));
+	vdo_free(vdo_forget(journal->lock_counter.logical_zone_counts));
+	vdo_free(vdo_forget(journal->lock_counter.physical_zone_counts));
+	vdo_free(vdo_forget(journal->lock_counter.journal_counters));
+	vdo_free(vdo_forget(journal->lock_counter.journal_decrement_counts));
+	vdo_free(vdo_forget(journal->lock_counter.logical_counters));
+	vdo_free(vdo_forget(journal->lock_counter.physical_counters));
+	free_vio(vdo_forget(journal->flush_vio));
 
 	/*
 	 * FIXME: eventually, the journal should be constructed in a quiescent state which
@@ -812,11 +812,11 @@ void vdo_free_recovery_journal(struct recovery_journal *journal)
 	for (i = 0; i < RECOVERY_JOURNAL_RESERVED_BLOCKS; i++) {
 		struct recovery_journal_block *block = &journal->blocks[i];
 
-		uds_free(uds_forget(block->vio.data));
+		vdo_free(vdo_forget(block->vio.data));
 		free_vio_components(&block->vio);
 	}
 
-	uds_free(journal);
+	vdo_free(journal);
 }
 
 /**
diff --git a/drivers/md/dm-vdo/repair.c b/drivers/md/dm-vdo/repair.c
index a75278eb8aa4ae..83322afa454bbe 100644
--- a/drivers/md/dm-vdo/repair.c
+++ b/drivers/md/dm-vdo/repair.c
@@ -226,7 +226,7 @@ static void uninitialize_vios(struct repair_completion *repair)
 	while (repair->vio_count > 0)
 		free_vio_components(&repair->vios[--repair->vio_count]);
 
-	uds_free(uds_forget(repair->vios));
+	vdo_free(vdo_forget(repair->vios));
 }
 
 static void free_repair_completion(struct repair_completion *repair)
@@ -241,9 +241,9 @@ static void free_repair_completion(struct repair_completion *repair)
 	repair->completion.vdo->block_map->zones[0].page_cache.rebuilding = false;
 
 	uninitialize_vios(repair);
-	uds_free(uds_forget(repair->journal_data));
-	uds_free(uds_forget(repair->entries));
-	uds_free(repair);
+	vdo_free(vdo_forget(repair->journal_data));
+	vdo_free(vdo_forget(repair->entries));
+	vdo_free(repair);
 }
 
 static void finish_repair(struct vdo_completion *completion)
@@ -262,7 +262,7 @@ static void finish_repair(struct vdo_completion *completion)
 						    repair->highest_tail,
 						    repair->logical_blocks_used,
 						    repair->block_map_data_blocks);
-	free_repair_completion(uds_forget(repair));
+	free_repair_completion(vdo_forget(repair));
 
 	if (vdo_state_requires_read_only_rebuild(vdo->load_state)) {
 		uds_log_info("Read-only rebuild complete");
@@ -295,7 +295,7 @@ static void abort_repair(struct vdo_completion *completion)
 	else
 		uds_log_warning("Recovery aborted");
 
-	free_repair_completion(uds_forget(repair));
+	free_repair_completion(vdo_forget(repair));
 	vdo_continue_completion(parent, result);
 }
 
@@ -1108,7 +1108,7 @@ static void recover_block_map(struct vdo_completion *completion)
 
 	if (repair->block_map_entry_count == 0) {
 		uds_log_info("Replaying 0 recovery entries into block map");
-		uds_free(uds_forget(repair->journal_data));
+		vdo_free(vdo_forget(repair->journal_data));
 		launch_repair_completion(repair, load_slab_depot, VDO_ZONE_TYPE_ADMIN);
 		return;
 	}
@@ -1418,7 +1418,7 @@ static int parse_journal_for_rebuild(struct repair_completion *repair)
 	 * packed_recovery_journal_entry from every valid journal block.
 	 */
 	count = ((repair->highest_tail - repair->block_map_head + 1) * entries_per_block);
-	result = uds_allocate(count, struct numbered_block_mapping, __func__,
+	result = vdo_allocate(count, struct numbered_block_mapping, __func__,
 			      &repair->entries);
 	if (result != VDO_SUCCESS)
 		return result;
@@ -1464,7 +1464,7 @@ static int extract_new_mappings(struct repair_completion *repair)
 	 * Allocate an array of numbered_block_mapping structs just large enough to transcribe
 	 * every packed_recovery_journal_entry from every valid journal block.
 	 */
-	result = uds_allocate(repair->entry_count, struct numbered_block_mapping,
+	result = vdo_allocate(repair->entry_count, struct numbered_block_mapping,
 			      __func__, &repair->entries);
 	if (result != VDO_SUCCESS)
 		return result;
@@ -1709,7 +1709,7 @@ void vdo_repair(struct vdo_completion *parent)
 		uds_log_warning("Device was dirty, rebuilding reference counts");
 	}
 
-	result = uds_allocate_extended(struct repair_completion, page_count,
+	result = vdo_allocate_extended(struct repair_completion, page_count,
 				       struct vdo_page_completion, __func__,
 				       &repair);
 	if (result != VDO_SUCCESS) {
@@ -1723,12 +1723,12 @@ void vdo_repair(struct vdo_completion *parent)
 	prepare_repair_completion(repair, finish_repair, VDO_ZONE_TYPE_ADMIN);
 	repair->page_count = page_count;
 
-	result = uds_allocate(remaining * VDO_BLOCK_SIZE, char, __func__,
+	result = vdo_allocate(remaining * VDO_BLOCK_SIZE, char, __func__,
 			      &repair->journal_data);
 	if (abort_on_error(result, repair))
 		return;
 
-	result = uds_allocate(vio_count, struct vio, __func__, &repair->vios);
+	result = vdo_allocate(vio_count, struct vio, __func__, &repair->vios);
 	if (abort_on_error(result, repair))
 		return;
 
diff --git a/drivers/md/dm-vdo/slab-depot.c b/drivers/md/dm-vdo/slab-depot.c
index 2f4a2ae5e0823e..eaf2930752090c 100644
--- a/drivers/md/dm-vdo/slab-depot.c
+++ b/drivers/md/dm-vdo/slab-depot.c
@@ -411,7 +411,7 @@ static void complete_reaping(struct vdo_completion *completion)
 	struct slab_journal *journal = completion->parent;
 
 	return_vio_to_pool(journal->slab->allocator->vio_pool,
-			   vio_as_pooled_vio(as_vio(uds_forget(completion))));
+			   vio_as_pooled_vio(as_vio(vdo_forget(completion))));
 	finish_reaping(journal);
 	reap_slab_journal(journal);
 }
@@ -694,7 +694,7 @@ static void complete_write(struct vdo_completion *completion)
 	sequence_number_t committed = get_committing_sequence_number(pooled);
 
 	list_del_init(&pooled->list_entry);
-	return_vio_to_pool(journal->slab->allocator->vio_pool, uds_forget(pooled));
+	return_vio_to_pool(journal->slab->allocator->vio_pool, vdo_forget(pooled));
 
 	if (result != VDO_SUCCESS) {
 		vio_record_metadata_io_error(as_vio(completion));
@@ -772,7 +772,7 @@ static void write_slab_journal_block(struct vdo_waiter *waiter, void *context)
 	 * This block won't be read in recovery until the slab summary is updated to refer to it.
 	 * The slab summary update does a flush which is sufficient to protect us from VDO-2331.
 	 */
-	vdo_submit_metadata_vio(uds_forget(vio), block_number, write_slab_journal_endio,
+	vdo_submit_metadata_vio(vdo_forget(vio), block_number, write_slab_journal_endio,
 				complete_write, REQ_OP_WRITE);
 
 	/* Since the write is submitted, the tail block structure can be reused. */
@@ -2361,7 +2361,7 @@ static int allocate_slab_counters(struct vdo_slab *slab)
 	if (result != VDO_SUCCESS)
 		return result;
 
-	result = uds_allocate(slab->reference_block_count, struct reference_block,
+	result = vdo_allocate(slab->reference_block_count, struct reference_block,
 			      __func__, &slab->reference_blocks);
 	if (result != VDO_SUCCESS)
 		return result;
@@ -2371,10 +2371,10 @@ static int allocate_slab_counters(struct vdo_slab *slab)
 	 * so we can word-search even at the very end.
 	 */
 	bytes = (slab->reference_block_count * COUNTS_PER_BLOCK) + (2 * BYTES_PER_WORD);
-	result = uds_allocate(bytes, vdo_refcount_t, "ref counts array",
+	result = vdo_allocate(bytes, vdo_refcount_t, "ref counts array",
 			      &slab->counters);
 	if (result != UDS_SUCCESS) {
-		uds_free(uds_forget(slab->reference_blocks));
+		vdo_free(vdo_forget(slab->reference_blocks));
 		return result;
 	}
 
@@ -2652,7 +2652,7 @@ static inline bool __must_check has_slabs_to_scrub(struct slab_scrubber *scrubbe
  */
 static void uninitialize_scrubber_vio(struct slab_scrubber *scrubber)
 {
-	uds_free(uds_forget(scrubber->vio.data));
+	vdo_free(vdo_forget(scrubber->vio.data));
 	free_vio_components(&scrubber->vio);
 }
 
@@ -2673,7 +2673,7 @@ static void finish_scrubbing(struct slab_scrubber *scrubber, int result)
 
 	if (scrubber->high_priority_only) {
 		scrubber->high_priority_only = false;
-		vdo_fail_completion(uds_forget(scrubber->vio.completion.parent), result);
+		vdo_fail_completion(vdo_forget(scrubber->vio.completion.parent), result);
 	} else if (done && (atomic_add_return(-1, &allocator->depot->zones_to_scrub) == 0)) {
 		/* All of our slabs were scrubbed, and we're the last allocator to finish. */
 		enum vdo_state prior_state =
@@ -3376,7 +3376,7 @@ static void finish_loading_allocator(struct vdo_completion *completion)
 		vdo_get_admin_state_code(&allocator->state);
 
 	if (allocator->eraser != NULL)
-		dm_kcopyd_client_destroy(uds_forget(allocator->eraser));
+		dm_kcopyd_client_destroy(vdo_forget(allocator->eraser));
 
 	if (operation == VDO_ADMIN_STATE_LOADING_FOR_RECOVERY) {
 		void *context =
@@ -3479,7 +3479,7 @@ static int get_slab_statuses(struct block_allocator *allocator,
 	struct slab_status *statuses;
 	struct slab_iterator iterator = get_slab_iterator(allocator);
 
-	result = uds_allocate(allocator->slab_count, struct slab_status, __func__,
+	result = vdo_allocate(allocator->slab_count, struct slab_status, __func__,
 			      &statuses);
 	if (result != VDO_SUCCESS)
 		return result;
@@ -3546,7 +3546,7 @@ static int __must_check vdo_prepare_slabs_for_allocation(struct block_allocator
 		register_slab_for_scrubbing(slab, high_priority);
 	}
 
-	uds_free(slab_statuses);
+	vdo_free(slab_statuses);
 	return VDO_SUCCESS;
 }
 
@@ -3642,11 +3642,11 @@ static void free_slab(struct vdo_slab *slab)
 		return;
 
 	list_del(&slab->allocq_entry);
-	uds_free(uds_forget(slab->journal.block));
-	uds_free(uds_forget(slab->journal.locks));
-	uds_free(uds_forget(slab->counters));
-	uds_free(uds_forget(slab->reference_blocks));
-	uds_free(slab);
+	vdo_free(vdo_forget(slab->journal.block));
+	vdo_free(vdo_forget(slab->journal.locks));
+	vdo_free(vdo_forget(slab->counters));
+	vdo_free(vdo_forget(slab->reference_blocks));
+	vdo_free(slab);
 }
 
 static int initialize_slab_journal(struct vdo_slab *slab)
@@ -3655,12 +3655,12 @@ static int initialize_slab_journal(struct vdo_slab *slab)
 	const struct slab_config *slab_config = &slab->allocator->depot->slab_config;
 	int result;
 
-	result = uds_allocate(slab_config->slab_journal_blocks, struct journal_lock,
+	result = vdo_allocate(slab_config->slab_journal_blocks, struct journal_lock,
 			      __func__, &journal->locks);
 	if (result != VDO_SUCCESS)
 		return result;
 
-	result = uds_allocate(VDO_BLOCK_SIZE, char, "struct packed_slab_journal_block",
+	result = vdo_allocate(VDO_BLOCK_SIZE, char, "struct packed_slab_journal_block",
 			      (char **) &journal->block);
 	if (result != VDO_SUCCESS)
 		return result;
@@ -3716,7 +3716,7 @@ static int __must_check make_slab(physical_block_number_t slab_origin,
 	struct vdo_slab *slab;
 	int result;
 
-	result = uds_allocate(1, struct vdo_slab, __func__, &slab);
+	result = vdo_allocate(1, struct vdo_slab, __func__, &slab);
 	if (result != VDO_SUCCESS)
 		return result;
 
@@ -3773,7 +3773,7 @@ static int allocate_slabs(struct slab_depot *depot, slab_count_t slab_count)
 	physical_block_number_t slab_origin;
 	int result;
 
-	result = uds_allocate(slab_count, struct vdo_slab *,
+	result = vdo_allocate(slab_count, struct vdo_slab *,
 			      "slab pointer array", &depot->new_slabs);
 	if (result != VDO_SUCCESS)
 		return result;
@@ -3815,10 +3815,10 @@ void vdo_abandon_new_slabs(struct slab_depot *depot)
 		return;
 
 	for (i = depot->slab_count; i < depot->new_slab_count; i++)
-		free_slab(uds_forget(depot->new_slabs[i]));
+		free_slab(vdo_forget(depot->new_slabs[i]));
 	depot->new_slab_count = 0;
 	depot->new_size = 0;
-	uds_free(uds_forget(depot->new_slabs));
+	vdo_free(vdo_forget(depot->new_slabs));
 }
 
 /**
@@ -3928,7 +3928,7 @@ static int initialize_slab_scrubber(struct block_allocator *allocator)
 	char *journal_data;
 	int result;
 
-	result = uds_allocate(VDO_BLOCK_SIZE * slab_journal_size,
+	result = vdo_allocate(VDO_BLOCK_SIZE * slab_journal_size,
 			      char, __func__, &journal_data);
 	if (result != VDO_SUCCESS)
 		return result;
@@ -3939,7 +3939,7 @@ static int initialize_slab_scrubber(struct block_allocator *allocator)
 					 allocator, slab_journal_size,
 					 journal_data, &scrubber->vio);
 	if (result != VDO_SUCCESS) {
-		uds_free(journal_data);
+		vdo_free(journal_data);
 		return result;
 	}
 
@@ -3962,7 +3962,7 @@ static int __must_check initialize_slab_summary_block(struct block_allocator *al
 	struct slab_summary_block *block = &allocator->summary_blocks[index];
 	int result;
 
-	result = uds_allocate(VDO_BLOCK_SIZE, char, __func__, &block->outgoing_entries);
+	result = vdo_allocate(VDO_BLOCK_SIZE, char, __func__, &block->outgoing_entries);
 	if (result != VDO_SUCCESS)
 		return result;
 
@@ -4018,7 +4018,7 @@ static int __must_check initialize_block_allocator(struct slab_depot *depot,
 	if (result != VDO_SUCCESS)
 		return result;
 
-	result = uds_allocate(VDO_SLAB_SUMMARY_BLOCKS_PER_ZONE,
+	result = vdo_allocate(VDO_SLAB_SUMMARY_BLOCKS_PER_ZONE,
 			      struct slab_summary_block, __func__,
 			      &allocator->summary_blocks);
 	if (result != VDO_SUCCESS)
@@ -4078,7 +4078,7 @@ static int allocate_components(struct slab_depot *depot,
 
 	depot->summary_origin = summary_partition->offset;
 	depot->hint_shift = vdo_get_slab_summary_hint_shift(depot->slab_size_shift);
-	result = uds_allocate(MAXIMUM_VDO_SLAB_SUMMARY_ENTRIES,
+	result = vdo_allocate(MAXIMUM_VDO_SLAB_SUMMARY_ENTRIES,
 			      struct slab_summary_entry, __func__,
 			      &depot->summary_entries);
 	if (result != VDO_SUCCESS)
@@ -4166,7 +4166,7 @@ int vdo_decode_slab_depot(struct slab_depot_state_2_0 state, struct vdo *vdo,
 	}
 	slab_size_shift = ilog2(slab_size);
 
-	result = uds_allocate_extended(struct slab_depot,
+	result = vdo_allocate_extended(struct slab_depot,
 				       vdo->thread_config.physical_zone_count,
 				       struct block_allocator, __func__, &depot);
 	if (result != VDO_SUCCESS)
@@ -4199,10 +4199,10 @@ static void uninitialize_allocator_summary(struct block_allocator *allocator)
 
 	for (i = 0; i < VDO_SLAB_SUMMARY_BLOCKS_PER_ZONE; i++) {
 		free_vio_components(&allocator->summary_blocks[i].vio);
-		uds_free(uds_forget(allocator->summary_blocks[i].outgoing_entries));
+		vdo_free(vdo_forget(allocator->summary_blocks[i].outgoing_entries));
 	}
 
-	uds_free(uds_forget(allocator->summary_blocks));
+	vdo_free(vdo_forget(allocator->summary_blocks));
 }
 
 /**
@@ -4222,25 +4222,25 @@ void vdo_free_slab_depot(struct slab_depot *depot)
 		struct block_allocator *allocator = &depot->allocators[zone];
 
 		if (allocator->eraser != NULL)
-			dm_kcopyd_client_destroy(uds_forget(allocator->eraser));
+			dm_kcopyd_client_destroy(vdo_forget(allocator->eraser));
 
 		uninitialize_allocator_summary(allocator);
 		uninitialize_scrubber_vio(&allocator->scrubber);
-		free_vio_pool(uds_forget(allocator->vio_pool));
-		vdo_free_priority_table(uds_forget(allocator->prioritized_slabs));
+		free_vio_pool(vdo_forget(allocator->vio_pool));
+		vdo_free_priority_table(vdo_forget(allocator->prioritized_slabs));
 	}
 
 	if (depot->slabs != NULL) {
 		slab_count_t i;
 
 		for (i = 0; i < depot->slab_count; i++)
-			free_slab(uds_forget(depot->slabs[i]));
+			free_slab(vdo_forget(depot->slabs[i]));
 	}
 
-	uds_free(uds_forget(depot->slabs));
-	uds_free(uds_forget(depot->action_manager));
-	uds_free(uds_forget(depot->summary_entries));
-	uds_free(depot);
+	vdo_free(vdo_forget(depot->slabs));
+	vdo_free(vdo_forget(depot->action_manager));
+	vdo_free(vdo_forget(depot->summary_entries));
+	vdo_free(depot);
 }
 
 /**
@@ -4441,7 +4441,7 @@ static void finish_combining_zones(struct vdo_completion *completion)
 	int result = completion->result;
 	struct vdo_completion *parent = completion->parent;
 
-	free_vio(as_vio(uds_forget(completion)));
+	free_vio(as_vio(vdo_forget(completion)));
 	vdo_fail_completion(parent, result);
 }
 
@@ -4702,7 +4702,7 @@ static int finish_registration(void *context)
 	struct slab_depot *depot = context;
 
 	WRITE_ONCE(depot->slab_count, depot->new_slab_count);
-	uds_free(depot->slabs);
+	vdo_free(depot->slabs);
 	depot->slabs = depot->new_slabs;
 	depot->new_slabs = NULL;
 	depot->new_slab_count = 0;
diff --git a/drivers/md/dm-vdo/slab-depot.h b/drivers/md/dm-vdo/slab-depot.h
index fba293f9713e38..f234853501cabd 100644
--- a/drivers/md/dm-vdo/slab-depot.h
+++ b/drivers/md/dm-vdo/slab-depot.h
@@ -241,7 +241,7 @@ struct vdo_slab {
 	/* The number of free blocks */
 	u32 free_blocks;
 	/* The array of reference counts */
-	vdo_refcount_t *counters; /* use uds_allocate() to align data ptr */
+	vdo_refcount_t *counters; /* use vdo_allocate() to align data ptr */
 
 	/* The saved block pointer and array indexes for the free block search */
 	struct search_cursor search_cursor;
diff --git a/drivers/md/dm-vdo/thread-utils.c b/drivers/md/dm-vdo/thread-utils.c
index aeca14bba8529b..f328307a5f1d26 100644
--- a/drivers/md/dm-vdo/thread-utils.c
+++ b/drivers/md/dm-vdo/thread-utils.c
@@ -68,9 +68,9 @@ static int thread_starter(void *arg)
 	mutex_lock(&thread_mutex);
 	hlist_add_head(&thread->thread_links, &thread_list);
 	mutex_unlock(&thread_mutex);
-	uds_register_allocating_thread(&allocating_thread, NULL);
+	vdo_register_allocating_thread(&allocating_thread, NULL);
 	thread->thread_function(thread->thread_data);
-	uds_unregister_allocating_thread();
+	vdo_unregister_allocating_thread();
 	complete(&thread->thread_done);
 	return 0;
 }
@@ -84,7 +84,7 @@ int vdo_create_thread(void (*thread_function)(void *), void *thread_data,
 	struct thread *thread;
 	int result;
 
-	result = uds_allocate(1, struct thread, __func__, &thread);
+	result = vdo_allocate(1, struct thread, __func__, &thread);
 	if (result != UDS_SUCCESS) {
 		uds_log_warning("Error allocating memory for %s", name);
 		return result;
@@ -116,7 +116,7 @@ int vdo_create_thread(void (*thread_function)(void *), void *thread_data,
 	}
 
 	if (IS_ERR(task)) {
-		uds_free(thread);
+		vdo_free(thread);
 		return PTR_ERR(task);
 	}
 
@@ -132,5 +132,5 @@ void vdo_join_threads(struct thread *thread)
 	mutex_lock(&thread_mutex);
 	hlist_del(&thread->thread_links);
 	mutex_unlock(&thread_mutex);
-	uds_free(thread);
+	vdo_free(thread);
 }
diff --git a/drivers/md/dm-vdo/vdo.c b/drivers/md/dm-vdo/vdo.c
index 0fe32dfcf3dd05..f9b8edf1292d73 100644
--- a/drivers/md/dm-vdo/vdo.c
+++ b/drivers/md/dm-vdo/vdo.c
@@ -136,13 +136,13 @@ static void start_vdo_request_queue(void *ptr)
 {
 	struct vdo_thread *thread = vdo_get_work_queue_owner(vdo_get_current_work_queue());
 
-	uds_register_allocating_thread(&thread->allocating_thread,
+	vdo_register_allocating_thread(&thread->allocating_thread,
 				       &thread->vdo->allocations_allowed);
 }
 
 static void finish_vdo_request_queue(void *ptr)
 {
-	uds_unregister_allocating_thread();
+	vdo_unregister_allocating_thread();
 }
 
 #ifdef MODULE
@@ -174,10 +174,10 @@ static const struct vdo_work_queue_type cpu_q_type = {
 
 static void uninitialize_thread_config(struct thread_config *config)
 {
-	uds_free(uds_forget(config->logical_threads));
-	uds_free(uds_forget(config->physical_threads));
-	uds_free(uds_forget(config->hash_zone_threads));
-	uds_free(uds_forget(config->bio_threads));
+	vdo_free(vdo_forget(config->logical_threads));
+	vdo_free(vdo_forget(config->physical_threads));
+	vdo_free(vdo_forget(config->hash_zone_threads));
+	vdo_free(vdo_forget(config->bio_threads));
 	memset(config, 0, sizeof(struct thread_config));
 }
 
@@ -216,28 +216,28 @@ static int __must_check initialize_thread_config(struct thread_count_config coun
 		config->hash_zone_count = counts.hash_zones;
 	}
 
-	result = uds_allocate(config->logical_zone_count, thread_id_t,
+	result = vdo_allocate(config->logical_zone_count, thread_id_t,
 			      "logical thread array", &config->logical_threads);
 	if (result != VDO_SUCCESS) {
 		uninitialize_thread_config(config);
 		return result;
 	}
 
-	result = uds_allocate(config->physical_zone_count, thread_id_t,
+	result = vdo_allocate(config->physical_zone_count, thread_id_t,
 			      "physical thread array", &config->physical_threads);
 	if (result != VDO_SUCCESS) {
 		uninitialize_thread_config(config);
 		return result;
 	}
 
-	result = uds_allocate(config->hash_zone_count, thread_id_t,
+	result = vdo_allocate(config->hash_zone_count, thread_id_t,
 			      "hash thread array", &config->hash_zone_threads);
 	if (result != VDO_SUCCESS) {
 		uninitialize_thread_config(config);
 		return result;
 	}
 
-	result = uds_allocate(config->bio_thread_count, thread_id_t,
+	result = vdo_allocate(config->bio_thread_count, thread_id_t,
 			      "bio thread array", &config->bio_threads);
 	if (result != VDO_SUCCESS) {
 		uninitialize_thread_config(config);
@@ -278,14 +278,14 @@ static int __must_check read_geometry_block(struct vdo *vdo)
 	char *block;
 	int result;
 
-	result = uds_allocate(VDO_BLOCK_SIZE, u8, __func__, &block);
+	result = vdo_allocate(VDO_BLOCK_SIZE, u8, __func__, &block);
 	if (result != VDO_SUCCESS)
 		return result;
 
 	result = create_metadata_vio(vdo, VIO_TYPE_GEOMETRY, VIO_PRIORITY_HIGH, NULL,
 				     block, &vio);
 	if (result != VDO_SUCCESS) {
-		uds_free(block);
+		vdo_free(block);
 		return result;
 	}
 
@@ -297,23 +297,23 @@ static int __must_check read_geometry_block(struct vdo *vdo)
 	result = vio_reset_bio(vio, block, NULL, REQ_OP_READ,
 			       VDO_GEOMETRY_BLOCK_LOCATION);
 	if (result != VDO_SUCCESS) {
-		free_vio(uds_forget(vio));
-		uds_free(block);
+		free_vio(vdo_forget(vio));
+		vdo_free(block);
 		return result;
 	}
 
 	bio_set_dev(vio->bio, vdo_get_backing_device(vdo));
 	submit_bio_wait(vio->bio);
 	result = blk_status_to_errno(vio->bio->bi_status);
-	free_vio(uds_forget(vio));
+	free_vio(vdo_forget(vio));
 	if (result != 0) {
 		uds_log_error_strerror(result, "synchronous read failed");
-		uds_free(block);
+		vdo_free(block);
 		return -EIO;
 	}
 
 	result = vdo_parse_geometry_block((u8 *) block, &vdo->geometry);
-	uds_free(block);
+	vdo_free(block);
 	return result;
 }
 
@@ -502,7 +502,7 @@ static int initialize_vdo(struct vdo *vdo, struct device_config *config,
 		     config->thread_counts.hash_zones, vdo->thread_config.thread_count);
 
 	/* Compression context storage */
-	result = uds_allocate(config->thread_counts.cpu_threads, char *, "LZ4 context",
+	result = vdo_allocate(config->thread_counts.cpu_threads, char *, "LZ4 context",
 			      &vdo->compression_context);
 	if (result != VDO_SUCCESS) {
 		*reason = "cannot allocate LZ4 context";
@@ -510,7 +510,7 @@ static int initialize_vdo(struct vdo *vdo, struct device_config *config,
 	}
 
 	for (i = 0; i < config->thread_counts.cpu_threads; i++) {
-		result = uds_allocate(LZ4_MEM_COMPRESS, char, "LZ4 context",
+		result = vdo_allocate(LZ4_MEM_COMPRESS, char, "LZ4 context",
 				      &vdo->compression_context[i]);
 		if (result != VDO_SUCCESS) {
 			*reason = "cannot allocate LZ4 context";
@@ -546,7 +546,7 @@ int vdo_make(unsigned int instance, struct device_config *config, char **reason,
 	/* VDO-3769 - Set a generic reason so we don't ever return garbage. */
 	*reason = "Unspecified error";
 
-	result = uds_allocate(1, struct vdo, __func__, &vdo);
+	result = vdo_allocate(1, struct vdo, __func__, &vdo);
 	if (result != UDS_SUCCESS) {
 		*reason = "Cannot allocate VDO";
 		return result;
@@ -564,7 +564,7 @@ int vdo_make(unsigned int instance, struct device_config *config, char **reason,
 	snprintf(vdo->thread_name_prefix, sizeof(vdo->thread_name_prefix),
 		 "%s%u", MODULE_NAME, instance);
 	BUG_ON(vdo->thread_name_prefix[0] == '\0');
-	result = uds_allocate(vdo->thread_config.thread_count,
+	result = vdo_allocate(vdo->thread_config.thread_count,
 			      struct vdo_thread, __func__, &vdo->threads);
 	if (result != VDO_SUCCESS) {
 		*reason = "Cannot allocate thread structures";
@@ -652,16 +652,16 @@ static void free_listeners(struct vdo_thread *thread)
 {
 	struct read_only_listener *listener, *next;
 
-	for (listener = uds_forget(thread->listeners); listener != NULL; listener = next) {
-		next = uds_forget(listener->next);
-		uds_free(listener);
+	for (listener = vdo_forget(thread->listeners); listener != NULL; listener = next) {
+		next = vdo_forget(listener->next);
+		vdo_free(listener);
 	}
 }
 
 static void uninitialize_super_block(struct vdo_super_block *super_block)
 {
 	free_vio_components(&super_block->vio);
-	uds_free(super_block->buffer);
+	vdo_free(super_block->buffer);
 }
 
 /**
@@ -696,36 +696,36 @@ void vdo_destroy(struct vdo *vdo)
 	finish_vdo(vdo);
 	unregister_vdo(vdo);
 	free_data_vio_pool(vdo->data_vio_pool);
-	vdo_free_io_submitter(uds_forget(vdo->io_submitter));
-	vdo_free_flusher(uds_forget(vdo->flusher));
-	vdo_free_packer(uds_forget(vdo->packer));
-	vdo_free_recovery_journal(uds_forget(vdo->recovery_journal));
-	vdo_free_slab_depot(uds_forget(vdo->depot));
+	vdo_free_io_submitter(vdo_forget(vdo->io_submitter));
+	vdo_free_flusher(vdo_forget(vdo->flusher));
+	vdo_free_packer(vdo_forget(vdo->packer));
+	vdo_free_recovery_journal(vdo_forget(vdo->recovery_journal));
+	vdo_free_slab_depot(vdo_forget(vdo->depot));
 	vdo_uninitialize_layout(&vdo->layout);
 	vdo_uninitialize_layout(&vdo->next_layout);
 	if (vdo->partition_copier)
-		dm_kcopyd_client_destroy(uds_forget(vdo->partition_copier));
+		dm_kcopyd_client_destroy(vdo_forget(vdo->partition_copier));
 	uninitialize_super_block(&vdo->super_block);
-	vdo_free_block_map(uds_forget(vdo->block_map));
-	vdo_free_hash_zones(uds_forget(vdo->hash_zones));
-	vdo_free_physical_zones(uds_forget(vdo->physical_zones));
-	vdo_free_logical_zones(uds_forget(vdo->logical_zones));
+	vdo_free_block_map(vdo_forget(vdo->block_map));
+	vdo_free_hash_zones(vdo_forget(vdo->hash_zones));
+	vdo_free_physical_zones(vdo_forget(vdo->physical_zones));
+	vdo_free_logical_zones(vdo_forget(vdo->logical_zones));
 
 	if (vdo->threads != NULL) {
 		for (i = 0; i < vdo->thread_config.thread_count; i++) {
 			free_listeners(&vdo->threads[i]);
-			vdo_free_work_queue(uds_forget(vdo->threads[i].queue));
+			vdo_free_work_queue(vdo_forget(vdo->threads[i].queue));
 		}
-		uds_free(uds_forget(vdo->threads));
+		vdo_free(vdo_forget(vdo->threads));
 	}
 
 	uninitialize_thread_config(&vdo->thread_config);
 
 	if (vdo->compression_context != NULL) {
 		for (i = 0; i < vdo->device_config->thread_counts.cpu_threads; i++)
-			uds_free(uds_forget(vdo->compression_context[i]));
+			vdo_free(vdo_forget(vdo->compression_context[i]));
 
-		uds_free(uds_forget(vdo->compression_context));
+		vdo_free(vdo_forget(vdo->compression_context));
 	}
 }
 
@@ -733,7 +733,7 @@ static int initialize_super_block(struct vdo *vdo, struct vdo_super_block *super
 {
 	int result;
 
-	result = uds_allocate(VDO_BLOCK_SIZE, char, "encoded super block",
+	result = vdo_allocate(VDO_BLOCK_SIZE, char, "encoded super block",
 			      (char **) &vdo->super_block.buffer);
 	if (result != VDO_SUCCESS)
 		return result;
@@ -755,7 +755,7 @@ static void finish_reading_super_block(struct vdo_completion *completion)
 	struct vdo_super_block *super_block =
 		container_of(as_vio(completion), struct vdo_super_block, vio);
 
-	vdo_continue_completion(uds_forget(completion->parent),
+	vdo_continue_completion(vdo_forget(completion->parent),
 				vdo_decode_super_block(super_block->buffer));
 }
 
@@ -915,7 +915,7 @@ static void record_vdo(struct vdo *vdo)
  */
 static void continue_super_block_parent(struct vdo_completion *completion)
 {
-	vdo_continue_completion(uds_forget(completion->parent), completion->result);
+	vdo_continue_completion(vdo_forget(completion->parent), completion->result);
 }
 
 /**
@@ -1005,7 +1005,7 @@ int vdo_register_read_only_listener(struct vdo *vdo, void *listener,
 	if (result != VDO_SUCCESS)
 		return result;
 
-	result = uds_allocate(1, struct read_only_listener, __func__,
+	result = vdo_allocate(1, struct read_only_listener, __func__,
 			      &read_only_listener);
 	if (result != VDO_SUCCESS)
 		return result;
@@ -1134,7 +1134,7 @@ static void finish_entering_read_only_mode(struct vdo_completion *completion)
 	spin_unlock(&notifier->lock);
 
 	if (notifier->waiter != NULL)
-		vdo_continue_completion(uds_forget(notifier->waiter),
+		vdo_continue_completion(vdo_forget(notifier->waiter),
 					completion->result);
 }
 
@@ -1571,7 +1571,7 @@ static void get_vdo_statistics(const struct vdo *vdo, struct vdo_statistics *sta
 	copy_bio_stat(&stats->bios_acknowledged_partial, &vdo->stats.bios_acknowledged_partial);
 	stats->bios_in_progress =
 		subtract_bio_stats(stats->bios_in, stats->bios_acknowledged);
-	uds_get_memory_stats(&stats->memory_usage.bytes_used,
+	vdo_get_memory_stats(&stats->memory_usage.bytes_used,
 			     &stats->memory_usage.peak_bytes_used);
 }
 
diff --git a/drivers/md/dm-vdo/vio.c b/drivers/md/dm-vdo/vio.c
index eb6838ddabbb83..f4441f9ff77235 100644
--- a/drivers/md/dm-vdo/vio.c
+++ b/drivers/md/dm-vdo/vio.c
@@ -52,7 +52,7 @@ static int create_multi_block_bio(block_count_t size, struct bio **bio_ptr)
 	struct bio *bio = NULL;
 	int result;
 
-	result = uds_allocate_extended(struct bio, size + 1, struct bio_vec,
+	result = vdo_allocate_extended(struct bio, size + 1, struct bio_vec,
 				       "bio", &bio);
 	if (result != VDO_SUCCESS)
 		return result;
@@ -72,7 +72,7 @@ void vdo_free_bio(struct bio *bio)
 		return;
 
 	bio_uninit(bio);
-	uds_free(uds_forget(bio));
+	vdo_free(vdo_forget(bio));
 }
 
 int allocate_vio_components(struct vdo *vdo, enum vio_type vio_type,
@@ -130,7 +130,7 @@ int create_multi_block_metadata_vio(struct vdo *vdo, enum vio_type vio_type,
 	 * Metadata vios should use direct allocation and not use the buffer pool, which is
 	 * reserved for submissions from the linux block layer.
 	 */
-	result = uds_allocate(1, struct vio, __func__, &vio);
+	result = vdo_allocate(1, struct vio, __func__, &vio);
 	if (result != VDO_SUCCESS) {
 		uds_log_error("metadata vio allocation failure %d", result);
 		return result;
@@ -139,7 +139,7 @@ int create_multi_block_metadata_vio(struct vdo *vdo, enum vio_type vio_type,
 	result = allocate_vio_components(vdo, vio_type, priority, parent, block_count,
 					 data, vio);
 	if (result != VDO_SUCCESS) {
-		uds_free(vio);
+		vdo_free(vio);
 		return result;
 	}
 
@@ -157,7 +157,7 @@ void free_vio_components(struct vio *vio)
 		return;
 
 	BUG_ON(is_data_vio(vio));
-	vdo_free_bio(uds_forget(vio->bio));
+	vdo_free_bio(vdo_forget(vio->bio));
 }
 
 /**
@@ -167,7 +167,7 @@ void free_vio_components(struct vio *vio)
 void free_vio(struct vio *vio)
 {
 	free_vio_components(vio);
-	uds_free(vio);
+	vdo_free(vio);
 }
 
 /* Set bio properties for a VDO read or write. */
@@ -317,7 +317,7 @@ int make_vio_pool(struct vdo *vdo, size_t pool_size, thread_id_t thread_id,
 	char *ptr;
 	int result;
 
-	result = uds_allocate_extended(struct vio_pool, pool_size, struct pooled_vio,
+	result = vdo_allocate_extended(struct vio_pool, pool_size, struct pooled_vio,
 				       __func__, &pool);
 	if (result != VDO_SUCCESS)
 		return result;
@@ -326,7 +326,7 @@ int make_vio_pool(struct vdo *vdo, size_t pool_size, thread_id_t thread_id,
 	INIT_LIST_HEAD(&pool->available);
 	INIT_LIST_HEAD(&pool->busy);
 
-	result = uds_allocate(pool_size * VDO_BLOCK_SIZE, char,
+	result = vdo_allocate(pool_size * VDO_BLOCK_SIZE, char,
 			      "VIO pool buffer", &pool->buffer);
 	if (result != VDO_SUCCESS) {
 		free_vio_pool(pool);
@@ -381,8 +381,8 @@ void free_vio_pool(struct vio_pool *pool)
 	ASSERT_LOG_ONLY(pool->size == 0,
 			"VIO pool must not have missing entries when being freed");
 
-	uds_free(uds_forget(pool->buffer));
-	uds_free(pool);
+	vdo_free(vdo_forget(pool->buffer));
+	vdo_free(pool);
 }
 
 /**

From 4450012c39f81dee99b783785cf86318bf166a49 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@kernel.org>
Date: Tue, 13 Feb 2024 11:17:00 -0600
Subject: [PATCH 0905/1406] dm vdo memory-alloc: rename vdo_do_allocation to
 __vdo_do_allocation

__vdo_do_allocation shouldn't be used outside of memory-alloc.h, so
add hidden prefix.

Also, tabify the vdo_allocate_extended macro.

Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 drivers/md/dm-vdo/memory-alloc.h | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/drivers/md/dm-vdo/memory-alloc.h b/drivers/md/dm-vdo/memory-alloc.h
index 3f27dd722a2d2b..417a378882e16d 100644
--- a/drivers/md/dm-vdo/memory-alloc.h
+++ b/drivers/md/dm-vdo/memory-alloc.h
@@ -37,8 +37,8 @@ int __must_check vdo_allocate_memory(size_t size, size_t align, const char *what
  *
  * Return: UDS_SUCCESS or an error code
  */
-static inline int vdo_do_allocation(size_t count, size_t size, size_t extra,
-				    size_t align, const char *what, void *ptr)
+static inline int __vdo_do_allocation(size_t count, size_t size, size_t extra,
+				      size_t align, const char *what, void *ptr)
 {
 	size_t total_size = count * size + extra;
 
@@ -68,7 +68,7 @@ static inline int vdo_do_allocation(size_t count, size_t size, size_t extra,
  * Return: UDS_SUCCESS or an error code
  */
 #define vdo_allocate(COUNT, TYPE, WHAT, PTR) \
-	vdo_do_allocation(COUNT, sizeof(TYPE), 0, __alignof__(TYPE), WHAT, PTR)
+	__vdo_do_allocation(COUNT, sizeof(TYPE), 0, __alignof__(TYPE), WHAT, PTR)
 
 /*
  * Allocate one object of an indicated type, followed by one or more elements of a second type,
@@ -83,18 +83,18 @@ static inline int vdo_do_allocation(size_t count, size_t size, size_t extra,
  *
  * Return: UDS_SUCCESS or an error code
  */
-#define vdo_allocate_extended(TYPE1, COUNT, TYPE2, WHAT, PTR)            \
-	__extension__({                                                  \
+#define vdo_allocate_extended(TYPE1, COUNT, TYPE2, WHAT, PTR)		 \
+	__extension__({							 \
 		int _result;						 \
-		TYPE1 **_ptr = (PTR);                                    \
-		BUILD_BUG_ON(__alignof__(TYPE1) < __alignof__(TYPE2));   \
-		_result = vdo_do_allocation(COUNT,                       \
-					    sizeof(TYPE2),               \
-					    sizeof(TYPE1),               \
-					    __alignof__(TYPE1),          \
-					    WHAT,                        \
-					    _ptr);                       \
-		_result;                                                 \
+		TYPE1 **_ptr = (PTR);					 \
+		BUILD_BUG_ON(__alignof__(TYPE1) < __alignof__(TYPE2));	 \
+		_result = __vdo_do_allocation(COUNT,			 \
+					    sizeof(TYPE2),		 \
+					    sizeof(TYPE1),		 \
+					    __alignof__(TYPE1),		 \
+					    WHAT,			 \
+					    _ptr);			 \
+		_result;						 \
 	})
 
 /*

From 7a90afce84e4e3e20547fee0772d2338c0170f34 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@kernel.org>
Date: Tue, 13 Feb 2024 11:37:05 -0600
Subject: [PATCH 0906/1406] dm vdo memory-alloc: return VDO_SUCCESS on success

Renamed UDS_BLOCK_SIZE to UDS_ERRORS_BLOCK_SIZE to avoid conflict with
UDS_BLOCK_SIZE in indexer.

Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 drivers/md/dm-vdo/memory-alloc.c | 24 ++++++++++++------------
 drivers/md/dm-vdo/memory-alloc.h |  8 ++++----
 drivers/md/dm-vdo/permassert.h   |  2 +-
 drivers/md/dm-vdo/status-codes.h |  4 ++--
 4 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/drivers/md/dm-vdo/memory-alloc.c b/drivers/md/dm-vdo/memory-alloc.c
index b0191001e1ee4a..f8b13b755e1556 100644
--- a/drivers/md/dm-vdo/memory-alloc.c
+++ b/drivers/md/dm-vdo/memory-alloc.c
@@ -194,7 +194,7 @@ static inline bool use_kmalloc(size_t size)
  * @what: What is being allocated (for error logging)
  * @ptr: A pointer to hold the allocated memory
  *
- * Return: UDS_SUCCESS or an error code
+ * Return: VDO_SUCCESS or an error code
  */
 int vdo_allocate_memory(size_t size, size_t align, const char *what, void *ptr)
 {
@@ -216,12 +216,12 @@ int vdo_allocate_memory(size_t size, size_t align, const char *what, void *ptr)
 	unsigned long start_time;
 	void *p = NULL;
 
-	if (ptr == NULL)
-		return UDS_INVALID_ARGUMENT;
+	if (unlikely(ptr == NULL))
+		return -EINVAL;
 
 	if (size == 0) {
 		*((void **) ptr) = NULL;
-		return UDS_SUCCESS;
+		return VDO_SUCCESS;
 	}
 
 	if (allocations_restricted)
@@ -245,7 +245,7 @@ int vdo_allocate_memory(size_t size, size_t align, const char *what, void *ptr)
 	} else {
 		struct vmalloc_block_info *block;
 
-		if (vdo_allocate(1, struct vmalloc_block_info, __func__, &block) == UDS_SUCCESS) {
+		if (vdo_allocate(1, struct vmalloc_block_info, __func__, &block) == VDO_SUCCESS) {
 			/*
 			 * It is possible for __vmalloc to fail to allocate memory because there
 			 * are no pages available (see VDO-3661). A short sleep may allow the page
@@ -290,7 +290,7 @@ int vdo_allocate_memory(size_t size, size_t align, const char *what, void *ptr)
 	}
 
 	*((void **) ptr) = p;
-	return UDS_SUCCESS;
+	return VDO_SUCCESS;
 }
 
 /*
@@ -335,7 +335,7 @@ void vdo_free(void *ptr)
  * @what: What is being allocated (for error logging)
  * @new_ptr: A pointer to hold the reallocated pointer
  *
- * Return: UDS_SUCCESS or an error code
+ * Return: VDO_SUCCESS or an error code
  */
 int vdo_reallocate_memory(void *ptr, size_t old_size, size_t size, const char *what,
 			  void *new_ptr)
@@ -345,11 +345,11 @@ int vdo_reallocate_memory(void *ptr, size_t old_size, size_t size, const char *w
 	if (size == 0) {
 		vdo_free(ptr);
 		*(void **) new_ptr = NULL;
-		return UDS_SUCCESS;
+		return VDO_SUCCESS;
 	}
 
 	result = vdo_allocate(size, char, what, new_ptr);
-	if (result != UDS_SUCCESS)
+	if (result != VDO_SUCCESS)
 		return result;
 
 	if (ptr != NULL) {
@@ -360,7 +360,7 @@ int vdo_reallocate_memory(void *ptr, size_t old_size, size_t size, const char *w
 		vdo_free(ptr);
 	}
 
-	return UDS_SUCCESS;
+	return VDO_SUCCESS;
 }
 
 int vdo_duplicate_string(const char *string, const char *what, char **new_string)
@@ -369,12 +369,12 @@ int vdo_duplicate_string(const char *string, const char *what, char **new_string
 	u8 *dup;
 
 	result = vdo_allocate(strlen(string) + 1, u8, what, &dup);
-	if (result != UDS_SUCCESS)
+	if (result != VDO_SUCCESS)
 		return result;
 
 	memcpy(dup, string, strlen(string) + 1);
 	*new_string = dup;
-	return UDS_SUCCESS;
+	return VDO_SUCCESS;
 }
 
 void vdo_memory_init(void)
diff --git a/drivers/md/dm-vdo/memory-alloc.h b/drivers/md/dm-vdo/memory-alloc.h
index 417a378882e16d..9016d32022e6e5 100644
--- a/drivers/md/dm-vdo/memory-alloc.h
+++ b/drivers/md/dm-vdo/memory-alloc.h
@@ -35,7 +35,7 @@ int __must_check vdo_allocate_memory(size_t size, size_t align, const char *what
  * @what: What is being allocated (for error logging)
  * @ptr: A pointer to hold the allocated memory
  *
- * Return: UDS_SUCCESS or an error code
+ * Return: VDO_SUCCESS or an error code
  */
 static inline int __vdo_do_allocation(size_t count, size_t size, size_t extra,
 				      size_t align, const char *what, void *ptr)
@@ -65,7 +65,7 @@ static inline int __vdo_do_allocation(size_t count, size_t size, size_t extra,
  * @WHAT: What is being allocated (for error logging)
  * @PTR: A pointer to hold the allocated memory
  *
- * Return: UDS_SUCCESS or an error code
+ * Return: VDO_SUCCESS or an error code
  */
 #define vdo_allocate(COUNT, TYPE, WHAT, PTR) \
 	__vdo_do_allocation(COUNT, sizeof(TYPE), 0, __alignof__(TYPE), WHAT, PTR)
@@ -81,7 +81,7 @@ static inline int __vdo_do_allocation(size_t count, size_t size, size_t extra,
  * @WHAT: What is being allocated (for error logging)
  * @PTR: A pointer to hold the allocated memory
  *
- * Return: UDS_SUCCESS or an error code
+ * Return: VDO_SUCCESS or an error code
  */
 #define vdo_allocate_extended(TYPE1, COUNT, TYPE2, WHAT, PTR)		 \
 	__extension__({							 \
@@ -105,7 +105,7 @@ static inline int __vdo_do_allocation(size_t count, size_t size, size_t extra,
  * @what: What is being allocated (for error logging)
  * @ptr: A pointer to hold the allocated memory
  *
- * Return: UDS_SUCCESS or an error code
+ * Return: VDO_SUCCESS or an error code
  */
 static inline int __must_check vdo_allocate_cache_aligned(size_t size, const char *what, void *ptr)
 {
diff --git a/drivers/md/dm-vdo/permassert.h b/drivers/md/dm-vdo/permassert.h
index ee978bc115eccd..8fb5f7d9c66fe5 100644
--- a/drivers/md/dm-vdo/permassert.h
+++ b/drivers/md/dm-vdo/permassert.h
@@ -8,7 +8,7 @@
 
 #include <linux/compiler.h>
 
-#include "errors.h"
+#include "status-codes.h"
 
 /* Utilities for asserting that certain conditions are met */
 
diff --git a/drivers/md/dm-vdo/status-codes.h b/drivers/md/dm-vdo/status-codes.h
index 5d1e8bbe54b466..389811ca6b33d6 100644
--- a/drivers/md/dm-vdo/status-codes.h
+++ b/drivers/md/dm-vdo/status-codes.h
@@ -9,9 +9,9 @@
 #include "errors.h"
 
 enum {
-	UDS_BLOCK_SIZE = UDS_ERROR_CODE_BLOCK_END - UDS_ERROR_CODE_BASE,
+	UDS_ERRORS_BLOCK_SIZE = UDS_ERROR_CODE_BLOCK_END - UDS_ERROR_CODE_BASE,
 	VDO_BLOCK_START = UDS_ERROR_CODE_BLOCK_END,
-	VDO_BLOCK_END = VDO_BLOCK_START + UDS_BLOCK_SIZE,
+	VDO_BLOCK_END = VDO_BLOCK_START + UDS_ERRORS_BLOCK_SIZE,
 };
 
 /* VDO-specific status codes. */

From 86f9f4167258036ca897cd62ebd676b321ddb70b Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@kernel.org>
Date: Tue, 13 Feb 2024 12:06:53 -0600
Subject: [PATCH 0907/1406] dm vdo: check for VDO_SUCCESS return value from
 memory-alloc functions

VDO_SUCCESS and UDS_SUCCESS were used interchangably, update all
callers of VDO's memory-alloc functions to consistently check for
VDO_SUCCESS.

Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 drivers/md/dm-vdo/block-map.c              |  6 +++---
 drivers/md/dm-vdo/data-vio.c               |  2 +-
 drivers/md/dm-vdo/dm-vdo-target.c          | 20 ++++++++++----------
 drivers/md/dm-vdo/encodings.c              |  2 +-
 drivers/md/dm-vdo/funnel-queue.c           |  2 +-
 drivers/md/dm-vdo/funnel-requestqueue.c    |  2 +-
 drivers/md/dm-vdo/funnel-workqueue.c       |  6 +++---
 drivers/md/dm-vdo/indexer/chapter-index.c  |  2 +-
 drivers/md/dm-vdo/indexer/config.c         |  2 +-
 drivers/md/dm-vdo/indexer/delta-index.c    | 10 +++++-----
 drivers/md/dm-vdo/indexer/geometry.c       |  2 +-
 drivers/md/dm-vdo/indexer/index-layout.c   | 18 +++++++++---------
 drivers/md/dm-vdo/indexer/index-page-map.c |  8 ++++----
 drivers/md/dm-vdo/indexer/index-session.c  |  2 +-
 drivers/md/dm-vdo/indexer/index.c          | 12 ++++++------
 drivers/md/dm-vdo/indexer/io-factory.c     |  6 +++---
 drivers/md/dm-vdo/indexer/open-chapter.c   |  4 ++--
 drivers/md/dm-vdo/indexer/radix-sort.c     |  2 +-
 drivers/md/dm-vdo/indexer/sparse-cache.c   |  8 ++++----
 drivers/md/dm-vdo/indexer/volume-index.c   |  6 +++---
 drivers/md/dm-vdo/indexer/volume.c         | 14 +++++++-------
 drivers/md/dm-vdo/int-map.c                | 18 +++++++++---------
 drivers/md/dm-vdo/io-submitter.c           |  2 +-
 drivers/md/dm-vdo/slab-depot.c             |  2 +-
 drivers/md/dm-vdo/thread-utils.c           |  4 ++--
 drivers/md/dm-vdo/vdo.c                    |  2 +-
 26 files changed, 82 insertions(+), 82 deletions(-)

diff --git a/drivers/md/dm-vdo/block-map.c b/drivers/md/dm-vdo/block-map.c
index 5012ddbb5b0ecc..ac226bdae7993f 100644
--- a/drivers/md/dm-vdo/block-map.c
+++ b/drivers/md/dm-vdo/block-map.c
@@ -225,11 +225,11 @@ static int __must_check allocate_cache_components(struct vdo_page_cache *cache)
 
 	result = vdo_allocate(cache->page_count, struct page_info, "page infos",
 			      &cache->infos);
-	if (result != UDS_SUCCESS)
+	if (result != VDO_SUCCESS)
 		return result;
 
 	result = vdo_allocate_memory(size, VDO_BLOCK_SIZE, "cache pages", &cache->pages);
-	if (result != UDS_SUCCESS)
+	if (result != VDO_SUCCESS)
 		return result;
 
 	result = vdo_int_map_create(cache->page_count, &cache->page_map);
@@ -2876,7 +2876,7 @@ int vdo_decode_block_map(struct block_map_state_2_0 state, block_count_t logical
 	result = vdo_allocate_extended(struct block_map,
 				       vdo->thread_config.logical_zone_count,
 				       struct block_map_zone, __func__, &map);
-	if (result != UDS_SUCCESS)
+	if (result != VDO_SUCCESS)
 		return result;
 
 	map->vdo = vdo;
diff --git a/drivers/md/dm-vdo/data-vio.c b/drivers/md/dm-vdo/data-vio.c
index de3dd0afd93a4d..dcb0d1075239fe 100644
--- a/drivers/md/dm-vdo/data-vio.c
+++ b/drivers/md/dm-vdo/data-vio.c
@@ -849,7 +849,7 @@ int make_data_vio_pool(struct vdo *vdo, data_vio_count_t pool_size,
 
 	result = vdo_allocate_extended(struct data_vio_pool, pool_size, struct data_vio,
 				       __func__, &pool);
-	if (result != UDS_SUCCESS)
+	if (result != VDO_SUCCESS)
 		return result;
 
 	ASSERT_LOG_ONLY((discard_limit <= pool_size),
diff --git a/drivers/md/dm-vdo/dm-vdo-target.c b/drivers/md/dm-vdo/dm-vdo-target.c
index bffa48fc71e6ff..d8769eb46f0a99 100644
--- a/drivers/md/dm-vdo/dm-vdo-target.c
+++ b/drivers/md/dm-vdo/dm-vdo-target.c
@@ -280,7 +280,7 @@ static int split_string(const char *string, char separator, char ***substring_ar
 
 	result = vdo_allocate(substring_count + 1, char *, "string-splitting array",
 			      &substrings);
-	if (result != UDS_SUCCESS)
+	if (result != VDO_SUCCESS)
 		return result;
 
 	for (s = string; *s != 0; s++) {
@@ -289,7 +289,7 @@ static int split_string(const char *string, char separator, char ***substring_ar
 
 			result = vdo_allocate(length + 1, char, "split string",
 					      &substrings[current_substring]);
-			if (result != UDS_SUCCESS) {
+			if (result != VDO_SUCCESS) {
 				free_string_array(substrings);
 				return result;
 			}
@@ -310,7 +310,7 @@ static int split_string(const char *string, char separator, char ***substring_ar
 
 	result = vdo_allocate(length + 1, char, "split string",
 			      &substrings[current_substring]);
-	if (result != UDS_SUCCESS) {
+	if (result != VDO_SUCCESS) {
 		free_string_array(substrings);
 		return result;
 	}
@@ -1529,7 +1529,7 @@ static size_t get_bit_array_size(unsigned int bit_count)
  * Since the array is initially NULL, this also initializes the array the first time we allocate an
  * instance number.
  *
- * Return: UDS_SUCCESS or an error code from the allocation
+ * Return: VDO_SUCCESS or an error code from the allocation
  */
 static int grow_bit_array(void)
 {
@@ -1542,19 +1542,19 @@ static int grow_bit_array(void)
 				       get_bit_array_size(instances.bit_count),
 				       get_bit_array_size(new_count),
 				       "instance number bit array", &new_words);
-	if (result != UDS_SUCCESS)
+	if (result != VDO_SUCCESS)
 		return result;
 
 	instances.bit_count = new_count;
 	instances.words = new_words;
-	return UDS_SUCCESS;
+	return VDO_SUCCESS;
 }
 
 /**
  * allocate_instance() - Allocate an instance number.
  * @instance_ptr: A point to hold the instance number
  *
- * Return: UDS_SUCCESS or an error code
+ * Return: VDO_SUCCESS or an error code
  *
  * This function must be called while holding the instances lock.
  */
@@ -1566,7 +1566,7 @@ static int allocate_instance(unsigned int *instance_ptr)
 	/* If there are no unallocated instances, grow the bit array. */
 	if (instances.count >= instances.bit_count) {
 		result = grow_bit_array();
-		if (result != UDS_SUCCESS)
+		if (result != VDO_SUCCESS)
 			return result;
 	}
 
@@ -1581,7 +1581,7 @@ static int allocate_instance(unsigned int *instance_ptr)
 		instance = find_first_zero_bit(instances.words, instances.bit_count);
 		result = ASSERT(instance < instances.bit_count,
 				"impossibly, no zero bit found");
-		if (result != UDS_SUCCESS)
+		if (result != VDO_SUCCESS)
 			return result;
 	}
 
@@ -1589,7 +1589,7 @@ static int allocate_instance(unsigned int *instance_ptr)
 	instances.count++;
 	instances.next = instance + 1;
 	*instance_ptr = instance;
-	return UDS_SUCCESS;
+	return VDO_SUCCESS;
 }
 
 static int construct_new_vdo_registered(struct dm_target *ti, unsigned int argc,
diff --git a/drivers/md/dm-vdo/encodings.c b/drivers/md/dm-vdo/encodings.c
index b08e3dc9af4107..9388f475ad44d2 100644
--- a/drivers/md/dm-vdo/encodings.c
+++ b/drivers/md/dm-vdo/encodings.c
@@ -802,7 +802,7 @@ static int allocate_partition(struct layout *layout, u8 id,
 	int result;
 
 	result = vdo_allocate(1, struct partition, __func__, &partition);
-	if (result != UDS_SUCCESS)
+	if (result != VDO_SUCCESS)
 		return result;
 
 	partition->id = id;
diff --git a/drivers/md/dm-vdo/funnel-queue.c b/drivers/md/dm-vdo/funnel-queue.c
index 0474cfc7ba46d0..1dba6b776f915b 100644
--- a/drivers/md/dm-vdo/funnel-queue.c
+++ b/drivers/md/dm-vdo/funnel-queue.c
@@ -16,7 +16,7 @@ int uds_make_funnel_queue(struct funnel_queue **queue_ptr)
 	struct funnel_queue *queue;
 
 	result = vdo_allocate(1, struct funnel_queue, "funnel queue", &queue);
-	if (result != UDS_SUCCESS)
+	if (result != VDO_SUCCESS)
 		return result;
 
 	/*
diff --git a/drivers/md/dm-vdo/funnel-requestqueue.c b/drivers/md/dm-vdo/funnel-requestqueue.c
index a3d241a6a42e41..948b5e55617f99 100644
--- a/drivers/md/dm-vdo/funnel-requestqueue.c
+++ b/drivers/md/dm-vdo/funnel-requestqueue.c
@@ -199,7 +199,7 @@ int uds_make_request_queue(const char *queue_name,
 	struct uds_request_queue *queue;
 
 	result = vdo_allocate(1, struct uds_request_queue, __func__, &queue);
-	if (result != UDS_SUCCESS)
+	if (result != VDO_SUCCESS)
 		return result;
 
 	queue->processor = processor;
diff --git a/drivers/md/dm-vdo/funnel-workqueue.c b/drivers/md/dm-vdo/funnel-workqueue.c
index 8dbaeb8326b08b..c56a113f441790 100644
--- a/drivers/md/dm-vdo/funnel-workqueue.c
+++ b/drivers/md/dm-vdo/funnel-workqueue.c
@@ -324,7 +324,7 @@ static int make_simple_work_queue(const char *thread_name_prefix, const char *na
 			VDO_WORK_Q_MAX_PRIORITY);
 
 	result = vdo_allocate(1, struct simple_work_queue, "simple work queue", &queue);
-	if (result != UDS_SUCCESS)
+	if (result != VDO_SUCCESS)
 		return result;
 
 	queue->private = private;
@@ -401,12 +401,12 @@ int vdo_make_work_queue(const char *thread_name_prefix, const char *name,
 
 	result = vdo_allocate(1, struct round_robin_work_queue, "round-robin work queue",
 			      &queue);
-	if (result != UDS_SUCCESS)
+	if (result != VDO_SUCCESS)
 		return result;
 
 	result = vdo_allocate(thread_count, struct simple_work_queue *,
 			      "subordinate work queues", &queue->service_queues);
-	if (result != UDS_SUCCESS) {
+	if (result != VDO_SUCCESS) {
 		vdo_free(queue);
 		return result;
 	}
diff --git a/drivers/md/dm-vdo/indexer/chapter-index.c b/drivers/md/dm-vdo/indexer/chapter-index.c
index 94b9fadc26437b..2caba57c83cca6 100644
--- a/drivers/md/dm-vdo/indexer/chapter-index.c
+++ b/drivers/md/dm-vdo/indexer/chapter-index.c
@@ -20,7 +20,7 @@ int uds_make_open_chapter_index(struct open_chapter_index **chapter_index,
 	struct open_chapter_index *index;
 
 	result = vdo_allocate(1, struct open_chapter_index, "open chapter index", &index);
-	if (result != UDS_SUCCESS)
+	if (result != VDO_SUCCESS)
 		return result;
 
 	/*
diff --git a/drivers/md/dm-vdo/indexer/config.c b/drivers/md/dm-vdo/indexer/config.c
index 153da2273b6fed..5df57961856be4 100644
--- a/drivers/md/dm-vdo/indexer/config.c
+++ b/drivers/md/dm-vdo/indexer/config.c
@@ -328,7 +328,7 @@ int uds_make_configuration(const struct uds_parameters *params,
 		return result;
 
 	result = vdo_allocate(1, struct uds_configuration, __func__, &config);
-	if (result != UDS_SUCCESS)
+	if (result != VDO_SUCCESS)
 		return result;
 
 	result = uds_make_index_geometry(DEFAULT_BYTES_PER_PAGE, record_pages_per_chapter,
diff --git a/drivers/md/dm-vdo/indexer/delta-index.c b/drivers/md/dm-vdo/indexer/delta-index.c
index 86f777a1c95b17..6448c891f80765 100644
--- a/drivers/md/dm-vdo/indexer/delta-index.c
+++ b/drivers/md/dm-vdo/indexer/delta-index.c
@@ -330,18 +330,18 @@ static int initialize_delta_zone(struct delta_zone *delta_zone, size_t size,
 	int result;
 
 	result = vdo_allocate(size, u8, "delta list", &delta_zone->memory);
-	if (result != UDS_SUCCESS)
+	if (result != VDO_SUCCESS)
 		return result;
 
 	result = vdo_allocate(list_count + 2, u64, "delta list temp",
 			      &delta_zone->new_offsets);
-	if (result != UDS_SUCCESS)
+	if (result != VDO_SUCCESS)
 		return result;
 
 	/* Allocate the delta lists. */
 	result = vdo_allocate(list_count + 2, struct delta_list, "delta lists",
 			      &delta_zone->delta_lists);
-	if (result != UDS_SUCCESS)
+	if (result != VDO_SUCCESS)
 		return result;
 
 	compute_coding_constants(mean_delta, &delta_zone->min_bits,
@@ -372,7 +372,7 @@ int uds_initialize_delta_index(struct delta_index *delta_index, unsigned int zon
 
 	result = vdo_allocate(zone_count, struct delta_zone, "Delta Index Zones",
 			      &delta_index->delta_zones);
-	if (result != UDS_SUCCESS)
+	if (result != VDO_SUCCESS)
 		return result;
 
 	delta_index->zone_count = zone_count;
@@ -1066,7 +1066,7 @@ int uds_finish_restoring_delta_index(struct delta_index *delta_index,
 	u8 *data;
 
 	result = vdo_allocate(DELTA_LIST_MAX_BYTE_COUNT, u8, __func__, &data);
-	if (result != UDS_SUCCESS)
+	if (result != VDO_SUCCESS)
 		return result;
 
 	for (z = 0; z < reader_count; z++) {
diff --git a/drivers/md/dm-vdo/indexer/geometry.c b/drivers/md/dm-vdo/indexer/geometry.c
index 18479ad78c8913..7cc6faaffcd7e8 100644
--- a/drivers/md/dm-vdo/indexer/geometry.c
+++ b/drivers/md/dm-vdo/indexer/geometry.c
@@ -62,7 +62,7 @@ int uds_make_index_geometry(size_t bytes_per_page, u32 record_pages_per_chapter,
 	struct index_geometry *geometry;
 
 	result = vdo_allocate(1, struct index_geometry, "geometry", &geometry);
-	if (result != UDS_SUCCESS)
+	if (result != VDO_SUCCESS)
 		return result;
 
 	geometry->bytes_per_page = bytes_per_page;
diff --git a/drivers/md/dm-vdo/indexer/index-layout.c b/drivers/md/dm-vdo/indexer/index-layout.c
index bc7e9aabc27bee..3e380206c2f594 100644
--- a/drivers/md/dm-vdo/indexer/index-layout.c
+++ b/drivers/md/dm-vdo/indexer/index-layout.c
@@ -493,7 +493,7 @@ static int __must_check make_index_save_region_table(struct index_save_layout *i
 	result = vdo_allocate_extended(struct region_table, region_count,
 				       struct layout_region,
 				       "layout region table for ISL", &table);
-	if (result != UDS_SUCCESS)
+	if (result != VDO_SUCCESS)
 		return result;
 
 	lr = &table->regions[0];
@@ -552,7 +552,7 @@ static int __must_check write_index_save_header(struct index_save_layout *isl,
 	size_t offset = 0;
 
 	result = vdo_allocate(table->encoded_size, u8, "index save data", &buffer);
-	if (result != UDS_SUCCESS)
+	if (result != VDO_SUCCESS)
 		return result;
 
 	encode_region_table(buffer, &offset, table);
@@ -676,7 +676,7 @@ static int __must_check make_layout_region_table(struct index_layout *layout,
 	result = vdo_allocate_extended(struct region_table, region_count,
 				       struct layout_region, "layout region table",
 				       &table);
-	if (result != UDS_SUCCESS)
+	if (result != VDO_SUCCESS)
 		return result;
 
 	lr = &table->regions[0];
@@ -722,7 +722,7 @@ static int __must_check write_layout_header(struct index_layout *layout,
 	size_t offset = 0;
 
 	result = vdo_allocate(table->encoded_size, u8, "layout data", &buffer);
-	if (result != UDS_SUCCESS)
+	if (result != VDO_SUCCESS)
 		return result;
 
 	encode_region_table(buffer, &offset, table);
@@ -813,7 +813,7 @@ static int create_index_layout(struct index_layout *layout, struct uds_configura
 
 	result = vdo_allocate(sizes.save_count, struct index_save_layout, __func__,
 			      &layout->index.saves);
-	if (result != UDS_SUCCESS)
+	if (result != VDO_SUCCESS)
 		return result;
 
 	initialize_layout(layout, &sizes);
@@ -1171,7 +1171,7 @@ static int __must_check load_region_table(struct buffered_reader *reader,
 	result = vdo_allocate_extended(struct region_table, header.region_count,
 				       struct layout_region,
 				       "single file layout region table", &table);
-	if (result != UDS_SUCCESS)
+	if (result != VDO_SUCCESS)
 		return result;
 
 	table->header = header;
@@ -1208,7 +1208,7 @@ static int __must_check read_super_block_data(struct buffered_reader *reader,
 	size_t offset = 0;
 
 	result = vdo_allocate(saved_size, u8, "super block data", &buffer);
-	if (result != UDS_SUCCESS)
+	if (result != VDO_SUCCESS)
 		return result;
 
 	result = uds_read_from_buffered_reader(reader, buffer, saved_size);
@@ -1343,7 +1343,7 @@ static int __must_check reconstitute_layout(struct index_layout *layout,
 
 	result = vdo_allocate(layout->super.max_saves, struct index_save_layout,
 			      __func__, &layout->index.saves);
-	if (result != UDS_SUCCESS)
+	if (result != VDO_SUCCESS)
 		return result;
 
 	layout->total_blocks = table->header.region_blocks;
@@ -1702,7 +1702,7 @@ int uds_make_index_layout(struct uds_configuration *config, bool new_layout,
 		return result;
 
 	result = vdo_allocate(1, struct index_layout, __func__, &layout);
-	if (result != UDS_SUCCESS)
+	if (result != VDO_SUCCESS)
 		return result;
 
 	result = create_layout_factory(layout, config);
diff --git a/drivers/md/dm-vdo/indexer/index-page-map.c b/drivers/md/dm-vdo/indexer/index-page-map.c
index 41940574731a4b..f2ebcc76eef9cc 100644
--- a/drivers/md/dm-vdo/indexer/index-page-map.c
+++ b/drivers/md/dm-vdo/indexer/index-page-map.c
@@ -41,14 +41,14 @@ int uds_make_index_page_map(const struct index_geometry *geometry,
 	struct index_page_map *map;
 
 	result = vdo_allocate(1, struct index_page_map, "page map", &map);
-	if (result != UDS_SUCCESS)
+	if (result != VDO_SUCCESS)
 		return result;
 
 	map->geometry = geometry;
 	map->entries_per_chapter = geometry->index_pages_per_chapter - 1;
 	result = vdo_allocate(get_entry_count(geometry), u16, "Index Page Map Entries",
 			      &map->entries);
-	if (result != UDS_SUCCESS) {
+	if (result != VDO_SUCCESS) {
 		vdo_free_index_page_map(map);
 		return result;
 	}
@@ -121,7 +121,7 @@ int uds_write_index_page_map(struct index_page_map *map, struct buffered_writer
 	u32 i;
 
 	result = vdo_allocate(saved_size, u8, "page map data", &buffer);
-	if (result != UDS_SUCCESS)
+	if (result != VDO_SUCCESS)
 		return result;
 
 	memcpy(buffer, PAGE_MAP_MAGIC, PAGE_MAP_MAGIC_LENGTH);
@@ -148,7 +148,7 @@ int uds_read_index_page_map(struct index_page_map *map, struct buffered_reader *
 	u32 i;
 
 	result = vdo_allocate(saved_size, u8, "page map data", &buffer);
-	if (result != UDS_SUCCESS)
+	if (result != VDO_SUCCESS)
 		return result;
 
 	result = uds_read_from_buffered_reader(reader, buffer, saved_size);
diff --git a/drivers/md/dm-vdo/indexer/index-session.c b/drivers/md/dm-vdo/indexer/index-session.c
index 7fada13abc445c..5d020783487f04 100644
--- a/drivers/md/dm-vdo/indexer/index-session.c
+++ b/drivers/md/dm-vdo/indexer/index-session.c
@@ -222,7 +222,7 @@ static int __must_check make_empty_index_session(struct uds_index_session **inde
 	struct uds_index_session *session;
 
 	result = vdo_allocate(1, struct uds_index_session, __func__, &session);
-	if (result != UDS_SUCCESS)
+	if (result != VDO_SUCCESS)
 		return result;
 
 	mutex_init(&session->request_mutex);
diff --git a/drivers/md/dm-vdo/indexer/index.c b/drivers/md/dm-vdo/indexer/index.c
index b114043f16281f..0db6e458b6de4e 100644
--- a/drivers/md/dm-vdo/indexer/index.c
+++ b/drivers/md/dm-vdo/indexer/index.c
@@ -87,7 +87,7 @@ static int launch_zone_message(struct uds_zone_message message, unsigned int zon
 	struct uds_request *request;
 
 	result = vdo_allocate(1, struct uds_request, __func__, &request);
-	if (result != UDS_SUCCESS)
+	if (result != VDO_SUCCESS)
 		return result;
 
 	request->index = index;
@@ -768,7 +768,7 @@ static int make_chapter_writer(struct uds_index *index,
 	result = vdo_allocate_extended(struct chapter_writer, index->zone_count,
 				       struct open_chapter_zone *, "Chapter Writer",
 				       &writer);
-	if (result != UDS_SUCCESS)
+	if (result != VDO_SUCCESS)
 		return result;
 
 	writer->index = index;
@@ -777,7 +777,7 @@ static int make_chapter_writer(struct uds_index *index,
 
 	result = vdo_allocate_cache_aligned(collated_records_size, "collated records",
 					    &writer->collated_records);
-	if (result != UDS_SUCCESS) {
+	if (result != VDO_SUCCESS) {
 		free_chapter_writer(writer);
 		return result;
 	}
@@ -1125,7 +1125,7 @@ static int make_index_zone(struct uds_index *index, unsigned int zone_number)
 	struct index_zone *zone;
 
 	result = vdo_allocate(1, struct index_zone, "index zone", &zone);
-	if (result != UDS_SUCCESS)
+	if (result != VDO_SUCCESS)
 		return result;
 
 	result = uds_make_open_chapter(index->volume->geometry, index->zone_count,
@@ -1163,7 +1163,7 @@ int uds_make_index(struct uds_configuration *config, enum uds_open_index_type op
 
 	result = vdo_allocate_extended(struct uds_index, config->zone_count,
 				       struct uds_request_queue *, "index", &index);
-	if (result != UDS_SUCCESS)
+	if (result != VDO_SUCCESS)
 		return result;
 
 	index->zone_count = config->zone_count;
@@ -1176,7 +1176,7 @@ int uds_make_index(struct uds_configuration *config, enum uds_open_index_type op
 
 	result = vdo_allocate(index->zone_count, struct index_zone *, "zones",
 			      &index->zones);
-	if (result != UDS_SUCCESS) {
+	if (result != VDO_SUCCESS) {
 		vdo_free_index(index);
 		return result;
 	}
diff --git a/drivers/md/dm-vdo/indexer/io-factory.c b/drivers/md/dm-vdo/indexer/io-factory.c
index 795aa5238c047f..8fe7c0b2802dbb 100644
--- a/drivers/md/dm-vdo/indexer/io-factory.c
+++ b/drivers/md/dm-vdo/indexer/io-factory.c
@@ -65,7 +65,7 @@ int uds_make_io_factory(struct block_device *bdev, struct io_factory **factory_p
 	struct io_factory *factory;
 
 	result = vdo_allocate(1, struct io_factory, __func__, &factory);
-	if (result != UDS_SUCCESS)
+	if (result != VDO_SUCCESS)
 		return result;
 
 	factory->bdev = bdev;
@@ -145,7 +145,7 @@ int uds_make_buffered_reader(struct io_factory *factory, off_t offset, u64 block
 		return result;
 
 	result = vdo_allocate(1, struct buffered_reader, "buffered reader", &reader);
-	if (result != UDS_SUCCESS) {
+	if (result != VDO_SUCCESS) {
 		dm_bufio_client_destroy(client);
 		return result;
 	}
@@ -283,7 +283,7 @@ int uds_make_buffered_writer(struct io_factory *factory, off_t offset, u64 block
 		return result;
 
 	result = vdo_allocate(1, struct buffered_writer, "buffered writer", &writer);
-	if (result != UDS_SUCCESS) {
+	if (result != VDO_SUCCESS) {
 		dm_bufio_client_destroy(client);
 		return result;
 	}
diff --git a/drivers/md/dm-vdo/indexer/open-chapter.c b/drivers/md/dm-vdo/indexer/open-chapter.c
index 6dd055ab47deef..989b1946e55daf 100644
--- a/drivers/md/dm-vdo/indexer/open-chapter.c
+++ b/drivers/md/dm-vdo/indexer/open-chapter.c
@@ -73,14 +73,14 @@ int uds_make_open_chapter(const struct index_geometry *geometry, unsigned int zo
 	result = vdo_allocate_extended(struct open_chapter_zone, slot_count,
 				       struct open_chapter_zone_slot, "open chapter",
 				       &open_chapter);
-	if (result != UDS_SUCCESS)
+	if (result != VDO_SUCCESS)
 		return result;
 
 	open_chapter->slot_count = slot_count;
 	open_chapter->capacity = capacity;
 	result = vdo_allocate_cache_aligned(records_size(open_chapter), "record pages",
 					    &open_chapter->records);
-	if (result != UDS_SUCCESS) {
+	if (result != VDO_SUCCESS) {
 		vdo_free_open_chapter(open_chapter);
 		return result;
 	}
diff --git a/drivers/md/dm-vdo/indexer/radix-sort.c b/drivers/md/dm-vdo/indexer/radix-sort.c
index 1b8233c99ae696..e76e27a859d891 100644
--- a/drivers/md/dm-vdo/indexer/radix-sort.c
+++ b/drivers/md/dm-vdo/indexer/radix-sort.c
@@ -215,7 +215,7 @@ int uds_make_radix_sorter(unsigned int count, struct radix_sorter **sorter)
 
 	result = vdo_allocate_extended(struct radix_sorter, stack_size, struct task,
 				       __func__, &radix_sorter);
-	if (result != UDS_SUCCESS)
+	if (result != VDO_SUCCESS)
 		return result;
 
 	radix_sorter->count = count;
diff --git a/drivers/md/dm-vdo/indexer/sparse-cache.c b/drivers/md/dm-vdo/indexer/sparse-cache.c
index cb222bda32c29c..687694d55e5448 100644
--- a/drivers/md/dm-vdo/indexer/sparse-cache.c
+++ b/drivers/md/dm-vdo/indexer/sparse-cache.c
@@ -226,7 +226,7 @@ static int __must_check initialize_cached_chapter_index(struct cached_chapter_in
 
 	result = vdo_allocate(chapter->index_pages_count, struct delta_index_page,
 			      __func__, &chapter->index_pages);
-	if (result != UDS_SUCCESS)
+	if (result != VDO_SUCCESS)
 		return result;
 
 	return vdo_allocate(chapter->index_pages_count, struct dm_buffer *,
@@ -244,7 +244,7 @@ static int __must_check make_search_list(struct sparse_cache *cache,
 	bytes = (sizeof(struct search_list) +
 		 (cache->capacity * sizeof(struct cached_chapter_index *)));
 	result = vdo_allocate_cache_aligned(bytes, "search list", &list);
-	if (result != UDS_SUCCESS)
+	if (result != VDO_SUCCESS)
 		return result;
 
 	list->capacity = cache->capacity;
@@ -267,7 +267,7 @@ int uds_make_sparse_cache(const struct index_geometry *geometry, unsigned int ca
 
 	bytes = (sizeof(struct sparse_cache) + (capacity * sizeof(struct cached_chapter_index)));
 	result = vdo_allocate_cache_aligned(bytes, "sparse cache", &cache);
-	if (result != UDS_SUCCESS)
+	if (result != VDO_SUCCESS)
 		return result;
 
 	cache->geometry = geometry;
@@ -298,7 +298,7 @@ int uds_make_sparse_cache(const struct index_geometry *geometry, unsigned int ca
 	/* purge_search_list() needs some temporary lists for sorting. */
 	result = vdo_allocate(capacity * 2, struct cached_chapter_index *,
 			      "scratch entries", &cache->scratch_entries);
-	if (result != UDS_SUCCESS)
+	if (result != VDO_SUCCESS)
 		goto out;
 
 	*cache_ptr = cache;
diff --git a/drivers/md/dm-vdo/indexer/volume-index.c b/drivers/md/dm-vdo/indexer/volume-index.c
index eb54f5b478318a..762607974f3587 100644
--- a/drivers/md/dm-vdo/indexer/volume-index.c
+++ b/drivers/md/dm-vdo/indexer/volume-index.c
@@ -1211,7 +1211,7 @@ static int initialize_volume_sub_index(const struct uds_configuration *config,
 	/* The following arrays are initialized to all zeros. */
 	result = vdo_allocate(params.list_count, u64, "first chapter to flush",
 			      &sub_index->flush_chapters);
-	if (result != UDS_SUCCESS)
+	if (result != VDO_SUCCESS)
 		return result;
 
 	return vdo_allocate(zone_count, struct volume_sub_index_zone,
@@ -1227,7 +1227,7 @@ int uds_make_volume_index(const struct uds_configuration *config, u64 volume_non
 	int result;
 
 	result = vdo_allocate(1, struct volume_index, "volume index", &volume_index);
-	if (result != UDS_SUCCESS)
+	if (result != VDO_SUCCESS)
 		return result;
 
 	volume_index->zone_count = config->zone_count;
@@ -1249,7 +1249,7 @@ int uds_make_volume_index(const struct uds_configuration *config, u64 volume_non
 
 	result = vdo_allocate(config->zone_count, struct volume_index_zone,
 			      "volume index zones", &volume_index->zones);
-	if (result != UDS_SUCCESS) {
+	if (result != VDO_SUCCESS) {
 		vdo_free_volume_index(volume_index);
 		return result;
 	}
diff --git a/drivers/md/dm-vdo/indexer/volume.c b/drivers/md/dm-vdo/indexer/volume.c
index f8e47f1c54c7ff..1dd9c563a6e16d 100644
--- a/drivers/md/dm-vdo/indexer/volume.c
+++ b/drivers/md/dm-vdo/indexer/volume.c
@@ -1511,22 +1511,22 @@ static int __must_check initialize_page_cache(struct page_cache *cache,
 
 	result = vdo_allocate(VOLUME_CACHE_MAX_QUEUED_READS, struct queued_read,
 			      "volume read queue", &cache->read_queue);
-	if (result != UDS_SUCCESS)
+	if (result != VDO_SUCCESS)
 		return result;
 
 	result = vdo_allocate(cache->zone_count, struct search_pending_counter,
 			      "Volume Cache Zones", &cache->search_pending_counters);
-	if (result != UDS_SUCCESS)
+	if (result != VDO_SUCCESS)
 		return result;
 
 	result = vdo_allocate(cache->indexable_pages, u16, "page cache index",
 			      &cache->index);
-	if (result != UDS_SUCCESS)
+	if (result != VDO_SUCCESS)
 		return result;
 
 	result = vdo_allocate(cache->cache_slots, struct cached_page, "page cache cache",
 			      &cache->cache);
-	if (result != UDS_SUCCESS)
+	if (result != VDO_SUCCESS)
 		return result;
 
 	/* Initialize index values to invalid values. */
@@ -1549,7 +1549,7 @@ int uds_make_volume(const struct uds_configuration *config, struct index_layout
 	int result;
 
 	result = vdo_allocate(1, struct volume, "volume", &volume);
-	if (result != UDS_SUCCESS)
+	if (result != VDO_SUCCESS)
 		return result;
 
 	volume->nonce = uds_get_volume_nonce(layout);
@@ -1588,7 +1588,7 @@ int uds_make_volume(const struct uds_configuration *config, struct index_layout
 	result = vdo_allocate(geometry->records_per_page,
 			      const struct uds_volume_record *, "record pointers",
 			      &volume->record_pointers);
-	if (result != UDS_SUCCESS) {
+	if (result != VDO_SUCCESS) {
 		vdo_free_volume(volume);
 		return result;
 	}
@@ -1628,7 +1628,7 @@ int uds_make_volume(const struct uds_configuration *config, struct index_layout
 
 	result = vdo_allocate(config->read_threads, struct thread *, "reader threads",
 			      &volume->reader_threads);
-	if (result != UDS_SUCCESS) {
+	if (result != VDO_SUCCESS) {
 		vdo_free_volume(volume);
 		return result;
 	}
diff --git a/drivers/md/dm-vdo/int-map.c b/drivers/md/dm-vdo/int-map.c
index b8a955449737e1..f6b4a3de9bdb67 100644
--- a/drivers/md/dm-vdo/int-map.c
+++ b/drivers/md/dm-vdo/int-map.c
@@ -154,7 +154,7 @@ static u64 hash_key(u64 key)
  * @map: The map to initialize.
  * @capacity: The initial capacity of the map.
  *
- * Return: UDS_SUCCESS or an error code.
+ * Return: VDO_SUCCESS or an error code.
  */
 static int allocate_buckets(struct int_map *map, size_t capacity)
 {
@@ -176,7 +176,7 @@ static int allocate_buckets(struct int_map *map, size_t capacity)
  *                    tells the map to use its own small default).
  * @map_ptr: Output, a pointer to hold the new int_map.
  *
- * Return: UDS_SUCCESS or an error code.
+ * Return: VDO_SUCCESS or an error code.
  */
 int vdo_int_map_create(size_t initial_capacity, struct int_map **map_ptr)
 {
@@ -185,7 +185,7 @@ int vdo_int_map_create(size_t initial_capacity, struct int_map **map_ptr)
 	size_t capacity;
 
 	result = vdo_allocate(1, struct int_map, "struct int_map", &map);
-	if (result != UDS_SUCCESS)
+	if (result != VDO_SUCCESS)
 		return result;
 
 	/* Use the default capacity if the caller did not specify one. */
@@ -198,13 +198,13 @@ int vdo_int_map_create(size_t initial_capacity, struct int_map **map_ptr)
 	capacity = capacity * 100 / DEFAULT_LOAD;
 
 	result = allocate_buckets(map, capacity);
-	if (result != UDS_SUCCESS) {
+	if (result != VDO_SUCCESS) {
 		vdo_int_map_free(vdo_forget(map));
 		return result;
 	}
 
 	*map_ptr = map;
-	return UDS_SUCCESS;
+	return VDO_SUCCESS;
 }
 
 /**
@@ -370,7 +370,7 @@ void *vdo_int_map_get(struct int_map *map, u64 key)
  *
  * Resizes and rehashes all the existing entries, storing them in the new buckets.
  *
- * Return: UDS_SUCCESS or an error code.
+ * Return: VDO_SUCCESS or an error code.
  */
 static int resize_buckets(struct int_map *map)
 {
@@ -386,7 +386,7 @@ static int resize_buckets(struct int_map *map)
 	uds_log_info("%s: attempting resize from %zu to %zu, current size=%zu",
 		     __func__, map->capacity, new_capacity, map->size);
 	result = allocate_buckets(map, new_capacity);
-	if (result != UDS_SUCCESS) {
+	if (result != VDO_SUCCESS) {
 		*map = old_map;
 		return result;
 	}
@@ -409,7 +409,7 @@ static int resize_buckets(struct int_map *map)
 
 	/* Destroy the old bucket array. */
 	vdo_free(vdo_forget(old_map.buckets));
-	return UDS_SUCCESS;
+	return VDO_SUCCESS;
 }
 
 /**
@@ -649,7 +649,7 @@ int vdo_int_map_put(struct int_map *map, u64 key, void *new_value, bool update,
 		 * large maps).
 		 */
 		result = resize_buckets(map);
-		if (result != UDS_SUCCESS)
+		if (result != VDO_SUCCESS)
 			return result;
 
 		/*
diff --git a/drivers/md/dm-vdo/io-submitter.c b/drivers/md/dm-vdo/io-submitter.c
index 23549b7e9e6d60..b0f1ba810cd0c6 100644
--- a/drivers/md/dm-vdo/io-submitter.c
+++ b/drivers/md/dm-vdo/io-submitter.c
@@ -383,7 +383,7 @@ int vdo_make_io_submitter(unsigned int thread_count, unsigned int rotation_inter
 	result = vdo_allocate_extended(struct io_submitter, thread_count,
 				       struct bio_queue_data, "bio submission data",
 				       &io_submitter);
-	if (result != UDS_SUCCESS)
+	if (result != VDO_SUCCESS)
 		return result;
 
 	io_submitter->bio_queue_rotation_interval = rotation_interval;
diff --git a/drivers/md/dm-vdo/slab-depot.c b/drivers/md/dm-vdo/slab-depot.c
index eaf2930752090c..3a9b79ae9fb6a7 100644
--- a/drivers/md/dm-vdo/slab-depot.c
+++ b/drivers/md/dm-vdo/slab-depot.c
@@ -2373,7 +2373,7 @@ static int allocate_slab_counters(struct vdo_slab *slab)
 	bytes = (slab->reference_block_count * COUNTS_PER_BLOCK) + (2 * BYTES_PER_WORD);
 	result = vdo_allocate(bytes, vdo_refcount_t, "ref counts array",
 			      &slab->counters);
-	if (result != UDS_SUCCESS) {
+	if (result != VDO_SUCCESS) {
 		vdo_free(vdo_forget(slab->reference_blocks));
 		return result;
 	}
diff --git a/drivers/md/dm-vdo/thread-utils.c b/drivers/md/dm-vdo/thread-utils.c
index f328307a5f1d26..ee2c8f2b505f3f 100644
--- a/drivers/md/dm-vdo/thread-utils.c
+++ b/drivers/md/dm-vdo/thread-utils.c
@@ -11,9 +11,9 @@
 #include <linux/mutex.h>
 #include <linux/types.h>
 
-#include "errors.h"
 #include "logger.h"
 #include "memory-alloc.h"
+#include "status-codes.h"
 
 static struct hlist_head thread_list;
 static struct mutex thread_mutex;
@@ -85,7 +85,7 @@ int vdo_create_thread(void (*thread_function)(void *), void *thread_data,
 	int result;
 
 	result = vdo_allocate(1, struct thread, __func__, &thread);
-	if (result != UDS_SUCCESS) {
+	if (result != VDO_SUCCESS) {
 		uds_log_warning("Error allocating memory for %s", name);
 		return result;
 	}
diff --git a/drivers/md/dm-vdo/vdo.c b/drivers/md/dm-vdo/vdo.c
index f9b8edf1292d73..283bda0c830728 100644
--- a/drivers/md/dm-vdo/vdo.c
+++ b/drivers/md/dm-vdo/vdo.c
@@ -547,7 +547,7 @@ int vdo_make(unsigned int instance, struct device_config *config, char **reason,
 	*reason = "Unspecified error";
 
 	result = vdo_allocate(1, struct vdo, __func__, &vdo);
-	if (result != UDS_SUCCESS) {
+	if (result != VDO_SUCCESS) {
 		*reason = "Cannot allocate VDO";
 		return result;
 	}

From 8abe53f40e7426def7fbd8e2c0d5d57d3dcd783c Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@kernel.org>
Date: Tue, 13 Feb 2024 12:17:53 -0600
Subject: [PATCH 0908/1406] dm vdo int-map: return VDO_SUCCESS on success

Update all callers to check for VDO_SUCCESS (most already did).
Also fix whitespace for update_mapping() parameters.

Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 drivers/md/dm-vdo/block-map.c    |  4 ++--
 drivers/md/dm-vdo/int-map.c      | 22 +++++++++-------------
 drivers/md/dm-vdo/io-submitter.c |  4 ++--
 3 files changed, 13 insertions(+), 17 deletions(-)

diff --git a/drivers/md/dm-vdo/block-map.c b/drivers/md/dm-vdo/block-map.c
index ac226bdae7993f..7490b0222fa7cf 100644
--- a/drivers/md/dm-vdo/block-map.c
+++ b/drivers/md/dm-vdo/block-map.c
@@ -233,7 +233,7 @@ static int __must_check allocate_cache_components(struct vdo_page_cache *cache)
 		return result;
 
 	result = vdo_int_map_create(cache->page_count, &cache->page_map);
-	if (result != UDS_SUCCESS)
+	if (result != VDO_SUCCESS)
 		return result;
 
 	return initialize_info(cache);
@@ -392,7 +392,7 @@ static int __must_check set_info_pbn(struct page_info *info, physical_block_numb
 
 	if (pbn != NO_PAGE) {
 		result = vdo_int_map_put(cache->page_map, pbn, info, true, NULL);
-		if (result != UDS_SUCCESS)
+		if (result != VDO_SUCCESS)
 			return result;
 	}
 	return VDO_SUCCESS;
diff --git a/drivers/md/dm-vdo/int-map.c b/drivers/md/dm-vdo/int-map.c
index f6b4a3de9bdb67..1bdd83a1dc2bb9 100644
--- a/drivers/md/dm-vdo/int-map.c
+++ b/drivers/md/dm-vdo/int-map.c
@@ -50,11 +50,11 @@
 
 #include <linux/minmax.h>
 
-#include "errors.h"
 #include "logger.h"
 #include "memory-alloc.h"
 #include "numeric.h"
 #include "permassert.h"
+#include "status-codes.h"
 
 enum {
 	DEFAULT_CAPACITY = 16, /* the number of neighborhoods in a new table */
@@ -399,7 +399,7 @@ static int resize_buckets(struct int_map *map)
 			continue;
 
 		result = vdo_int_map_put(map, entry->key, entry->value, true, NULL);
-		if (result != UDS_SUCCESS) {
+		if (result != VDO_SUCCESS) {
 			/* Destroy the new partial map and restore the map from the stack. */
 			vdo_free(vdo_forget(map->buckets));
 			*map = old_map;
@@ -527,12 +527,8 @@ static struct bucket *move_empty_bucket(struct int_map *map __always_unused,
  *
  * Return: true if the map contains a mapping for the key, false if it does not.
  */
-static bool update_mapping(struct int_map *map,
-			   struct bucket *neighborhood,
-			   u64 key,
-			   void *new_value,
-			   bool update,
-			   void **old_value_ptr)
+static bool update_mapping(struct int_map *map, struct bucket *neighborhood,
+			   u64 key, void *new_value, bool update, void **old_value_ptr)
 {
 	struct bucket *bucket = search_hop_list(map, neighborhood, key, NULL);
 
@@ -611,15 +607,15 @@ static struct bucket *find_or_make_vacancy(struct int_map *map,
  * update is true. In either case the old value is returned. If the map does not already contain a
  * value for the specified key, the new value is added regardless of the value of update.
  *
- * Return: UDS_SUCCESS or an error code.
+ * Return: VDO_SUCCESS or an error code.
  */
 int vdo_int_map_put(struct int_map *map, u64 key, void *new_value, bool update,
 		    void **old_value_ptr)
 {
 	struct bucket *neighborhood, *bucket;
 
-	if (new_value == NULL)
-		return UDS_INVALID_ARGUMENT;
+	if (unlikely(new_value == NULL))
+		return -EINVAL;
 
 	/*
 	 * Select the bucket at the start of the neighborhood that must contain any entry for the
@@ -632,7 +628,7 @@ int vdo_int_map_put(struct int_map *map, u64 key, void *new_value, bool update,
 	 * optionally update it, returning the old value.
 	 */
 	if (update_mapping(map, neighborhood, key, new_value, update, old_value_ptr))
-		return UDS_SUCCESS;
+		return VDO_SUCCESS;
 
 	/*
 	 * Find an empty bucket in the desired neighborhood for the new entry or re-arrange entries
@@ -668,7 +664,7 @@ int vdo_int_map_put(struct int_map *map, u64 key, void *new_value, bool update,
 	/* There was no existing entry, so there was no old value to be returned. */
 	if (old_value_ptr != NULL)
 		*old_value_ptr = NULL;
-	return UDS_SUCCESS;
+	return VDO_SUCCESS;
 }
 
 /**
diff --git a/drivers/md/dm-vdo/io-submitter.c b/drivers/md/dm-vdo/io-submitter.c
index b0f1ba810cd0c6..e82b4a8c6fc45f 100644
--- a/drivers/md/dm-vdo/io-submitter.c
+++ b/drivers/md/dm-vdo/io-submitter.c
@@ -300,7 +300,7 @@ static bool try_bio_map_merge(struct vio *vio)
 	mutex_unlock(&bio_queue_data->lock);
 
 	/* We don't care about failure of int_map_put in this case. */
-	ASSERT_LOG_ONLY(result == UDS_SUCCESS, "bio map insertion succeeds");
+	ASSERT_LOG_ONLY(result == VDO_SUCCESS, "bio map insertion succeeds");
 	return merged;
 }
 
@@ -403,7 +403,7 @@ int vdo_make_io_submitter(unsigned int thread_count, unsigned int rotation_inter
 		 */
 		result = vdo_int_map_create(max_requests_active * 2,
 					    &bio_queue_data->map);
-		if (result != 0) {
+		if (result != VDO_SUCCESS) {
 			/*
 			 * Clean up the partially initialized bio-queue entirely and indicate that
 			 * initialization failed.

From 555797d8a43e8a4d2251a98d3b28f866eb78e468 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@kernel.org>
Date: Tue, 13 Feb 2024 13:18:35 -0600
Subject: [PATCH 0909/1406] dm vdo thread-utils: return VDO_SUCCESS on
 vdo_create_thread success

Update all callers to check for VDO_SUCCESS.

Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 drivers/md/dm-vdo/funnel-requestqueue.c | 2 +-
 drivers/md/dm-vdo/indexer/index.c       | 2 +-
 drivers/md/dm-vdo/indexer/volume.c      | 2 +-
 drivers/md/dm-vdo/thread-utils.c        | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/md/dm-vdo/funnel-requestqueue.c b/drivers/md/dm-vdo/funnel-requestqueue.c
index 948b5e55617f99..8bb4963f0ec55c 100644
--- a/drivers/md/dm-vdo/funnel-requestqueue.c
+++ b/drivers/md/dm-vdo/funnel-requestqueue.c
@@ -221,7 +221,7 @@ int uds_make_request_queue(const char *queue_name,
 
 	result = vdo_create_thread(request_queue_worker, queue, queue_name,
 				   &queue->thread);
-	if (result != UDS_SUCCESS) {
+	if (result != VDO_SUCCESS) {
 		uds_request_queue_finish(queue);
 		return result;
 	}
diff --git a/drivers/md/dm-vdo/indexer/index.c b/drivers/md/dm-vdo/indexer/index.c
index 0db6e458b6de4e..3d14c1e4035bf7 100644
--- a/drivers/md/dm-vdo/indexer/index.c
+++ b/drivers/md/dm-vdo/indexer/index.c
@@ -796,7 +796,7 @@ static int make_chapter_writer(struct uds_index *index,
 			       writer->open_chapter_index->memory_size);
 
 	result = vdo_create_thread(close_chapters, writer, "writer", &writer->thread);
-	if (result != UDS_SUCCESS) {
+	if (result != VDO_SUCCESS) {
 		free_chapter_writer(writer);
 		return result;
 	}
diff --git a/drivers/md/dm-vdo/indexer/volume.c b/drivers/md/dm-vdo/indexer/volume.c
index 1dd9c563a6e16d..7996b026a58b01 100644
--- a/drivers/md/dm-vdo/indexer/volume.c
+++ b/drivers/md/dm-vdo/indexer/volume.c
@@ -1636,7 +1636,7 @@ int uds_make_volume(const struct uds_configuration *config, struct index_layout
 	for (i = 0; i < config->read_threads; i++) {
 		result = vdo_create_thread(read_thread_function, (void *) volume,
 					   "reader", &volume->reader_threads[i]);
-		if (result != UDS_SUCCESS) {
+		if (result != VDO_SUCCESS) {
 			vdo_free_volume(volume);
 			return result;
 		}
diff --git a/drivers/md/dm-vdo/thread-utils.c b/drivers/md/dm-vdo/thread-utils.c
index ee2c8f2b505f3f..a6cea9544d9a1e 100644
--- a/drivers/md/dm-vdo/thread-utils.c
+++ b/drivers/md/dm-vdo/thread-utils.c
@@ -121,7 +121,7 @@ int vdo_create_thread(void (*thread_function)(void *), void *thread_data,
 	}
 
 	*new_thread = thread;
-	return UDS_SUCCESS;
+	return VDO_SUCCESS;
 }
 
 void vdo_join_threads(struct thread *thread)

From 8144264361eaf33b2668fa9f2304ef305e7ea6cb Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@kernel.org>
Date: Tue, 13 Feb 2024 13:35:28 -0600
Subject: [PATCH 0910/1406] dm vdo funnel-queue: change from uds_ to vdo_
 namespace

Also return VDO_SUCCESS from vdo_make_funnel_queue.

Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 drivers/md/dm-vdo/data-vio.c            | 10 +++++-----
 drivers/md/dm-vdo/dedupe.c              |  8 ++++----
 drivers/md/dm-vdo/funnel-queue.c        | 16 ++++++++--------
 drivers/md/dm-vdo/funnel-queue.h        | 24 ++++++++++++------------
 drivers/md/dm-vdo/funnel-requestqueue.c | 18 +++++++++---------
 drivers/md/dm-vdo/funnel-workqueue.c    |  8 ++++----
 6 files changed, 42 insertions(+), 42 deletions(-)

diff --git a/drivers/md/dm-vdo/data-vio.c b/drivers/md/dm-vdo/data-vio.c
index dcb0d1075239fe..d0913653bad9fc 100644
--- a/drivers/md/dm-vdo/data-vio.c
+++ b/drivers/md/dm-vdo/data-vio.c
@@ -720,7 +720,7 @@ static void process_release_callback(struct vdo_completion *completion)
 
 	for (processed = 0; processed < DATA_VIO_RELEASE_BATCH_SIZE; processed++) {
 		struct data_vio *data_vio;
-		struct funnel_queue_entry *entry = uds_funnel_queue_poll(pool->queue);
+		struct funnel_queue_entry *entry = vdo_funnel_queue_poll(pool->queue);
 
 		if (entry == NULL)
 			break;
@@ -750,7 +750,7 @@ static void process_release_callback(struct vdo_completion *completion)
 	/* Pairs with the barrier in schedule_releases(). */
 	smp_mb();
 
-	reschedule = !uds_is_funnel_queue_empty(pool->queue);
+	reschedule = !vdo_is_funnel_queue_empty(pool->queue);
 	drained = (!reschedule &&
 		   vdo_is_state_draining(&pool->state) &&
 		   check_for_drain_complete_locked(pool));
@@ -867,8 +867,8 @@ int make_data_vio_pool(struct vdo *vdo, data_vio_count_t pool_size,
 			       process_release_callback, vdo->thread_config.cpu_thread,
 			       NULL);
 
-	result = uds_make_funnel_queue(&pool->queue);
-	if (result != UDS_SUCCESS) {
+	result = vdo_make_funnel_queue(&pool->queue);
+	if (result != VDO_SUCCESS) {
 		free_data_vio_pool(vdo_forget(pool));
 		return result;
 	}
@@ -1285,7 +1285,7 @@ static void finish_cleanup(struct data_vio *data_vio)
 	    (completion->result != VDO_SUCCESS)) {
 		struct data_vio_pool *pool = completion->vdo->data_vio_pool;
 
-		uds_funnel_queue_put(pool->queue, &completion->work_queue_entry_link);
+		vdo_funnel_queue_put(pool->queue, &completion->work_queue_entry_link);
 		schedule_releases(pool);
 		return;
 	}
diff --git a/drivers/md/dm-vdo/dedupe.c b/drivers/md/dm-vdo/dedupe.c
index b88596f4ae5142..43437d10399600 100644
--- a/drivers/md/dm-vdo/dedupe.c
+++ b/drivers/md/dm-vdo/dedupe.c
@@ -2255,7 +2255,7 @@ static void finish_index_operation(struct uds_request *request)
 				atomic_read(&context->state));
 	}
 
-	uds_funnel_queue_put(context->zone->timed_out_complete, &context->queue_entry);
+	vdo_funnel_queue_put(context->zone->timed_out_complete, &context->queue_entry);
 }
 
 /**
@@ -2284,7 +2284,7 @@ static void check_for_drain_complete(struct hash_zone *zone)
 		struct dedupe_context *context;
 		struct funnel_queue_entry *entry;
 
-		entry = uds_funnel_queue_poll(zone->timed_out_complete);
+		entry = vdo_funnel_queue_poll(zone->timed_out_complete);
 		if (entry == NULL)
 			break;
 
@@ -2382,7 +2382,7 @@ static int __must_check initialize_zone(struct vdo *vdo, struct hash_zones *zone
 
 	INIT_LIST_HEAD(&zone->available);
 	INIT_LIST_HEAD(&zone->pending);
-	result = uds_make_funnel_queue(&zone->timed_out_complete);
+	result = vdo_make_funnel_queue(&zone->timed_out_complete);
 	if (result != VDO_SUCCESS)
 		return result;
 
@@ -2885,7 +2885,7 @@ static struct dedupe_context * __must_check acquire_context(struct hash_zone *zo
 		return context;
 	}
 
-	entry = uds_funnel_queue_poll(zone->timed_out_complete);
+	entry = vdo_funnel_queue_poll(zone->timed_out_complete);
 	return ((entry == NULL) ?
 		NULL : container_of(entry, struct dedupe_context, queue_entry));
 }
diff --git a/drivers/md/dm-vdo/funnel-queue.c b/drivers/md/dm-vdo/funnel-queue.c
index 1dba6b776f915b..e52846ad64f0b8 100644
--- a/drivers/md/dm-vdo/funnel-queue.c
+++ b/drivers/md/dm-vdo/funnel-queue.c
@@ -10,7 +10,7 @@
 #include "permassert.h"
 #include "indexer/indexer.h"
 
-int uds_make_funnel_queue(struct funnel_queue **queue_ptr)
+int vdo_make_funnel_queue(struct funnel_queue **queue_ptr)
 {
 	int result;
 	struct funnel_queue *queue;
@@ -28,7 +28,7 @@ int uds_make_funnel_queue(struct funnel_queue **queue_ptr)
 	queue->oldest = &queue->stub;
 
 	*queue_ptr = queue;
-	return UDS_SUCCESS;
+	return VDO_SUCCESS;
 }
 
 void vdo_free_funnel_queue(struct funnel_queue *queue)
@@ -41,7 +41,7 @@ static struct funnel_queue_entry *get_oldest(struct funnel_queue *queue)
 	/*
 	 * Barrier requirements: We need a read barrier between reading a "next" field pointer
 	 * value and reading anything it points to. There's an accompanying barrier in
-	 * uds_funnel_queue_put() between its caller setting up the entry and making it visible.
+	 * vdo_funnel_queue_put() between its caller setting up the entry and making it visible.
 	 */
 	struct funnel_queue_entry *oldest = queue->oldest;
 	struct funnel_queue_entry *next = READ_ONCE(oldest->next);
@@ -81,7 +81,7 @@ static struct funnel_queue_entry *get_oldest(struct funnel_queue *queue)
 		 * Put the stub entry back on the queue, ensuring a successor will eventually be
 		 * seen.
 		 */
-		uds_funnel_queue_put(queue, &queue->stub);
+		vdo_funnel_queue_put(queue, &queue->stub);
 
 		/* Check again for a successor. */
 		next = READ_ONCE(oldest->next);
@@ -101,7 +101,7 @@ static struct funnel_queue_entry *get_oldest(struct funnel_queue *queue)
  * Poll a queue, removing the oldest entry if the queue is not empty. This function must only be
  * called from a single consumer thread.
  */
-struct funnel_queue_entry *uds_funnel_queue_poll(struct funnel_queue *queue)
+struct funnel_queue_entry *vdo_funnel_queue_poll(struct funnel_queue *queue)
 {
 	struct funnel_queue_entry *oldest = get_oldest(queue);
 
@@ -135,7 +135,7 @@ struct funnel_queue_entry *uds_funnel_queue_poll(struct funnel_queue *queue)
  * or more entries being added such that the list view is incomplete, this function will report the
  * queue as empty.
  */
-bool uds_is_funnel_queue_empty(struct funnel_queue *queue)
+bool vdo_is_funnel_queue_empty(struct funnel_queue *queue)
 {
 	return get_oldest(queue) == NULL;
 }
@@ -144,9 +144,9 @@ bool uds_is_funnel_queue_empty(struct funnel_queue *queue)
  * Check whether the funnel queue is idle or not. If the queue has entries available to be
  * retrieved, it is not idle. If the queue is in a transition state with one or more entries being
  * added such that the list view is incomplete, it may not be possible to retrieve an entry with
- * the uds_funnel_queue_poll() function, but the queue will not be considered idle.
+ * the vdo_funnel_queue_poll() function, but the queue will not be considered idle.
  */
-bool uds_is_funnel_queue_idle(struct funnel_queue *queue)
+bool vdo_is_funnel_queue_idle(struct funnel_queue *queue)
 {
 	/*
 	 * Oldest is not the stub, so there's another entry, though if next is NULL we can't
diff --git a/drivers/md/dm-vdo/funnel-queue.h b/drivers/md/dm-vdo/funnel-queue.h
index 5d5d249554f0c2..bde0f1deff98de 100644
--- a/drivers/md/dm-vdo/funnel-queue.h
+++ b/drivers/md/dm-vdo/funnel-queue.h
@@ -3,8 +3,8 @@
  * Copyright 2023 Red Hat
  */
 
-#ifndef UDS_FUNNEL_QUEUE_H
-#define UDS_FUNNEL_QUEUE_H
+#ifndef VDO_FUNNEL_QUEUE_H
+#define VDO_FUNNEL_QUEUE_H
 
 #include <linux/atomic.h>
 #include <linux/cache.h>
@@ -25,19 +25,19 @@
  * the queue entries, and pointers to those structures are used exclusively by the queue. No macros
  * are defined to template the queue, so the offset of the funnel_queue_entry in the records placed
  * in the queue must all be the same so the client can derive their structure pointer from the
- * entry pointer returned by uds_funnel_queue_poll().
+ * entry pointer returned by vdo_funnel_queue_poll().
  *
  * Callers are wholly responsible for allocating and freeing the entries. Entries may be freed as
  * soon as they are returned since this queue is not susceptible to the "ABA problem" present in
  * many lock-free data structures. The queue is dynamically allocated to ensure cache-line
  * alignment, but no other dynamic allocation is used.
  *
- * The algorithm is not actually 100% lock-free. There is a single point in uds_funnel_queue_put()
+ * The algorithm is not actually 100% lock-free. There is a single point in vdo_funnel_queue_put()
  * at which a preempted producer will prevent the consumers from seeing items added to the queue by
  * later producers, and only if the queue is short enough or the consumer fast enough for it to
  * reach what was the end of the queue at the time of the preemption.
  *
- * The consumer function, uds_funnel_queue_poll(), will return NULL when the queue is empty. To
+ * The consumer function, vdo_funnel_queue_poll(), will return NULL when the queue is empty. To
  * wait for data to consume, spin (if safe) or combine the queue with a struct event_count to
  * signal the presence of new entries.
  */
@@ -51,7 +51,7 @@ struct funnel_queue_entry {
 /*
  * The dynamically allocated queue structure, which is allocated on a cache line boundary so the
  * producer and consumer fields in the structure will land on separate cache lines. This should be
- * consider opaque but it is exposed here so uds_funnel_queue_put() can be inlined.
+ * consider opaque but it is exposed here so vdo_funnel_queue_put() can be inlined.
  */
 struct __aligned(L1_CACHE_BYTES) funnel_queue {
 	/*
@@ -67,7 +67,7 @@ struct __aligned(L1_CACHE_BYTES) funnel_queue {
 	struct funnel_queue_entry stub;
 };
 
-int __must_check uds_make_funnel_queue(struct funnel_queue **queue_ptr);
+int __must_check vdo_make_funnel_queue(struct funnel_queue **queue_ptr);
 
 void vdo_free_funnel_queue(struct funnel_queue *queue);
 
@@ -79,7 +79,7 @@ void vdo_free_funnel_queue(struct funnel_queue *queue);
  * from the pointer that passed in here, so every entry in the queue must have the struct
  * funnel_queue_entry at the same offset within the client's structure.
  */
-static inline void uds_funnel_queue_put(struct funnel_queue *queue,
+static inline void vdo_funnel_queue_put(struct funnel_queue *queue,
 					struct funnel_queue_entry *entry)
 {
 	struct funnel_queue_entry *previous;
@@ -101,10 +101,10 @@ static inline void uds_funnel_queue_put(struct funnel_queue *queue,
 	WRITE_ONCE(previous->next, entry);
 }
 
-struct funnel_queue_entry *__must_check uds_funnel_queue_poll(struct funnel_queue *queue);
+struct funnel_queue_entry *__must_check vdo_funnel_queue_poll(struct funnel_queue *queue);
 
-bool __must_check uds_is_funnel_queue_empty(struct funnel_queue *queue);
+bool __must_check vdo_is_funnel_queue_empty(struct funnel_queue *queue);
 
-bool __must_check uds_is_funnel_queue_idle(struct funnel_queue *queue);
+bool __must_check vdo_is_funnel_queue_idle(struct funnel_queue *queue);
 
-#endif /* UDS_FUNNEL_QUEUE_H */
+#endif /* VDO_FUNNEL_QUEUE_H */
diff --git a/drivers/md/dm-vdo/funnel-requestqueue.c b/drivers/md/dm-vdo/funnel-requestqueue.c
index 8bb4963f0ec55c..1a5735375ddc06 100644
--- a/drivers/md/dm-vdo/funnel-requestqueue.c
+++ b/drivers/md/dm-vdo/funnel-requestqueue.c
@@ -69,11 +69,11 @@ static inline struct uds_request *poll_queues(struct uds_request_queue *queue)
 {
 	struct funnel_queue_entry *entry;
 
-	entry = uds_funnel_queue_poll(queue->retry_queue);
+	entry = vdo_funnel_queue_poll(queue->retry_queue);
 	if (entry != NULL)
 		return container_of(entry, struct uds_request, queue_link);
 
-	entry = uds_funnel_queue_poll(queue->main_queue);
+	entry = vdo_funnel_queue_poll(queue->main_queue);
 	if (entry != NULL)
 		return container_of(entry, struct uds_request, queue_link);
 
@@ -82,8 +82,8 @@ static inline struct uds_request *poll_queues(struct uds_request_queue *queue)
 
 static inline bool are_queues_idle(struct uds_request_queue *queue)
 {
-	return uds_is_funnel_queue_idle(queue->retry_queue) &&
-	       uds_is_funnel_queue_idle(queue->main_queue);
+	return vdo_is_funnel_queue_idle(queue->retry_queue) &&
+	       vdo_is_funnel_queue_idle(queue->main_queue);
 }
 
 /*
@@ -207,14 +207,14 @@ int uds_make_request_queue(const char *queue_name,
 	atomic_set(&queue->dormant, false);
 	init_waitqueue_head(&queue->wait_head);
 
-	result = uds_make_funnel_queue(&queue->main_queue);
-	if (result != UDS_SUCCESS) {
+	result = vdo_make_funnel_queue(&queue->main_queue);
+	if (result != VDO_SUCCESS) {
 		uds_request_queue_finish(queue);
 		return result;
 	}
 
-	result = uds_make_funnel_queue(&queue->retry_queue);
-	if (result != UDS_SUCCESS) {
+	result = vdo_make_funnel_queue(&queue->retry_queue);
+	if (result != VDO_SUCCESS) {
 		uds_request_queue_finish(queue);
 		return result;
 	}
@@ -244,7 +244,7 @@ void uds_request_queue_enqueue(struct uds_request_queue *queue,
 	bool unbatched = request->unbatched;
 
 	sub_queue = request->requeued ? queue->retry_queue : queue->main_queue;
-	uds_funnel_queue_put(sub_queue, &request->queue_link);
+	vdo_funnel_queue_put(sub_queue, &request->queue_link);
 
 	/*
 	 * We must wake the worker thread when it is dormant. A read fence isn't needed here since
diff --git a/drivers/md/dm-vdo/funnel-workqueue.c b/drivers/md/dm-vdo/funnel-workqueue.c
index c56a113f441790..ffcfd2fdd894f8 100644
--- a/drivers/md/dm-vdo/funnel-workqueue.c
+++ b/drivers/md/dm-vdo/funnel-workqueue.c
@@ -98,7 +98,7 @@ static struct vdo_completion *poll_for_completion(struct simple_work_queue *queu
 	int i;
 
 	for (i = queue->common.type->max_priority; i >= 0; i--) {
-		struct funnel_queue_entry *link = uds_funnel_queue_poll(queue->priority_lists[i]);
+		struct funnel_queue_entry *link = vdo_funnel_queue_poll(queue->priority_lists[i]);
 
 		if (link != NULL)
 			return container_of(link, struct vdo_completion, work_queue_entry_link);
@@ -123,7 +123,7 @@ static void enqueue_work_queue_completion(struct simple_work_queue *queue,
 	completion->my_queue = &queue->common;
 
 	/* Funnel queue handles the synchronization for the put. */
-	uds_funnel_queue_put(queue->priority_lists[completion->priority],
+	vdo_funnel_queue_put(queue->priority_lists[completion->priority],
 			     &completion->work_queue_entry_link);
 
 	/*
@@ -340,8 +340,8 @@ static int make_simple_work_queue(const char *thread_name_prefix, const char *na
 	}
 
 	for (i = 0; i <= type->max_priority; i++) {
-		result = uds_make_funnel_queue(&queue->priority_lists[i]);
-		if (result != UDS_SUCCESS) {
+		result = vdo_make_funnel_queue(&queue->priority_lists[i]);
+		if (result != VDO_SUCCESS) {
 			free_simple_work_queue(queue);
 			return result;
 		}

From 533f49bcbe0ce9be60797ba4683b8e9e98617a00 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@kernel.org>
Date: Tue, 13 Feb 2024 13:45:48 -0600
Subject: [PATCH 0911/1406] dm vdo: move funnel-requestqueue to dm-vdo/indexer/

Only ever used by indexer code.

Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 drivers/md/dm-vdo/Makefile                            | 2 +-
 drivers/md/dm-vdo/{ => indexer}/funnel-requestqueue.c | 8 ++++----
 drivers/md/dm-vdo/{ => indexer}/funnel-requestqueue.h | 2 +-
 drivers/md/dm-vdo/indexer/index-session.c             | 2 +-
 drivers/md/dm-vdo/indexer/index.c                     | 2 +-
 5 files changed, 8 insertions(+), 8 deletions(-)
 rename drivers/md/dm-vdo/{ => indexer}/funnel-requestqueue.c (98%)
 rename drivers/md/dm-vdo/{ => indexer}/funnel-requestqueue.h (96%)

diff --git a/drivers/md/dm-vdo/Makefile b/drivers/md/dm-vdo/Makefile
index 923088f29ef20a..c16bd89eab6023 100644
--- a/drivers/md/dm-vdo/Makefile
+++ b/drivers/md/dm-vdo/Makefile
@@ -15,7 +15,6 @@ dm-vdo-objs := \
 	errors.o \
 	flush.o \
 	funnel-queue.o \
-	funnel-requestqueue.o \
 	funnel-workqueue.o \
 	int-map.o \
 	io-submitter.o \
@@ -41,6 +40,7 @@ dm-vdo-objs := \
 	indexer/chapter-index.o \
 	indexer/config.o \
 	indexer/delta-index.o \
+	indexer/funnel-requestqueue.o \
 	indexer/geometry.o \
 	indexer/index.o \
 	indexer/index-layout.o \
diff --git a/drivers/md/dm-vdo/funnel-requestqueue.c b/drivers/md/dm-vdo/indexer/funnel-requestqueue.c
similarity index 98%
rename from drivers/md/dm-vdo/funnel-requestqueue.c
rename to drivers/md/dm-vdo/indexer/funnel-requestqueue.c
index 1a5735375ddc06..fe7b6799cadaab 100644
--- a/drivers/md/dm-vdo/funnel-requestqueue.c
+++ b/drivers/md/dm-vdo/indexer/funnel-requestqueue.c
@@ -9,10 +9,10 @@
 #include <linux/compiler.h>
 #include <linux/wait.h>
 
-#include "funnel-queue.h"
-#include "logger.h"
-#include "memory-alloc.h"
-#include "thread-utils.h"
+#include "../funnel-queue.h"
+#include "../logger.h"
+#include "../memory-alloc.h"
+#include "../thread-utils.h"
 
 /*
  * This queue will attempt to handle requests in reasonably sized batches instead of reacting
diff --git a/drivers/md/dm-vdo/funnel-requestqueue.h b/drivers/md/dm-vdo/indexer/funnel-requestqueue.h
similarity index 96%
rename from drivers/md/dm-vdo/funnel-requestqueue.h
rename to drivers/md/dm-vdo/indexer/funnel-requestqueue.h
index 6c6c5bf0d61bd5..9b0f53939b4dd0 100644
--- a/drivers/md/dm-vdo/funnel-requestqueue.h
+++ b/drivers/md/dm-vdo/indexer/funnel-requestqueue.h
@@ -6,7 +6,7 @@
 #ifndef UDS_REQUEST_QUEUE_H
 #define UDS_REQUEST_QUEUE_H
 
-#include "indexer/indexer.h"
+#include "indexer.h"
 
 /*
  * A simple request queue which will handle new requests in the order in which they are received,
diff --git a/drivers/md/dm-vdo/indexer/index-session.c b/drivers/md/dm-vdo/indexer/index-session.c
index 5d020783487f04..8a129d09392acd 100644
--- a/drivers/md/dm-vdo/indexer/index-session.c
+++ b/drivers/md/dm-vdo/indexer/index-session.c
@@ -7,11 +7,11 @@
 
 #include <linux/atomic.h>
 
-#include "../funnel-requestqueue.h"
 #include "../logger.h"
 #include "../memory-alloc.h"
 #include "../time-utils.h"
 
+#include "funnel-requestqueue.h"
 #include "index.h"
 #include "index-layout.h"
 
diff --git a/drivers/md/dm-vdo/indexer/index.c b/drivers/md/dm-vdo/indexer/index.c
index 3d14c1e4035bf7..45bc163f65277b 100644
--- a/drivers/md/dm-vdo/indexer/index.c
+++ b/drivers/md/dm-vdo/indexer/index.c
@@ -3,11 +3,11 @@
  * Copyright 2023 Red Hat
  */
 
+#include "funnel-requestqueue.h"
 #include "index.h"
 #include "hash-utils.h"
 #include "sparse-cache.h"
 
-#include "../funnel-requestqueue.h"
 #include "../logger.h"
 #include "../memory-alloc.h"
 

From f039f872c842657fa53d5f74bde2b793a3c118ee Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@kernel.org>
Date: Tue, 13 Feb 2024 13:50:41 -0600
Subject: [PATCH 0912/1406] dm-vdo funnel-workqueue: return VDO_SUCCESS from
 make_simple_work_queue

Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 drivers/md/dm-vdo/funnel-workqueue.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/md/dm-vdo/funnel-workqueue.c b/drivers/md/dm-vdo/funnel-workqueue.c
index ffcfd2fdd894f8..8758748e8222ce 100644
--- a/drivers/md/dm-vdo/funnel-workqueue.c
+++ b/drivers/md/dm-vdo/funnel-workqueue.c
@@ -367,7 +367,7 @@ static int make_simple_work_queue(const char *thread_name_prefix, const char *na
 	wait_for_completion(&started);
 
 	*queue_ptr = queue;
-	return UDS_SUCCESS;
+	return VDO_SUCCESS;
 }
 
 /**

From 99a1b51f91a9b49f02eaa0da7464bff67499bce6 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@kernel.org>
Date: Tue, 13 Feb 2024 14:57:33 -0600
Subject: [PATCH 0913/1406] dm vdo permassert: audit all of ASSERT to test for
 VDO_SUCCESS

Also rename ASSERT to VDO_ASSERT and ASSERT_LOG_ONLY to
VDO_ASSERT_LOG_ONLY.

But re-introduce ASSERT and ASSERT_LOG_ONLY as a placeholder for the
benefit of dm-vdo/indexer (for historic UDS usage).  If/when the
indexer's usage will be updated is TBD.

Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 drivers/md/dm-vdo/action-manager.c   |   8 +-
 drivers/md/dm-vdo/block-map.c        | 118 +++++++++----------
 drivers/md/dm-vdo/completion.c       |  10 +-
 drivers/md/dm-vdo/completion.h       |   6 +-
 drivers/md/dm-vdo/data-vio.c         | 108 +++++++++---------
 drivers/md/dm-vdo/data-vio.h         |  68 +++++------
 drivers/md/dm-vdo/dedupe.c           | 165 +++++++++++++--------------
 drivers/md/dm-vdo/dm-vdo-target.c    |  36 +++---
 drivers/md/dm-vdo/encodings.c        | 156 ++++++++++++-------------
 drivers/md/dm-vdo/errors.c           |   5 +-
 drivers/md/dm-vdo/flush.c            |  22 ++--
 drivers/md/dm-vdo/funnel-workqueue.c |  22 ++--
 drivers/md/dm-vdo/io-submitter.c     |   8 +-
 drivers/md/dm-vdo/logical-zone.c     |  22 ++--
 drivers/md/dm-vdo/memory-alloc.c     |  12 +-
 drivers/md/dm-vdo/packer.c           |  12 +-
 drivers/md/dm-vdo/permassert.h       |  15 ++-
 drivers/md/dm-vdo/physical-zone.c    |  48 ++++----
 drivers/md/dm-vdo/priority-table.c   |   4 +-
 drivers/md/dm-vdo/recovery-journal.c |  60 +++++-----
 drivers/md/dm-vdo/repair.c           |  12 +-
 drivers/md/dm-vdo/slab-depot.c       | 116 +++++++++----------
 drivers/md/dm-vdo/thread-registry.c  |   4 +-
 drivers/md/dm-vdo/vdo.c              |  32 +++---
 drivers/md/dm-vdo/vio.c              |  40 +++----
 drivers/md/dm-vdo/vio.h              |   8 +-
 26 files changed, 560 insertions(+), 557 deletions(-)

diff --git a/drivers/md/dm-vdo/action-manager.c b/drivers/md/dm-vdo/action-manager.c
index 709be4c17d2721..a0e5e7077d1386 100644
--- a/drivers/md/dm-vdo/action-manager.c
+++ b/drivers/md/dm-vdo/action-manager.c
@@ -177,8 +177,8 @@ static void apply_to_zone(struct vdo_completion *completion)
 	zone_count_t zone;
 	struct action_manager *manager = as_action_manager(completion);
 
-	ASSERT_LOG_ONLY((vdo_get_callback_thread_id() == get_acting_zone_thread_id(manager)),
-			"%s() called on acting zones's thread", __func__);
+	VDO_ASSERT_LOG_ONLY((vdo_get_callback_thread_id() == get_acting_zone_thread_id(manager)),
+			    "%s() called on acting zones's thread", __func__);
 
 	zone = manager->acting_zone++;
 	if (manager->acting_zone == manager->zones) {
@@ -357,8 +357,8 @@ bool vdo_schedule_operation_with_context(struct action_manager *manager,
 {
 	struct action *current_action;
 
-	ASSERT_LOG_ONLY((vdo_get_callback_thread_id() == manager->initiator_thread_id),
-			"action initiated from correct thread");
+	VDO_ASSERT_LOG_ONLY((vdo_get_callback_thread_id() == manager->initiator_thread_id),
+			    "action initiated from correct thread");
 	if (!manager->current_action->in_use) {
 		current_action = manager->current_action;
 	} else if (!manager->current_action->next->in_use) {
diff --git a/drivers/md/dm-vdo/block-map.c b/drivers/md/dm-vdo/block-map.c
index 7490b0222fa7cf..cd81eb2715e829 100644
--- a/drivers/md/dm-vdo/block-map.c
+++ b/drivers/md/dm-vdo/block-map.c
@@ -248,16 +248,16 @@ static inline void assert_on_cache_thread(struct vdo_page_cache *cache,
 {
 	thread_id_t thread_id = vdo_get_callback_thread_id();
 
-	ASSERT_LOG_ONLY((thread_id == cache->zone->thread_id),
-			"%s() must only be called on cache thread %d, not thread %d",
-			function_name, cache->zone->thread_id, thread_id);
+	VDO_ASSERT_LOG_ONLY((thread_id == cache->zone->thread_id),
+			    "%s() must only be called on cache thread %d, not thread %d",
+			    function_name, cache->zone->thread_id, thread_id);
 }
 
 /** assert_io_allowed() - Assert that a page cache may issue I/O. */
 static inline void assert_io_allowed(struct vdo_page_cache *cache)
 {
-	ASSERT_LOG_ONLY(!vdo_is_state_quiescent(&cache->zone->state),
-			"VDO page cache may issue I/O");
+	VDO_ASSERT_LOG_ONLY(!vdo_is_state_quiescent(&cache->zone->state),
+			    "VDO page cache may issue I/O");
 }
 
 /** report_cache_pressure() - Log and, if enabled, report cache pressure. */
@@ -289,9 +289,9 @@ static const char * __must_check get_page_state_name(enum vdo_page_buffer_state
 
 	BUILD_BUG_ON(ARRAY_SIZE(state_names) != PAGE_STATE_COUNT);
 
-	result = ASSERT(state < ARRAY_SIZE(state_names),
-			"Unknown page_state value %d", state);
-	if (result != UDS_SUCCESS)
+	result = VDO_ASSERT(state < ARRAY_SIZE(state_names),
+			    "Unknown page_state value %d", state);
+	if (result != VDO_SUCCESS)
 		return "[UNKNOWN PAGE STATE]";
 
 	return state_names[state];
@@ -380,8 +380,8 @@ static int __must_check set_info_pbn(struct page_info *info, physical_block_numb
 	struct vdo_page_cache *cache = info->cache;
 
 	/* Either the new or the old page number must be NO_PAGE. */
-	int result = ASSERT((pbn == NO_PAGE) || (info->pbn == NO_PAGE),
-			    "Must free a page before reusing it.");
+	int result = VDO_ASSERT((pbn == NO_PAGE) || (info->pbn == NO_PAGE),
+				"Must free a page before reusing it.");
 	if (result != VDO_SUCCESS)
 		return result;
 
@@ -403,13 +403,13 @@ static int reset_page_info(struct page_info *info)
 {
 	int result;
 
-	result = ASSERT(info->busy == 0, "VDO Page must not be busy");
-	if (result != UDS_SUCCESS)
+	result = VDO_ASSERT(info->busy == 0, "VDO Page must not be busy");
+	if (result != VDO_SUCCESS)
 		return result;
 
-	result = ASSERT(!vdo_waitq_has_waiters(&info->waiting),
-			"VDO Page must not have waiters");
-	if (result != UDS_SUCCESS)
+	result = VDO_ASSERT(!vdo_waitq_has_waiters(&info->waiting),
+			    "VDO Page must not have waiters");
+	if (result != VDO_SUCCESS)
 		return result;
 
 	result = set_info_pbn(info, NO_PAGE);
@@ -594,29 +594,29 @@ static int __must_check validate_completed_page(struct vdo_page_completion *comp
 {
 	int result;
 
-	result = ASSERT(completion->ready, "VDO Page completion not ready");
-	if (result != UDS_SUCCESS)
+	result = VDO_ASSERT(completion->ready, "VDO Page completion not ready");
+	if (result != VDO_SUCCESS)
 		return result;
 
-	result = ASSERT(completion->info != NULL,
-			"VDO Page Completion must be complete");
-	if (result != UDS_SUCCESS)
+	result = VDO_ASSERT(completion->info != NULL,
+			    "VDO Page Completion must be complete");
+	if (result != VDO_SUCCESS)
 		return result;
 
-	result = ASSERT(completion->info->pbn == completion->pbn,
-			"VDO Page Completion pbn must be consistent");
-	if (result != UDS_SUCCESS)
+	result = VDO_ASSERT(completion->info->pbn == completion->pbn,
+			    "VDO Page Completion pbn must be consistent");
+	if (result != VDO_SUCCESS)
 		return result;
 
-	result = ASSERT(is_valid(completion->info),
-			"VDO Page Completion page must be valid");
-	if (result != UDS_SUCCESS)
+	result = VDO_ASSERT(is_valid(completion->info),
+			    "VDO Page Completion page must be valid");
+	if (result != VDO_SUCCESS)
 		return result;
 
 	if (writable) {
-		result = ASSERT(completion->writable,
-				"VDO Page Completion must be writable");
-		if (result != UDS_SUCCESS)
+		result = VDO_ASSERT(completion->writable,
+				    "VDO Page Completion must be writable");
+		if (result != VDO_SUCCESS)
 			return result;
 	}
 
@@ -778,7 +778,7 @@ static int __must_check launch_page_load(struct page_info *info,
 	if (result != VDO_SUCCESS)
 		return result;
 
-	result = ASSERT((info->busy == 0), "Page is not busy before loading.");
+	result = VDO_ASSERT((info->busy == 0), "Page is not busy before loading.");
 	if (result != VDO_SUCCESS)
 		return result;
 
@@ -951,8 +951,8 @@ static void discard_a_page(struct vdo_page_cache *cache)
 		return;
 	}
 
-	ASSERT_LOG_ONLY(!is_in_flight(info),
-			"page selected for discard is not in flight");
+	VDO_ASSERT_LOG_ONLY(!is_in_flight(info),
+			    "page selected for discard is not in flight");
 
 	cache->discard_count++;
 	info->write_status = WRITE_STATUS_DISCARD;
@@ -1155,8 +1155,8 @@ void vdo_release_page_completion(struct vdo_completion *completion)
 			discard_info = page_completion->info;
 	}
 
-	ASSERT_LOG_ONLY((page_completion->waiter.next_waiter == NULL),
-			"Page being released after leaving all queues");
+	VDO_ASSERT_LOG_ONLY((page_completion->waiter.next_waiter == NULL),
+			    "Page being released after leaving all queues");
 
 	page_completion->info = NULL;
 	cache = page_completion->cache;
@@ -1219,8 +1219,8 @@ void vdo_get_page(struct vdo_page_completion *page_completion,
 	struct page_info *info;
 
 	assert_on_cache_thread(cache, __func__);
-	ASSERT_LOG_ONLY((page_completion->waiter.next_waiter == NULL),
-			"New page completion was not already on a wait queue");
+	VDO_ASSERT_LOG_ONLY((page_completion->waiter.next_waiter == NULL),
+			    "New page completion was not already on a wait queue");
 
 	*page_completion = (struct vdo_page_completion) {
 		.pbn = pbn,
@@ -1267,7 +1267,7 @@ void vdo_get_page(struct vdo_page_completion *page_completion,
 		}
 
 		/* Something horrible has gone wrong. */
-		ASSERT_LOG_ONLY(false, "Info found in a usable state.");
+		VDO_ASSERT_LOG_ONLY(false, "Info found in a usable state.");
 	}
 
 	/* The page must be fetched. */
@@ -1336,7 +1336,7 @@ int vdo_invalidate_page_cache(struct vdo_page_cache *cache)
 
 	/* Make sure we don't throw away any dirty pages. */
 	for (info = cache->infos; info < cache->infos + cache->page_count; info++) {
-		int result = ASSERT(!is_dirty(info), "cache must have no dirty pages");
+		int result = VDO_ASSERT(!is_dirty(info), "cache must have no dirty pages");
 
 		if (result != VDO_SUCCESS)
 			return result;
@@ -1442,10 +1442,10 @@ static bool __must_check is_not_older(struct block_map_zone *zone, u8 a, u8 b)
 {
 	int result;
 
-	result = ASSERT((in_cyclic_range(zone->oldest_generation, a, zone->generation, 1 << 8) &&
-			 in_cyclic_range(zone->oldest_generation, b, zone->generation, 1 << 8)),
-			"generation(s) %u, %u are out of range [%u, %u]",
-			a, b, zone->oldest_generation, zone->generation);
+	result = VDO_ASSERT((in_cyclic_range(zone->oldest_generation, a, zone->generation, 1 << 8) &&
+			     in_cyclic_range(zone->oldest_generation, b, zone->generation, 1 << 8)),
+			    "generation(s) %u, %u are out of range [%u, %u]",
+			    a, b, zone->oldest_generation, zone->generation);
 	if (result != VDO_SUCCESS) {
 		enter_zone_read_only_mode(zone, result);
 		return true;
@@ -1458,8 +1458,8 @@ static void release_generation(struct block_map_zone *zone, u8 generation)
 {
 	int result;
 
-	result = ASSERT((zone->dirty_page_counts[generation] > 0),
-			"dirty page count underflow for generation %u", generation);
+	result = VDO_ASSERT((zone->dirty_page_counts[generation] > 0),
+			    "dirty page count underflow for generation %u", generation);
 	if (result != VDO_SUCCESS) {
 		enter_zone_read_only_mode(zone, result);
 		return;
@@ -1484,8 +1484,8 @@ static void set_generation(struct block_map_zone *zone, struct tree_page *page,
 
 	page->generation = new_generation;
 	new_count = ++zone->dirty_page_counts[new_generation];
-	result = ASSERT((new_count != 0), "dirty page count overflow for generation %u",
-			new_generation);
+	result = VDO_ASSERT((new_count != 0), "dirty page count overflow for generation %u",
+			    new_generation);
 	if (result != VDO_SUCCESS) {
 		enter_zone_read_only_mode(zone, result);
 		return;
@@ -1700,15 +1700,15 @@ static void release_page_lock(struct data_vio *data_vio, char *what)
 	struct tree_lock *lock_holder;
 	struct tree_lock *lock = &data_vio->tree_lock;
 
-	ASSERT_LOG_ONLY(lock->locked,
-			"release of unlocked block map page %s for key %llu in tree %u",
-			what, (unsigned long long) lock->key, lock->root_index);
+	VDO_ASSERT_LOG_ONLY(lock->locked,
+			    "release of unlocked block map page %s for key %llu in tree %u",
+			    what, (unsigned long long) lock->key, lock->root_index);
 
 	zone = data_vio->logical.zone->block_map_zone;
 	lock_holder = vdo_int_map_remove(zone->loading_pages, lock->key);
-	ASSERT_LOG_ONLY((lock_holder == lock),
-			"block map page %s mismatch for key %llu in tree %u",
-			what, (unsigned long long) lock->key, lock->root_index);
+	VDO_ASSERT_LOG_ONLY((lock_holder == lock),
+			    "block map page %s mismatch for key %llu in tree %u",
+			    what, (unsigned long long) lock->key, lock->root_index);
 	lock->locked = false;
 }
 
@@ -2010,8 +2010,8 @@ static void write_expired_elements(struct block_map_zone *zone)
 
 		list_del_init(&page->entry);
 
-		result = ASSERT(!vdo_waiter_is_waiting(&page->waiter),
-				"Newly expired page not already waiting to write");
+		result = VDO_ASSERT(!vdo_waiter_is_waiting(&page->waiter),
+				    "Newly expired page not already waiting to write");
 		if (result != VDO_SUCCESS) {
 			enter_zone_read_only_mode(zone, result);
 			continue;
@@ -2869,8 +2869,8 @@ int vdo_decode_block_map(struct block_map_state_2_0 state, block_count_t logical
 	BUILD_BUG_ON(VDO_BLOCK_MAP_ENTRIES_PER_PAGE !=
 		     ((VDO_BLOCK_SIZE - sizeof(struct block_map_page)) /
 		      sizeof(struct block_map_entry)));
-	result = ASSERT(cache_size > 0, "block map cache size is specified");
-	if (result != UDS_SUCCESS)
+	result = VDO_ASSERT(cache_size > 0, "block map cache size is specified");
+	if (result != VDO_SUCCESS)
 		return result;
 
 	result = vdo_allocate_extended(struct block_map,
@@ -2939,7 +2939,7 @@ void vdo_initialize_block_map_from_journal(struct block_map *map,
 	for (z = 0; z < map->zone_count; z++) {
 		struct dirty_lists *dirty_lists = map->zones[z].dirty_lists;
 
-		ASSERT_LOG_ONLY(dirty_lists->next_period == 0, "current period not set");
+		VDO_ASSERT_LOG_ONLY(dirty_lists->next_period == 0, "current period not set");
 		dirty_lists->oldest_period = map->current_era_point;
 		dirty_lists->next_period = map->current_era_point + 1;
 		dirty_lists->offset = map->current_era_point % dirty_lists->maximum_age;
@@ -2973,8 +2973,8 @@ static void initiate_drain(struct admin_state *state)
 {
 	struct block_map_zone *zone = container_of(state, struct block_map_zone, state);
 
-	ASSERT_LOG_ONLY((zone->active_lookups == 0),
-			"%s() called with no active lookups", __func__);
+	VDO_ASSERT_LOG_ONLY((zone->active_lookups == 0),
+			    "%s() called with no active lookups", __func__);
 
 	if (!vdo_is_state_suspending(state)) {
 		while (zone->dirty_lists->oldest_period < zone->dirty_lists->next_period)
diff --git a/drivers/md/dm-vdo/completion.c b/drivers/md/dm-vdo/completion.c
index 9e2381dc3683a3..5ad85334632d79 100644
--- a/drivers/md/dm-vdo/completion.c
+++ b/drivers/md/dm-vdo/completion.c
@@ -60,7 +60,7 @@ void vdo_initialize_completion(struct vdo_completion *completion,
 
 static inline void assert_incomplete(struct vdo_completion *completion)
 {
-	ASSERT_LOG_ONLY(!completion->complete, "completion is not complete");
+	VDO_ASSERT_LOG_ONLY(!completion->complete, "completion is not complete");
 }
 
 /**
@@ -111,10 +111,10 @@ void vdo_enqueue_completion(struct vdo_completion *completion,
 	struct vdo *vdo = completion->vdo;
 	thread_id_t thread_id = completion->callback_thread_id;
 
-	if (ASSERT(thread_id < vdo->thread_config.thread_count,
-		   "thread_id %u (completion type %d) is less than thread count %u",
-		   thread_id, completion->type,
-		   vdo->thread_config.thread_count) != UDS_SUCCESS)
+	if (VDO_ASSERT(thread_id < vdo->thread_config.thread_count,
+		       "thread_id %u (completion type %d) is less than thread count %u",
+		       thread_id, completion->type,
+		       vdo->thread_config.thread_count) != VDO_SUCCESS)
 		BUG();
 
 	completion->requeue = false;
diff --git a/drivers/md/dm-vdo/completion.h b/drivers/md/dm-vdo/completion.h
index aa145d73a6867f..3407f34ce58c9c 100644
--- a/drivers/md/dm-vdo/completion.h
+++ b/drivers/md/dm-vdo/completion.h
@@ -85,9 +85,9 @@ static inline void vdo_fail_completion(struct vdo_completion *completion, int re
 static inline int vdo_assert_completion_type(struct vdo_completion *completion,
 					     enum vdo_completion_type expected)
 {
-	return ASSERT(expected == completion->type,
-		      "completion type should be %u, not %u", expected,
-		      completion->type);
+	return VDO_ASSERT(expected == completion->type,
+			  "completion type should be %u, not %u", expected,
+			  completion->type);
 }
 
 static inline void vdo_set_completion_callback(struct vdo_completion *completion,
diff --git a/drivers/md/dm-vdo/data-vio.c b/drivers/md/dm-vdo/data-vio.c
index d0913653bad9fc..5f6f0883850a6b 100644
--- a/drivers/md/dm-vdo/data-vio.c
+++ b/drivers/md/dm-vdo/data-vio.c
@@ -235,8 +235,8 @@ static bool check_for_drain_complete_locked(struct data_vio_pool *pool)
 	if (pool->limiter.busy > 0)
 		return false;
 
-	ASSERT_LOG_ONLY((pool->discard_limiter.busy == 0),
-			"no outstanding discard permits");
+	VDO_ASSERT_LOG_ONLY((pool->discard_limiter.busy == 0),
+			    "no outstanding discard permits");
 
 	return (bio_list_empty(&pool->limiter.new_waiters) &&
 		bio_list_empty(&pool->discard_limiter.new_waiters));
@@ -280,9 +280,9 @@ static void acknowledge_data_vio(struct data_vio *data_vio)
 	if (bio == NULL)
 		return;
 
-	ASSERT_LOG_ONLY((data_vio->remaining_discard <=
-			 (u32) (VDO_BLOCK_SIZE - data_vio->offset)),
-			"data_vio to acknowledge is not an incomplete discard");
+	VDO_ASSERT_LOG_ONLY((data_vio->remaining_discard <=
+			     (u32) (VDO_BLOCK_SIZE - data_vio->offset)),
+			    "data_vio to acknowledge is not an incomplete discard");
 
 	data_vio->user_bio = NULL;
 	vdo_count_bios(&vdo->stats.bios_acknowledged, bio);
@@ -446,7 +446,7 @@ static void attempt_logical_block_lock(struct vdo_completion *completion)
 		return;
 	}
 
-	result = ASSERT(lock_holder->logical.locked, "logical block lock held");
+	result = VDO_ASSERT(lock_holder->logical.locked, "logical block lock held");
 	if (result != VDO_SUCCESS) {
 		continue_data_vio_with_error(data_vio, result);
 		return;
@@ -629,9 +629,9 @@ static void update_limiter(struct limiter *limiter)
 	struct bio_list *waiters = &limiter->waiters;
 	data_vio_count_t available = limiter->limit - limiter->busy;
 
-	ASSERT_LOG_ONLY((limiter->release_count <= limiter->busy),
-			"Release count %u is not more than busy count %u",
-			limiter->release_count, limiter->busy);
+	VDO_ASSERT_LOG_ONLY((limiter->release_count <= limiter->busy),
+			    "Release count %u is not more than busy count %u",
+			    limiter->release_count, limiter->busy);
 
 	get_waiters(limiter);
 	for (; (limiter->release_count > 0) && !bio_list_empty(waiters); limiter->release_count--)
@@ -852,8 +852,8 @@ int make_data_vio_pool(struct vdo *vdo, data_vio_count_t pool_size,
 	if (result != VDO_SUCCESS)
 		return result;
 
-	ASSERT_LOG_ONLY((discard_limit <= pool_size),
-			"discard limit does not exceed pool size");
+	VDO_ASSERT_LOG_ONLY((discard_limit <= pool_size),
+			    "discard limit does not exceed pool size");
 	initialize_limiter(&pool->discard_limiter, pool, assign_discard_permit,
 			   discard_limit);
 	pool->discard_limiter.permitted_waiters = &pool->permitted_discards;
@@ -910,15 +910,15 @@ void free_data_vio_pool(struct data_vio_pool *pool)
 	BUG_ON(atomic_read(&pool->processing));
 
 	spin_lock(&pool->lock);
-	ASSERT_LOG_ONLY((pool->limiter.busy == 0),
-			"data_vio pool must not have %u busy entries when being freed",
-			pool->limiter.busy);
-	ASSERT_LOG_ONLY((bio_list_empty(&pool->limiter.waiters) &&
-			 bio_list_empty(&pool->limiter.new_waiters)),
-			"data_vio pool must not have threads waiting to read or write when being freed");
-	ASSERT_LOG_ONLY((bio_list_empty(&pool->discard_limiter.waiters) &&
-			 bio_list_empty(&pool->discard_limiter.new_waiters)),
-			"data_vio pool must not have threads waiting to discard when being freed");
+	VDO_ASSERT_LOG_ONLY((pool->limiter.busy == 0),
+			    "data_vio pool must not have %u busy entries when being freed",
+			    pool->limiter.busy);
+	VDO_ASSERT_LOG_ONLY((bio_list_empty(&pool->limiter.waiters) &&
+			     bio_list_empty(&pool->limiter.new_waiters)),
+			    "data_vio pool must not have threads waiting to read or write when being freed");
+	VDO_ASSERT_LOG_ONLY((bio_list_empty(&pool->discard_limiter.waiters) &&
+			     bio_list_empty(&pool->discard_limiter.new_waiters)),
+			    "data_vio pool must not have threads waiting to discard when being freed");
 	spin_unlock(&pool->lock);
 
 	list_for_each_entry_safe(data_vio, tmp, &pool->available, pool_entry) {
@@ -963,8 +963,8 @@ void vdo_launch_bio(struct data_vio_pool *pool, struct bio *bio)
 {
 	struct data_vio *data_vio;
 
-	ASSERT_LOG_ONLY(!vdo_is_state_quiescent(&pool->state),
-			"data_vio_pool not quiescent on acquire");
+	VDO_ASSERT_LOG_ONLY(!vdo_is_state_quiescent(&pool->state),
+			    "data_vio_pool not quiescent on acquire");
 
 	bio->bi_private = (void *) jiffies;
 	spin_lock(&pool->lock);
@@ -1000,8 +1000,8 @@ static void initiate_drain(struct admin_state *state)
 
 static void assert_on_vdo_cpu_thread(const struct vdo *vdo, const char *name)
 {
-	ASSERT_LOG_ONLY((vdo_get_callback_thread_id() == vdo->thread_config.cpu_thread),
-			"%s called on cpu thread", name);
+	VDO_ASSERT_LOG_ONLY((vdo_get_callback_thread_id() == vdo->thread_config.cpu_thread),
+			    "%s called on cpu thread", name);
 }
 
 /**
@@ -1175,17 +1175,17 @@ static void release_lock(struct data_vio *data_vio, struct lbn_lock *lock)
 		/*  The lock is not locked, so it had better not be registered in the lock map. */
 		struct data_vio *lock_holder = vdo_int_map_get(lock_map, lock->lbn);
 
-		ASSERT_LOG_ONLY((data_vio != lock_holder),
-				"no logical block lock held for block %llu",
-				(unsigned long long) lock->lbn);
+		VDO_ASSERT_LOG_ONLY((data_vio != lock_holder),
+				    "no logical block lock held for block %llu",
+				    (unsigned long long) lock->lbn);
 		return;
 	}
 
 	/* Release the lock by removing the lock from the map. */
 	lock_holder = vdo_int_map_remove(lock_map, lock->lbn);
-	ASSERT_LOG_ONLY((data_vio == lock_holder),
-			"logical block lock mismatch for block %llu",
-			(unsigned long long) lock->lbn);
+	VDO_ASSERT_LOG_ONLY((data_vio == lock_holder),
+			    "logical block lock mismatch for block %llu",
+			    (unsigned long long) lock->lbn);
 	lock->locked = false;
 }
 
@@ -1195,7 +1195,7 @@ static void transfer_lock(struct data_vio *data_vio, struct lbn_lock *lock)
 	struct data_vio *lock_holder, *next_lock_holder;
 	int result;
 
-	ASSERT_LOG_ONLY(lock->locked, "lbn_lock with waiters is not locked");
+	VDO_ASSERT_LOG_ONLY(lock->locked, "lbn_lock with waiters is not locked");
 
 	/* Another data_vio is waiting for the lock, transfer it in a single lock map operation. */
 	next_lock_holder =
@@ -1212,9 +1212,9 @@ static void transfer_lock(struct data_vio *data_vio, struct lbn_lock *lock)
 		return;
 	}
 
-	ASSERT_LOG_ONLY((lock_holder == data_vio),
-			"logical block lock mismatch for block %llu",
-			(unsigned long long) lock->lbn);
+	VDO_ASSERT_LOG_ONLY((lock_holder == data_vio),
+			    "logical block lock mismatch for block %llu",
+			    (unsigned long long) lock->lbn);
 	lock->locked = false;
 
 	/*
@@ -1277,10 +1277,10 @@ static void finish_cleanup(struct data_vio *data_vio)
 {
 	struct vdo_completion *completion = &data_vio->vio.completion;
 
-	ASSERT_LOG_ONLY(data_vio->allocation.lock == NULL,
-			"complete data_vio has no allocation lock");
-	ASSERT_LOG_ONLY(data_vio->hash_lock == NULL,
-			"complete data_vio has no hash lock");
+	VDO_ASSERT_LOG_ONLY(data_vio->allocation.lock == NULL,
+			    "complete data_vio has no allocation lock");
+	VDO_ASSERT_LOG_ONLY(data_vio->hash_lock == NULL,
+			    "complete data_vio has no hash lock");
 	if ((data_vio->remaining_discard <= VDO_BLOCK_SIZE) ||
 	    (completion->result != VDO_SUCCESS)) {
 		struct data_vio_pool *pool = completion->vdo->data_vio_pool;
@@ -1406,8 +1406,8 @@ void data_vio_allocate_data_block(struct data_vio *data_vio,
 {
 	struct allocation *allocation = &data_vio->allocation;
 
-	ASSERT_LOG_ONLY((allocation->pbn == VDO_ZERO_BLOCK),
-			"data_vio does not have an allocation");
+	VDO_ASSERT_LOG_ONLY((allocation->pbn == VDO_ZERO_BLOCK),
+			    "data_vio does not have an allocation");
 	allocation->write_lock_type = write_lock_type;
 	allocation->zone = vdo_get_next_allocation_zone(data_vio->logical.zone);
 	allocation->first_allocation_zone = allocation->zone->zone_number;
@@ -1798,11 +1798,11 @@ static void compress_data_vio(struct vdo_completion *completion)
  */
 void launch_compress_data_vio(struct data_vio *data_vio)
 {
-	ASSERT_LOG_ONLY(!data_vio->is_duplicate, "compressing a non-duplicate block");
-	ASSERT_LOG_ONLY(data_vio->hash_lock != NULL,
-			"data_vio to compress has a hash_lock");
-	ASSERT_LOG_ONLY(data_vio_has_allocation(data_vio),
-			"data_vio to compress has an allocation");
+	VDO_ASSERT_LOG_ONLY(!data_vio->is_duplicate, "compressing a non-duplicate block");
+	VDO_ASSERT_LOG_ONLY(data_vio->hash_lock != NULL,
+			    "data_vio to compress has a hash_lock");
+	VDO_ASSERT_LOG_ONLY(data_vio_has_allocation(data_vio),
+			    "data_vio to compress has an allocation");
 
 	/*
 	 * There are 4 reasons why a data_vio which has reached this point will not be eligible for
@@ -1843,7 +1843,7 @@ static void hash_data_vio(struct vdo_completion *completion)
 	struct data_vio *data_vio = as_data_vio(completion);
 
 	assert_data_vio_on_cpu_thread(data_vio);
-	ASSERT_LOG_ONLY(!data_vio->is_zero, "zero blocks should not be hashed");
+	VDO_ASSERT_LOG_ONLY(!data_vio->is_zero, "zero blocks should not be hashed");
 
 	murmurhash3_128(data_vio->vio.data, VDO_BLOCK_SIZE, 0x62ea60be,
 			&data_vio->record_name);
@@ -1858,7 +1858,7 @@ static void hash_data_vio(struct vdo_completion *completion)
 static void prepare_for_dedupe(struct data_vio *data_vio)
 {
 	/* We don't care what thread we are on. */
-	ASSERT_LOG_ONLY(!data_vio->is_zero, "must not prepare to dedupe zero blocks");
+	VDO_ASSERT_LOG_ONLY(!data_vio->is_zero, "must not prepare to dedupe zero blocks");
 
 	/*
 	 * Before we can dedupe, we need to know the record name, so the first
@@ -1931,11 +1931,11 @@ static void acknowledge_write_callback(struct vdo_completion *completion)
 	struct data_vio *data_vio = as_data_vio(completion);
 	struct vdo *vdo = completion->vdo;
 
-	ASSERT_LOG_ONLY((!vdo_uses_bio_ack_queue(vdo) ||
-			 (vdo_get_callback_thread_id() == vdo->thread_config.bio_ack_thread)),
-			"%s() called on bio ack queue", __func__);
-	ASSERT_LOG_ONLY(data_vio_has_flush_generation_lock(data_vio),
-			"write VIO to be acknowledged has a flush generation lock");
+	VDO_ASSERT_LOG_ONLY((!vdo_uses_bio_ack_queue(vdo) ||
+			     (vdo_get_callback_thread_id() == vdo->thread_config.bio_ack_thread)),
+			    "%s() called on bio ack queue", __func__);
+	VDO_ASSERT_LOG_ONLY(data_vio_has_flush_generation_lock(data_vio),
+			    "write VIO to be acknowledged has a flush generation lock");
 	acknowledge_data_vio(data_vio);
 	if (data_vio->new_mapped.pbn == VDO_ZERO_BLOCK) {
 		/* This is a zero write or discard */
@@ -2000,8 +2000,8 @@ static void handle_allocation_error(struct vdo_completion *completion)
 
 static int assert_is_discard(struct data_vio *data_vio)
 {
-	int result = ASSERT(data_vio->is_discard,
-			    "data_vio with no block map page is a discard");
+	int result = VDO_ASSERT(data_vio->is_discard,
+				"data_vio with no block map page is a discard");
 
 	return ((result == VDO_SUCCESS) ? result : VDO_READ_ONLY);
 }
diff --git a/drivers/md/dm-vdo/data-vio.h b/drivers/md/dm-vdo/data-vio.h
index 51324153f622a5..a1d3da95f83910 100644
--- a/drivers/md/dm-vdo/data-vio.h
+++ b/drivers/md/dm-vdo/data-vio.h
@@ -279,7 +279,7 @@ struct data_vio {
 
 static inline struct data_vio *vio_as_data_vio(struct vio *vio)
 {
-	ASSERT_LOG_ONLY((vio->type == VIO_TYPE_DATA), "vio is a data_vio");
+	VDO_ASSERT_LOG_ONLY((vio->type == VIO_TYPE_DATA), "vio is a data_vio");
 	return container_of(vio, struct data_vio, vio);
 }
 
@@ -373,9 +373,9 @@ static inline void assert_data_vio_in_hash_zone(struct data_vio *data_vio)
 	 * It's odd to use the LBN, but converting the record name to hex is a bit clunky for an
 	 * inline, and the LBN better than nothing as an identifier.
 	 */
-	ASSERT_LOG_ONLY((expected == thread_id),
-			"data_vio for logical block %llu on thread %u, should be on hash zone thread %u",
-			(unsigned long long) data_vio->logical.lbn, thread_id, expected);
+	VDO_ASSERT_LOG_ONLY((expected == thread_id),
+			    "data_vio for logical block %llu on thread %u, should be on hash zone thread %u",
+			    (unsigned long long) data_vio->logical.lbn, thread_id, expected);
 }
 
 static inline void set_data_vio_hash_zone_callback(struct data_vio *data_vio,
@@ -401,9 +401,9 @@ static inline void assert_data_vio_in_logical_zone(struct data_vio *data_vio)
 	thread_id_t expected = data_vio->logical.zone->thread_id;
 	thread_id_t thread_id = vdo_get_callback_thread_id();
 
-	ASSERT_LOG_ONLY((expected == thread_id),
-			"data_vio for logical block %llu on thread %u, should be on thread %u",
-			(unsigned long long) data_vio->logical.lbn, thread_id, expected);
+	VDO_ASSERT_LOG_ONLY((expected == thread_id),
+			    "data_vio for logical block %llu on thread %u, should be on thread %u",
+			    (unsigned long long) data_vio->logical.lbn, thread_id, expected);
 }
 
 static inline void set_data_vio_logical_callback(struct data_vio *data_vio,
@@ -429,10 +429,10 @@ static inline void assert_data_vio_in_allocated_zone(struct data_vio *data_vio)
 	thread_id_t expected = data_vio->allocation.zone->thread_id;
 	thread_id_t thread_id = vdo_get_callback_thread_id();
 
-	ASSERT_LOG_ONLY((expected == thread_id),
-			"struct data_vio for allocated physical block %llu on thread %u, should be on thread %u",
-			(unsigned long long) data_vio->allocation.pbn, thread_id,
-			expected);
+	VDO_ASSERT_LOG_ONLY((expected == thread_id),
+			    "struct data_vio for allocated physical block %llu on thread %u, should be on thread %u",
+			    (unsigned long long) data_vio->allocation.pbn, thread_id,
+			    expected);
 }
 
 static inline void set_data_vio_allocated_zone_callback(struct data_vio *data_vio,
@@ -459,10 +459,10 @@ static inline void assert_data_vio_in_duplicate_zone(struct data_vio *data_vio)
 	thread_id_t expected = data_vio->duplicate.zone->thread_id;
 	thread_id_t thread_id = vdo_get_callback_thread_id();
 
-	ASSERT_LOG_ONLY((expected == thread_id),
-			"data_vio for duplicate physical block %llu on thread %u, should be on thread %u",
-			(unsigned long long) data_vio->duplicate.pbn, thread_id,
-			expected);
+	VDO_ASSERT_LOG_ONLY((expected == thread_id),
+			    "data_vio for duplicate physical block %llu on thread %u, should be on thread %u",
+			    (unsigned long long) data_vio->duplicate.pbn, thread_id,
+			    expected);
 }
 
 static inline void set_data_vio_duplicate_zone_callback(struct data_vio *data_vio,
@@ -489,9 +489,9 @@ static inline void assert_data_vio_in_mapped_zone(struct data_vio *data_vio)
 	thread_id_t expected = data_vio->mapped.zone->thread_id;
 	thread_id_t thread_id = vdo_get_callback_thread_id();
 
-	ASSERT_LOG_ONLY((expected == thread_id),
-			"data_vio for mapped physical block %llu on thread %u, should be on thread %u",
-			(unsigned long long) data_vio->mapped.pbn, thread_id, expected);
+	VDO_ASSERT_LOG_ONLY((expected == thread_id),
+			    "data_vio for mapped physical block %llu on thread %u, should be on thread %u",
+			    (unsigned long long) data_vio->mapped.pbn, thread_id, expected);
 }
 
 static inline void set_data_vio_mapped_zone_callback(struct data_vio *data_vio,
@@ -506,10 +506,10 @@ static inline void assert_data_vio_in_new_mapped_zone(struct data_vio *data_vio)
 	thread_id_t expected = data_vio->new_mapped.zone->thread_id;
 	thread_id_t thread_id = vdo_get_callback_thread_id();
 
-	ASSERT_LOG_ONLY((expected == thread_id),
-			"data_vio for new_mapped physical block %llu on thread %u, should be on thread %u",
-			(unsigned long long) data_vio->new_mapped.pbn, thread_id,
-			expected);
+	VDO_ASSERT_LOG_ONLY((expected == thread_id),
+			    "data_vio for new_mapped physical block %llu on thread %u, should be on thread %u",
+			    (unsigned long long) data_vio->new_mapped.pbn, thread_id,
+			    expected);
 }
 
 static inline void set_data_vio_new_mapped_zone_callback(struct data_vio *data_vio,
@@ -524,10 +524,10 @@ static inline void assert_data_vio_in_journal_zone(struct data_vio *data_vio)
 	thread_id_t journal_thread = vdo_from_data_vio(data_vio)->thread_config.journal_thread;
 	thread_id_t thread_id = vdo_get_callback_thread_id();
 
-	ASSERT_LOG_ONLY((journal_thread == thread_id),
-			"data_vio for logical block %llu on thread %u, should be on journal thread %u",
-			(unsigned long long) data_vio->logical.lbn, thread_id,
-			journal_thread);
+	VDO_ASSERT_LOG_ONLY((journal_thread == thread_id),
+			    "data_vio for logical block %llu on thread %u, should be on journal thread %u",
+			    (unsigned long long) data_vio->logical.lbn, thread_id,
+			    journal_thread);
 }
 
 static inline void set_data_vio_journal_callback(struct data_vio *data_vio,
@@ -554,10 +554,10 @@ static inline void assert_data_vio_in_packer_zone(struct data_vio *data_vio)
 	thread_id_t packer_thread = vdo_from_data_vio(data_vio)->thread_config.packer_thread;
 	thread_id_t thread_id = vdo_get_callback_thread_id();
 
-	ASSERT_LOG_ONLY((packer_thread == thread_id),
-			"data_vio for logical block %llu on thread %u, should be on packer thread %u",
-			(unsigned long long) data_vio->logical.lbn, thread_id,
-			packer_thread);
+	VDO_ASSERT_LOG_ONLY((packer_thread == thread_id),
+			    "data_vio for logical block %llu on thread %u, should be on packer thread %u",
+			    (unsigned long long) data_vio->logical.lbn, thread_id,
+			    packer_thread);
 }
 
 static inline void set_data_vio_packer_callback(struct data_vio *data_vio,
@@ -584,10 +584,10 @@ static inline void assert_data_vio_on_cpu_thread(struct data_vio *data_vio)
 	thread_id_t cpu_thread = vdo_from_data_vio(data_vio)->thread_config.cpu_thread;
 	thread_id_t thread_id = vdo_get_callback_thread_id();
 
-	ASSERT_LOG_ONLY((cpu_thread == thread_id),
-			"data_vio for logical block %llu on thread %u, should be on cpu thread %u",
-			(unsigned long long) data_vio->logical.lbn, thread_id,
-			cpu_thread);
+	VDO_ASSERT_LOG_ONLY((cpu_thread == thread_id),
+			    "data_vio for logical block %llu on thread %u, should be on cpu thread %u",
+			    (unsigned long long) data_vio->logical.lbn, thread_id,
+			    cpu_thread);
 }
 
 static inline void set_data_vio_cpu_callback(struct data_vio *data_vio,
diff --git a/drivers/md/dm-vdo/dedupe.c b/drivers/md/dm-vdo/dedupe.c
index 43437d10399600..d7d1249cde8c5e 100644
--- a/drivers/md/dm-vdo/dedupe.c
+++ b/drivers/md/dm-vdo/dedupe.c
@@ -331,8 +331,8 @@ static inline struct hash_zones *as_hash_zones(struct vdo_completion *completion
 
 static inline void assert_in_hash_zone(struct hash_zone *zone, const char *name)
 {
-	ASSERT_LOG_ONLY((vdo_get_callback_thread_id() == zone->thread_id),
-			"%s called on hash zone thread", name);
+	VDO_ASSERT_LOG_ONLY((vdo_get_callback_thread_id() == zone->thread_id),
+			    "%s called on hash zone thread", name);
 }
 
 static inline bool change_context_state(struct dedupe_context *context, int old, int new)
@@ -408,8 +408,8 @@ static void assert_hash_lock_agent(struct data_vio *data_vio, const char *where)
 {
 	/* Not safe to access the agent field except from the hash zone. */
 	assert_data_vio_in_hash_zone(data_vio);
-	ASSERT_LOG_ONLY(data_vio == data_vio->hash_lock->agent,
-			"%s must be for the hash lock agent", where);
+	VDO_ASSERT_LOG_ONLY(data_vio == data_vio->hash_lock->agent,
+			    "%s must be for the hash lock agent", where);
 }
 
 /**
@@ -420,9 +420,8 @@ static void assert_hash_lock_agent(struct data_vio *data_vio, const char *where)
  */
 static void set_duplicate_lock(struct hash_lock *hash_lock, struct pbn_lock *pbn_lock)
 {
-	ASSERT_LOG_ONLY((hash_lock->duplicate_lock == NULL),
-			"hash lock must not already hold a duplicate lock");
-
+	VDO_ASSERT_LOG_ONLY((hash_lock->duplicate_lock == NULL),
+			    "hash lock must not already hold a duplicate lock");
 	pbn_lock->holder_count += 1;
 	hash_lock->duplicate_lock = pbn_lock;
 }
@@ -450,12 +449,12 @@ static void set_hash_lock(struct data_vio *data_vio, struct hash_lock *new_lock)
 	struct hash_lock *old_lock = data_vio->hash_lock;
 
 	if (old_lock != NULL) {
-		ASSERT_LOG_ONLY(data_vio->hash_zone != NULL,
-				"must have a hash zone when holding a hash lock");
-		ASSERT_LOG_ONLY(!list_empty(&data_vio->hash_lock_entry),
-				"must be on a hash lock ring when holding a hash lock");
-		ASSERT_LOG_ONLY(old_lock->reference_count > 0,
-				"hash lock reference must be counted");
+		VDO_ASSERT_LOG_ONLY(data_vio->hash_zone != NULL,
+				    "must have a hash zone when holding a hash lock");
+		VDO_ASSERT_LOG_ONLY(!list_empty(&data_vio->hash_lock_entry),
+				    "must be on a hash lock ring when holding a hash lock");
+		VDO_ASSERT_LOG_ONLY(old_lock->reference_count > 0,
+				    "hash lock reference must be counted");
 
 		if ((old_lock->state != VDO_HASH_LOCK_BYPASSING) &&
 		    (old_lock->state != VDO_HASH_LOCK_UNLOCKING)) {
@@ -463,9 +462,9 @@ static void set_hash_lock(struct data_vio *data_vio, struct hash_lock *new_lock)
 			 * If the reference count goes to zero in a non-terminal state, we're most
 			 * likely leaking this lock.
 			 */
-			ASSERT_LOG_ONLY(old_lock->reference_count > 1,
-					"hash locks should only become unreferenced in a terminal state, not state %s",
-					get_hash_lock_state_name(old_lock->state));
+			VDO_ASSERT_LOG_ONLY(old_lock->reference_count > 1,
+					    "hash locks should only become unreferenced in a terminal state, not state %s",
+					    get_hash_lock_state_name(old_lock->state));
 		}
 
 		list_del_init(&data_vio->hash_lock_entry);
@@ -645,8 +644,8 @@ static void finish_unlocking(struct vdo_completion *completion)
 
 	assert_hash_lock_agent(agent, __func__);
 
-	ASSERT_LOG_ONLY(lock->duplicate_lock == NULL,
-			"must have released the duplicate lock for the hash lock");
+	VDO_ASSERT_LOG_ONLY(lock->duplicate_lock == NULL,
+			    "must have released the duplicate lock for the hash lock");
 
 	if (!lock->verified) {
 		/*
@@ -700,8 +699,8 @@ static void unlock_duplicate_pbn(struct vdo_completion *completion)
 	struct hash_lock *lock = agent->hash_lock;
 
 	assert_data_vio_in_duplicate_zone(agent);
-	ASSERT_LOG_ONLY(lock->duplicate_lock != NULL,
-			"must have a duplicate lock to release");
+	VDO_ASSERT_LOG_ONLY(lock->duplicate_lock != NULL,
+			    "must have a duplicate lock to release");
 
 	vdo_release_physical_zone_pbn_lock(agent->duplicate.zone, agent->duplicate.pbn,
 					   vdo_forget(lock->duplicate_lock));
@@ -803,8 +802,8 @@ static void start_updating(struct hash_lock *lock, struct data_vio *agent)
 {
 	lock->state = VDO_HASH_LOCK_UPDATING;
 
-	ASSERT_LOG_ONLY(lock->verified, "new advice should have been verified");
-	ASSERT_LOG_ONLY(lock->update_advice, "should only update advice if needed");
+	VDO_ASSERT_LOG_ONLY(lock->verified, "new advice should have been verified");
+	VDO_ASSERT_LOG_ONLY(lock->update_advice, "should only update advice if needed");
 
 	agent->last_async_operation = VIO_ASYNC_OP_UPDATE_DEDUPE_INDEX;
 	set_data_vio_hash_zone_callback(agent, finish_updating);
@@ -826,9 +825,9 @@ static void finish_deduping(struct hash_lock *lock, struct data_vio *data_vio)
 {
 	struct data_vio *agent = data_vio;
 
-	ASSERT_LOG_ONLY(lock->agent == NULL, "shouldn't have an agent in DEDUPING");
-	ASSERT_LOG_ONLY(!vdo_waitq_has_waiters(&lock->waiters),
-			"shouldn't have any lock waiters in DEDUPING");
+	VDO_ASSERT_LOG_ONLY(lock->agent == NULL, "shouldn't have an agent in DEDUPING");
+	VDO_ASSERT_LOG_ONLY(!vdo_waitq_has_waiters(&lock->waiters),
+			    "shouldn't have any lock waiters in DEDUPING");
 
 	/* Just release the lock reference if other data_vios are still deduping. */
 	if (lock->reference_count > 1) {
@@ -883,8 +882,8 @@ static int __must_check acquire_lock(struct hash_zone *zone,
 	 * Borrow and prepare a lock from the pool so we don't have to do two int_map accesses
 	 * in the common case of no lock contention.
 	 */
-	result = ASSERT(!list_empty(&zone->lock_pool),
-			"never need to wait for a free hash lock");
+	result = VDO_ASSERT(!list_empty(&zone->lock_pool),
+			    "never need to wait for a free hash lock");
 	if (result != VDO_SUCCESS)
 		return result;
 
@@ -906,11 +905,11 @@ static int __must_check acquire_lock(struct hash_zone *zone,
 
 	if (replace_lock != NULL) {
 		/* On mismatch put the old lock back and return a severe error */
-		ASSERT_LOG_ONLY(lock == replace_lock,
-				"old lock must have been in the lock map");
+		VDO_ASSERT_LOG_ONLY(lock == replace_lock,
+				    "old lock must have been in the lock map");
 		/* TODO: Check earlier and bail out? */
-		ASSERT_LOG_ONLY(replace_lock->registered,
-				"old lock must have been marked registered");
+		VDO_ASSERT_LOG_ONLY(replace_lock->registered,
+				    "old lock must have been marked registered");
 		replace_lock->registered = false;
 	}
 
@@ -1022,15 +1021,15 @@ static void start_deduping(struct hash_lock *lock, struct data_vio *agent,
 	 * deduplicate against it.
 	 */
 	if (lock->duplicate_lock == NULL) {
-		ASSERT_LOG_ONLY(!vdo_is_state_compressed(agent->new_mapped.state),
-				"compression must have shared a lock");
-		ASSERT_LOG_ONLY(agent_is_done,
-				"agent must have written the new duplicate");
+		VDO_ASSERT_LOG_ONLY(!vdo_is_state_compressed(agent->new_mapped.state),
+				    "compression must have shared a lock");
+		VDO_ASSERT_LOG_ONLY(agent_is_done,
+				    "agent must have written the new duplicate");
 		transfer_allocation_lock(agent);
 	}
 
-	ASSERT_LOG_ONLY(vdo_is_pbn_read_lock(lock->duplicate_lock),
-			"duplicate_lock must be a PBN read lock");
+	VDO_ASSERT_LOG_ONLY(vdo_is_pbn_read_lock(lock->duplicate_lock),
+			    "duplicate_lock must be a PBN read lock");
 
 	/*
 	 * This state is not like any of the other states. There is no designated agent--the agent
@@ -1208,7 +1207,7 @@ static void start_verifying(struct hash_lock *lock, struct data_vio *agent)
 			agent->scratch_block);
 
 	lock->state = VDO_HASH_LOCK_VERIFYING;
-	ASSERT_LOG_ONLY(!lock->verified, "hash lock only verifies advice once");
+	VDO_ASSERT_LOG_ONLY(!lock->verified, "hash lock only verifies advice once");
 
 	agent->last_async_operation = VIO_ASYNC_OP_VERIFY_DUPLICATION;
 	result = vio_reset_bio(vio, buffer, verify_endio, REQ_OP_READ,
@@ -1238,8 +1237,8 @@ static void finish_locking(struct vdo_completion *completion)
 	assert_hash_lock_agent(agent, __func__);
 
 	if (!agent->is_duplicate) {
-		ASSERT_LOG_ONLY(lock->duplicate_lock == NULL,
-				"must not hold duplicate_lock if not flagged as a duplicate");
+		VDO_ASSERT_LOG_ONLY(lock->duplicate_lock == NULL,
+				    "must not hold duplicate_lock if not flagged as a duplicate");
 		/*
 		 * LOCKING -> WRITING transition: The advice block is being modified or has no
 		 * available references, so try to write or compress the data, remembering to
@@ -1251,8 +1250,8 @@ static void finish_locking(struct vdo_completion *completion)
 		return;
 	}
 
-	ASSERT_LOG_ONLY(lock->duplicate_lock != NULL,
-			"must hold duplicate_lock if flagged as a duplicate");
+	VDO_ASSERT_LOG_ONLY(lock->duplicate_lock != NULL,
+			    "must hold duplicate_lock if flagged as a duplicate");
 
 	if (!lock->verified) {
 		/*
@@ -1422,8 +1421,8 @@ static void lock_duplicate_pbn(struct vdo_completion *completion)
  */
 static void start_locking(struct hash_lock *lock, struct data_vio *agent)
 {
-	ASSERT_LOG_ONLY(lock->duplicate_lock == NULL,
-			"must not acquire a duplicate lock when already holding it");
+	VDO_ASSERT_LOG_ONLY(lock->duplicate_lock == NULL,
+			    "must not acquire a duplicate lock when already holding it");
 
 	lock->state = VDO_HASH_LOCK_LOCKING;
 
@@ -1729,8 +1728,8 @@ static void start_querying(struct hash_lock *lock, struct data_vio *data_vio)
  */
 static void report_bogus_lock_state(struct hash_lock *lock, struct data_vio *data_vio)
 {
-	ASSERT_LOG_ONLY(false, "hash lock must not be in unimplemented state %s",
-			get_hash_lock_state_name(lock->state));
+	VDO_ASSERT_LOG_ONLY(false, "hash lock must not be in unimplemented state %s",
+			    get_hash_lock_state_name(lock->state));
 	continue_data_vio_with_error(data_vio, VDO_LOCK_ERROR);
 }
 
@@ -1752,8 +1751,8 @@ void vdo_continue_hash_lock(struct vdo_completion *completion)
 
 	switch (lock->state) {
 	case VDO_HASH_LOCK_WRITING:
-		ASSERT_LOG_ONLY(data_vio == lock->agent,
-				"only the lock agent may continue the lock");
+		VDO_ASSERT_LOG_ONLY(data_vio == lock->agent,
+				    "only the lock agent may continue the lock");
 		finish_writing(lock, data_vio);
 		break;
 
@@ -1819,18 +1818,18 @@ static inline int assert_hash_lock_preconditions(const struct data_vio *data_vio
 	int result;
 
 	/* FIXME: BUG_ON() and/or enter read-only mode? */
-	result = ASSERT(data_vio->hash_lock == NULL,
-			"must not already hold a hash lock");
+	result = VDO_ASSERT(data_vio->hash_lock == NULL,
+			    "must not already hold a hash lock");
 	if (result != VDO_SUCCESS)
 		return result;
 
-	result = ASSERT(list_empty(&data_vio->hash_lock_entry),
-			"must not already be a member of a hash lock ring");
+	result = VDO_ASSERT(list_empty(&data_vio->hash_lock_entry),
+			    "must not already be a member of a hash lock ring");
 	if (result != VDO_SUCCESS)
 		return result;
 
-	return ASSERT(data_vio->recovery_sequence_number == 0,
-		      "must not hold a recovery lock when getting a hash lock");
+	return VDO_ASSERT(data_vio->recovery_sequence_number == 0,
+			  "must not hold a recovery lock when getting a hash lock");
 }
 
 /**
@@ -1937,24 +1936,24 @@ void vdo_release_hash_lock(struct data_vio *data_vio)
 		struct hash_lock *removed;
 
 		removed = vdo_int_map_remove(zone->hash_lock_map, lock_key);
-		ASSERT_LOG_ONLY(lock == removed,
-				"hash lock being released must have been mapped");
+		VDO_ASSERT_LOG_ONLY(lock == removed,
+				    "hash lock being released must have been mapped");
 	} else {
-		ASSERT_LOG_ONLY(lock != vdo_int_map_get(zone->hash_lock_map, lock_key),
-				"unregistered hash lock must not be in the lock map");
-	}
-
-	ASSERT_LOG_ONLY(!vdo_waitq_has_waiters(&lock->waiters),
-			"hash lock returned to zone must have no waiters");
-	ASSERT_LOG_ONLY((lock->duplicate_lock == NULL),
-			"hash lock returned to zone must not reference a PBN lock");
-	ASSERT_LOG_ONLY((lock->state == VDO_HASH_LOCK_BYPASSING),
-			"returned hash lock must not be in use with state %s",
-			get_hash_lock_state_name(lock->state));
-	ASSERT_LOG_ONLY(list_empty(&lock->pool_node),
-			"hash lock returned to zone must not be in a pool ring");
-	ASSERT_LOG_ONLY(list_empty(&lock->duplicate_ring),
-			"hash lock returned to zone must not reference DataVIOs");
+		VDO_ASSERT_LOG_ONLY(lock != vdo_int_map_get(zone->hash_lock_map, lock_key),
+				    "unregistered hash lock must not be in the lock map");
+	}
+
+	VDO_ASSERT_LOG_ONLY(!vdo_waitq_has_waiters(&lock->waiters),
+			    "hash lock returned to zone must have no waiters");
+	VDO_ASSERT_LOG_ONLY((lock->duplicate_lock == NULL),
+			    "hash lock returned to zone must not reference a PBN lock");
+	VDO_ASSERT_LOG_ONLY((lock->state == VDO_HASH_LOCK_BYPASSING),
+			    "returned hash lock must not be in use with state %s",
+			    get_hash_lock_state_name(lock->state));
+	VDO_ASSERT_LOG_ONLY(list_empty(&lock->pool_node),
+			    "hash lock returned to zone must not be in a pool ring");
+	VDO_ASSERT_LOG_ONLY(list_empty(&lock->duplicate_ring),
+			    "hash lock returned to zone must not reference DataVIOs");
 
 	return_hash_lock_to_pool(zone, lock);
 }
@@ -1969,13 +1968,13 @@ static void transfer_allocation_lock(struct data_vio *data_vio)
 	struct allocation *allocation = &data_vio->allocation;
 	struct hash_lock *hash_lock = data_vio->hash_lock;
 
-	ASSERT_LOG_ONLY(data_vio->new_mapped.pbn == allocation->pbn,
-			"transferred lock must be for the block written");
+	VDO_ASSERT_LOG_ONLY(data_vio->new_mapped.pbn == allocation->pbn,
+			    "transferred lock must be for the block written");
 
 	allocation->pbn = VDO_ZERO_BLOCK;
 
-	ASSERT_LOG_ONLY(vdo_is_pbn_read_lock(allocation->lock),
-			"must have downgraded the allocation lock before transfer");
+	VDO_ASSERT_LOG_ONLY(vdo_is_pbn_read_lock(allocation->lock),
+			    "must have downgraded the allocation lock before transfer");
 
 	hash_lock->duplicate = data_vio->new_mapped;
 	data_vio->duplicate = data_vio->new_mapped;
@@ -2001,10 +2000,10 @@ void vdo_share_compressed_write_lock(struct data_vio *data_vio,
 {
 	bool claimed;
 
-	ASSERT_LOG_ONLY(vdo_get_duplicate_lock(data_vio) == NULL,
-			"a duplicate PBN lock should not exist when writing");
-	ASSERT_LOG_ONLY(vdo_is_state_compressed(data_vio->new_mapped.state),
-			"lock transfer must be for a compressed write");
+	VDO_ASSERT_LOG_ONLY(vdo_get_duplicate_lock(data_vio) == NULL,
+			    "a duplicate PBN lock should not exist when writing");
+	VDO_ASSERT_LOG_ONLY(vdo_is_state_compressed(data_vio->new_mapped.state),
+			    "lock transfer must be for a compressed write");
 	assert_data_vio_in_new_mapped_zone(data_vio);
 
 	/* First sharer downgrades the lock. */
@@ -2024,7 +2023,7 @@ void vdo_share_compressed_write_lock(struct data_vio *data_vio,
 	 * deduplicating against it before our incRef.
 	 */
 	claimed = vdo_claim_pbn_lock_increment(pbn_lock);
-	ASSERT_LOG_ONLY(claimed, "impossible to fail to claim an initial increment");
+	VDO_ASSERT_LOG_ONLY(claimed, "impossible to fail to claim an initial increment");
 }
 
 /*----------------------------------------------------------------------*/
@@ -2251,8 +2250,8 @@ static void finish_index_operation(struct uds_request *request)
 	 */
 	if (!change_context_state(context, DEDUPE_CONTEXT_TIMED_OUT,
 				  DEDUPE_CONTEXT_TIMED_OUT_COMPLETE)) {
-		ASSERT_LOG_ONLY(false, "uds request was timed out (state %d)",
-				atomic_read(&context->state));
+		VDO_ASSERT_LOG_ONLY(false, "uds request was timed out (state %d)",
+				    atomic_read(&context->state));
 	}
 
 	vdo_funnel_queue_put(context->zone->timed_out_complete, &context->queue_entry);
@@ -2296,7 +2295,7 @@ static void check_for_drain_complete(struct hash_zone *zone)
 
 	if (recycled > 0)
 		WRITE_ONCE(zone->active, zone->active - recycled);
-	ASSERT_LOG_ONLY(READ_ONCE(zone->active) == 0, "all contexts inactive");
+	VDO_ASSERT_LOG_ONLY(READ_ONCE(zone->active) == 0, "all contexts inactive");
 	vdo_finish_draining(&zone->state);
 }
 
diff --git a/drivers/md/dm-vdo/dm-vdo-target.c b/drivers/md/dm-vdo/dm-vdo-target.c
index d8769eb46f0a99..f1c5e656f4ddb7 100644
--- a/drivers/md/dm-vdo/dm-vdo-target.c
+++ b/drivers/md/dm-vdo/dm-vdo-target.c
@@ -904,8 +904,8 @@ static int vdo_map_bio(struct dm_target *ti, struct bio *bio)
 	struct vdo_work_queue *current_work_queue;
 	const struct admin_state_code *code = vdo_get_admin_state_code(&vdo->admin.state);
 
-	ASSERT_LOG_ONLY(code->normal, "vdo should not receive bios while in state %s",
-			code->name);
+	VDO_ASSERT_LOG_ONLY(code->normal, "vdo should not receive bios while in state %s",
+			    code->name);
 
 	/* Count all incoming bios. */
 	vdo_count_bios(&vdo->stats.bios_in, bio);
@@ -1246,9 +1246,9 @@ static int perform_admin_operation(struct vdo *vdo, u32 starting_phase,
 /* Assert that we are operating on the correct thread for the current phase. */
 static void assert_admin_phase_thread(struct vdo *vdo, const char *what)
 {
-	ASSERT_LOG_ONLY(vdo_get_callback_thread_id() == get_thread_id_for_phase(vdo),
-			"%s on correct thread for %s", what,
-			ADMIN_PHASE_NAMES[vdo->admin.phase]);
+	VDO_ASSERT_LOG_ONLY(vdo_get_callback_thread_id() == get_thread_id_for_phase(vdo),
+			    "%s on correct thread for %s", what,
+			    ADMIN_PHASE_NAMES[vdo->admin.phase]);
 }
 
 /**
@@ -1426,11 +1426,11 @@ static void release_instance(unsigned int instance)
 {
 	mutex_lock(&instances_lock);
 	if (instance >= instances.bit_count) {
-		ASSERT_LOG_ONLY(false,
-				"instance number %u must be less than bit count %u",
-				instance, instances.bit_count);
+		VDO_ASSERT_LOG_ONLY(false,
+				    "instance number %u must be less than bit count %u",
+				    instance, instances.bit_count);
 	} else if (test_bit(instance, instances.words) == 0) {
-		ASSERT_LOG_ONLY(false, "instance number %u must be allocated", instance);
+		VDO_ASSERT_LOG_ONLY(false, "instance number %u must be allocated", instance);
 	} else {
 		__clear_bit(instance, instances.words);
 		instances.count -= 1;
@@ -1579,8 +1579,8 @@ static int allocate_instance(unsigned int *instance_ptr)
 	if (instance >= instances.bit_count) {
 		/* Nothing free after next, so wrap around to instance zero. */
 		instance = find_first_zero_bit(instances.words, instances.bit_count);
-		result = ASSERT(instance < instances.bit_count,
-				"impossibly, no zero bit found");
+		result = VDO_ASSERT(instance < instances.bit_count,
+				    "impossibly, no zero bit found");
 		if (result != VDO_SUCCESS)
 			return result;
 	}
@@ -1731,8 +1731,8 @@ static int prepare_to_grow_physical(struct vdo *vdo, block_count_t new_physical_
 
 	uds_log_info("Preparing to resize physical to %llu",
 		     (unsigned long long) new_physical_blocks);
-	ASSERT_LOG_ONLY((new_physical_blocks > current_physical_blocks),
-			"New physical size is larger than current physical size");
+	VDO_ASSERT_LOG_ONLY((new_physical_blocks > current_physical_blocks),
+			    "New physical size is larger than current physical size");
 	result = perform_admin_operation(vdo, PREPARE_GROW_PHYSICAL_PHASE_START,
 					 check_may_grow_physical,
 					 finish_operation_callback,
@@ -1831,8 +1831,8 @@ static int prepare_to_modify(struct dm_target *ti, struct device_config *config,
 
 		uds_log_info("Preparing to resize logical to %llu",
 			     (unsigned long long) config->logical_blocks);
-		ASSERT_LOG_ONLY((config->logical_blocks > logical_blocks),
-				"New logical size is larger than current size");
+		VDO_ASSERT_LOG_ONLY((config->logical_blocks > logical_blocks),
+				    "New logical size is larger than current size");
 
 		result = vdo_prepare_to_grow_block_map(vdo->block_map,
 						       config->logical_blocks);
@@ -2862,9 +2862,9 @@ static void vdo_module_destroy(void)
 	if (dm_registered)
 		dm_unregister_target(&vdo_target_bio);
 
-	ASSERT_LOG_ONLY(instances.count == 0,
-			"should have no instance numbers still in use, but have %u",
-			instances.count);
+	VDO_ASSERT_LOG_ONLY(instances.count == 0,
+			    "should have no instance numbers still in use, but have %u",
+			    instances.count);
 	vdo_free(instances.words);
 	memset(&instances, 0, sizeof(struct instance_tracker));
 
diff --git a/drivers/md/dm-vdo/encodings.c b/drivers/md/dm-vdo/encodings.c
index 9388f475ad44d2..f330f67a4dca06 100644
--- a/drivers/md/dm-vdo/encodings.c
+++ b/drivers/md/dm-vdo/encodings.c
@@ -322,8 +322,8 @@ int __must_check vdo_parse_geometry_block(u8 *block, struct volume_geometry *geo
 
 	decode_volume_geometry(block, &offset, geometry, header.version.major_version);
 
-	result = ASSERT(header.size == offset + sizeof(u32),
-			"should have decoded up to the geometry checksum");
+	result = VDO_ASSERT(header.size == offset + sizeof(u32),
+			    "should have decoded up to the geometry checksum");
 	if (result != VDO_SUCCESS)
 		return result;
 
@@ -382,25 +382,25 @@ static int decode_block_map_state_2_0(u8 *buffer, size_t *offset,
 	initial_offset = *offset;
 
 	decode_u64_le(buffer, offset, &flat_page_origin);
-	result = ASSERT(flat_page_origin == VDO_BLOCK_MAP_FLAT_PAGE_ORIGIN,
-			"Flat page origin must be %u (recorded as %llu)",
-			VDO_BLOCK_MAP_FLAT_PAGE_ORIGIN,
-			(unsigned long long) state->flat_page_origin);
-	if (result != UDS_SUCCESS)
+	result = VDO_ASSERT(flat_page_origin == VDO_BLOCK_MAP_FLAT_PAGE_ORIGIN,
+			    "Flat page origin must be %u (recorded as %llu)",
+			    VDO_BLOCK_MAP_FLAT_PAGE_ORIGIN,
+			    (unsigned long long) state->flat_page_origin);
+	if (result != VDO_SUCCESS)
 		return result;
 
 	decode_u64_le(buffer, offset, &flat_page_count);
-	result = ASSERT(flat_page_count == 0,
-			"Flat page count must be 0 (recorded as %llu)",
-			(unsigned long long) state->flat_page_count);
-	if (result != UDS_SUCCESS)
+	result = VDO_ASSERT(flat_page_count == 0,
+			    "Flat page count must be 0 (recorded as %llu)",
+			    (unsigned long long) state->flat_page_count);
+	if (result != VDO_SUCCESS)
 		return result;
 
 	decode_u64_le(buffer, offset, &root_origin);
 	decode_u64_le(buffer, offset, &root_count);
 
-	result = ASSERT(VDO_BLOCK_MAP_HEADER_2_0.size == *offset - initial_offset,
-			"decoded block map component size must match header size");
+	result = VDO_ASSERT(VDO_BLOCK_MAP_HEADER_2_0.size == *offset - initial_offset,
+			    "decoded block map component size must match header size");
 	if (result != VDO_SUCCESS)
 		return result;
 
@@ -427,8 +427,8 @@ static void encode_block_map_state_2_0(u8 *buffer, size_t *offset,
 	encode_u64_le(buffer, offset, state.root_origin);
 	encode_u64_le(buffer, offset, state.root_count);
 
-	ASSERT_LOG_ONLY(VDO_BLOCK_MAP_HEADER_2_0.size == *offset - initial_offset,
-			"encoded block map component size must match header size");
+	VDO_ASSERT_LOG_ONLY(VDO_BLOCK_MAP_HEADER_2_0.size == *offset - initial_offset,
+			    "encoded block map component size must match header size");
 }
 
 /**
@@ -479,8 +479,8 @@ static void encode_recovery_journal_state_7_0(u8 *buffer, size_t *offset,
 	encode_u64_le(buffer, offset, state.logical_blocks_used);
 	encode_u64_le(buffer, offset, state.block_map_data_blocks);
 
-	ASSERT_LOG_ONLY(VDO_RECOVERY_JOURNAL_HEADER_7_0.size == *offset - initial_offset,
-			"encoded recovery journal component size must match header size");
+	VDO_ASSERT_LOG_ONLY(VDO_RECOVERY_JOURNAL_HEADER_7_0.size == *offset - initial_offset,
+			    "encoded recovery journal component size must match header size");
 }
 
 /**
@@ -510,9 +510,9 @@ static int __must_check decode_recovery_journal_state_7_0(u8 *buffer, size_t *of
 	decode_u64_le(buffer, offset, &logical_blocks_used);
 	decode_u64_le(buffer, offset, &block_map_data_blocks);
 
-	result = ASSERT(VDO_RECOVERY_JOURNAL_HEADER_7_0.size == *offset - initial_offset,
-			"decoded recovery journal component size must match header size");
-	if (result != UDS_SUCCESS)
+	result = VDO_ASSERT(VDO_RECOVERY_JOURNAL_HEADER_7_0.size == *offset - initial_offset,
+			    "decoded recovery journal component size must match header size");
+	if (result != VDO_SUCCESS)
 		return result;
 
 	*state = (struct recovery_journal_state_7_0) {
@@ -568,8 +568,8 @@ static void encode_slab_depot_state_2_0(u8 *buffer, size_t *offset,
 	encode_u64_le(buffer, offset, state.last_block);
 	buffer[(*offset)++] = state.zone_count;
 
-	ASSERT_LOG_ONLY(VDO_SLAB_DEPOT_HEADER_2_0.size == *offset - initial_offset,
-			"encoded block map component size must match header size");
+	VDO_ASSERT_LOG_ONLY(VDO_SLAB_DEPOT_HEADER_2_0.size == *offset - initial_offset,
+			    "encoded block map component size must match header size");
 }
 
 /**
@@ -620,9 +620,9 @@ static int decode_slab_depot_state_2_0(u8 *buffer, size_t *offset,
 	decode_u64_le(buffer, offset, &last_block);
 	zone_count = buffer[(*offset)++];
 
-	result = ASSERT(VDO_SLAB_DEPOT_HEADER_2_0.size == *offset - initial_offset,
-			"decoded slab depot component size must match header size");
-	if (result != UDS_SUCCESS)
+	result = VDO_ASSERT(VDO_SLAB_DEPOT_HEADER_2_0.size == *offset - initial_offset,
+			    "decoded slab depot component size must match header size");
+	if (result != VDO_SUCCESS)
 		return result;
 
 	*state = (struct slab_depot_state_2_0) {
@@ -972,7 +972,7 @@ struct partition *vdo_get_known_partition(struct layout *layout, enum partition_
 	struct partition *partition;
 	int result = vdo_get_partition(layout, id, &partition);
 
-	ASSERT_LOG_ONLY(result == VDO_SUCCESS, "layout has expected partition: %u", id);
+	VDO_ASSERT_LOG_ONLY(result == VDO_SUCCESS, "layout has expected partition: %u", id);
 
 	return partition;
 }
@@ -984,8 +984,8 @@ static void encode_layout(u8 *buffer, size_t *offset, const struct layout *layou
 	struct header header = VDO_LAYOUT_HEADER_3_0;
 
 	BUILD_BUG_ON(sizeof(enum partition_id) != sizeof(u8));
-	ASSERT_LOG_ONLY(layout->num_partitions <= U8_MAX,
-			"layout partition count must fit in a byte");
+	VDO_ASSERT_LOG_ONLY(layout->num_partitions <= U8_MAX,
+			    "layout partition count must fit in a byte");
 
 	vdo_encode_header(buffer, offset, &header);
 
@@ -994,8 +994,8 @@ static void encode_layout(u8 *buffer, size_t *offset, const struct layout *layou
 	encode_u64_le(buffer, offset, layout->last_free);
 	buffer[(*offset)++] = layout->num_partitions;
 
-	ASSERT_LOG_ONLY(sizeof(struct layout_3_0) == *offset - initial_offset,
-			"encoded size of a layout header must match structure");
+	VDO_ASSERT_LOG_ONLY(sizeof(struct layout_3_0) == *offset - initial_offset,
+			    "encoded size of a layout header must match structure");
 
 	for (partition = layout->head; partition != NULL; partition = partition->next) {
 		buffer[(*offset)++] = partition->id;
@@ -1005,8 +1005,8 @@ static void encode_layout(u8 *buffer, size_t *offset, const struct layout *layou
 		encode_u64_le(buffer, offset, partition->count);
 	}
 
-	ASSERT_LOG_ONLY(header.size == *offset - initial_offset,
-			"encoded size of a layout must match header size");
+	VDO_ASSERT_LOG_ONLY(header.size == *offset - initial_offset,
+			    "encoded size of a layout must match header size");
 }
 
 static int decode_layout(u8 *buffer, size_t *offset, physical_block_number_t start,
@@ -1037,8 +1037,8 @@ static int decode_layout(u8 *buffer, size_t *offset, physical_block_number_t sta
 		.partition_count = partition_count,
 	};
 
-	result = ASSERT(sizeof(struct layout_3_0) == *offset - initial_offset,
-			"decoded size of a layout header must match structure");
+	result = VDO_ASSERT(sizeof(struct layout_3_0) == *offset - initial_offset,
+			    "decoded size of a layout header must match structure");
 	if (result != VDO_SUCCESS)
 		return result;
 
@@ -1210,29 +1210,29 @@ int vdo_validate_config(const struct vdo_config *config,
 	struct slab_config slab_config;
 	int result;
 
-	result = ASSERT(config->slab_size > 0, "slab size unspecified");
-	if (result != UDS_SUCCESS)
+	result = VDO_ASSERT(config->slab_size > 0, "slab size unspecified");
+	if (result != VDO_SUCCESS)
 		return result;
 
-	result = ASSERT(is_power_of_2(config->slab_size),
-			"slab size must be a power of two");
-	if (result != UDS_SUCCESS)
+	result = VDO_ASSERT(is_power_of_2(config->slab_size),
+			    "slab size must be a power of two");
+	if (result != VDO_SUCCESS)
 		return result;
 
-	result = ASSERT(config->slab_size <= (1 << MAX_VDO_SLAB_BITS),
-			"slab size must be less than or equal to 2^%d",
-			MAX_VDO_SLAB_BITS);
+	result = VDO_ASSERT(config->slab_size <= (1 << MAX_VDO_SLAB_BITS),
+			    "slab size must be less than or equal to 2^%d",
+			    MAX_VDO_SLAB_BITS);
 	if (result != VDO_SUCCESS)
 		return result;
 
-	result = ASSERT(config->slab_journal_blocks >= MINIMUM_VDO_SLAB_JOURNAL_BLOCKS,
-			"slab journal size meets minimum size");
-	if (result != UDS_SUCCESS)
+	result = VDO_ASSERT(config->slab_journal_blocks >= MINIMUM_VDO_SLAB_JOURNAL_BLOCKS,
+			    "slab journal size meets minimum size");
+	if (result != VDO_SUCCESS)
 		return result;
 
-	result = ASSERT(config->slab_journal_blocks <= config->slab_size,
-			"slab journal size is within expected bound");
-	if (result != UDS_SUCCESS)
+	result = VDO_ASSERT(config->slab_journal_blocks <= config->slab_size,
+			    "slab journal size is within expected bound");
+	if (result != VDO_SUCCESS)
 		return result;
 
 	result = vdo_configure_slab(config->slab_size, config->slab_journal_blocks,
@@ -1240,20 +1240,20 @@ int vdo_validate_config(const struct vdo_config *config,
 	if (result != VDO_SUCCESS)
 		return result;
 
-	result = ASSERT((slab_config.data_blocks >= 1),
-			"slab must be able to hold at least one block");
-	if (result != UDS_SUCCESS)
+	result = VDO_ASSERT((slab_config.data_blocks >= 1),
+			    "slab must be able to hold at least one block");
+	if (result != VDO_SUCCESS)
 		return result;
 
-	result = ASSERT(config->physical_blocks > 0, "physical blocks unspecified");
-	if (result != UDS_SUCCESS)
+	result = VDO_ASSERT(config->physical_blocks > 0, "physical blocks unspecified");
+	if (result != VDO_SUCCESS)
 		return result;
 
-	result = ASSERT(config->physical_blocks <= MAXIMUM_VDO_PHYSICAL_BLOCKS,
-			"physical block count %llu exceeds maximum %llu",
-			(unsigned long long) config->physical_blocks,
-			(unsigned long long) MAXIMUM_VDO_PHYSICAL_BLOCKS);
-	if (result != UDS_SUCCESS)
+	result = VDO_ASSERT(config->physical_blocks <= MAXIMUM_VDO_PHYSICAL_BLOCKS,
+			    "physical block count %llu exceeds maximum %llu",
+			    (unsigned long long) config->physical_blocks,
+			    (unsigned long long) MAXIMUM_VDO_PHYSICAL_BLOCKS);
+	if (result != VDO_SUCCESS)
 		return VDO_OUT_OF_RANGE;
 
 	if (physical_block_count != config->physical_blocks) {
@@ -1264,9 +1264,9 @@ int vdo_validate_config(const struct vdo_config *config,
 	}
 
 	if (logical_block_count > 0) {
-		result = ASSERT((config->logical_blocks > 0),
-				"logical blocks unspecified");
-		if (result != UDS_SUCCESS)
+		result = VDO_ASSERT((config->logical_blocks > 0),
+				    "logical blocks unspecified");
+		if (result != VDO_SUCCESS)
 			return result;
 
 		if (logical_block_count != config->logical_blocks) {
@@ -1277,19 +1277,19 @@ int vdo_validate_config(const struct vdo_config *config,
 		}
 	}
 
-	result = ASSERT(config->logical_blocks <= MAXIMUM_VDO_LOGICAL_BLOCKS,
-			"logical blocks too large");
-	if (result != UDS_SUCCESS)
+	result = VDO_ASSERT(config->logical_blocks <= MAXIMUM_VDO_LOGICAL_BLOCKS,
+			    "logical blocks too large");
+	if (result != VDO_SUCCESS)
 		return result;
 
-	result = ASSERT(config->recovery_journal_size > 0,
-			"recovery journal size unspecified");
-	if (result != UDS_SUCCESS)
+	result = VDO_ASSERT(config->recovery_journal_size > 0,
+			    "recovery journal size unspecified");
+	if (result != VDO_SUCCESS)
 		return result;
 
-	result = ASSERT(is_power_of_2(config->recovery_journal_size),
-			"recovery journal size must be a power of two");
-	if (result != UDS_SUCCESS)
+	result = VDO_ASSERT(is_power_of_2(config->recovery_journal_size),
+			    "recovery journal size must be a power of two");
+	if (result != VDO_SUCCESS)
 		return result;
 
 	return result;
@@ -1343,8 +1343,8 @@ static int __must_check decode_components(u8 *buffer, size_t *offset,
 	if (result != VDO_SUCCESS)
 		return result;
 
-	ASSERT_LOG_ONLY(*offset == VDO_COMPONENT_DATA_OFFSET + VDO_COMPONENT_DATA_SIZE,
-			"All decoded component data was used");
+	VDO_ASSERT_LOG_ONLY(*offset == VDO_COMPONENT_DATA_OFFSET + VDO_COMPONENT_DATA_SIZE,
+			    "All decoded component data was used");
 	return VDO_SUCCESS;
 }
 
@@ -1418,8 +1418,8 @@ static void vdo_encode_component_states(u8 *buffer, size_t *offset,
 	encode_slab_depot_state_2_0(buffer, offset, states->slab_depot);
 	encode_block_map_state_2_0(buffer, offset, states->block_map);
 
-	ASSERT_LOG_ONLY(*offset == VDO_COMPONENT_DATA_OFFSET + VDO_COMPONENT_DATA_SIZE,
-			"All super block component data was encoded");
+	VDO_ASSERT_LOG_ONLY(*offset == VDO_COMPONENT_DATA_OFFSET + VDO_COMPONENT_DATA_SIZE,
+			    "All super block component data was encoded");
 }
 
 /**
@@ -1442,8 +1442,8 @@ void vdo_encode_super_block(u8 *buffer, struct vdo_component_states *states)
 	 * Even though the buffer is a full block, to avoid the potential corruption from a torn
 	 * write, the entire encoding must fit in the first sector.
 	 */
-	ASSERT_LOG_ONLY(offset <= VDO_SECTOR_SIZE,
-			"entire superblock must fit in one sector");
+	VDO_ASSERT_LOG_ONLY(offset <= VDO_SECTOR_SIZE,
+			    "entire superblock must fit in one sector");
 }
 
 /**
@@ -1478,8 +1478,8 @@ int vdo_decode_super_block(u8 *buffer)
 	checksum = vdo_crc32(buffer, offset);
 	decode_u32_le(buffer, &offset, &saved_checksum);
 
-	result = ASSERT(offset == VDO_SUPER_BLOCK_FIXED_SIZE + VDO_COMPONENT_DATA_SIZE,
-			"must have decoded entire superblock payload");
+	result = VDO_ASSERT(offset == VDO_SUPER_BLOCK_FIXED_SIZE + VDO_COMPONENT_DATA_SIZE,
+			    "must have decoded entire superblock payload");
 	if (result != VDO_SUCCESS)
 		return result;
 
diff --git a/drivers/md/dm-vdo/errors.c b/drivers/md/dm-vdo/errors.c
index e8599599a377e0..2da614d714c076 100644
--- a/drivers/md/dm-vdo/errors.c
+++ b/drivers/md/dm-vdo/errors.c
@@ -287,8 +287,9 @@ int uds_register_error_block(const char *block_name, int first_error,
 		.infos = infos,
 	};
 
-	result = ASSERT(first_error < next_free_error, "well-defined error block range");
-	if (result != UDS_SUCCESS)
+	result = VDO_ASSERT(first_error < next_free_error,
+			    "well-defined error block range");
+	if (result != VDO_SUCCESS)
 		return result;
 
 	if (registered_errors.count == registered_errors.allocated) {
diff --git a/drivers/md/dm-vdo/flush.c b/drivers/md/dm-vdo/flush.c
index 91512f115c5620..18d18e9a95576b 100644
--- a/drivers/md/dm-vdo/flush.c
+++ b/drivers/md/dm-vdo/flush.c
@@ -59,8 +59,8 @@ struct flusher {
  */
 static inline void assert_on_flusher_thread(struct flusher *flusher, const char *caller)
 {
-	ASSERT_LOG_ONLY((vdo_get_callback_thread_id() == flusher->thread_id),
-			"%s() called from flusher thread", caller);
+	VDO_ASSERT_LOG_ONLY((vdo_get_callback_thread_id() == flusher->thread_id),
+			    "%s() called from flusher thread", caller);
 }
 
 /**
@@ -272,8 +272,8 @@ static void flush_vdo(struct vdo_completion *completion)
 	int result;
 
 	assert_on_flusher_thread(flusher, __func__);
-	result = ASSERT(vdo_is_state_normal(&flusher->state),
-			"flusher is in normal operation");
+	result = VDO_ASSERT(vdo_is_state_normal(&flusher->state),
+			    "flusher is in normal operation");
 	if (result != VDO_SUCCESS) {
 		vdo_enter_read_only_mode(flusher->vdo, result);
 		vdo_complete_flush(flush);
@@ -330,11 +330,11 @@ void vdo_complete_flushes(struct flusher *flusher)
 		if (flush->flush_generation >= oldest_active_generation)
 			return;
 
-		ASSERT_LOG_ONLY((flush->flush_generation ==
-				 flusher->first_unacknowledged_generation),
-				"acknowledged next expected flush, %llu, was: %llu",
-				(unsigned long long) flusher->first_unacknowledged_generation,
-				(unsigned long long) flush->flush_generation);
+		VDO_ASSERT_LOG_ONLY((flush->flush_generation ==
+				     flusher->first_unacknowledged_generation),
+				    "acknowledged next expected flush, %llu, was: %llu",
+				    (unsigned long long) flusher->first_unacknowledged_generation,
+				    (unsigned long long) flush->flush_generation);
 		vdo_waitq_dequeue_waiter(&flusher->pending_flushes);
 		vdo_complete_flush(flush);
 		flusher->first_unacknowledged_generation++;
@@ -400,8 +400,8 @@ void vdo_launch_flush(struct vdo *vdo, struct bio *bio)
 	struct flusher *flusher = vdo->flusher;
 	const struct admin_state_code *code = vdo_get_admin_state_code(&flusher->state);
 
-	ASSERT_LOG_ONLY(!code->quiescent, "Flushing not allowed in state %s",
-			code->name);
+	VDO_ASSERT_LOG_ONLY(!code->quiescent, "Flushing not allowed in state %s",
+			    code->name);
 
 	spin_lock(&flusher->lock);
 
diff --git a/drivers/md/dm-vdo/funnel-workqueue.c b/drivers/md/dm-vdo/funnel-workqueue.c
index 8758748e8222ce..cf04cdef07500e 100644
--- a/drivers/md/dm-vdo/funnel-workqueue.c
+++ b/drivers/md/dm-vdo/funnel-workqueue.c
@@ -110,14 +110,14 @@ static struct vdo_completion *poll_for_completion(struct simple_work_queue *queu
 static void enqueue_work_queue_completion(struct simple_work_queue *queue,
 					  struct vdo_completion *completion)
 {
-	ASSERT_LOG_ONLY(completion->my_queue == NULL,
-			"completion %px (fn %px) to enqueue (%px) is not already queued (%px)",
-			completion, completion->callback, queue, completion->my_queue);
+	VDO_ASSERT_LOG_ONLY(completion->my_queue == NULL,
+			    "completion %px (fn %px) to enqueue (%px) is not already queued (%px)",
+			    completion, completion->callback, queue, completion->my_queue);
 	if (completion->priority == VDO_WORK_Q_DEFAULT_PRIORITY)
 		completion->priority = queue->common.type->default_priority;
 
-	if (ASSERT(completion->priority <= queue->common.type->max_priority,
-		   "priority is in range for queue") != VDO_SUCCESS)
+	if (VDO_ASSERT(completion->priority <= queue->common.type->max_priority,
+		       "priority is in range for queue") != VDO_SUCCESS)
 		completion->priority = 0;
 
 	completion->my_queue = &queue->common;
@@ -222,9 +222,9 @@ static struct vdo_completion *wait_for_next_completion(struct simple_work_queue
 static void process_completion(struct simple_work_queue *queue,
 			       struct vdo_completion *completion)
 {
-	if (ASSERT(completion->my_queue == &queue->common,
-		   "completion %px from queue %px marked as being in this queue (%px)",
-		   completion, queue, completion->my_queue) == UDS_SUCCESS)
+	if (VDO_ASSERT(completion->my_queue == &queue->common,
+		       "completion %px from queue %px marked as being in this queue (%px)",
+		       completion, queue, completion->my_queue) == VDO_SUCCESS)
 		completion->my_queue = NULL;
 
 	vdo_run_completion(completion);
@@ -319,9 +319,9 @@ static int make_simple_work_queue(const char *thread_name_prefix, const char *na
 	struct task_struct *thread = NULL;
 	int result;
 
-	ASSERT_LOG_ONLY((type->max_priority <= VDO_WORK_Q_MAX_PRIORITY),
-			"queue priority count %u within limit %u", type->max_priority,
-			VDO_WORK_Q_MAX_PRIORITY);
+	VDO_ASSERT_LOG_ONLY((type->max_priority <= VDO_WORK_Q_MAX_PRIORITY),
+			    "queue priority count %u within limit %u", type->max_priority,
+			    VDO_WORK_Q_MAX_PRIORITY);
 
 	result = vdo_allocate(1, struct simple_work_queue, "simple work queue", &queue);
 	if (result != VDO_SUCCESS)
diff --git a/drivers/md/dm-vdo/io-submitter.c b/drivers/md/dm-vdo/io-submitter.c
index e82b4a8c6fc45f..61bb48068c3a28 100644
--- a/drivers/md/dm-vdo/io-submitter.c
+++ b/drivers/md/dm-vdo/io-submitter.c
@@ -94,7 +94,7 @@ static void count_all_bios(struct vio *vio, struct bio *bio)
  */
 static void assert_in_bio_zone(struct vio *vio)
 {
-	ASSERT_LOG_ONLY(!in_interrupt(), "not in interrupt context");
+	VDO_ASSERT_LOG_ONLY(!in_interrupt(), "not in interrupt context");
 	assert_vio_in_bio_zone(vio);
 }
 
@@ -300,7 +300,7 @@ static bool try_bio_map_merge(struct vio *vio)
 	mutex_unlock(&bio_queue_data->lock);
 
 	/* We don't care about failure of int_map_put in this case. */
-	ASSERT_LOG_ONLY(result == VDO_SUCCESS, "bio map insertion succeeds");
+	VDO_ASSERT_LOG_ONLY(result == VDO_SUCCESS, "bio map insertion succeeds");
 	return merged;
 }
 
@@ -345,8 +345,8 @@ void __submit_metadata_vio(struct vio *vio, physical_block_number_t physical,
 	const struct admin_state_code *code = vdo_get_admin_state(completion->vdo);
 
 
-	ASSERT_LOG_ONLY(!code->quiescent, "I/O not allowed in state %s", code->name);
-	ASSERT_LOG_ONLY(vio->bio->bi_next == NULL, "metadata bio has no next bio");
+	VDO_ASSERT_LOG_ONLY(!code->quiescent, "I/O not allowed in state %s", code->name);
+	VDO_ASSERT_LOG_ONLY(vio->bio->bi_next == NULL, "metadata bio has no next bio");
 
 	vdo_reset_completion(completion);
 	completion->error_handler = error_handler;
diff --git a/drivers/md/dm-vdo/logical-zone.c b/drivers/md/dm-vdo/logical-zone.c
index de231c3a4850a1..52aa9c48dcf807 100644
--- a/drivers/md/dm-vdo/logical-zone.c
+++ b/drivers/md/dm-vdo/logical-zone.c
@@ -144,8 +144,8 @@ void vdo_free_logical_zones(struct logical_zones *zones)
 
 static inline void assert_on_zone_thread(struct logical_zone *zone, const char *what)
 {
-	ASSERT_LOG_ONLY((vdo_get_callback_thread_id() == zone->thread_id),
-			"%s() called on correct thread", what);
+	VDO_ASSERT_LOG_ONLY((vdo_get_callback_thread_id() == zone->thread_id),
+			    "%s() called on correct thread", what);
 }
 
 /**
@@ -249,10 +249,10 @@ void vdo_increment_logical_zone_flush_generation(struct logical_zone *zone,
 						 sequence_number_t expected_generation)
 {
 	assert_on_zone_thread(zone, __func__);
-	ASSERT_LOG_ONLY((zone->flush_generation == expected_generation),
-			"logical zone %u flush generation %llu should be %llu before increment",
-			zone->zone_number, (unsigned long long) zone->flush_generation,
-			(unsigned long long) expected_generation);
+	VDO_ASSERT_LOG_ONLY((zone->flush_generation == expected_generation),
+			    "logical zone %u flush generation %llu should be %llu before increment",
+			    zone->zone_number, (unsigned long long) zone->flush_generation,
+			    (unsigned long long) expected_generation);
 
 	zone->flush_generation++;
 	zone->ios_in_flush_generation = 0;
@@ -269,7 +269,7 @@ void vdo_acquire_flush_generation_lock(struct data_vio *data_vio)
 	struct logical_zone *zone = data_vio->logical.zone;
 
 	assert_on_zone_thread(zone, __func__);
-	ASSERT_LOG_ONLY(vdo_is_state_normal(&zone->state), "vdo state is normal");
+	VDO_ASSERT_LOG_ONLY(vdo_is_state_normal(&zone->state), "vdo state is normal");
 
 	data_vio->flush_generation = zone->flush_generation;
 	list_add_tail(&data_vio->write_entry, &zone->write_vios);
@@ -334,10 +334,10 @@ void vdo_release_flush_generation_lock(struct data_vio *data_vio)
 		return;
 
 	list_del_init(&data_vio->write_entry);
-	ASSERT_LOG_ONLY((zone->oldest_active_generation <= data_vio->flush_generation),
-			"data_vio releasing lock on generation %llu is not older than oldest active generation %llu",
-			(unsigned long long) data_vio->flush_generation,
-			(unsigned long long) zone->oldest_active_generation);
+	VDO_ASSERT_LOG_ONLY((zone->oldest_active_generation <= data_vio->flush_generation),
+			    "data_vio releasing lock on generation %llu is not older than oldest active generation %llu",
+			    (unsigned long long) data_vio->flush_generation,
+			    (unsigned long long) zone->oldest_active_generation);
 
 	if (!update_oldest_active_generation(zone) || zone->notifying)
 		return;
diff --git a/drivers/md/dm-vdo/memory-alloc.c b/drivers/md/dm-vdo/memory-alloc.c
index f8b13b755e1556..d2095516af282f 100644
--- a/drivers/md/dm-vdo/memory-alloc.c
+++ b/drivers/md/dm-vdo/memory-alloc.c
@@ -385,12 +385,12 @@ void vdo_memory_init(void)
 
 void vdo_memory_exit(void)
 {
-	ASSERT_LOG_ONLY(memory_stats.kmalloc_bytes == 0,
-			"kmalloc memory used (%zd bytes in %zd blocks) is returned to the kernel",
-			memory_stats.kmalloc_bytes, memory_stats.kmalloc_blocks);
-	ASSERT_LOG_ONLY(memory_stats.vmalloc_bytes == 0,
-			"vmalloc memory used (%zd bytes in %zd blocks) is returned to the kernel",
-			memory_stats.vmalloc_bytes, memory_stats.vmalloc_blocks);
+	VDO_ASSERT_LOG_ONLY(memory_stats.kmalloc_bytes == 0,
+			    "kmalloc memory used (%zd bytes in %zd blocks) is returned to the kernel",
+			    memory_stats.kmalloc_bytes, memory_stats.kmalloc_blocks);
+	VDO_ASSERT_LOG_ONLY(memory_stats.vmalloc_bytes == 0,
+			    "vmalloc memory used (%zd bytes in %zd blocks) is returned to the kernel",
+			    memory_stats.vmalloc_bytes, memory_stats.vmalloc_blocks);
 	uds_log_debug("peak usage %zd bytes", memory_stats.peak_bytes);
 }
 
diff --git a/drivers/md/dm-vdo/packer.c b/drivers/md/dm-vdo/packer.c
index 59820f91a70246..e849b4ad691f89 100644
--- a/drivers/md/dm-vdo/packer.c
+++ b/drivers/md/dm-vdo/packer.c
@@ -88,8 +88,8 @@ int vdo_get_compressed_block_fragment(enum block_mapping_state mapping_state,
  */
 static inline void assert_on_packer_thread(struct packer *packer, const char *caller)
 {
-	ASSERT_LOG_ONLY((vdo_get_callback_thread_id() == packer->thread_id),
-			"%s() called from packer thread", caller);
+	VDO_ASSERT_LOG_ONLY((vdo_get_callback_thread_id() == packer->thread_id),
+			    "%s() called from packer thread", caller);
 }
 
 /**
@@ -571,9 +571,9 @@ void vdo_attempt_packing(struct data_vio *data_vio)
 
 	assert_on_packer_thread(packer, __func__);
 
-	result = ASSERT((status.stage == DATA_VIO_COMPRESSING),
-			"attempt to pack data_vio not ready for packing, stage: %u",
-			status.stage);
+	result = VDO_ASSERT((status.stage == DATA_VIO_COMPRESSING),
+			    "attempt to pack data_vio not ready for packing, stage: %u",
+			    status.stage);
 	if (result != VDO_SUCCESS)
 		return;
 
@@ -675,7 +675,7 @@ void vdo_remove_lock_holder_from_packer(struct vdo_completion *completion)
 
 	lock_holder = vdo_forget(data_vio->compression.lock_holder);
 	bin = lock_holder->compression.bin;
-	ASSERT_LOG_ONLY((bin != NULL), "data_vio in packer has a bin");
+	VDO_ASSERT_LOG_ONLY((bin != NULL), "data_vio in packer has a bin");
 
 	slot = lock_holder->compression.slot;
 	bin->slots_used--;
diff --git a/drivers/md/dm-vdo/permassert.h b/drivers/md/dm-vdo/permassert.h
index 8fb5f7d9c66fe5..21e7e2dfd24c4a 100644
--- a/drivers/md/dm-vdo/permassert.h
+++ b/drivers/md/dm-vdo/permassert.h
@@ -13,7 +13,6 @@
 /* Utilities for asserting that certain conditions are met */
 
 #define STRINGIFY(X) #X
-#define STRINGIFY_VALUE(X) STRINGIFY(X)
 
 /*
  * A hack to apply the "warn if unused" attribute to an integral expression.
@@ -23,19 +22,23 @@
  * expression. With optimization enabled, this function contributes no additional instructions, but
  * the warn_unused_result attribute still applies to the code calling it.
  */
-static inline int __must_check uds_must_use(int value)
+static inline int __must_check vdo_must_use(int value)
 {
 	return value;
 }
 
 /* Assert that an expression is true and return an error if it is not. */
-#define ASSERT(expr, ...) uds_must_use(__UDS_ASSERT(expr, __VA_ARGS__))
+#define VDO_ASSERT(expr, ...) vdo_must_use(__VDO_ASSERT(expr, __VA_ARGS__))
 
 /* Log a message if the expression is not true. */
-#define ASSERT_LOG_ONLY(expr, ...) __UDS_ASSERT(expr, __VA_ARGS__)
+#define VDO_ASSERT_LOG_ONLY(expr, ...) __VDO_ASSERT(expr, __VA_ARGS__)
 
-#define __UDS_ASSERT(expr, ...)				      \
-	(likely(expr) ? UDS_SUCCESS			      \
+/* For use by UDS */
+#define ASSERT(expr, ...) VDO_ASSERT(expr, __VA_ARGS__)
+#define ASSERT_LOG_ONLY(expr, ...) __VDO_ASSERT(expr, __VA_ARGS__)
+
+#define __VDO_ASSERT(expr, ...)				      \
+	(likely(expr) ? VDO_SUCCESS			      \
 		      : uds_assertion_failed(STRINGIFY(expr), __FILE__, __LINE__, __VA_ARGS__))
 
 /* Log an assertion failure message. */
diff --git a/drivers/md/dm-vdo/physical-zone.c b/drivers/md/dm-vdo/physical-zone.c
index b0a1d75567bac0..389e5ed2a0a1a2 100644
--- a/drivers/md/dm-vdo/physical-zone.c
+++ b/drivers/md/dm-vdo/physical-zone.c
@@ -82,13 +82,13 @@ static inline void set_pbn_lock_type(struct pbn_lock *lock, enum pbn_lock_type t
  */
 void vdo_downgrade_pbn_write_lock(struct pbn_lock *lock, bool compressed_write)
 {
-	ASSERT_LOG_ONLY(!vdo_is_pbn_read_lock(lock),
-			"PBN lock must not already have been downgraded");
-	ASSERT_LOG_ONLY(!has_lock_type(lock, VIO_BLOCK_MAP_WRITE_LOCK),
-			"must not downgrade block map write locks");
-	ASSERT_LOG_ONLY(lock->holder_count == 1,
-			"PBN write lock should have one holder but has %u",
-			lock->holder_count);
+	VDO_ASSERT_LOG_ONLY(!vdo_is_pbn_read_lock(lock),
+			    "PBN lock must not already have been downgraded");
+	VDO_ASSERT_LOG_ONLY(!has_lock_type(lock, VIO_BLOCK_MAP_WRITE_LOCK),
+			    "must not downgrade block map write locks");
+	VDO_ASSERT_LOG_ONLY(lock->holder_count == 1,
+			    "PBN write lock should have one holder but has %u",
+			    lock->holder_count);
 	/*
 	 * data_vio write locks are downgraded in place--the writer retains the hold on the lock.
 	 * If this was a compressed write, the holder has not yet journaled its own inc ref,
@@ -130,8 +130,8 @@ bool vdo_claim_pbn_lock_increment(struct pbn_lock *lock)
  */
 void vdo_assign_pbn_lock_provisional_reference(struct pbn_lock *lock)
 {
-	ASSERT_LOG_ONLY(!lock->has_provisional_reference,
-			"lock does not have a provisional reference");
+	VDO_ASSERT_LOG_ONLY(!lock->has_provisional_reference,
+			    "lock does not have a provisional reference");
 	lock->has_provisional_reference = true;
 }
 
@@ -223,7 +223,7 @@ static void return_pbn_lock_to_pool(struct pbn_lock_pool *pool, struct pbn_lock
 	INIT_LIST_HEAD(&idle->entry);
 	list_add_tail(&idle->entry, &pool->idle_list);
 
-	ASSERT_LOG_ONLY(pool->borrowed > 0, "shouldn't return more than borrowed");
+	VDO_ASSERT_LOG_ONLY(pool->borrowed > 0, "shouldn't return more than borrowed");
 	pool->borrowed -= 1;
 }
 
@@ -269,9 +269,9 @@ static void free_pbn_lock_pool(struct pbn_lock_pool *pool)
 	if (pool == NULL)
 		return;
 
-	ASSERT_LOG_ONLY(pool->borrowed == 0,
-			"All PBN locks must be returned to the pool before it is freed, but %zu locks are still on loan",
-			pool->borrowed);
+	VDO_ASSERT_LOG_ONLY(pool->borrowed == 0,
+			    "All PBN locks must be returned to the pool before it is freed, but %zu locks are still on loan",
+			    pool->borrowed);
 	vdo_free(pool);
 }
 
@@ -300,8 +300,8 @@ static int __must_check borrow_pbn_lock_from_pool(struct pbn_lock_pool *pool,
 					      "no free PBN locks left to borrow");
 	pool->borrowed += 1;
 
-	result = ASSERT(!list_empty(&pool->idle_list),
-			"idle list should not be empty if pool not at capacity");
+	result = VDO_ASSERT(!list_empty(&pool->idle_list),
+			    "idle list should not be empty if pool not at capacity");
 	if (result != VDO_SUCCESS)
 		return result;
 
@@ -449,7 +449,7 @@ int vdo_attempt_physical_zone_pbn_lock(struct physical_zone *zone,
 
 	result = borrow_pbn_lock_from_pool(zone->lock_pool, type, &new_lock);
 	if (result != VDO_SUCCESS) {
-		ASSERT_LOG_ONLY(false, "must always be able to borrow a PBN lock");
+		VDO_ASSERT_LOG_ONLY(false, "must always be able to borrow a PBN lock");
 		return result;
 	}
 
@@ -463,8 +463,8 @@ int vdo_attempt_physical_zone_pbn_lock(struct physical_zone *zone,
 	if (lock != NULL) {
 		/* The lock is already held, so we don't need the borrowed one. */
 		return_pbn_lock_to_pool(zone->lock_pool, vdo_forget(new_lock));
-		result = ASSERT(lock->holder_count > 0, "physical block %llu lock held",
-				(unsigned long long) pbn);
+		result = VDO_ASSERT(lock->holder_count > 0, "physical block %llu lock held",
+				    (unsigned long long) pbn);
 		if (result != VDO_SUCCESS)
 			return result;
 		*lock_ptr = lock;
@@ -487,8 +487,8 @@ static int allocate_and_lock_block(struct allocation *allocation)
 	int result;
 	struct pbn_lock *lock;
 
-	ASSERT_LOG_ONLY(allocation->lock == NULL,
-			"must not allocate a block while already holding a lock on one");
+	VDO_ASSERT_LOG_ONLY(allocation->lock == NULL,
+			    "must not allocate a block while already holding a lock on one");
 
 	result = vdo_allocate_block(allocation->zone->allocator, &allocation->pbn);
 	if (result != VDO_SUCCESS)
@@ -619,8 +619,8 @@ void vdo_release_physical_zone_pbn_lock(struct physical_zone *zone,
 	if (lock == NULL)
 		return;
 
-	ASSERT_LOG_ONLY(lock->holder_count > 0,
-			"should not be releasing a lock that is not held");
+	VDO_ASSERT_LOG_ONLY(lock->holder_count > 0,
+			    "should not be releasing a lock that is not held");
 
 	lock->holder_count -= 1;
 	if (lock->holder_count > 0) {
@@ -629,8 +629,8 @@ void vdo_release_physical_zone_pbn_lock(struct physical_zone *zone,
 	}
 
 	holder = vdo_int_map_remove(zone->pbn_operations, locked_pbn);
-	ASSERT_LOG_ONLY((lock == holder), "physical block lock mismatch for block %llu",
-			(unsigned long long) locked_pbn);
+	VDO_ASSERT_LOG_ONLY((lock == holder), "physical block lock mismatch for block %llu",
+			    (unsigned long long) locked_pbn);
 
 	release_pbn_lock_provisional_reference(lock, locked_pbn, zone->allocator);
 	return_pbn_lock_to_pool(zone->lock_pool, lock);
diff --git a/drivers/md/dm-vdo/priority-table.c b/drivers/md/dm-vdo/priority-table.c
index bb98fb06b73fcd..e6931ae43ba691 100644
--- a/drivers/md/dm-vdo/priority-table.c
+++ b/drivers/md/dm-vdo/priority-table.c
@@ -129,8 +129,8 @@ void vdo_reset_priority_table(struct priority_table *table)
 void vdo_priority_table_enqueue(struct priority_table *table, unsigned int priority,
 				struct list_head *entry)
 {
-	ASSERT_LOG_ONLY((priority <= table->max_priority),
-			"entry priority must be valid for the table");
+	VDO_ASSERT_LOG_ONLY((priority <= table->max_priority),
+			    "entry priority must be valid for the table");
 
 	/* Append the entry to the queue in the specified bucket. */
 	list_move_tail(entry, &table->buckets[priority].queue);
diff --git a/drivers/md/dm-vdo/recovery-journal.c b/drivers/md/dm-vdo/recovery-journal.c
index c0f3dea5d64b92..271d172360773d 100644
--- a/drivers/md/dm-vdo/recovery-journal.c
+++ b/drivers/md/dm-vdo/recovery-journal.c
@@ -121,8 +121,8 @@ static bool is_journal_zone_locked(struct recovery_journal *journal,
 
 	/* Pairs with barrier in vdo_release_journal_entry_lock() */
 	smp_rmb();
-	ASSERT_LOG_ONLY((decrements <= journal_value),
-			"journal zone lock counter must not underflow");
+	VDO_ASSERT_LOG_ONLY((decrements <= journal_value),
+			    "journal zone lock counter must not underflow");
 	return (journal_value != decrements);
 }
 
@@ -152,8 +152,8 @@ void vdo_release_recovery_journal_block_reference(struct recovery_journal *journ
 	lock_number = vdo_get_recovery_journal_block_number(journal, sequence_number);
 	current_value = get_counter(journal, lock_number, zone_type, zone_id);
 
-	ASSERT_LOG_ONLY((*current_value >= 1),
-			"decrement of lock counter must not underflow");
+	VDO_ASSERT_LOG_ONLY((*current_value >= 1),
+			    "decrement of lock counter must not underflow");
 	*current_value -= 1;
 
 	if (zone_type == VDO_ZONE_TYPE_JOURNAL) {
@@ -256,8 +256,8 @@ static inline bool __must_check is_block_full(const struct recovery_journal_bloc
 static void assert_on_journal_thread(struct recovery_journal *journal,
 				     const char *function_name)
 {
-	ASSERT_LOG_ONLY((vdo_get_callback_thread_id() == journal->thread_id),
-			"%s() called on journal thread", function_name);
+	VDO_ASSERT_LOG_ONLY((vdo_get_callback_thread_id() == journal->thread_id),
+			    "%s() called on journal thread", function_name);
 }
 
 /**
@@ -355,14 +355,14 @@ static void check_for_drain_complete(struct recovery_journal *journal)
 
 	if (vdo_is_state_saving(&journal->state)) {
 		if (journal->active_block != NULL) {
-			ASSERT_LOG_ONLY(((result == VDO_READ_ONLY) ||
-					 !is_block_dirty(journal->active_block)),
-					"journal being saved has clean active block");
+			VDO_ASSERT_LOG_ONLY(((result == VDO_READ_ONLY) ||
+					     !is_block_dirty(journal->active_block)),
+					    "journal being saved has clean active block");
 			recycle_journal_block(journal->active_block);
 		}
 
-		ASSERT_LOG_ONLY(list_empty(&journal->active_tail_blocks),
-				"all blocks in a journal being saved must be inactive");
+		VDO_ASSERT_LOG_ONLY(list_empty(&journal->active_tail_blocks),
+				    "all blocks in a journal being saved must be inactive");
 	}
 
 	vdo_finish_draining_with_result(&journal->state, result);
@@ -802,8 +802,8 @@ void vdo_free_recovery_journal(struct recovery_journal *journal)
 	 *        requires opening before use.
 	 */
 	if (!vdo_is_state_quiescent(&journal->state)) {
-		ASSERT_LOG_ONLY(list_empty(&journal->active_tail_blocks),
-				"journal being freed has no active tail blocks");
+		VDO_ASSERT_LOG_ONLY(list_empty(&journal->active_tail_blocks),
+				    "journal being freed has no active tail blocks");
 	} else if (!vdo_is_state_saved(&journal->state) &&
 		   !list_empty(&journal->active_tail_blocks)) {
 		uds_log_warning("journal being freed has uncommitted entries");
@@ -991,8 +991,8 @@ static void initialize_lock_count(struct recovery_journal *journal)
 	atomic_t *decrement_counter = get_decrement_counter(journal, lock_number);
 
 	journal_value = get_counter(journal, lock_number, VDO_ZONE_TYPE_JOURNAL, 0);
-	ASSERT_LOG_ONLY((*journal_value == atomic_read(decrement_counter)),
-			"count to be initialized not in use");
+	VDO_ASSERT_LOG_ONLY((*journal_value == atomic_read(decrement_counter)),
+			    "count to be initialized not in use");
 	*journal_value = journal->entries_per_block + 1;
 	atomic_set(decrement_counter, 0);
 }
@@ -1177,13 +1177,13 @@ static void continue_committed_waiter(struct vdo_waiter *waiter, void *context)
 	int result = (is_read_only(journal) ? VDO_READ_ONLY : VDO_SUCCESS);
 	bool has_decrement;
 
-	ASSERT_LOG_ONLY(vdo_before_journal_point(&journal->commit_point,
-						 &data_vio->recovery_journal_point),
-			"DataVIOs released from recovery journal in order. Recovery journal point is (%llu, %u), but commit waiter point is (%llu, %u)",
-			(unsigned long long) journal->commit_point.sequence_number,
-			journal->commit_point.entry_count,
-			(unsigned long long) data_vio->recovery_journal_point.sequence_number,
-			data_vio->recovery_journal_point.entry_count);
+	VDO_ASSERT_LOG_ONLY(vdo_before_journal_point(&journal->commit_point,
+						     &data_vio->recovery_journal_point),
+			    "DataVIOs released from recovery journal in order. Recovery journal point is (%llu, %u), but commit waiter point is (%llu, %u)",
+			    (unsigned long long) journal->commit_point.sequence_number,
+			    journal->commit_point.entry_count,
+			    (unsigned long long) data_vio->recovery_journal_point.sequence_number,
+			    data_vio->recovery_journal_point.entry_count);
 
 	journal->commit_point = data_vio->recovery_journal_point;
 	data_vio->last_async_operation = VIO_ASYNC_OP_UPDATE_REFERENCE_COUNTS;
@@ -1283,8 +1283,8 @@ static void complete_write(struct vdo_completion *completion)
 		journal->last_write_acknowledged = block->sequence_number;
 
 	last_active_block = get_journal_block(&journal->active_tail_blocks);
-	ASSERT_LOG_ONLY((block->sequence_number >= last_active_block->sequence_number),
-			"completed journal write is still active");
+	VDO_ASSERT_LOG_ONLY((block->sequence_number >= last_active_block->sequence_number),
+			    "completed journal write is still active");
 
 	notify_commit_waiters(journal);
 
@@ -1458,8 +1458,8 @@ void vdo_add_recovery_journal_entry(struct recovery_journal *journal,
 		return;
 	}
 
-	ASSERT_LOG_ONLY(data_vio->recovery_sequence_number == 0,
-			"journal lock not held for new entry");
+	VDO_ASSERT_LOG_ONLY(data_vio->recovery_sequence_number == 0,
+			    "journal lock not held for new entry");
 
 	vdo_advance_journal_point(&journal->append_point, journal->entries_per_block);
 	vdo_waitq_enqueue_waiter(&journal->entry_waiters, &data_vio->waiter);
@@ -1566,13 +1566,13 @@ void vdo_acquire_recovery_journal_block_reference(struct recovery_journal *journ
 	if (sequence_number == 0)
 		return;
 
-	ASSERT_LOG_ONLY((zone_type != VDO_ZONE_TYPE_JOURNAL),
-			"invalid lock count increment from journal zone");
+	VDO_ASSERT_LOG_ONLY((zone_type != VDO_ZONE_TYPE_JOURNAL),
+			    "invalid lock count increment from journal zone");
 
 	lock_number = vdo_get_recovery_journal_block_number(journal, sequence_number);
 	current_value = get_counter(journal, lock_number, zone_type, zone_id);
-	ASSERT_LOG_ONLY(*current_value < U16_MAX,
-			"increment of lock counter must not overflow");
+	VDO_ASSERT_LOG_ONLY(*current_value < U16_MAX,
+			    "increment of lock counter must not overflow");
 
 	if (*current_value == 0) {
 		/*
diff --git a/drivers/md/dm-vdo/repair.c b/drivers/md/dm-vdo/repair.c
index 83322afa454bbe..ce6f78d281f33f 100644
--- a/drivers/md/dm-vdo/repair.c
+++ b/drivers/md/dm-vdo/repair.c
@@ -976,8 +976,8 @@ find_entry_starting_next_page(struct repair_completion *repair,
 		if (needs_sort) {
 			struct numbered_block_mapping *just_sorted_entry =
 				sort_next_heap_element(repair);
-			ASSERT_LOG_ONLY(just_sorted_entry < current_entry,
-					"heap is returning elements in an unexpected order");
+			VDO_ASSERT_LOG_ONLY(just_sorted_entry < current_entry,
+					    "heap is returning elements in an unexpected order");
 		}
 
 		current_entry--;
@@ -1129,8 +1129,8 @@ static void recover_block_map(struct vdo_completion *completion)
 
 	repair->current_entry = &repair->entries[repair->block_map_entry_count - 1];
 	first_sorted_entry = sort_next_heap_element(repair);
-	ASSERT_LOG_ONLY(first_sorted_entry == repair->current_entry,
-			"heap is returning elements in an unexpected order");
+	VDO_ASSERT_LOG_ONLY(first_sorted_entry == repair->current_entry,
+			    "heap is returning elements in an unexpected order");
 
 	/* Prevent any page from being processed until all pages have been launched. */
 	repair->launching = true;
@@ -1489,8 +1489,8 @@ static int extract_new_mappings(struct repair_completion *repair)
 		repair->block_map_entry_count++;
 	}
 
-	result = ASSERT((repair->block_map_entry_count <= repair->entry_count),
-			"approximate entry count is an upper bound");
+	result = VDO_ASSERT((repair->block_map_entry_count <= repair->entry_count),
+			    "approximate entry count is an upper bound");
 	if (result != VDO_SUCCESS)
 		vdo_enter_read_only_mode(vdo, result);
 
diff --git a/drivers/md/dm-vdo/slab-depot.c b/drivers/md/dm-vdo/slab-depot.c
index 3a9b79ae9fb6a7..2c273b82336379 100644
--- a/drivers/md/dm-vdo/slab-depot.c
+++ b/drivers/md/dm-vdo/slab-depot.c
@@ -149,7 +149,7 @@ static void mark_slab_journal_dirty(struct slab_journal *journal, sequence_numbe
 	struct slab_journal *dirty_journal;
 	struct list_head *dirty_list = &journal->slab->allocator->dirty_slab_journals;
 
-	ASSERT_LOG_ONLY(journal->recovery_lock == 0, "slab journal was clean");
+	VDO_ASSERT_LOG_ONLY(journal->recovery_lock == 0, "slab journal was clean");
 
 	journal->recovery_lock = lock;
 	list_for_each_entry_reverse(dirty_journal, dirty_list, dirty_entry) {
@@ -216,7 +216,7 @@ static u8 __must_check compute_fullness_hint(struct slab_depot *depot,
 {
 	block_count_t hint;
 
-	ASSERT_LOG_ONLY((free_blocks < (1 << 23)), "free blocks must be less than 2^23");
+	VDO_ASSERT_LOG_ONLY((free_blocks < (1 << 23)), "free blocks must be less than 2^23");
 
 	if (free_blocks == 0)
 		return 0;
@@ -528,13 +528,13 @@ static void adjust_slab_journal_block_reference(struct slab_journal *journal,
 		return;
 	}
 
-	ASSERT_LOG_ONLY((adjustment != 0), "adjustment must be non-zero");
+	VDO_ASSERT_LOG_ONLY((adjustment != 0), "adjustment must be non-zero");
 	lock = get_lock(journal, sequence_number);
 	if (adjustment < 0) {
-		ASSERT_LOG_ONLY((-adjustment <= lock->count),
-				"adjustment %d of lock count %u for slab journal block %llu must not underflow",
-				adjustment, lock->count,
-				(unsigned long long) sequence_number);
+		VDO_ASSERT_LOG_ONLY((-adjustment <= lock->count),
+				    "adjustment %d of lock count %u for slab journal block %llu must not underflow",
+				    adjustment, lock->count,
+				    (unsigned long long) sequence_number);
 	}
 
 	lock->count += adjustment;
@@ -657,16 +657,16 @@ static void reopen_slab_journal(struct vdo_slab *slab)
 	struct slab_journal *journal = &slab->journal;
 	sequence_number_t block;
 
-	ASSERT_LOG_ONLY(journal->tail_header.entry_count == 0,
-			"vdo_slab journal's active block empty before reopening");
+	VDO_ASSERT_LOG_ONLY(journal->tail_header.entry_count == 0,
+			    "vdo_slab journal's active block empty before reopening");
 	journal->head = journal->tail;
 	initialize_journal_state(journal);
 
 	/* Ensure no locks are spuriously held on an empty journal. */
 	for (block = 1; block <= journal->size; block++) {
-		ASSERT_LOG_ONLY((get_lock(journal, block)->count == 0),
-				"Scrubbed journal's block %llu is not locked",
-				(unsigned long long) block);
+		VDO_ASSERT_LOG_ONLY((get_lock(journal, block)->count == 0),
+				    "Scrubbed journal's block %llu is not locked",
+				    (unsigned long long) block);
 	}
 
 	add_entries(journal);
@@ -753,7 +753,7 @@ static void write_slab_journal_block(struct vdo_waiter *waiter, void *context)
 	/* Copy the tail block into the vio. */
 	memcpy(pooled->vio.data, journal->block, VDO_BLOCK_SIZE);
 
-	ASSERT_LOG_ONLY(unused_entries >= 0, "vdo_slab journal block is not overfull");
+	VDO_ASSERT_LOG_ONLY(unused_entries >= 0, "vdo_slab journal block is not overfull");
 	if (unused_entries > 0) {
 		/*
 		 * Release the per-entry locks for any unused entries in the block we are about to
@@ -902,22 +902,22 @@ static void add_entry(struct slab_journal *journal, physical_block_number_t pbn,
 	struct packed_slab_journal_block *block = journal->block;
 	int result;
 
-	result = ASSERT(vdo_before_journal_point(&journal->tail_header.recovery_point,
-						 &recovery_point),
-			"recovery journal point is monotonically increasing, recovery point: %llu.%u, block recovery point: %llu.%u",
-			(unsigned long long) recovery_point.sequence_number,
-			recovery_point.entry_count,
-			(unsigned long long) journal->tail_header.recovery_point.sequence_number,
-			journal->tail_header.recovery_point.entry_count);
+	result = VDO_ASSERT(vdo_before_journal_point(&journal->tail_header.recovery_point,
+						     &recovery_point),
+			    "recovery journal point is monotonically increasing, recovery point: %llu.%u, block recovery point: %llu.%u",
+			    (unsigned long long) recovery_point.sequence_number,
+			    recovery_point.entry_count,
+			    (unsigned long long) journal->tail_header.recovery_point.sequence_number,
+			    journal->tail_header.recovery_point.entry_count);
 	if (result != VDO_SUCCESS) {
 		vdo_enter_read_only_mode(journal->slab->allocator->depot->vdo, result);
 		return;
 	}
 
 	if (operation == VDO_JOURNAL_BLOCK_MAP_REMAPPING) {
-		result = ASSERT((journal->tail_header.entry_count <
-				 journal->full_entries_per_block),
-				"block has room for full entries");
+		result = VDO_ASSERT((journal->tail_header.entry_count <
+				     journal->full_entries_per_block),
+				    "block has room for full entries");
 		if (result != VDO_SUCCESS) {
 			vdo_enter_read_only_mode(journal->slab->allocator->depot->vdo,
 						 result);
@@ -1365,8 +1365,8 @@ static unsigned int calculate_slab_priority(struct vdo_slab *slab)
  */
 static void prioritize_slab(struct vdo_slab *slab)
 {
-	ASSERT_LOG_ONLY(list_empty(&slab->allocq_entry),
-			"a slab must not already be on a ring when prioritizing");
+	VDO_ASSERT_LOG_ONLY(list_empty(&slab->allocq_entry),
+			    "a slab must not already be on a ring when prioritizing");
 	slab->priority = calculate_slab_priority(slab);
 	vdo_priority_table_enqueue(slab->allocator->prioritized_slabs,
 				   slab->priority, &slab->allocq_entry);
@@ -1649,8 +1649,8 @@ static int __must_check adjust_reference_count(struct vdo_slab *slab,
 		 * the last time it was clean. We must release the per-entry slab journal lock for
 		 * the entry associated with the update we are now doing.
 		 */
-		result = ASSERT(is_valid_journal_point(slab_journal_point),
-				"Reference count adjustments need slab journal points.");
+		result = VDO_ASSERT(is_valid_journal_point(slab_journal_point),
+				    "Reference count adjustments need slab journal points.");
 		if (result != VDO_SUCCESS)
 			return result;
 
@@ -1819,16 +1819,16 @@ static void add_entries(struct slab_journal *journal)
 			 * scrubbing thresholds, this should never happen.
 			 */
 			if (lock->count > 0) {
-				ASSERT_LOG_ONLY((journal->head + journal->size) == journal->tail,
-						"New block has locks, but journal is not full");
+				VDO_ASSERT_LOG_ONLY((journal->head + journal->size) == journal->tail,
+						    "New block has locks, but journal is not full");
 
 				/*
 				 * The blocking threshold must let the journal fill up if the new
 				 * block has locks; if the blocking threshold is smaller than the
 				 * journal size, the new block cannot possibly have locks already.
 				 */
-				ASSERT_LOG_ONLY((journal->blocking_threshold >= journal->size),
-						"New block can have locks already iff blocking threshold is at the end of the journal");
+				VDO_ASSERT_LOG_ONLY((journal->blocking_threshold >= journal->size),
+						    "New block can have locks already iff blocking threshold is at the end of the journal");
 
 				WRITE_ONCE(journal->events->disk_full_count,
 					   journal->events->disk_full_count + 1);
@@ -2355,9 +2355,9 @@ static int allocate_slab_counters(struct vdo_slab *slab)
 	int result;
 	size_t index, bytes;
 
-	result = ASSERT(slab->reference_blocks == NULL,
-			"vdo_slab %u doesn't allocate refcounts twice",
-			slab->slab_number);
+	result = VDO_ASSERT(slab->reference_blocks == NULL,
+			    "vdo_slab %u doesn't allocate refcounts twice",
+			    slab->slab_number);
 	if (result != VDO_SUCCESS)
 		return result;
 
@@ -2497,9 +2497,9 @@ static void load_slab_journal(struct vdo_slab *slab)
 		 * 1. This is impossible, due to the scrubbing threshold, on a real system, so
 		 * don't bother reading the (bogus) data off disk.
 		 */
-		ASSERT_LOG_ONLY(((journal->size < 16) ||
-				 (journal->scrubbing_threshold < (journal->size - 1))),
-				"Scrubbing threshold protects against reads of unwritten slab journal blocks");
+		VDO_ASSERT_LOG_ONLY(((journal->size < 16) ||
+				     (journal->scrubbing_threshold < (journal->size - 1))),
+				    "Scrubbing threshold protects against reads of unwritten slab journal blocks");
 		vdo_finish_loading_with_result(&slab->state,
 					       allocate_counters_if_clean(slab));
 		return;
@@ -2513,8 +2513,8 @@ static void register_slab_for_scrubbing(struct vdo_slab *slab, bool high_priorit
 {
 	struct slab_scrubber *scrubber = &slab->allocator->scrubber;
 
-	ASSERT_LOG_ONLY((slab->status != VDO_SLAB_REBUILT),
-			"slab to be scrubbed is unrecovered");
+	VDO_ASSERT_LOG_ONLY((slab->status != VDO_SLAB_REBUILT),
+			    "slab to be scrubbed is unrecovered");
 
 	if (slab->status != VDO_SLAB_REQUIRES_SCRUBBING)
 		return;
@@ -2541,17 +2541,17 @@ static void queue_slab(struct vdo_slab *slab)
 	block_count_t free_blocks;
 	int result;
 
-	ASSERT_LOG_ONLY(list_empty(&slab->allocq_entry),
+	VDO_ASSERT_LOG_ONLY(list_empty(&slab->allocq_entry),
 			"a requeued slab must not already be on a ring");
 
 	if (vdo_is_read_only(allocator->depot->vdo))
 		return;
 
 	free_blocks = slab->free_blocks;
-	result = ASSERT((free_blocks <= allocator->depot->slab_config.data_blocks),
-			"rebuilt slab %u must have a valid free block count (has %llu, expected maximum %llu)",
-			slab->slab_number, (unsigned long long) free_blocks,
-			(unsigned long long) allocator->depot->slab_config.data_blocks);
+	result = VDO_ASSERT((free_blocks <= allocator->depot->slab_config.data_blocks),
+			    "rebuilt slab %u must have a valid free block count (has %llu, expected maximum %llu)",
+			    slab->slab_number, (unsigned long long) free_blocks,
+			    (unsigned long long) allocator->depot->slab_config.data_blocks);
 	if (result != VDO_SUCCESS) {
 		vdo_enter_read_only_mode(allocator->depot->vdo, result);
 		return;
@@ -2874,9 +2874,9 @@ static void apply_journal_entries(struct vdo_completion *completion)
 	 * At the end of rebuild, the reference counters should be accurate to the end of the
 	 * journal we just applied.
 	 */
-	result = ASSERT(!vdo_before_journal_point(&last_entry_applied,
-						  &ref_counts_point),
-			"Refcounts are not more accurate than the slab journal");
+	result = VDO_ASSERT(!vdo_before_journal_point(&last_entry_applied,
+						      &ref_counts_point),
+			    "Refcounts are not more accurate than the slab journal");
 	if (result != VDO_SUCCESS) {
 		abort_scrubbing(scrubber, result);
 		return;
@@ -2987,8 +2987,8 @@ static void scrub_slabs(struct block_allocator *allocator, struct vdo_completion
 static inline void assert_on_allocator_thread(thread_id_t thread_id,
 					      const char *function_name)
 {
-	ASSERT_LOG_ONLY((vdo_get_callback_thread_id() == thread_id),
-			"%s called on correct thread", function_name);
+	VDO_ASSERT_LOG_ONLY((vdo_get_callback_thread_id() == thread_id),
+			    "%s called on correct thread", function_name);
 }
 
 static void register_slab_with_allocator(struct block_allocator *allocator,
@@ -3136,8 +3136,8 @@ static int __must_check allocate_slab_block(struct vdo_slab *slab,
 	if (!search_reference_blocks(slab, &free_index))
 		return VDO_NO_SPACE;
 
-	ASSERT_LOG_ONLY((slab->counters[free_index] == EMPTY_REFERENCE_COUNT),
-			"free block must have ref count of zero");
+	VDO_ASSERT_LOG_ONLY((slab->counters[free_index] == EMPTY_REFERENCE_COUNT),
+			    "free block must have ref count of zero");
 	make_provisional_reference(slab, free_index);
 	adjust_free_block_count(slab, false);
 
@@ -3844,8 +3844,8 @@ static bool __must_check release_recovery_journal_lock(struct slab_journal *jour
 						       sequence_number_t recovery_lock)
 {
 	if (recovery_lock > journal->recovery_lock) {
-		ASSERT_LOG_ONLY((recovery_lock < journal->recovery_lock),
-				"slab journal recovery lock is not older than the recovery journal head");
+		VDO_ASSERT_LOG_ONLY((recovery_lock < journal->recovery_lock),
+				    "slab journal recovery lock is not older than the recovery journal head");
 		return false;
 	}
 
@@ -4659,8 +4659,8 @@ int vdo_prepare_to_grow_slab_depot(struct slab_depot *depot,
 		return VDO_INCREMENT_TOO_SMALL;
 
 	/* Generate the depot configuration for the new block count. */
-	ASSERT_LOG_ONLY(depot->first_block == partition->offset,
-			"New slab depot partition doesn't change origin");
+	VDO_ASSERT_LOG_ONLY(depot->first_block == partition->offset,
+			    "New slab depot partition doesn't change origin");
 	result = vdo_configure_slab_depot(partition, depot->slab_config,
 					  depot->zone_count, &new_state);
 	if (result != VDO_SUCCESS)
@@ -4734,7 +4734,7 @@ static void register_new_slabs(void *context, zone_count_t zone_number,
  */
 void vdo_use_new_slabs(struct slab_depot *depot, struct vdo_completion *parent)
 {
-	ASSERT_LOG_ONLY(depot->new_slabs != NULL, "Must have new slabs to use");
+	VDO_ASSERT_LOG_ONLY(depot->new_slabs != NULL, "Must have new slabs to use");
 	vdo_schedule_operation(depot->action_manager,
 			       VDO_ADMIN_STATE_SUSPENDED_OPERATION,
 			       NULL, register_new_slabs,
@@ -4790,8 +4790,8 @@ static void do_drain_step(struct vdo_completion *completion)
 		return;
 
 	case VDO_DRAIN_ALLOCATOR_STEP_FINISHED:
-		ASSERT_LOG_ONLY(!is_vio_pool_busy(allocator->vio_pool),
-				"vio pool not busy");
+		VDO_ASSERT_LOG_ONLY(!is_vio_pool_busy(allocator->vio_pool),
+				    "vio pool not busy");
 		vdo_finish_draining_with_result(&allocator->state, completion->result);
 		return;
 
diff --git a/drivers/md/dm-vdo/thread-registry.c b/drivers/md/dm-vdo/thread-registry.c
index 03e2f45e8e7874..d4a077d58c60c4 100644
--- a/drivers/md/dm-vdo/thread-registry.c
+++ b/drivers/md/dm-vdo/thread-registry.c
@@ -44,7 +44,7 @@ void vdo_register_thread(struct thread_registry *registry,
 	list_add_tail_rcu(&new_thread->links, &registry->links);
 	spin_unlock(&registry->lock);
 
-	ASSERT_LOG_ONLY(!found_it, "new thread not already in registry");
+	VDO_ASSERT_LOG_ONLY(!found_it, "new thread not already in registry");
 	if (found_it) {
 		/* Ensure no RCU iterators see it before re-initializing. */
 		synchronize_rcu();
@@ -67,7 +67,7 @@ void vdo_unregister_thread(struct thread_registry *registry)
 	}
 	spin_unlock(&registry->lock);
 
-	ASSERT_LOG_ONLY(found_it, "thread found in registry");
+	VDO_ASSERT_LOG_ONLY(found_it, "thread found in registry");
 	if (found_it) {
 		/* Ensure no RCU iterators see it before re-initializing. */
 		synchronize_rcu();
diff --git a/drivers/md/dm-vdo/vdo.c b/drivers/md/dm-vdo/vdo.c
index 283bda0c830728..5fbdeccf3fc606 100644
--- a/drivers/md/dm-vdo/vdo.c
+++ b/drivers/md/dm-vdo/vdo.c
@@ -427,9 +427,9 @@ int vdo_make_thread(struct vdo *vdo, thread_id_t thread_id,
 		type = &default_queue_type;
 
 	if (thread->queue != NULL) {
-		return ASSERT(vdo_work_queue_type_is(thread->queue, type),
-			      "already constructed vdo thread %u is of the correct type",
-			      thread_id);
+		return VDO_ASSERT(vdo_work_queue_type_is(thread->queue, type),
+				  "already constructed vdo thread %u is of the correct type",
+				  thread_id);
 	}
 
 	thread->vdo = vdo;
@@ -450,8 +450,8 @@ static int register_vdo(struct vdo *vdo)
 	int result;
 
 	write_lock(&registry.lock);
-	result = ASSERT(filter_vdos_locked(vdo_is_equal, vdo) == NULL,
-			"VDO not already registered");
+	result = VDO_ASSERT(filter_vdos_locked(vdo_is_equal, vdo) == NULL,
+			    "VDO not already registered");
 	if (result == VDO_SUCCESS) {
 		INIT_LIST_HEAD(&vdo->registration);
 		list_add_tail(&vdo->registration, &registry.links);
@@ -1000,8 +1000,8 @@ int vdo_register_read_only_listener(struct vdo *vdo, void *listener,
 	struct read_only_listener *read_only_listener;
 	int result;
 
-	result = ASSERT(thread_id != vdo->thread_config.dedupe_thread,
-			"read only listener not registered on dedupe thread");
+	result = VDO_ASSERT(thread_id != vdo->thread_config.dedupe_thread,
+			    "read only listener not registered on dedupe thread");
 	if (result != VDO_SUCCESS)
 		return result;
 
@@ -1654,8 +1654,8 @@ void vdo_dump_status(const struct vdo *vdo)
  */
 void vdo_assert_on_admin_thread(const struct vdo *vdo, const char *name)
 {
-	ASSERT_LOG_ONLY((vdo_get_callback_thread_id() == vdo->thread_config.admin_thread),
-			"%s called on admin thread", name);
+	VDO_ASSERT_LOG_ONLY((vdo_get_callback_thread_id() == vdo->thread_config.admin_thread),
+			    "%s called on admin thread", name);
 }
 
 /**
@@ -1668,9 +1668,9 @@ void vdo_assert_on_admin_thread(const struct vdo *vdo, const char *name)
 void vdo_assert_on_logical_zone_thread(const struct vdo *vdo, zone_count_t logical_zone,
 				       const char *name)
 {
-	ASSERT_LOG_ONLY((vdo_get_callback_thread_id() ==
-			 vdo->thread_config.logical_threads[logical_zone]),
-			"%s called on logical thread", name);
+	VDO_ASSERT_LOG_ONLY((vdo_get_callback_thread_id() ==
+			     vdo->thread_config.logical_threads[logical_zone]),
+			    "%s called on logical thread", name);
 }
 
 /**
@@ -1683,9 +1683,9 @@ void vdo_assert_on_logical_zone_thread(const struct vdo *vdo, zone_count_t logic
 void vdo_assert_on_physical_zone_thread(const struct vdo *vdo,
 					zone_count_t physical_zone, const char *name)
 {
-	ASSERT_LOG_ONLY((vdo_get_callback_thread_id() ==
-			 vdo->thread_config.physical_threads[physical_zone]),
-			"%s called on physical thread", name);
+	VDO_ASSERT_LOG_ONLY((vdo_get_callback_thread_id() ==
+			     vdo->thread_config.physical_threads[physical_zone]),
+			    "%s called on physical thread", name);
 }
 
 /**
@@ -1723,7 +1723,7 @@ int vdo_get_physical_zone(const struct vdo *vdo, physical_block_number_t pbn,
 
 	/* With the PBN already checked, we should always succeed in finding a slab. */
 	slab = vdo_get_slab(vdo->depot, pbn);
-	result = ASSERT(slab != NULL, "vdo_get_slab must succeed on all valid PBNs");
+	result = VDO_ASSERT(slab != NULL, "vdo_get_slab must succeed on all valid PBNs");
 	if (result != VDO_SUCCESS)
 		return result;
 
diff --git a/drivers/md/dm-vdo/vio.c b/drivers/md/dm-vdo/vio.c
index f4441f9ff77235..edcb010ab125c6 100644
--- a/drivers/md/dm-vdo/vio.c
+++ b/drivers/md/dm-vdo/vio.c
@@ -82,14 +82,14 @@ int allocate_vio_components(struct vdo *vdo, enum vio_type vio_type,
 	struct bio *bio;
 	int result;
 
-	result = ASSERT(block_count <= MAX_BLOCKS_PER_VIO,
-			"block count %u does not exceed maximum %u", block_count,
-			MAX_BLOCKS_PER_VIO);
+	result = VDO_ASSERT(block_count <= MAX_BLOCKS_PER_VIO,
+			    "block count %u does not exceed maximum %u", block_count,
+			    MAX_BLOCKS_PER_VIO);
 	if (result != VDO_SUCCESS)
 		return result;
 
-	result = ASSERT(((vio_type != VIO_TYPE_UNINITIALIZED) && (vio_type != VIO_TYPE_DATA)),
-			"%d is a metadata type", vio_type);
+	result = VDO_ASSERT(((vio_type != VIO_TYPE_UNINITIALIZED) && (vio_type != VIO_TYPE_DATA)),
+			    "%d is a metadata type", vio_type);
 	if (result != VDO_SUCCESS)
 		return result;
 
@@ -364,13 +364,13 @@ void free_vio_pool(struct vio_pool *pool)
 		return;
 
 	/* Remove all available vios from the object pool. */
-	ASSERT_LOG_ONLY(!vdo_waitq_has_waiters(&pool->waiting),
-			"VIO pool must not have any waiters when being freed");
-	ASSERT_LOG_ONLY((pool->busy_count == 0),
-			"VIO pool must not have %zu busy entries when being freed",
-			pool->busy_count);
-	ASSERT_LOG_ONLY(list_empty(&pool->busy),
-			"VIO pool must not have busy entries when being freed");
+	VDO_ASSERT_LOG_ONLY(!vdo_waitq_has_waiters(&pool->waiting),
+			    "VIO pool must not have any waiters when being freed");
+	VDO_ASSERT_LOG_ONLY((pool->busy_count == 0),
+			    "VIO pool must not have %zu busy entries when being freed",
+			    pool->busy_count);
+	VDO_ASSERT_LOG_ONLY(list_empty(&pool->busy),
+			    "VIO pool must not have busy entries when being freed");
 
 	list_for_each_entry_safe(pooled, tmp, &pool->available, pool_entry) {
 		list_del(&pooled->pool_entry);
@@ -378,8 +378,8 @@ void free_vio_pool(struct vio_pool *pool)
 		pool->size--;
 	}
 
-	ASSERT_LOG_ONLY(pool->size == 0,
-			"VIO pool must not have missing entries when being freed");
+	VDO_ASSERT_LOG_ONLY(pool->size == 0,
+			    "VIO pool must not have missing entries when being freed");
 
 	vdo_free(vdo_forget(pool->buffer));
 	vdo_free(pool);
@@ -404,8 +404,8 @@ void acquire_vio_from_pool(struct vio_pool *pool, struct vdo_waiter *waiter)
 {
 	struct pooled_vio *pooled;
 
-	ASSERT_LOG_ONLY((pool->thread_id == vdo_get_callback_thread_id()),
-			"acquire from active vio_pool called from correct thread");
+	VDO_ASSERT_LOG_ONLY((pool->thread_id == vdo_get_callback_thread_id()),
+			    "acquire from active vio_pool called from correct thread");
 
 	if (list_empty(&pool->available)) {
 		vdo_waitq_enqueue_waiter(&pool->waiting, waiter);
@@ -425,8 +425,8 @@ void acquire_vio_from_pool(struct vio_pool *pool, struct vdo_waiter *waiter)
  */
 void return_vio_to_pool(struct vio_pool *pool, struct pooled_vio *vio)
 {
-	ASSERT_LOG_ONLY((pool->thread_id == vdo_get_callback_thread_id()),
-			"vio pool entry returned on same thread as it was acquired");
+	VDO_ASSERT_LOG_ONLY((pool->thread_id == vdo_get_callback_thread_id()),
+			    "vio pool entry returned on same thread as it was acquired");
 
 	vio->vio.completion.error_handler = NULL;
 	vio->vio.completion.parent = NULL;
@@ -466,8 +466,8 @@ void vdo_count_bios(struct atomic_bio_stats *bio_stats, struct bio *bio)
 		 * shouldn't exist.
 		 */
 	default:
-		ASSERT_LOG_ONLY(0, "Bio operation %d not a write, read, discard, or empty flush",
-				bio_op(bio));
+		VDO_ASSERT_LOG_ONLY(0, "Bio operation %d not a write, read, discard, or empty flush",
+				    bio_op(bio));
 	}
 
 	if ((bio->bi_opf & REQ_PREFLUSH) != 0)
diff --git a/drivers/md/dm-vdo/vio.h b/drivers/md/dm-vdo/vio.h
index fbfee5e3415da4..3490e9f59b04aa 100644
--- a/drivers/md/dm-vdo/vio.h
+++ b/drivers/md/dm-vdo/vio.h
@@ -67,10 +67,10 @@ static inline void assert_vio_in_bio_zone(struct vio *vio)
 	thread_id_t expected = get_vio_bio_zone_thread_id(vio);
 	thread_id_t thread_id = vdo_get_callback_thread_id();
 
-	ASSERT_LOG_ONLY((expected == thread_id),
-			"vio I/O for physical block %llu on thread %u, should be on bio zone thread %u",
-			(unsigned long long) pbn_from_vio_bio(vio->bio), thread_id,
-			expected);
+	VDO_ASSERT_LOG_ONLY((expected == thread_id),
+			    "vio I/O for physical block %llu on thread %u, should be on bio zone thread %u",
+			    (unsigned long long) pbn_from_vio_bio(vio->bio), thread_id,
+			    expected);
 }
 
 int vdo_create_bio(struct bio **bio_ptr);

From 16ee863acc220c4cb9f4f3c7f1449c066cbeef03 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@kernel.org>
Date: Tue, 13 Feb 2024 16:03:01 -0600
Subject: [PATCH 0914/1406] dm vdo encodings: update some stale comments

Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 drivers/md/dm-vdo/encodings.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/drivers/md/dm-vdo/encodings.c b/drivers/md/dm-vdo/encodings.c
index f330f67a4dca06..1f0fae2a4e8a26 100644
--- a/drivers/md/dm-vdo/encodings.c
+++ b/drivers/md/dm-vdo/encodings.c
@@ -546,8 +546,6 @@ const char *vdo_get_journal_operation_name(enum journal_operation operation)
 
 /**
  * encode_slab_depot_state_2_0() - Encode the state of a slab depot into a buffer.
- *
- * Return: UDS_SUCCESS or an error.
  */
 static void encode_slab_depot_state_2_0(u8 *buffer, size_t *offset,
 					struct slab_depot_state_2_0 state)
@@ -575,7 +573,7 @@ static void encode_slab_depot_state_2_0(u8 *buffer, size_t *offset,
 /**
  * decode_slab_depot_state_2_0() - Decode slab depot component state version 2.0 from a buffer.
  *
- * Return: UDS_SUCCESS or an error code.
+ * Return: VDO_SUCCESS or an error code.
  */
 static int decode_slab_depot_state_2_0(u8 *buffer, size_t *offset,
 				       struct slab_depot_state_2_0 *state)

From 86727d98f86075e91c8a8c0c547513b1d25c8faa Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@kernel.org>
Date: Tue, 13 Feb 2024 16:03:47 -0600
Subject: [PATCH 0915/1406] dm vdo target: eliminate inapropriate uses of
 UDS_SUCCESS

Most should be VDO_SUCCESS.  But comparing the return from
kstrtouint() with UDS_SUCCESS (happens to be 0) made no sense.

Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 drivers/md/dm-vdo/dm-vdo-target.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/drivers/md/dm-vdo/dm-vdo-target.c b/drivers/md/dm-vdo/dm-vdo-target.c
index f1c5e656f4ddb7..322ce52bcc2c43 100644
--- a/drivers/md/dm-vdo/dm-vdo-target.c
+++ b/drivers/md/dm-vdo/dm-vdo-target.c
@@ -318,7 +318,7 @@ static int split_string(const char *string, char separator, char ***substring_ar
 	current_substring++;
 	/* substrings[current_substring] is NULL already */
 	*substring_array_ptr = substrings;
-	return UDS_SUCCESS;
+	return VDO_SUCCESS;
 }
 
 /*
@@ -356,7 +356,7 @@ static int join_strings(char **substring_array, size_t array_length, char separa
 		*(current_position - 1) = '\0';
 
 	*string_ptr = output;
-	return UDS_SUCCESS;
+	return VDO_SUCCESS;
 }
 
 /**
@@ -484,7 +484,7 @@ static int parse_one_thread_config_spec(const char *spec,
 	int result;
 
 	result = split_string(spec, '=', &fields);
-	if (result != UDS_SUCCESS)
+	if (result != VDO_SUCCESS)
 		return result;
 
 	if ((fields[0] == NULL) || (fields[1] == NULL) || (fields[2] != NULL)) {
@@ -495,7 +495,7 @@ static int parse_one_thread_config_spec(const char *spec,
 	}
 
 	result = kstrtouint(fields[1], 10, &count);
-	if (result != UDS_SUCCESS) {
+	if (result) {
 		uds_log_error("thread config string error: integer value needed, found \"%s\"",
 			      fields[1]);
 		free_string_array(fields);
@@ -537,7 +537,7 @@ static int parse_thread_config_string(const char *string,
 		unsigned int i;
 
 		result = split_string(string, ',', &specs);
-		if (result != UDS_SUCCESS)
+		if (result != VDO_SUCCESS)
 			return result;
 
 		for (i = 0; specs[i] != NULL; i++) {
@@ -607,7 +607,7 @@ static int parse_one_key_value_pair(const char *key, const char *value,
 
 	/* The remaining arguments must have integral values. */
 	result = kstrtouint(value, 10, &count);
-	if (result != UDS_SUCCESS) {
+	if (result) {
 		uds_log_error("optional config string error: integer value needed, found \"%s\"",
 			      value);
 		return result;
@@ -2886,7 +2886,7 @@ static int __init vdo_init(void)
 
 	/* Add VDO errors to the already existing set of errors in UDS. */
 	result = vdo_register_status_codes();
-	if (result != UDS_SUCCESS) {
+	if (result != VDO_SUCCESS) {
 		uds_log_error("vdo_register_status_codes failed %d", result);
 		vdo_module_destroy();
 		return result;

From 958115d11d28762b4d35fb12b4181987b6ff906a Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@kernel.org>
Date: Wed, 14 Feb 2024 08:53:29 -0600
Subject: [PATCH 0916/1406] dm vdo logger: remove log level to string
 conversion code

Was only used by sysfs code, can be reinstated if/when needed.

Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 drivers/md/dm-vdo/logger.c | 53 --------------------------------------
 drivers/md/dm-vdo/logger.h |  4 ---
 2 files changed, 57 deletions(-)

diff --git a/drivers/md/dm-vdo/logger.c b/drivers/md/dm-vdo/logger.c
index 6dc29219f70259..aaab2f1f0c53d2 100644
--- a/drivers/md/dm-vdo/logger.c
+++ b/drivers/md/dm-vdo/logger.c
@@ -16,39 +16,6 @@
 #include "thread-device.h"
 #include "thread-utils.h"
 
-struct priority_name {
-	const char *name;
-	const int priority;
-};
-
-static const struct priority_name PRIORITIES[] = {
-	{ "ALERT", UDS_LOG_ALERT },
-	{ "CRITICAL", UDS_LOG_CRIT },
-	{ "CRIT", UDS_LOG_CRIT },
-	{ "DEBUG", UDS_LOG_DEBUG },
-	{ "EMERGENCY", UDS_LOG_EMERG },
-	{ "EMERG", UDS_LOG_EMERG },
-	{ "ERROR", UDS_LOG_ERR },
-	{ "ERR", UDS_LOG_ERR },
-	{ "INFO", UDS_LOG_INFO },
-	{ "NOTICE", UDS_LOG_NOTICE },
-	{ "PANIC", UDS_LOG_EMERG },
-	{ "WARN", UDS_LOG_WARNING },
-	{ "WARNING", UDS_LOG_WARNING },
-	{ NULL, -1 },
-};
-
-static const char *const PRIORITY_STRINGS[] = {
-	"EMERGENCY",
-	"ALERT",
-	"CRITICAL",
-	"ERROR",
-	"WARN",
-	"NOTICE",
-	"INFO",
-	"DEBUG",
-};
-
 int log_level = UDS_LOG_DEFAULT;
 
 int uds_get_log_level(void)
@@ -62,26 +29,6 @@ int uds_get_log_level(void)
 	return log_level_latch;
 }
 
-int uds_log_string_to_priority(const char *string)
-{
-	int i;
-
-	for (i = 0; PRIORITIES[i].name != NULL; i++) {
-		if (strcasecmp(string, PRIORITIES[i].name) == 0)
-			return PRIORITIES[i].priority;
-	}
-
-	return UDS_LOG_INFO;
-}
-
-const char *uds_log_priority_to_string(int priority)
-{
-	if ((priority < 0) || (priority >= (int) ARRAY_SIZE(PRIORITY_STRINGS)))
-		return "unknown";
-
-	return PRIORITY_STRINGS[priority];
-}
-
 static const char *get_current_interrupt_type(void)
 {
 	if (in_nmi())
diff --git a/drivers/md/dm-vdo/logger.h b/drivers/md/dm-vdo/logger.h
index 2da2bd351578d6..2e6e921c8d6311 100644
--- a/drivers/md/dm-vdo/logger.h
+++ b/drivers/md/dm-vdo/logger.h
@@ -45,10 +45,6 @@ extern int log_level;
 
 int uds_get_log_level(void);
 
-int uds_log_string_to_priority(const char *string);
-
-const char *uds_log_priority_to_string(int priority);
-
 void uds_log_embedded_message(int priority, const char *module, const char *prefix,
 			      const char *fmt1, va_list args1, const char *fmt2, ...)
 	__printf(4, 0) __printf(6, 7);

From 82cbad98d1f9f5ccebdda6c1208a3bfeed12afd3 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@kernel.org>
Date: Wed, 14 Feb 2024 09:22:04 -0600
Subject: [PATCH 0917/1406] dm vdo logger: change from uds_ to vdo_ namespace

Rename all uds_log_* to vdo_log_*.  Also fixup some other related code
(e.g. permassert renames).

Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 drivers/md/dm-vdo/admin-state.c            |   8 +-
 drivers/md/dm-vdo/block-map.c              |  20 +--
 drivers/md/dm-vdo/data-vio.c               |  18 +--
 drivers/md/dm-vdo/dedupe.c                 |  36 +++---
 drivers/md/dm-vdo/dm-vdo-target.c          | 144 ++++++++++-----------
 drivers/md/dm-vdo/dump.c                   |  14 +-
 drivers/md/dm-vdo/encodings.c              |  26 ++--
 drivers/md/dm-vdo/errors.c                 |   6 +-
 drivers/md/dm-vdo/errors.h                 |   4 +-
 drivers/md/dm-vdo/flush.c                  |   8 +-
 drivers/md/dm-vdo/funnel-workqueue.c       |   2 +-
 drivers/md/dm-vdo/indexer/chapter-index.c  |   4 +-
 drivers/md/dm-vdo/indexer/config.c         |  48 +++----
 drivers/md/dm-vdo/indexer/delta-index.c    |  50 +++----
 drivers/md/dm-vdo/indexer/index-layout.c   |  82 ++++++------
 drivers/md/dm-vdo/indexer/index-page-map.c |   2 +-
 drivers/md/dm-vdo/indexer/index-session.c  |  44 +++----
 drivers/md/dm-vdo/indexer/index.c          |  52 ++++----
 drivers/md/dm-vdo/indexer/io-factory.c     |   2 +-
 drivers/md/dm-vdo/indexer/open-chapter.c   |   6 +-
 drivers/md/dm-vdo/indexer/volume-index.c   |  46 +++----
 drivers/md/dm-vdo/indexer/volume.c         |  64 ++++-----
 drivers/md/dm-vdo/int-map.c                |   2 +-
 drivers/md/dm-vdo/io-submitter.c           |   4 +-
 drivers/md/dm-vdo/logger.c                 |  52 ++++----
 drivers/md/dm-vdo/logger.h                 |  85 ++++++------
 drivers/md/dm-vdo/logical-zone.c           |   4 +-
 drivers/md/dm-vdo/memory-alloc.c           |  14 +-
 drivers/md/dm-vdo/packer.c                 |   6 +-
 drivers/md/dm-vdo/permassert.c             |   6 +-
 drivers/md/dm-vdo/permassert.h             |   4 +-
 drivers/md/dm-vdo/physical-zone.c          |   6 +-
 drivers/md/dm-vdo/recovery-journal.c       |  16 +--
 drivers/md/dm-vdo/repair.c                 |  46 +++----
 drivers/md/dm-vdo/slab-depot.c             |  54 ++++----
 drivers/md/dm-vdo/status-codes.c           |   6 +-
 drivers/md/dm-vdo/thread-utils.c           |   2 +-
 drivers/md/dm-vdo/vdo.c                    |  14 +-
 drivers/md/dm-vdo/vio.c                    |  10 +-
 39 files changed, 509 insertions(+), 508 deletions(-)

diff --git a/drivers/md/dm-vdo/admin-state.c b/drivers/md/dm-vdo/admin-state.c
index 603fd6e3406ab8..8663c1e916164a 100644
--- a/drivers/md/dm-vdo/admin-state.c
+++ b/drivers/md/dm-vdo/admin-state.c
@@ -228,12 +228,12 @@ static int __must_check begin_operation(struct admin_state *state,
 	const struct admin_state_code *next_state = get_next_state(state, operation);
 
 	if (next_state == NULL) {
-		result = uds_log_error_strerror(VDO_INVALID_ADMIN_STATE,
+		result = vdo_log_error_strerror(VDO_INVALID_ADMIN_STATE,
 						"Can't start %s from %s",
 						operation->name,
 						vdo_get_admin_state_code(state)->name);
 	} else if (state->waiter != NULL) {
-		result = uds_log_error_strerror(VDO_COMPONENT_BUSY,
+		result = vdo_log_error_strerror(VDO_COMPONENT_BUSY,
 						"Can't start %s with extant waiter",
 						operation->name);
 	} else {
@@ -291,7 +291,7 @@ static bool check_code(bool valid, const struct admin_state_code *code, const ch
 	if (valid)
 		return true;
 
-	result = uds_log_error_strerror(VDO_INVALID_ADMIN_STATE,
+	result = vdo_log_error_strerror(VDO_INVALID_ADMIN_STATE,
 					"%s is not a %s", code->name, what);
 	if (waiter != NULL)
 		vdo_continue_completion(waiter, result);
@@ -334,7 +334,7 @@ bool vdo_start_draining(struct admin_state *state,
 	}
 
 	if (!code->normal) {
-		uds_log_error_strerror(VDO_INVALID_ADMIN_STATE, "can't start %s from %s",
+		vdo_log_error_strerror(VDO_INVALID_ADMIN_STATE, "can't start %s from %s",
 				       operation->name, code->name);
 		vdo_continue_completion(waiter, VDO_INVALID_ADMIN_STATE);
 		return false;
diff --git a/drivers/md/dm-vdo/block-map.c b/drivers/md/dm-vdo/block-map.c
index cd81eb2715e829..42ec3a252e7393 100644
--- a/drivers/md/dm-vdo/block-map.c
+++ b/drivers/md/dm-vdo/block-map.c
@@ -266,7 +266,7 @@ static void report_cache_pressure(struct vdo_page_cache *cache)
 	ADD_ONCE(cache->stats.cache_pressure, 1);
 	if (cache->waiter_count > cache->page_count) {
 		if ((cache->pressure_report % LOG_INTERVAL) == 0)
-			uds_log_info("page cache pressure %u", cache->stats.cache_pressure);
+			vdo_log_info("page cache pressure %u", cache->stats.cache_pressure);
 
 		if (++cache->pressure_report >= DISPLAY_INTERVAL)
 			cache->pressure_report = 0;
@@ -485,7 +485,7 @@ static void complete_with_page(struct page_info *info,
 	bool available = vdo_page_comp->writable ? is_present(info) : is_valid(info);
 
 	if (!available) {
-		uds_log_error_strerror(VDO_BAD_PAGE,
+		vdo_log_error_strerror(VDO_BAD_PAGE,
 				       "Requested cache page %llu in state %s is not %s",
 				       (unsigned long long) info->pbn,
 				       get_page_state_name(info->state),
@@ -565,7 +565,7 @@ static void set_persistent_error(struct vdo_page_cache *cache, const char *conte
 	struct vdo *vdo = cache->vdo;
 
 	if ((result != VDO_READ_ONLY) && !vdo_is_read_only(vdo)) {
-		uds_log_error_strerror(result, "VDO Page Cache persistent error: %s",
+		vdo_log_error_strerror(result, "VDO Page Cache persistent error: %s",
 				       context);
 		vdo_enter_read_only_mode(vdo, result);
 	}
@@ -706,7 +706,7 @@ static void page_is_loaded(struct vdo_completion *completion)
 	validity = vdo_validate_block_map_page(page, nonce, info->pbn);
 	if (validity == VDO_BLOCK_MAP_PAGE_BAD) {
 		physical_block_number_t pbn = vdo_get_block_map_page_pbn(page);
-		int result = uds_log_error_strerror(VDO_BAD_PAGE,
+		int result = vdo_log_error_strerror(VDO_BAD_PAGE,
 						    "Expected page %llu but got page %llu instead",
 						    (unsigned long long) info->pbn,
 						    (unsigned long long) pbn);
@@ -896,7 +896,7 @@ static void allocate_free_page(struct page_info *info)
 
 	if (!vdo_waitq_has_waiters(&cache->free_waiters)) {
 		if (cache->stats.cache_pressure > 0) {
-			uds_log_info("page cache pressure relieved");
+			vdo_log_info("page cache pressure relieved");
 			WRITE_ONCE(cache->stats.cache_pressure, 0);
 		}
 
@@ -1014,7 +1014,7 @@ static void handle_page_write_error(struct vdo_completion *completion)
 
 	/* If we're already read-only, write failures are to be expected. */
 	if (result != VDO_READ_ONLY) {
-		uds_log_ratelimit(uds_log_error,
+		vdo_log_ratelimit(vdo_log_error,
 				  "failed to write block map page %llu",
 				  (unsigned long long) info->pbn);
 	}
@@ -1399,7 +1399,7 @@ bool vdo_copy_valid_page(char *buffer, nonce_t nonce,
 	}
 
 	if (validity == VDO_BLOCK_MAP_PAGE_BAD) {
-		uds_log_error_strerror(VDO_BAD_PAGE,
+		vdo_log_error_strerror(VDO_BAD_PAGE,
 				       "Expected page %llu but got page %llu instead",
 				       (unsigned long long) pbn,
 				       (unsigned long long) vdo_get_block_map_page_pbn(loaded));
@@ -1787,7 +1787,7 @@ static void continue_with_loaded_page(struct data_vio *data_vio,
 		vdo_unpack_block_map_entry(&page->entries[slot.block_map_slot.slot]);
 
 	if (is_invalid_tree_entry(vdo_from_data_vio(data_vio), &mapping, lock->height)) {
-		uds_log_error_strerror(VDO_BAD_MAPPING,
+		vdo_log_error_strerror(VDO_BAD_MAPPING,
 				       "Invalid block map tree PBN: %llu with state %u for page index %u at height %u",
 				       (unsigned long long) mapping.pbn, mapping.state,
 				       lock->tree_slots[lock->height - 1].page_index,
@@ -2265,7 +2265,7 @@ void vdo_find_block_map_slot(struct data_vio *data_vio)
 	/* The page at this height has been allocated and loaded. */
 	mapping = vdo_unpack_block_map_entry(&page->entries[tree_slot.block_map_slot.slot]);
 	if (is_invalid_tree_entry(vdo_from_data_vio(data_vio), &mapping, lock->height)) {
-		uds_log_error_strerror(VDO_BAD_MAPPING,
+		vdo_log_error_strerror(VDO_BAD_MAPPING,
 				       "Invalid block map tree PBN: %llu with state %u for page index %u at height %u",
 				       (unsigned long long) mapping.pbn, mapping.state,
 				       lock->tree_slots[lock->height - 1].page_index,
@@ -3142,7 +3142,7 @@ static int __must_check set_mapped_location(struct data_vio *data_vio,
 	 * Log the corruption even if we wind up ignoring it for write VIOs, converting all cases
 	 * to VDO_BAD_MAPPING.
 	 */
-	uds_log_error_strerror(VDO_BAD_MAPPING,
+	vdo_log_error_strerror(VDO_BAD_MAPPING,
 			       "PBN %llu with state %u read from the block map was invalid",
 			       (unsigned long long) mapped.pbn, mapped.state);
 
diff --git a/drivers/md/dm-vdo/data-vio.c b/drivers/md/dm-vdo/data-vio.c
index 5f6f0883850a6b..544c0e10bd37d0 100644
--- a/drivers/md/dm-vdo/data-vio.c
+++ b/drivers/md/dm-vdo/data-vio.c
@@ -794,25 +794,25 @@ static int initialize_data_vio(struct data_vio *data_vio, struct vdo *vdo)
 	result = vdo_allocate_memory(VDO_BLOCK_SIZE, 0, "data_vio data",
 				     &data_vio->vio.data);
 	if (result != VDO_SUCCESS)
-		return uds_log_error_strerror(result,
+		return vdo_log_error_strerror(result,
 					      "data_vio data allocation failure");
 
 	result = vdo_allocate_memory(VDO_BLOCK_SIZE, 0, "compressed block",
 				     &data_vio->compression.block);
 	if (result != VDO_SUCCESS) {
-		return uds_log_error_strerror(result,
+		return vdo_log_error_strerror(result,
 					      "data_vio compressed block allocation failure");
 	}
 
 	result = vdo_allocate_memory(VDO_BLOCK_SIZE, 0, "vio scratch",
 				     &data_vio->scratch_block);
 	if (result != VDO_SUCCESS)
-		return uds_log_error_strerror(result,
+		return vdo_log_error_strerror(result,
 					      "data_vio scratch allocation failure");
 
 	result = vdo_create_bio(&bio);
 	if (result != VDO_SUCCESS)
-		return uds_log_error_strerror(result,
+		return vdo_log_error_strerror(result,
 					      "data_vio data bio allocation failure");
 
 	vdo_initialize_completion(&data_vio->decrement_completion, vdo,
@@ -1027,7 +1027,7 @@ void resume_data_vio_pool(struct data_vio_pool *pool, struct vdo_completion *com
 
 static void dump_limiter(const char *name, struct limiter *limiter)
 {
-	uds_log_info("%s: %u of %u busy (max %u), %s", name, limiter->busy,
+	vdo_log_info("%s: %u of %u busy (max %u), %s", name, limiter->busy,
 		     limiter->limit, limiter->max_busy,
 		     ((bio_list_empty(&limiter->waiters) &&
 		       bio_list_empty(&limiter->new_waiters)) ?
@@ -1325,7 +1325,7 @@ static void perform_cleanup_stage(struct data_vio *data_vio,
 		if ((data_vio->recovery_sequence_number > 0) &&
 		    (READ_ONCE(vdo->read_only_notifier.read_only_error) == VDO_SUCCESS) &&
 		    (data_vio->vio.completion.result != VDO_READ_ONLY))
-			uds_log_warning("VDO not read-only when cleaning data_vio with RJ lock");
+			vdo_log_warning("VDO not read-only when cleaning data_vio with RJ lock");
 		fallthrough;
 
 	case VIO_RELEASE_LOGICAL:
@@ -1355,7 +1355,7 @@ static void enter_read_only_mode(struct vdo_completion *completion)
 	if (completion->result != VDO_READ_ONLY) {
 		struct data_vio *data_vio = as_data_vio(completion);
 
-		uds_log_error_strerror(completion->result,
+		vdo_log_error_strerror(completion->result,
 				       "Preparing to enter read-only mode: data_vio for LBN %llu (becoming mapped to %llu, previously mapped to %llu, allocated %llu) is completing with a fatal error after operation %s",
 				       (unsigned long long) data_vio->logical.lbn,
 				       (unsigned long long) data_vio->new_mapped.pbn,
@@ -1451,14 +1451,14 @@ int uncompress_data_vio(struct data_vio *data_vio,
 						       &fragment_offset, &fragment_size);
 
 	if (result != VDO_SUCCESS) {
-		uds_log_debug("%s: compressed fragment error %d", __func__, result);
+		vdo_log_debug("%s: compressed fragment error %d", __func__, result);
 		return result;
 	}
 
 	size = LZ4_decompress_safe((block->data + fragment_offset), buffer,
 				   fragment_size, VDO_BLOCK_SIZE);
 	if (size != VDO_BLOCK_SIZE) {
-		uds_log_debug("%s: lz4 error", __func__);
+		vdo_log_debug("%s: lz4 error", __func__);
 		return VDO_INVALID_FRAGMENT;
 	}
 
diff --git a/drivers/md/dm-vdo/dedupe.c b/drivers/md/dm-vdo/dedupe.c
index d7d1249cde8c5e..a9b18939559208 100644
--- a/drivers/md/dm-vdo/dedupe.c
+++ b/drivers/md/dm-vdo/dedupe.c
@@ -1293,7 +1293,7 @@ static bool acquire_provisional_reference(struct data_vio *agent, struct pbn_loc
 	if (result == VDO_SUCCESS)
 		return true;
 
-	uds_log_warning_strerror(result,
+	vdo_log_warning_strerror(result,
 				 "Error acquiring provisional reference for dedupe candidate; aborting dedupe");
 	agent->is_duplicate = false;
 	vdo_release_physical_zone_pbn_lock(agent->duplicate.zone,
@@ -1620,7 +1620,7 @@ static bool decode_uds_advice(struct dedupe_context *context)
 
 	version = encoding->data[offset++];
 	if (version != UDS_ADVICE_VERSION) {
-		uds_log_error("invalid UDS advice version code %u", version);
+		vdo_log_error("invalid UDS advice version code %u", version);
 		return false;
 	}
 
@@ -1631,7 +1631,7 @@ static bool decode_uds_advice(struct dedupe_context *context)
 
 	/* Don't use advice that's clearly meaningless. */
 	if ((advice->state == VDO_MAPPING_STATE_UNMAPPED) || (advice->pbn == VDO_ZERO_BLOCK)) {
-		uds_log_debug("Invalid advice from deduplication server: pbn %llu, state %u. Giving up on deduplication of logical block %llu",
+		vdo_log_debug("Invalid advice from deduplication server: pbn %llu, state %u. Giving up on deduplication of logical block %llu",
 			      (unsigned long long) advice->pbn, advice->state,
 			      (unsigned long long) data_vio->logical.lbn);
 		atomic64_inc(&vdo->stats.invalid_advice_pbn_count);
@@ -1640,7 +1640,7 @@ static bool decode_uds_advice(struct dedupe_context *context)
 
 	result = vdo_get_physical_zone(vdo, advice->pbn, &advice->zone);
 	if ((result != VDO_SUCCESS) || (advice->zone == NULL)) {
-		uds_log_debug("Invalid physical block number from deduplication server: %llu, giving up on deduplication of logical block %llu",
+		vdo_log_debug("Invalid physical block number from deduplication server: %llu, giving up on deduplication of logical block %llu",
 			      (unsigned long long) advice->pbn,
 			      (unsigned long long) data_vio->logical.lbn);
 		atomic64_inc(&vdo->stats.invalid_advice_pbn_count);
@@ -2061,7 +2061,7 @@ static void close_index(struct hash_zones *zones)
 	result = uds_close_index(zones->index_session);
 
 	if (result != UDS_SUCCESS)
-		uds_log_error_strerror(result, "Error closing index");
+		vdo_log_error_strerror(result, "Error closing index");
 	spin_lock(&zones->lock);
 	zones->index_state = IS_CLOSED;
 	zones->error_flag |= result != UDS_SUCCESS;
@@ -2088,7 +2088,7 @@ static void open_index(struct hash_zones *zones)
 	result = uds_open_index(create_flag ? UDS_CREATE : UDS_LOAD,
 				&zones->parameters, zones->index_session);
 	if (result != UDS_SUCCESS)
-		uds_log_error_strerror(result, "Error opening index");
+		vdo_log_error_strerror(result, "Error opening index");
 
 	spin_lock(&zones->lock);
 	if (!create_flag) {
@@ -2112,7 +2112,7 @@ static void open_index(struct hash_zones *zones)
 		zones->index_target = IS_CLOSED;
 		zones->error_flag = true;
 		spin_unlock(&zones->lock);
-		uds_log_info("Setting UDS index target state to error");
+		vdo_log_info("Setting UDS index target state to error");
 		spin_lock(&zones->lock);
 	}
 	/*
@@ -2168,7 +2168,7 @@ static void report_dedupe_timeouts(struct hash_zones *zones, unsigned int timeou
 		u64 unreported = atomic64_read(&zones->timeouts);
 
 		unreported -= zones->reported_timeouts;
-		uds_log_debug("UDS index timeout on %llu requests",
+		vdo_log_debug("UDS index timeout on %llu requests",
 			      (unsigned long long) unreported);
 		zones->reported_timeouts += unreported;
 	}
@@ -2215,7 +2215,7 @@ static int initialize_index(struct vdo *vdo, struct hash_zones *zones)
 				 1, NULL);
 	if (result != VDO_SUCCESS) {
 		uds_destroy_index_session(vdo_forget(zones->index_session));
-		uds_log_error("UDS index queue initialization failed (%d)", result);
+		vdo_log_error("UDS index queue initialization failed (%d)", result);
 		return result;
 	}
 
@@ -2511,7 +2511,7 @@ static void initiate_suspend_index(struct admin_state *state)
 
 		result = uds_suspend_index_session(zones->index_session, save);
 		if (result != UDS_SUCCESS)
-			uds_log_error_strerror(result, "Error suspending dedupe index");
+			vdo_log_error_strerror(result, "Error suspending dedupe index");
 	}
 
 	vdo_finish_draining(state);
@@ -2594,7 +2594,7 @@ static void resume_index(void *context, struct vdo_completion *parent)
 	zones->parameters.bdev = config->owned_device->bdev;
 	result = uds_resume_index_session(zones->index_session, zones->parameters.bdev);
 	if (result != UDS_SUCCESS)
-		uds_log_error_strerror(result, "Error resuming dedupe index");
+		vdo_log_error_strerror(result, "Error resuming dedupe index");
 
 	spin_lock(&zones->lock);
 	vdo_resume_if_quiescent(&zones->state);
@@ -2674,7 +2674,7 @@ static void get_index_statistics(struct hash_zones *zones,
 
 	result = uds_get_index_session_stats(zones->index_session, &index_stats);
 	if (result != UDS_SUCCESS) {
-		uds_log_error_strerror(result, "Error reading index stats");
+		vdo_log_error_strerror(result, "Error reading index stats");
 		return;
 	}
 
@@ -2759,7 +2759,7 @@ static void dump_hash_lock(const struct hash_lock *lock)
 	 * unambiguous. 'U' indicates a lock not registered in the map.
 	 */
 	state = get_hash_lock_state_name(lock->state);
-	uds_log_info("  hl %px: %3.3s %c%llu/%u rc=%u wc=%zu agt=%px",
+	vdo_log_info("  hl %px: %3.3s %c%llu/%u rc=%u wc=%zu agt=%px",
 		     lock, state, (lock->registered ? 'D' : 'U'),
 		     (unsigned long long) lock->duplicate.pbn,
 		     lock->duplicate.state, lock->reference_count,
@@ -2793,11 +2793,11 @@ static void dump_hash_zone(const struct hash_zone *zone)
 	data_vio_count_t i;
 
 	if (zone->hash_lock_map == NULL) {
-		uds_log_info("struct hash_zone %u: NULL map", zone->zone_number);
+		vdo_log_info("struct hash_zone %u: NULL map", zone->zone_number);
 		return;
 	}
 
-	uds_log_info("struct hash_zone %u: mapSize=%zu",
+	vdo_log_info("struct hash_zone %u: mapSize=%zu",
 		     zone->zone_number, vdo_int_map_size(zone->hash_lock_map));
 	for (i = 0; i < LOCK_POOL_CAPACITY; i++)
 		dump_hash_lock(&zone->lock_array[i]);
@@ -2817,9 +2817,9 @@ void vdo_dump_hash_zones(struct hash_zones *zones)
 	target = (zones->changing ? index_state_to_string(zones, zones->index_target) : NULL);
 	spin_unlock(&zones->lock);
 
-	uds_log_info("UDS index: state: %s", state);
+	vdo_log_info("UDS index: state: %s", state);
 	if (target != NULL)
-		uds_log_info("UDS index: changing to state: %s", target);
+		vdo_log_info("UDS index: changing to state: %s", target);
 
 	for (zone = 0; zone < zones->zone_count; zone++)
 		dump_hash_zone(&zones->zones[zone]);
@@ -2966,7 +2966,7 @@ static void set_target_state(struct hash_zones *zones, enum index_state target,
 	spin_unlock(&zones->lock);
 
 	if (old_state != new_state)
-		uds_log_info("Setting UDS index target state to %s", new_state);
+		vdo_log_info("Setting UDS index target state to %s", new_state);
 }
 
 const char *vdo_get_dedupe_index_state_name(struct hash_zones *zones)
diff --git a/drivers/md/dm-vdo/dm-vdo-target.c b/drivers/md/dm-vdo/dm-vdo-target.c
index 322ce52bcc2c43..240bfa0aa3da7e 100644
--- a/drivers/md/dm-vdo/dm-vdo-target.c
+++ b/drivers/md/dm-vdo/dm-vdo-target.c
@@ -236,9 +236,9 @@ static int get_version_number(int argc, char **argv, char **error_ptr,
 	}
 
 	if (*version_ptr != TABLE_VERSION) {
-		uds_log_warning("Detected version mismatch between kernel module and tools kernel: %d, tool: %d",
+		vdo_log_warning("Detected version mismatch between kernel module and tools kernel: %d, tool: %d",
 				TABLE_VERSION, *version_ptr);
-		uds_log_warning("Please consider upgrading management tools to match kernel.");
+		vdo_log_warning("Please consider upgrading management tools to match kernel.");
 	}
 	return VDO_SUCCESS;
 }
@@ -403,10 +403,10 @@ static int process_one_thread_config_spec(const char *thread_param_type,
 	/* Handle limited thread parameters */
 	if (strcmp(thread_param_type, "bioRotationInterval") == 0) {
 		if (count == 0) {
-			uds_log_error("thread config string error:  'bioRotationInterval' of at least 1 is required");
+			vdo_log_error("thread config string error:  'bioRotationInterval' of at least 1 is required");
 			return -EINVAL;
 		} else if (count > VDO_BIO_ROTATION_INTERVAL_LIMIT) {
-			uds_log_error("thread config string error: 'bioRotationInterval' cannot be higher than %d",
+			vdo_log_error("thread config string error: 'bioRotationInterval' cannot be higher than %d",
 				      VDO_BIO_ROTATION_INTERVAL_LIMIT);
 			return -EINVAL;
 		}
@@ -415,7 +415,7 @@ static int process_one_thread_config_spec(const char *thread_param_type,
 	}
 	if (strcmp(thread_param_type, "logical") == 0) {
 		if (count > MAX_VDO_LOGICAL_ZONES) {
-			uds_log_error("thread config string error: at most %d 'logical' threads are allowed",
+			vdo_log_error("thread config string error: at most %d 'logical' threads are allowed",
 				      MAX_VDO_LOGICAL_ZONES);
 			return -EINVAL;
 		}
@@ -424,7 +424,7 @@ static int process_one_thread_config_spec(const char *thread_param_type,
 	}
 	if (strcmp(thread_param_type, "physical") == 0) {
 		if (count > MAX_VDO_PHYSICAL_ZONES) {
-			uds_log_error("thread config string error: at most %d 'physical' threads are allowed",
+			vdo_log_error("thread config string error: at most %d 'physical' threads are allowed",
 				      MAX_VDO_PHYSICAL_ZONES);
 			return -EINVAL;
 		}
@@ -433,7 +433,7 @@ static int process_one_thread_config_spec(const char *thread_param_type,
 	}
 	/* Handle other thread count parameters */
 	if (count > MAXIMUM_VDO_THREADS) {
-		uds_log_error("thread config string error: at most %d '%s' threads are allowed",
+		vdo_log_error("thread config string error: at most %d '%s' threads are allowed",
 			      MAXIMUM_VDO_THREADS, thread_param_type);
 		return -EINVAL;
 	}
@@ -443,7 +443,7 @@ static int process_one_thread_config_spec(const char *thread_param_type,
 	}
 	if (strcmp(thread_param_type, "cpu") == 0) {
 		if (count == 0) {
-			uds_log_error("thread config string error: at least one 'cpu' thread required");
+			vdo_log_error("thread config string error: at least one 'cpu' thread required");
 			return -EINVAL;
 		}
 		config->cpu_threads = count;
@@ -455,7 +455,7 @@ static int process_one_thread_config_spec(const char *thread_param_type,
 	}
 	if (strcmp(thread_param_type, "bio") == 0) {
 		if (count == 0) {
-			uds_log_error("thread config string error: at least one 'bio' thread required");
+			vdo_log_error("thread config string error: at least one 'bio' thread required");
 			return -EINVAL;
 		}
 		config->bio_threads = count;
@@ -466,7 +466,7 @@ static int process_one_thread_config_spec(const char *thread_param_type,
 	 * Don't fail, just log. This will handle version mismatches between user mode tools and
 	 * kernel.
 	 */
-	uds_log_info("unknown thread parameter type \"%s\"", thread_param_type);
+	vdo_log_info("unknown thread parameter type \"%s\"", thread_param_type);
 	return VDO_SUCCESS;
 }
 
@@ -488,7 +488,7 @@ static int parse_one_thread_config_spec(const char *spec,
 		return result;
 
 	if ((fields[0] == NULL) || (fields[1] == NULL) || (fields[2] != NULL)) {
-		uds_log_error("thread config string error: expected thread parameter assignment, saw \"%s\"",
+		vdo_log_error("thread config string error: expected thread parameter assignment, saw \"%s\"",
 			      spec);
 		free_string_array(fields);
 		return -EINVAL;
@@ -496,7 +496,7 @@ static int parse_one_thread_config_spec(const char *spec,
 
 	result = kstrtouint(fields[1], 10, &count);
 	if (result) {
-		uds_log_error("thread config string error: integer value needed, found \"%s\"",
+		vdo_log_error("thread config string error: integer value needed, found \"%s\"",
 			      fields[1]);
 		free_string_array(fields);
 		return result;
@@ -568,12 +568,12 @@ static int process_one_key_value_pair(const char *key, unsigned int value,
 	/* Non thread optional parameters */
 	if (strcmp(key, "maxDiscard") == 0) {
 		if (value == 0) {
-			uds_log_error("optional parameter error: at least one max discard block required");
+			vdo_log_error("optional parameter error: at least one max discard block required");
 			return -EINVAL;
 		}
 		/* Max discard sectors in blkdev_issue_discard is UINT_MAX >> 9 */
 		if (value > (UINT_MAX / VDO_BLOCK_SIZE)) {
-			uds_log_error("optional parameter error: at most %d max discard	 blocks are allowed",
+			vdo_log_error("optional parameter error: at most %d max discard	 blocks are allowed",
 				      UINT_MAX / VDO_BLOCK_SIZE);
 			return -EINVAL;
 		}
@@ -608,7 +608,7 @@ static int parse_one_key_value_pair(const char *key, const char *value,
 	/* The remaining arguments must have integral values. */
 	result = kstrtouint(value, 10, &count);
 	if (result) {
-		uds_log_error("optional config string error: integer value needed, found \"%s\"",
+		vdo_log_error("optional config string error: integer value needed, found \"%s\"",
 			      value);
 		return result;
 	}
@@ -749,7 +749,7 @@ static int parse_device_config(int argc, char **argv, struct dm_target *ti,
 		return VDO_BAD_CONFIGURATION;
 	}
 
-	uds_log_info("table line: %s", config->original_string);
+	vdo_log_info("table line: %s", config->original_string);
 
 	config->thread_counts = (struct thread_count_config) {
 		.bio_ack_threads = 1,
@@ -876,7 +876,7 @@ static int parse_device_config(int argc, char **argv, struct dm_target *ti,
 	result = dm_get_device(ti, config->parent_device_name,
 			       dm_table_get_mode(ti->table), &config->owned_device);
 	if (result != 0) {
-		uds_log_error("couldn't open device \"%s\": error %d",
+		vdo_log_error("couldn't open device \"%s\": error %d",
 			      config->parent_device_name, result);
 		handle_parse_error(config, error_ptr, "Unable to open storage device");
 		return VDO_BAD_CONFIGURATION;
@@ -1035,12 +1035,12 @@ static int __must_check process_vdo_message_locked(struct vdo *vdo, unsigned int
 			return 0;
 		}
 
-		uds_log_warning("invalid argument '%s' to dmsetup compression message",
+		vdo_log_warning("invalid argument '%s' to dmsetup compression message",
 				argv[1]);
 		return -EINVAL;
 	}
 
-	uds_log_warning("unrecognized dmsetup message '%s' received", argv[0]);
+	vdo_log_warning("unrecognized dmsetup message '%s' received", argv[0]);
 	return -EINVAL;
 }
 
@@ -1097,7 +1097,7 @@ static int vdo_message(struct dm_target *ti, unsigned int argc, char **argv,
 	int result;
 
 	if (argc == 0) {
-		uds_log_warning("unspecified dmsetup message");
+		vdo_log_warning("unspecified dmsetup message");
 		return -EINVAL;
 	}
 
@@ -1217,7 +1217,7 @@ static int perform_admin_operation(struct vdo *vdo, u32 starting_phase,
 	struct vdo_administrator *admin = &vdo->admin;
 
 	if (atomic_cmpxchg(&admin->busy, 0, 1) != 0) {
-		return uds_log_error_strerror(VDO_COMPONENT_BUSY,
+		return vdo_log_error_strerror(VDO_COMPONENT_BUSY,
 					      "Can't start %s operation, another operation is already in progress",
 					      type);
 	}
@@ -1291,7 +1291,7 @@ static int __must_check decode_from_super_block(struct vdo *vdo)
 	 * block, just accept it.
 	 */
 	if (vdo->states.vdo.config.logical_blocks < config->logical_blocks) {
-		uds_log_warning("Growing logical size: a logical size of %llu blocks was specified, but that differs from the %llu blocks configured in the vdo super block",
+		vdo_log_warning("Growing logical size: a logical size of %llu blocks was specified, but that differs from the %llu blocks configured in the vdo super block",
 				(unsigned long long) config->logical_blocks,
 				(unsigned long long) vdo->states.vdo.config.logical_blocks);
 		vdo->states.vdo.config.logical_blocks = config->logical_blocks;
@@ -1334,14 +1334,14 @@ static int __must_check decode_vdo(struct vdo *vdo)
 	journal_length =
 		vdo_get_recovery_journal_length(vdo->states.vdo.config.recovery_journal_size);
 	if (maximum_age > (journal_length / 2)) {
-		return uds_log_error_strerror(VDO_BAD_CONFIGURATION,
+		return vdo_log_error_strerror(VDO_BAD_CONFIGURATION,
 					      "maximum age: %llu exceeds limit %llu",
 					      (unsigned long long) maximum_age,
 					      (unsigned long long) (journal_length / 2));
 	}
 
 	if (maximum_age == 0) {
-		return uds_log_error_strerror(VDO_BAD_CONFIGURATION,
+		return vdo_log_error_strerror(VDO_BAD_CONFIGURATION,
 					      "maximum age must be greater than 0");
 	}
 
@@ -1457,19 +1457,19 @@ static int vdo_initialize(struct dm_target *ti, unsigned int instance,
 	u64 logical_size = to_bytes(ti->len);
 	block_count_t logical_blocks = logical_size / block_size;
 
-	uds_log_info("loading device '%s'", vdo_get_device_name(ti));
-	uds_log_debug("Logical block size     = %llu", (u64) config->logical_block_size);
-	uds_log_debug("Logical blocks         = %llu", logical_blocks);
-	uds_log_debug("Physical block size    = %llu", (u64) block_size);
-	uds_log_debug("Physical blocks        = %llu", config->physical_blocks);
-	uds_log_debug("Block map cache blocks = %u", config->cache_size);
-	uds_log_debug("Block map maximum age  = %u", config->block_map_maximum_age);
-	uds_log_debug("Deduplication          = %s", (config->deduplication ? "on" : "off"));
-	uds_log_debug("Compression            = %s", (config->compression ? "on" : "off"));
+	vdo_log_info("loading device '%s'", vdo_get_device_name(ti));
+	vdo_log_debug("Logical block size     = %llu", (u64) config->logical_block_size);
+	vdo_log_debug("Logical blocks         = %llu", logical_blocks);
+	vdo_log_debug("Physical block size    = %llu", (u64) block_size);
+	vdo_log_debug("Physical blocks        = %llu", config->physical_blocks);
+	vdo_log_debug("Block map cache blocks = %u", config->cache_size);
+	vdo_log_debug("Block map maximum age  = %u", config->block_map_maximum_age);
+	vdo_log_debug("Deduplication          = %s", (config->deduplication ? "on" : "off"));
+	vdo_log_debug("Compression            = %s", (config->compression ? "on" : "off"));
 
 	vdo = vdo_find_matching(vdo_uses_device, config);
 	if (vdo != NULL) {
-		uds_log_error("Existing vdo already uses device %s",
+		vdo_log_error("Existing vdo already uses device %s",
 			      vdo->device_config->parent_device_name);
 		ti->error = "Cannot share storage device with already-running VDO";
 		return VDO_BAD_CONFIGURATION;
@@ -1477,7 +1477,7 @@ static int vdo_initialize(struct dm_target *ti, unsigned int instance,
 
 	result = vdo_make(instance, config, &ti->error, &vdo);
 	if (result != VDO_SUCCESS) {
-		uds_log_error("Could not create VDO device. (VDO error %d, message %s)",
+		vdo_log_error("Could not create VDO device. (VDO error %d, message %s)",
 			      result, ti->error);
 		vdo_destroy(vdo);
 		return result;
@@ -1489,7 +1489,7 @@ static int vdo_initialize(struct dm_target *ti, unsigned int instance,
 		ti->error = ((result == VDO_INVALID_ADMIN_STATE) ?
 			     "Pre-load is only valid immediately after initialization" :
 			     "Cannot load metadata from device");
-		uds_log_error("Could not start VDO device. (VDO error %d, message %s)",
+		vdo_log_error("Could not start VDO device. (VDO error %d, message %s)",
 			      result, ti->error);
 		vdo_destroy(vdo);
 		return result;
@@ -1600,7 +1600,7 @@ static int construct_new_vdo_registered(struct dm_target *ti, unsigned int argc,
 
 	result = parse_device_config(argc, argv, ti, &config);
 	if (result != VDO_SUCCESS) {
-		uds_log_error_strerror(result, "parsing failed: %s", ti->error);
+		vdo_log_error_strerror(result, "parsing failed: %s", ti->error);
 		release_instance(instance);
 		return -EINVAL;
 	}
@@ -1729,7 +1729,7 @@ static int prepare_to_grow_physical(struct vdo *vdo, block_count_t new_physical_
 	int result;
 	block_count_t current_physical_blocks = vdo->states.vdo.config.physical_blocks;
 
-	uds_log_info("Preparing to resize physical to %llu",
+	vdo_log_info("Preparing to resize physical to %llu",
 		     (unsigned long long) new_physical_blocks);
 	VDO_ASSERT_LOG_ONLY((new_physical_blocks > current_physical_blocks),
 			    "New physical size is larger than current physical size");
@@ -1752,7 +1752,7 @@ static int prepare_to_grow_physical(struct vdo *vdo, block_count_t new_physical_
 		return result;
 	}
 
-	uds_log_info("Done preparing to resize physical");
+	vdo_log_info("Done preparing to resize physical");
 	return VDO_SUCCESS;
 }
 
@@ -1829,7 +1829,7 @@ static int prepare_to_modify(struct dm_target *ti, struct device_config *config,
 	if (config->logical_blocks > vdo->device_config->logical_blocks) {
 		block_count_t logical_blocks = vdo->states.vdo.config.logical_blocks;
 
-		uds_log_info("Preparing to resize logical to %llu",
+		vdo_log_info("Preparing to resize logical to %llu",
 			     (unsigned long long) config->logical_blocks);
 		VDO_ASSERT_LOG_ONLY((config->logical_blocks > logical_blocks),
 				    "New logical size is larger than current size");
@@ -1841,7 +1841,7 @@ static int prepare_to_modify(struct dm_target *ti, struct device_config *config,
 			return result;
 		}
 
-		uds_log_info("Done preparing to resize logical");
+		vdo_log_info("Done preparing to resize logical");
 	}
 
 	if (config->physical_blocks > vdo->device_config->physical_blocks) {
@@ -1867,7 +1867,7 @@ static int prepare_to_modify(struct dm_target *ti, struct device_config *config,
 	if (strcmp(config->parent_device_name, vdo->device_config->parent_device_name) != 0) {
 		const char *device_name = vdo_get_device_name(config->owning_target);
 
-		uds_log_info("Updating backing device of %s from %s to %s", device_name,
+		vdo_log_info("Updating backing device of %s from %s to %s", device_name,
 			     vdo->device_config->parent_device_name,
 			     config->parent_device_name);
 	}
@@ -1885,7 +1885,7 @@ static int update_existing_vdo(const char *device_name, struct dm_target *ti,
 	if (result != VDO_SUCCESS)
 		return -EINVAL;
 
-	uds_log_info("preparing to modify device '%s'", device_name);
+	vdo_log_info("preparing to modify device '%s'", device_name);
 	result = prepare_to_modify(ti, config, vdo);
 	if (result != VDO_SUCCESS) {
 		free_device_config(config);
@@ -1935,12 +1935,12 @@ static void vdo_dtr(struct dm_target *ti)
 		vdo_register_allocating_thread(&allocating_thread, NULL);
 
 		device_name = vdo_get_device_name(ti);
-		uds_log_info("stopping device '%s'", device_name);
+		vdo_log_info("stopping device '%s'", device_name);
 		if (vdo->dump_on_shutdown)
 			vdo_dump_all(vdo, "device shutdown");
 
 		vdo_destroy(vdo_forget(vdo));
-		uds_log_info("device '%s' stopped", device_name);
+		vdo_log_info("device '%s' stopped", device_name);
 		vdo_unregister_thread_device_id();
 		vdo_unregister_allocating_thread();
 		release_instance(instance);
@@ -2102,7 +2102,7 @@ static void vdo_postsuspend(struct dm_target *ti)
 
 	vdo_register_thread_device_id(&instance_thread, &vdo->instance);
 	device_name = vdo_get_device_name(vdo->device_config->owning_target);
-	uds_log_info("suspending device '%s'", device_name);
+	vdo_log_info("suspending device '%s'", device_name);
 
 	/*
 	 * It's important to note any error here does not actually stop device-mapper from
@@ -2116,12 +2116,12 @@ static void vdo_postsuspend(struct dm_target *ti)
 		 * Treat VDO_READ_ONLY as a success since a read-only suspension still leaves the
 		 * VDO suspended.
 		 */
-		uds_log_info("device '%s' suspended", device_name);
+		vdo_log_info("device '%s' suspended", device_name);
 	} else if (result == VDO_INVALID_ADMIN_STATE) {
-		uds_log_error("Suspend invoked while in unexpected state: %s",
+		vdo_log_error("Suspend invoked while in unexpected state: %s",
 			      vdo_get_admin_state(vdo)->name);
 	} else {
-		uds_log_error_strerror(result, "Suspend of device '%s' failed",
+		vdo_log_error_strerror(result, "Suspend of device '%s' failed",
 				       device_name);
 	}
 
@@ -2293,13 +2293,13 @@ static void handle_load_error(struct vdo_completion *completion)
 
 	if (vdo_state_requires_read_only_rebuild(vdo->load_state) &&
 	    (vdo->admin.phase == LOAD_PHASE_MAKE_DIRTY)) {
-		uds_log_error_strerror(completion->result, "aborting load");
+		vdo_log_error_strerror(completion->result, "aborting load");
 		vdo->admin.phase = LOAD_PHASE_DRAIN_JOURNAL;
 		load_callback(vdo_forget(completion));
 		return;
 	}
 
-	uds_log_error_strerror(completion->result,
+	vdo_log_error_strerror(completion->result,
 			       "Entering read-only mode due to load error");
 	vdo->admin.phase = LOAD_PHASE_WAIT_FOR_READ_ONLY;
 	vdo_enter_read_only_mode(vdo, completion->result);
@@ -2391,7 +2391,7 @@ static void resume_callback(struct vdo_completion *completion)
 
 		if (enable != was_enabled)
 			WRITE_ONCE(vdo->compressing, enable);
-		uds_log_info("compression is %s", (enable ? "enabled" : "disabled"));
+		vdo_log_info("compression is %s", (enable ? "enabled" : "disabled"));
 
 		vdo_resume_packer(vdo->packer, completion);
 		return;
@@ -2431,7 +2431,7 @@ static void grow_logical_callback(struct vdo_completion *completion)
 	switch (advance_phase(vdo)) {
 	case GROW_LOGICAL_PHASE_START:
 		if (vdo_is_read_only(vdo)) {
-			uds_log_error_strerror(VDO_READ_ONLY,
+			vdo_log_error_strerror(VDO_READ_ONLY,
 					       "Can't grow logical size of a read-only VDO");
 			vdo_set_completion_result(completion, VDO_READ_ONLY);
 			break;
@@ -2510,7 +2510,7 @@ static int perform_grow_logical(struct vdo *vdo, block_count_t new_logical_block
 		return VDO_SUCCESS;
 	}
 
-	uds_log_info("Resizing logical to %llu",
+	vdo_log_info("Resizing logical to %llu",
 		     (unsigned long long) new_logical_blocks);
 	if (vdo->block_map->next_entry_count != new_logical_blocks)
 		return VDO_PARAMETER_MISMATCH;
@@ -2521,7 +2521,7 @@ static int perform_grow_logical(struct vdo *vdo, block_count_t new_logical_block
 	if (result != VDO_SUCCESS)
 		return result;
 
-	uds_log_info("Logical blocks now %llu", (unsigned long long) new_logical_blocks);
+	vdo_log_info("Logical blocks now %llu", (unsigned long long) new_logical_blocks);
 	return VDO_SUCCESS;
 }
 
@@ -2581,7 +2581,7 @@ static void grow_physical_callback(struct vdo_completion *completion)
 	switch (advance_phase(vdo)) {
 	case GROW_PHYSICAL_PHASE_START:
 		if (vdo_is_read_only(vdo)) {
-			uds_log_error_strerror(VDO_READ_ONLY,
+			vdo_log_error_strerror(VDO_READ_ONLY,
 					       "Can't grow physical size of a read-only VDO");
 			vdo_set_completion_result(completion, VDO_READ_ONLY);
 			break;
@@ -2690,7 +2690,7 @@ static int perform_grow_physical(struct vdo *vdo, block_count_t new_physical_blo
 	if (result != VDO_SUCCESS)
 		return result;
 
-	uds_log_info("Physical block count was %llu, now %llu",
+	vdo_log_info("Physical block count was %llu, now %llu",
 		     (unsigned long long) old_physical_blocks,
 		     (unsigned long long) new_physical_blocks);
 	return VDO_SUCCESS;
@@ -2712,13 +2712,13 @@ static int __must_check apply_new_vdo_configuration(struct vdo *vdo,
 
 	result = perform_grow_logical(vdo, config->logical_blocks);
 	if (result != VDO_SUCCESS) {
-		uds_log_error("grow logical operation failed, result = %d", result);
+		vdo_log_error("grow logical operation failed, result = %d", result);
 		return result;
 	}
 
 	result = perform_grow_physical(vdo, config->physical_blocks);
 	if (result != VDO_SUCCESS)
-		uds_log_error("resize operation failed, result = %d", result);
+		vdo_log_error("resize operation failed, result = %d", result);
 
 	return result;
 }
@@ -2733,14 +2733,14 @@ static int vdo_preresume_registered(struct dm_target *ti, struct vdo *vdo)
 	backing_blocks = get_underlying_device_block_count(vdo);
 	if (backing_blocks < config->physical_blocks) {
 		/* FIXME: can this still happen? */
-		uds_log_error("resume of device '%s' failed: backing device has %llu blocks but VDO physical size is %llu blocks",
+		vdo_log_error("resume of device '%s' failed: backing device has %llu blocks but VDO physical size is %llu blocks",
 			      device_name, (unsigned long long) backing_blocks,
 			      (unsigned long long) config->physical_blocks);
 		return -EINVAL;
 	}
 
 	if (vdo_get_admin_state(vdo) == VDO_ADMIN_STATE_PRE_LOADED) {
-		uds_log_info("starting device '%s'", device_name);
+		vdo_log_info("starting device '%s'", device_name);
 		result = perform_admin_operation(vdo, LOAD_PHASE_START, load_callback,
 						 handle_load_error, "load");
 		if ((result != VDO_SUCCESS) && (result != VDO_READ_ONLY)) {
@@ -2748,7 +2748,7 @@ static int vdo_preresume_registered(struct dm_target *ti, struct vdo *vdo)
 			 * Something has gone very wrong. Make sure everything has drained and
 			 * leave the device in an unresumable state.
 			 */
-			uds_log_error_strerror(result,
+			vdo_log_error_strerror(result,
 					       "Start failed, could not load VDO metadata");
 			vdo->suspend_type = VDO_ADMIN_STATE_STOPPING;
 			perform_admin_operation(vdo, SUSPEND_PHASE_START,
@@ -2758,10 +2758,10 @@ static int vdo_preresume_registered(struct dm_target *ti, struct vdo *vdo)
 		}
 
 		/* Even if the VDO is read-only, it is now able to handle read requests. */
-		uds_log_info("device '%s' started", device_name);
+		vdo_log_info("device '%s' started", device_name);
 	}
 
-	uds_log_info("resuming device '%s'", device_name);
+	vdo_log_info("resuming device '%s'", device_name);
 
 	/* If this fails, the VDO was not in a state to be resumed. This should never happen. */
 	result = apply_new_vdo_configuration(vdo, config);
@@ -2779,7 +2779,7 @@ static int vdo_preresume_registered(struct dm_target *ti, struct vdo *vdo)
 	 * written to disk.
 	 */
 	if (result != VDO_SUCCESS) {
-		uds_log_error_strerror(result,
+		vdo_log_error_strerror(result,
 				       "Commit of modifications to device '%s' failed",
 				       device_name);
 		vdo_enter_read_only_mode(vdo, result);
@@ -2800,7 +2800,7 @@ static int vdo_preresume_registered(struct dm_target *ti, struct vdo *vdo)
 	}
 
 	if (result != VDO_SUCCESS)
-		uds_log_error("resume of device '%s' failed with error: %d", device_name,
+		vdo_log_error("resume of device '%s' failed with error: %d", device_name,
 			      result);
 
 	return result;
@@ -2826,7 +2826,7 @@ static void vdo_resume(struct dm_target *ti)
 
 	vdo_register_thread_device_id(&instance_thread,
 				      &get_vdo_for_target(ti)->instance);
-	uds_log_info("device '%s' resumed", vdo_get_device_name(ti));
+	vdo_log_info("device '%s' resumed", vdo_get_device_name(ti));
 	vdo_unregister_thread_device_id();
 }
 
@@ -2857,7 +2857,7 @@ static bool dm_registered;
 
 static void vdo_module_destroy(void)
 {
-	uds_log_debug("unloading");
+	vdo_log_debug("unloading");
 
 	if (dm_registered)
 		dm_unregister_target(&vdo_target_bio);
@@ -2868,7 +2868,7 @@ static void vdo_module_destroy(void)
 	vdo_free(instances.words);
 	memset(&instances, 0, sizeof(struct instance_tracker));
 
-	uds_log_info("unloaded version %s", CURRENT_VERSION);
+	vdo_log_info("unloaded version %s", CURRENT_VERSION);
 }
 
 static int __init vdo_init(void)
@@ -2882,19 +2882,19 @@ static int __init vdo_init(void)
 
 	vdo_initialize_thread_device_registry();
 	vdo_initialize_device_registry_once();
-	uds_log_info("loaded version %s", CURRENT_VERSION);
+	vdo_log_info("loaded version %s", CURRENT_VERSION);
 
 	/* Add VDO errors to the already existing set of errors in UDS. */
 	result = vdo_register_status_codes();
 	if (result != VDO_SUCCESS) {
-		uds_log_error("vdo_register_status_codes failed %d", result);
+		vdo_log_error("vdo_register_status_codes failed %d", result);
 		vdo_module_destroy();
 		return result;
 	}
 
 	result = dm_register_target(&vdo_target_bio);
 	if (result < 0) {
-		uds_log_error("dm_register_target failed %d", result);
+		vdo_log_error("dm_register_target failed %d", result);
 		vdo_module_destroy();
 		return result;
 	}
diff --git a/drivers/md/dm-vdo/dump.c b/drivers/md/dm-vdo/dump.c
index f5cef6d80c9e37..ad1d58899832c5 100644
--- a/drivers/md/dm-vdo/dump.c
+++ b/drivers/md/dm-vdo/dump.c
@@ -58,12 +58,12 @@ static void do_dump(struct vdo *vdo, unsigned int dump_options_requested,
 	u32 active, maximum;
 	s64 outstanding;
 
-	uds_log_info("%s dump triggered via %s", UDS_LOGGING_MODULE_NAME, why);
+	vdo_log_info("%s dump triggered via %s", VDO_LOGGING_MODULE_NAME, why);
 	active = get_data_vio_pool_active_requests(vdo->data_vio_pool);
 	maximum = get_data_vio_pool_maximum_requests(vdo->data_vio_pool);
 	outstanding = (atomic64_read(&vdo->stats.bios_submitted) -
 		       atomic64_read(&vdo->stats.bios_completed));
-	uds_log_info("%u device requests outstanding (max %u), %lld bio requests outstanding, device '%s'",
+	vdo_log_info("%u device requests outstanding (max %u), %lld bio requests outstanding, device '%s'",
 		     active, maximum, outstanding,
 		     vdo_get_device_name(vdo->device_config->owning_target));
 	if (((dump_options_requested & FLAG_SHOW_QUEUES) != 0) && (vdo->threads != NULL)) {
@@ -80,7 +80,7 @@ static void do_dump(struct vdo *vdo, unsigned int dump_options_requested,
 		vdo_dump_status(vdo);
 
 	vdo_report_memory_usage();
-	uds_log_info("end of %s dump", UDS_LOGGING_MODULE_NAME);
+	vdo_log_info("end of %s dump", VDO_LOGGING_MODULE_NAME);
 }
 
 static int parse_dump_options(unsigned int argc, char *const *argv,
@@ -114,7 +114,7 @@ static int parse_dump_options(unsigned int argc, char *const *argv,
 			}
 		}
 		if (j == ARRAY_SIZE(option_names)) {
-			uds_log_warning("dump option name '%s' unknown", argv[i]);
+			vdo_log_warning("dump option name '%s' unknown", argv[i]);
 			options_okay = false;
 		}
 	}
@@ -159,13 +159,13 @@ static void dump_vio_waiters(struct vdo_wait_queue *waitq, char *wait_on)
 
 	data_vio = vdo_waiter_as_data_vio(first);
 
-	uds_log_info("      %s is locked. Waited on by: vio %px pbn %llu lbn %llu d-pbn %llu lastOp %s",
+	vdo_log_info("      %s is locked. Waited on by: vio %px pbn %llu lbn %llu d-pbn %llu lastOp %s",
 		     wait_on, data_vio, data_vio->allocation.pbn, data_vio->logical.lbn,
 		     data_vio->duplicate.pbn, get_data_vio_operation_name(data_vio));
 
 	for (waiter = first->next_waiter; waiter != first; waiter = waiter->next_waiter) {
 		data_vio = vdo_waiter_as_data_vio(waiter);
-		uds_log_info("     ... and : vio %px pbn %llu lbn %llu d-pbn %llu lastOp %s",
+		vdo_log_info("     ... and : vio %px pbn %llu lbn %llu d-pbn %llu lastOp %s",
 			     data_vio, data_vio->allocation.pbn, data_vio->logical.lbn,
 			     data_vio->duplicate.pbn,
 			     get_data_vio_operation_name(data_vio));
@@ -261,7 +261,7 @@ void dump_data_vio(void *data)
 
 	encode_vio_dump_flags(data_vio, flags_dump_buffer);
 
-	uds_log_info("	vio %px %s%s %s %s%s", data_vio,
+	vdo_log_info("	vio %px %s%s %s %s%s", data_vio,
 		     vio_block_number_dump_buffer,
 		     vio_flush_generation_buffer,
 		     get_data_vio_operation_name(data_vio),
diff --git a/drivers/md/dm-vdo/encodings.c b/drivers/md/dm-vdo/encodings.c
index 1f0fae2a4e8a26..42ffd793deebaf 100644
--- a/drivers/md/dm-vdo/encodings.c
+++ b/drivers/md/dm-vdo/encodings.c
@@ -148,7 +148,7 @@ static int __must_check validate_version(struct version_number expected_version,
 					 const char *component_name)
 {
 	if (!vdo_are_same_version(expected_version, actual_version)) {
-		return uds_log_error_strerror(VDO_UNSUPPORTED_VERSION,
+		return vdo_log_error_strerror(VDO_UNSUPPORTED_VERSION,
 					      "%s version mismatch, expected %d.%d, got %d.%d",
 					      component_name,
 					      expected_version.major_version,
@@ -181,7 +181,7 @@ int vdo_validate_header(const struct header *expected_header,
 	int result;
 
 	if (expected_header->id != actual_header->id) {
-		return uds_log_error_strerror(VDO_INCORRECT_COMPONENT,
+		return vdo_log_error_strerror(VDO_INCORRECT_COMPONENT,
 					      "%s ID mismatch, expected %d, got %d",
 					      name, expected_header->id,
 					      actual_header->id);
@@ -194,7 +194,7 @@ int vdo_validate_header(const struct header *expected_header,
 
 	if ((expected_header->size > actual_header->size) ||
 	    (exact_size && (expected_header->size < actual_header->size))) {
-		return uds_log_error_strerror(VDO_UNSUPPORTED_VERSION,
+		return vdo_log_error_strerror(VDO_UNSUPPORTED_VERSION,
 					      "%s size mismatch, expected %zu, got %zu",
 					      name, expected_header->size,
 					      actual_header->size);
@@ -655,7 +655,7 @@ int vdo_configure_slab_depot(const struct partition *partition,
 	physical_block_number_t last_block;
 	block_count_t slab_size = slab_config.slab_blocks;
 
-	uds_log_debug("slabDepot %s(block_count=%llu, first_block=%llu, slab_size=%llu, zone_count=%u)",
+	vdo_log_debug("slabDepot %s(block_count=%llu, first_block=%llu, slab_size=%llu, zone_count=%u)",
 		      __func__, (unsigned long long) partition->count,
 		      (unsigned long long) partition->offset,
 		      (unsigned long long) slab_size, zone_count);
@@ -679,7 +679,7 @@ int vdo_configure_slab_depot(const struct partition *partition,
 		.zone_count = zone_count,
 	};
 
-	uds_log_debug("slab_depot last_block=%llu, total_data_blocks=%llu, slab_count=%zu, left_over=%llu",
+	vdo_log_debug("slab_depot last_block=%llu, total_data_blocks=%llu, slab_count=%zu, left_over=%llu",
 		      (unsigned long long) last_block,
 		      (unsigned long long) total_data_blocks, slab_count,
 		      (unsigned long long) (partition->count - (last_block - partition->offset)));
@@ -877,7 +877,7 @@ int vdo_initialize_layout(block_count_t size, physical_block_number_t offset,
 		(offset + block_map_blocks + journal_blocks + summary_blocks);
 
 	if (necessary_size > size)
-		return uds_log_error_strerror(VDO_NO_SPACE,
+		return vdo_log_error_strerror(VDO_NO_SPACE,
 					      "Not enough space to make a VDO");
 
 	*layout = (struct layout) {
@@ -1047,7 +1047,7 @@ static int decode_layout(u8 *buffer, size_t *offset, physical_block_number_t sta
 	layout->num_partitions = layout_header.partition_count;
 
 	if (layout->num_partitions > VDO_PARTITION_COUNT) {
-		return uds_log_error_strerror(VDO_UNKNOWN_PARTITION,
+		return vdo_log_error_strerror(VDO_UNKNOWN_PARTITION,
 					      "layout has extra partitions");
 	}
 
@@ -1072,7 +1072,7 @@ static int decode_layout(u8 *buffer, size_t *offset, physical_block_number_t sta
 		result = vdo_get_partition(layout, REQUIRED_PARTITIONS[i], &partition);
 		if (result != VDO_SUCCESS) {
 			vdo_uninitialize_layout(layout);
-			return uds_log_error_strerror(result,
+			return vdo_log_error_strerror(result,
 						      "layout is missing required partition %u",
 						      REQUIRED_PARTITIONS[i]);
 		}
@@ -1082,7 +1082,7 @@ static int decode_layout(u8 *buffer, size_t *offset, physical_block_number_t sta
 
 	if (start != size) {
 		vdo_uninitialize_layout(layout);
-		return uds_log_error_strerror(UDS_BAD_STATE,
+		return vdo_log_error_strerror(UDS_BAD_STATE,
 					      "partitions do not cover the layout");
 	}
 
@@ -1255,7 +1255,7 @@ int vdo_validate_config(const struct vdo_config *config,
 		return VDO_OUT_OF_RANGE;
 
 	if (physical_block_count != config->physical_blocks) {
-		uds_log_error("A physical size of %llu blocks was specified, not the %llu blocks configured in the vdo super block",
+		vdo_log_error("A physical size of %llu blocks was specified, not the %llu blocks configured in the vdo super block",
 			      (unsigned long long) physical_block_count,
 			      (unsigned long long) config->physical_blocks);
 		return VDO_PARAMETER_MISMATCH;
@@ -1268,7 +1268,7 @@ int vdo_validate_config(const struct vdo_config *config,
 			return result;
 
 		if (logical_block_count != config->logical_blocks) {
-			uds_log_error("A logical size of %llu blocks was specified, but that differs from the %llu blocks configured in the vdo super block",
+			vdo_log_error("A logical size of %llu blocks was specified, but that differs from the %llu blocks configured in the vdo super block",
 				      (unsigned long long) logical_block_count,
 				      (unsigned long long) config->logical_blocks);
 			return VDO_PARAMETER_MISMATCH;
@@ -1392,7 +1392,7 @@ int vdo_validate_component_states(struct vdo_component_states *states,
 				  block_count_t logical_size)
 {
 	if (geometry_nonce != states->vdo.nonce) {
-		return uds_log_error_strerror(VDO_BAD_NONCE,
+		return vdo_log_error_strerror(VDO_BAD_NONCE,
 					      "Geometry nonce %llu does not match superblock nonce %llu",
 					      (unsigned long long) geometry_nonce,
 					      (unsigned long long) states->vdo.nonce);
@@ -1465,7 +1465,7 @@ int vdo_decode_super_block(u8 *buffer)
 		 * We can't check release version or checksum until we know the content size, so we
 		 * have to assume a version mismatch on unexpected values.
 		 */
-		return uds_log_error_strerror(VDO_UNSUPPORTED_VERSION,
+		return vdo_log_error_strerror(VDO_UNSUPPORTED_VERSION,
 					      "super block contents too large: %zu",
 					      header.size);
 	}
diff --git a/drivers/md/dm-vdo/errors.c b/drivers/md/dm-vdo/errors.c
index 2da614d714c076..62c76b9852d84f 100644
--- a/drivers/md/dm-vdo/errors.c
+++ b/drivers/md/dm-vdo/errors.c
@@ -221,8 +221,8 @@ const char *uds_string_error_name(int errnum, char *buf, size_t buflen)
  */
 int uds_status_to_errno(int error)
 {
-	char error_name[UDS_MAX_ERROR_NAME_SIZE];
-	char error_message[UDS_MAX_ERROR_MESSAGE_SIZE];
+	char error_name[VDO_MAX_ERROR_NAME_SIZE];
+	char error_message[VDO_MAX_ERROR_MESSAGE_SIZE];
 
 	/* 0 is success, and negative values are already system error codes. */
 	if (likely(error <= 0))
@@ -254,7 +254,7 @@ int uds_status_to_errno(int error)
 
 	default:
 		/* Translate an unexpected error into something generic. */
-		uds_log_info("%s: mapping status code %d (%s: %s) to -EIO",
+		vdo_log_info("%s: mapping status code %d (%s: %s) to -EIO",
 			     __func__, error,
 			     uds_string_error_name(error, error_name,
 						   sizeof(error_name)),
diff --git a/drivers/md/dm-vdo/errors.h b/drivers/md/dm-vdo/errors.h
index cf15d7243204a0..1f1636c556d98d 100644
--- a/drivers/md/dm-vdo/errors.h
+++ b/drivers/md/dm-vdo/errors.h
@@ -58,8 +58,8 @@ enum uds_status_codes {
 };
 
 enum {
-	UDS_MAX_ERROR_NAME_SIZE = 80,
-	UDS_MAX_ERROR_MESSAGE_SIZE = 128,
+	VDO_MAX_ERROR_NAME_SIZE = 80,
+	VDO_MAX_ERROR_MESSAGE_SIZE = 128,
 };
 
 struct error_info {
diff --git a/drivers/md/dm-vdo/flush.c b/drivers/md/dm-vdo/flush.c
index 18d18e9a95576b..e4be450aa12ce3 100644
--- a/drivers/md/dm-vdo/flush.c
+++ b/drivers/md/dm-vdo/flush.c
@@ -108,7 +108,7 @@ static void *allocate_flush(gfp_t gfp_mask, void *pool_data)
 		int result = vdo_allocate(1, struct vdo_flush, __func__, &flush);
 
 		if (result != VDO_SUCCESS)
-			uds_log_error_strerror(result, "failed to allocate spare flush");
+			vdo_log_error_strerror(result, "failed to allocate spare flush");
 	}
 
 	if (flush != NULL) {
@@ -349,11 +349,11 @@ void vdo_complete_flushes(struct flusher *flusher)
  */
 void vdo_dump_flusher(const struct flusher *flusher)
 {
-	uds_log_info("struct flusher");
-	uds_log_info("  flush_generation=%llu first_unacknowledged_generation=%llu",
+	vdo_log_info("struct flusher");
+	vdo_log_info("  flush_generation=%llu first_unacknowledged_generation=%llu",
 		     (unsigned long long) flusher->flush_generation,
 		     (unsigned long long) flusher->first_unacknowledged_generation);
-	uds_log_info("  notifiers queue is %s; pending_flushes queue is %s",
+	vdo_log_info("  notifiers queue is %s; pending_flushes queue is %s",
 		     (vdo_waitq_has_waiters(&flusher->notifiers) ? "not empty" : "empty"),
 		     (vdo_waitq_has_waiters(&flusher->pending_flushes) ? "not empty" : "empty"));
 }
diff --git a/drivers/md/dm-vdo/funnel-workqueue.c b/drivers/md/dm-vdo/funnel-workqueue.c
index cf04cdef07500e..ae11941c90a92b 100644
--- a/drivers/md/dm-vdo/funnel-workqueue.c
+++ b/drivers/md/dm-vdo/funnel-workqueue.c
@@ -485,7 +485,7 @@ static void dump_simple_work_queue(struct simple_work_queue *queue)
 		thread_status = atomic_read(&queue->idle) ? "idle" : "running";
 	}
 
-	uds_log_info("workQ %px (%s) %s (%c)", &queue->common, queue->common.name,
+	vdo_log_info("workQ %px (%s) %s (%c)", &queue->common, queue->common.name,
 		     thread_status, task_state_report);
 
 	/* ->waiting_worker_threads wait queue status? anyone waiting? */
diff --git a/drivers/md/dm-vdo/indexer/chapter-index.c b/drivers/md/dm-vdo/indexer/chapter-index.c
index 2caba57c83cca6..5a0b96bd956730 100644
--- a/drivers/md/dm-vdo/indexer/chapter-index.c
+++ b/drivers/md/dm-vdo/indexer/chapter-index.c
@@ -165,7 +165,7 @@ int uds_pack_open_chapter_index_page(struct open_chapter_index *chapter_index,
 
 		if (removals == 0) {
 			uds_get_delta_index_stats(delta_index, &stats);
-			uds_log_warning("The chapter index for chapter %llu contains %llu entries with %llu collisions",
+			vdo_log_warning("The chapter index for chapter %llu contains %llu entries with %llu collisions",
 					(unsigned long long) chapter_number,
 					(unsigned long long) stats.record_count,
 					(unsigned long long) stats.collision_count);
@@ -197,7 +197,7 @@ int uds_pack_open_chapter_index_page(struct open_chapter_index *chapter_index,
 	}
 
 	if (removals > 0) {
-		uds_log_warning("To avoid chapter index page overflow in chapter %llu, %u entries were removed from the chapter index",
+		vdo_log_warning("To avoid chapter index page overflow in chapter %llu, %u entries were removed from the chapter index",
 				(unsigned long long) chapter_number, removals);
 	}
 
diff --git a/drivers/md/dm-vdo/indexer/config.c b/drivers/md/dm-vdo/indexer/config.c
index 5df57961856be4..260993ce1944c9 100644
--- a/drivers/md/dm-vdo/indexer/config.c
+++ b/drivers/md/dm-vdo/indexer/config.c
@@ -35,54 +35,54 @@ static bool are_matching_configurations(struct uds_configuration *saved_config,
 	bool result = true;
 
 	if (saved_geometry->record_pages_per_chapter != geometry->record_pages_per_chapter) {
-		uds_log_error("Record pages per chapter (%u) does not match (%u)",
+		vdo_log_error("Record pages per chapter (%u) does not match (%u)",
 			      saved_geometry->record_pages_per_chapter,
 			      geometry->record_pages_per_chapter);
 		result = false;
 	}
 
 	if (saved_geometry->chapters_per_volume != geometry->chapters_per_volume) {
-		uds_log_error("Chapter count (%u) does not match (%u)",
+		vdo_log_error("Chapter count (%u) does not match (%u)",
 			      saved_geometry->chapters_per_volume,
 			      geometry->chapters_per_volume);
 		result = false;
 	}
 
 	if (saved_geometry->sparse_chapters_per_volume != geometry->sparse_chapters_per_volume) {
-		uds_log_error("Sparse chapter count (%u) does not match (%u)",
+		vdo_log_error("Sparse chapter count (%u) does not match (%u)",
 			      saved_geometry->sparse_chapters_per_volume,
 			      geometry->sparse_chapters_per_volume);
 		result = false;
 	}
 
 	if (saved_config->cache_chapters != user->cache_chapters) {
-		uds_log_error("Cache size (%u) does not match (%u)",
+		vdo_log_error("Cache size (%u) does not match (%u)",
 			      saved_config->cache_chapters, user->cache_chapters);
 		result = false;
 	}
 
 	if (saved_config->volume_index_mean_delta != user->volume_index_mean_delta) {
-		uds_log_error("Volume index mean delta (%u) does not match (%u)",
+		vdo_log_error("Volume index mean delta (%u) does not match (%u)",
 			      saved_config->volume_index_mean_delta,
 			      user->volume_index_mean_delta);
 		result = false;
 	}
 
 	if (saved_geometry->bytes_per_page != geometry->bytes_per_page) {
-		uds_log_error("Bytes per page value (%zu) does not match (%zu)",
+		vdo_log_error("Bytes per page value (%zu) does not match (%zu)",
 			      saved_geometry->bytes_per_page, geometry->bytes_per_page);
 		result = false;
 	}
 
 	if (saved_config->sparse_sample_rate != user->sparse_sample_rate) {
-		uds_log_error("Sparse sample rate (%u) does not match (%u)",
+		vdo_log_error("Sparse sample rate (%u) does not match (%u)",
 			      saved_config->sparse_sample_rate,
 			      user->sparse_sample_rate);
 		result = false;
 	}
 
 	if (saved_config->nonce != user->nonce) {
-		uds_log_error("Nonce (%llu) does not match (%llu)",
+		vdo_log_error("Nonce (%llu) does not match (%llu)",
 			      (unsigned long long) saved_config->nonce,
 			      (unsigned long long) user->nonce);
 		result = false;
@@ -111,11 +111,11 @@ int uds_validate_config_contents(struct buffered_reader *reader,
 	result = uds_read_from_buffered_reader(reader, version_buffer,
 					       INDEX_CONFIG_VERSION_LENGTH);
 	if (result != UDS_SUCCESS)
-		return uds_log_error_strerror(result, "cannot read index config version");
+		return vdo_log_error_strerror(result, "cannot read index config version");
 
 	if (!is_version(INDEX_CONFIG_VERSION_6_02, version_buffer) &&
 	    !is_version(INDEX_CONFIG_VERSION_8_02, version_buffer)) {
-		return uds_log_error_strerror(UDS_CORRUPT_DATA,
+		return vdo_log_error_strerror(UDS_CORRUPT_DATA,
 					      "unsupported configuration version: '%.*s'",
 					      INDEX_CONFIG_VERSION_LENGTH,
 					      version_buffer);
@@ -123,7 +123,7 @@ int uds_validate_config_contents(struct buffered_reader *reader,
 
 	result = uds_read_from_buffered_reader(reader, buffer, sizeof(buffer));
 	if (result != UDS_SUCCESS)
-		return uds_log_error_strerror(result, "cannot read config data");
+		return vdo_log_error_strerror(result, "cannot read config data");
 
 	decode_u32_le(buffer, &offset, &geometry.record_pages_per_chapter);
 	decode_u32_le(buffer, &offset, &geometry.chapters_per_volume);
@@ -151,7 +151,7 @@ int uds_validate_config_contents(struct buffered_reader *reader,
 		result = uds_read_from_buffered_reader(reader, remapping,
 						       sizeof(remapping));
 		if (result != UDS_SUCCESS)
-			return uds_log_error_strerror(result, "cannot read converted config");
+			return vdo_log_error_strerror(result, "cannot read converted config");
 
 		offset = 0;
 		decode_u64_le(remapping, &offset,
@@ -161,7 +161,7 @@ int uds_validate_config_contents(struct buffered_reader *reader,
 	}
 
 	if (!are_matching_configurations(&config, &geometry, user_config)) {
-		uds_log_warning("Supplied configuration does not match save");
+		vdo_log_warning("Supplied configuration does not match save");
 		return UDS_NO_INDEX;
 	}
 
@@ -265,7 +265,7 @@ static int compute_memory_sizes(uds_memory_config_size_t mem_gb, bool sparse,
 				 DEFAULT_CHAPTERS_PER_VOLUME);
 		*record_pages_per_chapter = DEFAULT_RECORD_PAGES_PER_CHAPTER;
 	} else {
-		uds_log_error("received invalid memory size");
+		vdo_log_error("received invalid memory size");
 		return -EINVAL;
 	}
 
@@ -294,7 +294,7 @@ static unsigned int __must_check normalize_zone_count(unsigned int requested)
 	if (zone_count > MAX_ZONES)
 		zone_count = MAX_ZONES;
 
-	uds_log_info("Using %u indexing zone%s for concurrency.",
+	vdo_log_info("Using %u indexing zone%s for concurrency.",
 		     zone_count, zone_count == 1 ? "" : "s");
 	return zone_count;
 }
@@ -366,13 +366,13 @@ void uds_log_configuration(struct uds_configuration *config)
 {
 	struct index_geometry *geometry = config->geometry;
 
-	uds_log_debug("Configuration:");
-	uds_log_debug("  Record pages per chapter:   %10u", geometry->record_pages_per_chapter);
-	uds_log_debug("  Chapters per volume:        %10u", geometry->chapters_per_volume);
-	uds_log_debug("  Sparse chapters per volume: %10u", geometry->sparse_chapters_per_volume);
-	uds_log_debug("  Cache size (chapters):      %10u", config->cache_chapters);
-	uds_log_debug("  Volume index mean delta:    %10u", config->volume_index_mean_delta);
-	uds_log_debug("  Bytes per page:             %10zu", geometry->bytes_per_page);
-	uds_log_debug("  Sparse sample rate:         %10u", config->sparse_sample_rate);
-	uds_log_debug("  Nonce:                      %llu", (unsigned long long) config->nonce);
+	vdo_log_debug("Configuration:");
+	vdo_log_debug("  Record pages per chapter:   %10u", geometry->record_pages_per_chapter);
+	vdo_log_debug("  Chapters per volume:        %10u", geometry->chapters_per_volume);
+	vdo_log_debug("  Sparse chapters per volume: %10u", geometry->sparse_chapters_per_volume);
+	vdo_log_debug("  Cache size (chapters):      %10u", config->cache_chapters);
+	vdo_log_debug("  Volume index mean delta:    %10u", config->volume_index_mean_delta);
+	vdo_log_debug("  Bytes per page:             %10zu", geometry->bytes_per_page);
+	vdo_log_debug("  Sparse sample rate:         %10u", config->sparse_sample_rate);
+	vdo_log_debug("  Nonce:                      %llu", (unsigned long long) config->nonce);
 }
diff --git a/drivers/md/dm-vdo/indexer/delta-index.c b/drivers/md/dm-vdo/indexer/delta-index.c
index 6448c891f80765..4923207d9eecc2 100644
--- a/drivers/md/dm-vdo/indexer/delta-index.c
+++ b/drivers/md/dm-vdo/indexer/delta-index.c
@@ -393,7 +393,7 @@ int uds_initialize_delta_index(struct delta_index *delta_index, unsigned int zon
 			 */
 			if (delta_index->list_count <= first_list_in_zone) {
 				uds_uninitialize_delta_index(delta_index);
-				return uds_log_error_strerror(UDS_INVALID_ARGUMENT,
+				return vdo_log_error_strerror(UDS_INVALID_ARGUMENT,
 							      "%u delta lists not enough for %u zones",
 							      list_count, zone_count);
 			}
@@ -750,7 +750,7 @@ int uds_pack_delta_index_page(const struct delta_index *delta_index, u64 header_
 	free_bits -= GUARD_BITS;
 	if (free_bits < IMMUTABLE_HEADER_SIZE) {
 		/* This page is too small to store any delta lists. */
-		return uds_log_error_strerror(UDS_OVERFLOW,
+		return vdo_log_error_strerror(UDS_OVERFLOW,
 					      "Chapter Index Page of %zu bytes is too small",
 					      memory_size);
 	}
@@ -861,7 +861,7 @@ int uds_start_restoring_delta_index(struct delta_index *delta_index,
 		result = uds_read_from_buffered_reader(buffered_readers[z], buffer,
 						       sizeof(buffer));
 		if (result != UDS_SUCCESS) {
-			return uds_log_warning_strerror(result,
+			return vdo_log_warning_strerror(result,
 							"failed to read delta index header");
 		}
 
@@ -878,23 +878,23 @@ int uds_start_restoring_delta_index(struct delta_index *delta_index,
 				"%zu bytes decoded of %zu expected", offset,
 				sizeof(struct delta_index_header));
 		if (result != UDS_SUCCESS) {
-			return uds_log_warning_strerror(result,
+			return vdo_log_warning_strerror(result,
 							"failed to read delta index header");
 		}
 
 		if (memcmp(header.magic, DELTA_INDEX_MAGIC, MAGIC_SIZE) != 0) {
-			return uds_log_warning_strerror(UDS_CORRUPT_DATA,
+			return vdo_log_warning_strerror(UDS_CORRUPT_DATA,
 							"delta index file has bad magic number");
 		}
 
 		if (zone_count != header.zone_count) {
-			return uds_log_warning_strerror(UDS_CORRUPT_DATA,
+			return vdo_log_warning_strerror(UDS_CORRUPT_DATA,
 							"delta index files contain mismatched zone counts (%u,%u)",
 							zone_count, header.zone_count);
 		}
 
 		if (header.zone_number != z) {
-			return uds_log_warning_strerror(UDS_CORRUPT_DATA,
+			return vdo_log_warning_strerror(UDS_CORRUPT_DATA,
 							"delta index zone %u found in slot %u",
 							header.zone_number, z);
 		}
@@ -905,7 +905,7 @@ int uds_start_restoring_delta_index(struct delta_index *delta_index,
 		collision_count += header.collision_count;
 
 		if (first_list[z] != list_next) {
-			return uds_log_warning_strerror(UDS_CORRUPT_DATA,
+			return vdo_log_warning_strerror(UDS_CORRUPT_DATA,
 							"delta index file for zone %u starts with list %u instead of list %u",
 							z, first_list[z], list_next);
 		}
@@ -914,13 +914,13 @@ int uds_start_restoring_delta_index(struct delta_index *delta_index,
 	}
 
 	if (list_next != delta_index->list_count) {
-		return uds_log_warning_strerror(UDS_CORRUPT_DATA,
+		return vdo_log_warning_strerror(UDS_CORRUPT_DATA,
 						"delta index files contain %u delta lists instead of %u delta lists",
 						list_next, delta_index->list_count);
 	}
 
 	if (collision_count > record_count) {
-		return uds_log_warning_strerror(UDS_CORRUPT_DATA,
+		return vdo_log_warning_strerror(UDS_CORRUPT_DATA,
 						"delta index files contain %llu collisions and %llu records",
 						(unsigned long long) collision_count,
 						(unsigned long long) record_count);
@@ -945,7 +945,7 @@ int uds_start_restoring_delta_index(struct delta_index *delta_index,
 							       size_data,
 							       sizeof(size_data));
 			if (result != UDS_SUCCESS) {
-				return uds_log_warning_strerror(result,
+				return vdo_log_warning_strerror(result,
 								"failed to read delta index size");
 			}
 
@@ -978,7 +978,7 @@ static int restore_delta_list_to_zone(struct delta_zone *delta_zone,
 	u32 list_number = save_info->index - delta_zone->first_list;
 
 	if (list_number >= delta_zone->list_count) {
-		return uds_log_warning_strerror(UDS_CORRUPT_DATA,
+		return vdo_log_warning_strerror(UDS_CORRUPT_DATA,
 						"invalid delta list number %u not in range [%u,%u)",
 						save_info->index, delta_zone->first_list,
 						delta_zone->first_list + delta_zone->list_count);
@@ -986,7 +986,7 @@ static int restore_delta_list_to_zone(struct delta_zone *delta_zone,
 
 	delta_list = &delta_zone->delta_lists[list_number + 1];
 	if (delta_list->size == 0) {
-		return uds_log_warning_strerror(UDS_CORRUPT_DATA,
+		return vdo_log_warning_strerror(UDS_CORRUPT_DATA,
 						"unexpected delta list number %u",
 						save_info->index);
 	}
@@ -994,7 +994,7 @@ static int restore_delta_list_to_zone(struct delta_zone *delta_zone,
 	bit_count = delta_list->size + save_info->bit_offset;
 	byte_count = BITS_TO_BYTES(bit_count);
 	if (save_info->byte_count != byte_count) {
-		return uds_log_warning_strerror(UDS_CORRUPT_DATA,
+		return vdo_log_warning_strerror(UDS_CORRUPT_DATA,
 						"unexpected delta list size %u != %u",
 						save_info->byte_count, byte_count);
 	}
@@ -1014,7 +1014,7 @@ static int restore_delta_list_data(struct delta_index *delta_index, unsigned int
 
 	result = uds_read_from_buffered_reader(buffered_reader, buffer, sizeof(buffer));
 	if (result != UDS_SUCCESS) {
-		return uds_log_warning_strerror(result,
+		return vdo_log_warning_strerror(result,
 						"failed to read delta list data");
 	}
 
@@ -1027,7 +1027,7 @@ static int restore_delta_list_data(struct delta_index *delta_index, unsigned int
 
 	if ((save_info.bit_offset >= BITS_PER_BYTE) ||
 	    (save_info.byte_count > DELTA_LIST_MAX_BYTE_COUNT)) {
-		return uds_log_warning_strerror(UDS_CORRUPT_DATA,
+		return vdo_log_warning_strerror(UDS_CORRUPT_DATA,
 						"corrupt delta list data");
 	}
 
@@ -1036,7 +1036,7 @@ static int restore_delta_list_data(struct delta_index *delta_index, unsigned int
 		return UDS_CORRUPT_DATA;
 
 	if (save_info.index >= delta_index->list_count) {
-		return uds_log_warning_strerror(UDS_CORRUPT_DATA,
+		return vdo_log_warning_strerror(UDS_CORRUPT_DATA,
 						"invalid delta list number %u of %u",
 						save_info.index,
 						delta_index->list_count);
@@ -1045,7 +1045,7 @@ static int restore_delta_list_data(struct delta_index *delta_index, unsigned int
 	result = uds_read_from_buffered_reader(buffered_reader, data,
 					       save_info.byte_count);
 	if (result != UDS_SUCCESS) {
-		return uds_log_warning_strerror(result,
+		return vdo_log_warning_strerror(result,
 						"failed to read delta list data");
 	}
 
@@ -1120,7 +1120,7 @@ static int flush_delta_list(struct delta_zone *zone, u32 flush_index)
 	result = uds_write_to_buffered_writer(zone->buffered_writer, buffer,
 					      sizeof(buffer));
 	if (result != UDS_SUCCESS) {
-		uds_log_warning_strerror(result, "failed to write delta list memory");
+		vdo_log_warning_strerror(result, "failed to write delta list memory");
 		return result;
 	}
 
@@ -1128,7 +1128,7 @@ static int flush_delta_list(struct delta_zone *zone, u32 flush_index)
 					      zone->memory + get_delta_list_byte_start(delta_list),
 					      get_delta_list_byte_size(delta_list));
 	if (result != UDS_SUCCESS)
-		uds_log_warning_strerror(result, "failed to write delta list memory");
+		vdo_log_warning_strerror(result, "failed to write delta list memory");
 
 	return result;
 }
@@ -1162,7 +1162,7 @@ int uds_start_saving_delta_index(const struct delta_index *delta_index,
 
 	result = uds_write_to_buffered_writer(buffered_writer, buffer, offset);
 	if (result != UDS_SUCCESS)
-		return uds_log_warning_strerror(result,
+		return vdo_log_warning_strerror(result,
 						"failed to write delta index header");
 
 	for (i = 0; i < delta_zone->list_count; i++) {
@@ -1174,7 +1174,7 @@ int uds_start_saving_delta_index(const struct delta_index *delta_index,
 		result = uds_write_to_buffered_writer(buffered_writer, data,
 						      sizeof(data));
 		if (result != UDS_SUCCESS)
-			return uds_log_warning_strerror(result,
+			return vdo_log_warning_strerror(result,
 							"failed to write delta list size");
 	}
 
@@ -1215,7 +1215,7 @@ int uds_write_guard_delta_list(struct buffered_writer *buffered_writer)
 
 	result = uds_write_to_buffered_writer(buffered_writer, buffer, sizeof(buffer));
 	if (result != UDS_SUCCESS)
-		uds_log_warning_strerror(result, "failed to write guard delta list");
+		vdo_log_warning_strerror(result, "failed to write guard delta list");
 
 	return UDS_SUCCESS;
 }
@@ -1396,7 +1396,7 @@ noinline int uds_next_delta_index_entry(struct delta_index_entry *delta_entry)
 		 * This is not an assertion because uds_validate_chapter_index_page() wants to
 		 * handle this error.
 		 */
-		uds_log_warning("Decoded past the end of the delta list");
+		vdo_log_warning("Decoded past the end of the delta list");
 		return UDS_CORRUPT_DATA;
 	}
 
@@ -1977,7 +1977,7 @@ u32 uds_get_delta_index_page_count(u32 entry_count, u32 list_count, u32 mean_del
 
 void uds_log_delta_index_entry(struct delta_index_entry *delta_entry)
 {
-	uds_log_ratelimit(uds_log_info,
+	vdo_log_ratelimit(vdo_log_info,
 			  "List 0x%X Key 0x%X Offset 0x%X%s%s List_size 0x%X%s",
 			  delta_entry->list_number, delta_entry->key,
 			  delta_entry->offset, delta_entry->at_end ? " end" : "",
diff --git a/drivers/md/dm-vdo/indexer/index-layout.c b/drivers/md/dm-vdo/indexer/index-layout.c
index 3e380206c2f594..1453fddaa656d1 100644
--- a/drivers/md/dm-vdo/indexer/index-layout.c
+++ b/drivers/md/dm-vdo/indexer/index-layout.c
@@ -237,7 +237,7 @@ static int __must_check compute_sizes(const struct uds_configuration *config,
 	result = uds_compute_volume_index_save_blocks(config, sls->block_size,
 						      &sls->volume_index_blocks);
 	if (result != UDS_SUCCESS)
-		return uds_log_error_strerror(result, "cannot compute index save size");
+		return vdo_log_error_strerror(result, "cannot compute index save size");
 
 	sls->page_map_blocks =
 		DIV_ROUND_UP(uds_compute_index_page_map_save_size(geometry),
@@ -261,13 +261,13 @@ int uds_compute_index_size(const struct uds_parameters *parameters, u64 *index_s
 	struct save_layout_sizes sizes;
 
 	if (index_size == NULL) {
-		uds_log_error("Missing output size pointer");
+		vdo_log_error("Missing output size pointer");
 		return -EINVAL;
 	}
 
 	result = uds_make_configuration(parameters, &index_config);
 	if (result != UDS_SUCCESS) {
-		uds_log_error_strerror(result, "cannot compute index size");
+		vdo_log_error_strerror(result, "cannot compute index size");
 		return uds_status_to_errno(result);
 	}
 
@@ -654,7 +654,7 @@ static int discard_index_state_data(struct index_layout *layout)
 	}
 
 	if (saved_result != UDS_SUCCESS) {
-		return uds_log_error_strerror(result,
+		return vdo_log_error_strerror(result,
 					      "%s: cannot destroy all index saves",
 					      __func__);
 	}
@@ -761,18 +761,18 @@ static int __must_check write_uds_index_config(struct index_layout *layout,
 
 	result = open_layout_writer(layout, &layout->config, offset, &writer);
 	if (result != UDS_SUCCESS)
-		return uds_log_error_strerror(result, "failed to open config region");
+		return vdo_log_error_strerror(result, "failed to open config region");
 
 	result = uds_write_config_contents(writer, config, layout->super.version);
 	if (result != UDS_SUCCESS) {
 		vdo_free_buffered_writer(writer);
-		return uds_log_error_strerror(result, "failed to write config region");
+		return vdo_log_error_strerror(result, "failed to write config region");
 	}
 
 	result = uds_flush_buffered_writer(writer);
 	if (result != UDS_SUCCESS) {
 		vdo_free_buffered_writer(writer);
-		return uds_log_error_strerror(result, "cannot flush config writer");
+		return vdo_log_error_strerror(result, "cannot flush config writer");
 	}
 
 	vdo_free_buffered_writer(writer);
@@ -878,7 +878,7 @@ static int find_latest_uds_index_save_slot(struct index_layout *layout,
 	}
 
 	if (latest == NULL) {
-		uds_log_error("No valid index save found");
+		vdo_log_error("No valid index save found");
 		return UDS_INDEX_NOT_SAVED_CLEANLY;
 	}
 
@@ -1150,7 +1150,7 @@ static int __must_check load_region_table(struct buffered_reader *reader,
 
 	result = uds_read_from_buffered_reader(reader, buffer, sizeof(buffer));
 	if (result != UDS_SUCCESS)
-		return uds_log_error_strerror(result, "cannot read region table header");
+		return vdo_log_error_strerror(result, "cannot read region table header");
 
 	decode_u64_le(buffer, &offset, &header.magic);
 	decode_u64_le(buffer, &offset, &header.region_blocks);
@@ -1163,7 +1163,7 @@ static int __must_check load_region_table(struct buffered_reader *reader,
 		return UDS_NO_INDEX;
 
 	if (header.version != 1) {
-		return uds_log_error_strerror(UDS_UNSUPPORTED_VERSION,
+		return vdo_log_error_strerror(UDS_UNSUPPORTED_VERSION,
 					      "unknown region table version %hu",
 					      header.version);
 	}
@@ -1183,7 +1183,7 @@ static int __must_check load_region_table(struct buffered_reader *reader,
 						       sizeof(region_buffer));
 		if (result != UDS_SUCCESS) {
 			vdo_free(table);
-			return uds_log_error_strerror(UDS_CORRUPT_DATA,
+			return vdo_log_error_strerror(UDS_CORRUPT_DATA,
 						      "cannot read region table layouts");
 		}
 
@@ -1214,7 +1214,7 @@ static int __must_check read_super_block_data(struct buffered_reader *reader,
 	result = uds_read_from_buffered_reader(reader, buffer, saved_size);
 	if (result != UDS_SUCCESS) {
 		vdo_free(buffer);
-		return uds_log_error_strerror(result, "cannot read region table header");
+		return vdo_log_error_strerror(result, "cannot read region table header");
 	}
 
 	memcpy(&super->magic_label, buffer, MAGIC_SIZE);
@@ -1241,19 +1241,19 @@ static int __must_check read_super_block_data(struct buffered_reader *reader,
 	vdo_free(buffer);
 
 	if (memcmp(super->magic_label, LAYOUT_MAGIC, MAGIC_SIZE) != 0)
-		return uds_log_error_strerror(UDS_CORRUPT_DATA,
+		return vdo_log_error_strerror(UDS_CORRUPT_DATA,
 					      "unknown superblock magic label");
 
 	if ((super->version < SUPER_VERSION_MINIMUM) ||
 	    (super->version == 4) || (super->version == 5) || (super->version == 6) ||
 	    (super->version > SUPER_VERSION_MAXIMUM)) {
-		return uds_log_error_strerror(UDS_UNSUPPORTED_VERSION,
+		return vdo_log_error_strerror(UDS_UNSUPPORTED_VERSION,
 					      "unknown superblock version number %u",
 					      super->version);
 	}
 
 	if (super->volume_offset < super->start_offset) {
-		return uds_log_error_strerror(UDS_CORRUPT_DATA,
+		return vdo_log_error_strerror(UDS_CORRUPT_DATA,
 					      "inconsistent offsets (start %llu, volume %llu)",
 					      (unsigned long long) super->start_offset,
 					      (unsigned long long) super->volume_offset);
@@ -1261,13 +1261,13 @@ static int __must_check read_super_block_data(struct buffered_reader *reader,
 
 	/* Sub-indexes are no longer used but the layout retains this field. */
 	if (super->index_count != 1) {
-		return uds_log_error_strerror(UDS_CORRUPT_DATA,
+		return vdo_log_error_strerror(UDS_CORRUPT_DATA,
 					      "invalid subindex count %u",
 					      super->index_count);
 	}
 
 	if (generate_primary_nonce(super->nonce_info, sizeof(super->nonce_info)) != super->nonce) {
-		return uds_log_error_strerror(UDS_CORRUPT_DATA,
+		return vdo_log_error_strerror(UDS_CORRUPT_DATA,
 					      "inconsistent superblock nonce");
 	}
 
@@ -1278,15 +1278,15 @@ static int __must_check verify_region(struct layout_region *lr, u64 start_block,
 				      enum region_kind kind, unsigned int instance)
 {
 	if (lr->start_block != start_block)
-		return uds_log_error_strerror(UDS_CORRUPT_DATA,
+		return vdo_log_error_strerror(UDS_CORRUPT_DATA,
 					      "incorrect layout region offset");
 
 	if (lr->kind != kind)
-		return uds_log_error_strerror(UDS_CORRUPT_DATA,
+		return vdo_log_error_strerror(UDS_CORRUPT_DATA,
 					      "incorrect layout region kind");
 
 	if (lr->instance != instance) {
-		return uds_log_error_strerror(UDS_CORRUPT_DATA,
+		return vdo_log_error_strerror(UDS_CORRUPT_DATA,
 					      "incorrect layout region instance");
 	}
 
@@ -1328,7 +1328,7 @@ static int __must_check verify_sub_index(struct index_layout *layout, u64 start_
 
 	next_block -= layout->super.volume_offset;
 	if (next_block != start_block + sil->sub_index.block_count) {
-		return uds_log_error_strerror(UDS_CORRUPT_DATA,
+		return vdo_log_error_strerror(UDS_CORRUPT_DATA,
 					      "sub index region does not span all saves");
 	}
 
@@ -1373,7 +1373,7 @@ static int __must_check reconstitute_layout(struct index_layout *layout,
 		return result;
 
 	if (++next_block != (first_block + layout->total_blocks)) {
-		return uds_log_error_strerror(UDS_CORRUPT_DATA,
+		return vdo_log_error_strerror(UDS_CORRUPT_DATA,
 					      "layout table does not span total blocks");
 	}
 
@@ -1393,19 +1393,19 @@ static int __must_check load_super_block(struct index_layout *layout, size_t blo
 
 	if (table->header.type != RH_TYPE_SUPER) {
 		vdo_free(table);
-		return uds_log_error_strerror(UDS_CORRUPT_DATA,
+		return vdo_log_error_strerror(UDS_CORRUPT_DATA,
 					      "not a superblock region table");
 	}
 
 	result = read_super_block_data(reader, layout, table->header.payload);
 	if (result != UDS_SUCCESS) {
 		vdo_free(table);
-		return uds_log_error_strerror(result, "unknown superblock format");
+		return vdo_log_error_strerror(result, "unknown superblock format");
 	}
 
 	if (super->block_size != block_size) {
 		vdo_free(table);
-		return uds_log_error_strerror(UDS_CORRUPT_DATA,
+		return vdo_log_error_strerror(UDS_CORRUPT_DATA,
 					      "superblock saved block_size %u differs from supplied block_size %zu",
 					      super->block_size, block_size);
 	}
@@ -1426,14 +1426,14 @@ static int __must_check read_index_save_data(struct buffered_reader *reader,
 	size_t offset = 0;
 
 	if (saved_size != sizeof(buffer)) {
-		return uds_log_error_strerror(UDS_CORRUPT_DATA,
+		return vdo_log_error_strerror(UDS_CORRUPT_DATA,
 					      "unexpected index save data size %zu",
 					      saved_size);
 	}
 
 	result = uds_read_from_buffered_reader(reader, buffer, sizeof(buffer));
 	if (result != UDS_SUCCESS)
-		return uds_log_error_strerror(result, "cannot read index save data");
+		return vdo_log_error_strerror(result, "cannot read index save data");
 
 	decode_u64_le(buffer, &offset, &isl->save_data.timestamp);
 	decode_u64_le(buffer, &offset, &isl->save_data.nonce);
@@ -1441,7 +1441,7 @@ static int __must_check read_index_save_data(struct buffered_reader *reader,
 	offset += sizeof(u32);
 
 	if (isl->save_data.version > 1) {
-		return uds_log_error_strerror(UDS_UNSUPPORTED_VERSION,
+		return vdo_log_error_strerror(UDS_UNSUPPORTED_VERSION,
 					      "unknown index save version number %u",
 					      isl->save_data.version);
 	}
@@ -1451,7 +1451,7 @@ static int __must_check read_index_save_data(struct buffered_reader *reader,
 
 	if ((file_version.signature != INDEX_STATE_VERSION_301.signature) ||
 	    (file_version.version_id != INDEX_STATE_VERSION_301.version_id)) {
-		return uds_log_error_strerror(UDS_UNSUPPORTED_VERSION,
+		return vdo_log_error_strerror(UDS_UNSUPPORTED_VERSION,
 					      "index state version %d,%d is unsupported",
 					      file_version.signature,
 					      file_version.version_id);
@@ -1528,7 +1528,7 @@ static int __must_check reconstruct_index_save(struct index_save_layout *isl,
 
 	next_block += isl->free_space.block_count;
 	if (next_block != last_block) {
-		return uds_log_error_strerror(UDS_CORRUPT_DATA,
+		return vdo_log_error_strerror(UDS_CORRUPT_DATA,
 					      "index save layout table incomplete");
 	}
 
@@ -1544,7 +1544,7 @@ static int __must_check load_index_save(struct index_save_layout *isl,
 
 	result = load_region_table(reader, &table);
 	if (result != UDS_SUCCESS) {
-		return uds_log_error_strerror(result, "cannot read index save %u header",
+		return vdo_log_error_strerror(result, "cannot read index save %u header",
 					      instance);
 	}
 
@@ -1552,7 +1552,7 @@ static int __must_check load_index_save(struct index_save_layout *isl,
 		u64 region_blocks = table->header.region_blocks;
 
 		vdo_free(table);
-		return uds_log_error_strerror(UDS_CORRUPT_DATA,
+		return vdo_log_error_strerror(UDS_CORRUPT_DATA,
 					      "unexpected index save %u region block count %llu",
 					      instance,
 					      (unsigned long long) region_blocks);
@@ -1567,7 +1567,7 @@ static int __must_check load_index_save(struct index_save_layout *isl,
 
 	if (table->header.type != RH_TYPE_SAVE) {
 		vdo_free(table);
-		return uds_log_error_strerror(UDS_CORRUPT_DATA,
+		return vdo_log_error_strerror(UDS_CORRUPT_DATA,
 					      "unexpected index save %u header type %u",
 					      instance, table->header.type);
 	}
@@ -1575,7 +1575,7 @@ static int __must_check load_index_save(struct index_save_layout *isl,
 	result = read_index_save_data(reader, isl, table->header.payload);
 	if (result != UDS_SUCCESS) {
 		vdo_free(table);
-		return uds_log_error_strerror(result,
+		return vdo_log_error_strerror(result,
 					      "unknown index save %u data format",
 					      instance);
 	}
@@ -1583,7 +1583,7 @@ static int __must_check load_index_save(struct index_save_layout *isl,
 	result = reconstruct_index_save(isl, table);
 	vdo_free(table);
 	if (result != UDS_SUCCESS) {
-		return uds_log_error_strerror(result, "cannot reconstruct index save %u",
+		return vdo_log_error_strerror(result, "cannot reconstruct index save %u",
 					      instance);
 	}
 
@@ -1602,7 +1602,7 @@ static int __must_check load_sub_index_regions(struct index_layout *layout)
 		result = open_region_reader(layout, &isl->index_save, &reader);
 
 		if (result != UDS_SUCCESS) {
-			uds_log_error_strerror(result,
+			vdo_log_error_strerror(result,
 					       "cannot get reader for index 0 save %u",
 					       j);
 			return result;
@@ -1630,12 +1630,12 @@ static int __must_check verify_uds_index_config(struct index_layout *layout,
 	offset = layout->super.volume_offset - layout->super.start_offset;
 	result = open_layout_reader(layout, &layout->config, offset, &reader);
 	if (result != UDS_SUCCESS)
-		return uds_log_error_strerror(result, "failed to open config reader");
+		return vdo_log_error_strerror(result, "failed to open config reader");
 
 	result = uds_validate_config_contents(reader, config);
 	if (result != UDS_SUCCESS) {
 		vdo_free_buffered_reader(reader);
-		return uds_log_error_strerror(result, "failed to read config region");
+		return vdo_log_error_strerror(result, "failed to read config region");
 	}
 
 	vdo_free_buffered_reader(reader);
@@ -1650,7 +1650,7 @@ static int load_index_layout(struct index_layout *layout, struct uds_configurati
 	result = uds_make_buffered_reader(layout->factory,
 					  layout->offset / UDS_BLOCK_SIZE, 1, &reader);
 	if (result != UDS_SUCCESS)
-		return uds_log_error_strerror(result, "unable to read superblock");
+		return vdo_log_error_strerror(result, "unable to read superblock");
 
 	result = load_super_block(layout, UDS_BLOCK_SIZE,
 				  layout->offset / UDS_BLOCK_SIZE, reader);
@@ -1679,7 +1679,7 @@ static int create_layout_factory(struct index_layout *layout,
 	writable_size = uds_get_writable_size(factory) & -UDS_BLOCK_SIZE;
 	if (writable_size < config->size + config->offset) {
 		uds_put_io_factory(factory);
-		uds_log_error("index storage (%zu) is smaller than the requested size %zu",
+		vdo_log_error("index storage (%zu) is smaller than the requested size %zu",
 			      writable_size, config->size + config->offset);
 		return -ENOSPC;
 	}
@@ -1712,7 +1712,7 @@ int uds_make_index_layout(struct uds_configuration *config, bool new_layout,
 	}
 
 	if (layout->factory_size < sizes.total_size) {
-		uds_log_error("index storage (%zu) is smaller than the required size %llu",
+		vdo_log_error("index storage (%zu) is smaller than the required size %llu",
 			      layout->factory_size,
 			      (unsigned long long) sizes.total_size);
 		vdo_free_index_layout(layout);
diff --git a/drivers/md/dm-vdo/indexer/index-page-map.c b/drivers/md/dm-vdo/indexer/index-page-map.c
index f2ebcc76eef9cc..51054c8ee55efb 100644
--- a/drivers/md/dm-vdo/indexer/index-page-map.c
+++ b/drivers/md/dm-vdo/indexer/index-page-map.c
@@ -169,7 +169,7 @@ int uds_read_index_page_map(struct index_page_map *map, struct buffered_reader *
 		decode_u16_le(buffer, &offset, &map->entries[i]);
 
 	vdo_free(buffer);
-	uds_log_debug("read index page map, last update %llu",
+	vdo_log_debug("read index page map, last update %llu",
 		      (unsigned long long) map->last_update);
 	return UDS_SUCCESS;
 }
diff --git a/drivers/md/dm-vdo/indexer/index-session.c b/drivers/md/dm-vdo/indexer/index-session.c
index 8a129d09392acd..1949a25986561d 100644
--- a/drivers/md/dm-vdo/indexer/index-session.c
+++ b/drivers/md/dm-vdo/indexer/index-session.c
@@ -104,7 +104,7 @@ int uds_launch_request(struct uds_request *request)
 	int result;
 
 	if (request->callback == NULL) {
-		uds_log_error("missing required callback");
+		vdo_log_error("missing required callback");
 		return -EINVAL;
 	}
 
@@ -116,7 +116,7 @@ int uds_launch_request(struct uds_request *request)
 	case UDS_UPDATE:
 		break;
 	default:
-		uds_log_error("received invalid callback type");
+		vdo_log_error("received invalid callback type");
 		return -EINVAL;
 	}
 
@@ -244,7 +244,7 @@ static int __must_check make_empty_index_session(struct uds_index_session **inde
 int uds_create_index_session(struct uds_index_session **session)
 {
 	if (session == NULL) {
-		uds_log_error("missing session pointer");
+		vdo_log_error("missing session pointer");
 		return -EINVAL;
 	}
 
@@ -257,10 +257,10 @@ static int __must_check start_loading_index_session(struct uds_index_session *in
 
 	mutex_lock(&index_session->request_mutex);
 	if (index_session->state & IS_FLAG_SUSPENDED) {
-		uds_log_info("Index session is suspended");
+		vdo_log_info("Index session is suspended");
 		result = -EBUSY;
 	} else if (index_session->state != 0) {
-		uds_log_info("Index is already loaded");
+		vdo_log_info("Index is already loaded");
 		result = -EBUSY;
 	} else {
 		index_session->state |= IS_FLAG_LOADING;
@@ -290,7 +290,7 @@ static int initialize_index_session(struct uds_index_session *index_session,
 
 	result = uds_make_configuration(&index_session->parameters, &config);
 	if (result != UDS_SUCCESS) {
-		uds_log_error_strerror(result, "Failed to allocate config");
+		vdo_log_error_strerror(result, "Failed to allocate config");
 		return result;
 	}
 
@@ -298,7 +298,7 @@ static int initialize_index_session(struct uds_index_session *index_session,
 	result = uds_make_index(config, open_type, &index_session->load_context,
 				enter_callback_stage, &index_session->index);
 	if (result != UDS_SUCCESS)
-		uds_log_error_strerror(result, "Failed to make index");
+		vdo_log_error_strerror(result, "Failed to make index");
 	else
 		uds_log_configuration(config);
 
@@ -332,15 +332,15 @@ int uds_open_index(enum uds_open_index_type open_type,
 	char name[BDEVNAME_SIZE];
 
 	if (parameters == NULL) {
-		uds_log_error("missing required parameters");
+		vdo_log_error("missing required parameters");
 		return -EINVAL;
 	}
 	if (parameters->bdev == NULL) {
-		uds_log_error("missing required block device");
+		vdo_log_error("missing required block device");
 		return -EINVAL;
 	}
 	if (session == NULL) {
-		uds_log_error("missing required session pointer");
+		vdo_log_error("missing required session pointer");
 		return -EINVAL;
 	}
 
@@ -350,11 +350,11 @@ int uds_open_index(enum uds_open_index_type open_type,
 
 	session->parameters = *parameters;
 	format_dev_t(name, parameters->bdev->bd_dev);
-	uds_log_info("%s: %s", get_open_type_string(open_type), name);
+	vdo_log_info("%s: %s", get_open_type_string(open_type), name);
 
 	result = initialize_index_session(session, open_type);
 	if (result != UDS_SUCCESS)
-		uds_log_error_strerror(result, "Failed %s",
+		vdo_log_error_strerror(result, "Failed %s",
 				       get_open_type_string(open_type));
 
 	finish_loading_index_session(session, result);
@@ -426,7 +426,7 @@ int uds_suspend_index_session(struct uds_index_session *session, bool save)
 
 	if ((session->state & IS_FLAG_WAITING) || (session->state & IS_FLAG_DESTROYING)) {
 		no_work = true;
-		uds_log_info("Index session is already changing state");
+		vdo_log_info("Index session is already changing state");
 		result = -EBUSY;
 	} else if (session->state & IS_FLAG_SUSPENDED) {
 		no_work = true;
@@ -485,7 +485,7 @@ int uds_resume_index_session(struct uds_index_session *session,
 
 	mutex_lock(&session->request_mutex);
 	if (session->state & IS_FLAG_WAITING) {
-		uds_log_info("Index session is already changing state");
+		vdo_log_info("Index session is already changing state");
 		no_work = true;
 		result = -EBUSY;
 	} else if (!(session->state & IS_FLAG_SUSPENDED)) {
@@ -562,7 +562,7 @@ static int save_and_free_index(struct uds_index_session *index_session)
 	if (!suspended) {
 		result = uds_save_index(index);
 		if (result != UDS_SUCCESS)
-			uds_log_warning_strerror(result,
+			vdo_log_warning_strerror(result,
 						 "ignoring error from save_index");
 	}
 	vdo_free_index(index);
@@ -598,7 +598,7 @@ int uds_close_index(struct uds_index_session *index_session)
 	}
 
 	if (index_session->state & IS_FLAG_SUSPENDED) {
-		uds_log_info("Index session is suspended");
+		vdo_log_info("Index session is suspended");
 		result = -EBUSY;
 	} else if ((index_session->state & IS_FLAG_DESTROYING) ||
 		   !(index_session->state & IS_FLAG_LOADED)) {
@@ -611,10 +611,10 @@ int uds_close_index(struct uds_index_session *index_session)
 	if (result != UDS_SUCCESS)
 		return uds_status_to_errno(result);
 
-	uds_log_debug("Closing index");
+	vdo_log_debug("Closing index");
 	wait_for_no_requests_in_progress(index_session);
 	result = save_and_free_index(index_session);
-	uds_log_debug("Closed index");
+	vdo_log_debug("Closed index");
 
 	mutex_lock(&index_session->request_mutex);
 	index_session->state &= ~IS_FLAG_CLOSING;
@@ -629,7 +629,7 @@ int uds_destroy_index_session(struct uds_index_session *index_session)
 	int result;
 	bool load_pending = false;
 
-	uds_log_debug("Destroying index session");
+	vdo_log_debug("Destroying index session");
 
 	/* Wait for any current index state change to complete. */
 	mutex_lock(&index_session->request_mutex);
@@ -641,7 +641,7 @@ int uds_destroy_index_session(struct uds_index_session *index_session)
 
 	if (index_session->state & IS_FLAG_DESTROYING) {
 		mutex_unlock(&index_session->request_mutex);
-		uds_log_info("Index session is already closing");
+		vdo_log_info("Index session is already closing");
 		return -EBUSY;
 	}
 
@@ -672,7 +672,7 @@ int uds_destroy_index_session(struct uds_index_session *index_session)
 	result = save_and_free_index(index_session);
 	uds_request_queue_finish(index_session->callback_queue);
 	index_session->callback_queue = NULL;
-	uds_log_debug("Destroyed index session");
+	vdo_log_debug("Destroyed index session");
 	vdo_free(index_session);
 	return uds_status_to_errno(result);
 }
@@ -710,7 +710,7 @@ int uds_get_index_session_stats(struct uds_index_session *index_session,
 				struct uds_index_stats *stats)
 {
 	if (stats == NULL) {
-		uds_log_error("received a NULL index stats pointer");
+		vdo_log_error("received a NULL index stats pointer");
 		return -EINVAL;
 	}
 
diff --git a/drivers/md/dm-vdo/indexer/index.c b/drivers/md/dm-vdo/indexer/index.c
index 45bc163f65277b..bd2405738c5077 100644
--- a/drivers/md/dm-vdo/indexer/index.c
+++ b/drivers/md/dm-vdo/indexer/index.c
@@ -186,7 +186,7 @@ static int finish_previous_chapter(struct uds_index *index, u64 current_chapter_
 	mutex_unlock(&writer->mutex);
 
 	if (result != UDS_SUCCESS)
-		return uds_log_error_strerror(result,
+		return vdo_log_error_strerror(result,
 					      "Writing of previous open chapter failed");
 
 	return UDS_SUCCESS;
@@ -256,7 +256,7 @@ static int open_next_chapter(struct index_zone *zone)
 	unsigned int finished_zones;
 	u32 expire_chapters;
 
-	uds_log_debug("closing chapter %llu of zone %u after %u entries (%u short)",
+	vdo_log_debug("closing chapter %llu of zone %u after %u entries (%u short)",
 		      (unsigned long long) zone->newest_virtual_chapter, zone->id,
 		      zone->open_chapter->size,
 		      zone->open_chapter->capacity - zone->open_chapter->size);
@@ -313,7 +313,7 @@ static int dispatch_index_zone_control_request(struct uds_request *request)
 		return handle_chapter_closed(zone, message->virtual_chapter);
 
 	default:
-		uds_log_error("invalid message type: %d", message->type);
+		vdo_log_error("invalid message type: %d", message->type);
 		return UDS_INVALID_ARGUMENT;
 	}
 }
@@ -598,7 +598,7 @@ static int dispatch_index_request(struct uds_index *index, struct uds_request *r
 		break;
 
 	default:
-		result = uds_log_warning_strerror(UDS_INVALID_ARGUMENT,
+		result = vdo_log_warning_strerror(UDS_INVALID_ARGUMENT,
 						  "invalid request type: %d",
 						  request->type);
 		break;
@@ -616,7 +616,7 @@ static void execute_zone_request(struct uds_request *request)
 	if (request->zone_message.type != UDS_MESSAGE_NONE) {
 		result = dispatch_index_zone_control_request(request);
 		if (result != UDS_SUCCESS) {
-			uds_log_error_strerror(result, "error executing message: %d",
+			vdo_log_error_strerror(result, "error executing message: %d",
 					       request->zone_message.type);
 		}
 
@@ -676,7 +676,7 @@ static void close_chapters(void *arg)
 	struct chapter_writer *writer = arg;
 	struct uds_index *index = writer->index;
 
-	uds_log_debug("chapter writer starting");
+	vdo_log_debug("chapter writer starting");
 	mutex_lock(&writer->mutex);
 	for (;;) {
 		while (writer->zones_to_write < index->zone_count) {
@@ -686,7 +686,7 @@ static void close_chapters(void *arg)
 				 * open chapter, so we can exit now.
 				 */
 				mutex_unlock(&writer->mutex);
-				uds_log_debug("chapter writer stopping");
+				vdo_log_debug("chapter writer stopping");
 				return;
 			}
 			uds_wait_cond(&writer->cond, &writer->mutex);
@@ -709,7 +709,7 @@ static void close_chapters(void *arg)
 			index->has_saved_open_chapter = false;
 			result = uds_discard_open_chapter(index->layout);
 			if (result == UDS_SUCCESS)
-				uds_log_debug("Discarding saved open chapter");
+				vdo_log_debug("Discarding saved open chapter");
 		}
 
 		result = uds_close_open_chapter(writer->chapters, index->zone_count,
@@ -816,7 +816,7 @@ static int load_index(struct uds_index *index)
 
 	last_save_chapter = ((index->last_save != NO_LAST_SAVE) ? index->last_save : 0);
 
-	uds_log_info("loaded index from chapter %llu through chapter %llu",
+	vdo_log_info("loaded index from chapter %llu through chapter %llu",
 		     (unsigned long long) index->oldest_virtual_chapter,
 		     (unsigned long long) last_save_chapter);
 
@@ -841,7 +841,7 @@ static int rebuild_index_page_map(struct uds_index *index, u64 vcn)
 						   index_page_number,
 						   &chapter_index_page);
 		if (result != UDS_SUCCESS) {
-			return uds_log_error_strerror(result,
+			return vdo_log_error_strerror(result,
 						      "failed to read index page %u in chapter %u",
 						      index_page_number, chapter);
 		}
@@ -849,7 +849,7 @@ static int rebuild_index_page_map(struct uds_index *index, u64 vcn)
 		lowest_delta_list = chapter_index_page->lowest_list_number;
 		highest_delta_list = chapter_index_page->highest_list_number;
 		if (lowest_delta_list != expected_list_number) {
-			return uds_log_error_strerror(UDS_CORRUPT_DATA,
+			return vdo_log_error_strerror(UDS_CORRUPT_DATA,
 						      "chapter %u index page %u is corrupt",
 						      chapter, index_page_number);
 		}
@@ -978,7 +978,7 @@ static int replay_chapter(struct uds_index *index, u64 virtual, bool sparse)
 	u32 physical_chapter;
 
 	if (check_for_suspend(index)) {
-		uds_log_info("Replay interrupted by index shutdown at chapter %llu",
+		vdo_log_info("Replay interrupted by index shutdown at chapter %llu",
 			     (unsigned long long) virtual);
 		return -EBUSY;
 	}
@@ -990,7 +990,7 @@ static int replay_chapter(struct uds_index *index, u64 virtual, bool sparse)
 
 	result = rebuild_index_page_map(index, virtual);
 	if (result != UDS_SUCCESS) {
-		return uds_log_error_strerror(result,
+		return vdo_log_error_strerror(result,
 					      "could not rebuild index page map for chapter %u",
 					      physical_chapter);
 	}
@@ -1003,7 +1003,7 @@ static int replay_chapter(struct uds_index *index, u64 virtual, bool sparse)
 		result = uds_get_volume_record_page(index->volume, physical_chapter,
 						    record_page_number, &record_page);
 		if (result != UDS_SUCCESS) {
-			return uds_log_error_strerror(result, "could not get page %d",
+			return vdo_log_error_strerror(result, "could not get page %d",
 						      record_page_number);
 		}
 
@@ -1032,7 +1032,7 @@ static int replay_volume(struct uds_index *index)
 	u64 upto_virtual = index->newest_virtual_chapter;
 	bool will_be_sparse;
 
-	uds_log_info("Replaying volume from chapter %llu through chapter %llu",
+	vdo_log_info("Replaying volume from chapter %llu through chapter %llu",
 		     (unsigned long long) from_virtual,
 		     (unsigned long long) upto_virtual);
 
@@ -1062,7 +1062,7 @@ static int replay_volume(struct uds_index *index)
 
 	new_map_update = index->volume->index_page_map->last_update;
 	if (new_map_update != old_map_update) {
-		uds_log_info("replay changed index page map update from %llu to %llu",
+		vdo_log_info("replay changed index page map update from %llu to %llu",
 			     (unsigned long long) old_map_update,
 			     (unsigned long long) new_map_update);
 	}
@@ -1082,7 +1082,7 @@ static int rebuild_index(struct uds_index *index)
 	result = uds_find_volume_chapter_boundaries(index->volume, &lowest, &highest,
 						    &is_empty);
 	if (result != UDS_SUCCESS) {
-		return uds_log_fatal_strerror(result,
+		return vdo_log_fatal_strerror(result,
 					      "cannot rebuild index: unknown volume chapter boundaries");
 	}
 
@@ -1192,7 +1192,7 @@ int uds_make_index(struct uds_configuration *config, enum uds_open_index_type op
 		result = make_index_zone(index, z);
 		if (result != UDS_SUCCESS) {
 			vdo_free_index(index);
-			return uds_log_error_strerror(result,
+			return vdo_log_error_strerror(result,
 						      "Could not create index zone");
 		}
 	}
@@ -1201,7 +1201,7 @@ int uds_make_index(struct uds_configuration *config, enum uds_open_index_type op
 	result = uds_make_volume_index(config, nonce, &index->volume_index);
 	if (result != UDS_SUCCESS) {
 		vdo_free_index(index);
-		return uds_log_error_strerror(result, "could not make volume index");
+		return vdo_log_error_strerror(result, "could not make volume index");
 	}
 
 	index->load_context = load_context;
@@ -1227,14 +1227,14 @@ int uds_make_index(struct uds_configuration *config, enum uds_open_index_type op
 			break;
 		case -ENOMEM:
 			/* We should not try a rebuild for this error. */
-			uds_log_error_strerror(result, "index could not be loaded");
+			vdo_log_error_strerror(result, "index could not be loaded");
 			break;
 		default:
-			uds_log_error_strerror(result, "index could not be loaded");
+			vdo_log_error_strerror(result, "index could not be loaded");
 			if (open_type == UDS_LOAD) {
 				result = rebuild_index(index);
 				if (result != UDS_SUCCESS) {
-					uds_log_error_strerror(result,
+					vdo_log_error_strerror(result,
 							       "index could not be rebuilt");
 				}
 			}
@@ -1244,7 +1244,7 @@ int uds_make_index(struct uds_configuration *config, enum uds_open_index_type op
 
 	if (result != UDS_SUCCESS) {
 		vdo_free_index(index);
-		return uds_log_error_strerror(result, "fatal error in %s()", __func__);
+		return vdo_log_error_strerror(result, "fatal error in %s()", __func__);
 	}
 
 	for (z = 0; z < index->zone_count; z++) {
@@ -1318,16 +1318,16 @@ int uds_save_index(struct uds_index *index)
 	index->prev_save = index->last_save;
 	index->last_save = ((index->newest_virtual_chapter == 0) ?
 			    NO_LAST_SAVE : index->newest_virtual_chapter - 1);
-	uds_log_info("beginning save (vcn %llu)", (unsigned long long) index->last_save);
+	vdo_log_info("beginning save (vcn %llu)", (unsigned long long) index->last_save);
 
 	result = uds_save_index_state(index->layout, index);
 	if (result != UDS_SUCCESS) {
-		uds_log_info("save index failed");
+		vdo_log_info("save index failed");
 		index->last_save = index->prev_save;
 	} else {
 		index->has_saved_open_chapter = true;
 		index->need_to_save = false;
-		uds_log_info("finished save (vcn %llu)",
+		vdo_log_info("finished save (vcn %llu)",
 			     (unsigned long long) index->last_save);
 	}
 
diff --git a/drivers/md/dm-vdo/indexer/io-factory.c b/drivers/md/dm-vdo/indexer/io-factory.c
index 8fe7c0b2802dbb..61104d5ccd61a6 100644
--- a/drivers/md/dm-vdo/indexer/io-factory.c
+++ b/drivers/md/dm-vdo/indexer/io-factory.c
@@ -365,7 +365,7 @@ void vdo_free_buffered_writer(struct buffered_writer *writer)
 	flush_previous_buffer(writer);
 	result = -dm_bufio_write_dirty_buffers(writer->client);
 	if (result != UDS_SUCCESS)
-		uds_log_warning_strerror(result, "%s: failed to sync storage", __func__);
+		vdo_log_warning_strerror(result, "%s: failed to sync storage", __func__);
 
 	dm_bufio_client_destroy(writer->client);
 	uds_put_io_factory(writer->factory);
diff --git a/drivers/md/dm-vdo/indexer/open-chapter.c b/drivers/md/dm-vdo/indexer/open-chapter.c
index 989b1946e55daf..298c9c88ae5aa3 100644
--- a/drivers/md/dm-vdo/indexer/open-chapter.c
+++ b/drivers/md/dm-vdo/indexer/open-chapter.c
@@ -261,14 +261,14 @@ static int fill_delta_chapter_index(struct open_chapter_zone **chapter_zones,
 			overflow_count++;
 			break;
 		default:
-			uds_log_error_strerror(result,
+			vdo_log_error_strerror(result,
 					       "failed to build open chapter index");
 			return result;
 		}
 	}
 
 	if (overflow_count > 0)
-		uds_log_warning("Failed to add %d entries to chapter index",
+		vdo_log_warning("Failed to add %d entries to chapter index",
 				overflow_count);
 
 	return UDS_SUCCESS;
@@ -419,7 +419,7 @@ int uds_load_open_chapter(struct uds_index *index, struct buffered_reader *reade
 		return result;
 
 	if (memcmp(OPEN_CHAPTER_VERSION, version, sizeof(version)) != 0) {
-		return uds_log_error_strerror(UDS_CORRUPT_DATA,
+		return vdo_log_error_strerror(UDS_CORRUPT_DATA,
 					      "Invalid open chapter version: %.*s",
 					      (int) sizeof(version), version);
 	}
diff --git a/drivers/md/dm-vdo/indexer/volume-index.c b/drivers/md/dm-vdo/indexer/volume-index.c
index 762607974f3587..be15c9e5568b7b 100644
--- a/drivers/md/dm-vdo/indexer/volume-index.c
+++ b/drivers/md/dm-vdo/indexer/volume-index.c
@@ -223,13 +223,13 @@ static int compute_volume_sub_index_parameters(const struct uds_configuration *c
 	params->address_bits = bits_per(address_count - 1);
 	params->chapter_bits = bits_per(rounded_chapters - 1);
 	if ((u32) params->list_count != params->list_count) {
-		return uds_log_warning_strerror(UDS_INVALID_ARGUMENT,
+		return vdo_log_warning_strerror(UDS_INVALID_ARGUMENT,
 						"cannot initialize volume index with %llu delta lists",
 						(unsigned long long) params->list_count);
 	}
 
 	if (params->address_bits > 31) {
-		return uds_log_warning_strerror(UDS_INVALID_ARGUMENT,
+		return vdo_log_warning_strerror(UDS_INVALID_ARGUMENT,
 						"cannot initialize volume index with %u address bits",
 						params->address_bits);
 	}
@@ -566,7 +566,7 @@ int uds_put_volume_index_record(struct volume_index_record *record, u64 virtual_
 		u64 low = get_zone_for_record(record)->virtual_chapter_low;
 		u64 high = get_zone_for_record(record)->virtual_chapter_high;
 
-		return uds_log_warning_strerror(UDS_INVALID_ARGUMENT,
+		return vdo_log_warning_strerror(UDS_INVALID_ARGUMENT,
 						"cannot put record into chapter number %llu that is out of the valid range %llu to %llu",
 						(unsigned long long) virtual_chapter,
 						(unsigned long long) low,
@@ -588,7 +588,7 @@ int uds_put_volume_index_record(struct volume_index_record *record, u64 virtual_
 		record->is_found = true;
 		break;
 	case UDS_OVERFLOW:
-		uds_log_ratelimit(uds_log_warning_strerror, UDS_OVERFLOW,
+		vdo_log_ratelimit(vdo_log_warning_strerror, UDS_OVERFLOW,
 				  "Volume index entry dropped due to overflow condition");
 		uds_log_delta_index_entry(&record->delta_entry);
 		break;
@@ -604,7 +604,7 @@ int uds_remove_volume_index_record(struct volume_index_record *record)
 	int result;
 
 	if (!record->is_found)
-		return uds_log_warning_strerror(UDS_BAD_STATE,
+		return vdo_log_warning_strerror(UDS_BAD_STATE,
 						"illegal operation on new record");
 
 	/* Mark the record so that it cannot be used again */
@@ -642,7 +642,7 @@ static void set_volume_sub_index_zone_open_chapter(struct volume_sub_index *sub_
 			1 + (used_bits - sub_index->max_zone_bits) / sub_index->chapter_zone_bits;
 
 		if (expire_count == 1) {
-			uds_log_ratelimit(uds_log_info,
+			vdo_log_ratelimit(vdo_log_info,
 					  "zone %u:  At chapter %llu, expiring chapter %llu early",
 					  zone_number,
 					  (unsigned long long) virtual_chapter,
@@ -660,7 +660,7 @@ static void set_volume_sub_index_zone_open_chapter(struct volume_sub_index *sub_
 					zone->virtual_chapter_high - zone->virtual_chapter_low;
 				zone->virtual_chapter_low = zone->virtual_chapter_high;
 			}
-			uds_log_ratelimit(uds_log_info,
+			vdo_log_ratelimit(vdo_log_info,
 					  "zone %u:  At chapter %llu, expiring chapters %llu to %llu early",
 					  zone_number,
 					  (unsigned long long) virtual_chapter,
@@ -711,14 +711,14 @@ int uds_set_volume_index_record_chapter(struct volume_index_record *record,
 	int result;
 
 	if (!record->is_found)
-		return uds_log_warning_strerror(UDS_BAD_STATE,
+		return vdo_log_warning_strerror(UDS_BAD_STATE,
 						"illegal operation on new record");
 
 	if (!is_virtual_chapter_indexed(record, virtual_chapter)) {
 		u64 low = get_zone_for_record(record)->virtual_chapter_low;
 		u64 high = get_zone_for_record(record)->virtual_chapter_high;
 
-		return uds_log_warning_strerror(UDS_INVALID_ARGUMENT,
+		return vdo_log_warning_strerror(UDS_INVALID_ARGUMENT,
 						"cannot set chapter number %llu that is out of the valid range %llu to %llu",
 						(unsigned long long) virtual_chapter,
 						(unsigned long long) low,
@@ -818,7 +818,7 @@ static int start_restoring_volume_sub_index(struct volume_sub_index *sub_index,
 		result = uds_read_from_buffered_reader(readers[i], buffer,
 						       sizeof(buffer));
 		if (result != UDS_SUCCESS) {
-			return uds_log_warning_strerror(result,
+			return vdo_log_warning_strerror(result,
 							"failed to read volume index header");
 		}
 
@@ -837,14 +837,14 @@ static int start_restoring_volume_sub_index(struct volume_sub_index *sub_index,
 			result = UDS_CORRUPT_DATA;
 
 		if (memcmp(header.magic, MAGIC_START_5, MAGIC_SIZE) != 0) {
-			return uds_log_warning_strerror(UDS_CORRUPT_DATA,
+			return vdo_log_warning_strerror(UDS_CORRUPT_DATA,
 							"volume index file had bad magic number");
 		}
 
 		if (sub_index->volume_nonce == 0) {
 			sub_index->volume_nonce = header.volume_nonce;
 		} else if (header.volume_nonce != sub_index->volume_nonce) {
-			return uds_log_warning_strerror(UDS_CORRUPT_DATA,
+			return vdo_log_warning_strerror(UDS_CORRUPT_DATA,
 							"volume index volume nonce incorrect");
 		}
 
@@ -855,7 +855,7 @@ static int start_restoring_volume_sub_index(struct volume_sub_index *sub_index,
 			u64 low = header.virtual_chapter_low;
 			u64 high = header.virtual_chapter_high;
 
-			return uds_log_warning_strerror(UDS_CORRUPT_DATA,
+			return vdo_log_warning_strerror(UDS_CORRUPT_DATA,
 							"Inconsistent volume index zone files: Chapter range is [%llu,%llu], chapter range %d is [%llu,%llu]",
 							(unsigned long long) virtual_chapter_low,
 							(unsigned long long) virtual_chapter_high,
@@ -871,7 +871,7 @@ static int start_restoring_volume_sub_index(struct volume_sub_index *sub_index,
 			result = uds_read_from_buffered_reader(readers[i], decoded,
 							       sizeof(u64));
 			if (result != UDS_SUCCESS) {
-				return uds_log_warning_strerror(result,
+				return vdo_log_warning_strerror(result,
 								"failed to read volume index flush ranges");
 			}
 
@@ -889,7 +889,7 @@ static int start_restoring_volume_sub_index(struct volume_sub_index *sub_index,
 	result = uds_start_restoring_delta_index(&sub_index->delta_index, readers,
 						 reader_count);
 	if (result != UDS_SUCCESS)
-		return uds_log_warning_strerror(result, "restoring delta index failed");
+		return vdo_log_warning_strerror(result, "restoring delta index failed");
 
 	return UDS_SUCCESS;
 }
@@ -914,7 +914,7 @@ static int start_restoring_volume_index(struct volume_index *volume_index,
 		result = uds_read_from_buffered_reader(buffered_readers[i], buffer,
 						       sizeof(buffer));
 		if (result != UDS_SUCCESS) {
-			return uds_log_warning_strerror(result,
+			return vdo_log_warning_strerror(result,
 							"failed to read volume index header");
 		}
 
@@ -929,13 +929,13 @@ static int start_restoring_volume_index(struct volume_index *volume_index,
 			result = UDS_CORRUPT_DATA;
 
 		if (memcmp(header.magic, MAGIC_START_6, MAGIC_SIZE) != 0)
-			return uds_log_warning_strerror(UDS_CORRUPT_DATA,
+			return vdo_log_warning_strerror(UDS_CORRUPT_DATA,
 							"volume index file had bad magic number");
 
 		if (i == 0) {
 			volume_index->sparse_sample_rate = header.sparse_sample_rate;
 		} else if (volume_index->sparse_sample_rate != header.sparse_sample_rate) {
-			uds_log_warning_strerror(UDS_CORRUPT_DATA,
+			vdo_log_warning_strerror(UDS_CORRUPT_DATA,
 						 "Inconsistent sparse sample rate in delta index zone files: %u vs. %u",
 						 volume_index->sparse_sample_rate,
 						 header.sparse_sample_rate);
@@ -1029,7 +1029,7 @@ static int start_saving_volume_sub_index(const struct volume_sub_index *sub_inde
 
 	result = uds_write_to_buffered_writer(buffered_writer, buffer, offset);
 	if (result != UDS_SUCCESS)
-		return uds_log_warning_strerror(result,
+		return vdo_log_warning_strerror(result,
 						"failed to write volume index header");
 
 	for (i = 0; i < list_count; i++) {
@@ -1039,7 +1039,7 @@ static int start_saving_volume_sub_index(const struct volume_sub_index *sub_inde
 		result = uds_write_to_buffered_writer(buffered_writer, encoded,
 						      sizeof(u64));
 		if (result != UDS_SUCCESS) {
-			return uds_log_warning_strerror(result,
+			return vdo_log_warning_strerror(result,
 							"failed to write volume index flush ranges");
 		}
 	}
@@ -1072,7 +1072,7 @@ static int start_saving_volume_index(const struct volume_index *volume_index,
 
 	result = uds_write_to_buffered_writer(writer, buffer, offset);
 	if (result != UDS_SUCCESS) {
-		uds_log_warning_strerror(result, "failed to write volume index header");
+		vdo_log_warning_strerror(result, "failed to write volume index header");
 		return result;
 	}
 
@@ -1262,7 +1262,7 @@ int uds_make_volume_index(const struct uds_configuration *config, u64 volume_non
 					     &volume_index->vi_non_hook);
 	if (result != UDS_SUCCESS) {
 		vdo_free_volume_index(volume_index);
-		return uds_log_error_strerror(result,
+		return vdo_log_error_strerror(result,
 					      "Error creating non hook volume index");
 	}
 
@@ -1270,7 +1270,7 @@ int uds_make_volume_index(const struct uds_configuration *config, u64 volume_non
 					     &volume_index->vi_hook);
 	if (result != UDS_SUCCESS) {
 		vdo_free_volume_index(volume_index);
-		return uds_log_error_strerror(result,
+		return vdo_log_error_strerror(result,
 					      "Error creating hook volume index");
 	}
 
diff --git a/drivers/md/dm-vdo/indexer/volume.c b/drivers/md/dm-vdo/indexer/volume.c
index 7996b026a58b01..8b21ec93f3bc31 100644
--- a/drivers/md/dm-vdo/indexer/volume.c
+++ b/drivers/md/dm-vdo/indexer/volume.c
@@ -359,7 +359,7 @@ static void enqueue_page_read(struct volume *volume, struct uds_request *request
 {
 	/* Mark the page as queued, so that chapter invalidation knows to cancel a read. */
 	while (!enqueue_read(&volume->page_cache, request, physical_page)) {
-		uds_log_debug("Read queue full, waiting for reads to finish");
+		vdo_log_debug("Read queue full, waiting for reads to finish");
 		uds_wait_cond(&volume->read_threads_read_done_cond,
 			      &volume->read_threads_mutex);
 	}
@@ -433,7 +433,7 @@ static int init_chapter_index_page(const struct volume *volume, u8 *index_page,
 		return result;
 
 	if (result != UDS_SUCCESS) {
-		return uds_log_error_strerror(result,
+		return vdo_log_error_strerror(result,
 					      "Reading chapter index page for chapter %u page %u",
 					      chapter, index_page_number);
 	}
@@ -447,14 +447,14 @@ static int init_chapter_index_page(const struct volume *volume, u8 *index_page,
 	    (highest_list == chapter_index_page->highest_list_number))
 		return UDS_SUCCESS;
 
-	uds_log_warning("Index page map updated to %llu",
+	vdo_log_warning("Index page map updated to %llu",
 			(unsigned long long) volume->index_page_map->last_update);
-	uds_log_warning("Page map expects that chapter %u page %u has range %u to %u, but chapter index page has chapter %llu with range %u to %u",
+	vdo_log_warning("Page map expects that chapter %u page %u has range %u to %u, but chapter index page has chapter %llu with range %u to %u",
 			chapter, index_page_number, lowest_list, highest_list,
 			(unsigned long long) ci_virtual,
 			chapter_index_page->lowest_list_number,
 			chapter_index_page->highest_list_number);
-	return uds_log_error_strerror(UDS_CORRUPT_DATA,
+	return vdo_log_error_strerror(UDS_CORRUPT_DATA,
 				      "index page map mismatch with chapter index");
 }
 
@@ -549,7 +549,7 @@ static int process_entry(struct volume *volume, struct queued_read *entry)
 	int result;
 
 	if (entry->invalid) {
-		uds_log_debug("Requeuing requests for invalid page");
+		vdo_log_debug("Requeuing requests for invalid page");
 		return UDS_SUCCESS;
 	}
 
@@ -560,7 +560,7 @@ static int process_entry(struct volume *volume, struct queued_read *entry)
 	mutex_lock(&volume->read_threads_mutex);
 	if (IS_ERR(page_data)) {
 		result = -PTR_ERR(page_data);
-		uds_log_warning_strerror(result,
+		vdo_log_warning_strerror(result,
 					 "error reading physical page %u from volume",
 					 page_number);
 		cancel_page_in_cache(&volume->page_cache, page_number, page);
@@ -568,7 +568,7 @@ static int process_entry(struct volume *volume, struct queued_read *entry)
 	}
 
 	if (entry->invalid) {
-		uds_log_warning("Page %u invalidated after read", page_number);
+		vdo_log_warning("Page %u invalidated after read", page_number);
 		cancel_page_in_cache(&volume->page_cache, page_number, page);
 		return UDS_SUCCESS;
 	}
@@ -576,7 +576,7 @@ static int process_entry(struct volume *volume, struct queued_read *entry)
 	if (!is_record_page(volume->geometry, page_number)) {
 		result = initialize_index_page(volume, page_number, page);
 		if (result != UDS_SUCCESS) {
-			uds_log_warning("Error initializing chapter index page");
+			vdo_log_warning("Error initializing chapter index page");
 			cancel_page_in_cache(&volume->page_cache, page_number, page);
 			return result;
 		}
@@ -584,7 +584,7 @@ static int process_entry(struct volume *volume, struct queued_read *entry)
 
 	result = put_page_in_cache(&volume->page_cache, page_number, page);
 	if (result != UDS_SUCCESS) {
-		uds_log_warning("Error putting page %u in cache", page_number);
+		vdo_log_warning("Error putting page %u in cache", page_number);
 		cancel_page_in_cache(&volume->page_cache, page_number, page);
 		return result;
 	}
@@ -626,7 +626,7 @@ static void read_thread_function(void *arg)
 {
 	struct volume *volume = arg;
 
-	uds_log_debug("reader starting");
+	vdo_log_debug("reader starting");
 	mutex_lock(&volume->read_threads_mutex);
 	while (true) {
 		struct queued_read *queue_entry;
@@ -640,7 +640,7 @@ static void read_thread_function(void *arg)
 		release_queued_requests(volume, queue_entry, result);
 	}
 	mutex_unlock(&volume->read_threads_mutex);
-	uds_log_debug("reader done");
+	vdo_log_debug("reader done");
 }
 
 static void get_page_and_index(struct page_cache *cache, u32 physical_page,
@@ -703,7 +703,7 @@ static int read_page_locked(struct volume *volume, u32 physical_page,
 	page_data = dm_bufio_read(volume->client, physical_page, &page->buffer);
 	if (IS_ERR(page_data)) {
 		result = -PTR_ERR(page_data);
-		uds_log_warning_strerror(result,
+		vdo_log_warning_strerror(result,
 					 "error reading physical page %u from volume",
 					 physical_page);
 		cancel_page_in_cache(&volume->page_cache, physical_page, page);
@@ -714,7 +714,7 @@ static int read_page_locked(struct volume *volume, u32 physical_page,
 		result = initialize_index_page(volume, physical_page, page);
 		if (result != UDS_SUCCESS) {
 			if (volume->lookup_mode != LOOKUP_FOR_REBUILD)
-				uds_log_warning("Corrupt index page %u", physical_page);
+				vdo_log_warning("Corrupt index page %u", physical_page);
 			cancel_page_in_cache(&volume->page_cache, physical_page, page);
 			return result;
 		}
@@ -722,7 +722,7 @@ static int read_page_locked(struct volume *volume, u32 physical_page,
 
 	result = put_page_in_cache(&volume->page_cache, physical_page, page);
 	if (result != UDS_SUCCESS) {
-		uds_log_warning("Error putting page %u in cache", physical_page);
+		vdo_log_warning("Error putting page %u in cache", physical_page);
 		cancel_page_in_cache(&volume->page_cache, physical_page, page);
 		return result;
 	}
@@ -949,7 +949,7 @@ int uds_read_chapter_index_from_volume(const struct volume *volume, u64 virtual_
 					   &volume_buffers[i]);
 		if (IS_ERR(index_page)) {
 			result = -PTR_ERR(index_page);
-			uds_log_warning_strerror(result,
+			vdo_log_warning_strerror(result,
 						 "error reading physical page %u",
 						 physical_page);
 			return result;
@@ -1041,7 +1041,7 @@ static void invalidate_page(struct page_cache *cache, u32 physical_page)
 		wait_for_pending_searches(cache, page->physical_page);
 		clear_cache_page(cache, page);
 	} else if (queue_index > -1) {
-		uds_log_debug("setting pending read to invalid");
+		vdo_log_debug("setting pending read to invalid");
 		cache->read_queue[queue_index].invalid = true;
 	}
 }
@@ -1053,7 +1053,7 @@ void vdo_forget_chapter(struct volume *volume, u64 virtual_chapter)
 	u32 first_page = map_to_physical_page(volume->geometry, physical_chapter, 0);
 	u32 i;
 
-	uds_log_debug("forgetting chapter %llu", (unsigned long long) virtual_chapter);
+	vdo_log_debug("forgetting chapter %llu", (unsigned long long) virtual_chapter);
 	mutex_lock(&volume->read_threads_mutex);
 	for (i = 0; i < volume->geometry->pages_per_chapter; i++)
 		invalidate_page(&volume->page_cache, first_page + i);
@@ -1079,14 +1079,14 @@ static int donate_index_page_locked(struct volume *volume, u32 physical_chapter,
 					 physical_chapter, index_page_number,
 					 &page->index_page);
 	if (result != UDS_SUCCESS) {
-		uds_log_warning("Error initialize chapter index page");
+		vdo_log_warning("Error initialize chapter index page");
 		cancel_page_in_cache(&volume->page_cache, physical_page, page);
 		return result;
 	}
 
 	result = put_page_in_cache(&volume->page_cache, physical_page, page);
 	if (result != UDS_SUCCESS) {
-		uds_log_warning("Error putting page %u in cache", physical_page);
+		vdo_log_warning("Error putting page %u in cache", physical_page);
 		cancel_page_in_cache(&volume->page_cache, physical_page, page);
 		return result;
 	}
@@ -1114,7 +1114,7 @@ static int write_index_pages(struct volume *volume, u32 physical_chapter_number,
 
 		page_data = dm_bufio_new(volume->client, physical_page, &page_buffer);
 		if (IS_ERR(page_data)) {
-			return uds_log_warning_strerror(-PTR_ERR(page_data),
+			return vdo_log_warning_strerror(-PTR_ERR(page_data),
 							"failed to prepare index page");
 		}
 
@@ -1124,14 +1124,14 @@ static int write_index_pages(struct volume *volume, u32 physical_chapter_number,
 							  &lists_packed);
 		if (result != UDS_SUCCESS) {
 			dm_bufio_release(page_buffer);
-			return uds_log_warning_strerror(result,
+			return vdo_log_warning_strerror(result,
 							"failed to pack index page");
 		}
 
 		dm_bufio_mark_buffer_dirty(page_buffer);
 
 		if (lists_packed == 0) {
-			uds_log_debug("no delta lists packed on chapter %u page %u",
+			vdo_log_debug("no delta lists packed on chapter %u page %u",
 				      physical_chapter_number, index_page_number);
 		} else {
 			delta_list_number += lists_packed;
@@ -1223,14 +1223,14 @@ static int write_record_pages(struct volume *volume, u32 physical_chapter_number
 
 		page_data = dm_bufio_new(volume->client, physical_page, &page_buffer);
 		if (IS_ERR(page_data)) {
-			return uds_log_warning_strerror(-PTR_ERR(page_data),
+			return vdo_log_warning_strerror(-PTR_ERR(page_data),
 							"failed to prepare record page");
 		}
 
 		result = encode_record_page(volume, next_record, page_data);
 		if (result != UDS_SUCCESS) {
 			dm_bufio_release(page_buffer);
-			return uds_log_warning_strerror(result,
+			return vdo_log_warning_strerror(result,
 							"failed to encode record page %u",
 							record_page_number);
 		}
@@ -1261,7 +1261,7 @@ int uds_write_chapter(struct volume *volume, struct open_chapter_index *chapter_
 
 	result = -dm_bufio_write_dirty_buffers(volume->client);
 	if (result != UDS_SUCCESS)
-		uds_log_error_strerror(result, "cannot sync chapter to volume");
+		vdo_log_error_strerror(result, "cannot sync chapter to volume");
 
 	return result;
 }
@@ -1288,7 +1288,7 @@ static void probe_chapter(struct volume *volume, u32 chapter_number,
 			return;
 
 		if (page->virtual_chapter_number == BAD_CHAPTER) {
-			uds_log_error("corrupt index page in chapter %u",
+			vdo_log_error("corrupt index page in chapter %u",
 				      chapter_number);
 			return;
 		}
@@ -1296,14 +1296,14 @@ static void probe_chapter(struct volume *volume, u32 chapter_number,
 		if (vcn == BAD_CHAPTER) {
 			vcn = page->virtual_chapter_number;
 		} else if (page->virtual_chapter_number != vcn) {
-			uds_log_error("inconsistent chapter %u index page %u: expected vcn %llu, got vcn %llu",
+			vdo_log_error("inconsistent chapter %u index page %u: expected vcn %llu, got vcn %llu",
 				      chapter_number, i, (unsigned long long) vcn,
 				      (unsigned long long) page->virtual_chapter_number);
 			return;
 		}
 
 		if (expected_list_number != page->lowest_list_number) {
-			uds_log_error("inconsistent chapter %u index page %u: expected list number %u, got list number %u",
+			vdo_log_error("inconsistent chapter %u index page %u: expected list number %u, got list number %u",
 				      chapter_number, i, expected_list_number,
 				      page->lowest_list_number);
 			return;
@@ -1316,7 +1316,7 @@ static void probe_chapter(struct volume *volume, u32 chapter_number,
 	}
 
 	if (chapter_number != uds_map_to_physical_chapter(geometry, vcn)) {
-		uds_log_error("chapter %u vcn %llu is out of phase (%u)", chapter_number,
+		vdo_log_error("chapter %u vcn %llu is out of phase (%u)", chapter_number,
 			      (unsigned long long) vcn, geometry->chapters_per_volume);
 		return;
 	}
@@ -1433,7 +1433,7 @@ static int find_chapter_limits(struct volume *volume, u32 chapter_limit, u64 *lo
 
 		probe_chapter(volume, right_chapter, &highest);
 		if (bad_chapters++ >= MAX_BAD_CHAPTERS) {
-			uds_log_error("too many bad chapters in volume: %u",
+			vdo_log_error("too many bad chapters in volume: %u",
 				      bad_chapters);
 			return UDS_CORRUPT_DATA;
 		}
@@ -1557,7 +1557,7 @@ int uds_make_volume(const struct uds_configuration *config, struct index_layout
 	result = uds_copy_index_geometry(config->geometry, &volume->geometry);
 	if (result != UDS_SUCCESS) {
 		vdo_free_volume(volume);
-		return uds_log_warning_strerror(result,
+		return vdo_log_warning_strerror(result,
 						"failed to allocate geometry: error");
 	}
 	geometry = volume->geometry;
diff --git a/drivers/md/dm-vdo/int-map.c b/drivers/md/dm-vdo/int-map.c
index 1bdd83a1dc2bb9..8919cda22d58bd 100644
--- a/drivers/md/dm-vdo/int-map.c
+++ b/drivers/md/dm-vdo/int-map.c
@@ -383,7 +383,7 @@ static int resize_buckets(struct int_map *map)
 	/* Re-initialize the map to be empty and 50% larger. */
 	size_t new_capacity = map->capacity / 2 * 3;
 
-	uds_log_info("%s: attempting resize from %zu to %zu, current size=%zu",
+	vdo_log_info("%s: attempting resize from %zu to %zu, current size=%zu",
 		     __func__, map->capacity, new_capacity, map->size);
 	result = allocate_buckets(map, new_capacity);
 	if (result != VDO_SUCCESS) {
diff --git a/drivers/md/dm-vdo/io-submitter.c b/drivers/md/dm-vdo/io-submitter.c
index 61bb48068c3a28..9a3716bb3c05ec 100644
--- a/drivers/md/dm-vdo/io-submitter.c
+++ b/drivers/md/dm-vdo/io-submitter.c
@@ -408,7 +408,7 @@ int vdo_make_io_submitter(unsigned int thread_count, unsigned int rotation_inter
 			 * Clean up the partially initialized bio-queue entirely and indicate that
 			 * initialization failed.
 			 */
-			uds_log_error("bio map initialization failed %d", result);
+			vdo_log_error("bio map initialization failed %d", result);
 			vdo_cleanup_io_submitter(io_submitter);
 			vdo_free_io_submitter(io_submitter);
 			return result;
@@ -423,7 +423,7 @@ int vdo_make_io_submitter(unsigned int thread_count, unsigned int rotation_inter
 			 * initialization failed.
 			 */
 			vdo_int_map_free(vdo_forget(bio_queue_data->map));
-			uds_log_error("bio queue initialization failed %d", result);
+			vdo_log_error("bio queue initialization failed %d", result);
 			vdo_cleanup_io_submitter(io_submitter);
 			vdo_free_io_submitter(io_submitter);
 			return result;
diff --git a/drivers/md/dm-vdo/logger.c b/drivers/md/dm-vdo/logger.c
index aaab2f1f0c53d2..c516b634d09ac2 100644
--- a/drivers/md/dm-vdo/logger.c
+++ b/drivers/md/dm-vdo/logger.c
@@ -16,14 +16,14 @@
 #include "thread-device.h"
 #include "thread-utils.h"
 
-int log_level = UDS_LOG_DEFAULT;
+int log_level = VDO_LOG_DEFAULT;
 
-int uds_get_log_level(void)
+int vdo_get_log_level(void)
 {
 	int log_level_latch = READ_ONCE(log_level);
 
-	if (unlikely(log_level_latch > UDS_LOG_MAX)) {
-		log_level_latch = UDS_LOG_DEFAULT;
+	if (unlikely(log_level_latch > VDO_LOG_MAX)) {
+		log_level_latch = VDO_LOG_DEFAULT;
 		WRITE_ONCE(log_level, log_level_latch);
 	}
 	return log_level_latch;
@@ -54,7 +54,7 @@ static void emit_log_message_to_kernel(int priority, const char *fmt, ...)
 	va_list args;
 	struct va_format vaf;
 
-	if (priority > uds_get_log_level())
+	if (priority > vdo_get_log_level())
 		return;
 
 	va_start(args, fmt);
@@ -62,22 +62,22 @@ static void emit_log_message_to_kernel(int priority, const char *fmt, ...)
 	vaf.va = &args;
 
 	switch (priority) {
-	case UDS_LOG_EMERG:
-	case UDS_LOG_ALERT:
-	case UDS_LOG_CRIT:
+	case VDO_LOG_EMERG:
+	case VDO_LOG_ALERT:
+	case VDO_LOG_CRIT:
 		pr_crit("%pV", &vaf);
 		break;
-	case UDS_LOG_ERR:
+	case VDO_LOG_ERR:
 		pr_err("%pV", &vaf);
 		break;
-	case UDS_LOG_WARNING:
+	case VDO_LOG_WARNING:
 		pr_warn("%pV", &vaf);
 		break;
-	case UDS_LOG_NOTICE:
-	case UDS_LOG_INFO:
+	case VDO_LOG_NOTICE:
+	case VDO_LOG_INFO:
 		pr_info("%pV", &vaf);
 		break;
-	case UDS_LOG_DEBUG:
+	case VDO_LOG_DEBUG:
 		pr_debug("%pV", &vaf);
 		break;
 	default:
@@ -150,7 +150,7 @@ static void emit_log_message(int priority, const char *module, const char *prefi
 }
 
 /*
- * uds_log_embedded_message() - Log a message embedded within another message.
+ * vdo_log_embedded_message() - Log a message embedded within another message.
  * @priority: the priority at which to log the message
  * @module: the name of the module doing the logging
  * @prefix: optional string prefix to message, may be NULL
@@ -158,7 +158,7 @@ static void emit_log_message(int priority, const char *module, const char *prefi
  * @args1: arguments for message first part (required)
  * @fmt2: format of message second part
  */
-void uds_log_embedded_message(int priority, const char *module, const char *prefix,
+void vdo_log_embedded_message(int priority, const char *module, const char *prefix,
 			      const char *fmt1, va_list args1, const char *fmt2, ...)
 {
 	va_list args1_copy;
@@ -168,7 +168,7 @@ void uds_log_embedded_message(int priority, const char *module, const char *pref
 	va_start(args2, fmt2);
 
 	if (module == NULL)
-		module = UDS_LOGGING_MODULE_NAME;
+		module = VDO_LOGGING_MODULE_NAME;
 
 	if (prefix == NULL)
 		prefix = "";
@@ -191,41 +191,41 @@ void uds_log_embedded_message(int priority, const char *module, const char *pref
 	va_end(args2);
 }
 
-int uds_vlog_strerror(int priority, int errnum, const char *module, const char *format,
+int vdo_vlog_strerror(int priority, int errnum, const char *module, const char *format,
 		      va_list args)
 {
-	char errbuf[UDS_MAX_ERROR_MESSAGE_SIZE];
+	char errbuf[VDO_MAX_ERROR_MESSAGE_SIZE];
 	const char *message = uds_string_error(errnum, errbuf, sizeof(errbuf));
 
-	uds_log_embedded_message(priority, module, NULL, format, args, ": %s (%d)",
+	vdo_log_embedded_message(priority, module, NULL, format, args, ": %s (%d)",
 				 message, errnum);
 	return errnum;
 }
 
-int __uds_log_strerror(int priority, int errnum, const char *module, const char *format, ...)
+int __vdo_log_strerror(int priority, int errnum, const char *module, const char *format, ...)
 {
 	va_list args;
 
 	va_start(args, format);
-	uds_vlog_strerror(priority, errnum, module, format, args);
+	vdo_vlog_strerror(priority, errnum, module, format, args);
 	va_end(args);
 	return errnum;
 }
 
-void uds_log_backtrace(int priority)
+void vdo_log_backtrace(int priority)
 {
-	if (priority > uds_get_log_level())
+	if (priority > vdo_get_log_level())
 		return;
 
 	dump_stack();
 }
 
-void __uds_log_message(int priority, const char *module, const char *format, ...)
+void __vdo_log_message(int priority, const char *module, const char *format, ...)
 {
 	va_list args;
 
 	va_start(args, format);
-	uds_log_embedded_message(priority, module, NULL, format, args, "%s", "");
+	vdo_log_embedded_message(priority, module, NULL, format, args, "%s", "");
 	va_end(args);
 }
 
@@ -233,7 +233,7 @@ void __uds_log_message(int priority, const char *module, const char *format, ...
  * Sleep or delay a few milliseconds in an attempt to allow the log buffers to be flushed lest they
  * be overrun.
  */
-void uds_pause_for_logger(void)
+void vdo_pause_for_logger(void)
 {
 	fsleep(4000);
 }
diff --git a/drivers/md/dm-vdo/logger.h b/drivers/md/dm-vdo/logger.h
index 2e6e921c8d6311..ab94c3a4ab094d 100644
--- a/drivers/md/dm-vdo/logger.h
+++ b/drivers/md/dm-vdo/logger.h
@@ -3,8 +3,8 @@
  * Copyright 2023 Red Hat
  */
 
-#ifndef UDS_LOGGER_H
-#define UDS_LOGGER_H
+#ifndef VDO_LOGGER_H
+#define VDO_LOGGER_H
 
 #include <linux/kern_levels.h>
 #include <linux/module.h>
@@ -14,26 +14,26 @@
 /* Custom logging utilities for UDS */
 
 enum {
-	UDS_LOG_EMERG = LOGLEVEL_EMERG,
-	UDS_LOG_ALERT = LOGLEVEL_ALERT,
-	UDS_LOG_CRIT = LOGLEVEL_CRIT,
-	UDS_LOG_ERR = LOGLEVEL_ERR,
-	UDS_LOG_WARNING = LOGLEVEL_WARNING,
-	UDS_LOG_NOTICE = LOGLEVEL_NOTICE,
-	UDS_LOG_INFO = LOGLEVEL_INFO,
-	UDS_LOG_DEBUG = LOGLEVEL_DEBUG,
-
-	UDS_LOG_MAX = UDS_LOG_DEBUG,
-	UDS_LOG_DEFAULT = UDS_LOG_INFO,
+	VDO_LOG_EMERG = LOGLEVEL_EMERG,
+	VDO_LOG_ALERT = LOGLEVEL_ALERT,
+	VDO_LOG_CRIT = LOGLEVEL_CRIT,
+	VDO_LOG_ERR = LOGLEVEL_ERR,
+	VDO_LOG_WARNING = LOGLEVEL_WARNING,
+	VDO_LOG_NOTICE = LOGLEVEL_NOTICE,
+	VDO_LOG_INFO = LOGLEVEL_INFO,
+	VDO_LOG_DEBUG = LOGLEVEL_DEBUG,
+
+	VDO_LOG_MAX = VDO_LOG_DEBUG,
+	VDO_LOG_DEFAULT = VDO_LOG_INFO,
 };
 
 extern int log_level;
 
 #define DM_MSG_PREFIX "vdo"
-#define UDS_LOGGING_MODULE_NAME DM_NAME ": " DM_MSG_PREFIX
+#define VDO_LOGGING_MODULE_NAME DM_NAME ": " DM_MSG_PREFIX
 
 /* Apply a rate limiter to a log method call. */
-#define uds_log_ratelimit(log_fn, ...)                                    \
+#define vdo_log_ratelimit(log_fn, ...)                                    \
 	do {                                                              \
 		static DEFINE_RATELIMIT_STATE(_rs,                        \
 					      DEFAULT_RATELIMIT_INTERVAL, \
@@ -43,58 +43,59 @@ extern int log_level;
 		}                                                         \
 	} while (0)
 
-int uds_get_log_level(void);
+int vdo_get_log_level(void);
 
-void uds_log_embedded_message(int priority, const char *module, const char *prefix,
+void vdo_log_embedded_message(int priority, const char *module, const char *prefix,
 			      const char *fmt1, va_list args1, const char *fmt2, ...)
 	__printf(4, 0) __printf(6, 7);
 
-void uds_log_backtrace(int priority);
+void vdo_log_backtrace(int priority);
 
 /* All log functions will preserve the caller's value of errno. */
 
-#define uds_log_strerror(priority, errnum, ...) \
-	__uds_log_strerror(priority, errnum, UDS_LOGGING_MODULE_NAME, __VA_ARGS__)
+#define vdo_log_strerror(priority, errnum, ...) \
+	__vdo_log_strerror(priority, errnum, VDO_LOGGING_MODULE_NAME, __VA_ARGS__)
 
-int __uds_log_strerror(int priority, int errnum, const char *module,
+int __vdo_log_strerror(int priority, int errnum, const char *module,
 		       const char *format, ...)
 	__printf(4, 5);
 
-int uds_vlog_strerror(int priority, int errnum, const char *module, const char *format,
+int vdo_vlog_strerror(int priority, int errnum, const char *module, const char *format,
 		      va_list args)
 	__printf(4, 0);
 
 /* Log an error prefixed with the string associated with the errnum. */
-#define uds_log_error_strerror(errnum, ...) \
-	uds_log_strerror(UDS_LOG_ERR, errnum, __VA_ARGS__)
+#define vdo_log_error_strerror(errnum, ...) \
+	vdo_log_strerror(VDO_LOG_ERR, errnum, __VA_ARGS__)
 
-#define uds_log_debug_strerror(errnum, ...) \
-	uds_log_strerror(UDS_LOG_DEBUG, errnum, __VA_ARGS__)
+#define vdo_log_debug_strerror(errnum, ...) \
+	vdo_log_strerror(VDO_LOG_DEBUG, errnum, __VA_ARGS__)
 
-#define uds_log_info_strerror(errnum, ...) \
-	uds_log_strerror(UDS_LOG_INFO, errnum, __VA_ARGS__)
+#define vdo_log_info_strerror(errnum, ...) \
+	vdo_log_strerror(VDO_LOG_INFO, errnum, __VA_ARGS__)
 
-#define uds_log_warning_strerror(errnum, ...) \
-	uds_log_strerror(UDS_LOG_WARNING, errnum, __VA_ARGS__)
+#define vdo_log_warning_strerror(errnum, ...) \
+	vdo_log_strerror(VDO_LOG_WARNING, errnum, __VA_ARGS__)
 
-#define uds_log_fatal_strerror(errnum, ...) \
-	uds_log_strerror(UDS_LOG_CRIT, errnum, __VA_ARGS__)
+#define vdo_log_fatal_strerror(errnum, ...) \
+	vdo_log_strerror(VDO_LOG_CRIT, errnum, __VA_ARGS__)
 
-#define uds_log_message(priority, ...) \
-	__uds_log_message(priority, UDS_LOGGING_MODULE_NAME, __VA_ARGS__)
+#define vdo_log_message(priority, ...) \
+	__vdo_log_message(priority, VDO_LOGGING_MODULE_NAME, __VA_ARGS__)
 
-void __uds_log_message(int priority, const char *module, const char *format, ...)
+void __vdo_log_message(int priority, const char *module, const char *format, ...)
 	__printf(3, 4);
 
-#define uds_log_debug(...) uds_log_message(UDS_LOG_DEBUG, __VA_ARGS__)
+#define vdo_log_debug(...) vdo_log_message(VDO_LOG_DEBUG, __VA_ARGS__)
 
-#define uds_log_info(...) uds_log_message(UDS_LOG_INFO, __VA_ARGS__)
+#define vdo_log_info(...) vdo_log_message(VDO_LOG_INFO, __VA_ARGS__)
 
-#define uds_log_warning(...) uds_log_message(UDS_LOG_WARNING, __VA_ARGS__)
+#define vdo_log_warning(...) vdo_log_message(VDO_LOG_WARNING, __VA_ARGS__)
 
-#define uds_log_error(...) uds_log_message(UDS_LOG_ERR, __VA_ARGS__)
+#define vdo_log_error(...) vdo_log_message(VDO_LOG_ERR, __VA_ARGS__)
 
-#define uds_log_fatal(...) uds_log_message(UDS_LOG_CRIT, __VA_ARGS__)
+#define vdo_log_fatal(...) vdo_log_message(VDO_LOG_CRIT, __VA_ARGS__)
 
-void uds_pause_for_logger(void);
-#endif /* UDS_LOGGER_H */
+void vdo_pause_for_logger(void);
+
+#endif /* VDO_LOGGER_H */
diff --git a/drivers/md/dm-vdo/logical-zone.c b/drivers/md/dm-vdo/logical-zone.c
index 52aa9c48dcf807..336c3d3ec5e78c 100644
--- a/drivers/md/dm-vdo/logical-zone.c
+++ b/drivers/md/dm-vdo/logical-zone.c
@@ -365,8 +365,8 @@ struct physical_zone *vdo_get_next_allocation_zone(struct logical_zone *zone)
  */
 void vdo_dump_logical_zone(const struct logical_zone *zone)
 {
-	uds_log_info("logical_zone %u", zone->zone_number);
-	uds_log_info("  flush_generation=%llu oldest_active_generation=%llu notification_generation=%llu notifying=%s ios_in_flush_generation=%llu",
+	vdo_log_info("logical_zone %u", zone->zone_number);
+	vdo_log_info("  flush_generation=%llu oldest_active_generation=%llu notification_generation=%llu notifying=%s ios_in_flush_generation=%llu",
 		     (unsigned long long) READ_ONCE(zone->flush_generation),
 		     (unsigned long long) READ_ONCE(zone->oldest_active_generation),
 		     (unsigned long long) READ_ONCE(zone->notification_generation),
diff --git a/drivers/md/dm-vdo/memory-alloc.c b/drivers/md/dm-vdo/memory-alloc.c
index d2095516af282f..271fab11cfdec5 100644
--- a/drivers/md/dm-vdo/memory-alloc.c
+++ b/drivers/md/dm-vdo/memory-alloc.c
@@ -150,7 +150,7 @@ static void remove_vmalloc_block(void *ptr)
 	if (block != NULL)
 		vdo_free(block);
 	else
-		uds_log_info("attempting to remove ptr %px not found in vmalloc list", ptr);
+		vdo_log_info("attempting to remove ptr %px not found in vmalloc list", ptr);
 }
 
 /*
@@ -284,7 +284,7 @@ int vdo_allocate_memory(size_t size, size_t align, const char *what, void *ptr)
 		memalloc_noio_restore(noio_flags);
 
 	if (unlikely(p == NULL)) {
-		uds_log_error("Could not allocate %zu bytes for %s in %u msecs",
+		vdo_log_error("Could not allocate %zu bytes for %s in %u msecs",
 			      size, what, jiffies_to_msecs(jiffies - start_time));
 		return -ENOMEM;
 	}
@@ -391,7 +391,7 @@ void vdo_memory_exit(void)
 	VDO_ASSERT_LOG_ONLY(memory_stats.vmalloc_bytes == 0,
 			    "vmalloc memory used (%zd bytes in %zd blocks) is returned to the kernel",
 			    memory_stats.vmalloc_bytes, memory_stats.vmalloc_blocks);
-	uds_log_debug("peak usage %zd bytes", memory_stats.peak_bytes);
+	vdo_log_debug("peak usage %zd bytes", memory_stats.peak_bytes);
 }
 
 void vdo_get_memory_stats(u64 *bytes_used, u64 *peak_bytes_used)
@@ -426,13 +426,13 @@ void vdo_report_memory_usage(void)
 	peak_usage = memory_stats.peak_bytes;
 	spin_unlock_irqrestore(&memory_stats.lock, flags);
 	total_bytes = kmalloc_bytes + vmalloc_bytes;
-	uds_log_info("current module memory tracking (actual allocation sizes, not requested):");
-	uds_log_info("  %llu bytes in %llu kmalloc blocks",
+	vdo_log_info("current module memory tracking (actual allocation sizes, not requested):");
+	vdo_log_info("  %llu bytes in %llu kmalloc blocks",
 		     (unsigned long long) kmalloc_bytes,
 		     (unsigned long long) kmalloc_blocks);
-	uds_log_info("  %llu bytes in %llu vmalloc blocks",
+	vdo_log_info("  %llu bytes in %llu vmalloc blocks",
 		     (unsigned long long) vmalloc_bytes,
 		     (unsigned long long) vmalloc_blocks);
-	uds_log_info("  total %llu bytes, peak usage %llu bytes",
+	vdo_log_info("  total %llu bytes, peak usage %llu bytes",
 		     (unsigned long long) total_bytes, (unsigned long long) peak_usage);
 }
diff --git a/drivers/md/dm-vdo/packer.c b/drivers/md/dm-vdo/packer.c
index e849b4ad691f89..f661ddfe463a59 100644
--- a/drivers/md/dm-vdo/packer.c
+++ b/drivers/md/dm-vdo/packer.c
@@ -752,7 +752,7 @@ static void dump_packer_bin(const struct packer_bin *bin, bool canceled)
 		/* Don't dump empty bins. */
 		return;
 
-	uds_log_info("	  %sBin slots_used=%u free_space=%zu",
+	vdo_log_info("	  %sBin slots_used=%u free_space=%zu",
 		     (canceled ? "Canceled" : ""), bin->slots_used, bin->free_space);
 
 	/*
@@ -771,8 +771,8 @@ void vdo_dump_packer(const struct packer *packer)
 {
 	struct packer_bin *bin;
 
-	uds_log_info("packer");
-	uds_log_info("	flushGeneration=%llu state %s  packer_bin_count=%llu",
+	vdo_log_info("packer");
+	vdo_log_info("	flushGeneration=%llu state %s  packer_bin_count=%llu",
 		     (unsigned long long) packer->flush_generation,
 		     vdo_get_admin_state_code(&packer->state)->name,
 		     (unsigned long long) packer->size);
diff --git a/drivers/md/dm-vdo/permassert.c b/drivers/md/dm-vdo/permassert.c
index 3fa752ba006109..bf9eccea1cb339 100644
--- a/drivers/md/dm-vdo/permassert.c
+++ b/drivers/md/dm-vdo/permassert.c
@@ -8,17 +8,17 @@
 #include "errors.h"
 #include "logger.h"
 
-int uds_assertion_failed(const char *expression_string, const char *file_name,
+int vdo_assertion_failed(const char *expression_string, const char *file_name,
 			 int line_number, const char *format, ...)
 {
 	va_list args;
 
 	va_start(args, format);
 
-	uds_log_embedded_message(UDS_LOG_ERR, UDS_LOGGING_MODULE_NAME, "assertion \"",
+	vdo_log_embedded_message(VDO_LOG_ERR, VDO_LOGGING_MODULE_NAME, "assertion \"",
 				 format, args, "\" (%s) failed at %s:%d",
 				 expression_string, file_name, line_number);
-	uds_log_backtrace(UDS_LOG_ERR);
+	vdo_log_backtrace(VDO_LOG_ERR);
 
 	va_end(args);
 
diff --git a/drivers/md/dm-vdo/permassert.h b/drivers/md/dm-vdo/permassert.h
index 21e7e2dfd24c4a..26e2d19faecbd9 100644
--- a/drivers/md/dm-vdo/permassert.h
+++ b/drivers/md/dm-vdo/permassert.h
@@ -39,10 +39,10 @@ static inline int __must_check vdo_must_use(int value)
 
 #define __VDO_ASSERT(expr, ...)				      \
 	(likely(expr) ? VDO_SUCCESS			      \
-		      : uds_assertion_failed(STRINGIFY(expr), __FILE__, __LINE__, __VA_ARGS__))
+		      : vdo_assertion_failed(STRINGIFY(expr), __FILE__, __LINE__, __VA_ARGS__))
 
 /* Log an assertion failure message. */
-int uds_assertion_failed(const char *expression_string, const char *file_name,
+int vdo_assertion_failed(const char *expression_string, const char *file_name,
 			 int line_number, const char *format, ...)
 	__printf(4, 5);
 
diff --git a/drivers/md/dm-vdo/physical-zone.c b/drivers/md/dm-vdo/physical-zone.c
index 389e5ed2a0a1a2..b80c4d3a9bdef9 100644
--- a/drivers/md/dm-vdo/physical-zone.c
+++ b/drivers/md/dm-vdo/physical-zone.c
@@ -165,7 +165,7 @@ static void release_pbn_lock_provisional_reference(struct pbn_lock *lock,
 
 	result = vdo_release_block_reference(allocator, locked_pbn);
 	if (result != VDO_SUCCESS) {
-		uds_log_error_strerror(result,
+		vdo_log_error_strerror(result,
 				       "Failed to release reference to %s physical block %llu",
 				       lock->implementation->release_reason,
 				       (unsigned long long) locked_pbn);
@@ -296,7 +296,7 @@ static int __must_check borrow_pbn_lock_from_pool(struct pbn_lock_pool *pool,
 	idle_pbn_lock *idle;
 
 	if (pool->borrowed >= pool->capacity)
-		return uds_log_error_strerror(VDO_LOCK_ERROR,
+		return vdo_log_error_strerror(VDO_LOCK_ERROR,
 					      "no free PBN locks left to borrow");
 	pool->borrowed += 1;
 
@@ -501,7 +501,7 @@ static int allocate_and_lock_block(struct allocation *allocation)
 
 	if (lock->holder_count > 0) {
 		/* This block is already locked, which should be impossible. */
-		return uds_log_error_strerror(VDO_LOCK_ERROR,
+		return vdo_log_error_strerror(VDO_LOCK_ERROR,
 					      "Newly allocated block %llu was spuriously locked (holder_count=%u)",
 					      (unsigned long long) allocation->pbn,
 					      lock->holder_count);
diff --git a/drivers/md/dm-vdo/recovery-journal.c b/drivers/md/dm-vdo/recovery-journal.c
index 271d172360773d..cda01864a5fab3 100644
--- a/drivers/md/dm-vdo/recovery-journal.c
+++ b/drivers/md/dm-vdo/recovery-journal.c
@@ -806,7 +806,7 @@ void vdo_free_recovery_journal(struct recovery_journal *journal)
 				    "journal being freed has no active tail blocks");
 	} else if (!vdo_is_state_saved(&journal->state) &&
 		   !list_empty(&journal->active_tail_blocks)) {
-		uds_log_warning("journal being freed has uncommitted entries");
+		vdo_log_warning("journal being freed has uncommitted entries");
 	}
 
 	for (i = 0; i < RECOVERY_JOURNAL_RESERVED_BLOCKS; i++) {
@@ -1307,7 +1307,7 @@ static void handle_write_error(struct vdo_completion *completion)
 	struct recovery_journal *journal = block->journal;
 
 	vio_record_metadata_io_error(as_vio(completion));
-	uds_log_error_strerror(completion->result,
+	vdo_log_error_strerror(completion->result,
 			       "cannot write recovery journal block %llu",
 			       (unsigned long long) block->sequence_number);
 	enter_journal_read_only_mode(journal, completion->result);
@@ -1721,7 +1721,7 @@ vdo_get_recovery_journal_statistics(const struct recovery_journal *journal)
  */
 static void dump_recovery_block(const struct recovery_journal_block *block)
 {
-	uds_log_info("    sequence number %llu; entries %u; %s; %zu entry waiters; %zu commit waiters",
+	vdo_log_info("    sequence number %llu; entries %u; %s; %zu entry waiters; %zu commit waiters",
 		     (unsigned long long) block->sequence_number, block->entry_count,
 		     (block->committing ? "committing" : "waiting"),
 		     vdo_waitq_num_waiters(&block->entry_waiters),
@@ -1738,8 +1738,8 @@ void vdo_dump_recovery_journal_statistics(const struct recovery_journal *journal
 	const struct recovery_journal_block *block;
 	struct recovery_journal_statistics stats = vdo_get_recovery_journal_statistics(journal);
 
-	uds_log_info("Recovery Journal");
-	uds_log_info("	block_map_head=%llu slab_journal_head=%llu last_write_acknowledged=%llu tail=%llu block_map_reap_head=%llu slab_journal_reap_head=%llu disk_full=%llu slab_journal_commits_requested=%llu entry_waiters=%zu",
+	vdo_log_info("Recovery Journal");
+	vdo_log_info("	block_map_head=%llu slab_journal_head=%llu last_write_acknowledged=%llu tail=%llu block_map_reap_head=%llu slab_journal_reap_head=%llu disk_full=%llu slab_journal_commits_requested=%llu entry_waiters=%zu",
 		     (unsigned long long) journal->block_map_head,
 		     (unsigned long long) journal->slab_journal_head,
 		     (unsigned long long) journal->last_write_acknowledged,
@@ -1749,16 +1749,16 @@ void vdo_dump_recovery_journal_statistics(const struct recovery_journal *journal
 		     (unsigned long long) stats.disk_full,
 		     (unsigned long long) stats.slab_journal_commits_requested,
 		     vdo_waitq_num_waiters(&journal->entry_waiters));
-	uds_log_info("	entries: started=%llu written=%llu committed=%llu",
+	vdo_log_info("	entries: started=%llu written=%llu committed=%llu",
 		     (unsigned long long) stats.entries.started,
 		     (unsigned long long) stats.entries.written,
 		     (unsigned long long) stats.entries.committed);
-	uds_log_info("	blocks: started=%llu written=%llu committed=%llu",
+	vdo_log_info("	blocks: started=%llu written=%llu committed=%llu",
 		     (unsigned long long) stats.blocks.started,
 		     (unsigned long long) stats.blocks.written,
 		     (unsigned long long) stats.blocks.committed);
 
-	uds_log_info("	active blocks:");
+	vdo_log_info("	active blocks:");
 	list_for_each_entry(block, &journal->active_tail_blocks, list_node)
 		dump_recovery_block(block);
 }
diff --git a/drivers/md/dm-vdo/repair.c b/drivers/md/dm-vdo/repair.c
index ce6f78d281f33f..f1e60f3c2ae70f 100644
--- a/drivers/md/dm-vdo/repair.c
+++ b/drivers/md/dm-vdo/repair.c
@@ -265,13 +265,13 @@ static void finish_repair(struct vdo_completion *completion)
 	free_repair_completion(vdo_forget(repair));
 
 	if (vdo_state_requires_read_only_rebuild(vdo->load_state)) {
-		uds_log_info("Read-only rebuild complete");
+		vdo_log_info("Read-only rebuild complete");
 		vdo_launch_completion(parent);
 		return;
 	}
 
 	/* FIXME: shouldn't this say either "recovery" or "repair"? */
-	uds_log_info("Rebuild complete");
+	vdo_log_info("Rebuild complete");
 
 	/*
 	 * Now that we've freed the repair completion and its vast array of journal entries, we
@@ -291,9 +291,9 @@ static void abort_repair(struct vdo_completion *completion)
 	struct repair_completion *repair = as_repair_completion(completion);
 
 	if (vdo_state_requires_read_only_rebuild(completion->vdo->load_state))
-		uds_log_info("Read-only rebuild aborted");
+		vdo_log_info("Read-only rebuild aborted");
 	else
-		uds_log_warning("Recovery aborted");
+		vdo_log_warning("Recovery aborted");
 
 	free_repair_completion(vdo_forget(repair));
 	vdo_continue_completion(parent, result);
@@ -329,10 +329,10 @@ static void drain_slab_depot(struct vdo_completion *completion)
 
 	prepare_repair_completion(repair, finish_repair, VDO_ZONE_TYPE_ADMIN);
 	if (vdo_state_requires_read_only_rebuild(vdo->load_state)) {
-		uds_log_info("Saving rebuilt state");
+		vdo_log_info("Saving rebuilt state");
 		operation = VDO_ADMIN_STATE_REBUILDING;
 	} else {
-		uds_log_info("Replayed %zu journal entries into slab journals",
+		vdo_log_info("Replayed %zu journal entries into slab journals",
 			     repair->entries_added_to_slab_journals);
 		operation = VDO_ADMIN_STATE_RECOVERING;
 	}
@@ -350,7 +350,7 @@ static void flush_block_map_updates(struct vdo_completion *completion)
 {
 	vdo_assert_on_admin_thread(completion->vdo, __func__);
 
-	uds_log_info("Flushing block map changes");
+	vdo_log_info("Flushing block map changes");
 	prepare_repair_completion(as_repair_completion(completion), drain_slab_depot,
 				  VDO_ZONE_TYPE_ADMIN);
 	vdo_drain_block_map(completion->vdo->block_map, VDO_ADMIN_STATE_RECOVERING,
@@ -449,7 +449,7 @@ static bool process_slot(struct block_map_page *page, struct vdo_completion *com
 	if (result == VDO_SUCCESS)
 		return true;
 
-	uds_log_error_strerror(result,
+	vdo_log_error_strerror(result,
 			       "Could not adjust reference count for PBN %llu, slot %u mapped to PBN %llu",
 			       (unsigned long long) vdo_get_block_map_page_pbn(page),
 			       slot, (unsigned long long) mapping.pbn);
@@ -615,7 +615,7 @@ static int process_entry(physical_block_number_t pbn, struct vdo_completion *com
 	int result;
 
 	if ((pbn == VDO_ZERO_BLOCK) || !vdo_is_physical_data_block(depot, pbn)) {
-		return uds_log_error_strerror(VDO_BAD_CONFIGURATION,
+		return vdo_log_error_strerror(VDO_BAD_CONFIGURATION,
 					      "PBN %llu out of range",
 					      (unsigned long long) pbn);
 	}
@@ -623,7 +623,7 @@ static int process_entry(physical_block_number_t pbn, struct vdo_completion *com
 	result = vdo_adjust_reference_count_for_rebuild(depot, pbn,
 							VDO_JOURNAL_BLOCK_MAP_REMAPPING);
 	if (result != VDO_SUCCESS) {
-		return uds_log_error_strerror(result,
+		return vdo_log_error_strerror(result,
 					      "Could not adjust reference count for block map tree PBN %llu",
 					      (unsigned long long) pbn);
 	}
@@ -758,7 +758,7 @@ static int validate_recovery_journal_entry(const struct vdo *vdo,
 	    !vdo_is_valid_location(&entry->unmapping) ||
 	    !vdo_is_physical_data_block(vdo->depot, entry->mapping.pbn) ||
 	    !vdo_is_physical_data_block(vdo->depot, entry->unmapping.pbn)) {
-		return uds_log_error_strerror(VDO_CORRUPT_JOURNAL,
+		return vdo_log_error_strerror(VDO_CORRUPT_JOURNAL,
 					      "Invalid entry: %s (%llu, %u) from %llu to %llu is not within bounds",
 					      vdo_get_journal_operation_name(entry->operation),
 					      (unsigned long long) entry->slot.pbn,
@@ -772,7 +772,7 @@ static int validate_recovery_journal_entry(const struct vdo *vdo,
 	     (entry->mapping.pbn == VDO_ZERO_BLOCK) ||
 	     (entry->unmapping.state != VDO_MAPPING_STATE_UNMAPPED) ||
 	     (entry->unmapping.pbn != VDO_ZERO_BLOCK))) {
-		return uds_log_error_strerror(VDO_CORRUPT_JOURNAL,
+		return vdo_log_error_strerror(VDO_CORRUPT_JOURNAL,
 					      "Invalid entry: %s (%llu, %u) from %llu to %llu is not a valid tree mapping",
 					      vdo_get_journal_operation_name(entry->operation),
 					      (unsigned long long) entry->slot.pbn,
@@ -875,7 +875,7 @@ void vdo_replay_into_slab_journals(struct block_allocator *allocator, void *cont
 		.entry_count = 0,
 	};
 
-	uds_log_info("Replaying entries into slab journals for zone %u",
+	vdo_log_info("Replaying entries into slab journals for zone %u",
 		     allocator->zone_number);
 	completion->parent = repair;
 	add_slab_journal_entries(completion);
@@ -907,7 +907,7 @@ static void flush_block_map(struct vdo_completion *completion)
 
 	vdo_assert_on_admin_thread(completion->vdo, __func__);
 
-	uds_log_info("Flushing block map changes");
+	vdo_log_info("Flushing block map changes");
 	prepare_repair_completion(repair, load_slab_depot, VDO_ZONE_TYPE_ADMIN);
 	operation = (vdo_state_requires_read_only_rebuild(completion->vdo->load_state) ?
 		     VDO_ADMIN_STATE_REBUILDING :
@@ -1107,7 +1107,7 @@ static void recover_block_map(struct vdo_completion *completion)
 		vdo_state_requires_read_only_rebuild(vdo->load_state);
 
 	if (repair->block_map_entry_count == 0) {
-		uds_log_info("Replaying 0 recovery entries into block map");
+		vdo_log_info("Replaying 0 recovery entries into block map");
 		vdo_free(vdo_forget(repair->journal_data));
 		launch_repair_completion(repair, load_slab_depot, VDO_ZONE_TYPE_ADMIN);
 		return;
@@ -1124,7 +1124,7 @@ static void recover_block_map(struct vdo_completion *completion)
 	};
 	min_heapify_all(&repair->replay_heap, &repair_min_heap);
 
-	uds_log_info("Replaying %zu recovery entries into block map",
+	vdo_log_info("Replaying %zu recovery entries into block map",
 		     repair->block_map_entry_count);
 
 	repair->current_entry = &repair->entries[repair->block_map_entry_count - 1];
@@ -1437,7 +1437,7 @@ static int validate_heads(struct repair_completion *repair)
 		return VDO_SUCCESS;
 
 
-	return uds_log_error_strerror(VDO_CORRUPT_JOURNAL,
+	return vdo_log_error_strerror(VDO_CORRUPT_JOURNAL,
 				      "Journal tail too early. block map head: %llu, slab journal head: %llu, tail: %llu",
 				      (unsigned long long) repair->block_map_head,
 				      (unsigned long long) repair->slab_journal_head,
@@ -1571,7 +1571,7 @@ static int parse_journal_for_recovery(struct repair_completion *repair)
 		header = get_recovery_journal_block_header(journal, repair->journal_data, i);
 		if (header.metadata_type == VDO_METADATA_RECOVERY_JOURNAL) {
 			/* This is an old format block, so we need to upgrade */
-			uds_log_error_strerror(VDO_UNSUPPORTED_VERSION,
+			vdo_log_error_strerror(VDO_UNSUPPORTED_VERSION,
 					       "Recovery journal is in the old format, a read-only rebuild is required.");
 			vdo_enter_read_only_mode(repair->completion.vdo,
 						 VDO_UNSUPPORTED_VERSION);
@@ -1628,7 +1628,7 @@ static int parse_journal_for_recovery(struct repair_completion *repair)
 	if (result != VDO_SUCCESS)
 		return result;
 
-	uds_log_info("Highest-numbered recovery journal block has sequence number %llu, and the highest-numbered usable block is %llu",
+	vdo_log_info("Highest-numbered recovery journal block has sequence number %llu, and the highest-numbered usable block is %llu",
 		     (unsigned long long) repair->highest_tail,
 		     (unsigned long long) repair->tail);
 
@@ -1656,7 +1656,7 @@ static void finish_journal_load(struct vdo_completion *completion)
 	if (++repair->vios_complete != repair->vio_count)
 		return;
 
-	uds_log_info("Finished reading recovery journal");
+	vdo_log_info("Finished reading recovery journal");
 	uninitialize_vios(repair);
 	prepare_repair_completion(repair, recover_block_map, VDO_ZONE_TYPE_LOGICAL);
 	vdo_continue_completion(&repair->completion, parse_journal(repair));
@@ -1701,12 +1701,12 @@ void vdo_repair(struct vdo_completion *parent)
 	vdo_assert_on_admin_thread(vdo, __func__);
 
 	if (vdo->load_state == VDO_FORCE_REBUILD) {
-		uds_log_warning("Rebuilding reference counts to clear read-only mode");
+		vdo_log_warning("Rebuilding reference counts to clear read-only mode");
 		vdo->states.vdo.read_only_recoveries++;
 	} else if (vdo->load_state == VDO_REBUILD_FOR_UPGRADE) {
-		uds_log_warning("Rebuilding reference counts for upgrade");
+		vdo_log_warning("Rebuilding reference counts for upgrade");
 	} else {
-		uds_log_warning("Device was dirty, rebuilding reference counts");
+		vdo_log_warning("Device was dirty, rebuilding reference counts");
 	}
 
 	result = vdo_allocate_extended(struct repair_completion, page_count,
diff --git a/drivers/md/dm-vdo/slab-depot.c b/drivers/md/dm-vdo/slab-depot.c
index 2c273b82336379..9749bd9fea4eae 100644
--- a/drivers/md/dm-vdo/slab-depot.c
+++ b/drivers/md/dm-vdo/slab-depot.c
@@ -564,7 +564,7 @@ static void release_journal_locks(struct vdo_waiter *waiter, void *context)
 			 * Don't bother logging what might be lots of errors if we are already in
 			 * read-only mode.
 			 */
-			uds_log_error_strerror(result, "failed slab summary update %llu",
+			vdo_log_error_strerror(result, "failed slab summary update %llu",
 					       (unsigned long long) journal->summarized);
 		}
 
@@ -698,7 +698,7 @@ static void complete_write(struct vdo_completion *completion)
 
 	if (result != VDO_SUCCESS) {
 		vio_record_metadata_io_error(as_vio(completion));
-		uds_log_error_strerror(result, "cannot write slab journal block %llu",
+		vdo_log_error_strerror(result, "cannot write slab journal block %llu",
 				       (unsigned long long) committed);
 		vdo_enter_read_only_mode(journal->slab->allocator->depot->vdo, result);
 		check_if_slab_drained(journal->slab);
@@ -1015,7 +1015,7 @@ static void finish_summary_update(struct vdo_waiter *waiter, void *context)
 	slab->active_count--;
 
 	if ((result != VDO_SUCCESS) && (result != VDO_READ_ONLY)) {
-		uds_log_error_strerror(result, "failed to update slab summary");
+		vdo_log_error_strerror(result, "failed to update slab summary");
 		vdo_enter_read_only_mode(slab->allocator->depot->vdo, result);
 	}
 
@@ -1434,7 +1434,7 @@ static int increment_for_data(struct vdo_slab *slab, struct reference_block *blo
 	default:
 		/* Single or shared */
 		if (*counter_ptr >= MAXIMUM_REFERENCE_COUNT) {
-			return uds_log_error_strerror(VDO_REF_COUNT_INVALID,
+			return vdo_log_error_strerror(VDO_REF_COUNT_INVALID,
 						      "Incrementing a block already having 254 references (slab %u, offset %u)",
 						      slab->slab_number, block_number);
 		}
@@ -1467,7 +1467,7 @@ static int decrement_for_data(struct vdo_slab *slab, struct reference_block *blo
 {
 	switch (old_status) {
 	case RS_FREE:
-		return uds_log_error_strerror(VDO_REF_COUNT_INVALID,
+		return vdo_log_error_strerror(VDO_REF_COUNT_INVALID,
 					      "Decrementing free block at offset %u in slab %u",
 					      block_number, slab->slab_number);
 
@@ -1531,7 +1531,7 @@ static int increment_for_block_map(struct vdo_slab *slab, struct reference_block
 	switch (old_status) {
 	case RS_FREE:
 		if (normal_operation) {
-			return uds_log_error_strerror(VDO_REF_COUNT_INVALID,
+			return vdo_log_error_strerror(VDO_REF_COUNT_INVALID,
 						      "Incrementing unallocated block map block (slab %u, offset %u)",
 						      slab->slab_number, block_number);
 		}
@@ -1546,7 +1546,7 @@ static int increment_for_block_map(struct vdo_slab *slab, struct reference_block
 
 	case RS_PROVISIONAL:
 		if (!normal_operation)
-			return uds_log_error_strerror(VDO_REF_COUNT_INVALID,
+			return vdo_log_error_strerror(VDO_REF_COUNT_INVALID,
 						      "Block map block had provisional reference during replay (slab %u, offset %u)",
 						      slab->slab_number, block_number);
 
@@ -1556,7 +1556,7 @@ static int increment_for_block_map(struct vdo_slab *slab, struct reference_block
 		return VDO_SUCCESS;
 
 	default:
-		return uds_log_error_strerror(VDO_REF_COUNT_INVALID,
+		return vdo_log_error_strerror(VDO_REF_COUNT_INVALID,
 					      "Incrementing a block map block which is already referenced %u times (slab %u, offset %u)",
 					      *counter_ptr, slab->slab_number,
 					      block_number);
@@ -2213,7 +2213,7 @@ static void unpack_reference_block(struct packed_reference_block *packed,
 					  block->commit_points[i])) {
 			size_t block_index = block - block->slab->reference_blocks;
 
-			uds_log_warning("Torn write detected in sector %u of reference block %zu of slab %u",
+			vdo_log_warning("Torn write detected in sector %u of reference block %zu of slab %u",
 					i, block_index, block->slab->slab_number);
 		}
 	}
@@ -2692,9 +2692,9 @@ static void finish_scrubbing(struct slab_scrubber *scrubber, int result)
 		 * thread does not yet know about.
 		 */
 		if (prior_state == VDO_DIRTY)
-			uds_log_info("VDO commencing normal operation");
+			vdo_log_info("VDO commencing normal operation");
 		else if (prior_state == VDO_RECOVERING)
-			uds_log_info("Exiting recovery mode");
+			vdo_log_info("Exiting recovery mode");
 	}
 
 	/*
@@ -2784,7 +2784,7 @@ static int apply_block_entries(struct packed_slab_journal_block *block,
 
 		if (entry.sbn > max_sbn) {
 			/* This entry is out of bounds. */
-			return uds_log_error_strerror(VDO_CORRUPT_JOURNAL,
+			return vdo_log_error_strerror(VDO_CORRUPT_JOURNAL,
 						      "vdo_slab journal entry (%llu, %u) had invalid offset %u in slab (size %u blocks)",
 						      (unsigned long long) block_number,
 						      entry_point.entry_count,
@@ -2793,7 +2793,7 @@ static int apply_block_entries(struct packed_slab_journal_block *block,
 
 		result = replay_reference_count_change(slab, &entry_point, entry);
 		if (result != VDO_SUCCESS) {
-			uds_log_error_strerror(result,
+			vdo_log_error_strerror(result,
 					       "vdo_slab journal entry (%llu, %u) (%s of offset %u) could not be applied in slab %u",
 					       (unsigned long long) block_number,
 					       entry_point.entry_count,
@@ -2851,7 +2851,7 @@ static void apply_journal_entries(struct vdo_completion *completion)
 		    (header.has_block_map_increments &&
 		     (header.entry_count > journal->full_entries_per_block))) {
 			/* The block is not what we expect it to be. */
-			uds_log_error("vdo_slab journal block for slab %u was invalid",
+			vdo_log_error("vdo_slab journal block for slab %u was invalid",
 				      slab->slab_number);
 			abort_scrubbing(scrubber, VDO_CORRUPT_JOURNAL);
 			return;
@@ -3574,22 +3574,22 @@ void vdo_dump_block_allocator(const struct block_allocator *allocator)
 	struct slab_iterator iterator = get_slab_iterator(allocator);
 	const struct slab_scrubber *scrubber = &allocator->scrubber;
 
-	uds_log_info("block_allocator zone %u", allocator->zone_number);
+	vdo_log_info("block_allocator zone %u", allocator->zone_number);
 	while (iterator.next != NULL) {
 		struct vdo_slab *slab = next_slab(&iterator);
 		struct slab_journal *journal = &slab->journal;
 
 		if (slab->reference_blocks != NULL) {
 			/* Terse because there are a lot of slabs to dump and syslog is lossy. */
-			uds_log_info("slab %u: P%u, %llu free", slab->slab_number,
+			vdo_log_info("slab %u: P%u, %llu free", slab->slab_number,
 				     slab->priority,
 				     (unsigned long long) slab->free_blocks);
 		} else {
-			uds_log_info("slab %u: status %s", slab->slab_number,
+			vdo_log_info("slab %u: status %s", slab->slab_number,
 				     status_to_string(slab->status));
 		}
 
-		uds_log_info("  slab journal: entry_waiters=%zu waiting_to_commit=%s updating_slab_summary=%s head=%llu unreapable=%llu tail=%llu next_commit=%llu summarized=%llu last_summarized=%llu recovery_lock=%llu dirty=%s",
+		vdo_log_info("  slab journal: entry_waiters=%zu waiting_to_commit=%s updating_slab_summary=%s head=%llu unreapable=%llu tail=%llu next_commit=%llu summarized=%llu last_summarized=%llu recovery_lock=%llu dirty=%s",
 			     vdo_waitq_num_waiters(&journal->entry_waiters),
 			     uds_bool_to_string(journal->waiting_to_commit),
 			     uds_bool_to_string(journal->updating_slab_summary),
@@ -3608,7 +3608,7 @@ void vdo_dump_block_allocator(const struct block_allocator *allocator)
 
 		if (slab->counters != NULL) {
 			/* Terse because there are a lot of slabs to dump and syslog is lossy. */
-			uds_log_info("  slab: free=%u/%u blocks=%u dirty=%zu active=%zu journal@(%llu,%u)",
+			vdo_log_info("  slab: free=%u/%u blocks=%u dirty=%zu active=%zu journal@(%llu,%u)",
 				     slab->free_blocks, slab->block_count,
 				     slab->reference_block_count,
 				     vdo_waitq_num_waiters(&slab->dirty_blocks),
@@ -3616,7 +3616,7 @@ void vdo_dump_block_allocator(const struct block_allocator *allocator)
 				     (unsigned long long) slab->slab_journal_point.sequence_number,
 				     slab->slab_journal_point.entry_count);
 		} else {
-			uds_log_info("  no counters");
+			vdo_log_info("  no counters");
 		}
 
 		/*
@@ -3625,11 +3625,11 @@ void vdo_dump_block_allocator(const struct block_allocator *allocator)
 		 */
 		if (pause_counter++ == 31) {
 			pause_counter = 0;
-			uds_pause_for_logger();
+			vdo_pause_for_logger();
 		}
 	}
 
-	uds_log_info("slab_scrubber slab_count %u waiters %zu %s%s",
+	vdo_log_info("slab_scrubber slab_count %u waiters %zu %s%s",
 		     READ_ONCE(scrubber->slab_count),
 		     vdo_waitq_num_waiters(&scrubber->waiters),
 		     vdo_get_admin_state_code(&scrubber->admin_state)->name,
@@ -4103,7 +4103,7 @@ static int allocate_components(struct slab_depot *depot,
 	slab_count = vdo_compute_slab_count(depot->first_block, depot->last_block,
 					    depot->slab_size_shift);
 	if (thread_config->physical_zone_count > slab_count) {
-		return uds_log_error_strerror(VDO_BAD_CONFIGURATION,
+		return vdo_log_error_strerror(VDO_BAD_CONFIGURATION,
 					      "%u physical zones exceeds slab count %u",
 					      thread_config->physical_zone_count,
 					      slab_count);
@@ -4161,7 +4161,7 @@ int vdo_decode_slab_depot(struct slab_depot_state_2_0 state, struct vdo *vdo,
 	block_count_t slab_size = state.slab_config.slab_blocks;
 
 	if (!is_power_of_2(slab_size)) {
-		return uds_log_error_strerror(UDS_INVALID_ARGUMENT,
+		return vdo_log_error_strerror(UDS_INVALID_ARGUMENT,
 					      "slab size must be a power of two");
 	}
 	slab_size_shift = ilog2(slab_size);
@@ -4670,7 +4670,7 @@ int vdo_prepare_to_grow_slab_depot(struct slab_depot *depot,
 						new_state.last_block,
 						depot->slab_size_shift);
 	if (new_slab_count <= depot->slab_count)
-		return uds_log_error_strerror(VDO_INCREMENT_TOO_SMALL,
+		return vdo_log_error_strerror(VDO_INCREMENT_TOO_SMALL,
 					      "Depot can only grow");
 	if (new_slab_count == depot->new_slab_count) {
 		/* Check it out, we've already got all the new slabs allocated! */
@@ -5086,8 +5086,8 @@ void vdo_get_slab_depot_statistics(const struct slab_depot *depot,
  */
 void vdo_dump_slab_depot(const struct slab_depot *depot)
 {
-	uds_log_info("vdo slab depot");
-	uds_log_info("  zone_count=%u old_zone_count=%u slabCount=%u active_release_request=%llu new_release_request=%llu",
+	vdo_log_info("vdo slab depot");
+	vdo_log_info("  zone_count=%u old_zone_count=%u slabCount=%u active_release_request=%llu new_release_request=%llu",
 		     (unsigned int) depot->zone_count,
 		     (unsigned int) depot->old_zone_count, READ_ONCE(depot->slab_count),
 		     (unsigned long long) depot->active_release_request,
diff --git a/drivers/md/dm-vdo/status-codes.c b/drivers/md/dm-vdo/status-codes.c
index efba1ead0acaef..25f76e97b792c9 100644
--- a/drivers/md/dm-vdo/status-codes.c
+++ b/drivers/md/dm-vdo/status-codes.c
@@ -98,8 +98,8 @@ int vdo_register_status_codes(void)
  */
 int vdo_status_to_errno(int error)
 {
-	char error_name[UDS_MAX_ERROR_NAME_SIZE];
-	char error_message[UDS_MAX_ERROR_MESSAGE_SIZE];
+	char error_name[VDO_MAX_ERROR_NAME_SIZE];
+	char error_message[VDO_MAX_ERROR_MESSAGE_SIZE];
 
 	/* 0 is success, negative a system error code */
 	if (likely(error <= 0))
@@ -114,7 +114,7 @@ int vdo_status_to_errno(int error)
 	case VDO_READ_ONLY:
 		return -EIO;
 	default:
-		uds_log_info("%s: mapping internal status code %d (%s: %s) to EIO",
+		vdo_log_info("%s: mapping internal status code %d (%s: %s) to EIO",
 			     __func__, error,
 			     uds_string_error_name(error, error_name, sizeof(error_name)),
 			     uds_string_error(error, error_message, sizeof(error_message)));
diff --git a/drivers/md/dm-vdo/thread-utils.c b/drivers/md/dm-vdo/thread-utils.c
index a6cea9544d9a1e..5e9204c0df7210 100644
--- a/drivers/md/dm-vdo/thread-utils.c
+++ b/drivers/md/dm-vdo/thread-utils.c
@@ -86,7 +86,7 @@ int vdo_create_thread(void (*thread_function)(void *), void *thread_data,
 
 	result = vdo_allocate(1, struct thread, __func__, &thread);
 	if (result != VDO_SUCCESS) {
-		uds_log_warning("Error allocating memory for %s", name);
+		vdo_log_warning("Error allocating memory for %s", name);
 		return result;
 	}
 
diff --git a/drivers/md/dm-vdo/vdo.c b/drivers/md/dm-vdo/vdo.c
index 5fbdeccf3fc606..23a4f1d1b6dcfc 100644
--- a/drivers/md/dm-vdo/vdo.c
+++ b/drivers/md/dm-vdo/vdo.c
@@ -307,7 +307,7 @@ static int __must_check read_geometry_block(struct vdo *vdo)
 	result = blk_status_to_errno(vio->bio->bi_status);
 	free_vio(vdo_forget(vio));
 	if (result != 0) {
-		uds_log_error_strerror(result, "synchronous read failed");
+		vdo_log_error_strerror(result, "synchronous read failed");
 		vdo_free(block);
 		return -EIO;
 	}
@@ -496,7 +496,7 @@ static int initialize_vdo(struct vdo *vdo, struct device_config *config,
 		return result;
 	}
 
-	uds_log_info("zones: %d logical, %d physical, %d hash; total threads: %d",
+	vdo_log_info("zones: %d logical, %d physical, %d hash; total threads: %d",
 		     config->thread_counts.logical_zones,
 		     config->thread_counts.physical_zones,
 		     config->thread_counts.hash_zones, vdo->thread_config.thread_count);
@@ -843,7 +843,7 @@ int vdo_synchronous_flush(struct vdo *vdo)
 
 	atomic64_inc(&vdo->stats.flush_out);
 	if (result != 0) {
-		uds_log_error_strerror(result, "synchronous flush failed");
+		vdo_log_error_strerror(result, "synchronous flush failed");
 		result = -EIO;
 	}
 
@@ -930,7 +930,7 @@ static void handle_save_error(struct vdo_completion *completion)
 		container_of(as_vio(completion), struct vdo_super_block, vio);
 
 	vio_record_metadata_io_error(&super_block->vio);
-	uds_log_error_strerror(completion->result, "super block save failed");
+	vdo_log_error_strerror(completion->result, "super block save failed");
 	/*
 	 * Mark the super block as unwritable so that we won't attempt to write it again. This
 	 * avoids the case where a growth attempt fails writing the super block with the new size,
@@ -1156,7 +1156,7 @@ static void make_thread_read_only(struct vdo_completion *completion)
 		thread->is_read_only = true;
 		listener = thread->listeners;
 		if (thread_id == 0)
-			uds_log_error_strerror(READ_ONCE(notifier->read_only_error),
+			vdo_log_error_strerror(READ_ONCE(notifier->read_only_error),
 					       "Unrecoverable error, entering read-only mode");
 	} else {
 		/* We've just finished notifying a listener */
@@ -1331,7 +1331,7 @@ void vdo_enter_recovery_mode(struct vdo *vdo)
 	if (vdo_in_read_only_mode(vdo))
 		return;
 
-	uds_log_info("Entering recovery mode");
+	vdo_log_info("Entering recovery mode");
 	vdo_set_state(vdo, VDO_RECOVERING);
 }
 
@@ -1384,7 +1384,7 @@ static void set_compression_callback(struct vdo_completion *completion)
 		}
 	}
 
-	uds_log_info("compression is %s", (*enable ? "enabled" : "disabled"));
+	vdo_log_info("compression is %s", (*enable ? "enabled" : "disabled"));
 	*enable = was_enabled;
 	complete_synchronous_action(completion);
 }
diff --git a/drivers/md/dm-vdo/vio.c b/drivers/md/dm-vdo/vio.c
index edcb010ab125c6..b045c325549b07 100644
--- a/drivers/md/dm-vdo/vio.c
+++ b/drivers/md/dm-vdo/vio.c
@@ -132,7 +132,7 @@ int create_multi_block_metadata_vio(struct vdo *vdo, enum vio_type vio_type,
 	 */
 	result = vdo_allocate(1, struct vio, __func__, &vio);
 	if (result != VDO_SUCCESS) {
-		uds_log_error("metadata vio allocation failure %d", result);
+		vdo_log_error("metadata vio allocation failure %d", result);
 		return result;
 	}
 
@@ -226,7 +226,7 @@ int vio_reset_bio(struct vio *vio, char *data, bio_end_io_t callback,
 		bytes_added = bio_add_page(bio, page, bytes, offset);
 
 		if (bytes_added != bytes) {
-			return uds_log_error_strerror(VDO_BIO_CREATION_FAILED,
+			return vdo_log_error_strerror(VDO_BIO_CREATION_FAILED,
 						      "Could only add %i bytes to bio",
 						      bytes_added);
 		}
@@ -259,18 +259,18 @@ void update_vio_error_stats(struct vio *vio, const char *format, ...)
 
 	case VDO_NO_SPACE:
 		atomic64_inc(&vdo->stats.no_space_error_count);
-		priority = UDS_LOG_DEBUG;
+		priority = VDO_LOG_DEBUG;
 		break;
 
 	default:
-		priority = UDS_LOG_ERR;
+		priority = VDO_LOG_ERR;
 	}
 
 	if (!__ratelimit(&error_limiter))
 		return;
 
 	va_start(args, format);
-	uds_vlog_strerror(priority, vio->completion.result, UDS_LOGGING_MODULE_NAME,
+	vdo_vlog_strerror(priority, vio->completion.result, VDO_LOGGING_MODULE_NAME,
 			  format, args);
 	va_end(args);
 }

From 5a75256887719526832934e5918414fe678c60f0 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@kernel.org>
Date: Wed, 14 Feb 2024 09:34:46 -0600
Subject: [PATCH 0918/1406] dm vdo string-utils: change from uds_ to vdo_
 namespace

Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 drivers/md/dm-vdo/dm-vdo-target.c |  2 +-
 drivers/md/dm-vdo/errors.c        | 16 ++++++++--------
 drivers/md/dm-vdo/logical-zone.c  |  2 +-
 drivers/md/dm-vdo/slab-depot.c    |  6 +++---
 drivers/md/dm-vdo/string-utils.c  |  2 +-
 drivers/md/dm-vdo/string-utils.h  | 10 +++++-----
 6 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/drivers/md/dm-vdo/dm-vdo-target.c b/drivers/md/dm-vdo/dm-vdo-target.c
index 240bfa0aa3da7e..89d00be9f07598 100644
--- a/drivers/md/dm-vdo/dm-vdo-target.c
+++ b/drivers/md/dm-vdo/dm-vdo-target.c
@@ -344,7 +344,7 @@ static int join_strings(char **substring_array, size_t array_length, char separa
 	current_position = &output[0];
 
 	for (i = 0; (i < array_length) && (substring_array[i] != NULL); i++) {
-		current_position = uds_append_to_buffer(current_position,
+		current_position = vdo_append_to_buffer(current_position,
 							output + string_length, "%s",
 							substring_array[i]);
 		*current_position = separator;
diff --git a/drivers/md/dm-vdo/errors.c b/drivers/md/dm-vdo/errors.c
index 62c76b9852d84f..1f685a9955281d 100644
--- a/drivers/md/dm-vdo/errors.c
+++ b/drivers/md/dm-vdo/errors.c
@@ -160,19 +160,19 @@ const char *uds_string_error(int errnum, char *buf, size_t buflen)
 	block_name = get_error_info(errnum, &info);
 	if (block_name != NULL) {
 		if (info != NULL) {
-			buffer = uds_append_to_buffer(buffer, buf_end, "%s: %s",
+			buffer = vdo_append_to_buffer(buffer, buf_end, "%s: %s",
 						      block_name, info->message);
 		} else {
-			buffer = uds_append_to_buffer(buffer, buf_end, "Unknown %s %d",
+			buffer = vdo_append_to_buffer(buffer, buf_end, "Unknown %s %d",
 						      block_name, errnum);
 		}
 	} else if (info != NULL) {
-		buffer = uds_append_to_buffer(buffer, buf_end, "%s", info->message);
+		buffer = vdo_append_to_buffer(buffer, buf_end, "%s", info->message);
 	} else {
 		const char *tmp = system_string_error(errnum, buffer, buf_end - buffer);
 
 		if (tmp != buffer)
-			buffer = uds_append_to_buffer(buffer, buf_end, "%s", tmp);
+			buffer = vdo_append_to_buffer(buffer, buf_end, "%s", tmp);
 		else
 			buffer += strlen(tmp);
 	}
@@ -194,19 +194,19 @@ const char *uds_string_error_name(int errnum, char *buf, size_t buflen)
 	block_name = get_error_info(errnum, &info);
 	if (block_name != NULL) {
 		if (info != NULL) {
-			buffer = uds_append_to_buffer(buffer, buf_end, "%s", info->name);
+			buffer = vdo_append_to_buffer(buffer, buf_end, "%s", info->name);
 		} else {
-			buffer = uds_append_to_buffer(buffer, buf_end, "%s %d",
+			buffer = vdo_append_to_buffer(buffer, buf_end, "%s %d",
 						      block_name, errnum);
 		}
 	} else if (info != NULL) {
-		buffer = uds_append_to_buffer(buffer, buf_end, "%s", info->name);
+		buffer = vdo_append_to_buffer(buffer, buf_end, "%s", info->name);
 	} else {
 		const char *tmp;
 
 		tmp = system_string_error(errnum, buffer, buf_end - buffer);
 		if (tmp != buffer)
-			buffer = uds_append_to_buffer(buffer, buf_end, "%s", tmp);
+			buffer = vdo_append_to_buffer(buffer, buf_end, "%s", tmp);
 		else
 			buffer += strlen(tmp);
 	}
diff --git a/drivers/md/dm-vdo/logical-zone.c b/drivers/md/dm-vdo/logical-zone.c
index 336c3d3ec5e78c..a040fe9c07797a 100644
--- a/drivers/md/dm-vdo/logical-zone.c
+++ b/drivers/md/dm-vdo/logical-zone.c
@@ -370,6 +370,6 @@ void vdo_dump_logical_zone(const struct logical_zone *zone)
 		     (unsigned long long) READ_ONCE(zone->flush_generation),
 		     (unsigned long long) READ_ONCE(zone->oldest_active_generation),
 		     (unsigned long long) READ_ONCE(zone->notification_generation),
-		     uds_bool_to_string(READ_ONCE(zone->notifying)),
+		     vdo_bool_to_string(READ_ONCE(zone->notifying)),
 		     (unsigned long long) READ_ONCE(zone->ios_in_flush_generation));
 }
diff --git a/drivers/md/dm-vdo/slab-depot.c b/drivers/md/dm-vdo/slab-depot.c
index 9749bd9fea4eae..b11972a8a08b3c 100644
--- a/drivers/md/dm-vdo/slab-depot.c
+++ b/drivers/md/dm-vdo/slab-depot.c
@@ -3591,8 +3591,8 @@ void vdo_dump_block_allocator(const struct block_allocator *allocator)
 
 		vdo_log_info("  slab journal: entry_waiters=%zu waiting_to_commit=%s updating_slab_summary=%s head=%llu unreapable=%llu tail=%llu next_commit=%llu summarized=%llu last_summarized=%llu recovery_lock=%llu dirty=%s",
 			     vdo_waitq_num_waiters(&journal->entry_waiters),
-			     uds_bool_to_string(journal->waiting_to_commit),
-			     uds_bool_to_string(journal->updating_slab_summary),
+			     vdo_bool_to_string(journal->waiting_to_commit),
+			     vdo_bool_to_string(journal->updating_slab_summary),
 			     (unsigned long long) journal->head,
 			     (unsigned long long) journal->unreapable,
 			     (unsigned long long) journal->tail,
@@ -3600,7 +3600,7 @@ void vdo_dump_block_allocator(const struct block_allocator *allocator)
 			     (unsigned long long) journal->summarized,
 			     (unsigned long long) journal->last_summarized,
 			     (unsigned long long) journal->recovery_lock,
-			     uds_bool_to_string(journal->recovery_lock != 0));
+			     vdo_bool_to_string(journal->recovery_lock != 0));
 		/*
 		 * Given the frequency with which the locks are just a tiny bit off, it might be
 		 * worth dumping all the locks, but that might be too much logging.
diff --git a/drivers/md/dm-vdo/string-utils.c b/drivers/md/dm-vdo/string-utils.c
index 6cdf018cdaf094..71e44b4683eafa 100644
--- a/drivers/md/dm-vdo/string-utils.c
+++ b/drivers/md/dm-vdo/string-utils.c
@@ -5,7 +5,7 @@
 
 #include "string-utils.h"
 
-char *uds_append_to_buffer(char *buffer, char *buf_end, const char *fmt, ...)
+char *vdo_append_to_buffer(char *buffer, char *buf_end, const char *fmt, ...)
 {
 	va_list args;
 	size_t n;
diff --git a/drivers/md/dm-vdo/string-utils.h b/drivers/md/dm-vdo/string-utils.h
index 8275af582cf7a2..96eecd38b1c222 100644
--- a/drivers/md/dm-vdo/string-utils.h
+++ b/drivers/md/dm-vdo/string-utils.h
@@ -3,21 +3,21 @@
  * Copyright 2023 Red Hat
  */
 
-#ifndef UDS_STRING_UTILS_H
-#define UDS_STRING_UTILS_H
+#ifndef VDO_STRING_UTILS_H
+#define VDO_STRING_UTILS_H
 
 #include <linux/kernel.h>
 #include <linux/string.h>
 
 /* Utilities related to string manipulation */
 
-static inline const char *uds_bool_to_string(bool value)
+static inline const char *vdo_bool_to_string(bool value)
 {
 	return value ? "true" : "false";
 }
 
 /* Append a formatted string to the end of a buffer. */
-char *uds_append_to_buffer(char *buffer, char *buf_end, const char *fmt, ...)
+char *vdo_append_to_buffer(char *buffer, char *buf_end, const char *fmt, ...)
 	__printf(3, 4);
 
-#endif /* UDS_STRING_UTILS_H */
+#endif /* VDO_STRING_UTILS_H */

From 76912dcbabc64f2a8b405edc9ea7cc00e14bd63d Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Sat, 17 Feb 2024 08:30:41 +0100
Subject: [PATCH 0919/1406] Bluetooth: btbcm: Use strreplace()

Use strreplace() instead of hand-writing it.
It is less verbose.

Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 drivers/bluetooth/btbcm.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/drivers/bluetooth/btbcm.c b/drivers/bluetooth/btbcm.c
index 0a5445ac5e1b7c..01d2343b4978a2 100644
--- a/drivers/bluetooth/btbcm.c
+++ b/drivers/bluetooth/btbcm.c
@@ -11,6 +11,7 @@
 #include <linux/firmware.h>
 #include <linux/dmi.h>
 #include <linux/of.h>
+#include <linux/string.h>
 #include <asm/unaligned.h>
 
 #include <net/bluetooth/bluetooth.h>
@@ -544,7 +545,6 @@ static const char *btbcm_get_board_name(struct device *dev)
 	char *board_type;
 	const char *tmp;
 	int len;
-	int i;
 
 	root = of_find_node_by_path("/");
 	if (!root)
@@ -557,10 +557,7 @@ static const char *btbcm_get_board_name(struct device *dev)
 	len = strlen(tmp) + 1;
 	board_type = devm_kzalloc(dev, len, GFP_KERNEL);
 	strscpy(board_type, tmp, len);
-	for (i = 0; i < len; i++) {
-		if (board_type[i] == '/')
-			board_type[i] = '-';
-	}
+	strreplace(board_type, '/', '-');
 	of_node_put(root);
 
 	return board_type;

From d095e06a156945f8461510341f3a488b08372018 Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Sat, 17 Feb 2024 08:30:42 +0100
Subject: [PATCH 0920/1406] Bluetooth: btbcm: Use devm_kstrdup()

Use devm_kstrdup() instead of hand-writing it.
It is less verbose.

Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 drivers/bluetooth/btbcm.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/drivers/bluetooth/btbcm.c b/drivers/bluetooth/btbcm.c
index 01d2343b4978a2..f9a7c790d7e2ec 100644
--- a/drivers/bluetooth/btbcm.c
+++ b/drivers/bluetooth/btbcm.c
@@ -544,7 +544,6 @@ static const char *btbcm_get_board_name(struct device *dev)
 	struct device_node *root;
 	char *board_type;
 	const char *tmp;
-	int len;
 
 	root = of_find_node_by_path("/");
 	if (!root)
@@ -554,9 +553,7 @@ static const char *btbcm_get_board_name(struct device *dev)
 		return NULL;
 
 	/* get rid of any '/' in the compatible string */
-	len = strlen(tmp) + 1;
-	board_type = devm_kzalloc(dev, len, GFP_KERNEL);
-	strscpy(board_type, tmp, len);
+	board_type = devm_kstrdup(dev, tmp, GFP_KERNEL);
 	strreplace(board_type, '/', '-');
 	of_node_put(root);
 

From 8a496378559fc8029ddcb05813067e9fe51890f6 Mon Sep 17 00:00:00 2001
From: "Ricardo B. Marliere" <ricardo@marliere.net>
Date: Mon, 19 Feb 2024 16:46:57 -0300
Subject: [PATCH 0921/1406] Bluetooth: constify the struct device_type usage

Since commit aed65af1cc2f ("drivers: make device_type const"), the driver
core can properly handle constant struct device_type. Move the bt_type and
bnep_type variables to be constant structures as well, placing it into
read-only memory which can not be modified at runtime.

Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Ricardo B. Marliere <ricardo@marliere.net>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 net/bluetooth/6lowpan.c   | 2 +-
 net/bluetooth/bnep/core.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c
index 715cbafbf6631c..27520a8a486f3c 100644
--- a/net/bluetooth/6lowpan.c
+++ b/net/bluetooth/6lowpan.c
@@ -572,7 +572,7 @@ static void netdev_setup(struct net_device *dev)
 	dev->needs_free_netdev	= true;
 }
 
-static struct device_type bt_type = {
+static const struct device_type bt_type = {
 	.name	= "bluetooth",
 };
 
diff --git a/net/bluetooth/bnep/core.c b/net/bluetooth/bnep/core.c
index 5a6a49885ab66d..8c3f8d0c03588b 100644
--- a/net/bluetooth/bnep/core.c
+++ b/net/bluetooth/bnep/core.c
@@ -549,7 +549,7 @@ static struct device *bnep_get_device(struct bnep_session *session)
 	return &conn->hcon->dev;
 }
 
-static struct device_type bnep_type = {
+static const struct device_type bnep_type = {
 	.name	= "bluetooth",
 };
 

From 55a7c10e71ae464444303a66128e6c272b295a8c Mon Sep 17 00:00:00 2001
From: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Date: Fri, 16 Feb 2024 15:29:55 -0500
Subject: [PATCH 0922/1406] Bluetooth: hci_sync: Fix UAF on hci_abort_conn_sync

Fixes the following trace where hci_acl_create_conn_sync attempts to
call hci_abort_conn_sync after timeout:

BUG: KASAN: slab-use-after-free in hci_abort_conn_sync
(net/bluetooth/hci_sync.c:5439)
Read of size 2 at addr ffff88800322c032 by task kworker/u3:2/36

Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.2-1.fc38
04/01/2014
Workqueue: hci0 hci_cmd_sync_work
Call Trace:
<TASK>
dump_stack_lvl (./arch/x86/include/asm/irqflags.h:26
./arch/x86/include/asm/irqflags.h:67 ./arch/x86/include/asm/irqflags.h:127
lib/dump_stack.c:107)
print_report (mm/kasan/report.c:378 mm/kasan/report.c:488)
? preempt_count_sub (kernel/sched/core.c:5889)
? __virt_addr_valid (./arch/x86/include/asm/preempt.h:103 (discriminator 1)
./include/linux/rcupdate.h:865 (discriminator 1)
./include/linux/mmzone.h:2026 (discriminator 1)
arch/x86/mm/physaddr.c:65 (discriminator 1))
? hci_abort_conn_sync (net/bluetooth/hci_sync.c:5439)
kasan_report (mm/kasan/report.c:603)
? hci_abort_conn_sync (net/bluetooth/hci_sync.c:5439)
hci_abort_conn_sync (net/bluetooth/hci_sync.c:5439)
? __pfx_hci_abort_conn_sync (net/bluetooth/hci_sync.c:5433)
hci_acl_create_conn_sync (net/bluetooth/hci_sync.c:6681)

Fixes: 456561ba8e49 ("Bluetooth: hci_conn: Only do ACL connections sequentially")
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 net/bluetooth/hci_sync.c | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c
index b7d8e99e2a30e0..4a28aef2f01f4f 100644
--- a/net/bluetooth/hci_sync.c
+++ b/net/bluetooth/hci_sync.c
@@ -6670,15 +6670,10 @@ static int hci_acl_create_conn_sync(struct hci_dev *hdev, void *data)
 	else
 		cp.role_switch = 0x00;
 
-	err = __hci_cmd_sync_status_sk(hdev, HCI_OP_CREATE_CONN,
-				       sizeof(cp), &cp,
-				       HCI_EV_CONN_COMPLETE,
-				       conn->conn_timeout, NULL);
-
-	if (err == -ETIMEDOUT)
-		hci_abort_conn_sync(hdev, conn, HCI_ERROR_LOCAL_HOST_TERM);
-
-	return err;
+	return __hci_cmd_sync_status_sk(hdev, HCI_OP_CREATE_CONN,
+					sizeof(cp), &cp,
+					HCI_EV_CONN_COMPLETE,
+					conn->conn_timeout, NULL);
 }
 
 int hci_connect_acl_sync(struct hci_dev *hdev, struct hci_conn *conn)

From df90bef42e65d3a3ea2a449515167a786f4d64b8 Mon Sep 17 00:00:00 2001
From: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Date: Fri, 16 Feb 2024 16:20:11 -0500
Subject: [PATCH 0923/1406] Bluetooth: hci_sync: Fix overwriting request
 callback

In a few cases the stack may generate commands as responses to events
which would happen to overwrite the sent_cmd, so this attempts to store
the request in req_skb so even if sent_cmd is replaced with a new
command the pending request will remain in stored in req_skb.

Fixes: 6a98e3836fa2 ("Bluetooth: Add helper for serialized HCI command execution")
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 net/bluetooth/hci_conn.c  |  2 +-
 net/bluetooth/hci_core.c  | 46 +++++++++++++++++++++++++++------------
 net/bluetooth/hci_event.c | 18 +++++++--------
 net/bluetooth/hci_sync.c  | 15 +++++++++++++
 4 files changed, 57 insertions(+), 24 deletions(-)

diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c
index 21e0b4064d05d6..1acc75a49bb04f 100644
--- a/net/bluetooth/hci_conn.c
+++ b/net/bluetooth/hci_conn.c
@@ -2893,7 +2893,7 @@ int hci_abort_conn(struct hci_conn *conn, u8 reason)
 	 * connection can be in BT_CONNECT at time.
 	 */
 	if (conn->state == BT_CONNECT && hdev->req_status == HCI_REQ_PEND) {
-		switch (hci_skb_event(hdev->sent_cmd)) {
+		switch (hci_skb_event(hdev->req_skb)) {
 		case HCI_EV_CONN_COMPLETE:
 		case HCI_EV_LE_CONN_COMPLETE:
 		case HCI_EV_LE_ENHANCED_CONN_COMPLETE:
diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index 34c8dca2069f6b..6ca4c0df9f9c44 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -1522,8 +1522,8 @@ static void hci_cmd_timeout(struct work_struct *work)
 	struct hci_dev *hdev = container_of(work, struct hci_dev,
 					    cmd_timer.work);
 
-	if (hdev->sent_cmd) {
-		u16 opcode = hci_skb_opcode(hdev->sent_cmd);
+	if (hdev->req_skb) {
+		u16 opcode = hci_skb_opcode(hdev->req_skb);
 
 		bt_dev_err(hdev, "command 0x%4.4x tx timeout", opcode);
 
@@ -2828,6 +2828,7 @@ void hci_release_dev(struct hci_dev *hdev)
 	ida_destroy(&hdev->unset_handle_ida);
 	ida_free(&hci_index_ida, hdev->id);
 	kfree_skb(hdev->sent_cmd);
+	kfree_skb(hdev->req_skb);
 	kfree_skb(hdev->recv_event);
 	kfree(hdev);
 }
@@ -3157,21 +3158,33 @@ int __hci_cmd_send(struct hci_dev *hdev, u16 opcode, u32 plen,
 EXPORT_SYMBOL(__hci_cmd_send);
 
 /* Get data from the previously sent command */
-void *hci_sent_cmd_data(struct hci_dev *hdev, __u16 opcode)
+static void *hci_cmd_data(struct sk_buff *skb, __u16 opcode)
 {
 	struct hci_command_hdr *hdr;
 
-	if (!hdev->sent_cmd)
+	if (!skb || skb->len < HCI_COMMAND_HDR_SIZE)
 		return NULL;
 
-	hdr = (void *) hdev->sent_cmd->data;
+	hdr = (void *)skb->data;
 
 	if (hdr->opcode != cpu_to_le16(opcode))
 		return NULL;
 
-	BT_DBG("%s opcode 0x%4.4x", hdev->name, opcode);
+	return skb->data + HCI_COMMAND_HDR_SIZE;
+}
 
-	return hdev->sent_cmd->data + HCI_COMMAND_HDR_SIZE;
+/* Get data from the previously sent command */
+void *hci_sent_cmd_data(struct hci_dev *hdev, __u16 opcode)
+{
+	void *data;
+
+	/* Check if opcode matches last sent command */
+	data = hci_cmd_data(hdev->sent_cmd, opcode);
+	if (!data)
+		/* Check if opcode matches last request */
+		data = hci_cmd_data(hdev->req_skb, opcode);
+
+	return data;
 }
 
 /* Get data from last received event */
@@ -4072,17 +4085,19 @@ void hci_req_cmd_complete(struct hci_dev *hdev, u16 opcode, u8 status,
 	if (!status && !hci_req_is_complete(hdev))
 		return;
 
+	skb = hdev->req_skb;
+
 	/* If this was the last command in a request the complete
-	 * callback would be found in hdev->sent_cmd instead of the
+	 * callback would be found in hdev->req_skb instead of the
 	 * command queue (hdev->cmd_q).
 	 */
-	if (bt_cb(hdev->sent_cmd)->hci.req_flags & HCI_REQ_SKB) {
-		*req_complete_skb = bt_cb(hdev->sent_cmd)->hci.req_complete_skb;
+	if (skb && bt_cb(skb)->hci.req_flags & HCI_REQ_SKB) {
+		*req_complete_skb = bt_cb(skb)->hci.req_complete_skb;
 		return;
 	}
 
-	if (bt_cb(hdev->sent_cmd)->hci.req_complete) {
-		*req_complete = bt_cb(hdev->sent_cmd)->hci.req_complete;
+	if (skb && bt_cb(skb)->hci.req_complete) {
+		*req_complete = bt_cb(skb)->hci.req_complete;
 		return;
 	}
 
@@ -4199,8 +4214,11 @@ static void hci_send_cmd_sync(struct hci_dev *hdev, struct sk_buff *skb)
 		return;
 	}
 
-	if (hci_req_status_pend(hdev))
-		hci_dev_set_flag(hdev, HCI_CMD_PENDING);
+	if (hci_req_status_pend(hdev) &&
+	    !hci_dev_test_and_set_flag(hdev, HCI_CMD_PENDING)) {
+		kfree_skb(hdev->req_skb);
+		hdev->req_skb = skb_clone(skb, GFP_KERNEL);
+	}
 
 	atomic_dec(&hdev->cmd_cnt);
 }
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index 6071a1226e1b4c..bffd2c7ff6087b 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -4368,7 +4368,7 @@ static void hci_cmd_status_evt(struct hci_dev *hdev, void *data,
 	 * (since for this kind of commands there will not be a command
 	 * complete event).
 	 */
-	if (ev->status || (hdev->sent_cmd && !hci_skb_event(hdev->sent_cmd))) {
+	if (ev->status || (hdev->req_skb && !hci_skb_event(hdev->req_skb))) {
 		hci_req_cmd_complete(hdev, *opcode, ev->status, req_complete,
 				     req_complete_skb);
 		if (hci_dev_test_flag(hdev, HCI_CMD_PENDING)) {
@@ -7170,10 +7170,10 @@ static void hci_le_meta_evt(struct hci_dev *hdev, void *data,
 	bt_dev_dbg(hdev, "subevent 0x%2.2x", ev->subevent);
 
 	/* Only match event if command OGF is for LE */
-	if (hdev->sent_cmd &&
-	    hci_opcode_ogf(hci_skb_opcode(hdev->sent_cmd)) == 0x08 &&
-	    hci_skb_event(hdev->sent_cmd) == ev->subevent) {
-		*opcode = hci_skb_opcode(hdev->sent_cmd);
+	if (hdev->req_skb &&
+	    hci_opcode_ogf(hci_skb_opcode(hdev->req_skb)) == 0x08 &&
+	    hci_skb_event(hdev->req_skb) == ev->subevent) {
+		*opcode = hci_skb_opcode(hdev->req_skb);
 		hci_req_cmd_complete(hdev, *opcode, 0x00, req_complete,
 				     req_complete_skb);
 	}
@@ -7541,10 +7541,10 @@ void hci_event_packet(struct hci_dev *hdev, struct sk_buff *skb)
 	}
 
 	/* Only match event if command OGF is not for LE */
-	if (hdev->sent_cmd &&
-	    hci_opcode_ogf(hci_skb_opcode(hdev->sent_cmd)) != 0x08 &&
-	    hci_skb_event(hdev->sent_cmd) == event) {
-		hci_req_cmd_complete(hdev, hci_skb_opcode(hdev->sent_cmd),
+	if (hdev->req_skb &&
+	    hci_opcode_ogf(hci_skb_opcode(hdev->req_skb)) != 0x08 &&
+	    hci_skb_event(hdev->req_skb) == event) {
+		hci_req_cmd_complete(hdev, hci_skb_opcode(hdev->req_skb),
 				     status, &req_complete, &req_complete_skb);
 		req_evt = event;
 	}
diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c
index 4a28aef2f01f4f..7306020ccda3c5 100644
--- a/net/bluetooth/hci_sync.c
+++ b/net/bluetooth/hci_sync.c
@@ -32,6 +32,10 @@ static void hci_cmd_sync_complete(struct hci_dev *hdev, u8 result, u16 opcode,
 	hdev->req_result = result;
 	hdev->req_status = HCI_REQ_DONE;
 
+	/* Free the request command so it is not used as response */
+	kfree_skb(hdev->req_skb);
+	hdev->req_skb = NULL;
+
 	if (skb) {
 		struct sock *sk = hci_skb_sk(skb);
 
@@ -4952,6 +4956,11 @@ int hci_dev_open_sync(struct hci_dev *hdev)
 			hdev->sent_cmd = NULL;
 		}
 
+		if (hdev->req_skb) {
+			kfree_skb(hdev->req_skb);
+			hdev->req_skb = NULL;
+		}
+
 		clear_bit(HCI_RUNNING, &hdev->flags);
 		hci_sock_dev_event(hdev, HCI_DEV_CLOSE);
 
@@ -5112,6 +5121,12 @@ int hci_dev_close_sync(struct hci_dev *hdev)
 		hdev->sent_cmd = NULL;
 	}
 
+	/* Drop last request */
+	if (hdev->req_skb) {
+		kfree_skb(hdev->req_skb);
+		hdev->req_skb = NULL;
+	}
+
 	clear_bit(HCI_RUNNING, &hdev->flags);
 	hci_sock_dev_event(hdev, HCI_DEV_CLOSE);
 

From 25956d989c603a269aeb04336164acef82da50c5 Mon Sep 17 00:00:00 2001
From: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Date: Tue, 20 Feb 2024 13:10:47 -0500
Subject: [PATCH 0924/1406] Bluetooth: hci_sync: Fix UAF on
 create_le_conn_complete

While waiting for hci_dev_lock the hci_conn object may be cleanup
causing the following trace:

BUG: KASAN: slab-use-after-free in hci_connect_le_scan_cleanup+0x29/0x350
Read of size 8 at addr ffff888001a50a30 by task kworker/u3:1/111

CPU: 0 PID: 111 Comm: kworker/u3:1 Not tainted
6.8.0-rc2-00701-g8179b15ab3fd-dirty #6418
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.2-1.fc38
04/01/2014
Workqueue: hci0 hci_cmd_sync_work
Call Trace:
 <TASK>
 dump_stack_lvl+0x21/0x70
 print_report+0xce/0x620
 ? preempt_count_sub+0x13/0xc0
 ? __virt_addr_valid+0x15f/0x310
 ? hci_connect_le_scan_cleanup+0x29/0x350
 kasan_report+0xdf/0x110
 ? hci_connect_le_scan_cleanup+0x29/0x350
 hci_connect_le_scan_cleanup+0x29/0x350
 create_le_conn_complete+0x25c/0x2c0

Fixes: 96fb2aab16bf ("Bluetooth: hci_sync: Attempt to dequeue connection attempt")
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 net/bluetooth/hci_sync.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c
index 7306020ccda3c5..669099cf9b1a00 100644
--- a/net/bluetooth/hci_sync.c
+++ b/net/bluetooth/hci_sync.c
@@ -6708,6 +6708,9 @@ static void create_le_conn_complete(struct hci_dev *hdev, void *data, int err)
 
 	hci_dev_lock(hdev);
 
+	if (!hci_conn_valid(hdev, conn))
+		goto done;
+
 	if (!err) {
 		hci_connect_le_scan_cleanup(conn, 0x00);
 		goto done;

From 78b6f8e7379b5399d1804f0852bb2ddabd049019 Mon Sep 17 00:00:00 2001
From: Rob Herring <robh@kernel.org>
Date: Tue, 13 Feb 2024 13:34:30 -0600
Subject: [PATCH 0925/1406] dtc: Enable dtc interrupt_provider check

Now that all the interrupt warnings have been fixed, enable
'interrupt_provider' check by default. This will also enable
'interrupt_map' check.

Signed-off-by: Rob Herring <robh@kernel.org>
Link: https://lore.kernel.org/r/20240213-arm-dt-cleanups-v1-6-f2dee1292525@kernel.org
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 scripts/Makefile.lib | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib
index cd5b181060f151..fce35e4657f547 100644
--- a/scripts/Makefile.lib
+++ b/scripts/Makefile.lib
@@ -340,7 +340,7 @@ quiet_cmd_gzip = GZIP    $@
 # DTC
 # ---------------------------------------------------------------------------
 DTC ?= $(objtree)/scripts/dtc/dtc
-DTC_FLAGS += -Wno-interrupt_provider \
+DTC_FLAGS += \
 	-Wno-unique_unit_address
 
 # Disable noisy checks by default
@@ -358,7 +358,6 @@ endif
 ifneq ($(findstring 2,$(KBUILD_EXTRA_WARN)),)
 DTC_FLAGS += -Wnode_name_chars_strict \
 	-Wproperty_name_chars_strict \
-	-Winterrupt_provider \
 	-Wunique_unit_address
 endif
 

From 37156e9b997483f1d08db52ef298b9878394eef0 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 20 Feb 2024 22:06:29 +0100
Subject: [PATCH 0926/1406] soc: document merges

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 arch/arm/arm-soc-for-next-contents.txt | 39 ++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)
 create mode 100644 arch/arm/arm-soc-for-next-contents.txt

diff --git a/arch/arm/arm-soc-for-next-contents.txt b/arch/arm/arm-soc-for-next-contents.txt
new file mode 100644
index 00000000000000..96073b99a9ad56
--- /dev/null
+++ b/arch/arm/arm-soc-for-next-contents.txt
@@ -0,0 +1,39 @@
+soc/arm
+
+soc/dt
+
+soc/drivers
+	mediatek/soc-drivers
+		https://git.kernel.org/pub/scm/linux/kernel/git/mediatek/linux tags/mtk-soc-for-v6.9
+	renesas/drivers
+		git://git.kernel.org/pub/scm/linux/kernel/git/geert/renesas-devel tags/renesas-drivers-for-v6.9-tag1
+	samsung/drivers
+		https://git.kernel.org/pub/scm/linux/kernel/git/krzk/linux tags/samsung-drivers-6.9
+	drivers/memory
+		https://git.kernel.org/pub/scm/linux/kernel/git/krzk/linux-mem-ctrl tags/memory-controller-drv-6.9
+
+soc/defconfig
+	patch
+		ARM: multi_v7_defconfig: Add more TI Keystone support
+		ARM: multi_v7_defconfig: Enable BACKLIGHT_CLASS_DEVICE
+		arm64: config: disable new platforms in virt.config
+	renesas/defconfig
+		git://git.kernel.org/pub/scm/linux/kernel/git/geert/renesas-devel tags/renesas-arm-defconfig-for-v6.9-tag1
+
+soc/late
+
+arm/fixes
+	patch
+		ARM: ep93xx: Add terminator to gpiod_lookup_table
+	<no branch> (7bca405c986075c99b9f729d3587b5c45db39d01)
+		git://git.kernel.org/pub/scm/linux/kernel/git/shawnguo/linux tags/imx-fixes-6.8
+	<no branch> (c22d03a95b0d815cd186302fdd93f74d99f1c914)
+		git://git.kernel.org/pub/scm/linux/kernel/git/mmind/linux-rockchip tags/v6.8-rockchip-dtsfixes1
+	patch
+		arm64: dts: freescale: Disable interrupt_map check
+		arm: dts: Fix dtc interrupt_provider warnings
+		arm64: dts: Fix dtc interrupt_provider warnings
+		arm: dts: Fix dtc interrupt_map warnings
+		arm64: dts: qcom: Fix interrupt-map cell sizes
+		dtc: Enable dtc interrupt_provider check
+

From 6032f99a90c86f6be061baa00394cccb652a1457 Mon Sep 17 00:00:00 2001
From: Yosry Ahmed <yosryahmed@google.com>
Date: Thu, 25 Jan 2024 08:51:27 +0000
Subject: [PATCH 0927/1406] mm: zswap: fix missing folio cleanup in writeback
 race path

In zswap_writeback_entry(), after we get a folio from
__read_swap_cache_async(), we grab the tree lock again to check that the
swap entry was not invalidated and recycled.  If it was, we delete the
folio we just added to the swap cache and exit.

However, __read_swap_cache_async() returns the folio locked when it is
newly allocated, which is always true for this path, and the folio is
ref'd.  Make sure to unlock and put the folio before returning.

This was discovered by code inspection, probably because this path handles
a race condition that should not happen often, and the bug would not crash
the system, it will only strand the folio indefinitely.

Link: https://lkml.kernel.org/r/20240125085127.1327013-1-yosryahmed@google.com
Fixes: 04fc7816089c ("mm: fix zswap writeback race condition")
Signed-off-by: Yosry Ahmed <yosryahmed@google.com>
Reviewed-by: Chengming Zhou <zhouchengming@bytedance.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Nhat Pham <nphamcs@gmail.com>
Cc: Domenico Cerasuolo <cerasuolodomenico@gmail.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/zswap.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/mm/zswap.c b/mm/zswap.c
index 350dd2fc815994..d2423247acfd64 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -1440,6 +1440,8 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
 	if (zswap_rb_search(&tree->rbroot, swp_offset(entry->swpentry)) != entry) {
 		spin_unlock(&tree->lock);
 		delete_from_swap_cache(folio);
+		folio_unlock(folio);
+		folio_put(folio);
 		return -ENOMEM;
 	}
 	spin_unlock(&tree->lock);

From 099eee8c5530ba3c9cdf9f9539c3bfa3fe43d2f1 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 5 Feb 2024 12:13:06 -0800
Subject: [PATCH 0928/1406] mm/damon/core: check apply interval in
 damon_do_apply_schemes()

kdamond_apply_schemes() checks apply intervals of schemes and avoid
further applying any schemes if no scheme passed its apply interval.
However, the following schemes applying function, damon_do_apply_schemes()
iterates all schemes without the apply interval check.  As a result, the
shortest apply interval is applied to all schemes.  Fix the problem by
checking the apply interval in damon_do_apply_schemes().

Link: https://lkml.kernel.org/r/20240205201306.88562-1-sj@kernel.org
Fixes: 42f994b71404 ("mm/damon/core: implement scheme-specific apply interval")
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: <stable@vger.kernel.org>	[6.7.x]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/core.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/mm/damon/core.c b/mm/damon/core.c
index 36f6f1d21ff069..5b325749fc1259 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -1026,6 +1026,9 @@ static void damon_do_apply_schemes(struct damon_ctx *c,
 	damon_for_each_scheme(s, c) {
 		struct damos_quota *quota = &s->quota;
 
+		if (c->passed_sample_intervals != s->next_apply_sis)
+			continue;
+
 		if (!s->wmarks.activated)
 			continue;
 
@@ -1176,10 +1179,6 @@ static void kdamond_apply_schemes(struct damon_ctx *c)
 		if (c->passed_sample_intervals != s->next_apply_sis)
 			continue;
 
-		s->next_apply_sis +=
-			(s->apply_interval_us ? s->apply_interval_us :
-			 c->attrs.aggr_interval) / sample_interval;
-
 		if (!s->wmarks.activated)
 			continue;
 
@@ -1195,6 +1194,14 @@ static void kdamond_apply_schemes(struct damon_ctx *c)
 		damon_for_each_region_safe(r, next_r, t)
 			damon_do_apply_schemes(c, t, r);
 	}
+
+	damon_for_each_scheme(s, c) {
+		if (c->passed_sample_intervals != s->next_apply_sis)
+			continue;
+		s->next_apply_sis +=
+			(s->apply_interval_us ? s->apply_interval_us :
+			 c->attrs.aggr_interval) / sample_interval;
+	}
 }
 
 /*

From cbab2e62929ece35d3558b5457dc26217d2df132 Mon Sep 17 00:00:00 2001
From: Terry Tritton <terry.tritton@linaro.org>
Date: Mon, 5 Feb 2024 14:50:56 +0000
Subject: [PATCH 0929/1406] selftests/mm: uffd-unit-test check if huge page
 size is 0

If HUGETLBFS is not enabled then the default_huge_page_size function will
return 0 and cause a divide by 0 error. Add a check to see if the huge page
size is 0 and skip the hugetlb tests if it is.

Link: https://lkml.kernel.org/r/20240205145055.3545806-2-terry.tritton@linaro.org
Fixes: 16a45b57cbf2 ("selftests/mm: add framework for uffd-unit-test")
Signed-off-by: Terry Tritton <terry.tritton@linaro.org>
Cc: Peter Griffin <peter.griffin@linaro.org>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Peter Xu <peterx@redhat.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/uffd-unit-tests.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/tools/testing/selftests/mm/uffd-unit-tests.c b/tools/testing/selftests/mm/uffd-unit-tests.c
index cce90a10515ad2..2b9f8cc52639d1 100644
--- a/tools/testing/selftests/mm/uffd-unit-tests.c
+++ b/tools/testing/selftests/mm/uffd-unit-tests.c
@@ -1517,6 +1517,12 @@ int main(int argc, char *argv[])
 				continue;
 
 			uffd_test_start("%s on %s", test->name, mem_type->name);
+			if ((mem_type->mem_flag == MEM_HUGETLB ||
+			    mem_type->mem_flag == MEM_HUGETLB_PRIVATE) &&
+			    (default_huge_page_size() == 0)) {
+				uffd_test_skip("huge page size is 0, feature missing?");
+				continue;
+			}
 			if (!uffd_feature_supported(test)) {
 				uffd_test_skip("feature missing");
 				continue;

From 54883afaf2be5af5a5cd446e623c8de9689b5787 Mon Sep 17 00:00:00 2001
From: Marco Elver <elver@google.com>
Date: Mon, 29 Jan 2024 11:07:01 +0100
Subject: [PATCH 0930/1406] stackdepot: use variable size records for
 non-evictable entries

With the introduction of stack depot evictions, each stack record is now
fixed size, so that future reuse after an eviction can safely store
differently sized stack traces.  In all cases that do not make use of
evictions, this wastes lots of space.

Fix it by re-introducing variable size stack records (up to the max
allowed size) for entries that will never be evicted.  We know if an entry
will never be evicted if the flag STACK_DEPOT_FLAG_GET is not provided,
since a later stack_depot_put() attempt is undefined behavior.

With my current kernel config that enables KASAN and also SLUB owner
tracking, I observe (after a kernel boot) a whopping reduction of 296
stack depot pools, which translates into 4736 KiB saved.  The savings here
are from SLUB owner tracking only, because KASAN generic mode still uses
refcounting.

Before:

  pools: 893
  allocations: 29841
  frees: 6524
  in_use: 23317
  freelist_size: 3454

After:

  pools: 597
  refcounted_allocations: 17547
  refcounted_frees: 6477
  refcounted_in_use: 11070
  freelist_size: 3497
  persistent_count: 12163
  persistent_bytes: 1717008

Link: https://lkml.kernel.org/r/20240129100708.39460-1-elver@google.com
Link: https://lore.kernel.org/all/CABXGCsOzpRPZGg23QqJAzKnqkZPKzvieeg=W7sgjgi3q0pBo0g@mail.gmail.com/
Fixes: 108be8def46e ("lib/stackdepot: allow users to evict stack traces")
Signed-off-by: Marco Elver <elver@google.com>
Reviewed-by: Andrey Konovalov <andreyknvl@gmail.com>
Tested-by: Mikhail Gavrilov <mikhail.v.gavrilov@gmail.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Vincenzo Frascino <vincenzo.frascino@arm.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/poison.h |   3 +
 lib/stackdepot.c       | 250 +++++++++++++++++++++--------------------
 2 files changed, 130 insertions(+), 123 deletions(-)

diff --git a/include/linux/poison.h b/include/linux/poison.h
index 27a7dad17eefb8..1f0ee2459f2aa2 100644
--- a/include/linux/poison.h
+++ b/include/linux/poison.h
@@ -92,4 +92,7 @@
 /********** VFS **********/
 #define VFS_PTR_POISON ((void *)(0xF5 + POISON_POINTER_DELTA))
 
+/********** lib/stackdepot.c **********/
+#define STACK_DEPOT_POISON ((void *)(0xD390 + POISON_POINTER_DELTA))
+
 #endif
diff --git a/lib/stackdepot.c b/lib/stackdepot.c
index 5caa1f56655384..8f3b2c84ec2db3 100644
--- a/lib/stackdepot.c
+++ b/lib/stackdepot.c
@@ -22,6 +22,7 @@
 #include <linux/list.h>
 #include <linux/mm.h>
 #include <linux/mutex.h>
+#include <linux/poison.h>
 #include <linux/printk.h>
 #include <linux/rculist.h>
 #include <linux/rcupdate.h>
@@ -43,17 +44,7 @@
 #define DEPOT_OFFSET_BITS (DEPOT_POOL_ORDER + PAGE_SHIFT - DEPOT_STACK_ALIGN)
 #define DEPOT_POOL_INDEX_BITS (DEPOT_HANDLE_BITS - DEPOT_OFFSET_BITS - \
 			       STACK_DEPOT_EXTRA_BITS)
-#if IS_ENABLED(CONFIG_KMSAN) && CONFIG_STACKDEPOT_MAX_FRAMES >= 32
-/*
- * KMSAN is frequently used in fuzzing scenarios and thus saves a lot of stack
- * traces. As KMSAN does not support evicting stack traces from the stack
- * depot, the stack depot capacity might be reached quickly with large stack
- * records. Adjust the maximum number of stack depot pools for this case.
- */
-#define DEPOT_POOLS_CAP (8192 * (CONFIG_STACKDEPOT_MAX_FRAMES / 16))
-#else
 #define DEPOT_POOLS_CAP 8192
-#endif
 #define DEPOT_MAX_POOLS \
 	(((1LL << (DEPOT_POOL_INDEX_BITS)) < DEPOT_POOLS_CAP) ? \
 	 (1LL << (DEPOT_POOL_INDEX_BITS)) : DEPOT_POOLS_CAP)
@@ -93,9 +84,6 @@ struct stack_record {
 	};
 };
 
-#define DEPOT_STACK_RECORD_SIZE \
-	ALIGN(sizeof(struct stack_record), 1 << DEPOT_STACK_ALIGN)
-
 static bool stack_depot_disabled;
 static bool __stack_depot_early_init_requested __initdata = IS_ENABLED(CONFIG_STACKDEPOT_ALWAYS_INIT);
 static bool __stack_depot_early_init_passed __initdata;
@@ -121,32 +109,31 @@ static void *stack_pools[DEPOT_MAX_POOLS];
 static void *new_pool;
 /* Number of pools in stack_pools. */
 static int pools_num;
+/* Offset to the unused space in the currently used pool. */
+static size_t pool_offset = DEPOT_POOL_SIZE;
 /* Freelist of stack records within stack_pools. */
 static LIST_HEAD(free_stacks);
-/*
- * Stack depot tries to keep an extra pool allocated even before it runs out
- * of space in the currently used pool. This flag marks whether this extra pool
- * needs to be allocated. It has the value 0 when either an extra pool is not
- * yet allocated or if the limit on the number of pools is reached.
- */
-static bool new_pool_required = true;
 /* The lock must be held when performing pool or freelist modifications. */
 static DEFINE_RAW_SPINLOCK(pool_lock);
 
 /* Statistics counters for debugfs. */
 enum depot_counter_id {
-	DEPOT_COUNTER_ALLOCS,
-	DEPOT_COUNTER_FREES,
-	DEPOT_COUNTER_INUSE,
+	DEPOT_COUNTER_REFD_ALLOCS,
+	DEPOT_COUNTER_REFD_FREES,
+	DEPOT_COUNTER_REFD_INUSE,
 	DEPOT_COUNTER_FREELIST_SIZE,
+	DEPOT_COUNTER_PERSIST_COUNT,
+	DEPOT_COUNTER_PERSIST_BYTES,
 	DEPOT_COUNTER_COUNT,
 };
 static long counters[DEPOT_COUNTER_COUNT];
 static const char *const counter_names[] = {
-	[DEPOT_COUNTER_ALLOCS]		= "allocations",
-	[DEPOT_COUNTER_FREES]		= "frees",
-	[DEPOT_COUNTER_INUSE]		= "in_use",
+	[DEPOT_COUNTER_REFD_ALLOCS]	= "refcounted_allocations",
+	[DEPOT_COUNTER_REFD_FREES]	= "refcounted_frees",
+	[DEPOT_COUNTER_REFD_INUSE]	= "refcounted_in_use",
 	[DEPOT_COUNTER_FREELIST_SIZE]	= "freelist_size",
+	[DEPOT_COUNTER_PERSIST_COUNT]	= "persistent_count",
+	[DEPOT_COUNTER_PERSIST_BYTES]	= "persistent_bytes",
 };
 static_assert(ARRAY_SIZE(counter_names) == DEPOT_COUNTER_COUNT);
 
@@ -294,48 +281,52 @@ int stack_depot_init(void)
 EXPORT_SYMBOL_GPL(stack_depot_init);
 
 /*
- * Initializes new stack depot @pool, release all its entries to the freelist,
- * and update the list of pools.
+ * Initializes new stack pool, and updates the list of pools.
  */
-static void depot_init_pool(void *pool)
+static bool depot_init_pool(void **prealloc)
 {
-	int offset;
-
 	lockdep_assert_held(&pool_lock);
 
-	/* Initialize handles and link stack records into the freelist. */
-	for (offset = 0; offset <= DEPOT_POOL_SIZE - DEPOT_STACK_RECORD_SIZE;
-	     offset += DEPOT_STACK_RECORD_SIZE) {
-		struct stack_record *stack = pool + offset;
-
-		stack->handle.pool_index = pools_num;
-		stack->handle.offset = offset >> DEPOT_STACK_ALIGN;
-		stack->handle.extra = 0;
-
-		/*
-		 * Stack traces of size 0 are never saved, and we can simply use
-		 * the size field as an indicator if this is a new unused stack
-		 * record in the freelist.
-		 */
-		stack->size = 0;
+	if (unlikely(pools_num >= DEPOT_MAX_POOLS)) {
+		/* Bail out if we reached the pool limit. */
+		WARN_ON_ONCE(pools_num > DEPOT_MAX_POOLS); /* should never happen */
+		WARN_ON_ONCE(!new_pool); /* to avoid unnecessary pre-allocation */
+		WARN_ONCE(1, "Stack depot reached limit capacity");
+		return false;
+	}
 
-		INIT_LIST_HEAD(&stack->hash_list);
-		/*
-		 * Add to the freelist front to prioritize never-used entries:
-		 * required in case there are entries in the freelist, but their
-		 * RCU cookie still belongs to the current RCU grace period
-		 * (there can still be concurrent readers).
-		 */
-		list_add(&stack->free_list, &free_stacks);
-		counters[DEPOT_COUNTER_FREELIST_SIZE]++;
+	if (!new_pool && *prealloc) {
+		/* We have preallocated memory, use it. */
+		WRITE_ONCE(new_pool, *prealloc);
+		*prealloc = NULL;
 	}
 
+	if (!new_pool)
+		return false; /* new_pool and *prealloc are NULL */
+
 	/* Save reference to the pool to be used by depot_fetch_stack(). */
-	stack_pools[pools_num] = pool;
+	stack_pools[pools_num] = new_pool;
+
+	/*
+	 * Stack depot tries to keep an extra pool allocated even before it runs
+	 * out of space in the currently used pool.
+	 *
+	 * To indicate that a new preallocation is needed new_pool is reset to
+	 * NULL; do not reset to NULL if we have reached the maximum number of
+	 * pools.
+	 */
+	if (pools_num < DEPOT_MAX_POOLS)
+		WRITE_ONCE(new_pool, NULL);
+	else
+		WRITE_ONCE(new_pool, STACK_DEPOT_POISON);
 
 	/* Pairs with concurrent READ_ONCE() in depot_fetch_stack(). */
 	WRITE_ONCE(pools_num, pools_num + 1);
 	ASSERT_EXCLUSIVE_WRITER(pools_num);
+
+	pool_offset = 0;
+
+	return true;
 }
 
 /* Keeps the preallocated memory to be used for a new stack depot pool. */
@@ -347,63 +338,51 @@ static void depot_keep_new_pool(void **prealloc)
 	 * If a new pool is already saved or the maximum number of
 	 * pools is reached, do not use the preallocated memory.
 	 */
-	if (!new_pool_required)
+	if (new_pool)
 		return;
 
-	/*
-	 * Use the preallocated memory for the new pool
-	 * as long as we do not exceed the maximum number of pools.
-	 */
-	if (pools_num < DEPOT_MAX_POOLS) {
-		new_pool = *prealloc;
-		*prealloc = NULL;
-	}
-
-	/*
-	 * At this point, either a new pool is kept or the maximum
-	 * number of pools is reached. In either case, take note that
-	 * keeping another pool is not required.
-	 */
-	WRITE_ONCE(new_pool_required, false);
+	WRITE_ONCE(new_pool, *prealloc);
+	*prealloc = NULL;
 }
 
 /*
- * Try to initialize a new stack depot pool from either a previous or the
- * current pre-allocation, and release all its entries to the freelist.
+ * Try to initialize a new stack record from the current pool, a cached pool, or
+ * the current pre-allocation.
  */
-static bool depot_try_init_pool(void **prealloc)
+static struct stack_record *depot_pop_free_pool(void **prealloc, size_t size)
 {
+	struct stack_record *stack;
+	void *current_pool;
+	u32 pool_index;
+
 	lockdep_assert_held(&pool_lock);
 
-	/* Check if we have a new pool saved and use it. */
-	if (new_pool) {
-		depot_init_pool(new_pool);
-		new_pool = NULL;
+	if (pool_offset + size > DEPOT_POOL_SIZE) {
+		if (!depot_init_pool(prealloc))
+			return NULL;
+	}
 
-		/* Take note that we might need a new new_pool. */
-		if (pools_num < DEPOT_MAX_POOLS)
-			WRITE_ONCE(new_pool_required, true);
+	if (WARN_ON_ONCE(pools_num < 1))
+		return NULL;
+	pool_index = pools_num - 1;
+	current_pool = stack_pools[pool_index];
+	if (WARN_ON_ONCE(!current_pool))
+		return NULL;
 
-		return true;
-	}
+	stack = current_pool + pool_offset;
 
-	/* Bail out if we reached the pool limit. */
-	if (unlikely(pools_num >= DEPOT_MAX_POOLS)) {
-		WARN_ONCE(1, "Stack depot reached limit capacity");
-		return false;
-	}
+	/* Pre-initialize handle once. */
+	stack->handle.pool_index = pool_index;
+	stack->handle.offset = pool_offset >> DEPOT_STACK_ALIGN;
+	stack->handle.extra = 0;
+	INIT_LIST_HEAD(&stack->hash_list);
 
-	/* Check if we have preallocated memory and use it. */
-	if (*prealloc) {
-		depot_init_pool(*prealloc);
-		*prealloc = NULL;
-		return true;
-	}
+	pool_offset += size;
 
-	return false;
+	return stack;
 }
 
-/* Try to find next free usable entry. */
+/* Try to find next free usable entry from the freelist. */
 static struct stack_record *depot_pop_free(void)
 {
 	struct stack_record *stack;
@@ -420,7 +399,7 @@ static struct stack_record *depot_pop_free(void)
 	 * check the first entry.
 	 */
 	stack = list_first_entry(&free_stacks, struct stack_record, free_list);
-	if (stack->size && !poll_state_synchronize_rcu(stack->rcu_state))
+	if (!poll_state_synchronize_rcu(stack->rcu_state))
 		return NULL;
 
 	list_del(&stack->free_list);
@@ -429,48 +408,73 @@ static struct stack_record *depot_pop_free(void)
 	return stack;
 }
 
+static inline size_t depot_stack_record_size(struct stack_record *s, unsigned int nr_entries)
+{
+	const size_t used = flex_array_size(s, entries, nr_entries);
+	const size_t unused = sizeof(s->entries) - used;
+
+	WARN_ON_ONCE(sizeof(s->entries) < used);
+
+	return ALIGN(sizeof(struct stack_record) - unused, 1 << DEPOT_STACK_ALIGN);
+}
+
 /* Allocates a new stack in a stack depot pool. */
 static struct stack_record *
-depot_alloc_stack(unsigned long *entries, int size, u32 hash, void **prealloc)
+depot_alloc_stack(unsigned long *entries, int nr_entries, u32 hash, depot_flags_t flags, void **prealloc)
 {
-	struct stack_record *stack;
+	struct stack_record *stack = NULL;
+	size_t record_size;
 
 	lockdep_assert_held(&pool_lock);
 
 	/* This should already be checked by public API entry points. */
-	if (WARN_ON_ONCE(!size))
+	if (WARN_ON_ONCE(!nr_entries))
 		return NULL;
 
-	/* Check if we have a stack record to save the stack trace. */
-	stack = depot_pop_free();
-	if (!stack) {
-		/* No usable entries on the freelist - try to refill the freelist. */
-		if (!depot_try_init_pool(prealloc))
-			return NULL;
+	/* Limit number of saved frames to CONFIG_STACKDEPOT_MAX_FRAMES. */
+	if (nr_entries > CONFIG_STACKDEPOT_MAX_FRAMES)
+		nr_entries = CONFIG_STACKDEPOT_MAX_FRAMES;
+
+	if (flags & STACK_DEPOT_FLAG_GET) {
+		/*
+		 * Evictable entries have to allocate the max. size so they may
+		 * safely be re-used by differently sized allocations.
+		 */
+		record_size = depot_stack_record_size(stack, CONFIG_STACKDEPOT_MAX_FRAMES);
 		stack = depot_pop_free();
-		if (WARN_ON(!stack))
-			return NULL;
+	} else {
+		record_size = depot_stack_record_size(stack, nr_entries);
 	}
 
-	/* Limit number of saved frames to CONFIG_STACKDEPOT_MAX_FRAMES. */
-	if (size > CONFIG_STACKDEPOT_MAX_FRAMES)
-		size = CONFIG_STACKDEPOT_MAX_FRAMES;
+	if (!stack) {
+		stack = depot_pop_free_pool(prealloc, record_size);
+		if (!stack)
+			return NULL;
+	}
 
 	/* Save the stack trace. */
 	stack->hash = hash;
-	stack->size = size;
-	/* stack->handle is already filled in by depot_init_pool(). */
-	refcount_set(&stack->count, 1);
-	memcpy(stack->entries, entries, flex_array_size(stack, entries, size));
+	stack->size = nr_entries;
+	/* stack->handle is already filled in by depot_pop_free_pool(). */
+	memcpy(stack->entries, entries, flex_array_size(stack, entries, nr_entries));
+
+	if (flags & STACK_DEPOT_FLAG_GET) {
+		refcount_set(&stack->count, 1);
+		counters[DEPOT_COUNTER_REFD_ALLOCS]++;
+		counters[DEPOT_COUNTER_REFD_INUSE]++;
+	} else {
+		/* Warn on attempts to switch to refcounting this entry. */
+		refcount_set(&stack->count, REFCOUNT_SATURATED);
+		counters[DEPOT_COUNTER_PERSIST_COUNT]++;
+		counters[DEPOT_COUNTER_PERSIST_BYTES] += record_size;
+	}
 
 	/*
 	 * Let KMSAN know the stored stack record is initialized. This shall
 	 * prevent false positive reports if instrumented code accesses it.
 	 */
-	kmsan_unpoison_memory(stack, DEPOT_STACK_RECORD_SIZE);
+	kmsan_unpoison_memory(stack, record_size);
 
-	counters[DEPOT_COUNTER_ALLOCS]++;
-	counters[DEPOT_COUNTER_INUSE]++;
 	return stack;
 }
 
@@ -538,8 +542,8 @@ static void depot_free_stack(struct stack_record *stack)
 	list_add_tail(&stack->free_list, &free_stacks);
 
 	counters[DEPOT_COUNTER_FREELIST_SIZE]++;
-	counters[DEPOT_COUNTER_FREES]++;
-	counters[DEPOT_COUNTER_INUSE]--;
+	counters[DEPOT_COUNTER_REFD_FREES]++;
+	counters[DEPOT_COUNTER_REFD_INUSE]--;
 
 	printk_deferred_exit();
 	raw_spin_unlock_irqrestore(&pool_lock, flags);
@@ -660,7 +664,7 @@ depot_stack_handle_t stack_depot_save_flags(unsigned long *entries,
 	 * Allocate memory for a new pool if required now:
 	 * we won't be able to do that under the lock.
 	 */
-	if (unlikely(can_alloc && READ_ONCE(new_pool_required))) {
+	if (unlikely(can_alloc && !READ_ONCE(new_pool))) {
 		/*
 		 * Zero out zone modifiers, as we don't have specific zone
 		 * requirements. Keep the flags related to allocation in atomic
@@ -681,7 +685,7 @@ depot_stack_handle_t stack_depot_save_flags(unsigned long *entries,
 	found = find_stack(bucket, entries, nr_entries, hash, depot_flags);
 	if (!found) {
 		struct stack_record *new =
-			depot_alloc_stack(entries, nr_entries, hash, &prealloc);
+			depot_alloc_stack(entries, nr_entries, hash, depot_flags, &prealloc);
 
 		if (new) {
 			/*

From df7b661e49cfb74cbe510e1c47b1f4f4810a84b3 Mon Sep 17 00:00:00 2001
From: Marco Elver <elver@google.com>
Date: Thu, 1 Feb 2024 10:04:30 +0100
Subject: [PATCH 0931/1406] stackdepot: fix -Wstringop-overflow warning

Since 113a61863ecb ("Makefile: Enable -Wstringop-overflow globally")
string overflow checking is enabled by default. Within stackdepot, the
compiler (GCC 13.2.0) assumes that a multiplication overflow may be
possible and flex_array_size() can return SIZE_MAX (4294967295 on
32-bit), resulting in this warning:

 In function 'depot_alloc_stack',
     inlined from 'stack_depot_save_flags' at lib/stackdepot.c:688:4:
 arch/x86/include/asm/string_32.h:150:25: error: '__builtin_memcpy' specified bound 4294967295 exceeds maximum object size 2147483647 [-Werror=stringop-overflow=]
   150 | #define memcpy(t, f, n) __builtin_memcpy(t, f, n)
       |                         ^~~~~~~~~~~~~~~~~~~~~~~~~
 lib/stackdepot.c:459:9: note: in expansion of macro 'memcpy'
   459 |         memcpy(stack->entries, entries, flex_array_size(stack, entries, nr_entries));
       |         ^~~~~~
 cc1: all warnings being treated as errors

This is due to depot_alloc_stack() accepting an 'int nr_entries' which
could be negative without deeper analysis of callers.

The call to depot_alloc_stack() from stack_depot_save_flags(), however,
only passes in its nr_entries which is unsigned int. Fix the warning by
switching depot_alloc_stack()'s nr_entries to also be unsigned.

Link: https://lore.kernel.org/all/20240201135747.18eca98e@canb.auug.org.au/
Link: https://lkml.kernel.org/r/20240201090434.1762340-1-elver@google.com
Link: https://lore.kernel.org/all/CABXGCsOzpRPZGg23QqJAzKnqkZPKzvieeg=W7sgjgi3q0pBo0g@mail.gmail.com/
Fixes: d869d3fb362c ("stackdepot: use variable size records for non-evictable entries")
Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Marco Elver <elver@google.com>
Reviewed-by: Andrey Konovalov <andreyknvl@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 lib/stackdepot.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/stackdepot.c b/lib/stackdepot.c
index 8f3b2c84ec2db3..4a7055a63d9f8a 100644
--- a/lib/stackdepot.c
+++ b/lib/stackdepot.c
@@ -420,7 +420,7 @@ static inline size_t depot_stack_record_size(struct stack_record *s, unsigned in
 
 /* Allocates a new stack in a stack depot pool. */
 static struct stack_record *
-depot_alloc_stack(unsigned long *entries, int nr_entries, u32 hash, depot_flags_t flags, void **prealloc)
+depot_alloc_stack(unsigned long *entries, unsigned int nr_entries, u32 hash, depot_flags_t flags, void **prealloc)
 {
 	struct stack_record *stack = NULL;
 	size_t record_size;

From 5609186a13529c7fa20f246a24663684e2b3d112 Mon Sep 17 00:00:00 2001
From: Marco Elver <elver@google.com>
Date: Mon, 29 Jan 2024 11:07:02 +0100
Subject: [PATCH 0932/1406] kasan: revert eviction of stack traces in generic
 mode

This partially reverts commits cc478e0b6bdf, 63b85ac56a64, 08d7c94d9635,
a414d4286f34, and 773688a6cb24 to make use of variable-sized stack depot
records, since eviction of stack entries from stack depot forces fixed-
sized stack records.  Care was taken to retain the code cleanups by the
above commits.

Eviction was added to generic KASAN as a response to alleviating the
additional memory usage from fixed-sized stack records, but this still
uses more memory than previously.

With the re-introduction of variable-sized records for stack depot, we can
just switch back to non-evictable stack records again, and return back to
the previous performance and memory usage baseline.

Before (observed after a KASAN kernel boot):

  pools: 597
  refcounted_allocations: 17547
  refcounted_frees: 6477
  refcounted_in_use: 11070
  freelist_size: 3497
  persistent_count: 12163
  persistent_bytes: 1717008

After:

  pools: 319
  refcounted_allocations: 0
  refcounted_frees: 0
  refcounted_in_use: 0
  freelist_size: 0
  persistent_count: 29397
  persistent_bytes: 5183536

As can be seen from the counters, with a generic KASAN config, refcounted
allocations and evictions are no longer used.  Due to using variable-sized
records, I observe a reduction of 278 stack depot pools (saving 4448 KiB)
with my test setup.

Link: https://lkml.kernel.org/r/20240129100708.39460-2-elver@google.com
Fixes: cc478e0b6bdf ("kasan: avoid resetting aux_lock")
Fixes: 63b85ac56a64 ("kasan: stop leaking stack trace handles")
Fixes: 08d7c94d9635 ("kasan: memset free track in qlink_free")
Fixes: a414d4286f34 ("kasan: handle concurrent kasan_record_aux_stack calls")
Fixes: 773688a6cb24 ("kasan: use stack_depot_put for Generic mode")
Signed-off-by: Marco Elver <elver@google.com>
Reviewed-by: Andrey Konovalov <andreyknvl@gmail.com>
Tested-by: Mikhail Gavrilov <mikhail.v.gavrilov@gmail.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Vincenzo Frascino <vincenzo.frascino@arm.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/kasan/common.c     |  8 ++---
 mm/kasan/generic.c    | 68 +++++--------------------------------------
 mm/kasan/kasan.h      | 10 -------
 mm/kasan/quarantine.c |  5 +++-
 4 files changed, 14 insertions(+), 77 deletions(-)

diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index 610efae9122094..6ca63e8dda741b 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -65,8 +65,7 @@ void kasan_save_track(struct kasan_track *track, gfp_t flags)
 {
 	depot_stack_handle_t stack;
 
-	stack = kasan_save_stack(flags,
-			STACK_DEPOT_FLAG_CAN_ALLOC | STACK_DEPOT_FLAG_GET);
+	stack = kasan_save_stack(flags, STACK_DEPOT_FLAG_CAN_ALLOC);
 	kasan_set_track(track, stack);
 }
 
@@ -266,10 +265,9 @@ bool __kasan_slab_free(struct kmem_cache *cache, void *object,
 		return true;
 
 	/*
-	 * If the object is not put into quarantine, it will likely be quickly
-	 * reallocated. Thus, release its metadata now.
+	 * Note: Keep per-object metadata to allow KASAN print stack traces for
+	 * use-after-free-before-realloc bugs.
 	 */
-	kasan_release_object_meta(cache, object);
 
 	/* Let slab put the object onto the freelist. */
 	return false;
diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c
index df6627f62402c0..fc9cf1860efb34 100644
--- a/mm/kasan/generic.c
+++ b/mm/kasan/generic.c
@@ -485,16 +485,6 @@ void kasan_init_object_meta(struct kmem_cache *cache, const void *object)
 	if (alloc_meta) {
 		/* Zero out alloc meta to mark it as invalid. */
 		__memset(alloc_meta, 0, sizeof(*alloc_meta));
-
-		/*
-		 * Prepare the lock for saving auxiliary stack traces.
-		 * Temporarily disable KASAN bug reporting to allow instrumented
-		 * raw_spin_lock_init to access aux_lock, which resides inside
-		 * of a redzone.
-		 */
-		kasan_disable_current();
-		raw_spin_lock_init(&alloc_meta->aux_lock);
-		kasan_enable_current();
 	}
 
 	/*
@@ -506,18 +496,8 @@ void kasan_init_object_meta(struct kmem_cache *cache, const void *object)
 
 static void release_alloc_meta(struct kasan_alloc_meta *meta)
 {
-	/* Evict the stack traces from stack depot. */
-	stack_depot_put(meta->alloc_track.stack);
-	stack_depot_put(meta->aux_stack[0]);
-	stack_depot_put(meta->aux_stack[1]);
-
-	/*
-	 * Zero out alloc meta to mark it as invalid but keep aux_lock
-	 * initialized to avoid having to reinitialize it when another object
-	 * is allocated in the same slot.
-	 */
-	__memset(&meta->alloc_track, 0, sizeof(meta->alloc_track));
-	__memset(meta->aux_stack, 0, sizeof(meta->aux_stack));
+	/* Zero out alloc meta to mark it as invalid. */
+	__memset(meta, 0, sizeof(*meta));
 }
 
 static void release_free_meta(const void *object, struct kasan_free_meta *meta)
@@ -526,27 +506,10 @@ static void release_free_meta(const void *object, struct kasan_free_meta *meta)
 	if (*(u8 *)kasan_mem_to_shadow(object) != KASAN_SLAB_FREE_META)
 		return;
 
-	/* Evict the stack trace from the stack depot. */
-	stack_depot_put(meta->free_track.stack);
-
 	/* Mark free meta as invalid. */
 	*(u8 *)kasan_mem_to_shadow(object) = KASAN_SLAB_FREE;
 }
 
-void kasan_release_object_meta(struct kmem_cache *cache, const void *object)
-{
-	struct kasan_alloc_meta *alloc_meta;
-	struct kasan_free_meta *free_meta;
-
-	alloc_meta = kasan_get_alloc_meta(cache, object);
-	if (alloc_meta)
-		release_alloc_meta(alloc_meta);
-
-	free_meta = kasan_get_free_meta(cache, object);
-	if (free_meta)
-		release_free_meta(object, free_meta);
-}
-
 size_t kasan_metadata_size(struct kmem_cache *cache, bool in_object)
 {
 	struct kasan_cache *info = &cache->kasan_info;
@@ -571,8 +534,6 @@ static void __kasan_record_aux_stack(void *addr, depot_flags_t depot_flags)
 	struct kmem_cache *cache;
 	struct kasan_alloc_meta *alloc_meta;
 	void *object;
-	depot_stack_handle_t new_handle, old_handle;
-	unsigned long flags;
 
 	if (is_kfence_address(addr) || !slab)
 		return;
@@ -583,33 +544,18 @@ static void __kasan_record_aux_stack(void *addr, depot_flags_t depot_flags)
 	if (!alloc_meta)
 		return;
 
-	new_handle = kasan_save_stack(0, depot_flags);
-
-	/*
-	 * Temporarily disable KASAN bug reporting to allow instrumented
-	 * spinlock functions to access aux_lock, which resides inside of a
-	 * redzone.
-	 */
-	kasan_disable_current();
-	raw_spin_lock_irqsave(&alloc_meta->aux_lock, flags);
-	old_handle = alloc_meta->aux_stack[1];
 	alloc_meta->aux_stack[1] = alloc_meta->aux_stack[0];
-	alloc_meta->aux_stack[0] = new_handle;
-	raw_spin_unlock_irqrestore(&alloc_meta->aux_lock, flags);
-	kasan_enable_current();
-
-	stack_depot_put(old_handle);
+	alloc_meta->aux_stack[0] = kasan_save_stack(0, depot_flags);
 }
 
 void kasan_record_aux_stack(void *addr)
 {
-	return __kasan_record_aux_stack(addr,
-			STACK_DEPOT_FLAG_CAN_ALLOC | STACK_DEPOT_FLAG_GET);
+	return __kasan_record_aux_stack(addr, STACK_DEPOT_FLAG_CAN_ALLOC);
 }
 
 void kasan_record_aux_stack_noalloc(void *addr)
 {
-	return __kasan_record_aux_stack(addr, STACK_DEPOT_FLAG_GET);
+	return __kasan_record_aux_stack(addr, 0);
 }
 
 void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags)
@@ -620,7 +566,7 @@ void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags)
 	if (!alloc_meta)
 		return;
 
-	/* Evict previous stack traces (might exist for krealloc or mempool). */
+	/* Invalidate previous stack traces (might exist for krealloc or mempool). */
 	release_alloc_meta(alloc_meta);
 
 	kasan_save_track(&alloc_meta->alloc_track, flags);
@@ -634,7 +580,7 @@ void kasan_save_free_info(struct kmem_cache *cache, void *object)
 	if (!free_meta)
 		return;
 
-	/* Evict previous stack trace (might exist for mempool). */
+	/* Invalidate previous stack trace (might exist for mempool). */
 	release_free_meta(object, free_meta);
 
 	kasan_save_track(&free_meta->free_track, 0);
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index d0f172f2b9783f..fb2b9ac0659a7a 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -6,7 +6,6 @@
 #include <linux/kasan.h>
 #include <linux/kasan-tags.h>
 #include <linux/kfence.h>
-#include <linux/spinlock.h>
 #include <linux/stackdepot.h>
 
 #if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS)
@@ -265,13 +264,6 @@ struct kasan_global {
 struct kasan_alloc_meta {
 	struct kasan_track alloc_track;
 	/* Free track is stored in kasan_free_meta. */
-	/*
-	 * aux_lock protects aux_stack from accesses from concurrent
-	 * kasan_record_aux_stack calls. It is a raw spinlock to avoid sleeping
-	 * on RT kernels, as kasan_record_aux_stack_noalloc can be called from
-	 * non-sleepable contexts.
-	 */
-	raw_spinlock_t aux_lock;
 	depot_stack_handle_t aux_stack[2];
 };
 
@@ -398,10 +390,8 @@ struct kasan_alloc_meta *kasan_get_alloc_meta(struct kmem_cache *cache,
 struct kasan_free_meta *kasan_get_free_meta(struct kmem_cache *cache,
 						const void *object);
 void kasan_init_object_meta(struct kmem_cache *cache, const void *object);
-void kasan_release_object_meta(struct kmem_cache *cache, const void *object);
 #else
 static inline void kasan_init_object_meta(struct kmem_cache *cache, const void *object) { }
-static inline void kasan_release_object_meta(struct kmem_cache *cache, const void *object) { }
 #endif
 
 depot_stack_handle_t kasan_save_stack(gfp_t flags, depot_flags_t depot_flags);
diff --git a/mm/kasan/quarantine.c b/mm/kasan/quarantine.c
index 3ba02efb952aac..6958aa713c67ee 100644
--- a/mm/kasan/quarantine.c
+++ b/mm/kasan/quarantine.c
@@ -145,7 +145,10 @@ static void qlink_free(struct qlist_node *qlink, struct kmem_cache *cache)
 	void *object = qlink_to_object(qlink, cache);
 	struct kasan_free_meta *free_meta = kasan_get_free_meta(cache, object);
 
-	kasan_release_object_meta(cache, object);
+	/*
+	 * Note: Keep per-object metadata to allow KASAN print stack traces for
+	 * use-after-free-before-realloc bugs.
+	 */
 
 	/*
 	 * If init_on_free is enabled and KASAN's free metadata is stored in

From 294c14634205e1a28009439c9753a15e830629a7 Mon Sep 17 00:00:00 2001
From: Nhat Pham <nphamcs@gmail.com>
Date: Mon, 5 Feb 2024 15:24:42 -0800
Subject: [PATCH 0933/1406] mm/swap_state: update zswap LRU's protection range
 with the folio locked

When a folio is swapped in, the protection size of the corresponding zswap
LRU is incremented, so that the zswap shrinker is more conservative with
its reclaiming action.  This field is embedded within the struct lruvec,
so updating it requires looking up the folio's memcg and lruvec.  However,
currently this lookup can happen after the folio is unlocked, for instance
if a new folio is allocated, and swap_read_folio() unlocks the folio
before returning.  In this scenario, there is no stability guarantee for
the binding between a folio and its memcg and lruvec:

* A folio's memcg and lruvec can be freed between the lookup and the
  update, leading to a UAF.
* Folio migration can clear the now-unlocked folio's memcg_data, which
  directs the zswap LRU protection size update towards the root memcg
  instead of the original memcg. This was recently picked up by the
  syzbot thanks to a warning in the inlined folio_lruvec() call.

Move the zswap LRU protection range update above the swap_read_folio()
call, and only when a new page is allocated, to prevent this.

Link: https://lkml.kernel.org/r/20240205232442.3240571-1-nphamcs@gmail.com
Fixes: b5ba474f3f51 ("zswap: shrink zswap pool based on memory pressure")
Reported-by: syzbot+17a611d10af7d18a7092@syzkaller.appspotmail.com
Closes: https://lore.kernel.org/all/000000000000ae47f90610803260@google.com/
Signed-off-by: Nhat Pham <nphamcs@gmail.com>
Reviewed-by: Chengming Zhou <zhouchengming@bytedance.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Yosry Ahmed <yosryahmed@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/swap_state.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/mm/swap_state.c b/mm/swap_state.c
index e671266ad77241..7255c01a1e4e16 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -680,9 +680,10 @@ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
 	/* The page was likely read above, so no need for plugging here */
 	folio = __read_swap_cache_async(entry, gfp_mask, mpol, ilx,
 					&page_allocated, false);
-	if (unlikely(page_allocated))
+	if (unlikely(page_allocated)) {
+		zswap_folio_swapin(folio);
 		swap_read_folio(folio, false, NULL);
-	zswap_folio_swapin(folio);
+	}
 	return folio;
 }
 
@@ -855,9 +856,10 @@ static struct folio *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask,
 	/* The folio was likely read above, so no need for plugging here */
 	folio = __read_swap_cache_async(targ_entry, gfp_mask, mpol, targ_ilx,
 					&page_allocated, false);
-	if (unlikely(page_allocated))
+	if (unlikely(page_allocated)) {
+		zswap_folio_swapin(folio);
 		swap_read_folio(folio, false, NULL);
-	zswap_folio_swapin(folio);
+	}
 	return folio;
 }
 

From 97b552747a65042c526bc462f8c79f9fdf14f9ba Mon Sep 17 00:00:00 2001
From: Nhat Pham <nphamcs@gmail.com>
Date: Tue, 6 Feb 2024 10:08:55 -0800
Subject: [PATCH 0934/1406] mm/swap_state: update zswap LRU's protection range
 with the folio locked

add VM_WARN_ON_ONCE() to zswap_folio_swapin()

Link: https://lkml.kernel.org/r/20240206180855.3987204-1-nphamcs@gmail.com
Reported-by: syzbot+17a611d10af7d18a7092@syzkaller.appspotmail.com
Closes: https://lore.kernel.org/all/000000000000ae47f90610803260@google.com/
Fixes: b5ba474f3f51 ("zswap: shrink zswap pool based on memory pressure")
Signed-off-by: Nhat Pham <nphamcs@gmail.com>
Cc: Chengming Zhou <chengming.zhou@linux.dev>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Yosry Ahmed <yosryahmed@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/zswap.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mm/zswap.c b/mm/zswap.c
index d2423247acfd64..9735e34dfca128 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -378,6 +378,7 @@ void zswap_folio_swapin(struct folio *folio)
 	struct lruvec *lruvec;
 
 	if (folio) {
+		VM_WARN_ON_ONCE(!folio_test_locked(folio));
 		lruvec = folio_lruvec(folio);
 		atomic_long_inc(&lruvec->zswap_lruvec_state.nr_zswap_protected);
 	}

From 878baa40677e3694c8381fd3dfab3407f0db8e3f Mon Sep 17 00:00:00 2001
From: Nhat Pham <nphamcs@gmail.com>
Date: Tue, 6 Feb 2024 11:13:55 -0800
Subject: [PATCH 0935/1406] mm/swap_state: update zswap LRU's protection range
 with the folio locked (fix)

The if (folio) checks inside zswap_folio_swapin() is no longer needed.

Link: https://lkml.kernel.org/r/20240206191355.83755-1-nphamcs@gmail.com
Suggested-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Nhat Pham <nphamcs@gmail.com>
Cc: Chengming Zhou <chengming.zhou@linux.dev>
Cc: Yosry Ahmed <yosryahmed@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/zswap.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/mm/zswap.c b/mm/zswap.c
index 9735e34dfca128..36903d938c15e2 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -377,11 +377,9 @@ void zswap_folio_swapin(struct folio *folio)
 {
 	struct lruvec *lruvec;
 
-	if (folio) {
-		VM_WARN_ON_ONCE(!folio_test_locked(folio));
-		lruvec = folio_lruvec(folio);
-		atomic_long_inc(&lruvec->zswap_lruvec_state.nr_zswap_protected);
-	}
+	VM_WARN_ON_ONCE(!folio_test_locked(folio));
+	lruvec = folio_lruvec(folio);
+	atomic_long_inc(&lruvec->zswap_lruvec_state.nr_zswap_protected);
 }
 
 /*********************************

From 1bbdf326c5f9f10b2e184abea02f565269746e17 Mon Sep 17 00:00:00 2001
From: Kairui Song <kasong@tencent.com>
Date: Wed, 7 Feb 2024 02:25:59 +0800
Subject: [PATCH 0936/1406] mm/swap: fix race when skipping swapcache

When skipping swapcache for SWP_SYNCHRONOUS_IO, if two or more threads
swapin the same entry at the same time, they get different pages (A, B).
Before one thread (T0) finishes the swapin and installs page (A) to the
PTE, another thread (T1) could finish swapin of page (B), swap_free the
entry, then swap out the possibly modified page reusing the same entry.
It breaks the pte_same check in (T0) because PTE value is unchanged,
causing ABA problem.  Thread (T0) will install a stalled page (A) into the
PTE and cause data corruption.

One possible callstack is like this:

CPU0                                 CPU1
----                                 ----
do_swap_page()                       do_swap_page() with same entry
<direct swapin path>                 <direct swapin path>
<alloc page A>                       <alloc page B>
swap_read_folio() <- read to page A  swap_read_folio() <- read to page B
<slow on later locks or interrupt>   <finished swapin first>
...                                  set_pte_at()
                                     swap_free() <- entry is free
                                     <write to page B, now page A stalled>
                                     <swap out page B to same swap entry>
pte_same() <- Check pass, PTE seems
              unchanged, but page A
              is stalled!
swap_free() <- page B content lost!
set_pte_at() <- staled page A installed!

And besides, for ZRAM, swap_free() allows the swap device to discard the
entry content, so even if page (B) is not modified, if swap_read_folio()
on CPU0 happens later than swap_free() on CPU1, it may also cause data
loss.

To fix this, reuse swapcache_prepare which will pin the swap entry using
the cache flag, and allow only one thread to swap it in, also prevent any
parallel code from putting the entry in the cache.  Release the pin after
PT unlocked.

Racers just loop and wait since it's a rare and very short event.  A
schedule_timeout_uninterruptible(1) call is added to avoid repeated page
faults wasting too much CPU, causing livelock or adding too much noise to
perf statistics.  A similar livelock issue was described in commit
029c4628b2eb ("mm: swap: get rid of livelock in swapin readahead")

Reproducer:

This race issue can be triggered easily using a well constructed
reproducer and patched brd (with a delay in read path) [1]:

With latest 6.8 mainline, race caused data loss can be observed easily:
$ gcc -g -lpthread test-thread-swap-race.c && ./a.out
  Polulating 32MB of memory region...
  Keep swapping out...
  Starting round 0...
  Spawning 65536 workers...
  32746 workers spawned, wait for done...
  Round 0: Error on 0x5aa00, expected 32746, got 32743, 3 data loss!
  Round 0: Error on 0x395200, expected 32746, got 32743, 3 data loss!
  Round 0: Error on 0x3fd000, expected 32746, got 32737, 9 data loss!
  Round 0 Failed, 15 data loss!

This reproducer spawns multiple threads sharing the same memory region
using a small swap device.  Every two threads updates mapped pages one by
one in opposite direction trying to create a race, with one dedicated
thread keep swapping out the data out using madvise.

The reproducer created a reproduce rate of about once every 5 minutes, so
the race should be totally possible in production.

After this patch, I ran the reproducer for over a few hundred rounds and
no data loss observed.

Performance overhead is minimal, microbenchmark swapin 10G from 32G
zram:

Before:     10934698 us
After:      11157121 us
Cached:     13155355 us (Dropping SWP_SYNCHRONOUS_IO flag)

Link: https://lkml.kernel.org/r/20240206182559.32264-1-ryncsn@gmail.com
Fixes: 0bcac06f27d7 ("mm, swap: skip swapcache for swapin of synchronous device")
Reported-by: "Huang, Ying" <ying.huang@intel.com>
Closes: https://lore.kernel.org/lkml/87bk92gqpx.fsf_-_@yhuang6-desk2.ccr.corp.intel.com/
Link: https://github.com/ryncsn/emm-test-project/tree/master/swap-stress-race [1]
Signed-off-by: Kairui Song <kasong@tencent.com>
Reviewed-by: "Huang, Ying" <ying.huang@intel.com>
Acked-by: Yu Zhao <yuzhao@google.com>
Acked-by: David Hildenbrand <david@redhat.com>
Acked-by: Chris Li <chrisl@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Yosry Ahmed <yosryahmed@google.com>
Cc: Yu Zhao <yuzhao@google.com>
Cc: Barry Song <21cnbao@gmail.com>
Cc: SeongJae Park <sj@kernel.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/swap.h |  5 +++++
 mm/memory.c          | 15 +++++++++++++++
 mm/swap.h            |  5 +++++
 mm/swapfile.c        | 13 +++++++++++++
 4 files changed, 38 insertions(+)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 4db00ddad26169..8d28f6091a320e 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -549,6 +549,11 @@ static inline int swap_duplicate(swp_entry_t swp)
 	return 0;
 }
 
+static inline int swapcache_prepare(swp_entry_t swp)
+{
+	return 0;
+}
+
 static inline void swap_free(swp_entry_t swp)
 {
 }
diff --git a/mm/memory.c b/mm/memory.c
index 15f8b10ea17c4f..64d5ee2aabd23e 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3867,6 +3867,16 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 	if (!folio) {
 		if (data_race(si->flags & SWP_SYNCHRONOUS_IO) &&
 		    __swap_count(entry) == 1) {
+			/*
+			 * Prevent parallel swapin from proceeding with
+			 * the cache flag. Otherwise, another thread may
+			 * finish swapin first, free the entry, and swapout
+			 * reusing the same entry. It's undetectable as
+			 * pte_same() returns true due to entry reuse.
+			 */
+			if (swapcache_prepare(entry))
+				goto out;
+
 			/* skip swapcache */
 			folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0,
 						vma, vmf->address, false);
@@ -4116,6 +4126,9 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 unlock:
 	if (vmf->pte)
 		pte_unmap_unlock(vmf->pte, vmf->ptl);
+	/* Clear the swap cache pin for direct swapin after PTL unlock */
+	if (folio && !swapcache)
+		swapcache_clear(si, entry);
 out:
 	if (si)
 		put_swap_device(si);
@@ -4124,6 +4137,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 	if (vmf->pte)
 		pte_unmap_unlock(vmf->pte, vmf->ptl);
 out_page:
+	if (!swapcache)
+		swapcache_clear(si, entry);
 	folio_unlock(folio);
 out_release:
 	folio_put(folio);
diff --git a/mm/swap.h b/mm/swap.h
index 758c46ca671ed1..fc2f6ade7f80b3 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -41,6 +41,7 @@ void __delete_from_swap_cache(struct folio *folio,
 void delete_from_swap_cache(struct folio *folio);
 void clear_shadow_from_swap_cache(int type, unsigned long begin,
 				  unsigned long end);
+void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry);
 struct folio *swap_cache_get_folio(swp_entry_t entry,
 		struct vm_area_struct *vma, unsigned long addr);
 struct folio *filemap_get_incore_folio(struct address_space *mapping,
@@ -97,6 +98,10 @@ static inline int swap_writepage(struct page *p, struct writeback_control *wbc)
 	return 0;
 }
 
+static inline void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry)
+{
+}
+
 static inline struct folio *swap_cache_get_folio(swp_entry_t entry,
 		struct vm_area_struct *vma, unsigned long addr)
 {
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 556ff7347d5f04..746aa9da530255 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -3365,6 +3365,19 @@ int swapcache_prepare(swp_entry_t entry)
 	return __swap_duplicate(entry, SWAP_HAS_CACHE);
 }
 
+void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry)
+{
+	struct swap_cluster_info *ci;
+	unsigned long offset = swp_offset(entry);
+	unsigned char usage;
+
+	ci = lock_cluster_or_swap_info(si, offset);
+	usage = __swap_entry_free_locked(si, offset, SWAP_HAS_CACHE);
+	unlock_cluster_or_swap_info(si, ci);
+	if (!usage)
+		free_swap_slot(entry);
+}
+
 struct swap_info_struct *swp_swap_info(swp_entry_t entry)
 {
 	return swap_type_to_swap_info(swp_type(entry));

From e55acdfcca6be072331e2656469308b8eb4f9c60 Mon Sep 17 00:00:00 2001
From: Kairui Song <kasong@tencent.com>
Date: Mon, 19 Feb 2024 16:20:40 +0800
Subject: [PATCH 0937/1406] mm-swap-fix-race-when-skipping-swapcache-v4

Add a schedule() if raced to prevent repeated page faults wasting CPU and
add noise to perf statistics.

Use a bool to state the special case instead of reusing existing variables
fixing error handling [Minchan Kim].

Use schedule_timeout_uninterruptible(1) for now instead of schedule() to
prevent the busy faulting task holds CPU and livelocks [Huang, Ying].

Link: https://lkml.kernel.org/r/20240219082040.7495-1-ryncsn@gmail.com
Fixes: 0bcac06f27d7 ("mm, swap: skip swapcache for swapin of synchronous device")
Link: https://github.com/ryncsn/emm-test-project/tree/master/swap-stress-race [1]
Reported-by: "Huang, Ying" <ying.huang@intel.com>
Closes: https://lore.kernel.org/lkml/87bk92gqpx.fsf_-_@yhuang6-desk2.ccr.corp.intel.com/
Signed-off-by: Kairui Song <kasong@tencent.com>
Reviewed-by: "Huang, Ying" <ying.huang@intel.com>
Acked-by: David Hildenbrand <david@redhat.com>
Acked-by: Chris Li <chrisl@kernel.org>
Cc: Yu Zhao <yuzhao@google.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Yosry Ahmed <yosryahmed@google.com>
Cc: Yu Zhao <yuzhao@google.com>
Cc: Barry Song <21cnbao@gmail.com>
Cc: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/memory.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index 64d5ee2aabd23e..0bfc8b007c01a3 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3799,6 +3799,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 	struct page *page;
 	struct swap_info_struct *si = NULL;
 	rmap_t rmap_flags = RMAP_NONE;
+	bool need_clear_cache = false;
 	bool exclusive = false;
 	swp_entry_t entry;
 	pte_t pte;
@@ -3874,8 +3875,12 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 			 * reusing the same entry. It's undetectable as
 			 * pte_same() returns true due to entry reuse.
 			 */
-			if (swapcache_prepare(entry))
+			if (swapcache_prepare(entry)) {
+				/* Relax a bit to prevent rapid repeated page faults */
+				schedule_timeout_uninterruptible(1);
 				goto out;
+			}
+			need_clear_cache = true;
 
 			/* skip swapcache */
 			folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0,
@@ -4126,10 +4131,10 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 unlock:
 	if (vmf->pte)
 		pte_unmap_unlock(vmf->pte, vmf->ptl);
+out:
 	/* Clear the swap cache pin for direct swapin after PTL unlock */
-	if (folio && !swapcache)
+	if (need_clear_cache)
 		swapcache_clear(si, entry);
-out:
 	if (si)
 		put_swap_device(si);
 	return ret;
@@ -4137,8 +4142,6 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 	if (vmf->pte)
 		pte_unmap_unlock(vmf->pte, vmf->ptl);
 out_page:
-	if (!swapcache)
-		swapcache_clear(si, entry);
 	folio_unlock(folio);
 out_release:
 	folio_put(folio);
@@ -4146,6 +4149,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 		folio_unlock(swapcache);
 		folio_put(swapcache);
 	}
+	if (need_clear_cache)
+		swapcache_clear(si, entry);
 	if (si)
 		put_swap_device(si);
 	return ret;

From fda5ab1c59d19b02de769748db7a370203b3696f Mon Sep 17 00:00:00 2001
From: Guenter Roeck <linux@roeck-us.net>
Date: Thu, 8 Feb 2024 07:30:10 -0800
Subject: [PATCH 0938/1406] lib/Kconfig.debug: TEST_IOV_ITER depends on MMU

Trying to run the iov_iter unit test on a nommu system such as the qemu
kc705-nommu emulation results in a crash.

    KTAP version 1
    # Subtest: iov_iter
    # module: kunit_iov_iter
    1..9
BUG: failure at mm/nommu.c:318/vmap()!
Kernel panic - not syncing: BUG!

The test calls vmap() directly, but vmap() is not supported on nommu
systems, causing the crash.  TEST_IOV_ITER therefore needs to depend on
MMU.

Link: https://lkml.kernel.org/r/20240208153010.1439753-1-linux@roeck-us.net
Fixes: 2d71340ff1d4 ("iov_iter: Kunit tests for copying to/from an iterator")
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
Cc: David Howells <dhowells@redhat.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 lib/Kconfig.debug | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 975a07f9f1cc08..ef36b829ae1f55 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -2235,6 +2235,7 @@ config TEST_DIV64
 config TEST_IOV_ITER
 	tristate "Test iov_iter operation" if !KUNIT_ALL_TESTS
 	depends on KUNIT
+	depends on MMU
 	default KUNIT_ALL_TESTS
 	help
 	  Enable this to turn on testing of the operation of the I/O iterator

From 2ea8f48cd4f4a286a1b25c35a4017cb6381f7fd4 Mon Sep 17 00:00:00 2001
From: Chengming Zhou <zhouchengming@bytedance.com>
Date: Thu, 8 Feb 2024 02:32:54 +0000
Subject: [PATCH 0939/1406] mm/zswap: invalidate duplicate entry when
 !zswap_enabled

We have to invalidate any duplicate entry even when !zswap_enabled since
zswap can be disabled anytime.  If the folio store success before, then
got dirtied again but zswap disabled, we won't invalidate the old
duplicate entry in the zswap_store().  So later lru writeback may
overwrite the new data in swapfile.

Link: https://lkml.kernel.org/r/20240208023254.3873823-1-chengming.zhou@linux.dev
Fixes: 42c06a0e8ebe ("mm: kill frontswap")
Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: Yosry Ahmed <yosryahmed@google.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/zswap.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/mm/zswap.c b/mm/zswap.c
index 36903d938c15e2..db4625af65fb7f 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -1518,7 +1518,7 @@ bool zswap_store(struct folio *folio)
 	if (folio_test_large(folio))
 		return false;
 
-	if (!zswap_enabled || !tree)
+	if (!tree)
 		return false;
 
 	/*
@@ -1533,6 +1533,10 @@ bool zswap_store(struct folio *folio)
 		zswap_invalidate_entry(tree, dupentry);
 	}
 	spin_unlock(&tree->lock);
+
+	if (!zswap_enabled)
+		return false;
+
 	objcg = get_obj_cgroup_from_folio(folio);
 	if (objcg && !obj_cgroup_may_zswap(objcg)) {
 		memcg = get_mem_cgroup_from_objcg(objcg);

From 648385d1e2225b9eb7be194df1e5548a96244dca Mon Sep 17 00:00:00 2001
From: Anshuman Khandual <anshuman.khandual@arm.com>
Date: Fri, 9 Feb 2024 08:39:12 +0530
Subject: [PATCH 0940/1406] mm/memblock: add MEMBLOCK_RSRV_NOINIT into
 flagname[] array

The commit 77e6c43e137c ("memblock: introduce MEMBLOCK_RSRV_NOINIT flag")
skipped adding this newly introduced memblock flag into flagname[] array,
thus preventing a correct memblock flags output for applicable memblock
regions.

Link: https://lkml.kernel.org/r/20240209030912.1382251-1-anshuman.khandual@arm.com
Fixes: 77e6c43e137c ("memblock: introduce MEMBLOCK_RSRV_NOINIT flag")
Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
Reviewed-by: Mike Rapoport <rppt@kernel.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/memblock.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mm/memblock.c b/mm/memblock.c
index 4dcb2ee35eca85..d9f4b82cbffeb8 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -2249,6 +2249,7 @@ static const char * const flagname[] = {
 	[ilog2(MEMBLOCK_MIRROR)] = "MIRROR",
 	[ilog2(MEMBLOCK_NOMAP)] = "NOMAP",
 	[ilog2(MEMBLOCK_DRIVER_MANAGED)] = "DRV_MNG",
+	[ilog2(MEMBLOCK_RSRV_NOINIT)] = "RSV_NIT",
 };
 
 static int memblock_debug_show(struct seq_file *m, void *private)

From 87cb524a6a3420684a8939e0c172c400784479ec Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Tue, 13 Feb 2024 03:16:34 -0500
Subject: [PATCH 0941/1406] mm: memcontrol: clarify swapaccount=0 deprecation
 warning
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The swapaccount deprecation warning is throwing false positives.  Since we
deprecated the knob and defaulted to enabling, the only reports we've been
getting are from folks that set swapaccount=1.  While this is a nice
affirmation that always-enabling was the right choice, we certainly don't
want to warn when users request the supported mode.

Only warn when disabling is requested, and clarify the warning.

Link: https://lkml.kernel.org/r/20240213081634.3652326-1-hannes@cmpxchg.org
Fixes: b25806dcd3d5 ("mm: memcontrol: deprecate swapaccounting=0 mode")
Cc: stable@vger.kernel.org
Reported-by: "Jonas Schäfer" <jonas@wielicki.name>
Reported-by: Narcis Garcia <debianlists@actiu.net>
Suggested-by: Yosry Ahmed <yosryahmed@google.com>
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Yosry Ahmed <yosryahmed@google.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Shakeel Butt <shakeelb@google.com>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/memcontrol.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 1ed40f9d3a277e..107ec5d36819bf 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -7971,9 +7971,13 @@ bool mem_cgroup_swap_full(struct folio *folio)
 
 static int __init setup_swap_account(char *s)
 {
-	pr_warn_once("The swapaccount= commandline option is deprecated. "
-		     "Please report your usecase to linux-mm@kvack.org if you "
-		     "depend on this functionality.\n");
+	bool res;
+
+	if (!kstrtobool(s, &res) && !res)
+		pr_warn_once("The swapaccount=0 commdandline option is deprecated "
+			     "in favor of configuring swap control via cgroupfs. "
+			     "Please report your usecase to linux-mm@kvack.org if you "
+			     "depend on this functionality.\n");
 	return 1;
 }
 __setup("swapaccount=", setup_swap_account);

From 28c20622ca5954b1c19a77083a4919f86014d4e6 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.i.king@gmail.com>
Date: Thu, 15 Feb 2024 09:05:44 +0000
Subject: [PATCH 0942/1406] mm: Fix spelling mistake "commdandline" ->
 "commandline"

There is a spelling mistake in a pr_warn_once message. Fix it.

Link: https://lkml.kernel.org/r/20240215090544.1649201-1-colin.i.king@gmail.com
Signed-off-by: Colin Ian King <colin.i.king@gmail.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/memcontrol.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 107ec5d36819bf..61932c9215e773 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -7974,7 +7974,7 @@ static int __init setup_swap_account(char *s)
 	bool res;
 
 	if (!kstrtobool(s, &res) && !res)
-		pr_warn_once("The swapaccount=0 commdandline option is deprecated "
+		pr_warn_once("The swapaccount=0 commandline option is deprecated "
 			     "in favor of configuring swap control via cgroupfs. "
 			     "Please report your usecase to linux-mm@kvack.org if you "
 			     "depend on this functionality.\n");

From 0193bcaa0ca74746bc80ebb6ffd12869680efb28 Mon Sep 17 00:00:00 2001
From: Benjamin Gray <bgray@linux.ibm.com>
Date: Tue, 13 Feb 2024 14:39:58 +1100
Subject: [PATCH 0943/1406] kasan: guard release_free_meta() shadow access with
 kasan_arch_is_ready()

release_free_meta() accesses the shadow directly through the path

  kasan_slab_free
    __kasan_slab_free
      kasan_release_object_meta
        release_free_meta
          kasan_mem_to_shadow

There are no kasan_arch_is_ready() guards here, allowing an oops when the
shadow is not initialized.  The oops can be seen on a Power8 KVM guest.

This patch adds the guard to release_free_meta(), as it's the first level
that specifically requires the shadow.

It is safe to put the guard at the start of this function, before the
stack put: only kasan_save_free_info() can initialize the saved stack,
which itself is guarded with kasan_arch_is_ready() by its caller
poison_slab_object().  If the arch becomes ready before
release_free_meta() then we will not observe KASAN_SLAB_FREE_META in the
object's shadow, so we will not put an uninitialized stack either.

Link: https://lkml.kernel.org/r/20240213033958.139383-1-bgray@linux.ibm.com
Fixes: 63b85ac56a64 ("kasan: stop leaking stack trace handles")
Signed-off-by: Benjamin Gray <bgray@linux.ibm.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Konovalov <andreyknvl@gmail.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Vincenzo Frascino <vincenzo.frascino@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/kasan/generic.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c
index fc9cf1860efb34..1900f857603456 100644
--- a/mm/kasan/generic.c
+++ b/mm/kasan/generic.c
@@ -502,6 +502,9 @@ static void release_alloc_meta(struct kasan_alloc_meta *meta)
 
 static void release_free_meta(const void *object, struct kasan_free_meta *meta)
 {
+	if (!kasan_arch_is_ready())
+		return;
+
 	/* Check if free meta is valid. */
 	if (*(u8 *)kasan_mem_to_shadow(object) != KASAN_SLAB_FREE_META)
 		return;

From 17cdc59c7a70e0ff66c034c938b2a76a3b5f4c87 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 12 Feb 2024 18:36:32 -0800
Subject: [PATCH 0944/1406] mm/damon/sysfs-schemes: handle schemes sysfs dir
 removal before commit_schemes_quota_goals

'commit_schemes_quota_goals' command handler,
damos_sysfs_set_quota_scores() assumes the number of schemes sysfs
directory will be same to the number of schemes of the DAMON context.  The
assumption is wrong since users can remove schemes sysfs directories while
DAMON is running.  In the case, illegal memory accesses can happen.  Fix
it by checking the case.

Link: https://lkml.kernel.org/r/20240213023633.124928-1-sj@kernel.org
Fixes: d91beaa505a0 ("mm/damon/sysfs-schemes: implement a command for scheme quota goals only commit")
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/sysfs-schemes.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c
index dd2fb512700920..ae0f0b314f3a9a 100644
--- a/mm/damon/sysfs-schemes.c
+++ b/mm/damon/sysfs-schemes.c
@@ -1905,6 +1905,10 @@ void damos_sysfs_set_quota_scores(struct damon_sysfs_schemes *sysfs_schemes,
 	damon_for_each_scheme(scheme, ctx) {
 		struct damon_sysfs_scheme *sysfs_scheme;
 
+		/* user could have removed the scheme sysfs dir */
+		if (i >= sysfs_schemes->nr)
+			break;
+
 		sysfs_scheme = sysfs_schemes->schemes_arr[i];
 		damos_sysfs_set_quota_score(sysfs_scheme->quotas->goals,
 				&scheme->quota);

From 04a4c28633193d8dfbb0f69e4f4c10e068de254b Mon Sep 17 00:00:00 2001
From: Shakeel Butt <shakeel.butt@linux.dev>
Date: Mon, 19 Feb 2024 12:50:50 -0800
Subject: [PATCH 0945/1406] MAINTAINERS: mailmap: update Shakeel's email
 address

Moving to linux.dev based email for kernel work.

Link: https://lkml.kernel.org/r/20240219205050.887810-1-shakeel.butt@linux.dev
Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 .mailmap    | 1 +
 MAINTAINERS | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/.mailmap b/.mailmap
index b99a238ee3bde1..08f28f2999f0dc 100644
--- a/.mailmap
+++ b/.mailmap
@@ -553,6 +553,7 @@ Senthilkumar N L <quic_snlakshm@quicinc.com> <snlakshm@codeaurora.org>
 Serge Hallyn <sergeh@kernel.org> <serge.hallyn@canonical.com>
 Serge Hallyn <sergeh@kernel.org> <serue@us.ibm.com>
 Seth Forshee <sforshee@kernel.org> <seth.forshee@canonical.com>
+Shakeel Butt <shakeel.butt@linux.dev> <shakeelb@google.com>
 Shannon Nelson <shannon.nelson@amd.com> <snelson@pensando.io>
 Shannon Nelson <shannon.nelson@amd.com> <shannon.nelson@intel.com>
 Shannon Nelson <shannon.nelson@amd.com> <shannon.nelson@oracle.com>
diff --git a/MAINTAINERS b/MAINTAINERS
index 9ed4d38685394d..c3c9cf33595cc7 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5378,7 +5378,7 @@ CONTROL GROUP - MEMORY RESOURCE CONTROLLER (MEMCG)
 M:	Johannes Weiner <hannes@cmpxchg.org>
 M:	Michal Hocko <mhocko@kernel.org>
 M:	Roman Gushchin <roman.gushchin@linux.dev>
-M:	Shakeel Butt <shakeelb@google.com>
+M:	Shakeel Butt <shakeel.butt@linux.dev>
 R:	Muchun Song <muchun.song@linux.dev>
 L:	cgroups@vger.kernel.org
 L:	linux-mm@kvack.org

From c5210ef03786ee2144f25a66ba595133adae2ffa Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 16 Feb 2024 11:40:24 -0800
Subject: [PATCH 0946/1406] mm/damon/reclaim: fix quota stauts loss due to
 online tunings

Patch series "mm/damon: fix quota status loss due to online tunings".

DAMON_RECLAIM and DAMON_LRU_SORT is not preserving internal quota status
when applying new user parameters, and hence could cause temporal quota
accuracy degradation.  Fix it by preserving the status.


This patch (of 2):

For online parameters change, DAMON_RECLAIM creates new scheme based on
latest values of the parameters and replaces the old scheme with the new
one.  When creating it, the internal status of the quota of the old
scheme is not preserved.  As a result, charging of the quota starts from
zero after the online tuning.  The data that collected to estimate the
throughput of the scheme's action is also reset, and therefore the
estimation should start from the scratch again.  Because the throughput
estimation is being used to convert the time quota to the effective size
quota, this could result in temporal time quota inaccuracy.  It would be
recovered over time, though.  In short, the quota accuracy could be
temporarily degraded after online parameters update.

Fix the problem by checking the case and copying the internal fields for
the status.

Link: https://lkml.kernel.org/r/20240216194025.9207-1-sj@kernel.org
Link: https://lkml.kernel.org/r/20240216194025.9207-2-sj@kernel.org
Fixes: e035c280f6df ("mm/damon/reclaim: support online inputs update")
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: <stable@vger.kernel.org>	[5.19+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/reclaim.c | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c
index ab974e477d2f28..66e190f0374ac8 100644
--- a/mm/damon/reclaim.c
+++ b/mm/damon/reclaim.c
@@ -150,9 +150,20 @@ static struct damos *damon_reclaim_new_scheme(void)
 			&damon_reclaim_wmarks);
 }
 
+static void damon_reclaim_copy_quota_status(struct damos_quota *dst,
+		struct damos_quota *src)
+{
+	dst->total_charged_sz = src->total_charged_sz;
+	dst->total_charged_ns = src->total_charged_ns;
+	dst->charged_sz = src->charged_sz;
+	dst->charged_from = src->charged_from;
+	dst->charge_target_from = src->charge_target_from;
+	dst->charge_addr_from = src->charge_addr_from;
+}
+
 static int damon_reclaim_apply_parameters(void)
 {
-	struct damos *scheme;
+	struct damos *scheme, *old_scheme;
 	struct damos_filter *filter;
 	int err = 0;
 
@@ -164,6 +175,11 @@ static int damon_reclaim_apply_parameters(void)
 	scheme = damon_reclaim_new_scheme();
 	if (!scheme)
 		return -ENOMEM;
+	if (!list_empty(&ctx->schemes)) {
+		damon_for_each_scheme(old_scheme, ctx)
+			damon_reclaim_copy_quota_status(&scheme->quota,
+					&old_scheme->quota);
+	}
 	if (skip_anon) {
 		filter = damos_new_filter(DAMOS_FILTER_TYPE_ANON, true);
 		if (!filter) {

From 304e409769f465dfd93c402135e547309b8da2f8 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 16 Feb 2024 11:40:25 -0800
Subject: [PATCH 0947/1406] mm/damon/lru_sort: fix quota status loss due to
 online tunings

For online parameters change, DAMON_LRU_SORT creates new schemes based on
latest values of the parameters and replaces the old schemes with the new
one.  When creating it, the internal status of the quotas of the old
schemes is not preserved.  As a result, charging of the quota starts from
zero after the online tuning.  The data that collected to estimate the
throughput of the scheme's action is also reset, and therefore the
estimation should start from the scratch again.  Because the throughput
estimation is being used to convert the time quota to the effective size
quota, this could result in temporal time quota inaccuracy.  It would be
recovered over time, though.  In short, the quota accuracy could be
temporarily degraded after online parameters update.

Fix the problem by checking the case and copying the internal fields for
the status.

Link: https://lkml.kernel.org/r/20240216194025.9207-3-sj@kernel.org
Fixes: 40e983cca927 ("mm/damon: introduce DAMON-based LRU-lists Sorting")
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: <stable@vger.kernel.org>	[6.0+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/lru_sort.c | 43 ++++++++++++++++++++++++++++++++++++-------
 1 file changed, 36 insertions(+), 7 deletions(-)

diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c
index f2e5f9431892eb..3de2916a65c38c 100644
--- a/mm/damon/lru_sort.c
+++ b/mm/damon/lru_sort.c
@@ -185,9 +185,21 @@ static struct damos *damon_lru_sort_new_cold_scheme(unsigned int cold_thres)
 	return damon_lru_sort_new_scheme(&pattern, DAMOS_LRU_DEPRIO);
 }
 
+static void damon_lru_sort_copy_quota_status(struct damos_quota *dst,
+		struct damos_quota *src)
+{
+	dst->total_charged_sz = src->total_charged_sz;
+	dst->total_charged_ns = src->total_charged_ns;
+	dst->charged_sz = src->charged_sz;
+	dst->charged_from = src->charged_from;
+	dst->charge_target_from = src->charge_target_from;
+	dst->charge_addr_from = src->charge_addr_from;
+}
+
 static int damon_lru_sort_apply_parameters(void)
 {
-	struct damos *scheme;
+	struct damos *scheme, *hot_scheme, *cold_scheme;
+	struct damos *old_hot_scheme = NULL, *old_cold_scheme = NULL;
 	unsigned int hot_thres, cold_thres;
 	int err = 0;
 
@@ -195,18 +207,35 @@ static int damon_lru_sort_apply_parameters(void)
 	if (err)
 		return err;
 
+	damon_for_each_scheme(scheme, ctx) {
+		if (!old_hot_scheme) {
+			old_hot_scheme = scheme;
+			continue;
+		}
+		old_cold_scheme = scheme;
+	}
+
 	hot_thres = damon_max_nr_accesses(&damon_lru_sort_mon_attrs) *
 		hot_thres_access_freq / 1000;
-	scheme = damon_lru_sort_new_hot_scheme(hot_thres);
-	if (!scheme)
+	hot_scheme = damon_lru_sort_new_hot_scheme(hot_thres);
+	if (!hot_scheme)
 		return -ENOMEM;
-	damon_set_schemes(ctx, &scheme, 1);
+	if (old_hot_scheme)
+		damon_lru_sort_copy_quota_status(&hot_scheme->quota,
+				&old_hot_scheme->quota);
 
 	cold_thres = cold_min_age / damon_lru_sort_mon_attrs.aggr_interval;
-	scheme = damon_lru_sort_new_cold_scheme(cold_thres);
-	if (!scheme)
+	cold_scheme = damon_lru_sort_new_cold_scheme(cold_thres);
+	if (!cold_scheme) {
+		damon_destroy_scheme(hot_scheme);
 		return -ENOMEM;
-	damon_add_scheme(ctx, scheme);
+	}
+	if (old_cold_scheme)
+		damon_lru_sort_copy_quota_status(&cold_scheme->quota,
+				&old_cold_scheme->quota);
+
+	damon_set_schemes(ctx, &hot_scheme, 1);
+	damon_add_scheme(ctx, cold_scheme);
 
 	return damon_set_region_biggest_system_ram_default(target,
 					&monitor_region_start,

From 0565b4ed581ab00e81c4de21be28f7f5c190d6c8 Mon Sep 17 00:00:00 2001
From: Byungchul Park <byungchul@sk.com>
Date: Fri, 16 Feb 2024 20:15:02 +0900
Subject: [PATCH 0948/1406] mm/vmscan: fix a bug calling wakeup_kswapd() with a
 wrong zone index

With numa balancing on, when a numa system is running where a numa node
doesn't have its local memory so it has no managed zones, the following
oops has been observed.  It's because wakeup_kswapd() is called with a
wrong zone index, -1.  Fixed it by checking the index before calling
wakeup_kswapd().

> BUG: unable to handle page fault for address: 00000000000033f3
> #PF: supervisor read access in kernel mode
> #PF: error_code(0x0000) - not-present page
> PGD 0 P4D 0
> Oops: 0000 [#1] PREEMPT SMP NOPTI
> CPU: 2 PID: 895 Comm: masim Not tainted 6.6.0-dirty #255
> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS
>    rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014
> RIP: 0010:wakeup_kswapd (./linux/mm/vmscan.c:7812)
> Code: (omitted)
> RSP: 0000:ffffc90004257d58 EFLAGS: 00010286
> RAX: ffffffffffffffff RBX: ffff88883fff0480 RCX: 0000000000000003
> RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff88883fff0480
> RBP: ffffffffffffffff R08: ff0003ffffffffff R09: ffffffffffffffff
> R10: ffff888106c95540 R11: 0000000055555554 R12: 0000000000000003
> R13: 0000000000000000 R14: 0000000000000000 R15: ffff88883fff0940
> FS:  00007fc4b8124740(0000) GS:ffff888827c00000(0000) knlGS:0000000000000000
> CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> CR2: 00000000000033f3 CR3: 000000026cc08004 CR4: 0000000000770ee0
> DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> PKRU: 55555554
> Call Trace:
>  <TASK>
> ? __die
> ? page_fault_oops
> ? __pte_offset_map_lock
> ? exc_page_fault
> ? asm_exc_page_fault
> ? wakeup_kswapd
> migrate_misplaced_page
> __handle_mm_fault
> handle_mm_fault
> do_user_addr_fault
> exc_page_fault
> asm_exc_page_fault
> RIP: 0033:0x55b897ba0808
> Code: (omitted)
> RSP: 002b:00007ffeefa821a0 EFLAGS: 00010287
> RAX: 000055b89983acd0 RBX: 00007ffeefa823f8 RCX: 000055b89983acd0
> RDX: 00007fc2f8122010 RSI: 0000000000020000 RDI: 000055b89983acd0
> RBP: 00007ffeefa821a0 R08: 0000000000000037 R09: 0000000000000075
> R10: 0000000000000000 R11: 0000000000000202 R12: 0000000000000000
> R13: 00007ffeefa82410 R14: 000055b897ba5dd8 R15: 00007fc4b8340000
>  </TASK>

Link: https://lkml.kernel.org/r/20240216111502.79759-1-byungchul@sk.com
Signed-off-by: Byungchul Park <byungchul@sk.com>
Reported-by: Hyeongtak Ji <hyeongtak.ji@sk.com>
Fixes: c574bbe917036 ("NUMA balancing: optimize page placement for memory tiering system")
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/migrate.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/mm/migrate.c b/mm/migrate.c
index cc9f2bcd73b492..c27b1f8097d4a7 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -2519,6 +2519,14 @@ static int numamigrate_isolate_folio(pg_data_t *pgdat, struct folio *folio)
 			if (managed_zone(pgdat->node_zones + z))
 				break;
 		}
+
+		/*
+		 * If there are no managed zones, it should not proceed
+		 * further.
+		 */
+		if (z < 0)
+			return 0;
+
 		wakeup_kswapd(pgdat->node_zones + z, 0,
 			      folio_order(folio), ZONE_MOVABLE);
 		return 0;

From d1e64f82cdbb85411b303523cb5bc02b52b02ae4 Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lstoakes@gmail.com>
Date: Tue, 20 Feb 2024 06:44:10 +0000
Subject: [PATCH 0949/1406] MAINTAINERS: add memory mapping entry with
 reviewers

Recently there have been a number of patches which have affected various
aspects of the memory mapping logic as implemented in mm/mmap.c where it
would have been useful for regular contributors to have been notified.

Add an entry for this part of mm in particular with regular contributors
tagged as reviewers.

Link: https://lkml.kernel.org/r/20240220064410.4639-1-lstoakes@gmail.com
Signed-off-by: Lorenzo Stoakes <lstoakes@gmail.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 MAINTAINERS | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index c3c9cf33595cc7..f7c81cea9b69e5 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -14111,6 +14111,17 @@ F:	mm/
 F:	tools/mm/
 F:	tools/testing/selftests/mm/
 
+MEMORY MAPPING
+M:	Andrew Morton <akpm@linux-foundation.org>
+R:	Liam R. Howlett <Liam.Howlett@oracle.com>
+R:	Vlastimil Babka <vbabka@suse.cz>
+R:	Lorenzo Stoakes <lstoakes@gmail.com>
+L:	linux-mm@kvack.org
+S:	Maintained
+W:	http://www.linux-mm.org
+T:	git git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
+F:	mm/mmap.c
+
 MEMORY TECHNOLOGY DEVICES (MTD)
 M:	Miquel Raynal <miquel.raynal@bootlin.com>
 M:	Richard Weinberger <richard@nod.at>

From 25daab033dbd6da41a58f044318c18e5fc8171e8 Mon Sep 17 00:00:00 2001
From: Nhat Pham <nphamcs@gmail.com>
Date: Mon, 19 Feb 2024 19:01:21 -0800
Subject: [PATCH 0950/1406] mm: cachestat: fix folio read-after-free in cache
 walk

In cachestat, we access the folio from the page cache's xarray to compute
its page offset, and check for its dirty and writeback flags.  However, we
do not hold a reference to the folio before performing these actions,
which means the folio can concurrently be released and reused as another
folio/page/slab.

Get around this altogether by just using xarray's existing machinery for
the folio page offsets and dirty/writeback states.

This changes behavior for tmpfs files to now always report zeroes in their
dirty and writeback counters.  This is okay as tmpfs doesn't follow
conventional writeback cache behavior: its pages get "cleaned" during
swapout, after which they're no longer resident etc.

Link: https://lkml.kernel.org/r/20240220153409.GA216065@cmpxchg.org
Fixes: cf264e1329fb ("cachestat: implement cachestat syscall")
Reported-by: Jann Horn <jannh@google.com>
Suggested-by: Matthew Wilcox <willy@infradead.org>
Signed-off-by: Nhat Pham <nphamcs@gmail.com>
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Tested-by: Jann Horn <jannh@google.com>
Cc: <stable@vger.kernel.org>	[6.4+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/filemap.c | 51 ++++++++++++++++++++++++++-------------------------
 1 file changed, 26 insertions(+), 25 deletions(-)

diff --git a/mm/filemap.c b/mm/filemap.c
index 750e779c23db74..4a30de98a8c75d 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -4111,28 +4111,40 @@ static void filemap_cachestat(struct address_space *mapping,
 
 	rcu_read_lock();
 	xas_for_each(&xas, folio, last_index) {
+		int order;
 		unsigned long nr_pages;
 		pgoff_t folio_first_index, folio_last_index;
 
+		/*
+		 * Don't deref the folio. It is not pinned, and might
+		 * get freed (and reused) underneath us.
+		 *
+		 * We *could* pin it, but that would be expensive for
+		 * what should be a fast and lightweight syscall.
+		 *
+		 * Instead, derive all information of interest from
+		 * the rcu-protected xarray.
+		 */
+
 		if (xas_retry(&xas, folio))
 			continue;
 
+		order = xa_get_order(xas.xa, xas.xa_index);
+		nr_pages = 1 << order;
+		folio_first_index = round_down(xas.xa_index, 1 << order);
+		folio_last_index = folio_first_index + nr_pages - 1;
+
+		/* Folios might straddle the range boundaries, only count covered pages */
+		if (folio_first_index < first_index)
+			nr_pages -= first_index - folio_first_index;
+
+		if (folio_last_index > last_index)
+			nr_pages -= folio_last_index - last_index;
+
 		if (xa_is_value(folio)) {
 			/* page is evicted */
 			void *shadow = (void *)folio;
 			bool workingset; /* not used */
-			int order = xa_get_order(xas.xa, xas.xa_index);
-
-			nr_pages = 1 << order;
-			folio_first_index = round_down(xas.xa_index, 1 << order);
-			folio_last_index = folio_first_index + nr_pages - 1;
-
-			/* Folios might straddle the range boundaries, only count covered pages */
-			if (folio_first_index < first_index)
-				nr_pages -= first_index - folio_first_index;
-
-			if (folio_last_index > last_index)
-				nr_pages -= folio_last_index - last_index;
 
 			cs->nr_evicted += nr_pages;
 
@@ -4150,24 +4162,13 @@ static void filemap_cachestat(struct address_space *mapping,
 			goto resched;
 		}
 
-		nr_pages = folio_nr_pages(folio);
-		folio_first_index = folio_pgoff(folio);
-		folio_last_index = folio_first_index + nr_pages - 1;
-
-		/* Folios might straddle the range boundaries, only count covered pages */
-		if (folio_first_index < first_index)
-			nr_pages -= first_index - folio_first_index;
-
-		if (folio_last_index > last_index)
-			nr_pages -= folio_last_index - last_index;
-
 		/* page is in cache */
 		cs->nr_cache += nr_pages;
 
-		if (folio_test_dirty(folio))
+		if (xas_get_mark(&xas, PAGECACHE_TAG_DIRTY))
 			cs->nr_dirty += nr_pages;
 
-		if (folio_test_writeback(folio))
+		if (xas_get_mark(&xas, PAGECACHE_TAG_WRITEBACK))
 			cs->nr_writeback += nr_pages;
 
 resched:

From 0eb702ab51ac8e631795cd92f2c672ae40864b21 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V (IBM)" <aneesh.kumar@kernel.org>
Date: Mon, 29 Jan 2024 11:30:22 +0530
Subject: [PATCH 0951/1406] mm/debug_vm_pgtable: fix BUG_ON with pud advanced
 test

Architectures like powerpc add debug checks to ensure we find only devmap
PUD pte entries.  These debug checks are only done with CONFIG_DEBUG_VM.
This patch marks the ptes used for PUD advanced test devmap pte entries so
that we don't hit on debug checks on architecture like ppc64 as below.

WARNING: CPU: 2 PID: 1 at arch/powerpc/mm/book3s64/radix_pgtable.c:1382 radix__pud_hugepage_update+0x38/0x138
....
NIP [c0000000000a7004] radix__pud_hugepage_update+0x38/0x138
LR [c0000000000a77a8] radix__pudp_huge_get_and_clear+0x28/0x60
Call Trace:
[c000000004a2f950] [c000000004a2f9a0] 0xc000000004a2f9a0 (unreliable)
[c000000004a2f980] [000d34c100000000] 0xd34c100000000
[c000000004a2f9a0] [c00000000206ba98] pud_advanced_tests+0x118/0x334
[c000000004a2fa40] [c00000000206db34] debug_vm_pgtable+0xcbc/0x1c48
[c000000004a2fc10] [c00000000000fd28] do_one_initcall+0x60/0x388

Also

 kernel BUG at arch/powerpc/mm/book3s64/pgtable.c:202!
 ....

 NIP [c000000000096510] pudp_huge_get_and_clear_full+0x98/0x174
 LR [c00000000206bb34] pud_advanced_tests+0x1b4/0x334
 Call Trace:
 [c000000004a2f950] [000d34c100000000] 0xd34c100000000 (unreliable)
 [c000000004a2f9a0] [c00000000206bb34] pud_advanced_tests+0x1b4/0x334
 [c000000004a2fa40] [c00000000206db34] debug_vm_pgtable+0xcbc/0x1c48
 [c000000004a2fc10] [c00000000000fd28] do_one_initcall+0x60/0x388

Link: https://lkml.kernel.org/r/20240129060022.68044-1-aneesh.kumar@kernel.org
Fixes: 27af67f35631 ("powerpc/book3s64/mm: enable transparent pud hugepage")
Signed-off-by: Aneesh Kumar K.V (IBM) <aneesh.kumar@kernel.org>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/debug_vm_pgtable.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index 5662e29fe25335..65c19025da3dfe 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -362,6 +362,12 @@ static void __init pud_advanced_tests(struct pgtable_debug_args *args)
 	vaddr &= HPAGE_PUD_MASK;
 
 	pud = pfn_pud(args->pud_pfn, args->page_prot);
+	/*
+	 * Some architectures have debug checks to make sure
+	 * huge pud mapping are only found with devmap entries
+	 * For now test with only devmap entries.
+	 */
+	pud = pud_mkdevmap(pud);
 	set_pud_at(args->mm, vaddr, args->pudp, pud);
 	flush_dcache_page(page);
 	pudp_set_wrprotect(args->mm, vaddr, args->pudp);
@@ -374,6 +380,7 @@ static void __init pud_advanced_tests(struct pgtable_debug_args *args)
 	WARN_ON(!pud_none(pud));
 #endif /* __PAGETABLE_PMD_FOLDED */
 	pud = pfn_pud(args->pud_pfn, args->page_prot);
+	pud = pud_mkdevmap(pud);
 	pud = pud_wrprotect(pud);
 	pud = pud_mkclean(pud);
 	set_pud_at(args->mm, vaddr, args->pudp, pud);
@@ -391,6 +398,7 @@ static void __init pud_advanced_tests(struct pgtable_debug_args *args)
 #endif /* __PAGETABLE_PMD_FOLDED */
 
 	pud = pfn_pud(args->pud_pfn, args->page_prot);
+	pud = pud_mkdevmap(pud);
 	pud = pud_mkyoung(pud);
 	set_pud_at(args->mm, vaddr, args->pudp, pud);
 	flush_dcache_page(page);

From d482a1af03e3c62514fed7a20c256702ee204b64 Mon Sep 17 00:00:00 2001
From: Kalesh Singh <kaleshsingh@google.com>
Date: Tue, 9 Jan 2024 17:22:33 -0800
Subject: [PATCH 0952/1406] mm/cma: fix placement of
 trace_cma_alloc_start/finish

The current placement of trace_cma_alloc_start/finish misses the fail
cases: !cma || !cma->count || !cma->bitmap.

trace_cma_alloc_finish is also not emitted for the failure case
where bitmap_count > bitmap_maxno.

Fix these missed cases by moving the start event before the failure
checks and moving the finish event to the out label.

Link: https://lkml.kernel.org/r/20240110012234.3793639-1-kaleshsingh@google.com
Fixes: 7bc1aec5e287 ("mm: cma: add trace events for CMA alloc perf testing")
Signed-off-by: Kalesh Singh <kaleshsingh@google.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Liam Mark <lmark@codeaurora.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/cma.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/mm/cma.c b/mm/cma.c
index 7c09c47e530bf6..e12cf41d83549a 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -436,6 +436,9 @@ struct page *cma_alloc(struct cma *cma, unsigned long count,
 	unsigned long i;
 	struct page *page = NULL;
 	int ret = -ENOMEM;
+	const char *name = cma ? cma->name : NULL;
+
+	trace_cma_alloc_start(name, count, align);
 
 	if (!cma || !cma->count || !cma->bitmap)
 		goto out;
@@ -446,8 +449,6 @@ struct page *cma_alloc(struct cma *cma, unsigned long count,
 	if (!count)
 		goto out;
 
-	trace_cma_alloc_start(cma->name, count, align);
-
 	mask = cma_bitmap_aligned_mask(cma, align);
 	offset = cma_bitmap_aligned_offset(cma, align);
 	bitmap_maxno = cma_bitmap_maxno(cma);
@@ -496,8 +497,6 @@ struct page *cma_alloc(struct cma *cma, unsigned long count,
 		start = bitmap_no + mask + 1;
 	}
 
-	trace_cma_alloc_finish(cma->name, pfn, page, count, align, ret);
-
 	/*
 	 * CMA can allocate multiple page blocks, which results in different
 	 * blocks being marked with different tags. Reset the tags to ignore
@@ -516,6 +515,7 @@ struct page *cma_alloc(struct cma *cma, unsigned long count,
 
 	pr_debug("%s(): returned %p\n", __func__, page);
 out:
+	trace_cma_alloc_finish(name, pfn, page, count, align, ret);
 	if (page) {
 		count_vm_event(CMA_ALLOC_SUCCESS);
 		cma_sysfs_account_success_pages(cma, count);

From 087bef999e8194705da56ac2c95e4c01e405b973 Mon Sep 17 00:00:00 2001
From: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Date: Tue, 9 Jan 2024 14:31:19 -0800
Subject: [PATCH 0953/1406] maple_tree: fix comment describing
 mas_node_count_gfp()

The function description comment for mas_node_count_gfp() mistakingly
refers to the function as mas_node_count().  Change it to refer to the
correct function.

Link: https://lkml.kernel.org/r/20240109223119.162357-1-sidhartha.kumar@oracle.com
Signed-off-by: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Peng Zhang <zhangpeng.00@bytedance.com>
Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 lib/maple_tree.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/maple_tree.c b/lib/maple_tree.c
index 6f241bb3879920..7b161802860bdb 100644
--- a/lib/maple_tree.c
+++ b/lib/maple_tree.c
@@ -1307,8 +1307,8 @@ static inline void mas_free(struct ma_state *mas, struct maple_enode *used)
 }
 
 /*
- * mas_node_count() - Check if enough nodes are allocated and request more if
- * there is not enough nodes.
+ * mas_node_count_gfp() - Check if enough nodes are allocated and request more
+ * if there is not enough nodes.
  * @mas: The maple state
  * @count: The number of nodes needed
  * @gfp: the gfp flags

From da6464dff5c85f123a35ad8ab0ecf73e13c85af0 Mon Sep 17 00:00:00 2001
From: Sumanth Korikkar <sumanthk@linux.ibm.com>
Date: Mon, 8 Jan 2024 14:27:43 +0100
Subject: [PATCH 0954/1406] mm/memory_hotplug: introduce
 MEM_PREPARE_ONLINE/MEM_FINISH_OFFLINE notifiers

Patch series "implement "memmap on memory" feature on s390".

This series provides "memmap on memory" support on s390 platform.  "memmap
on memory" allows struct pages array to be allocated from the hotplugged
memory range instead of allocating it from main system memory.

s390 currently preallocates struct pages array for all potentially
possible memory, which ensures memory onlining always succeeds, but with
the cost of significant memory consumption from the available system
memory during boottime.  In certain extreme configuration, this could lead
to ipl failure.

"memmap on memory" ensures struct pages array are populated from self
contained hotplugged memory range instead of depleting the available
system memory and this could eliminate ipl failure on s390 platform.

On other platforms, system might go OOM when the physically hotplugged
memory depletes the available memory before it is onlined.  Hence, "memmap
on memory" feature was introduced as described in commit a08a2ae34613
("mm,memory_hotplug: allocate memmap from the added memory range").

Unlike other architectures, s390 memory blocks are not physically
accessible until it is online.  To make it physically accessible two new
memory notifiers MEM_PREPARE_ONLINE / MEM_FINISH_OFFLINE are added and
this notifier lets the hypervisor inform that the memory should be made
physically accessible.  This allows for "memmap on memory" initialization
during memory hotplug onlining phase, which is performed before calling
MEM_GOING_ONLINE notifier.

Patch 1 introduces MEM_PREPARE_ONLINE/MEM_FINISH_OFFLINE memory notifiers
to prepare the transition of memory to and from a physically accessible
state.  New mhp_flag MHP_OFFLINE_INACCESSIBLE is introduced to ensure
altmap cannot be written when adding memory - before it is set online.
This enhancement is crucial for implementing the "memmap on memory"
feature for s390 in a subsequent patch.

Patches 2 allocates vmemmap pages from self-contained memory range for
s390.  It allocates memory map (struct pages array) from the hotplugged
memory range, rather than using system memory by passing altmap to vmemmap
functions.

Patch 3 removes unhandled memory notifier types on s390.

Patch 4 implements MEM_PREPARE_ONLINE/MEM_FINISH_OFFLINE memory notifiers
on s390.  MEM_PREPARE_ONLINE memory notifier makes memory block physical
accessible via sclp assign command.  The notifier ensures self-contained
memory maps are accessible and hence enabling the "memmap on memory" on
s390.  MEM_FINISH_OFFLINE memory notifier shifts the memory block to an
inaccessible state via sclp unassign command.

Patch 5 finally enables MHP_MEMMAP_ON_MEMORY on s390.


This patch (of 5):

Introduce MEM_PREPARE_ONLINE/MEM_FINISH_OFFLINE memory notifiers to
prepare the transition of memory to and from a physically accessible
state.  This enhancement is crucial for implementing the "memmap on
memory" feature for s390 in a subsequent patch.

Platforms such as x86 can support physical memory hotplug via ACPI.  When
there is physical memory hotplug, ACPI event leads to the memory addition
with the following callchain:

acpi_memory_device_add()
  -> acpi_memory_enable_device()
     -> __add_memory()

After this, the hotplugged memory is physically accessible, and altmap
support prepared, before the "memmap on memory" initialization in
memory_block_online() is called.

On s390, memory hotplug works in a different way.  The available hotplug
memory has to be defined upfront in the hypervisor, but it is made
physically accessible only when the user sets it online via sysfs,
currently in the MEM_GOING_ONLINE notifier.  This is too late and "memmap
on memory" initialization is performed before calling MEM_GOING_ONLINE
notifier.

During the memory hotplug addition phase, altmap support is prepared and
during the memory onlining phase s390 requires memory to be physically
accessible and then subsequently initiate the "memmap on memory"
initialization process.

The memory provider will handle new MEM_PREPARE_ONLINE /
MEM_FINISH_OFFLINE notifications and make the memory accessible.

The mhp_flag MHP_OFFLINE_INACCESSIBLE is introduced and is relevant when
used along with MHP_MEMMAP_ON_MEMORY, because the altmap cannot be written
(e.g., poisoned) when adding memory -- before it is set online.  This
allows for adding memory with an altmap that is not currently made
available by a hypervisor.  When onlining that memory, the hypervisor can
be instructed to make that memory accessible via the new notifiers and the
onlining phase will not require any memory allocations, which is helpful
in low-memory situations.

All architectures ignore unknown memory notifiers.  Therefore, the
introduction of these new notifiers does not result in any functional
modifications across architectures.

Link: https://lkml.kernel.org/r/20240108132747.3238763-1-sumanthk@linux.ibm.com
Link: https://lkml.kernel.org/r/20240108132747.3238763-2-sumanthk@linux.ibm.com
Signed-off-by: Sumanth Korikkar <sumanthk@linux.ibm.com>
Suggested-by: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Suggested-by: David Hildenbrand <david@redhat.com>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 drivers/base/memory.c          | 23 ++++++++++++++++++++++-
 include/linux/memory.h         |  9 +++++++++
 include/linux/memory_hotplug.h | 18 +++++++++++++++++-
 include/linux/memremap.h       |  1 +
 mm/memory_hotplug.c            | 17 ++++++++++++++---
 mm/sparse.c                    |  3 ++-
 6 files changed, 65 insertions(+), 6 deletions(-)

diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 14f964a7719bd0..c0436f46cfb701 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -188,6 +188,7 @@ static int memory_block_online(struct memory_block *mem)
 	unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
 	unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
 	unsigned long nr_vmemmap_pages = 0;
+	struct memory_notify arg;
 	struct zone *zone;
 	int ret;
 
@@ -207,9 +208,19 @@ static int memory_block_online(struct memory_block *mem)
 	if (mem->altmap)
 		nr_vmemmap_pages = mem->altmap->free;
 
+	arg.altmap_start_pfn = start_pfn;
+	arg.altmap_nr_pages = nr_vmemmap_pages;
+	arg.start_pfn = start_pfn + nr_vmemmap_pages;
+	arg.nr_pages = nr_pages - nr_vmemmap_pages;
 	mem_hotplug_begin();
+	ret = memory_notify(MEM_PREPARE_ONLINE, &arg);
+	ret = notifier_to_errno(ret);
+	if (ret)
+		goto out_notifier;
+
 	if (nr_vmemmap_pages) {
-		ret = mhp_init_memmap_on_memory(start_pfn, nr_vmemmap_pages, zone);
+		ret = mhp_init_memmap_on_memory(start_pfn, nr_vmemmap_pages,
+						zone, mem->altmap->inaccessible);
 		if (ret)
 			goto out;
 	}
@@ -231,7 +242,11 @@ static int memory_block_online(struct memory_block *mem)
 					  nr_vmemmap_pages);
 
 	mem->zone = zone;
+	mem_hotplug_done();
+	return ret;
 out:
+	memory_notify(MEM_FINISH_OFFLINE, &arg);
+out_notifier:
 	mem_hotplug_done();
 	return ret;
 }
@@ -244,6 +259,7 @@ static int memory_block_offline(struct memory_block *mem)
 	unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
 	unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
 	unsigned long nr_vmemmap_pages = 0;
+	struct memory_notify arg;
 	int ret;
 
 	if (!mem->zone)
@@ -275,6 +291,11 @@ static int memory_block_offline(struct memory_block *mem)
 		mhp_deinit_memmap_on_memory(start_pfn, nr_vmemmap_pages);
 
 	mem->zone = NULL;
+	arg.altmap_start_pfn = start_pfn;
+	arg.altmap_nr_pages = nr_vmemmap_pages;
+	arg.start_pfn = start_pfn + nr_vmemmap_pages;
+	arg.nr_pages = nr_pages - nr_vmemmap_pages;
+	memory_notify(MEM_FINISH_OFFLINE, &arg);
 out:
 	mem_hotplug_done();
 	return ret;
diff --git a/include/linux/memory.h b/include/linux/memory.h
index f53cfdaaaa4166..939a16bd5cea15 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -96,8 +96,17 @@ int set_memory_block_size_order(unsigned int order);
 #define	MEM_GOING_ONLINE	(1<<3)
 #define	MEM_CANCEL_ONLINE	(1<<4)
 #define	MEM_CANCEL_OFFLINE	(1<<5)
+#define	MEM_PREPARE_ONLINE	(1<<6)
+#define	MEM_FINISH_OFFLINE	(1<<7)
 
 struct memory_notify {
+	/*
+	 * The altmap_start_pfn and altmap_nr_pages fields are designated for
+	 * specifying the altmap range and are exclusively intended for use in
+	 * MEM_PREPARE_ONLINE/MEM_FINISH_OFFLINE notifiers.
+	 */
+	unsigned long altmap_start_pfn;
+	unsigned long altmap_nr_pages;
 	unsigned long start_pfn;
 	unsigned long nr_pages;
 	int status_change_nid_normal;
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 7d207658349416..ee00015575aab3 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -106,6 +106,22 @@ typedef int __bitwise mhp_t;
  * implies the node id (nid).
  */
 #define MHP_NID_IS_MGID		((__force mhp_t)BIT(2))
+/*
+ * The hotplugged memory is completely inaccessible while the memory is
+ * offline. The memory provider will handle MEM_PREPARE_ONLINE /
+ * MEM_FINISH_OFFLINE notifications and make the memory accessible.
+ *
+ * This flag is only relevant when used along with MHP_MEMMAP_ON_MEMORY,
+ * because the altmap cannot be written (e.g., poisoned) when adding
+ * memory -- before it is set online.
+ *
+ * This allows for adding memory with an altmap that is not currently
+ * made available by a hypervisor. When onlining that memory, the
+ * hypervisor can be instructed to make that memory available, and
+ * the onlining phase will not require any memory allocations, which is
+ * helpful in low-memory situations.
+ */
+#define MHP_OFFLINE_INACCESSIBLE	((__force mhp_t)BIT(3))
 
 /*
  * Extended parameters for memory hotplug:
@@ -154,7 +170,7 @@ extern void adjust_present_page_count(struct page *page,
 				      long nr_pages);
 /* VM interface that may be used by firmware interface */
 extern int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages,
-				     struct zone *zone);
+				     struct zone *zone, bool mhp_off_inaccessible);
 extern void mhp_deinit_memmap_on_memory(unsigned long pfn, unsigned long nr_pages);
 extern int online_pages(unsigned long pfn, unsigned long nr_pages,
 			struct zone *zone, struct memory_group *group);
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index 744c830f4b132c..9837f3e6fb9582 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -25,6 +25,7 @@ struct vmem_altmap {
 	unsigned long free;
 	unsigned long align;
 	unsigned long alloc;
+	bool inaccessible;
 };
 
 /*
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 21890994c1d3cc..707027f691503f 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1087,7 +1087,7 @@ void adjust_present_page_count(struct page *page, struct memory_group *group,
 }
 
 int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages,
-			      struct zone *zone)
+			      struct zone *zone, bool mhp_off_inaccessible)
 {
 	unsigned long end_pfn = pfn + nr_pages;
 	int ret, i;
@@ -1096,6 +1096,15 @@ int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages,
 	if (ret)
 		return ret;
 
+	/*
+	 * Memory block is accessible at this stage and hence poison the struct
+	 * pages now.  If the memory block is accessible during memory hotplug
+	 * addition phase, then page poisining is already performed in
+	 * sparse_add_section().
+	 */
+	if (mhp_off_inaccessible)
+		page_init_poison(pfn_to_page(pfn), sizeof(struct page) * nr_pages);
+
 	move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_UNMOVABLE);
 
 	for (i = 0; i < nr_pages; i++)
@@ -1415,7 +1424,7 @@ static void __ref remove_memory_blocks_and_altmaps(u64 start, u64 size)
 }
 
 static int create_altmaps_and_memory_blocks(int nid, struct memory_group *group,
-					    u64 start, u64 size)
+					    u64 start, u64 size, mhp_t mhp_flags)
 {
 	unsigned long memblock_size = memory_block_size_bytes();
 	u64 cur_start;
@@ -1431,6 +1440,8 @@ static int create_altmaps_and_memory_blocks(int nid, struct memory_group *group,
 		};
 
 		mhp_altmap.free = memory_block_memmap_on_memory_pages();
+		if (mhp_flags & MHP_OFFLINE_INACCESSIBLE)
+			mhp_altmap.inaccessible = true;
 		params.altmap = kmemdup(&mhp_altmap, sizeof(struct vmem_altmap),
 					GFP_KERNEL);
 		if (!params.altmap) {
@@ -1516,7 +1527,7 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
 	 */
 	if ((mhp_flags & MHP_MEMMAP_ON_MEMORY) &&
 	    mhp_supports_memmap_on_memory(memory_block_size_bytes())) {
-		ret = create_altmaps_and_memory_blocks(nid, group, start, size);
+		ret = create_altmaps_and_memory_blocks(nid, group, start, size, mhp_flags);
 		if (ret)
 			goto error;
 	} else {
diff --git a/mm/sparse.c b/mm/sparse.c
index 338cf946dee8de..aed0951b87fa04 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -908,7 +908,8 @@ int __meminit sparse_add_section(int nid, unsigned long start_pfn,
 	 * Poison uninitialized struct pages in order to catch invalid flags
 	 * combinations.
 	 */
-	page_init_poison(memmap, sizeof(struct page) * nr_pages);
+	if (!altmap || !altmap->inaccessible)
+		page_init_poison(memmap, sizeof(struct page) * nr_pages);
 
 	ms = __nr_to_section(section_nr);
 	set_section_nid(section_nr, nid);

From d3fbafdfb8a568121c3f8ee52a22d9a5873f566d Mon Sep 17 00:00:00 2001
From: Sumanth Korikkar <sumanthk@linux.ibm.com>
Date: Mon, 8 Jan 2024 14:27:44 +0100
Subject: [PATCH 0955/1406] s390/mm: allocate vmemmap pages from self-contained
 memory range

Allocate memory map (struct pages array) from the hotplugged memory
range, rather than using system memory. The change addresses the issue
where standby memory, when configured to be much larger than online
memory, could potentially lead to ipl failure due to memory map
allocation from online memory. For example, 16MB of memory map
allocation is needed for a memory block size of 1GB and when standby
memory is configured much larger than online memory, this could lead to
ipl failure.

To address this issue, the solution involves introducing "memmap on
memory" using the vmem_altmap structure on s390.  Architectures that
want to implement it should pass the altmap to the vmemmap_populate()
function and its associated callchain. This enhancement is discussed in
commit 4b94ffdc4163 ("x86, mm: introduce vmem_altmap to augment
vmemmap_populate()")

Provide "memmap on memory" support for s390 by passing the altmap in
vmemmap_populate() and its callchain. The allocation path is described
as follows:
* When altmap is NULL in vmemmap_populate(), memory map allocation
  occurs using the existing vmemmap_alloc_block_buf().
* When altmap is not NULL in vmemmap_populate(), memory map allocation
  still uses vmemmap_alloc_block_buf(), but this function internally
  calls altmap_alloc_block_buf().

For deallocation, the process is outlined as follows:
* When altmap is NULL in vmemmap_free(), memory map deallocation happens
  through free_pages().
* When altmap is not NULL in vmemmap_free(), memory map deallocation
  occurs via vmem_altmap_free().

While memory map allocation is primarily handled through the
self-contained memory map range, there might still be a small amount of
system memory allocation required for vmemmap pagetables. To mitigate
this impact, this feature will be limited to machines with EDAT1
support.

Link: https://lkml.kernel.org/r/20240108132747.3238763-3-sumanthk@linux.ibm.com
Reviewed-by: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Signed-off-by: Sumanth Korikkar <sumanthk@linux.ibm.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/s390/mm/init.c |  3 ---
 arch/s390/mm/vmem.c | 62 +++++++++++++++++++++++++--------------------
 2 files changed, 35 insertions(+), 30 deletions(-)

diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
index 43e612bc2bcd34..8d9a60ccb7771a 100644
--- a/arch/s390/mm/init.c
+++ b/arch/s390/mm/init.c
@@ -281,9 +281,6 @@ int arch_add_memory(int nid, u64 start, u64 size,
 	unsigned long size_pages = PFN_DOWN(size);
 	int rc;
 
-	if (WARN_ON_ONCE(params->altmap))
-		return -EINVAL;
-
 	if (WARN_ON_ONCE(params->pgprot.pgprot != PAGE_KERNEL.pgprot))
 		return -EINVAL;
 
diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c
index 186a020857cf6a..eb100479f7bec4 100644
--- a/arch/s390/mm/vmem.c
+++ b/arch/s390/mm/vmem.c
@@ -33,8 +33,12 @@ static void __ref *vmem_alloc_pages(unsigned int order)
 	return memblock_alloc(size, size);
 }
 
-static void vmem_free_pages(unsigned long addr, int order)
+static void vmem_free_pages(unsigned long addr, int order, struct vmem_altmap *altmap)
 {
+	if (altmap) {
+		vmem_altmap_free(altmap, 1 << order);
+		return;
+	}
 	/* We don't expect boot memory to be removed ever. */
 	if (!slab_is_available() ||
 	    WARN_ON_ONCE(PageReserved(virt_to_page((void *)addr))))
@@ -156,7 +160,8 @@ static bool vmemmap_unuse_sub_pmd(unsigned long start, unsigned long end)
 
 /* __ref: we'll only call vmemmap_alloc_block() via vmemmap_populate() */
 static int __ref modify_pte_table(pmd_t *pmd, unsigned long addr,
-				  unsigned long end, bool add, bool direct)
+				  unsigned long end, bool add, bool direct,
+				  struct vmem_altmap *altmap)
 {
 	unsigned long prot, pages = 0;
 	int ret = -ENOMEM;
@@ -172,11 +177,11 @@ static int __ref modify_pte_table(pmd_t *pmd, unsigned long addr,
 			if (pte_none(*pte))
 				continue;
 			if (!direct)
-				vmem_free_pages((unsigned long) pfn_to_virt(pte_pfn(*pte)), 0);
+				vmem_free_pages((unsigned long)pfn_to_virt(pte_pfn(*pte)), get_order(PAGE_SIZE), altmap);
 			pte_clear(&init_mm, addr, pte);
 		} else if (pte_none(*pte)) {
 			if (!direct) {
-				void *new_page = vmemmap_alloc_block(PAGE_SIZE, NUMA_NO_NODE);
+				void *new_page = vmemmap_alloc_block_buf(PAGE_SIZE, NUMA_NO_NODE, altmap);
 
 				if (!new_page)
 					goto out;
@@ -213,7 +218,8 @@ static void try_free_pte_table(pmd_t *pmd, unsigned long start)
 
 /* __ref: we'll only call vmemmap_alloc_block() via vmemmap_populate() */
 static int __ref modify_pmd_table(pud_t *pud, unsigned long addr,
-				  unsigned long end, bool add, bool direct)
+				  unsigned long end, bool add, bool direct,
+				  struct vmem_altmap *altmap)
 {
 	unsigned long next, prot, pages = 0;
 	int ret = -ENOMEM;
@@ -234,11 +240,11 @@ static int __ref modify_pmd_table(pud_t *pud, unsigned long addr,
 				if (IS_ALIGNED(addr, PMD_SIZE) &&
 				    IS_ALIGNED(next, PMD_SIZE)) {
 					if (!direct)
-						vmem_free_pages(pmd_deref(*pmd), get_order(PMD_SIZE));
+						vmem_free_pages(pmd_deref(*pmd), get_order(PMD_SIZE), altmap);
 					pmd_clear(pmd);
 					pages++;
 				} else if (!direct && vmemmap_unuse_sub_pmd(addr, next)) {
-					vmem_free_pages(pmd_deref(*pmd), get_order(PMD_SIZE));
+					vmem_free_pages(pmd_deref(*pmd), get_order(PMD_SIZE), altmap);
 					pmd_clear(pmd);
 				}
 				continue;
@@ -261,7 +267,7 @@ static int __ref modify_pmd_table(pud_t *pud, unsigned long addr,
 				 * page tables since vmemmap_populate gets
 				 * called for each section separately.
 				 */
-				new_page = vmemmap_alloc_block(PMD_SIZE, NUMA_NO_NODE);
+				new_page = vmemmap_alloc_block_buf(PMD_SIZE, NUMA_NO_NODE, altmap);
 				if (new_page) {
 					set_pmd(pmd, __pmd(__pa(new_page) | prot));
 					if (!IS_ALIGNED(addr, PMD_SIZE) ||
@@ -280,7 +286,7 @@ static int __ref modify_pmd_table(pud_t *pud, unsigned long addr,
 				vmemmap_use_sub_pmd(addr, next);
 			continue;
 		}
-		ret = modify_pte_table(pmd, addr, next, add, direct);
+		ret = modify_pte_table(pmd, addr, next, add, direct, altmap);
 		if (ret)
 			goto out;
 		if (!add)
@@ -302,12 +308,12 @@ static void try_free_pmd_table(pud_t *pud, unsigned long start)
 	for (i = 0; i < PTRS_PER_PMD; i++, pmd++)
 		if (!pmd_none(*pmd))
 			return;
-	vmem_free_pages(pud_deref(*pud), CRST_ALLOC_ORDER);
+	vmem_free_pages(pud_deref(*pud), CRST_ALLOC_ORDER, NULL);
 	pud_clear(pud);
 }
 
 static int modify_pud_table(p4d_t *p4d, unsigned long addr, unsigned long end,
-			    bool add, bool direct)
+			    bool add, bool direct, struct vmem_altmap *altmap)
 {
 	unsigned long next, prot, pages = 0;
 	int ret = -ENOMEM;
@@ -347,7 +353,7 @@ static int modify_pud_table(p4d_t *p4d, unsigned long addr, unsigned long end,
 		} else if (pud_large(*pud)) {
 			continue;
 		}
-		ret = modify_pmd_table(pud, addr, next, add, direct);
+		ret = modify_pmd_table(pud, addr, next, add, direct, altmap);
 		if (ret)
 			goto out;
 		if (!add)
@@ -370,12 +376,12 @@ static void try_free_pud_table(p4d_t *p4d, unsigned long start)
 		if (!pud_none(*pud))
 			return;
 	}
-	vmem_free_pages(p4d_deref(*p4d), CRST_ALLOC_ORDER);
+	vmem_free_pages(p4d_deref(*p4d), CRST_ALLOC_ORDER, NULL);
 	p4d_clear(p4d);
 }
 
 static int modify_p4d_table(pgd_t *pgd, unsigned long addr, unsigned long end,
-			    bool add, bool direct)
+			    bool add, bool direct, struct vmem_altmap *altmap)
 {
 	unsigned long next;
 	int ret = -ENOMEM;
@@ -394,7 +400,7 @@ static int modify_p4d_table(pgd_t *pgd, unsigned long addr, unsigned long end,
 				goto out;
 			p4d_populate(&init_mm, p4d, pud);
 		}
-		ret = modify_pud_table(p4d, addr, next, add, direct);
+		ret = modify_pud_table(p4d, addr, next, add, direct, altmap);
 		if (ret)
 			goto out;
 		if (!add)
@@ -415,12 +421,12 @@ static void try_free_p4d_table(pgd_t *pgd, unsigned long start)
 		if (!p4d_none(*p4d))
 			return;
 	}
-	vmem_free_pages(pgd_deref(*pgd), CRST_ALLOC_ORDER);
+	vmem_free_pages(pgd_deref(*pgd), CRST_ALLOC_ORDER, NULL);
 	pgd_clear(pgd);
 }
 
 static int modify_pagetable(unsigned long start, unsigned long end, bool add,
-			    bool direct)
+			    bool direct, struct vmem_altmap *altmap)
 {
 	unsigned long addr, next;
 	int ret = -ENOMEM;
@@ -445,7 +451,7 @@ static int modify_pagetable(unsigned long start, unsigned long end, bool add,
 				goto out;
 			pgd_populate(&init_mm, pgd, p4d);
 		}
-		ret = modify_p4d_table(pgd, addr, next, add, direct);
+		ret = modify_p4d_table(pgd, addr, next, add, direct, altmap);
 		if (ret)
 			goto out;
 		if (!add)
@@ -458,14 +464,16 @@ static int modify_pagetable(unsigned long start, unsigned long end, bool add,
 	return ret;
 }
 
-static int add_pagetable(unsigned long start, unsigned long end, bool direct)
+static int add_pagetable(unsigned long start, unsigned long end, bool direct,
+			 struct vmem_altmap *altmap)
 {
-	return modify_pagetable(start, end, true, direct);
+	return modify_pagetable(start, end, true, direct, altmap);
 }
 
-static int remove_pagetable(unsigned long start, unsigned long end, bool direct)
+static int remove_pagetable(unsigned long start, unsigned long end, bool direct,
+			    struct vmem_altmap *altmap)
 {
-	return modify_pagetable(start, end, false, direct);
+	return modify_pagetable(start, end, false, direct, altmap);
 }
 
 /*
@@ -474,7 +482,7 @@ static int remove_pagetable(unsigned long start, unsigned long end, bool direct)
 static int vmem_add_range(unsigned long start, unsigned long size)
 {
 	start = (unsigned long)__va(start);
-	return add_pagetable(start, start + size, true);
+	return add_pagetable(start, start + size, true, NULL);
 }
 
 /*
@@ -483,7 +491,7 @@ static int vmem_add_range(unsigned long start, unsigned long size)
 static void vmem_remove_range(unsigned long start, unsigned long size)
 {
 	start = (unsigned long)__va(start);
-	remove_pagetable(start, start + size, true);
+	remove_pagetable(start, start + size, true, NULL);
 }
 
 /*
@@ -496,9 +504,9 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
 
 	mutex_lock(&vmem_mutex);
 	/* We don't care about the node, just use NUMA_NO_NODE on allocations */
-	ret = add_pagetable(start, end, false);
+	ret = add_pagetable(start, end, false, altmap);
 	if (ret)
-		remove_pagetable(start, end, false);
+		remove_pagetable(start, end, false, altmap);
 	mutex_unlock(&vmem_mutex);
 	return ret;
 }
@@ -509,7 +517,7 @@ void vmemmap_free(unsigned long start, unsigned long end,
 		  struct vmem_altmap *altmap)
 {
 	mutex_lock(&vmem_mutex);
-	remove_pagetable(start, end, false);
+	remove_pagetable(start, end, false, altmap);
 	mutex_unlock(&vmem_mutex);
 }
 

From 21f72825f10143645cc0150ac0b3dffbf82c695e Mon Sep 17 00:00:00 2001
From: Sumanth Korikkar <sumanthk@linux.ibm.com>
Date: Mon, 8 Jan 2024 14:27:45 +0100
Subject: [PATCH 0956/1406] s390/sclp: remove unhandled memory notifier type

Remove memory notifier types which are unhandled by s390.  Unhandled
memory notifier types are covered by default case.

Link: https://lkml.kernel.org/r/20240108132747.3238763-4-sumanthk@linux.ibm.com
Suggested-by: Alexander Gordeev <agordeev@linux.ibm.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Sumanth Korikkar <sumanthk@linux.ibm.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 drivers/s390/char/sclp_cmd.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/drivers/s390/char/sclp_cmd.c b/drivers/s390/char/sclp_cmd.c
index 11c428f4c7cf9c..355e63e44e9546 100644
--- a/drivers/s390/char/sclp_cmd.c
+++ b/drivers/s390/char/sclp_cmd.c
@@ -340,9 +340,6 @@ static int sclp_mem_notifier(struct notifier_block *nb,
 		if (contains_standby_increment(start, start + size))
 			rc = -EPERM;
 		break;
-	case MEM_ONLINE:
-	case MEM_CANCEL_OFFLINE:
-		break;
 	case MEM_GOING_ONLINE:
 		rc = sclp_mem_change_state(start, size, 1);
 		break;

From cfd81eacea892aaac02f4535bc3fa51f2efa7dbc Mon Sep 17 00:00:00 2001
From: Sumanth Korikkar <sumanthk@linux.ibm.com>
Date: Mon, 8 Jan 2024 14:27:46 +0100
Subject: [PATCH 0957/1406] s390/mm: implement
 MEM_PREPARE_ONLINE/MEM_FINISH_OFFLINE notifiers

MEM_PREPARE_ONLINE memory notifier makes memory block physical
accessible via sclp assign command. The notifier ensures self-contained
memory maps are accessible and hence enabling the "memmap on memory" on
s390.

MEM_FINISH_OFFLINE memory notifier shifts the memory block to an
inaccessible state via sclp unassign command.

Implementation considerations:
* When MHP_MEMMAP_ON_MEMORY is disabled, the system retains the old
  behavior. This means the memory map is allocated from default memory.
* If MACHINE_HAS_EDAT1 is unavailable, MHP_MEMMAP_ON_MEMORY is
  automatically disabled. This ensures that vmemmap pagetables do not
  consume additional memory from the default memory allocator.
* The MEM_GOING_ONLINE notifier has been modified to perform no
  operation, as MEM_PREPARE_ONLINE already executes the sclp assign
  command.
* The MEM_CANCEL_ONLINE/MEM_OFFLINE notifier now performs no operation, as
  MEM_FINISH_OFFLINE already executes the sclp unassign command.

Link: https://lkml.kernel.org/r/20240108132747.3238763-5-sumanthk@linux.ibm.com
Reviewed-by: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Sumanth Korikkar <sumanthk@linux.ibm.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 drivers/s390/char/sclp_cmd.c | 41 ++++++++++++++++++++++++++++++------
 1 file changed, 35 insertions(+), 6 deletions(-)

diff --git a/drivers/s390/char/sclp_cmd.c b/drivers/s390/char/sclp_cmd.c
index 355e63e44e9546..7815e9bea69a13 100644
--- a/drivers/s390/char/sclp_cmd.c
+++ b/drivers/s390/char/sclp_cmd.c
@@ -18,6 +18,7 @@
 #include <linux/mm.h>
 #include <linux/mmzone.h>
 #include <linux/memory.h>
+#include <linux/memory_hotplug.h>
 #include <linux/module.h>
 #include <asm/ctlreg.h>
 #include <asm/chpid.h>
@@ -26,6 +27,7 @@
 #include <asm/sclp.h>
 #include <asm/numa.h>
 #include <asm/facility.h>
+#include <asm/page-states.h>
 
 #include "sclp.h"
 
@@ -340,13 +342,38 @@ static int sclp_mem_notifier(struct notifier_block *nb,
 		if (contains_standby_increment(start, start + size))
 			rc = -EPERM;
 		break;
-	case MEM_GOING_ONLINE:
+	case MEM_PREPARE_ONLINE:
+		/*
+		 * Access the altmap_start_pfn and altmap_nr_pages fields
+		 * within the struct memory_notify specifically when dealing
+		 * with only MEM_PREPARE_ONLINE/MEM_FINISH_OFFLINE notifiers.
+		 *
+		 * When altmap is in use, take the specified memory range
+		 * online, which includes the altmap.
+		 */
+		if (arg->altmap_nr_pages) {
+			start = PFN_PHYS(arg->altmap_start_pfn);
+			size += PFN_PHYS(arg->altmap_nr_pages);
+		}
 		rc = sclp_mem_change_state(start, size, 1);
+		if (rc || !arg->altmap_nr_pages)
+			break;
+		/*
+		 * Set CMMA state to nodat here, since the struct page memory
+		 * at the beginning of the memory block will not go through the
+		 * buddy allocator later.
+		 */
+		__arch_set_page_nodat((void *)__va(start), arg->altmap_nr_pages);
 		break;
-	case MEM_CANCEL_ONLINE:
-		sclp_mem_change_state(start, size, 0);
-		break;
-	case MEM_OFFLINE:
+	case MEM_FINISH_OFFLINE:
+		/*
+		 * When altmap is in use, take the specified memory range
+		 * offline, which includes the altmap.
+		 */
+		if (arg->altmap_nr_pages) {
+			start = PFN_PHYS(arg->altmap_start_pfn);
+			size += PFN_PHYS(arg->altmap_nr_pages);
+		}
 		sclp_mem_change_state(start, size, 0);
 		break;
 	default:
@@ -397,7 +424,9 @@ static void __init add_memory_merged(u16 rn)
 	if (!size)
 		goto skip_add;
 	for (addr = start; addr < start + size; addr += block_size)
-		add_memory(0, addr, block_size, MHP_NONE);
+		add_memory(0, addr, block_size,
+			   MACHINE_HAS_EDAT1 ?
+			   MHP_MEMMAP_ON_MEMORY | MHP_OFFLINE_INACCESSIBLE : MHP_NONE);
 skip_add:
 	first_rn = rn;
 	num = 1;

From a53614278c79577fff1405716c76018633442344 Mon Sep 17 00:00:00 2001
From: Sumanth Korikkar <sumanthk@linux.ibm.com>
Date: Mon, 8 Jan 2024 14:27:47 +0100
Subject: [PATCH 0958/1406] s390: enable MHP_MEMMAP_ON_MEMORY

Enable MHP_MEMMAP_ON_MEMORY to support "memmap on memory".
memory_hotplug.memmap_on_memory=true kernel parameter should be set in
kernel boot option to enable the feature.

Link: https://lkml.kernel.org/r/20240108132747.3238763-6-sumanthk@linux.ibm.com
Reviewed-by: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Signed-off-by: Sumanth Korikkar <sumanthk@linux.ibm.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/s390/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index fe565f3a3a917d..a1d6dcbc89654c 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -113,6 +113,7 @@ config S390
 	select ARCH_INLINE_WRITE_UNLOCK_BH
 	select ARCH_INLINE_WRITE_UNLOCK_IRQ
 	select ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE
+	select ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE
 	select ARCH_STACKWALK
 	select ARCH_SUPPORTS_ATOMIC_RMW
 	select ARCH_SUPPORTS_DEBUG_PAGEALLOC

From 678d4d5c8d38a41990a949f36470b6d24c9af742 Mon Sep 17 00:00:00 2001
From: Hongbo Li <lihongbo22@huawei.com>
Date: Mon, 8 Jan 2024 12:48:15 +0800
Subject: [PATCH 0959/1406] mm/filemap: avoid type conversion

The return type of function folio_test_hugetlb is bool type, there is no
need to assign it to an integer type.

Link: https://lkml.kernel.org/r/20240108044815.3291487-1-lihongbo22@huawei.com
Signed-off-by: Hongbo Li <lihongbo22@huawei.com>
Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/filemap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/filemap.c b/mm/filemap.c
index 4a30de98a8c75d..c7e67b22cc94ca 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -843,7 +843,7 @@ noinline int __filemap_add_folio(struct address_space *mapping,
 		struct folio *folio, pgoff_t index, gfp_t gfp, void **shadowp)
 {
 	XA_STATE(xas, &mapping->i_pages, index);
-	int huge = folio_test_hugetlb(folio);
+	bool huge = folio_test_hugetlb(folio);
 	bool charged = false;
 	long nr = 1;
 

From c3f1ab8b996bd983d429ad5c873ad506f5266498 Mon Sep 17 00:00:00 2001
From: JP Kobryn <inwardvessel@gmail.com>
Date: Fri, 5 Jan 2024 12:24:01 -0800
Subject: [PATCH 0960/1406] selftests/mm/ksm_functional: prevent unmapping
 undefined address

Replace some goto statements with return statements so that unmap() is not
called on an undefined address.  This change is made so that unmap() can
only be reached after mmap() is called (and the address mentioned is
defined).  Returning MAP_FAILED seems acceptable since client code checks
for this value.

Link: https://lkml.kernel.org/r/20240105202401.28851-1-inwardvessel@gmail.com
Fixes: 42096aa24b82 ("selftest/mm: ksm_functional_tests: test in mmap_and_merge_range() if anything got merged")
Signed-off-by: JP Kobryn <inwardvessel@gmail.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/ksm_functional_tests.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/mm/ksm_functional_tests.c b/tools/testing/selftests/mm/ksm_functional_tests.c
index fbff0dd09191f1..d615767e396bec 100644
--- a/tools/testing/selftests/mm/ksm_functional_tests.c
+++ b/tools/testing/selftests/mm/ksm_functional_tests.c
@@ -155,12 +155,12 @@ static char *mmap_and_merge_range(char val, unsigned long size, int prot,
 	/* Stabilize accounting by disabling KSM completely. */
 	if (ksm_unmerge()) {
 		ksft_test_result_fail("Disabling (unmerging) KSM failed\n");
-		goto unmap;
+		return MAP_FAILED;
 	}
 
 	if (get_my_merging_pages() > 0) {
 		ksft_test_result_fail("Still pages merged\n");
-		goto unmap;
+		return MAP_FAILED;
 	}
 
 	map = mmap(NULL, size, PROT_READ|PROT_WRITE,

From 80dee6252c0a36207677f071f10b2390c751742c Mon Sep 17 00:00:00 2001
From: Breno Leitao <leitao@debian.org>
Date: Fri, 5 Jan 2024 07:54:19 -0800
Subject: [PATCH 0961/1406] selftests/mm: new test that steals pages

This test stresses the race between of madvise(DONTNEED), a page fault
and a parallel huge page mmap, which should fail due to lack of
available page available for mapping.

This test case must run on a system with one and only one huge page
available.

	# echo 1 > /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages

During setup, the test allocates the only available page, and starts
three threads:

  - thread 1:
      * madvise(MADV_DONTNEED) on the allocated huge page
  - thread 2:
      * Write to the allocated huge page
  - thread 3:
      * Tries to allocated (steal) an extra huge page (which is not
        available)

thread 3 should never succeed in the allocation, since the only huge
page was never unmapped, and should be reserved.

Touching the old page after thread3 allocation will raise a SIGBUS.

Link: https://lkml.kernel.org/r/20240105155419.1939484-2-leitao@debian.org
Signed-off-by: Breno Leitao <leitao@debian.org>
Cc: Mike Rapoport (IBM) <rppt@kernel.org>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Vegard Nossum <vegard.nossum@oracle.com>
Cc: Yang Shi <shy828301@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/.gitignore         |   1 +
 tools/testing/selftests/mm/Makefile           |   1 +
 .../selftests/mm/hugetlb_madv_vs_map.c        | 124 ++++++++++++++++++
 3 files changed, 126 insertions(+)
 create mode 100644 tools/testing/selftests/mm/hugetlb_madv_vs_map.c

diff --git a/tools/testing/selftests/mm/.gitignore b/tools/testing/selftests/mm/.gitignore
index 4ff10ea6146179..d26e962f2ac490 100644
--- a/tools/testing/selftests/mm/.gitignore
+++ b/tools/testing/selftests/mm/.gitignore
@@ -46,3 +46,4 @@ gup_longterm
 mkdirty
 va_high_addr_switch
 hugetlb_fault_after_madv
+hugetlb_madv_vs_map
diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile
index 2453add65d12f8..990e9bb112c507 100644
--- a/tools/testing/selftests/mm/Makefile
+++ b/tools/testing/selftests/mm/Makefile
@@ -70,6 +70,7 @@ TEST_GEN_FILES += ksm_tests
 TEST_GEN_FILES += ksm_functional_tests
 TEST_GEN_FILES += mdwe_test
 TEST_GEN_FILES += hugetlb_fault_after_madv
+TEST_GEN_FILES += hugetlb_madv_vs_map
 
 ifneq ($(ARCH),arm64)
 TEST_GEN_FILES += soft-dirty
diff --git a/tools/testing/selftests/mm/hugetlb_madv_vs_map.c b/tools/testing/selftests/mm/hugetlb_madv_vs_map.c
new file mode 100644
index 00000000000000..d01e8d4901d0b5
--- /dev/null
+++ b/tools/testing/selftests/mm/hugetlb_madv_vs_map.c
@@ -0,0 +1,124 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * A test case that must run on a system with one and only one huge page available.
+ *	# echo 1 > /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages
+ *
+ * During setup, the test allocates the only available page, and starts three threads:
+ *  - thread1:
+ *	* madvise(MADV_DONTNEED) on the allocated huge page
+ *  - thread 2:
+ *	* Write to the allocated huge page
+ *  - thread 3:
+ *	* Try to allocated an extra huge page (which must not available)
+ *
+ *  The test fails if thread3 is able to allocate a page.
+ *
+ *  Touching the first page after thread3's allocation will raise a SIGBUS
+ *
+ *  Author: Breno Leitao <leitao@debian.org>
+ */
+#include <pthread.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "vm_util.h"
+#include "../kselftest.h"
+
+#define MMAP_SIZE (1 << 21)
+#define INLOOP_ITER 100
+
+char *huge_ptr;
+
+/* Touch the memory while it is being madvised() */
+void *touch(void *unused)
+{
+	for (int i = 0; i < INLOOP_ITER; i++)
+		huge_ptr[0] = '.';
+
+	return NULL;
+}
+
+void *madv(void *unused)
+{
+	for (int i = 0; i < INLOOP_ITER; i++)
+		madvise(huge_ptr, MMAP_SIZE, MADV_DONTNEED);
+
+	return NULL;
+}
+
+/*
+ * We got here, and there must be no huge page available for mapping
+ * The other hugepage should be flipping from used <-> reserved, because
+ * of madvise(DONTNEED).
+ */
+void *map_extra(void *unused)
+{
+	void *ptr;
+
+	for (int i = 0; i < INLOOP_ITER; i++) {
+		ptr = mmap(NULL, MMAP_SIZE, PROT_READ | PROT_WRITE,
+			   MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB,
+			   -1, 0);
+
+		if ((long)ptr != -1) {
+			/* Touching the other page now will cause a SIGBUG
+			 * huge_ptr[0] = '1';
+			 */
+			return ptr;
+		}
+	}
+
+	return NULL;
+}
+
+int main(void)
+{
+	pthread_t thread1, thread2, thread3;
+	unsigned long free_hugepages;
+	void *ret;
+
+	/*
+	 * On kernel 6.7, we are able to reproduce the problem with ~10
+	 * interactions
+	 */
+	int max = 10;
+
+	free_hugepages = get_free_hugepages();
+
+	if (free_hugepages != 1) {
+		ksft_exit_skip("This test needs one and only one page to execute. Got %lu\n",
+			       free_hugepages);
+	}
+
+	while (max--) {
+		huge_ptr = mmap(NULL, MMAP_SIZE, PROT_READ | PROT_WRITE,
+				MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB,
+				-1, 0);
+
+		if ((unsigned long)huge_ptr == -1) {
+			ksft_exit_skip("Failed to allocated huge page\n");
+			return KSFT_SKIP;
+		}
+
+		pthread_create(&thread1, NULL, madv, NULL);
+		pthread_create(&thread2, NULL, touch, NULL);
+		pthread_create(&thread3, NULL, map_extra, NULL);
+
+		pthread_join(thread1, NULL);
+		pthread_join(thread2, NULL);
+		pthread_join(thread3, &ret);
+
+		if (ret) {
+			ksft_test_result_fail("Unexpected huge page allocation\n");
+			return KSFT_FAIL;
+		}
+
+		/* Unmap and restart */
+		munmap(huge_ptr, MMAP_SIZE);
+	}
+
+	return KSFT_PASS;
+}

From 20cbe2fe8f5eff651b82c2c69c820458c296f546 Mon Sep 17 00:00:00 2001
From: Kuan-Ying Lee <Kuan-Ying.Lee@mediatek.com>
Date: Wed, 7 Feb 2024 16:58:51 +0800
Subject: [PATCH 0962/1406] scripts/gdb/vmalloc: fix vmallocinfo error

The patch series "Mitigate a vmap lock contention" removes vmap_area_list,
which will break the gdb vmallocinfo command:

(gdb) lx-vmallocinfo
Python Exception <class 'gdb.error'>: No symbol "vmap_area_list" in current context.
Error occurred in Python: No symbol "vmap_area_list" in current context.

So we can instead use vmap_nodes to iterate all vmallocinfo.

Link: https://lkml.kernel.org/r/20240207085856.11190-1-Kuan-Ying.Lee@mediatek.com
Signed-off-by: Kuan-Ying Lee <Kuan-Ying.Lee@mediatek.com>
Cc: Casper Li <casper.li@mediatek.com>
Cc: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
Cc: Chinwen Chang <chinwen.chang@mediatek.com>
Cc: Jan Kiszka <jan.kiszka@siemens.com>
Cc: Kieran Bingham <kbingham@kernel.org>
Cc: Matthias Brugger <matthias.bgg@gmail.com>
Cc: Qun-Wei Lin <qun-wei.lin@mediatek.com>
Cc: Uladzislau Rezki (Sony) <urezki@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 scripts/gdb/linux/vmalloc.py | 56 +++++++++++++++++++-----------------
 1 file changed, 29 insertions(+), 27 deletions(-)

diff --git a/scripts/gdb/linux/vmalloc.py b/scripts/gdb/linux/vmalloc.py
index d3c8a0274d1eda..803f1737105289 100644
--- a/scripts/gdb/linux/vmalloc.py
+++ b/scripts/gdb/linux/vmalloc.py
@@ -29,32 +29,34 @@ def invoke(self, arg, from_tty):
         if not constants.LX_CONFIG_MMU:
             raise gdb.GdbError("Requires MMU support")
 
-        vmap_area_list = gdb.parse_and_eval('vmap_area_list')
-        for vmap_area in lists.list_for_each_entry(vmap_area_list, vmap_area_ptr_type, "list"):
-            if not vmap_area['vm']:
-                gdb.write("0x%x-0x%x %10d vm_map_ram\n" % (vmap_area['va_start'], vmap_area['va_end'],
-                    vmap_area['va_end'] - vmap_area['va_start']))
-                continue
-            v = vmap_area['vm']
-            gdb.write("0x%x-0x%x %10d" % (v['addr'], v['addr'] + v['size'], v['size']))
-            if v['caller']:
-                gdb.write(" %s" % str(v['caller']).split(' ')[-1])
-            if v['nr_pages']:
-                gdb.write(" pages=%d" % v['nr_pages'])
-            if v['phys_addr']:
-                gdb.write(" phys=0x%x" % v['phys_addr'])
-            if v['flags'] & constants.LX_VM_IOREMAP:
-                gdb.write(" ioremap")
-            if v['flags'] & constants.LX_VM_ALLOC:
-                gdb.write(" vmalloc")
-            if v['flags'] & constants.LX_VM_MAP:
-                gdb.write(" vmap")
-            if v['flags'] & constants.LX_VM_USERMAP:
-                gdb.write(" user")
-            if v['flags'] & constants.LX_VM_DMA_COHERENT:
-                gdb.write(" dma-coherent")
-            if is_vmalloc_addr(v['pages']):
-                gdb.write(" vpages")
-            gdb.write("\n")
+        nr_vmap_nodes = gdb.parse_and_eval('nr_vmap_nodes')
+        for i in range(0, nr_vmap_nodes):
+            vn = gdb.parse_and_eval('&vmap_nodes[%d]' % i)
+            for vmap_area in lists.list_for_each_entry(vn['busy']['head'], vmap_area_ptr_type, "list"):
+                if not vmap_area['vm']:
+                    gdb.write("0x%x-0x%x %10d vm_map_ram\n" % (vmap_area['va_start'], vmap_area['va_end'],
+                        vmap_area['va_end'] - vmap_area['va_start']))
+                    continue
+                v = vmap_area['vm']
+                gdb.write("0x%x-0x%x %10d" % (v['addr'], v['addr'] + v['size'], v['size']))
+                if v['caller']:
+                    gdb.write(" %s" % str(v['caller']).split(' ')[-1])
+                if v['nr_pages']:
+                    gdb.write(" pages=%d" % v['nr_pages'])
+                if v['phys_addr']:
+                    gdb.write(" phys=0x%x" % v['phys_addr'])
+                if v['flags'] & constants.LX_VM_IOREMAP:
+                    gdb.write(" ioremap")
+                if v['flags'] & constants.LX_VM_ALLOC:
+                    gdb.write(" vmalloc")
+                if v['flags'] & constants.LX_VM_MAP:
+                    gdb.write(" vmap")
+                if v['flags'] & constants.LX_VM_USERMAP:
+                    gdb.write(" user")
+                if v['flags'] & constants.LX_VM_DMA_COHERENT:
+                    gdb.write(" dma-coherent")
+                if is_vmalloc_addr(v['pages']):
+                    gdb.write(" vpages")
+                gdb.write("\n")
 
 LxVmallocInfo()

From b371c7a772d5e6ce4ed901c9ab2c46bd7910be6e Mon Sep 17 00:00:00 2001
From: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Date: Tue, 2 Jan 2024 19:46:23 +0100
Subject: [PATCH 0963/1406] mm: vmalloc: add va_alloc() helper

Patch series "Mitigate a vmap lock contention", v3.

1. Motivation

- Offload global vmap locks making it scaled to number of CPUS;

- If possible and there is an agreement, we can remove the "Per cpu kva
  allocator" to make the vmap code to be more simple;

- There were complaints from XFS folk that a vmalloc might be contented
  on their workloads.

2. Design(high level overview)

We introduce an effective vmap node logic.  A node behaves as independent
entity to serve an allocation request directly(if possible) from its pool.
That way it bypasses a global vmap space that is protected by its own
lock.

An access to pools are serialized by CPUs.  Number of nodes are equal to
number of CPUs in a system.  Please note the high threshold is bound to
128 nodes.

Pools are size segregated and populated based on system demand.  The
maximum alloc request that can be stored into a segregated storage is 256
pages.  The lazily drain path decays a pool by 25% as a first step and as
second populates it by fresh freed VAs for reuse instead of returning them
into a global space.

When a VA is obtained(alloc path), it is stored in separate nodes.  A
va->va_start address is converted into a correct node where it should be
placed and resided.  Doing so we balance VAs across the nodes as a result
an access becomes scalable.  The addr_to_node() function does a proper
address conversion to a correct node.

A vmap space is divided on segments with fixed size, it is 16 pages.  That
way any address can be associated with a segment number.  Number of
segments are equal to num_possible_cpus() but not grater then 128.  The
numeration starts from 0.  See below how it is converted:

static inline unsigned int
addr_to_node_id(unsigned long addr)
{
	return (addr / zone_size) % nr_nodes;
}

On a free path, a VA can be easily found by converting its "va_start"
address to a certain node it resides.  It is moved from "busy" data to
"lazy" data structure.  Later on, as noted earlier, the lazy kworker
decays each node pool and populates it by fresh incoming VAs.  Please
note, a VA is returned to a node that did an alloc request.

3. Test on AMD Ryzen Threadripper 3970X 32-Core Processor

sudo ./test_vmalloc.sh run_test_mask=7 nr_threads=64

<default perf>
 94.41%     0.89%  [kernel]        [k] _raw_spin_lock
 93.35%    93.07%  [kernel]        [k] native_queued_spin_lock_slowpath
 76.13%     0.28%  [kernel]        [k] __vmalloc_node_range
 72.96%     0.81%  [kernel]        [k] alloc_vmap_area
 56.94%     0.00%  [kernel]        [k] __get_vm_area_node
 41.95%     0.00%  [kernel]        [k] vmalloc
 37.15%     0.01%  [test_vmalloc]  [k] full_fit_alloc_test
 35.17%     0.00%  [kernel]        [k] ret_from_fork_asm
 35.17%     0.00%  [kernel]        [k] ret_from_fork
 35.17%     0.00%  [kernel]        [k] kthread
 35.08%     0.00%  [test_vmalloc]  [k] test_func
 34.45%     0.00%  [test_vmalloc]  [k] fix_size_alloc_test
 28.09%     0.01%  [test_vmalloc]  [k] long_busy_list_alloc_test
 23.53%     0.25%  [kernel]        [k] vfree.part.0
 21.72%     0.00%  [kernel]        [k] remove_vm_area
 20.08%     0.21%  [kernel]        [k] find_unlink_vmap_area
  2.34%     0.61%  [kernel]        [k] free_vmap_area_noflush
<default perf>
   vs
<patch-series perf>
 82.32%     0.22%  [test_vmalloc]  [k] long_busy_list_alloc_test
 63.36%     0.02%  [kernel]        [k] vmalloc
 63.34%     2.64%  [kernel]        [k] __vmalloc_node_range
 30.42%     4.46%  [kernel]        [k] vfree.part.0
 28.98%     2.51%  [kernel]        [k] __alloc_pages_bulk
 27.28%     0.19%  [kernel]        [k] __get_vm_area_node
 26.13%     1.50%  [kernel]        [k] alloc_vmap_area
 21.72%    21.67%  [kernel]        [k] clear_page_rep
 19.51%     2.43%  [kernel]        [k] _raw_spin_lock
 16.61%    16.51%  [kernel]        [k] native_queued_spin_lock_slowpath
 13.40%     2.07%  [kernel]        [k] free_unref_page
 10.62%     0.01%  [kernel]        [k] remove_vm_area
  9.02%     8.73%  [kernel]        [k] insert_vmap_area
  8.94%     0.00%  [kernel]        [k] ret_from_fork_asm
  8.94%     0.00%  [kernel]        [k] ret_from_fork
  8.94%     0.00%  [kernel]        [k] kthread
  8.29%     0.00%  [test_vmalloc]  [k] test_func
  7.81%     0.05%  [test_vmalloc]  [k] full_fit_alloc_test
  5.30%     4.73%  [kernel]        [k] purge_vmap_node
  4.47%     2.65%  [kernel]        [k] free_vmap_area_noflush
<patch-series perf>

confirms that a native_queued_spin_lock_slowpath goes down to
16.51% percent from 93.07%.

The throughput is ~12x higher:

urezki@pc638:~$ time sudo ./test_vmalloc.sh run_test_mask=7 nr_threads=64
Run the test with following parameters: run_test_mask=7 nr_threads=64
Done.
Check the kernel ring buffer to see the summary.

real    10m51.271s
user    0m0.013s
sys     0m0.187s
urezki@pc638:~$

urezki@pc638:~$ time sudo ./test_vmalloc.sh run_test_mask=7 nr_threads=64
Run the test with following parameters: run_test_mask=7 nr_threads=64
Done.
Check the kernel ring buffer to see the summary.

real    0m51.301s
user    0m0.015s
sys     0m0.040s
urezki@pc638:~$


This patch (of 11):

Currently __alloc_vmap_area() function contains an open codded logic that
finds and adjusts a VA based on allocation request.

Introduce a va_alloc() helper that adjusts found VA only.  There is no a
functional change as a result of this patch.

Link: https://lkml.kernel.org/r/20240102184633.748113-1-urezki@gmail.com
Link: https://lkml.kernel.org/r/20240102184633.748113-2-urezki@gmail.com
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Reviewed-by: Baoquan He <bhe@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Oleksiy Avramchenko <oleksiy.avramchenko@sony.com>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Kazuhito Hagio <k-hagio-ab@nec.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/vmalloc.c | 41 ++++++++++++++++++++++++++++-------------
 1 file changed, 28 insertions(+), 13 deletions(-)

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index d12a17fc0c171c..739401a9eafcfe 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1481,6 +1481,32 @@ adjust_va_to_fit_type(struct rb_root *root, struct list_head *head,
 	return 0;
 }
 
+static unsigned long
+va_alloc(struct vmap_area *va,
+		struct rb_root *root, struct list_head *head,
+		unsigned long size, unsigned long align,
+		unsigned long vstart, unsigned long vend)
+{
+	unsigned long nva_start_addr;
+	int ret;
+
+	if (va->va_start > vstart)
+		nva_start_addr = ALIGN(va->va_start, align);
+	else
+		nva_start_addr = ALIGN(vstart, align);
+
+	/* Check the "vend" restriction. */
+	if (nva_start_addr + size > vend)
+		return vend;
+
+	/* Update the free vmap_area. */
+	ret = adjust_va_to_fit_type(root, head, va, nva_start_addr, size);
+	if (WARN_ON_ONCE(ret))
+		return vend;
+
+	return nva_start_addr;
+}
+
 /*
  * Returns a start address of the newly allocated area, if success.
  * Otherwise a vend is returned that indicates failure.
@@ -1493,7 +1519,6 @@ __alloc_vmap_area(struct rb_root *root, struct list_head *head,
 	bool adjust_search_size = true;
 	unsigned long nva_start_addr;
 	struct vmap_area *va;
-	int ret;
 
 	/*
 	 * Do not adjust when:
@@ -1511,18 +1536,8 @@ __alloc_vmap_area(struct rb_root *root, struct list_head *head,
 	if (unlikely(!va))
 		return vend;
 
-	if (va->va_start > vstart)
-		nva_start_addr = ALIGN(va->va_start, align);
-	else
-		nva_start_addr = ALIGN(vstart, align);
-
-	/* Check the "vend" restriction. */
-	if (nva_start_addr + size > vend)
-		return vend;
-
-	/* Update the free vmap_area. */
-	ret = adjust_va_to_fit_type(root, head, va, nva_start_addr, size);
-	if (WARN_ON_ONCE(ret))
+	nva_start_addr = va_alloc(va, root, head, size, align, vstart, vend);
+	if (nva_start_addr == vend)
 		return vend;
 
 #if DEBUG_AUGMENT_LOWEST_MATCH_CHECK

From 723f950792eb9423033b24f12a25590433b9896a Mon Sep 17 00:00:00 2001
From: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Date: Tue, 2 Jan 2024 19:46:24 +0100
Subject: [PATCH 0964/1406] mm: vmalloc: rename adjust_va_to_fit_type()
 function

This patch renames the adjust_va_to_fit_type() function to va_clip() which
is shorter and more expressive.

There is no a functional change as a result of this patch.

Link: https://lkml.kernel.org/r/20240102184633.748113-3-urezki@gmail.com
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Reviewed-by: Baoquan He <bhe@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Cc: Kazuhito Hagio <k-hagio-ab@nec.com>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Oleksiy Avramchenko <oleksiy.avramchenko@sony.com>
Cc: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/vmalloc.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 739401a9eafcfe..10f289e865122a 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1382,9 +1382,9 @@ classify_va_fit_type(struct vmap_area *va,
 }
 
 static __always_inline int
-adjust_va_to_fit_type(struct rb_root *root, struct list_head *head,
-		      struct vmap_area *va, unsigned long nva_start_addr,
-		      unsigned long size)
+va_clip(struct rb_root *root, struct list_head *head,
+		struct vmap_area *va, unsigned long nva_start_addr,
+		unsigned long size)
 {
 	struct vmap_area *lva = NULL;
 	enum fit_type type = classify_va_fit_type(va, nva_start_addr, size);
@@ -1500,7 +1500,7 @@ va_alloc(struct vmap_area *va,
 		return vend;
 
 	/* Update the free vmap_area. */
-	ret = adjust_va_to_fit_type(root, head, va, nva_start_addr, size);
+	ret = va_clip(root, head, va, nva_start_addr, size);
 	if (WARN_ON_ONCE(ret))
 		return vend;
 
@@ -4155,9 +4155,8 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
 			/* It is a BUG(), but trigger recovery instead. */
 			goto recovery;
 
-		ret = adjust_va_to_fit_type(&free_vmap_area_root,
-					    &free_vmap_area_list,
-					    va, start, size);
+		ret = va_clip(&free_vmap_area_root,
+			&free_vmap_area_list, va, start, size);
 		if (WARN_ON_ONCE(unlikely(ret)))
 			/* It is a BUG(), but trigger recovery instead. */
 			goto recovery;

From a97db964e575942a0f0b0ac1b4c1b387b9dd77ed Mon Sep 17 00:00:00 2001
From: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Date: Tue, 2 Jan 2024 19:46:25 +0100
Subject: [PATCH 0965/1406] mm: vmalloc: move vmap_init_free_space() down in
 vmalloc.c

A vmap_init_free_space() is a function that setups a vmap space and is
considered as part of initialization phase.  Since a main entry which is
vmalloc_init(), has been moved down in vmalloc.c it makes sense to follow
the pattern.

There is no a functional change as a result of this patch.

Link: https://lkml.kernel.org/r/20240102184633.748113-4-urezki@gmail.com
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Reviewed-by: Baoquan He <bhe@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Cc: Kazuhito Hagio <k-hagio-ab@nec.com>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Oleksiy Avramchenko <oleksiy.avramchenko@sony.com>
Cc: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/vmalloc.c | 82 ++++++++++++++++++++++++++--------------------------
 1 file changed, 41 insertions(+), 41 deletions(-)

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 10f289e865122a..06bd843d18ae99 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -2512,47 +2512,6 @@ void __init vm_area_register_early(struct vm_struct *vm, size_t align)
 	kasan_populate_early_vm_area_shadow(vm->addr, vm->size);
 }
 
-static void vmap_init_free_space(void)
-{
-	unsigned long vmap_start = 1;
-	const unsigned long vmap_end = ULONG_MAX;
-	struct vmap_area *busy, *free;
-
-	/*
-	 *     B     F     B     B     B     F
-	 * -|-----|.....|-----|-----|-----|.....|-
-	 *  |           The KVA space           |
-	 *  |<--------------------------------->|
-	 */
-	list_for_each_entry(busy, &vmap_area_list, list) {
-		if (busy->va_start - vmap_start > 0) {
-			free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
-			if (!WARN_ON_ONCE(!free)) {
-				free->va_start = vmap_start;
-				free->va_end = busy->va_start;
-
-				insert_vmap_area_augment(free, NULL,
-					&free_vmap_area_root,
-						&free_vmap_area_list);
-			}
-		}
-
-		vmap_start = busy->va_end;
-	}
-
-	if (vmap_end - vmap_start > 0) {
-		free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
-		if (!WARN_ON_ONCE(!free)) {
-			free->va_start = vmap_start;
-			free->va_end = vmap_end;
-
-			insert_vmap_area_augment(free, NULL,
-				&free_vmap_area_root,
-					&free_vmap_area_list);
-		}
-	}
-}
-
 static inline void setup_vmalloc_vm_locked(struct vm_struct *vm,
 	struct vmap_area *va, unsigned long flags, const void *caller)
 {
@@ -4465,6 +4424,47 @@ module_init(proc_vmalloc_init);
 
 #endif
 
+static void vmap_init_free_space(void)
+{
+	unsigned long vmap_start = 1;
+	const unsigned long vmap_end = ULONG_MAX;
+	struct vmap_area *busy, *free;
+
+	/*
+	 *     B     F     B     B     B     F
+	 * -|-----|.....|-----|-----|-----|.....|-
+	 *  |           The KVA space           |
+	 *  |<--------------------------------->|
+	 */
+	list_for_each_entry(busy, &vmap_area_list, list) {
+		if (busy->va_start - vmap_start > 0) {
+			free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
+			if (!WARN_ON_ONCE(!free)) {
+				free->va_start = vmap_start;
+				free->va_end = busy->va_start;
+
+				insert_vmap_area_augment(free, NULL,
+					&free_vmap_area_root,
+						&free_vmap_area_list);
+			}
+		}
+
+		vmap_start = busy->va_end;
+	}
+
+	if (vmap_end - vmap_start > 0) {
+		free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
+		if (!WARN_ON_ONCE(!free)) {
+			free->va_start = vmap_start;
+			free->va_end = vmap_end;
+
+			insert_vmap_area_augment(free, NULL,
+				&free_vmap_area_root,
+					&free_vmap_area_list);
+		}
+	}
+}
+
 void __init vmalloc_init(void)
 {
 	struct vmap_area *va;

From 7d7ce96df4f59824531d74562c69920e7a80142d Mon Sep 17 00:00:00 2001
From: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Date: Tue, 2 Jan 2024 19:46:26 +0100
Subject: [PATCH 0966/1406] mm: vmalloc: remove global vmap_area_root rb-tree

Store allocated objects in a separate nodes.  A va->va_start address is
converted into a correct node where it should be placed and resided.  An
addr_to_node() function is used to do a proper address conversion to
determine a node that contains a VA.

Such approach balances VAs across nodes as a result an access becomes
scalable.  Number of nodes in a system depends on number of CPUs.

Please note:

1. As of now allocated VAs are bound to a node-0. It means the
   patch does not give any difference comparing with a current
   behavior;

2. The global vmap_area_lock, vmap_area_root are removed as there
   is no need in it anymore. The vmap_area_list is still kept and
   is _empty_. It is exported for a kexec only;

3. The vmallocinfo and vread() have to be reworked to be able to
   handle multiple nodes.

Link: https://lkml.kernel.org/r/20240102184633.748113-5-urezki@gmail.com
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Reviewed-by: Baoquan He <bhe@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Cc: Kazuhito Hagio <k-hagio-ab@nec.com>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Oleksiy Avramchenko <oleksiy.avramchenko@sony.com>
Cc: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/vmalloc.c | 240 +++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 173 insertions(+), 67 deletions(-)

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 06bd843d18ae99..786ecb18ae228b 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -728,11 +728,9 @@ EXPORT_SYMBOL(vmalloc_to_pfn);
 #define DEBUG_AUGMENT_LOWEST_MATCH_CHECK 0
 
 
-static DEFINE_SPINLOCK(vmap_area_lock);
 static DEFINE_SPINLOCK(free_vmap_area_lock);
 /* Export for kexec only */
 LIST_HEAD(vmap_area_list);
-static struct rb_root vmap_area_root = RB_ROOT;
 static bool vmap_initialized __read_mostly;
 
 static struct rb_root purge_vmap_area_root = RB_ROOT;
@@ -772,6 +770,38 @@ static struct rb_root free_vmap_area_root = RB_ROOT;
  */
 static DEFINE_PER_CPU(struct vmap_area *, ne_fit_preload_node);
 
+/*
+ * An effective vmap-node logic. Users make use of nodes instead
+ * of a global heap. It allows to balance an access and mitigate
+ * contention.
+ */
+struct rb_list {
+	struct rb_root root;
+	struct list_head head;
+	spinlock_t lock;
+};
+
+static struct vmap_node {
+	/* Bookkeeping data of this node. */
+	struct rb_list busy;
+} single;
+
+static struct vmap_node *vmap_nodes = &single;
+static __read_mostly unsigned int nr_vmap_nodes = 1;
+static __read_mostly unsigned int vmap_zone_size = 1;
+
+static inline unsigned int
+addr_to_node_id(unsigned long addr)
+{
+	return (addr / vmap_zone_size) % nr_vmap_nodes;
+}
+
+static inline struct vmap_node *
+addr_to_node(unsigned long addr)
+{
+	return &vmap_nodes[addr_to_node_id(addr)];
+}
+
 static __always_inline unsigned long
 va_size(struct vmap_area *va)
 {
@@ -803,10 +833,11 @@ unsigned long vmalloc_nr_pages(void)
 }
 
 /* Look up the first VA which satisfies addr < va_end, NULL if none. */
-static struct vmap_area *find_vmap_area_exceed_addr(unsigned long addr)
+static struct vmap_area *
+find_vmap_area_exceed_addr(unsigned long addr, struct rb_root *root)
 {
 	struct vmap_area *va = NULL;
-	struct rb_node *n = vmap_area_root.rb_node;
+	struct rb_node *n = root->rb_node;
 
 	addr = (unsigned long)kasan_reset_tag((void *)addr);
 
@@ -1552,12 +1583,14 @@ __alloc_vmap_area(struct rb_root *root, struct list_head *head,
  */
 static void free_vmap_area(struct vmap_area *va)
 {
+	struct vmap_node *vn = addr_to_node(va->va_start);
+
 	/*
 	 * Remove from the busy tree/list.
 	 */
-	spin_lock(&vmap_area_lock);
-	unlink_va(va, &vmap_area_root);
-	spin_unlock(&vmap_area_lock);
+	spin_lock(&vn->busy.lock);
+	unlink_va(va, &vn->busy.root);
+	spin_unlock(&vn->busy.lock);
 
 	/*
 	 * Insert/Merge it back to the free tree/list.
@@ -1600,6 +1633,7 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
 				int node, gfp_t gfp_mask,
 				unsigned long va_flags)
 {
+	struct vmap_node *vn;
 	struct vmap_area *va;
 	unsigned long freed;
 	unsigned long addr;
@@ -1645,9 +1679,11 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
 	va->vm = NULL;
 	va->flags = va_flags;
 
-	spin_lock(&vmap_area_lock);
-	insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
-	spin_unlock(&vmap_area_lock);
+	vn = addr_to_node(va->va_start);
+
+	spin_lock(&vn->busy.lock);
+	insert_vmap_area(va, &vn->busy.root, &vn->busy.head);
+	spin_unlock(&vn->busy.lock);
 
 	BUG_ON(!IS_ALIGNED(va->va_start, align));
 	BUG_ON(va->va_start < vstart);
@@ -1871,26 +1907,61 @@ static void free_unmap_vmap_area(struct vmap_area *va)
 
 struct vmap_area *find_vmap_area(unsigned long addr)
 {
+	struct vmap_node *vn;
 	struct vmap_area *va;
+	int i, j;
 
-	spin_lock(&vmap_area_lock);
-	va = __find_vmap_area(addr, &vmap_area_root);
-	spin_unlock(&vmap_area_lock);
+	/*
+	 * An addr_to_node_id(addr) converts an address to a node index
+	 * where a VA is located. If VA spans several zones and passed
+	 * addr is not the same as va->va_start, what is not common, we
+	 * may need to scan an extra nodes. See an example:
+	 *
+	 *      <--va-->
+	 * -|-----|-----|-----|-----|-
+	 *     1     2     0     1
+	 *
+	 * VA resides in node 1 whereas it spans 1 and 2. If passed
+	 * addr is within a second node we should do extra work. We
+	 * should mention that it is rare and is a corner case from
+	 * the other hand it has to be covered.
+	 */
+	i = j = addr_to_node_id(addr);
+	do {
+		vn = &vmap_nodes[i];
 
-	return va;
+		spin_lock(&vn->busy.lock);
+		va = __find_vmap_area(addr, &vn->busy.root);
+		spin_unlock(&vn->busy.lock);
+
+		if (va)
+			return va;
+	} while ((i = (i + 1) % nr_vmap_nodes) != j);
+
+	return NULL;
 }
 
 static struct vmap_area *find_unlink_vmap_area(unsigned long addr)
 {
+	struct vmap_node *vn;
 	struct vmap_area *va;
+	int i, j;
 
-	spin_lock(&vmap_area_lock);
-	va = __find_vmap_area(addr, &vmap_area_root);
-	if (va)
-		unlink_va(va, &vmap_area_root);
-	spin_unlock(&vmap_area_lock);
+	i = j = addr_to_node_id(addr);
+	do {
+		vn = &vmap_nodes[i];
 
-	return va;
+		spin_lock(&vn->busy.lock);
+		va = __find_vmap_area(addr, &vn->busy.root);
+		if (va)
+			unlink_va(va, &vn->busy.root);
+		spin_unlock(&vn->busy.lock);
+
+		if (va)
+			return va;
+	} while ((i = (i + 1) % nr_vmap_nodes) != j);
+
+	return NULL;
 }
 
 /*** Per cpu kva allocator ***/
@@ -2092,6 +2163,7 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
 
 static void free_vmap_block(struct vmap_block *vb)
 {
+	struct vmap_node *vn;
 	struct vmap_block *tmp;
 	struct xarray *xa;
 
@@ -2099,9 +2171,10 @@ static void free_vmap_block(struct vmap_block *vb)
 	tmp = xa_erase(xa, addr_to_vb_idx(vb->va->va_start));
 	BUG_ON(tmp != vb);
 
-	spin_lock(&vmap_area_lock);
-	unlink_va(vb->va, &vmap_area_root);
-	spin_unlock(&vmap_area_lock);
+	vn = addr_to_node(vb->va->va_start);
+	spin_lock(&vn->busy.lock);
+	unlink_va(vb->va, &vn->busy.root);
+	spin_unlock(&vn->busy.lock);
 
 	free_vmap_area_noflush(vb->va);
 	kfree_rcu(vb, rcu_head);
@@ -2525,9 +2598,11 @@ static inline void setup_vmalloc_vm_locked(struct vm_struct *vm,
 static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
 			      unsigned long flags, const void *caller)
 {
-	spin_lock(&vmap_area_lock);
+	struct vmap_node *vn = addr_to_node(va->va_start);
+
+	spin_lock(&vn->busy.lock);
 	setup_vmalloc_vm_locked(vm, va, flags, caller);
-	spin_unlock(&vmap_area_lock);
+	spin_unlock(&vn->busy.lock);
 }
 
 static void clear_vm_uninitialized_flag(struct vm_struct *vm)
@@ -3715,6 +3790,7 @@ static size_t vmap_ram_vread_iter(struct iov_iter *iter, const char *addr,
  */
 long vread_iter(struct iov_iter *iter, const char *addr, size_t count)
 {
+	struct vmap_node *vn;
 	struct vmap_area *va;
 	struct vm_struct *vm;
 	char *vaddr;
@@ -3728,8 +3804,11 @@ long vread_iter(struct iov_iter *iter, const char *addr, size_t count)
 
 	remains = count;
 
-	spin_lock(&vmap_area_lock);
-	va = find_vmap_area_exceed_addr((unsigned long)addr);
+	/* Hooked to node_0 so far. */
+	vn = addr_to_node(0);
+	spin_lock(&vn->busy.lock);
+
+	va = find_vmap_area_exceed_addr((unsigned long)addr, &vn->busy.root);
 	if (!va)
 		goto finished_zero;
 
@@ -3737,7 +3816,7 @@ long vread_iter(struct iov_iter *iter, const char *addr, size_t count)
 	if ((unsigned long)addr + remains <= va->va_start)
 		goto finished_zero;
 
-	list_for_each_entry_from(va, &vmap_area_list, list) {
+	list_for_each_entry_from(va, &vn->busy.head, list) {
 		size_t copied;
 
 		if (remains == 0)
@@ -3796,12 +3875,12 @@ long vread_iter(struct iov_iter *iter, const char *addr, size_t count)
 	}
 
 finished_zero:
-	spin_unlock(&vmap_area_lock);
+	spin_unlock(&vn->busy.lock);
 	/* zero-fill memory holes */
 	return count - remains + zero_iter(iter, remains);
 finished:
 	/* Nothing remains, or We couldn't copy/zero everything. */
-	spin_unlock(&vmap_area_lock);
+	spin_unlock(&vn->busy.lock);
 
 	return count - remains;
 }
@@ -4135,14 +4214,15 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
 	}
 
 	/* insert all vm's */
-	spin_lock(&vmap_area_lock);
 	for (area = 0; area < nr_vms; area++) {
-		insert_vmap_area(vas[area], &vmap_area_root, &vmap_area_list);
+		struct vmap_node *vn = addr_to_node(vas[area]->va_start);
 
+		spin_lock(&vn->busy.lock);
+		insert_vmap_area(vas[area], &vn->busy.root, &vn->busy.head);
 		setup_vmalloc_vm_locked(vms[area], vas[area], VM_ALLOC,
 				 pcpu_get_vm_areas);
+		spin_unlock(&vn->busy.lock);
 	}
-	spin_unlock(&vmap_area_lock);
 
 	/*
 	 * Mark allocated areas as accessible. Do it now as a best-effort
@@ -4253,55 +4333,57 @@ bool vmalloc_dump_obj(void *object)
 {
 	void *objp = (void *)PAGE_ALIGN((unsigned long)object);
 	const void *caller;
-	struct vm_struct *vm;
 	struct vmap_area *va;
+	struct vmap_node *vn;
 	unsigned long addr;
 	unsigned int nr_pages;
+	bool success = false;
 
-	if (!spin_trylock(&vmap_area_lock))
-		return false;
-	va = __find_vmap_area((unsigned long)objp, &vmap_area_root);
-	if (!va) {
-		spin_unlock(&vmap_area_lock);
-		return false;
-	}
+	vn = addr_to_node((unsigned long)objp);
 
-	vm = va->vm;
-	if (!vm) {
-		spin_unlock(&vmap_area_lock);
-		return false;
+	if (spin_trylock(&vn->busy.lock)) {
+		va = __find_vmap_area(addr, &vn->busy.root);
+
+		if (va && va->vm) {
+			addr = (unsigned long)va->vm->addr;
+			caller = va->vm->caller;
+			nr_pages = va->vm->nr_pages;
+			success = true;
+		}
+
+		spin_unlock(&vn->busy.lock);
 	}
-	addr = (unsigned long)vm->addr;
-	caller = vm->caller;
-	nr_pages = vm->nr_pages;
-	spin_unlock(&vmap_area_lock);
-	pr_cont(" %u-page vmalloc region starting at %#lx allocated at %pS\n",
-		nr_pages, addr, caller);
-	return true;
+
+	if (success)
+		pr_cont(" %u-page vmalloc region starting at %#lx allocated at %pS\n",
+			nr_pages, addr, caller);
+
+	return success;
 }
 #endif
 
 #ifdef CONFIG_PROC_FS
 static void *s_start(struct seq_file *m, loff_t *pos)
-	__acquires(&vmap_purge_lock)
-	__acquires(&vmap_area_lock)
 {
+	struct vmap_node *vn = addr_to_node(0);
+
 	mutex_lock(&vmap_purge_lock);
-	spin_lock(&vmap_area_lock);
+	spin_lock(&vn->busy.lock);
 
-	return seq_list_start(&vmap_area_list, *pos);
+	return seq_list_start(&vn->busy.head, *pos);
 }
 
 static void *s_next(struct seq_file *m, void *p, loff_t *pos)
 {
-	return seq_list_next(p, &vmap_area_list, pos);
+	struct vmap_node *vn = addr_to_node(0);
+	return seq_list_next(p, &vn->busy.head, pos);
 }
 
 static void s_stop(struct seq_file *m, void *p)
-	__releases(&vmap_area_lock)
-	__releases(&vmap_purge_lock)
 {
-	spin_unlock(&vmap_area_lock);
+	struct vmap_node *vn = addr_to_node(0);
+
+	spin_unlock(&vn->busy.lock);
 	mutex_unlock(&vmap_purge_lock);
 }
 
@@ -4344,9 +4426,11 @@ static void show_purge_info(struct seq_file *m)
 
 static int s_show(struct seq_file *m, void *p)
 {
+	struct vmap_node *vn;
 	struct vmap_area *va;
 	struct vm_struct *v;
 
+	vn = addr_to_node(0);
 	va = list_entry(p, struct vmap_area, list);
 
 	if (!va->vm) {
@@ -4397,7 +4481,7 @@ static int s_show(struct seq_file *m, void *p)
 	 * As a final step, dump "unpurged" areas.
 	 */
 final:
-	if (list_is_last(&va->list, &vmap_area_list))
+	if (list_is_last(&va->list, &vn->busy.head))
 		show_purge_info(m);
 
 	return 0;
@@ -4428,7 +4512,8 @@ static void vmap_init_free_space(void)
 {
 	unsigned long vmap_start = 1;
 	const unsigned long vmap_end = ULONG_MAX;
-	struct vmap_area *busy, *free;
+	struct vmap_area *free;
+	struct vm_struct *busy;
 
 	/*
 	 *     B     F     B     B     B     F
@@ -4436,12 +4521,12 @@ static void vmap_init_free_space(void)
 	 *  |           The KVA space           |
 	 *  |<--------------------------------->|
 	 */
-	list_for_each_entry(busy, &vmap_area_list, list) {
-		if (busy->va_start - vmap_start > 0) {
+	for (busy = vmlist; busy; busy = busy->next) {
+		if ((unsigned long) busy->addr - vmap_start > 0) {
 			free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
 			if (!WARN_ON_ONCE(!free)) {
 				free->va_start = vmap_start;
-				free->va_end = busy->va_start;
+				free->va_end = (unsigned long) busy->addr;
 
 				insert_vmap_area_augment(free, NULL,
 					&free_vmap_area_root,
@@ -4449,7 +4534,7 @@ static void vmap_init_free_space(void)
 			}
 		}
 
-		vmap_start = busy->va_end;
+		vmap_start = (unsigned long) busy->addr + busy->size;
 	}
 
 	if (vmap_end - vmap_start > 0) {
@@ -4465,9 +4550,23 @@ static void vmap_init_free_space(void)
 	}
 }
 
+static void vmap_init_nodes(void)
+{
+	struct vmap_node *vn;
+	int i;
+
+	for (i = 0; i < nr_vmap_nodes; i++) {
+		vn = &vmap_nodes[i];
+		vn->busy.root = RB_ROOT;
+		INIT_LIST_HEAD(&vn->busy.head);
+		spin_lock_init(&vn->busy.lock);
+	}
+}
+
 void __init vmalloc_init(void)
 {
 	struct vmap_area *va;
+	struct vmap_node *vn;
 	struct vm_struct *tmp;
 	int i;
 
@@ -4489,6 +4588,11 @@ void __init vmalloc_init(void)
 		xa_init(&vbq->vmap_blocks);
 	}
 
+	/*
+	 * Setup nodes before importing vmlist.
+	 */
+	vmap_init_nodes();
+
 	/* Import existing vmlist entries. */
 	for (tmp = vmlist; tmp; tmp = tmp->next) {
 		va = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
@@ -4498,7 +4602,9 @@ void __init vmalloc_init(void)
 		va->va_start = (unsigned long)tmp->addr;
 		va->va_end = va->va_start + tmp->size;
 		va->vm = tmp;
-		insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
+
+		vn = addr_to_node(va->va_start);
+		insert_vmap_area(va, &vn->busy.root, &vn->busy.head);
 	}
 
 	/*

From 66fdefb670445005da9c2320185daa1188635471 Mon Sep 17 00:00:00 2001
From: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Date: Thu, 11 Jan 2024 14:26:28 +0100
Subject: [PATCH 0967/1406] mm: vmalloc: mark vmap_init_free_space() with
 __init tag

vmap_init_free_space() is called only once therefore tag it with __init.
Apart of that it access the "vmlist" variable that is located in
".init.data" section.

Link: https://lkml.kernel.org/r/20240111132628.299644-1-urezki@gmail.com
Fixes: 86817057732a ("mm: vmalloc: remove global vmap_area_root rb-tree")
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202401112056.I41bELL4-lkp@intel.com/
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Lorenzo Stoakes <lstoakes@gmail.com>
Reviewed-by: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Oleksiy Avramchenko <oleksiy.avramchenko@sony.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/vmalloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 786ecb18ae228b..666ea8a379f6bb 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -4508,7 +4508,7 @@ module_init(proc_vmalloc_init);
 
 #endif
 
-static void vmap_init_free_space(void)
+static void __init vmap_init_free_space(void)
 {
 	unsigned long vmap_start = 1;
 	const unsigned long vmap_end = ULONG_MAX;

From b1d0e7c2b401e4765dcea227172c5bc7eb1723b9 Mon Sep 17 00:00:00 2001
From: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Date: Thu, 11 Jan 2024 13:11:04 +0100
Subject: [PATCH 0968/1406] fix a wrong value passed to __find_vmap_area()

There was a typo in the vmalloc_dump_obj() function.  Instead of passing a
real address which is "objp" an "addr" was used what is wrong and not
initialized.

Link: https://lkml.kernel.org/r/20240111121104.180993-1-urezki@gmail.com
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Reported-by: kernel test robot <lkp@intel.com>
Fixes: 86817057732a ("mm: vmalloc: remove global vmap_area_root rb-tree")
Closes: https://lore.kernel.org/oe-kbuild-all/202401111810.TKPIXLCs-lkp@intel.com/
Cc: Baoquan He <bhe@redhat.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Oleksiy Avramchenko <oleksiy.avramchenko@sony.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/vmalloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 666ea8a379f6bb..86efebf0e0c8a7 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -4342,7 +4342,7 @@ bool vmalloc_dump_obj(void *object)
 	vn = addr_to_node((unsigned long)objp);
 
 	if (spin_trylock(&vn->busy.lock)) {
-		va = __find_vmap_area(addr, &vn->busy.root);
+		va = __find_vmap_area((unsigned long)objp, &vn->busy.root);
 
 		if (va && va->vm) {
 			addr = (unsigned long)va->vm->addr;

From 378eb24a0658dd922b29524e0ce35c6c43f56cba Mon Sep 17 00:00:00 2001
From: Baoquan He <bhe@redhat.com>
Date: Tue, 2 Jan 2024 19:46:27 +0100
Subject: [PATCH 0969/1406] mm/vmalloc: remove vmap_area_list

Earlier, vmap_area_list is exported to vmcoreinfo so that makedumpfile get
the base address of vmalloc area.  Now, vmap_area_list is empty, so export
VMALLOC_START to vmcoreinfo instead, and remove vmap_area_list.

Link: https://lkml.kernel.org/r/20240102184633.748113-6-urezki@gmail.com
Signed-off-by: Baoquan He <bhe@redhat.com>
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Acked-by: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Cc: Kazuhito Hagio <k-hagio-ab@nec.com>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Oleksiy Avramchenko <oleksiy.avramchenko@sony.com>
Cc: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/admin-guide/kdump/vmcoreinfo.rst | 8 ++++----
 arch/arm64/kernel/crash_core.c                 | 1 -
 arch/riscv/kernel/crash_core.c                 | 1 -
 include/linux/vmalloc.h                        | 1 -
 kernel/crash_core.c                            | 4 +---
 kernel/kallsyms_selftest.c                     | 1 -
 mm/nommu.c                                     | 2 --
 mm/vmalloc.c                                   | 2 --
 8 files changed, 5 insertions(+), 15 deletions(-)

diff --git a/Documentation/admin-guide/kdump/vmcoreinfo.rst b/Documentation/admin-guide/kdump/vmcoreinfo.rst
index bced9e4b6e0899..0f714fc945acf4 100644
--- a/Documentation/admin-guide/kdump/vmcoreinfo.rst
+++ b/Documentation/admin-guide/kdump/vmcoreinfo.rst
@@ -65,11 +65,11 @@ Defines the beginning of the text section. In general, _stext indicates
 the kernel start address. Used to convert a virtual address from the
 direct kernel map to a physical address.
 
-vmap_area_list
---------------
+VMALLOC_START
+-------------
 
-Stores the virtual area list. makedumpfile gets the vmalloc start value
-from this variable and its value is necessary for vmalloc translation.
+Stores the base address of vmalloc area. makedumpfile gets this value
+since is necessary for vmalloc translation.
 
 mem_map
 -------
diff --git a/arch/arm64/kernel/crash_core.c b/arch/arm64/kernel/crash_core.c
index 66cde752cd7409..2a24199a9b81e0 100644
--- a/arch/arm64/kernel/crash_core.c
+++ b/arch/arm64/kernel/crash_core.c
@@ -23,7 +23,6 @@ void arch_crash_save_vmcoreinfo(void)
 	/* Please note VMCOREINFO_NUMBER() uses "%d", not "%x" */
 	vmcoreinfo_append_str("NUMBER(MODULES_VADDR)=0x%lx\n", MODULES_VADDR);
 	vmcoreinfo_append_str("NUMBER(MODULES_END)=0x%lx\n", MODULES_END);
-	vmcoreinfo_append_str("NUMBER(VMALLOC_START)=0x%lx\n", VMALLOC_START);
 	vmcoreinfo_append_str("NUMBER(VMALLOC_END)=0x%lx\n", VMALLOC_END);
 	vmcoreinfo_append_str("NUMBER(VMEMMAP_START)=0x%lx\n", VMEMMAP_START);
 	vmcoreinfo_append_str("NUMBER(VMEMMAP_END)=0x%lx\n", VMEMMAP_END);
diff --git a/arch/riscv/kernel/crash_core.c b/arch/riscv/kernel/crash_core.c
index 8706736fd4e2dc..d18d529fd9b984 100644
--- a/arch/riscv/kernel/crash_core.c
+++ b/arch/riscv/kernel/crash_core.c
@@ -8,7 +8,6 @@ void arch_crash_save_vmcoreinfo(void)
 	VMCOREINFO_NUMBER(phys_ram_base);
 
 	vmcoreinfo_append_str("NUMBER(PAGE_OFFSET)=0x%lx\n", PAGE_OFFSET);
-	vmcoreinfo_append_str("NUMBER(VMALLOC_START)=0x%lx\n", VMALLOC_START);
 	vmcoreinfo_append_str("NUMBER(VMALLOC_END)=0x%lx\n", VMALLOC_END);
 #ifdef CONFIG_MMU
 	VMCOREINFO_NUMBER(VA_BITS);
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index c720be70c8ddde..91810b4e95107b 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -253,7 +253,6 @@ extern long vread_iter(struct iov_iter *iter, const char *addr, size_t count);
 /*
  *	Internals.  Don't use..
  */
-extern struct list_head vmap_area_list;
 extern __init void vm_area_add_early(struct vm_struct *vm);
 extern __init void vm_area_register_early(struct vm_struct *vm, size_t align);
 
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 75cd6a736d0306..b60de490c1fccb 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -748,7 +748,7 @@ static int __init crash_save_vmcoreinfo_init(void)
 	VMCOREINFO_SYMBOL_ARRAY(swapper_pg_dir);
 #endif
 	VMCOREINFO_SYMBOL(_stext);
-	VMCOREINFO_SYMBOL(vmap_area_list);
+	vmcoreinfo_append_str("NUMBER(VMALLOC_START)=0x%lx\n", VMALLOC_START);
 
 #ifndef CONFIG_NUMA
 	VMCOREINFO_SYMBOL(mem_map);
@@ -789,8 +789,6 @@ static int __init crash_save_vmcoreinfo_init(void)
 	VMCOREINFO_OFFSET(free_area, free_list);
 	VMCOREINFO_OFFSET(list_head, next);
 	VMCOREINFO_OFFSET(list_head, prev);
-	VMCOREINFO_OFFSET(vmap_area, va_start);
-	VMCOREINFO_OFFSET(vmap_area, list);
 	VMCOREINFO_LENGTH(zone.free_area, NR_PAGE_ORDERS);
 	log_buf_vmcoreinfo_setup();
 	VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);
diff --git a/kernel/kallsyms_selftest.c b/kernel/kallsyms_selftest.c
index b4cac76ea5e989..8a689b4ff4f982 100644
--- a/kernel/kallsyms_selftest.c
+++ b/kernel/kallsyms_selftest.c
@@ -89,7 +89,6 @@ static struct test_item test_items[] = {
 	ITEM_DATA(kallsyms_test_var_data_static),
 	ITEM_DATA(kallsyms_test_var_bss),
 	ITEM_DATA(kallsyms_test_var_data),
-	ITEM_DATA(vmap_area_list),
 #endif
 };
 
diff --git a/mm/nommu.c b/mm/nommu.c
index b6dc558d314408..5ec8f44e7ce976 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -131,8 +131,6 @@ int follow_pfn(struct vm_area_struct *vma, unsigned long address,
 }
 EXPORT_SYMBOL(follow_pfn);
 
-LIST_HEAD(vmap_area_list);
-
 void vfree(const void *addr)
 {
 	kfree(addr);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 86efebf0e0c8a7..b5882790da0088 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -729,8 +729,6 @@ EXPORT_SYMBOL(vmalloc_to_pfn);
 
 
 static DEFINE_SPINLOCK(free_vmap_area_lock);
-/* Export for kexec only */
-LIST_HEAD(vmap_area_list);
 static bool vmap_initialized __read_mostly;
 
 static struct rb_root purge_vmap_area_root = RB_ROOT;

From cf0ae37372e77a3967f274a950f7cb9e8c243e2d Mon Sep 17 00:00:00 2001
From: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Date: Thu, 11 Jan 2024 20:23:29 +0100
Subject: [PATCH 0970/1406] mm: vmalloc: Fix a warning in the
 crash_save_vmcoreinfo_init()

The vmcoreinfo_append_str() function expects "long unsigned int" type as a
second argument(0x%lx) to print a beginning of vmalloc start address which
is defined as a VMALLOC_START macro.

For some architectures it can be considered as "int" type, for example m68
generates a compile warning message.  To fix it cast a second argument to
"unsigned long".

Link: https://lkml.kernel.org/r/20240111192329.449189-1-urezki@gmail.com
Fixes: 9bdb180b2db6 ("mm/vmalloc: remove vmap_area_list")
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202401120218.y469Puyf-lkp@intel.com/
Acked-by: Baoquan He <bhe@redhat.com>
Reviewed-by: Anshuman Khandual <anshuman.khandual@arm.com>
Reviewed-by: Lorenzo Stoakes <lstoakes@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/crash_core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index b60de490c1fccb..49b31e59d3ccd1 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -748,7 +748,7 @@ static int __init crash_save_vmcoreinfo_init(void)
 	VMCOREINFO_SYMBOL_ARRAY(swapper_pg_dir);
 #endif
 	VMCOREINFO_SYMBOL(_stext);
-	vmcoreinfo_append_str("NUMBER(VMALLOC_START)=0x%lx\n", VMALLOC_START);
+	vmcoreinfo_append_str("NUMBER(VMALLOC_START)=0x%lx\n", (unsigned long) VMALLOC_START);
 
 #ifndef CONFIG_NUMA
 	VMCOREINFO_SYMBOL(mem_map);

From 4afdcd0aecea7e157352a66a97768e1a7e9fe755 Mon Sep 17 00:00:00 2001
From: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Date: Tue, 2 Jan 2024 19:46:28 +0100
Subject: [PATCH 0971/1406] mm: vmalloc: remove global purge_vmap_area_root
 rb-tree

Similar to busy VA, lazily-freed area is stored to a node it belongs to.
Such approach does not require any global locking primitive, instead an
access becomes scalable what mitigates a contention.

This patch removes a global purge-lock, global purge-tree and global purge
list.

Link: https://lkml.kernel.org/r/20240102184633.748113-7-urezki@gmail.com
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Reviewed-by: Baoquan He <bhe@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Cc: Kazuhito Hagio <k-hagio-ab@nec.com>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Oleksiy Avramchenko <oleksiy.avramchenko@sony.com>
Cc: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/vmalloc.c | 135 +++++++++++++++++++++++++++++++--------------------
 1 file changed, 82 insertions(+), 53 deletions(-)

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index b5882790da0088..72822aeff55c22 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -731,10 +731,6 @@ EXPORT_SYMBOL(vmalloc_to_pfn);
 static DEFINE_SPINLOCK(free_vmap_area_lock);
 static bool vmap_initialized __read_mostly;
 
-static struct rb_root purge_vmap_area_root = RB_ROOT;
-static LIST_HEAD(purge_vmap_area_list);
-static DEFINE_SPINLOCK(purge_vmap_area_lock);
-
 /*
  * This kmem_cache is used for vmap_area objects. Instead of
  * allocating from slab we reuse an object from this cache to
@@ -782,6 +778,12 @@ struct rb_list {
 static struct vmap_node {
 	/* Bookkeeping data of this node. */
 	struct rb_list busy;
+	struct rb_list lazy;
+
+	/*
+	 * Ready-to-free areas.
+	 */
+	struct list_head purge_list;
 } single;
 
 static struct vmap_node *vmap_nodes = &single;
@@ -1766,40 +1768,22 @@ static DEFINE_MUTEX(vmap_purge_lock);
 
 /* for per-CPU blocks */
 static void purge_fragmented_blocks_allcpus(void);
+static cpumask_t purge_nodes;
 
 /*
  * Purges all lazily-freed vmap areas.
  */
-static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
+static unsigned long
+purge_vmap_node(struct vmap_node *vn)
 {
-	unsigned long resched_threshold;
-	unsigned int num_purged_areas = 0;
-	struct list_head local_purge_list;
+	unsigned long num_purged_areas = 0;
 	struct vmap_area *va, *n_va;
 
-	lockdep_assert_held(&vmap_purge_lock);
-
-	spin_lock(&purge_vmap_area_lock);
-	purge_vmap_area_root = RB_ROOT;
-	list_replace_init(&purge_vmap_area_list, &local_purge_list);
-	spin_unlock(&purge_vmap_area_lock);
-
-	if (unlikely(list_empty(&local_purge_list)))
-		goto out;
-
-	start = min(start,
-		list_first_entry(&local_purge_list,
-			struct vmap_area, list)->va_start);
-
-	end = max(end,
-		list_last_entry(&local_purge_list,
-			struct vmap_area, list)->va_end);
-
-	flush_tlb_kernel_range(start, end);
-	resched_threshold = lazy_max_pages() << 1;
+	if (list_empty(&vn->purge_list))
+		return 0;
 
 	spin_lock(&free_vmap_area_lock);
-	list_for_each_entry_safe(va, n_va, &local_purge_list, list) {
+	list_for_each_entry_safe(va, n_va, &vn->purge_list, list) {
 		unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT;
 		unsigned long orig_start = va->va_start;
 		unsigned long orig_end = va->va_end;
@@ -1821,13 +1805,55 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
 
 		atomic_long_sub(nr, &vmap_lazy_nr);
 		num_purged_areas++;
-
-		if (atomic_long_read(&vmap_lazy_nr) < resched_threshold)
-			cond_resched_lock(&free_vmap_area_lock);
 	}
 	spin_unlock(&free_vmap_area_lock);
 
-out:
+	return num_purged_areas;
+}
+
+/*
+ * Purges all lazily-freed vmap areas.
+ */
+static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
+{
+	unsigned long num_purged_areas = 0;
+	struct vmap_node *vn;
+	int i;
+
+	lockdep_assert_held(&vmap_purge_lock);
+	purge_nodes = CPU_MASK_NONE;
+
+	for (i = 0; i < nr_vmap_nodes; i++) {
+		vn = &vmap_nodes[i];
+
+		INIT_LIST_HEAD(&vn->purge_list);
+
+		if (RB_EMPTY_ROOT(&vn->lazy.root))
+			continue;
+
+		spin_lock(&vn->lazy.lock);
+		WRITE_ONCE(vn->lazy.root.rb_node, NULL);
+		list_replace_init(&vn->lazy.head, &vn->purge_list);
+		spin_unlock(&vn->lazy.lock);
+
+		start = min(start, list_first_entry(&vn->purge_list,
+			struct vmap_area, list)->va_start);
+
+		end = max(end, list_last_entry(&vn->purge_list,
+			struct vmap_area, list)->va_end);
+
+		cpumask_set_cpu(i, &purge_nodes);
+	}
+
+	if (cpumask_weight(&purge_nodes) > 0) {
+		flush_tlb_kernel_range(start, end);
+
+		for_each_cpu(i, &purge_nodes) {
+			vn = &nodes[i];
+			num_purged_areas += purge_vmap_node(vn);
+		}
+	}
+
 	trace_purge_vmap_area_lazy(start, end, num_purged_areas);
 	return num_purged_areas > 0;
 }
@@ -1846,16 +1872,9 @@ static void reclaim_and_purge_vmap_areas(void)
 
 static void drain_vmap_area_work(struct work_struct *work)
 {
-	unsigned long nr_lazy;
-
-	do {
-		mutex_lock(&vmap_purge_lock);
-		__purge_vmap_area_lazy(ULONG_MAX, 0);
-		mutex_unlock(&vmap_purge_lock);
-
-		/* Recheck if further work is required. */
-		nr_lazy = atomic_long_read(&vmap_lazy_nr);
-	} while (nr_lazy > lazy_max_pages());
+	mutex_lock(&vmap_purge_lock);
+	__purge_vmap_area_lazy(ULONG_MAX, 0);
+	mutex_unlock(&vmap_purge_lock);
 }
 
 /*
@@ -1865,6 +1884,7 @@ static void drain_vmap_area_work(struct work_struct *work)
  */
 static void free_vmap_area_noflush(struct vmap_area *va)
 {
+	struct vmap_node *vn = addr_to_node(va->va_start);
 	unsigned long nr_lazy_max = lazy_max_pages();
 	unsigned long va_start = va->va_start;
 	unsigned long nr_lazy;
@@ -1878,10 +1898,9 @@ static void free_vmap_area_noflush(struct vmap_area *va)
 	/*
 	 * Merge or place it to the purge tree/list.
 	 */
-	spin_lock(&purge_vmap_area_lock);
-	merge_or_add_vmap_area(va,
-		&purge_vmap_area_root, &purge_vmap_area_list);
-	spin_unlock(&purge_vmap_area_lock);
+	spin_lock(&vn->lazy.lock);
+	merge_or_add_vmap_area(va, &vn->lazy.root, &vn->lazy.head);
+	spin_unlock(&vn->lazy.lock);
 
 	trace_free_vmap_area_noflush(va_start, nr_lazy, nr_lazy_max);
 
@@ -4411,15 +4430,21 @@ static void show_numa_info(struct seq_file *m, struct vm_struct *v)
 
 static void show_purge_info(struct seq_file *m)
 {
+	struct vmap_node *vn;
 	struct vmap_area *va;
+	int i;
 
-	spin_lock(&purge_vmap_area_lock);
-	list_for_each_entry(va, &purge_vmap_area_list, list) {
-		seq_printf(m, "0x%pK-0x%pK %7ld unpurged vm_area\n",
-			(void *)va->va_start, (void *)va->va_end,
-			va->va_end - va->va_start);
+	for (i = 0; i < nr_vmap_nodes; i++) {
+		vn = &vmap_nodes[i];
+
+		spin_lock(&vn->lazy.lock);
+		list_for_each_entry(va, &vn->lazy.head, list) {
+			seq_printf(m, "0x%pK-0x%pK %7ld unpurged vm_area\n",
+				(void *)va->va_start, (void *)va->va_end,
+				va->va_end - va->va_start);
+		}
+		spin_unlock(&vn->lazy.lock);
 	}
-	spin_unlock(&purge_vmap_area_lock);
 }
 
 static int s_show(struct seq_file *m, void *p)
@@ -4558,6 +4583,10 @@ static void vmap_init_nodes(void)
 		vn->busy.root = RB_ROOT;
 		INIT_LIST_HEAD(&vn->busy.head);
 		spin_lock_init(&vn->busy.lock);
+
+		vn->lazy.root = RB_ROOT;
+		INIT_LIST_HEAD(&vn->lazy.head);
+		spin_lock_init(&vn->lazy.lock);
 	}
 }
 

From 128c02e74f744c715a15d531d0e38e47f093712b Mon Sep 17 00:00:00 2001
From: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Date: Tue, 2 Jan 2024 19:46:29 +0100
Subject: [PATCH 0972/1406] mm: vmalloc: offload free_vmap_area_lock lock

Concurrent access to a global vmap space is a bottle-neck.  We can
simulate a high contention by running a vmalloc test suite.

To address it, introduce an effective vmap node logic.  Each node behaves
as independent entity.  When a node is accessed it serves a request
directly(if possible) from its pool.

This model has a size based pool for requests, i.e.  pools are serialized
and populated based on object size and real demand.  A maximum object size
that pool can handle is set to 256 pages.

This technique reduces a pressure on the global vmap lock.

Link: https://lkml.kernel.org/r/20240102184633.748113-8-urezki@gmail.com
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Cc: Kazuhito Hagio <k-hagio-ab@nec.com>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Oleksiy Avramchenko <oleksiy.avramchenko@sony.com>
Cc: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/vmalloc.c | 387 +++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 342 insertions(+), 45 deletions(-)

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 72822aeff55c22..e8b9621ea02b46 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -775,7 +775,22 @@ struct rb_list {
 	spinlock_t lock;
 };
 
+struct vmap_pool {
+	struct list_head head;
+	unsigned long len;
+};
+
+/*
+ * A fast size storage contains VAs up to 1M size.
+ */
+#define MAX_VA_SIZE_PAGES 256
+
 static struct vmap_node {
+	/* Simple size segregated storage. */
+	struct vmap_pool pool[MAX_VA_SIZE_PAGES];
+	spinlock_t pool_lock;
+	bool skip_populate;
+
 	/* Bookkeeping data of this node. */
 	struct rb_list busy;
 	struct rb_list lazy;
@@ -784,6 +799,8 @@ static struct vmap_node {
 	 * Ready-to-free areas.
 	 */
 	struct list_head purge_list;
+	struct work_struct purge_work;
+	unsigned long nr_purged;
 } single;
 
 static struct vmap_node *vmap_nodes = &single;
@@ -802,6 +819,61 @@ addr_to_node(unsigned long addr)
 	return &vmap_nodes[addr_to_node_id(addr)];
 }
 
+static inline struct vmap_node *
+id_to_node(unsigned int id)
+{
+	return &vmap_nodes[id % nr_vmap_nodes];
+}
+
+/*
+ * We use the value 0 to represent "no node", that is why
+ * an encoded value will be the node-id incremented by 1.
+ * It is always greater then 0. A valid node_id which can
+ * be encoded is [0:nr_vmap_nodes - 1]. If a passed node_id
+ * is not valid 0 is returned.
+ */
+static unsigned int
+encode_vn_id(unsigned int node_id)
+{
+	/* Can store U8_MAX [0:254] nodes. */
+	if (node_id < nr_vmap_nodes)
+		return (node_id + 1) << BITS_PER_BYTE;
+
+	/* Warn and no node encoded. */
+	WARN_ONCE(1, "Encode wrong node id (%u)\n", node_id);
+	return 0;
+}
+
+/*
+ * Returns an encoded node-id, the valid range is within
+ * [0:nr_vmap_nodes-1] values. Otherwise nr_vmap_nodes is
+ * returned if extracted data is wrong.
+ */
+static unsigned int
+decode_vn_id(unsigned int val)
+{
+	unsigned int node_id = (val >> BITS_PER_BYTE) - 1;
+
+	/* Can store U8_MAX [0:254] nodes. */
+	if (node_id < nr_vmap_nodes)
+		return node_id;
+
+	/* If it was _not_ zero, warn. */
+	WARN_ONCE(node_id != UINT_MAX,
+		"Decode wrong node id (%d)\n", node_id);
+
+	return nr_vmap_nodes;
+}
+
+static bool
+is_vn_id_valid(unsigned int node_id)
+{
+	if (node_id < nr_vmap_nodes)
+		return true;
+
+	return false;
+}
+
 static __always_inline unsigned long
 va_size(struct vmap_area *va)
 {
@@ -1623,6 +1695,104 @@ preload_this_cpu_lock(spinlock_t *lock, gfp_t gfp_mask, int node)
 		kmem_cache_free(vmap_area_cachep, va);
 }
 
+static struct vmap_pool *
+size_to_va_pool(struct vmap_node *vn, unsigned long size)
+{
+	unsigned int idx = (size - 1) / PAGE_SIZE;
+
+	if (idx < MAX_VA_SIZE_PAGES)
+		return &vn->pool[idx];
+
+	return NULL;
+}
+
+static bool
+node_pool_add_va(struct vmap_node *n, struct vmap_area *va)
+{
+	struct vmap_pool *vp;
+
+	vp = size_to_va_pool(n, va_size(va));
+	if (!vp)
+		return false;
+
+	spin_lock(&n->pool_lock);
+	list_add(&va->list, &vp->head);
+	WRITE_ONCE(vp->len, vp->len + 1);
+	spin_unlock(&n->pool_lock);
+
+	return true;
+}
+
+static struct vmap_area *
+node_pool_del_va(struct vmap_node *vn, unsigned long size,
+		unsigned long align, unsigned long vstart,
+		unsigned long vend)
+{
+	struct vmap_area *va = NULL;
+	struct vmap_pool *vp;
+	int err = 0;
+
+	vp = size_to_va_pool(vn, size);
+	if (!vp || list_empty(&vp->head))
+		return NULL;
+
+	spin_lock(&vn->pool_lock);
+	if (!list_empty(&vp->head)) {
+		va = list_first_entry(&vp->head, struct vmap_area, list);
+
+		if (IS_ALIGNED(va->va_start, align)) {
+			/*
+			 * Do some sanity check and emit a warning
+			 * if one of below checks detects an error.
+			 */
+			err |= (va_size(va) != size);
+			err |= (va->va_start < vstart);
+			err |= (va->va_end > vend);
+
+			if (!WARN_ON_ONCE(err)) {
+				list_del_init(&va->list);
+				WRITE_ONCE(vp->len, vp->len - 1);
+			} else {
+				va = NULL;
+			}
+		} else {
+			list_move_tail(&va->list, &vp->head);
+			va = NULL;
+		}
+	}
+	spin_unlock(&vn->pool_lock);
+
+	return va;
+}
+
+static struct vmap_area *
+node_alloc(unsigned long size, unsigned long align,
+		unsigned long vstart, unsigned long vend,
+		unsigned long *addr, unsigned int *vn_id)
+{
+	struct vmap_area *va;
+
+	*vn_id = 0;
+	*addr = vend;
+
+	/*
+	 * Fallback to a global heap if not vmalloc or there
+	 * is only one node.
+	 */
+	if (vstart != VMALLOC_START || vend != VMALLOC_END ||
+			nr_vmap_nodes == 1)
+		return NULL;
+
+	*vn_id = raw_smp_processor_id() % nr_vmap_nodes;
+	va = node_pool_del_va(id_to_node(*vn_id), size, align, vstart, vend);
+	*vn_id = encode_vn_id(*vn_id);
+
+	if (va)
+		*addr = va->va_start;
+
+	return va;
+}
+
 /*
  * Allocate a region of KVA of the specified size and alignment, within the
  * vstart and vend.
@@ -1637,6 +1807,7 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
 	struct vmap_area *va;
 	unsigned long freed;
 	unsigned long addr;
+	unsigned int vn_id;
 	int purged = 0;
 	int ret;
 
@@ -1647,11 +1818,23 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
 		return ERR_PTR(-EBUSY);
 
 	might_sleep();
-	gfp_mask = gfp_mask & GFP_RECLAIM_MASK;
 
-	va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node);
-	if (unlikely(!va))
-		return ERR_PTR(-ENOMEM);
+	/*
+	 * If a VA is obtained from a global heap(if it fails here)
+	 * it is anyway marked with this "vn_id" so it is returned
+	 * to this pool's node later. Such way gives a possibility
+	 * to populate pools based on users demand.
+	 *
+	 * On success a ready to go VA is returned.
+	 */
+	va = node_alloc(size, align, vstart, vend, &addr, &vn_id);
+	if (!va) {
+		gfp_mask = gfp_mask & GFP_RECLAIM_MASK;
+
+		va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node);
+		if (unlikely(!va))
+			return ERR_PTR(-ENOMEM);
+	}
 
 	/*
 	 * Only scan the relevant parts containing pointers to other objects
@@ -1660,10 +1843,12 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
 	kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask);
 
 retry:
-	preload_this_cpu_lock(&free_vmap_area_lock, gfp_mask, node);
-	addr = __alloc_vmap_area(&free_vmap_area_root, &free_vmap_area_list,
-		size, align, vstart, vend);
-	spin_unlock(&free_vmap_area_lock);
+	if (addr == vend) {
+		preload_this_cpu_lock(&free_vmap_area_lock, gfp_mask, node);
+		addr = __alloc_vmap_area(&free_vmap_area_root, &free_vmap_area_list,
+			size, align, vstart, vend);
+		spin_unlock(&free_vmap_area_lock);
+	}
 
 	trace_alloc_vmap_area(addr, size, align, vstart, vend, addr == vend);
 
@@ -1677,7 +1862,7 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
 	va->va_start = addr;
 	va->va_end = addr + size;
 	va->vm = NULL;
-	va->flags = va_flags;
+	va->flags = (va_flags | vn_id);
 
 	vn = addr_to_node(va->va_start);
 
@@ -1770,63 +1955,135 @@ static DEFINE_MUTEX(vmap_purge_lock);
 static void purge_fragmented_blocks_allcpus(void);
 static cpumask_t purge_nodes;
 
-/*
- * Purges all lazily-freed vmap areas.
- */
-static unsigned long
-purge_vmap_node(struct vmap_node *vn)
+static void
+reclaim_list_global(struct list_head *head)
 {
-	unsigned long num_purged_areas = 0;
-	struct vmap_area *va, *n_va;
+	struct vmap_area *va, *n;
 
-	if (list_empty(&vn->purge_list))
-		return 0;
+	if (list_empty(head))
+		return;
 
 	spin_lock(&free_vmap_area_lock);
+	list_for_each_entry_safe(va, n, head, list)
+		merge_or_add_vmap_area_augment(va,
+			&free_vmap_area_root, &free_vmap_area_list);
+	spin_unlock(&free_vmap_area_lock);
+}
+
+static void
+decay_va_pool_node(struct vmap_node *vn, bool full_decay)
+{
+	struct vmap_area *va, *nva;
+	struct list_head decay_list;
+	struct rb_root decay_root;
+	unsigned long n_decay;
+	int i;
+
+	decay_root = RB_ROOT;
+	INIT_LIST_HEAD(&decay_list);
+
+	for (i = 0; i < MAX_VA_SIZE_PAGES; i++) {
+		struct list_head tmp_list;
+
+		if (list_empty(&vn->pool[i].head))
+			continue;
+
+		INIT_LIST_HEAD(&tmp_list);
+
+		/* Detach the pool, so no-one can access it. */
+		spin_lock(&vn->pool_lock);
+		list_replace_init(&vn->pool[i].head, &tmp_list);
+		spin_unlock(&vn->pool_lock);
+
+		if (full_decay)
+			WRITE_ONCE(vn->pool[i].len, 0);
+
+		/* Decay a pool by ~25% out of left objects. */
+		n_decay = vn->pool[i].len >> 2;
+
+		list_for_each_entry_safe(va, nva, &tmp_list, list) {
+			list_del_init(&va->list);
+			merge_or_add_vmap_area(va, &decay_root, &decay_list);
+
+			if (!full_decay) {
+				WRITE_ONCE(vn->pool[i].len, vn->pool[i].len - 1);
+
+				if (!--n_decay)
+					break;
+			}
+		}
+
+		/* Attach the pool back if it has been partly decayed. */
+		if (!full_decay && !list_empty(&tmp_list)) {
+			spin_lock(&vn->pool_lock);
+			list_replace_init(&tmp_list, &vn->pool[i].head);
+			spin_unlock(&vn->pool_lock);
+		}
+	}
+
+	reclaim_list_global(&decay_list);
+}
+
+static void purge_vmap_node(struct work_struct *work)
+{
+	struct vmap_node *vn = container_of(work,
+		struct vmap_node, purge_work);
+	struct vmap_area *va, *n_va;
+	LIST_HEAD(local_list);
+
+	vn->nr_purged = 0;
+
 	list_for_each_entry_safe(va, n_va, &vn->purge_list, list) {
 		unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT;
 		unsigned long orig_start = va->va_start;
 		unsigned long orig_end = va->va_end;
+		unsigned int vn_id = decode_vn_id(va->flags);
 
-		/*
-		 * Finally insert or merge lazily-freed area. It is
-		 * detached and there is no need to "unlink" it from
-		 * anything.
-		 */
-		va = merge_or_add_vmap_area_augment(va, &free_vmap_area_root,
-				&free_vmap_area_list);
-
-		if (!va)
-			continue;
+		list_del_init(&va->list);
 
 		if (is_vmalloc_or_module_addr((void *)orig_start))
 			kasan_release_vmalloc(orig_start, orig_end,
 					      va->va_start, va->va_end);
 
 		atomic_long_sub(nr, &vmap_lazy_nr);
-		num_purged_areas++;
+		vn->nr_purged++;
+
+		if (is_vn_id_valid(vn_id) && !vn->skip_populate)
+			if (node_pool_add_va(vn, va))
+				continue;
+
+		/* Go back to global. */
+		list_add(&va->list, &local_list);
 	}
-	spin_unlock(&free_vmap_area_lock);
 
-	return num_purged_areas;
+	reclaim_list_global(&local_list);
 }
 
 /*
  * Purges all lazily-freed vmap areas.
  */
-static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
+static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end,
+		bool full_pool_decay)
 {
-	unsigned long num_purged_areas = 0;
+	unsigned long nr_purged_areas = 0;
+	unsigned int nr_purge_helpers;
+	unsigned int nr_purge_nodes;
 	struct vmap_node *vn;
 	int i;
 
 	lockdep_assert_held(&vmap_purge_lock);
+
+	/*
+	 * Use cpumask to mark which node has to be processed.
+	 */
 	purge_nodes = CPU_MASK_NONE;
 
 	for (i = 0; i < nr_vmap_nodes; i++) {
 		vn = &vmap_nodes[i];
 
 		INIT_LIST_HEAD(&vn->purge_list);
+		vn->skip_populate = full_pool_decay;
+		decay_va_pool_node(vn, full_pool_decay);
 
 		if (RB_EMPTY_ROOT(&vn->lazy.root))
 			continue;
@@ -1845,17 +2102,45 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
 		cpumask_set_cpu(i, &purge_nodes);
 	}
 
-	if (cpumask_weight(&purge_nodes) > 0) {
+	nr_purge_nodes = cpumask_weight(&purge_nodes);
+	if (nr_purge_nodes > 0) {
 		flush_tlb_kernel_range(start, end);
 
+		/* One extra worker is per a lazy_max_pages() full set minus one. */
+		nr_purge_helpers = atomic_long_read(&vmap_lazy_nr) / lazy_max_pages();
+		nr_purge_helpers = clamp(nr_purge_helpers, 1U, nr_purge_nodes) - 1;
+
 		for_each_cpu(i, &purge_nodes) {
-			vn = &nodes[i];
-			num_purged_areas += purge_vmap_node(vn);
+			vn = &vmap_nodes[i];
+
+			if (nr_purge_helpers > 0) {
+				INIT_WORK(&vn->purge_work, purge_vmap_node);
+
+				if (cpumask_test_cpu(i, cpu_online_mask))
+					schedule_work_on(i, &vn->purge_work);
+				else
+					schedule_work(&vn->purge_work);
+
+				nr_purge_helpers--;
+			} else {
+				vn->purge_work.func = NULL;
+				purge_vmap_node(&vn->purge_work);
+				nr_purged_areas += vn->nr_purged;
+			}
+		}
+
+		for_each_cpu(i, &purge_nodes) {
+			vn = &vmap_nodes[i];
+
+			if (vn->purge_work.func) {
+				flush_work(&vn->purge_work);
+				nr_purged_areas += vn->nr_purged;
+			}
 		}
 	}
 
-	trace_purge_vmap_area_lazy(start, end, num_purged_areas);
-	return num_purged_areas > 0;
+	trace_purge_vmap_area_lazy(start, end, nr_purged_areas);
+	return nr_purged_areas > 0;
 }
 
 /*
@@ -1866,14 +2151,14 @@ static void reclaim_and_purge_vmap_areas(void)
 {
 	mutex_lock(&vmap_purge_lock);
 	purge_fragmented_blocks_allcpus();
-	__purge_vmap_area_lazy(ULONG_MAX, 0);
+	__purge_vmap_area_lazy(ULONG_MAX, 0, true);
 	mutex_unlock(&vmap_purge_lock);
 }
 
 static void drain_vmap_area_work(struct work_struct *work)
 {
 	mutex_lock(&vmap_purge_lock);
-	__purge_vmap_area_lazy(ULONG_MAX, 0);
+	__purge_vmap_area_lazy(ULONG_MAX, 0, false);
 	mutex_unlock(&vmap_purge_lock);
 }
 
@@ -1884,9 +2169,10 @@ static void drain_vmap_area_work(struct work_struct *work)
  */
 static void free_vmap_area_noflush(struct vmap_area *va)
 {
-	struct vmap_node *vn = addr_to_node(va->va_start);
 	unsigned long nr_lazy_max = lazy_max_pages();
 	unsigned long va_start = va->va_start;
+	unsigned int vn_id = decode_vn_id(va->flags);
+	struct vmap_node *vn;
 	unsigned long nr_lazy;
 
 	if (WARN_ON_ONCE(!list_empty(&va->list)))
@@ -1896,10 +2182,14 @@ static void free_vmap_area_noflush(struct vmap_area *va)
 				PAGE_SHIFT, &vmap_lazy_nr);
 
 	/*
-	 * Merge or place it to the purge tree/list.
+	 * If it was request by a certain node we would like to
+	 * return it to that node, i.e. its pool for later reuse.
 	 */
+	vn = is_vn_id_valid(vn_id) ?
+		id_to_node(vn_id):addr_to_node(va->va_start);
+
 	spin_lock(&vn->lazy.lock);
-	merge_or_add_vmap_area(va, &vn->lazy.root, &vn->lazy.head);
+	insert_vmap_area(va, &vn->lazy.root, &vn->lazy.head);
 	spin_unlock(&vn->lazy.lock);
 
 	trace_free_vmap_area_noflush(va_start, nr_lazy, nr_lazy_max);
@@ -2408,7 +2698,7 @@ static void _vm_unmap_aliases(unsigned long start, unsigned long end, int flush)
 	}
 	free_purged_blocks(&purge_list);
 
-	if (!__purge_vmap_area_lazy(start, end) && flush)
+	if (!__purge_vmap_area_lazy(start, end, false) && flush)
 		flush_tlb_kernel_range(start, end);
 	mutex_unlock(&vmap_purge_lock);
 }
@@ -4576,7 +4866,7 @@ static void __init vmap_init_free_space(void)
 static void vmap_init_nodes(void)
 {
 	struct vmap_node *vn;
-	int i;
+	int i, j;
 
 	for (i = 0; i < nr_vmap_nodes; i++) {
 		vn = &vmap_nodes[i];
@@ -4587,6 +4877,13 @@ static void vmap_init_nodes(void)
 		vn->lazy.root = RB_ROOT;
 		INIT_LIST_HEAD(&vn->lazy.head);
 		spin_lock_init(&vn->lazy.lock);
+
+		for (j = 0; j < MAX_VA_SIZE_PAGES; j++) {
+			INIT_LIST_HEAD(&vn->pool[j].head);
+			WRITE_ONCE(vn->pool[j].len, 0);
+		}
+
+		spin_lock_init(&vn->pool_lock);
 	}
 }
 

From 7b2f35747de103fded3f8881e11880233f94c1b4 Mon Sep 17 00:00:00 2001
From: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Date: Fri, 2 Feb 2024 20:06:28 +0100
Subject: [PATCH 0973/1406] mm: vmalloc: add a scan area of VA only once

Invoke a kmemleak_scan_area() function only for newly allocated objects to
add a scan area within that object.  There is no reason to add a same scan
area(pointer to beginning or inside the object) several times.  If a VA is
obtained from the cache its scan area has already been associated.

Link: https://lkml.kernel.org/r/20240202190628.47806-1-urezki@gmail.com
Fixes: 7db166b4aa0d ("mm: vmalloc: offload free_vmap_area_lock lock")
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Reviewed-by: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Cc: Kazuhito Hagio <k-hagio-ab@nec.com>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Oleksiy Avramchenko <oleksiy.avramchenko@sony.com>
Cc: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/vmalloc.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index e8b9621ea02b46..75e96cf377ef34 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1834,13 +1834,13 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
 		va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node);
 		if (unlikely(!va))
 			return ERR_PTR(-ENOMEM);
-	}
 
-	/*
-	 * Only scan the relevant parts containing pointers to other objects
-	 * to avoid false negatives.
-	 */
-	kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask);
+		/*
+		 * Only scan the relevant parts containing pointers to other objects
+		 * to avoid false negatives.
+		 */
+		kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask);
+	}
 
 retry:
 	if (addr == vend) {

From 1ca0cbfe1f489a789cc4f7906fa440acd3f3c882 Mon Sep 17 00:00:00 2001
From: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Date: Tue, 2 Jan 2024 19:46:30 +0100
Subject: [PATCH 0974/1406] mm: vmalloc: support multiple nodes in vread_iter

Extend the vread_iter() to be able to perform a sequential reading of VAs
which are spread among multiple nodes.  So a data read over the /dev/kmem
correctly reflects a vmalloc memory layout.

Link: https://lkml.kernel.org/r/20240102184633.748113-9-urezki@gmail.com
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Reviewed-by: Baoquan He <bhe@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Cc: Kazuhito Hagio <k-hagio-ab@nec.com>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Oleksiy Avramchenko <oleksiy.avramchenko@sony.com>
Cc: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/vmalloc.c | 67 +++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 53 insertions(+), 14 deletions(-)

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 75e96cf377ef34..5a2db5b66333b9 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -906,7 +906,7 @@ unsigned long vmalloc_nr_pages(void)
 
 /* Look up the first VA which satisfies addr < va_end, NULL if none. */
 static struct vmap_area *
-find_vmap_area_exceed_addr(unsigned long addr, struct rb_root *root)
+__find_vmap_area_exceed_addr(unsigned long addr, struct rb_root *root)
 {
 	struct vmap_area *va = NULL;
 	struct rb_node *n = root->rb_node;
@@ -930,6 +930,41 @@ find_vmap_area_exceed_addr(unsigned long addr, struct rb_root *root)
 	return va;
 }
 
+/*
+ * Returns a node where a first VA, that satisfies addr < va_end, resides.
+ * If success, a node is locked. A user is responsible to unlock it when a
+ * VA is no longer needed to be accessed.
+ *
+ * Returns NULL if nothing found.
+ */
+static struct vmap_node *
+find_vmap_area_exceed_addr_lock(unsigned long addr, struct vmap_area **va)
+{
+	struct vmap_node *vn, *va_node = NULL;
+	struct vmap_area *va_lowest;
+	int i;
+
+	for (i = 0; i < nr_vmap_nodes; i++) {
+		vn = &vmap_nodes[i];
+
+		spin_lock(&vn->busy.lock);
+		va_lowest = __find_vmap_area_exceed_addr(addr, &vn->busy.root);
+		if (va_lowest) {
+			if (!va_node || va_lowest->va_start < (*va)->va_start) {
+				if (va_node)
+					spin_unlock(&va_node->busy.lock);
+
+				*va = va_lowest;
+				va_node = vn;
+				continue;
+			}
+		}
+		spin_unlock(&vn->busy.lock);
+	}
+
+	return va_node;
+}
+
 static struct vmap_area *__find_vmap_area(unsigned long addr, struct rb_root *root)
 {
 	struct rb_node *n = root->rb_node;
@@ -4102,6 +4137,7 @@ long vread_iter(struct iov_iter *iter, const char *addr, size_t count)
 	struct vm_struct *vm;
 	char *vaddr;
 	size_t n, size, flags, remains;
+	unsigned long next;
 
 	addr = kasan_reset_tag(addr);
 
@@ -4111,19 +4147,15 @@ long vread_iter(struct iov_iter *iter, const char *addr, size_t count)
 
 	remains = count;
 
-	/* Hooked to node_0 so far. */
-	vn = addr_to_node(0);
-	spin_lock(&vn->busy.lock);
-
-	va = find_vmap_area_exceed_addr((unsigned long)addr, &vn->busy.root);
-	if (!va)
+	vn = find_vmap_area_exceed_addr_lock((unsigned long) addr, &va);
+	if (!vn)
 		goto finished_zero;
 
 	/* no intersects with alive vmap_area */
 	if ((unsigned long)addr + remains <= va->va_start)
 		goto finished_zero;
 
-	list_for_each_entry_from(va, &vn->busy.head, list) {
+	do {
 		size_t copied;
 
 		if (remains == 0)
@@ -4138,10 +4170,10 @@ long vread_iter(struct iov_iter *iter, const char *addr, size_t count)
 		WARN_ON(flags == VMAP_BLOCK);
 
 		if (!vm && !flags)
-			continue;
+			goto next_va;
 
 		if (vm && (vm->flags & VM_UNINITIALIZED))
-			continue;
+			goto next_va;
 
 		/* Pair with smp_wmb() in clear_vm_uninitialized_flag() */
 		smp_rmb();
@@ -4150,7 +4182,7 @@ long vread_iter(struct iov_iter *iter, const char *addr, size_t count)
 		size = vm ? get_vm_area_size(vm) : va_size(va);
 
 		if (addr >= vaddr + size)
-			continue;
+			goto next_va;
 
 		if (addr < vaddr) {
 			size_t to_zero = min_t(size_t, vaddr - addr, remains);
@@ -4179,15 +4211,22 @@ long vread_iter(struct iov_iter *iter, const char *addr, size_t count)
 
 		if (copied != n)
 			goto finished;
-	}
+
+	next_va:
+		next = va->va_end;
+		spin_unlock(&vn->busy.lock);
+	} while ((vn = find_vmap_area_exceed_addr_lock(next, &va)));
 
 finished_zero:
-	spin_unlock(&vn->busy.lock);
+	if (vn)
+		spin_unlock(&vn->busy.lock);
+
 	/* zero-fill memory holes */
 	return count - remains + zero_iter(iter, remains);
 finished:
 	/* Nothing remains, or We couldn't copy/zero everything. */
-	spin_unlock(&vn->busy.lock);
+	if (vn)
+		spin_unlock(&vn->busy.lock);
 
 	return count - remains;
 }

From 3c31a619309fc4883e2efb5a6d58219e4df897e1 Mon Sep 17 00:00:00 2001
From: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Date: Tue, 2 Jan 2024 19:46:31 +0100
Subject: [PATCH 0975/1406] mm: vmalloc: support multiple nodes in vmallocinfo

Allocated areas are spread among nodes, it implies that the scanning has
to be performed individually of each node in order to dump all existing
VAs.

Link: https://lkml.kernel.org/r/20240102184633.748113-10-urezki@gmail.com
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Cc: Kazuhito Hagio <k-hagio-ab@nec.com>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Oleksiy Avramchenko <oleksiy.avramchenko@sony.com>
Cc: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/vmalloc.c | 120 ++++++++++++++++++++-------------------------------
 1 file changed, 47 insertions(+), 73 deletions(-)

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 5a2db5b66333b9..41f924a9b52e95 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -4709,30 +4709,6 @@ bool vmalloc_dump_obj(void *object)
 #endif
 
 #ifdef CONFIG_PROC_FS
-static void *s_start(struct seq_file *m, loff_t *pos)
-{
-	struct vmap_node *vn = addr_to_node(0);
-
-	mutex_lock(&vmap_purge_lock);
-	spin_lock(&vn->busy.lock);
-
-	return seq_list_start(&vn->busy.head, *pos);
-}
-
-static void *s_next(struct seq_file *m, void *p, loff_t *pos)
-{
-	struct vmap_node *vn = addr_to_node(0);
-	return seq_list_next(p, &vn->busy.head, pos);
-}
-
-static void s_stop(struct seq_file *m, void *p)
-{
-	struct vmap_node *vn = addr_to_node(0);
-
-	spin_unlock(&vn->busy.lock);
-	mutex_unlock(&vmap_purge_lock);
-}
-
 static void show_numa_info(struct seq_file *m, struct vm_struct *v)
 {
 	if (IS_ENABLED(CONFIG_NUMA)) {
@@ -4776,84 +4752,82 @@ static void show_purge_info(struct seq_file *m)
 	}
 }
 
-static int s_show(struct seq_file *m, void *p)
+static int vmalloc_info_show(struct seq_file *m, void *p)
 {
 	struct vmap_node *vn;
 	struct vmap_area *va;
 	struct vm_struct *v;
+	int i;
 
-	vn = addr_to_node(0);
-	va = list_entry(p, struct vmap_area, list);
+	for (i = 0; i < nr_vmap_nodes; i++) {
+		vn = &vmap_nodes[i];
 
-	if (!va->vm) {
-		if (va->flags & VMAP_RAM)
-			seq_printf(m, "0x%pK-0x%pK %7ld vm_map_ram\n",
-				(void *)va->va_start, (void *)va->va_end,
-				va->va_end - va->va_start);
+		spin_lock(&vn->busy.lock);
+		list_for_each_entry(va, &vn->busy.head, list) {
+			if (!va->vm) {
+				if (va->flags & VMAP_RAM)
+					seq_printf(m, "0x%pK-0x%pK %7ld vm_map_ram\n",
+						(void *)va->va_start, (void *)va->va_end,
+						va->va_end - va->va_start);
 
-		goto final;
-	}
+				continue;
+			}
 
-	v = va->vm;
+			v = va->vm;
 
-	seq_printf(m, "0x%pK-0x%pK %7ld",
-		v->addr, v->addr + v->size, v->size);
+			seq_printf(m, "0x%pK-0x%pK %7ld",
+				v->addr, v->addr + v->size, v->size);
 
-	if (v->caller)
-		seq_printf(m, " %pS", v->caller);
+			if (v->caller)
+				seq_printf(m, " %pS", v->caller);
 
-	if (v->nr_pages)
-		seq_printf(m, " pages=%d", v->nr_pages);
+			if (v->nr_pages)
+				seq_printf(m, " pages=%d", v->nr_pages);
 
-	if (v->phys_addr)
-		seq_printf(m, " phys=%pa", &v->phys_addr);
+			if (v->phys_addr)
+				seq_printf(m, " phys=%pa", &v->phys_addr);
 
-	if (v->flags & VM_IOREMAP)
-		seq_puts(m, " ioremap");
+			if (v->flags & VM_IOREMAP)
+				seq_puts(m, " ioremap");
 
-	if (v->flags & VM_ALLOC)
-		seq_puts(m, " vmalloc");
+			if (v->flags & VM_ALLOC)
+				seq_puts(m, " vmalloc");
 
-	if (v->flags & VM_MAP)
-		seq_puts(m, " vmap");
+			if (v->flags & VM_MAP)
+				seq_puts(m, " vmap");
 
-	if (v->flags & VM_USERMAP)
-		seq_puts(m, " user");
+			if (v->flags & VM_USERMAP)
+				seq_puts(m, " user");
 
-	if (v->flags & VM_DMA_COHERENT)
-		seq_puts(m, " dma-coherent");
+			if (v->flags & VM_DMA_COHERENT)
+				seq_puts(m, " dma-coherent");
 
-	if (is_vmalloc_addr(v->pages))
-		seq_puts(m, " vpages");
+			if (is_vmalloc_addr(v->pages))
+				seq_puts(m, " vpages");
 
-	show_numa_info(m, v);
-	seq_putc(m, '\n');
+			show_numa_info(m, v);
+			seq_putc(m, '\n');
+		}
+		spin_unlock(&vn->busy.lock);
+	}
 
 	/*
 	 * As a final step, dump "unpurged" areas.
 	 */
-final:
-	if (list_is_last(&va->list, &vn->busy.head))
-		show_purge_info(m);
-
+	show_purge_info(m);
 	return 0;
 }
 
-static const struct seq_operations vmalloc_op = {
-	.start = s_start,
-	.next = s_next,
-	.stop = s_stop,
-	.show = s_show,
-};
-
 static int __init proc_vmalloc_init(void)
 {
+	void *priv_data = NULL;
+
 	if (IS_ENABLED(CONFIG_NUMA))
-		proc_create_seq_private("vmallocinfo", 0400, NULL,
-				&vmalloc_op,
-				nr_node_ids * sizeof(unsigned int), NULL);
-	else
-		proc_create_seq("vmallocinfo", 0400, NULL, &vmalloc_op);
+		priv_data = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL);
+
+	proc_create_single_data("vmallocinfo",
+		0400, NULL, vmalloc_info_show, priv_data);
+
 	return 0;
 }
 module_init(proc_vmalloc_init);

From 30c80444eeb46618c0083262af119ac5145cba0f Mon Sep 17 00:00:00 2001
From: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Date: Tue, 2 Jan 2024 19:46:32 +0100
Subject: [PATCH 0976/1406] mm: vmalloc: set nr_nodes based on CPUs in a system

A number of nodes which are used in the alloc/free paths is set based on
num_possible_cpus() in a system.  Please note a high limit threshold
though is fixed and corresponds to 128 nodes.

For 32-bit or single core systems an access to a global vmap heap is not
balanced.  Such small systems do not suffer from lock contentions due to
low number of CPUs.  In such case the nr_nodes is equal to 1.

Test on AMD Ryzen Threadripper 3970X 32-Core Processor: sudo
./test_vmalloc.sh run_test_mask=7 nr_threads=64

<default perf>
 94.41%     0.89%  [kernel]        [k] _raw_spin_lock
 93.35%    93.07%  [kernel]        [k] native_queued_spin_lock_slowpath
 76.13%     0.28%  [kernel]        [k] __vmalloc_node_range
 72.96%     0.81%  [kernel]        [k] alloc_vmap_area
 56.94%     0.00%  [kernel]        [k] __get_vm_area_node
 41.95%     0.00%  [kernel]        [k] vmalloc
 37.15%     0.01%  [test_vmalloc]  [k] full_fit_alloc_test
 35.17%     0.00%  [kernel]        [k] ret_from_fork_asm
 35.17%     0.00%  [kernel]        [k] ret_from_fork
 35.17%     0.00%  [kernel]        [k] kthread
 35.08%     0.00%  [test_vmalloc]  [k] test_func
 34.45%     0.00%  [test_vmalloc]  [k] fix_size_alloc_test
 28.09%     0.01%  [test_vmalloc]  [k] long_busy_list_alloc_test
 23.53%     0.25%  [kernel]        [k] vfree.part.0
 21.72%     0.00%  [kernel]        [k] remove_vm_area
 20.08%     0.21%  [kernel]        [k] find_unlink_vmap_area
  2.34%     0.61%  [kernel]        [k] free_vmap_area_noflush
<default perf>
   vs
<patch-series perf>
 82.32%     0.22%  [test_vmalloc]  [k] long_busy_list_alloc_test
 63.36%     0.02%  [kernel]        [k] vmalloc
 63.34%     2.64%  [kernel]        [k] __vmalloc_node_range
 30.42%     4.46%  [kernel]        [k] vfree.part.0
 28.98%     2.51%  [kernel]        [k] __alloc_pages_bulk
 27.28%     0.19%  [kernel]        [k] __get_vm_area_node
 26.13%     1.50%  [kernel]        [k] alloc_vmap_area
 21.72%    21.67%  [kernel]        [k] clear_page_rep
 19.51%     2.43%  [kernel]        [k] _raw_spin_lock
 16.61%    16.51%  [kernel]        [k] native_queued_spin_lock_slowpath
 13.40%     2.07%  [kernel]        [k] free_unref_page
 10.62%     0.01%  [kernel]        [k] remove_vm_area
  9.02%     8.73%  [kernel]        [k] insert_vmap_area
  8.94%     0.00%  [kernel]        [k] ret_from_fork_asm
  8.94%     0.00%  [kernel]        [k] ret_from_fork
  8.94%     0.00%  [kernel]        [k] kthread
  8.29%     0.00%  [test_vmalloc]  [k] test_func
  7.81%     0.05%  [test_vmalloc]  [k] full_fit_alloc_test
  5.30%     4.73%  [kernel]        [k] purge_vmap_node
  4.47%     2.65%  [kernel]        [k] free_vmap_area_noflush
<patch-series perf>

confirms that a native_queued_spin_lock_slowpath goes down to
16.51% percent from 93.07%.

The throughput is ~12x higher:

urezki@pc638:~$ time sudo ./test_vmalloc.sh run_test_mask=7 nr_threads=64
Run the test with following parameters: run_test_mask=7 nr_threads=64
Done.
Check the kernel ring buffer to see the summary.

real    10m51.271s
user    0m0.013s
sys     0m0.187s
urezki@pc638:~$

urezki@pc638:~$ time sudo ./test_vmalloc.sh run_test_mask=7 nr_threads=64
Run the test with following parameters: run_test_mask=7 nr_threads=64
Done.
Check the kernel ring buffer to see the summary.

real    0m51.301s
user    0m0.015s
sys     0m0.040s
urezki@pc638:~$

Link: https://lkml.kernel.org/r/20240102184633.748113-11-urezki@gmail.com
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Cc: Kazuhito Hagio <k-hagio-ab@nec.com>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Oleksiy Avramchenko <oleksiy.avramchenko@sony.com>
Cc: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/vmalloc.c | 29 +++++++++++++++++++++++------
 1 file changed, 23 insertions(+), 6 deletions(-)

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 41f924a9b52e95..4d90a6ad83aa7b 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -4879,10 +4879,27 @@ static void __init vmap_init_free_space(void)
 static void vmap_init_nodes(void)
 {
 	struct vmap_node *vn;
-	int i, j;
+	int i, n;
+
+#if BITS_PER_LONG == 64
+	/* A high threshold of max nodes is fixed and bound to 128. */
+	n = clamp_t(unsigned int, num_possible_cpus(), 1, 128);
+
+	if (n > 1) {
+		vn = kmalloc_array(n, sizeof(*vn), GFP_NOWAIT | __GFP_NOWARN);
+		if (vn) {
+			/* Node partition is 16 pages. */
+			vmap_zone_size = (1 << 4) * PAGE_SIZE;
+			nr_vmap_nodes = n;
+			vmap_nodes = vn;
+		} else {
+			pr_err("Failed to allocate an array. Disable a node layer\n");
+		}
+	}
+#endif
 
-	for (i = 0; i < nr_vmap_nodes; i++) {
-		vn = &vmap_nodes[i];
+	for (n = 0; n < nr_vmap_nodes; n++) {
+		vn = &vmap_nodes[n];
 		vn->busy.root = RB_ROOT;
 		INIT_LIST_HEAD(&vn->busy.head);
 		spin_lock_init(&vn->busy.lock);
@@ -4891,9 +4908,9 @@ static void vmap_init_nodes(void)
 		INIT_LIST_HEAD(&vn->lazy.head);
 		spin_lock_init(&vn->lazy.lock);
 
-		for (j = 0; j < MAX_VA_SIZE_PAGES; j++) {
-			INIT_LIST_HEAD(&vn->pool[j].head);
-			WRITE_ONCE(vn->pool[j].len, 0);
+		for (i = 0; i < MAX_VA_SIZE_PAGES; i++) {
+			INIT_LIST_HEAD(&vn->pool[i].head);
+			WRITE_ONCE(vn->pool[i].len, 0);
 		}
 
 		spin_lock_init(&vn->pool_lock);

From 6fba4f60efdd6500ad3b003daa29780b1256bd52 Mon Sep 17 00:00:00 2001
From: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Date: Tue, 2 Jan 2024 19:46:33 +0100
Subject: [PATCH 0977/1406] mm: vmalloc: add a shrinker to drain vmap pools

The added shrinker is used to return back current cached VAs into a global
vmap space, when a system enters into a low memory mode.

Link: https://lkml.kernel.org/r/20240102184633.748113-12-urezki@gmail.com
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Cc: Kazuhito Hagio <k-hagio-ab@nec.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Oleksiy Avramchenko <oleksiy.avramchenko@sony.com>
Cc: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/vmalloc.c | 39 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 4d90a6ad83aa7b..6746491de381ef 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -4917,8 +4917,37 @@ static void vmap_init_nodes(void)
 	}
 }
 
+static unsigned long
+vmap_node_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
+{
+	unsigned long count;
+	struct vmap_node *vn;
+	int i, j;
+
+	for (count = 0, i = 0; i < nr_vmap_nodes; i++) {
+		vn = &vmap_nodes[i];
+
+		for (j = 0; j < MAX_VA_SIZE_PAGES; j++)
+			count += READ_ONCE(vn->pool[j].len);
+	}
+
+	return count ? count : SHRINK_EMPTY;
+}
+
+static unsigned long
+vmap_node_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
+{
+	int i;
+
+	for (i = 0; i < nr_vmap_nodes; i++)
+		decay_va_pool_node(&vmap_nodes[i], true);
+
+	return SHRINK_STOP;
+}
+
 void __init vmalloc_init(void)
 {
+	struct shrinker *vmap_node_shrinker;
 	struct vmap_area *va;
 	struct vmap_node *vn;
 	struct vm_struct *tmp;
@@ -4966,4 +4995,14 @@ void __init vmalloc_init(void)
 	 */
 	vmap_init_free_space();
 	vmap_initialized = true;
+
+	vmap_node_shrinker = shrinker_alloc(0, "vmap-node");
+	if (!vmap_node_shrinker) {
+		pr_err("Failed to allocate vmap-node shrinker!\n");
+		return;
+	}
+
+	vmap_node_shrinker->count_objects = vmap_node_shrink_count;
+	vmap_node_shrinker->scan_objects = vmap_node_shrink_scan;
+	shrinker_register(vmap_node_shrinker);
 }

From 55f13907bb8189f90145dc8d449493967ff9f2f3 Mon Sep 17 00:00:00 2001
From: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Date: Wed, 24 Jan 2024 19:09:19 +0100
Subject: [PATCH 0978/1406] mm: vmalloc: improve description of vmap node layer

This patch adds extra explanation of recently added vmap node layer based
on community feedback.  No functional change.

Link: https://lkml.kernel.org/r/20240124180920.50725-1-urezki@gmail.com
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Reviewed-by: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Oleksiy Avramchenko <oleksiy.avramchenko@sony.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/vmalloc.c | 60 ++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 46 insertions(+), 14 deletions(-)

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 6746491de381ef..568f6c0b1fb5ef 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -765,9 +765,10 @@ static struct rb_root free_vmap_area_root = RB_ROOT;
 static DEFINE_PER_CPU(struct vmap_area *, ne_fit_preload_node);
 
 /*
- * An effective vmap-node logic. Users make use of nodes instead
- * of a global heap. It allows to balance an access and mitigate
- * contention.
+ * This structure defines a single, solid model where a list and
+ * rb-tree are part of one entity protected by the lock. Nodes are
+ * sorted in ascending order, thus for O(1) access to left/right
+ * neighbors a list is used as well as for sequential traversal.
  */
 struct rb_list {
 	struct rb_root root;
@@ -775,16 +776,23 @@ struct rb_list {
 	spinlock_t lock;
 };
 
+/*
+ * A fast size storage contains VAs up to 1M size. A pool consists
+ * of linked between each other ready to go VAs of certain sizes.
+ * An index in the pool-array corresponds to number of pages + 1.
+ */
+#define MAX_VA_SIZE_PAGES 256
+
 struct vmap_pool {
 	struct list_head head;
 	unsigned long len;
 };
 
 /*
- * A fast size storage contains VAs up to 1M size.
+ * An effective vmap-node logic. Users make use of nodes instead
+ * of a global heap. It allows to balance an access and mitigate
+ * contention.
  */
-#define MAX_VA_SIZE_PAGES 256
-
 static struct vmap_node {
 	/* Simple size segregated storage. */
 	struct vmap_pool pool[MAX_VA_SIZE_PAGES];
@@ -803,6 +811,11 @@ static struct vmap_node {
 	unsigned long nr_purged;
 } single;
 
+/*
+ * Initial setup consists of one single node, i.e. a balancing
+ * is fully disabled. Later on, after vmap is initialized these
+ * parameters are updated based on a system capacity.
+ */
 static struct vmap_node *vmap_nodes = &single;
 static __read_mostly unsigned int nr_vmap_nodes = 1;
 static __read_mostly unsigned int vmap_zone_size = 1;
@@ -2048,7 +2061,12 @@ decay_va_pool_node(struct vmap_node *vn, bool full_decay)
 			}
 		}
 
-		/* Attach the pool back if it has been partly decayed. */
+		/*
+		 * Attach the pool back if it has been partly decayed.
+		 * Please note, it is supposed that nobody(other contexts)
+		 * can populate the pool therefore a simple list replace
+		 * operation takes place here.
+		 */
 		if (!full_decay && !list_empty(&tmp_list)) {
 			spin_lock(&vn->pool_lock);
 			list_replace_init(&tmp_list, &vn->pool[i].head);
@@ -2257,16 +2275,14 @@ struct vmap_area *find_vmap_area(unsigned long addr)
 	 * An addr_to_node_id(addr) converts an address to a node index
 	 * where a VA is located. If VA spans several zones and passed
 	 * addr is not the same as va->va_start, what is not common, we
-	 * may need to scan an extra nodes. See an example:
+	 * may need to scan extra nodes. See an example:
 	 *
-	 *      <--va-->
+	 *      <----va---->
 	 * -|-----|-----|-----|-----|-
 	 *     1     2     0     1
 	 *
-	 * VA resides in node 1 whereas it spans 1 and 2. If passed
-	 * addr is within a second node we should do extra work. We
-	 * should mention that it is rare and is a corner case from
-	 * the other hand it has to be covered.
+	 * VA resides in node 1 whereas it spans 1, 2 an 0. If passed
+	 * addr is within 2 or 0 nodes we should do extra work.
 	 */
 	i = j = addr_to_node_id(addr);
 	do {
@@ -2289,6 +2305,9 @@ static struct vmap_area *find_unlink_vmap_area(unsigned long addr)
 	struct vmap_area *va;
 	int i, j;
 
+	/*
+	 * Check the comment in the find_vmap_area() about the loop.
+	 */
 	i = j = addr_to_node_id(addr);
 	do {
 		vn = &vmap_nodes[i];
@@ -4882,7 +4901,20 @@ static void vmap_init_nodes(void)
 	int i, n;
 
 #if BITS_PER_LONG == 64
-	/* A high threshold of max nodes is fixed and bound to 128. */
+	/*
+	 * A high threshold of max nodes is fixed and bound to 128,
+	 * thus a scale factor is 1 for systems where number of cores
+	 * are less or equal to specified threshold.
+	 *
+	 * As for NUMA-aware notes. For bigger systems, for example
+	 * NUMA with multi-sockets, where we can end-up with thousands
+	 * of cores in total, a "sub-numa-clustering" should be added.
+	 *
+	 * In this case a NUMA domain is considered as a single entity
+	 * with dedicated sub-nodes in it which describe one group or
+	 * set of cores. Therefore a per-domain purging is supposed to
+	 * be added as well as a per-domain balancing.
+	 */
 	n = clamp_t(unsigned int, num_possible_cpus(), 1, 128);
 
 	if (n > 1) {

From 0d904895048970a1e65a767302a33eb8f99b817c Mon Sep 17 00:00:00 2001
From: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Date: Wed, 24 Jan 2024 19:09:20 +0100
Subject: [PATCH 0979/1406] mm: vmalloc: refactor vmalloc_dump_obj() function

This patch tends to simplify the function in question, by removing an
extra stack "objp" variable, returning back to an early exit approach if
spin_trylock() fails or VA was not found.

Link: https://lkml.kernel.org/r/20240124180920.50725-2-urezki@gmail.com
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Reviewed-by: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Oleksiy Avramchenko <oleksiy.avramchenko@sony.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/vmalloc.c | 33 +++++++++++++++++----------------
 1 file changed, 17 insertions(+), 16 deletions(-)

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 568f6c0b1fb5ef..25a8df49725544 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -4696,34 +4696,35 @@ void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms)
 #ifdef CONFIG_PRINTK
 bool vmalloc_dump_obj(void *object)
 {
-	void *objp = (void *)PAGE_ALIGN((unsigned long)object);
 	const void *caller;
+	struct vm_struct *vm;
 	struct vmap_area *va;
 	struct vmap_node *vn;
 	unsigned long addr;
 	unsigned int nr_pages;
-	bool success = false;
-
-	vn = addr_to_node((unsigned long)objp);
 
-	if (spin_trylock(&vn->busy.lock)) {
-		va = __find_vmap_area((unsigned long)objp, &vn->busy.root);
+	addr = PAGE_ALIGN((unsigned long) object);
+	vn = addr_to_node(addr);
 
-		if (va && va->vm) {
-			addr = (unsigned long)va->vm->addr;
-			caller = va->vm->caller;
-			nr_pages = va->vm->nr_pages;
-			success = true;
-		}
+	if (!spin_trylock(&vn->busy.lock))
+		return false;
 
+	va = __find_vmap_area(addr, &vn->busy.root);
+	if (!va || !va->vm) {
 		spin_unlock(&vn->busy.lock);
+		return false;
 	}
 
-	if (success)
-		pr_cont(" %u-page vmalloc region starting at %#lx allocated at %pS\n",
-			nr_pages, addr, caller);
+	vm = va->vm;
+	addr = (unsigned long) vm->addr;
+	caller = vm->caller;
+	nr_pages = vm->nr_pages;
+	spin_unlock(&vn->busy.lock);
 
-	return success;
+	pr_cont(" %u-page vmalloc region starting at %#lx allocated at %pS\n",
+		nr_pages, addr, caller);
+
+	return true;
 }
 #endif
 

From fee8757fd0a0147dcaa3bd87a64a6e7cea7cc2f8 Mon Sep 17 00:00:00 2001
From: Yajun Deng <yajun.deng@linux.dev>
Date: Wed, 10 Jan 2024 16:46:22 +0800
Subject: [PATCH 0980/1406] mm/mmap: simplify vma link and unlink

The file parameter in the __remove_shared_vm_struct is no longer used,
remove it.

These functions vma_link() and mmap_region() have some of the same code,
introduce vma_link_file() helper function to simplify the code.

Link: https://lkml.kernel.org/r/20240110084622.2425927-1-yajun.deng@linux.dev
Signed-off-by: Yajun Deng <yajun.deng@linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/mmap.c | 44 +++++++++++++++++++-------------------------
 1 file changed, 19 insertions(+), 25 deletions(-)

diff --git a/mm/mmap.c b/mm/mmap.c
index d89770eaab6b61..282ed6d0914b07 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -105,7 +105,7 @@ void vma_set_page_prot(struct vm_area_struct *vma)
  * Requires inode->i_mapping->i_mmap_rwsem
  */
 static void __remove_shared_vm_struct(struct vm_area_struct *vma,
-		struct file *file, struct address_space *mapping)
+				      struct address_space *mapping)
 {
 	if (vma_is_shared_maywrite(vma))
 		mapping_unmap_writable(mapping);
@@ -126,7 +126,7 @@ void unlink_file_vma(struct vm_area_struct *vma)
 	if (file) {
 		struct address_space *mapping = file->f_mapping;
 		i_mmap_lock_write(mapping);
-		__remove_shared_vm_struct(vma, file, mapping);
+		__remove_shared_vm_struct(vma, mapping);
 		i_mmap_unlock_write(mapping);
 	}
 }
@@ -392,26 +392,30 @@ static void __vma_link_file(struct vm_area_struct *vma,
 	flush_dcache_mmap_unlock(mapping);
 }
 
+static void vma_link_file(struct vm_area_struct *vma)
+{
+	struct file *file = vma->vm_file;
+	struct address_space *mapping;
+
+	if (file) {
+		mapping = file->f_mapping;
+		i_mmap_lock_write(mapping);
+		__vma_link_file(vma, mapping);
+		i_mmap_unlock_write(mapping);
+	}
+}
+
 static int vma_link(struct mm_struct *mm, struct vm_area_struct *vma)
 {
 	VMA_ITERATOR(vmi, mm, 0);
-	struct address_space *mapping = NULL;
 
 	vma_iter_config(&vmi, vma->vm_start, vma->vm_end);
 	if (vma_iter_prealloc(&vmi, vma))
 		return -ENOMEM;
 
 	vma_start_write(vma);
-
 	vma_iter_store(&vmi, vma);
-
-	if (vma->vm_file) {
-		mapping = vma->vm_file->f_mapping;
-		i_mmap_lock_write(mapping);
-		__vma_link_file(vma, mapping);
-		i_mmap_unlock_write(mapping);
-	}
-
+	vma_link_file(vma);
 	mm->map_count++;
 	validate_mm(mm);
 	return 0;
@@ -519,10 +523,9 @@ static inline void vma_complete(struct vma_prepare *vp,
 	}
 
 	if (vp->remove && vp->file) {
-		__remove_shared_vm_struct(vp->remove, vp->file, vp->mapping);
+		__remove_shared_vm_struct(vp->remove, vp->mapping);
 		if (vp->remove2)
-			__remove_shared_vm_struct(vp->remove2, vp->file,
-						  vp->mapping);
+			__remove_shared_vm_struct(vp->remove2, vp->mapping);
 	} else if (vp->insert) {
 		/*
 		 * split_vma has split insert from vma, and needs
@@ -2891,16 +2894,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
 	vma_start_write(vma);
 	vma_iter_store(&vmi, vma);
 	mm->map_count++;
-	if (vma->vm_file) {
-		i_mmap_lock_write(vma->vm_file->f_mapping);
-		if (vma_is_shared_maywrite(vma))
-			mapping_allow_writable(vma->vm_file->f_mapping);
-
-		flush_dcache_mmap_lock(vma->vm_file->f_mapping);
-		vma_interval_tree_insert(vma, &vma->vm_file->f_mapping->i_mmap);
-		flush_dcache_mmap_unlock(vma->vm_file->f_mapping);
-		i_mmap_unlock_write(vma->vm_file->f_mapping);
-	}
+	vma_link_file(vma);
 
 	/*
 	 * vma_merge() calls khugepaged_enter_vma() either, the below

From 0d803cf78d4bcd4aaadf80e63e11775328d5ca61 Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Fri, 29 Dec 2023 16:22:07 +0800
Subject: [PATCH 0981/1406] mm: memory: use nth_page() in clear/copy_subpage()

The clear and copy of huge gigantic page has converted to use nth_page()
to handle the possible discontinuous struct page(SPARSEMEM without
VMEMMAP), but not change for the non-gigantic part, fix it too.

Link: https://lkml.kernel.org/r/20231229082207.60235-1-wangkefeng.wang@huawei.com
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Zi Yan <ziy@nvidia.com>
Cc: David Hildenbrand <david@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/memory.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index 0bfc8b007c01a3..a25bc8a370fd0e 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -6163,7 +6163,7 @@ static int clear_subpage(unsigned long addr, int idx, void *arg)
 {
 	struct page *page = arg;
 
-	clear_user_highpage(page + idx, addr);
+	clear_user_highpage(nth_page(page, idx), addr);
 	return 0;
 }
 
@@ -6213,10 +6213,11 @@ struct copy_subpage_arg {
 static int copy_subpage(unsigned long addr, int idx, void *arg)
 {
 	struct copy_subpage_arg *copy_arg = arg;
+	struct page *dst = nth_page(copy_arg->dst, idx);
+	struct page *src = nth_page(copy_arg->src, idx);
 
-	if (copy_mc_user_highpage(copy_arg->dst + idx, copy_arg->src + idx,
-				  addr, copy_arg->vma)) {
-		memory_failure_queue(page_to_pfn(copy_arg->src + idx), 0);
+	if (copy_mc_user_highpage(dst, src, addr, copy_arg->vma)) {
+		memory_failure_queue(page_to_pfn(src), 0);
 		return -EHWPOISON;
 	}
 	return 0;

From 8e52f333f2d52062106a6b2e22d2cc50e7e4af50 Mon Sep 17 00:00:00 2001
From: Haifeng Xu <haifeng.xu@shopee.com>
Date: Thu, 28 Dec 2023 06:27:14 +0000
Subject: [PATCH 0982/1406] mm: list_lru: disable memcg_aware when
 cgroup.memory is set to "nokmem"

Actually, when using a boot time kernel option "cgroup.memory=nokmem", all
lru items are inserted to list_lru_node.  But for those users who invoke
list_lru_init_memcg() to initialize list_lru, list_lru_memcg_aware()
returns true.  And this brings unneeded operations related to memcg.

To make things more convenient, let's disable memcg_aware when
cgroup.memory is set to "nokmem".

Link: https://lkml.kernel.org/r/20231228062715.338672-1-haifeng.xu@shopee.com
Signed-off-by: Haifeng Xu <haifeng.xu@shopee.com>
Acked-by: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Shakeel Butt <shakeelb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/list_lru.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/mm/list_lru.c b/mm/list_lru.c
index 35b0147542a9de..158781d1d3c215 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -567,6 +567,9 @@ int __list_lru_init(struct list_lru *lru, bool memcg_aware,
 		lru->shrinker_id = shrinker->id;
 	else
 		lru->shrinker_id = -1;
+
+	if (mem_cgroup_kmem_disabled())
+		memcg_aware = false;
 #endif
 
 	lru->node = kcalloc(nr_node_ids, sizeof(*lru->node), GFP_KERNEL);

From 13c8b215d9e27adf2ec8000e962df6c0b9d92ee7 Mon Sep 17 00:00:00 2001
From: Haifeng Xu <haifeng.xu@shopee.com>
Date: Thu, 28 Dec 2023 06:27:15 +0000
Subject: [PATCH 0983/1406] mm: list_lru: remove unused macro
 list_lru_init_key()

list_lru_init_key() isn't used by anyone, remove it to clean up.

Link: https://lkml.kernel.org/r/20231228062715.338672-2-haifeng.xu@shopee.com
Signed-off-by: Haifeng Xu <haifeng.xu@shopee.com>
Acked-by: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Shakeel Butt <shakeelb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/list_lru.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h
index 7675a48a070108..c679e6b293c4c4 100644
--- a/include/linux/list_lru.h
+++ b/include/linux/list_lru.h
@@ -62,8 +62,6 @@ int __list_lru_init(struct list_lru *lru, bool memcg_aware,
 
 #define list_lru_init(lru)				\
 	__list_lru_init((lru), false, NULL, NULL)
-#define list_lru_init_key(lru, key)			\
-	__list_lru_init((lru), false, (key), NULL)
 #define list_lru_init_memcg(lru, shrinker)		\
 	__list_lru_init((lru), true, NULL, shrinker)
 

From 86a006124a020033e4eaa8d3d610c37641a98e4a Mon Sep 17 00:00:00 2001
From: Yang Shi <yang@os.amperecomputing.com>
Date: Wed, 20 Dec 2023 22:59:42 -0800
Subject: [PATCH 0984/1406] mm: mmap: no need to call khugepaged_enter_vma()
 for stack

We avoid allocating THP for temporary stack, even though
khugepaged_enter_vma() is called for stack VMAs, it actualy returns
false.  So no need to call it in the first place at all.

Link: https://lkml.kernel.org/r/20231221065943.2803551-1-shy828301@gmail.com
Signed-off-by: Yang Shi <yang@os.amperecomputing.com>
Reviewed-by: Yin Fengwei <fengwei.yin@intel.com>
Cc: Christopher Lameter <cl@linux.com>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Rik van Riel <riel@surriel.com>
Cc: kernel test robot <oliver.sang@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/mmap.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/mm/mmap.c b/mm/mmap.c
index 282ed6d0914b07..66f534ec90a55e 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2051,7 +2051,6 @@ static int expand_upwards(struct vm_area_struct *vma, unsigned long address)
 		}
 	}
 	anon_vma_unlock_write(vma->anon_vma);
-	khugepaged_enter_vma(vma, vma->vm_flags);
 	mas_destroy(&mas);
 	validate_mm(mm);
 	return error;
@@ -2145,7 +2144,6 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address)
 		}
 	}
 	anon_vma_unlock_write(vma->anon_vma);
-	khugepaged_enter_vma(vma, vma->vm_flags);
 	mas_destroy(&mas);
 	validate_mm(mm);
 	return error;

From 3f02bf847d4872069eb338dc2b3a23fef2e536a2 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 11 Jan 2024 18:12:16 +0000
Subject: [PATCH 0985/1406] memcg: convert mem_cgroup_move_charge_pte_range()
 to use a folio

Patch series "Convert memcontrol charge moving to use folios".

No part of these patches should change behaviour; all the called functions
already convert from page to folio, so this ought to simply be a reduction
in the number of calls to compound_head().


This patch (of 4):

Remove many calls to compound_head() by calling page_folio() once at the
start of each stanza which receives a struct page from 'target'.  There
should be no change in behaviour here as all the called functions start
out by converting the page to its folio.

Link: https://lkml.kernel.org/r/20240111181219.3462852-1-willy@infradead.org
Link: https://lkml.kernel.org/r/20240111181219.3462852-2-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Roman Gushchin <roman.gushchin@linux.dev>
Reviewed-by: Muchun Song <muchun.song@linux.dev>
Acked-by: Shakeel Butt <shakeelb@google.com>
Cc: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/memcontrol.c | 49 ++++++++++++++++++++++++-------------------------
 1 file changed, 24 insertions(+), 25 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 61932c9215e773..d1c1bd1663078c 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5965,23 +5965,22 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
 }
 
 /**
- * mem_cgroup_move_account - move account of the page
- * @page: the page
+ * mem_cgroup_move_account - move account of the folio
+ * @folio: The folio.
  * @compound: charge the page as compound or small page
- * @from: mem_cgroup which the page is moved from.
- * @to:	mem_cgroup which the page is moved to. @from != @to.
+ * @from: mem_cgroup which the folio is moved from.
+ * @to:	mem_cgroup which the folio is moved to. @from != @to.
  *
- * The page must be locked and not on the LRU.
+ * The folio must be locked and not on the LRU.
  *
  * This function doesn't do "charge" to new cgroup and doesn't do "uncharge"
  * from old cgroup.
  */
-static int mem_cgroup_move_account(struct page *page,
+static int mem_cgroup_move_account(struct folio *folio,
 				   bool compound,
 				   struct mem_cgroup *from,
 				   struct mem_cgroup *to)
 {
-	struct folio *folio = page_folio(page);
 	struct lruvec *from_vec, *to_vec;
 	struct pglist_data *pgdat;
 	unsigned int nr_pages = compound ? folio_nr_pages(folio) : 1;
@@ -6431,7 +6430,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
 	spinlock_t *ptl;
 	enum mc_target_type target_type;
 	union mc_target target;
-	struct page *page;
+	struct folio *folio;
 
 	ptl = pmd_trans_huge_lock(pmd, vma);
 	if (ptl) {
@@ -6441,26 +6440,26 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
 		}
 		target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
 		if (target_type == MC_TARGET_PAGE) {
-			page = target.page;
-			if (isolate_lru_page(page)) {
-				if (!mem_cgroup_move_account(page, true,
+			folio = page_folio(target.page);
+			if (folio_isolate_lru(folio)) {
+				if (!mem_cgroup_move_account(folio, true,
 							     mc.from, mc.to)) {
 					mc.precharge -= HPAGE_PMD_NR;
 					mc.moved_charge += HPAGE_PMD_NR;
 				}
-				putback_lru_page(page);
+				folio_putback_lru(folio);
 			}
-			unlock_page(page);
-			put_page(page);
+			folio_unlock(folio);
+			folio_put(folio);
 		} else if (target_type == MC_TARGET_DEVICE) {
-			page = target.page;
-			if (!mem_cgroup_move_account(page, true,
+			folio = page_folio(target.page);
+			if (!mem_cgroup_move_account(folio, true,
 						     mc.from, mc.to)) {
 				mc.precharge -= HPAGE_PMD_NR;
 				mc.moved_charge += HPAGE_PMD_NR;
 			}
-			unlock_page(page);
-			put_page(page);
+			folio_unlock(folio);
+			folio_put(folio);
 		}
 		spin_unlock(ptl);
 		return 0;
@@ -6483,28 +6482,28 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
 			device = true;
 			fallthrough;
 		case MC_TARGET_PAGE:
-			page = target.page;
+			folio = page_folio(target.page);
 			/*
 			 * We can have a part of the split pmd here. Moving it
 			 * can be done but it would be too convoluted so simply
 			 * ignore such a partial THP and keep it in original
 			 * memcg. There should be somebody mapping the head.
 			 */
-			if (PageTransCompound(page))
+			if (folio_test_large(folio))
 				goto put;
-			if (!device && !isolate_lru_page(page))
+			if (!device && !folio_isolate_lru(folio))
 				goto put;
-			if (!mem_cgroup_move_account(page, false,
+			if (!mem_cgroup_move_account(folio, false,
 						mc.from, mc.to)) {
 				mc.precharge--;
 				/* we uncharge from mc.from later. */
 				mc.moved_charge++;
 			}
 			if (!device)
-				putback_lru_page(page);
+				folio_putback_lru(folio);
 put:			/* get_mctgt_type() gets & locks the page */
-			unlock_page(page);
-			put_page(page);
+			folio_unlock(folio);
+			folio_put(folio);
 			break;
 		case MC_TARGET_SWAP:
 			ent = target.ent;

From 6f455f20d6e2edd3c37a05ec93d4315556795f93 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 11 Jan 2024 18:12:17 +0000
Subject: [PATCH 0986/1406] memcg: return the folio in union mc_target

All users of target.page convert it to the folio, so we can just return
the folio directly and save a few calls to compound_head().

Link: https://lkml.kernel.org/r/20240111181219.3462852-3-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Roman Gushchin <roman.gushchin@linux.dev>
Reviewed-by: Muchun Song <muchun.song@linux.dev>
Acked-by: Shakeel Butt <shakeelb@google.com>
Cc: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/memcontrol.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index d1c1bd1663078c..f52cbc42359e5e 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5873,7 +5873,7 @@ static int mem_cgroup_do_precharge(unsigned long count)
 }
 
 union mc_target {
-	struct page	*page;
+	struct folio	*folio;
 	swp_entry_t	ent;
 };
 
@@ -6095,7 +6095,7 @@ static int mem_cgroup_move_account(struct folio *folio,
  * Return:
  * * MC_TARGET_NONE - If the pte is not a target for move charge.
  * * MC_TARGET_PAGE - If the page corresponding to this pte is a target for
- *   move charge. If @target is not NULL, the page is stored in target->page
+ *   move charge. If @target is not NULL, the folio is stored in target->folio
  *   with extra refcnt taken (Caller should release it).
  * * MC_TARGET_SWAP - If the swap entry corresponding to this pte is a
  *   target for charge migration.  If @target is not NULL, the entry is
@@ -6160,7 +6160,7 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
 			    is_device_coherent_page(page))
 				ret = MC_TARGET_DEVICE;
 			if (target)
-				target->page = page;
+				target->folio = page_folio(page);
 		}
 		if (!ret || !target) {
 			if (target)
@@ -6210,7 +6210,7 @@ static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
 				put_page(page);
 				return MC_TARGET_NONE;
 			}
-			target->page = page;
+			target->folio = page_folio(page);
 		}
 	}
 	return ret;
@@ -6440,7 +6440,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
 		}
 		target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
 		if (target_type == MC_TARGET_PAGE) {
-			folio = page_folio(target.page);
+			folio = target.folio;
 			if (folio_isolate_lru(folio)) {
 				if (!mem_cgroup_move_account(folio, true,
 							     mc.from, mc.to)) {
@@ -6452,7 +6452,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
 			folio_unlock(folio);
 			folio_put(folio);
 		} else if (target_type == MC_TARGET_DEVICE) {
-			folio = page_folio(target.page);
+			folio = target.folio;
 			if (!mem_cgroup_move_account(folio, true,
 						     mc.from, mc.to)) {
 				mc.precharge -= HPAGE_PMD_NR;
@@ -6482,7 +6482,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
 			device = true;
 			fallthrough;
 		case MC_TARGET_PAGE:
-			folio = page_folio(target.page);
+			folio = target.folio;
 			/*
 			 * We can have a part of the split pmd here. Moving it
 			 * can be done but it would be too convoluted so simply

From 3bd1e4666fc2afd9b2e61a15bba7a52e396729fa Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 11 Jan 2024 18:12:18 +0000
Subject: [PATCH 0987/1406] memcg: use a folio in get_mctgt_type

Replace seven calls to compound_head() with one.  We still use the page as
page_mapped() is different from folio_mapped().

Link: https://lkml.kernel.org/r/20240111181219.3462852-4-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Roman Gushchin <roman.gushchin@linux.dev>
Reviewed-by: Muchun Song <muchun.song@linux.dev>
Acked-by: Shakeel Butt <shakeelb@google.com>
Cc: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/memcontrol.c | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index f52cbc42359e5e..a5bc34d44dbd50 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -6109,6 +6109,7 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
 		unsigned long addr, pte_t ptent, union mc_target *target)
 {
 	struct page *page = NULL;
+	struct folio *folio;
 	enum mc_target_type ret = MC_TARGET_NONE;
 	swp_entry_t ent = { .val = 0 };
 
@@ -6123,9 +6124,11 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
 	else if (is_swap_pte(ptent))
 		page = mc_handle_swap_pte(vma, ptent, &ent);
 
+	if (page)
+		folio = page_folio(page);
 	if (target && page) {
-		if (!trylock_page(page)) {
-			put_page(page);
+		if (!folio_trylock(folio)) {
+			folio_put(folio);
 			return ret;
 		}
 		/*
@@ -6140,8 +6143,8 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
 		 * Alas, skip moving the page in this case.
 		 */
 		if (!pte_present(ptent) && page_mapped(page)) {
-			unlock_page(page);
-			put_page(page);
+			folio_unlock(folio);
+			folio_put(folio);
 			return ret;
 		}
 	}
@@ -6154,18 +6157,18 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
 		 * mem_cgroup_move_account() checks the page is valid or
 		 * not under LRU exclusion.
 		 */
-		if (page_memcg(page) == mc.from) {
+		if (folio_memcg(folio) == mc.from) {
 			ret = MC_TARGET_PAGE;
-			if (is_device_private_page(page) ||
-			    is_device_coherent_page(page))
+			if (folio_is_device_private(folio) ||
+			    folio_is_device_coherent(folio))
 				ret = MC_TARGET_DEVICE;
 			if (target)
-				target->folio = page_folio(page);
+				target->folio = folio;
 		}
 		if (!ret || !target) {
 			if (target)
-				unlock_page(page);
-			put_page(page);
+				folio_unlock(folio);
+			folio_put(folio);
 		}
 	}
 	/*

From 379f3efea46d7a465224c3d132cff29bb4c9d044 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 11 Jan 2024 18:12:19 +0000
Subject: [PATCH 0988/1406] memcg: use a folio in get_mctgt_type_thp

Replace five calls to compound_head() with one.

Link: https://lkml.kernel.org/r/20240111181219.3462852-5-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Roman Gushchin <roman.gushchin@linux.dev>
Reviewed-by: Muchun Song <muchun.song@linux.dev>
Acked-by: Shakeel Butt <shakeelb@google.com>
Cc: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/memcontrol.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index a5bc34d44dbd50..db92401257f7d0 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -6194,6 +6194,7 @@ static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
 		unsigned long addr, pmd_t pmd, union mc_target *target)
 {
 	struct page *page = NULL;
+	struct folio *folio;
 	enum mc_target_type ret = MC_TARGET_NONE;
 
 	if (unlikely(is_swap_pmd(pmd))) {
@@ -6203,17 +6204,18 @@ static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
 	}
 	page = pmd_page(pmd);
 	VM_BUG_ON_PAGE(!page || !PageHead(page), page);
+	folio = page_folio(page);
 	if (!(mc.flags & MOVE_ANON))
 		return ret;
-	if (page_memcg(page) == mc.from) {
+	if (folio_memcg(folio) == mc.from) {
 		ret = MC_TARGET_PAGE;
 		if (target) {
-			get_page(page);
-			if (!trylock_page(page)) {
-				put_page(page);
+			folio_get(folio);
+			if (!folio_trylock(folio)) {
+				folio_put(folio);
 				return MC_TARGET_NONE;
 			}
-			target->folio = page_folio(page);
+			target->folio = folio;
 		}
 	}
 	return ret;

From 68e6e98aee34c94ee58112199a5be8c348619627 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 11 Jan 2024 15:24:20 +0000
Subject: [PATCH 0989/1406] mm: add pfn_swap_entry_folio()

Patch series "mm: convert mm counter to take a folio", v3.

Make sure all mm_counter() and mm_counter_file() callers have a folio,
then convert mm counter functions to take a folio, which saves some
compound_head() calls.


This patch (of 10):

Thanks to the compound_head() hidden inside PageLocked(), this saves a
call to compound_head() over calling page_folio(pfn_swap_entry_to_page())

Link: https://lkml.kernel.org/r/20240111152429.3374566-1-willy@infradead.org
Link: https://lkml.kernel.org/r/20240111152429.3374566-2-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/swapops.h | 13 +++++++++++++
 mm/filemap.c            |  2 +-
 mm/huge_memory.c        |  2 +-
 3 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index bff1e8d97de0e0..48b700ba1d188a 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -468,6 +468,19 @@ static inline struct page *pfn_swap_entry_to_page(swp_entry_t entry)
 	return p;
 }
 
+static inline struct folio *pfn_swap_entry_folio(swp_entry_t entry)
+{
+	struct folio *folio = pfn_folio(swp_offset_pfn(entry));
+
+	/*
+	 * Any use of migration entries may only occur while the
+	 * corresponding folio is locked
+	 */
+	BUG_ON(is_migration_entry(entry) && !folio_test_locked(folio));
+
+	return folio;
+}
+
 /*
  * A pfn swap entry is a special type of swap entry that always has a pfn stored
  * in the swap offset. They are used to represent unaddressable device memory
diff --git a/mm/filemap.c b/mm/filemap.c
index c7e67b22cc94ca..b7a21551fbc7ca 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1354,7 +1354,7 @@ void migration_entry_wait_on_locked(swp_entry_t entry, spinlock_t *ptl)
 	unsigned long pflags;
 	bool in_thrashing;
 	wait_queue_head_t *q;
-	struct folio *folio = page_folio(pfn_swap_entry_to_page(entry));
+	struct folio *folio = pfn_swap_entry_folio(entry);
 
 	q = folio_waitqueue(folio);
 	if (!folio_test_uptodate(folio) && folio_test_workingset(folio)) {
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 94c958f7ebb50d..5468b2f97cbf70 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2045,7 +2045,7 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
 	if (is_swap_pmd(*pmd)) {
 		swp_entry_t entry = pmd_to_swp_entry(*pmd);
-		struct folio *folio = page_folio(pfn_swap_entry_to_page(entry));
+		struct folio *folio = pfn_swap_entry_folio(entry);
 		pmd_t newpmd;
 
 		VM_BUG_ON(!is_pmd_migration_entry(*pmd));

From 98e1bda7c59a863e6091c3ed95662407cde4745c Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 11 Jan 2024 15:24:21 +0000
Subject: [PATCH 0990/1406] proc: use pfn_swap_entry_folio where obvious

These callers only pass the result to PageAnon(), so we can save the extra
call to compound_head() by using pfn_swap_entry_folio().

Link: https://lkml.kernel.org/r/20240111152429.3374566-3-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/proc/task_mmu.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 3f78ebbb795fe2..ac6ea2cc2ee8fe 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1807,7 +1807,7 @@ static unsigned long pagemap_page_category(struct pagemap_scan_private *p,
 		if (p->masks_of_interest & PAGE_IS_FILE) {
 			swp = pte_to_swp_entry(pte);
 			if (is_pfn_swap_entry(swp) &&
-			    !PageAnon(pfn_swap_entry_to_page(swp)))
+			    !folio_test_anon(pfn_swap_entry_folio(swp)))
 				categories |= PAGE_IS_FILE;
 		}
 		if (pte_swp_soft_dirty(pte))
@@ -1873,7 +1873,7 @@ static unsigned long pagemap_thp_category(struct pagemap_scan_private *p,
 		if (p->masks_of_interest & PAGE_IS_FILE) {
 			swp = pmd_to_swp_entry(pmd);
 			if (is_pfn_swap_entry(swp) &&
-			    !PageAnon(pfn_swap_entry_to_page(swp)))
+			    !folio_test_anon(pfn_swap_entry_folio(swp)))
 				categories |= PAGE_IS_FILE;
 		}
 	}

From d84b640b4e0f3b9cf6dc73ce09f0cae0c0e92571 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 11 Jan 2024 15:24:22 +0000
Subject: [PATCH 0991/1406] mprotect: use pfn_swap_entry_folio

We only want to know whether the folio is anonymous, so use
pfn_swap_entry_folio() and save a call to compound_head().

Link: https://lkml.kernel.org/r/20240111152429.3374566-4-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/mprotect.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mm/mprotect.c b/mm/mprotect.c
index 81991102f7859e..f8a4544b4601db 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -198,13 +198,13 @@ static long change_pte_range(struct mmu_gather *tlb,
 			pte_t newpte;
 
 			if (is_writable_migration_entry(entry)) {
-				struct page *page = pfn_swap_entry_to_page(entry);
+				struct folio *folio = pfn_swap_entry_folio(entry);
 
 				/*
 				 * A protection check is difficult so
 				 * just be safe and disable write
 				 */
-				if (PageAnon(page))
+				if (folio_test_anon(folio))
 					entry = make_readable_exclusive_migration_entry(
 							     swp_offset(entry));
 				else

From da27bea9e6d6dc43750a9d6178ee88a503f6ba57 Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Thu, 11 Jan 2024 15:24:23 +0000
Subject: [PATCH 0992/1406] s390: use pfn_swap_entry_folio() in
 ptep_zap_swap_entry()

Call pfn_swap_entry_folio() in ptep_zap_swap_entry() as preparation for
converting mm counter functions to take a folio.

Link: https://lkml.kernel.org/r/20240111152429.3374566-5-willy@infradead.org
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: David Hildenbrand <david@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/s390/mm/pgtable.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index 99422926efe1b5..7e5dd4b176642c 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -721,9 +721,9 @@ static void ptep_zap_swap_entry(struct mm_struct *mm, swp_entry_t entry)
 	if (!non_swap_entry(entry))
 		dec_mm_counter(mm, MM_SWAPENTS);
 	else if (is_migration_entry(entry)) {
-		struct page *page = pfn_swap_entry_to_page(entry);
+		struct folio *folio = pfn_swap_entry_folio(entry);
 
-		dec_mm_counter(mm, mm_counter(page));
+		dec_mm_counter(mm, mm_counter(&folio->page));
 	}
 	free_swap_and_cache(entry);
 }

From 2bf20a5cf4674c2a403981b18e644a730582ee90 Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Thu, 11 Jan 2024 15:24:24 +0000
Subject: [PATCH 0993/1406] mm: use pfn_swap_entry_folio() in
 __split_huge_pmd_locked()

Call pfn_swap_entry_folio() in __split_huge_pmd_locked() as preparation
for converting mm counter functions to take a folio.

Link: https://lkml.kernel.org/r/20240111152429.3374566-6-willy@infradead.org
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: David Hildenbrand <david@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/huge_memory.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 5468b2f97cbf70..33b720037ab725 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2442,7 +2442,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
 			swp_entry_t entry;
 
 			entry = pmd_to_swp_entry(old_pmd);
-			page = pfn_swap_entry_to_page(entry);
+			folio = pfn_swap_entry_folio(entry);
 		} else {
 			page = pmd_page(old_pmd);
 			folio = page_folio(page);
@@ -2453,7 +2453,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
 			folio_remove_rmap_pmd(folio, page, vma);
 			folio_put(folio);
 		}
-		add_mm_counter(mm, mm_counter_file(page), -HPAGE_PMD_NR);
+		add_mm_counter(mm, mm_counter_file(&folio->page), -HPAGE_PMD_NR);
 		return;
 	}
 

From a613c7970403d016cc4686bffdf9de06e0ee3710 Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Thu, 11 Jan 2024 15:24:25 +0000
Subject: [PATCH 0994/1406] mm: use pfn_swap_entry_to_folio() in zap_huge_pmd()

Call pfn_swap_entry_to_folio() in zap_huge_pmd() as preparation for
converting mm counter functions to take a folio.  Saves a call to
compound_head() embedded inside PageAnon().

Link: https://lkml.kernel.org/r/20240111152429.3374566-7-willy@infradead.org
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: David Hildenbrand <david@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/huge_memory.c | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 33b720037ab725..7a28a7db08ea0d 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1905,12 +1905,14 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 		zap_deposited_table(tlb->mm, pmd);
 		spin_unlock(ptl);
 	} else {
-		struct page *page = NULL;
+		struct folio *folio = NULL;
 		int flush_needed = 1;
 
 		if (pmd_present(orig_pmd)) {
-			page = pmd_page(orig_pmd);
-			folio_remove_rmap_pmd(page_folio(page), page, vma);
+			struct page *page = pmd_page(orig_pmd);
+
+			folio = page_folio(page);
+			folio_remove_rmap_pmd(folio, page, vma);
 			VM_BUG_ON_PAGE(page_mapcount(page) < 0, page);
 			VM_BUG_ON_PAGE(!PageHead(page), page);
 		} else if (thp_migration_supported()) {
@@ -1918,23 +1920,24 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 
 			VM_BUG_ON(!is_pmd_migration_entry(orig_pmd));
 			entry = pmd_to_swp_entry(orig_pmd);
-			page = pfn_swap_entry_to_page(entry);
+			folio = pfn_swap_entry_folio(entry);
 			flush_needed = 0;
 		} else
 			WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!");
 
-		if (PageAnon(page)) {
+		if (folio_test_anon(folio)) {
 			zap_deposited_table(tlb->mm, pmd);
 			add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
 		} else {
 			if (arch_needs_pgtable_deposit())
 				zap_deposited_table(tlb->mm, pmd);
-			add_mm_counter(tlb->mm, mm_counter_file(page), -HPAGE_PMD_NR);
+			add_mm_counter(tlb->mm, mm_counter_file(&folio->page),
+				       -HPAGE_PMD_NR);
 		}
 
 		spin_unlock(ptl);
 		if (flush_needed)
-			tlb_remove_page_size(tlb, page, HPAGE_PMD_SIZE);
+			tlb_remove_page_size(tlb, &folio->page, HPAGE_PMD_SIZE);
 	}
 	return 1;
 }

From bce0f983689f66a230991c5beaf37c16dbc2b315 Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Thu, 11 Jan 2024 15:24:26 +0000
Subject: [PATCH 0995/1406] mm: use pfn_swap_entry_folio() in
 copy_nonpresent_pte()

Call pfn_swap_entry_folio() as preparation for converting mm counter
functions to take a folio.

Link: https://lkml.kernel.org/r/20240111152429.3374566-8-willy@infradead.org
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: David Hildenbrand <david@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/memory.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index a25bc8a370fd0e..c8dd249419142c 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -806,9 +806,9 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		}
 		rss[MM_SWAPENTS]++;
 	} else if (is_migration_entry(entry)) {
-		page = pfn_swap_entry_to_page(entry);
+		folio = pfn_swap_entry_folio(entry);
 
-		rss[mm_counter(page)]++;
+		rss[mm_counter(&folio->page)]++;
 
 		if (!is_readable_migration_entry(entry) &&
 				is_cow_mapping(vm_flags)) {

From a2699eab902d9dd05ab5226bd44f22cdec028bef Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Thu, 11 Jan 2024 15:24:27 +0000
Subject: [PATCH 0996/1406] mm: convert to should_zap_page() to
 should_zap_folio()

Make should_zap_page() take a folio and rename it to should_zap_folio() as
preparation for converting mm counter functions to take a folio.  Saves a
call to compound_head() hidden inside PageAnon().

Link: https://lkml.kernel.org/r/20240111152429.3374566-9-willy@infradead.org
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: David Hildenbrand <david@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/memory.c | 29 ++++++++++++++++-------------
 1 file changed, 16 insertions(+), 13 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index c8dd249419142c..9d7c31bb994a6f 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1369,19 +1369,20 @@ static inline bool should_zap_cows(struct zap_details *details)
 	return details->even_cows;
 }
 
-/* Decides whether we should zap this page with the page pointer specified */
-static inline bool should_zap_page(struct zap_details *details, struct page *page)
+/* Decides whether we should zap this folio with the folio pointer specified */
+static inline bool should_zap_folio(struct zap_details *details,
+				    struct folio *folio)
 {
-	/* If we can make a decision without *page.. */
+	/* If we can make a decision without *folio.. */
 	if (should_zap_cows(details))
 		return true;
 
-	/* E.g. the caller passes NULL for the case of a zero page */
-	if (!page)
+	/* E.g. the caller passes NULL for the case of a zero folio */
+	if (!folio)
 		return true;
 
-	/* Otherwise we should only zap non-anon pages */
-	return !PageAnon(page);
+	/* Otherwise we should only zap non-anon folios */
+	return !folio_test_anon(folio);
 }
 
 static inline bool zap_drop_file_uffd_wp(struct zap_details *details)
@@ -1447,7 +1448,10 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
 			unsigned int delay_rmap;
 
 			page = vm_normal_page(vma, addr, ptent);
-			if (unlikely(!should_zap_page(details, page)))
+			if (page)
+				folio = page_folio(page);
+
+			if (unlikely(!should_zap_folio(details, folio)))
 				continue;
 			ptent = ptep_get_and_clear_full(mm, addr, pte,
 							tlb->fullmm);
@@ -1460,7 +1464,6 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
 				continue;
 			}
 
-			folio = page_folio(page);
 			delay_rmap = 0;
 			if (!folio_test_anon(folio)) {
 				if (pte_dirty(ptent)) {
@@ -1492,7 +1495,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
 		    is_device_exclusive_entry(entry)) {
 			page = pfn_swap_entry_to_page(entry);
 			folio = page_folio(page);
-			if (unlikely(!should_zap_page(details, page)))
+			if (unlikely(!should_zap_folio(details, folio)))
 				continue;
 			/*
 			 * Both device private/exclusive mappings should only
@@ -1513,10 +1516,10 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
 			if (unlikely(!free_swap_and_cache(entry)))
 				print_bad_pte(vma, addr, ptent, NULL);
 		} else if (is_migration_entry(entry)) {
-			page = pfn_swap_entry_to_page(entry);
-			if (!should_zap_page(details, page))
+			folio = pfn_swap_entry_folio(entry);
+			if (!should_zap_folio(details, folio))
 				continue;
-			rss[mm_counter(page)]--;
+			rss[mm_counter(&folio->page)]--;
 		} else if (pte_marker_entry_uffd_wp(entry)) {
 			/*
 			 * For anon: always drop the marker; for file: only

From 991c829e6bbc8a2bd78317aaeb1239fbe7a0973d Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Fri, 12 Jan 2024 18:14:32 +0800
Subject: [PATCH 0997/1406] 
 mm-convert-to-should_zap_page-to-should_zap_folio-fix

fix used-uninitialized warning

Link: https://lkml.kernel.org/r/962a7993-fce9-4de8-85cd-25e290f25736@huawei.com
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202401121250.A221BL2D-lkp@intel.com/
Cc: David Hildenbrand <david@redhat.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/memory.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/memory.c b/mm/memory.c
index 9d7c31bb994a6f..4bed82009ea7e0 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1435,7 +1435,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
 	arch_enter_lazy_mmu_mode();
 	do {
 		pte_t ptent = ptep_get(pte);
-		struct folio *folio;
+		struct folio *folio = NULL;
 		struct page *page;
 
 		if (pte_none(ptent))

From af7e524dc0c9229211dd1ff72005b69828563321 Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Thu, 11 Jan 2024 15:24:28 +0000
Subject: [PATCH 0998/1406] mm: convert mm_counter() to take a folio

Now all callers of mm_counter() have a folio, convert mm_counter() to take
a folio.  Saves a call to compound_head() hidden inside PageAnon().

Link: https://lkml.kernel.org/r/20240111152429.3374566-10-willy@infradead.org
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: David Hildenbrand <david@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/s390/mm/pgtable.c |  2 +-
 include/linux/mm.h     |  6 +++---
 mm/memory.c            | 10 +++++-----
 mm/rmap.c              |  8 ++++----
 mm/userfaultfd.c       |  2 +-
 5 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index 7e5dd4b176642c..b71432b15d665c 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -723,7 +723,7 @@ static void ptep_zap_swap_entry(struct mm_struct *mm, swp_entry_t entry)
 	else if (is_migration_entry(entry)) {
 		struct folio *folio = pfn_swap_entry_folio(entry);
 
-		dec_mm_counter(mm, mm_counter(&folio->page));
+		dec_mm_counter(mm, mm_counter(folio));
 	}
 	free_swap_and_cache(entry);
 }
diff --git a/include/linux/mm.h b/include/linux/mm.h
index f5a97dec516948..22e597b36b3887 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2603,11 +2603,11 @@ static inline int mm_counter_file(struct page *page)
 	return MM_FILEPAGES;
 }
 
-static inline int mm_counter(struct page *page)
+static inline int mm_counter(struct folio *folio)
 {
-	if (PageAnon(page))
+	if (folio_test_anon(folio))
 		return MM_ANONPAGES;
-	return mm_counter_file(page);
+	return mm_counter_file(&folio->page);
 }
 
 static inline unsigned long get_mm_rss(struct mm_struct *mm)
diff --git a/mm/memory.c b/mm/memory.c
index 4bed82009ea7e0..87ef9809984728 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -808,7 +808,7 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 	} else if (is_migration_entry(entry)) {
 		folio = pfn_swap_entry_folio(entry);
 
-		rss[mm_counter(&folio->page)]++;
+		rss[mm_counter(folio)]++;
 
 		if (!is_readable_migration_entry(entry) &&
 				is_cow_mapping(vm_flags)) {
@@ -840,7 +840,7 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		 * keep things as they are.
 		 */
 		folio_get(folio);
-		rss[mm_counter(page)]++;
+		rss[mm_counter(folio)]++;
 		/* Cannot fail as these pages cannot get pinned. */
 		folio_try_dup_anon_rmap_pte(folio, page, src_vma);
 
@@ -1476,7 +1476,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
 				if (pte_young(ptent) && likely(vma_has_recency(vma)))
 					folio_mark_accessed(folio);
 			}
-			rss[mm_counter(page)]--;
+			rss[mm_counter(folio)]--;
 			if (!delay_rmap) {
 				folio_remove_rmap_pte(folio, page, vma);
 				if (unlikely(page_mapcount(page) < 0))
@@ -1504,7 +1504,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
 			 * see zap_install_uffd_wp_if_needed().
 			 */
 			WARN_ON_ONCE(!vma_is_anonymous(vma));
-			rss[mm_counter(page)]--;
+			rss[mm_counter(folio)]--;
 			if (is_device_private_entry(entry))
 				folio_remove_rmap_pte(folio, page, vma);
 			folio_put(folio);
@@ -1519,7 +1519,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
 			folio = pfn_swap_entry_folio(entry);
 			if (!should_zap_folio(details, folio))
 				continue;
-			rss[mm_counter(&folio->page)]--;
+			rss[mm_counter(folio)]--;
 		} else if (pte_marker_entry_uffd_wp(entry)) {
 			/*
 			 * For anon: always drop the marker; for file: only
diff --git a/mm/rmap.c b/mm/rmap.c
index f5d43edad529a7..4648cf1d8178b5 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1780,7 +1780,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
 				set_huge_pte_at(mm, address, pvmw.pte, pteval,
 						hsz);
 			} else {
-				dec_mm_counter(mm, mm_counter(&folio->page));
+				dec_mm_counter(mm, mm_counter(folio));
 				set_pte_at(mm, address, pvmw.pte, pteval);
 			}
 
@@ -1795,7 +1795,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
 			 * migration) will not expect userfaults on already
 			 * copied pages.
 			 */
-			dec_mm_counter(mm, mm_counter(&folio->page));
+			dec_mm_counter(mm, mm_counter(folio));
 		} else if (folio_test_anon(folio)) {
 			swp_entry_t entry = page_swap_entry(subpage);
 			pte_t swp_pte;
@@ -2181,7 +2181,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
 				set_huge_pte_at(mm, address, pvmw.pte, pteval,
 						hsz);
 			} else {
-				dec_mm_counter(mm, mm_counter(&folio->page));
+				dec_mm_counter(mm, mm_counter(folio));
 				set_pte_at(mm, address, pvmw.pte, pteval);
 			}
 
@@ -2196,7 +2196,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
 			 * migration) will not expect userfaults on already
 			 * copied pages.
 			 */
-			dec_mm_counter(mm, mm_counter(&folio->page));
+			dec_mm_counter(mm, mm_counter(folio));
 		} else {
 			swp_entry_t entry;
 			pte_t swp_pte;
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 7cf7d43842590c..ae80c37148290a 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -124,7 +124,7 @@ int mfill_atomic_install_pte(pmd_t *dst_pmd,
 	 * Must happen after rmap, as mm_counter() checks mapping (via
 	 * PageAnon()), which is set by __page_set_anon_rmap().
 	 */
-	inc_mm_counter(dst_mm, mm_counter(page));
+	inc_mm_counter(dst_mm, mm_counter(folio));
 
 	set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
 

From 0400c9365a8b2bbdf309de21a838ea646a928e43 Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Thu, 11 Jan 2024 15:24:29 +0000
Subject: [PATCH 0999/1406] mm: convert mm_counter_file() to take a folio

Now all callers of mm_counter_file() have a folio, convert
mm_counter_file() to take a folio.  Saves a call to compound_head() hidden
inside PageSwapBacked().

Link: https://lkml.kernel.org/r/20240111152429.3374566-11-willy@infradead.org
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: David Hildenbrand <david@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h      |  8 ++++----
 kernel/events/uprobes.c |  2 +-
 mm/huge_memory.c        |  4 ++--
 mm/khugepaged.c         |  4 ++--
 mm/memory.c             | 10 +++++-----
 mm/rmap.c               |  2 +-
 6 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 22e597b36b3887..ac6b71cbdffbfa 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2595,10 +2595,10 @@ static inline void dec_mm_counter(struct mm_struct *mm, int member)
 	mm_trace_rss_stat(mm, member);
 }
 
-/* Optimized variant when page is already known not to be PageAnon */
-static inline int mm_counter_file(struct page *page)
+/* Optimized variant when folio is already known not to be anon */
+static inline int mm_counter_file(struct folio *folio)
 {
-	if (PageSwapBacked(page))
+	if (folio_test_swapbacked(folio))
 		return MM_SHMEMPAGES;
 	return MM_FILEPAGES;
 }
@@ -2607,7 +2607,7 @@ static inline int mm_counter(struct folio *folio)
 {
 	if (folio_test_anon(folio))
 		return MM_ANONPAGES;
-	return mm_counter_file(&folio->page);
+	return mm_counter_file(folio);
 }
 
 static inline unsigned long get_mm_rss(struct mm_struct *mm)
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 929e98c629652a..e4834d23e1d1a2 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -188,7 +188,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
 		dec_mm_counter(mm, MM_ANONPAGES);
 
 	if (!folio_test_anon(old_folio)) {
-		dec_mm_counter(mm, mm_counter_file(old_page));
+		dec_mm_counter(mm, mm_counter_file(old_folio));
 		inc_mm_counter(mm, MM_ANONPAGES);
 	}
 
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 7a28a7db08ea0d..f005f04247355f 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1931,7 +1931,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 		} else {
 			if (arch_needs_pgtable_deposit())
 				zap_deposited_table(tlb->mm, pmd);
-			add_mm_counter(tlb->mm, mm_counter_file(&folio->page),
+			add_mm_counter(tlb->mm, mm_counter_file(folio),
 				       -HPAGE_PMD_NR);
 		}
 
@@ -2456,7 +2456,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
 			folio_remove_rmap_pmd(folio, page, vma);
 			folio_put(folio);
 		}
-		add_mm_counter(mm, mm_counter_file(&folio->page), -HPAGE_PMD_NR);
+		add_mm_counter(mm, mm_counter_file(folio), -HPAGE_PMD_NR);
 		return;
 	}
 
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 2b219acb528e25..fe43fbc4452539 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1634,7 +1634,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
 	/* step 3: set proper refcount and mm_counters. */
 	if (nr_ptes) {
 		folio_ref_sub(folio, nr_ptes);
-		add_mm_counter(mm, mm_counter_file(&folio->page), -nr_ptes);
+		add_mm_counter(mm, mm_counter_file(folio), -nr_ptes);
 	}
 
 	/* step 4: remove empty page table */
@@ -1665,7 +1665,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
 	if (nr_ptes) {
 		flush_tlb_mm(mm);
 		folio_ref_sub(folio, nr_ptes);
-		add_mm_counter(mm, mm_counter_file(&folio->page), -nr_ptes);
+		add_mm_counter(mm, mm_counter_file(folio), -nr_ptes);
 	}
 	if (start_pte)
 		pte_unmap_unlock(start_pte, ptl);
diff --git a/mm/memory.c b/mm/memory.c
index 87ef9809984728..5e608edfe330c5 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -966,7 +966,7 @@ copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
 	} else if (page) {
 		folio_get(folio);
 		folio_dup_file_rmap_pte(folio, page);
-		rss[mm_counter_file(page)]++;
+		rss[mm_counter_file(folio)]++;
 	}
 
 	/*
@@ -1873,7 +1873,7 @@ static int insert_page_into_pte_locked(struct vm_area_struct *vma, pte_t *pte,
 		return -EBUSY;
 	/* Ok, finally just insert the thing.. */
 	folio_get(folio);
-	inc_mm_counter(vma->vm_mm, mm_counter_file(page));
+	inc_mm_counter(vma->vm_mm, mm_counter_file(folio));
 	folio_add_file_rmap_pte(folio, page, vma);
 	set_pte_at(vma->vm_mm, addr, pte, mk_pte(page, prot));
 	return 0;
@@ -3178,7 +3178,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
 	if (likely(vmf->pte && pte_same(ptep_get(vmf->pte), vmf->orig_pte))) {
 		if (old_folio) {
 			if (!folio_test_anon(old_folio)) {
-				dec_mm_counter(mm, mm_counter_file(&old_folio->page));
+				dec_mm_counter(mm, mm_counter_file(old_folio));
 				inc_mm_counter(mm, MM_ANONPAGES);
 			}
 		} else {
@@ -4483,7 +4483,7 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
 	if (write)
 		entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
 
-	add_mm_counter(vma->vm_mm, mm_counter_file(page), HPAGE_PMD_NR);
+	add_mm_counter(vma->vm_mm, mm_counter_file(folio), HPAGE_PMD_NR);
 	folio_add_file_rmap_pmd(folio, page, vma);
 
 	/*
@@ -4546,7 +4546,7 @@ void set_pte_range(struct vm_fault *vmf, struct folio *folio,
 		folio_add_new_anon_rmap(folio, vma, addr);
 		folio_add_lru_vma(folio, vma);
 	} else {
-		add_mm_counter(vma->vm_mm, mm_counter_file(page), nr);
+		add_mm_counter(vma->vm_mm, mm_counter_file(folio), nr);
 		folio_add_file_rmap_ptes(folio, page, nr, vma);
 	}
 	set_ptes(vma->vm_mm, addr, vmf->pte, entry, nr);
diff --git a/mm/rmap.c b/mm/rmap.c
index 4648cf1d8178b5..1cf2bffa48ed87 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1903,7 +1903,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
 			 *
 			 * See Documentation/mm/mmu_notifier.rst
 			 */
-			dec_mm_counter(mm, mm_counter_file(&folio->page));
+			dec_mm_counter(mm, mm_counter_file(folio));
 		}
 discard:
 		if (unlikely(folio_test_hugetlb(folio)))

From bcd39f542b3c5a0e7dfe8c7fcbebe1822e0e246a Mon Sep 17 00:00:00 2001
From: Hui Zhu <teawater@antgroup.com>
Date: Thu, 11 Jan 2024 08:45:33 +0000
Subject: [PATCH 1000/1406] fs/proc/task_mmu.c: add_to_pagemap: remove useless
 parameter addr

Function parameter addr of add_to_pagemap() is useless.  Remove it.

Link: https://lkml.kernel.org/r/20240111084533.40038-1-teawaterz@linux.alibaba.com
Signed-off-by: Hui Zhu <teawater@antgroup.com>
Reviewed-by: Muhammad Usama Anjum <usama.anjum@collabora.com>
Tested-by: Muhammad Usama Anjum <usama.anjum@collabora.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Andrei Vagin <avagin@google.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/proc/task_mmu.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index ac6ea2cc2ee8fe..23fbab954c20b6 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1352,8 +1352,7 @@ static inline pagemap_entry_t make_pme(u64 frame, u64 flags)
 	return (pagemap_entry_t) { .pme = (frame & PM_PFRAME_MASK) | flags };
 }
 
-static int add_to_pagemap(unsigned long addr, pagemap_entry_t *pme,
-			  struct pagemapread *pm)
+static int add_to_pagemap(pagemap_entry_t *pme, struct pagemapread *pm)
 {
 	pm->buffer[pm->pos++] = *pme;
 	if (pm->pos >= pm->len)
@@ -1380,7 +1379,7 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end,
 			hole_end = end;
 
 		for (; addr < hole_end; addr += PAGE_SIZE) {
-			err = add_to_pagemap(addr, &pme, pm);
+			err = add_to_pagemap(&pme, pm);
 			if (err)
 				goto out;
 		}
@@ -1392,7 +1391,7 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end,
 		if (vma->vm_flags & VM_SOFTDIRTY)
 			pme = make_pme(0, PM_SOFT_DIRTY);
 		for (; addr < min(end, vma->vm_end); addr += PAGE_SIZE) {
-			err = add_to_pagemap(addr, &pme, pm);
+			err = add_to_pagemap(&pme, pm);
 			if (err)
 				goto out;
 		}
@@ -1519,7 +1518,7 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
 		for (; addr != end; addr += PAGE_SIZE) {
 			pagemap_entry_t pme = make_pme(frame, flags);
 
-			err = add_to_pagemap(addr, &pme, pm);
+			err = add_to_pagemap(&pme, pm);
 			if (err)
 				break;
 			if (pm->show_pfn) {
@@ -1547,7 +1546,7 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
 		pagemap_entry_t pme;
 
 		pme = pte_to_pagemap_entry(pm, vma, addr, ptep_get(pte));
-		err = add_to_pagemap(addr, &pme, pm);
+		err = add_to_pagemap(&pme, pm);
 		if (err)
 			break;
 	}
@@ -1597,7 +1596,7 @@ static int pagemap_hugetlb_range(pte_t *ptep, unsigned long hmask,
 	for (; addr != end; addr += PAGE_SIZE) {
 		pagemap_entry_t pme = make_pme(frame, flags);
 
-		err = add_to_pagemap(addr, &pme, pm);
+		err = add_to_pagemap(&pme, pm);
 		if (err)
 			return err;
 		if (pm->show_pfn && (flags & PM_PRESENT))

From 9131a9b60ff811fabad3ca5d90de8114fa1fc2b2 Mon Sep 17 00:00:00 2001
From: Carlos Galo <carlosgalo@google.com>
Date: Thu, 11 Jan 2024 21:05:30 +0000
Subject: [PATCH 1001/1406] mm: update mark_victim tracepoints fields

The current implementation of the mark_victim tracepoint provides only the
process ID (pid) of the victim process.  This limitation poses challenges
for userspace tools that need additional information about the OOM victim.
The association between pid and the additional data may be lost after the
kill, making it difficult for userspace to correlate the OOM event with
the specific process.

In order to mitigate this limitation, add the following fields:

- UID
   In Android each installed application has a unique UID. Including
   the `uid` assists in correlating OOM events with specific apps.

- Process Name (comm)
   Enables identification of the affected process.

- OOM Score
   Allows userspace to get additional insights of the relative kill
   priority of the OOM victim.

Link: https://lkml.kernel.org/r/20240111210539.636607-1-carlosgalo@google.com
Signed-off-by: Carlos Galo <carlosgalo@google.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/trace/events/oom.h | 19 +++++++++++++++----
 mm/oom_kill.c              |  6 +++++-
 2 files changed, 20 insertions(+), 5 deletions(-)

diff --git a/include/trace/events/oom.h b/include/trace/events/oom.h
index 26a11e4a2c361d..3c5941da80755b 100644
--- a/include/trace/events/oom.h
+++ b/include/trace/events/oom.h
@@ -72,19 +72,30 @@ TRACE_EVENT(reclaim_retry_zone,
 );
 
 TRACE_EVENT(mark_victim,
-	TP_PROTO(int pid),
+	TP_PROTO(struct task_struct *task, uid_t uid),
 
-	TP_ARGS(pid),
+	TP_ARGS(task, uid),
 
 	TP_STRUCT__entry(
 		__field(int, pid)
+		__field(uid_t, uid)
+		__string(comm, task->comm)
+		__field(short, oom_score_adj)
 	),
 
 	TP_fast_assign(
-		__entry->pid = pid;
+		__entry->pid = task->pid;
+		__entry->uid = uid;
+		__assign_str(comm, task->comm);
+		__entry->oom_score_adj = task->signal->oom_score_adj;
 	),
 
-	TP_printk("pid=%d", __entry->pid)
+	TP_printk("pid=%d uid=%u comm=%s oom_score_adj=%hd",
+		__entry->pid,
+		__entry->uid,
+		__get_str(comm),
+		__entry->oom_score_adj
+	)
 );
 
 TRACE_EVENT(wake_reaper,
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 91ccd82097c2ba..8d6a207c3c5905 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -44,6 +44,7 @@
 #include <linux/kthread.h>
 #include <linux/init.h>
 #include <linux/mmu_notifier.h>
+#include <linux/cred.h>
 
 #include <asm/tlb.h>
 #include "internal.h"
@@ -754,6 +755,7 @@ static inline void queue_oom_reaper(struct task_struct *tsk)
  */
 static void mark_oom_victim(struct task_struct *tsk)
 {
+	const struct cred *cred;
 	struct mm_struct *mm = tsk->mm;
 
 	WARN_ON(oom_killer_disabled);
@@ -773,7 +775,9 @@ static void mark_oom_victim(struct task_struct *tsk)
 	 */
 	__thaw_task(tsk);
 	atomic_inc(&oom_victims);
-	trace_mark_victim(tsk->pid);
+	cred = get_task_cred(tsk);
+	trace_mark_victim(tsk, cred->uid.val);
+	put_cred(cred);
 }
 
 /**

From 767a0ea889a6130370cd5ba4c287f0ef1603b231 Mon Sep 17 00:00:00 2001
From: Pankaj Raghav <p.raghav@samsung.com>
Date: Mon, 15 Jan 2024 11:25:22 +0100
Subject: [PATCH 1002/1406] readahead: use ilog2 instead of a while loop in
 page_cache_ra_order()

A while loop is used to adjust the new_order to be lower than the
ra->size.  ilog2 could be used to do the same instead of using a loop.

ilog2 typically resolves to a bit scan reverse instruction.  This is
particularly useful when ra->size is smaller than the 2^new_order as it
resolves in one instruction instead of looping to find the new_order.

No functional changes.

Link: https://lkml.kernel.org/r/20240115102523.2336742-1-kernel@pankajraghav.com
Signed-off-by: Pankaj Raghav <p.raghav@samsung.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/readahead.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/mm/readahead.c b/mm/readahead.c
index 2648ec4f04947b..1e74455f908e50 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -500,10 +500,8 @@ void page_cache_ra_order(struct readahead_control *ractl,
 
 	if (new_order < MAX_PAGECACHE_ORDER) {
 		new_order += 2;
-		if (new_order > MAX_PAGECACHE_ORDER)
-			new_order = MAX_PAGECACHE_ORDER;
-		while ((1 << new_order) > ra->size)
-			new_order--;
+		new_order = min_t(unsigned int, MAX_PAGECACHE_ORDER, new_order);
+		new_order = min_t(unsigned int, new_order, ilog2(ra->size));
 	}
 
 	filemap_invalidate_lock_shared(mapping);

From ff29aec3bf3e37aa4c1bf5bf00b8f27cd0a74cd8 Mon Sep 17 00:00:00 2001
From: Ronald Monthero <debug.penguin32@gmail.com>
Date: Tue, 16 Jan 2024 23:31:45 +1000
Subject: [PATCH 1003/1406] mm/zswap: improve with alloc_workqueue() call

The core-api create_workqueue is deprecated, this patch replaces the
create_workqueue with alloc_workqueue.  The previous implementation
workqueue of zswap was a bounded workqueue, this patch uses
alloc_workqueue() to create an unbounded workqueue.  The WQ_UNBOUND
attribute is desirable making the workqueue not localized to a specific
cpu so that the scheduler is free to exercise improvisations in any
demanding scenarios for offloading cpu time slices for workqueues.  For
example if any other workqueues of the same primary cpu had to be served
which are WQ_HIGHPRI and WQ_CPU_INTENSIVE.  Also Unbound workqueue happens
to be more efficient in a system during memory pressure scenarios in
comparison to a bounded workqueue.

shrink_wq = alloc_workqueue("zswap-shrink",
                     WQ_UNBOUND|WQ_MEM_RECLAIM, 1);

Overall the change suggested in this patch should be seamless and does not
alter the existing behavior, other than the improvisation to be an
unbounded workqueue.

Link: https://lkml.kernel.org/r/20240116133145.12454-1-debug.penguin32@gmail.com
Signed-off-by: Ronald Monthero <debug.penguin32@gmail.com>
Cc: Chris Li <chrisl@kernel.org>
Cc: Dan Streetman <ddstreet@ieee.org>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: Seth Jennings <sjenning@redhat.com>
Cc: Vitaly Wool <vitaly.wool@konsulko.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/zswap.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mm/zswap.c b/mm/zswap.c
index db4625af65fb7f..2c3d77c6fe72cd 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -1887,7 +1887,8 @@ static int zswap_setup(void)
 		zswap_enabled = false;
 	}
 
-	shrink_wq = create_workqueue("zswap-shrink");
+	shrink_wq = alloc_workqueue("zswap-shrink",
+			WQ_UNBOUND|WQ_MEM_RECLAIM, 1);
 	if (!shrink_wq)
 		goto fallback_fail;
 

From ff9a89a428d61d0d67219a4cfcc05ca501e02a4a Mon Sep 17 00:00:00 2001
From: Ryan Roberts <ryan.roberts@arm.com>
Date: Tue, 16 Jan 2024 14:12:35 +0000
Subject: [PATCH 1004/1406] tools/mm: add thpmaps script to dump THP usage info

With the proliferation of large folios for file-backed memory, and more
recently the introduction of multi-size THP for anonymous memory, it is
becoming useful to be able to see exactly how large folios are mapped into
processes.  For some architectures (e.g.  arm64), if most memory is mapped
using contpte-sized and -aligned blocks, TLB usage can be optimized so
it's useful to see where these requirements are and are not being met.

thpmaps is a Python utility that reads /proc/<pid>/smaps,
/proc/<pid>/pagemap and /proc/kpageflags to print information about how
transparent huge pages (both file and anon) are mapped to a specified
process or cgroup.  It aims to help users debug and optimize their
workloads.  In future we may wish to introduce stats directly into the
kernel (e.g.  smaps or similar), but for now this provides a short term
solution without the need to introduce any new ABI.

Run with help option for a full listing of the arguments:

    # ./thpmaps --help

--8<--
usage: thpmaps [-h] [--pid pid | --cgroup path] [--rollup]
               [--cont size[KMG]] [--inc-smaps] [--inc-empty]
               [--periodic sleep_ms]

Prints information about how transparent huge pages are mapped, either
system-wide, or for a specified process or cgroup.

When run with --pid, the user explicitly specifies the set of pids to
scan.  e.g.  "--pid 10 [--pid 134 ...]".  When run with --cgroup, the user
passes either a v1 or v2 cgroup and all pids that belong to the cgroup
subtree are scanned.  When run with neither --pid nor --cgroup, the full
set of pids on the system is gathered from /proc and scanned as if the
user had provided "--pid 1 --pid 2 ...".

A default set of statistics is always generated for THP mappings.
However, it is also possible to generate additional statistics for
"contiguous block mappings" where the block size is user-defined.

Statistics are maintained independently for anonymous and file-backed
(pagecache) memory and are shown both in kB and as a percentage of either
total anonymous or total file-backed memory as appropriate.

THP Statistics
--------------

Statistics are always generated for fully- and contiguously-mapped THPs
whose mapping address is aligned to their size, for each <size> supported
by the system.  Separate counters describe THPs mapped by PTE vs those
mapped by PMD.  (Although note a THP can only be mapped by PMD if it is
PMD-sized):

- anon-thp-pte-aligned-<size>kB
- file-thp-pte-aligned-<size>kB
- anon-thp-pmd-aligned-<size>kB
- file-thp-pmd-aligned-<size>kB

Similarly, statistics are always generated for fully- and contiguously-
mapped THPs whose mapping address is *not* aligned to their size, for each
<size> supported by the system.  Due to the unaligned mapping, it is
impossible to map by PMD, so there are only PTE counters for this case:

- anon-thp-pte-unaligned-<size>kB
- file-thp-pte-unaligned-<size>kB

Statistics are also always generated for mapped pages that belong to a THP
but where the is THP is *not* fully- and contiguously- mapped.  These
"partial" mappings are all counted in the same counter regardless of the
size of the THP that is partially mapped:

- anon-thp-pte-partial
- file-thp-pte-partial

Contiguous Block Statistics
---------------------------

An optional, additional set of statistics is generated for every
contiguous block size specified with `--cont <size>`.  These statistics
show how much memory is mapped in contiguous blocks of <size> and also
aligned to <size>.  A given contiguous block must all belong to the same
THP, but there is no requirement for it to be the *whole* THP.  Separate
counters describe contiguous blocks mapped by PTE vs those mapped by PMD:

- anon-cont-pte-aligned-<size>kB
- file-cont-pte-aligned-<size>kB
- anon-cont-pmd-aligned-<size>kB
- file-cont-pmd-aligned-<size>kB

As an example, if monitoring 64K contiguous blocks (--cont 64K), there are
a number of sources that could provide such blocks: a fully- and
contiguously-mapped 64K THP that is aligned to a 64K boundary would
provide 1 block.  A fully- and contiguously-mapped 128K THP that is
aligned to at least a 64K boundary would provide 2 blocks.  Or a 128K THP
that maps its first 100K, but contiguously and starting at a 64K boundary
would provide 1 block.  A fully- and contiguously-mapped 2M THP would
provide 32 blocks.  There are many other possible permutations.

options:
  -h, --help           show this help message and exit
  --pid pid            Process id of the target process. Maybe issued
                       multiple times to scan multiple processes. --pid
                       and --cgroup are mutually exclusive. If neither
                       are provided, all processes are scanned to
                       provide system-wide information.
  --cgroup path        Path to the target cgroup in sysfs. Iterates
                       over every pid in the cgroup and its children.
                       --pid and --cgroup are mutually exclusive. If
                       neither are provided, all processes are scanned
                       to provide system-wide information.
  --rollup             Sum the per-vma statistics to provide a summary
                       over the whole system, process or cgroup.
  --cont size[KMG]     Adds stats for memory that is mapped in
                       contiguous blocks of <size> and also aligned to
                       <size>. May be issued multiple times to track
                       multiple sized blocks. Useful to infer e.g.
                       arm64 contpte and hpa mappings. Size must be a
                       power-of-2 number of pages.
  --inc-smaps          Include all numerical, additive
                       /proc/<pid>/smaps stats in the output.
  --inc-empty          Show all statistics including those whose value
                       is 0.
  --periodic sleep_ms  Run in a loop, polling every sleep_ms
                       milliseconds.

Requires root privilege to access pagemap and kpageflags.
--8<--

Example command to summarise fully and partially mapped THPs and 64K
contiguous blocks over all VMAs in all processes in the system
(--inc-empty forces printing stats that are 0):

    # ./thpmaps --cont 64K --rollup --inc-empty

--8<--
anon-thp-pmd-aligned-2048kB:      139264 kB ( 6%)
file-thp-pmd-aligned-2048kB:           0 kB ( 0%)
anon-thp-pte-aligned-16kB:             0 kB ( 0%)
anon-thp-pte-aligned-32kB:             0 kB ( 0%)
anon-thp-pte-aligned-64kB:         72256 kB ( 3%)
anon-thp-pte-aligned-128kB:            0 kB ( 0%)
anon-thp-pte-aligned-256kB:            0 kB ( 0%)
anon-thp-pte-aligned-512kB:            0 kB ( 0%)
anon-thp-pte-aligned-1024kB:           0 kB ( 0%)
anon-thp-pte-aligned-2048kB:           0 kB ( 0%)
anon-thp-pte-unaligned-16kB:           0 kB ( 0%)
anon-thp-pte-unaligned-32kB:           0 kB ( 0%)
anon-thp-pte-unaligned-64kB:           0 kB ( 0%)
anon-thp-pte-unaligned-128kB:          0 kB ( 0%)
anon-thp-pte-unaligned-256kB:          0 kB ( 0%)
anon-thp-pte-unaligned-512kB:          0 kB ( 0%)
anon-thp-pte-unaligned-1024kB:         0 kB ( 0%)
anon-thp-pte-unaligned-2048kB:         0 kB ( 0%)
anon-thp-pte-partial:              63232 kB ( 3%)
file-thp-pte-aligned-16kB:        809024 kB (47%)
file-thp-pte-aligned-32kB:         43168 kB ( 3%)
file-thp-pte-aligned-64kB:         98496 kB ( 6%)
file-thp-pte-aligned-128kB:        17536 kB ( 1%)
file-thp-pte-aligned-256kB:            0 kB ( 0%)
file-thp-pte-aligned-512kB:            0 kB ( 0%)
file-thp-pte-aligned-1024kB:           0 kB ( 0%)
file-thp-pte-aligned-2048kB:           0 kB ( 0%)
file-thp-pte-unaligned-16kB:       21712 kB ( 1%)
file-thp-pte-unaligned-32kB:         704 kB ( 0%)
file-thp-pte-unaligned-64kB:         896 kB ( 0%)
file-thp-pte-unaligned-128kB:      44928 kB ( 3%)
file-thp-pte-unaligned-256kB:          0 kB ( 0%)
file-thp-pte-unaligned-512kB:          0 kB ( 0%)
file-thp-pte-unaligned-1024kB:         0 kB ( 0%)
file-thp-pte-unaligned-2048kB:         0 kB ( 0%)
file-thp-pte-partial:               9252 kB ( 1%)
anon-cont-pmd-aligned-64kB:       139264 kB ( 6%)
file-cont-pmd-aligned-64kB:            0 kB ( 0%)
anon-cont-pte-aligned-64kB:       100672 kB ( 4%)
file-cont-pte-aligned-64kB:       161856 kB ( 9%)
--8<--

Link: https://lkml.kernel.org/r/20240116141235.960842-1-ryan.roberts@arm.com
Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
Tested-by: Barry Song <v-songbaohua@oppo.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: William Kucharski <william.kucharski@oracle.com>
Cc: Zenghui Yu <yuzenghui@huawei.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/mm/Makefile |   9 +-
 tools/mm/thpmaps  | 675 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 680 insertions(+), 4 deletions(-)
 create mode 100644 tools/mm/thpmaps

diff --git a/tools/mm/Makefile b/tools/mm/Makefile
index 1c5606cc33346b..7bb03606b9eaa2 100644
--- a/tools/mm/Makefile
+++ b/tools/mm/Makefile
@@ -3,7 +3,8 @@
 #
 include ../scripts/Makefile.include
 
-TARGETS=page-types slabinfo page_owner_sort
+BUILD_TARGETS=page-types slabinfo page_owner_sort
+INSTALL_TARGETS = $(BUILD_TARGETS) thpmaps
 
 LIB_DIR = ../lib/api
 LIBS = $(LIB_DIR)/libapi.a
@@ -11,9 +12,9 @@ LIBS = $(LIB_DIR)/libapi.a
 CFLAGS += -Wall -Wextra -I../lib/ -pthread
 LDFLAGS += $(LIBS) -pthread
 
-all: $(TARGETS)
+all: $(BUILD_TARGETS)
 
-$(TARGETS): $(LIBS)
+$(BUILD_TARGETS): $(LIBS)
 
 $(LIBS):
 	make -C $(LIB_DIR)
@@ -29,4 +30,4 @@ sbindir ?= /usr/sbin
 
 install: all
 	install -d $(DESTDIR)$(sbindir)
-	install -m 755 -p $(TARGETS) $(DESTDIR)$(sbindir)
+	install -m 755 -p $(INSTALL_TARGETS) $(DESTDIR)$(sbindir)
diff --git a/tools/mm/thpmaps b/tools/mm/thpmaps
new file mode 100644
index 00000000000000..803e0318f2fea1
--- /dev/null
+++ b/tools/mm/thpmaps
@@ -0,0 +1,675 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: GPL-2.0-only
+# Copyright (C) 2024 ARM Ltd.
+#
+# Utility providing smaps-like output detailing transparent hugepage usage.
+# For more info, run:
+# ./thpmaps --help
+#
+# Requires numpy:
+# pip3 install numpy
+
+
+import argparse
+import collections
+import math
+import os
+import re
+import resource
+import shutil
+import sys
+import textwrap
+import time
+import numpy as np
+
+
+with open('/sys/kernel/mm/transparent_hugepage/hpage_pmd_size') as f:
+    PAGE_SIZE = resource.getpagesize()
+    PAGE_SHIFT = int(math.log2(PAGE_SIZE))
+    PMD_SIZE = int(f.read())
+    PMD_ORDER = int(math.log2(PMD_SIZE / PAGE_SIZE))
+
+
+def align_forward(v, a):
+    return (v + (a - 1)) & ~(a - 1)
+
+
+def align_offset(v, a):
+    return v & (a - 1)
+
+
+def kbnr(kb):
+    # Convert KB to number of pages.
+    return (kb << 10) >> PAGE_SHIFT
+
+
+def nrkb(nr):
+    # Convert number of pages to KB.
+    return (nr << PAGE_SHIFT) >> 10
+
+
+def odkb(order):
+    # Convert page order to KB.
+    return (PAGE_SIZE << order) >> 10
+
+
+def cont_ranges_all(search, index):
+    # Given a list of arrays, find the ranges for which values are monotonically
+    # incrementing in all arrays. all arrays in search and index must be the
+    # same size.
+    sz = len(search[0])
+    r = np.full(sz, 2)
+    d = np.diff(search[0]) == 1
+    for dd in [np.diff(arr) == 1 for arr in search[1:]]:
+        d &= dd
+    r[1:] -= d
+    r[:-1] -= d
+    return [np.repeat(arr, r).reshape(-1, 2) for arr in index]
+
+
+class ArgException(Exception):
+    pass
+
+
+class FileIOException(Exception):
+    pass
+
+
+class BinArrayFile:
+    # Base class used to read /proc/<pid>/pagemap and /proc/kpageflags into a
+    # numpy array. Use inherrited class in a with clause to ensure file is
+    # closed when it goes out of scope.
+    def __init__(self, filename, element_size):
+        self.element_size = element_size
+        self.filename = filename
+        self.fd = os.open(self.filename, os.O_RDONLY)
+
+    def cleanup(self):
+        os.close(self.fd)
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.cleanup()
+
+    def _readin(self, offset, buffer):
+        length = os.preadv(self.fd, (buffer,), offset)
+        if len(buffer) != length:
+            raise FileIOException('error: {} failed to read {} bytes at {:x}'
+                            .format(self.filename, len(buffer), offset))
+
+    def _toarray(self, buf):
+        assert(self.element_size == 8)
+        return np.frombuffer(buf, dtype=np.uint64)
+
+    def getv(self, vec):
+        vec *= self.element_size
+        offsets = vec[:, 0]
+        lengths = (np.diff(vec) + self.element_size).reshape(len(vec))
+        buf = bytearray(int(np.sum(lengths)))
+        view = memoryview(buf)
+        pos = 0
+        for offset, length in zip(offsets, lengths):
+            offset = int(offset)
+            length = int(length)
+            self._readin(offset, view[pos:pos+length])
+            pos += length
+        return self._toarray(buf)
+
+    def get(self, index, nr=1):
+        offset = index * self.element_size
+        length = nr * self.element_size
+        buf = bytearray(length)
+        self._readin(offset, buf)
+        return self._toarray(buf)
+
+
+PM_PAGE_PRESENT = 1 << 63
+PM_PFN_MASK = (1 << 55) - 1
+
+class PageMap(BinArrayFile):
+    # Read ranges of a given pid's pagemap into a numpy array.
+    def __init__(self, pid='self'):
+        super().__init__(f'/proc/{pid}/pagemap', 8)
+
+
+KPF_ANON = 1 << 12
+KPF_COMPOUND_HEAD = 1 << 15
+KPF_COMPOUND_TAIL = 1 << 16
+KPF_THP = 1 << 22
+
+class KPageFlags(BinArrayFile):
+    # Read ranges of /proc/kpageflags into a numpy array.
+    def __init__(self):
+         super().__init__(f'/proc/kpageflags', 8)
+
+
+vma_all_stats = set([
+    "Size",
+    "Rss",
+    "Pss",
+    "Pss_Dirty",
+    "Shared_Clean",
+    "Shared_Dirty",
+    "Private_Clean",
+    "Private_Dirty",
+    "Referenced",
+    "Anonymous",
+    "KSM",
+    "LazyFree",
+    "AnonHugePages",
+    "ShmemPmdMapped",
+    "FilePmdMapped",
+    "Shared_Hugetlb",
+    "Private_Hugetlb",
+    "Swap",
+    "SwapPss",
+    "Locked",
+])
+
+vma_min_stats = set([
+    "Rss",
+    "Anonymous",
+    "AnonHugePages",
+    "ShmemPmdMapped",
+    "FilePmdMapped",
+])
+
+VMA = collections.namedtuple('VMA', [
+    'name',
+    'start',
+    'end',
+    'read',
+    'write',
+    'execute',
+    'private',
+    'pgoff',
+    'major',
+    'minor',
+    'inode',
+    'stats',
+])
+
+class VMAList:
+    # A container for VMAs, parsed from /proc/<pid>/smaps. Iterate over the
+    # instance to receive VMAs.
+    def __init__(self, pid='self', stats=[]):
+        self.vmas = []
+        with open(f'/proc/{pid}/smaps', 'r') as file:
+            for line in file:
+                elements = line.split()
+                if '-' in elements[0]:
+                    start, end = map(lambda x: int(x, 16), elements[0].split('-'))
+                    major, minor = map(lambda x: int(x, 16), elements[3].split(':'))
+                    self.vmas.append(VMA(
+                        name=elements[5] if len(elements) == 6 else '',
+                        start=start,
+                        end=end,
+                        read=elements[1][0] == 'r',
+                        write=elements[1][1] == 'w',
+                        execute=elements[1][2] == 'x',
+                        private=elements[1][3] == 'p',
+                        pgoff=int(elements[2], 16),
+                        major=major,
+                        minor=minor,
+                        inode=int(elements[4], 16),
+                        stats={},
+                    ))
+                else:
+                    param = elements[0][:-1]
+                    if param in stats:
+                        value = int(elements[1])
+                        self.vmas[-1].stats[param] = {'type': None, 'value': value}
+
+    def __iter__(self):
+        yield from self.vmas
+
+
+def thp_parse(vma, kpageflags, ranges, indexes, vfns, pfns, anons, heads):
+    # Given 4 same-sized arrays representing a range within a page table backed
+    # by THPs (vfns: virtual frame numbers, pfns: physical frame numbers, anons:
+    # True if page is anonymous, heads: True if page is head of a THP), return a
+    # dictionary of statistics describing the mapped THPs.
+    stats = {
+        'file': {
+            'partial': 0,
+            'aligned': [0] * (PMD_ORDER + 1),
+            'unaligned': [0] * (PMD_ORDER + 1),
+        },
+        'anon': {
+            'partial': 0,
+            'aligned': [0] * (PMD_ORDER + 1),
+            'unaligned': [0] * (PMD_ORDER + 1),
+        },
+    }
+
+    for rindex, rpfn in zip(ranges[0], ranges[2]):
+        index_next = int(rindex[0])
+        index_end = int(rindex[1]) + 1
+        pfn_end = int(rpfn[1]) + 1
+
+        folios = indexes[index_next:index_end][heads[index_next:index_end]]
+
+        # Account pages for any partially mapped THP at the front. In that case,
+        # the first page of the range is a tail.
+        nr = (int(folios[0]) if len(folios) else index_end) - index_next
+        stats['anon' if anons[index_next] else 'file']['partial'] += nr
+
+        # Account pages for any partially mapped THP at the back. In that case,
+        # the next page after the range is a tail.
+        if len(folios):
+            flags = int(kpageflags.get(pfn_end)[0])
+            if flags & KPF_COMPOUND_TAIL:
+                nr = index_end - int(folios[-1])
+                folios = folios[:-1]
+                index_end -= nr
+                stats['anon' if anons[index_end - 1] else 'file']['partial'] += nr
+
+        # Account fully mapped THPs in the middle of the range.
+        if len(folios):
+            folio_nrs = np.append(np.diff(folios), np.uint64(index_end - folios[-1]))
+            folio_orders = np.log2(folio_nrs).astype(np.uint64)
+            for index, order in zip(folios, folio_orders):
+                index = int(index)
+                order = int(order)
+                nr = 1 << order
+                vfn = int(vfns[index])
+                align = 'aligned' if align_forward(vfn, nr) == vfn else 'unaligned'
+                anon = 'anon' if anons[index] else 'file'
+                stats[anon][align][order] += nr
+
+    # Account PMD-mapped THPs spearately, so filter out of the stats. There is a
+    # race between acquiring the smaps stats and reading pagemap, where memory
+    # could be deallocated. So clamp to zero incase it would have gone negative.
+    anon_pmd_mapped = vma.stats['AnonHugePages']['value']
+    file_pmd_mapped = vma.stats['ShmemPmdMapped']['value'] + \
+                      vma.stats['FilePmdMapped']['value']
+    stats['anon']['aligned'][PMD_ORDER] = max(0, stats['anon']['aligned'][PMD_ORDER] - kbnr(anon_pmd_mapped))
+    stats['file']['aligned'][PMD_ORDER] = max(0, stats['file']['aligned'][PMD_ORDER] - kbnr(file_pmd_mapped))
+
+    rstats = {
+        f"anon-thp-pmd-aligned-{odkb(PMD_ORDER)}kB": {'type': 'anon', 'value': anon_pmd_mapped},
+        f"file-thp-pmd-aligned-{odkb(PMD_ORDER)}kB": {'type': 'file', 'value': file_pmd_mapped},
+    }
+
+    def flatten_sub(type, subtype, stats):
+        param = f"{type}-thp-pte-{subtype}-{{}}kB"
+        for od, nr in enumerate(stats[2:], 2):
+            rstats[param.format(odkb(od))] = {'type': type, 'value': nrkb(nr)}
+
+    def flatten_type(type, stats):
+        flatten_sub(type, 'aligned', stats['aligned'])
+        flatten_sub(type, 'unaligned', stats['unaligned'])
+        rstats[f"{type}-thp-pte-partial"] = {'type': type, 'value': nrkb(stats['partial'])}
+
+    flatten_type('anon', stats['anon'])
+    flatten_type('file', stats['file'])
+
+    return rstats
+
+
+def cont_parse(vma, order, ranges, anons, heads):
+    # Given 4 same-sized arrays representing a range within a page table backed
+    # by THPs (vfns: virtual frame numbers, pfns: physical frame numbers, anons:
+    # True if page is anonymous, heads: True if page is head of a THP), return a
+    # dictionary of statistics describing the contiguous blocks.
+    nr_cont = 1 << order
+    nr_anon = 0
+    nr_file = 0
+
+    for rindex, rvfn, rpfn in zip(*ranges):
+        index_next = int(rindex[0])
+        index_end = int(rindex[1]) + 1
+        vfn_start = int(rvfn[0])
+        pfn_start = int(rpfn[0])
+
+        if align_offset(pfn_start, nr_cont) != align_offset(vfn_start, nr_cont):
+            continue
+
+        off = align_forward(vfn_start, nr_cont) - vfn_start
+        index_next += off
+
+        while index_next + nr_cont <= index_end:
+            folio_boundary = heads[index_next+1:index_next+nr_cont].any()
+            if not folio_boundary:
+                if anons[index_next]:
+                    nr_anon += nr_cont
+                else:
+                    nr_file += nr_cont
+            index_next += nr_cont
+
+    # Account blocks that are PMD-mapped spearately, so filter out of the stats.
+    # There is a race between acquiring the smaps stats and reading pagemap,
+    # where memory could be deallocated. So clamp to zero incase it would have
+    # gone negative.
+    anon_pmd_mapped = vma.stats['AnonHugePages']['value']
+    file_pmd_mapped = vma.stats['ShmemPmdMapped']['value'] + \
+                    vma.stats['FilePmdMapped']['value']
+    nr_anon = max(0, nr_anon - kbnr(anon_pmd_mapped))
+    nr_file = max(0, nr_file - kbnr(file_pmd_mapped))
+
+    rstats = {
+        f"anon-cont-pmd-aligned-{nrkb(nr_cont)}kB": {'type': 'anon', 'value': anon_pmd_mapped},
+        f"file-cont-pmd-aligned-{nrkb(nr_cont)}kB": {'type': 'file', 'value': file_pmd_mapped},
+    }
+
+    rstats[f"anon-cont-pte-aligned-{nrkb(nr_cont)}kB"] = {'type': 'anon', 'value': nrkb(nr_anon)}
+    rstats[f"file-cont-pte-aligned-{nrkb(nr_cont)}kB"] = {'type': 'file', 'value': nrkb(nr_file)}
+
+    return rstats
+
+
+def vma_print(vma, pid):
+    # Prints a VMA instance in a format similar to smaps. The main difference is
+    # that the pid is included as the first value.
+    print("{:010d}: {:016x}-{:016x} {}{}{}{} {:08x} {:02x}:{:02x} {:08x} {}"
+        .format(
+            pid, vma.start, vma.end,
+            'r' if vma.read else '-', 'w' if vma.write else '-',
+            'x' if vma.execute else '-', 'p' if vma.private else 's',
+            vma.pgoff, vma.major, vma.minor, vma.inode, vma.name
+        ))
+
+
+def stats_print(stats, tot_anon, tot_file, inc_empty):
+    # Print a statistics dictionary.
+    label_field = 32
+    for label, stat in stats.items():
+        type = stat['type']
+        value = stat['value']
+        if value or inc_empty:
+            pad = max(0, label_field - len(label) - 1)
+            if type == 'anon' and tot_anon > 0:
+                percent = f' ({value / tot_anon:3.0%})'
+            elif type == 'file' and tot_file > 0:
+                percent = f' ({value / tot_file:3.0%})'
+            else:
+                percent = ''
+            print(f"{label}:{' ' * pad}{value:8} kB{percent}")
+
+
+def vma_parse(vma, pagemap, kpageflags, contorders):
+    # Generate thp and cont statistics for a single VMA.
+    start = vma.start >> PAGE_SHIFT
+    end = vma.end >> PAGE_SHIFT
+
+    pmes = pagemap.get(start, end - start)
+    present = pmes & PM_PAGE_PRESENT != 0
+    pfns = pmes & PM_PFN_MASK
+    pfns = pfns[present]
+    vfns = np.arange(start, end, dtype=np.uint64)
+    vfns = vfns[present]
+
+    pfn_vec = cont_ranges_all([pfns], [pfns])[0]
+    flags = kpageflags.getv(pfn_vec)
+    anons = flags & KPF_ANON != 0
+    heads = flags & KPF_COMPOUND_HEAD != 0
+    thps = flags & KPF_THP != 0
+
+    vfns = vfns[thps]
+    pfns = pfns[thps]
+    anons = anons[thps]
+    heads = heads[thps]
+
+    indexes = np.arange(len(vfns), dtype=np.uint64)
+    ranges = cont_ranges_all([vfns, pfns], [indexes, vfns, pfns])
+
+    thpstats = thp_parse(vma, kpageflags, ranges, indexes, vfns, pfns, anons, heads)
+    contstats = [cont_parse(vma, order, ranges, anons, heads) for order in contorders]
+
+    tot_anon = vma.stats['Anonymous']['value']
+    tot_file = vma.stats['Rss']['value'] - tot_anon
+
+    return {
+        **thpstats,
+        **{k: v for s in contstats for k, v in s.items()}
+    }, tot_anon, tot_file
+
+
+def do_main(args):
+    pids = set()
+    rollup = {}
+    rollup_anon = 0
+    rollup_file = 0
+
+    if args.cgroup:
+        strict = False
+        for walk_info in os.walk(args.cgroup):
+            cgroup = walk_info[0]
+            with open(f'{cgroup}/cgroup.procs') as pidfile:
+                for line in pidfile.readlines():
+                    pids.add(int(line.strip()))
+    elif args.pid:
+        strict = True
+        pids = pids.union(args.pid)
+    else:
+        strict = False
+        for pid in os.listdir('/proc'):
+            if pid.isdigit():
+                pids.add(int(pid))
+
+    if not args.rollup:
+        print("       PID             START              END PROT   OFFSET   DEV    INODE OBJECT")
+
+    for pid in pids:
+        try:
+            with PageMap(pid) as pagemap:
+                with KPageFlags() as kpageflags:
+                    for vma in VMAList(pid, vma_all_stats if args.inc_smaps else vma_min_stats):
+                        if (vma.read or vma.write or vma.execute) and vma.stats['Rss']['value'] > 0:
+                            stats, vma_anon, vma_file = vma_parse(vma, pagemap, kpageflags, args.cont)
+                        else:
+                            stats = {}
+                            vma_anon = 0
+                            vma_file = 0
+                        if args.inc_smaps:
+                            stats = {**vma.stats, **stats}
+                        if args.rollup:
+                            for k, v in stats.items():
+                                if k in rollup:
+                                    assert(rollup[k]['type'] == v['type'])
+                                    rollup[k]['value'] += v['value']
+                                else:
+                                    rollup[k] = v
+                            rollup_anon += vma_anon
+                            rollup_file += vma_file
+                        else:
+                            vma_print(vma, pid)
+                            stats_print(stats, vma_anon, vma_file, args.inc_empty)
+        except (FileNotFoundError, ProcessLookupError, FileIOException):
+            if strict:
+                raise
+
+    if args.rollup:
+        stats_print(rollup, rollup_anon, rollup_file, args.inc_empty)
+
+
+def main():
+    docs_width = shutil.get_terminal_size().columns
+    docs_width -= 2
+    docs_width = min(80, docs_width)
+
+    def format(string):
+        text = re.sub(r'\s+', ' ', string)
+        text = re.sub(r'\s*\\n\s*', '\n', text)
+        paras = text.split('\n')
+        paras = [textwrap.fill(p, width=docs_width) for p in paras]
+        return '\n'.join(paras)
+
+    def formatter(prog):
+        return argparse.RawDescriptionHelpFormatter(prog, width=docs_width)
+
+    def size2order(human):
+        units = {
+            "K": 2**10, "M": 2**20, "G": 2**30,
+            "k": 2**10, "m": 2**20, "g": 2**30,
+        }
+        unit = 1
+        if human[-1] in units:
+            unit = units[human[-1]]
+            human = human[:-1]
+        try:
+            size = int(human)
+        except ValueError:
+            raise ArgException('error: --cont value must be integer size with optional KMG unit')
+        size *= unit
+        order = int(math.log2(size / PAGE_SIZE))
+        if order < 1:
+            raise ArgException('error: --cont value must be size of at least 2 pages')
+        if (1 << order) * PAGE_SIZE != size:
+            raise ArgException('error: --cont value must be size of power-of-2 pages')
+        if order > PMD_ORDER:
+            raise ArgException('error: --cont value must be less than or equal to PMD order')
+        return order
+
+    parser = argparse.ArgumentParser(formatter_class=formatter,
+        description=format("""Prints information about how transparent huge
+                    pages are mapped, either system-wide, or for a specified
+                    process or cgroup.\\n
+                    \\n
+                    When run with --pid, the user explicitly specifies the set
+                    of pids to scan. e.g. "--pid 10 [--pid 134 ...]". When run
+                    with --cgroup, the user passes either a v1 or v2 cgroup and
+                    all pids that belong to the cgroup subtree are scanned. When
+                    run with neither --pid nor --cgroup, the full set of pids on
+                    the system is gathered from /proc and scanned as if the user
+                    had provided "--pid 1 --pid 2 ...".\\n
+                    \\n
+                    A default set of statistics is always generated for THP
+                    mappings. However, it is also possible to generate
+                    additional statistics for "contiguous block mappings" where
+                    the block size is user-defined.\\n
+                    \\n
+                    Statistics are maintained independently for anonymous and
+                    file-backed (pagecache) memory and are shown both in kB and
+                    as a percentage of either total anonymous or total
+                    file-backed memory as appropriate.\\n
+                    \\n
+                    THP Statistics\\n
+                    --------------\\n
+                    \\n
+                    Statistics are always generated for fully- and
+                    contiguously-mapped THPs whose mapping address is aligned to
+                    their size, for each <size> supported by the system.
+                    Separate counters describe THPs mapped by PTE vs those
+                    mapped by PMD. (Although note a THP can only be mapped by
+                    PMD if it is PMD-sized):\\n
+                    \\n
+                    - anon-thp-pte-aligned-<size>kB\\n
+                    - file-thp-pte-aligned-<size>kB\\n
+                    - anon-thp-pmd-aligned-<size>kB\\n
+                    - file-thp-pmd-aligned-<size>kB\\n
+                    \\n
+                    Similarly, statistics are always generated for fully- and
+                    contiguously-mapped THPs whose mapping address is *not*
+                    aligned to their size, for each <size> supported by the
+                    system. Due to the unaligned mapping, it is impossible to
+                    map by PMD, so there are only PTE counters for this case:\\n
+                    \\n
+                    - anon-thp-pte-unaligned-<size>kB\\n
+                    - file-thp-pte-unaligned-<size>kB\\n
+                    \\n
+                    Statistics are also always generated for mapped pages that
+                    belong to a THP but where the is THP is *not* fully- and
+                    contiguously- mapped. These "partial" mappings are all
+                    counted in the same counter regardless of the size of the
+                    THP that is partially mapped:\\n
+                    \\n
+                    - anon-thp-pte-partial\\n
+                    - file-thp-pte-partial\\n
+                    \\n
+                    Contiguous Block Statistics\\n
+                    ---------------------------\\n
+                    \\n
+                    An optional, additional set of statistics is generated for
+                    every contiguous block size specified with `--cont <size>`.
+                    These statistics show how much memory is mapped in
+                    contiguous blocks of <size> and also aligned to <size>. A
+                    given contiguous block must all belong to the same THP, but
+                    there is no requirement for it to be the *whole* THP.
+                    Separate counters describe contiguous blocks mapped by PTE
+                    vs those mapped by PMD:\\n
+                    \\n
+                    - anon-cont-pte-aligned-<size>kB\\n
+                    - file-cont-pte-aligned-<size>kB\\n
+                    - anon-cont-pmd-aligned-<size>kB\\n
+                    - file-cont-pmd-aligned-<size>kB\\n
+                    \\n
+                    As an example, if monitoring 64K contiguous blocks (--cont
+                    64K), there are a number of sources that could provide such
+                    blocks: a fully- and contiguously-mapped 64K THP that is
+                    aligned to a 64K boundary would provide 1 block. A fully-
+                    and contiguously-mapped 128K THP that is aligned to at least
+                    a 64K boundary would provide 2 blocks. Or a 128K THP that
+                    maps its first 100K, but contiguously and starting at a 64K
+                    boundary would provide 1 block. A fully- and
+                    contiguously-mapped 2M THP would provide 32 blocks. There
+                    are many other possible permutations.\\n"""),
+        epilog=format("""Requires root privilege to access pagemap and
+                    kpageflags."""))
+
+    group = parser.add_mutually_exclusive_group(required=False)
+    group.add_argument('--pid',
+        metavar='pid', required=False, type=int, default=[], action='append',
+        help="""Process id of the target process. Maybe issued multiple times to
+            scan multiple processes. --pid and --cgroup are mutually exclusive.
+            If neither are provided, all processes are scanned to provide
+            system-wide information.""")
+
+    group.add_argument('--cgroup',
+        metavar='path', required=False,
+        help="""Path to the target cgroup in sysfs. Iterates over every pid in
+            the cgroup and its children. --pid and --cgroup are mutually
+            exclusive. If neither are provided, all processes are scanned to
+            provide system-wide information.""")
+
+    parser.add_argument('--rollup',
+        required=False, default=False, action='store_true',
+        help="""Sum the per-vma statistics to provide a summary over the whole
+            system, process or cgroup.""")
+
+    parser.add_argument('--cont',
+        metavar='size[KMG]', required=False, default=[], action='append',
+        help="""Adds stats for memory that is mapped in contiguous blocks of
+            <size> and also aligned to <size>. May be issued multiple times to
+            track multiple sized blocks. Useful to infer e.g. arm64 contpte and
+            hpa mappings. Size must be a power-of-2 number of pages.""")
+
+    parser.add_argument('--inc-smaps',
+        required=False, default=False, action='store_true',
+        help="""Include all numerical, additive /proc/<pid>/smaps stats in the
+            output.""")
+
+    parser.add_argument('--inc-empty',
+        required=False, default=False, action='store_true',
+        help="""Show all statistics including those whose value is 0.""")
+
+    parser.add_argument('--periodic',
+        metavar='sleep_ms', required=False, type=int,
+        help="""Run in a loop, polling every sleep_ms milliseconds.""")
+
+    args = parser.parse_args()
+
+    try:
+        args.cont = [size2order(cont) for cont in args.cont]
+    except ArgException as e:
+        parser.print_usage()
+        raise
+
+    if args.periodic:
+        while True:
+            do_main(args)
+            print()
+            time.sleep(args.periodic / 1000)
+    else:
+        do_main(args)
+
+
+if __name__ == "__main__":
+    try:
+        main()
+    except Exception as e:
+        prog = os.path.basename(sys.argv[0])
+        print(f'{prog}: {e}')
+        exit(1)

From dc437404706382d11e3b7452cda9c6f29e31e70d Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Wed, 17 Jan 2024 18:39:54 +0800
Subject: [PATCH 1005/1406] mm: memory: move mem_cgroup_charge() into
 alloc_anon_folio()

The GFP flags from vma_thp_gfp_mask() according to user configuration only
used for large folio allocation but not for memory cgroup charge, and
GFP_KERNEL is used for both order-0 and large order folio when memory
cgroup charge at present.  However, mem_cgroup_charge() uses the GFP flags
in a fairly sophisticated way.  In addition to checking
gfpflags_allow_blocking(), it pays attention to __GFP_NORETRY and
__GFP_RETRY_MAYFAIL to ensure that processes within this memcg do not
exceed their quotas.

So we'd better to move mem_cgroup_charge() into alloc_anon_folio(),

1) it will make us to allocate as much as possible large order folio,
   because we could try the next order if mem_cgroup_charge() fails,
   although the memcg's memory usage is close to its limits.

2) using same GFP flags for allocation and charge is to be consistent
   with PMD THP firstly, in addition, according to GFP flag returned from
   vma_thp_gfp_mask(), GFP_TRANSHUGE_LIGHT could make us skip direct
   reclaim, _GFP_NORETRY will make us skip mem_cgroup_oom() and won't
   trigger memory cgroup oom from large order(order <= COSTLY_ORDER) folio
   charging.

Link: https://lkml.kernel.org/r/20240122011612.501029-1-wangkefeng.wang@huawei.com
Link: https://lkml.kernel.org/r/20240117103954.2756050-1-wangkefeng.wang@huawei.com
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Reviewed-by: Ryan Roberts <ryan.roberts@arm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Muchun Song <songmuchun@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/memory.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index 5e608edfe330c5..00f3f4fbd131d0 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4173,8 +4173,8 @@ static bool pte_range_none(pte_t *pte, int nr_pages)
 
 static struct folio *alloc_anon_folio(struct vm_fault *vmf)
 {
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
 	struct vm_area_struct *vma = vmf->vma;
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
 	unsigned long orders;
 	struct folio *folio;
 	unsigned long addr;
@@ -4226,15 +4226,21 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf)
 		addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
 		folio = vma_alloc_folio(gfp, order, vma, addr, true);
 		if (folio) {
+			if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) {
+				folio_put(folio);
+				goto next;
+			}
+			folio_throttle_swaprate(folio, gfp);
 			clear_huge_page(&folio->page, vmf->address, 1 << order);
 			return folio;
 		}
+next:
 		order = next_order(&orders, order);
 	}
 
 fallback:
 #endif
-	return vma_alloc_zeroed_movable_folio(vmf->vma, vmf->address);
+	return folio_prealloc(vma->vm_mm, vma, vmf->address, true);
 }
 
 /*
@@ -4301,10 +4307,6 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
 	nr_pages = folio_nr_pages(folio);
 	addr = ALIGN_DOWN(vmf->address, nr_pages * PAGE_SIZE);
 
-	if (mem_cgroup_charge(folio, vma->vm_mm, GFP_KERNEL))
-		goto oom_free_page;
-	folio_throttle_swaprate(folio, GFP_KERNEL);
-
 	/*
 	 * The memory barrier inside __folio_mark_uptodate makes sure that
 	 * preceding stores to the page contents become visible before
@@ -4358,8 +4360,6 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
 release:
 	folio_put(folio);
 	goto unlock;
-oom_free_page:
-	folio_put(folio);
 oom:
 	return VM_FAULT_OOM;
 }

From e60bf65418789fc63690cbdc483571f2639bbb46 Mon Sep 17 00:00:00 2001
From: Shakeel Butt <shakeelb@google.com>
Date: Thu, 18 Jan 2024 18:42:35 +0000
Subject: [PATCH 1006/1406] mm: writeback: ratelimit stat flush from
 mem_cgroup_wb_stats

One of our workloads (Postgres 14) has regressed when migrated from 5.10
to 6.1 upstream kernel.  The regression can be reproduced by sysbench's
oltp_write_only benchmark.  It seems like the always on rstat flush in
mem_cgroup_wb_stats() is causing the regression.  So, rate limit that
specific rstat flush.  One potential consequence would be the dirty
throttling might be decided on stale memcg stats.  However from our
benchmarks and production traffic we have not observed any change in the
dirty throttling behavior of the application.

Link: https://lkml.kernel.org/r/20240118184235.618164-1-shakeelb@google.com
Fixes: 2d146aa3aa84 ("mm: memcontrol: switch to rstat")
Signed-off-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Jan Kara <jack@suse.cz>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/memcontrol.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index db92401257f7d0..df11d6d19ee38f 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4800,7 +4800,7 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
 	struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
 	struct mem_cgroup *parent;
 
-	mem_cgroup_flush_stats(memcg);
+	mem_cgroup_flush_stats_ratelimited(memcg);
 
 	*pdirty = memcg_page_state(memcg, NR_FILE_DIRTY);
 	*pwriteback = memcg_page_state(memcg, NR_WRITEBACK);

From 4566690b3d17ba35ae4a96d47fe806008a30f6d1 Mon Sep 17 00:00:00 2001
From: Greg Thelen <gthelen@google.com>
Date: Thu, 18 Jan 2024 01:50:57 -0800
Subject: [PATCH 1007/1406] selftests/memfd: delete unused declarations

Commit 32d118ad50a5 ("selftests/memfd: add tests for F_SEAL_EXEC"):
- added several unused 'nbytes' local variables

Commit 6469b66e3f5a ("selftests: improve vm.memfd_noexec sysctl tests"):
- orphaned 'newpid_thread_fn2()' forward declaration
- orphaned 'join_newpid_thread()' forward declaration
- added unused 'pid' local in sysctl_simple_child()
- orphaned 'fd' local in sysctl_simple_child()
- added unused 'fd' in sysctl_nested_child()

Delete the unused locals and forward declarations.

Link: https://lkml.kernel.org/r/20240118095057.677544-1-gthelen@google.com
Signed-off-by: Greg Thelen <gthelen@google.com>
Cc: Aleksa Sarai <cyphar@cyphar.com>
Cc: Daniel Verkamp <dverkamp@chromium.org>
Cc: Jeff Xu <jeffxu@google.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/memfd/memfd_test.c | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/tools/testing/selftests/memfd/memfd_test.c b/tools/testing/selftests/memfd/memfd_test.c
index 3df00867723910..18f585684e2025 100644
--- a/tools/testing/selftests/memfd/memfd_test.c
+++ b/tools/testing/selftests/memfd/memfd_test.c
@@ -44,8 +44,6 @@
  */
 static size_t mfd_def_size = MFD_DEF_SIZE;
 static const char *memfd_str = MEMFD_STR;
-static int newpid_thread_fn2(void *arg);
-static void join_newpid_thread(pid_t pid);
 
 static ssize_t fd2name(int fd, char *buf, size_t bufsize)
 {
@@ -194,7 +192,6 @@ static unsigned int mfd_assert_get_seals(int fd)
 static void mfd_assert_has_seals(int fd, unsigned int seals)
 {
 	char buf[PATH_MAX];
-	int nbytes;
 	unsigned int s;
 	fd2name(fd, buf, PATH_MAX);
 
@@ -696,7 +693,6 @@ static void mfd_assert_mode(int fd, int mode)
 {
 	struct stat st;
 	char buf[PATH_MAX];
-	int nbytes;
 
 	fd2name(fd, buf, PATH_MAX);
 
@@ -715,7 +711,6 @@ static void mfd_assert_mode(int fd, int mode)
 static void mfd_assert_chmod(int fd, int mode)
 {
 	char buf[PATH_MAX];
-	int nbytes;
 
 	fd2name(fd, buf, PATH_MAX);
 
@@ -731,7 +726,6 @@ static void mfd_fail_chmod(int fd, int mode)
 {
 	struct stat st;
 	char buf[PATH_MAX];
-	int nbytes;
 
 	fd2name(fd, buf, PATH_MAX);
 
@@ -1254,9 +1248,6 @@ static void test_sysctl_set_sysctl2(void)
 
 static int sysctl_simple_child(void *arg)
 {
-	int fd;
-	int pid;
-
 	printf("%s sysctl 0\n", memfd_str);
 	test_sysctl_set_sysctl0();
 
@@ -1321,7 +1312,6 @@ static void test_sysctl_sysctl2_failset(void)
 
 static int sysctl_nested_child(void *arg)
 {
-	int fd;
 	int pid;
 
 	printf("%s nested sysctl 0\n", memfd_str);

From 5dbe602e012b6f122bb818a988e16439b0357820 Mon Sep 17 00:00:00 2001
From: Lokesh Gidra <lokeshgidra@google.com>
Date: Wed, 17 Jan 2024 14:39:21 -0800
Subject: [PATCH 1008/1406] userfaultfd: fix return error if mmap_changing is
 non-zero in MOVE ioctl

To be consistent with other uffd ioctl's returning EAGAIN when
mmap_changing is detected, we should change UFFDIO_MOVE to do the same.

Link: https://lkml.kernel.org/r/20240117223922.1445327-1-lokeshgidra@google.com
Signed-off-by: Lokesh Gidra <lokeshgidra@google.com>
Acked-by: Suren Baghdasaryan <surenb@google.com>
Acked-by: Mike Rapoport (IBM) <rppt@kernel.org>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Brian Geffon <bgeffon@google.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: Kalesh Singh <kaleshsingh@google.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Nicolas Geoffray <ngeoffray@google.com>
Cc: Peter Xu <peterx@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/userfaultfd.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 959551ff9a9514..05c8e8a054272f 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -2047,7 +2047,7 @@ static int userfaultfd_move(struct userfaultfd_ctx *ctx,
 			ret = move_pages(ctx, mm, uffdio_move.dst, uffdio_move.src,
 					 uffdio_move.len, uffdio_move.mode);
 		else
-			ret = -EINVAL;
+			ret = -EAGAIN;
 
 		mmap_read_unlock(mm);
 		mmput(mm);

From c7072461363d05805cd6ec20f3c1da95d5f72178 Mon Sep 17 00:00:00 2001
From: Nico Pache <npache@redhat.com>
Date: Wed, 17 Jan 2024 11:00:37 -0700
Subject: [PATCH 1009/1406] selftests: mm: perform some system cleanup before
 using hugepages

When running with CATEGORY= (thp | hugetlb) we see a large numbers of
tests failing.  These failures are due to not being able to allocate a
hugepage and normally occur on memory contrainted systems or when using
large page sizes.

drop_cache and compact_memory before the tests for a higher chance at a
successful hugepage allocation.

Link: https://lkml.kernel.org/r/20240117180037.15734-1-npache@redhat.com
Signed-off-by: Nico Pache <npache@redhat.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/run_vmtests.sh | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh
index 246d53a5d7f287..040f27e21f47a3 100755
--- a/tools/testing/selftests/mm/run_vmtests.sh
+++ b/tools/testing/selftests/mm/run_vmtests.sh
@@ -206,6 +206,15 @@ pretty_name() {
 # Usage: run_test [test binary] [arbitrary test arguments...]
 run_test() {
 	if test_selected ${CATEGORY}; then
+		# On memory constrainted systems some tests can fail to allocate hugepages.
+		# perform some cleanup before the test for a higher success rate.
+		if [ ${CATEGORY} == "thp" ] | [ ${CATEGORY} == "hugetlb" ]; then
+			echo 3 > /proc/sys/vm/drop_caches
+			sleep 2
+			echo 1 > /proc/sys/vm/compact_memory
+			sleep 2
+		fi
+
 		local test=$(pretty_name "$*")
 		local title="running $*"
 		local sep=$(echo -n "$title" | tr "[:graph:][:space:]" -)

From b4940304bc81ece3af18731cfab2307ac646867e Mon Sep 17 00:00:00 2001
From: Lukas Bulwahn <lukas.bulwahn@gmail.com>
Date: Mon, 22 Jan 2024 11:20:00 +0100
Subject: [PATCH 1010/1406] maple_tree: avoid duplicate variable init in
 mast_spanning_rebalance()

The local variables r_tmp and l_tmp in mast_spanning_rebalance() are
already initialized at its declaration; there is no need to assign the
value again.

Remove the duplicate initialization of {r,l}_tmp.  No functional change.
Due to common compiler optimizations, also no change to object code.

This issue was identified with clang-analyzer's dead stores analysis.

Link: https://lkml.kernel.org/r/20240122102000.29558-1-lukas.bulwahn@gmail.com
Signed-off-by: Lukas Bulwahn <lukas.bulwahn@gmail.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 lib/maple_tree.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/lib/maple_tree.c b/lib/maple_tree.c
index 7b161802860bdb..82fb5195c2354f 100644
--- a/lib/maple_tree.c
+++ b/lib/maple_tree.c
@@ -2271,8 +2271,6 @@ bool mast_spanning_rebalance(struct maple_subtree_state *mast)
 	struct ma_state l_tmp = *mast->orig_l;
 	unsigned char depth = 0;
 
-	r_tmp = *mast->orig_r;
-	l_tmp = *mast->orig_l;
 	do {
 		mas_ascend(mast->orig_r);
 		mas_ascend(mast->orig_l);

From 50bd10d6d40969f16601bdfc0ae6fddd49c97543 Mon Sep 17 00:00:00 2001
From: Lukas Bulwahn <lukas.bulwahn@gmail.com>
Date: Mon, 22 Jan 2024 10:25:04 +0100
Subject: [PATCH 1011/1406] mempolicy: clean up minor dead code in
 queue_pages_test_walk()

Commit 2cafb582173f ("mempolicy: remove confusing MPOL_MF_LAZY dead code")
removes MPOL_MF_LAZY handling in queue_pages_test_walk(), and with that,
there is no effective use of the local variable endvma in that function
remaining.

Remove the local variable endvma and its dead code. No functional change.

This issue was identified with clang-analyzer's dead stores analysis.

Link: https://lkml.kernel.org/r/20240122092504.18377-1-lukas.bulwahn@gmail.com
Signed-off-by: Lukas Bulwahn <lukas.bulwahn@gmail.com>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/mempolicy.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 10a590ee1c8997..5e519163c4dcb6 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -654,7 +654,6 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end,
 {
 	struct vm_area_struct *next, *vma = walk->vma;
 	struct queue_pages *qp = walk->private;
-	unsigned long endvma = vma->vm_end;
 	unsigned long flags = qp->flags;
 
 	/* range check first */
@@ -682,9 +681,6 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end,
 	    !(flags & MPOL_MF_STRICT))
 		return 1;
 
-	if (endvma > end)
-		endvma = end;
-
 	/*
 	 * Check page nodes, and queue pages to move, in the current vma.
 	 * But if no moving, and no strict checking, the scan can be skipped.

From a8d5cdbd26e8c45785c0e9d48d9fa90f4884d8fb Mon Sep 17 00:00:00 2001
From: Baoquan He <bhe@redhat.com>
Date: Wed, 24 Jan 2024 13:12:41 +0800
Subject: [PATCH 1012/1406] kexec: split crashkernel reservation code out from
 crash_core.c

Patch series "Split crash out from kexec and clean up related config
items", v3.

Motivation:
=============
Previously, LKP reported a building error. When investigating, it can't
be resolved reasonablly with the present messy kdump config items.

 https://lore.kernel.org/oe-kbuild-all/202312182200.Ka7MzifQ-lkp@intel.com/

The kdump (crash dumping) related config items could causes confusions:

Firstly,

CRASH_CORE enables codes including
 - crashkernel reservation;
 - elfcorehdr updating;
 - vmcoreinfo exporting;
 - crash hotplug handling;

Now fadump of powerpc, kcore dynamic debugging and kdump all selects
CRASH_CORE, while fadump
 - fadump needs crashkernel parsing, vmcoreinfo exporting, and accessing
   global variable 'elfcorehdr_addr';
 - kcore only needs vmcoreinfo exporting;
 - kdump needs all of the current kernel/crash_core.c.

So only enabling PROC_CORE or FA_DUMP will enable CRASH_CORE, this
mislead people that we enable crash dumping, actual it's not.

Secondly,

It's not reasonable to allow KEXEC_CORE select CRASH_CORE.

Because KEXEC_CORE enables codes which allocate control pages, copy
kexec/kdump segments, and prepare for switching. These codes are
shared by both kexec reboot and kdump. We could want kexec reboot,
but disable kdump. In that case, CRASH_CORE should not be selected.

 --------------------
 CONFIG_CRASH_CORE=y
 CONFIG_KEXEC_CORE=y
 CONFIG_KEXEC=y
 CONFIG_KEXEC_FILE=y
 ---------------------

Thirdly,

It's not reasonable to allow CRASH_DUMP select KEXEC_CORE.

That could make KEXEC_CORE, CRASH_DUMP are enabled independently from
KEXEC or KEXEC_FILE. However, w/o KEXEC or KEXEC_FILE, the KEXEC_CORE
code built in doesn't make any sense because no kernel loading or
switching will happen to utilize the KEXEC_CORE code.
 ---------------------
 CONFIG_CRASH_CORE=y
 CONFIG_KEXEC_CORE=y
 CONFIG_CRASH_DUMP=y
 ---------------------

In this case, what is worse, on arch sh and arm, KEXEC relies on MMU,
while CRASH_DUMP can still be enabled when !MMU, then compiling error is
seen as the lkp test robot reported in above link.

 ------arch/sh/Kconfig------
 config ARCH_SUPPORTS_KEXEC
         def_bool MMU

 config ARCH_SUPPORTS_CRASH_DUMP
         def_bool BROKEN_ON_SMP
 ---------------------------

Changes:
===========
1, split out crash_reserve.c from crash_core.c;
2, split out vmcore_infoc. from crash_core.c;
3, move crash related codes in kexec_core.c into crash_core.c;
4, remove dependency of FA_DUMP on CRASH_DUMP;
5, clean up kdump related config items;
6, wrap up crash codes in crash related ifdefs on all 8 arch-es
   which support crash dumping, except of ppc;

Achievement:
===========
With above changes, I can rearrange the config item logic as below (the right
item depends on or is selected by the left item):

    PROC_KCORE -----------> VMCORE_INFO

               |----------> VMCORE_INFO
    FA_DUMP----|
               |----------> CRASH_RESERVE

                                                    ---->VMCORE_INFO
                                                   /
                                                   |---->CRASH_RESERVE
    KEXEC      --|                                /|
                 |--> KEXEC_CORE--> CRASH_DUMP-->/-|---->PROC_VMCORE
    KEXEC_FILE --|                               \ |
                                                   \---->CRASH_HOTPLUG


    KEXEC      --|
                 |--> KEXEC_CORE (for kexec reboot only)
    KEXEC_FILE --|

Test
========
On all 8 architectures, including x86_64, arm64, s390x, sh, arm, mips,
riscv, loongarch, I did below three cases of config item setting and
building all passed. Take configs on x86_64 as exampmle here:

(1) Both CONFIG_KEXEC and KEXEC_FILE is unset, then all kexec/kdump
items are unset automatically:
# Kexec and crash features
# CONFIG_KEXEC is not set
# CONFIG_KEXEC_FILE is not set
# end of Kexec and crash features

(2) set CONFIG_KEXEC_FILE and 'make olddefconfig':
---------------
# Kexec and crash features
CONFIG_CRASH_RESERVE=y
CONFIG_VMCORE_INFO=y
CONFIG_KEXEC_CORE=y
CONFIG_KEXEC_FILE=y
CONFIG_CRASH_DUMP=y
CONFIG_CRASH_HOTPLUG=y
CONFIG_CRASH_MAX_MEMORY_RANGES=8192
# end of Kexec and crash features
---------------

(3) unset CONFIG_CRASH_DUMP in case 2 and execute 'make olddefconfig':
------------------------
# Kexec and crash features
CONFIG_KEXEC_CORE=y
CONFIG_KEXEC_FILE=y
# end of Kexec and crash features
------------------------

Note:
For ppc, it needs investigation to make clear how to split out crash
code in arch folder. Hope Hari and Pingfan can help have a look, see if
it's doable. Now, I make it either have both kexec and crash enabled, or
disable both of them altogether.


This patch (of 14):

Both kdump and fa_dump of ppc rely on crashkernel reservation.  Move the
relevant codes into separate files: crash_reserve.c,
include/linux/crash_reserve.h.

And also add config item CRASH_RESERVE to control its enabling of the
codes.  And update config items which has relationship with crashkernel
reservation.

And also change ifdeffery from CONFIG_CRASH_CORE to CONFIG_CRASH_RESERVE
when those scopes are only crashkernel reservation related.

And also rename arch/XXX/include/asm/{crash_core.h => crash_reserve.h} on
arm64, x86 and risc-v because those architectures' crash_core.h is only
related to crashkernel reservation.

Link: https://lkml.kernel.org/r/20240124051254.67105-1-bhe@redhat.com
Link: https://lkml.kernel.org/r/20240124051254.67105-2-bhe@redhat.com
Signed-off-by: Baoquan He <bhe@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Hari Bathini <hbathini@linux.ibm.com>
Cc: Pingfan Liu <piliu@redhat.com>
Cc: Klara Modin <klarasmodin@gmail.com>
Cc: Michael Kelley <mhklinux@outlook.com>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Yang Li <yang.lee@linux.alibaba.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/arm64/Kconfig                            |   2 +-
 .../asm/{crash_core.h => crash_reserve.h}     |   4 +-
 arch/powerpc/Kconfig                          |   1 +
 arch/powerpc/mm/nohash/kaslr_booke.c          |   4 +-
 arch/riscv/Kconfig                            |   2 +-
 .../asm/{crash_core.h => crash_reserve.h}     |   4 +-
 arch/x86/Kconfig                              |   2 +-
 .../asm/{crash_core.h => crash_reserve.h}     |   6 +-
 include/linux/crash_core.h                    |  40 --
 include/linux/crash_reserve.h                 |  48 ++
 include/linux/kexec.h                         |   1 +
 kernel/Kconfig.kexec                          |   5 +-
 kernel/Makefile                               |   1 +
 kernel/crash_core.c                           | 438 -----------------
 kernel/crash_reserve.c                        | 464 ++++++++++++++++++
 15 files changed, 531 insertions(+), 491 deletions(-)
 rename arch/arm64/include/asm/{crash_core.h => crash_reserve.h} (81%)
 rename arch/riscv/include/asm/{crash_core.h => crash_reserve.h} (78%)
 rename arch/x86/include/asm/{crash_core.h => crash_reserve.h} (92%)
 create mode 100644 include/linux/crash_reserve.h
 create mode 100644 kernel/crash_reserve.c

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index aa7c1d43513968..e8275a40afbd3b 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -1519,7 +1519,7 @@ config ARCH_SUPPORTS_CRASH_DUMP
 	def_bool y
 
 config ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION
-	def_bool CRASH_CORE
+	def_bool CRASH_RESERVE
 
 config TRANS_TABLE
 	def_bool y
diff --git a/arch/arm64/include/asm/crash_core.h b/arch/arm64/include/asm/crash_reserve.h
similarity index 81%
rename from arch/arm64/include/asm/crash_core.h
rename to arch/arm64/include/asm/crash_reserve.h
index 9f5c8d339f44f5..4afe027a4e7b2c 100644
--- a/arch/arm64/include/asm/crash_core.h
+++ b/arch/arm64/include/asm/crash_reserve.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
-#ifndef _ARM64_CRASH_CORE_H
-#define _ARM64_CRASH_CORE_H
+#ifndef _ARM64_CRASH_RESERVE_H
+#define _ARM64_CRASH_RESERVE_H
 
 /* Current arm64 boot protocol requires 2MB alignment */
 #define CRASH_ALIGN                     SZ_2M
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index b9fc064d38d281..7f704ae5c5efcb 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -691,6 +691,7 @@ config FA_DUMP
 	bool "Firmware-assisted dump"
 	depends on PPC64 && (PPC_RTAS || PPC_POWERNV)
 	select CRASH_CORE
+	select CRASH_RESERVE
 	select CRASH_DUMP
 	help
 	  A robust mechanism to get reliable kernel crash dump with
diff --git a/arch/powerpc/mm/nohash/kaslr_booke.c b/arch/powerpc/mm/nohash/kaslr_booke.c
index b4f2786a7d2b0b..cdff129abb1446 100644
--- a/arch/powerpc/mm/nohash/kaslr_booke.c
+++ b/arch/powerpc/mm/nohash/kaslr_booke.c
@@ -13,7 +13,7 @@
 #include <linux/delay.h>
 #include <linux/memblock.h>
 #include <linux/libfdt.h>
-#include <linux/crash_core.h>
+#include <linux/crash_reserve.h>
 #include <linux/of.h>
 #include <linux/of_fdt.h>
 #include <asm/cacheflush.h>
@@ -173,7 +173,7 @@ static __init bool overlaps_region(const void *fdt, u32 start,
 
 static void __init get_crash_kernel(void *fdt, unsigned long size)
 {
-#ifdef CONFIG_CRASH_CORE
+#ifdef CONFIG_CRASH_RESERVE
 	unsigned long long crash_size, crash_base;
 	int ret;
 
diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index bffbd869a06828..bd06ad1bb97cbb 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -767,7 +767,7 @@ config ARCH_SUPPORTS_CRASH_DUMP
 	def_bool y
 
 config ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION
-	def_bool CRASH_CORE
+	def_bool CRASH_RESERVE
 
 config COMPAT
 	bool "Kernel support for 32-bit U-mode"
diff --git a/arch/riscv/include/asm/crash_core.h b/arch/riscv/include/asm/crash_reserve.h
similarity index 78%
rename from arch/riscv/include/asm/crash_core.h
rename to arch/riscv/include/asm/crash_reserve.h
index e1874b23feaf11..013962e63587f3 100644
--- a/arch/riscv/include/asm/crash_core.h
+++ b/arch/riscv/include/asm/crash_reserve.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
-#ifndef _RISCV_CRASH_CORE_H
-#define _RISCV_CRASH_CORE_H
+#ifndef _RISCV_CRASH_RESERVE_H
+#define _RISCV_CRASH_RESERVE_H
 
 #define CRASH_ALIGN			PMD_SIZE
 
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 5edec175b9bfc9..71417c5b228c51 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -2106,7 +2106,7 @@ config ARCH_SUPPORTS_CRASH_HOTPLUG
 	def_bool y
 
 config ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION
-	def_bool CRASH_CORE
+	def_bool CRASH_RESEERVE
 
 config PHYSICAL_START
 	hex "Physical address where the kernel is loaded" if (EXPERT || CRASH_DUMP)
diff --git a/arch/x86/include/asm/crash_core.h b/arch/x86/include/asm/crash_reserve.h
similarity index 92%
rename from arch/x86/include/asm/crash_core.h
rename to arch/x86/include/asm/crash_reserve.h
index 76af98f4e80126..152239f9554195 100644
--- a/arch/x86/include/asm/crash_core.h
+++ b/arch/x86/include/asm/crash_reserve.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _X86_CRASH_CORE_H
-#define _X86_CRASH_CORE_H
+#ifndef _X86_CRASH_RESERVE_H
+#define _X86_CRASH_RESERVE_H
 
 /* 16M alignment for crash kernel regions */
 #define CRASH_ALIGN             SZ_16M
@@ -39,4 +39,4 @@ static inline unsigned long crash_low_size_default(void)
 #endif
 }
 
-#endif /* _X86_CRASH_CORE_H */
+#endif /* _X86_CRASH_RESERVE_H */
diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h
index 9eaeaafe0cad3a..1fde49246fa6e3 100644
--- a/include/linux/crash_core.h
+++ b/include/linux/crash_core.h
@@ -5,14 +5,6 @@
 #include <linux/linkage.h>
 #include <linux/elfcore.h>
 #include <linux/elf.h>
-#ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION
-#include <asm/crash_core.h>
-#endif
-
-/* Location of a reserved region to hold the crash kernel.
- */
-extern struct resource crashk_res;
-extern struct resource crashk_low_res;
 
 #define CRASH_CORE_NOTE_NAME	   "CORE"
 #define CRASH_CORE_NOTE_HEAD_BYTES ALIGN(sizeof(struct elf_note), 4)
@@ -87,38 +79,6 @@ Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type,
 			  void *data, size_t data_len);
 void final_note(Elf_Word *buf);
 
-int __init parse_crashkernel(char *cmdline, unsigned long long system_ram,
-		unsigned long long *crash_size, unsigned long long *crash_base,
-		unsigned long long *low_size, bool *high);
-
-#ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION
-#ifndef DEFAULT_CRASH_KERNEL_LOW_SIZE
-#define DEFAULT_CRASH_KERNEL_LOW_SIZE	(128UL << 20)
-#endif
-#ifndef CRASH_ALIGN
-#define CRASH_ALIGN			SZ_2M
-#endif
-#ifndef CRASH_ADDR_LOW_MAX
-#define CRASH_ADDR_LOW_MAX		SZ_4G
-#endif
-#ifndef CRASH_ADDR_HIGH_MAX
-#define CRASH_ADDR_HIGH_MAX		memblock_end_of_DRAM()
-#endif
-
-void __init reserve_crashkernel_generic(char *cmdline,
-		unsigned long long crash_size,
-		unsigned long long crash_base,
-		unsigned long long crash_low_size,
-		bool high);
-#else
-static inline void __init reserve_crashkernel_generic(char *cmdline,
-		unsigned long long crash_size,
-		unsigned long long crash_base,
-		unsigned long long crash_low_size,
-		bool high)
-{}
-#endif
-
 /* Alignment required for elf header segment */
 #define ELF_CORE_HEADER_ALIGN   4096
 
diff --git a/include/linux/crash_reserve.h b/include/linux/crash_reserve.h
new file mode 100644
index 00000000000000..5a9df944fb806a
--- /dev/null
+++ b/include/linux/crash_reserve.h
@@ -0,0 +1,48 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef LINUX_CRASH_RESERVE_H
+#define LINUX_CRASH_RESERVE_H
+
+#include <linux/linkage.h>
+#include <linux/elfcore.h>
+#include <linux/elf.h>
+#ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION
+#include <asm/crash_reserve.h>
+#endif
+
+/* Location of a reserved region to hold the crash kernel.
+ */
+extern struct resource crashk_res;
+extern struct resource crashk_low_res;
+
+int __init parse_crashkernel(char *cmdline, unsigned long long system_ram,
+		unsigned long long *crash_size, unsigned long long *crash_base,
+		unsigned long long *low_size, bool *high);
+
+#ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION
+#ifndef DEFAULT_CRASH_KERNEL_LOW_SIZE
+#define DEFAULT_CRASH_KERNEL_LOW_SIZE	(128UL << 20)
+#endif
+#ifndef CRASH_ALIGN
+#define CRASH_ALIGN			SZ_2M
+#endif
+#ifndef CRASH_ADDR_LOW_MAX
+#define CRASH_ADDR_LOW_MAX		SZ_4G
+#endif
+#ifndef CRASH_ADDR_HIGH_MAX
+#define CRASH_ADDR_HIGH_MAX		memblock_end_of_DRAM()
+#endif
+
+void __init reserve_crashkernel_generic(char *cmdline,
+		unsigned long long crash_size,
+		unsigned long long crash_base,
+		unsigned long long crash_low_size,
+		bool high);
+#else
+static inline void __init reserve_crashkernel_generic(char *cmdline,
+		unsigned long long crash_size,
+		unsigned long long crash_base,
+		unsigned long long crash_low_size,
+		bool high)
+{}
+#endif
+#endif /* LINUX_CRASH_RESERVE_H */
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index 400cb6c02176e0..6d79bfb52e5bf0 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -16,6 +16,7 @@
 #if !defined(__ASSEMBLY__)
 
 #include <linux/crash_core.h>
+#include <linux/crash_reserve.h>
 #include <asm/io.h>
 #include <linux/range.h>
 
diff --git a/kernel/Kconfig.kexec b/kernel/Kconfig.kexec
index 946dffa048b74c..8b7be71edd859e 100644
--- a/kernel/Kconfig.kexec
+++ b/kernel/Kconfig.kexec
@@ -2,11 +2,15 @@
 
 menu "Kexec and crash features"
 
+config CRASH_RESERVE
+	bool
+
 config CRASH_CORE
 	bool
 
 config KEXEC_CORE
 	select CRASH_CORE
+	select CRASH_RESERVE
 	bool
 
 config KEXEC_ELF
@@ -96,7 +100,6 @@ config KEXEC_JUMP
 config CRASH_DUMP
 	bool "kernel crash dumps"
 	depends on ARCH_SUPPORTS_CRASH_DUMP
-	select CRASH_CORE
 	select KEXEC_CORE
 	help
 	  Generate crash dump after being started by kexec.
diff --git a/kernel/Makefile b/kernel/Makefile
index ce105a5558fcfa..05fa88b3ab7499 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -69,6 +69,7 @@ obj-$(CONFIG_KALLSYMS) += kallsyms.o
 obj-$(CONFIG_KALLSYMS_SELFTEST) += kallsyms_selftest.o
 obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
 obj-$(CONFIG_CRASH_CORE) += crash_core.o
+obj-$(CONFIG_CRASH_RESERVE) += crash_reserve.o
 obj-$(CONFIG_KEXEC_CORE) += kexec_core.o
 obj-$(CONFIG_KEXEC) += kexec.o
 obj-$(CONFIG_KEXEC_FILE) += kexec_file.o
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 49b31e59d3ccd1..ae0d1ce89b46b8 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -34,444 +34,6 @@ u32 *vmcoreinfo_note;
 /* trusted vmcoreinfo, e.g. we can make a copy in the crash memory */
 static unsigned char *vmcoreinfo_data_safecopy;
 
-/* Location of the reserved area for the crash kernel */
-struct resource crashk_res = {
-	.name  = "Crash kernel",
-	.start = 0,
-	.end   = 0,
-	.flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM,
-	.desc  = IORES_DESC_CRASH_KERNEL
-};
-struct resource crashk_low_res = {
-	.name  = "Crash kernel",
-	.start = 0,
-	.end   = 0,
-	.flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM,
-	.desc  = IORES_DESC_CRASH_KERNEL
-};
-
-/*
- * parsing the "crashkernel" commandline
- *
- * this code is intended to be called from architecture specific code
- */
-
-
-/*
- * This function parses command lines in the format
- *
- *   crashkernel=ramsize-range:size[,...][@offset]
- *
- * The function returns 0 on success and -EINVAL on failure.
- */
-static int __init parse_crashkernel_mem(char *cmdline,
-					unsigned long long system_ram,
-					unsigned long long *crash_size,
-					unsigned long long *crash_base)
-{
-	char *cur = cmdline, *tmp;
-	unsigned long long total_mem = system_ram;
-
-	/*
-	 * Firmware sometimes reserves some memory regions for its own use,
-	 * so the system memory size is less than the actual physical memory
-	 * size. Work around this by rounding up the total size to 128M,
-	 * which is enough for most test cases.
-	 */
-	total_mem = roundup(total_mem, SZ_128M);
-
-	/* for each entry of the comma-separated list */
-	do {
-		unsigned long long start, end = ULLONG_MAX, size;
-
-		/* get the start of the range */
-		start = memparse(cur, &tmp);
-		if (cur == tmp) {
-			pr_warn("crashkernel: Memory value expected\n");
-			return -EINVAL;
-		}
-		cur = tmp;
-		if (*cur != '-') {
-			pr_warn("crashkernel: '-' expected\n");
-			return -EINVAL;
-		}
-		cur++;
-
-		/* if no ':' is here, than we read the end */
-		if (*cur != ':') {
-			end = memparse(cur, &tmp);
-			if (cur == tmp) {
-				pr_warn("crashkernel: Memory value expected\n");
-				return -EINVAL;
-			}
-			cur = tmp;
-			if (end <= start) {
-				pr_warn("crashkernel: end <= start\n");
-				return -EINVAL;
-			}
-		}
-
-		if (*cur != ':') {
-			pr_warn("crashkernel: ':' expected\n");
-			return -EINVAL;
-		}
-		cur++;
-
-		size = memparse(cur, &tmp);
-		if (cur == tmp) {
-			pr_warn("Memory value expected\n");
-			return -EINVAL;
-		}
-		cur = tmp;
-		if (size >= total_mem) {
-			pr_warn("crashkernel: invalid size\n");
-			return -EINVAL;
-		}
-
-		/* match ? */
-		if (total_mem >= start && total_mem < end) {
-			*crash_size = size;
-			break;
-		}
-	} while (*cur++ == ',');
-
-	if (*crash_size > 0) {
-		while (*cur && *cur != ' ' && *cur != '@')
-			cur++;
-		if (*cur == '@') {
-			cur++;
-			*crash_base = memparse(cur, &tmp);
-			if (cur == tmp) {
-				pr_warn("Memory value expected after '@'\n");
-				return -EINVAL;
-			}
-		}
-	} else
-		pr_info("crashkernel size resulted in zero bytes\n");
-
-	return 0;
-}
-
-/*
- * That function parses "simple" (old) crashkernel command lines like
- *
- *	crashkernel=size[@offset]
- *
- * It returns 0 on success and -EINVAL on failure.
- */
-static int __init parse_crashkernel_simple(char *cmdline,
-					   unsigned long long *crash_size,
-					   unsigned long long *crash_base)
-{
-	char *cur = cmdline;
-
-	*crash_size = memparse(cmdline, &cur);
-	if (cmdline == cur) {
-		pr_warn("crashkernel: memory value expected\n");
-		return -EINVAL;
-	}
-
-	if (*cur == '@')
-		*crash_base = memparse(cur+1, &cur);
-	else if (*cur != ' ' && *cur != '\0') {
-		pr_warn("crashkernel: unrecognized char: %c\n", *cur);
-		return -EINVAL;
-	}
-
-	return 0;
-}
-
-#define SUFFIX_HIGH 0
-#define SUFFIX_LOW  1
-#define SUFFIX_NULL 2
-static __initdata char *suffix_tbl[] = {
-	[SUFFIX_HIGH] = ",high",
-	[SUFFIX_LOW]  = ",low",
-	[SUFFIX_NULL] = NULL,
-};
-
-/*
- * That function parses "suffix"  crashkernel command lines like
- *
- *	crashkernel=size,[high|low]
- *
- * It returns 0 on success and -EINVAL on failure.
- */
-static int __init parse_crashkernel_suffix(char *cmdline,
-					   unsigned long long *crash_size,
-					   const char *suffix)
-{
-	char *cur = cmdline;
-
-	*crash_size = memparse(cmdline, &cur);
-	if (cmdline == cur) {
-		pr_warn("crashkernel: memory value expected\n");
-		return -EINVAL;
-	}
-
-	/* check with suffix */
-	if (strncmp(cur, suffix, strlen(suffix))) {
-		pr_warn("crashkernel: unrecognized char: %c\n", *cur);
-		return -EINVAL;
-	}
-	cur += strlen(suffix);
-	if (*cur != ' ' && *cur != '\0') {
-		pr_warn("crashkernel: unrecognized char: %c\n", *cur);
-		return -EINVAL;
-	}
-
-	return 0;
-}
-
-static __init char *get_last_crashkernel(char *cmdline,
-			     const char *name,
-			     const char *suffix)
-{
-	char *p = cmdline, *ck_cmdline = NULL;
-
-	/* find crashkernel and use the last one if there are more */
-	p = strstr(p, name);
-	while (p) {
-		char *end_p = strchr(p, ' ');
-		char *q;
-
-		if (!end_p)
-			end_p = p + strlen(p);
-
-		if (!suffix) {
-			int i;
-
-			/* skip the one with any known suffix */
-			for (i = 0; suffix_tbl[i]; i++) {
-				q = end_p - strlen(suffix_tbl[i]);
-				if (!strncmp(q, suffix_tbl[i],
-					     strlen(suffix_tbl[i])))
-					goto next;
-			}
-			ck_cmdline = p;
-		} else {
-			q = end_p - strlen(suffix);
-			if (!strncmp(q, suffix, strlen(suffix)))
-				ck_cmdline = p;
-		}
-next:
-		p = strstr(p+1, name);
-	}
-
-	return ck_cmdline;
-}
-
-static int __init __parse_crashkernel(char *cmdline,
-			     unsigned long long system_ram,
-			     unsigned long long *crash_size,
-			     unsigned long long *crash_base,
-			     const char *suffix)
-{
-	char *first_colon, *first_space;
-	char *ck_cmdline;
-	char *name = "crashkernel=";
-
-	BUG_ON(!crash_size || !crash_base);
-	*crash_size = 0;
-	*crash_base = 0;
-
-	ck_cmdline = get_last_crashkernel(cmdline, name, suffix);
-	if (!ck_cmdline)
-		return -ENOENT;
-
-	ck_cmdline += strlen(name);
-
-	if (suffix)
-		return parse_crashkernel_suffix(ck_cmdline, crash_size,
-				suffix);
-	/*
-	 * if the commandline contains a ':', then that's the extended
-	 * syntax -- if not, it must be the classic syntax
-	 */
-	first_colon = strchr(ck_cmdline, ':');
-	first_space = strchr(ck_cmdline, ' ');
-	if (first_colon && (!first_space || first_colon < first_space))
-		return parse_crashkernel_mem(ck_cmdline, system_ram,
-				crash_size, crash_base);
-
-	return parse_crashkernel_simple(ck_cmdline, crash_size, crash_base);
-}
-
-/*
- * That function is the entry point for command line parsing and should be
- * called from the arch-specific code.
- *
- * If crashkernel=,high|low is supported on architecture, non-NULL values
- * should be passed to parameters 'low_size' and 'high'.
- */
-int __init parse_crashkernel(char *cmdline,
-			     unsigned long long system_ram,
-			     unsigned long long *crash_size,
-			     unsigned long long *crash_base,
-			     unsigned long long *low_size,
-			     bool *high)
-{
-	int ret;
-
-	/* crashkernel=X[@offset] */
-	ret = __parse_crashkernel(cmdline, system_ram, crash_size,
-				crash_base, NULL);
-#ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION
-	/*
-	 * If non-NULL 'high' passed in and no normal crashkernel
-	 * setting detected, try parsing crashkernel=,high|low.
-	 */
-	if (high && ret == -ENOENT) {
-		ret = __parse_crashkernel(cmdline, 0, crash_size,
-				crash_base, suffix_tbl[SUFFIX_HIGH]);
-		if (ret || !*crash_size)
-			return -EINVAL;
-
-		/*
-		 * crashkernel=Y,low can be specified or not, but invalid value
-		 * is not allowed.
-		 */
-		ret = __parse_crashkernel(cmdline, 0, low_size,
-				crash_base, suffix_tbl[SUFFIX_LOW]);
-		if (ret == -ENOENT) {
-			*low_size = DEFAULT_CRASH_KERNEL_LOW_SIZE;
-			ret = 0;
-		} else if (ret) {
-			return ret;
-		}
-
-		*high = true;
-	}
-#endif
-	if (!*crash_size)
-		ret = -EINVAL;
-
-	return ret;
-}
-
-/*
- * Add a dummy early_param handler to mark crashkernel= as a known command line
- * parameter and suppress incorrect warnings in init/main.c.
- */
-static int __init parse_crashkernel_dummy(char *arg)
-{
-	return 0;
-}
-early_param("crashkernel", parse_crashkernel_dummy);
-
-#ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION
-static int __init reserve_crashkernel_low(unsigned long long low_size)
-{
-#ifdef CONFIG_64BIT
-	unsigned long long low_base;
-
-	low_base = memblock_phys_alloc_range(low_size, CRASH_ALIGN, 0, CRASH_ADDR_LOW_MAX);
-	if (!low_base) {
-		pr_err("cannot allocate crashkernel low memory (size:0x%llx).\n", low_size);
-		return -ENOMEM;
-	}
-
-	pr_info("crashkernel low memory reserved: 0x%08llx - 0x%08llx (%lld MB)\n",
-		low_base, low_base + low_size, low_size >> 20);
-
-	crashk_low_res.start = low_base;
-	crashk_low_res.end   = low_base + low_size - 1;
-#endif
-	return 0;
-}
-
-void __init reserve_crashkernel_generic(char *cmdline,
-			     unsigned long long crash_size,
-			     unsigned long long crash_base,
-			     unsigned long long crash_low_size,
-			     bool high)
-{
-	unsigned long long search_end = CRASH_ADDR_LOW_MAX, search_base = 0;
-	bool fixed_base = false;
-
-	/* User specifies base address explicitly. */
-	if (crash_base) {
-		fixed_base = true;
-		search_base = crash_base;
-		search_end = crash_base + crash_size;
-	} else if (high) {
-		search_base = CRASH_ADDR_LOW_MAX;
-		search_end = CRASH_ADDR_HIGH_MAX;
-	}
-
-retry:
-	crash_base = memblock_phys_alloc_range(crash_size, CRASH_ALIGN,
-					       search_base, search_end);
-	if (!crash_base) {
-		/*
-		 * For crashkernel=size[KMG]@offset[KMG], print out failure
-		 * message if can't reserve the specified region.
-		 */
-		if (fixed_base) {
-			pr_warn("crashkernel reservation failed - memory is in use.\n");
-			return;
-		}
-
-		/*
-		 * For crashkernel=size[KMG], if the first attempt was for
-		 * low memory, fall back to high memory, the minimum required
-		 * low memory will be reserved later.
-		 */
-		if (!high && search_end == CRASH_ADDR_LOW_MAX) {
-			search_end = CRASH_ADDR_HIGH_MAX;
-			search_base = CRASH_ADDR_LOW_MAX;
-			crash_low_size = DEFAULT_CRASH_KERNEL_LOW_SIZE;
-			goto retry;
-		}
-
-		/*
-		 * For crashkernel=size[KMG],high, if the first attempt was
-		 * for high memory, fall back to low memory.
-		 */
-		if (high && search_end == CRASH_ADDR_HIGH_MAX) {
-			search_end = CRASH_ADDR_LOW_MAX;
-			search_base = 0;
-			goto retry;
-		}
-		pr_warn("cannot allocate crashkernel (size:0x%llx)\n",
-			crash_size);
-		return;
-	}
-
-	if ((crash_base >= CRASH_ADDR_LOW_MAX) &&
-	     crash_low_size && reserve_crashkernel_low(crash_low_size)) {
-		memblock_phys_free(crash_base, crash_size);
-		return;
-	}
-
-	pr_info("crashkernel reserved: 0x%016llx - 0x%016llx (%lld MB)\n",
-		crash_base, crash_base + crash_size, crash_size >> 20);
-
-	/*
-	 * The crashkernel memory will be removed from the kernel linear
-	 * map. Inform kmemleak so that it won't try to access it.
-	 */
-	kmemleak_ignore_phys(crash_base);
-	if (crashk_low_res.end)
-		kmemleak_ignore_phys(crashk_low_res.start);
-
-	crashk_res.start = crash_base;
-	crashk_res.end = crash_base + crash_size - 1;
-}
-
-static __init int insert_crashkernel_resources(void)
-{
-	if (crashk_res.start < crashk_res.end)
-		insert_resource(&iomem_resource, &crashk_res);
-
-	if (crashk_low_res.start < crashk_low_res.end)
-		insert_resource(&iomem_resource, &crashk_low_res);
-
-	return 0;
-}
-early_initcall(insert_crashkernel_resources);
-#endif
-
 int crash_prepare_elf64_headers(struct crash_mem *mem, int need_kernel_map,
 			  void **addr, unsigned long *sz)
 {
diff --git a/kernel/crash_reserve.c b/kernel/crash_reserve.c
new file mode 100644
index 00000000000000..bbb6c3cb00e460
--- /dev/null
+++ b/kernel/crash_reserve.c
@@ -0,0 +1,464 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * crash.c - kernel crash support code.
+ * Copyright (C) 2002-2004 Eric Biederman  <ebiederm@xmission.com>
+ */
+
+#include <linux/buildid.h>
+#include <linux/init.h>
+#include <linux/utsname.h>
+#include <linux/vmalloc.h>
+#include <linux/sizes.h>
+#include <linux/kexec.h>
+#include <linux/memory.h>
+#include <linux/cpuhotplug.h>
+#include <linux/memblock.h>
+#include <linux/kexec.h>
+#include <linux/kmemleak.h>
+
+#include <asm/page.h>
+#include <asm/sections.h>
+
+#include <crypto/sha1.h>
+
+#include "kallsyms_internal.h"
+#include "kexec_internal.h"
+
+/* Location of the reserved area for the crash kernel */
+struct resource crashk_res = {
+	.name  = "Crash kernel",
+	.start = 0,
+	.end   = 0,
+	.flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM,
+	.desc  = IORES_DESC_CRASH_KERNEL
+};
+struct resource crashk_low_res = {
+	.name  = "Crash kernel",
+	.start = 0,
+	.end   = 0,
+	.flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM,
+	.desc  = IORES_DESC_CRASH_KERNEL
+};
+
+/*
+ * parsing the "crashkernel" commandline
+ *
+ * this code is intended to be called from architecture specific code
+ */
+
+
+/*
+ * This function parses command lines in the format
+ *
+ *   crashkernel=ramsize-range:size[,...][@offset]
+ *
+ * The function returns 0 on success and -EINVAL on failure.
+ */
+static int __init parse_crashkernel_mem(char *cmdline,
+					unsigned long long system_ram,
+					unsigned long long *crash_size,
+					unsigned long long *crash_base)
+{
+	char *cur = cmdline, *tmp;
+	unsigned long long total_mem = system_ram;
+
+	/*
+	 * Firmware sometimes reserves some memory regions for its own use,
+	 * so the system memory size is less than the actual physical memory
+	 * size. Work around this by rounding up the total size to 128M,
+	 * which is enough for most test cases.
+	 */
+	total_mem = roundup(total_mem, SZ_128M);
+
+	/* for each entry of the comma-separated list */
+	do {
+		unsigned long long start, end = ULLONG_MAX, size;
+
+		/* get the start of the range */
+		start = memparse(cur, &tmp);
+		if (cur == tmp) {
+			pr_warn("crashkernel: Memory value expected\n");
+			return -EINVAL;
+		}
+		cur = tmp;
+		if (*cur != '-') {
+			pr_warn("crashkernel: '-' expected\n");
+			return -EINVAL;
+		}
+		cur++;
+
+		/* if no ':' is here, than we read the end */
+		if (*cur != ':') {
+			end = memparse(cur, &tmp);
+			if (cur == tmp) {
+				pr_warn("crashkernel: Memory value expected\n");
+				return -EINVAL;
+			}
+			cur = tmp;
+			if (end <= start) {
+				pr_warn("crashkernel: end <= start\n");
+				return -EINVAL;
+			}
+		}
+
+		if (*cur != ':') {
+			pr_warn("crashkernel: ':' expected\n");
+			return -EINVAL;
+		}
+		cur++;
+
+		size = memparse(cur, &tmp);
+		if (cur == tmp) {
+			pr_warn("Memory value expected\n");
+			return -EINVAL;
+		}
+		cur = tmp;
+		if (size >= total_mem) {
+			pr_warn("crashkernel: invalid size\n");
+			return -EINVAL;
+		}
+
+		/* match ? */
+		if (total_mem >= start && total_mem < end) {
+			*crash_size = size;
+			break;
+		}
+	} while (*cur++ == ',');
+
+	if (*crash_size > 0) {
+		while (*cur && *cur != ' ' && *cur != '@')
+			cur++;
+		if (*cur == '@') {
+			cur++;
+			*crash_base = memparse(cur, &tmp);
+			if (cur == tmp) {
+				pr_warn("Memory value expected after '@'\n");
+				return -EINVAL;
+			}
+		}
+	} else
+		pr_info("crashkernel size resulted in zero bytes\n");
+
+	return 0;
+}
+
+/*
+ * That function parses "simple" (old) crashkernel command lines like
+ *
+ *	crashkernel=size[@offset]
+ *
+ * It returns 0 on success and -EINVAL on failure.
+ */
+static int __init parse_crashkernel_simple(char *cmdline,
+					   unsigned long long *crash_size,
+					   unsigned long long *crash_base)
+{
+	char *cur = cmdline;
+
+	*crash_size = memparse(cmdline, &cur);
+	if (cmdline == cur) {
+		pr_warn("crashkernel: memory value expected\n");
+		return -EINVAL;
+	}
+
+	if (*cur == '@')
+		*crash_base = memparse(cur+1, &cur);
+	else if (*cur != ' ' && *cur != '\0') {
+		pr_warn("crashkernel: unrecognized char: %c\n", *cur);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+#define SUFFIX_HIGH 0
+#define SUFFIX_LOW  1
+#define SUFFIX_NULL 2
+static __initdata char *suffix_tbl[] = {
+	[SUFFIX_HIGH] = ",high",
+	[SUFFIX_LOW]  = ",low",
+	[SUFFIX_NULL] = NULL,
+};
+
+/*
+ * That function parses "suffix"  crashkernel command lines like
+ *
+ *	crashkernel=size,[high|low]
+ *
+ * It returns 0 on success and -EINVAL on failure.
+ */
+static int __init parse_crashkernel_suffix(char *cmdline,
+					   unsigned long long *crash_size,
+					   const char *suffix)
+{
+	char *cur = cmdline;
+
+	*crash_size = memparse(cmdline, &cur);
+	if (cmdline == cur) {
+		pr_warn("crashkernel: memory value expected\n");
+		return -EINVAL;
+	}
+
+	/* check with suffix */
+	if (strncmp(cur, suffix, strlen(suffix))) {
+		pr_warn("crashkernel: unrecognized char: %c\n", *cur);
+		return -EINVAL;
+	}
+	cur += strlen(suffix);
+	if (*cur != ' ' && *cur != '\0') {
+		pr_warn("crashkernel: unrecognized char: %c\n", *cur);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static __init char *get_last_crashkernel(char *cmdline,
+			     const char *name,
+			     const char *suffix)
+{
+	char *p = cmdline, *ck_cmdline = NULL;
+
+	/* find crashkernel and use the last one if there are more */
+	p = strstr(p, name);
+	while (p) {
+		char *end_p = strchr(p, ' ');
+		char *q;
+
+		if (!end_p)
+			end_p = p + strlen(p);
+
+		if (!suffix) {
+			int i;
+
+			/* skip the one with any known suffix */
+			for (i = 0; suffix_tbl[i]; i++) {
+				q = end_p - strlen(suffix_tbl[i]);
+				if (!strncmp(q, suffix_tbl[i],
+					     strlen(suffix_tbl[i])))
+					goto next;
+			}
+			ck_cmdline = p;
+		} else {
+			q = end_p - strlen(suffix);
+			if (!strncmp(q, suffix, strlen(suffix)))
+				ck_cmdline = p;
+		}
+next:
+		p = strstr(p+1, name);
+	}
+
+	return ck_cmdline;
+}
+
+static int __init __parse_crashkernel(char *cmdline,
+			     unsigned long long system_ram,
+			     unsigned long long *crash_size,
+			     unsigned long long *crash_base,
+			     const char *suffix)
+{
+	char *first_colon, *first_space;
+	char *ck_cmdline;
+	char *name = "crashkernel=";
+
+	BUG_ON(!crash_size || !crash_base);
+	*crash_size = 0;
+	*crash_base = 0;
+
+	ck_cmdline = get_last_crashkernel(cmdline, name, suffix);
+	if (!ck_cmdline)
+		return -ENOENT;
+
+	ck_cmdline += strlen(name);
+
+	if (suffix)
+		return parse_crashkernel_suffix(ck_cmdline, crash_size,
+				suffix);
+	/*
+	 * if the commandline contains a ':', then that's the extended
+	 * syntax -- if not, it must be the classic syntax
+	 */
+	first_colon = strchr(ck_cmdline, ':');
+	first_space = strchr(ck_cmdline, ' ');
+	if (first_colon && (!first_space || first_colon < first_space))
+		return parse_crashkernel_mem(ck_cmdline, system_ram,
+				crash_size, crash_base);
+
+	return parse_crashkernel_simple(ck_cmdline, crash_size, crash_base);
+}
+
+/*
+ * That function is the entry point for command line parsing and should be
+ * called from the arch-specific code.
+ *
+ * If crashkernel=,high|low is supported on architecture, non-NULL values
+ * should be passed to parameters 'low_size' and 'high'.
+ */
+int __init parse_crashkernel(char *cmdline,
+			     unsigned long long system_ram,
+			     unsigned long long *crash_size,
+			     unsigned long long *crash_base,
+			     unsigned long long *low_size,
+			     bool *high)
+{
+	int ret;
+
+	/* crashkernel=X[@offset] */
+	ret = __parse_crashkernel(cmdline, system_ram, crash_size,
+				crash_base, NULL);
+#ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION
+	/*
+	 * If non-NULL 'high' passed in and no normal crashkernel
+	 * setting detected, try parsing crashkernel=,high|low.
+	 */
+	if (high && ret == -ENOENT) {
+		ret = __parse_crashkernel(cmdline, 0, crash_size,
+				crash_base, suffix_tbl[SUFFIX_HIGH]);
+		if (ret || !*crash_size)
+			return -EINVAL;
+
+		/*
+		 * crashkernel=Y,low can be specified or not, but invalid value
+		 * is not allowed.
+		 */
+		ret = __parse_crashkernel(cmdline, 0, low_size,
+				crash_base, suffix_tbl[SUFFIX_LOW]);
+		if (ret == -ENOENT) {
+			*low_size = DEFAULT_CRASH_KERNEL_LOW_SIZE;
+			ret = 0;
+		} else if (ret) {
+			return ret;
+		}
+
+		*high = true;
+	}
+#endif
+	if (!*crash_size)
+		ret = -EINVAL;
+
+	return ret;
+}
+
+/*
+ * Add a dummy early_param handler to mark crashkernel= as a known command line
+ * parameter and suppress incorrect warnings in init/main.c.
+ */
+static int __init parse_crashkernel_dummy(char *arg)
+{
+	return 0;
+}
+early_param("crashkernel", parse_crashkernel_dummy);
+
+#ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION
+static int __init reserve_crashkernel_low(unsigned long long low_size)
+{
+#ifdef CONFIG_64BIT
+	unsigned long long low_base;
+
+	low_base = memblock_phys_alloc_range(low_size, CRASH_ALIGN, 0, CRASH_ADDR_LOW_MAX);
+	if (!low_base) {
+		pr_err("cannot allocate crashkernel low memory (size:0x%llx).\n", low_size);
+		return -ENOMEM;
+	}
+
+	pr_info("crashkernel low memory reserved: 0x%08llx - 0x%08llx (%lld MB)\n",
+		low_base, low_base + low_size, low_size >> 20);
+
+	crashk_low_res.start = low_base;
+	crashk_low_res.end   = low_base + low_size - 1;
+	insert_resource(&iomem_resource, &crashk_low_res);
+#endif
+	return 0;
+}
+
+void __init reserve_crashkernel_generic(char *cmdline,
+			     unsigned long long crash_size,
+			     unsigned long long crash_base,
+			     unsigned long long crash_low_size,
+			     bool high)
+{
+	unsigned long long search_end = CRASH_ADDR_LOW_MAX, search_base = 0;
+	bool fixed_base = false;
+
+	/* User specifies base address explicitly. */
+	if (crash_base) {
+		fixed_base = true;
+		search_base = crash_base;
+		search_end = crash_base + crash_size;
+	} else if (high) {
+		search_base = CRASH_ADDR_LOW_MAX;
+		search_end = CRASH_ADDR_HIGH_MAX;
+	}
+
+retry:
+	crash_base = memblock_phys_alloc_range(crash_size, CRASH_ALIGN,
+					       search_base, search_end);
+	if (!crash_base) {
+		/*
+		 * For crashkernel=size[KMG]@offset[KMG], print out failure
+		 * message if can't reserve the specified region.
+		 */
+		if (fixed_base) {
+			pr_warn("crashkernel reservation failed - memory is in use.\n");
+			return;
+		}
+
+		/*
+		 * For crashkernel=size[KMG], if the first attempt was for
+		 * low memory, fall back to high memory, the minimum required
+		 * low memory will be reserved later.
+		 */
+		if (!high && search_end == CRASH_ADDR_LOW_MAX) {
+			search_end = CRASH_ADDR_HIGH_MAX;
+			search_base = CRASH_ADDR_LOW_MAX;
+			crash_low_size = DEFAULT_CRASH_KERNEL_LOW_SIZE;
+			goto retry;
+		}
+
+		/*
+		 * For crashkernel=size[KMG],high, if the first attempt was
+		 * for high memory, fall back to low memory.
+		 */
+		if (high && search_end == CRASH_ADDR_HIGH_MAX) {
+			search_end = CRASH_ADDR_LOW_MAX;
+			search_base = 0;
+			goto retry;
+		}
+		pr_warn("cannot allocate crashkernel (size:0x%llx)\n",
+			crash_size);
+		return;
+	}
+
+	if ((crash_base >= CRASH_ADDR_LOW_MAX) &&
+	     crash_low_size && reserve_crashkernel_low(crash_low_size)) {
+		memblock_phys_free(crash_base, crash_size);
+		return;
+	}
+
+	pr_info("crashkernel reserved: 0x%016llx - 0x%016llx (%lld MB)\n",
+		crash_base, crash_base + crash_size, crash_size >> 20);
+
+	/*
+	 * The crashkernel memory will be removed from the kernel linear
+	 * map. Inform kmemleak so that it won't try to access it.
+	 */
+	kmemleak_ignore_phys(crash_base);
+	if (crashk_low_res.end)
+		kmemleak_ignore_phys(crashk_low_res.start);
+
+	crashk_res.start = crash_base;
+	crashk_res.end = crash_base + crash_size - 1;
+}
+
+static __init int insert_crashkernel_resources(void)
+{
+	if (crashk_res.start < crashk_res.end)
+		insert_resource(&iomem_resource, &crashk_res);
+
+	if (crashk_low_res.start < crashk_low_res.end)
+		insert_resource(&iomem_resource, &crashk_low_res);
+
+	return 0;
+}
+early_initcall(insert_crashkernel_resources);
+#endif

From fa394df4a49cfb88e708f86797c34ee6d9c6c5c0 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Sun, 28 Jan 2024 22:00:15 -0800
Subject: [PATCH 1013/1406] 
 kexec-split-crashkernel-reservation-code-out-from-crash_corec-fix

s/CRASH_RESEERVE/CRASH_RESERVE/, per Klara Modin

Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Baoquan He <bhe@redhat.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Hari Bathini <hbathini@linux.ibm.com>
Cc: Pingfan Liu <piliu@redhat.com>
Cc: Klara Modin <klarasmodin@gmail.com>
Cc: Michael Kelley <mhklinux@outlook.com>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Yang Li <yang.lee@linux.alibaba.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/x86/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 71417c5b228c51..5bd9258151546e 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -2106,7 +2106,7 @@ config ARCH_SUPPORTS_CRASH_HOTPLUG
 	def_bool y
 
 config ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION
-	def_bool CRASH_RESEERVE
+	def_bool CRASH_RESERVE
 
 config PHYSICAL_START
 	hex "Physical address where the kernel is loaded" if (EXPERT || CRASH_DUMP)

From 7d140ad8bd1faecba158a0d16ca4976d20635bdc Mon Sep 17 00:00:00 2001
From: Baoquan He <bhe@redhat.com>
Date: Wed, 24 Jan 2024 13:12:42 +0800
Subject: [PATCH 1014/1406] crash: split vmcoreinfo exporting code out from
 crash_core.c

Now move the relevant codes into separate files:
kernel/crash_reserve.c, include/linux/crash_reserve.h.

And add config item CRASH_RESERVE to control its enabling.

And also update the old ifdeffery of CONFIG_CRASH_CORE, including of
<linux/crash_core.h> and config item dependency on CRASH_CORE
accordingly.

And also do renaming as follows:
 - arch/xxx/kernel/{crash_core.c => vmcore_info.c}
because they are only related to vmcoreinfo exporting on x86, arm64,
riscv.

And also Remove config item CRASH_CORE, and rely on CONFIG_KEXEC_CORE to
decide if build in crash_core.c.

Link: https://lkml.kernel.org/r/20240124051254.67105-3-bhe@redhat.com
Signed-off-by: Baoquan He <bhe@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Hari Bathini <hbathini@linux.ibm.com>
Cc: Pingfan Liu <piliu@redhat.com>
Cc: Klara Modin <klarasmodin@gmail.com>
Cc: Michael Kelley <mhklinux@outlook.com>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Yang Li <yang.lee@linux.alibaba.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/arm64/kernel/Makefile                    |   2 +-
 .../kernel/{crash_core.c => vmcore_info.c}    |   2 +-
 arch/powerpc/Kconfig                          |   2 +-
 arch/powerpc/kernel/setup-common.c            |   2 +-
 arch/powerpc/platforms/powernv/opal-core.c    |   2 +-
 arch/riscv/kernel/Makefile                    |   2 +-
 .../kernel/{crash_core.c => vmcore_info.c}    |   2 +-
 arch/x86/kernel/Makefile                      |   2 +-
 .../{crash_core_32.c => vmcore_info_32.c}     |   2 +-
 .../{crash_core_64.c => vmcore_info_64.c}     |   2 +-
 drivers/firmware/qemu_fw_cfg.c                |  14 +-
 fs/proc/Kconfig                               |   2 +-
 fs/proc/kcore.c                               |   2 +-
 include/linux/buildid.h                       |   2 +-
 include/linux/crash_core.h                    |  73 ------
 include/linux/kexec.h                         |   1 +
 include/linux/vmcore_info.h                   |  81 ++++++
 kernel/Kconfig.kexec                          |   4 +-
 kernel/Makefile                               |   4 +-
 kernel/crash_core.c                           | 206 ----------------
 kernel/ksysfs.c                               |   6 +-
 kernel/printk/printk.c                        |   4 +-
 kernel/vmcore_info.c                          | 231 ++++++++++++++++++
 lib/buildid.c                                 |   2 +-
 24 files changed, 343 insertions(+), 309 deletions(-)
 rename arch/arm64/kernel/{crash_core.c => vmcore_info.c} (97%)
 rename arch/riscv/kernel/{crash_core.c => vmcore_info.c} (96%)
 rename arch/x86/kernel/{crash_core_32.c => vmcore_info_32.c} (90%)
 rename arch/x86/kernel/{crash_core_64.c => vmcore_info_64.c} (94%)
 create mode 100644 include/linux/vmcore_info.h
 create mode 100644 kernel/vmcore_info.c

diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
index 467cb711727309..a3882cccf049d0 100644
--- a/arch/arm64/kernel/Makefile
+++ b/arch/arm64/kernel/Makefile
@@ -66,7 +66,7 @@ obj-$(CONFIG_KEXEC_FILE)		+= machine_kexec_file.o kexec_image.o
 obj-$(CONFIG_ARM64_RELOC_TEST)		+= arm64-reloc-test.o
 arm64-reloc-test-y := reloc_test_core.o reloc_test_syms.o
 obj-$(CONFIG_CRASH_DUMP)		+= crash_dump.o
-obj-$(CONFIG_CRASH_CORE)		+= crash_core.o
+obj-$(CONFIG_VMCORE_INFO)		+= vmcore_info.o
 obj-$(CONFIG_ARM_SDE_INTERFACE)		+= sdei.o
 obj-$(CONFIG_ARM64_PTR_AUTH)		+= pointer_auth.o
 obj-$(CONFIG_ARM64_MTE)			+= mte.o
diff --git a/arch/arm64/kernel/crash_core.c b/arch/arm64/kernel/vmcore_info.c
similarity index 97%
rename from arch/arm64/kernel/crash_core.c
rename to arch/arm64/kernel/vmcore_info.c
index 2a24199a9b81e0..b19d5d6cb8b387 100644
--- a/arch/arm64/kernel/crash_core.c
+++ b/arch/arm64/kernel/vmcore_info.c
@@ -4,7 +4,7 @@
  * Copyright (C) Huawei Futurewei Technologies.
  */
 
-#include <linux/crash_core.h>
+#include <linux/vmcore_info.h>
 #include <asm/cpufeature.h>
 #include <asm/memory.h>
 #include <asm/pgtable-hwdef.h>
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 7f704ae5c5efcb..495d197c9b2751 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -690,7 +690,7 @@ config ARCH_SELECTS_CRASH_DUMP
 config FA_DUMP
 	bool "Firmware-assisted dump"
 	depends on PPC64 && (PPC_RTAS || PPC_POWERNV)
-	select CRASH_CORE
+	select VMCORE_INFO
 	select CRASH_RESERVE
 	select CRASH_DUMP
 	help
diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c
index 9b142b9d5187b2..733f210ffda1fe 100644
--- a/arch/powerpc/kernel/setup-common.c
+++ b/arch/powerpc/kernel/setup-common.c
@@ -109,7 +109,7 @@ int ppc_do_canonicalize_irqs;
 EXPORT_SYMBOL(ppc_do_canonicalize_irqs);
 #endif
 
-#ifdef CONFIG_CRASH_CORE
+#ifdef CONFIG_VMCORE_INFO
 /* This keeps a track of which one is the crashing cpu. */
 int crashing_cpu = -1;
 #endif
diff --git a/arch/powerpc/platforms/powernv/opal-core.c b/arch/powerpc/platforms/powernv/opal-core.c
index bb7657115f1d27..c9a9b759cc928b 100644
--- a/arch/powerpc/platforms/powernv/opal-core.c
+++ b/arch/powerpc/platforms/powernv/opal-core.c
@@ -16,7 +16,7 @@
 #include <linux/kobject.h>
 #include <linux/sysfs.h>
 #include <linux/slab.h>
-#include <linux/crash_core.h>
+#include <linux/vmcore_info.h>
 #include <linux/of.h>
 
 #include <asm/page.h>
diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile
index f71910718053d8..d6fd8dcfceb5e3 100644
--- a/arch/riscv/kernel/Makefile
+++ b/arch/riscv/kernel/Makefile
@@ -92,7 +92,7 @@ obj-$(CONFIG_KGDB)		+= kgdb.o
 obj-$(CONFIG_KEXEC_CORE)	+= kexec_relocate.o crash_save_regs.o machine_kexec.o
 obj-$(CONFIG_KEXEC_FILE)	+= elf_kexec.o machine_kexec_file.o
 obj-$(CONFIG_CRASH_DUMP)	+= crash_dump.o
-obj-$(CONFIG_CRASH_CORE)	+= crash_core.o
+obj-$(CONFIG_VMCORE_INFO)	+= vmcore_info.o
 
 obj-$(CONFIG_JUMP_LABEL)	+= jump_label.o
 
diff --git a/arch/riscv/kernel/crash_core.c b/arch/riscv/kernel/vmcore_info.c
similarity index 96%
rename from arch/riscv/kernel/crash_core.c
rename to arch/riscv/kernel/vmcore_info.c
index d18d529fd9b984..6d7a22522d6309 100644
--- a/arch/riscv/kernel/crash_core.c
+++ b/arch/riscv/kernel/vmcore_info.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0-only
 
-#include <linux/crash_core.h>
+#include <linux/vmcore_info.h>
 #include <linux/pagemap.h>
 
 void arch_crash_save_vmcoreinfo(void)
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 0000325ab98f4d..913d4022131eba 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -98,7 +98,7 @@ obj-$(CONFIG_FTRACE_SYSCALLS)	+= ftrace.o
 obj-$(CONFIG_X86_TSC)		+= trace_clock.o
 obj-$(CONFIG_TRACING)		+= trace.o
 obj-$(CONFIG_RETHOOK)		+= rethook.o
-obj-$(CONFIG_CRASH_CORE)	+= crash_core_$(BITS).o
+obj-$(CONFIG_VMCORE_INFO)	+= vmcore_info_$(BITS).o
 obj-$(CONFIG_KEXEC_CORE)	+= machine_kexec_$(BITS).o
 obj-$(CONFIG_KEXEC_CORE)	+= relocate_kernel_$(BITS).o crash.o
 obj-$(CONFIG_KEXEC_FILE)	+= kexec-bzimage64.o
diff --git a/arch/x86/kernel/crash_core_32.c b/arch/x86/kernel/vmcore_info_32.c
similarity index 90%
rename from arch/x86/kernel/crash_core_32.c
rename to arch/x86/kernel/vmcore_info_32.c
index 8a89c109e20a6c..5995a749288a95 100644
--- a/arch/x86/kernel/crash_core_32.c
+++ b/arch/x86/kernel/vmcore_info_32.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0-only
 
-#include <linux/crash_core.h>
+#include <linux/vmcore_info.h>
 #include <linux/pgtable.h>
 
 #include <asm/setup.h>
diff --git a/arch/x86/kernel/crash_core_64.c b/arch/x86/kernel/vmcore_info_64.c
similarity index 94%
rename from arch/x86/kernel/crash_core_64.c
rename to arch/x86/kernel/vmcore_info_64.c
index 7d255f882afe6f..0dec7d86875447 100644
--- a/arch/x86/kernel/crash_core_64.c
+++ b/arch/x86/kernel/vmcore_info_64.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0-only
 
-#include <linux/crash_core.h>
+#include <linux/vmcore_info.h>
 #include <linux/pgtable.h>
 
 #include <asm/setup.h>
diff --git a/drivers/firmware/qemu_fw_cfg.c b/drivers/firmware/qemu_fw_cfg.c
index 03da9a4354f886..5f43dfa22f799c 100644
--- a/drivers/firmware/qemu_fw_cfg.c
+++ b/drivers/firmware/qemu_fw_cfg.c
@@ -37,7 +37,7 @@
 #include <uapi/linux/qemu_fw_cfg.h>
 #include <linux/delay.h>
 #include <linux/crash_dump.h>
-#include <linux/crash_core.h>
+#include <linux/vmcore_info.h>
 
 MODULE_AUTHOR("Gabriel L. Somlo <somlo@cmu.edu>");
 MODULE_DESCRIPTION("QEMU fw_cfg sysfs support");
@@ -67,7 +67,7 @@ static void fw_cfg_sel_endianness(u16 key)
 		iowrite16(key, fw_cfg_reg_ctrl);
 }
 
-#ifdef CONFIG_CRASH_CORE
+#ifdef CONFIG_VMCORE_INFO
 static inline bool fw_cfg_dma_enabled(void)
 {
 	return (fw_cfg_rev & FW_CFG_VERSION_DMA) && fw_cfg_reg_dma;
@@ -156,7 +156,7 @@ static ssize_t fw_cfg_read_blob(u16 key,
 	return count;
 }
 
-#ifdef CONFIG_CRASH_CORE
+#ifdef CONFIG_VMCORE_INFO
 /* write chunk of given fw_cfg blob (caller responsible for sanity-check) */
 static ssize_t fw_cfg_write_blob(u16 key,
 				 void *buf, loff_t pos, size_t count)
@@ -195,7 +195,7 @@ static ssize_t fw_cfg_write_blob(u16 key,
 
 	return ret;
 }
-#endif /* CONFIG_CRASH_CORE */
+#endif /* CONFIG_VMCORE_INFO */
 
 /* clean up fw_cfg device i/o */
 static void fw_cfg_io_cleanup(void)
@@ -319,7 +319,7 @@ struct fw_cfg_sysfs_entry {
 	struct list_head list;
 };
 
-#ifdef CONFIG_CRASH_CORE
+#ifdef CONFIG_VMCORE_INFO
 static ssize_t fw_cfg_write_vmcoreinfo(const struct fw_cfg_file *f)
 {
 	static struct fw_cfg_vmcoreinfo *data;
@@ -343,7 +343,7 @@ static ssize_t fw_cfg_write_vmcoreinfo(const struct fw_cfg_file *f)
 	kfree(data);
 	return ret;
 }
-#endif /* CONFIG_CRASH_CORE */
+#endif /* CONFIG_VMCORE_INFO */
 
 /* get fw_cfg_sysfs_entry from kobject member */
 static inline struct fw_cfg_sysfs_entry *to_entry(struct kobject *kobj)
@@ -583,7 +583,7 @@ static int fw_cfg_register_file(const struct fw_cfg_file *f)
 	int err;
 	struct fw_cfg_sysfs_entry *entry;
 
-#ifdef CONFIG_CRASH_CORE
+#ifdef CONFIG_VMCORE_INFO
 	if (fw_cfg_dma_enabled() &&
 		strcmp(f->name, FW_CFG_VMCOREINFO_FILENAME) == 0 &&
 		!is_kdump_kernel()) {
diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig
index 32b1116ae137c6..d80a1431ef7be0 100644
--- a/fs/proc/Kconfig
+++ b/fs/proc/Kconfig
@@ -32,7 +32,7 @@ config PROC_FS
 config PROC_KCORE
 	bool "/proc/kcore support" if !ARM
 	depends on PROC_FS && MMU
-	select CRASH_CORE
+	select VMCORE_INFO
 	help
 	  Provides a virtual ELF core file of the live kernel.  This can
 	  be read with gdb and other ELF tools.  No modifications can be
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 6422e569b08085..8e08a9a1b7ed57 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -10,7 +10,7 @@
  *	Safe accesses to vmalloc/direct-mapped discontiguous areas, Kanoj Sarcar <kanoj@sgi.com>
  */
 
-#include <linux/crash_core.h>
+#include <linux/vmcore_info.h>
 #include <linux/mm.h>
 #include <linux/proc_fs.h>
 #include <linux/kcore.h>
diff --git a/include/linux/buildid.h b/include/linux/buildid.h
index 8a582d242f0672..20aa3c2d89f760 100644
--- a/include/linux/buildid.h
+++ b/include/linux/buildid.h
@@ -11,7 +11,7 @@ int build_id_parse(struct vm_area_struct *vma, unsigned char *build_id,
 		   __u32 *size);
 int build_id_parse_buf(const void *buf, unsigned char *build_id, u32 buf_size);
 
-#if IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID) || IS_ENABLED(CONFIG_CRASH_CORE)
+#if IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID) || IS_ENABLED(CONFIG_VMCORE_INFO)
 extern unsigned char vmlinux_build_id[BUILD_ID_SIZE_MAX];
 void init_vmlinux_build_id(void);
 #else
diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h
index 1fde49246fa6e3..7f19f62018ef9c 100644
--- a/include/linux/crash_core.h
+++ b/include/linux/crash_core.h
@@ -6,79 +6,6 @@
 #include <linux/elfcore.h>
 #include <linux/elf.h>
 
-#define CRASH_CORE_NOTE_NAME	   "CORE"
-#define CRASH_CORE_NOTE_HEAD_BYTES ALIGN(sizeof(struct elf_note), 4)
-#define CRASH_CORE_NOTE_NAME_BYTES ALIGN(sizeof(CRASH_CORE_NOTE_NAME), 4)
-#define CRASH_CORE_NOTE_DESC_BYTES ALIGN(sizeof(struct elf_prstatus), 4)
-
-/*
- * The per-cpu notes area is a list of notes terminated by a "NULL"
- * note header.  For kdump, the code in vmcore.c runs in the context
- * of the second kernel to combine them into one note.
- */
-#define CRASH_CORE_NOTE_BYTES	   ((CRASH_CORE_NOTE_HEAD_BYTES * 2) +	\
-				     CRASH_CORE_NOTE_NAME_BYTES +	\
-				     CRASH_CORE_NOTE_DESC_BYTES)
-
-#define VMCOREINFO_BYTES	   PAGE_SIZE
-#define VMCOREINFO_NOTE_NAME	   "VMCOREINFO"
-#define VMCOREINFO_NOTE_NAME_BYTES ALIGN(sizeof(VMCOREINFO_NOTE_NAME), 4)
-#define VMCOREINFO_NOTE_SIZE	   ((CRASH_CORE_NOTE_HEAD_BYTES * 2) +	\
-				     VMCOREINFO_NOTE_NAME_BYTES +	\
-				     VMCOREINFO_BYTES)
-
-typedef u32 note_buf_t[CRASH_CORE_NOTE_BYTES/4];
-/* Per cpu memory for storing cpu states in case of system crash. */
-extern note_buf_t __percpu *crash_notes;
-
-void crash_update_vmcoreinfo_safecopy(void *ptr);
-void crash_save_vmcoreinfo(void);
-void arch_crash_save_vmcoreinfo(void);
-__printf(1, 2)
-void vmcoreinfo_append_str(const char *fmt, ...);
-phys_addr_t paddr_vmcoreinfo_note(void);
-
-#define VMCOREINFO_OSRELEASE(value) \
-	vmcoreinfo_append_str("OSRELEASE=%s\n", value)
-#define VMCOREINFO_BUILD_ID()						\
-	({								\
-		static_assert(sizeof(vmlinux_build_id) == 20);		\
-		vmcoreinfo_append_str("BUILD-ID=%20phN\n", vmlinux_build_id); \
-	})
-
-#define VMCOREINFO_PAGESIZE(value) \
-	vmcoreinfo_append_str("PAGESIZE=%ld\n", value)
-#define VMCOREINFO_SYMBOL(name) \
-	vmcoreinfo_append_str("SYMBOL(%s)=%lx\n", #name, (unsigned long)&name)
-#define VMCOREINFO_SYMBOL_ARRAY(name) \
-	vmcoreinfo_append_str("SYMBOL(%s)=%lx\n", #name, (unsigned long)name)
-#define VMCOREINFO_SIZE(name) \
-	vmcoreinfo_append_str("SIZE(%s)=%lu\n", #name, \
-			      (unsigned long)sizeof(name))
-#define VMCOREINFO_STRUCT_SIZE(name) \
-	vmcoreinfo_append_str("SIZE(%s)=%lu\n", #name, \
-			      (unsigned long)sizeof(struct name))
-#define VMCOREINFO_OFFSET(name, field) \
-	vmcoreinfo_append_str("OFFSET(%s.%s)=%lu\n", #name, #field, \
-			      (unsigned long)offsetof(struct name, field))
-#define VMCOREINFO_TYPE_OFFSET(name, field) \
-	vmcoreinfo_append_str("OFFSET(%s.%s)=%lu\n", #name, #field, \
-			      (unsigned long)offsetof(name, field))
-#define VMCOREINFO_LENGTH(name, value) \
-	vmcoreinfo_append_str("LENGTH(%s)=%lu\n", #name, (unsigned long)value)
-#define VMCOREINFO_NUMBER(name) \
-	vmcoreinfo_append_str("NUMBER(%s)=%ld\n", #name, (long)name)
-#define VMCOREINFO_CONFIG(name) \
-	vmcoreinfo_append_str("CONFIG_%s=y\n", #name)
-
-extern unsigned char *vmcoreinfo_data;
-extern size_t vmcoreinfo_size;
-extern u32 *vmcoreinfo_note;
-
-Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type,
-			  void *data, size_t data_len);
-void final_note(Elf_Word *buf);
-
 /* Alignment required for elf header segment */
 #define ELF_CORE_HEADER_ALIGN   4096
 
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index 6d79bfb52e5bf0..9c7bb8b56ed66d 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -16,6 +16,7 @@
 #if !defined(__ASSEMBLY__)
 
 #include <linux/crash_core.h>
+#include <linux/vmcore_info.h>
 #include <linux/crash_reserve.h>
 #include <asm/io.h>
 #include <linux/range.h>
diff --git a/include/linux/vmcore_info.h b/include/linux/vmcore_info.h
new file mode 100644
index 00000000000000..e1dec1a6a749dc
--- /dev/null
+++ b/include/linux/vmcore_info.h
@@ -0,0 +1,81 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef LINUX_VMCORE_INFO_H
+#define LINUX_VMCORE_INFO_H
+
+#include <linux/linkage.h>
+#include <linux/elfcore.h>
+#include <linux/elf.h>
+
+#define CRASH_CORE_NOTE_NAME	   "CORE"
+#define CRASH_CORE_NOTE_HEAD_BYTES ALIGN(sizeof(struct elf_note), 4)
+#define CRASH_CORE_NOTE_NAME_BYTES ALIGN(sizeof(CRASH_CORE_NOTE_NAME), 4)
+#define CRASH_CORE_NOTE_DESC_BYTES ALIGN(sizeof(struct elf_prstatus), 4)
+
+/*
+ * The per-cpu notes area is a list of notes terminated by a "NULL"
+ * note header.  For kdump, the code in vmcore.c runs in the context
+ * of the second kernel to combine them into one note.
+ */
+#define CRASH_CORE_NOTE_BYTES	   ((CRASH_CORE_NOTE_HEAD_BYTES * 2) +	\
+				     CRASH_CORE_NOTE_NAME_BYTES +	\
+				     CRASH_CORE_NOTE_DESC_BYTES)
+
+#define VMCOREINFO_BYTES	   PAGE_SIZE
+#define VMCOREINFO_NOTE_NAME	   "VMCOREINFO"
+#define VMCOREINFO_NOTE_NAME_BYTES ALIGN(sizeof(VMCOREINFO_NOTE_NAME), 4)
+#define VMCOREINFO_NOTE_SIZE	   ((CRASH_CORE_NOTE_HEAD_BYTES * 2) +	\
+				     VMCOREINFO_NOTE_NAME_BYTES +	\
+				     VMCOREINFO_BYTES)
+
+typedef u32 note_buf_t[CRASH_CORE_NOTE_BYTES/4];
+/* Per cpu memory for storing cpu states in case of system crash. */
+extern note_buf_t __percpu *crash_notes;
+
+void crash_update_vmcoreinfo_safecopy(void *ptr);
+void crash_save_vmcoreinfo(void);
+void arch_crash_save_vmcoreinfo(void);
+__printf(1, 2)
+void vmcoreinfo_append_str(const char *fmt, ...);
+phys_addr_t paddr_vmcoreinfo_note(void);
+
+#define VMCOREINFO_OSRELEASE(value) \
+	vmcoreinfo_append_str("OSRELEASE=%s\n", value)
+#define VMCOREINFO_BUILD_ID()						\
+	({								\
+		static_assert(sizeof(vmlinux_build_id) == 20);		\
+		vmcoreinfo_append_str("BUILD-ID=%20phN\n", vmlinux_build_id); \
+	})
+
+#define VMCOREINFO_PAGESIZE(value) \
+	vmcoreinfo_append_str("PAGESIZE=%ld\n", value)
+#define VMCOREINFO_SYMBOL(name) \
+	vmcoreinfo_append_str("SYMBOL(%s)=%lx\n", #name, (unsigned long)&name)
+#define VMCOREINFO_SYMBOL_ARRAY(name) \
+	vmcoreinfo_append_str("SYMBOL(%s)=%lx\n", #name, (unsigned long)name)
+#define VMCOREINFO_SIZE(name) \
+	vmcoreinfo_append_str("SIZE(%s)=%lu\n", #name, \
+			      (unsigned long)sizeof(name))
+#define VMCOREINFO_STRUCT_SIZE(name) \
+	vmcoreinfo_append_str("SIZE(%s)=%lu\n", #name, \
+			      (unsigned long)sizeof(struct name))
+#define VMCOREINFO_OFFSET(name, field) \
+	vmcoreinfo_append_str("OFFSET(%s.%s)=%lu\n", #name, #field, \
+			      (unsigned long)offsetof(struct name, field))
+#define VMCOREINFO_TYPE_OFFSET(name, field) \
+	vmcoreinfo_append_str("OFFSET(%s.%s)=%lu\n", #name, #field, \
+			      (unsigned long)offsetof(name, field))
+#define VMCOREINFO_LENGTH(name, value) \
+	vmcoreinfo_append_str("LENGTH(%s)=%lu\n", #name, (unsigned long)value)
+#define VMCOREINFO_NUMBER(name) \
+	vmcoreinfo_append_str("NUMBER(%s)=%ld\n", #name, (long)name)
+#define VMCOREINFO_CONFIG(name) \
+	vmcoreinfo_append_str("CONFIG_%s=y\n", #name)
+
+extern unsigned char *vmcoreinfo_data;
+extern size_t vmcoreinfo_size;
+extern u32 *vmcoreinfo_note;
+
+Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type,
+			  void *data, size_t data_len);
+void final_note(Elf_Word *buf);
+#endif /* LINUX_VMCORE_INFO_H */
diff --git a/kernel/Kconfig.kexec b/kernel/Kconfig.kexec
index 8b7be71edd859e..8faf27043432fe 100644
--- a/kernel/Kconfig.kexec
+++ b/kernel/Kconfig.kexec
@@ -5,11 +5,11 @@ menu "Kexec and crash features"
 config CRASH_RESERVE
 	bool
 
-config CRASH_CORE
+config VMCORE_INFO
 	bool
 
 config KEXEC_CORE
-	select CRASH_CORE
+	select VMCORE_INFO
 	select CRASH_RESERVE
 	bool
 
diff --git a/kernel/Makefile b/kernel/Makefile
index 05fa88b3ab7499..649272a1d6b9f8 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -68,9 +68,9 @@ obj-$(CONFIG_MODULE_SIG_FORMAT) += module_signature.o
 obj-$(CONFIG_KALLSYMS) += kallsyms.o
 obj-$(CONFIG_KALLSYMS_SELFTEST) += kallsyms_selftest.o
 obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
-obj-$(CONFIG_CRASH_CORE) += crash_core.o
+obj-$(CONFIG_VMCORE_INFO) += vmcore_info.o
 obj-$(CONFIG_CRASH_RESERVE) += crash_reserve.o
-obj-$(CONFIG_KEXEC_CORE) += kexec_core.o
+obj-$(CONFIG_KEXEC_CORE) += kexec_core.o crash_core.o
 obj-$(CONFIG_KEXEC) += kexec.o
 obj-$(CONFIG_KEXEC_FILE) += kexec_file.o
 obj-$(CONFIG_KEXEC_ELF) += kexec_elf.o
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index ae0d1ce89b46b8..2f4df1fe6f7af5 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -26,14 +26,6 @@
 /* Per cpu memory for storing cpu states in case of system crash. */
 note_buf_t __percpu *crash_notes;
 
-/* vmcoreinfo stuff */
-unsigned char *vmcoreinfo_data;
-size_t vmcoreinfo_size;
-u32 *vmcoreinfo_note;
-
-/* trusted vmcoreinfo, e.g. we can make a copy in the crash memory */
-static unsigned char *vmcoreinfo_data_safecopy;
-
 int crash_prepare_elf64_headers(struct crash_mem *mem, int need_kernel_map,
 			  void **addr, unsigned long *sz)
 {
@@ -195,204 +187,6 @@ int crash_exclude_mem_range(struct crash_mem *mem,
 	return 0;
 }
 
-Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type,
-			  void *data, size_t data_len)
-{
-	struct elf_note *note = (struct elf_note *)buf;
-
-	note->n_namesz = strlen(name) + 1;
-	note->n_descsz = data_len;
-	note->n_type   = type;
-	buf += DIV_ROUND_UP(sizeof(*note), sizeof(Elf_Word));
-	memcpy(buf, name, note->n_namesz);
-	buf += DIV_ROUND_UP(note->n_namesz, sizeof(Elf_Word));
-	memcpy(buf, data, data_len);
-	buf += DIV_ROUND_UP(data_len, sizeof(Elf_Word));
-
-	return buf;
-}
-
-void final_note(Elf_Word *buf)
-{
-	memset(buf, 0, sizeof(struct elf_note));
-}
-
-static void update_vmcoreinfo_note(void)
-{
-	u32 *buf = vmcoreinfo_note;
-
-	if (!vmcoreinfo_size)
-		return;
-	buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data,
-			      vmcoreinfo_size);
-	final_note(buf);
-}
-
-void crash_update_vmcoreinfo_safecopy(void *ptr)
-{
-	if (ptr)
-		memcpy(ptr, vmcoreinfo_data, vmcoreinfo_size);
-
-	vmcoreinfo_data_safecopy = ptr;
-}
-
-void crash_save_vmcoreinfo(void)
-{
-	if (!vmcoreinfo_note)
-		return;
-
-	/* Use the safe copy to generate vmcoreinfo note if have */
-	if (vmcoreinfo_data_safecopy)
-		vmcoreinfo_data = vmcoreinfo_data_safecopy;
-
-	vmcoreinfo_append_str("CRASHTIME=%lld\n", ktime_get_real_seconds());
-	update_vmcoreinfo_note();
-}
-
-void vmcoreinfo_append_str(const char *fmt, ...)
-{
-	va_list args;
-	char buf[0x50];
-	size_t r;
-
-	va_start(args, fmt);
-	r = vscnprintf(buf, sizeof(buf), fmt, args);
-	va_end(args);
-
-	r = min(r, (size_t)VMCOREINFO_BYTES - vmcoreinfo_size);
-
-	memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r);
-
-	vmcoreinfo_size += r;
-
-	WARN_ONCE(vmcoreinfo_size == VMCOREINFO_BYTES,
-		  "vmcoreinfo data exceeds allocated size, truncating");
-}
-
-/*
- * provide an empty default implementation here -- architecture
- * code may override this
- */
-void __weak arch_crash_save_vmcoreinfo(void)
-{}
-
-phys_addr_t __weak paddr_vmcoreinfo_note(void)
-{
-	return __pa(vmcoreinfo_note);
-}
-EXPORT_SYMBOL(paddr_vmcoreinfo_note);
-
-static int __init crash_save_vmcoreinfo_init(void)
-{
-	vmcoreinfo_data = (unsigned char *)get_zeroed_page(GFP_KERNEL);
-	if (!vmcoreinfo_data) {
-		pr_warn("Memory allocation for vmcoreinfo_data failed\n");
-		return -ENOMEM;
-	}
-
-	vmcoreinfo_note = alloc_pages_exact(VMCOREINFO_NOTE_SIZE,
-						GFP_KERNEL | __GFP_ZERO);
-	if (!vmcoreinfo_note) {
-		free_page((unsigned long)vmcoreinfo_data);
-		vmcoreinfo_data = NULL;
-		pr_warn("Memory allocation for vmcoreinfo_note failed\n");
-		return -ENOMEM;
-	}
-
-	VMCOREINFO_OSRELEASE(init_uts_ns.name.release);
-	VMCOREINFO_BUILD_ID();
-	VMCOREINFO_PAGESIZE(PAGE_SIZE);
-
-	VMCOREINFO_SYMBOL(init_uts_ns);
-	VMCOREINFO_OFFSET(uts_namespace, name);
-	VMCOREINFO_SYMBOL(node_online_map);
-#ifdef CONFIG_MMU
-	VMCOREINFO_SYMBOL_ARRAY(swapper_pg_dir);
-#endif
-	VMCOREINFO_SYMBOL(_stext);
-	vmcoreinfo_append_str("NUMBER(VMALLOC_START)=0x%lx\n", (unsigned long) VMALLOC_START);
-
-#ifndef CONFIG_NUMA
-	VMCOREINFO_SYMBOL(mem_map);
-	VMCOREINFO_SYMBOL(contig_page_data);
-#endif
-#ifdef CONFIG_SPARSEMEM
-	VMCOREINFO_SYMBOL_ARRAY(mem_section);
-	VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS);
-	VMCOREINFO_STRUCT_SIZE(mem_section);
-	VMCOREINFO_OFFSET(mem_section, section_mem_map);
-	VMCOREINFO_NUMBER(SECTION_SIZE_BITS);
-	VMCOREINFO_NUMBER(MAX_PHYSMEM_BITS);
-#endif
-	VMCOREINFO_STRUCT_SIZE(page);
-	VMCOREINFO_STRUCT_SIZE(pglist_data);
-	VMCOREINFO_STRUCT_SIZE(zone);
-	VMCOREINFO_STRUCT_SIZE(free_area);
-	VMCOREINFO_STRUCT_SIZE(list_head);
-	VMCOREINFO_SIZE(nodemask_t);
-	VMCOREINFO_OFFSET(page, flags);
-	VMCOREINFO_OFFSET(page, _refcount);
-	VMCOREINFO_OFFSET(page, mapping);
-	VMCOREINFO_OFFSET(page, lru);
-	VMCOREINFO_OFFSET(page, _mapcount);
-	VMCOREINFO_OFFSET(page, private);
-	VMCOREINFO_OFFSET(page, compound_head);
-	VMCOREINFO_OFFSET(pglist_data, node_zones);
-	VMCOREINFO_OFFSET(pglist_data, nr_zones);
-#ifdef CONFIG_FLATMEM
-	VMCOREINFO_OFFSET(pglist_data, node_mem_map);
-#endif
-	VMCOREINFO_OFFSET(pglist_data, node_start_pfn);
-	VMCOREINFO_OFFSET(pglist_data, node_spanned_pages);
-	VMCOREINFO_OFFSET(pglist_data, node_id);
-	VMCOREINFO_OFFSET(zone, free_area);
-	VMCOREINFO_OFFSET(zone, vm_stat);
-	VMCOREINFO_OFFSET(zone, spanned_pages);
-	VMCOREINFO_OFFSET(free_area, free_list);
-	VMCOREINFO_OFFSET(list_head, next);
-	VMCOREINFO_OFFSET(list_head, prev);
-	VMCOREINFO_LENGTH(zone.free_area, NR_PAGE_ORDERS);
-	log_buf_vmcoreinfo_setup();
-	VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);
-	VMCOREINFO_NUMBER(NR_FREE_PAGES);
-	VMCOREINFO_NUMBER(PG_lru);
-	VMCOREINFO_NUMBER(PG_private);
-	VMCOREINFO_NUMBER(PG_swapcache);
-	VMCOREINFO_NUMBER(PG_swapbacked);
-	VMCOREINFO_NUMBER(PG_slab);
-#ifdef CONFIG_MEMORY_FAILURE
-	VMCOREINFO_NUMBER(PG_hwpoison);
-#endif
-	VMCOREINFO_NUMBER(PG_head_mask);
-#define PAGE_BUDDY_MAPCOUNT_VALUE	(~PG_buddy)
-	VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE);
-#ifdef CONFIG_HUGETLB_PAGE
-	VMCOREINFO_NUMBER(PG_hugetlb);
-#define PAGE_OFFLINE_MAPCOUNT_VALUE	(~PG_offline)
-	VMCOREINFO_NUMBER(PAGE_OFFLINE_MAPCOUNT_VALUE);
-#endif
-
-#ifdef CONFIG_KALLSYMS
-	VMCOREINFO_SYMBOL(kallsyms_names);
-	VMCOREINFO_SYMBOL(kallsyms_num_syms);
-	VMCOREINFO_SYMBOL(kallsyms_token_table);
-	VMCOREINFO_SYMBOL(kallsyms_token_index);
-#ifdef CONFIG_KALLSYMS_BASE_RELATIVE
-	VMCOREINFO_SYMBOL(kallsyms_offsets);
-	VMCOREINFO_SYMBOL(kallsyms_relative_base);
-#else
-	VMCOREINFO_SYMBOL(kallsyms_addresses);
-#endif /* CONFIG_KALLSYMS_BASE_RELATIVE */
-#endif /* CONFIG_KALLSYMS */
-
-	arch_crash_save_vmcoreinfo();
-	update_vmcoreinfo_note();
-
-	return 0;
-}
-
-subsys_initcall(crash_save_vmcoreinfo_init);
-
 static int __init crash_notes_memory_init(void)
 {
 	/* Allocate memory for saving cpu registers. */
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 1d4bc493b2f4b2..11526fc42bc24c 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -154,7 +154,7 @@ KERNEL_ATTR_RW(kexec_crash_size);
 
 #endif /* CONFIG_KEXEC_CORE */
 
-#ifdef CONFIG_CRASH_CORE
+#ifdef CONFIG_VMCORE_INFO
 
 static ssize_t vmcoreinfo_show(struct kobject *kobj,
 			       struct kobj_attribute *attr, char *buf)
@@ -177,7 +177,7 @@ KERNEL_ATTR_RO(crash_elfcorehdr_size);
 
 #endif
 
-#endif /* CONFIG_CRASH_CORE */
+#endif /* CONFIG_VMCORE_INFO */
 
 /* whether file capabilities are enabled */
 static ssize_t fscaps_show(struct kobject *kobj,
@@ -265,7 +265,7 @@ static struct attribute * kernel_attrs[] = {
 	&kexec_crash_loaded_attr.attr,
 	&kexec_crash_size_attr.attr,
 #endif
-#ifdef CONFIG_CRASH_CORE
+#ifdef CONFIG_VMCORE_INFO
 	&vmcoreinfo_attr.attr,
 #ifdef CONFIG_CRASH_HOTPLUG
 	&crash_elfcorehdr_size_attr.attr,
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index f2444b581e16c3..7d74b000b43a9b 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -34,7 +34,7 @@
 #include <linux/security.h>
 #include <linux/memblock.h>
 #include <linux/syscalls.h>
-#include <linux/crash_core.h>
+#include <linux/vmcore_info.h>
 #include <linux/ratelimit.h>
 #include <linux/kmsg_dump.h>
 #include <linux/syslog.h>
@@ -951,7 +951,7 @@ const struct file_operations kmsg_fops = {
 	.release = devkmsg_release,
 };
 
-#ifdef CONFIG_CRASH_CORE
+#ifdef CONFIG_VMCORE_INFO
 /*
  * This appends the listed symbols to /proc/vmcore
  *
diff --git a/kernel/vmcore_info.c b/kernel/vmcore_info.c
new file mode 100644
index 00000000000000..8f48c0a42e2eed
--- /dev/null
+++ b/kernel/vmcore_info.c
@@ -0,0 +1,231 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * crash.c - kernel crash support code.
+ * Copyright (C) 2002-2004 Eric Biederman  <ebiederm@xmission.com>
+ */
+
+#include <linux/buildid.h>
+#include <linux/init.h>
+#include <linux/utsname.h>
+#include <linux/vmalloc.h>
+#include <linux/sizes.h>
+#include <linux/kexec.h>
+#include <linux/memory.h>
+#include <linux/cpuhotplug.h>
+#include <linux/memblock.h>
+#include <linux/kexec.h>
+#include <linux/kmemleak.h>
+
+#include <asm/page.h>
+#include <asm/sections.h>
+
+#include <crypto/sha1.h>
+
+#include "kallsyms_internal.h"
+#include "kexec_internal.h"
+
+/* vmcoreinfo stuff */
+unsigned char *vmcoreinfo_data;
+size_t vmcoreinfo_size;
+u32 *vmcoreinfo_note;
+
+/* trusted vmcoreinfo, e.g. we can make a copy in the crash memory */
+static unsigned char *vmcoreinfo_data_safecopy;
+
+Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type,
+			  void *data, size_t data_len)
+{
+	struct elf_note *note = (struct elf_note *)buf;
+
+	note->n_namesz = strlen(name) + 1;
+	note->n_descsz = data_len;
+	note->n_type   = type;
+	buf += DIV_ROUND_UP(sizeof(*note), sizeof(Elf_Word));
+	memcpy(buf, name, note->n_namesz);
+	buf += DIV_ROUND_UP(note->n_namesz, sizeof(Elf_Word));
+	memcpy(buf, data, data_len);
+	buf += DIV_ROUND_UP(data_len, sizeof(Elf_Word));
+
+	return buf;
+}
+
+void final_note(Elf_Word *buf)
+{
+	memset(buf, 0, sizeof(struct elf_note));
+}
+
+static void update_vmcoreinfo_note(void)
+{
+	u32 *buf = vmcoreinfo_note;
+
+	if (!vmcoreinfo_size)
+		return;
+	buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data,
+			      vmcoreinfo_size);
+	final_note(buf);
+}
+
+void crash_update_vmcoreinfo_safecopy(void *ptr)
+{
+	if (ptr)
+		memcpy(ptr, vmcoreinfo_data, vmcoreinfo_size);
+
+	vmcoreinfo_data_safecopy = ptr;
+}
+
+void crash_save_vmcoreinfo(void)
+{
+	if (!vmcoreinfo_note)
+		return;
+
+	/* Use the safe copy to generate vmcoreinfo note if have */
+	if (vmcoreinfo_data_safecopy)
+		vmcoreinfo_data = vmcoreinfo_data_safecopy;
+
+	vmcoreinfo_append_str("CRASHTIME=%lld\n", ktime_get_real_seconds());
+	update_vmcoreinfo_note();
+}
+
+void vmcoreinfo_append_str(const char *fmt, ...)
+{
+	va_list args;
+	char buf[0x50];
+	size_t r;
+
+	va_start(args, fmt);
+	r = vscnprintf(buf, sizeof(buf), fmt, args);
+	va_end(args);
+
+	r = min(r, (size_t)VMCOREINFO_BYTES - vmcoreinfo_size);
+
+	memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r);
+
+	vmcoreinfo_size += r;
+
+	WARN_ONCE(vmcoreinfo_size == VMCOREINFO_BYTES,
+		  "vmcoreinfo data exceeds allocated size, truncating");
+}
+
+/*
+ * provide an empty default implementation here -- architecture
+ * code may override this
+ */
+void __weak arch_crash_save_vmcoreinfo(void)
+{}
+
+phys_addr_t __weak paddr_vmcoreinfo_note(void)
+{
+	return __pa(vmcoreinfo_note);
+}
+EXPORT_SYMBOL(paddr_vmcoreinfo_note);
+
+static int __init crash_save_vmcoreinfo_init(void)
+{
+	vmcoreinfo_data = (unsigned char *)get_zeroed_page(GFP_KERNEL);
+	if (!vmcoreinfo_data) {
+		pr_warn("Memory allocation for vmcoreinfo_data failed\n");
+		return -ENOMEM;
+	}
+
+	vmcoreinfo_note = alloc_pages_exact(VMCOREINFO_NOTE_SIZE,
+						GFP_KERNEL | __GFP_ZERO);
+	if (!vmcoreinfo_note) {
+		free_page((unsigned long)vmcoreinfo_data);
+		vmcoreinfo_data = NULL;
+		pr_warn("Memory allocation for vmcoreinfo_note failed\n");
+		return -ENOMEM;
+	}
+
+	VMCOREINFO_OSRELEASE(init_uts_ns.name.release);
+	VMCOREINFO_BUILD_ID();
+	VMCOREINFO_PAGESIZE(PAGE_SIZE);
+
+	VMCOREINFO_SYMBOL(init_uts_ns);
+	VMCOREINFO_OFFSET(uts_namespace, name);
+	VMCOREINFO_SYMBOL(node_online_map);
+#ifdef CONFIG_MMU
+	VMCOREINFO_SYMBOL_ARRAY(swapper_pg_dir);
+#endif
+	VMCOREINFO_SYMBOL(_stext);
+	vmcoreinfo_append_str("NUMBER(VMALLOC_START)=0x%lx\n", (unsigned long) VMALLOC_START);
+
+#ifndef CONFIG_NUMA
+	VMCOREINFO_SYMBOL(mem_map);
+	VMCOREINFO_SYMBOL(contig_page_data);
+#endif
+#ifdef CONFIG_SPARSEMEM
+	VMCOREINFO_SYMBOL_ARRAY(mem_section);
+	VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS);
+	VMCOREINFO_STRUCT_SIZE(mem_section);
+	VMCOREINFO_OFFSET(mem_section, section_mem_map);
+	VMCOREINFO_NUMBER(SECTION_SIZE_BITS);
+	VMCOREINFO_NUMBER(MAX_PHYSMEM_BITS);
+#endif
+	VMCOREINFO_STRUCT_SIZE(page);
+	VMCOREINFO_STRUCT_SIZE(pglist_data);
+	VMCOREINFO_STRUCT_SIZE(zone);
+	VMCOREINFO_STRUCT_SIZE(free_area);
+	VMCOREINFO_STRUCT_SIZE(list_head);
+	VMCOREINFO_SIZE(nodemask_t);
+	VMCOREINFO_OFFSET(page, flags);
+	VMCOREINFO_OFFSET(page, _refcount);
+	VMCOREINFO_OFFSET(page, mapping);
+	VMCOREINFO_OFFSET(page, lru);
+	VMCOREINFO_OFFSET(page, _mapcount);
+	VMCOREINFO_OFFSET(page, private);
+	VMCOREINFO_OFFSET(page, compound_head);
+	VMCOREINFO_OFFSET(pglist_data, node_zones);
+	VMCOREINFO_OFFSET(pglist_data, nr_zones);
+#ifdef CONFIG_FLATMEM
+	VMCOREINFO_OFFSET(pglist_data, node_mem_map);
+#endif
+	VMCOREINFO_OFFSET(pglist_data, node_start_pfn);
+	VMCOREINFO_OFFSET(pglist_data, node_spanned_pages);
+	VMCOREINFO_OFFSET(pglist_data, node_id);
+	VMCOREINFO_OFFSET(zone, free_area);
+	VMCOREINFO_OFFSET(zone, vm_stat);
+	VMCOREINFO_OFFSET(zone, spanned_pages);
+	VMCOREINFO_OFFSET(free_area, free_list);
+	VMCOREINFO_OFFSET(list_head, next);
+	VMCOREINFO_OFFSET(list_head, prev);
+	VMCOREINFO_LENGTH(zone.free_area, NR_PAGE_ORDERS);
+	log_buf_vmcoreinfo_setup();
+	VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);
+	VMCOREINFO_NUMBER(NR_FREE_PAGES);
+	VMCOREINFO_NUMBER(PG_lru);
+	VMCOREINFO_NUMBER(PG_private);
+	VMCOREINFO_NUMBER(PG_swapcache);
+	VMCOREINFO_NUMBER(PG_swapbacked);
+	VMCOREINFO_NUMBER(PG_slab);
+#ifdef CONFIG_MEMORY_FAILURE
+	VMCOREINFO_NUMBER(PG_hwpoison);
+#endif
+	VMCOREINFO_NUMBER(PG_head_mask);
+#define PAGE_BUDDY_MAPCOUNT_VALUE	(~PG_buddy)
+	VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE);
+#ifdef CONFIG_HUGETLB_PAGE
+	VMCOREINFO_NUMBER(PG_hugetlb);
+#define PAGE_OFFLINE_MAPCOUNT_VALUE	(~PG_offline)
+	VMCOREINFO_NUMBER(PAGE_OFFLINE_MAPCOUNT_VALUE);
+#endif
+
+#ifdef CONFIG_KALLSYMS
+	VMCOREINFO_SYMBOL(kallsyms_names);
+	VMCOREINFO_SYMBOL(kallsyms_num_syms);
+	VMCOREINFO_SYMBOL(kallsyms_token_table);
+	VMCOREINFO_SYMBOL(kallsyms_token_index);
+#ifdef CONFIG_KALLSYMS_BASE_RELATIVE
+	VMCOREINFO_SYMBOL(kallsyms_offsets);
+	VMCOREINFO_SYMBOL(kallsyms_relative_base);
+#else
+	VMCOREINFO_SYMBOL(kallsyms_addresses);
+#endif /* CONFIG_KALLSYMS_BASE_RELATIVE */
+#endif /* CONFIG_KALLSYMS */
+
+	arch_crash_save_vmcoreinfo();
+	update_vmcoreinfo_note();
+
+	return 0;
+}
+
+subsys_initcall(crash_save_vmcoreinfo_init);
diff --git a/lib/buildid.c b/lib/buildid.c
index e3a7acdeef0ed4..3e6868c86b45a8 100644
--- a/lib/buildid.c
+++ b/lib/buildid.c
@@ -174,7 +174,7 @@ int build_id_parse_buf(const void *buf, unsigned char *build_id, u32 buf_size)
 	return parse_build_id_buf(build_id, NULL, buf, buf_size);
 }
 
-#if IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID) || IS_ENABLED(CONFIG_CRASH_CORE)
+#if IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID) || IS_ENABLED(CONFIG_VMCORE_INFO)
 unsigned char vmlinux_build_id[BUILD_ID_SIZE_MAX] __ro_after_init;
 
 /**

From acbabfd8eac037a01e45db07a27b064f4361d8ac Mon Sep 17 00:00:00 2001
From: Yang Li <yang.lee@linux.alibaba.com>
Date: Fri, 26 Jan 2024 08:57:44 +0800
Subject: [PATCH 1015/1406] crash: remove duplicated include in vmcore_info.c

The header files kexec.h is included twice in vmcore_info.c,
so one inclusion can be removed.

Link: https://lkml.kernel.org/r/20240126005744.16561-1-yang.lee@linux.alibaba.com
Signed-off-by: Yang Li <yang.lee@linux.alibaba.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Hari Bathini <hbathini@linux.ibm.com>
Cc: Klara Modin <klarasmodin@gmail.com>
Cc: Michael Kelley <mhklinux@outlook.com>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Pingfan Liu <piliu@redhat.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/vmcore_info.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/kernel/vmcore_info.c b/kernel/vmcore_info.c
index 8f48c0a42e2eed..8f77e238a54f54 100644
--- a/kernel/vmcore_info.c
+++ b/kernel/vmcore_info.c
@@ -13,7 +13,6 @@
 #include <linux/memory.h>
 #include <linux/cpuhotplug.h>
 #include <linux/memblock.h>
-#include <linux/kexec.h>
 #include <linux/kmemleak.h>
 
 #include <asm/page.h>

From 391f16b927b215e2c99a5d04fc371401d5e632b2 Mon Sep 17 00:00:00 2001
From: Baoquan He <bhe@redhat.com>
Date: Wed, 24 Jan 2024 13:12:43 +0800
Subject: [PATCH 1016/1406] crash: remove dependency of FA_DUMP on CRASH_DUMP

In kdump kernel, /proc/vmcore is an elf file mapping the crashed kernel's
old memory content. Its elf header is constructed in 1st kernel and passed
to kdump kernel via elfcorehdr_addr. Config CRASH_DUMP enables the code
of 1st kernel's old memory accessing in different architectures.

Currently, config FA_DUMP has dependency on CRASH_DUMP because fadump
needs access global variable 'elfcorehdr_addr' to judge if it's in
kdump kernel within function is_kdump_kernel(). In the current
kernel/crash_dump.c, variable 'elfcorehdr_addr' is defined, and function
setup_elfcorehdr() used to parse kernel parameter to fetch the passed
value of elfcorehdr_addr. Only for accessing elfcorehdr_addr, FA_DUMP
really doesn't have to depends on CRASH_DUMP.

To remove the dependency of FA_DUMP on CRASH_DUMP to avoid confusion,
rename kernel/crash_dump.c to kernel/elfcorehdr.c, and build it when
CONFIG_VMCORE_INFO is ebabled. With this, FA_DUMP doesn't need to depend
on CRASH_DUMP.

Link: https://lkml.kernel.org/r/20240124051254.67105-4-bhe@redhat.com
Signed-off-by: Baoquan He <bhe@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Hari Bathini <hbathini@linux.ibm.com>
Cc: Pingfan Liu <piliu@redhat.com>
Cc: Klara Modin <klarasmodin@gmail.com>
Cc: Michael Kelley <mhklinux@outlook.com>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Yang Li <yang.lee@linux.alibaba.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/powerpc/Kconfig                  | 1 -
 kernel/Makefile                       | 3 +--
 kernel/{crash_dump.c => elfcorehdr.c} | 0
 kernel/kexec_internal.h               | 2 ++
 4 files changed, 3 insertions(+), 3 deletions(-)
 rename kernel/{crash_dump.c => elfcorehdr.c} (100%)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 495d197c9b2751..e66fd9923250ea 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -692,7 +692,6 @@ config FA_DUMP
 	depends on PPC64 && (PPC_RTAS || PPC_POWERNV)
 	select VMCORE_INFO
 	select CRASH_RESERVE
-	select CRASH_DUMP
 	help
 	  A robust mechanism to get reliable kernel crash dump with
 	  assistance from firmware. This approach does not use kexec,
diff --git a/kernel/Makefile b/kernel/Makefile
index 649272a1d6b9f8..35abc65e1f1ade 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -68,7 +68,7 @@ obj-$(CONFIG_MODULE_SIG_FORMAT) += module_signature.o
 obj-$(CONFIG_KALLSYMS) += kallsyms.o
 obj-$(CONFIG_KALLSYMS_SELFTEST) += kallsyms_selftest.o
 obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
-obj-$(CONFIG_VMCORE_INFO) += vmcore_info.o
+obj-$(CONFIG_VMCORE_INFO) += vmcore_info.o elfcorehdr.o
 obj-$(CONFIG_CRASH_RESERVE) += crash_reserve.o
 obj-$(CONFIG_KEXEC_CORE) += kexec_core.o crash_core.o
 obj-$(CONFIG_KEXEC) += kexec.o
@@ -121,7 +121,6 @@ obj-$(CONFIG_PERF_EVENTS) += events/
 
 obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o
 obj-$(CONFIG_PADATA) += padata.o
-obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
 obj-$(CONFIG_JUMP_LABEL) += jump_label.o
 obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o
 obj-$(CONFIG_TORTURE_TEST) += torture.o
diff --git a/kernel/crash_dump.c b/kernel/elfcorehdr.c
similarity index 100%
rename from kernel/crash_dump.c
rename to kernel/elfcorehdr.c
diff --git a/kernel/kexec_internal.h b/kernel/kexec_internal.h
index 74da1409cd14b5..2595defe8c0d92 100644
--- a/kernel/kexec_internal.h
+++ b/kernel/kexec_internal.h
@@ -4,6 +4,8 @@
 
 #include <linux/kexec.h>
 
+struct kexec_segment;
+
 struct kimage *do_kimage_alloc_init(void);
 int sanity_check_segment_list(struct kimage *image);
 void kimage_free_page_list(struct list_head *list);

From e24430b8ffd062e67849a26494e68c99c25598b0 Mon Sep 17 00:00:00 2001
From: Baoquan He <bhe@redhat.com>
Date: Sun, 4 Feb 2024 11:06:54 +0800
Subject: [PATCH 1017/1406] power/fadump: make FA_DUMP select CRASH_DUMP

FA_DUMP which is similar with kdump needs vmcoreinfo exporting,
crashkernel reservation and /proc/vmcore file . After refactoring crash
related codes and Kconfig items, make FA_DUMP select CRASH_DUMP. Now
the dependency layout is like below:

                           ---->VMCORE_INFO
                         /|
FA_DUMP--> CRASH_DUMP-->/-|---->CRASH_RESERVE
                        \ |
                          \---->PROC_VMCORE

Link: https://lkml.kernel.org/r/Zb8D1ASrgX0qVm9z@MiWiFi-R3L-srv
Signed-off-by: Baoquan He <bhe@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Hari Bathini <hbathini@linux.ibm.com>
Cc: Klara Modin <klarasmodin@gmail.com>
Cc: Michael Kelley <mhklinux@outlook.com>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Pingfan Liu <piliu@redhat.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Yang Li <yang.lee@linux.alibaba.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/powerpc/Kconfig | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index e66fd9923250ea..a9efaf87966dfd 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -690,8 +690,7 @@ config ARCH_SELECTS_CRASH_DUMP
 config FA_DUMP
 	bool "Firmware-assisted dump"
 	depends on PPC64 && (PPC_RTAS || PPC_POWERNV)
-	select VMCORE_INFO
-	select CRASH_RESERVE
+	select CRASH_DUMP
 	help
 	  A robust mechanism to get reliable kernel crash dump with
 	  assistance from firmware. This approach does not use kexec,

From 92255e86ccb861f654b886b12ea0aeddfacace3a Mon Sep 17 00:00:00 2001
From: Baoquan He <bhe@redhat.com>
Date: Wed, 24 Jan 2024 13:12:44 +0800
Subject: [PATCH 1018/1406] crash: split crash dumping code out from
 kexec_core.c

Currently, KEXEC_CORE select CRASH_CORE automatically because crash codes
need be built in to avoid compiling error when building kexec code even
though the crash dumping functionality is not enabled. E.g
--------------------
CONFIG_CRASH_CORE=y
CONFIG_KEXEC_CORE=y
CONFIG_KEXEC=y
CONFIG_KEXEC_FILE=y
---------------------

After splitting out crashkernel reservation code and vmcoreinfo exporting
code, there's only crash related code left in kernel/crash_core.c. Now
move crash related codes from kexec_core.c to crash_core.c and only build it
in when CONFIG_CRASH_DUMP=y.

And also wrap up crash codes inside CONFIG_CRASH_DUMP ifdeffery scope,
or replace inappropriate CONFIG_KEXEC_CORE ifdef with CONFIG_CRASH_DUMP
ifdef in generic kernel files.

With these changes, crash_core codes are abstracted from kexec codes and
can be disabled at all if only kexec reboot feature is wanted.

Link: https://lkml.kernel.org/r/20240124051254.67105-5-bhe@redhat.com
Signed-off-by: Baoquan He <bhe@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Hari Bathini <hbathini@linux.ibm.com>
Cc: Pingfan Liu <piliu@redhat.com>
Cc: Klara Modin <klarasmodin@gmail.com>
Cc: Michael Kelley <mhklinux@outlook.com>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Yang Li <yang.lee@linux.alibaba.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 drivers/base/cpu.c         |   6 +-
 include/linux/crash_core.h |  61 +++++++++
 include/linux/kexec.h      |  45 +------
 init/initramfs.c           |   2 +-
 kernel/Makefile            |   3 +-
 kernel/crash_core.c        | 256 +++++++++++++++++++++++++++++++++++++
 kernel/kexec.c             |  11 +-
 kernel/kexec_core.c        | 250 ++----------------------------------
 kernel/kexec_file.c        |  13 +-
 kernel/ksysfs.c            |   4 +
 10 files changed, 359 insertions(+), 292 deletions(-)

diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c
index 47de0f140ba65e..b621a0fc75e15a 100644
--- a/drivers/base/cpu.c
+++ b/drivers/base/cpu.c
@@ -144,7 +144,7 @@ static DEVICE_ATTR(release, S_IWUSR, NULL, cpu_release_store);
 #endif /* CONFIG_ARCH_CPU_PROBE_RELEASE */
 #endif /* CONFIG_HOTPLUG_CPU */
 
-#ifdef CONFIG_KEXEC_CORE
+#ifdef CONFIG_CRASH_DUMP
 #include <linux/kexec.h>
 
 static ssize_t crash_notes_show(struct device *dev,
@@ -189,14 +189,14 @@ static const struct attribute_group crash_note_cpu_attr_group = {
 #endif
 
 static const struct attribute_group *common_cpu_attr_groups[] = {
-#ifdef CONFIG_KEXEC_CORE
+#ifdef CONFIG_CRASH_DUMP
 	&crash_note_cpu_attr_group,
 #endif
 	NULL
 };
 
 static const struct attribute_group *hotplugable_cpu_attr_groups[] = {
-#ifdef CONFIG_KEXEC_CORE
+#ifdef CONFIG_CRASH_DUMP
 	&crash_note_cpu_attr_group,
 #endif
 	NULL
diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h
index 7f19f62018ef9c..23270b16e1dbf3 100644
--- a/include/linux/crash_core.h
+++ b/include/linux/crash_core.h
@@ -6,6 +6,48 @@
 #include <linux/elfcore.h>
 #include <linux/elf.h>
 
+struct kimage;
+
+#ifdef CONFIG_CRASH_DUMP
+
+int crash_shrink_memory(unsigned long new_size);
+ssize_t crash_get_memory_size(void);
+
+#ifndef arch_kexec_protect_crashkres
+/*
+ * Protection mechanism for crashkernel reserved memory after
+ * the kdump kernel is loaded.
+ *
+ * Provide an empty default implementation here -- architecture
+ * code may override this
+ */
+static inline void arch_kexec_protect_crashkres(void) { }
+#endif
+
+#ifndef arch_kexec_unprotect_crashkres
+static inline void arch_kexec_unprotect_crashkres(void) { }
+#endif
+
+
+
+#ifndef arch_crash_handle_hotplug_event
+static inline void arch_crash_handle_hotplug_event(struct kimage *image) { }
+#endif
+
+int crash_check_update_elfcorehdr(void);
+
+#ifndef crash_hotplug_cpu_support
+static inline int crash_hotplug_cpu_support(void) { return 0; }
+#endif
+
+#ifndef crash_hotplug_memory_support
+static inline int crash_hotplug_memory_support(void) { return 0; }
+#endif
+
+#ifndef crash_get_elfcorehdr_size
+static inline unsigned int crash_get_elfcorehdr_size(void) { return 0; }
+#endif
+
 /* Alignment required for elf header segment */
 #define ELF_CORE_HEADER_ALIGN   4096
 
@@ -31,4 +73,23 @@ struct kexec_segment;
 #define KEXEC_CRASH_HP_REMOVE_MEMORY		4
 #define KEXEC_CRASH_HP_INVALID_CPU		-1U
 
+extern void __crash_kexec(struct pt_regs *regs);
+extern void crash_kexec(struct pt_regs *regs);
+int kexec_should_crash(struct task_struct *p);
+int kexec_crash_loaded(void);
+void crash_save_cpu(struct pt_regs *regs, int cpu);
+extern int kimage_crash_copy_vmcoreinfo(struct kimage *image);
+
+#else /* !CONFIG_CRASH_DUMP*/
+struct pt_regs;
+struct task_struct;
+struct kimage;
+static inline void __crash_kexec(struct pt_regs *regs) { }
+static inline void crash_kexec(struct pt_regs *regs) { }
+static inline int kexec_should_crash(struct task_struct *p) { return 0; }
+static inline int kexec_crash_loaded(void) { return 0; }
+static inline void crash_save_cpu(struct pt_regs *regs, int cpu) {};
+static inline int kimage_crash_copy_vmcoreinfo(struct kimage *image) { return 0; };
+#endif /* CONFIG_CRASH_DUMP*/
+
 #endif /* LINUX_CRASH_CORE_H */
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index 9c7bb8b56ed66d..060835bb82d52f 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -15,7 +15,6 @@
 
 #if !defined(__ASSEMBLY__)
 
-#include <linux/crash_core.h>
 #include <linux/vmcore_info.h>
 #include <linux/crash_reserve.h>
 #include <asm/io.h>
@@ -33,6 +32,7 @@ extern note_buf_t __percpu *crash_notes;
 #include <linux/module.h>
 #include <linux/highmem.h>
 #include <asm/kexec.h>
+#include <linux/crash_core.h>
 
 /* Verify architecture specific macros are defined */
 
@@ -380,13 +380,6 @@ extern struct page *kimage_alloc_control_pages(struct kimage *image,
 static inline int machine_kexec_post_load(struct kimage *image) { return 0; }
 #endif
 
-extern void __crash_kexec(struct pt_regs *);
-extern void crash_kexec(struct pt_regs *);
-int kexec_should_crash(struct task_struct *);
-int kexec_crash_loaded(void);
-void crash_save_cpu(struct pt_regs *regs, int cpu);
-extern int kimage_crash_copy_vmcoreinfo(struct kimage *image);
-
 extern struct kimage *kexec_image;
 extern struct kimage *kexec_crash_image;
 
@@ -410,24 +403,6 @@ bool kexec_load_permitted(int kexec_image_type);
 /* flag to track if kexec reboot is in progress */
 extern bool kexec_in_progress;
 
-int crash_shrink_memory(unsigned long new_size);
-ssize_t crash_get_memory_size(void);
-
-#ifndef arch_kexec_protect_crashkres
-/*
- * Protection mechanism for crashkernel reserved memory after
- * the kdump kernel is loaded.
- *
- * Provide an empty default implementation here -- architecture
- * code may override this
- */
-static inline void arch_kexec_protect_crashkres(void) { }
-#endif
-
-#ifndef arch_kexec_unprotect_crashkres
-static inline void arch_kexec_unprotect_crashkres(void) { }
-#endif
-
 #ifndef page_to_boot_pfn
 static inline unsigned long page_to_boot_pfn(struct page *page)
 {
@@ -484,24 +459,6 @@ static inline int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages, g
 static inline void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages) { }
 #endif
 
-#ifndef arch_crash_handle_hotplug_event
-static inline void arch_crash_handle_hotplug_event(struct kimage *image) { }
-#endif
-
-int crash_check_update_elfcorehdr(void);
-
-#ifndef crash_hotplug_cpu_support
-static inline int crash_hotplug_cpu_support(void) { return 0; }
-#endif
-
-#ifndef crash_hotplug_memory_support
-static inline int crash_hotplug_memory_support(void) { return 0; }
-#endif
-
-#ifndef crash_get_elfcorehdr_size
-static inline unsigned int crash_get_elfcorehdr_size(void) { return 0; }
-#endif
-
 extern bool kexec_file_dbg_print;
 
 #define kexec_dprintk(fmt, ...)					\
diff --git a/init/initramfs.c b/init/initramfs.c
index 76deb48c38cb16..6f095f54eec976 100644
--- a/init/initramfs.c
+++ b/init/initramfs.c
@@ -642,7 +642,7 @@ void __weak __init free_initrd_mem(unsigned long start, unsigned long end)
 			"initrd");
 }
 
-#ifdef CONFIG_KEXEC_CORE
+#ifdef CONFIG_CRASH_RESERVE
 static bool __init kexec_free_initrd(void)
 {
 	unsigned long crashk_start = (unsigned long)__va(crashk_res.start);
diff --git a/kernel/Makefile b/kernel/Makefile
index 35abc65e1f1ade..3c13240dfc9f09 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -70,7 +70,8 @@ obj-$(CONFIG_KALLSYMS_SELFTEST) += kallsyms_selftest.o
 obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
 obj-$(CONFIG_VMCORE_INFO) += vmcore_info.o elfcorehdr.o
 obj-$(CONFIG_CRASH_RESERVE) += crash_reserve.o
-obj-$(CONFIG_KEXEC_CORE) += kexec_core.o crash_core.o
+obj-$(CONFIG_KEXEC_CORE) += kexec_core.o
+obj-$(CONFIG_CRASH_DUMP) += crash_core.o
 obj-$(CONFIG_KEXEC) += kexec.o
 obj-$(CONFIG_KEXEC_FILE) += kexec_file.o
 obj-$(CONFIG_KEXEC_ELF) += kexec_elf.o
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 2f4df1fe6f7af5..78b5dc7cee3ab7 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -11,9 +11,14 @@
 #include <linux/sizes.h>
 #include <linux/kexec.h>
 #include <linux/memory.h>
+#include <linux/mm.h>
 #include <linux/cpuhotplug.h>
 #include <linux/memblock.h>
 #include <linux/kmemleak.h>
+#include <linux/crash_core.h>
+#include <linux/reboot.h>
+#include <linux/btf.h>
+#include <linux/objtool.h>
 
 #include <asm/page.h>
 #include <asm/sections.h>
@@ -26,6 +31,131 @@
 /* Per cpu memory for storing cpu states in case of system crash. */
 note_buf_t __percpu *crash_notes;
 
+#ifdef CONFIG_CRASH_DUMP
+
+int kimage_crash_copy_vmcoreinfo(struct kimage *image)
+{
+	struct page *vmcoreinfo_page;
+	void *safecopy;
+
+	if (!IS_ENABLED(CONFIG_CRASH_DUMP))
+		return 0;
+	if (image->type != KEXEC_TYPE_CRASH)
+		return 0;
+
+	/*
+	 * For kdump, allocate one vmcoreinfo safe copy from the
+	 * crash memory. as we have arch_kexec_protect_crashkres()
+	 * after kexec syscall, we naturally protect it from write
+	 * (even read) access under kernel direct mapping. But on
+	 * the other hand, we still need to operate it when crash
+	 * happens to generate vmcoreinfo note, hereby we rely on
+	 * vmap for this purpose.
+	 */
+	vmcoreinfo_page = kimage_alloc_control_pages(image, 0);
+	if (!vmcoreinfo_page) {
+		pr_warn("Could not allocate vmcoreinfo buffer\n");
+		return -ENOMEM;
+	}
+	safecopy = vmap(&vmcoreinfo_page, 1, VM_MAP, PAGE_KERNEL);
+	if (!safecopy) {
+		pr_warn("Could not vmap vmcoreinfo buffer\n");
+		return -ENOMEM;
+	}
+
+	image->vmcoreinfo_data_copy = safecopy;
+	crash_update_vmcoreinfo_safecopy(safecopy);
+
+	return 0;
+}
+
+
+
+int kexec_should_crash(struct task_struct *p)
+{
+	/*
+	 * If crash_kexec_post_notifiers is enabled, don't run
+	 * crash_kexec() here yet, which must be run after panic
+	 * notifiers in panic().
+	 */
+	if (crash_kexec_post_notifiers)
+		return 0;
+	/*
+	 * There are 4 panic() calls in make_task_dead() path, each of which
+	 * corresponds to each of these 4 conditions.
+	 */
+	if (in_interrupt() || !p->pid || is_global_init(p) || panic_on_oops)
+		return 1;
+	return 0;
+}
+
+int kexec_crash_loaded(void)
+{
+	return !!kexec_crash_image;
+}
+EXPORT_SYMBOL_GPL(kexec_crash_loaded);
+
+/*
+ * No panic_cpu check version of crash_kexec().  This function is called
+ * only when panic_cpu holds the current CPU number; this is the only CPU
+ * which processes crash_kexec routines.
+ */
+void __noclone __crash_kexec(struct pt_regs *regs)
+{
+	/* Take the kexec_lock here to prevent sys_kexec_load
+	 * running on one cpu from replacing the crash kernel
+	 * we are using after a panic on a different cpu.
+	 *
+	 * If the crash kernel was not located in a fixed area
+	 * of memory the xchg(&kexec_crash_image) would be
+	 * sufficient.  But since I reuse the memory...
+	 */
+	if (kexec_trylock()) {
+		if (kexec_crash_image) {
+			struct pt_regs fixed_regs;
+
+			crash_setup_regs(&fixed_regs, regs);
+			crash_save_vmcoreinfo();
+			machine_crash_shutdown(&fixed_regs);
+			machine_kexec(kexec_crash_image);
+		}
+		kexec_unlock();
+	}
+}
+STACK_FRAME_NON_STANDARD(__crash_kexec);
+
+__bpf_kfunc void crash_kexec(struct pt_regs *regs)
+{
+	int old_cpu, this_cpu;
+
+	/*
+	 * Only one CPU is allowed to execute the crash_kexec() code as with
+	 * panic().  Otherwise parallel calls of panic() and crash_kexec()
+	 * may stop each other.  To exclude them, we use panic_cpu here too.
+	 */
+	old_cpu = PANIC_CPU_INVALID;
+	this_cpu = raw_smp_processor_id();
+
+	if (atomic_try_cmpxchg(&panic_cpu, &old_cpu, this_cpu)) {
+		/* This is the 1st CPU which comes here, so go ahead. */
+		__crash_kexec(regs);
+
+		/*
+		 * Reset panic_cpu to allow another panic()/crash_kexec()
+		 * call.
+		 */
+		atomic_set(&panic_cpu, PANIC_CPU_INVALID);
+	}
+}
+
+static inline resource_size_t crash_resource_size(const struct resource *res)
+{
+	return !res->end ? 0 : resource_size(res);
+}
+
+
+
+
 int crash_prepare_elf64_headers(struct crash_mem *mem, int need_kernel_map,
 			  void **addr, unsigned long *sz)
 {
@@ -187,6 +317,130 @@ int crash_exclude_mem_range(struct crash_mem *mem,
 	return 0;
 }
 
+ssize_t crash_get_memory_size(void)
+{
+	ssize_t size = 0;
+
+	if (!kexec_trylock())
+		return -EBUSY;
+
+	size += crash_resource_size(&crashk_res);
+	size += crash_resource_size(&crashk_low_res);
+
+	kexec_unlock();
+	return size;
+}
+
+static int __crash_shrink_memory(struct resource *old_res,
+				 unsigned long new_size)
+{
+	struct resource *ram_res;
+
+	ram_res = kzalloc(sizeof(*ram_res), GFP_KERNEL);
+	if (!ram_res)
+		return -ENOMEM;
+
+	ram_res->start = old_res->start + new_size;
+	ram_res->end   = old_res->end;
+	ram_res->flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM;
+	ram_res->name  = "System RAM";
+
+	if (!new_size) {
+		release_resource(old_res);
+		old_res->start = 0;
+		old_res->end   = 0;
+	} else {
+		crashk_res.end = ram_res->start - 1;
+	}
+
+	crash_free_reserved_phys_range(ram_res->start, ram_res->end);
+	insert_resource(&iomem_resource, ram_res);
+
+	return 0;
+}
+
+int crash_shrink_memory(unsigned long new_size)
+{
+	int ret = 0;
+	unsigned long old_size, low_size;
+
+	if (!kexec_trylock())
+		return -EBUSY;
+
+	if (kexec_crash_image) {
+		ret = -ENOENT;
+		goto unlock;
+	}
+
+	low_size = crash_resource_size(&crashk_low_res);
+	old_size = crash_resource_size(&crashk_res) + low_size;
+	new_size = roundup(new_size, KEXEC_CRASH_MEM_ALIGN);
+	if (new_size >= old_size) {
+		ret = (new_size == old_size) ? 0 : -EINVAL;
+		goto unlock;
+	}
+
+	/*
+	 * (low_size > new_size) implies that low_size is greater than zero.
+	 * This also means that if low_size is zero, the else branch is taken.
+	 *
+	 * If low_size is greater than 0, (low_size > new_size) indicates that
+	 * crashk_low_res also needs to be shrunken. Otherwise, only crashk_res
+	 * needs to be shrunken.
+	 */
+	if (low_size > new_size) {
+		ret = __crash_shrink_memory(&crashk_res, 0);
+		if (ret)
+			goto unlock;
+
+		ret = __crash_shrink_memory(&crashk_low_res, new_size);
+	} else {
+		ret = __crash_shrink_memory(&crashk_res, new_size - low_size);
+	}
+
+	/* Swap crashk_res and crashk_low_res if needed */
+	if (!crashk_res.end && crashk_low_res.end) {
+		crashk_res.start = crashk_low_res.start;
+		crashk_res.end   = crashk_low_res.end;
+		release_resource(&crashk_low_res);
+		crashk_low_res.start = 0;
+		crashk_low_res.end   = 0;
+		insert_resource(&iomem_resource, &crashk_res);
+	}
+
+unlock:
+	kexec_unlock();
+	return ret;
+}
+
+void crash_save_cpu(struct pt_regs *regs, int cpu)
+{
+	struct elf_prstatus prstatus;
+	u32 *buf;
+
+	if ((cpu < 0) || (cpu >= nr_cpu_ids))
+		return;
+
+	/* Using ELF notes here is opportunistic.
+	 * I need a well defined structure format
+	 * for the data I pass, and I need tags
+	 * on the data to indicate what information I have
+	 * squirrelled away.  ELF notes happen to provide
+	 * all of that, so there is no need to invent something new.
+	 */
+	buf = (u32 *)per_cpu_ptr(crash_notes, cpu);
+	if (!buf)
+		return;
+	memset(&prstatus, 0, sizeof(prstatus));
+	prstatus.common.pr_pid = current->pid;
+	elf_core_copy_regs(&prstatus.pr_reg, regs);
+	buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS,
+			      &prstatus, sizeof(prstatus));
+	final_note(buf);
+}
+
+
+
 static int __init crash_notes_memory_init(void)
 {
 	/* Allocate memory for saving cpu registers. */
@@ -220,6 +474,8 @@ static int __init crash_notes_memory_init(void)
 }
 subsys_initcall(crash_notes_memory_init);
 
+#endif /*CONFIG_CRASH_DUMP*/
+
 #ifdef CONFIG_CRASH_HOTPLUG
 #undef pr_fmt
 #define pr_fmt(fmt) "crash hp: " fmt
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 8f35a5a42af852..bab542fc1463d2 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -28,12 +28,14 @@ static int kimage_alloc_init(struct kimage **rimage, unsigned long entry,
 	struct kimage *image;
 	bool kexec_on_panic = flags & KEXEC_ON_CRASH;
 
+#ifdef CONFIG_CRASH_DUMP
 	if (kexec_on_panic) {
 		/* Verify we have a valid entry point */
 		if ((entry < phys_to_boot_phys(crashk_res.start)) ||
 		    (entry > phys_to_boot_phys(crashk_res.end)))
 			return -EADDRNOTAVAIL;
 	}
+#endif
 
 	/* Allocate and initialize a controlling structure */
 	image = do_kimage_alloc_init();
@@ -44,11 +46,13 @@ static int kimage_alloc_init(struct kimage **rimage, unsigned long entry,
 	image->nr_segments = nr_segments;
 	memcpy(image->segment, segments, nr_segments * sizeof(*segments));
 
+#ifdef CONFIG_CRASH_DUMP
 	if (kexec_on_panic) {
 		/* Enable special crash kernel control page alloc policy. */
 		image->control_page = crashk_res.start;
 		image->type = KEXEC_TYPE_CRASH;
 	}
+#endif
 
 	ret = sanity_check_segment_list(image);
 	if (ret)
@@ -99,13 +103,14 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments,
 	if (!kexec_trylock())
 		return -EBUSY;
 
+#ifdef CONFIG_CRASH_DUMP
 	if (flags & KEXEC_ON_CRASH) {
 		dest_image = &kexec_crash_image;
 		if (kexec_crash_image)
 			arch_kexec_unprotect_crashkres();
-	} else {
+	} else
+#endif
 		dest_image = &kexec_image;
-	}
 
 	if (nr_segments == 0) {
 		/* Uninstall image */
@@ -162,8 +167,10 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments,
 	image = xchg(dest_image, image);
 
 out:
+#ifdef CONFIG_CRASH_DUMP
 	if ((flags & KEXEC_ON_CRASH) && kexec_crash_image)
 		arch_kexec_protect_crashkres();
+#endif
 
 	kimage_free(image);
 out_unlock:
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index d08fc7b5db9790..ce3429e7972ccd 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -54,30 +54,6 @@ bool kexec_in_progress = false;
 
 bool kexec_file_dbg_print;
 
-int kexec_should_crash(struct task_struct *p)
-{
-	/*
-	 * If crash_kexec_post_notifiers is enabled, don't run
-	 * crash_kexec() here yet, which must be run after panic
-	 * notifiers in panic().
-	 */
-	if (crash_kexec_post_notifiers)
-		return 0;
-	/*
-	 * There are 4 panic() calls in make_task_dead() path, each of which
-	 * corresponds to each of these 4 conditions.
-	 */
-	if (in_interrupt() || !p->pid || is_global_init(p) || panic_on_oops)
-		return 1;
-	return 0;
-}
-
-int kexec_crash_loaded(void)
-{
-	return !!kexec_crash_image;
-}
-EXPORT_SYMBOL_GPL(kexec_crash_loaded);
-
 /*
  * When kexec transitions to the new kernel there is a one-to-one
  * mapping between physical and virtual addresses.  On processors
@@ -209,6 +185,7 @@ int sanity_check_segment_list(struct kimage *image)
 	if (total_pages > nr_pages / 2)
 		return -EINVAL;
 
+#ifdef CONFIG_CRASH_DUMP
 	/*
 	 * Verify we have good destination addresses.  Normally
 	 * the caller is responsible for making certain we don't
@@ -231,6 +208,7 @@ int sanity_check_segment_list(struct kimage *image)
 				return -EADDRNOTAVAIL;
 		}
 	}
+#endif
 
 	return 0;
 }
@@ -403,6 +381,7 @@ static struct page *kimage_alloc_normal_control_pages(struct kimage *image,
 	return pages;
 }
 
+#ifdef CONFIG_CRASH_DUMP
 static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
 						      unsigned int order)
 {
@@ -468,6 +447,7 @@ static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
 
 	return pages;
 }
+#endif
 
 
 struct page *kimage_alloc_control_pages(struct kimage *image,
@@ -479,48 +459,16 @@ struct page *kimage_alloc_control_pages(struct kimage *image,
 	case KEXEC_TYPE_DEFAULT:
 		pages = kimage_alloc_normal_control_pages(image, order);
 		break;
+#ifdef CONFIG_CRASH_DUMP
 	case KEXEC_TYPE_CRASH:
 		pages = kimage_alloc_crash_control_pages(image, order);
 		break;
+#endif
 	}
 
 	return pages;
 }
 
-int kimage_crash_copy_vmcoreinfo(struct kimage *image)
-{
-	struct page *vmcoreinfo_page;
-	void *safecopy;
-
-	if (image->type != KEXEC_TYPE_CRASH)
-		return 0;
-
-	/*
-	 * For kdump, allocate one vmcoreinfo safe copy from the
-	 * crash memory. as we have arch_kexec_protect_crashkres()
-	 * after kexec syscall, we naturally protect it from write
-	 * (even read) access under kernel direct mapping. But on
-	 * the other hand, we still need to operate it when crash
-	 * happens to generate vmcoreinfo note, hereby we rely on
-	 * vmap for this purpose.
-	 */
-	vmcoreinfo_page = kimage_alloc_control_pages(image, 0);
-	if (!vmcoreinfo_page) {
-		pr_warn("Could not allocate vmcoreinfo buffer\n");
-		return -ENOMEM;
-	}
-	safecopy = vmap(&vmcoreinfo_page, 1, VM_MAP, PAGE_KERNEL);
-	if (!safecopy) {
-		pr_warn("Could not vmap vmcoreinfo buffer\n");
-		return -ENOMEM;
-	}
-
-	image->vmcoreinfo_data_copy = safecopy;
-	crash_update_vmcoreinfo_safecopy(safecopy);
-
-	return 0;
-}
-
 static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
 {
 	if (*image->entry != 0)
@@ -603,10 +551,12 @@ void kimage_free(struct kimage *image)
 	if (!image)
 		return;
 
+#ifdef CONFIG_CRASH_DUMP
 	if (image->vmcoreinfo_data_copy) {
 		crash_update_vmcoreinfo_safecopy(NULL);
 		vunmap(image->vmcoreinfo_data_copy);
 	}
+#endif
 
 	kimage_free_extra_pages(image);
 	for_each_kimage_entry(image, ptr, entry) {
@@ -824,6 +774,7 @@ static int kimage_load_normal_segment(struct kimage *image,
 	return result;
 }
 
+#ifdef CONFIG_CRASH_DUMP
 static int kimage_load_crash_segment(struct kimage *image,
 					struct kexec_segment *segment)
 {
@@ -891,6 +842,7 @@ static int kimage_load_crash_segment(struct kimage *image,
 out:
 	return result;
 }
+#endif
 
 int kimage_load_segment(struct kimage *image,
 				struct kexec_segment *segment)
@@ -901,9 +853,11 @@ int kimage_load_segment(struct kimage *image,
 	case KEXEC_TYPE_DEFAULT:
 		result = kimage_load_normal_segment(image, segment);
 		break;
+#ifdef CONFIG_CRASH_DUMP
 	case KEXEC_TYPE_CRASH:
 		result = kimage_load_crash_segment(image, segment);
 		break;
+#endif
 	}
 
 	return result;
@@ -1027,186 +981,6 @@ bool kexec_load_permitted(int kexec_image_type)
 	return true;
 }
 
-/*
- * No panic_cpu check version of crash_kexec().  This function is called
- * only when panic_cpu holds the current CPU number; this is the only CPU
- * which processes crash_kexec routines.
- */
-void __noclone __crash_kexec(struct pt_regs *regs)
-{
-	/* Take the kexec_lock here to prevent sys_kexec_load
-	 * running on one cpu from replacing the crash kernel
-	 * we are using after a panic on a different cpu.
-	 *
-	 * If the crash kernel was not located in a fixed area
-	 * of memory the xchg(&kexec_crash_image) would be
-	 * sufficient.  But since I reuse the memory...
-	 */
-	if (kexec_trylock()) {
-		if (kexec_crash_image) {
-			struct pt_regs fixed_regs;
-
-			crash_setup_regs(&fixed_regs, regs);
-			crash_save_vmcoreinfo();
-			machine_crash_shutdown(&fixed_regs);
-			machine_kexec(kexec_crash_image);
-		}
-		kexec_unlock();
-	}
-}
-STACK_FRAME_NON_STANDARD(__crash_kexec);
-
-__bpf_kfunc void crash_kexec(struct pt_regs *regs)
-{
-	int old_cpu, this_cpu;
-
-	/*
-	 * Only one CPU is allowed to execute the crash_kexec() code as with
-	 * panic().  Otherwise parallel calls of panic() and crash_kexec()
-	 * may stop each other.  To exclude them, we use panic_cpu here too.
-	 */
-	old_cpu = PANIC_CPU_INVALID;
-	this_cpu = raw_smp_processor_id();
-
-	if (atomic_try_cmpxchg(&panic_cpu, &old_cpu, this_cpu)) {
-		/* This is the 1st CPU which comes here, so go ahead. */
-		__crash_kexec(regs);
-
-		/*
-		 * Reset panic_cpu to allow another panic()/crash_kexec()
-		 * call.
-		 */
-		atomic_set(&panic_cpu, PANIC_CPU_INVALID);
-	}
-}
-
-static inline resource_size_t crash_resource_size(const struct resource *res)
-{
-	return !res->end ? 0 : resource_size(res);
-}
-
-ssize_t crash_get_memory_size(void)
-{
-	ssize_t size = 0;
-
-	if (!kexec_trylock())
-		return -EBUSY;
-
-	size += crash_resource_size(&crashk_res);
-	size += crash_resource_size(&crashk_low_res);
-
-	kexec_unlock();
-	return size;
-}
-
-static int __crash_shrink_memory(struct resource *old_res,
-				 unsigned long new_size)
-{
-	struct resource *ram_res;
-
-	ram_res = kzalloc(sizeof(*ram_res), GFP_KERNEL);
-	if (!ram_res)
-		return -ENOMEM;
-
-	ram_res->start = old_res->start + new_size;
-	ram_res->end   = old_res->end;
-	ram_res->flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM;
-	ram_res->name  = "System RAM";
-
-	if (!new_size) {
-		release_resource(old_res);
-		old_res->start = 0;
-		old_res->end   = 0;
-	} else {
-		crashk_res.end = ram_res->start - 1;
-	}
-
-	crash_free_reserved_phys_range(ram_res->start, ram_res->end);
-	insert_resource(&iomem_resource, ram_res);
-
-	return 0;
-}
-
-int crash_shrink_memory(unsigned long new_size)
-{
-	int ret = 0;
-	unsigned long old_size, low_size;
-
-	if (!kexec_trylock())
-		return -EBUSY;
-
-	if (kexec_crash_image) {
-		ret = -ENOENT;
-		goto unlock;
-	}
-
-	low_size = crash_resource_size(&crashk_low_res);
-	old_size = crash_resource_size(&crashk_res) + low_size;
-	new_size = roundup(new_size, KEXEC_CRASH_MEM_ALIGN);
-	if (new_size >= old_size) {
-		ret = (new_size == old_size) ? 0 : -EINVAL;
-		goto unlock;
-	}
-
-	/*
-	 * (low_size > new_size) implies that low_size is greater than zero.
-	 * This also means that if low_size is zero, the else branch is taken.
-	 *
-	 * If low_size is greater than 0, (low_size > new_size) indicates that
-	 * crashk_low_res also needs to be shrunken. Otherwise, only crashk_res
-	 * needs to be shrunken.
-	 */
-	if (low_size > new_size) {
-		ret = __crash_shrink_memory(&crashk_res, 0);
-		if (ret)
-			goto unlock;
-
-		ret = __crash_shrink_memory(&crashk_low_res, new_size);
-	} else {
-		ret = __crash_shrink_memory(&crashk_res, new_size - low_size);
-	}
-
-	/* Swap crashk_res and crashk_low_res if needed */
-	if (!crashk_res.end && crashk_low_res.end) {
-		crashk_res.start = crashk_low_res.start;
-		crashk_res.end   = crashk_low_res.end;
-		release_resource(&crashk_low_res);
-		crashk_low_res.start = 0;
-		crashk_low_res.end   = 0;
-		insert_resource(&iomem_resource, &crashk_res);
-	}
-
-unlock:
-	kexec_unlock();
-	return ret;
-}
-
-void crash_save_cpu(struct pt_regs *regs, int cpu)
-{
-	struct elf_prstatus prstatus;
-	u32 *buf;
-
-	if ((cpu < 0) || (cpu >= nr_cpu_ids))
-		return;
-
-	/* Using ELF notes here is opportunistic.
-	 * I need a well defined structure format
-	 * for the data I pass, and I need tags
-	 * on the data to indicate what information I have
-	 * squirrelled away.  ELF notes happen to provide
-	 * all of that, so there is no need to invent something new.
-	 */
-	buf = (u32 *)per_cpu_ptr(crash_notes, cpu);
-	if (!buf)
-		return;
-	memset(&prstatus, 0, sizeof(prstatus));
-	prstatus.common.pr_pid = current->pid;
-	elf_core_copy_regs(&prstatus.pr_reg, regs);
-	buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS,
-			      &prstatus, sizeof(prstatus));
-	final_note(buf);
-}
-
 /*
  * Move into place and start executing a preloaded standalone
  * executable.  If nothing was preloaded return an error.
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index bef2f6f2571b42..ce7ce2ae27cdfe 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -285,11 +285,13 @@ kimage_file_alloc_init(struct kimage **rimage, int kernel_fd,
 	kexec_file_dbg_print = !!(flags & KEXEC_FILE_DEBUG);
 	image->file_mode = 1;
 
+#ifdef CONFIG_CRASH_DUMP
 	if (kexec_on_panic) {
 		/* Enable special crash kernel control page alloc policy. */
 		image->control_page = crashk_res.start;
 		image->type = KEXEC_TYPE_CRASH;
 	}
+#endif
 
 	ret = kimage_file_prepare_segments(image, kernel_fd, initrd_fd,
 					   cmdline_ptr, cmdline_len, flags);
@@ -349,13 +351,14 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
 	if (!kexec_trylock())
 		return -EBUSY;
 
+#ifdef CONFIG_CRASH_DUMP
 	if (image_type == KEXEC_TYPE_CRASH) {
 		dest_image = &kexec_crash_image;
 		if (kexec_crash_image)
 			arch_kexec_unprotect_crashkres();
-	} else {
+	} else
+#endif
 		dest_image = &kexec_image;
-	}
 
 	if (flags & KEXEC_FILE_UNLOAD)
 		goto exchange;
@@ -419,8 +422,10 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
 exchange:
 	image = xchg(dest_image, image);
 out:
+#ifdef CONFIG_CRASH_DUMP
 	if ((flags & KEXEC_FILE_ON_CRASH) && kexec_crash_image)
 		arch_kexec_protect_crashkres();
+#endif
 
 	kexec_unlock();
 	kimage_free(image);
@@ -595,12 +600,14 @@ static int kexec_walk_memblock(struct kexec_buf *kbuf,
 static int kexec_walk_resources(struct kexec_buf *kbuf,
 				int (*func)(struct resource *, void *))
 {
+#ifdef CONFIG_CRASH_DUMP
 	if (kbuf->image->type == KEXEC_TYPE_CRASH)
 		return walk_iomem_res_desc(crashk_res.desc,
 					   IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY,
 					   crashk_res.start, crashk_res.end,
 					   kbuf, func);
-	else if (kbuf->top_down)
+#endif
+	if (kbuf->top_down)
 		return walk_system_ram_res_rev(0, ULONG_MAX, kbuf, func);
 	else
 		return walk_system_ram_res(0, ULONG_MAX, kbuf, func);
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 11526fc42bc24c..fe7a517fc4abbf 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -120,6 +120,7 @@ static ssize_t kexec_loaded_show(struct kobject *kobj,
 }
 KERNEL_ATTR_RO(kexec_loaded);
 
+#ifdef CONFIG_CRASH_DUMP
 static ssize_t kexec_crash_loaded_show(struct kobject *kobj,
 				       struct kobj_attribute *attr, char *buf)
 {
@@ -152,6 +153,7 @@ static ssize_t kexec_crash_size_store(struct kobject *kobj,
 }
 KERNEL_ATTR_RW(kexec_crash_size);
 
+#endif /* CONFIG_CRASH_DUMP*/
 #endif /* CONFIG_KEXEC_CORE */
 
 #ifdef CONFIG_VMCORE_INFO
@@ -262,9 +264,11 @@ static struct attribute * kernel_attrs[] = {
 #endif
 #ifdef CONFIG_KEXEC_CORE
 	&kexec_loaded_attr.attr,
+#ifdef CONFIG_CRASH_DUMP
 	&kexec_crash_loaded_attr.attr,
 	&kexec_crash_size_attr.attr,
 #endif
+#endif
 #ifdef CONFIG_VMCORE_INFO
 	&vmcoreinfo_attr.attr,
 #ifdef CONFIG_CRASH_HOTPLUG

From a7c993c0e7517180779782fdc3c3105fad168d38 Mon Sep 17 00:00:00 2001
From: Baoquan He <bhe@redhat.com>
Date: Wed, 24 Jan 2024 13:12:45 +0800
Subject: [PATCH 1019/1406] crash: clean up kdump related config items

By splitting CRASH_RESERVE and VMCORE_INFO out from CRASH_CORE, cleaning
up the dependency of FA_DMUMP on CRASH_DUMP, and moving crash codes from
kexec_core.c to crash_core.c, now we can rearrange CRASH_DUMP to
depend on KEXEC_CORE, and make CRASH_DUMP select CRASH_RESERVE and
VMCORE_INFO.

KEXEC_CORE won't select CRASH_RESERVE and VMCORE_INFO any more because
KEXEC_CORE enables codes which allocate control pages, copy
kexec/kdump segments, and prepare for switching. These codes are shared
by both kexec reboot and crash dumping.

Doing this makes codes and the corresponding config items more
logical (the right item depends on or is selected by the left item).

PROC_KCORE -----------> VMCORE_INFO

           |----------> VMCORE_INFO
FA_DUMP----|
           |----------> CRASH_RESERVE

                                                ---->VMCORE_INFO
                                               /
                                               |---->CRASH_RESERVE
KEXEC      --|                                /|
             |--> KEXEC_CORE--> CRASH_DUMP-->/-|---->PROC_VMCORE
KEXEC_FILE --|                               \ |
                                               \---->CRASH_HOTPLUG

KEXEC      --|
             |--> KEXEC_CORE--> kexec reboot
KEXEC_FILE --|

Link: https://lkml.kernel.org/r/20240124051254.67105-6-bhe@redhat.com
Signed-off-by: Baoquan He <bhe@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Hari Bathini <hbathini@linux.ibm.com>
Cc: Pingfan Liu <piliu@redhat.com>
Cc: Klara Modin <klarasmodin@gmail.com>
Cc: Michael Kelley <mhklinux@outlook.com>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Yang Li <yang.lee@linux.alibaba.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/Kconfig.kexec | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/kernel/Kconfig.kexec b/kernel/Kconfig.kexec
index 8faf27043432fe..6c34e63c88ff4c 100644
--- a/kernel/Kconfig.kexec
+++ b/kernel/Kconfig.kexec
@@ -9,8 +9,6 @@ config VMCORE_INFO
 	bool
 
 config KEXEC_CORE
-	select VMCORE_INFO
-	select CRASH_RESERVE
 	bool
 
 config KEXEC_ELF
@@ -99,8 +97,11 @@ config KEXEC_JUMP
 
 config CRASH_DUMP
 	bool "kernel crash dumps"
+	default y
 	depends on ARCH_SUPPORTS_CRASH_DUMP
-	select KEXEC_CORE
+	depends on KEXEC_CORE
+	select VMCORE_INFO
+	select CRASH_RESERVE
 	help
 	  Generate crash dump after being started by kexec.
 	  This should be normally only set in special crash dump kernels

From 387395e6ded89bd53d846357a1d82a59b542cf21 Mon Sep 17 00:00:00 2001
From: Baoquan He <bhe@redhat.com>
Date: Wed, 24 Jan 2024 13:12:46 +0800
Subject: [PATCH 1020/1406] x86, crash: wrap crash dumping code into crash
 related ifdefs

Now crash codes under kernel/ folder has been split out from kexec
code, crash dumping can be separated from kexec reboot in config
items on x86 with some adjustments.

Here, also change some ifdefs or IS_ENABLED() check to more appropriate
ones, e,g
 - #ifdef CONFIG_KEXEC_CORE -> #ifdef CONFIG_CRASH_DUMP
 - (!IS_ENABLED(CONFIG_KEXEC_CORE)) - > (!IS_ENABLED(CONFIG_CRASH_RESERVE))

Link: https://lkml.kernel.org/r/20240124051254.67105-7-bhe@redhat.com
Signed-off-by: Baoquan He <bhe@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Hari Bathini <hbathini@linux.ibm.com>
Cc: Pingfan Liu <piliu@redhat.com>
Cc: Klara Modin <klarasmodin@gmail.com>
Cc: Michael Kelley <mhklinux@outlook.com>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Yang Li <yang.lee@linux.alibaba.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/x86/kernel/Makefile           | 4 ++--
 arch/x86/kernel/cpu/mshyperv.c     | 4 ++++
 arch/x86/kernel/kexec-bzimage64.c  | 4 ++++
 arch/x86/kernel/kvm.c              | 4 ++--
 arch/x86/kernel/machine_kexec_64.c | 3 +++
 arch/x86/kernel/reboot.c           | 2 +-
 arch/x86/kernel/setup.c            | 2 +-
 arch/x86/kernel/smp.c              | 2 +-
 arch/x86/xen/enlighten_hvm.c       | 4 ++++
 9 files changed, 22 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 913d4022131eba..3668b1edef2d28 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -100,9 +100,9 @@ obj-$(CONFIG_TRACING)		+= trace.o
 obj-$(CONFIG_RETHOOK)		+= rethook.o
 obj-$(CONFIG_VMCORE_INFO)	+= vmcore_info_$(BITS).o
 obj-$(CONFIG_KEXEC_CORE)	+= machine_kexec_$(BITS).o
-obj-$(CONFIG_KEXEC_CORE)	+= relocate_kernel_$(BITS).o crash.o
+obj-$(CONFIG_KEXEC_CORE)	+= relocate_kernel_$(BITS).o
 obj-$(CONFIG_KEXEC_FILE)	+= kexec-bzimage64.o
-obj-$(CONFIG_CRASH_DUMP)	+= crash_dump_$(BITS).o
+obj-$(CONFIG_CRASH_DUMP)	+= crash_dump_$(BITS).o crash.o
 obj-y				+= kprobes/
 obj-$(CONFIG_MODULES)		+= module.o
 obj-$(CONFIG_X86_32)		+= doublefault_32.o
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 01fa06dd06b66c..f8163a59026ba5 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -210,6 +210,7 @@ static void hv_machine_shutdown(void)
 		hyperv_cleanup();
 }
 
+#ifdef CONFIG_CRASH_DUMP
 static void hv_machine_crash_shutdown(struct pt_regs *regs)
 {
 	if (hv_crash_handler)
@@ -221,6 +222,7 @@ static void hv_machine_crash_shutdown(struct pt_regs *regs)
 	/* Disable the hypercall page when there is only 1 active CPU. */
 	hyperv_cleanup();
 }
+#endif
 #endif /* CONFIG_KEXEC_CORE */
 #endif /* CONFIG_HYPERV */
 
@@ -497,7 +499,9 @@ static void __init ms_hyperv_init_platform(void)
 
 #if IS_ENABLED(CONFIG_HYPERV) && defined(CONFIG_KEXEC_CORE)
 	machine_ops.shutdown = hv_machine_shutdown;
+#ifdef CONFIG_CRASH_DUMP
 	machine_ops.crash_shutdown = hv_machine_crash_shutdown;
+#endif
 #endif
 	if (ms_hyperv.features & HV_ACCESS_TSC_INVARIANT) {
 		/*
diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c
index 2a422e00ed4b42..b55737b83a841a 100644
--- a/arch/x86/kernel/kexec-bzimage64.c
+++ b/arch/x86/kernel/kexec-bzimage64.c
@@ -263,11 +263,13 @@ setup_boot_parameters(struct kimage *image, struct boot_params *params,
 	memset(&params->hd0_info, 0, sizeof(params->hd0_info));
 	memset(&params->hd1_info, 0, sizeof(params->hd1_info));
 
+#ifdef CONFIG_CRASH_DUMP
 	if (image->type == KEXEC_TYPE_CRASH) {
 		ret = crash_setup_memmap_entries(image, params);
 		if (ret)
 			return ret;
 	} else
+#endif
 		setup_e820_entries(params);
 
 	nr_e820_entries = params->e820_entries;
@@ -433,12 +435,14 @@ static void *bzImage64_load(struct kimage *image, char *kernel,
 		return ERR_PTR(-EINVAL);
 	}
 
+#ifdef CONFIG_CRASH_DUMP
 	/* Allocate and load backup region */
 	if (image->type == KEXEC_TYPE_CRASH) {
 		ret = crash_load_segments(image);
 		if (ret)
 			return ERR_PTR(ret);
 	}
+#endif
 
 	/*
 	 * Load purgatory. For 64bit entry point, purgatory  code can be
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 428ee74002e1ea..3c9c327d6706c5 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -770,7 +770,7 @@ static struct notifier_block kvm_pv_reboot_nb = {
  * won't be valid. In cases like kexec, in which you install a new kernel, this
  * means a random memory location will be kept being written.
  */
-#ifdef CONFIG_KEXEC_CORE
+#ifdef CONFIG_CRASH_DUMP
 static void kvm_crash_shutdown(struct pt_regs *regs)
 {
 	kvm_guest_cpu_offline(true);
@@ -853,7 +853,7 @@ static void __init kvm_guest_init(void)
 	kvm_guest_cpu_init();
 #endif
 
-#ifdef CONFIG_KEXEC_CORE
+#ifdef CONFIG_CRASH_DUMP
 	machine_ops.crash_shutdown = kvm_crash_shutdown;
 #endif
 
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index bc0a5348b4a627..b180d8e497c317 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -508,6 +508,8 @@ int arch_kimage_file_post_load_cleanup(struct kimage *image)
 }
 #endif /* CONFIG_KEXEC_FILE */
 
+#ifdef CONFIG_CRASH_DUMP
+
 static int
 kexec_mark_range(unsigned long start, unsigned long end, bool protect)
 {
@@ -552,6 +554,7 @@ void arch_kexec_unprotect_crashkres(void)
 {
 	kexec_mark_crashkres(false);
 }
+#endif
 
 /*
  * During a traditional boot under SME, SME will encrypt the kernel,
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 830425e6d38e2f..1287b0d5962f7f 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -796,7 +796,7 @@ struct machine_ops machine_ops __ro_after_init = {
 	.emergency_restart = native_machine_emergency_restart,
 	.restart = native_machine_restart,
 	.halt = native_machine_halt,
-#ifdef CONFIG_KEXEC_CORE
+#ifdef CONFIG_CRASH_DUMP
 	.crash_shutdown = native_machine_crash_shutdown,
 #endif
 };
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 84201071dfacd1..899d839a2954a7 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -471,7 +471,7 @@ static void __init arch_reserve_crashkernel(void)
 	bool high = false;
 	int ret;
 
-	if (!IS_ENABLED(CONFIG_KEXEC_CORE))
+	if (!IS_ENABLED(CONFIG_CRASH_RESERVE))
 		return;
 
 	ret = parse_crashkernel(cmdline, memblock_phys_mem_size(),
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 96a771f9f930a6..52c3823b721191 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -282,7 +282,7 @@ struct smp_ops smp_ops = {
 	.smp_cpus_done		= native_smp_cpus_done,
 
 	.stop_other_cpus	= native_stop_other_cpus,
-#if defined(CONFIG_KEXEC_CORE)
+#if defined(CONFIG_CRASH_DUMP)
 	.crash_stop_other_cpus	= kdump_nmi_shootdown_cpus,
 #endif
 	.smp_send_reschedule	= native_smp_send_reschedule,
diff --git a/arch/x86/xen/enlighten_hvm.c b/arch/x86/xen/enlighten_hvm.c
index 3f8c34707c5001..09e3db7ff99066 100644
--- a/arch/x86/xen/enlighten_hvm.c
+++ b/arch/x86/xen/enlighten_hvm.c
@@ -149,12 +149,14 @@ static void xen_hvm_shutdown(void)
 		xen_reboot(SHUTDOWN_soft_reset);
 }
 
+#ifdef CONFIG_CRASH_DUMP
 static void xen_hvm_crash_shutdown(struct pt_regs *regs)
 {
 	native_machine_crash_shutdown(regs);
 	xen_reboot(SHUTDOWN_soft_reset);
 }
 #endif
+#endif
 
 static int xen_cpu_up_prepare_hvm(unsigned int cpu)
 {
@@ -236,8 +238,10 @@ static void __init xen_hvm_guest_init(void)
 
 #ifdef CONFIG_KEXEC_CORE
 	machine_ops.shutdown = xen_hvm_shutdown;
+#ifdef CONFIG_CRASH_DUMP
 	machine_ops.crash_shutdown = xen_hvm_crash_shutdown;
 #endif
+#endif
 }
 
 static __init int xen_parse_nopv(char *arg)

From f4648fe8ad439321fc02d6190ae786ef61af20c8 Mon Sep 17 00:00:00 2001
From: Baoquan He <bhe@redhat.com>
Date: Tue, 30 Jan 2024 10:59:59 +0800
Subject: [PATCH 1021/1406] x86, crash: don't nest CONFIG_CRASH_DUMP ifdef
 inside CONFIG_KEXEC_CODE ifdef scope

Michael pointed out that the CONFIG_CRASH_DUMP ifdef is nested inside
CONFIG_KEXEC_CODE ifdef scope in some XEN, Hyper-V codes.

Although the nesting works well too since CONFIG_CRASH_DUMP has
dependency on CONFIG_KEXEC_CORE, it may cause confusion because there
are places where it's not nested, and people may think it needs to be
nested even though it doesn't have to.

Fix that by moving  CONFIG_CRASH_DUMP ifdeffery of codes out of
CONFIG_KEXEC_CODE ifdeffery scope.

And also put function machine_crash_shutdown() definition inside
CONFIG_CRASH_DUMP ifdef scope instead of CONFIG_KEXEC_CORE ifdef.

And also fix a building error Nathan reported as below by replacing
CONFIG_KEXEC_CORE ifdef with CONFIG_VMCORE_INFO ifdef.

====
$ curl -LSso .config https://git.alpinelinux.org/aports/plain/community/linux-edge/config-edge.x86_64
$ make -skj"$(nproc)" ARCH=x86_64 CROSS_COMPILE=x86_64-linux- olddefconfig all
..
x86_64-linux-ld: arch/x86/xen/mmu_pv.o: in function `paddr_vmcoreinfo_note':
mmu_pv.c:(.text+0x3af3): undefined reference to `vmcoreinfo_note'
====

Link: https://lore.kernel.org/all/SN6PR02MB4157931105FA68D72E3D3DB8D47B2@SN6PR02MB4157.namprd02.prod.outlook.com/T/#u
Link: https://lore.kernel.org/all/20240126045551.GA126645@dev-arch.thelio-3990X/T/#u
Link: https://lkml.kernel.org/r/ZbhmL/jQtZ7TFZqV@MiWiFi-R3L-srv
Signed-off-by: Baoquan He <bhe@redhat.com>
Reviewed-by: Michael Kelley <mhklinux@outlook.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Hari Bathini <hbathini@linux.ibm.com>
Cc: Klara Modin <klarasmodin@gmail.com>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Pingfan Liu <piliu@redhat.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Yang Li <yang.lee@linux.alibaba.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/x86/kernel/cpu/mshyperv.c | 10 ++++++----
 arch/x86/kernel/reboot.c       |  2 +-
 arch/x86/xen/enlighten_hvm.c   |  4 ++--
 arch/x86/xen/mmu_pv.c          |  2 +-
 4 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index f8163a59026ba5..2e8cd5a4ae859f 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -209,6 +209,7 @@ static void hv_machine_shutdown(void)
 	if (kexec_in_progress)
 		hyperv_cleanup();
 }
+#endif /* CONFIG_KEXEC_CORE */
 
 #ifdef CONFIG_CRASH_DUMP
 static void hv_machine_crash_shutdown(struct pt_regs *regs)
@@ -222,8 +223,7 @@ static void hv_machine_crash_shutdown(struct pt_regs *regs)
 	/* Disable the hypercall page when there is only 1 active CPU. */
 	hyperv_cleanup();
 }
-#endif
-#endif /* CONFIG_KEXEC_CORE */
+#endif /* CONFIG_CRASH_DUMP */
 #endif /* CONFIG_HYPERV */
 
 static uint32_t  __init ms_hyperv_platform(void)
@@ -497,9 +497,11 @@ static void __init ms_hyperv_init_platform(void)
 	no_timer_check = 1;
 #endif
 
-#if IS_ENABLED(CONFIG_HYPERV) && defined(CONFIG_KEXEC_CORE)
+#if IS_ENABLED(CONFIG_HYPERV)
+#if defined(CONFIG_KEXEC_CORE)
 	machine_ops.shutdown = hv_machine_shutdown;
-#ifdef CONFIG_CRASH_DUMP
+#endif
+#if defined(CONFIG_CRASH_DUMP)
 	machine_ops.crash_shutdown = hv_machine_crash_shutdown;
 #endif
 #endif
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 1287b0d5962f7f..f3130f762784a1 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -826,7 +826,7 @@ void machine_halt(void)
 	machine_ops.halt();
 }
 
-#ifdef CONFIG_KEXEC_CORE
+#ifdef CONFIG_CRASH_DUMP
 void machine_crash_shutdown(struct pt_regs *regs)
 {
 	machine_ops.crash_shutdown(regs);
diff --git a/arch/x86/xen/enlighten_hvm.c b/arch/x86/xen/enlighten_hvm.c
index 09e3db7ff99066..0b367c1e086d61 100644
--- a/arch/x86/xen/enlighten_hvm.c
+++ b/arch/x86/xen/enlighten_hvm.c
@@ -148,6 +148,7 @@ static void xen_hvm_shutdown(void)
 	if (kexec_in_progress)
 		xen_reboot(SHUTDOWN_soft_reset);
 }
+#endif
 
 #ifdef CONFIG_CRASH_DUMP
 static void xen_hvm_crash_shutdown(struct pt_regs *regs)
@@ -156,7 +157,6 @@ static void xen_hvm_crash_shutdown(struct pt_regs *regs)
 	xen_reboot(SHUTDOWN_soft_reset);
 }
 #endif
-#endif
 
 static int xen_cpu_up_prepare_hvm(unsigned int cpu)
 {
@@ -238,10 +238,10 @@ static void __init xen_hvm_guest_init(void)
 
 #ifdef CONFIG_KEXEC_CORE
 	machine_ops.shutdown = xen_hvm_shutdown;
+#endif
 #ifdef CONFIG_CRASH_DUMP
 	machine_ops.crash_shutdown = xen_hvm_crash_shutdown;
 #endif
-#endif
 }
 
 static __init int xen_parse_nopv(char *arg)
diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
index 72af496a160c8b..5744043deb6c85 100644
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -2520,7 +2520,7 @@ int xen_remap_pfn(struct vm_area_struct *vma, unsigned long addr,
 }
 EXPORT_SYMBOL_GPL(xen_remap_pfn);
 
-#ifdef CONFIG_KEXEC_CORE
+#ifdef CONFIG_VMCORE_INFO
 phys_addr_t paddr_vmcoreinfo_note(void)
 {
 	if (xen_pv_domain())

From bbffebca10cce16191f72ef1f169760dbbe44b43 Mon Sep 17 00:00:00 2001
From: Baoquan He <bhe@redhat.com>
Date: Wed, 24 Jan 2024 13:12:47 +0800
Subject: [PATCH 1022/1406] arm64, crash: wrap crash dumping code into crash
 related ifdefs

Now crash codes under kernel/ folder has been split out from kexec
code, crash dumping can be separated from kexec reboot in config
items on arm64 with some adjustments.

Here wrap up crash dumping codes with CONFIG_CRASH_DUMP ifdeffery.

Link: https://lkml.kernel.org/r/20240124051254.67105-8-bhe@redhat.com
Signed-off-by: Baoquan He <bhe@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Hari Bathini <hbathini@linux.ibm.com>
Cc: Pingfan Liu <piliu@redhat.com>
Cc: Klara Modin <klarasmodin@gmail.com>
Cc: Michael Kelley <mhklinux@outlook.com>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Yang Li <yang.lee@linux.alibaba.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/arm64/include/asm/kexec.h         |  2 +-
 arch/arm64/kernel/machine_kexec.c      |  2 +-
 arch/arm64/kernel/machine_kexec_file.c | 10 ++++++++--
 arch/arm64/mm/init.c                   |  2 +-
 4 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/arch/arm64/include/asm/kexec.h b/arch/arm64/include/asm/kexec.h
index 9ac9572a3bbee2..4d9cc7a76d9ca1 100644
--- a/arch/arm64/include/asm/kexec.h
+++ b/arch/arm64/include/asm/kexec.h
@@ -80,7 +80,7 @@ static inline void crash_setup_regs(struct pt_regs *newregs,
 	}
 }
 
-#if defined(CONFIG_KEXEC_CORE) && defined(CONFIG_HIBERNATION)
+#if defined(CONFIG_CRASH_DUMP) && defined(CONFIG_HIBERNATION)
 extern bool crash_is_nosave(unsigned long pfn);
 extern void crash_prepare_suspend(void);
 extern void crash_post_resume(void);
diff --git a/arch/arm64/kernel/machine_kexec.c b/arch/arm64/kernel/machine_kexec.c
index b38aae5b488d07..82e2203d86a31f 100644
--- a/arch/arm64/kernel/machine_kexec.c
+++ b/arch/arm64/kernel/machine_kexec.c
@@ -255,7 +255,7 @@ void machine_crash_shutdown(struct pt_regs *regs)
 	pr_info("Starting crashdump kernel...\n");
 }
 
-#ifdef CONFIG_HIBERNATION
+#if defined(CONFIG_CRASH_DUMP) && defined(CONFIG_HIBERNATION)
 /*
  * To preserve the crash dump kernel image, the relevant memory segments
  * should be mapped again around the hibernation.
diff --git a/arch/arm64/kernel/machine_kexec_file.c b/arch/arm64/kernel/machine_kexec_file.c
index 0e017358f4ba64..af1ca875c52ce2 100644
--- a/arch/arm64/kernel/machine_kexec_file.c
+++ b/arch/arm64/kernel/machine_kexec_file.c
@@ -39,6 +39,7 @@ int arch_kimage_file_post_load_cleanup(struct kimage *image)
 	return kexec_image_post_load_cleanup_default(image);
 }
 
+#ifdef CONFIG_CRASH_DUMP
 static int prepare_elf_headers(void **addr, unsigned long *sz)
 {
 	struct crash_mem *cmem;
@@ -80,6 +81,7 @@ static int prepare_elf_headers(void **addr, unsigned long *sz)
 	kfree(cmem);
 	return ret;
 }
+#endif
 
 /*
  * Tries to add the initrd and DTB to the image. If it is not possible to find
@@ -93,8 +95,8 @@ int load_other_segments(struct kimage *image,
 			char *cmdline)
 {
 	struct kexec_buf kbuf;
-	void *headers, *dtb = NULL;
-	unsigned long headers_sz, initrd_load_addr = 0, dtb_len,
+	void *dtb = NULL;
+	unsigned long initrd_load_addr = 0, dtb_len,
 		      orig_segments = image->nr_segments;
 	int ret = 0;
 
@@ -102,7 +104,10 @@ int load_other_segments(struct kimage *image,
 	/* not allocate anything below the kernel */
 	kbuf.buf_min = kernel_load_addr + kernel_size;
 
+#ifdef CONFIG_CRASH_DUMP
 	/* load elf core header */
+	void *headers;
+	unsigned long headers_sz;
 	if (image->type == KEXEC_TYPE_CRASH) {
 		ret = prepare_elf_headers(&headers, &headers_sz);
 		if (ret) {
@@ -130,6 +135,7 @@ int load_other_segments(struct kimage *image,
 		kexec_dprintk("Loaded elf core header at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
 			      image->elf_load_addr, kbuf.bufsz, kbuf.memsz);
 	}
+#endif
 
 	/* load initrd */
 	if (initrd) {
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index 74c1db8ce271d8..c1f6213e77f328 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -100,7 +100,7 @@ static void __init arch_reserve_crashkernel(void)
 	bool high = false;
 	int ret;
 
-	if (!IS_ENABLED(CONFIG_KEXEC_CORE))
+	if (!IS_ENABLED(CONFIG_CRASH_RESERVE))
 		return;
 
 	ret = parse_crashkernel(cmdline, memblock_phys_mem_size(),

From e9f441866d095f4939edcb67e21f06faf1a6c8a8 Mon Sep 17 00:00:00 2001
From: Baoquan He <bhe@redhat.com>
Date: Mon, 29 Jan 2024 21:50:32 +0800
Subject: [PATCH 1023/1406] crash: fix building error in generic codes

Nathan reported some building errors on arm64 as below:

==========
$ curl -LSso .config https://github.com/archlinuxarm/PKGBUILDs/raw/master/core/linux-aarch64/config
$ make -skj"$(nproc)" ARCH=arm64 CROSS_COMPILE=aarch64-linux- olddefconfig all
..
aarch64-linux-ld: kernel/kexec_file.o: in function `kexec_walk_memblock.constprop.0':
kexec_file.c:(.text+0x314): undefined reference to `crashk_res'
..
aarch64-linux-ld: drivers/of/kexec.o: in function `of_kexec_alloc_and_setup_fdt':
kexec.c:(.text+0x580): undefined reference to `crashk_res'
..
aarch64-linux-ld: kexec.c:(.text+0x5c0): undefined reference to `crashk_low_res'
==========

On the provided config, it has:
===
CONFIG_VMCORE_INFO=y
CONFIG_KEXEC_CORE=y
CONFIG_KEXEC=y
CONFIG_KEXEC_FILE=y
===

For these crash related code blocks, they need put inside CONFIG_CRASH_DUMP
ifdeffery scope to avoid building erorr when CONFIG_CRASH_DUMP is not
set.

Link: https://lkml.kernel.org/r/20240129135033.157195-2-bhe@redhat.com
Signed-off-by: Baoquan He <bhe@redhat.com>
Reported-by: Nathan Chancellor <nathan@kernel.org>
Closes: https://lore.kernel.org/all/20240126045551.GA126645@dev-arch.thelio-3990X/T/#u
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Hari Bathini <hbathini@linux.ibm.com>
Cc: Klara Modin <klarasmodin@gmail.com>
Cc: Michael Kelley <mhklinux@outlook.com>
Cc: Pingfan Liu <piliu@redhat.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Yang Li <yang.lee@linux.alibaba.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 drivers/of/kexec.c  | 2 ++
 kernel/kexec_file.c | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/drivers/of/kexec.c b/drivers/of/kexec.c
index 68278340cecfe5..9ccde2fd77cbf5 100644
--- a/drivers/of/kexec.c
+++ b/drivers/of/kexec.c
@@ -395,6 +395,7 @@ void *of_kexec_alloc_and_setup_fdt(const struct kimage *image,
 		if (ret)
 			goto out;
 
+#ifdef CONFIG_CRASH_DUMP
 		/* add linux,usable-memory-range */
 		ret = fdt_appendprop_addrrange(fdt, 0, chosen_node,
 				"linux,usable-memory-range", crashk_res.start,
@@ -410,6 +411,7 @@ void *of_kexec_alloc_and_setup_fdt(const struct kimage *image,
 			if (ret)
 				goto out;
 		}
+#endif
 	}
 
 	/* add bootargs */
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index ce7ce2ae27cdfe..2d1db05fbf04f3 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -540,8 +540,10 @@ static int kexec_walk_memblock(struct kexec_buf *kbuf,
 	phys_addr_t mstart, mend;
 	struct resource res = { };
 
+#ifdef CONFIG_CRASH_DUMP
 	if (kbuf->image->type == KEXEC_TYPE_CRASH)
 		return func(&crashk_res, kbuf);
+#endif
 
 	/*
 	 * Using MEMBLOCK_NONE will properly skip MEMBLOCK_DRIVER_MANAGED. See

From 48d9c93763e3ebdd7939b18557a3364cfb0ff699 Mon Sep 17 00:00:00 2001
From: Baoquan He <bhe@redhat.com>
Date: Wed, 24 Jan 2024 13:12:48 +0800
Subject: [PATCH 1024/1406] ppc, crash: enforce KEXEC and KEXEC_FILE to select
 CRASH_DUMP

In PowerPC, the crash dumping and kexec reboot share code in
arch_kexec_locate_mem_hole(), in which struct crash_mem is used.

Here enfoce enforce KEXEC and KEXEC_FILE to select CRASH_DUMP for now.

Link: https://lkml.kernel.org/r/20240124051254.67105-9-bhe@redhat.com
Signed-off-by: Baoquan He <bhe@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Hari Bathini <hbathini@linux.ibm.com>
Cc: Pingfan Liu <piliu@redhat.com>
Cc: Klara Modin <klarasmodin@gmail.com>
Cc: Michael Kelley <mhklinux@outlook.com>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Yang Li <yang.lee@linux.alibaba.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/powerpc/Kconfig | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index a9efaf87966dfd..644b0cfba902ed 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -608,6 +608,10 @@ config PPC64_SUPPORTS_MEMORY_FAILURE
 config ARCH_SUPPORTS_KEXEC
 	def_bool PPC_BOOK3S || PPC_E500 || (44x && !SMP)
 
+config ARCH_SELECTS_KEXEC
+	def_bool y
+	select CRASH_DUMP
+
 config ARCH_SUPPORTS_KEXEC_FILE
 	def_bool PPC64
 
@@ -618,6 +622,7 @@ config ARCH_SELECTS_KEXEC_FILE
 	def_bool y
 	depends on KEXEC_FILE
 	select KEXEC_ELF
+	select CRASH_DUMP
 	select HAVE_IMA_KEXEC if IMA
 
 config PPC64_BIG_ENDIAN_ELF_ABI_V2

From 6ddc37cef1f3b6847b70a8742ac99b8ed76cfcb3 Mon Sep 17 00:00:00 2001
From: Baoquan He <bhe@redhat.com>
Date: Thu, 25 Jan 2024 22:29:07 +0800
Subject: [PATCH 1025/1406] 
 ppc-crash-enforce-kexec-and-kexec_file-to-select-crash_dump-fix

I reproduced the failure with allnoconfig on ppc, and found below change
can fix it too. And the change makes ARCH_SELECTS_KEXEC consistent with
ARCH_SELECTS_KEXEC_FILE on the dependency. What do you think?

Link: https://lkml.kernel.org/r/ZbJwMyCpz4HDySoo@MiWiFi-R3L-srv
Signed-off-by: Baoquan He <bhe@redhat.com>
Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Hari Bathini <hbathini@linux.ibm.com>
Cc: Klara Modin <klarasmodin@gmail.com>
Cc: Michael Kelley <mhklinux@outlook.com>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Pingfan Liu <piliu@redhat.com>
Cc: Yang Li <yang.lee@linux.alibaba.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/powerpc/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 644b0cfba902ed..c3b44eba053323 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -610,6 +610,7 @@ config ARCH_SUPPORTS_KEXEC
 
 config ARCH_SELECTS_KEXEC
 	def_bool y
+	depends on KEXEC
 	select CRASH_DUMP
 
 config ARCH_SUPPORTS_KEXEC_FILE

From 3868960800c06935db22841f8efb42042c31410e Mon Sep 17 00:00:00 2001
From: Baoquan He <bhe@redhat.com>
Date: Wed, 24 Jan 2024 13:12:49 +0800
Subject: [PATCH 1026/1406] s390, crash: wrap crash dumping code into crash
 related ifdefs

Now crash codes under kernel/ folder has been split out from kexec
code, crash dumping can be separated from kexec reboot in config
items on s390 with some adjustments.

Here wrap up crash dumping codes with CONFIG_CRASH_DUMP ifdeffery.

Link: https://lkml.kernel.org/r/20240124051254.67105-10-bhe@redhat.com
Signed-off-by: Baoquan He <bhe@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Hari Bathini <hbathini@linux.ibm.com>
Cc: Pingfan Liu <piliu@redhat.com>
Cc: Klara Modin <klarasmodin@gmail.com>
Cc: Michael Kelley <mhklinux@outlook.com>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Yang Li <yang.lee@linux.alibaba.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/s390/kernel/kexec_elf.c          |  2 ++
 arch/s390/kernel/kexec_image.c        |  2 ++
 arch/s390/kernel/machine_kexec_file.c | 10 ++++++++++
 3 files changed, 14 insertions(+)

diff --git a/arch/s390/kernel/kexec_elf.c b/arch/s390/kernel/kexec_elf.c
index 9da6fa30c44749..4d364de4379921 100644
--- a/arch/s390/kernel/kexec_elf.c
+++ b/arch/s390/kernel/kexec_elf.c
@@ -40,8 +40,10 @@ static int kexec_file_add_kernel_elf(struct kimage *image,
 		buf.bufsz = phdr->p_filesz;
 
 		buf.mem = ALIGN(phdr->p_paddr, phdr->p_align);
+#ifdef CONFIG_CRASH_DUMP
 		if (image->type == KEXEC_TYPE_CRASH)
 			buf.mem += crashk_res.start;
+#endif
 		buf.memsz = phdr->p_memsz;
 		data->memsz = ALIGN(data->memsz, phdr->p_align) + buf.memsz;
 
diff --git a/arch/s390/kernel/kexec_image.c b/arch/s390/kernel/kexec_image.c
index af23eff5774dba..a32ce8bea745cf 100644
--- a/arch/s390/kernel/kexec_image.c
+++ b/arch/s390/kernel/kexec_image.c
@@ -24,8 +24,10 @@ static int kexec_file_add_kernel_image(struct kimage *image,
 	buf.bufsz = image->kernel_buf_len;
 
 	buf.mem = 0;
+#ifdef CONFIG_CRASH_DUMP
 	if (image->type == KEXEC_TYPE_CRASH)
 		buf.mem += crashk_res.start;
+#endif
 	buf.memsz = buf.bufsz;
 
 	data->kernel_buf = image->kernel_buf;
diff --git a/arch/s390/kernel/machine_kexec_file.c b/arch/s390/kernel/machine_kexec_file.c
index 8d207b82d9fedd..c2bac14dd668ae 100644
--- a/arch/s390/kernel/machine_kexec_file.c
+++ b/arch/s390/kernel/machine_kexec_file.c
@@ -105,6 +105,7 @@ static int kexec_file_update_purgatory(struct kimage *image,
 	if (ret)
 		return ret;
 
+#ifdef CONFIG_CRASH_DUMP
 	if (image->type == KEXEC_TYPE_CRASH) {
 		u64 crash_size;
 
@@ -121,6 +122,7 @@ static int kexec_file_update_purgatory(struct kimage *image,
 						     sizeof(crash_size),
 						     false);
 	}
+#endif
 	return ret;
 }
 
@@ -134,8 +136,10 @@ static int kexec_file_add_purgatory(struct kimage *image,
 
 	data->memsz = ALIGN(data->memsz, PAGE_SIZE);
 	buf.mem = data->memsz;
+#ifdef CONFIG_CRASH_DUMP
 	if (image->type == KEXEC_TYPE_CRASH)
 		buf.mem += crashk_res.start;
+#endif
 
 	ret = kexec_load_purgatory(image, &buf);
 	if (ret)
@@ -158,8 +162,10 @@ static int kexec_file_add_initrd(struct kimage *image,
 
 	data->memsz = ALIGN(data->memsz, PAGE_SIZE);
 	buf.mem = data->memsz;
+#ifdef CONFIG_CRASH_DUMP
 	if (image->type == KEXEC_TYPE_CRASH)
 		buf.mem += crashk_res.start;
+#endif
 	buf.memsz = buf.bufsz;
 
 	data->parm->initrd_start = data->memsz;
@@ -223,8 +229,10 @@ static int kexec_file_add_ipl_report(struct kimage *image,
 		data->kernel_buf + offsetof(struct lowcore, ipl_parmblock_ptr);
 	*lc_ipl_parmblock_ptr = (__u32)buf.mem;
 
+#ifdef CONFIG_CRASH_DUMP
 	if (image->type == KEXEC_TYPE_CRASH)
 		buf.mem += crashk_res.start;
+#endif
 
 	ret = kexec_add_buffer(&buf);
 out:
@@ -268,10 +276,12 @@ void *kexec_file_add_components(struct kimage *image,
 	memcpy(data.parm->command_line, image->cmdline_buf,
 	       image->cmdline_buf_len);
 
+#ifdef CONFIG_CRASH_DUMP
 	if (image->type == KEXEC_TYPE_CRASH) {
 		data.parm->oldmem_base = crashk_res.start;
 		data.parm->oldmem_size = crashk_res.end - crashk_res.start + 1;
 	}
+#endif
 
 	if (image->initrd_buf) {
 		ret = kexec_file_add_initrd(image, &data);

From 4083cf1ecd8501cac02888b8ce06bbe8bfad4f8e Mon Sep 17 00:00:00 2001
From: Baoquan He <bhe@redhat.com>
Date: Wed, 24 Jan 2024 13:12:50 +0800
Subject: [PATCH 1027/1406] sh, crash: wrap crash dumping code into crash
 related ifdefs

Now crash codes under kernel/ folder has been split out from kexec
code, crash dumping can be separated from kexec reboot in config
items on SuperH with some adjustments.

Wrap up crash dumping codes with CONFIG_CRASH_DUMP ifdeffery, and use
IS_ENABLED(CONFIG_CRASH_RESERVE) check to decide if compiling in the
crashkernel reservation code.

Link: https://lkml.kernel.org/r/20240124051254.67105-11-bhe@redhat.com
Signed-off-by: Baoquan He <bhe@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Hari Bathini <hbathini@linux.ibm.com>
Cc: Pingfan Liu <piliu@redhat.com>
Cc: Klara Modin <klarasmodin@gmail.com>
Cc: Michael Kelley <mhklinux@outlook.com>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Yang Li <yang.lee@linux.alibaba.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/sh/kernel/machine_kexec.c | 3 +++
 arch/sh/kernel/setup.c         | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/arch/sh/kernel/machine_kexec.c b/arch/sh/kernel/machine_kexec.c
index fa3a7b36190a2a..8daa8a6e6fa683 100644
--- a/arch/sh/kernel/machine_kexec.c
+++ b/arch/sh/kernel/machine_kexec.c
@@ -153,6 +153,9 @@ void __init reserve_crashkernel(void)
 	unsigned long long crash_size, crash_base;
 	int ret;
 
+	if (!IS_ENABLED(CONFIG_CRASH_RESERVE))
+		return;
+
 	ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(),
 			&crash_size, &crash_base, NULL, NULL);
 	if (ret == 0 && crash_size > 0) {
diff --git a/arch/sh/kernel/setup.c b/arch/sh/kernel/setup.c
index d3175f09b3aad9..620e5cf8ae1e74 100644
--- a/arch/sh/kernel/setup.c
+++ b/arch/sh/kernel/setup.c
@@ -220,7 +220,7 @@ void __init __add_active_range(unsigned int nid, unsigned long start_pfn,
 	request_resource(res, &code_resource);
 	request_resource(res, &data_resource);
 	request_resource(res, &bss_resource);
-#ifdef CONFIG_KEXEC_CORE
+#ifdef CONFIG_CRASH_RESERVE
 	request_resource(res, &crashk_res);
 #endif
 

From af7c6249c0b7db1b20c632312235479995e47c17 Mon Sep 17 00:00:00 2001
From: Baoquan He <bhe@redhat.com>
Date: Wed, 24 Jan 2024 13:12:51 +0800
Subject: [PATCH 1028/1406] mips, crash: wrap crash dumping code into crash
 related ifdefs

Now crash codes under kernel/ folder has been split out from kexec
code, crash dumping can be separated from kexec reboot in config
items on mips with some adjustments.

Here use IS_ENABLED(CONFIG_CRASH_RESERVE) check to decide if compiling
in the crashkernel reservation code.

Link: https://lkml.kernel.org/r/20240124051254.67105-12-bhe@redhat.com
Signed-off-by: Baoquan He <bhe@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Hari Bathini <hbathini@linux.ibm.com>
Cc: Pingfan Liu <piliu@redhat.com>
Cc: Klara Modin <klarasmodin@gmail.com>
Cc: Michael Kelley <mhklinux@outlook.com>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Yang Li <yang.lee@linux.alibaba.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/mips/kernel/setup.c | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/arch/mips/kernel/setup.c b/arch/mips/kernel/setup.c
index 9c30de15159761..12a1a4ffb60211 100644
--- a/arch/mips/kernel/setup.c
+++ b/arch/mips/kernel/setup.c
@@ -442,8 +442,6 @@ static void __init mips_reserve_vmcore(void)
 #endif
 }
 
-#ifdef CONFIG_KEXEC
-
 /* 64M alignment for crash kernel regions */
 #define CRASH_ALIGN	SZ_64M
 #define CRASH_ADDR_MAX	SZ_512M
@@ -454,6 +452,9 @@ static void __init mips_parse_crashkernel(void)
 	unsigned long long crash_size, crash_base;
 	int ret;
 
+	if (!IS_ENABLED(CONFIG_CRASH_RESERVE))
+		return;
+
 	total_mem = memblock_phys_mem_size();
 	ret = parse_crashkernel(boot_command_line, total_mem,
 				&crash_size, &crash_base,
@@ -489,6 +490,9 @@ static void __init request_crashkernel(struct resource *res)
 {
 	int ret;
 
+	if (!IS_ENABLED(CONFIG_CRASH_RESERVE))
+		return;
+
 	if (crashk_res.start == crashk_res.end)
 		return;
 
@@ -498,15 +502,6 @@ static void __init request_crashkernel(struct resource *res)
 			(unsigned long)(resource_size(&crashk_res) >> 20),
 			(unsigned long)(crashk_res.start  >> 20));
 }
-#else /* !defined(CONFIG_KEXEC)		*/
-static void __init mips_parse_crashkernel(void)
-{
-}
-
-static void __init request_crashkernel(struct resource *res)
-{
-}
-#endif /* !defined(CONFIG_KEXEC)  */
 
 static void __init check_kernel_sections_mem(void)
 {

From 4b0c67e6aeedfb28a0e6a96e688a572a17bc4e67 Mon Sep 17 00:00:00 2001
From: Baoquan He <bhe@redhat.com>
Date: Wed, 24 Jan 2024 13:12:52 +0800
Subject: [PATCH 1029/1406] riscv, crash: wrap crash dumping code into crash
 related ifdefs

Now crash codes under kernel/ folder has been split out from kexec
code, crash dumping can be separated from kexec reboot in config
items on risc-v with some adjustments.

Here wrap up crash dumping codes with CONFIG_CRASH_DUMP ifdeffery, and
use IS_ENABLED(CONFIG_CRASH_RESERVE) check to decide if compiling
in the crashkernel reservation code.

Link: https://lkml.kernel.org/r/20240124051254.67105-13-bhe@redhat.com
Signed-off-by: Baoquan He <bhe@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Hari Bathini <hbathini@linux.ibm.com>
Cc: Pingfan Liu <piliu@redhat.com>
Cc: Klara Modin <klarasmodin@gmail.com>
Cc: Michael Kelley <mhklinux@outlook.com>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Yang Li <yang.lee@linux.alibaba.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/riscv/kernel/elf_kexec.c | 9 +++++++--
 arch/riscv/mm/init.c          | 2 +-
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/arch/riscv/kernel/elf_kexec.c b/arch/riscv/kernel/elf_kexec.c
index 5bd1ec3341fe9c..54260c16f9912a 100644
--- a/arch/riscv/kernel/elf_kexec.c
+++ b/arch/riscv/kernel/elf_kexec.c
@@ -117,6 +117,7 @@ static int elf_find_pbase(struct kimage *image, unsigned long kernel_len,
 	return ret;
 }
 
+#ifdef CONFIG_CRASH_DUMP
 static int get_nr_ram_ranges_callback(struct resource *res, void *arg)
 {
 	unsigned int *nr_ranges = arg;
@@ -189,6 +190,7 @@ static char *setup_kdump_cmdline(struct kimage *image, char *cmdline,
 	cmdline_ptr[COMMAND_LINE_SIZE - 1] = '\0';
 	return cmdline_ptr;
 }
+#endif
 
 static void *elf_kexec_load(struct kimage *image, char *kernel_buf,
 			    unsigned long kernel_len, char *initrd,
@@ -196,12 +198,11 @@ static void *elf_kexec_load(struct kimage *image, char *kernel_buf,
 			    unsigned long cmdline_len)
 {
 	int ret;
+	void *fdt;
 	unsigned long old_kernel_pbase = ULONG_MAX;
 	unsigned long new_kernel_pbase = 0UL;
 	unsigned long initrd_pbase = 0UL;
-	unsigned long headers_sz;
 	unsigned long kernel_start;
-	void *fdt, *headers;
 	struct elfhdr ehdr;
 	struct kexec_buf kbuf;
 	struct kexec_elf_info elf_info;
@@ -227,8 +228,11 @@ static void *elf_kexec_load(struct kimage *image, char *kernel_buf,
 	kbuf.buf_min = new_kernel_pbase + kernel_len;
 	kbuf.buf_max = ULONG_MAX;
 
+#ifdef CONFIG_CRASH_DUMP
 	/* Add elfcorehdr */
 	if (image->type == KEXEC_TYPE_CRASH) {
+		void *headers;
+		unsigned long headers_sz;
 		ret = prepare_elf_headers(&headers, &headers_sz);
 		if (ret) {
 			pr_err("Preparing elf core header failed\n");
@@ -264,6 +268,7 @@ static void *elf_kexec_load(struct kimage *image, char *kernel_buf,
 		}
 		cmdline = modified_cmdline;
 	}
+#endif
 
 #ifdef CONFIG_ARCH_SUPPORTS_KEXEC_PURGATORY
 	/* Add purgatory to the image */
diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
index fa34cf55037bd3..eea8adae058be2 100644
--- a/arch/riscv/mm/init.c
+++ b/arch/riscv/mm/init.c
@@ -1358,7 +1358,7 @@ static void __init arch_reserve_crashkernel(void)
 	bool high = false;
 	int ret;
 
-	if (!IS_ENABLED(CONFIG_KEXEC_CORE))
+	if (!IS_ENABLED(CONFIG_CRASH_RESERVE))
 		return;
 
 	ret = parse_crashkernel(cmdline, memblock_phys_mem_size(),

From d61f2fb4b51359f7350e6377e52475c14aa80f38 Mon Sep 17 00:00:00 2001
From: Baoquan He <bhe@redhat.com>
Date: Wed, 24 Jan 2024 13:12:53 +0800
Subject: [PATCH 1030/1406] arm, crash: wrap crash dumping code into crash
 related ifdefs

Now crash codes under kernel/ folder has been split out from kexec
code, crash dumping can be separated from kexec reboot in config
items on arm with some adjustments.

Here use CONFIG_CRASH_RESERVE ifdef to replace CONFIG_KEXEC ifdef.

Link: https://lkml.kernel.org/r/20240124051254.67105-14-bhe@redhat.com
Signed-off-by: Baoquan He <bhe@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Hari Bathini <hbathini@linux.ibm.com>
Cc: Pingfan Liu <piliu@redhat.com>
Cc: Klara Modin <klarasmodin@gmail.com>
Cc: Michael Kelley <mhklinux@outlook.com>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Yang Li <yang.lee@linux.alibaba.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/arm/kernel/setup.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c
index ff2299ce1ad7a3..7b33b157fca0dc 100644
--- a/arch/arm/kernel/setup.c
+++ b/arch/arm/kernel/setup.c
@@ -979,7 +979,7 @@ static int __init init_machine_late(void)
 }
 late_initcall(init_machine_late);
 
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_CRASH_RESERVE
 /*
  * The crash region must be aligned to 128MB to avoid
  * zImage relocating below the reserved region.
@@ -1066,7 +1066,7 @@ static void __init reserve_crashkernel(void)
 }
 #else
 static inline void reserve_crashkernel(void) {}
-#endif /* CONFIG_KEXEC */
+#endif /* CONFIG_CRASH_RESERVE*/
 
 void __init hyp_mode_check(void)
 {

From d2366cb8409881d31c5dd91b3503716ee8e0888f Mon Sep 17 00:00:00 2001
From: Baoquan He <bhe@redhat.com>
Date: Wed, 24 Jan 2024 13:12:54 +0800
Subject: [PATCH 1031/1406] loongarch, crash: wrap crash dumping code into
 crash related ifdefs

Now crash codes under kernel/ folder has been split out from kexec
code, crash dumping can be separated from kexec reboot in config
items on loongarch with some adjustments.

Here use IS_ENABLED(CONFIG_CRASH_RESERVE) check to decide if compiling
in the crashkernel reservation code.

Link: https://lkml.kernel.org/r/20240124051254.67105-15-bhe@redhat.com
Signed-off-by: Baoquan He <bhe@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Hari Bathini <hbathini@linux.ibm.com>
Cc: Pingfan Liu <piliu@redhat.com>
Cc: Klara Modin <klarasmodin@gmail.com>
Cc: Michael Kelley <mhklinux@outlook.com>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Yang Li <yang.lee@linux.alibaba.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/loongarch/kernel/setup.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/loongarch/kernel/setup.c b/arch/loongarch/kernel/setup.c
index edf2bba8013067..57d37dd9f964d3 100644
--- a/arch/loongarch/kernel/setup.c
+++ b/arch/loongarch/kernel/setup.c
@@ -260,7 +260,7 @@ static void __init arch_reserve_crashkernel(void)
 	char *cmdline = boot_command_line;
 	bool high = false;
 
-	if (!IS_ENABLED(CONFIG_KEXEC_CORE))
+	if (!IS_ENABLED(CONFIG_CRASH_RESERVE))
 		return;
 
 	ret = parse_crashkernel(cmdline, memblock_phys_mem_size(),

From ca38020d4bcf45483c717d307acbb02708e00a9b Mon Sep 17 00:00:00 2001
From: Baoquan He <bhe@redhat.com>
Date: Mon, 29 Jan 2024 21:50:33 +0800
Subject: [PATCH 1032/1406] arch, crash: move arch_crash_save_vmcoreinfo() out
 to file vmcore_info.c

Nathan reported below building error:

=====
$ curl -LSso .config https://git.alpinelinux.org/aports/plain/community/linux-edge/config-edge.armv7
$ make -skj"$(nproc)" ARCH=arm CROSS_COMPILE=arm-linux-gnueabi- olddefconfig all
..
arm-linux-gnueabi-ld: arch/arm/kernel/machine_kexec.o: in function `arch_crash_save_vmcoreinfo':
machine_kexec.c:(.text+0x488): undefined reference to `vmcoreinfo_append_str'
====

On architecutres, like arm, s390, ppc, sh, function
arch_crash_save_vmcoreinfo() is located in machine_kexec.c and it can
only be compiled in when CONFIG_KEXEC_CORE=y.

That's not right because arch_crash_save_vmcoreinfo() is used to export
arch specific vmcoreinfo. CONFIG_VMCORE_INFO is supposed to control its
compiling in. However, CONFIG_VMVCORE_INFO could be independent of
CONFIG_KEXEC_CORE, e.g CONFIG_PROC_KCORE=y will select CONFIG_VMVCORE_INFO.
Or CONFIG_KEXEC/CONFIG_KEXEC_FILE is set while CONFIG_CRASH_DUMP is
not set, it will report linking error.

So, on arm, s390, ppc and sh, move arch_crash_save_vmcoreinfo out to
a new file vmcore_info.c. Let CONFIG_VMCORE_INFO decide if compiling in
arch_crash_save_vmcoreinfo().

Link: https://lkml.kernel.org/r/20240129135033.157195-3-bhe@redhat.com
Signed-off-by: Baoquan He <bhe@redhat.com>
Reported-by: Nathan Chancellor <nathan@kernel.org>
Closes: https://lore.kernel.org/all/20240126045551.GA126645@dev-arch.thelio-3990X/T/#u
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Hari Bathini <hbathini@linux.ibm.com>
Cc: Klara Modin <klarasmodin@gmail.com>
Cc: Michael Kelley <mhklinux@outlook.com>
Cc: Pingfan Liu <piliu@redhat.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Yang Li <yang.lee@linux.alibaba.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/arm/kernel/Makefile         |  1 +
 arch/arm/kernel/machine_kexec.c  |  7 -------
 arch/arm/kernel/vmcore_info.c    | 10 ++++++++++
 arch/powerpc/kexec/Makefile      |  1 +
 arch/powerpc/kexec/core.c        | 28 --------------------------
 arch/powerpc/kexec/vmcore_info.c | 34 ++++++++++++++++++++++++++++++++
 arch/s390/kernel/Makefile        |  1 +
 arch/s390/kernel/machine_kexec.c | 15 --------------
 arch/s390/kernel/vmcore_info.c   | 23 +++++++++++++++++++++
 arch/sh/kernel/Makefile          |  1 +
 arch/sh/kernel/machine_kexec.c   | 11 -----------
 arch/sh/kernel/vmcore_info.c     | 17 ++++++++++++++++
 12 files changed, 88 insertions(+), 61 deletions(-)
 create mode 100644 arch/arm/kernel/vmcore_info.c
 create mode 100644 arch/powerpc/kexec/vmcore_info.c
 create mode 100644 arch/s390/kernel/vmcore_info.c
 create mode 100644 arch/sh/kernel/vmcore_info.c

diff --git a/arch/arm/kernel/Makefile b/arch/arm/kernel/Makefile
index 771264d4726a73..6a9de826ffd3c0 100644
--- a/arch/arm/kernel/Makefile
+++ b/arch/arm/kernel/Makefile
@@ -60,6 +60,7 @@ obj-$(CONFIG_DYNAMIC_FTRACE)	+= ftrace.o insn.o patch.o
 obj-$(CONFIG_FUNCTION_GRAPH_TRACER)	+= ftrace.o insn.o patch.o
 obj-$(CONFIG_JUMP_LABEL)	+= jump_label.o insn.o patch.o
 obj-$(CONFIG_KEXEC_CORE)	+= machine_kexec.o relocate_kernel.o
+obj-$(CONFIG_VMCORE_INFO)	+= vmcore_info.o
 # Main staffs in KPROBES are in arch/arm/probes/ .
 obj-$(CONFIG_KPROBES)		+= patch.o insn.o
 obj-$(CONFIG_OABI_COMPAT)	+= sys_oabi-compat.o
diff --git a/arch/arm/kernel/machine_kexec.c b/arch/arm/kernel/machine_kexec.c
index 5d07cf9e0044d1..80ceb5bd2680bc 100644
--- a/arch/arm/kernel/machine_kexec.c
+++ b/arch/arm/kernel/machine_kexec.c
@@ -198,10 +198,3 @@ void machine_kexec(struct kimage *image)
 
 	soft_restart(reboot_entry_phys);
 }
-
-void arch_crash_save_vmcoreinfo(void)
-{
-#ifdef CONFIG_ARM_LPAE
-	VMCOREINFO_CONFIG(ARM_LPAE);
-#endif
-}
diff --git a/arch/arm/kernel/vmcore_info.c b/arch/arm/kernel/vmcore_info.c
new file mode 100644
index 00000000000000..1437aba47787fd
--- /dev/null
+++ b/arch/arm/kernel/vmcore_info.c
@@ -0,0 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/vmcore_info.h>
+
+void arch_crash_save_vmcoreinfo(void)
+{
+#ifdef CONFIG_ARM_LPAE
+	VMCOREINFO_CONFIG(ARM_LPAE);
+#endif
+}
diff --git a/arch/powerpc/kexec/Makefile b/arch/powerpc/kexec/Makefile
index 0c2abe7f990874..91e96f5168b753 100644
--- a/arch/powerpc/kexec/Makefile
+++ b/arch/powerpc/kexec/Makefile
@@ -8,6 +8,7 @@ obj-y				+= core.o crash.o core_$(BITS).o
 obj-$(CONFIG_PPC32)		+= relocate_32.o
 
 obj-$(CONFIG_KEXEC_FILE)	+= file_load.o ranges.o file_load_$(BITS).o elf_$(BITS).o
+obj-$(CONFIG_VMCORE_INFO)	+= vmcore_info.o
 
 # Disable GCOV, KCOV & sanitizers in odd or sensitive code
 GCOV_PROFILE_core_$(BITS).o := n
diff --git a/arch/powerpc/kexec/core.c b/arch/powerpc/kexec/core.c
index 27fa9098a5b74b..3ff4411ed49671 100644
--- a/arch/powerpc/kexec/core.c
+++ b/arch/powerpc/kexec/core.c
@@ -53,34 +53,6 @@ void machine_kexec_cleanup(struct kimage *image)
 {
 }
 
-void arch_crash_save_vmcoreinfo(void)
-{
-
-#ifdef CONFIG_NUMA
-	VMCOREINFO_SYMBOL(node_data);
-	VMCOREINFO_LENGTH(node_data, MAX_NUMNODES);
-#endif
-#ifndef CONFIG_NUMA
-	VMCOREINFO_SYMBOL(contig_page_data);
-#endif
-#if defined(CONFIG_PPC64) && defined(CONFIG_SPARSEMEM_VMEMMAP)
-	VMCOREINFO_SYMBOL(vmemmap_list);
-	VMCOREINFO_SYMBOL(mmu_vmemmap_psize);
-	VMCOREINFO_SYMBOL(mmu_psize_defs);
-	VMCOREINFO_STRUCT_SIZE(vmemmap_backing);
-	VMCOREINFO_OFFSET(vmemmap_backing, list);
-	VMCOREINFO_OFFSET(vmemmap_backing, phys);
-	VMCOREINFO_OFFSET(vmemmap_backing, virt_addr);
-	VMCOREINFO_STRUCT_SIZE(mmu_psize_def);
-	VMCOREINFO_OFFSET(mmu_psize_def, shift);
-#endif
-	VMCOREINFO_SYMBOL(cur_cpu_spec);
-	VMCOREINFO_OFFSET(cpu_spec, cpu_features);
-	VMCOREINFO_OFFSET(cpu_spec, mmu_features);
-	vmcoreinfo_append_str("NUMBER(RADIX_MMU)=%d\n", early_radix_enabled());
-	vmcoreinfo_append_str("KERNELOFFSET=%lx\n", kaslr_offset());
-}
-
 /*
  * Do not allocate memory (or fail in any way) in machine_kexec().
  * We are past the point of no return, committed to rebooting now.
diff --git a/arch/powerpc/kexec/vmcore_info.c b/arch/powerpc/kexec/vmcore_info.c
new file mode 100644
index 00000000000000..c15f0adaaab50f
--- /dev/null
+++ b/arch/powerpc/kexec/vmcore_info.c
@@ -0,0 +1,34 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/vmcore_info.h>
+#include <asm/pgalloc.h>
+
+void arch_crash_save_vmcoreinfo(void)
+{
+
+#ifdef CONFIG_NUMA
+	VMCOREINFO_SYMBOL(node_data);
+	VMCOREINFO_LENGTH(node_data, MAX_NUMNODES);
+#endif
+#ifndef CONFIG_NUMA
+	VMCOREINFO_SYMBOL(contig_page_data);
+#endif
+#if defined(CONFIG_PPC64) && defined(CONFIG_SPARSEMEM_VMEMMAP)
+	VMCOREINFO_SYMBOL(vmemmap_list);
+	VMCOREINFO_SYMBOL(mmu_vmemmap_psize);
+	VMCOREINFO_SYMBOL(mmu_psize_defs);
+	VMCOREINFO_STRUCT_SIZE(vmemmap_backing);
+	VMCOREINFO_OFFSET(vmemmap_backing, list);
+	VMCOREINFO_OFFSET(vmemmap_backing, phys);
+	VMCOREINFO_OFFSET(vmemmap_backing, virt_addr);
+	VMCOREINFO_STRUCT_SIZE(mmu_psize_def);
+	VMCOREINFO_OFFSET(mmu_psize_def, shift);
+#endif
+	VMCOREINFO_SYMBOL(cur_cpu_spec);
+	VMCOREINFO_OFFSET(cpu_spec, cpu_features);
+	VMCOREINFO_OFFSET(cpu_spec, mmu_features);
+	vmcoreinfo_append_str("NUMBER(RADIX_MMU)=%d\n", early_radix_enabled());
+	vmcoreinfo_append_str("KERNELOFFSET=%lx\n", kaslr_offset());
+}
+
+
diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile
index 7a562b4199c81b..fa029d0dc28ff9 100644
--- a/arch/s390/kernel/Makefile
+++ b/arch/s390/kernel/Makefile
@@ -64,6 +64,7 @@ obj-$(CONFIG_FUNCTION_TRACER)	+= ftrace.o
 obj-$(CONFIG_FUNCTION_TRACER)	+= mcount.o
 obj-$(CONFIG_CRASH_DUMP)	+= crash_dump.o
 obj-$(CONFIG_KEXEC_CORE)	+= machine_kexec.o relocate_kernel.o
+obj-$(CONFIG_VMCORE_INFO)	+= vmcore_info.o
 obj-$(CONFIG_UPROBES)		+= uprobes.o
 obj-$(CONFIG_JUMP_LABEL)	+= jump_label.o
 
diff --git a/arch/s390/kernel/machine_kexec.c b/arch/s390/kernel/machine_kexec.c
index aa22ffc16bcd3d..10277a4602049b 100644
--- a/arch/s390/kernel/machine_kexec.c
+++ b/arch/s390/kernel/machine_kexec.c
@@ -209,21 +209,6 @@ void machine_kexec_cleanup(struct kimage *image)
 {
 }
 
-void arch_crash_save_vmcoreinfo(void)
-{
-	struct lowcore *abs_lc;
-
-	VMCOREINFO_SYMBOL(lowcore_ptr);
-	VMCOREINFO_SYMBOL(high_memory);
-	VMCOREINFO_LENGTH(lowcore_ptr, NR_CPUS);
-	vmcoreinfo_append_str("SAMODE31=%lx\n", (unsigned long)__samode31);
-	vmcoreinfo_append_str("EAMODE31=%lx\n", (unsigned long)__eamode31);
-	vmcoreinfo_append_str("KERNELOFFSET=%lx\n", kaslr_offset());
-	abs_lc = get_abs_lowcore();
-	abs_lc->vmcore_info = paddr_vmcoreinfo_note();
-	put_abs_lowcore(abs_lc);
-}
-
 void machine_shutdown(void)
 {
 }
diff --git a/arch/s390/kernel/vmcore_info.c b/arch/s390/kernel/vmcore_info.c
new file mode 100644
index 00000000000000..eccb6b20b50529
--- /dev/null
+++ b/arch/s390/kernel/vmcore_info.c
@@ -0,0 +1,23 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/vmcore_info.h>
+#include <asm/abs_lowcore.h>
+#include <linux/mm.h>
+#include <asm/setup.h>
+
+void arch_crash_save_vmcoreinfo(void)
+{
+	struct lowcore *abs_lc;
+
+	VMCOREINFO_SYMBOL(lowcore_ptr);
+	VMCOREINFO_SYMBOL(high_memory);
+	VMCOREINFO_LENGTH(lowcore_ptr, NR_CPUS);
+	vmcoreinfo_append_str("SAMODE31=%lx\n", (unsigned long)__samode31);
+	vmcoreinfo_append_str("EAMODE31=%lx\n", (unsigned long)__eamode31);
+	vmcoreinfo_append_str("KERNELOFFSET=%lx\n", kaslr_offset());
+	abs_lc = get_abs_lowcore();
+	abs_lc->vmcore_info = paddr_vmcoreinfo_note();
+	put_abs_lowcore(abs_lc);
+}
+
+
diff --git a/arch/sh/kernel/Makefile b/arch/sh/kernel/Makefile
index 2d7e70537de04c..ba917008d63ed9 100644
--- a/arch/sh/kernel/Makefile
+++ b/arch/sh/kernel/Makefile
@@ -34,6 +34,7 @@ obj-$(CONFIG_SH_STANDARD_BIOS)	+= sh_bios.o
 obj-$(CONFIG_KGDB)		+= kgdb.o
 obj-$(CONFIG_MODULES)		+= sh_ksyms_32.o module.o
 obj-$(CONFIG_KEXEC_CORE)	+= machine_kexec.o relocate_kernel.o
+obj-$(CONFIG_VMCORE_INFO)	+= vmcore_info.o
 obj-$(CONFIG_CRASH_DUMP)	+= crash_dump.o
 obj-$(CONFIG_STACKTRACE)	+= stacktrace.o
 obj-$(CONFIG_IO_TRAPPED)	+= io_trapped.o
diff --git a/arch/sh/kernel/machine_kexec.c b/arch/sh/kernel/machine_kexec.c
index 8daa8a6e6fa683..8321b31d2e19dc 100644
--- a/arch/sh/kernel/machine_kexec.c
+++ b/arch/sh/kernel/machine_kexec.c
@@ -137,17 +137,6 @@ void machine_kexec(struct kimage *image)
 	__ftrace_enabled_restore(save_ftrace_enabled);
 }
 
-void arch_crash_save_vmcoreinfo(void)
-{
-#ifdef CONFIG_NUMA
-	VMCOREINFO_SYMBOL(node_data);
-	VMCOREINFO_LENGTH(node_data, MAX_NUMNODES);
-#endif
-#ifdef CONFIG_X2TLB
-	VMCOREINFO_CONFIG(X2TLB);
-#endif
-}
-
 void __init reserve_crashkernel(void)
 {
 	unsigned long long crash_size, crash_base;
diff --git a/arch/sh/kernel/vmcore_info.c b/arch/sh/kernel/vmcore_info.c
new file mode 100644
index 00000000000000..04c4387e63153d
--- /dev/null
+++ b/arch/sh/kernel/vmcore_info.c
@@ -0,0 +1,17 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/vmcore_info.h>
+#include <linux/mm.h>
+
+void arch_crash_save_vmcoreinfo(void)
+{
+#ifdef CONFIG_NUMA
+	VMCOREINFO_SYMBOL(node_data);
+	VMCOREINFO_LENGTH(node_data, MAX_NUMNODES);
+#endif
+#ifdef CONFIG_X2TLB
+	VMCOREINFO_CONFIG(X2TLB);
+#endif
+}
+
+

From 8b35c2c849812225d7fe086415eaa63154b162d1 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Thu, 15 Feb 2024 17:33:42 -0800
Subject: [PATCH 1033/1406] 
 arch-crash-move-arch_crash_save_vmcoreinfo-out-to-file-vmcore_infoc-fix

remove stray newlines at eof

Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Baoquan He <bhe@redhat.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Hari Bathini <hbathini@linux.ibm.com>
Cc: Klara Modin <klarasmodin@gmail.com>
Cc: Michael Kelley <mhklinux@outlook.com>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Pingfan Liu <piliu@redhat.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Yang Li <yang.lee@linux.alibaba.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/powerpc/kexec/vmcore_info.c | 2 --
 arch/s390/kernel/vmcore_info.c   | 2 --
 arch/sh/kernel/vmcore_info.c     | 2 --
 3 files changed, 6 deletions(-)

diff --git a/arch/powerpc/kexec/vmcore_info.c b/arch/powerpc/kexec/vmcore_info.c
index c15f0adaaab50f..2b65d2adca5e29 100644
--- a/arch/powerpc/kexec/vmcore_info.c
+++ b/arch/powerpc/kexec/vmcore_info.c
@@ -30,5 +30,3 @@ void arch_crash_save_vmcoreinfo(void)
 	vmcoreinfo_append_str("NUMBER(RADIX_MMU)=%d\n", early_radix_enabled());
 	vmcoreinfo_append_str("KERNELOFFSET=%lx\n", kaslr_offset());
 }
-
-
diff --git a/arch/s390/kernel/vmcore_info.c b/arch/s390/kernel/vmcore_info.c
index eccb6b20b50529..d296dfc22191ca 100644
--- a/arch/s390/kernel/vmcore_info.c
+++ b/arch/s390/kernel/vmcore_info.c
@@ -19,5 +19,3 @@ void arch_crash_save_vmcoreinfo(void)
 	abs_lc->vmcore_info = paddr_vmcoreinfo_note();
 	put_abs_lowcore(abs_lc);
 }
-
-
diff --git a/arch/sh/kernel/vmcore_info.c b/arch/sh/kernel/vmcore_info.c
index 04c4387e63153d..a244a204a1b127 100644
--- a/arch/sh/kernel/vmcore_info.c
+++ b/arch/sh/kernel/vmcore_info.c
@@ -13,5 +13,3 @@ void arch_crash_save_vmcoreinfo(void)
 	VMCOREINFO_CONFIG(X2TLB);
 #endif
 }
-
-

From 1e95966fa0ae0ea4512a871666d4976ba4b940b4 Mon Sep 17 00:00:00 2001
From: Chengming Zhou <zhouchengming@bytedance.com>
Date: Fri, 19 Jan 2024 11:22:22 +0000
Subject: [PATCH 1034/1406] mm/zswap: make sure each swapfile always have zswap
 rb-tree

Patch series "mm/zswap: optimize the scalability of zswap rb-tree", v2.

When testing the zswap performance by using kernel build -j32 in a tmpfs
directory, I found the scalability of zswap rb-tree is not good, which is
protected by the only spinlock.  That would cause heavy lock contention if
multiple tasks zswap_store/load concurrently.

So a simple solution is to split the only one zswap rb-tree into multiple
rb-trees, each corresponds to SWAP_ADDRESS_SPACE_PAGES (64M).  This idea
is from the commit 4b3ef9daa4fc ("mm/swap: split swap cache into 64MB
trunks").

Although this method can't solve the spinlock contention completely, it
can mitigate much of that contention.  Below is the results of kernel
build in tmpfs with zswap shrinker enabled:

     linux-next  zswap-lock-optimize
real 1m9.181s    1m3.820s
user 17m44.036s  17m40.100s
sys  7m37.297s   4m54.622s

So there are clearly improvements.  And it's complementary with the
ongoing zswap xarray conversion by Chris.  Anyway, I think we can also
merge this first, it's complementary IMHO.  So I just refresh and resend
this for further discussion.


This patch (of 2):

Not all zswap interfaces can handle the absence of the zswap rb-tree,
actually only zswap_store() has handled it for now.

To make things simple, we make sure each swapfile always have the zswap
rb-tree prepared before being enabled and used.  The preparation is
unlikely to fail in practice, this patch just make it explicit.

Link: https://lkml.kernel.org/r/20240117-b4-zswap-lock-optimize-v2-0-b5cc55479090@bytedance.com
Link: https://lkml.kernel.org/r/20240117-b4-zswap-lock-optimize-v2-1-b5cc55479090@bytedance.com
Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
Acked-by: Nhat Pham <nphamcs@gmail.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Yosry Ahmed <yosryahmed@google.com>
Cc: Chris Li <chriscli@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/zswap.h |  7 +++++--
 mm/swapfile.c         | 10 +++++++---
 mm/zswap.c            |  8 +++-----
 3 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/include/linux/zswap.h b/include/linux/zswap.h
index 0b709f5bc65fac..eca388229d9a76 100644
--- a/include/linux/zswap.h
+++ b/include/linux/zswap.h
@@ -30,7 +30,7 @@ struct zswap_lruvec_state {
 bool zswap_store(struct folio *folio);
 bool zswap_load(struct folio *folio);
 void zswap_invalidate(int type, pgoff_t offset);
-void zswap_swapon(int type);
+int zswap_swapon(int type);
 void zswap_swapoff(int type);
 void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg);
 void zswap_lruvec_state_init(struct lruvec *lruvec);
@@ -51,7 +51,10 @@ static inline bool zswap_load(struct folio *folio)
 }
 
 static inline void zswap_invalidate(int type, pgoff_t offset) {}
-static inline void zswap_swapon(int type) {}
+static inline int zswap_swapon(int type)
+{
+	return 0;
+}
 static inline void zswap_swapoff(int type) {}
 static inline void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg) {}
 static inline void zswap_lruvec_state_init(struct lruvec *lruvec) {}
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 746aa9da530255..b3a83c5dcbb8a2 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -2348,8 +2348,6 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
 				unsigned char *swap_map,
 				struct swap_cluster_info *cluster_info)
 {
-	zswap_swapon(p->type);
-
 	spin_lock(&swap_lock);
 	spin_lock(&p->lock);
 	setup_swap_info(p, prio, swap_map, cluster_info);
@@ -3167,6 +3165,10 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
 	if (error)
 		goto bad_swap_unlock_inode;
 
+	error = zswap_swapon(p->type);
+	if (error)
+		goto free_swap_address_space;
+
 	/*
 	 * Flush any pending IO and dirty mappings before we start using this
 	 * swap device.
@@ -3175,7 +3177,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
 	error = inode_drain_writes(inode);
 	if (error) {
 		inode->i_flags &= ~S_SWAPFILE;
-		goto free_swap_address_space;
+		goto free_swap_zswap;
 	}
 
 	mutex_lock(&swapon_mutex);
@@ -3199,6 +3201,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
 
 	error = 0;
 	goto out;
+free_swap_zswap:
+	zswap_swapoff(p->type);
 free_swap_address_space:
 	exit_swap_address_space(p->type);
 bad_swap_unlock_inode:
diff --git a/mm/zswap.c b/mm/zswap.c
index 2c3d77c6fe72cd..5a40a7b4bae8cd 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -1518,9 +1518,6 @@ bool zswap_store(struct folio *folio)
 	if (folio_test_large(folio))
 		return false;
 
-	if (!tree)
-		return false;
-
 	/*
 	 * If this is a duplicate, it must be removed before attempting to store
 	 * it, otherwise, if the store fails the old page won't be removed from
@@ -1775,19 +1772,20 @@ void zswap_invalidate(int type, pgoff_t offset)
 	spin_unlock(&tree->lock);
 }
 
-void zswap_swapon(int type)
+int zswap_swapon(int type)
 {
 	struct zswap_tree *tree;
 
 	tree = kzalloc(sizeof(*tree), GFP_KERNEL);
 	if (!tree) {
 		pr_err("alloc failed, zswap disabled for swap type %d\n", type);
-		return;
+		return -ENOMEM;
 	}
 
 	tree->rbroot = RB_ROOT;
 	spin_lock_init(&tree->lock);
 	zswap_trees[type] = tree;
+	return 0;
 }
 
 void zswap_swapoff(int type)

From 22a17d55b12585883904852af884d82276ea885c Mon Sep 17 00:00:00 2001
From: Chengming Zhou <zhouchengming@bytedance.com>
Date: Fri, 19 Jan 2024 11:22:23 +0000
Subject: [PATCH 1035/1406] mm/zswap: split zswap rb-tree

Each swapfile has one rb-tree to search the mapping of swp_entry_t to
zswap_entry, that use a spinlock to protect, which can cause heavy lock
contention if multiple tasks zswap_store/load concurrently.

Optimize the scalability problem by splitting the zswap rb-tree into
multiple rb-trees, each corresponds to SWAP_ADDRESS_SPACE_PAGES (64M),
just like we did in the swap cache address_space splitting.

Although this method can't solve the spinlock contention completely, it
can mitigate much of that contention.  Below is the results of kernel
build in tmpfs with zswap shrinker enabled:

     linux-next  zswap-lock-optimize
real 1m9.181s    1m3.820s
user 17m44.036s  17m40.100s
sys  7m37.297s   4m54.622s

So there are clearly improvements.

Link: https://lkml.kernel.org/r/20240117-b4-zswap-lock-optimize-v2-2-b5cc55479090@bytedance.com
Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Nhat Pham <nphamcs@gmail.com>
Acked-by: Yosry Ahmed <yosryahmed@google.com>
Cc: Chris Li <chriscli@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/zswap.h |  4 +--
 mm/swapfile.c         |  2 +-
 mm/zswap.c            | 71 ++++++++++++++++++++++++++++---------------
 3 files changed, 49 insertions(+), 28 deletions(-)

diff --git a/include/linux/zswap.h b/include/linux/zswap.h
index eca388229d9a76..91895ce1fdbc4f 100644
--- a/include/linux/zswap.h
+++ b/include/linux/zswap.h
@@ -30,7 +30,7 @@ struct zswap_lruvec_state {
 bool zswap_store(struct folio *folio);
 bool zswap_load(struct folio *folio);
 void zswap_invalidate(int type, pgoff_t offset);
-int zswap_swapon(int type);
+int zswap_swapon(int type, unsigned long nr_pages);
 void zswap_swapoff(int type);
 void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg);
 void zswap_lruvec_state_init(struct lruvec *lruvec);
@@ -51,7 +51,7 @@ static inline bool zswap_load(struct folio *folio)
 }
 
 static inline void zswap_invalidate(int type, pgoff_t offset) {}
-static inline int zswap_swapon(int type)
+static inline int zswap_swapon(int type, unsigned long nr_pages)
 {
 	return 0;
 }
diff --git a/mm/swapfile.c b/mm/swapfile.c
index b3a83c5dcbb8a2..0c6dde8b8604d8 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -3165,7 +3165,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
 	if (error)
 		goto bad_swap_unlock_inode;
 
-	error = zswap_swapon(p->type);
+	error = zswap_swapon(p->type, maxpages);
 	if (error)
 		goto free_swap_address_space;
 
diff --git a/mm/zswap.c b/mm/zswap.c
index 5a40a7b4bae8cd..464179d4339979 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -239,6 +239,7 @@ struct zswap_tree {
 };
 
 static struct zswap_tree *zswap_trees[MAX_SWAPFILES];
+static unsigned int nr_zswap_trees[MAX_SWAPFILES];
 
 /* RCU-protected iteration */
 static LIST_HEAD(zswap_pools);
@@ -265,6 +266,12 @@ static bool zswap_has_pool;
 * helpers and fwd declarations
 **********************************/
 
+static inline struct zswap_tree *swap_zswap_tree(swp_entry_t swp)
+{
+	return &zswap_trees[swp_type(swp)][swp_offset(swp)
+		>> SWAP_ADDRESS_SPACE_SHIFT];
+}
+
 #define zswap_pool_debug(msg, p)				\
 	pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name,		\
 		 zpool_get_type((p)->zpools[0]))
@@ -864,7 +871,7 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o
 	 * until the entry is verified to still be alive in the tree.
 	 */
 	swpoffset = swp_offset(entry->swpentry);
-	tree = zswap_trees[swp_type(entry->swpentry)];
+	tree = swap_zswap_tree(entry->swpentry);
 	list_lru_isolate(l, item);
 	/*
 	 * It's safe to drop the lock here because we return either
@@ -1493,10 +1500,9 @@ static void zswap_fill_page(void *ptr, unsigned long value)
 bool zswap_store(struct folio *folio)
 {
 	swp_entry_t swp = folio->swap;
-	int type = swp_type(swp);
 	pgoff_t offset = swp_offset(swp);
 	struct page *page = &folio->page;
-	struct zswap_tree *tree = zswap_trees[type];
+	struct zswap_tree *tree = swap_zswap_tree(swp);
 	struct zswap_entry *entry, *dupentry;
 	struct scatterlist input, output;
 	struct crypto_acomp_ctx *acomp_ctx;
@@ -1569,7 +1575,7 @@ bool zswap_store(struct folio *folio)
 		src = kmap_local_page(page);
 		if (zswap_is_page_same_filled(src, &value)) {
 			kunmap_local(src);
-			entry->swpentry = swp_entry(type, offset);
+			entry->swpentry = swp;
 			entry->length = 0;
 			entry->value = value;
 			atomic_inc(&zswap_same_filled_pages);
@@ -1651,7 +1657,7 @@ bool zswap_store(struct folio *folio)
 	mutex_unlock(&acomp_ctx->mutex);
 
 	/* populate entry */
-	entry->swpentry = swp_entry(type, offset);
+	entry->swpentry = swp;
 	entry->handle = handle;
 	entry->length = dlen;
 
@@ -1711,10 +1717,9 @@ bool zswap_store(struct folio *folio)
 bool zswap_load(struct folio *folio)
 {
 	swp_entry_t swp = folio->swap;
-	int type = swp_type(swp);
 	pgoff_t offset = swp_offset(swp);
 	struct page *page = &folio->page;
-	struct zswap_tree *tree = zswap_trees[type];
+	struct zswap_tree *tree = swap_zswap_tree(swp);
 	struct zswap_entry *entry;
 	u8 *dst;
 
@@ -1757,7 +1762,7 @@ bool zswap_load(struct folio *folio)
 
 void zswap_invalidate(int type, pgoff_t offset)
 {
-	struct zswap_tree *tree = zswap_trees[type];
+	struct zswap_tree *tree = swap_zswap_tree(swp_entry(type, offset));
 	struct zswap_entry *entry;
 
 	/* find */
@@ -1772,37 +1777,53 @@ void zswap_invalidate(int type, pgoff_t offset)
 	spin_unlock(&tree->lock);
 }
 
-int zswap_swapon(int type)
+int zswap_swapon(int type, unsigned long nr_pages)
 {
-	struct zswap_tree *tree;
+	struct zswap_tree *trees, *tree;
+	unsigned int nr, i;
 
-	tree = kzalloc(sizeof(*tree), GFP_KERNEL);
-	if (!tree) {
+	nr = DIV_ROUND_UP(nr_pages, SWAP_ADDRESS_SPACE_PAGES);
+	trees = kvcalloc(nr, sizeof(*tree), GFP_KERNEL);
+	if (!trees) {
 		pr_err("alloc failed, zswap disabled for swap type %d\n", type);
 		return -ENOMEM;
 	}
 
-	tree->rbroot = RB_ROOT;
-	spin_lock_init(&tree->lock);
-	zswap_trees[type] = tree;
+	for (i = 0; i < nr; i++) {
+		tree = trees + i;
+		tree->rbroot = RB_ROOT;
+		spin_lock_init(&tree->lock);
+	}
+
+	nr_zswap_trees[type] = nr;
+	zswap_trees[type] = trees;
 	return 0;
 }
 
 void zswap_swapoff(int type)
 {
-	struct zswap_tree *tree = zswap_trees[type];
-	struct zswap_entry *entry, *n;
+	struct zswap_tree *trees = zswap_trees[type];
+	unsigned int i;
 
-	if (!tree)
+	if (!trees)
 		return;
 
-	/* walk the tree and free everything */
-	spin_lock(&tree->lock);
-	rbtree_postorder_for_each_entry_safe(entry, n, &tree->rbroot, rbnode)
-		zswap_free_entry(entry);
-	tree->rbroot = RB_ROOT;
-	spin_unlock(&tree->lock);
-	kfree(tree);
+	for (i = 0; i < nr_zswap_trees[type]; i++) {
+		struct zswap_tree *tree = trees + i;
+		struct zswap_entry *entry, *n;
+
+		/* walk the tree and free everything */
+		spin_lock(&tree->lock);
+		rbtree_postorder_for_each_entry_safe(entry, n,
+						     &tree->rbroot,
+						     rbnode)
+			zswap_free_entry(entry);
+		tree->rbroot = RB_ROOT;
+		spin_unlock(&tree->lock);
+	}
+
+	kvfree(trees);
+	nr_zswap_trees[type] = 0;
 	zswap_trees[type] = NULL;
 }
 

From 4acb2cbf51fe6336aec378cc7fbda6fe7e99b062 Mon Sep 17 00:00:00 2001
From: Yosry Ahmed <yosryahmed@google.com>
Date: Wed, 24 Jan 2024 04:51:11 +0000
Subject: [PATCH 1036/1406] mm: swap: enforce updating inuse_pages at the end
 of swap_range_free()

Patch series "mm: zswap: simplify zswap_swapoff()", v2.

These patches aim to simplify zswap_swapoff() by removing the unnecessary
trees cleanup code.  Patch 1 makes sure that the order of operations
during swapoff is enforced correctly, making sure the simplification in
patch 2 is correct in a future-proof manner.


This patch (of 2):

In swap_range_free(), we update inuse_pages then do some cleanups (arch
invalidation, zswap invalidation, swap cache cleanups, etc).  During
swapoff, try_to_unuse() checks that inuse_pages is 0 to make sure all swap
entries are freed.  Make sure we only update inuse_pages after we are done
with the cleanups in swap_range_free(), and use the proper memory barriers
to enforce it.  This makes sure that code following try_to_unuse() can
safely assume that swap_range_free() ran for all entries in thr swapfile
(e.g.  swap cache cleanup, zswap_swapoff()).

In practice, this currently isn't a problem because swap_range_free() is
called with the swap info lock held, and the swapoff code happens to spin
for that after try_to_unuse().  However, this seems fragile and
unintentional, so make it more relable and future-proof.  This also
facilitates a following simplification of zswap_swapoff().

Link: https://lkml.kernel.org/r/20240124045113.415378-1-yosryahmed@google.com
Link: https://lkml.kernel.org/r/20240124045113.415378-2-yosryahmed@google.com
Signed-off-by: Yosry Ahmed <yosryahmed@google.com>
Reviewed-by: "Huang, Ying" <ying.huang@intel.com>
Cc: Chengming Zhou <zhouchengming@bytedance.com>
Cc: Chris Li <chrisl@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Nhat Pham <nphamcs@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/swapfile.c | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/mm/swapfile.c b/mm/swapfile.c
index 0c6dde8b8604d8..a8edaf4e5b8ad9 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -737,8 +737,6 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
 		if (was_full && (si->flags & SWP_WRITEOK))
 			add_to_avail_list(si);
 	}
-	atomic_long_add(nr_entries, &nr_swap_pages);
-	WRITE_ONCE(si->inuse_pages, si->inuse_pages - nr_entries);
 	if (si->flags & SWP_BLKDEV)
 		swap_slot_free_notify =
 			si->bdev->bd_disk->fops->swap_slot_free_notify;
@@ -752,6 +750,14 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
 		offset++;
 	}
 	clear_shadow_from_swap_cache(si->type, begin, end);
+
+	/*
+	 * Make sure that try_to_unuse() observes si->inuse_pages reaching 0
+	 * only after the above cleanups are done.
+	 */
+	smp_wmb();
+	atomic_long_add(nr_entries, &nr_swap_pages);
+	WRITE_ONCE(si->inuse_pages, si->inuse_pages - nr_entries);
 }
 
 static void set_cluster_next(struct swap_info_struct *si, unsigned long next)
@@ -2049,7 +2055,7 @@ static int try_to_unuse(unsigned int type)
 	unsigned int i;
 
 	if (!READ_ONCE(si->inuse_pages))
-		return 0;
+		goto success;
 
 retry:
 	retval = shmem_unuse(type);
@@ -2130,6 +2136,12 @@ static int try_to_unuse(unsigned int type)
 		return -EINTR;
 	}
 
+success:
+	/*
+	 * Make sure that further cleanups after try_to_unuse() returns happen
+	 * after swap_range_free() reduces si->inuse_pages to 0.
+	 */
+	smp_mb();
 	return 0;
 }
 

From e08ec8ea71842fdc027b788d39bc90e014308a1d Mon Sep 17 00:00:00 2001
From: Yosry Ahmed <yosryahmed@google.com>
Date: Wed, 24 Jan 2024 04:51:12 +0000
Subject: [PATCH 1037/1406] mm: zswap: remove unnecessary trees cleanups in
 zswap_swapoff()

During swapoff, try_to_unuse() makes sure that zswap_invalidate() is
called for all swap entries before zswap_swapoff() is called.  This means
that all zswap entries should already be removed from the tree.  Simplify
zswap_swapoff() by removing the trees cleanup code, and leave an assertion
in its place.

Link: https://lkml.kernel.org/r/20240124045113.415378-3-yosryahmed@google.com
Signed-off-by: Yosry Ahmed <yosryahmed@google.com>
Reviewed-by: Chengming Zhou <zhouchengming@bytedance.com>
Cc: Chris Li <chrisl@kernel.org>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Nhat Pham <nphamcs@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/zswap.c | 16 +++-------------
 1 file changed, 3 insertions(+), 13 deletions(-)

diff --git a/mm/zswap.c b/mm/zswap.c
index 464179d4339979..0e4a869b6fd8aa 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -1808,19 +1808,9 @@ void zswap_swapoff(int type)
 	if (!trees)
 		return;
 
-	for (i = 0; i < nr_zswap_trees[type]; i++) {
-		struct zswap_tree *tree = trees + i;
-		struct zswap_entry *entry, *n;
-
-		/* walk the tree and free everything */
-		spin_lock(&tree->lock);
-		rbtree_postorder_for_each_entry_safe(entry, n,
-						     &tree->rbroot,
-						     rbnode)
-			zswap_free_entry(entry);
-		tree->rbroot = RB_ROOT;
-		spin_unlock(&tree->lock);
-	}
+	/* try_to_unuse() invalidated all the entries already */
+	for (i = 0; i < nr_zswap_trees[type]; i++)
+		WARN_ON_ONCE(!RB_EMPTY_ROOT(&trees[i].rbroot));
 
 	kvfree(trees);
 	nr_zswap_trees[type] = 0;

From 1df36cce6cfd17504f96feb62806210bbe2133cc Mon Sep 17 00:00:00 2001
From: Yajun Deng <yajun.deng@linux.dev>
Date: Wed, 24 Jan 2024 11:57:19 +0800
Subject: [PATCH 1038/1406] mm/mmap: introduce vma_set_range()

There is a lot of code needs to set the range of vma in mmap.c, introduce
vma_set_range() to simplify the code.

Link: https://lkml.kernel.org/r/20240124035719.3685193-1-yajun.deng@linux.dev
Signed-off-by: Yajun Deng <yajun.deng@linux.dev>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/internal.h |  9 +++++++++
 mm/mmap.c     | 29 +++++++----------------------
 2 files changed, 16 insertions(+), 22 deletions(-)

diff --git a/mm/internal.h b/mm/internal.h
index f309a010d50fb6..1e29c5821a1dde 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -1114,6 +1114,15 @@ static inline bool gup_must_unshare(struct vm_area_struct *vma,
 extern bool mirrored_kernelcore;
 extern bool memblock_has_mirror(void);
 
+static __always_inline void vma_set_range(struct vm_area_struct *vma,
+					  unsigned long start, unsigned long end,
+					  pgoff_t pgoff)
+{
+	vma->vm_start = start;
+	vma->vm_end = end;
+	vma->vm_pgoff = pgoff;
+}
+
 static inline bool vma_soft_dirty_enabled(struct vm_area_struct *vma)
 {
 	/*
diff --git a/mm/mmap.c b/mm/mmap.c
index 66f534ec90a55e..476de5daf598d1 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -663,9 +663,7 @@ int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma,
 
 	vma_prepare(&vp);
 	vma_adjust_trans_huge(vma, start, end, 0);
-	vma->vm_start = start;
-	vma->vm_end = end;
-	vma->vm_pgoff = pgoff;
+	vma_set_range(vma, start, end, pgoff);
 	vma_iter_store(vmi, vma);
 
 	vma_complete(&vp, vmi, vma->vm_mm);
@@ -708,9 +706,7 @@ int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma,
 	vma_adjust_trans_huge(vma, start, end, 0);
 
 	vma_iter_clear(vmi);
-	vma->vm_start = start;
-	vma->vm_end = end;
-	vma->vm_pgoff = pgoff;
+	vma_set_range(vma, start, end, pgoff);
 	vma_complete(&vp, vmi, vma->vm_mm);
 	return 0;
 }
@@ -1015,10 +1011,7 @@ static struct vm_area_struct
 
 	vma_prepare(&vp);
 	vma_adjust_trans_huge(vma, vma_start, vma_end, adj_start);
-
-	vma->vm_start = vma_start;
-	vma->vm_end = vma_end;
-	vma->vm_pgoff = vma_pgoff;
+	vma_set_range(vma, vma_start, vma_end, vma_pgoff);
 
 	if (vma_expanded)
 		vma_iter_store(vmi, vma);
@@ -2811,11 +2804,9 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
 	}
 
 	vma_iter_config(&vmi, addr, end);
-	vma->vm_start = addr;
-	vma->vm_end = end;
+	vma_set_range(vma, addr, end, pgoff);
 	vm_flags_init(vma, vm_flags);
 	vma->vm_page_prot = vm_get_page_prot(vm_flags);
-	vma->vm_pgoff = pgoff;
 
 	if (file) {
 		vma->vm_file = get_file(file);
@@ -3165,9 +3156,7 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
 		goto unacct_fail;
 
 	vma_set_anonymous(vma);
-	vma->vm_start = addr;
-	vma->vm_end = addr + len;
-	vma->vm_pgoff = addr >> PAGE_SHIFT;
+	vma_set_range(vma, addr, addr + len, addr >> PAGE_SHIFT);
 	vm_flags_init(vma, flags);
 	vma->vm_page_prot = vm_get_page_prot(flags);
 	vma_start_write(vma);
@@ -3404,9 +3393,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
 		new_vma = vm_area_dup(vma);
 		if (!new_vma)
 			goto out;
-		new_vma->vm_start = addr;
-		new_vma->vm_end = addr + len;
-		new_vma->vm_pgoff = pgoff;
+		vma_set_range(new_vma, addr, addr + len, pgoff);
 		if (vma_dup_policy(vma, new_vma))
 			goto out_free_vma;
 		if (anon_vma_clone(new_vma, vma))
@@ -3574,9 +3561,7 @@ static struct vm_area_struct *__install_special_mapping(
 	if (unlikely(vma == NULL))
 		return ERR_PTR(-ENOMEM);
 
-	vma->vm_start = addr;
-	vma->vm_end = addr + len;
-
+	vma_set_range(vma, addr, addr + len, 0);
 	vm_flags_init(vma, (vm_flags | mm->def_flags |
 		      VM_DONTEXPAND | VM_SOFTDIRTY) & ~VM_LOCKED_MASK);
 	vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);

From b71af5805eeddd17e42cba71c2b10c23cf1153e6 Mon Sep 17 00:00:00 2001
From: Yosry Ahmed <yosryahmed@google.com>
Date: Thu, 25 Jan 2024 08:14:23 +0000
Subject: [PATCH 1039/1406] mm: zswap: remove unused tree argument in
 zswap_entry_put()

Commit 7310895779624 ("mm: zswap: tighten up entry invalidation") removed
the usage of tree argument, delete it.

Link: https://lkml.kernel.org/r/20240125081423.1200336-1-yosryahmed@google.com
Signed-off-by: Yosry Ahmed <yosryahmed@google.com>
Reviewed-by: Chengming Zhou <zhouchengming@bytedance.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Nhat Pham <nphamcs@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/zswap.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/mm/zswap.c b/mm/zswap.c
index 0e4a869b6fd8aa..bccef2af43cc37 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -568,8 +568,7 @@ static void zswap_entry_get(struct zswap_entry *entry)
 /* caller must hold the tree lock
 * remove from the tree and free it, if nobody reference the entry
 */
-static void zswap_entry_put(struct zswap_tree *tree,
-			struct zswap_entry *entry)
+static void zswap_entry_put(struct zswap_entry *entry)
 {
 	int refcount = --entry->refcount;
 
@@ -852,7 +851,7 @@ static void zswap_invalidate_entry(struct zswap_tree *tree,
 				   struct zswap_entry *entry)
 {
 	if (zswap_rb_erase(&tree->rbroot, entry))
-		zswap_entry_put(tree, entry);
+		zswap_entry_put(entry);
 }
 
 static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_one *l,
@@ -921,7 +920,7 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o
 
 put_unlock:
 	/* Drop local reference */
-	zswap_entry_put(tree, entry);
+	zswap_entry_put(entry);
 unlock:
 	spin_unlock(&tree->lock);
 	spin_lock(lock);
@@ -1754,7 +1753,7 @@ bool zswap_load(struct folio *folio)
 		zswap_lru_del(&entry->pool->list_lru, entry);
 		zswap_lru_add(&entry->pool->list_lru, entry);
 	}
-	zswap_entry_put(tree, entry);
+	zswap_entry_put(entry);
 	spin_unlock(&tree->lock);
 
 	return true;

From d721fbbd6ddacea97fe7df66bd3db144b33992ef Mon Sep 17 00:00:00 2001
From: Vishal Verma <vishal.l.verma@intel.com>
Date: Wed, 24 Jan 2024 12:03:46 -0800
Subject: [PATCH 1040/1406] dax/bus.c: replace driver-core lock usage by a
 local rwsem

Patch series "Add DAX ABI for memmap_on_memory", v7.

This series adds sysfs ABI to control memmap_on_memory behavior for DAX
devices.

Patch 1 replaces incorrect device_lock() usage with a local rwsem - this
was identified during review.

Patch 2 is also a preparatory patch that replaces sprintf() for sysfs
operations with sysfs_emit()

Patch 3 adds the missing documentation for the sysfs ABI for DAX regions
and Dax devices.

Patch 4 exports mhp_supports_memmap_on_memory().

Patch 5 adds the new ABI for toggling memmap_on_memory semantics for dax
devices.


This patch (of 5):

The dax driver incorrectly used driver-core device locks to protect
internal dax region and dax device configuration structures.  Replace the
device lock usage with a local rwsem, one each for dax region
configuration and dax device configuration.  As a result of this
conversion, no device_lock() usage remains in dax/bus.c.

Link: https://lkml.kernel.org/r/20240124-vv-dax_abi-v7-0-20d16cb8d23d@intel.com
Link: https://lkml.kernel.org/r/20240124-vv-dax_abi-v7-1-20d16cb8d23d@intel.com
Signed-off-by: Vishal Verma <vishal.l.verma@intel.com>
Reported-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Alison Schofield <alison.schofield@intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Li Zhijian <lizhijian@fujitsu.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Oscar Salvador <osalvador@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 drivers/dax/bus.c | 218 +++++++++++++++++++++++++++++++++-------------
 1 file changed, 156 insertions(+), 62 deletions(-)

diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c
index 1ff1ab5fa105a6..cb148f74ceda67 100644
--- a/drivers/dax/bus.c
+++ b/drivers/dax/bus.c
@@ -12,6 +12,18 @@
 
 static DEFINE_MUTEX(dax_bus_lock);
 
+/*
+ * All changes to the dax region configuration occur with this lock held
+ * for write.
+ */
+DECLARE_RWSEM(dax_region_rwsem);
+
+/*
+ * All changes to the dax device configuration occur with this lock held
+ * for write.
+ */
+DECLARE_RWSEM(dax_dev_rwsem);
+
 #define DAX_NAME_LEN 30
 struct dax_id {
 	struct list_head list;
@@ -180,7 +192,7 @@ static u64 dev_dax_size(struct dev_dax *dev_dax)
 	u64 size = 0;
 	int i;
 
-	device_lock_assert(&dev_dax->dev);
+	WARN_ON_ONCE(!rwsem_is_locked(&dax_dev_rwsem));
 
 	for (i = 0; i < dev_dax->nr_range; i++)
 		size += range_len(&dev_dax->ranges[i].range);
@@ -194,8 +206,15 @@ static int dax_bus_probe(struct device *dev)
 	struct dev_dax *dev_dax = to_dev_dax(dev);
 	struct dax_region *dax_region = dev_dax->region;
 	int rc;
+	u64 size;
 
-	if (dev_dax_size(dev_dax) == 0 || dev_dax->id < 0)
+	rc = down_read_interruptible(&dax_dev_rwsem);
+	if (rc)
+		return rc;
+	size = dev_dax_size(dev_dax);
+	up_read(&dax_dev_rwsem);
+
+	if (size == 0 || dev_dax->id < 0)
 		return -ENXIO;
 
 	rc = dax_drv->probe(dev_dax);
@@ -283,7 +302,7 @@ static unsigned long long dax_region_avail_size(struct dax_region *dax_region)
 	resource_size_t size = resource_size(&dax_region->res);
 	struct resource *res;
 
-	device_lock_assert(dax_region->dev);
+	WARN_ON_ONCE(!rwsem_is_locked(&dax_region_rwsem));
 
 	for_each_dax_region_resource(dax_region, res)
 		size -= resource_size(res);
@@ -295,10 +314,13 @@ static ssize_t available_size_show(struct device *dev,
 {
 	struct dax_region *dax_region = dev_get_drvdata(dev);
 	unsigned long long size;
+	int rc;
 
-	device_lock(dev);
+	rc = down_read_interruptible(&dax_region_rwsem);
+	if (rc)
+		return rc;
 	size = dax_region_avail_size(dax_region);
-	device_unlock(dev);
+	up_read(&dax_region_rwsem);
 
 	return sprintf(buf, "%llu\n", size);
 }
@@ -314,10 +336,12 @@ static ssize_t seed_show(struct device *dev,
 	if (is_static(dax_region))
 		return -EINVAL;
 
-	device_lock(dev);
+	rc = down_read_interruptible(&dax_region_rwsem);
+	if (rc)
+		return rc;
 	seed = dax_region->seed;
 	rc = sprintf(buf, "%s\n", seed ? dev_name(seed) : "");
-	device_unlock(dev);
+	up_read(&dax_region_rwsem);
 
 	return rc;
 }
@@ -333,14 +357,18 @@ static ssize_t create_show(struct device *dev,
 	if (is_static(dax_region))
 		return -EINVAL;
 
-	device_lock(dev);
+	rc = down_read_interruptible(&dax_region_rwsem);
+	if (rc)
+		return rc;
 	youngest = dax_region->youngest;
 	rc = sprintf(buf, "%s\n", youngest ? dev_name(youngest) : "");
-	device_unlock(dev);
+	up_read(&dax_region_rwsem);
 
 	return rc;
 }
 
+static struct dev_dax *__devm_create_dev_dax(struct dev_dax_data *data);
+
 static ssize_t create_store(struct device *dev, struct device_attribute *attr,
 		const char *buf, size_t len)
 {
@@ -358,7 +386,9 @@ static ssize_t create_store(struct device *dev, struct device_attribute *attr,
 	if (val != 1)
 		return -EINVAL;
 
-	device_lock(dev);
+	rc = down_write_killable(&dax_region_rwsem);
+	if (rc)
+		return rc;
 	avail = dax_region_avail_size(dax_region);
 	if (avail == 0)
 		rc = -ENOSPC;
@@ -369,7 +399,7 @@ static ssize_t create_store(struct device *dev, struct device_attribute *attr,
 			.id = -1,
 			.memmap_on_memory = false,
 		};
-		struct dev_dax *dev_dax = devm_create_dev_dax(&data);
+		struct dev_dax *dev_dax = __devm_create_dev_dax(&data);
 
 		if (IS_ERR(dev_dax))
 			rc = PTR_ERR(dev_dax);
@@ -387,7 +417,7 @@ static ssize_t create_store(struct device *dev, struct device_attribute *attr,
 			rc = len;
 		}
 	}
-	device_unlock(dev);
+	up_write(&dax_region_rwsem);
 
 	return rc;
 }
@@ -417,7 +447,7 @@ static void trim_dev_dax_range(struct dev_dax *dev_dax)
 	struct range *range = &dev_dax->ranges[i].range;
 	struct dax_region *dax_region = dev_dax->region;
 
-	device_lock_assert(dax_region->dev);
+	WARN_ON_ONCE(!rwsem_is_locked(&dax_region_rwsem));
 	dev_dbg(&dev_dax->dev, "delete range[%d]: %#llx:%#llx\n", i,
 		(unsigned long long)range->start,
 		(unsigned long long)range->end);
@@ -435,7 +465,7 @@ static void free_dev_dax_ranges(struct dev_dax *dev_dax)
 		trim_dev_dax_range(dev_dax);
 }
 
-static void unregister_dev_dax(void *dev)
+static void __unregister_dev_dax(void *dev)
 {
 	struct dev_dax *dev_dax = to_dev_dax(dev);
 
@@ -447,6 +477,17 @@ static void unregister_dev_dax(void *dev)
 	put_device(dev);
 }
 
+static void unregister_dev_dax(void *dev)
+{
+	if (rwsem_is_locked(&dax_region_rwsem))
+		return __unregister_dev_dax(dev);
+
+	if (WARN_ON_ONCE(down_write_killable(&dax_region_rwsem) != 0))
+		return;
+	__unregister_dev_dax(dev);
+	up_write(&dax_region_rwsem);
+}
+
 static void dax_region_free(struct kref *kref)
 {
 	struct dax_region *dax_region;
@@ -463,11 +504,10 @@ static void dax_region_put(struct dax_region *dax_region)
 /* a return value >= 0 indicates this invocation invalidated the id */
 static int __free_dev_dax_id(struct dev_dax *dev_dax)
 {
-	struct device *dev = &dev_dax->dev;
 	struct dax_region *dax_region;
 	int rc = dev_dax->id;
 
-	device_lock_assert(dev);
+	WARN_ON_ONCE(!rwsem_is_locked(&dax_dev_rwsem));
 
 	if (!dev_dax->dyn_id || dev_dax->id < 0)
 		return -1;
@@ -480,12 +520,13 @@ static int __free_dev_dax_id(struct dev_dax *dev_dax)
 
 static int free_dev_dax_id(struct dev_dax *dev_dax)
 {
-	struct device *dev = &dev_dax->dev;
 	int rc;
 
-	device_lock(dev);
+	rc = down_write_killable(&dax_dev_rwsem);
+	if (rc)
+		return rc;
 	rc = __free_dev_dax_id(dev_dax);
-	device_unlock(dev);
+	up_write(&dax_dev_rwsem);
 	return rc;
 }
 
@@ -519,8 +560,14 @@ static ssize_t delete_store(struct device *dev, struct device_attribute *attr,
 	if (!victim)
 		return -ENXIO;
 
-	device_lock(dev);
-	device_lock(victim);
+	rc = down_write_killable(&dax_region_rwsem);
+	if (rc)
+		return rc;
+	rc = down_write_killable(&dax_dev_rwsem);
+	if (rc) {
+		up_write(&dax_region_rwsem);
+		return rc;
+	}
 	dev_dax = to_dev_dax(victim);
 	if (victim->driver || dev_dax_size(dev_dax))
 		rc = -EBUSY;
@@ -541,12 +588,12 @@ static ssize_t delete_store(struct device *dev, struct device_attribute *attr,
 		} else
 			rc = -EBUSY;
 	}
-	device_unlock(victim);
+	up_write(&dax_dev_rwsem);
 
 	/* won the race to invalidate the device, clean it up */
 	if (do_del)
 		devm_release_action(dev, unregister_dev_dax, victim);
-	device_unlock(dev);
+	up_write(&dax_region_rwsem);
 	put_device(victim);
 
 	return rc;
@@ -658,16 +705,15 @@ static void dax_mapping_release(struct device *dev)
 	put_device(parent);
 }
 
-static void unregister_dax_mapping(void *data)
+static void __unregister_dax_mapping(void *data)
 {
 	struct device *dev = data;
 	struct dax_mapping *mapping = to_dax_mapping(dev);
 	struct dev_dax *dev_dax = to_dev_dax(dev->parent);
-	struct dax_region *dax_region = dev_dax->region;
 
 	dev_dbg(dev, "%s\n", __func__);
 
-	device_lock_assert(dax_region->dev);
+	WARN_ON_ONCE(!rwsem_is_locked(&dax_region_rwsem));
 
 	dev_dax->ranges[mapping->range_id].mapping = NULL;
 	mapping->range_id = -1;
@@ -675,28 +721,37 @@ static void unregister_dax_mapping(void *data)
 	device_unregister(dev);
 }
 
+static void unregister_dax_mapping(void *data)
+{
+	if (rwsem_is_locked(&dax_region_rwsem))
+		return __unregister_dax_mapping(data);
+
+	if (WARN_ON_ONCE(down_write_killable(&dax_region_rwsem) != 0))
+		return;
+	__unregister_dax_mapping(data);
+	up_write(&dax_region_rwsem);
+}
+
 static struct dev_dax_range *get_dax_range(struct device *dev)
 {
 	struct dax_mapping *mapping = to_dax_mapping(dev);
 	struct dev_dax *dev_dax = to_dev_dax(dev->parent);
-	struct dax_region *dax_region = dev_dax->region;
+	int rc;
 
-	device_lock(dax_region->dev);
+	rc = down_write_killable(&dax_region_rwsem);
+	if (rc)
+		return NULL;
 	if (mapping->range_id < 0) {
-		device_unlock(dax_region->dev);
+		up_write(&dax_region_rwsem);
 		return NULL;
 	}
 
 	return &dev_dax->ranges[mapping->range_id];
 }
 
-static void put_dax_range(struct dev_dax_range *dax_range)
+static void put_dax_range(void)
 {
-	struct dax_mapping *mapping = dax_range->mapping;
-	struct dev_dax *dev_dax = to_dev_dax(mapping->dev.parent);
-	struct dax_region *dax_region = dev_dax->region;
-
-	device_unlock(dax_region->dev);
+	up_write(&dax_region_rwsem);
 }
 
 static ssize_t start_show(struct device *dev,
@@ -709,7 +764,7 @@ static ssize_t start_show(struct device *dev,
 	if (!dax_range)
 		return -ENXIO;
 	rc = sprintf(buf, "%#llx\n", dax_range->range.start);
-	put_dax_range(dax_range);
+	put_dax_range();
 
 	return rc;
 }
@@ -725,7 +780,7 @@ static ssize_t end_show(struct device *dev,
 	if (!dax_range)
 		return -ENXIO;
 	rc = sprintf(buf, "%#llx\n", dax_range->range.end);
-	put_dax_range(dax_range);
+	put_dax_range();
 
 	return rc;
 }
@@ -741,7 +796,7 @@ static ssize_t pgoff_show(struct device *dev,
 	if (!dax_range)
 		return -ENXIO;
 	rc = sprintf(buf, "%#lx\n", dax_range->pgoff);
-	put_dax_range(dax_range);
+	put_dax_range();
 
 	return rc;
 }
@@ -775,7 +830,7 @@ static int devm_register_dax_mapping(struct dev_dax *dev_dax, int range_id)
 	struct device *dev;
 	int rc;
 
-	device_lock_assert(dax_region->dev);
+	WARN_ON_ONCE(!rwsem_is_locked(&dax_region_rwsem));
 
 	if (dev_WARN_ONCE(&dev_dax->dev, !dax_region->dev->driver,
 				"region disabled\n"))
@@ -821,7 +876,7 @@ static int alloc_dev_dax_range(struct dev_dax *dev_dax, u64 start,
 	struct resource *alloc;
 	int i, rc;
 
-	device_lock_assert(dax_region->dev);
+	WARN_ON_ONCE(!rwsem_is_locked(&dax_region_rwsem));
 
 	/* handle the seed alloc special case */
 	if (!size) {
@@ -875,13 +930,12 @@ static int adjust_dev_dax_range(struct dev_dax *dev_dax, struct resource *res, r
 {
 	int last_range = dev_dax->nr_range - 1;
 	struct dev_dax_range *dax_range = &dev_dax->ranges[last_range];
-	struct dax_region *dax_region = dev_dax->region;
 	bool is_shrink = resource_size(res) > size;
 	struct range *range = &dax_range->range;
 	struct device *dev = &dev_dax->dev;
 	int rc;
 
-	device_lock_assert(dax_region->dev);
+	WARN_ON_ONCE(!rwsem_is_locked(&dax_region_rwsem));
 
 	if (dev_WARN_ONCE(dev, !size, "deletion is handled by dev_dax_shrink\n"))
 		return -EINVAL;
@@ -907,10 +961,13 @@ static ssize_t size_show(struct device *dev,
 {
 	struct dev_dax *dev_dax = to_dev_dax(dev);
 	unsigned long long size;
+	int rc;
 
-	device_lock(dev);
+	rc = down_write_killable(&dax_dev_rwsem);
+	if (rc)
+		return rc;
 	size = dev_dax_size(dev_dax);
-	device_unlock(dev);
+	up_write(&dax_dev_rwsem);
 
 	return sprintf(buf, "%llu\n", size);
 }
@@ -1080,17 +1137,27 @@ static ssize_t size_store(struct device *dev, struct device_attribute *attr,
 		return -EINVAL;
 	}
 
-	device_lock(dax_region->dev);
+	rc = down_write_killable(&dax_region_rwsem);
+	if (rc)
+		return rc;
 	if (!dax_region->dev->driver) {
-		device_unlock(dax_region->dev);
-		return -ENXIO;
+		rc = -ENXIO;
+		goto err_region;
 	}
-	device_lock(dev);
+	rc = down_write_killable(&dax_dev_rwsem);
+	if (rc)
+		goto err_dev;
+
 	rc = dev_dax_resize(dax_region, dev_dax, val);
-	device_unlock(dev);
-	device_unlock(dax_region->dev);
 
-	return rc == 0 ? len : rc;
+err_dev:
+	up_write(&dax_dev_rwsem);
+err_region:
+	up_write(&dax_region_rwsem);
+
+	if (rc == 0)
+		return len;
+	return rc;
 }
 static DEVICE_ATTR_RW(size);
 
@@ -1138,18 +1205,24 @@ static ssize_t mapping_store(struct device *dev, struct device_attribute *attr,
 		return rc;
 
 	rc = -ENXIO;
-	device_lock(dax_region->dev);
+	rc = down_write_killable(&dax_region_rwsem);
+	if (rc)
+		return rc;
 	if (!dax_region->dev->driver) {
-		device_unlock(dax_region->dev);
+		up_write(&dax_region_rwsem);
+		return rc;
+	}
+	rc = down_write_killable(&dax_dev_rwsem);
+	if (rc) {
+		up_write(&dax_region_rwsem);
 		return rc;
 	}
-	device_lock(dev);
 
 	to_alloc = range_len(&r);
 	if (alloc_is_aligned(dev_dax, to_alloc))
 		rc = alloc_dev_dax_range(dev_dax, r.start, to_alloc);
-	device_unlock(dev);
-	device_unlock(dax_region->dev);
+	up_write(&dax_dev_rwsem);
+	up_write(&dax_region_rwsem);
 
 	return rc == 0 ? len : rc;
 }
@@ -1196,13 +1269,19 @@ static ssize_t align_store(struct device *dev, struct device_attribute *attr,
 	if (!dax_align_valid(val))
 		return -EINVAL;
 
-	device_lock(dax_region->dev);
+	rc = down_write_killable(&dax_region_rwsem);
+	if (rc)
+		return rc;
 	if (!dax_region->dev->driver) {
-		device_unlock(dax_region->dev);
+		up_write(&dax_region_rwsem);
 		return -ENXIO;
 	}
 
-	device_lock(dev);
+	rc = down_write_killable(&dax_dev_rwsem);
+	if (rc) {
+		up_write(&dax_region_rwsem);
+		return rc;
+	}
 	if (dev->driver) {
 		rc = -EBUSY;
 		goto out_unlock;
@@ -1214,8 +1293,8 @@ static ssize_t align_store(struct device *dev, struct device_attribute *attr,
 	if (rc)
 		dev_dax->align = align_save;
 out_unlock:
-	device_unlock(dev);
-	device_unlock(dax_region->dev);
+	up_write(&dax_dev_rwsem);
+	up_write(&dax_region_rwsem);
 	return rc == 0 ? len : rc;
 }
 static DEVICE_ATTR_RW(align);
@@ -1325,7 +1404,7 @@ static const struct device_type dev_dax_type = {
 	.groups = dax_attribute_groups,
 };
 
-struct dev_dax *devm_create_dev_dax(struct dev_dax_data *data)
+static struct dev_dax *__devm_create_dev_dax(struct dev_dax_data *data)
 {
 	struct dax_region *dax_region = data->dax_region;
 	struct device *parent = dax_region->dev;
@@ -1440,6 +1519,21 @@ struct dev_dax *devm_create_dev_dax(struct dev_dax_data *data)
 
 	return ERR_PTR(rc);
 }
+
+struct dev_dax *devm_create_dev_dax(struct dev_dax_data *data)
+{
+	struct dev_dax *dev_dax;
+	int rc;
+
+	rc = down_write_killable(&dax_region_rwsem);
+	if (rc)
+		return ERR_PTR(rc);
+
+	dev_dax = __devm_create_dev_dax(data);
+	up_write(&dax_region_rwsem);
+
+	return dev_dax;
+}
 EXPORT_SYMBOL_GPL(devm_create_dev_dax);
 
 int __dax_driver_register(struct dax_device_driver *dax_drv,

From ce95905b9489d826c886df5670bae6bd1bae44ab Mon Sep 17 00:00:00 2001
From: Vishal Verma <vishal.l.verma@intel.com>
Date: Wed, 24 Jan 2024 12:03:47 -0800
Subject: [PATCH 1041/1406] dax/bus.c: replace several sprintf() with
 sysfs_emit()

There were several places where drivers/dax/bus.c uses 'sprintf' to print
sysfs data.  Since a sysfs_emit() helper is available specifically for
this purpose, replace all the sprintf() usage for sysfs with sysfs_emit()
in this file.

Link: https://lkml.kernel.org/r/20240124-vv-dax_abi-v7-2-20d16cb8d23d@intel.com
Signed-off-by: Vishal Verma <vishal.l.verma@intel.com>
Reported-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Alison Schofield <alison.schofield@intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Li Zhijian <lizhijian@fujitsu.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Oscar Salvador <osalvador@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 drivers/dax/bus.c | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c
index cb148f74ceda67..0fd948a4443e38 100644
--- a/drivers/dax/bus.c
+++ b/drivers/dax/bus.c
@@ -269,7 +269,7 @@ static ssize_t id_show(struct device *dev,
 {
 	struct dax_region *dax_region = dev_get_drvdata(dev);
 
-	return sprintf(buf, "%d\n", dax_region->id);
+	return sysfs_emit(buf, "%d\n", dax_region->id);
 }
 static DEVICE_ATTR_RO(id);
 
@@ -278,8 +278,8 @@ static ssize_t region_size_show(struct device *dev,
 {
 	struct dax_region *dax_region = dev_get_drvdata(dev);
 
-	return sprintf(buf, "%llu\n", (unsigned long long)
-			resource_size(&dax_region->res));
+	return sysfs_emit(buf, "%llu\n",
+			  (unsigned long long)resource_size(&dax_region->res));
 }
 static struct device_attribute dev_attr_region_size = __ATTR(size, 0444,
 		region_size_show, NULL);
@@ -289,7 +289,7 @@ static ssize_t region_align_show(struct device *dev,
 {
 	struct dax_region *dax_region = dev_get_drvdata(dev);
 
-	return sprintf(buf, "%u\n", dax_region->align);
+	return sysfs_emit(buf, "%u\n", dax_region->align);
 }
 static struct device_attribute dev_attr_region_align =
 		__ATTR(align, 0400, region_align_show, NULL);
@@ -322,7 +322,7 @@ static ssize_t available_size_show(struct device *dev,
 	size = dax_region_avail_size(dax_region);
 	up_read(&dax_region_rwsem);
 
-	return sprintf(buf, "%llu\n", size);
+	return sysfs_emit(buf, "%llu\n", size);
 }
 static DEVICE_ATTR_RO(available_size);
 
@@ -340,7 +340,7 @@ static ssize_t seed_show(struct device *dev,
 	if (rc)
 		return rc;
 	seed = dax_region->seed;
-	rc = sprintf(buf, "%s\n", seed ? dev_name(seed) : "");
+	rc = sysfs_emit(buf, "%s\n", seed ? dev_name(seed) : "");
 	up_read(&dax_region_rwsem);
 
 	return rc;
@@ -361,7 +361,7 @@ static ssize_t create_show(struct device *dev,
 	if (rc)
 		return rc;
 	youngest = dax_region->youngest;
-	rc = sprintf(buf, "%s\n", youngest ? dev_name(youngest) : "");
+	rc = sysfs_emit(buf, "%s\n", youngest ? dev_name(youngest) : "");
 	up_read(&dax_region_rwsem);
 
 	return rc;
@@ -763,7 +763,7 @@ static ssize_t start_show(struct device *dev,
 	dax_range = get_dax_range(dev);
 	if (!dax_range)
 		return -ENXIO;
-	rc = sprintf(buf, "%#llx\n", dax_range->range.start);
+	rc = sysfs_emit(buf, "%#llx\n", dax_range->range.start);
 	put_dax_range();
 
 	return rc;
@@ -779,7 +779,7 @@ static ssize_t end_show(struct device *dev,
 	dax_range = get_dax_range(dev);
 	if (!dax_range)
 		return -ENXIO;
-	rc = sprintf(buf, "%#llx\n", dax_range->range.end);
+	rc = sysfs_emit(buf, "%#llx\n", dax_range->range.end);
 	put_dax_range();
 
 	return rc;
@@ -795,7 +795,7 @@ static ssize_t pgoff_show(struct device *dev,
 	dax_range = get_dax_range(dev);
 	if (!dax_range)
 		return -ENXIO;
-	rc = sprintf(buf, "%#lx\n", dax_range->pgoff);
+	rc = sysfs_emit(buf, "%#lx\n", dax_range->pgoff);
 	put_dax_range();
 
 	return rc;
@@ -969,7 +969,7 @@ static ssize_t size_show(struct device *dev,
 	size = dev_dax_size(dev_dax);
 	up_write(&dax_dev_rwsem);
 
-	return sprintf(buf, "%llu\n", size);
+	return sysfs_emit(buf, "%llu\n", size);
 }
 
 static bool alloc_is_aligned(struct dev_dax *dev_dax, resource_size_t size)
@@ -1233,7 +1233,7 @@ static ssize_t align_show(struct device *dev,
 {
 	struct dev_dax *dev_dax = to_dev_dax(dev);
 
-	return sprintf(buf, "%d\n", dev_dax->align);
+	return sysfs_emit(buf, "%d\n", dev_dax->align);
 }
 
 static ssize_t dev_dax_validate_align(struct dev_dax *dev_dax)
@@ -1311,7 +1311,7 @@ static ssize_t target_node_show(struct device *dev,
 {
 	struct dev_dax *dev_dax = to_dev_dax(dev);
 
-	return sprintf(buf, "%d\n", dev_dax_target_node(dev_dax));
+	return sysfs_emit(buf, "%d\n", dev_dax_target_node(dev_dax));
 }
 static DEVICE_ATTR_RO(target_node);
 
@@ -1327,7 +1327,7 @@ static ssize_t resource_show(struct device *dev,
 	else
 		start = dev_dax->ranges[0].range.start;
 
-	return sprintf(buf, "%#llx\n", start);
+	return sysfs_emit(buf, "%#llx\n", start);
 }
 static DEVICE_ATTR(resource, 0400, resource_show, NULL);
 
@@ -1338,14 +1338,14 @@ static ssize_t modalias_show(struct device *dev, struct device_attribute *attr,
 	 * We only ever expect to handle device-dax instances, i.e. the
 	 * @type argument to MODULE_ALIAS_DAX_DEVICE() is always zero
 	 */
-	return sprintf(buf, DAX_DEVICE_MODALIAS_FMT "\n", 0);
+	return sysfs_emit(buf, DAX_DEVICE_MODALIAS_FMT "\n", 0);
 }
 static DEVICE_ATTR_RO(modalias);
 
 static ssize_t numa_node_show(struct device *dev,
 		struct device_attribute *attr, char *buf)
 {
-	return sprintf(buf, "%d\n", dev_to_node(dev));
+	return sysfs_emit(buf, "%d\n", dev_to_node(dev));
 }
 static DEVICE_ATTR_RO(numa_node);
 

From 5ec5176dade8ee4930eff854f43ff825337d1173 Mon Sep 17 00:00:00 2001
From: Vishal Verma <vishal.l.verma@intel.com>
Date: Wed, 24 Jan 2024 12:03:48 -0800
Subject: [PATCH 1042/1406] Documentatiion/ABI: add ABI documentation for
 sys-bus-dax

Add the missing sysfs ABI documentation for the device DAX subsystem.
Various ABI attributes under this have been present since v5.1, and more
have been added over time. In preparation for adding a new attribute,
add this file with the historical details.

Link: https://lkml.kernel.org/r/20240124-vv-dax_abi-v7-3-20d16cb8d23d@intel.com
Signed-off-by: Vishal Verma <vishal.l.verma@intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Li Zhijian <lizhijian@fujitsu.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Oscar Salvador <osalvador@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/ABI/testing/sysfs-bus-dax | 136 ++++++++++++++++++++++++
 1 file changed, 136 insertions(+)
 create mode 100644 Documentation/ABI/testing/sysfs-bus-dax

diff --git a/Documentation/ABI/testing/sysfs-bus-dax b/Documentation/ABI/testing/sysfs-bus-dax
new file mode 100644
index 00000000000000..6359f7bc9bf430
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-bus-dax
@@ -0,0 +1,136 @@
+What:		/sys/bus/dax/devices/daxX.Y/align
+Date:		October, 2020
+KernelVersion:	v5.10
+Contact:	nvdimm@lists.linux.dev
+Description:
+		(RW) Provides a way to specify an alignment for a dax device.
+		Values allowed are constrained by the physical address ranges
+		that back the dax device, and also by arch requirements.
+
+What:		/sys/bus/dax/devices/daxX.Y/mapping
+Date:		October, 2020
+KernelVersion:	v5.10
+Contact:	nvdimm@lists.linux.dev
+Description:
+		(WO) Provides a way to allocate a mapping range under a dax
+		device. Specified in the format <start>-<end>.
+
+What:		/sys/bus/dax/devices/daxX.Y/mapping[0..N]/start
+What:		/sys/bus/dax/devices/daxX.Y/mapping[0..N]/end
+What:		/sys/bus/dax/devices/daxX.Y/mapping[0..N]/page_offset
+Date:		October, 2020
+KernelVersion:	v5.10
+Contact:	nvdimm@lists.linux.dev
+Description:
+		(RO) A dax device may have multiple constituent discontiguous
+		address ranges. These are represented by the different
+		'mappingX' subdirectories. The 'start' attribute indicates the
+		start physical address for the given range. The 'end' attribute
+		indicates the end physical address for the given range. The
+		'page_offset' attribute indicates the offset of the current
+		range in the dax device.
+
+What:		/sys/bus/dax/devices/daxX.Y/resource
+Date:		June, 2019
+KernelVersion:	v5.3
+Contact:	nvdimm@lists.linux.dev
+Description:
+		(RO) The resource attribute indicates the starting physical
+		address of a dax device. In case of a device with multiple
+		constituent ranges, it indicates the starting address of the
+		first range.
+
+What:		/sys/bus/dax/devices/daxX.Y/size
+Date:		October, 2020
+KernelVersion:	v5.10
+Contact:	nvdimm@lists.linux.dev
+Description:
+		(RW) The size attribute indicates the total size of a dax
+		device. For creating subdivided dax devices, or for resizing
+		an existing device, the new size can be written to this as
+		part of the reconfiguration process.
+
+What:		/sys/bus/dax/devices/daxX.Y/numa_node
+Date:		November, 2019
+KernelVersion:	v5.5
+Contact:	nvdimm@lists.linux.dev
+Description:
+		(RO) If NUMA is enabled and the platform has affinitized the
+		backing device for this dax device, emit the CPU node
+		affinity for this device.
+
+What:		/sys/bus/dax/devices/daxX.Y/target_node
+Date:		February, 2019
+KernelVersion:	v5.1
+Contact:	nvdimm@lists.linux.dev
+Description:
+		(RO) The target-node attribute is the Linux numa-node that a
+		device-dax instance may create when it is online. Prior to
+		being online the device's 'numa_node' property reflects the
+		closest online cpu node which is the typical expectation of a
+		device 'numa_node'. Once it is online it becomes its own
+		distinct numa node.
+
+What:		$(readlink -f /sys/bus/dax/devices/daxX.Y)/../dax_region/available_size
+Date:		October, 2020
+KernelVersion:	v5.10
+Contact:	nvdimm@lists.linux.dev
+Description:
+		(RO) The available_size attribute tracks available dax region
+		capacity. This only applies to volatile hmem devices, not pmem
+		devices, since pmem devices are defined by nvdimm namespace
+		boundaries.
+
+What:		$(readlink -f /sys/bus/dax/devices/daxX.Y)/../dax_region/size
+Date:		July, 2017
+KernelVersion:	v5.1
+Contact:	nvdimm@lists.linux.dev
+Description:
+		(RO) The size attribute indicates the size of a given dax region
+		in bytes.
+
+What:		$(readlink -f /sys/bus/dax/devices/daxX.Y)/../dax_region/align
+Date:		October, 2020
+KernelVersion:	v5.10
+Contact:	nvdimm@lists.linux.dev
+Description:
+		(RO) The align attribute indicates alignment of the dax region.
+		Changes on align may not always be valid, when say certain
+		mappings were created with 2M and then we switch to 1G. This
+		validates all ranges against the new value being attempted, post
+		resizing.
+
+What:		$(readlink -f /sys/bus/dax/devices/daxX.Y)/../dax_region/seed
+Date:		October, 2020
+KernelVersion:	v5.10
+Contact:	nvdimm@lists.linux.dev
+Description:
+		(RO) The seed device is a concept for dynamic dax regions to be
+		able to split the region amongst multiple sub-instances.  The
+		seed device, similar to libnvdimm seed devices, is a device
+		that starts with zero capacity allocated and unbound to a
+		driver.
+
+What:		$(readlink -f /sys/bus/dax/devices/daxX.Y)/../dax_region/create
+Date:		October, 2020
+KernelVersion:	v5.10
+Contact:	nvdimm@lists.linux.dev
+Description:
+		(RW) The create interface to the dax region provides a way to
+		create a new unconfigured dax device under the given region, which
+		can then be configured (with a size etc.) and then probed.
+
+What:		$(readlink -f /sys/bus/dax/devices/daxX.Y)/../dax_region/delete
+Date:		October, 2020
+KernelVersion:	v5.10
+Contact:	nvdimm@lists.linux.dev
+Description:
+		(WO) The delete interface for a dax region provides for deletion
+		of any 0-sized and idle dax devices.
+
+What:		$(readlink -f /sys/bus/dax/devices/daxX.Y)/../dax_region/id
+Date:		July, 2017
+KernelVersion:	v5.1
+Contact:	nvdimm@lists.linux.dev
+Description:
+		(RO) The id attribute indicates the region id of a dax region.

From c82d969822b6f7e736388b5adbdb37340087fcbf Mon Sep 17 00:00:00 2001
From: Vishal Verma <vishal.l.verma@intel.com>
Date: Wed, 24 Jan 2024 12:03:49 -0800
Subject: [PATCH 1043/1406] mm/memory_hotplug: export
 mhp_supports_memmap_on_memory()

In preparation for adding sysfs ABI to toggle memmap_on_memory semantics
for drivers adding memory, export the mhp_supports_memmap_on_memory()
helper. This allows drivers to check if memmap_on_memory support is
available before trying to request it, and display an appropriate
message if it isn't available. As part of this, remove the size argument
to this - with recent updates to allow memmap_on_memory for larger
ranges, and the internal splitting of altmaps into respective memory
blocks, the size argument is meaningless.

Link: https://lkml.kernel.org/r/20240124-vv-dax_abi-v7-4-20d16cb8d23d@intel.com
Signed-off-by: Vishal Verma <vishal.l.verma@intel.com>
Acked-by: David Hildenbrand <david@redhat.com>
Suggested-by: David Hildenbrand <david@redhat.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Li Zhijian <lizhijian@fujitsu.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Huang Ying <ying.huang@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memory_hotplug.h |  6 ++++++
 mm/memory_hotplug.c            | 17 ++++++-----------
 2 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index ee00015575aab3..70aadb2009a08c 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -137,6 +137,7 @@ struct mhp_params {
 
 bool mhp_range_allowed(u64 start, u64 size, bool need_mapping);
 struct range mhp_get_pluggable_range(bool need_mapping);
+bool mhp_supports_memmap_on_memory(void);
 
 /*
  * Zone resizing functions
@@ -278,6 +279,11 @@ static inline bool movable_node_is_enabled(void)
 	return false;
 }
 
+static bool mhp_supports_memmap_on_memory(void)
+{
+	return false;
+}
+
 static inline void pgdat_kswapd_lock(pg_data_t *pgdat) {}
 static inline void pgdat_kswapd_unlock(pg_data_t *pgdat) {}
 static inline void pgdat_kswapd_lock_init(pg_data_t *pgdat) {}
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 707027f691503f..a444e2d7dd2bff 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1337,7 +1337,7 @@ static inline bool arch_supports_memmap_on_memory(unsigned long vmemmap_size)
 }
 #endif
 
-static bool mhp_supports_memmap_on_memory(unsigned long size)
+bool mhp_supports_memmap_on_memory(void)
 {
 	unsigned long vmemmap_size = memory_block_memmap_size();
 	unsigned long memmap_pages = memory_block_memmap_on_memory_pages();
@@ -1346,17 +1346,11 @@ static bool mhp_supports_memmap_on_memory(unsigned long size)
 	 * Besides having arch support and the feature enabled at runtime, we
 	 * need a few more assumptions to hold true:
 	 *
-	 * a) We span a single memory block: memory onlining/offlinin;g happens
-	 *    in memory block granularity. We don't want the vmemmap of online
-	 *    memory blocks to reside on offline memory blocks. In the future,
-	 *    we might want to support variable-sized memory blocks to make the
-	 *    feature more versatile.
-	 *
-	 * b) The vmemmap pages span complete PMDs: We don't want vmemmap code
+	 * a) The vmemmap pages span complete PMDs: We don't want vmemmap code
 	 *    to populate memory from the altmap for unrelated parts (i.e.,
 	 *    other memory blocks)
 	 *
-	 * c) The vmemmap pages (and thereby the pages that will be exposed to
+	 * b) The vmemmap pages (and thereby the pages that will be exposed to
 	 *    the buddy) have to cover full pageblocks: memory onlining/offlining
 	 *    code requires applicable ranges to be page-aligned, for example, to
 	 *    set the migratetypes properly.
@@ -1368,7 +1362,7 @@ static bool mhp_supports_memmap_on_memory(unsigned long size)
 	 *       altmap as an alternative source of memory, and we do not exactly
 	 *       populate a single PMD.
 	 */
-	if (!mhp_memmap_on_memory() || size != memory_block_size_bytes())
+	if (!mhp_memmap_on_memory())
 		return false;
 
 	/*
@@ -1391,6 +1385,7 @@ static bool mhp_supports_memmap_on_memory(unsigned long size)
 
 	return arch_supports_memmap_on_memory(vmemmap_size);
 }
+EXPORT_SYMBOL_GPL(mhp_supports_memmap_on_memory);
 
 static void __ref remove_memory_blocks_and_altmaps(u64 start, u64 size)
 {
@@ -1526,7 +1521,7 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
 	 * Self hosted memmap array
 	 */
 	if ((mhp_flags & MHP_MEMMAP_ON_MEMORY) &&
-	    mhp_supports_memmap_on_memory(memory_block_size_bytes())) {
+	    mhp_supports_memmap_on_memory()) {
 		ret = create_altmaps_and_memory_blocks(nid, group, start, size, mhp_flags);
 		if (ret)
 			goto error;

From 5fa288b8c9c916e8a84a8a692ce7cf3d2a52e346 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Sat, 27 Jan 2024 19:13:46 -0800
Subject: [PATCH 1044/1406] 
 mm-memory_hotplug-export-mhp_supports_memmap_on_memory-fix

fix build

In file included from ./include/linux/mmzone.h:1425,
                 from ./include/linux/gfp.h:7,
                 from ./include/linux/slab.h:16,
                 from ./include/linux/crypto.h:17,
                 from arch/x86/kernel/asm-offsets.c:9:
./include/linux/memory_hotplug.h:282:13: warning: 'mhp_supports_memmap_on_memory' defined but not used [-Wunused-function]
  282 | static bool mhp_supports_memmap_on_memory(void)
      |             ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Cc: Vishal Verma <vishal.l.verma@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memory_hotplug.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 70aadb2009a08c..7a9ff464608d70 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -279,7 +279,7 @@ static inline bool movable_node_is_enabled(void)
 	return false;
 }
 
-static bool mhp_supports_memmap_on_memory(void)
+static inline bool mhp_supports_memmap_on_memory(void)
 {
 	return false;
 }

From 27130a1fab2d233563e7b154c50159577050285a Mon Sep 17 00:00:00 2001
From: Vishal Verma <vishal.l.verma@intel.com>
Date: Wed, 24 Jan 2024 12:03:50 -0800
Subject: [PATCH 1045/1406] dax: add a sysfs knob to control memmap_on_memory
 behavior

Add a sysfs knob for dax devices to control the memmap_on_memory setting
if the dax device were to be hotplugged as system memory.

The default memmap_on_memory setting for dax devices originating via pmem
or hmem is set to 'false' - i.e.  no memmap_on_memory semantics, to
preserve legacy behavior.  For dax devices via CXL, the default is on.
The sysfs control allows the administrator to override the above defaults
if needed.

Link: https://lkml.kernel.org/r/20240124-vv-dax_abi-v7-5-20d16cb8d23d@intel.com
Signed-off-by: Vishal Verma <vishal.l.verma@intel.com>
Tested-by: Li Zhijian <lizhijian@fujitsu.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Huang, Ying <ying.huang@intel.com>
Reviewed-by: Alison Schofield <alison.schofield@intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Oscar Salvador <osalvador@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/ABI/testing/sysfs-bus-dax | 17 ++++++++++
 drivers/dax/bus.c                       | 43 +++++++++++++++++++++++++
 2 files changed, 60 insertions(+)

diff --git a/Documentation/ABI/testing/sysfs-bus-dax b/Documentation/ABI/testing/sysfs-bus-dax
index 6359f7bc9bf430..b34266bfae49ae 100644
--- a/Documentation/ABI/testing/sysfs-bus-dax
+++ b/Documentation/ABI/testing/sysfs-bus-dax
@@ -134,3 +134,20 @@ KernelVersion:	v5.1
 Contact:	nvdimm@lists.linux.dev
 Description:
 		(RO) The id attribute indicates the region id of a dax region.
+
+What:		/sys/bus/dax/devices/daxX.Y/memmap_on_memory
+Date:		January, 2024
+KernelVersion:	v6.8
+Contact:	nvdimm@lists.linux.dev
+Description:
+		(RW) Control the memmap_on_memory setting if the dax device
+		were to be hotplugged as system memory. This determines whether
+		the 'altmap' for the hotplugged memory will be placed on the
+		device being hotplugged (memmap_on_memory=1) or if it will be
+		placed on regular memory (memmap_on_memory=0). This attribute
+		must be set before the device is handed over to the 'kmem'
+		driver (i.e.  hotplugged into system-ram). Additionally, this
+		depends on CONFIG_MHP_MEMMAP_ON_MEMORY, and a globally enabled
+		memmap_on_memory parameter for memory_hotplug. This is
+		typically set on the kernel command line -
+		memory_hotplug.memmap_on_memory set to 'true' or 'force'."
diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c
index 0fd948a4443e38..27c86d0ca7118d 100644
--- a/drivers/dax/bus.c
+++ b/drivers/dax/bus.c
@@ -1349,6 +1349,48 @@ static ssize_t numa_node_show(struct device *dev,
 }
 static DEVICE_ATTR_RO(numa_node);
 
+static ssize_t memmap_on_memory_show(struct device *dev,
+				     struct device_attribute *attr, char *buf)
+{
+	struct dev_dax *dev_dax = to_dev_dax(dev);
+
+	return sysfs_emit(buf, "%d\n", dev_dax->memmap_on_memory);
+}
+
+static ssize_t memmap_on_memory_store(struct device *dev,
+				      struct device_attribute *attr,
+				      const char *buf, size_t len)
+{
+	struct dev_dax *dev_dax = to_dev_dax(dev);
+	bool val;
+	int rc;
+
+	rc = kstrtobool(buf, &val);
+	if (rc)
+		return rc;
+
+	if (val == true && !mhp_supports_memmap_on_memory()) {
+		dev_dbg(dev, "memmap_on_memory is not available\n");
+		return -EOPNOTSUPP;
+	}
+
+	rc = down_write_killable(&dax_dev_rwsem);
+	if (rc)
+		return rc;
+
+	if (dev_dax->memmap_on_memory != val && dev->driver &&
+	    to_dax_drv(dev->driver)->type == DAXDRV_KMEM_TYPE) {
+		up_write(&dax_dev_rwsem);
+		return -EBUSY;
+	}
+
+	dev_dax->memmap_on_memory = val;
+	up_write(&dax_dev_rwsem);
+
+	return len;
+}
+static DEVICE_ATTR_RW(memmap_on_memory);
+
 static umode_t dev_dax_visible(struct kobject *kobj, struct attribute *a, int n)
 {
 	struct device *dev = container_of(kobj, struct device, kobj);
@@ -1375,6 +1417,7 @@ static struct attribute *dev_dax_attributes[] = {
 	&dev_attr_align.attr,
 	&dev_attr_resource.attr,
 	&dev_attr_numa_node.attr,
+	&dev_attr_memmap_on_memory.attr,
 	NULL,
 };
 

From f65526010602cb9456a79a46af07e88b7b1f9f36 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Wed, 24 Jan 2024 18:12:15 +0000
Subject: [PATCH 1046/1406] highmem: add kernel-doc for memcpy_*_folio()

This was inadvertently skipped when adding the new functions.

Link: https://lkml.kernel.org/r/20240124181217.1761674-1-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/highmem.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index 451c1dff0e873c..00341b56d2910d 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -439,6 +439,13 @@ static inline void memzero_page(struct page *page, size_t offset, size_t len)
 	kunmap_local(addr);
 }
 
+/**
+ * memcpy_from_folio - Copy a range of bytes from a folio.
+ * @to: The memory to copy to.
+ * @folio: The folio to read from.
+ * @offset: The first byte in the folio to read.
+ * @len: The number of bytes to copy.
+ */
 static inline void memcpy_from_folio(char *to, struct folio *folio,
 		size_t offset, size_t len)
 {
@@ -460,6 +467,13 @@ static inline void memcpy_from_folio(char *to, struct folio *folio,
 	} while (len > 0);
 }
 
+/**
+ * memcpy_to_folio - Copy a range of bytes to a folio.
+ * @folio: The folio to write to.
+ * @offset: The first byte in the folio to store to.
+ * @from: The memory to copy from.
+ * @len: The number of bytes to copy.
+ */
 static inline void memcpy_to_folio(struct folio *folio, size_t offset,
 		const char *from, size_t len)
 {

From 77ff881c7e658135b99423a61bc591063746a665 Mon Sep 17 00:00:00 2001
From: Alexander Potapenko <glider@google.com>
Date: Wed, 24 Jan 2024 18:31:34 +0100
Subject: [PATCH 1047/1406] mm: kmsan: remove runtime checks from
 kmsan_unpoison_memory()

Similarly to what's been done in commit 85716a80c16d ("kmsan: allow using
__msan_instrument_asm_store() inside runtime"), it should be safe to call
kmsan_unpoison_memory() from within the runtime, as it does not allocate
memory or take locks.  Remove the redundant runtime checks.

This should fix false positives seen with CONFIG_DEBUG_LIST=y when
the non-instrumented lib/stackdepot.c failed to unpoison the memory
chunks later checked by the instrumented lib/list_debug.c

Also replace the implementation of kmsan_unpoison_entry_regs() with
a call to kmsan_unpoison_memory().

Link: https://lkml.kernel.org/r/20240124173134.1165747-1-glider@google.com
Fixes: f80be4571b19 ("kmsan: add KMSAN runtime core")
Signed-off-by: Alexander Potapenko <glider@google.com>
Tested-by: Marco Elver <elver@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Ilya Leoshkevich <iii@linux.ibm.com>
Cc: Nicholas Miehlbradt <nicholas@linux.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/kmsan/hooks.c | 36 +++++++++++++-----------------------
 1 file changed, 13 insertions(+), 23 deletions(-)

diff --git a/mm/kmsan/hooks.c b/mm/kmsan/hooks.c
index 5d6e2dee5692a3..0b09daa188ef6c 100644
--- a/mm/kmsan/hooks.c
+++ b/mm/kmsan/hooks.c
@@ -359,6 +359,12 @@ void kmsan_handle_dma_sg(struct scatterlist *sg, int nents,
 }
 
 /* Functions from kmsan-checks.h follow. */
+
+/*
+ * To create an origin, kmsan_poison_memory() unwinds the stacks and stores it
+ * into the stack depot. This may cause deadlocks if done from within KMSAN
+ * runtime, therefore we bail out if kmsan_in_runtime().
+ */
 void kmsan_poison_memory(const void *address, size_t size, gfp_t flags)
 {
 	if (!kmsan_enabled || kmsan_in_runtime())
@@ -371,47 +377,31 @@ void kmsan_poison_memory(const void *address, size_t size, gfp_t flags)
 }
 EXPORT_SYMBOL(kmsan_poison_memory);
 
+/*
+ * Unlike kmsan_poison_memory(), this function can be used from within KMSAN
+ * runtime, because it does not trigger allocations or call instrumented code.
+ */
 void kmsan_unpoison_memory(const void *address, size_t size)
 {
 	unsigned long ua_flags;
 
-	if (!kmsan_enabled || kmsan_in_runtime())
+	if (!kmsan_enabled)
 		return;
 
 	ua_flags = user_access_save();
-	kmsan_enter_runtime();
 	/* The users may want to poison/unpoison random memory. */
 	kmsan_internal_unpoison_memory((void *)address, size,
 				       KMSAN_POISON_NOCHECK);
-	kmsan_leave_runtime();
 	user_access_restore(ua_flags);
 }
 EXPORT_SYMBOL(kmsan_unpoison_memory);
 
 /*
- * Version of kmsan_unpoison_memory() that can be called from within the KMSAN
- * runtime.
- *
- * Non-instrumented IRQ entry functions receive struct pt_regs from assembly
- * code. Those regs need to be unpoisoned, otherwise using them will result in
- * false positives.
- * Using kmsan_unpoison_memory() is not an option in entry code, because the
- * return value of in_task() is inconsistent - as a result, certain calls to
- * kmsan_unpoison_memory() are ignored. kmsan_unpoison_entry_regs() ensures that
- * the registers are unpoisoned even if kmsan_in_runtime() is true in the early
- * entry code.
+ * Version of kmsan_unpoison_memory() called from IRQ entry functions.
  */
 void kmsan_unpoison_entry_regs(const struct pt_regs *regs)
 {
-	unsigned long ua_flags;
-
-	if (!kmsan_enabled)
-		return;
-
-	ua_flags = user_access_save();
-	kmsan_internal_unpoison_memory((void *)regs, sizeof(*regs),
-				       KMSAN_POISON_NOCHECK);
-	user_access_restore(ua_flags);
+	kmsan_unpoison_memory((void *)regs, sizeof(*regs));
 }
 
 void kmsan_check_memory(const void *addr, size_t size)

From 7148ffdbece9f06df0c16cbc6a61d0d3ab9e7d3c Mon Sep 17 00:00:00 2001
From: "T.J. Mercier" <tjmercier@google.com>
Date: Fri, 26 Jan 2024 21:19:25 +0000
Subject: [PATCH 1048/1406] mm: memcg: don't periodically flush stats when
 memcg is disabled

The root memcg is onlined even when memcg is disabled.  When it's onlined
a 2 second periodic stat flush is started, but no stat flushing is
required when memcg is disabled because there can be no child memcgs.
Most calls to flush memcg stats are avoided when memcg is disabled as a
result of the mem_cgroup_disabled check added in 7d7ef0a4686a ("mm: memcg:
restore subtree stats flushing"), but the periodic flushing started in
mem_cgroup_css_online is not.  Skip it.

Link: https://lkml.kernel.org/r/20240126211927.1171338-1-tjmercier@google.com
Fixes: aa48e47e3906 ("memcg: infrastructure to flush memcg stats")
Signed-off-by: T.J. Mercier <tjmercier@google.com>
Acked-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Chris Li <chrisl@kernel.org>
Reported-by: Minchan Kim <minchan@google.com>
Reviewed-by: Yosry Ahmed <yosryahmed@google.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Michal Koutn <mkoutny@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/memcontrol.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index df11d6d19ee38f..484a9d2862d4fe 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5621,7 +5621,7 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
 	if (alloc_shrinker_info(memcg))
 		goto offline_kmem;
 
-	if (unlikely(mem_cgroup_is_root(memcg)))
+	if (unlikely(mem_cgroup_is_root(memcg)) && !mem_cgroup_disabled())
 		queue_delayed_work(system_unbound_wq, &stats_flush_dwork,
 				   FLUSH_TIME);
 	lru_gen_online_memcg(memcg);

From bd7fb1183f1fa6146cb9fab435b83db66465828f Mon Sep 17 00:00:00 2001
From: Levi Yun <ppbuk5246@gmail.com>
Date: Fri, 26 Jan 2024 15:25:54 +0000
Subject: [PATCH 1049/1406] kswapd: replace try_to_freeze() with
 kthread_freezable_should_stop()

Instead of using try_to_freeze, use kthread_freezable_should_stop in
kswapd.  By this, we can avoid unnecessary freezing when kswapd should
stop.

Link: https://lkml.kernel.org/r/20240126152556.58791-1-ppbuk5246@gmail.com
Signed-off-by: Levi Yun <ppbuk5246@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/vmscan.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 4f9c854ce6cc66..1f139830b26f6c 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -6796,6 +6796,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx)
 		bool raise_priority = true;
 		bool balanced;
 		bool ret;
+		bool was_frozen;
 
 		sc.reclaim_idx = highest_zoneidx;
 
@@ -6894,9 +6895,9 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx)
 
 		/* Check if kswapd should be suspending */
 		__fs_reclaim_release(_THIS_IP_);
-		ret = try_to_freeze();
+		ret = kthread_freezable_should_stop(&was_frozen);
 		__fs_reclaim_acquire(_THIS_IP_);
-		if (ret || kthread_should_stop())
+		if (was_frozen || ret)
 			break;
 
 		/*
@@ -7102,7 +7103,7 @@ static int kswapd(void *p)
 	WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES);
 	atomic_set(&pgdat->nr_writeback_throttled, 0);
 	for ( ; ; ) {
-		bool ret;
+		bool was_frozen;
 
 		alloc_order = reclaim_order = READ_ONCE(pgdat->kswapd_order);
 		highest_zoneidx = kswapd_highest_zoneidx(pgdat,
@@ -7119,15 +7120,14 @@ static int kswapd(void *p)
 		WRITE_ONCE(pgdat->kswapd_order, 0);
 		WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES);
 
-		ret = try_to_freeze();
-		if (kthread_should_stop())
+		if (kthread_freezable_should_stop(&was_frozen))
 			break;
 
 		/*
 		 * We can speed up thawing tasks if we don't call balance_pgdat
 		 * after returning from the refrigerator
 		 */
-		if (ret)
+		if (was_frozen)
 			continue;
 
 		/*

From 7bee2d15bcef0ba873b985924560bc5e05572378 Mon Sep 17 00:00:00 2001
From: Gang Li <gang.li@linux.dev>
Date: Fri, 26 Jan 2024 23:24:05 +0800
Subject: [PATCH 1050/1406] hugetlb: code clean for hugetlb_hstate_alloc_pages

Patch series "hugetlb: parallelize hugetlb page init on boot", v5.

# Introduction
Hugetlb initialization during boot takes up a considerable amount of time.
For instance, on a 2TB system, initializing 1,800 1GB huge pages takes
1-2 seconds out of 10 seconds.  Initializing 11,776 1GB pages on a 12TB
Intel host takes more than 1 minute[1].  This is a noteworthy figure.

Inspired by [2] and [3], hugetlb initialization can also be accelerated
through parallelization.  Kernel already has infrastructure like
padata_do_multithreaded, this patch uses it to achieve effective results
by minimal modifications.

[1] https://lore.kernel.org/all/783f8bac-55b8-5b95-eb6a-11a583675000@google.com/
[2] https://lore.kernel.org/all/20200527173608.2885243-1-daniel.m.jordan@oracle.com/
[3] https://lore.kernel.org/all/20230906112605.2286994-1-usama.arif@bytedance.com/
[4] https://lore.kernel.org/all/76becfc1-e609-e3e8-2966-4053143170b6@google.com/

# max_threads
This patch use `padata_do_multithreaded` like this:

```
job.max_threads	= num_node_state(N_MEMORY) * multiplier;
padata_do_multithreaded(&job);
```

To fully utilize the CPU, the number of parallel threads needs to be
carefully considered.  `max_threads = num_node_state(N_MEMORY)` does not
fully utilize the CPU, so we need to multiply it by a multiplier.

Tests below indicate that a multiplier of 2 significantly improves
performance, and although larger values also provide improvements, the
gains are marginal.

  multiplier     1       2       3       4       5
 ------------ ------- ------- ------- ------- -------
  256G 2node   358ms   215ms   157ms   134ms   126ms
  2T   4node   979ms   679ms   543ms   489ms   481ms
  50G  2node   71ms    44ms    37ms    30ms    31ms

Therefore, choosing 2 as the multiplier strikes a good balance between
enhancing parallel processing capabilities and maintaining efficient
resource management.

# Test result
      test case       no patch(ms)   patched(ms)   saved
 ------------------- -------------- ------------- --------
  256c2T(4 node) 1G           4745          2024   57.34%
  128c1T(2 node) 1G           3358          1712   49.02%
     12T         1G          77000         18300   76.23%

  256c2T(4 node) 2M           3336          1051   68.52%
  128c1T(2 node) 2M           1943           716   63.15%


This patch (of 7):

The readability of `hugetlb_hstate_alloc_pages` is poor.  By cleaning the
code, its readability can be improved, facilitating future modifications.

This patch extracts two functions to reduce the complexity of
`hugetlb_hstate_alloc_pages` and has no functional changes.

- hugetlb_hstate_alloc_pages_node_specific() to handle iterates through
  each online node and performs allocation if necessary.
- hugetlb_hstate_alloc_pages_report() report error during allocation.
  And the value of h->max_huge_pages is updated accordingly.

Link: https://lkml.kernel.org/r/20240126152411.1238072-1-gang.li@linux.dev
Link: https://lkml.kernel.org/r/20240126152411.1238072-2-gang.li@linux.dev
Signed-off-by: Gang Li <ligang.bdlg@bytedance.com>
Tested-by: David Rientjes <rientjes@google.com>
Reviewed-by: Muchun Song <muchun.song@linux.dev>
Reviewed-by: Tim Chen <tim.c.chen@linux.intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/hugetlb.c | 46 +++++++++++++++++++++++++++++-----------------
 1 file changed, 29 insertions(+), 17 deletions(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index ed1581b670d42e..b8e4a6adefd67c 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3482,6 +3482,33 @@ static void __init hugetlb_hstate_alloc_pages_onenode(struct hstate *h, int nid)
 	h->max_huge_pages_node[nid] = i;
 }
 
+static bool __init hugetlb_hstate_alloc_pages_specific_nodes(struct hstate *h)
+{
+	int i;
+	bool node_specific_alloc = false;
+
+	for_each_online_node(i) {
+		if (h->max_huge_pages_node[i] > 0) {
+			hugetlb_hstate_alloc_pages_onenode(h, i);
+			node_specific_alloc = true;
+		}
+	}
+
+	return node_specific_alloc;
+}
+
+static void __init hugetlb_hstate_alloc_pages_errcheck(unsigned long allocated, struct hstate *h)
+{
+	if (allocated < h->max_huge_pages) {
+		char buf[32];
+
+		string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32);
+		pr_warn("HugeTLB: allocating %lu of page size %s failed.  Only allocated %lu hugepages.\n",
+			h->max_huge_pages, buf, allocated);
+		h->max_huge_pages = allocated;
+	}
+}
+
 /*
  * NOTE: this routine is called in different contexts for gigantic and
  * non-gigantic pages.
@@ -3499,7 +3526,6 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
 	struct folio *folio;
 	LIST_HEAD(folio_list);
 	nodemask_t *node_alloc_noretry;
-	bool node_specific_alloc = false;
 
 	/* skip gigantic hugepages allocation if hugetlb_cma enabled */
 	if (hstate_is_gigantic(h) && hugetlb_cma_size) {
@@ -3508,14 +3534,7 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
 	}
 
 	/* do node specific alloc */
-	for_each_online_node(i) {
-		if (h->max_huge_pages_node[i] > 0) {
-			hugetlb_hstate_alloc_pages_onenode(h, i);
-			node_specific_alloc = true;
-		}
-	}
-
-	if (node_specific_alloc)
+	if (hugetlb_hstate_alloc_pages_specific_nodes(h))
 		return;
 
 	/* below will do all node balanced alloc */
@@ -3558,14 +3577,7 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
 	/* list will be empty if hstate_is_gigantic */
 	prep_and_add_allocated_folios(h, &folio_list);
 
-	if (i < h->max_huge_pages) {
-		char buf[32];
-
-		string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32);
-		pr_warn("HugeTLB: allocating %lu of page size %s failed.  Only allocated %lu hugepages.\n",
-			h->max_huge_pages, buf, i);
-		h->max_huge_pages = i;
-	}
+	hugetlb_hstate_alloc_pages_errcheck(i, h);
 	kfree(node_alloc_noretry);
 }
 

From 43b179cdf3a1a720edb74408330e41568876b9a9 Mon Sep 17 00:00:00 2001
From: Gang Li <gang.li@linux.dev>
Date: Fri, 26 Jan 2024 23:24:06 +0800
Subject: [PATCH 1051/1406] hugetlb: split hugetlb_hstate_alloc_pages

1G and 2M huge pages have different allocation and initialization logic,
which leads to subtle differences in parallelization.  Therefore, it is
appropriate to split hugetlb_hstate_alloc_pages into gigantic and
non-gigantic.

This patch has no functional changes.

Link: https://lkml.kernel.org/r/20240126152411.1238072-3-gang.li@linux.dev
Signed-off-by: Gang Li <ligang.bdlg@bytedance.com>
Tested-by: David Rientjes <rientjes@google.com>
Reviewed-by: Tim Chen <tim.c.chen@linux.intel.com>
Reviewed-by: Muchun Song <muchun.song@linux.dev>
Cc: David Hildenbrand <david@redhat.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/hugetlb.c | 87 ++++++++++++++++++++++++++--------------------------
 1 file changed, 43 insertions(+), 44 deletions(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index b8e4a6adefd67c..98ae108e1fac56 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3509,6 +3509,43 @@ static void __init hugetlb_hstate_alloc_pages_errcheck(unsigned long allocated,
 	}
 }
 
+static unsigned long __init hugetlb_gigantic_pages_alloc_boot(struct hstate *h)
+{
+	unsigned long i;
+
+	for (i = 0; i < h->max_huge_pages; ++i) {
+		if (!alloc_bootmem_huge_page(h, NUMA_NO_NODE))
+			break;
+		cond_resched();
+	}
+
+	return i;
+}
+
+static unsigned long __init hugetlb_pages_alloc_boot(struct hstate *h)
+{
+	unsigned long i;
+	struct folio *folio;
+	LIST_HEAD(folio_list);
+	nodemask_t node_alloc_noretry;
+
+	/* Bit mask controlling how hard we retry per-node allocations.*/
+	nodes_clear(node_alloc_noretry);
+
+	for (i = 0; i < h->max_huge_pages; ++i) {
+		folio = alloc_pool_huge_folio(h, &node_states[N_MEMORY],
+						&node_alloc_noretry);
+		if (!folio)
+			break;
+		list_add(&folio->lru, &folio_list);
+		cond_resched();
+	}
+
+	prep_and_add_allocated_folios(h, &folio_list);
+
+	return i;
+}
+
 /*
  * NOTE: this routine is called in different contexts for gigantic and
  * non-gigantic pages.
@@ -3522,10 +3559,7 @@ static void __init hugetlb_hstate_alloc_pages_errcheck(unsigned long allocated,
  */
 static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
 {
-	unsigned long i;
-	struct folio *folio;
-	LIST_HEAD(folio_list);
-	nodemask_t *node_alloc_noretry;
+	unsigned long allocated;
 
 	/* skip gigantic hugepages allocation if hugetlb_cma enabled */
 	if (hstate_is_gigantic(h) && hugetlb_cma_size) {
@@ -3538,47 +3572,12 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
 		return;
 
 	/* below will do all node balanced alloc */
-	if (!hstate_is_gigantic(h)) {
-		/*
-		 * Bit mask controlling how hard we retry per-node allocations.
-		 * Ignore errors as lower level routines can deal with
-		 * node_alloc_noretry == NULL.  If this kmalloc fails at boot
-		 * time, we are likely in bigger trouble.
-		 */
-		node_alloc_noretry = kmalloc(sizeof(*node_alloc_noretry),
-						GFP_KERNEL);
-	} else {
-		/* allocations done at boot time */
-		node_alloc_noretry = NULL;
-	}
-
-	/* bit mask controlling how hard we retry per-node allocations */
-	if (node_alloc_noretry)
-		nodes_clear(*node_alloc_noretry);
-
-	for (i = 0; i < h->max_huge_pages; ++i) {
-		if (hstate_is_gigantic(h)) {
-			/*
-			 * gigantic pages not added to list as they are not
-			 * added to pools now.
-			 */
-			if (!alloc_bootmem_huge_page(h, NUMA_NO_NODE))
-				break;
-		} else {
-			folio = alloc_pool_huge_folio(h, &node_states[N_MEMORY],
-							node_alloc_noretry);
-			if (!folio)
-				break;
-			list_add(&folio->lru, &folio_list);
-		}
-		cond_resched();
-	}
-
-	/* list will be empty if hstate_is_gigantic */
-	prep_and_add_allocated_folios(h, &folio_list);
+	if (hstate_is_gigantic(h))
+		allocated = hugetlb_gigantic_pages_alloc_boot(h);
+	else
+		allocated = hugetlb_pages_alloc_boot(h);
 
-	hugetlb_hstate_alloc_pages_errcheck(i, h);
-	kfree(node_alloc_noretry);
+	hugetlb_hstate_alloc_pages_errcheck(allocated, h);
 }
 
 static void __init hugetlb_init_hstates(void)

From 879a232cc8ca2e055de51847264017b9ec1003e2 Mon Sep 17 00:00:00 2001
From: Gang Li <gang.li@linux.dev>
Date: Fri, 26 Jan 2024 23:24:07 +0800
Subject: [PATCH 1052/1406] padata: dispatch works on different nodes

When a group of tasks that access different nodes are scheduled on the
same node, they may encounter bandwidth bottlenecks and access latency.

Thus, numa_aware flag is introduced here, allowing tasks to be distributed
across different nodes to fully utilize the advantage of multi-node
systems.

Link: https://lkml.kernel.org/r/20240126152411.1238072-4-gang.li@linux.dev
Signed-off-by: Gang Li <ligang.bdlg@bytedance.com>
Tested-by: David Rientjes <rientjes@google.com>
Reviewed-by: Muchun Song <muchun.song@linux.dev>
Reviewed-by: Tim Chen <tim.c.chen@linux.intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/padata.h |  2 ++
 kernel/padata.c        | 14 ++++++++++++--
 mm/mm_init.c           |  1 +
 3 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/include/linux/padata.h b/include/linux/padata.h
index 495b16b6b4d729..8f418711351bcc 100644
--- a/include/linux/padata.h
+++ b/include/linux/padata.h
@@ -137,6 +137,7 @@ struct padata_shell {
  *             appropriate for one worker thread to do at once.
  * @max_threads: Max threads to use for the job, actual number may be less
  *               depending on task size and minimum chunk size.
+ * @numa_aware: Distribute jobs to different nodes with CPU in a round robin fashion.
  */
 struct padata_mt_job {
 	void (*thread_fn)(unsigned long start, unsigned long end, void *arg);
@@ -146,6 +147,7 @@ struct padata_mt_job {
 	unsigned long		align;
 	unsigned long		min_chunk;
 	int			max_threads;
+	bool			numa_aware;
 };
 
 /**
diff --git a/kernel/padata.c b/kernel/padata.c
index 179fb1518070c2..e3f639ff16707a 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -485,7 +485,8 @@ void __init padata_do_multithreaded(struct padata_mt_job *job)
 	struct padata_work my_work, *pw;
 	struct padata_mt_job_state ps;
 	LIST_HEAD(works);
-	int nworks;
+	int nworks, nid;
+	static atomic_t last_used_nid __initdata;
 
 	if (job->size == 0)
 		return;
@@ -517,7 +518,16 @@ void __init padata_do_multithreaded(struct padata_mt_job *job)
 	ps.chunk_size = roundup(ps.chunk_size, job->align);
 
 	list_for_each_entry(pw, &works, pw_list)
-		queue_work(system_unbound_wq, &pw->pw_work);
+		if (job->numa_aware) {
+			int old_node = atomic_read(&last_used_nid);
+
+			do {
+				nid = next_node_in(old_node, node_states[N_CPU]);
+			} while (!atomic_try_cmpxchg(&last_used_nid, &old_node, nid));
+			queue_work_node(nid, system_unbound_wq, &pw->pw_work);
+		} else {
+			queue_work(system_unbound_wq, &pw->pw_work);
+		}
 
 	/* Use the current thread, which saves starting a workqueue worker. */
 	padata_work_init(&my_work, padata_mt_helper, &ps, PADATA_WORK_ONSTACK);
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 2c19f5515e36c4..549e76af8f82a8 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -2231,6 +2231,7 @@ static int __init deferred_init_memmap(void *data)
 			.align       = PAGES_PER_SECTION,
 			.min_chunk   = PAGES_PER_SECTION,
 			.max_threads = max_threads,
+			.numa_aware  = false,
 		};
 
 		padata_do_multithreaded(&job);

From c32dcdea67bd2233bd60744dd33d43b1ee3d88a3 Mon Sep 17 00:00:00 2001
From: Gang Li <gang.li@linux.dev>
Date: Fri, 26 Jan 2024 23:24:08 +0800
Subject: [PATCH 1053/1406] hugetlb: pass *next_nid_to_alloc directly to
 for_each_node_mask_to_alloc

With parallelization of hugetlb allocation across different threads, each
thread works on a differnet node to allocate pages from, instead of all
allocating from a common node h->next_nid_to_alloc.  To address this, it's
necessary to assign a separate next_nid_to_alloc for each thread.

Consequently, the hstate_next_node_to_alloc and
for_each_node_mask_to_alloc have been modified to directly accept a
*next_nid_to_alloc parameter, ensuring thread-specific allocation and
avoiding concurrent access issues.

Link: https://lkml.kernel.org/r/20240126152411.1238072-5-gang.li@linux.dev
Signed-off-by: Gang Li <ligang.bdlg@bytedance.com>
Tested-by: David Rientjes <rientjes@google.com>
Reviewed-by: Tim Chen <tim.c.chen@linux.intel.com>
Reviewed-by: Muchun Song <muchun.song@linux.dev>
Cc: David Hildenbrand <david@redhat.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/hugetlb.c | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 98ae108e1fac56..effe5539e545c7 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1464,15 +1464,15 @@ static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed)
  * next node from which to allocate, handling wrap at end of node
  * mask.
  */
-static int hstate_next_node_to_alloc(struct hstate *h,
+static int hstate_next_node_to_alloc(int *next_node,
 					nodemask_t *nodes_allowed)
 {
 	int nid;
 
 	VM_BUG_ON(!nodes_allowed);
 
-	nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed);
-	h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed);
+	nid = get_valid_node_allowed(*next_node, nodes_allowed);
+	*next_node = next_node_allowed(nid, nodes_allowed);
 
 	return nid;
 }
@@ -1495,10 +1495,10 @@ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
 	return nid;
 }
 
-#define for_each_node_mask_to_alloc(hs, nr_nodes, node, mask)		\
+#define for_each_node_mask_to_alloc(next_node, nr_nodes, node, mask)		\
 	for (nr_nodes = nodes_weight(*mask);				\
 		nr_nodes > 0 &&						\
-		((node = hstate_next_node_to_alloc(hs, mask)) || 1);	\
+		((node = hstate_next_node_to_alloc(next_node, mask)) || 1);	\
 		nr_nodes--)
 
 #define for_each_node_mask_to_free(hs, nr_nodes, node, mask)		\
@@ -2350,12 +2350,13 @@ static void prep_and_add_allocated_folios(struct hstate *h,
  */
 static struct folio *alloc_pool_huge_folio(struct hstate *h,
 					nodemask_t *nodes_allowed,
-					nodemask_t *node_alloc_noretry)
+					nodemask_t *node_alloc_noretry,
+					int *next_node)
 {
 	gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
 	int nr_nodes, node;
 
-	for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
+	for_each_node_mask_to_alloc(next_node, nr_nodes, node, nodes_allowed) {
 		struct folio *folio;
 
 		folio = only_alloc_fresh_hugetlb_folio(h, gfp_mask, node,
@@ -3310,7 +3311,7 @@ int __alloc_bootmem_huge_page(struct hstate *h, int nid)
 		goto found;
 	}
 	/* allocate from next node when distributing huge pages */
-	for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) {
+	for_each_node_mask_to_alloc(&h->next_nid_to_alloc, nr_nodes, node, &node_states[N_MEMORY]) {
 		m = memblock_alloc_try_nid_raw(
 				huge_page_size(h), huge_page_size(h),
 				0, MEMBLOCK_ALLOC_ACCESSIBLE, node);
@@ -3679,7 +3680,7 @@ static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed,
 	VM_BUG_ON(delta != -1 && delta != 1);
 
 	if (delta < 0) {
-		for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
+		for_each_node_mask_to_alloc(&h->next_nid_to_alloc, nr_nodes, node, nodes_allowed) {
 			if (h->surplus_huge_pages_node[node])
 				goto found;
 		}
@@ -3794,7 +3795,8 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
 		cond_resched();
 
 		folio = alloc_pool_huge_folio(h, nodes_allowed,
-						node_alloc_noretry);
+						node_alloc_noretry,
+						&h->next_nid_to_alloc);
 		if (!folio) {
 			prep_and_add_allocated_folios(h, &page_list);
 			spin_lock_irq(&hugetlb_lock);

From 1ddc988b0b7b5bc582110d7a31394a2a8d975df9 Mon Sep 17 00:00:00 2001
From: Gang Li <gang.li@linux.dev>
Date: Fri, 26 Jan 2024 23:24:09 +0800
Subject: [PATCH 1054/1406] hugetlb: have CONFIG_HUGETLBFS select CONFIG_PADATA

Allow hugetlb use padata_do_multithreaded for parallel initialization.
Select CONFIG_PADATA in this case.

Link: https://lkml.kernel.org/r/20240126152411.1238072-6-gang.li@linux.dev
Signed-off-by: Gang Li <ligang.bdlg@bytedance.com>
Tested-by: David Rientjes <rientjes@google.com>
Reviewed-by: Muchun Song <muchun.song@linux.dev>
Cc: David Hildenbrand <david@redhat.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/Kconfig b/fs/Kconfig
index 89fdbefd1075f8..a57d6e6c41e6f1 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -262,6 +262,7 @@ menuconfig HUGETLBFS
 	depends on X86 || SPARC64 || ARCH_SUPPORTS_HUGETLBFS || BROKEN
 	depends on (SYSFS || SYSCTL)
 	select MEMFD_CREATE
+	select PADATA
 	help
 	  hugetlbfs is a filesystem backing for HugeTLB pages, based on
 	  ramfs. For architectures that support it, say Y here and read

From 26f8c856248e26eed1294382f9ead6ca2364d64c Mon Sep 17 00:00:00 2001
From: Gang Li <gang.li@linux.dev>
Date: Fri, 26 Jan 2024 23:24:10 +0800
Subject: [PATCH 1055/1406] hugetlb: parallelize 2M hugetlb allocation and
 initialization

By distributing both the allocation and the initialization tasks across
multiple threads, the initialization of 2M hugetlb will be faster, thereby
improving the boot speed.

Here are some test results:
      test case        no patch(ms)   patched(ms)   saved
 ------------------- -------------- ------------- --------
  256c2T(4 node) 2M           3336          1051   68.52%
  128c1T(2 node) 2M           1943           716   63.15%

Link: https://lkml.kernel.org/r/20240126152411.1238072-7-gang.li@linux.dev
Signed-off-by: Gang Li <ligang.bdlg@bytedance.com>
Tested-by: David Rientjes <rientjes@google.com>
Reviewed-by: Muchun Song <muchun.song@linux.dev>
Cc: David Hildenbrand <david@redhat.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/hugetlb.c | 73 ++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 56 insertions(+), 17 deletions(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index effe5539e545c7..19d4dce2642bb1 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -35,6 +35,7 @@
 #include <linux/delayacct.h>
 #include <linux/memory.h>
 #include <linux/mm_inline.h>
+#include <linux/padata.h>
 
 #include <asm/page.h>
 #include <asm/pgalloc.h>
@@ -3510,6 +3511,30 @@ static void __init hugetlb_hstate_alloc_pages_errcheck(unsigned long allocated,
 	}
 }
 
+static void __init hugetlb_pages_alloc_boot_node(unsigned long start, unsigned long end, void *arg)
+{
+	struct hstate *h = (struct hstate *)arg;
+	int i, num = end - start;
+	nodemask_t node_alloc_noretry;
+	LIST_HEAD(folio_list);
+	int next_node = first_online_node;
+
+	/* Bit mask controlling how hard we retry per-node allocations.*/
+	nodes_clear(node_alloc_noretry);
+
+	for (i = 0; i < num; ++i) {
+		struct folio *folio = alloc_pool_huge_folio(h, &node_states[N_MEMORY],
+						&node_alloc_noretry, &next_node);
+		if (!folio)
+			break;
+
+		list_move(&folio->lru, &folio_list);
+		cond_resched();
+	}
+
+	prep_and_add_allocated_folios(h, &folio_list);
+}
+
 static unsigned long __init hugetlb_gigantic_pages_alloc_boot(struct hstate *h)
 {
 	unsigned long i;
@@ -3525,26 +3550,40 @@ static unsigned long __init hugetlb_gigantic_pages_alloc_boot(struct hstate *h)
 
 static unsigned long __init hugetlb_pages_alloc_boot(struct hstate *h)
 {
-	unsigned long i;
-	struct folio *folio;
-	LIST_HEAD(folio_list);
-	nodemask_t node_alloc_noretry;
-
-	/* Bit mask controlling how hard we retry per-node allocations.*/
-	nodes_clear(node_alloc_noretry);
+	struct padata_mt_job job = {
+		.fn_arg		= h,
+		.align		= 1,
+		.numa_aware	= true
+	};
 
-	for (i = 0; i < h->max_huge_pages; ++i) {
-		folio = alloc_pool_huge_folio(h, &node_states[N_MEMORY],
-						&node_alloc_noretry);
-		if (!folio)
-			break;
-		list_add(&folio->lru, &folio_list);
-		cond_resched();
-	}
+	job.thread_fn	= hugetlb_pages_alloc_boot_node;
+	job.start	= 0;
+	job.size	= h->max_huge_pages;
 
-	prep_and_add_allocated_folios(h, &folio_list);
+	/*
+	 * job.max_threads is twice the num_node_state(N_MEMORY),
+	 *
+	 * Tests below indicate that a multiplier of 2 significantly improves
+	 * performance, and although larger values also provide improvements,
+	 * the gains are marginal.
+	 *
+	 * Therefore, choosing 2 as the multiplier strikes a good balance between
+	 * enhancing parallel processing capabilities and maintaining efficient
+	 * resource management.
+	 *
+	 * +------------+-------+-------+-------+-------+-------+
+	 * | multiplier |   1   |   2   |   3   |   4   |   5   |
+	 * +------------+-------+-------+-------+-------+-------+
+	 * | 256G 2node | 358ms | 215ms | 157ms | 134ms | 126ms |
+	 * | 2T   4node | 979ms | 679ms | 543ms | 489ms | 481ms |
+	 * | 50G  2node | 71ms  | 44ms  | 37ms  | 30ms  | 31ms  |
+	 * +------------+-------+-------+-------+-------+-------+
+	 */
+	job.max_threads	= num_node_state(N_MEMORY) * 2;
+	job.min_chunk	= h->max_huge_pages / num_node_state(N_MEMORY) / 2;
+	padata_do_multithreaded(&job);
 
-	return i;
+	return h->nr_huge_pages;
 }
 
 /*

From 854acb7091616be3b015fe29466e69f76b1b779b Mon Sep 17 00:00:00 2001
From: Gang Li <gang.li@linux.dev>
Date: Fri, 26 Jan 2024 23:24:11 +0800
Subject: [PATCH 1056/1406] hugetlb: parallelize 1G hugetlb initialization

Optimize the initialization speed of 1G huge pages through
parallelization.

1G hugetlbs are allocated from bootmem, a process that is already very
fast and does not currently require optimization.  Therefore, we focus on
parallelizing only the initialization phase in `gather_bootmem_prealloc`.

Here are some test results:
      test case       no patch(ms)   patched(ms)   saved
 ------------------- -------------- ------------- --------
  256c2T(4 node) 1G           4745          2024   57.34%
  128c1T(2 node) 1G           3358          1712   49.02%
     12T         1G          77000         18300   76.23%

Link: https://lkml.kernel.org/r/20240126152411.1238072-8-gang.li@linux.dev
Signed-off-by: Gang Li <ligang.bdlg@bytedance.com>
Tested-by: David Rientjes <rientjes@google.com>
Reviewed-by: Muchun Song <muchun.song@linux.dev>
Cc: David Hildenbrand <david@redhat.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/powerpc/mm/hugetlbpage.c |  2 +-
 include/linux/hugetlb.h       |  2 +-
 mm/hugetlb.c                  | 44 ++++++++++++++++++++++++++++-------
 3 files changed, 38 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 0a540b37aab62c..a1651d54718626 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -226,7 +226,7 @@ static int __init pseries_alloc_bootmem_huge_page(struct hstate *hstate)
 		return 0;
 	m = phys_to_virt(gpage_freearray[--nr_gpages]);
 	gpage_freearray[nr_gpages] = 0;
-	list_add(&m->list, &huge_boot_pages);
+	list_add(&m->list, &huge_boot_pages[0]);
 	m->hstate = hstate;
 	return 1;
 }
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index c1ee640d87b11d..77b30a8c6076b6 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -178,7 +178,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
 struct address_space *hugetlb_page_mapping_lock_write(struct page *hpage);
 
 extern int sysctl_hugetlb_shm_group;
-extern struct list_head huge_boot_pages;
+extern struct list_head huge_boot_pages[MAX_NUMNODES];
 
 /* arch callbacks */
 
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 19d4dce2642bb1..9d996fe4ecd9cc 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -69,7 +69,7 @@ static bool hugetlb_cma_folio(struct folio *folio, unsigned int order)
 #endif
 static unsigned long hugetlb_cma_size __initdata;
 
-__initdata LIST_HEAD(huge_boot_pages);
+__initdata struct list_head huge_boot_pages[MAX_NUMNODES];
 
 /* for command line parsing */
 static struct hstate * __initdata parsed_hstate;
@@ -3301,7 +3301,7 @@ int alloc_bootmem_huge_page(struct hstate *h, int nid)
 int __alloc_bootmem_huge_page(struct hstate *h, int nid)
 {
 	struct huge_bootmem_page *m = NULL; /* initialize for clang */
-	int nr_nodes, node;
+	int nr_nodes, node = nid;
 
 	/* do node specific alloc */
 	if (nid != NUMA_NO_NODE) {
@@ -3339,7 +3339,7 @@ int __alloc_bootmem_huge_page(struct hstate *h, int nid)
 		huge_page_size(h) - PAGE_SIZE);
 	/* Put them into a private list first because mem_map is not up yet */
 	INIT_LIST_HEAD(&m->list);
-	list_add(&m->list, &huge_boot_pages);
+	list_add(&m->list, &huge_boot_pages[node]);
 	m->hstate = h;
 	return 1;
 }
@@ -3390,8 +3390,6 @@ static void __init prep_and_add_bootmem_folios(struct hstate *h,
 	/* Send list for bulk vmemmap optimization processing */
 	hugetlb_vmemmap_optimize_folios(h, folio_list);
 
-	/* Add all new pool pages to free lists in one lock cycle */
-	spin_lock_irqsave(&hugetlb_lock, flags);
 	list_for_each_entry_safe(folio, tmp_f, folio_list, lru) {
 		if (!folio_test_hugetlb_vmemmap_optimized(folio)) {
 			/*
@@ -3404,23 +3402,27 @@ static void __init prep_and_add_bootmem_folios(struct hstate *h,
 					HUGETLB_VMEMMAP_RESERVE_PAGES,
 					pages_per_huge_page(h));
 		}
+		/* Subdivide locks to achieve better parallel performance */
+		spin_lock_irqsave(&hugetlb_lock, flags);
 		__prep_account_new_huge_page(h, folio_nid(folio));
 		enqueue_hugetlb_folio(h, folio);
+		spin_unlock_irqrestore(&hugetlb_lock, flags);
 	}
-	spin_unlock_irqrestore(&hugetlb_lock, flags);
 }
 
 /*
  * Put bootmem huge pages into the standard lists after mem_map is up.
  * Note: This only applies to gigantic (order > MAX_PAGE_ORDER) pages.
  */
-static void __init gather_bootmem_prealloc(void)
+static void __init gather_bootmem_prealloc_node(unsigned long start, unsigned long end, void *arg)
+
 {
+	int nid = start;
 	LIST_HEAD(folio_list);
 	struct huge_bootmem_page *m;
 	struct hstate *h = NULL, *prev_h = NULL;
 
-	list_for_each_entry(m, &huge_boot_pages, list) {
+	list_for_each_entry(m, &huge_boot_pages[nid], list) {
 		struct page *page = virt_to_page(m);
 		struct folio *folio = (void *)page;
 
@@ -3453,6 +3455,22 @@ static void __init gather_bootmem_prealloc(void)
 	prep_and_add_bootmem_folios(h, &folio_list);
 }
 
+static void __init gather_bootmem_prealloc(void)
+{
+	struct padata_mt_job job = {
+		.thread_fn	= gather_bootmem_prealloc_node,
+		.fn_arg		= NULL,
+		.start		= 0,
+		.size		= num_node_state(N_MEMORY),
+		.align		= 1,
+		.min_chunk	= 1,
+		.max_threads	= num_node_state(N_MEMORY),
+		.numa_aware	= true,
+	};
+
+	padata_do_multithreaded(&job);
+}
+
 static void __init hugetlb_hstate_alloc_pages_onenode(struct hstate *h, int nid)
 {
 	unsigned long i;
@@ -3600,6 +3618,7 @@ static unsigned long __init hugetlb_pages_alloc_boot(struct hstate *h)
 static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
 {
 	unsigned long allocated;
+	static bool initialied __initdata;
 
 	/* skip gigantic hugepages allocation if hugetlb_cma enabled */
 	if (hstate_is_gigantic(h) && hugetlb_cma_size) {
@@ -3607,6 +3626,15 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
 		return;
 	}
 
+	/* hugetlb_hstate_alloc_pages will be called many times, initialize huge_boot_pages once */
+	if (!initialied) {
+		int i = 0;
+
+		for (i = 0; i < MAX_NUMNODES; i++)
+			INIT_LIST_HEAD(&huge_boot_pages[i]);
+		initialied = true;
+	}
+
 	/* do node specific alloc */
 	if (hugetlb_hstate_alloc_pages_specific_nodes(h))
 		return;

From 9db5312b96d1a50dc37f50389cda30af23f25d8c Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Fri, 26 Jan 2024 16:19:44 +0800
Subject: [PATCH 1057/1406] mm and cache_info: remove unnecessary CPU cache
 info update

For each CPU hotplug event, we will update per-CPU data slice size and
corresponding PCP configuration for every online CPU to make the
implementation simple.  But, Kyle reported that this takes tens seconds
during boot on a machine with 34 zones and 3840 CPUs.

So, in this patch, for each CPU hotplug event, we only update per-CPU data
slice size and corresponding PCP configuration for the CPUs that share
caches with the hotplugged CPU.  With the patch, the system boot time
reduces 67 seconds on the machine.

Link: https://lkml.kernel.org/r/20240126081944.414520-1-ying.huang@intel.com
Fixes: 362d37a106dd ("mm, pcp: reduce lock contention for draining high-order pages")
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Originally-by: Kyle Meyer <kyle.meyer@hpe.com>
Reported-and-tested-by: Kyle Meyer <kyle.meyer@hpe.com>
Cc: Sudeep Holla <sudeep.holla@arm.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 drivers/base/cacheinfo.c | 50 +++++++++++++++++++++++++++++++++++-----
 include/linux/gfp.h      |  2 +-
 mm/page_alloc.c          | 39 +++++++++++++++----------------
 3 files changed, 63 insertions(+), 28 deletions(-)

diff --git a/drivers/base/cacheinfo.c b/drivers/base/cacheinfo.c
index f1e79263fe61eb..23b8cba4a2a3b8 100644
--- a/drivers/base/cacheinfo.c
+++ b/drivers/base/cacheinfo.c
@@ -898,6 +898,37 @@ static int cache_add_dev(unsigned int cpu)
 	return rc;
 }
 
+static unsigned int cpu_map_shared_cache(bool online, unsigned int cpu,
+					 cpumask_t **map)
+{
+	struct cacheinfo *llc, *sib_llc;
+	unsigned int sibling;
+
+	if (!last_level_cache_is_valid(cpu))
+		return 0;
+
+	llc = per_cpu_cacheinfo_idx(cpu, cache_leaves(cpu) - 1);
+
+	if (llc->type != CACHE_TYPE_DATA && llc->type != CACHE_TYPE_UNIFIED)
+		return 0;
+
+	if (online) {
+		*map = &llc->shared_cpu_map;
+		return cpumask_weight(*map);
+	}
+
+	/* shared_cpu_map of offlined CPU will be cleared, so use sibling map */
+	for_each_cpu(sibling, &llc->shared_cpu_map) {
+		if (sibling == cpu || !last_level_cache_is_valid(sibling))
+			continue;
+		sib_llc = per_cpu_cacheinfo_idx(sibling, cache_leaves(sibling) - 1);
+		*map = &sib_llc->shared_cpu_map;
+		return cpumask_weight(*map);
+	}
+
+	return 0;
+}
+
 /*
  * Calculate the size of the per-CPU data cache slice.  This can be
  * used to estimate the size of the data cache slice that can be used
@@ -929,28 +960,31 @@ static void update_per_cpu_data_slice_size_cpu(unsigned int cpu)
 		ci->per_cpu_data_slice_size = llc->size / nr_shared;
 }
 
-static void update_per_cpu_data_slice_size(bool cpu_online, unsigned int cpu)
+static void update_per_cpu_data_slice_size(bool cpu_online, unsigned int cpu,
+					   cpumask_t *cpu_map)
 {
 	unsigned int icpu;
 
-	for_each_online_cpu(icpu) {
+	for_each_cpu(icpu, cpu_map) {
 		if (!cpu_online && icpu == cpu)
 			continue;
 		update_per_cpu_data_slice_size_cpu(icpu);
+		setup_pcp_cacheinfo(icpu);
 	}
 }
 
 static int cacheinfo_cpu_online(unsigned int cpu)
 {
 	int rc = detect_cache_attributes(cpu);
+	cpumask_t *cpu_map;
 
 	if (rc)
 		return rc;
 	rc = cache_add_dev(cpu);
 	if (rc)
 		goto err;
-	update_per_cpu_data_slice_size(true, cpu);
-	setup_pcp_cacheinfo();
+	if (cpu_map_shared_cache(true, cpu, &cpu_map))
+		update_per_cpu_data_slice_size(true, cpu, cpu_map);
 	return 0;
 err:
 	free_cache_attributes(cpu);
@@ -959,12 +993,16 @@ static int cacheinfo_cpu_online(unsigned int cpu)
 
 static int cacheinfo_cpu_pre_down(unsigned int cpu)
 {
+	cpumask_t *cpu_map;
+	unsigned int nr_shared;
+
+	nr_shared = cpu_map_shared_cache(false, cpu, &cpu_map);
 	if (cpumask_test_and_clear_cpu(cpu, &cache_dev_map))
 		cpu_cache_sysfs_exit(cpu);
 
 	free_cache_attributes(cpu);
-	update_per_cpu_data_slice_size(false, cpu);
-	setup_pcp_cacheinfo();
+	if (nr_shared > 1)
+		update_per_cpu_data_slice_size(false, cpu, cpu_map);
 	return 0;
 }
 
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index de292a0071389e..09e22091f1b03f 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -334,7 +334,7 @@ void drain_all_pages(struct zone *zone);
 void drain_local_pages(struct zone *zone);
 
 void page_alloc_init_late(void);
-void setup_pcp_cacheinfo(void);
+void setup_pcp_cacheinfo(unsigned int cpu);
 
 /*
  * gfp_allowed_mask is set to GFP_BOOT_MASK during early boot to restrict what
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 150d4f23b01048..9faca05d124e60 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5572,37 +5572,34 @@ static void zone_pcp_update(struct zone *zone, int cpu_online)
 	mutex_unlock(&pcp_batch_high_lock);
 }
 
-static void zone_pcp_update_cacheinfo(struct zone *zone)
+static void zone_pcp_update_cacheinfo(struct zone *zone, unsigned int cpu)
 {
-	int cpu;
 	struct per_cpu_pages *pcp;
 	struct cpu_cacheinfo *cci;
 
-	for_each_online_cpu(cpu) {
-		pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
-		cci = get_cpu_cacheinfo(cpu);
-		/*
-		 * If data cache slice of CPU is large enough, "pcp->batch"
-		 * pages can be preserved in PCP before draining PCP for
-		 * consecutive high-order pages freeing without allocation.
-		 * This can reduce zone lock contention without hurting
-		 * cache-hot pages sharing.
-		 */
-		spin_lock(&pcp->lock);
-		if ((cci->per_cpu_data_slice_size >> PAGE_SHIFT) > 3 * pcp->batch)
-			pcp->flags |= PCPF_FREE_HIGH_BATCH;
-		else
-			pcp->flags &= ~PCPF_FREE_HIGH_BATCH;
-		spin_unlock(&pcp->lock);
-	}
+	pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
+	cci = get_cpu_cacheinfo(cpu);
+	/*
+	 * If data cache slice of CPU is large enough, "pcp->batch"
+	 * pages can be preserved in PCP before draining PCP for
+	 * consecutive high-order pages freeing without allocation.
+	 * This can reduce zone lock contention without hurting
+	 * cache-hot pages sharing.
+	 */
+	spin_lock(&pcp->lock);
+	if ((cci->per_cpu_data_slice_size >> PAGE_SHIFT) > 3 * pcp->batch)
+		pcp->flags |= PCPF_FREE_HIGH_BATCH;
+	else
+		pcp->flags &= ~PCPF_FREE_HIGH_BATCH;
+	spin_unlock(&pcp->lock);
 }
 
-void setup_pcp_cacheinfo(void)
+void setup_pcp_cacheinfo(unsigned int cpu)
 {
 	struct zone *zone;
 
 	for_each_populated_zone(zone)
-		zone_pcp_update_cacheinfo(zone);
+		zone_pcp_update_cacheinfo(zone, cpu);
 }
 
 /*

From 848466d75f85fbd232035f17331627d847f59ea2 Mon Sep 17 00:00:00 2001
From: Yosry Ahmed <yosryahmed@google.com>
Date: Fri, 26 Jan 2024 08:06:43 +0000
Subject: [PATCH 1058/1406] x86/mm: delete unused cpu argument to leave_mm()

The argument is unused since commit 3d28ebceaffa ("x86/mm: Rework lazy
TLB to track the actual loaded mm"), delete it.

Link: https://lkml.kernel.org/r/20240126080644.1714297-1-yosryahmed@google.com
Signed-off-by: Yosry Ahmed <yosryahmed@google.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/x86/include/asm/mmu.h    | 2 +-
 arch/x86/kernel/alternative.c | 2 +-
 arch/x86/mm/tlb.c             | 2 +-
 arch/x86/xen/mmu_pv.c         | 2 +-
 drivers/cpuidle/cpuidle.c     | 2 +-
 include/linux/mmu_context.h   | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h
index 0da5c227f490c0..ce4677b8b7356c 100644
--- a/arch/x86/include/asm/mmu.h
+++ b/arch/x86/include/asm/mmu.h
@@ -75,7 +75,7 @@ typedef struct {
 		.lock = __MUTEX_INITIALIZER(mm.context.lock),		\
 	}
 
-void leave_mm(int cpu);
+void leave_mm(void);
 #define leave_mm leave_mm
 
 #endif /* _ASM_X86_MMU_H */
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 1d85cb7071cb21..21108d8e6f6b2e 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -1805,7 +1805,7 @@ static inline temp_mm_state_t use_temporary_mm(struct mm_struct *mm)
 	 * restoring the previous mm.
 	 */
 	if (this_cpu_read(cpu_tlbstate_shared.is_lazy))
-		leave_mm(smp_processor_id());
+		leave_mm();
 
 	temp_state.mm = this_cpu_read(cpu_tlbstate.loaded_mm);
 	switch_mm_irqs_off(NULL, mm, current);
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 5768d386efab6e..80b0caa82a91b4 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -299,7 +299,7 @@ static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, unsigned long lam,
 	write_cr3(new_mm_cr3);
 }
 
-void leave_mm(int cpu)
+void leave_mm(void)
 {
 	struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
 
diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
index 5744043deb6c85..e21974f2cf2d7d 100644
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -913,7 +913,7 @@ static void drop_mm_ref_this_cpu(void *info)
 	struct mm_struct *mm = info;
 
 	if (this_cpu_read(cpu_tlbstate.loaded_mm) == mm)
-		leave_mm(smp_processor_id());
+		leave_mm();
 
 	/*
 	 * If this cpu still has a stale cr3 reference, then make sure
diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index 737a026ef58a38..02e40fd7d948c9 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -237,7 +237,7 @@ noinstr int cpuidle_enter_state(struct cpuidle_device *dev,
 	}
 
 	if (target_state->flags & CPUIDLE_FLAG_TLB_FLUSHED)
-		leave_mm(dev->cpu);
+		leave_mm();
 
 	/* Take note of the planned idle state. */
 	sched_idle_set_state(target_state);
diff --git a/include/linux/mmu_context.h b/include/linux/mmu_context.h
index f2b7a3f040999e..bbaec80c78c505 100644
--- a/include/linux/mmu_context.h
+++ b/include/linux/mmu_context.h
@@ -11,7 +11,7 @@
 #endif
 
 #ifndef leave_mm
-static inline void leave_mm(int cpu) { }
+static inline void leave_mm(void) { }
 #endif
 
 /*

From 14568c99fdea232f7c35073554a3b15e6ef6ae95 Mon Sep 17 00:00:00 2001
From: Yosry Ahmed <yosryahmed@google.com>
Date: Fri, 26 Jan 2024 08:06:44 +0000
Subject: [PATCH 1059/1406] x86/mm: clarify "prev" usage in
 switch_mm_irqs_off()

In the x86 implementation of switch_mm_irqs_off(), we do not use the
"prev" argument passed in by the caller, we use exclusively use
"real_prev", which is cpu_tlbstate.loaded_mm.  This is not obvious at the
first sight.

Furthermore, a comment describes a condition that happens when called with
prev == next, but this should not affect the function in any way since
prev is unused.  Apparently, the comment is intended to clarify why we
don't rely on prev == next to decide whether we need to update CR3, but
again, it is not obvious.  The comment also references the fact that
leave_mm() calls with prev == NULL and tsk == NULL, but this also
shouldn't matter because prev is unused and tsk is only used in one
function which has a NULL check.

Clarify things by renaming (prev -> unused) and (real_prev -> prev), also
move and rewrite the comment as an explanation for why we don't rely on
"prev" supplied by the caller in x86 code and use our own.  Hopefully this
makes reading the code easier.

Link: https://lkml.kernel.org/r/20240126080644.1714297-2-yosryahmed@google.com
Signed-off-by: Yosry Ahmed <yosryahmed@google.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/x86/mm/tlb.c | 35 ++++++++++++++++-------------------
 1 file changed, 16 insertions(+), 19 deletions(-)

diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 80b0caa82a91b4..bf9605caf24f74 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -492,10 +492,16 @@ void cr4_update_pce(void *ignored)
 static inline void cr4_update_pce_mm(struct mm_struct *mm) { }
 #endif
 
-void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
+/*
+ * The "prev" argument passed by the caller does not always match CR3. For
+ * example, the scheduler passes in active_mm when switching from lazy TLB mode
+ * to normal mode, but switch_mm_irqs_off() can be called from x86 code without
+ * updating active_mm. Use cpu_tlbstate.loaded_mm instead.
+ */
+void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next,
 			struct task_struct *tsk)
 {
-	struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm);
+	struct mm_struct *prev = this_cpu_read(cpu_tlbstate.loaded_mm);
 	u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
 	unsigned long new_lam = mm_lam_cr3_mask(next);
 	bool was_lazy = this_cpu_read(cpu_tlbstate_shared.is_lazy);
@@ -504,15 +510,6 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 	bool need_flush;
 	u16 new_asid;
 
-	/*
-	 * NB: The scheduler will call us with prev == next when switching
-	 * from lazy TLB mode to normal mode if active_mm isn't changing.
-	 * When this happens, we don't assume that CR3 (and hence
-	 * cpu_tlbstate.loaded_mm) matches next.
-	 *
-	 * NB: leave_mm() calls us with prev == NULL and tsk == NULL.
-	 */
-
 	/* We don't want flush_tlb_func() to run concurrently with us. */
 	if (IS_ENABLED(CONFIG_PROVE_LOCKING))
 		WARN_ON_ONCE(!irqs_disabled());
@@ -527,7 +524,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 	 * isn't free.
 	 */
 #ifdef CONFIG_DEBUG_VM
-	if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev->pgd, prev_asid,
+	if (WARN_ON_ONCE(__read_cr3() != build_cr3(prev->pgd, prev_asid,
 						   tlbstate_lam_cr3_mask()))) {
 		/*
 		 * If we were to BUG here, we'd be very likely to kill
@@ -559,7 +556,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 	 * provides that full memory barrier and core serializing
 	 * instruction.
 	 */
-	if (real_prev == next) {
+	if (prev == next) {
 		/* Not actually switching mm's */
 		VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=
 			   next->context.ctx_id);
@@ -574,7 +571,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 		 * mm_cpumask. The TLB shootdown code can figure out from
 		 * cpu_tlbstate_shared.is_lazy whether or not to send an IPI.
 		 */
-		if (WARN_ON_ONCE(real_prev != &init_mm &&
+		if (WARN_ON_ONCE(prev != &init_mm &&
 				 !cpumask_test_cpu(cpu, mm_cpumask(next))))
 			cpumask_set_cpu(cpu, mm_cpumask(next));
 
@@ -616,10 +613,10 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 		 * Skip kernel threads; we never send init_mm TLB flushing IPIs,
 		 * but the bitmap manipulation can cause cache line contention.
 		 */
-		if (real_prev != &init_mm) {
+		if (prev != &init_mm) {
 			VM_WARN_ON_ONCE(!cpumask_test_cpu(cpu,
-						mm_cpumask(real_prev)));
-			cpumask_clear_cpu(cpu, mm_cpumask(real_prev));
+						mm_cpumask(prev)));
+			cpumask_clear_cpu(cpu, mm_cpumask(prev));
 		}
 
 		/*
@@ -656,9 +653,9 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 	this_cpu_write(cpu_tlbstate.loaded_mm, next);
 	this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
 
-	if (next != real_prev) {
+	if (next != prev) {
 		cr4_update_pce_mm(next);
-		switch_ldt(real_prev, next);
+		switch_ldt(prev, next);
 	}
 }
 

From 3fe04866df799964bb2e2e864db1c9c5db342422 Mon Sep 17 00:00:00 2001
From: Chengming Zhou <zhouchengming@bytedance.com>
Date: Sun, 28 Jan 2024 13:28:50 +0000
Subject: [PATCH 1060/1406] mm/zswap: fix race between lru writeback and
 swapoff

LRU writeback has race problem with swapoff, as spotted by Yosry [1]:

CPU1			CPU2
shrink_memcg_cb		swap_off
  list_lru_isolate	  zswap_invalidate
			  zswap_swapoff
			    kfree(tree)
  // UAF
  spin_lock(&tree->lock)

The problem is that the entry in lru list can't protect the tree from
being swapoff and freed, and the entry also can be invalidated and freed
concurrently after we unlock the lru lock.

We can fix it by moving the swap cache allocation ahead before referencing
the tree, then check invalidate race with tree lock, only after that we
can safely deref the entry.  Note we couldn't deref entry or tree anymore
after we unlock the folio, since we depend on this to hold on swapoff.

So this patch moves all tree and entry usage to zswap_writeback_entry(),
we only use the copied swpentry on the stack to allocate swap cache and if
returned with folio locked we can reference the tree safely.  Then we can
check invalidate race with tree lock, the following things is much the
same like zswap_load().

Since we can't deref the entry after zswap_writeback_entry(), we can't use
zswap_lru_putback() anymore, instead we rotate the entry in the beginning.
And it will be unlinked and freed when invalidated if writeback success.

Another change is we don't update the memcg nr_zswap_protected in the
-ENOMEM and -EEXIST cases anymore.  -EEXIST case means we raced with
swapin or concurrent shrinker action, since swapin already have memcg
nr_zswap_protected updated, don't need double counts here.  For concurrent
shrinker, the folio will be writeback and freed anyway.  -ENOMEM case is
extremely rare and doesn't happen spuriously either, so don't bother
distinguishing this case.

[1] https://lore.kernel.org/all/CAJD7tkasHsRnT_75-TXsEe58V9_OW6m3g6CF7Kmsvz8CKRG_EA@mail.gmail.com/

Link: https://lkml.kernel.org/r/20240126-zswap-writeback-race-v2-2-b10479847099@bytedance.com
Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Nhat Pham <nphamcs@gmail.com>
Cc: Chris Li <chriscli@google.com>
Cc: Yosry Ahmed <yosryahmed@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/zswap.c | 114 +++++++++++++++++++++++------------------------------
 1 file changed, 49 insertions(+), 65 deletions(-)

diff --git a/mm/zswap.c b/mm/zswap.c
index bccef2af43cc37..ddc8f930d3434e 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -277,7 +277,7 @@ static inline struct zswap_tree *swap_zswap_tree(swp_entry_t swp)
 		 zpool_get_type((p)->zpools[0]))
 
 static int zswap_writeback_entry(struct zswap_entry *entry,
-				 struct zswap_tree *tree);
+				 swp_entry_t swpentry);
 static int zswap_pool_get(struct zswap_pool *pool);
 static void zswap_pool_put(struct zswap_pool *pool);
 
@@ -444,27 +444,6 @@ static void zswap_lru_del(struct list_lru *list_lru, struct zswap_entry *entry)
 	rcu_read_unlock();
 }
 
-static void zswap_lru_putback(struct list_lru *list_lru,
-		struct zswap_entry *entry)
-{
-	int nid = entry_to_nid(entry);
-	spinlock_t *lock = &list_lru->node[nid].lock;
-	struct mem_cgroup *memcg;
-	struct lruvec *lruvec;
-
-	rcu_read_lock();
-	memcg = mem_cgroup_from_entry(entry);
-	spin_lock(lock);
-	/* we cannot use list_lru_add here, because it increments node's lru count */
-	list_lru_putback(list_lru, &entry->lru, nid, memcg);
-	spin_unlock(lock);
-
-	lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(entry_to_nid(entry)));
-	/* increment the protection area to account for the LRU rotation. */
-	atomic_long_inc(&lruvec->zswap_lruvec_state.nr_zswap_protected);
-	rcu_read_unlock();
-}
-
 /*********************************
 * rbtree functions
 **********************************/
@@ -859,40 +838,47 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o
 {
 	struct zswap_entry *entry = container_of(item, struct zswap_entry, lru);
 	bool *encountered_page_in_swapcache = (bool *)arg;
-	struct zswap_tree *tree;
-	pgoff_t swpoffset;
+	swp_entry_t swpentry;
 	enum lru_status ret = LRU_REMOVED_RETRY;
 	int writeback_result;
 
+	/*
+	 * Rotate the entry to the tail before unlocking the LRU,
+	 * so that in case of an invalidation race concurrent
+	 * reclaimers don't waste their time on it.
+	 *
+	 * If writeback succeeds, or failure is due to the entry
+	 * being invalidated by the swap subsystem, the invalidation
+	 * will unlink and free it.
+	 *
+	 * Temporary failures, where the same entry should be tried
+	 * again immediately, almost never happen for this shrinker.
+	 * We don't do any trylocking; -ENOMEM comes closest,
+	 * but that's extremely rare and doesn't happen spuriously
+	 * either. Don't bother distinguishing this case.
+	 *
+	 * But since they do exist in theory, the entry cannot just
+	 * be unlinked, or we could leak it. Hence, rotate.
+	 */
+	list_move_tail(item, &l->list);
+
 	/*
 	 * Once the lru lock is dropped, the entry might get freed. The
-	 * swpoffset is copied to the stack, and entry isn't deref'd again
+	 * swpentry is copied to the stack, and entry isn't deref'd again
 	 * until the entry is verified to still be alive in the tree.
 	 */
-	swpoffset = swp_offset(entry->swpentry);
-	tree = swap_zswap_tree(entry->swpentry);
-	list_lru_isolate(l, item);
+	swpentry = entry->swpentry;
+
 	/*
 	 * It's safe to drop the lock here because we return either
 	 * LRU_REMOVED_RETRY or LRU_RETRY.
 	 */
 	spin_unlock(lock);
 
-	/* Check for invalidate() race */
-	spin_lock(&tree->lock);
-	if (entry != zswap_rb_search(&tree->rbroot, swpoffset))
-		goto unlock;
-
-	/* Hold a reference to prevent a free during writeback */
-	zswap_entry_get(entry);
-	spin_unlock(&tree->lock);
+	writeback_result = zswap_writeback_entry(entry, swpentry);
 
-	writeback_result = zswap_writeback_entry(entry, tree);
-
-	spin_lock(&tree->lock);
 	if (writeback_result) {
 		zswap_reject_reclaim_fail++;
-		zswap_lru_putback(&entry->pool->list_lru, entry);
 		ret = LRU_RETRY;
 
 		/*
@@ -902,27 +888,10 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o
 		 */
 		if (writeback_result == -EEXIST && encountered_page_in_swapcache)
 			*encountered_page_in_swapcache = true;
-
-		goto put_unlock;
+	} else {
+		zswap_written_back_pages++;
 	}
-	zswap_written_back_pages++;
-
-	if (entry->objcg)
-		count_objcg_event(entry->objcg, ZSWPWB);
 
-	count_vm_event(ZSWPWB);
-	/*
-	 * Writeback started successfully, the page now belongs to the
-	 * swapcache. Drop the entry from zswap - unless invalidate already
-	 * took it out while we had the tree->lock released for IO.
-	 */
-	zswap_invalidate_entry(tree, entry);
-
-put_unlock:
-	/* Drop local reference */
-	zswap_entry_put(entry);
-unlock:
-	spin_unlock(&tree->lock);
 	spin_lock(lock);
 	return ret;
 }
@@ -1407,9 +1376,9 @@ static void __zswap_load(struct zswap_entry *entry, struct page *page)
  * freed.
  */
 static int zswap_writeback_entry(struct zswap_entry *entry,
-				 struct zswap_tree *tree)
+				 swp_entry_t swpentry)
 {
-	swp_entry_t swpentry = entry->swpentry;
+	struct zswap_tree *tree;
 	struct folio *folio;
 	struct mempolicy *mpol;
 	bool folio_was_allocated;
@@ -1425,9 +1394,11 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
 		return -ENOMEM;
 
 	/*
-	 * Found an existing folio, we raced with load/swapin. We generally
-	 * writeback cold folios from zswap, and swapin means the folio just
-	 * became hot. Skip this folio and let the caller find another one.
+	 * Found an existing folio, we raced with swapin or concurrent
+	 * shrinker. We generally writeback cold folios from zswap, and
+	 * swapin means the folio just became hot, so skip this folio.
+	 * For unlikely concurrent shrinker case, it will be unlinked
+	 * and freed when invalidated by the concurrent shrinker anyway.
 	 */
 	if (!folio_was_allocated) {
 		folio_put(folio);
@@ -1441,18 +1412,31 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
 	 * backs (our zswap_entry reference doesn't prevent that), to
 	 * avoid overwriting a new swap folio with old compressed data.
 	 */
+	tree = swap_zswap_tree(swpentry);
 	spin_lock(&tree->lock);
-	if (zswap_rb_search(&tree->rbroot, swp_offset(entry->swpentry)) != entry) {
+	if (zswap_rb_search(&tree->rbroot, swp_offset(swpentry)) != entry) {
 		spin_unlock(&tree->lock);
 		delete_from_swap_cache(folio);
 		folio_unlock(folio);
 		folio_put(folio);
 		return -ENOMEM;
 	}
+
+	/* Safe to deref entry after the entry is verified above. */
+	zswap_entry_get(entry);
 	spin_unlock(&tree->lock);
 
 	__zswap_load(entry, &folio->page);
 
+	count_vm_event(ZSWPWB);
+	if (entry->objcg)
+		count_objcg_event(entry->objcg, ZSWPWB);
+
+	spin_lock(&tree->lock);
+	zswap_invalidate_entry(tree, entry);
+	zswap_entry_put(entry);
+	spin_unlock(&tree->lock);
+
 	/* folio is up to date */
 	folio_mark_uptodate(folio);
 

From 63d89e123b8d76ed2a161a2784dcae67c27082aa Mon Sep 17 00:00:00 2001
From: Chengming Zhou <zhouchengming@bytedance.com>
Date: Sun, 28 Jan 2024 13:28:51 +0000
Subject: [PATCH 1061/1406] mm/list_lru: remove list_lru_putback()

Since the only user zswap_lru_putback() has gone, remove
list_lru_putback() too.

Link: https://lkml.kernel.org/r/20240126-zswap-writeback-race-v2-3-b10479847099@bytedance.com
Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
Acked-by: Yosry Ahmed <yosryahmed@google.com>
Cc: Chris Li <chriscli@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Nhat Pham <nphamcs@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/list_lru.h | 16 ----------------
 mm/list_lru.c            | 14 --------------
 mm/zswap.c               |  2 +-
 3 files changed, 1 insertion(+), 31 deletions(-)

diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h
index c679e6b293c4c4..f2882a82069027 100644
--- a/include/linux/list_lru.h
+++ b/include/linux/list_lru.h
@@ -168,22 +168,6 @@ static inline unsigned long list_lru_count(struct list_lru *lru)
 void list_lru_isolate(struct list_lru_one *list, struct list_head *item);
 void list_lru_isolate_move(struct list_lru_one *list, struct list_head *item,
 			   struct list_head *head);
-/**
- * list_lru_putback: undo list_lru_isolate
- * @lru: the lru pointer.
- * @item: the item to put back.
- * @nid: the node id of the sublist to put the item back to.
- * @memcg: the cgroup of the sublist to put the item back to.
- *
- * Put back an isolated item into its original LRU. Note that unlike
- * list_lru_add, this does not increment the node LRU count (as
- * list_lru_isolate does not originally decrement this count).
- *
- * Since we might have dropped the LRU lock in between, recompute list_lru_one
- * from the node's id and memcg.
- */
-void list_lru_putback(struct list_lru *lru, struct list_head *item, int nid,
-		      struct mem_cgroup *memcg);
 
 typedef enum lru_status (*list_lru_walk_cb)(struct list_head *item,
 		struct list_lru_one *list, spinlock_t *lock, void *cb_arg);
diff --git a/mm/list_lru.c b/mm/list_lru.c
index 158781d1d3c215..61f3b6b1134fbe 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -162,20 +162,6 @@ void list_lru_isolate_move(struct list_lru_one *list, struct list_head *item,
 }
 EXPORT_SYMBOL_GPL(list_lru_isolate_move);
 
-void list_lru_putback(struct list_lru *lru, struct list_head *item, int nid,
-		      struct mem_cgroup *memcg)
-{
-	struct list_lru_one *list =
-		list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg));
-
-	if (list_empty(item)) {
-		list_add_tail(item, &list->list);
-		if (!list->nr_items++)
-			set_shrinker_bit(memcg, nid, lru_shrinker_id(lru));
-	}
-}
-EXPORT_SYMBOL_GPL(list_lru_putback);
-
 unsigned long list_lru_count_one(struct list_lru *lru,
 				 int nid, struct mem_cgroup *memcg)
 {
diff --git a/mm/zswap.c b/mm/zswap.c
index ddc8f930d3434e..2d7f594e6d07fa 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -410,7 +410,7 @@ static void zswap_lru_add(struct list_lru *list_lru, struct zswap_entry *entry)
 	 * 2. list_lru_add() is called after memcg->kmemcg_id is updated. The
 	 *    new entry will be added directly to memcg's parent's list_lru.
 	 *
-	 * Similar reasoning holds for list_lru_del() and list_lru_putback().
+	 * Similar reasoning holds for list_lru_del().
 	 */
 	rcu_read_lock();
 	memcg = mem_cgroup_from_entry(entry);

From 9c50a832dbfedc88116fc7c66ccc2883d9f9dbeb Mon Sep 17 00:00:00 2001
From: Lance Yang <ioworker0@gmail.com>
Date: Mon, 29 Jan 2024 13:45:51 +0800
Subject: [PATCH 1062/1406] mm/khugepaged: bypassing unnecessary scans with
 MMF_DISABLE_THP check

khugepaged scans the entire address space in the background for each given
mm, looking for opportunities to merge sequences of basic pages into huge
pages.  However, when an mm is inserted to the mm_slots list, and the
MMF_DISABLE_THP flag is set later, this scanning process becomes
unnecessary for that mm and can be skipped to avoid redundant operations,
especially in scenarios with a large address space.

This commit introduces a check before each scanning process to test the
MMF_DISABLE_THP flag for the given mm; if the flag is set, the scanning
process is bypassed, thereby improving the efficiency of khugepaged.

Link: https://lkml.kernel.org/r/20240129054551.57728-1-ioworker0@gmail.com
Signed-off-by: Lance Yang <ioworker0@gmail.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Zach O'Keefe <zokeefe@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/khugepaged.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index fe43fbc4452539..2771fc043b3b8b 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -410,6 +410,12 @@ static inline int hpage_collapse_test_exit(struct mm_struct *mm)
 	return atomic_read(&mm->mm_users) == 0;
 }
 
+static inline int hpage_collapse_test_exit_or_disable(struct mm_struct *mm)
+{
+	return hpage_collapse_test_exit(mm) ||
+	       test_bit(MMF_DISABLE_THP, &mm->flags);
+}
+
 void __khugepaged_enter(struct mm_struct *mm)
 {
 	struct khugepaged_mm_slot *mm_slot;
@@ -1422,7 +1428,7 @@ static void collect_mm_slot(struct khugepaged_mm_slot *mm_slot)
 
 	lockdep_assert_held(&khugepaged_mm_lock);
 
-	if (hpage_collapse_test_exit(mm)) {
+	if (hpage_collapse_test_exit_or_disable(mm)) {
 		/* free mm_slot */
 		hash_del(&slot->hash);
 		list_del(&slot->mm_node);
@@ -2360,7 +2366,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
 		goto breakouterloop_mmap_lock;
 
 	progress++;
-	if (unlikely(hpage_collapse_test_exit(mm)))
+	if (unlikely(hpage_collapse_test_exit_or_disable(mm)))
 		goto breakouterloop;
 
 	vma_iter_init(&vmi, mm, khugepaged_scan.address);
@@ -2368,7 +2374,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
 		unsigned long hstart, hend;
 
 		cond_resched();
-		if (unlikely(hpage_collapse_test_exit(mm))) {
+		if (unlikely(hpage_collapse_test_exit_or_disable(mm))) {
 			progress++;
 			break;
 		}
@@ -2390,7 +2396,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
 			bool mmap_locked = true;
 
 			cond_resched();
-			if (unlikely(hpage_collapse_test_exit(mm)))
+			if (unlikely(hpage_collapse_test_exit_or_disable(mm)))
 				goto breakouterloop;
 
 			VM_BUG_ON(khugepaged_scan.address < hstart ||
@@ -2408,7 +2414,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
 				fput(file);
 				if (*result == SCAN_PTE_MAPPED_HUGEPAGE) {
 					mmap_read_lock(mm);
-					if (hpage_collapse_test_exit(mm))
+					if (hpage_collapse_test_exit_or_disable(mm))
 						goto breakouterloop;
 					*result = collapse_pte_mapped_thp(mm,
 						khugepaged_scan.address, false);
@@ -2450,7 +2456,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
 	 * Release the current mm_slot if this mm is about to die, or
 	 * if we scanned all vmas of this mm.
 	 */
-	if (hpage_collapse_test_exit(mm) || !vma) {
+	if (hpage_collapse_test_exit_or_disable(mm) || !vma) {
 		/*
 		 * Make sure that if mm_users is reaching zero while
 		 * khugepaged runs here, khugepaged_exit will find

From df94195adb7569ed8e158e8dd00755136c126bc2 Mon Sep 17 00:00:00 2001
From: Baolin Wang <baolin.wang@linux.alibaba.com>
Date: Mon, 22 Jan 2024 21:01:53 +0800
Subject: [PATCH 1063/1406] mm: compaction: limit the suitable target page
 order to be less than cc->order

It can not improve the fragmentation if we isolate the target free pages
exceeding cc->order, especially when the cc->order is less than
pageblock_order.  For example, suppose the pageblock_order is MAX_ORDER
(size is 4M) and cc->order is 2M THP size, we should not isolate other 2M
free pages to be the migration target, which can not improve the
fragmentation.

Moreover this is also applicable for large folio compaction.

Link: https://lkml.kernel.org/r/afcd9377351c259df7a25a388a4a0d5862b986f4.1705928395.git.baolin.wang@linux.alibaba.com
Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Acked-by: Mel Gorman <mgorman@techsingularity.net>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/compaction.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/mm/compaction.c b/mm/compaction.c
index 4add68d40e8d99..4494b2914386c1 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1365,12 +1365,14 @@ static bool suitable_migration_target(struct compact_control *cc,
 {
 	/* If the page is a large free page, then disallow migration */
 	if (PageBuddy(page)) {
+		int order = cc->order > 0 ? cc->order : pageblock_order;
+
 		/*
 		 * We are checking page_order without zone->lock taken. But
 		 * the only small danger is that we skip a potentially suitable
 		 * pageblock, so it's not worth to check order for valid range.
 		 */
-		if (buddy_order_unsafe(page) >= pageblock_order)
+		if (buddy_order_unsafe(page) >= order)
 			return false;
 	}
 

From a9b42fed77348d322333c0b69760d16f9749b98a Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Mon, 29 Jan 2024 20:36:37 -0500
Subject: [PATCH 1064/1406] mm: zswap: rename zswap_free_entry to
 zswap_entry_free

There is a zswap_entry_ namespace with multiple functions already.

Link: https://lkml.kernel.org/r/20240130014208.565554-2-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Nhat Pham <nphamcs@gmail.com>
Acked-by: Yosry Ahmed <yosryahmed@google.com>
Reviewed-by: Chengming Zhou <zhouchengming@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/zswap.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mm/zswap.c b/mm/zswap.c
index 2d7f594e6d07fa..5cb79cb497c06b 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -519,7 +519,7 @@ static struct zpool *zswap_find_zpool(struct zswap_entry *entry)
  * Carries out the common pattern of freeing and entry's zpool allocation,
  * freeing the entry itself, and decrementing the number of stored pages.
  */
-static void zswap_free_entry(struct zswap_entry *entry)
+static void zswap_entry_free(struct zswap_entry *entry)
 {
 	if (!entry->length)
 		atomic_dec(&zswap_same_filled_pages);
@@ -554,7 +554,7 @@ static void zswap_entry_put(struct zswap_entry *entry)
 	WARN_ON_ONCE(refcount < 0);
 	if (refcount == 0) {
 		WARN_ON_ONCE(!RB_EMPTY_NODE(&entry->rbnode));
-		zswap_free_entry(entry);
+		zswap_entry_free(entry);
 	}
 }
 

From 4e71eeeb58250db7963281564e84c0d3d41d6354 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Mon, 29 Jan 2024 20:36:38 -0500
Subject: [PATCH 1065/1406] mm: zswap: inline and remove zswap_entry_find_get()

There is only one caller and the function is trivial. Inline it.

Link: https://lkml.kernel.org/r/20240130014208.565554-3-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Nhat Pham <nphamcs@gmail.com>
Acked-by: Yosry Ahmed <yosryahmed@google.com>
Reviewed-by: Chengming Zhou <zhouchengming@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/zswap.c | 17 ++---------------
 1 file changed, 2 insertions(+), 15 deletions(-)

diff --git a/mm/zswap.c b/mm/zswap.c
index 5cb79cb497c06b..3df8b6329cf5c5 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -558,19 +558,6 @@ static void zswap_entry_put(struct zswap_entry *entry)
 	}
 }
 
-/* caller must hold the tree lock */
-static struct zswap_entry *zswap_entry_find_get(struct rb_root *root,
-				pgoff_t offset)
-{
-	struct zswap_entry *entry;
-
-	entry = zswap_rb_search(root, offset);
-	if (entry)
-		zswap_entry_get(entry);
-
-	return entry;
-}
-
 /*********************************
 * shrinker functions
 **********************************/
@@ -1708,13 +1695,13 @@ bool zswap_load(struct folio *folio)
 
 	VM_WARN_ON_ONCE(!folio_test_locked(folio));
 
-	/* find */
 	spin_lock(&tree->lock);
-	entry = zswap_entry_find_get(&tree->rbroot, offset);
+	entry = zswap_rb_search(&tree->rbroot, offset);
 	if (!entry) {
 		spin_unlock(&tree->lock);
 		return false;
 	}
+	zswap_entry_get(entry);
 	spin_unlock(&tree->lock);
 
 	if (entry->length)

From 9bd9047a76c8cc362a770e932ae53754e9b97ce6 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Mon, 29 Jan 2024 20:36:39 -0500
Subject: [PATCH 1066/1406] mm: zswap: move zswap_invalidate_entry() to related
 functions

Move it up to the other tree and refcounting functions.

Link: https://lkml.kernel.org/r/20240130014208.565554-4-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Nhat Pham <nphamcs@gmail.com>
Acked-by: Yosry Ahmed <yosryahmed@google.com>
Reviewed-by: Chengming Zhou <zhouchengming@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/zswap.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/mm/zswap.c b/mm/zswap.c
index 3df8b6329cf5c5..0dfd410d1b3c31 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -558,6 +558,18 @@ static void zswap_entry_put(struct zswap_entry *entry)
 	}
 }
 
+/*
+ * If the entry is still valid in the tree, drop the initial ref and remove it
+ * from the tree. This function must be called with an additional ref held,
+ * otherwise it may race with another invalidation freeing the entry.
+ */
+static void zswap_invalidate_entry(struct zswap_tree *tree,
+				   struct zswap_entry *entry)
+{
+	if (zswap_rb_erase(&tree->rbroot, entry))
+		zswap_entry_put(entry);
+}
+
 /*********************************
 * shrinker functions
 **********************************/
@@ -808,18 +820,6 @@ static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor)
 	return NULL;
 }
 
-/*
- * If the entry is still valid in the tree, drop the initial ref and remove it
- * from the tree. This function must be called with an additional ref held,
- * otherwise it may race with another invalidation freeing the entry.
- */
-static void zswap_invalidate_entry(struct zswap_tree *tree,
-				   struct zswap_entry *entry)
-{
-	if (zswap_rb_erase(&tree->rbroot, entry))
-		zswap_entry_put(entry);
-}
-
 static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_one *l,
 				       spinlock_t *lock, void *arg)
 {

From 324e30e73ca340055bf79271b6753160759fd164 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Mon, 29 Jan 2024 20:36:40 -0500
Subject: [PATCH 1067/1406] mm: zswap: warn when referencing a dead entry

Put a standard sanity check on zswap_entry_get() for UAF scenario.

Link: https://lkml.kernel.org/r/20240130014208.565554-5-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Nhat Pham <nphamcs@gmail.com>
Acked-by: Yosry Ahmed <yosryahmed@google.com>
Reviewed-by: Chengming Zhou <zhouchengming@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/zswap.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mm/zswap.c b/mm/zswap.c
index 0dfd410d1b3c31..70e409add32b2e 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -541,6 +541,7 @@ static void zswap_entry_free(struct zswap_entry *entry)
 /* caller must hold the tree lock */
 static void zswap_entry_get(struct zswap_entry *entry)
 {
+	WARN_ON_ONCE(!entry->refcount);
 	entry->refcount++;
 }
 

From 5934b964eb8c9d420a77d866e371cc007a89b2c5 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Mon, 29 Jan 2024 20:36:41 -0500
Subject: [PATCH 1068/1406] mm: zswap: clean up zswap_entry_put()

Remove stale comment and unnecessary local variable.

Link: https://lkml.kernel.org/r/20240130014208.565554-6-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Yosry Ahmed <yosryahmed@google.com>
Reviewed-by: Nhat Pham <nphamcs@gmail.com>
Reviewed-by: Chengming Zhou <zhouchengming@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/zswap.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/mm/zswap.c b/mm/zswap.c
index 70e409add32b2e..32bcc291397ffc 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -545,15 +545,11 @@ static void zswap_entry_get(struct zswap_entry *entry)
 	entry->refcount++;
 }
 
-/* caller must hold the tree lock
-* remove from the tree and free it, if nobody reference the entry
-*/
+/* caller must hold the tree lock */
 static void zswap_entry_put(struct zswap_entry *entry)
 {
-	int refcount = --entry->refcount;
-
-	WARN_ON_ONCE(refcount < 0);
-	if (refcount == 0) {
+	WARN_ON_ONCE(!entry->refcount);
+	if (--entry->refcount == 0) {
 		WARN_ON_ONCE(!RB_EMPTY_NODE(&entry->rbnode));
 		zswap_entry_free(entry);
 	}

From 0c1bc7f66c0d3624e761c5324758261a2a2c7f1a Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Mon, 29 Jan 2024 20:36:42 -0500
Subject: [PATCH 1069/1406] mm: zswap: rename __zswap_load() to
 zswap_decompress()

Link: https://lkml.kernel.org/r/20240130014208.565554-7-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Nhat Pham <nphamcs@gmail.com>
Acked-by: Yosry Ahmed <yosryahmed@google.com>
Reviewed-by: Chengming Zhou <zhouchengming@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/zswap.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/mm/zswap.c b/mm/zswap.c
index 32bcc291397ffc..4a0849bf893d21 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -1315,7 +1315,7 @@ static int zswap_enabled_param_set(const char *val,
 	return ret;
 }
 
-static void __zswap_load(struct zswap_entry *entry, struct page *page)
+static void zswap_decompress(struct zswap_entry *entry, struct page *page)
 {
 	struct zpool *zpool = zswap_find_zpool(entry);
 	struct scatterlist input, output;
@@ -1410,7 +1410,7 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
 	zswap_entry_get(entry);
 	spin_unlock(&tree->lock);
 
-	__zswap_load(entry, &folio->page);
+	zswap_decompress(entry, &folio->page);
 
 	count_vm_event(ZSWPWB);
 	if (entry->objcg)
@@ -1702,7 +1702,7 @@ bool zswap_load(struct folio *folio)
 	spin_unlock(&tree->lock);
 
 	if (entry->length)
-		__zswap_load(entry, page);
+		zswap_decompress(entry, page);
 	else {
 		dst = kmap_local_page(page);
 		zswap_fill_page(dst, entry->value);

From 41bb36d6bf5f0c306745df2a8c58b14bd9e0e1f9 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Mon, 29 Jan 2024 20:36:43 -0500
Subject: [PATCH 1070/1406] mm: zswap: break out zwap_compress()

zswap_store() is long and mixes work at the zswap layer with work at
the backend and compression layer. Move compression & backend work to
zswap_compress(), mirroring zswap_decompress().

Link: https://lkml.kernel.org/r/20240130014208.565554-8-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Nhat Pham <nphamcs@gmail.com>
Acked-by: Yosry Ahmed <yosryahmed@google.com>
Reviewed-by: Chengming Zhou <zhouchengming@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/zswap.c | 145 ++++++++++++++++++++++++++++-------------------------
 1 file changed, 77 insertions(+), 68 deletions(-)

diff --git a/mm/zswap.c b/mm/zswap.c
index 4a0849bf893d21..82f788a93a33ed 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -1315,6 +1315,79 @@ static int zswap_enabled_param_set(const char *val,
 	return ret;
 }
 
+static bool zswap_compress(struct folio *folio, struct zswap_entry *entry)
+{
+	struct crypto_acomp_ctx *acomp_ctx;
+	struct scatterlist input, output;
+	unsigned int dlen = PAGE_SIZE;
+	unsigned long handle;
+	struct zpool *zpool;
+	char *buf;
+	gfp_t gfp;
+	int ret;
+	u8 *dst;
+
+	acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
+
+	mutex_lock(&acomp_ctx->mutex);
+
+	dst = acomp_ctx->buffer;
+	sg_init_table(&input, 1);
+	sg_set_page(&input, &folio->page, PAGE_SIZE, 0);
+
+	/*
+	 * We need PAGE_SIZE * 2 here since there maybe over-compression case,
+	 * and hardware-accelerators may won't check the dst buffer size, so
+	 * giving the dst buffer with enough length to avoid buffer overflow.
+	 */
+	sg_init_one(&output, dst, PAGE_SIZE * 2);
+	acomp_request_set_params(acomp_ctx->req, &input, &output, PAGE_SIZE, dlen);
+
+	/*
+	 * it maybe looks a little bit silly that we send an asynchronous request,
+	 * then wait for its completion synchronously. This makes the process look
+	 * synchronous in fact.
+	 * Theoretically, acomp supports users send multiple acomp requests in one
+	 * acomp instance, then get those requests done simultaneously. but in this
+	 * case, zswap actually does store and load page by page, there is no
+	 * existing method to send the second page before the first page is done
+	 * in one thread doing zwap.
+	 * but in different threads running on different cpu, we have different
+	 * acomp instance, so multiple threads can do (de)compression in parallel.
+	 */
+	ret = crypto_wait_req(crypto_acomp_compress(acomp_ctx->req), &acomp_ctx->wait);
+	dlen = acomp_ctx->req->dlen;
+	if (ret) {
+		zswap_reject_compress_fail++;
+		goto unlock;
+	}
+
+	zpool = zswap_find_zpool(entry);
+	gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
+	if (zpool_malloc_support_movable(zpool))
+		gfp |= __GFP_HIGHMEM | __GFP_MOVABLE;
+	ret = zpool_malloc(zpool, dlen, gfp, &handle);
+	if (ret == -ENOSPC) {
+		zswap_reject_compress_poor++;
+		goto unlock;
+	}
+	if (ret) {
+		zswap_reject_alloc_fail++;
+		goto unlock;
+	}
+
+	buf = zpool_map_handle(zpool, handle, ZPOOL_MM_WO);
+	memcpy(buf, dst, dlen);
+	zpool_unmap_handle(zpool, handle);
+
+	entry->handle = handle;
+	entry->length = dlen;
+
+unlock:
+	mutex_unlock(&acomp_ctx->mutex);
+	return ret == 0;
+}
+
 static void zswap_decompress(struct zswap_entry *entry, struct page *page)
 {
 	struct zpool *zpool = zswap_find_zpool(entry);
@@ -1471,18 +1544,11 @@ bool zswap_store(struct folio *folio)
 	struct page *page = &folio->page;
 	struct zswap_tree *tree = swap_zswap_tree(swp);
 	struct zswap_entry *entry, *dupentry;
-	struct scatterlist input, output;
-	struct crypto_acomp_ctx *acomp_ctx;
 	struct obj_cgroup *objcg = NULL;
 	struct mem_cgroup *memcg = NULL;
 	struct zswap_pool *pool;
-	struct zpool *zpool;
-	unsigned int dlen = PAGE_SIZE;
-	unsigned long handle, value;
-	char *buf;
-	u8 *src, *dst;
-	gfp_t gfp;
-	int ret;
+	unsigned long value;
+	u8 *src;
 
 	VM_WARN_ON_ONCE(!folio_test_locked(folio));
 	VM_WARN_ON_ONCE(!folio_test_swapcache(folio));
@@ -1568,65 +1634,10 @@ bool zswap_store(struct folio *folio)
 		mem_cgroup_put(memcg);
 	}
 
-	/* compress */
-	acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
-
-	mutex_lock(&acomp_ctx->mutex);
-
-	dst = acomp_ctx->buffer;
-	sg_init_table(&input, 1);
-	sg_set_page(&input, &folio->page, PAGE_SIZE, 0);
+	if (!zswap_compress(folio, entry))
+		goto put_pool;
 
-	/*
-	 * We need PAGE_SIZE * 2 here since there maybe over-compression case,
-	 * and hardware-accelerators may won't check the dst buffer size, so
-	 * giving the dst buffer with enough length to avoid buffer overflow.
-	 */
-	sg_init_one(&output, dst, PAGE_SIZE * 2);
-	acomp_request_set_params(acomp_ctx->req, &input, &output, PAGE_SIZE, dlen);
-	/*
-	 * it maybe looks a little bit silly that we send an asynchronous request,
-	 * then wait for its completion synchronously. This makes the process look
-	 * synchronous in fact.
-	 * Theoretically, acomp supports users send multiple acomp requests in one
-	 * acomp instance, then get those requests done simultaneously. but in this
-	 * case, zswap actually does store and load page by page, there is no
-	 * existing method to send the second page before the first page is done
-	 * in one thread doing zwap.
-	 * but in different threads running on different cpu, we have different
-	 * acomp instance, so multiple threads can do (de)compression in parallel.
-	 */
-	ret = crypto_wait_req(crypto_acomp_compress(acomp_ctx->req), &acomp_ctx->wait);
-	dlen = acomp_ctx->req->dlen;
-
-	if (ret) {
-		zswap_reject_compress_fail++;
-		goto put_dstmem;
-	}
-
-	/* store */
-	zpool = zswap_find_zpool(entry);
-	gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
-	if (zpool_malloc_support_movable(zpool))
-		gfp |= __GFP_HIGHMEM | __GFP_MOVABLE;
-	ret = zpool_malloc(zpool, dlen, gfp, &handle);
-	if (ret == -ENOSPC) {
-		zswap_reject_compress_poor++;
-		goto put_dstmem;
-	}
-	if (ret) {
-		zswap_reject_alloc_fail++;
-		goto put_dstmem;
-	}
-	buf = zpool_map_handle(zpool, handle, ZPOOL_MM_WO);
-	memcpy(buf, dst, dlen);
-	zpool_unmap_handle(zpool, handle);
-	mutex_unlock(&acomp_ctx->mutex);
-
-	/* populate entry */
 	entry->swpentry = swp;
-	entry->handle = handle;
-	entry->length = dlen;
 
 insert_entry:
 	entry->objcg = objcg;
@@ -1663,8 +1674,6 @@ bool zswap_store(struct folio *folio)
 
 	return true;
 
-put_dstmem:
-	mutex_unlock(&acomp_ctx->mutex);
 put_pool:
 	zswap_pool_put(entry->pool);
 freepage:

From 8a1552d21f062db7000305386568230a2d7db6c4 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Mon, 29 Jan 2024 20:36:44 -0500
Subject: [PATCH 1071/1406] mm: zswap: further cleanup zswap_store()

- Remove dupentry, reusing entry works just fine.
- Rename pool to shrink_pool, as this one actually is confusing.
- Remove page, use folio_nid() and kmap_local_folio() directly.
- Set entry->swpentry in a common path.
- Move value and src to local scope of use.

Link: https://lkml.kernel.org/r/20240130014208.565554-9-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Nhat Pham <nphamcs@gmail.com>
Acked-by: Yosry Ahmed <yosryahmed@google.com>
Reviewed-by: Chengming Zhou <zhouchengming@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/zswap.c | 28 +++++++++++++---------------
 1 file changed, 13 insertions(+), 15 deletions(-)

diff --git a/mm/zswap.c b/mm/zswap.c
index 82f788a93a33ed..1a86659e1173de 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -1541,14 +1541,11 @@ bool zswap_store(struct folio *folio)
 {
 	swp_entry_t swp = folio->swap;
 	pgoff_t offset = swp_offset(swp);
-	struct page *page = &folio->page;
 	struct zswap_tree *tree = swap_zswap_tree(swp);
 	struct zswap_entry *entry, *dupentry;
 	struct obj_cgroup *objcg = NULL;
 	struct mem_cgroup *memcg = NULL;
-	struct zswap_pool *pool;
-	unsigned long value;
-	u8 *src;
+	struct zswap_pool *shrink_pool;
 
 	VM_WARN_ON_ONCE(!folio_test_locked(folio));
 	VM_WARN_ON_ONCE(!folio_test_swapcache(folio));
@@ -1563,10 +1560,10 @@ bool zswap_store(struct folio *folio)
 	 * the tree, and it might be written back overriding the new data.
 	 */
 	spin_lock(&tree->lock);
-	dupentry = zswap_rb_search(&tree->rbroot, offset);
-	if (dupentry) {
+	entry = zswap_rb_search(&tree->rbroot, offset);
+	if (entry) {
+		zswap_invalidate_entry(tree, entry);
 		zswap_duplicate_entry++;
-		zswap_invalidate_entry(tree, dupentry);
 	}
 	spin_unlock(&tree->lock);
 
@@ -1598,17 +1595,19 @@ bool zswap_store(struct folio *folio)
 	}
 
 	/* allocate entry */
-	entry = zswap_entry_cache_alloc(GFP_KERNEL, page_to_nid(page));
+	entry = zswap_entry_cache_alloc(GFP_KERNEL, folio_nid(folio));
 	if (!entry) {
 		zswap_reject_kmemcache_fail++;
 		goto reject;
 	}
 
 	if (zswap_same_filled_pages_enabled) {
-		src = kmap_local_page(page);
+		unsigned long value;
+		u8 *src;
+
+		src = kmap_local_folio(folio, 0);
 		if (zswap_is_page_same_filled(src, &value)) {
 			kunmap_local(src);
-			entry->swpentry = swp;
 			entry->length = 0;
 			entry->value = value;
 			atomic_inc(&zswap_same_filled_pages);
@@ -1637,9 +1636,8 @@ bool zswap_store(struct folio *folio)
 	if (!zswap_compress(folio, entry))
 		goto put_pool;
 
-	entry->swpentry = swp;
-
 insert_entry:
+	entry->swpentry = swp;
 	entry->objcg = objcg;
 	if (objcg) {
 		obj_cgroup_charge_zswap(objcg, entry->length);
@@ -1684,9 +1682,9 @@ bool zswap_store(struct folio *folio)
 	return false;
 
 shrink:
-	pool = zswap_pool_last_get();
-	if (pool && !queue_work(shrink_wq, &pool->shrink_work))
-		zswap_pool_put(pool);
+	shrink_pool = zswap_pool_last_get();
+	if (shrink_pool && !queue_work(shrink_wq, &shrink_pool->shrink_work))
+		zswap_pool_put(shrink_pool);
 	goto reject;
 }
 

From 8bc23a05de7c21458a5391b248613faab57bc3ab Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Mon, 29 Jan 2024 20:36:45 -0500
Subject: [PATCH 1072/1406] mm: zswap: simplify zswap_invalidate()

The branching is awkward and duplicates code. The comment about
writeback is also misleading: yes, the entry might have been written
back. Or it might have never been stored in zswap to begin with due to
a rejection - zswap_invalidate() is called on all exiting swap entries.

Link: https://lkml.kernel.org/r/20240130014208.565554-10-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Nhat Pham <nphamcs@gmail.com>
Acked-by: Yosry Ahmed <yosryahmed@google.com>
Reviewed-by: Chengming Zhou <zhouchengming@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/zswap.c | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/mm/zswap.c b/mm/zswap.c
index 1a86659e1173de..732b0a701b77fc 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -1739,15 +1739,10 @@ void zswap_invalidate(int type, pgoff_t offset)
 	struct zswap_tree *tree = swap_zswap_tree(swp_entry(type, offset));
 	struct zswap_entry *entry;
 
-	/* find */
 	spin_lock(&tree->lock);
 	entry = zswap_rb_search(&tree->rbroot, offset);
-	if (!entry) {
-		/* entry was written back */
-		spin_unlock(&tree->lock);
-		return;
-	}
-	zswap_invalidate_entry(tree, entry);
+	if (entry)
+		zswap_invalidate_entry(tree, entry);
 	spin_unlock(&tree->lock);
 }
 

From 109d7ecc16896b752b2a19325e793b6dfd9c31af Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Mon, 29 Jan 2024 20:36:46 -0500
Subject: [PATCH 1073/1406] mm: zswap: function ordering: pool alloc & free

The function ordering in zswap.c is a little chaotic, which requires
jumping in unexpected directions when following related code. This is
a series of patches that brings the file into the following order:

- pool functions
- lru functions
- rbtree functions
- zswap entry functions
- compression/backend functions
- writeback & shrinking functions
- store, load, invalidate, swapon, swapoff
- debugfs
- init

But it has to be split up such the moving still produces halfway
readable diffs.

In this patch, move pool allocation and freeing functions.

Link: https://lkml.kernel.org/r/20240130014208.565554-11-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Nhat Pham <nphamcs@gmail.com>
Cc: Chengming Zhou <zhouchengming@bytedance.com>
Cc: Yosry Ahmed <yosryahmed@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/zswap.c | 297 +++++++++++++++++++++++++++--------------------------
 1 file changed, 152 insertions(+), 145 deletions(-)

diff --git a/mm/zswap.c b/mm/zswap.c
index 732b0a701b77fc..5947eeb8d799c4 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -320,6 +320,158 @@ static void zswap_update_total_size(void)
 	zswap_pool_total_size = total;
 }
 
+/*********************************
+* pool functions
+**********************************/
+
+static void zswap_alloc_shrinker(struct zswap_pool *pool);
+static void shrink_worker(struct work_struct *w);
+
+static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
+{
+	int i;
+	struct zswap_pool *pool;
+	char name[38]; /* 'zswap' + 32 char (max) num + \0 */
+	gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
+	int ret;
+
+	if (!zswap_has_pool) {
+		/* if either are unset, pool initialization failed, and we
+		 * need both params to be set correctly before trying to
+		 * create a pool.
+		 */
+		if (!strcmp(type, ZSWAP_PARAM_UNSET))
+			return NULL;
+		if (!strcmp(compressor, ZSWAP_PARAM_UNSET))
+			return NULL;
+	}
+
+	pool = kzalloc(sizeof(*pool), GFP_KERNEL);
+	if (!pool)
+		return NULL;
+
+	for (i = 0; i < ZSWAP_NR_ZPOOLS; i++) {
+		/* unique name for each pool specifically required by zsmalloc */
+		snprintf(name, 38, "zswap%x",
+			 atomic_inc_return(&zswap_pools_count));
+
+		pool->zpools[i] = zpool_create_pool(type, name, gfp);
+		if (!pool->zpools[i]) {
+			pr_err("%s zpool not available\n", type);
+			goto error;
+		}
+	}
+	pr_debug("using %s zpool\n", zpool_get_type(pool->zpools[0]));
+
+	strscpy(pool->tfm_name, compressor, sizeof(pool->tfm_name));
+
+	pool->acomp_ctx = alloc_percpu(*pool->acomp_ctx);
+	if (!pool->acomp_ctx) {
+		pr_err("percpu alloc failed\n");
+		goto error;
+	}
+
+	ret = cpuhp_state_add_instance(CPUHP_MM_ZSWP_POOL_PREPARE,
+				       &pool->node);
+	if (ret)
+		goto error;
+
+	zswap_alloc_shrinker(pool);
+	if (!pool->shrinker)
+		goto error;
+
+	pr_debug("using %s compressor\n", pool->tfm_name);
+
+	/* being the current pool takes 1 ref; this func expects the
+	 * caller to always add the new pool as the current pool
+	 */
+	kref_init(&pool->kref);
+	INIT_LIST_HEAD(&pool->list);
+	if (list_lru_init_memcg(&pool->list_lru, pool->shrinker))
+		goto lru_fail;
+	shrinker_register(pool->shrinker);
+	INIT_WORK(&pool->shrink_work, shrink_worker);
+	atomic_set(&pool->nr_stored, 0);
+
+	zswap_pool_debug("created", pool);
+
+	return pool;
+
+lru_fail:
+	list_lru_destroy(&pool->list_lru);
+	shrinker_free(pool->shrinker);
+error:
+	if (pool->acomp_ctx)
+		free_percpu(pool->acomp_ctx);
+	while (i--)
+		zpool_destroy_pool(pool->zpools[i]);
+	kfree(pool);
+	return NULL;
+}
+
+static struct zswap_pool *__zswap_pool_create_fallback(void)
+{
+	bool has_comp, has_zpool;
+
+	has_comp = crypto_has_acomp(zswap_compressor, 0, 0);
+	if (!has_comp && strcmp(zswap_compressor,
+				CONFIG_ZSWAP_COMPRESSOR_DEFAULT)) {
+		pr_err("compressor %s not available, using default %s\n",
+		       zswap_compressor, CONFIG_ZSWAP_COMPRESSOR_DEFAULT);
+		param_free_charp(&zswap_compressor);
+		zswap_compressor = CONFIG_ZSWAP_COMPRESSOR_DEFAULT;
+		has_comp = crypto_has_acomp(zswap_compressor, 0, 0);
+	}
+	if (!has_comp) {
+		pr_err("default compressor %s not available\n",
+		       zswap_compressor);
+		param_free_charp(&zswap_compressor);
+		zswap_compressor = ZSWAP_PARAM_UNSET;
+	}
+
+	has_zpool = zpool_has_pool(zswap_zpool_type);
+	if (!has_zpool && strcmp(zswap_zpool_type,
+				 CONFIG_ZSWAP_ZPOOL_DEFAULT)) {
+		pr_err("zpool %s not available, using default %s\n",
+		       zswap_zpool_type, CONFIG_ZSWAP_ZPOOL_DEFAULT);
+		param_free_charp(&zswap_zpool_type);
+		zswap_zpool_type = CONFIG_ZSWAP_ZPOOL_DEFAULT;
+		has_zpool = zpool_has_pool(zswap_zpool_type);
+	}
+	if (!has_zpool) {
+		pr_err("default zpool %s not available\n",
+		       zswap_zpool_type);
+		param_free_charp(&zswap_zpool_type);
+		zswap_zpool_type = ZSWAP_PARAM_UNSET;
+	}
+
+	if (!has_comp || !has_zpool)
+		return NULL;
+
+	return zswap_pool_create(zswap_zpool_type, zswap_compressor);
+}
+
+static void zswap_pool_destroy(struct zswap_pool *pool)
+{
+	int i;
+
+	zswap_pool_debug("destroying", pool);
+
+	shrinker_free(pool->shrinker);
+	cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node);
+	free_percpu(pool->acomp_ctx);
+	list_lru_destroy(&pool->list_lru);
+
+	spin_lock(&zswap_pools_lock);
+	mem_cgroup_iter_break(NULL, pool->next_shrink);
+	pool->next_shrink = NULL;
+	spin_unlock(&zswap_pools_lock);
+
+	for (i = 0; i < ZSWAP_NR_ZPOOLS; i++)
+		zpool_destroy_pool(pool->zpools[i]);
+	kfree(pool);
+}
+
 /* should be called under RCU */
 #ifdef CONFIG_MEMCG
 static inline struct mem_cgroup *mem_cgroup_from_entry(struct zswap_entry *entry)
@@ -969,151 +1121,6 @@ static void shrink_worker(struct work_struct *w)
 	zswap_pool_put(pool);
 }
 
-static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
-{
-	int i;
-	struct zswap_pool *pool;
-	char name[38]; /* 'zswap' + 32 char (max) num + \0 */
-	gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
-	int ret;
-
-	if (!zswap_has_pool) {
-		/* if either are unset, pool initialization failed, and we
-		 * need both params to be set correctly before trying to
-		 * create a pool.
-		 */
-		if (!strcmp(type, ZSWAP_PARAM_UNSET))
-			return NULL;
-		if (!strcmp(compressor, ZSWAP_PARAM_UNSET))
-			return NULL;
-	}
-
-	pool = kzalloc(sizeof(*pool), GFP_KERNEL);
-	if (!pool)
-		return NULL;
-
-	for (i = 0; i < ZSWAP_NR_ZPOOLS; i++) {
-		/* unique name for each pool specifically required by zsmalloc */
-		snprintf(name, 38, "zswap%x",
-			 atomic_inc_return(&zswap_pools_count));
-
-		pool->zpools[i] = zpool_create_pool(type, name, gfp);
-		if (!pool->zpools[i]) {
-			pr_err("%s zpool not available\n", type);
-			goto error;
-		}
-	}
-	pr_debug("using %s zpool\n", zpool_get_type(pool->zpools[0]));
-
-	strscpy(pool->tfm_name, compressor, sizeof(pool->tfm_name));
-
-	pool->acomp_ctx = alloc_percpu(*pool->acomp_ctx);
-	if (!pool->acomp_ctx) {
-		pr_err("percpu alloc failed\n");
-		goto error;
-	}
-
-	ret = cpuhp_state_add_instance(CPUHP_MM_ZSWP_POOL_PREPARE,
-				       &pool->node);
-	if (ret)
-		goto error;
-
-	zswap_alloc_shrinker(pool);
-	if (!pool->shrinker)
-		goto error;
-
-	pr_debug("using %s compressor\n", pool->tfm_name);
-
-	/* being the current pool takes 1 ref; this func expects the
-	 * caller to always add the new pool as the current pool
-	 */
-	kref_init(&pool->kref);
-	INIT_LIST_HEAD(&pool->list);
-	if (list_lru_init_memcg(&pool->list_lru, pool->shrinker))
-		goto lru_fail;
-	shrinker_register(pool->shrinker);
-	INIT_WORK(&pool->shrink_work, shrink_worker);
-	atomic_set(&pool->nr_stored, 0);
-
-	zswap_pool_debug("created", pool);
-
-	return pool;
-
-lru_fail:
-	list_lru_destroy(&pool->list_lru);
-	shrinker_free(pool->shrinker);
-error:
-	if (pool->acomp_ctx)
-		free_percpu(pool->acomp_ctx);
-	while (i--)
-		zpool_destroy_pool(pool->zpools[i]);
-	kfree(pool);
-	return NULL;
-}
-
-static struct zswap_pool *__zswap_pool_create_fallback(void)
-{
-	bool has_comp, has_zpool;
-
-	has_comp = crypto_has_acomp(zswap_compressor, 0, 0);
-	if (!has_comp && strcmp(zswap_compressor,
-				CONFIG_ZSWAP_COMPRESSOR_DEFAULT)) {
-		pr_err("compressor %s not available, using default %s\n",
-		       zswap_compressor, CONFIG_ZSWAP_COMPRESSOR_DEFAULT);
-		param_free_charp(&zswap_compressor);
-		zswap_compressor = CONFIG_ZSWAP_COMPRESSOR_DEFAULT;
-		has_comp = crypto_has_acomp(zswap_compressor, 0, 0);
-	}
-	if (!has_comp) {
-		pr_err("default compressor %s not available\n",
-		       zswap_compressor);
-		param_free_charp(&zswap_compressor);
-		zswap_compressor = ZSWAP_PARAM_UNSET;
-	}
-
-	has_zpool = zpool_has_pool(zswap_zpool_type);
-	if (!has_zpool && strcmp(zswap_zpool_type,
-				 CONFIG_ZSWAP_ZPOOL_DEFAULT)) {
-		pr_err("zpool %s not available, using default %s\n",
-		       zswap_zpool_type, CONFIG_ZSWAP_ZPOOL_DEFAULT);
-		param_free_charp(&zswap_zpool_type);
-		zswap_zpool_type = CONFIG_ZSWAP_ZPOOL_DEFAULT;
-		has_zpool = zpool_has_pool(zswap_zpool_type);
-	}
-	if (!has_zpool) {
-		pr_err("default zpool %s not available\n",
-		       zswap_zpool_type);
-		param_free_charp(&zswap_zpool_type);
-		zswap_zpool_type = ZSWAP_PARAM_UNSET;
-	}
-
-	if (!has_comp || !has_zpool)
-		return NULL;
-
-	return zswap_pool_create(zswap_zpool_type, zswap_compressor);
-}
-
-static void zswap_pool_destroy(struct zswap_pool *pool)
-{
-	int i;
-
-	zswap_pool_debug("destroying", pool);
-
-	shrinker_free(pool->shrinker);
-	cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node);
-	free_percpu(pool->acomp_ctx);
-	list_lru_destroy(&pool->list_lru);
-
-	spin_lock(&zswap_pools_lock);
-	mem_cgroup_iter_break(NULL, pool->next_shrink);
-	pool->next_shrink = NULL;
-	spin_unlock(&zswap_pools_lock);
-
-	for (i = 0; i < ZSWAP_NR_ZPOOLS; i++)
-		zpool_destroy_pool(pool->zpools[i]);
-	kfree(pool);
-}
-
 static int __must_check zswap_pool_get(struct zswap_pool *pool)
 {
 	if (!pool)

From ceff3b099f5957f104de608da64bfc99437845b2 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Mon, 29 Jan 2024 20:36:47 -0500
Subject: [PATCH 1074/1406] mm: zswap: function ordering: pool refcounting

Move pool refcounting functions into the pool section. First the
destroy functions, then the get and put which uses them.

__zswap_pool_empty() has an upward reference to the global
zswap_pools, to sanity check it's not the currently active pool that's
being freed. That gets the forward decl for zswap_pool_current().

This puts the get and put function above all callers, so kill the
forward decls as well.

Link: https://lkml.kernel.org/r/20240130014208.565554-12-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Nhat Pham <nphamcs@gmail.com>
Cc: Chengming Zhou <zhouchengming@bytedance.com>
Cc: Yosry Ahmed <yosryahmed@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/zswap.c | 94 +++++++++++++++++++++++++++---------------------------
 1 file changed, 47 insertions(+), 47 deletions(-)

diff --git a/mm/zswap.c b/mm/zswap.c
index 5947eeb8d799c4..6f4e1c22a0dec2 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -278,8 +278,6 @@ static inline struct zswap_tree *swap_zswap_tree(swp_entry_t swp)
 
 static int zswap_writeback_entry(struct zswap_entry *entry,
 				 swp_entry_t swpentry);
-static int zswap_pool_get(struct zswap_pool *pool);
-static void zswap_pool_put(struct zswap_pool *pool);
 
 static bool zswap_is_full(void)
 {
@@ -472,6 +470,53 @@ static void zswap_pool_destroy(struct zswap_pool *pool)
 	kfree(pool);
 }
 
+static void __zswap_pool_release(struct work_struct *work)
+{
+	struct zswap_pool *pool = container_of(work, typeof(*pool),
+						release_work);
+
+	synchronize_rcu();
+
+	/* nobody should have been able to get a kref... */
+	WARN_ON(kref_get_unless_zero(&pool->kref));
+
+	/* pool is now off zswap_pools list and has no references. */
+	zswap_pool_destroy(pool);
+}
+
+static struct zswap_pool *zswap_pool_current(void);
+
+static void __zswap_pool_empty(struct kref *kref)
+{
+	struct zswap_pool *pool;
+
+	pool = container_of(kref, typeof(*pool), kref);
+
+	spin_lock(&zswap_pools_lock);
+
+	WARN_ON(pool == zswap_pool_current());
+
+	list_del_rcu(&pool->list);
+
+	INIT_WORK(&pool->release_work, __zswap_pool_release);
+	schedule_work(&pool->release_work);
+
+	spin_unlock(&zswap_pools_lock);
+}
+
+static int __must_check zswap_pool_get(struct zswap_pool *pool)
+{
+	if (!pool)
+		return 0;
+
+	return kref_get_unless_zero(&pool->kref);
+}
+
+static void zswap_pool_put(struct zswap_pool *pool)
+{
+	kref_put(&pool->kref, __zswap_pool_empty);
+}
+
 /* should be called under RCU */
 #ifdef CONFIG_MEMCG
 static inline struct mem_cgroup *mem_cgroup_from_entry(struct zswap_entry *entry)
@@ -1121,51 +1166,6 @@ static void shrink_worker(struct work_struct *w)
 	zswap_pool_put(pool);
 }
 
-static int __must_check zswap_pool_get(struct zswap_pool *pool)
-{
-	if (!pool)
-		return 0;
-
-	return kref_get_unless_zero(&pool->kref);
-}
-
-static void __zswap_pool_release(struct work_struct *work)
-{
-	struct zswap_pool *pool = container_of(work, typeof(*pool),
-						release_work);
-
-	synchronize_rcu();
-
-	/* nobody should have been able to get a kref... */
-	WARN_ON(kref_get_unless_zero(&pool->kref));
-
-	/* pool is now off zswap_pools list and has no references. */
-	zswap_pool_destroy(pool);
-}
-
-static void __zswap_pool_empty(struct kref *kref)
-{
-	struct zswap_pool *pool;
-
-	pool = container_of(kref, typeof(*pool), kref);
-
-	spin_lock(&zswap_pools_lock);
-
-	WARN_ON(pool == zswap_pool_current());
-
-	list_del_rcu(&pool->list);
-
-	INIT_WORK(&pool->release_work, __zswap_pool_release);
-	schedule_work(&pool->release_work);
-
-	spin_unlock(&zswap_pools_lock);
-}
-
-static void zswap_pool_put(struct zswap_pool *pool)
-{
-	kref_put(&pool->kref, __zswap_pool_empty);
-}
-
 /*********************************
 * param callbacks
 **********************************/

From 08cf8efe9d5c9043b39ecb05f8f5feb31b161259 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Mon, 29 Jan 2024 20:36:48 -0500
Subject: [PATCH 1075/1406] mm: zswap: function ordering: zswap_pools

Move the operations against the global zswap_pools list (current pool,
last, find) to the pool section.

Link: https://lkml.kernel.org/r/20240130014208.565554-13-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Nhat Pham <nphamcs@gmail.com>
Cc: Chengming Zhou <zhouchengming@bytedance.com>
Cc: Yosry Ahmed <yosryahmed@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/zswap.c | 150 ++++++++++++++++++++++++++---------------------------
 1 file changed, 73 insertions(+), 77 deletions(-)

diff --git a/mm/zswap.c b/mm/zswap.c
index 6f4e1c22a0dec2..77104406649b05 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -517,6 +517,79 @@ static void zswap_pool_put(struct zswap_pool *pool)
 	kref_put(&pool->kref, __zswap_pool_empty);
 }
 
+static struct zswap_pool *__zswap_pool_current(void)
+{
+	struct zswap_pool *pool;
+
+	pool = list_first_or_null_rcu(&zswap_pools, typeof(*pool), list);
+	WARN_ONCE(!pool && zswap_has_pool,
+		  "%s: no page storage pool!\n", __func__);
+
+	return pool;
+}
+
+static struct zswap_pool *zswap_pool_current(void)
+{
+	assert_spin_locked(&zswap_pools_lock);
+
+	return __zswap_pool_current();
+}
+
+static struct zswap_pool *zswap_pool_current_get(void)
+{
+	struct zswap_pool *pool;
+
+	rcu_read_lock();
+
+	pool = __zswap_pool_current();
+	if (!zswap_pool_get(pool))
+		pool = NULL;
+
+	rcu_read_unlock();
+
+	return pool;
+}
+
+static struct zswap_pool *zswap_pool_last_get(void)
+{
+	struct zswap_pool *pool, *last = NULL;
+
+	rcu_read_lock();
+
+	list_for_each_entry_rcu(pool, &zswap_pools, list)
+		last = pool;
+	WARN_ONCE(!last && zswap_has_pool,
+		  "%s: no page storage pool!\n", __func__);
+	if (!zswap_pool_get(last))
+		last = NULL;
+
+	rcu_read_unlock();
+
+	return last;
+}
+
+/* type and compressor must be null-terminated */
+static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor)
+{
+	struct zswap_pool *pool;
+
+	assert_spin_locked(&zswap_pools_lock);
+
+	list_for_each_entry_rcu(pool, &zswap_pools, list) {
+		if (strcmp(pool->tfm_name, compressor))
+			continue;
+		/* all zpools share the same type */
+		if (strcmp(zpool_get_type(pool->zpools[0]), type))
+			continue;
+		/* if we can't get it, it's about to be destroyed */
+		if (!zswap_pool_get(pool))
+			continue;
+		return pool;
+	}
+
+	return NULL;
+}
+
 /* should be called under RCU */
 #ifdef CONFIG_MEMCG
 static inline struct mem_cgroup *mem_cgroup_from_entry(struct zswap_entry *entry)
@@ -937,83 +1010,6 @@ static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node)
 	return 0;
 }
 
-/*********************************
-* pool functions
-**********************************/
-
-static struct zswap_pool *__zswap_pool_current(void)
-{
-	struct zswap_pool *pool;
-
-	pool = list_first_or_null_rcu(&zswap_pools, typeof(*pool), list);
-	WARN_ONCE(!pool && zswap_has_pool,
-		  "%s: no page storage pool!\n", __func__);
-
-	return pool;
-}
-
-static struct zswap_pool *zswap_pool_current(void)
-{
-	assert_spin_locked(&zswap_pools_lock);
-
-	return __zswap_pool_current();
-}
-
-static struct zswap_pool *zswap_pool_current_get(void)
-{
-	struct zswap_pool *pool;
-
-	rcu_read_lock();
-
-	pool = __zswap_pool_current();
-	if (!zswap_pool_get(pool))
-		pool = NULL;
-
-	rcu_read_unlock();
-
-	return pool;
-}
-
-static struct zswap_pool *zswap_pool_last_get(void)
-{
-	struct zswap_pool *pool, *last = NULL;
-
-	rcu_read_lock();
-
-	list_for_each_entry_rcu(pool, &zswap_pools, list)
-		last = pool;
-	WARN_ONCE(!last && zswap_has_pool,
-		  "%s: no page storage pool!\n", __func__);
-	if (!zswap_pool_get(last))
-		last = NULL;
-
-	rcu_read_unlock();
-
-	return last;
-}
-
-/* type and compressor must be null-terminated */
-static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor)
-{
-	struct zswap_pool *pool;
-
-	assert_spin_locked(&zswap_pools_lock);
-
-	list_for_each_entry_rcu(pool, &zswap_pools, list) {
-		if (strcmp(pool->tfm_name, compressor))
-			continue;
-		/* all zpools share the same type */
-		if (strcmp(zpool_get_type(pool->zpools[0]), type))
-			continue;
-		/* if we can't get it, it's about to be destroyed */
-		if (!zswap_pool_get(pool))
-			continue;
-		return pool;
-	}
-
-	return NULL;
-}
-
 static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_one *l,
 				       spinlock_t *lock, void *arg)
 {

From d5d9121ebc4a0a0b1e1953f2e2120ee41afdd369 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Mon, 29 Jan 2024 20:36:49 -0500
Subject: [PATCH 1076/1406] mm: zswap: function ordering: pool params

Patch series "mm: zswap: cleanups".

Cleanups and maintenance items that accumulated while reviewing zswap
patches.


This patch (of 20):

The parameters primarily control pool attributes. Move those
operations up to the pool section.

Link: https://lkml.kernel.org/r/20240130014208.565554-1-hannes@cmpxchg.org
Link: https://lkml.kernel.org/r/20240130014208.565554-14-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Nhat Pham <nphamcs@gmail.com>
Cc: Chengming Zhou <zhouchengming@bytedance.com>
Cc: Yosry Ahmed <yosryahmed@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/zswap.c | 312 ++++++++++++++++++++++++++---------------------------
 1 file changed, 156 insertions(+), 156 deletions(-)

diff --git a/mm/zswap.c b/mm/zswap.c
index 77104406649b05..98a9cd0a827a3c 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -590,6 +590,162 @@ static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor)
 	return NULL;
 }
 
+/*********************************
+* param callbacks
+**********************************/
+
+static bool zswap_pool_changed(const char *s, const struct kernel_param *kp)
+{
+	/* no change required */
+	if (!strcmp(s, *(char **)kp->arg) && zswap_has_pool)
+		return false;
+	return true;
+}
+
+/* val must be a null-terminated string */
+static int __zswap_param_set(const char *val, const struct kernel_param *kp,
+			     char *type, char *compressor)
+{
+	struct zswap_pool *pool, *put_pool = NULL;
+	char *s = strstrip((char *)val);
+	int ret = 0;
+	bool new_pool = false;
+
+	mutex_lock(&zswap_init_lock);
+	switch (zswap_init_state) {
+	case ZSWAP_UNINIT:
+		/* if this is load-time (pre-init) param setting,
+		 * don't create a pool; that's done during init.
+		 */
+		ret = param_set_charp(s, kp);
+		break;
+	case ZSWAP_INIT_SUCCEED:
+		new_pool = zswap_pool_changed(s, kp);
+		break;
+	case ZSWAP_INIT_FAILED:
+		pr_err("can't set param, initialization failed\n");
+		ret = -ENODEV;
+	}
+	mutex_unlock(&zswap_init_lock);
+
+	/* no need to create a new pool, return directly */
+	if (!new_pool)
+		return ret;
+
+	if (!type) {
+		if (!zpool_has_pool(s)) {
+			pr_err("zpool %s not available\n", s);
+			return -ENOENT;
+		}
+		type = s;
+	} else if (!compressor) {
+		if (!crypto_has_acomp(s, 0, 0)) {
+			pr_err("compressor %s not available\n", s);
+			return -ENOENT;
+		}
+		compressor = s;
+	} else {
+		WARN_ON(1);
+		return -EINVAL;
+	}
+
+	spin_lock(&zswap_pools_lock);
+
+	pool = zswap_pool_find_get(type, compressor);
+	if (pool) {
+		zswap_pool_debug("using existing", pool);
+		WARN_ON(pool == zswap_pool_current());
+		list_del_rcu(&pool->list);
+	}
+
+	spin_unlock(&zswap_pools_lock);
+
+	if (!pool)
+		pool = zswap_pool_create(type, compressor);
+
+	if (pool)
+		ret = param_set_charp(s, kp);
+	else
+		ret = -EINVAL;
+
+	spin_lock(&zswap_pools_lock);
+
+	if (!ret) {
+		put_pool = zswap_pool_current();
+		list_add_rcu(&pool->list, &zswap_pools);
+		zswap_has_pool = true;
+	} else if (pool) {
+		/* add the possibly pre-existing pool to the end of the pools
+		 * list; if it's new (and empty) then it'll be removed and
+		 * destroyed by the put after we drop the lock
+		 */
+		list_add_tail_rcu(&pool->list, &zswap_pools);
+		put_pool = pool;
+	}
+
+	spin_unlock(&zswap_pools_lock);
+
+	if (!zswap_has_pool && !pool) {
+		/* if initial pool creation failed, and this pool creation also
+		 * failed, maybe both compressor and zpool params were bad.
+		 * Allow changing this param, so pool creation will succeed
+		 * when the other param is changed. We already verified this
+		 * param is ok in the zpool_has_pool() or crypto_has_acomp()
+		 * checks above.
+		 */
+		ret = param_set_charp(s, kp);
+	}
+
+	/* drop the ref from either the old current pool,
+	 * or the new pool we failed to add
+	 */
+	if (put_pool)
+		zswap_pool_put(put_pool);
+
+	return ret;
+}
+
+static int zswap_compressor_param_set(const char *val,
+				      const struct kernel_param *kp)
+{
+	return __zswap_param_set(val, kp, zswap_zpool_type, NULL);
+}
+
+static int zswap_zpool_param_set(const char *val,
+				 const struct kernel_param *kp)
+{
+	return __zswap_param_set(val, kp, NULL, zswap_compressor);
+}
+
+static int zswap_enabled_param_set(const char *val,
+				   const struct kernel_param *kp)
+{
+	int ret = -ENODEV;
+
+	/* if this is load-time (pre-init) param setting, only set param. */
+	if (system_state != SYSTEM_RUNNING)
+		return param_set_bool(val, kp);
+
+	mutex_lock(&zswap_init_lock);
+	switch (zswap_init_state) {
+	case ZSWAP_UNINIT:
+		if (zswap_setup())
+			break;
+		fallthrough;
+	case ZSWAP_INIT_SUCCEED:
+		if (!zswap_has_pool)
+			pr_err("can't enable, no pool configured\n");
+		else
+			ret = param_set_bool(val, kp);
+		break;
+	case ZSWAP_INIT_FAILED:
+		pr_err("can't enable, initialization failed\n");
+	}
+	mutex_unlock(&zswap_init_lock);
+
+	return ret;
+}
+
 /* should be called under RCU */
 #ifdef CONFIG_MEMCG
 static inline struct mem_cgroup *mem_cgroup_from_entry(struct zswap_entry *entry)
@@ -1162,162 +1318,6 @@ static void shrink_worker(struct work_struct *w)
 	zswap_pool_put(pool);
 }
 
-/*********************************
-* param callbacks
-**********************************/
-
-static bool zswap_pool_changed(const char *s, const struct kernel_param *kp)
-{
-	/* no change required */
-	if (!strcmp(s, *(char **)kp->arg) && zswap_has_pool)
-		return false;
-	return true;
-}
-
-/* val must be a null-terminated string */
-static int __zswap_param_set(const char *val, const struct kernel_param *kp,
-			     char *type, char *compressor)
-{
-	struct zswap_pool *pool, *put_pool = NULL;
-	char *s = strstrip((char *)val);
-	int ret = 0;
-	bool new_pool = false;
-
-	mutex_lock(&zswap_init_lock);
-	switch (zswap_init_state) {
-	case ZSWAP_UNINIT:
-		/* if this is load-time (pre-init) param setting,
-		 * don't create a pool; that's done during init.
-		 */
-		ret = param_set_charp(s, kp);
-		break;
-	case ZSWAP_INIT_SUCCEED:
-		new_pool = zswap_pool_changed(s, kp);
-		break;
-	case ZSWAP_INIT_FAILED:
-		pr_err("can't set param, initialization failed\n");
-		ret = -ENODEV;
-	}
-	mutex_unlock(&zswap_init_lock);
-
-	/* no need to create a new pool, return directly */
-	if (!new_pool)
-		return ret;
-
-	if (!type) {
-		if (!zpool_has_pool(s)) {
-			pr_err("zpool %s not available\n", s);
-			return -ENOENT;
-		}
-		type = s;
-	} else if (!compressor) {
-		if (!crypto_has_acomp(s, 0, 0)) {
-			pr_err("compressor %s not available\n", s);
-			return -ENOENT;
-		}
-		compressor = s;
-	} else {
-		WARN_ON(1);
-		return -EINVAL;
-	}
-
-	spin_lock(&zswap_pools_lock);
-
-	pool = zswap_pool_find_get(type, compressor);
-	if (pool) {
-		zswap_pool_debug("using existing", pool);
-		WARN_ON(pool == zswap_pool_current());
-		list_del_rcu(&pool->list);
-	}
-
-	spin_unlock(&zswap_pools_lock);
-
-	if (!pool)
-		pool = zswap_pool_create(type, compressor);
-
-	if (pool)
-		ret = param_set_charp(s, kp);
-	else
-		ret = -EINVAL;
-
-	spin_lock(&zswap_pools_lock);
-
-	if (!ret) {
-		put_pool = zswap_pool_current();
-		list_add_rcu(&pool->list, &zswap_pools);
-		zswap_has_pool = true;
-	} else if (pool) {
-		/* add the possibly pre-existing pool to the end of the pools
-		 * list; if it's new (and empty) then it'll be removed and
-		 * destroyed by the put after we drop the lock
-		 */
-		list_add_tail_rcu(&pool->list, &zswap_pools);
-		put_pool = pool;
-	}
-
-	spin_unlock(&zswap_pools_lock);
-
-	if (!zswap_has_pool && !pool) {
-		/* if initial pool creation failed, and this pool creation also
-		 * failed, maybe both compressor and zpool params were bad.
-		 * Allow changing this param, so pool creation will succeed
-		 * when the other param is changed. We already verified this
-		 * param is ok in the zpool_has_pool() or crypto_has_acomp()
-		 * checks above.
-		 */
-		ret = param_set_charp(s, kp);
-	}
-
-	/* drop the ref from either the old current pool,
-	 * or the new pool we failed to add
-	 */
-	if (put_pool)
-		zswap_pool_put(put_pool);
-
-	return ret;
-}
-
-static int zswap_compressor_param_set(const char *val,
-				      const struct kernel_param *kp)
-{
-	return __zswap_param_set(val, kp, zswap_zpool_type, NULL);
-}
-
-static int zswap_zpool_param_set(const char *val,
-				 const struct kernel_param *kp)
-{
-	return __zswap_param_set(val, kp, NULL, zswap_compressor);
-}
-
-static int zswap_enabled_param_set(const char *val,
-				   const struct kernel_param *kp)
-{
-	int ret = -ENODEV;
-
-	/* if this is load-time (pre-init) param setting, only set param. */
-	if (system_state != SYSTEM_RUNNING)
-		return param_set_bool(val, kp);
-
-	mutex_lock(&zswap_init_lock);
-	switch (zswap_init_state) {
-	case ZSWAP_UNINIT:
-		if (zswap_setup())
-			break;
-		fallthrough;
-	case ZSWAP_INIT_SUCCEED:
-		if (!zswap_has_pool)
-			pr_err("can't enable, no pool configured\n");
-		else
-			ret = param_set_bool(val, kp);
-		break;
-	case ZSWAP_INIT_FAILED:
-		pr_err("can't enable, initialization failed\n");
-	}
-	mutex_unlock(&zswap_init_lock);
-
-	return ret;
-}
-
 static bool zswap_compress(struct folio *folio, struct zswap_entry *entry)
 {
 	struct crypto_acomp_ctx *acomp_ctx;

From 4383002e966305d86447509460ad7c487d0e995a Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Mon, 29 Jan 2024 20:36:50 -0500
Subject: [PATCH 1077/1406] mm: zswap: function ordering: public lru api

The zswap entry section sits awkwardly in the middle of LRU-related
functions. Group the external LRU API functions first.

Link: https://lkml.kernel.org/r/20240130014208.565554-15-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Nhat Pham <nphamcs@gmail.com>
Cc: Chengming Zhou <zhouchengming@bytedance.com>
Cc: Yosry Ahmed <yosryahmed@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/zswap.c | 36 +++++++++++++++++++-----------------
 1 file changed, 19 insertions(+), 17 deletions(-)

diff --git a/mm/zswap.c b/mm/zswap.c
index 98a9cd0a827a3c..74b128c3a0edf1 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -746,6 +746,10 @@ static int zswap_enabled_param_set(const char *val,
 	return ret;
 }
 
+/*********************************
+* lru functions
+**********************************/
+
 /* should be called under RCU */
 #ifdef CONFIG_MEMCG
 static inline struct mem_cgroup *mem_cgroup_from_entry(struct zswap_entry *entry)
@@ -764,6 +768,21 @@ static inline int entry_to_nid(struct zswap_entry *entry)
 	return page_to_nid(virt_to_page(entry));
 }
 
+void zswap_lruvec_state_init(struct lruvec *lruvec)
+{
+	atomic_long_set(&lruvec->zswap_lruvec_state.nr_zswap_protected, 0);
+}
+
+void zswap_folio_swapin(struct folio *folio)
+{
+	struct lruvec *lruvec;
+
+	if (folio) {
+		lruvec = folio_lruvec(folio);
+		atomic_long_inc(&lruvec->zswap_lruvec_state.nr_zswap_protected);
+	}
+}
+
 void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg)
 {
 	struct zswap_pool *pool;
@@ -798,23 +817,6 @@ static void zswap_entry_cache_free(struct zswap_entry *entry)
 	kmem_cache_free(zswap_entry_cache, entry);
 }
 
-/*********************************
-* zswap lruvec functions
-**********************************/
-void zswap_lruvec_state_init(struct lruvec *lruvec)
-{
-	atomic_long_set(&lruvec->zswap_lruvec_state.nr_zswap_protected, 0);
-}
-
-void zswap_folio_swapin(struct folio *folio)
-{
-	struct lruvec *lruvec;
-
-	VM_WARN_ON_ONCE(!folio_test_locked(folio));
-	lruvec = folio_lruvec(folio);
-	atomic_long_inc(&lruvec->zswap_lruvec_state.nr_zswap_protected);
-}
-
 /*********************************
 * lru functions
 **********************************/

From 9553064a900937c25be8feec09f09c467e389571 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Mon, 29 Jan 2024 20:36:51 -0500
Subject: [PATCH 1078/1406] mm: zswap: function ordering: move entry sections
 out of LRU section

This completes consolidation of the LRU section.

Link: https://lkml.kernel.org/r/20240130014208.565554-16-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Nhat Pham <nphamcs@gmail.com>
Cc: Chengming Zhou <zhouchengming@bytedance.com>
Cc: Yosry Ahmed <yosryahmed@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/zswap.c | 101 ++++++++++++++++++++++++++---------------------------
 1 file changed, 49 insertions(+), 52 deletions(-)

diff --git a/mm/zswap.c b/mm/zswap.c
index 74b128c3a0edf1..b8834c4fb7dbae 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -768,58 +768,6 @@ static inline int entry_to_nid(struct zswap_entry *entry)
 	return page_to_nid(virt_to_page(entry));
 }
 
-void zswap_lruvec_state_init(struct lruvec *lruvec)
-{
-	atomic_long_set(&lruvec->zswap_lruvec_state.nr_zswap_protected, 0);
-}
-
-void zswap_folio_swapin(struct folio *folio)
-{
-	struct lruvec *lruvec;
-
-	if (folio) {
-		lruvec = folio_lruvec(folio);
-		atomic_long_inc(&lruvec->zswap_lruvec_state.nr_zswap_protected);
-	}
-}
-
-void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg)
-{
-	struct zswap_pool *pool;
-
-	/* lock out zswap pools list modification */
-	spin_lock(&zswap_pools_lock);
-	list_for_each_entry(pool, &zswap_pools, list) {
-		if (pool->next_shrink == memcg)
-			pool->next_shrink = mem_cgroup_iter(NULL, pool->next_shrink, NULL);
-	}
-	spin_unlock(&zswap_pools_lock);
-}
-
-/*********************************
-* zswap entry functions
-**********************************/
-static struct kmem_cache *zswap_entry_cache;
-
-static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp, int nid)
-{
-	struct zswap_entry *entry;
-	entry = kmem_cache_alloc_node(zswap_entry_cache, gfp, nid);
-	if (!entry)
-		return NULL;
-	entry->refcount = 1;
-	RB_CLEAR_NODE(&entry->rbnode);
-	return entry;
-}
-
-static void zswap_entry_cache_free(struct zswap_entry *entry)
-{
-	kmem_cache_free(zswap_entry_cache, entry);
-}
-
-/*********************************
-* lru functions
-**********************************/
 static void zswap_lru_add(struct list_lru *list_lru, struct zswap_entry *entry)
 {
 	atomic_long_t *nr_zswap_protected;
@@ -872,6 +820,55 @@ static void zswap_lru_del(struct list_lru *list_lru, struct zswap_entry *entry)
 	rcu_read_unlock();
 }
 
+void zswap_lruvec_state_init(struct lruvec *lruvec)
+{
+	atomic_long_set(&lruvec->zswap_lruvec_state.nr_zswap_protected, 0);
+}
+
+void zswap_folio_swapin(struct folio *folio)
+{
+	struct lruvec *lruvec;
+
+	if (folio) {
+		lruvec = folio_lruvec(folio);
+		atomic_long_inc(&lruvec->zswap_lruvec_state.nr_zswap_protected);
+	}
+}
+
+void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg)
+{
+	struct zswap_pool *pool;
+
+	/* lock out zswap pools list modification */
+	spin_lock(&zswap_pools_lock);
+	list_for_each_entry(pool, &zswap_pools, list) {
+		if (pool->next_shrink == memcg)
+			pool->next_shrink = mem_cgroup_iter(NULL, pool->next_shrink, NULL);
+	}
+	spin_unlock(&zswap_pools_lock);
+}
+
+/*********************************
+* zswap entry functions
+**********************************/
+static struct kmem_cache *zswap_entry_cache;
+
+static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp, int nid)
+{
+	struct zswap_entry *entry;
+	entry = kmem_cache_alloc_node(zswap_entry_cache, gfp, nid);
+	if (!entry)
+		return NULL;
+	entry->refcount = 1;
+	RB_CLEAR_NODE(&entry->rbnode);
+	return entry;
+}
+
+static void zswap_entry_cache_free(struct zswap_entry *entry)
+{
+	kmem_cache_free(zswap_entry_cache, entry);
+}
+
 /*********************************
 * rbtree functions
 **********************************/

From fcdd3ba3acef3527fd96dd6e4a3297fab3c52245 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Mon, 29 Jan 2024 20:36:52 -0500
Subject: [PATCH 1079/1406] mm: zswap: function ordering: move entry section
 out of tree section

The higher-level entry operations modify the tree, so move the entry
API after the tree section.

Link: https://lkml.kernel.org/r/20240130014208.565554-17-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Nhat Pham <nphamcs@gmail.com>
Cc: Chengming Zhou <zhouchengming@bytedance.com>
Cc: Yosry Ahmed <yosryahmed@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/zswap.c | 42 +++++++++++++++++++++---------------------
 1 file changed, 21 insertions(+), 21 deletions(-)

diff --git a/mm/zswap.c b/mm/zswap.c
index b8834c4fb7dbae..19d3482fd035b1 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -848,27 +848,6 @@ void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg)
 	spin_unlock(&zswap_pools_lock);
 }
 
-/*********************************
-* zswap entry functions
-**********************************/
-static struct kmem_cache *zswap_entry_cache;
-
-static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp, int nid)
-{
-	struct zswap_entry *entry;
-	entry = kmem_cache_alloc_node(zswap_entry_cache, gfp, nid);
-	if (!entry)
-		return NULL;
-	entry->refcount = 1;
-	RB_CLEAR_NODE(&entry->rbnode);
-	return entry;
-}
-
-static void zswap_entry_cache_free(struct zswap_entry *entry)
-{
-	kmem_cache_free(zswap_entry_cache, entry);
-}
-
 /*********************************
 * rbtree functions
 **********************************/
@@ -930,6 +909,27 @@ static bool zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry)
 	return false;
 }
 
+/*********************************
+* zswap entry functions
+**********************************/
+static struct kmem_cache *zswap_entry_cache;
+
+static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp, int nid)
+{
+	struct zswap_entry *entry;
+	entry = kmem_cache_alloc_node(zswap_entry_cache, gfp, nid);
+	if (!entry)
+		return NULL;
+	entry->refcount = 1;
+	RB_CLEAR_NODE(&entry->rbnode);
+	return entry;
+}
+
+static void zswap_entry_cache_free(struct zswap_entry *entry)
+{
+	kmem_cache_free(zswap_entry_cache, entry);
+}
+
 static struct zpool *zswap_find_zpool(struct zswap_entry *entry)
 {
 	int i = 0;

From 7fbed64e8d5a610b9e4d4e5fda0cf43f86112d2c Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Mon, 29 Jan 2024 20:36:53 -0500
Subject: [PATCH 1080/1406] mm: zswap: function ordering: compress & decompress
 functions

Writeback needs to decompress. Move the (de)compression API above what
will be the consolidated shrinking/writeback code.

Link: https://lkml.kernel.org/r/20240130014208.565554-18-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Nhat Pham <nphamcs@gmail.com>
Cc: Chengming Zhou <zhouchengming@bytedance.com>
Cc: Yosry Ahmed <yosryahmed@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/zswap.c | 207 +++++++++++++++++++++++++++--------------------------
 1 file changed, 105 insertions(+), 102 deletions(-)

diff --git a/mm/zswap.c b/mm/zswap.c
index 19d3482fd035b1..6d1e5843c4dd0a 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -992,6 +992,111 @@ static void zswap_invalidate_entry(struct zswap_tree *tree,
 		zswap_entry_put(entry);
 }
 
+/*********************************
+* compressed storage functions
+**********************************/
+static bool zswap_compress(struct folio *folio, struct zswap_entry *entry)
+{
+	struct crypto_acomp_ctx *acomp_ctx;
+	struct scatterlist input, output;
+	unsigned int dlen = PAGE_SIZE;
+	unsigned long handle;
+	struct zpool *zpool;
+	char *buf;
+	gfp_t gfp;
+	int ret;
+	u8 *dst;
+
+	acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
+
+	mutex_lock(&acomp_ctx->mutex);
+
+	dst = acomp_ctx->buffer;
+	sg_init_table(&input, 1);
+	sg_set_page(&input, &folio->page, PAGE_SIZE, 0);
+
+	/*
+	 * We need PAGE_SIZE * 2 here since there maybe over-compression case,
+	 * and hardware-accelerators may won't check the dst buffer size, so
+	 * giving the dst buffer with enough length to avoid buffer overflow.
+	 */
+	sg_init_one(&output, dst, PAGE_SIZE * 2);
+	acomp_request_set_params(acomp_ctx->req, &input, &output, PAGE_SIZE, dlen);
+
+	/*
+	 * it maybe looks a little bit silly that we send an asynchronous request,
+	 * then wait for its completion synchronously. This makes the process look
+	 * synchronous in fact.
+	 * Theoretically, acomp supports users send multiple acomp requests in one
+	 * acomp instance, then get those requests done simultaneously. but in this
+	 * case, zswap actually does store and load page by page, there is no
+	 * existing method to send the second page before the first page is done
+	 * in one thread doing zwap.
+	 * but in different threads running on different cpu, we have different
+	 * acomp instance, so multiple threads can do (de)compression in parallel.
+	 */
+	ret = crypto_wait_req(crypto_acomp_compress(acomp_ctx->req), &acomp_ctx->wait);
+	dlen = acomp_ctx->req->dlen;
+	if (ret) {
+		zswap_reject_compress_fail++;
+		goto unlock;
+	}
+
+	zpool = zswap_find_zpool(entry);
+	gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
+	if (zpool_malloc_support_movable(zpool))
+		gfp |= __GFP_HIGHMEM | __GFP_MOVABLE;
+	ret = zpool_malloc(zpool, dlen, gfp, &handle);
+	if (ret == -ENOSPC) {
+		zswap_reject_compress_poor++;
+		goto unlock;
+	}
+	if (ret) {
+		zswap_reject_alloc_fail++;
+		goto unlock;
+	}
+
+	buf = zpool_map_handle(zpool, handle, ZPOOL_MM_WO);
+	memcpy(buf, dst, dlen);
+	zpool_unmap_handle(zpool, handle);
+
+	entry->handle = handle;
+	entry->length = dlen;
+
+unlock:
+	mutex_unlock(&acomp_ctx->mutex);
+	return ret == 0;
+}
+
+static void zswap_decompress(struct zswap_entry *entry, struct page *page)
+{
+	struct zpool *zpool = zswap_find_zpool(entry);
+	struct scatterlist input, output;
+	struct crypto_acomp_ctx *acomp_ctx;
+	u8 *src;
+
+	acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
+	mutex_lock(&acomp_ctx->mutex);
+
+	src = zpool_map_handle(zpool, entry->handle, ZPOOL_MM_RO);
+	if (!zpool_can_sleep_mapped(zpool)) {
+		memcpy(acomp_ctx->buffer, src, entry->length);
+		src = acomp_ctx->buffer;
+		zpool_unmap_handle(zpool, entry->handle);
+	}
+
+	sg_init_one(&input, src, entry->length);
+	sg_init_table(&output, 1);
+	sg_set_page(&output, page, PAGE_SIZE, 0);
+	acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, PAGE_SIZE);
+	BUG_ON(crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait));
+	BUG_ON(acomp_ctx->req->dlen != PAGE_SIZE);
+	mutex_unlock(&acomp_ctx->mutex);
+
+	if (zpool_can_sleep_mapped(zpool))
+		zpool_unmap_handle(zpool, entry->handle);
+}
+
 /*********************************
 * shrinker functions
 **********************************/
@@ -1317,108 +1422,6 @@ static void shrink_worker(struct work_struct *w)
 	zswap_pool_put(pool);
 }
 
-static bool zswap_compress(struct folio *folio, struct zswap_entry *entry)
-{
-	struct crypto_acomp_ctx *acomp_ctx;
-	struct scatterlist input, output;
-	unsigned int dlen = PAGE_SIZE;
-	unsigned long handle;
-	struct zpool *zpool;
-	char *buf;
-	gfp_t gfp;
-	int ret;
-	u8 *dst;
-
-	acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
-
-	mutex_lock(&acomp_ctx->mutex);
-
-	dst = acomp_ctx->buffer;
-	sg_init_table(&input, 1);
-	sg_set_page(&input, &folio->page, PAGE_SIZE, 0);
-
-	/*
-	 * We need PAGE_SIZE * 2 here since there maybe over-compression case,
-	 * and hardware-accelerators may won't check the dst buffer size, so
-	 * giving the dst buffer with enough length to avoid buffer overflow.
-	 */
-	sg_init_one(&output, dst, PAGE_SIZE * 2);
-	acomp_request_set_params(acomp_ctx->req, &input, &output, PAGE_SIZE, dlen);
-
-	/*
-	 * it maybe looks a little bit silly that we send an asynchronous request,
-	 * then wait for its completion synchronously. This makes the process look
-	 * synchronous in fact.
-	 * Theoretically, acomp supports users send multiple acomp requests in one
-	 * acomp instance, then get those requests done simultaneously. but in this
-	 * case, zswap actually does store and load page by page, there is no
-	 * existing method to send the second page before the first page is done
-	 * in one thread doing zwap.
-	 * but in different threads running on different cpu, we have different
-	 * acomp instance, so multiple threads can do (de)compression in parallel.
-	 */
-	ret = crypto_wait_req(crypto_acomp_compress(acomp_ctx->req), &acomp_ctx->wait);
-	dlen = acomp_ctx->req->dlen;
-	if (ret) {
-		zswap_reject_compress_fail++;
-		goto unlock;
-	}
-
-	zpool = zswap_find_zpool(entry);
-	gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
-	if (zpool_malloc_support_movable(zpool))
-		gfp |= __GFP_HIGHMEM | __GFP_MOVABLE;
-	ret = zpool_malloc(zpool, dlen, gfp, &handle);
-	if (ret == -ENOSPC) {
-		zswap_reject_compress_poor++;
-		goto unlock;
-	}
-	if (ret) {
-		zswap_reject_alloc_fail++;
-		goto unlock;
-	}
-
-	buf = zpool_map_handle(zpool, handle, ZPOOL_MM_WO);
-	memcpy(buf, dst, dlen);
-	zpool_unmap_handle(zpool, handle);
-
-	entry->handle = handle;
-	entry->length = dlen;
-
-unlock:
-	mutex_unlock(&acomp_ctx->mutex);
-	return ret == 0;
-}
-
-static void zswap_decompress(struct zswap_entry *entry, struct page *page)
-{
-	struct zpool *zpool = zswap_find_zpool(entry);
-	struct scatterlist input, output;
-	struct crypto_acomp_ctx *acomp_ctx;
-	u8 *src;
-
-	acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
-	mutex_lock(&acomp_ctx->mutex);
-
-	src = zpool_map_handle(zpool, entry->handle, ZPOOL_MM_RO);
-	if (!zpool_can_sleep_mapped(zpool)) {
-		memcpy(acomp_ctx->buffer, src, entry->length);
-		src = acomp_ctx->buffer;
-		zpool_unmap_handle(zpool, entry->handle);
-	}
-
-	sg_init_one(&input, src, entry->length);
-	sg_init_table(&output, 1);
-	sg_set_page(&output, page, PAGE_SIZE, 0);
-	acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, PAGE_SIZE);
-	BUG_ON(crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait));
-	BUG_ON(acomp_ctx->req->dlen != PAGE_SIZE);
-	mutex_unlock(&acomp_ctx->mutex);
-
-	if (zpool_can_sleep_mapped(zpool))
-		zpool_unmap_handle(zpool, entry->handle);
-}
-
 /*********************************
 * writeback code
 **********************************/

From 9edaaf49f4e873a4b2f53eec3c2f8e07796ee3a4 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Mon, 29 Jan 2024 20:36:54 -0500
Subject: [PATCH 1081/1406] mm: zswap: function ordering: per-cpu compression
 infra

The per-cpu compression init/exit callbacks are awkwardly in the
middle of the shrinker code. Move them up to the compression section.

Link: https://lkml.kernel.org/r/20240130014208.565554-19-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Nhat Pham <nphamcs@gmail.com>
Cc: Chengming Zhou <zhouchengming@bytedance.com>
Cc: Yosry Ahmed <yosryahmed@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/zswap.c | 135 ++++++++++++++++++++++++++---------------------------
 1 file changed, 66 insertions(+), 69 deletions(-)

diff --git a/mm/zswap.c b/mm/zswap.c
index 6d1e5843c4dd0a..680e5a4c1af43b 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -995,6 +995,72 @@ static void zswap_invalidate_entry(struct zswap_tree *tree,
 /*********************************
 * compressed storage functions
 **********************************/
+static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
+{
+	struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
+	struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
+	struct crypto_acomp *acomp;
+	struct acomp_req *req;
+	int ret;
+
+	mutex_init(&acomp_ctx->mutex);
+
+	acomp_ctx->buffer = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu));
+	if (!acomp_ctx->buffer)
+		return -ENOMEM;
+
+	acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0, cpu_to_node(cpu));
+	if (IS_ERR(acomp)) {
+		pr_err("could not alloc crypto acomp %s : %ld\n",
+				pool->tfm_name, PTR_ERR(acomp));
+		ret = PTR_ERR(acomp);
+		goto acomp_fail;
+	}
+	acomp_ctx->acomp = acomp;
+
+	req = acomp_request_alloc(acomp_ctx->acomp);
+	if (!req) {
+		pr_err("could not alloc crypto acomp_request %s\n",
+		       pool->tfm_name);
+		ret = -ENOMEM;
+		goto req_fail;
+	}
+	acomp_ctx->req = req;
+
+	crypto_init_wait(&acomp_ctx->wait);
+	/*
+	 * if the backend of acomp is async zip, crypto_req_done() will wakeup
+	 * crypto_wait_req(); if the backend of acomp is scomp, the callback
+	 * won't be called, crypto_wait_req() will return without blocking.
+	 */
+	acomp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG,
+				   crypto_req_done, &acomp_ctx->wait);
+
+	return 0;
+
+req_fail:
+	crypto_free_acomp(acomp_ctx->acomp);
+acomp_fail:
+	kfree(acomp_ctx->buffer);
+	return ret;
+}
+
+static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node)
+{
+	struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
+	struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
+
+	if (!IS_ERR_OR_NULL(acomp_ctx)) {
+		if (!IS_ERR_OR_NULL(acomp_ctx->req))
+			acomp_request_free(acomp_ctx->req);
+		if (!IS_ERR_OR_NULL(acomp_ctx->acomp))
+			crypto_free_acomp(acomp_ctx->acomp);
+		kfree(acomp_ctx->buffer);
+	}
+
+	return 0;
+}
+
 static bool zswap_compress(struct folio *folio, struct zswap_entry *entry)
 {
 	struct crypto_acomp_ctx *acomp_ctx;
@@ -1201,75 +1267,6 @@ static void zswap_alloc_shrinker(struct zswap_pool *pool)
 	pool->shrinker->seeks = DEFAULT_SEEKS;
 }
 
-/*********************************
-* per-cpu code
-**********************************/
-static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
-{
-	struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
-	struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
-	struct crypto_acomp *acomp;
-	struct acomp_req *req;
-	int ret;
-
-	mutex_init(&acomp_ctx->mutex);
-
-	acomp_ctx->buffer = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu));
-	if (!acomp_ctx->buffer)
-		return -ENOMEM;
-
-	acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0, cpu_to_node(cpu));
-	if (IS_ERR(acomp)) {
-		pr_err("could not alloc crypto acomp %s : %ld\n",
-				pool->tfm_name, PTR_ERR(acomp));
-		ret = PTR_ERR(acomp);
-		goto acomp_fail;
-	}
-	acomp_ctx->acomp = acomp;
-
-	req = acomp_request_alloc(acomp_ctx->acomp);
-	if (!req) {
-		pr_err("could not alloc crypto acomp_request %s\n",
-		       pool->tfm_name);
-		ret = -ENOMEM;
-		goto req_fail;
-	}
-	acomp_ctx->req = req;
-
-	crypto_init_wait(&acomp_ctx->wait);
-	/*
-	 * if the backend of acomp is async zip, crypto_req_done() will wakeup
-	 * crypto_wait_req(); if the backend of acomp is scomp, the callback
-	 * won't be called, crypto_wait_req() will return without blocking.
-	 */
-	acomp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG,
-				   crypto_req_done, &acomp_ctx->wait);
-
-	return 0;
-
-req_fail:
-	crypto_free_acomp(acomp_ctx->acomp);
-acomp_fail:
-	kfree(acomp_ctx->buffer);
-	return ret;
-}
-
-static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node)
-{
-	struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
-	struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
-
-	if (!IS_ERR_OR_NULL(acomp_ctx)) {
-		if (!IS_ERR_OR_NULL(acomp_ctx->req))
-			acomp_request_free(acomp_ctx->req);
-		if (!IS_ERR_OR_NULL(acomp_ctx->acomp))
-			crypto_free_acomp(acomp_ctx->acomp);
-		kfree(acomp_ctx->buffer);
-	}
-
-	return 0;
-}
-
 static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_one *l,
 				       spinlock_t *lock, void *arg)
 {

From 6ec5786559d6e8c7f6d4c2102d712490412f0c82 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Mon, 29 Jan 2024 20:36:55 -0500
Subject: [PATCH 1082/1406] mm: zswap: function ordering: writeback

Shrinking needs writeback. Naturally, move the writeback code above
the shrinking code. Delete the forward decl.

Link: https://lkml.kernel.org/r/20240130014208.565554-20-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Nhat Pham <nphamcs@gmail.com>
Cc: Chengming Zhou <zhouchengming@bytedance.com>
Cc: Yosry Ahmed <yosryahmed@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/zswap.c | 183 ++++++++++++++++++++++++++---------------------------
 1 file changed, 90 insertions(+), 93 deletions(-)

diff --git a/mm/zswap.c b/mm/zswap.c
index 680e5a4c1af43b..667ed3e193406a 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -276,9 +276,6 @@ static inline struct zswap_tree *swap_zswap_tree(swp_entry_t swp)
 	pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name,		\
 		 zpool_get_type((p)->zpools[0]))
 
-static int zswap_writeback_entry(struct zswap_entry *entry,
-				 swp_entry_t swpentry);
-
 static bool zswap_is_full(void)
 {
 	return totalram_pages() * zswap_max_pool_percent / 100 <
@@ -1163,6 +1160,96 @@ static void zswap_decompress(struct zswap_entry *entry, struct page *page)
 		zpool_unmap_handle(zpool, entry->handle);
 }
 
+/*********************************
+* writeback code
+**********************************/
+/*
+ * Attempts to free an entry by adding a folio to the swap cache,
+ * decompressing the entry data into the folio, and issuing a
+ * bio write to write the folio back to the swap device.
+ *
+ * This can be thought of as a "resumed writeback" of the folio
+ * to the swap device.  We are basically resuming the same swap
+ * writeback path that was intercepted with the zswap_store()
+ * in the first place.  After the folio has been decompressed into
+ * the swap cache, the compressed version stored by zswap can be
+ * freed.
+ */
+static int zswap_writeback_entry(struct zswap_entry *entry,
+				 swp_entry_t swpentry)
+{
+	struct zswap_tree *tree;
+	struct folio *folio;
+	struct mempolicy *mpol;
+	bool folio_was_allocated;
+	struct writeback_control wbc = {
+		.sync_mode = WB_SYNC_NONE,
+	};
+
+	/* try to allocate swap cache folio */
+	mpol = get_task_policy(current);
+	folio = __read_swap_cache_async(swpentry, GFP_KERNEL, mpol,
+				NO_INTERLEAVE_INDEX, &folio_was_allocated, true);
+	if (!folio)
+		return -ENOMEM;
+
+	/*
+	 * Found an existing folio, we raced with swapin or concurrent
+	 * shrinker. We generally writeback cold folios from zswap, and
+	 * swapin means the folio just became hot, so skip this folio.
+	 * For unlikely concurrent shrinker case, it will be unlinked
+	 * and freed when invalidated by the concurrent shrinker anyway.
+	 */
+	if (!folio_was_allocated) {
+		folio_put(folio);
+		return -EEXIST;
+	}
+
+	/*
+	 * folio is locked, and the swapcache is now secured against
+	 * concurrent swapping to and from the slot. Verify that the
+	 * swap entry hasn't been invalidated and recycled behind our
+	 * backs (our zswap_entry reference doesn't prevent that), to
+	 * avoid overwriting a new swap folio with old compressed data.
+	 */
+	tree = swap_zswap_tree(swpentry);
+	spin_lock(&tree->lock);
+	if (zswap_rb_search(&tree->rbroot, swp_offset(swpentry)) != entry) {
+		spin_unlock(&tree->lock);
+		delete_from_swap_cache(folio);
+		folio_unlock(folio);
+		folio_put(folio);
+		return -ENOMEM;
+	}
+
+	/* Safe to deref entry after the entry is verified above. */
+	zswap_entry_get(entry);
+	spin_unlock(&tree->lock);
+
+	zswap_decompress(entry, &folio->page);
+
+	count_vm_event(ZSWPWB);
+	if (entry->objcg)
+		count_objcg_event(entry->objcg, ZSWPWB);
+
+	spin_lock(&tree->lock);
+	zswap_invalidate_entry(tree, entry);
+	zswap_entry_put(entry);
+	spin_unlock(&tree->lock);
+
+	/* folio is up to date */
+	folio_mark_uptodate(folio);
+
+	/* move it to the tail of the inactive list after end_writeback */
+	folio_set_reclaim(folio);
+
+	/* start writeback */
+	__swap_writepage(folio, &wbc);
+	folio_put(folio);
+
+	return 0;
+}
+
 /*********************************
 * shrinker functions
 **********************************/
@@ -1419,96 +1506,6 @@ static void shrink_worker(struct work_struct *w)
 	zswap_pool_put(pool);
 }
 
-/*********************************
-* writeback code
-**********************************/
-/*
- * Attempts to free an entry by adding a folio to the swap cache,
- * decompressing the entry data into the folio, and issuing a
- * bio write to write the folio back to the swap device.
- *
- * This can be thought of as a "resumed writeback" of the folio
- * to the swap device.  We are basically resuming the same swap
- * writeback path that was intercepted with the zswap_store()
- * in the first place.  After the folio has been decompressed into
- * the swap cache, the compressed version stored by zswap can be
- * freed.
- */
-static int zswap_writeback_entry(struct zswap_entry *entry,
-				 swp_entry_t swpentry)
-{
-	struct zswap_tree *tree;
-	struct folio *folio;
-	struct mempolicy *mpol;
-	bool folio_was_allocated;
-	struct writeback_control wbc = {
-		.sync_mode = WB_SYNC_NONE,
-	};
-
-	/* try to allocate swap cache folio */
-	mpol = get_task_policy(current);
-	folio = __read_swap_cache_async(swpentry, GFP_KERNEL, mpol,
-				NO_INTERLEAVE_INDEX, &folio_was_allocated, true);
-	if (!folio)
-		return -ENOMEM;
-
-	/*
-	 * Found an existing folio, we raced with swapin or concurrent
-	 * shrinker. We generally writeback cold folios from zswap, and
-	 * swapin means the folio just became hot, so skip this folio.
-	 * For unlikely concurrent shrinker case, it will be unlinked
-	 * and freed when invalidated by the concurrent shrinker anyway.
-	 */
-	if (!folio_was_allocated) {
-		folio_put(folio);
-		return -EEXIST;
-	}
-
-	/*
-	 * folio is locked, and the swapcache is now secured against
-	 * concurrent swapping to and from the slot. Verify that the
-	 * swap entry hasn't been invalidated and recycled behind our
-	 * backs (our zswap_entry reference doesn't prevent that), to
-	 * avoid overwriting a new swap folio with old compressed data.
-	 */
-	tree = swap_zswap_tree(swpentry);
-	spin_lock(&tree->lock);
-	if (zswap_rb_search(&tree->rbroot, swp_offset(swpentry)) != entry) {
-		spin_unlock(&tree->lock);
-		delete_from_swap_cache(folio);
-		folio_unlock(folio);
-		folio_put(folio);
-		return -ENOMEM;
-	}
-
-	/* Safe to deref entry after the entry is verified above. */
-	zswap_entry_get(entry);
-	spin_unlock(&tree->lock);
-
-	zswap_decompress(entry, &folio->page);
-
-	count_vm_event(ZSWPWB);
-	if (entry->objcg)
-		count_objcg_event(entry->objcg, ZSWPWB);
-
-	spin_lock(&tree->lock);
-	zswap_invalidate_entry(tree, entry);
-	zswap_entry_put(entry);
-	spin_unlock(&tree->lock);
-
-	/* folio is up to date */
-	folio_mark_uptodate(folio);
-
-	/* move it to the tail of the inactive list after end_writeback */
-	folio_set_reclaim(folio);
-
-	/* start writeback */
-	__swap_writepage(folio, &wbc);
-	folio_put(folio);
-
-	return 0;
-}
-
 static int zswap_is_page_same_filled(void *ptr, unsigned long *value)
 {
 	unsigned long *page;

From 5d32274c24218fbc0ac42158bc484cf1eef71f3e Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Mon, 29 Jan 2024 20:36:56 -0500
Subject: [PATCH 1083/1406] mm: zswap: function ordering: shrink_memcg_cb

shrink_memcg_cb() is called by the shrinker and is based on
zswap_writeback_entry(). Move it in between. Save one fwd decl.

Link: https://lkml.kernel.org/r/20240130014208.565554-21-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Nhat Pham <nphamcs@gmail.com>
Cc: Chengming Zhou <zhouchengming@bytedance.com>
Cc: Yosry Ahmed <yosryahmed@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/zswap.c | 125 ++++++++++++++++++++++++++---------------------------
 1 file changed, 61 insertions(+), 64 deletions(-)

diff --git a/mm/zswap.c b/mm/zswap.c
index 667ed3e193406a..2bf4bf1d356cfe 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -1254,7 +1254,67 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
 * shrinker functions
 **********************************/
 static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_one *l,
-				       spinlock_t *lock, void *arg);
+				       spinlock_t *lock, void *arg)
+{
+	struct zswap_entry *entry = container_of(item, struct zswap_entry, lru);
+	bool *encountered_page_in_swapcache = (bool *)arg;
+	swp_entry_t swpentry;
+	enum lru_status ret = LRU_REMOVED_RETRY;
+	int writeback_result;
+
+	/*
+	 * Rotate the entry to the tail before unlocking the LRU,
+	 * so that in case of an invalidation race concurrent
+	 * reclaimers don't waste their time on it.
+	 *
+	 * If writeback succeeds, or failure is due to the entry
+	 * being invalidated by the swap subsystem, the invalidation
+	 * will unlink and free it.
+	 *
+	 * Temporary failures, where the same entry should be tried
+	 * again immediately, almost never happen for this shrinker.
+	 * We don't do any trylocking; -ENOMEM comes closest,
+	 * but that's extremely rare and doesn't happen spuriously
+	 * either. Don't bother distinguishing this case.
+	 *
+	 * But since they do exist in theory, the entry cannot just
+	 * be unlinked, or we could leak it. Hence, rotate.
+	 */
+	list_move_tail(item, &l->list);
+
+	/*
+	 * Once the lru lock is dropped, the entry might get freed. The
+	 * swpentry is copied to the stack, and entry isn't deref'd again
+	 * until the entry is verified to still be alive in the tree.
+	 */
+	swpentry = entry->swpentry;
+
+	/*
+	 * It's safe to drop the lock here because we return either
+	 * LRU_REMOVED_RETRY or LRU_RETRY.
+	 */
+	spin_unlock(lock);
+
+	writeback_result = zswap_writeback_entry(entry, swpentry);
+
+	if (writeback_result) {
+		zswap_reject_reclaim_fail++;
+		ret = LRU_RETRY;
+
+		/*
+		 * Encountering a page already in swap cache is a sign that we are shrinking
+		 * into the warmer region. We should terminate shrinking (if we're in the dynamic
+		 * shrinker context).
+		 */
+		if (writeback_result == -EEXIST && encountered_page_in_swapcache)
+			*encountered_page_in_swapcache = true;
+	} else {
+		zswap_written_back_pages++;
+	}
+
+	spin_lock(lock);
+	return ret;
+}
 
 static unsigned long zswap_shrinker_scan(struct shrinker *shrinker,
 		struct shrink_control *sc)
@@ -1354,69 +1414,6 @@ static void zswap_alloc_shrinker(struct zswap_pool *pool)
 	pool->shrinker->seeks = DEFAULT_SEEKS;
 }
 
-static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_one *l,
-				       spinlock_t *lock, void *arg)
-{
-	struct zswap_entry *entry = container_of(item, struct zswap_entry, lru);
-	bool *encountered_page_in_swapcache = (bool *)arg;
-	swp_entry_t swpentry;
-	enum lru_status ret = LRU_REMOVED_RETRY;
-	int writeback_result;
-
-	/*
-	 * Rotate the entry to the tail before unlocking the LRU,
-	 * so that in case of an invalidation race concurrent
-	 * reclaimers don't waste their time on it.
-	 *
-	 * If writeback succeeds, or failure is due to the entry
-	 * being invalidated by the swap subsystem, the invalidation
-	 * will unlink and free it.
-	 *
-	 * Temporary failures, where the same entry should be tried
-	 * again immediately, almost never happen for this shrinker.
-	 * We don't do any trylocking; -ENOMEM comes closest,
-	 * but that's extremely rare and doesn't happen spuriously
-	 * either. Don't bother distinguishing this case.
-	 *
-	 * But since they do exist in theory, the entry cannot just
-	 * be unlinked, or we could leak it. Hence, rotate.
-	 */
-	list_move_tail(item, &l->list);
-
-	/*
-	 * Once the lru lock is dropped, the entry might get freed. The
-	 * swpentry is copied to the stack, and entry isn't deref'd again
-	 * until the entry is verified to still be alive in the tree.
-	 */
-	swpentry = entry->swpentry;
-
-	/*
-	 * It's safe to drop the lock here because we return either
-	 * LRU_REMOVED_RETRY or LRU_RETRY.
-	 */
-	spin_unlock(lock);
-
-	writeback_result = zswap_writeback_entry(entry, swpentry);
-
-	if (writeback_result) {
-		zswap_reject_reclaim_fail++;
-		ret = LRU_RETRY;
-
-		/*
-		 * Encountering a page already in swap cache is a sign that we are shrinking
-		 * into the warmer region. We should terminate shrinking (if we're in the dynamic
-		 * shrinker context).
-		 */
-		if (writeback_result == -EEXIST && encountered_page_in_swapcache)
-			*encountered_page_in_swapcache = true;
-	} else {
-		zswap_written_back_pages++;
-	}
-
-	spin_lock(lock);
-	return ret;
-}
-
 static int shrink_memcg(struct mem_cgroup *memcg)
 {
 	struct zswap_pool *pool;

From 16c489e04938113fc8d28435a7f8055c901b713f Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 29 Jan 2024 17:35:40 -0800
Subject: [PATCH 1084/1406] Docs/admin-guide/mm/damon/usage: use sysfs
 interface for tracepoints example

Patch series "mm/damon: make DAMON debugfs interface deprecation
unignorable".

DAMON debugfs interface is deprecated in February 2023, by commit
5445fcbc4cda ("Docs/admin-guide/mm/damon/usage: add DAMON debugfs
interface deprecation notice").  Make the fact unable to be easily ignored
by removing an example usage from the document (patch 1), renaming the
config (patch 2), adding a deprecation notice file to the debugfs
directory (patches 3-5), and renaming the debugfs file that essnetial to
be used for real use of DAMON (patches 6-9).


This patch (of 9):

DAMON tracepoints example on the DAMON usage document is using DAMON
debugfs interface, which is deprecated.  Use its alternative, DAMON sysfs
interface.

Link: https://lkml.kernel.org/r/20240130013549.89538-1-sj@kernel.org
Link: https://lkml.kernel.org/r/20240130013549.89538-2-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Alex Shi <alexs@kernel.org>
Cc: Hu Haowen <2023002089@link.tyut.edu.cn>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Yanteng Si <siyanteng@loongson.cn>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/admin-guide/mm/damon/usage.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst
index 9d23144bf98501..f2feabb4bd35c7 100644
--- a/Documentation/admin-guide/mm/damon/usage.rst
+++ b/Documentation/admin-guide/mm/damon/usage.rst
@@ -579,11 +579,11 @@ monitoring results recording.
 While the monitoring is turned on, you could record the tracepoint events and
 show results using tracepoint supporting tools like ``perf``.  For example::
 
-    # echo on > monitor_on
+    # echo on > kdamonds/0/state
     # perf record -e damon:damon_aggregated &
     # sleep 5
     # kill 9 $(pidof perf)
-    # echo off > monitor_on
+    # echo off > kdamonds/0/state
     # perf script
     kdamond.0 46568 [027] 79357.842179: damon:damon_aggregated: target_id=0 nr_regions=11 122509119488-135708762112: 0 864
     [...]

From 3232d3c79256c12f810ccf7829bd69e3303b633d Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 29 Jan 2024 17:35:41 -0800
Subject: [PATCH 1085/1406] mm/damon: rename CONFIG_DAMON_DBGFS to
 DAMON_DBGFS_DEPRECATED

DAMON debugfs interface is deprecated.  The fact has documented by commit
5445fcbc4cda ("Docs/admin-guide/mm/damon/usage: add DAMON debugfs
interface deprecation notice").  Commit 620932cd2852 ("mm/damon/dbgfs:
print DAMON debugfs interface deprecation message") further started
printing a warning message when users still use it.  Many people don't
read documentation or kernel log, though.

Make the deprecation harder to be ignored using the approach of commit
eb07c4f39c3e ("mm/slab: rename CONFIG_SLAB to CONFIG_SLAB_DEPRECATED").
'make oldconfig' with 'CONFIG_DAMON_DBGFS=y' will get a new prompt with
the explicit deprecation notice on the name.  'make olddefconfig' with
'CONFIG_DAMON_DBGFS=y' will result in not building DAMON debugfs
interface.  If there is a real user of DAMON debugfs interface, they will
complain the change to the builder.

Link: https://lkml.kernel.org/r/20240130013549.89538-3-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Alex Shi <alexs@kernel.org>
Cc: Hu Haowen <2023002089@link.tyut.edu.cn>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Yanteng Si <siyanteng@loongson.cn>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/Kconfig | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/mm/damon/Kconfig b/mm/damon/Kconfig
index 29f43fbc2eff13..fecb8172410c54 100644
--- a/mm/damon/Kconfig
+++ b/mm/damon/Kconfig
@@ -71,7 +71,7 @@ config DAMON_SYSFS_KUNIT_TEST
 
 	  If unsure, say N.
 
-config DAMON_DBGFS
+config DAMON_DBGFS_DEPRECATED
 	bool "DAMON debugfs interface (DEPRECATED!)"
 	depends on DAMON_VADDR && DAMON_PADDR && DEBUG_FS
 	help
@@ -84,6 +84,11 @@ config DAMON_DBGFS
 	  (DAMON_SYSFS).  If you depend on this and cannot move, please report
 	  your usecase to damon@lists.linux.dev and linux-mm@kvack.org.
 
+config DAMON_DBGFS
+	bool
+	default y
+	depends on DAMON_DBGFS_DEPRECATED
+
 config DAMON_DBGFS_KUNIT_TEST
 	bool "Test for damon debugfs interface" if !KUNIT_ALL_TESTS
 	depends on DAMON_DBGFS && KUNIT=y

From 24cb6c91bb8f646e07a6d13a5f00c7d0bca647fb Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 29 Jan 2024 17:35:42 -0800
Subject: [PATCH 1086/1406] mm/damon/dbgfs: implement deprecation notice file

Implement a read-only file for DAMON debugfs interface deprecation notice,
to let users who manually read/write the DAMON debugfs files from their
shell command line easily notice the fact.

Link: https://lkml.kernel.org/r/20240130013549.89538-4-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Alex Shi <alexs@kernel.org>
Cc: Hu Haowen <2023002089@link.tyut.edu.cn>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Yanteng Si <siyanteng@loongson.cn>
Cc: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/dbgfs.c | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c
index 7dac24e69e3b95..fc6ece5a9f37cc 100644
--- a/mm/damon/dbgfs.c
+++ b/mm/damon/dbgfs.c
@@ -805,6 +805,18 @@ static void dbgfs_destroy_ctx(struct damon_ctx *ctx)
 	damon_destroy_ctx(ctx);
 }
 
+static ssize_t damon_dbgfs_deprecated_read(struct file *file,
+		char __user *buf, size_t count, loff_t *ppos)
+{
+	char kbuf[512] = "DAMON debugfs interface is deprecated, "
+		     "so users should move to DAMON_SYSFS. If you cannot, "
+		     "please report your usecase to damon@lists.linux.dev and "
+		     "linux-mm@kvack.org.\n";
+	int len = strnlen(kbuf, 1024);
+
+	return simple_read_from_buffer(buf, count, ppos, kbuf, len);
+}
+
 /*
  * Make a context of @name and create a debugfs directory for it.
  *
@@ -1056,6 +1068,10 @@ static int damon_dbgfs_static_file_open(struct inode *inode, struct file *file)
 	return nonseekable_open(inode, file);
 }
 
+static const struct file_operations deprecated_fops = {
+	.read = damon_dbgfs_deprecated_read,
+};
+
 static const struct file_operations mk_contexts_fops = {
 	.open = damon_dbgfs_static_file_open,
 	.write = dbgfs_mk_context_write,
@@ -1076,9 +1092,9 @@ static int __init __damon_dbgfs_init(void)
 {
 	struct dentry *dbgfs_root;
 	const char * const file_names[] = {"mk_contexts", "rm_contexts",
-		"monitor_on"};
+		"monitor_on", "DEPRECATED"};
 	const struct file_operations *fops[] = {&mk_contexts_fops,
-		&rm_contexts_fops, &monitor_on_fops};
+		&rm_contexts_fops, &monitor_on_fops, &deprecated_fops};
 	int i;
 
 	dbgfs_root = debugfs_create_dir("damon", NULL);

From c1ff995e70440f6bf39744784f7b0ac433f061fd Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 2 Feb 2024 13:43:26 +0100
Subject: [PATCH 1087/1406] mm/damon/dbgfs: fix bogus string length

gcc correctly points out that using strnlen() on a fixed size array
is nonsense with an overlong limit:

mm/damon/dbgfs.c: In function 'damon_dbgfs_deprecated_read':
mm/damon/dbgfs.c:814:19: error: 'strnlen' specified bound 1024 exceeds source size 512 [-Werror=stringop-overread]
  814 |         int len = strnlen(kbuf, 1024);
      |                   ^~~~~~~~~~~~~~~~~~~
mm/damon/dbgfs.c:813:14: note: source object allocated here
  813 |         char kbuf[512] = DAMON_DBGFS_DEPRECATION_NOTICE;
      |              ^~~~

In fact, neither of the arbitrary limits are needed here: The first
one can just be a static const string and avoid wasting any more
space then necessary, and the strnlen() can be either strlen() or
sizeof(kbuf)-1, both of which the compiler turns into the same
constant here.

Link: https://lkml.kernel.org/r/20240202124339.892862-1-arnd@kernel.org
Fixes: adf9047adfff ("mm/damon/dbgfs: implement deprecation notice file")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: SeongJae Park <sj@kernel.org>
Cc: Alex Shi <alexs@kernel.org>
Cc: Hu Haowen <2023002089@link.tyut.edu.cn>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Yanteng Si <siyanteng@loongson.cn>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/dbgfs.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c
index fc6ece5a9f37cc..f66865acb523dd 100644
--- a/mm/damon/dbgfs.c
+++ b/mm/damon/dbgfs.c
@@ -808,13 +808,12 @@ static void dbgfs_destroy_ctx(struct damon_ctx *ctx)
 static ssize_t damon_dbgfs_deprecated_read(struct file *file,
 		char __user *buf, size_t count, loff_t *ppos)
 {
-	char kbuf[512] = "DAMON debugfs interface is deprecated, "
+	static const char kbuf[512] = "DAMON debugfs interface is deprecated, "
 		     "so users should move to DAMON_SYSFS. If you cannot, "
 		     "please report your usecase to damon@lists.linux.dev and "
 		     "linux-mm@kvack.org.\n";
-	int len = strnlen(kbuf, 1024);
 
-	return simple_read_from_buffer(buf, count, ppos, kbuf, len);
+	return simple_read_from_buffer(buf, count, ppos, kbuf, strlen(kbuf));
 }
 
 /*

From 11665a1fdb390c2400b646574b0eeeb38469a7a6 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 29 Jan 2024 17:35:43 -0800
Subject: [PATCH 1088/1406] mm/damon/dbgfs: make debugfs interface deprecation
 message a macro

DAMON debugfs interface deprecation message is written twice, once for the
warning, and again for DEPRECATED file's read output.  De-duplicate those
by defining the message as a macro and reuse.

Link: https://lkml.kernel.org/r/20240130013549.89538-5-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Alex Shi <alexs@kernel.org>
Cc: Hu Haowen <2023002089@link.tyut.edu.cn>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Yanteng Si <siyanteng@loongson.cn>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/dbgfs.c | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c
index f66865acb523dd..10f2af6522952a 100644
--- a/mm/damon/dbgfs.c
+++ b/mm/damon/dbgfs.c
@@ -15,6 +15,11 @@
 #include <linux/page_idle.h>
 #include <linux/slab.h>
 
+#define DAMON_DBGFS_DEPRECATION_NOTICE					\
+	"DAMON debugfs interface is deprecated, so users should move "	\
+	"to DAMON_SYSFS. If you cannot, please report your usecase to "	\
+	"damon@lists.linux.dev and linux-mm@kvack.org.\n"
+
 static struct damon_ctx **dbgfs_ctxs;
 static int dbgfs_nr_ctxs;
 static struct dentry **dbgfs_dirs;
@@ -22,10 +27,7 @@ static DEFINE_MUTEX(damon_dbgfs_lock);
 
 static void damon_dbgfs_warn_deprecation(void)
 {
-	pr_warn_once("DAMON debugfs interface is deprecated, "
-		     "so users should move to DAMON_SYSFS. If you cannot, "
-		     "please report your usecase to damon@lists.linux.dev and "
-		     "linux-mm@kvack.org.\n");
+	pr_warn_once(DAMON_DBGFS_DEPRECATION_NOTICE);
 }
 
 /*
@@ -808,10 +810,7 @@ static void dbgfs_destroy_ctx(struct damon_ctx *ctx)
 static ssize_t damon_dbgfs_deprecated_read(struct file *file,
 		char __user *buf, size_t count, loff_t *ppos)
 {
-	static const char kbuf[512] = "DAMON debugfs interface is deprecated, "
-		     "so users should move to DAMON_SYSFS. If you cannot, "
-		     "please report your usecase to damon@lists.linux.dev and "
-		     "linux-mm@kvack.org.\n";
+	static comnst char kbuf[512] = DAMON_DBGFS_DEPRECATION_NOTICE;
 
 	return simple_read_from_buffer(buf, count, ppos, kbuf, strlen(kbuf));
 }

From 06996cb93186ce4025243290d43540e914612441 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Sat, 3 Feb 2024 06:10:51 -0800
Subject: [PATCH 1089/1406] 
 mm-damon-dbgfs-make-debugfs-interface-deprecation-message-a-macro-fix

s/comnst/const/

Cc: Alex Shi <alexs@kernel.org>
Cc: Hu Haowen <2023002089@link.tyut.edu.cn>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: SeongJae Park <sj@kernel.org>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Yanteng Si <siyanteng@loongson.cn>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/dbgfs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c
index 10f2af6522952a..45d769984daa38 100644
--- a/mm/damon/dbgfs.c
+++ b/mm/damon/dbgfs.c
@@ -810,7 +810,7 @@ static void dbgfs_destroy_ctx(struct damon_ctx *ctx)
 static ssize_t damon_dbgfs_deprecated_read(struct file *file,
 		char __user *buf, size_t count, loff_t *ppos)
 {
-	static comnst char kbuf[512] = DAMON_DBGFS_DEPRECATION_NOTICE;
+	static const char kbuf[512] = DAMON_DBGFS_DEPRECATION_NOTICE;
 
 	return simple_read_from_buffer(buf, count, ppos, kbuf, strlen(kbuf));
 }

From fa4760e9a297cf5d8d6582ffe23af04e0d77ee58 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 29 Jan 2024 17:35:44 -0800
Subject: [PATCH 1090/1406] Docs/admin-guide/mm/damon/usage: document
 'DEPRECATED' file of DAMON debugfs interface

Document the newly added DAMON debugfs interface deprecation notice file
on the usage document.

Link: https://lkml.kernel.org/r/20240130013549.89538-6-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Alex Shi <alexs@kernel.org>
Cc: Hu Haowen <2023002089@link.tyut.edu.cn>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Yanteng Si <siyanteng@loongson.cn>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/admin-guide/mm/damon/usage.rst | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst
index f2feabb4bd35c7..5d3df18dfb9fc0 100644
--- a/Documentation/admin-guide/mm/damon/usage.rst
+++ b/Documentation/admin-guide/mm/damon/usage.rst
@@ -628,9 +628,16 @@ debugfs Interface (DEPRECATED!)
   move, please report your usecase to damon@lists.linux.dev and
   linux-mm@kvack.org.
 
-DAMON exports eight files, ``attrs``, ``target_ids``, ``init_regions``,
-``schemes``, ``monitor_on``, ``kdamond_pid``, ``mk_contexts`` and
-``rm_contexts`` under its debugfs directory, ``<debugfs>/damon/``.
+DAMON exports nine files, ``DEPRECATED``, ``attrs``, ``target_ids``,
+``init_regions``, ``schemes``, ``monitor_on``, ``kdamond_pid``, ``mk_contexts``
+and ``rm_contexts`` under its debugfs directory, ``<debugfs>/damon/``.
+
+
+``DEPRECATED`` is a read-only file for the DAMON debugfs interface deprecation
+notice.  Reading it returns the deprecation notice, as below::
+
+    # cat DEPRECATED
+    DAMON debugfs interface is deprecated, so users should move to DAMON_SYSFS. If you cannot, please report your usecase to damon@lists.linux.dev and linux-mm@kvack.org.
 
 
 Attributes

From a08b0b44ae1a5eefc33056da263b4e474fe5750e Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 29 Jan 2024 17:35:45 -0800
Subject: [PATCH 1091/1406] selftets/damon: prepare for monitor_on file
 renaming

Following change will rename 'monitor_on' DAMON debugfs file to
'monitor_on_DEPRECATED', to make the deprecation unignorable in runtime.
Since it could make DAMON selftests fail and disturb future bisects,
update DAMON selftests to support the change.

Link: https://lkml.kernel.org/r/20240130013549.89538-7-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Alex Shi <alexs@kernel.org>
Cc: Hu Haowen <2023002089@link.tyut.edu.cn>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Yanteng Si <siyanteng@loongson.cn>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/damon/_chk_dependency.sh     | 11 +++++++++--
 tools/testing/selftests/damon/_debugfs_common.sh     |  7 +++++++
 .../testing/selftests/damon/debugfs_empty_targets.sh | 12 ++++++++++--
 3 files changed, 26 insertions(+), 4 deletions(-)

diff --git a/tools/testing/selftests/damon/_chk_dependency.sh b/tools/testing/selftests/damon/_chk_dependency.sh
index 0328ac0b5a5ed0..350f8c2b071dbc 100644
--- a/tools/testing/selftests/damon/_chk_dependency.sh
+++ b/tools/testing/selftests/damon/_chk_dependency.sh
@@ -18,7 +18,14 @@ then
 	exit $ksft_skip
 fi
 
-for f in attrs target_ids monitor_on
+if [ -f "$DBGFS/monitor_on_DEPRECATED" ]
+then
+	monitor_on_file="monitor_on_DEPRECATED"
+else
+	monitor_on_file="monitor_on"
+fi
+
+for f in attrs target_ids "$monitor_on_file"
 do
 	if [ ! -f "$DBGFS/$f" ]
 	then
@@ -28,7 +35,7 @@ do
 done
 
 permission_error="Operation not permitted"
-for f in attrs target_ids monitor_on
+for f in attrs target_ids "$monitor_on_file"
 do
 	status=$( cat "$DBGFS/$f" 2>&1 )
 	if [ "${status#*$permission_error}" != "$status" ]; then
diff --git a/tools/testing/selftests/damon/_debugfs_common.sh b/tools/testing/selftests/damon/_debugfs_common.sh
index 48989d4813ae8b..aa995516870bc8 100644
--- a/tools/testing/selftests/damon/_debugfs_common.sh
+++ b/tools/testing/selftests/damon/_debugfs_common.sh
@@ -45,6 +45,13 @@ test_content() {
 source ./_chk_dependency.sh
 
 damon_onoff="$DBGFS/monitor_on"
+if [ -f "$DBGFS/monitor_on_DEPRECATED" ]
+then
+	damon_onoff="$DBGFS/monitor_on_DEPRECATED"
+else
+	damon_onoff="$DBGFS/monitor_on"
+fi
+
 if [ $(cat "$damon_onoff") = "on" ]
 then
 	echo "monitoring is on"
diff --git a/tools/testing/selftests/damon/debugfs_empty_targets.sh b/tools/testing/selftests/damon/debugfs_empty_targets.sh
index 87aff8083822f6..effbea33dc1640 100755
--- a/tools/testing/selftests/damon/debugfs_empty_targets.sh
+++ b/tools/testing/selftests/damon/debugfs_empty_targets.sh
@@ -8,6 +8,14 @@ source _debugfs_common.sh
 
 orig_target_ids=$(cat "$DBGFS/target_ids")
 echo "" > "$DBGFS/target_ids"
-orig_monitor_on=$(cat "$DBGFS/monitor_on")
-test_write_fail "$DBGFS/monitor_on" "on" "orig_monitor_on" "empty target ids"
+
+if [ -f "$DBGFS/monitor_on_DEPRECATED" ]
+then
+	monitor_on_file="$DBGFS/monitor_on_DEPRECATED"
+else
+	monitor_on_file="$DBGFS/monitor_on"
+fi
+
+orig_monitor_on=$(cat "$monitor_on_file")
+test_write_fail "$monitor_on_file" "on" "orig_monitor_on" "empty target ids"
 echo "$orig_target_ids" > "$DBGFS/target_ids"

From 85b1bd1be0825d03dd408aef4770184e57292383 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 29 Jan 2024 17:35:46 -0800
Subject: [PATCH 1092/1406] mm/damon/dbgfs: rename monitor_on file to
 monitor_on_DEPRECATED

Kernel builders could silently enable CONFIG_DAMON_DBGFS_DEPRECATED.
Users who manually check the files under the DAMON debugfs directory could
notice the deprecation owing to the 'DEPRECATED' DAMON debugfs file, but
there could be users who doesn't manually check the files.

Make the deprecation cannot be ignored in the case by renaming
'monitor_on' file, which is essential for real use of DAMON on runtime, to
'monitor_on_DEPRECATED'.  Still users who control DAMON via only
user-space tool could ignore the deprecation, but that's what the tool
developers should take care of.  DAMON user-space tool, damo, has also
made a change[1] for the purpose.

[1] commit 935dae76f2aee ("_damon_args: Rename --damon_interface to
    --damon_interface_DEPRECATED") of https://github.com/awslabs/damo

Link: https://lkml.kernel.org/r/20240130013549.89538-8-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Alex Shi <alexs@kernel.org>
Cc: Hu Haowen <2023002089@link.tyut.edu.cn>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Yanteng Si <siyanteng@loongson.cn>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/dbgfs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c
index 45d769984daa38..2461cfe2e96883 100644
--- a/mm/damon/dbgfs.c
+++ b/mm/damon/dbgfs.c
@@ -1090,7 +1090,7 @@ static int __init __damon_dbgfs_init(void)
 {
 	struct dentry *dbgfs_root;
 	const char * const file_names[] = {"mk_contexts", "rm_contexts",
-		"monitor_on", "DEPRECATED"};
+		"monitor_on_DEPRECATED", "DEPRECATED"};
 	const struct file_operations *fops[] = {&mk_contexts_fops,
 		&rm_contexts_fops, &monitor_on_fops, &deprecated_fops};
 	int i;

From 5903c05260d6a978115b3997c37f8100d6582205 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 29 Jan 2024 17:35:47 -0800
Subject: [PATCH 1093/1406] Docs/admin-guide/mm/damon/usage: update for
 monitor_on renaming

Update DAMON debugfs interface sections on the usage document to reflect
the fact that 'monitor_on' file has renamed to 'monitor_on_DEPRECATED'.

Link: https://lkml.kernel.org/r/20240130013549.89538-9-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Alex Shi <alexs@kernel.org>
Cc: Hu Haowen <2023002089@link.tyut.edu.cn>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Yanteng Si <siyanteng@loongson.cn>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/admin-guide/mm/damon/usage.rst | 29 ++++++++++----------
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst
index 5d3df18dfb9fc0..58c34e66b31b2b 100644
--- a/Documentation/admin-guide/mm/damon/usage.rst
+++ b/Documentation/admin-guide/mm/damon/usage.rst
@@ -629,8 +629,9 @@ debugfs Interface (DEPRECATED!)
   linux-mm@kvack.org.
 
 DAMON exports nine files, ``DEPRECATED``, ``attrs``, ``target_ids``,
-``init_regions``, ``schemes``, ``monitor_on``, ``kdamond_pid``, ``mk_contexts``
-and ``rm_contexts`` under its debugfs directory, ``<debugfs>/damon/``.
+``init_regions``, ``schemes``, ``monitor_on_DEPRECATED``, ``kdamond_pid``,
+``mk_contexts`` and ``rm_contexts`` under its debugfs directory,
+``<debugfs>/damon/``.
 
 
 ``DEPRECATED`` is a read-only file for the DAMON debugfs interface deprecation
@@ -855,16 +856,16 @@ Turning On/Off
 
 Setting the files as described above doesn't incur effect unless you explicitly
 start the monitoring.  You can start, stop, and check the current status of the
-monitoring by writing to and reading from the ``monitor_on`` file.  Writing
-``on`` to the file starts the monitoring of the targets with the attributes.
-Writing ``off`` to the file stops those.  DAMON also stops if every target
-process is terminated.  Below example commands turn on, off, and check the
-status of DAMON::
+monitoring by writing to and reading from the ``monitor_on_DEPRECATED`` file.
+Writing ``on`` to the file starts the monitoring of the targets with the
+attributes.  Writing ``off`` to the file stops those.  DAMON also stops if
+every target process is terminated.  Below example commands turn on, off, and
+check the status of DAMON::
 
     # cd <debugfs>/damon
-    # echo on > monitor_on
-    # echo off > monitor_on
-    # cat monitor_on
+    # echo on > monitor_on_DEPRECATED
+    # echo off > monitor_on_DEPRECATED
+    # cat monitor_on_DEPRECATED
     off
 
 Please note that you cannot write to the above-mentioned debugfs files while
@@ -880,11 +881,11 @@ can get the pid of the thread by reading the ``kdamond_pid`` file.  When the
 monitoring is turned off, reading the file returns ``none``. ::
 
     # cd <debugfs>/damon
-    # cat monitor_on
+    # cat monitor_on_DEPRECATED
     off
     # cat kdamond_pid
     none
-    # echo on > monitor_on
+    # echo on > monitor_on_DEPRECATED
     # cat kdamond_pid
     18594
 
@@ -914,5 +915,5 @@ directory by putting the name of the context to the ``rm_contexts`` file. ::
     # ls foo
     # ls: cannot access 'foo': No such file or directory
 
-Note that ``mk_contexts``, ``rm_contexts``, and ``monitor_on`` files are in the
-root directory only.
+Note that ``mk_contexts``, ``rm_contexts``, and ``monitor_on_DEPRECATED`` files
+are in the root directory only.

From 08a920584d29694ce21d45dd2f07e64ca2a27888 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 29 Jan 2024 17:35:48 -0800
Subject: [PATCH 1094/1406] Docs/translations/damon/usage: update for
 monitor_on renaming

Update DAMON debugfs interface sections on the translated usage documents
to reflect the fact that 'monitor_on' file has renamed to
'monitor_on_DEPRECATED'.

Link: https://lkml.kernel.org/r/20240130013549.89538-10-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Reviewed-by: Alex Shi <alexs@kernel.org>
Cc: Hu Haowen <2023002089@link.tyut.edu.cn>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Yanteng Si <siyanteng@loongson.cn>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 .../zh_CN/admin-guide/mm/damon/usage.rst      | 20 +++++++++----------
 .../zh_TW/admin-guide/mm/damon/usage.rst      | 20 +++++++++----------
 2 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/Documentation/translations/zh_CN/admin-guide/mm/damon/usage.rst b/Documentation/translations/zh_CN/admin-guide/mm/damon/usage.rst
index 17b9949d9b4357..da2745464ece45 100644
--- a/Documentation/translations/zh_CN/admin-guide/mm/damon/usage.rst
+++ b/Documentation/translations/zh_CN/admin-guide/mm/damon/usage.rst
@@ -344,7 +344,7 @@ debugfs接口
   :ref:`sysfs接口<sysfs_interface>`。
 
 DAMON导出了八个文件, ``attrs``, ``target_ids``, ``init_regions``,
-``schemes``, ``monitor_on``, ``kdamond_pid``, ``mk_contexts`` 和
+``schemes``, ``monitor_on_DEPRECATED``, ``kdamond_pid``, ``mk_contexts`` 和
 ``rm_contexts`` under its debugfs directory, ``<debugfs>/damon/``.
 
 
@@ -521,15 +521,15 @@ DAMON导出了八个文件, ``attrs``, ``target_ids``, ``init_regions``,
 开关
 ----
 
-除非你明确地启动监测，否则如上所述的文件设置不会产生效果。你可以通过写入和读取 ``monitor_on``
+除非你明确地启动监测，否则如上所述的文件设置不会产生效果。你可以通过写入和读取 ``monitor_on_DEPRECATED``
 文件来启动、停止和检查监测的当前状态。写入 ``on`` 该文件可以启动对有属性的目标的监测。写入
 ``off`` 该文件则停止这些目标。如果每个目标进程被终止，DAMON也会停止。下面的示例命令开启、关
 闭和检查DAMON的状态::
 
     # cd <debugfs>/damon
-    # echo on > monitor_on
-    # echo off > monitor_on
-    # cat monitor_on
+    # echo on > monitor_on_DEPRECATED
+    # echo off > monitor_on_DEPRECATED
+    # cat monitor_on_DEPRECATED
     off
 
 请注意，当监测开启时，你不能写到上述的debugfs文件。如果你在DAMON运行时写到这些文件，将会返
@@ -543,11 +543,11 @@ DAMON通过一个叫做kdamond的内核线程来进行请求监测。你可以
 得该线程的 ``pid`` 。当监测被 ``关闭`` 时，读取该文件不会返回任何信息::
 
     # cd <debugfs>/damon
-    # cat monitor_on
+    # cat monitor_on_DEPRECATED
     off
     # cat kdamond_pid
     none
-    # echo on > monitor_on
+    # echo on > monitor_on_DEPRECATED
     # cat kdamond_pid
     18594
 
@@ -574,7 +574,7 @@ DAMON通过一个叫做kdamond的内核线程来进行请求监测。你可以
     # ls foo
     # ls: cannot access 'foo': No such file or directory
 
-注意， ``mk_contexts`` 、 ``rm_contexts`` 和 ``monitor_on`` 文件只在根目录下。
+注意， ``mk_contexts`` 、 ``rm_contexts`` 和 ``monitor_on_DEPRECATED`` 文件只在根目录下。
 
 
 监测结果的监测点
@@ -583,9 +583,9 @@ DAMON通过一个叫做kdamond的内核线程来进行请求监测。你可以
 DAMON通过一个tracepoint ``damon:damon_aggregated`` 提供监测结果.  当监测开启时，你可
 以记录追踪点事件，并使用追踪点支持工具如perf显示结果。比如说::
 
-    # echo on > monitor_on
+    # echo on > monitor_on_DEPRECATED
     # perf record -e damon:damon_aggregated &
     # sleep 5
     # kill 9 $(pidof perf)
-    # echo off > monitor_on
+    # echo off > monitor_on_DEPRECATED
     # perf script
diff --git a/Documentation/translations/zh_TW/admin-guide/mm/damon/usage.rst b/Documentation/translations/zh_TW/admin-guide/mm/damon/usage.rst
index 6dee719a32ea61..7464279f9b7de0 100644
--- a/Documentation/translations/zh_TW/admin-guide/mm/damon/usage.rst
+++ b/Documentation/translations/zh_TW/admin-guide/mm/damon/usage.rst
@@ -344,7 +344,7 @@ debugfs接口
   :ref:`sysfs接口<sysfs_interface>`。
 
 DAMON導出了八個文件, ``attrs``, ``target_ids``, ``init_regions``,
-``schemes``, ``monitor_on``, ``kdamond_pid``, ``mk_contexts`` 和
+``schemes``, ``monitor_on_DEPRECATED``, ``kdamond_pid``, ``mk_contexts`` 和
 ``rm_contexts`` under its debugfs directory, ``<debugfs>/damon/``.
 
 
@@ -521,15 +521,15 @@ DAMON導出了八個文件, ``attrs``, ``target_ids``, ``init_regions``,
 開關
 ----
 
-除非你明確地啓動監測，否則如上所述的文件設置不會產生效果。你可以通過寫入和讀取 ``monitor_on``
+除非你明確地啓動監測，否則如上所述的文件設置不會產生效果。你可以通過寫入和讀取 ``monitor_on_DEPRECATED``
 文件來啓動、停止和檢查監測的當前狀態。寫入 ``on`` 該文件可以啓動對有屬性的目標的監測。寫入
 ``off`` 該文件則停止這些目標。如果每個目標進程被終止，DAMON也會停止。下面的示例命令開啓、關
 閉和檢查DAMON的狀態::
 
     # cd <debugfs>/damon
-    # echo on > monitor_on
-    # echo off > monitor_on
-    # cat monitor_on
+    # echo on > monitor_on_DEPRECATED
+    # echo off > monitor_on_DEPRECATED
+    # cat monitor_on_DEPRECATED
     off
 
 請注意，當監測開啓時，你不能寫到上述的debugfs文件。如果你在DAMON運行時寫到這些文件，將會返
@@ -543,11 +543,11 @@ DAMON通過一個叫做kdamond的內核線程來進行請求監測。你可以
 得該線程的 ``pid`` 。當監測被 ``關閉`` 時，讀取該文件不會返回任何信息::
 
     # cd <debugfs>/damon
-    # cat monitor_on
+    # cat monitor_on_DEPRECATED
     off
     # cat kdamond_pid
     none
-    # echo on > monitor_on
+    # echo on > monitor_on_DEPRECATED
     # cat kdamond_pid
     18594
 
@@ -574,7 +574,7 @@ DAMON通過一個叫做kdamond的內核線程來進行請求監測。你可以
     # ls foo
     # ls: cannot access 'foo': No such file or directory
 
-注意， ``mk_contexts`` 、 ``rm_contexts`` 和 ``monitor_on`` 文件只在根目錄下。
+注意， ``mk_contexts`` 、 ``rm_contexts`` 和 ``monitor_on_DEPRECATED`` 文件只在根目錄下。
 
 
 監測結果的監測點
@@ -583,10 +583,10 @@ DAMON通過一個叫做kdamond的內核線程來進行請求監測。你可以
 DAMON通過一個tracepoint ``damon:damon_aggregated`` 提供監測結果.  當監測開啓時，你可
 以記錄追蹤點事件，並使用追蹤點支持工具如perf顯示結果。比如說::
 
-    # echo on > monitor_on
+    # echo on > monitor_on_DEPRECATED
     # perf record -e damon:damon_aggregated &
     # sleep 5
     # kill 9 $(pidof perf)
-    # echo off > monitor_on
+    # echo off > monitor_on_DEPRECATED
     # perf script
 

From 7339e99bb149414ef8546794b08b03dea13e0021 Mon Sep 17 00:00:00 2001
From: Yajun Deng <yajun.deng@linux.dev>
Date: Wed, 31 Jan 2024 11:19:13 +0800
Subject: [PATCH 1095/1406] mm/mmap: use SZ_{8K, 128K} helper macro

Use SZ_{8K, 128K} helper macro instead of the number in init_user_reserve
and reserve_mem_notifier. This is more readable.

Link: https://lkml.kernel.org/r/20240131031913.2058597-1-yajun.deng@linux.dev
Signed-off-by: Yajun Deng <yajun.deng@linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/mmap.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/mm/mmap.c b/mm/mmap.c
index 476de5daf598d1..1f9e7024285866 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -3845,7 +3845,7 @@ static int init_user_reserve(void)
 
 	free_kbytes = K(global_zone_page_state(NR_FREE_PAGES));
 
-	sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17);
+	sysctl_user_reserve_kbytes = min(free_kbytes / 32, SZ_128K);
 	return 0;
 }
 subsys_initcall(init_user_reserve);
@@ -3866,7 +3866,7 @@ static int init_admin_reserve(void)
 
 	free_kbytes = K(global_zone_page_state(NR_FREE_PAGES));
 
-	sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13);
+	sysctl_admin_reserve_kbytes = min(free_kbytes / 32, SZ_8K);
 	return 0;
 }
 subsys_initcall(init_admin_reserve);
@@ -3898,12 +3898,12 @@ static int reserve_mem_notifier(struct notifier_block *nb,
 	case MEM_ONLINE:
 		/* Default max is 128MB. Leave alone if modified by operator. */
 		tmp = sysctl_user_reserve_kbytes;
-		if (0 < tmp && tmp < (1UL << 17))
+		if (tmp > 0 && tmp < SZ_128K)
 			init_user_reserve();
 
 		/* Default max is 8MB.  Leave alone if modified by operator. */
 		tmp = sysctl_admin_reserve_kbytes;
-		if (0 < tmp && tmp < (1UL << 13))
+		if (tmp > 0 && tmp < SZ_8K)
 			init_admin_reserve();
 
 		break;

From 8b16a4129f944379e78bd82e42b78b9f2a1e7435 Mon Sep 17 00:00:00 2001
From: Rakie Kim <rakie.kim@sk.com>
Date: Fri, 2 Feb 2024 12:02:35 -0500
Subject: [PATCH 1096/1406] mm/mempolicy: implement the sysfs-based
 weighted_interleave interface
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Patch series "mm/mempolicy: weighted interleave mempolicy and sysfs
extension", v5.

Weighted interleave is a new interleave policy intended to make use of
heterogeneous memory environments appearing with CXL.

The existing interleave mechanism does an even round-robin distribution of
memory across all nodes in a nodemask, while weighted interleave
distributes memory across nodes according to a provided weight.  (Weight =
# of page allocations per round)

Weighted interleave is intended to reduce average latency when bandwidth
is pressured - therefore increasing total throughput.

In other words: It allows greater use of the total available bandwidth in
a heterogeneous hardware environment (different hardware provides
different bandwidth capacity).

As bandwidth is pressured, latency increases - first linearly and then
exponentially.  By keeping bandwidth usage distributed according to
available bandwidth, we therefore can reduce the average latency of a
cacheline fetch.

A good explanation of the bandwidth vs latency response curve:
https://mahmoudhatem.wordpress.com/2017/11/07/memory-bandwidth-vs-latency-response-curve/

From the article:
```
Constant region:
    The latency response is fairly constant for the first 40%
    of the sustained bandwidth.
Linear region:
    In between 40% to 80% of the sustained bandwidth, the
    latency response increases almost linearly with the bandwidth
    demand of the system due to contention overhead by numerous
    memory requests.
Exponential region:
    Between 80% to 100% of the sustained bandwidth, the memory
    latency is dominated by the contention latency which can be
    as much as twice the idle latency or more.
Maximum sustained bandwidth :
    Is 65% to 75% of the theoretical maximum bandwidth.
```

As a general rule of thumb:
* If bandwidth usage is low, latency does not increase. It is
  optimal to place data in the nearest (lowest latency) device.
* If bandwidth usage is high, latency increases. It is optimal
  to place data such that bandwidth use is optimized per-device.

This is the top line goal: Provide a user a mechanism to target using the
"maximum sustained bandwidth" of each hardware component in a heterogenous
memory system.


For example, the stream benchmark demonstrates that 1:1 (default)
interleave is actively harmful, while weighted interleave can be
beneficial.  Default interleave distributes data such that too much
pressure is placed on devices with lower available bandwidth.

Stream Benchmark (vs DRAM, 1 Socket + 1 CXL Device)
Default interleave : -78% (slower than DRAM)
Global weighting   : -6% to +4% (workload dependant)
Targeted weights   : +2.5% to +4% (consistently better than DRAM)

Global means the task-policy was set (set_mempolicy), while targeted means
VMA policies were set (mbind2).  We see weighted interleave is not always
beneficial when applied globally, but is always beneficial when applied to
bandwidth-driving memory regions.


There are 4 patches in this set:
1) Implement system-global interleave weights as sysfs extension
   in mm/mempolicy.c.  These weights are RCU protected, and a
   default weight set is provided (all weights are 1 by default).

   In future work, we intend to expose an interface for HMAT/CDAT
   code to set reasonable default values based on the memory
   configuration of the system discovered at boot/hotplug.

2) A mild refactor of some interleave-logic for re-use in the
   new weighted interleave logic.

3) MPOL_WEIGHTED_INTERLEAVE extension for set_mempolicy/mbind

4) Protect interleave logic (weighted and normal) with the
   mems_allowed seq cookie.  If the nodemask changes while
   accessing it during a rebind, just retry the access.

Included below are some performance and LTP test information,
and a sample numactl branch which can be used for testing.

= Performance summary =
(tests may have different configurations, see extended info below)
1) MLC (W2) : +38% over DRAM. +264% over default interleave.
   MLC (W5) : +40% over DRAM. +226% over default interleave.
2) Stream   : -6% to +4% over DRAM, +430% over default interleave.
3) XSBench  : +19% over DRAM. +47% over default interleave.

= LTP Testing Summary =
existing mempolicy & mbind tests: pass
mempolicy & mbind + weighted interleave (global weights): pass

= version history
v5:
- style fixes
- mems_allowed cookie protection to detect rebind issues,
  prevents spurious allocation failures and/or mis-allocations
- sparse warning fixes related to __rcu on local variables

=====================================================================
Performance tests - MLC
From - Ravi Jonnalagadda <ravis.opensrc@micron.com>

Hardware: Single-socket, multiple CXL memory expanders.

Workload:                               W2
Data Signature:                         2:1 read:write
DRAM only bandwidth (GBps):             298.8
DRAM + CXL (default interleave) (GBps): 113.04
DRAM + CXL (weighted interleave)(GBps): 412.5
Gain over DRAM only:                    1.38x
Gain over default interleave:           2.64x

Workload:                               W5
Data Signature:                         1:1 read:write
DRAM only bandwidth (GBps):             273.2
DRAM + CXL (default interleave) (GBps): 117.23
DRAM + CXL (weighted interleave)(GBps): 382.7
Gain over DRAM only:                    1.4x
Gain over default interleave:           2.26x

=====================================================================
Performance test - Stream
From - Gregory Price <gregory.price@memverge.com>

Hardware: Single socket, single CXL expander
numactl extension: https://github.com/gmprice/numactl/tree/weighted_interleave_master

Summary: 64 threads, ~18GB workload, 3GB per array, executed 100 times
Default interleave : -78% (slower than DRAM)
Global weighting   : -6% to +4% (workload dependant)
mbind2 weights     : +2.5% to +4% (consistently better than DRAM)

dram only:
numactl --cpunodebind=1 --membind=1 ./stream_c.exe --ntimes 100 --array-size 400M --malloc
Function     Direction    BestRateMBs     AvgTime      MinTime      MaxTime
Copy:        0->0            200923.2     0.032662     0.031853     0.033301
Scale:       0->0            202123.0     0.032526     0.031664     0.032970
Add:         0->0            208873.2     0.047322     0.045961     0.047884
Triad:       0->0            208523.8     0.047262     0.046038     0.048414

CXL-only:
numactl --cpunodebind=1 -w --membind=2 ./stream_c.exe --ntimes 100 --array-size 400M --malloc
Copy:        0->0             22209.7     0.288661     0.288162     0.289342
Scale:       0->0             22288.2     0.287549     0.287147     0.288291
Add:         0->0             24419.1     0.393372     0.393135     0.393735
Triad:       0->0             24484.6     0.392337     0.392083     0.394331

Based on the above, the optimal weights are ~9:1
echo 9 > /sys/kernel/mm/mempolicy/weighted_interleave/node1
echo 1 > /sys/kernel/mm/mempolicy/weighted_interleave/node2

default interleave:
numactl --cpunodebind=1 --interleave=1,2 ./stream_c.exe --ntimes 100 --array-size 400M --malloc
Copy:        0->0             44666.2     0.143671     0.143285     0.144174
Scale:       0->0             44781.6     0.143256     0.142916     0.143713
Add:         0->0             48600.7     0.197719     0.197528     0.197858
Triad:       0->0             48727.5     0.197204     0.197014     0.197439

global weighted interleave:
numactl --cpunodebind=1 -w --interleave=1,2 ./stream_c.exe --ntimes 100 --array-size 400M --malloc
Copy:        0->0            190085.9     0.034289     0.033669     0.034645
Scale:       0->0            207677.4     0.031909     0.030817     0.033061
Add:         0->0            202036.8     0.048737     0.047516     0.053409
Triad:       0->0            217671.5     0.045819     0.044103     0.046755

targted regions w/ global weights (modified stream to mbind2 malloc'd regions))
numactl --cpunodebind=1 --membind=1 ./stream_c.exe -b --ntimes 100 --array-size 400M --malloc
Copy:        0->0            205827.0     0.031445     0.031094     0.031984
Scale:       0->0            208171.8     0.031320     0.030744     0.032505
Add:         0->0            217352.0     0.045087     0.044168     0.046515
Triad:       0->0            216884.8     0.045062     0.044263     0.046982

=====================================================================
Performance tests - XSBench
From - Hyeongtak Ji <hyeongtak.ji@sk.com>

Hardware: Single socket, Single CXL memory Expander

NUMA node 0: 56 logical cores, 128 GB memory
NUMA node 2: 96 GB CXL memory
Threads:     56
Lookups:     170,000,000

Summary: +19% over DRAM. +47% over default interleave.

Performance tests - XSBench
1. dram only
$ numactl -m 0 ./XSBench -s XL –p 5000000
Runtime:     36.235 seconds
Lookups/s:   4,691,618

2. default interleave
$ numactl –i 0,2 ./XSBench –s XL –p 5000000
Runtime:     55.243 seconds
Lookups/s:   3,077,293

3. weighted interleave
numactl –w –i 0,2 ./XSBench –s XL –p 5000000
Runtime:     29.262 seconds
Lookups/s:   5,809,513

=====================================================================
LTP Tests: https://github.com/gmprice/ltp/tree/mempolicy2

= Existing tests
set_mempolicy, get_mempolicy, mbind

MPOL_WEIGHTED_INTERLEAVE added manually to test basic functionality but
did not adjust tests for weighting.  Basically the weights were set to 1,
which is the default, and it should behave the same as MPOL_INTERLEAVE if
logic is correct.

== set_mempolicy01 : passed   18, failed   0
== set_mempolicy02 : passed   10, failed   0
== set_mempolicy03 : passed   64, failed   0
== set_mempolicy04 : passed   32, failed   0
== set_mempolicy05 - n/a on non-x86
== set_mempolicy06 : passed   10, failed   0
   this is set_mempolicy02 + MPOL_WEIGHTED_INTERLEAVE
== set_mempolicy07 : passed   32, failed   0
   set_mempolicy04 + MPOL_WEIGHTED_INTERLEAVE
== get_mempolicy01 : passed   12, failed   0
   change: added MPOL_WEIGHTED_INTERLEAVE
== get_mempolicy02 : passed   2, failed   0
== mbind01 : passed   15, failed   0
   added MPOL_WEIGHTED_INTERLEAVE
== mbind02 : passed   4, failed   0
   added MPOL_WEIGHTED_INTERLEAVE
== mbind03 : passed   16, failed   0
   added MPOL_WEIGHTED_INTERLEAVE
== mbind04 : passed   48, failed   0
   added MPOL_WEIGHTED_INTERLEAVE

=====================================================================
numactl (set_mempolicy) w/ global weighting test
numactl fork: https://github.com/gmprice/numactl/tree/weighted_interleave_master

command: numactl -w --interleave=0,1 ./eatmem

result (weights 1:1):
0176a000 weighted interleave:0-1 heap anon=65793 dirty=65793 active=0 N0=32897 N1=32896 kernelpagesize_kB=4
7fceeb9ff000 weighted interleave:0-1 anon=65537 dirty=65537 active=0 N0=32768 N1=32769 kernelpagesize_kB=4
50% distribution is correct

result (weights 5:1):
01b14000 weighted interleave:0-1 heap anon=65793 dirty=65793 active=0 N0=54828 N1=10965 kernelpagesize_kB=4
7f47a1dff000 weighted interleave:0-1 anon=65537 dirty=65537 active=0 N0=54614 N1=10923 kernelpagesize_kB=4
16.666% distribution is correct

result (weights 1:5):
01f07000 weighted interleave:0-1 heap anon=65793 dirty=65793 active=0 N0=10966 N1=54827 kernelpagesize_kB=4
7f17b1dff000 weighted interleave:0-1 anon=65537 dirty=65537 active=0 N0=10923 N1=54614 kernelpagesize_kB=4
16.666% distribution is correct

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
int main (void)
{
        char* mem = malloc(1024*1024*256);
        memset(mem, 1, 1024*1024*256);
        for (int i = 0; i  < ((1024*1024*256)/4096); i++)
        {
                mem = malloc(4096);
                mem[0] = 1;
        }
        printf("done\n");
        getchar();
        return 0;
}


This patch (of 4):

This patch provides a way to set interleave weight information under sysfs
at /sys/kernel/mm/mempolicy/weighted_interleave/nodeN

The sysfs structure is designed as follows.

  $ tree /sys/kernel/mm/mempolicy/
  /sys/kernel/mm/mempolicy/ [1]
  └── weighted_interleave [2]
      ├── node0 [3]
      └── node1

Each file above can be explained as follows.

[1] mm/mempolicy: configuration interface for mempolicy subsystem

[2] weighted_interleave/: config interface for weighted interleave policy

[3] weighted_interleave/nodeN: weight for nodeN

If a node value is set to `0`, the system-default value will be used.
As of this patch, the system-default for all nodes is always 1.

Link: https://lkml.kernel.org/r/20240202170238.90004-1-gregory.price@memverge.com
Link: https://lkml.kernel.org/r/20240202170238.90004-2-gregory.price@memverge.com
Suggested-by: "Huang, Ying" <ying.huang@intel.com>
Signed-off-by: Rakie Kim <rakie.kim@sk.com>
Signed-off-by: Honggyu Kim <honggyu.kim@sk.com>
Co-developed-by: Gregory Price <gregory.price@memverge.com>
Signed-off-by: Gregory Price <gregory.price@memverge.com>
Co-developed-by: Hyeongtak Ji <hyeongtak.ji@sk.com>
Signed-off-by: Hyeongtak Ji <hyeongtak.ji@sk.com>
Reviewed-by: "Huang, Ying" <ying.huang@intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Gregory Price <gourry.memverge@gmail.com>
Cc: Hasan Al Maruf <Hasan.Maruf@amd.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Srinivasulu Thanneeru <sthanneeru.opensrc@micron.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 .../ABI/testing/sysfs-kernel-mm-mempolicy     |   4 +
 ...fs-kernel-mm-mempolicy-weighted-interleave |  25 ++
 mm/mempolicy.c                                | 223 ++++++++++++++++++
 3 files changed, 252 insertions(+)
 create mode 100644 Documentation/ABI/testing/sysfs-kernel-mm-mempolicy
 create mode 100644 Documentation/ABI/testing/sysfs-kernel-mm-mempolicy-weighted-interleave

diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy b/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy
new file mode 100644
index 00000000000000..8ac327fd7fb6e3
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy
@@ -0,0 +1,4 @@
+What:		/sys/kernel/mm/mempolicy/
+Date:		January 2024
+Contact:	Linux memory management mailing list <linux-mm@kvack.org>
+Description:	Interface for Mempolicy
diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy-weighted-interleave b/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy-weighted-interleave
new file mode 100644
index 00000000000000..0b7972de04e939
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy-weighted-interleave
@@ -0,0 +1,25 @@
+What:		/sys/kernel/mm/mempolicy/weighted_interleave/
+Date:		January 2024
+Contact:	Linux memory management mailing list <linux-mm@kvack.org>
+Description:	Configuration Interface for the Weighted Interleave policy
+
+What:		/sys/kernel/mm/mempolicy/weighted_interleave/nodeN
+Date:		January 2024
+Contact:	Linux memory management mailing list <linux-mm@kvack.org>
+Description:	Weight configuration interface for nodeN
+
+		The interleave weight for a memory node (N). These weights are
+		utilized by tasks which have set their mempolicy to
+		MPOL_WEIGHTED_INTERLEAVE.
+
+		These weights only affect new allocations, and changes at runtime
+		will not cause migrations on already allocated pages.
+
+		The minimum weight for a node is always 1.
+
+		Minimum weight: 1
+		Maximum weight: 255
+
+		Writing an empty string or `0` will reset the weight to the
+		system default. The system default may be set by the kernel
+		or drivers at boot or during hotplug events.
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 5e519163c4dcb6..b4fccc921b6238 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -131,6 +131,32 @@ static struct mempolicy default_policy = {
 
 static struct mempolicy preferred_node_policy[MAX_NUMNODES];
 
+/*
+ * iw_table is the sysfs-set interleave weight table, a value of 0 denotes
+ * system-default value should be used. A NULL iw_table also denotes that
+ * system-default values should be used. Until the system-default table
+ * is implemented, the system-default is always 1.
+ *
+ * iw_table is RCU protected
+ */
+static u8 __rcu *iw_table;
+static DEFINE_MUTEX(iw_table_lock);
+
+static u8 get_il_weight(int node)
+{
+	u8 *table;
+	u8 weight;
+
+	rcu_read_lock();
+	table = rcu_dereference(iw_table);
+	/* if no iw_table, use system default */
+	weight = table ? table[node] : 1;
+	/* if value in iw_table is 0, use system default */
+	weight = weight ? weight : 1;
+	rcu_read_unlock();
+	return weight;
+}
+
 /**
  * numa_nearest_node - Find nearest node by state
  * @node: Node id to start the search
@@ -3063,3 +3089,200 @@ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
 		p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
 			       nodemask_pr_args(&nodes));
 }
+
+#ifdef CONFIG_SYSFS
+struct iw_node_attr {
+	struct kobj_attribute kobj_attr;
+	int nid;
+};
+
+static ssize_t node_show(struct kobject *kobj, struct kobj_attribute *attr,
+			 char *buf)
+{
+	struct iw_node_attr *node_attr;
+	u8 weight;
+
+	node_attr = container_of(attr, struct iw_node_attr, kobj_attr);
+	weight = get_il_weight(node_attr->nid);
+	return sysfs_emit(buf, "%d\n", weight);
+}
+
+static ssize_t node_store(struct kobject *kobj, struct kobj_attribute *attr,
+			  const char *buf, size_t count)
+{
+	struct iw_node_attr *node_attr;
+	u8 *new;
+	u8 *old;
+	u8 weight = 0;
+
+	node_attr = container_of(attr, struct iw_node_attr, kobj_attr);
+	if (count == 0 || sysfs_streq(buf, ""))
+		weight = 0;
+	else if (kstrtou8(buf, 0, &weight))
+		return -EINVAL;
+
+	new = kzalloc(nr_node_ids, GFP_KERNEL);
+	if (!new)
+		return -ENOMEM;
+
+	mutex_lock(&iw_table_lock);
+	old = rcu_dereference_protected(iw_table,
+					lockdep_is_held(&iw_table_lock));
+	if (old)
+		memcpy(new, old, nr_node_ids);
+	new[node_attr->nid] = weight;
+	rcu_assign_pointer(iw_table, new);
+	mutex_unlock(&iw_table_lock);
+	synchronize_rcu();
+	kfree(old);
+	return count;
+}
+
+static struct iw_node_attr **node_attrs;
+
+static void sysfs_wi_node_release(struct iw_node_attr *node_attr,
+				  struct kobject *parent)
+{
+	if (!node_attr)
+		return;
+	sysfs_remove_file(parent, &node_attr->kobj_attr.attr);
+	kfree(node_attr->kobj_attr.attr.name);
+	kfree(node_attr);
+}
+
+static void sysfs_wi_release(struct kobject *wi_kobj)
+{
+	int i;
+
+	for (i = 0; i < nr_node_ids; i++)
+		sysfs_wi_node_release(node_attrs[i], wi_kobj);
+	kobject_put(wi_kobj);
+}
+
+static const struct kobj_type wi_ktype = {
+	.sysfs_ops = &kobj_sysfs_ops,
+	.release = sysfs_wi_release,
+};
+
+static int add_weight_node(int nid, struct kobject *wi_kobj)
+{
+	struct iw_node_attr *node_attr;
+	char *name;
+
+	node_attr = kzalloc(sizeof(*node_attr), GFP_KERNEL);
+	if (!node_attr)
+		return -ENOMEM;
+
+	name = kasprintf(GFP_KERNEL, "node%d", nid);
+	if (!name) {
+		kfree(node_attr);
+		return -ENOMEM;
+	}
+
+	sysfs_attr_init(&node_attr->kobj_attr.attr);
+	node_attr->kobj_attr.attr.name = name;
+	node_attr->kobj_attr.attr.mode = 0644;
+	node_attr->kobj_attr.show = node_show;
+	node_attr->kobj_attr.store = node_store;
+	node_attr->nid = nid;
+
+	if (sysfs_create_file(wi_kobj, &node_attr->kobj_attr.attr)) {
+		kfree(node_attr->kobj_attr.attr.name);
+		kfree(node_attr);
+		pr_err("failed to add attribute to weighted_interleave\n");
+		return -ENOMEM;
+	}
+
+	node_attrs[nid] = node_attr;
+	return 0;
+}
+
+static int add_weighted_interleave_group(struct kobject *root_kobj)
+{
+	struct kobject *wi_kobj;
+	int nid, err;
+
+	wi_kobj = kzalloc(sizeof(struct kobject), GFP_KERNEL);
+	if (!wi_kobj)
+		return -ENOMEM;
+
+	err = kobject_init_and_add(wi_kobj, &wi_ktype, root_kobj,
+				   "weighted_interleave");
+	if (err) {
+		kfree(wi_kobj);
+		return err;
+	}
+
+	for_each_node_state(nid, N_POSSIBLE) {
+		err = add_weight_node(nid, wi_kobj);
+		if (err) {
+			pr_err("failed to add sysfs [node%d]\n", nid);
+			break;
+		}
+	}
+	if (err)
+		kobject_put(wi_kobj);
+	return 0;
+}
+
+static void mempolicy_kobj_release(struct kobject *kobj)
+{
+	u8 *old;
+
+	mutex_lock(&iw_table_lock);
+	old = rcu_dereference_protected(iw_table,
+					lockdep_is_held(&iw_table_lock));
+	rcu_assign_pointer(iw_table, NULL);
+	mutex_unlock(&iw_table_lock);
+	synchronize_rcu();
+	kfree(old);
+	kfree(node_attrs);
+	kfree(kobj);
+}
+
+static const struct kobj_type mempolicy_ktype = {
+	.release = mempolicy_kobj_release
+};
+
+static int __init mempolicy_sysfs_init(void)
+{
+	int err;
+	static struct kobject *mempolicy_kobj;
+
+	mempolicy_kobj = kzalloc(sizeof(*mempolicy_kobj), GFP_KERNEL);
+	if (!mempolicy_kobj) {
+		err = -ENOMEM;
+		goto err_out;
+	}
+
+	node_attrs = kcalloc(nr_node_ids, sizeof(struct iw_node_attr *),
+			     GFP_KERNEL);
+	if (!node_attrs) {
+		err = -ENOMEM;
+		goto mempol_out;
+	}
+
+	err = kobject_init_and_add(mempolicy_kobj, &mempolicy_ktype, mm_kobj,
+				   "mempolicy");
+	if (err)
+		goto node_out;
+
+	err = add_weighted_interleave_group(mempolicy_kobj);
+	if (err) {
+		pr_err("mempolicy sysfs structure failed to initialize\n");
+		kobject_put(mempolicy_kobj);
+		return err;
+	}
+
+	return err;
+node_out:
+	kfree(node_attrs);
+mempol_out:
+	kfree(mempolicy_kobj);
+err_out:
+	pr_err("failed to add mempolicy kobject to the system\n");
+	return err;
+}
+
+late_initcall(mempolicy_sysfs_init);
+#endif /* CONFIG_SYSFS */

From 76fc110e70ef0eb701b138d620eeaf64f134c778 Mon Sep 17 00:00:00 2001
From: Gregory Price <gourry.memverge@gmail.com>
Date: Fri, 2 Feb 2024 12:02:36 -0500
Subject: [PATCH 1097/1406] mm/mempolicy: refactor a read-once mechanism into a
 function for re-use

Move the use of barrier() to force policy->nodemask onto the stack into a
function `read_once_policy_nodemask` so that it may be re-used.

Link: https://lkml.kernel.org/r/20240202170238.90004-3-gregory.price@memverge.com
Signed-off-by: Gregory Price <gregory.price@memverge.com>
Suggested-by: "Huang, Ying" <ying.huang@intel.com>
Reviewed-by: "Huang, Ying" <ying.huang@intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Hasan Al Maruf <Hasan.Maruf@amd.com>
Cc: Honggyu Kim <honggyu.kim@sk.com>
Cc: Hyeongtak Ji <hyeongtak.ji@sk.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Rakie Kim <rakie.kim@sk.com>
Cc: Ravi Jonnalagadda <ravis.opensrc@micron.com>
Cc: Srinivasulu Thanneeru <sthanneeru.opensrc@micron.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/mempolicy.c | 26 ++++++++++++++++----------
 1 file changed, 16 insertions(+), 10 deletions(-)

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index b4fccc921b6238..1bdc7d0d1b0b22 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1905,6 +1905,20 @@ unsigned int mempolicy_slab_node(void)
 	}
 }
 
+static unsigned int read_once_policy_nodemask(struct mempolicy *pol,
+					      nodemask_t *mask)
+{
+	/*
+	 * barrier stabilizes the nodemask locally so that it can be iterated
+	 * over safely without concern for changes. Allocators validate node
+	 * selection does not violate mems_allowed, so this is safe.
+	 */
+	barrier();
+	memcpy(mask, &pol->nodes, sizeof(nodemask_t));
+	barrier();
+	return nodes_weight(*mask);
+}
+
 /*
  * Do static interleaving for interleave index @ilx.  Returns the ilx'th
  * node in pol->nodes (starting from ilx=0), wrapping around if ilx
@@ -1912,20 +1926,12 @@ unsigned int mempolicy_slab_node(void)
  */
 static unsigned int interleave_nid(struct mempolicy *pol, pgoff_t ilx)
 {
-	nodemask_t nodemask = pol->nodes;
+	nodemask_t nodemask;
 	unsigned int target, nnodes;
 	int i;
 	int nid;
-	/*
-	 * The barrier will stabilize the nodemask in a register or on
-	 * the stack so that it will stop changing under the code.
-	 *
-	 * Between first_node() and next_node(), pol->nodes could be changed
-	 * by other threads. So we put pol->nodes in a local stack.
-	 */
-	barrier();
 
-	nnodes = nodes_weight(nodemask);
+	nnodes = read_once_policy_nodemask(pol, &nodemask);
 	if (!nnodes)
 		return numa_node_id();
 	target = ilx % nnodes;

From b33ab1eb822e1d4ea69cc20df2ba4b3037c5586a Mon Sep 17 00:00:00 2001
From: Gregory Price <gourry.memverge@gmail.com>
Date: Fri, 2 Feb 2024 12:02:37 -0500
Subject: [PATCH 1098/1406] mm/mempolicy: introduce MPOL_WEIGHTED_INTERLEAVE
 for weighted interleaving

When a system has multiple NUMA nodes and it becomes bandwidth hungry,
using the current MPOL_INTERLEAVE could be an wise option.

However, if those NUMA nodes consist of different types of memory such as
socket-attached DRAM and CXL/PCIe attached DRAM, the round-robin based
interleave policy does not optimally distribute data to make use of their
different bandwidth characteristics.

Instead, interleave is more effective when the allocation policy follows
each NUMA nodes' bandwidth weight rather than a simple 1:1 distribution.

This patch introduces a new memory policy, MPOL_WEIGHTED_INTERLEAVE,
enabling weighted interleave between NUMA nodes.  Weighted interleave
allows for proportional distribution of memory across multiple numa nodes,
preferably apportioned to match the bandwidth of each node.

For example, if a system has 1 CPU node (0), and 2 memory nodes (0,1),
with bandwidth of (100GB/s, 50GB/s) respectively, the appropriate weight
distribution is (2:1).

Weights for each node can be assigned via the new sysfs extension:
/sys/kernel/mm/mempolicy/weighted_interleave/

For now, the default value of all nodes will be `1`, which matches the
behavior of standard 1:1 round-robin interleave.  An extension will be
added in the future to allow default values to be registered at kernel and
device bringup time.

The policy allocates a number of pages equal to the set weights.  For
example, if the weights are (2,1), then 2 pages will be allocated on node0
for every 1 page allocated on node1.

The new flag MPOL_WEIGHTED_INTERLEAVE can be used in set_mempolicy(2)
and mbind(2).

Some high level notes about the pieces of weighted interleave:

current->il_prev:
    Tracks the node previously allocated from.

current->il_weight:
    The active weight of the current node (current->il_prev)
    When this reaches 0, current->il_prev is set to the next node
    and current->il_weight is set to the next weight.

weighted_interleave_nodes:
    Counts the number of allocations as they occur, and applies the
    weight for the current node.  When the weight reaches 0, switch
    to the next node.  Operates only on task->mempolicy.

weighted_interleave_nid:
    Gets the total weight of the nodemask as well as each individual
    node weight, then calculates the node based on the given index.
    Operates on VMA policies.

bulk_array_weighted_interleave:
    Gets the total weight of the nodemask as well as each individual
    node weight, then calculates the number of "interleave rounds" as
    well as any delta ("partial round").  Calculates the number of
    pages for each node and allocates them.

    If a node was scheduled for interleave via interleave_nodes, the
    current weight will be allocated first.

    Operates only on the task->mempolicy.

One piece of complexity is the interaction between a recent refactor which
split the logic to acquire the "ilx" (interleave index) of an allocation
and the actually application of the interleave.  If a call to
alloc_pages_mpol() were made with a weighted-interleave policy and ilx set
to NO_INTERLEAVE_INDEX, weighted_interleave_nodes() would operate on a VMA
policy - violating the description above.

An inspection of all callers of alloc_pages_mpol() shows that all external
callers set ilx to `0`, an index value, or will call get_vma_policy() to
acquire the ilx.

For example, mm/shmem.c may call into alloc_pages_mpol.  The call stacks
all set (pgoff_t ilx) or end up in `get_vma_policy()`.  This enforces the
`weighted_interleave_nodes()` and `weighted_interleave_nid()` policy
requirements (task/vma respectively).

Link: https://lkml.kernel.org/r/20240202170238.90004-4-gregory.price@memverge.com
Suggested-by: Hasan Al Maruf <Hasan.Maruf@amd.com>
Signed-off-by: Gregory Price <gregory.price@memverge.com>
Co-developed-by: Rakie Kim <rakie.kim@sk.com>
Signed-off-by: Rakie Kim <rakie.kim@sk.com>
Co-developed-by: Honggyu Kim <honggyu.kim@sk.com>
Signed-off-by: Honggyu Kim <honggyu.kim@sk.com>
Co-developed-by: Hyeongtak Ji <hyeongtak.ji@sk.com>
Signed-off-by: Hyeongtak Ji <hyeongtak.ji@sk.com>
Co-developed-by: Srinivasulu Thanneeru <sthanneeru.opensrc@micron.com>
Signed-off-by: Srinivasulu Thanneeru <sthanneeru.opensrc@micron.com>
Co-developed-by: Ravi Jonnalagadda <ravis.opensrc@micron.com>
Signed-off-by: Ravi Jonnalagadda <ravis.opensrc@micron.com>
Reviewed-by: "Huang, Ying" <ying.huang@intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 .../admin-guide/mm/numa_memory_policy.rst     |   9 +
 include/linux/sched.h                         |   1 +
 include/uapi/linux/mempolicy.h                |   1 +
 mm/mempolicy.c                                | 218 +++++++++++++++++-
 4 files changed, 225 insertions(+), 4 deletions(-)

diff --git a/Documentation/admin-guide/mm/numa_memory_policy.rst b/Documentation/admin-guide/mm/numa_memory_policy.rst
index eca38fa81e0f98..a70f20ce1ffb4f 100644
--- a/Documentation/admin-guide/mm/numa_memory_policy.rst
+++ b/Documentation/admin-guide/mm/numa_memory_policy.rst
@@ -250,6 +250,15 @@ MPOL_PREFERRED_MANY
 	can fall back to all existing numa nodes. This is effectively
 	MPOL_PREFERRED allowed for a mask rather than a single node.
 
+MPOL_WEIGHTED_INTERLEAVE
+	This mode operates the same as MPOL_INTERLEAVE, except that
+	interleaving behavior is executed based on weights set in
+	/sys/kernel/mm/mempolicy/weighted_interleave/
+
+	Weighted interleave allocates pages on nodes according to a
+	weight.  For example if nodes [0,1] are weighted [5,2], 5 pages
+	will be allocated on node0 for every 2 pages allocated on node1.
+
 NUMA memory policy supports the following optional mode flags:
 
 MPOL_F_STATIC_NODES
diff --git a/include/linux/sched.h b/include/linux/sched.h
index ffe8f618ab8697..b9ce285d8c9c81 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1259,6 +1259,7 @@ struct task_struct {
 	/* Protected by alloc_lock: */
 	struct mempolicy		*mempolicy;
 	short				il_prev;
+	u8				il_weight;
 	short				pref_node_fork;
 #endif
 #ifdef CONFIG_NUMA_BALANCING
diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h
index a8963f7ef4c279..1f9bb10d1a473f 100644
--- a/include/uapi/linux/mempolicy.h
+++ b/include/uapi/linux/mempolicy.h
@@ -23,6 +23,7 @@ enum {
 	MPOL_INTERLEAVE,
 	MPOL_LOCAL,
 	MPOL_PREFERRED_MANY,
+	MPOL_WEIGHTED_INTERLEAVE,
 	MPOL_MAX,	/* always last member of enum */
 };
 
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 1bdc7d0d1b0b22..a8db92c236974d 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -19,6 +19,13 @@
  *                for anonymous memory. For process policy an process counter
  *                is used.
  *
+ * weighted interleave
+ *                Allocate memory interleaved over a set of nodes based on
+ *                a set of weights (per-node), with normal fallback if it
+ *                fails.  Otherwise operates the same as interleave.
+ *                Example: nodeset(0,1) & weights (2,1) - 2 pages allocated
+ *                on node 0 for every 1 page allocated on node 1.
+ *
  * bind           Only allocate memory on a specific set of nodes,
  *                no fallback.
  *                FIXME: memory is allocated starting with the first node
@@ -441,6 +448,10 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
 		.create = mpol_new_nodemask,
 		.rebind = mpol_rebind_preferred,
 	},
+	[MPOL_WEIGHTED_INTERLEAVE] = {
+		.create = mpol_new_nodemask,
+		.rebind = mpol_rebind_nodemask,
+	},
 };
 
 static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist,
@@ -858,8 +869,11 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
 
 	old = current->mempolicy;
 	current->mempolicy = new;
-	if (new && new->mode == MPOL_INTERLEAVE)
+	if (new && (new->mode == MPOL_INTERLEAVE ||
+		    new->mode == MPOL_WEIGHTED_INTERLEAVE)) {
 		current->il_prev = MAX_NUMNODES-1;
+		current->il_weight = 0;
+	}
 	task_unlock(current);
 	mpol_put(old);
 	ret = 0;
@@ -884,6 +898,7 @@ static void get_policy_nodemask(struct mempolicy *pol, nodemask_t *nodes)
 	case MPOL_INTERLEAVE:
 	case MPOL_PREFERRED:
 	case MPOL_PREFERRED_MANY:
+	case MPOL_WEIGHTED_INTERLEAVE:
 		*nodes = pol->nodes;
 		break;
 	case MPOL_LOCAL:
@@ -968,6 +983,13 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
 		} else if (pol == current->mempolicy &&
 				pol->mode == MPOL_INTERLEAVE) {
 			*policy = next_node_in(current->il_prev, pol->nodes);
+		} else if (pol == current->mempolicy &&
+				pol->mode == MPOL_WEIGHTED_INTERLEAVE) {
+			if (current->il_weight)
+				*policy = current->il_prev;
+			else
+				*policy = next_node_in(current->il_prev,
+						       pol->nodes);
 		} else {
 			err = -EINVAL;
 			goto out;
@@ -1332,7 +1354,8 @@ static long do_mbind(unsigned long start, unsigned long len,
 		 * VMAs, the nodes will still be interleaved from the targeted
 		 * nodemask, but one by one may be selected differently.
 		 */
-		if (new->mode == MPOL_INTERLEAVE) {
+		if (new->mode == MPOL_INTERLEAVE ||
+		    new->mode == MPOL_WEIGHTED_INTERLEAVE) {
 			struct page *page;
 			unsigned int order;
 			unsigned long addr = -EFAULT;
@@ -1780,7 +1803,8 @@ struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
  * @vma: virtual memory area whose policy is sought
  * @addr: address in @vma for shared policy lookup
  * @order: 0, or appropriate huge_page_order for interleaving
- * @ilx: interleave index (output), for use only when MPOL_INTERLEAVE
+ * @ilx: interleave index (output), for use only when MPOL_INTERLEAVE or
+ *       MPOL_WEIGHTED_INTERLEAVE
  *
  * Returns effective policy for a VMA at specified address.
  * Falls back to current->mempolicy or system default policy, as necessary.
@@ -1797,7 +1821,8 @@ struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
 	pol = __get_vma_policy(vma, addr, ilx);
 	if (!pol)
 		pol = get_task_policy(current);
-	if (pol->mode == MPOL_INTERLEAVE) {
+	if (pol->mode == MPOL_INTERLEAVE ||
+	    pol->mode == MPOL_WEIGHTED_INTERLEAVE) {
 		*ilx += vma->vm_pgoff >> order;
 		*ilx += (addr - vma->vm_start) >> (PAGE_SHIFT + order);
 	}
@@ -1847,6 +1872,22 @@ bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
 	return zone >= dynamic_policy_zone;
 }
 
+static unsigned int weighted_interleave_nodes(struct mempolicy *policy)
+{
+	unsigned int node = current->il_prev;
+
+	if (!current->il_weight || !node_isset(node, policy->nodes)) {
+		node = next_node_in(node, policy->nodes);
+		/* can only happen if nodemask is being rebound */
+		if (node == MAX_NUMNODES)
+			return node;
+		current->il_prev = node;
+		current->il_weight = get_il_weight(node);
+	}
+	current->il_weight--;
+	return node;
+}
+
 /* Do dynamic interleaving for a process */
 static unsigned int interleave_nodes(struct mempolicy *policy)
 {
@@ -1881,6 +1922,9 @@ unsigned int mempolicy_slab_node(void)
 	case MPOL_INTERLEAVE:
 		return interleave_nodes(policy);
 
+	case MPOL_WEIGHTED_INTERLEAVE:
+		return weighted_interleave_nodes(policy);
+
 	case MPOL_BIND:
 	case MPOL_PREFERRED_MANY:
 	{
@@ -1919,6 +1963,45 @@ static unsigned int read_once_policy_nodemask(struct mempolicy *pol,
 	return nodes_weight(*mask);
 }
 
+static unsigned int weighted_interleave_nid(struct mempolicy *pol, pgoff_t ilx)
+{
+	nodemask_t nodemask;
+	unsigned int target, nr_nodes;
+	u8 *table;
+	unsigned int weight_total = 0;
+	u8 weight;
+	int nid;
+
+	nr_nodes = read_once_policy_nodemask(pol, &nodemask);
+	if (!nr_nodes)
+		return numa_node_id();
+
+	rcu_read_lock();
+	table = rcu_dereference(iw_table);
+	/* calculate the total weight */
+	for_each_node_mask(nid, nodemask) {
+		/* detect system default usage */
+		weight = table ? table[nid] : 1;
+		weight = weight ? weight : 1;
+		weight_total += weight;
+	}
+
+	/* Calculate the node offset based on totals */
+	target = ilx % weight_total;
+	nid = first_node(nodemask);
+	while (target) {
+		/* detect system default usage */
+		weight = table ? table[nid] : 1;
+		weight = weight ? weight : 1;
+		if (target < weight)
+			break;
+		target -= weight;
+		nid = next_node_in(nid, nodemask);
+	}
+	rcu_read_unlock();
+	return nid;
+}
+
 /*
  * Do static interleaving for interleave index @ilx.  Returns the ilx'th
  * node in pol->nodes (starting from ilx=0), wrapping around if ilx
@@ -1979,6 +2062,11 @@ static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol,
 		*nid = (ilx == NO_INTERLEAVE_INDEX) ?
 			interleave_nodes(pol) : interleave_nid(pol, ilx);
 		break;
+	case MPOL_WEIGHTED_INTERLEAVE:
+		*nid = (ilx == NO_INTERLEAVE_INDEX) ?
+			weighted_interleave_nodes(pol) :
+			weighted_interleave_nid(pol, ilx);
+		break;
 	}
 
 	return nodemask;
@@ -2040,6 +2128,7 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask)
 	case MPOL_PREFERRED_MANY:
 	case MPOL_BIND:
 	case MPOL_INTERLEAVE:
+	case MPOL_WEIGHTED_INTERLEAVE:
 		*mask = mempolicy->nodes;
 		break;
 
@@ -2140,6 +2229,7 @@ struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order,
 		 * node in its nodemask, we allocate the standard way.
 		 */
 		if (pol->mode != MPOL_INTERLEAVE &&
+		    pol->mode != MPOL_WEIGHTED_INTERLEAVE &&
 		    (!nodemask || node_isset(nid, *nodemask))) {
 			/*
 			 * First, try to allocate THP only on local node, but
@@ -2275,6 +2365,114 @@ static unsigned long alloc_pages_bulk_array_interleave(gfp_t gfp,
 	return total_allocated;
 }
 
+static unsigned long alloc_pages_bulk_array_weighted_interleave(gfp_t gfp,
+		struct mempolicy *pol, unsigned long nr_pages,
+		struct page **page_array)
+{
+	struct task_struct *me = current;
+	unsigned long total_allocated = 0;
+	unsigned long nr_allocated = 0;
+	unsigned long rounds;
+	unsigned long node_pages, delta;
+	u8 *table, *weights, weight;
+	unsigned int weight_total = 0;
+	unsigned long rem_pages = nr_pages;
+	nodemask_t nodes;
+	int nnodes, node;
+	int resume_node = MAX_NUMNODES - 1;
+	u8 resume_weight = 0;
+	int prev_node;
+	int i;
+
+	if (!nr_pages)
+		return 0;
+
+	nnodes = read_once_policy_nodemask(pol, &nodes);
+	if (!nnodes)
+		return 0;
+
+	/* Continue allocating from most recent node and adjust the nr_pages */
+	node = me->il_prev;
+	weight = me->il_weight;
+	if (weight && node_isset(node, nodes)) {
+		node_pages = min(rem_pages, weight);
+		nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages,
+						  NULL, page_array);
+		page_array += nr_allocated;
+		total_allocated += nr_allocated;
+		/* if that's all the pages, no need to interleave */
+		if (rem_pages <= weight) {
+			me->il_weight -= rem_pages;
+			return total_allocated;
+		}
+		/* Otherwise we adjust remaining pages, continue from there */
+		rem_pages -= weight;
+	}
+	/* clear active weight in case of an allocation failure */
+	me->il_weight = 0;
+	prev_node = node;
+
+	/* create a local copy of node weights to operate on outside rcu */
+	weights = kzalloc(nr_node_ids, GFP_KERNEL);
+	if (!weights)
+		return total_allocated;
+
+	rcu_read_lock();
+	table = rcu_dereference(iw_table);
+	if (table)
+		memcpy(weights, table, nr_node_ids);
+	rcu_read_unlock();
+
+	/* calculate total, detect system default usage */
+	for_each_node_mask(node, nodes) {
+		if (!weights[node])
+			weights[node] = 1;
+		weight_total += weights[node];
+	}
+
+	/*
+	 * Calculate rounds/partial rounds to minimize __alloc_pages_bulk calls.
+	 * Track which node weighted interleave should resume from.
+	 *
+	 * if (rounds > 0) and (delta == 0), resume_node will always be
+	 * the node following prev_node and its weight.
+	 */
+	rounds = rem_pages / weight_total;
+	delta = rem_pages % weight_total;
+	resume_node = next_node_in(prev_node, nodes);
+	resume_weight = weights[resume_node];
+	for (i = 0; i < nnodes; i++) {
+		node = next_node_in(prev_node, nodes);
+		weight = weights[node];
+		node_pages = weight * rounds;
+		/* If a delta exists, add this node's portion of the delta */
+		if (delta > weight) {
+			node_pages += weight;
+			delta -= weight;
+		} else if (delta) {
+			/* when delta is depleted, resume from that node */
+			node_pages += delta;
+			resume_node = node;
+			resume_weight = weight - delta;
+			delta = 0;
+		}
+		/* node_pages can be 0 if an allocation fails and rounds == 0 */
+		if (!node_pages)
+			break;
+		nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages,
+						  NULL, page_array);
+		page_array += nr_allocated;
+		total_allocated += nr_allocated;
+		if (total_allocated == nr_pages)
+			break;
+		prev_node = node;
+	}
+	me->il_prev = resume_node;
+	me->il_weight = resume_weight;
+	kfree(weights);
+	return total_allocated;
+}
+
 static unsigned long alloc_pages_bulk_array_preferred_many(gfp_t gfp, int nid,
 		struct mempolicy *pol, unsigned long nr_pages,
 		struct page **page_array)
@@ -2315,6 +2513,10 @@ unsigned long alloc_pages_bulk_array_mempolicy(gfp_t gfp,
 		return alloc_pages_bulk_array_interleave(gfp, pol,
 							 nr_pages, page_array);
 
+	if (pol->mode == MPOL_WEIGHTED_INTERLEAVE)
+		return alloc_pages_bulk_array_weighted_interleave(
+				  gfp, pol, nr_pages, page_array);
+
 	if (pol->mode == MPOL_PREFERRED_MANY)
 		return alloc_pages_bulk_array_preferred_many(gfp,
 				numa_node_id(), pol, nr_pages, page_array);
@@ -2390,6 +2592,7 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
 	case MPOL_INTERLEAVE:
 	case MPOL_PREFERRED:
 	case MPOL_PREFERRED_MANY:
+	case MPOL_WEIGHTED_INTERLEAVE:
 		return !!nodes_equal(a->nodes, b->nodes);
 	case MPOL_LOCAL:
 		return true;
@@ -2526,6 +2729,10 @@ int mpol_misplaced(struct folio *folio, struct vm_area_struct *vma,
 		polnid = interleave_nid(pol, ilx);
 		break;
 
+	case MPOL_WEIGHTED_INTERLEAVE:
+		polnid = weighted_interleave_nid(pol, ilx);
+		break;
+
 	case MPOL_PREFERRED:
 		if (node_isset(curnid, pol->nodes))
 			goto out;
@@ -2900,6 +3107,7 @@ static const char * const policy_modes[] =
 	[MPOL_PREFERRED]  = "prefer",
 	[MPOL_BIND]       = "bind",
 	[MPOL_INTERLEAVE] = "interleave",
+	[MPOL_WEIGHTED_INTERLEAVE] = "weighted interleave",
 	[MPOL_LOCAL]      = "local",
 	[MPOL_PREFERRED_MANY]  = "prefer (many)",
 };
@@ -2959,6 +3167,7 @@ int mpol_parse_str(char *str, struct mempolicy **mpol)
 		}
 		break;
 	case MPOL_INTERLEAVE:
+	case MPOL_WEIGHTED_INTERLEAVE:
 		/*
 		 * Default to online nodes with memory if no nodelist
 		 */
@@ -3069,6 +3278,7 @@ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
 	case MPOL_PREFERRED_MANY:
 	case MPOL_BIND:
 	case MPOL_INTERLEAVE:
+	case MPOL_WEIGHTED_INTERLEAVE:
 		nodes = pol->nodes;
 		break;
 	default:

From 9ef2a5c8f57c8479ae240f7947957bc34821c937 Mon Sep 17 00:00:00 2001
From: Gregory Price <gourry.memverge@gmail.com>
Date: Fri, 2 Feb 2024 12:02:38 -0500
Subject: [PATCH 1099/1406] mm/mempolicy: protect task interleave functions
 with tsk->mems_allowed_seq

In the event of rebind, pol->nodemask can change at the same time as an
allocation occurs.  We can detect this with tsk->mems_allowed_seq and
prevent a miscount or an allocation failure from occurring.

The same thing happens in the allocators to detect failure, but this can
prevent spurious failures in a much smaller critical section.

Link: https://lkml.kernel.org/r/20240202170238.90004-5-gregory.price@memverge.com
Signed-off-by: Gregory Price <gregory.price@memverge.com>
Suggested-by: "Huang, Ying" <ying.huang@intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Hasan Al Maruf <Hasan.Maruf@amd.com>
Cc: Honggyu Kim <honggyu.kim@sk.com>
Cc: Hyeongtak Ji <hyeongtak.ji@sk.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Rakie Kim <rakie.kim@sk.com>
Cc: Ravi Jonnalagadda <ravis.opensrc@micron.com>
Cc: Srinivasulu Thanneeru <sthanneeru.opensrc@micron.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/mempolicy.c | 31 +++++++++++++++++++++++++------
 1 file changed, 25 insertions(+), 6 deletions(-)

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index a8db92c236974d..093ffd792d4e8a 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1874,11 +1874,17 @@ bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
 
 static unsigned int weighted_interleave_nodes(struct mempolicy *policy)
 {
-	unsigned int node = current->il_prev;
-
-	if (!current->il_weight || !node_isset(node, policy->nodes)) {
+	unsigned int node;
+	unsigned int cpuset_mems_cookie;
+
+retry:
+	/* to prevent miscount use tsk->mems_allowed_seq to detect rebind */
+	cpuset_mems_cookie = read_mems_allowed_begin();
+	node = current->il_prev;
+	if (!node || !node_isset(node, policy->nodes)) {
 		node = next_node_in(node, policy->nodes);
-		/* can only happen if nodemask is being rebound */
+		if (read_mems_allowed_retry(cpuset_mems_cookie))
+			goto retry;
 		if (node == MAX_NUMNODES)
 			return node;
 		current->il_prev = node;
@@ -1892,8 +1898,14 @@ static unsigned int weighted_interleave_nodes(struct mempolicy *policy)
 static unsigned int interleave_nodes(struct mempolicy *policy)
 {
 	unsigned int nid;
+	unsigned int cpuset_mems_cookie;
+
+	/* to prevent miscount, use tsk->mems_allowed_seq to detect rebind */
+	do {
+		cpuset_mems_cookie = read_mems_allowed_begin();
+		nid = next_node_in(current->il_prev, policy->nodes);
+	} while (read_mems_allowed_retry(cpuset_mems_cookie));
 
-	nid = next_node_in(current->il_prev, policy->nodes);
 	if (nid < MAX_NUMNODES)
 		current->il_prev = nid;
 	return nid;
@@ -2370,6 +2382,7 @@ static unsigned long alloc_pages_bulk_array_weighted_interleave(gfp_t gfp,
 		struct page **page_array)
 {
 	struct task_struct *me = current;
+	unsigned int cpuset_mems_cookie;
 	unsigned long total_allocated = 0;
 	unsigned long nr_allocated = 0;
 	unsigned long rounds;
@@ -2387,7 +2400,13 @@ static unsigned long alloc_pages_bulk_array_weighted_interleave(gfp_t gfp,
 	if (!nr_pages)
 		return 0;
 
-	nnodes = read_once_policy_nodemask(pol, &nodes);
+	/* read the nodes onto the stack, retry if done during rebind */
+	do {
+		cpuset_mems_cookie = read_mems_allowed_begin();
+		nnodes = read_once_policy_nodemask(pol, &nodes);
+	} while (read_mems_allowed_retry(cpuset_mems_cookie));
+
+	/* if the nodemask has become invalid, we cannot do anything */
 	if (!nnodes)
 		return 0;
 

From d7a0e598712c9812e8075616eb48ead5645e222d Mon Sep 17 00:00:00 2001
From: Gregory Price <gourry.memverge@gmail.com>
Date: Tue, 6 Feb 2024 14:28:53 -0500
Subject: [PATCH 1100/1406] mm/mempolicy: weighted interleave checks wrong
 parameter

weighted interleave presently checks (!node) when it should check
(!il_weight).  This causes a wrong distribution of memory.

Link: https://lkml.kernel.org/r/20240206192853.3589-1-gregory.price@memverge.com
Signed-off-by: Gregory Price <gregory.price@memverge.com>
Reported-by: "Huang, Ying" <ying.huang@intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Hasan Al Maruf <Hasan.Maruf@amd.com>
Cc: Honggyu Kim <honggyu.kim@sk.com>
Cc: Hyeongtak Ji <hyeongtak.ji@sk.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Rakie Kim <rakie.kim@sk.com>
Cc: Ravi Jonnalagadda <ravis.opensrc@micron.com>
Cc: Srinivasulu Thanneeru <sthanneeru.opensrc@micron.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/mempolicy.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 093ffd792d4e8a..56f9a6ed939adf 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1881,7 +1881,7 @@ static unsigned int weighted_interleave_nodes(struct mempolicy *policy)
 	/* to prevent miscount use tsk->mems_allowed_seq to detect rebind */
 	cpuset_mems_cookie = read_mems_allowed_begin();
 	node = current->il_prev;
-	if (!node || !node_isset(node, policy->nodes)) {
+	if (!current->il_weight || !node_isset(node, policy->nodes)) {
 		node = next_node_in(node, policy->nodes);
 		if (read_mems_allowed_retry(cpuset_mems_cookie))
 			goto retry;

From ccfdb251aac09d1028b7d179cd1e7186077f24b8 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@csgroup.eu>
Date: Tue, 30 Jan 2024 11:34:32 +0100
Subject: [PATCH 1101/1406] arm: ptdump: rename CONFIG_DEBUG_WX to
 CONFIG_ARM_DEBUG_WX

Patch series "mm: ptdump: Refactor CONFIG_DEBUG_WX and check_wx_pages
debugfs attribute", v2.

This series refactors CONFIG_DEBUG_WX for the 5 architectures implementing
CONFIG_GENERIC_PTDUMP

First rename stuff in ARM which uses similar names while not implementing
CONFIG_GENERIC_PTDUMP.

Then define a generic version of debug_checkwx() that calls
ptdump_check_wx() when CONFIG_DEBUG_WX is set.  Call it immediately after
calling mark_rodata_ro() instead of calling it at the end of every
mark_rodata_ro().

Then implement a debugfs attribute that can be used to trigger a W^X test
at anytime and regardless of CONFIG_DEBUG_WX


CONFIG_DEBUG_WX is a core option defined in mm/Kconfig.debug

To avoid any future conflict, rename ARM version into CONFIG_ARM_DEBUG_WX.


Link: https://lore.kernel.org/lkml/20200422152656.GF676@willie-the-truck/T/#m802eaf33efd6f8d575939d157301b35ac0d4a64f
Link: https://github.com/KSPP/linux/issues/35
Link: https://lkml.kernel.org/r/cover.1706610398.git.christophe.leroy@csgroup.eu
Link: https://lkml.kernel.org/r/fa297aa90caeb61eee2b70c6c5897a2ab58a9562.1706610398.git.christophe.leroy@csgroup.eu
Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: "Aneesh Kumar K.V (IBM)" <aneesh.kumar@kernel.org>
Cc: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Greg KH <greg@kroah.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: "Naveen N. Rao" <naveen.n.rao@linux.ibm.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Phong Tran <tranmanphong@gmail.com>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Steven Price <steven.price@arm.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Alexandre Ghiti <alexghiti@rivosinc.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/arm/Kconfig.debug               | 2 +-
 arch/arm/configs/aspeed_g4_defconfig | 2 +-
 arch/arm/configs/aspeed_g5_defconfig | 2 +-
 arch/arm/include/asm/ptdump.h        | 6 +++---
 arch/arm/mm/init.c                   | 2 +-
 5 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/arch/arm/Kconfig.debug b/arch/arm/Kconfig.debug
index 5fbbac1b708b0a..f1fc278081d035 100644
--- a/arch/arm/Kconfig.debug
+++ b/arch/arm/Kconfig.debug
@@ -17,7 +17,7 @@ config ARM_PTDUMP_DEBUGFS
 	  kernel.
 	  If in doubt, say "N"
 
-config DEBUG_WX
+config ARM_DEBUG_WX
 	bool "Warn on W+X mappings at boot"
 	depends on MMU
 	select ARM_PTDUMP_CORE
diff --git a/arch/arm/configs/aspeed_g4_defconfig b/arch/arm/configs/aspeed_g4_defconfig
index b3dc0465796f9a..28b724d59e7e23 100644
--- a/arch/arm/configs/aspeed_g4_defconfig
+++ b/arch/arm/configs/aspeed_g4_defconfig
@@ -252,7 +252,7 @@ CONFIG_DEBUG_INFO_REDUCED=y
 CONFIG_GDB_SCRIPTS=y
 CONFIG_STRIP_ASM_SYMS=y
 CONFIG_DEBUG_FS=y
-CONFIG_DEBUG_WX=y
+CONFIG_ARM_DEBUG_WX=y
 CONFIG_SCHED_STACK_END_CHECK=y
 CONFIG_PANIC_ON_OOPS=y
 CONFIG_PANIC_TIMEOUT=-1
diff --git a/arch/arm/configs/aspeed_g5_defconfig b/arch/arm/configs/aspeed_g5_defconfig
index 3fdf4dbfdea5db..61cee1e7ebea61 100644
--- a/arch/arm/configs/aspeed_g5_defconfig
+++ b/arch/arm/configs/aspeed_g5_defconfig
@@ -302,7 +302,7 @@ CONFIG_DEBUG_INFO_REDUCED=y
 CONFIG_GDB_SCRIPTS=y
 CONFIG_STRIP_ASM_SYMS=y
 CONFIG_DEBUG_FS=y
-CONFIG_DEBUG_WX=y
+CONFIG_ARM_DEBUG_WX=y
 CONFIG_SCHED_STACK_END_CHECK=y
 CONFIG_PANIC_ON_OOPS=y
 CONFIG_PANIC_TIMEOUT=-1
diff --git a/arch/arm/include/asm/ptdump.h b/arch/arm/include/asm/ptdump.h
index aad1d034136cea..46a4575146ee85 100644
--- a/arch/arm/include/asm/ptdump.h
+++ b/arch/arm/include/asm/ptdump.h
@@ -32,10 +32,10 @@ void ptdump_check_wx(void);
 
 #endif /* CONFIG_ARM_PTDUMP_CORE */
 
-#ifdef CONFIG_DEBUG_WX
-#define debug_checkwx() ptdump_check_wx()
+#ifdef CONFIG_ARM_DEBUG_WX
+#define arm_debug_checkwx() ptdump_check_wx()
 #else
-#define debug_checkwx() do { } while (0)
+#define arm_debug_checkwx() do { } while (0)
 #endif
 
 #endif /* __ASM_PTDUMP_H */
diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c
index a42e4cd11db294..4c3d78691279d3 100644
--- a/arch/arm/mm/init.c
+++ b/arch/arm/mm/init.c
@@ -458,7 +458,7 @@ static int __mark_rodata_ro(void *unused)
 void mark_rodata_ro(void)
 {
 	stop_machine(__mark_rodata_ro, NULL, NULL);
-	debug_checkwx();
+	arm_debug_checkwx();
 }
 
 #else

From 4728b74d19926810e2845fe735869aece5ad28ee Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@csgroup.eu>
Date: Tue, 30 Jan 2024 11:34:33 +0100
Subject: [PATCH 1102/1406] arm64, powerpc, riscv, s390, x86: ptdump: refactor
 CONFIG_DEBUG_WX

All architectures using the core ptdump functionality also implement
CONFIG_DEBUG_WX, and they all do it more or less the same way, with a
function called debug_checkwx() that is called by mark_rodata_ro(), which
is a substitute to ptdump_check_wx() when CONFIG_DEBUG_WX is set and a
no-op otherwise.

Refactor by centrally defining debug_checkwx() in linux/ptdump.h and call
debug_checkwx() immediately after calling mark_rodata_ro() instead of
calling it at the end of every mark_rodata_ro().

On x86_32, mark_rodata_ro() first checks __supported_pte_mask has _PAGE_NX
before calling debug_checkwx().  Now the check is inside the callee
ptdump_walk_pgd_level_checkwx().

On powerpc_64, mark_rodata_ro() bails out early before calling
ptdump_check_wx() when the MMU doesn't have KERNEL_RO feature.  The check
is now also done in ptdump_check_wx() as it is called outside
mark_rodata_ro().

Link: https://lkml.kernel.org/r/a59b102d7964261d31ead0316a9f18628e4e7a8e.1706610398.git.christophe.leroy@csgroup.eu
Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Reviewed-by: Alexandre Ghiti <alexghiti@rivosinc.com>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: "Aneesh Kumar K.V (IBM)" <aneesh.kumar@kernel.org>
Cc: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Greg KH <greg@kroah.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: "Naveen N. Rao" <naveen.n.rao@linux.ibm.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Phong Tran <tranmanphong@gmail.com>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Steven Price <steven.price@arm.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/arm64/include/asm/ptdump.h |  7 -------
 arch/arm64/mm/mmu.c             |  2 --
 arch/powerpc/mm/mmu_decl.h      |  6 ------
 arch/powerpc/mm/pgtable_32.c    |  4 ----
 arch/powerpc/mm/pgtable_64.c    |  3 ---
 arch/powerpc/mm/ptdump/ptdump.c |  3 +++
 arch/riscv/include/asm/ptdump.h | 22 ----------------------
 arch/riscv/mm/init.c            |  3 ---
 arch/riscv/mm/ptdump.c          |  1 -
 arch/s390/include/asm/ptdump.h  | 14 --------------
 arch/s390/mm/dump_pagetables.c  |  1 -
 arch/s390/mm/init.c             |  2 --
 arch/x86/include/asm/pgtable.h  |  3 +--
 arch/x86/mm/dump_pagetables.c   |  3 +++
 arch/x86/mm/init_32.c           |  2 --
 arch/x86/mm/init_64.c           |  2 --
 include/linux/ptdump.h          |  7 +++++++
 init/main.c                     |  2 ++
 18 files changed, 16 insertions(+), 71 deletions(-)
 delete mode 100644 arch/riscv/include/asm/ptdump.h
 delete mode 100644 arch/s390/include/asm/ptdump.h

diff --git a/arch/arm64/include/asm/ptdump.h b/arch/arm64/include/asm/ptdump.h
index 581caac525b03a..5b1701c76d1cec 100644
--- a/arch/arm64/include/asm/ptdump.h
+++ b/arch/arm64/include/asm/ptdump.h
@@ -29,13 +29,6 @@ void __init ptdump_debugfs_register(struct ptdump_info *info, const char *name);
 static inline void ptdump_debugfs_register(struct ptdump_info *info,
 					   const char *name) { }
 #endif
-void ptdump_check_wx(void);
 #endif /* CONFIG_PTDUMP_CORE */
 
-#ifdef CONFIG_DEBUG_WX
-#define debug_checkwx()	ptdump_check_wx()
-#else
-#define debug_checkwx()	do { } while (0)
-#endif
-
 #endif /* __ASM_PTDUMP_H */
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 1ac7467d34c9c3..3a27d887f7dd71 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -632,8 +632,6 @@ void mark_rodata_ro(void)
 	section_size = (unsigned long)__init_begin - (unsigned long)__start_rodata;
 	update_mapping_prot(__pa_symbol(__start_rodata), (unsigned long)__start_rodata,
 			    section_size, PAGE_KERNEL_RO);
-
-	debug_checkwx();
 }
 
 static void __init map_kernel_segment(pgd_t *pgdp, void *va_start, void *va_end,
diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h
index 72341b9fb5521f..90dcc284405629 100644
--- a/arch/powerpc/mm/mmu_decl.h
+++ b/arch/powerpc/mm/mmu_decl.h
@@ -171,12 +171,6 @@ static inline void mmu_mark_rodata_ro(void) { }
 void __init mmu_mapin_immr(void);
 #endif
 
-#ifdef CONFIG_DEBUG_WX
-void ptdump_check_wx(void);
-#else
-static inline void ptdump_check_wx(void) { }
-#endif
-
 static inline bool debug_pagealloc_enabled_or_kfence(void)
 {
 	return IS_ENABLED(CONFIG_KFENCE) || debug_pagealloc_enabled();
diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c
index 5c02fd08d61eff..12498017da8e43 100644
--- a/arch/powerpc/mm/pgtable_32.c
+++ b/arch/powerpc/mm/pgtable_32.c
@@ -153,7 +153,6 @@ void mark_rodata_ro(void)
 
 	if (v_block_mapped((unsigned long)_stext + 1)) {
 		mmu_mark_rodata_ro();
-		ptdump_check_wx();
 		return;
 	}
 
@@ -166,9 +165,6 @@ void mark_rodata_ro(void)
 		   PFN_DOWN((unsigned long)_stext);
 
 	set_memory_ro((unsigned long)_stext, numpages);
-
-	// mark_initmem_nx() should have already run by now
-	ptdump_check_wx();
 }
 #endif
 
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index 5ac1fd30341bb2..1b366526f4f21e 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -150,9 +150,6 @@ void mark_rodata_ro(void)
 		radix__mark_rodata_ro();
 	else
 		hash__mark_rodata_ro();
-
-	// mark_initmem_nx() should have already run by now
-	ptdump_check_wx();
 }
 
 void mark_initmem_nx(void)
diff --git a/arch/powerpc/mm/ptdump/ptdump.c b/arch/powerpc/mm/ptdump/ptdump.c
index 2313053fe679ed..620d4917ebe8a6 100644
--- a/arch/powerpc/mm/ptdump/ptdump.c
+++ b/arch/powerpc/mm/ptdump/ptdump.c
@@ -343,6 +343,9 @@ void ptdump_check_wx(void)
 		}
 	};
 
+	if (IS_ENABLED(CONFIG_PPC_BOOK3S_64) && !mmu_has_feature(MMU_FTR_KERNEL_RO))
+		return;
+
 	ptdump_walk_pgd(&st.ptdump, &init_mm, NULL);
 
 	if (st.wx_pages)
diff --git a/arch/riscv/include/asm/ptdump.h b/arch/riscv/include/asm/ptdump.h
deleted file mode 100644
index 3c9ea6dd5af7eb..00000000000000
--- a/arch/riscv/include/asm/ptdump.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Copyright (C) 2019 SiFive
- */
-
-#ifndef _ASM_RISCV_PTDUMP_H
-#define _ASM_RISCV_PTDUMP_H
-
-void ptdump_check_wx(void);
-
-#ifdef CONFIG_DEBUG_WX
-static inline void debug_checkwx(void)
-{
-	ptdump_check_wx();
-}
-#else
-static inline void debug_checkwx(void)
-{
-}
-#endif
-
-#endif /* _ASM_RISCV_PTDUMP_H */
diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
index eea8adae058be2..b5ffb2ef54ad22 100644
--- a/arch/riscv/mm/init.c
+++ b/arch/riscv/mm/init.c
@@ -29,7 +29,6 @@
 #include <asm/io.h>
 #include <asm/numa.h>
 #include <asm/pgtable.h>
-#include <asm/ptdump.h>
 #include <asm/sections.h>
 #include <asm/soc.h>
 #include <asm/tlbflush.h>
@@ -723,8 +722,6 @@ void mark_rodata_ro(void)
 	if (IS_ENABLED(CONFIG_64BIT))
 		set_kernel_memory(lm_alias(__start_rodata), lm_alias(_data),
 				  set_memory_ro);
-
-	debug_checkwx();
 }
 #else
 static __init pgprot_t pgprot_from_va(uintptr_t va)
diff --git a/arch/riscv/mm/ptdump.c b/arch/riscv/mm/ptdump.c
index 657c27bc07a769..07526560331366 100644
--- a/arch/riscv/mm/ptdump.c
+++ b/arch/riscv/mm/ptdump.c
@@ -9,7 +9,6 @@
 #include <linux/seq_file.h>
 #include <linux/ptdump.h>
 
-#include <asm/ptdump.h>
 #include <linux/pgtable.h>
 #include <asm/kasan.h>
 
diff --git a/arch/s390/include/asm/ptdump.h b/arch/s390/include/asm/ptdump.h
deleted file mode 100644
index f960b2896606a1..00000000000000
--- a/arch/s390/include/asm/ptdump.h
+++ /dev/null
@@ -1,14 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-
-#ifndef _ASM_S390_PTDUMP_H
-#define _ASM_S390_PTDUMP_H
-
-void ptdump_check_wx(void);
-
-static inline void debug_checkwx(void)
-{
-	if (IS_ENABLED(CONFIG_DEBUG_WX))
-		ptdump_check_wx();
-}
-
-#endif /* _ASM_S390_PTDUMP_H */
diff --git a/arch/s390/mm/dump_pagetables.c b/arch/s390/mm/dump_pagetables.c
index d37a8f607b7188..8dcb4e0c71bde6 100644
--- a/arch/s390/mm/dump_pagetables.c
+++ b/arch/s390/mm/dump_pagetables.c
@@ -6,7 +6,6 @@
 #include <linux/mm.h>
 #include <linux/kfence.h>
 #include <linux/kasan.h>
-#include <asm/ptdump.h>
 #include <asm/kasan.h>
 #include <asm/abs_lowcore.h>
 #include <asm/nospec-branch.h>
diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
index 8d9a60ccb7771a..f6391442c0c2ad 100644
--- a/arch/s390/mm/init.c
+++ b/arch/s390/mm/init.c
@@ -37,7 +37,6 @@
 #include <asm/pgalloc.h>
 #include <asm/ctlreg.h>
 #include <asm/kfence.h>
-#include <asm/ptdump.h>
 #include <asm/dma.h>
 #include <asm/abs_lowcore.h>
 #include <asm/tlb.h>
@@ -109,7 +108,6 @@ void mark_rodata_ro(void)
 
 	__set_memory_ro(__start_ro_after_init, __end_ro_after_init);
 	pr_info("Write protected read-only-after-init data: %luk\n", size >> 10);
-	debug_checkwx();
 }
 
 int set_memory_encrypted(unsigned long vaddr, int numpages)
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 9d077bca6a103e..6c979028e5212f 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -32,6 +32,7 @@ void ptdump_walk_pgd_level(struct seq_file *m, struct mm_struct *mm);
 void ptdump_walk_pgd_level_debugfs(struct seq_file *m, struct mm_struct *mm,
 				   bool user);
 void ptdump_walk_pgd_level_checkwx(void);
+#define ptdump_check_wx ptdump_walk_pgd_level_checkwx
 void ptdump_walk_user_pgd_level_checkwx(void);
 
 /*
@@ -41,10 +42,8 @@ void ptdump_walk_user_pgd_level_checkwx(void);
 #define pgprot_decrypted(prot)	__pgprot(cc_mkdec(pgprot_val(prot)))
 
 #ifdef CONFIG_DEBUG_WX
-#define debug_checkwx()		ptdump_walk_pgd_level_checkwx()
 #define debug_checkwx_user()	ptdump_walk_user_pgd_level_checkwx()
 #else
-#define debug_checkwx()		do { } while (0)
 #define debug_checkwx_user()	do { } while (0)
 #endif
 
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index e1b599ecbbc26d..0008524eebe9af 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -433,6 +433,9 @@ void ptdump_walk_user_pgd_level_checkwx(void)
 
 void ptdump_walk_pgd_level_checkwx(void)
 {
+	if (!(__supported_pte_mask & _PAGE_NX))
+		return;
+
 	ptdump_walk_pgd_level_core(NULL, &init_mm, INIT_PGD, true, false);
 }
 
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index b63403d7179df4..5c736b707caea0 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -800,6 +800,4 @@ void mark_rodata_ro(void)
 	set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
 #endif
 	mark_nxdata_nx();
-	if (__supported_pte_mask & _PAGE_NX)
-		debug_checkwx();
 }
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index a0dffaca6d2bfc..ebdbcae48011d4 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1412,8 +1412,6 @@ void mark_rodata_ro(void)
 				(void *)text_end, (void *)rodata_start);
 	free_kernel_image_pages("unused kernel image (rodata/data gap)",
 				(void *)rodata_end, (void *)_sdata);
-
-	debug_checkwx();
 }
 
 /*
diff --git a/include/linux/ptdump.h b/include/linux/ptdump.h
index 2a3a955864259a..c10513739bf951 100644
--- a/include/linux/ptdump.h
+++ b/include/linux/ptdump.h
@@ -19,5 +19,12 @@ struct ptdump_state {
 };
 
 void ptdump_walk_pgd(struct ptdump_state *st, struct mm_struct *mm, pgd_t *pgd);
+void ptdump_check_wx(void);
+
+static inline void debug_checkwx(void)
+{
+	if (IS_ENABLED(CONFIG_DEBUG_WX))
+		ptdump_check_wx();
+}
 
 #endif /* _LINUX_PTDUMP_H */
diff --git a/init/main.c b/init/main.c
index e24b0780fdff7a..749a9f8d2c9b0d 100644
--- a/init/main.c
+++ b/init/main.c
@@ -99,6 +99,7 @@
 #include <linux/init_syscalls.h>
 #include <linux/stackdepot.h>
 #include <linux/randomize_kstack.h>
+#include <linux/ptdump.h>
 #include <net/net_namespace.h>
 
 #include <asm/io.h>
@@ -1408,6 +1409,7 @@ static void mark_readonly(void)
 		 */
 		rcu_barrier();
 		mark_rodata_ro();
+		debug_checkwx();
 		rodata_test();
 	} else
 		pr_info("Kernel memory protection disabled.\n");

From 4c3783260b00cd6604c2dc0d5144a86937960442 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@csgroup.eu>
Date: Tue, 30 Jan 2024 11:34:34 +0100
Subject: [PATCH 1103/1406] powerpc,s390: ptdump: define ptdump_check_wx()
 regardless of CONFIG_DEBUG_WX

Following patch will use ptdump_check_wx() regardless of CONFIG_DEBUG_WX,
so define it at all times on powerpc and s390 just like other
architectures.  Though keep the WARN_ON_ONCE() only when CONFIG_DEBUG_WX
is set.

Link: https://lkml.kernel.org/r/07bfb04c7fec58e84413e91d2533581be357a696.1706610398.git.christophe.leroy@csgroup.eu
Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Alexandre Ghiti <alexghiti@rivosinc.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: "Aneesh Kumar K.V (IBM)" <aneesh.kumar@kernel.org>
Cc: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Greg KH <greg@kroah.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: "Naveen N. Rao" <naveen.n.rao@linux.ibm.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Phong Tran <tranmanphong@gmail.com>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Steven Price <steven.price@arm.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/powerpc/mm/ptdump/ptdump.c | 7 +++----
 arch/s390/mm/dump_pagetables.c  | 7 ++-----
 2 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/mm/ptdump/ptdump.c b/arch/powerpc/mm/ptdump/ptdump.c
index 620d4917ebe8a6..b835c80371cd28 100644
--- a/arch/powerpc/mm/ptdump/ptdump.c
+++ b/arch/powerpc/mm/ptdump/ptdump.c
@@ -184,13 +184,14 @@ static void note_prot_wx(struct pg_state *st, unsigned long addr)
 {
 	pte_t pte = __pte(st->current_flags);
 
-	if (!IS_ENABLED(CONFIG_DEBUG_WX) || !st->check_wx)
+	if (!st->check_wx)
 		return;
 
 	if (!pte_write(pte) || !pte_exec(pte))
 		return;
 
-	WARN_ONCE(1, "powerpc/mm: Found insecure W+X mapping at address %p/%pS\n",
+	WARN_ONCE(IS_ENABLED(CONFIG_DEBUG_WX),
+		  "powerpc/mm: Found insecure W+X mapping at address %p/%pS\n",
 		  (void *)st->start_address, (void *)st->start_address);
 
 	st->wx_pages += (addr - st->start_address) / PAGE_SIZE;
@@ -326,7 +327,6 @@ static void __init build_pgtable_complete_mask(void)
 				pg_level[i].mask |= pg_level[i].flag[j].mask;
 }
 
-#ifdef CONFIG_DEBUG_WX
 void ptdump_check_wx(void)
 {
 	struct pg_state st = {
@@ -354,7 +354,6 @@ void ptdump_check_wx(void)
 	else
 		pr_info("Checked W+X mappings: passed, no W+X pages found\n");
 }
-#endif
 
 static int __init ptdump_init(void)
 {
diff --git a/arch/s390/mm/dump_pagetables.c b/arch/s390/mm/dump_pagetables.c
index 8dcb4e0c71bde6..99da5a5602a8ae 100644
--- a/arch/s390/mm/dump_pagetables.c
+++ b/arch/s390/mm/dump_pagetables.c
@@ -121,7 +121,6 @@ static void print_prot(struct seq_file *m, unsigned int pr, int level)
 
 static void note_prot_wx(struct pg_state *st, unsigned long addr)
 {
-#ifdef CONFIG_DEBUG_WX
 	if (!st->check_wx)
 		return;
 	if (st->current_prot & _PAGE_INVALID)
@@ -138,10 +137,10 @@ static void note_prot_wx(struct pg_state *st, unsigned long addr)
 	 */
 	if (addr == PAGE_SIZE && (nospec_uses_trampoline() || !static_key_enabled(&cpu_has_bear)))
 		return;
-	WARN_ONCE(1, "s390/mm: Found insecure W+X mapping at address %pS\n",
+	WARN_ONCE(IS_ENABLED(CONFIG_DEBUG_WX),
+		  "s390/mm: Found insecure W+X mapping at address %pS\n",
 		  (void *)st->start_address);
 	st->wx_pages += (addr - st->start_address) / PAGE_SIZE;
-#endif /* CONFIG_DEBUG_WX */
 }
 
 static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level, u64 val)
@@ -193,7 +192,6 @@ static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level,
 	}
 }
 
-#ifdef CONFIG_DEBUG_WX
 void ptdump_check_wx(void)
 {
 	struct pg_state st = {
@@ -226,7 +224,6 @@ void ptdump_check_wx(void)
 			(nospec_uses_trampoline() || !static_key_enabled(&cpu_has_bear)) ?
 			"unexpected " : "");
 }
-#endif /* CONFIG_DEBUG_WX */
 
 #ifdef CONFIG_PTDUMP_DEBUGFS
 static int ptdump_show(struct seq_file *m, void *v)

From 2e6e3dbafb81cdcd2f3567cbe71e53490d1aaab5 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@csgroup.eu>
Date: Tue, 30 Jan 2024 11:34:35 +0100
Subject: [PATCH 1104/1406] mm: ptdump: have ptdump_check_wx() return bool

Have ptdump_check_wx() return true when the check is successful or false
otherwise.

Link: https://lkml.kernel.org/r/7943149fe955458cb7b57cd483bf41a3aad94684.1706610398.git.christophe.leroy@csgroup.eu
Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Alexandre Ghiti <alexghiti@rivosinc.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: "Aneesh Kumar K.V (IBM)" <aneesh.kumar@kernel.org>
Cc: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Greg KH <greg@kroah.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: "Naveen N. Rao" <naveen.n.rao@linux.ibm.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Phong Tran <tranmanphong@gmail.com>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Steven Price <steven.price@arm.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/arm64/mm/ptdump.c          | 11 ++++++++---
 arch/powerpc/mm/ptdump/ptdump.c | 13 +++++++++----
 arch/riscv/mm/ptdump.c          | 11 ++++++++---
 arch/s390/mm/dump_pagetables.c  | 13 +++++++++----
 arch/x86/include/asm/pgtable.h  |  2 +-
 arch/x86/mm/dump_pagetables.c   | 19 ++++++++++++-------
 include/linux/ptdump.h          |  2 +-
 7 files changed, 48 insertions(+), 23 deletions(-)

diff --git a/arch/arm64/mm/ptdump.c b/arch/arm64/mm/ptdump.c
index e305b6593c4e23..696822f755827e 100644
--- a/arch/arm64/mm/ptdump.c
+++ b/arch/arm64/mm/ptdump.c
@@ -345,7 +345,7 @@ static struct ptdump_info kernel_ptdump_info = {
 	.base_addr	= PAGE_OFFSET,
 };
 
-void ptdump_check_wx(void)
+bool ptdump_check_wx(void)
 {
 	struct pg_state st = {
 		.seq = NULL,
@@ -366,11 +366,16 @@ void ptdump_check_wx(void)
 
 	ptdump_walk_pgd(&st.ptdump, &init_mm, NULL);
 
-	if (st.wx_pages || st.uxn_pages)
+	if (st.wx_pages || st.uxn_pages) {
 		pr_warn("Checked W+X mappings: FAILED, %lu W+X pages found, %lu non-UXN pages found\n",
 			st.wx_pages, st.uxn_pages);
-	else
+
+		return false;
+	} else {
 		pr_info("Checked W+X mappings: passed, no W+X pages found\n");
+
+		return true;
+	}
 }
 
 static int __init ptdump_init(void)
diff --git a/arch/powerpc/mm/ptdump/ptdump.c b/arch/powerpc/mm/ptdump/ptdump.c
index b835c80371cd28..9dc239967b77f7 100644
--- a/arch/powerpc/mm/ptdump/ptdump.c
+++ b/arch/powerpc/mm/ptdump/ptdump.c
@@ -327,7 +327,7 @@ static void __init build_pgtable_complete_mask(void)
 				pg_level[i].mask |= pg_level[i].flag[j].mask;
 }
 
-void ptdump_check_wx(void)
+bool ptdump_check_wx(void)
 {
 	struct pg_state st = {
 		.seq = NULL,
@@ -344,15 +344,20 @@ void ptdump_check_wx(void)
 	};
 
 	if (IS_ENABLED(CONFIG_PPC_BOOK3S_64) && !mmu_has_feature(MMU_FTR_KERNEL_RO))
-		return;
+		return true;
 
 	ptdump_walk_pgd(&st.ptdump, &init_mm, NULL);
 
-	if (st.wx_pages)
+	if (st.wx_pages) {
 		pr_warn("Checked W+X mappings: FAILED, %lu W+X pages found\n",
 			st.wx_pages);
-	else
+
+		return false;
+	} else {
 		pr_info("Checked W+X mappings: passed, no W+X pages found\n");
+
+		return true;
+	}
 }
 
 static int __init ptdump_init(void)
diff --git a/arch/riscv/mm/ptdump.c b/arch/riscv/mm/ptdump.c
index 07526560331366..1289cc6d3700cd 100644
--- a/arch/riscv/mm/ptdump.c
+++ b/arch/riscv/mm/ptdump.c
@@ -335,7 +335,7 @@ static void ptdump_walk(struct seq_file *s, struct ptd_mm_info *pinfo)
 	ptdump_walk_pgd(&st.ptdump, pinfo->mm, NULL);
 }
 
-void ptdump_check_wx(void)
+bool ptdump_check_wx(void)
 {
 	struct pg_state st = {
 		.seq = NULL,
@@ -356,11 +356,16 @@ void ptdump_check_wx(void)
 
 	ptdump_walk_pgd(&st.ptdump, &init_mm, NULL);
 
-	if (st.wx_pages)
+	if (st.wx_pages) {
 		pr_warn("Checked W+X mappings: failed, %lu W+X pages found\n",
 			st.wx_pages);
-	else
+
+		return false;
+	} else {
 		pr_info("Checked W+X mappings: passed, no W+X pages found\n");
+
+		return true;
+	}
 }
 
 static int ptdump_show(struct seq_file *m, void *v)
diff --git a/arch/s390/mm/dump_pagetables.c b/arch/s390/mm/dump_pagetables.c
index 99da5a5602a8ae..ffd07ed7b4af88 100644
--- a/arch/s390/mm/dump_pagetables.c
+++ b/arch/s390/mm/dump_pagetables.c
@@ -192,7 +192,7 @@ static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level,
 	}
 }
 
-void ptdump_check_wx(void)
+bool ptdump_check_wx(void)
 {
 	struct pg_state st = {
 		.ptdump = {
@@ -215,14 +215,19 @@ void ptdump_check_wx(void)
 	};
 
 	if (!MACHINE_HAS_NX)
-		return;
+		return true;
 	ptdump_walk_pgd(&st.ptdump, &init_mm, NULL);
-	if (st.wx_pages)
+	if (st.wx_pages) {
 		pr_warn("Checked W+X mappings: FAILED, %lu W+X pages found\n", st.wx_pages);
-	else
+
+		return false;
+	} else {
 		pr_info("Checked W+X mappings: passed, no %sW+X pages found\n",
 			(nospec_uses_trampoline() || !static_key_enabled(&cpu_has_bear)) ?
 			"unexpected " : "");
+
+		return true;
+	}
 }
 
 #ifdef CONFIG_PTDUMP_DEBUGFS
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 6c979028e5212f..b50b2ef63672f4 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -31,7 +31,7 @@ struct seq_file;
 void ptdump_walk_pgd_level(struct seq_file *m, struct mm_struct *mm);
 void ptdump_walk_pgd_level_debugfs(struct seq_file *m, struct mm_struct *mm,
 				   bool user);
-void ptdump_walk_pgd_level_checkwx(void);
+bool ptdump_walk_pgd_level_checkwx(void);
 #define ptdump_check_wx ptdump_walk_pgd_level_checkwx
 void ptdump_walk_user_pgd_level_checkwx(void);
 
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index 0008524eebe9af..c58c01f560fd87 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -362,7 +362,7 @@ static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level,
 	}
 }
 
-static void ptdump_walk_pgd_level_core(struct seq_file *m,
+bool void ptdump_walk_pgd_level_core(struct seq_file *m,
 				       struct mm_struct *mm, pgd_t *pgd,
 				       bool checkwx, bool dmesg)
 {
@@ -391,12 +391,17 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m,
 	ptdump_walk_pgd(&st.ptdump, mm, pgd);
 
 	if (!checkwx)
-		return;
-	if (st.wx_pages)
+		return true;
+	if (st.wx_pages) {
 		pr_info("x86/mm: Checked W+X mappings: FAILED, %lu W+X pages found.\n",
 			st.wx_pages);
-	else
+
+		return false;
+	} else {
 		pr_info("x86/mm: Checked W+X mappings: passed, no W+X pages found.\n");
+
+		return true;
+	}
 }
 
 void ptdump_walk_pgd_level(struct seq_file *m, struct mm_struct *mm)
@@ -431,12 +436,12 @@ void ptdump_walk_user_pgd_level_checkwx(void)
 #endif
 }
 
-void ptdump_walk_pgd_level_checkwx(void)
+bool ptdump_walk_pgd_level_checkwx(void)
 {
 	if (!(__supported_pte_mask & _PAGE_NX))
-		return;
+		return true;
 
-	ptdump_walk_pgd_level_core(NULL, &init_mm, INIT_PGD, true, false);
+	return ptdump_walk_pgd_level_core(NULL, &init_mm, INIT_PGD, true, false);
 }
 
 static int __init pt_dump_init(void)
diff --git a/include/linux/ptdump.h b/include/linux/ptdump.h
index c10513739bf951..953b61696ccf7b 100644
--- a/include/linux/ptdump.h
+++ b/include/linux/ptdump.h
@@ -19,7 +19,7 @@ struct ptdump_state {
 };
 
 void ptdump_walk_pgd(struct ptdump_state *st, struct mm_struct *mm, pgd_t *pgd);
-void ptdump_check_wx(void);
+bool ptdump_check_wx(void);
 
 static inline void debug_checkwx(void)
 {

From 6770f7bc4ae848885a3441b875f68153da94ed6e Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Wed, 31 Jan 2024 01:13:01 -0800
Subject: [PATCH 1105/1406] mm-ptdump-have-ptdump_check_wx-return-bool-fix

fix a couple of build issues (x86_64 allmodconfig)

Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Alexandre Ghiti <alexghiti@rivosinc.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: "Aneesh Kumar K.V (IBM)" <aneesh.kumar@kernel.org>
Cc: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Greg KH <greg@kroah.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: "Naveen N. Rao" <naveen.n.rao@linux.ibm.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Phong Tran <tranmanphong@gmail.com>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Steven Price <steven.price@arm.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/x86/mm/dump_pagetables.c | 6 +++---
 include/linux/ptdump.h        | 3 +++
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index c58c01f560fd87..35b2cfd4791418 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -362,9 +362,9 @@ static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level,
 	}
 }
 
-bool void ptdump_walk_pgd_level_core(struct seq_file *m,
-				       struct mm_struct *mm, pgd_t *pgd,
-				       bool checkwx, bool dmesg)
+bool ptdump_walk_pgd_level_core(struct seq_file *m,
+				struct mm_struct *mm, pgd_t *pgd,
+				bool checkwx, bool dmesg)
 {
 	const struct ptdump_range ptdump_ranges[] = {
 #ifdef CONFIG_X86_64
diff --git a/include/linux/ptdump.h b/include/linux/ptdump.h
index 953b61696ccf7b..8dbd51ea862678 100644
--- a/include/linux/ptdump.h
+++ b/include/linux/ptdump.h
@@ -18,6 +18,9 @@ struct ptdump_state {
 	const struct ptdump_range *range;
 };
 
+bool ptdump_walk_pgd_level_core(struct seq_file *m,
+				struct mm_struct *mm, pgd_t *pgd,
+				bool checkwx, bool dmesg);
 void ptdump_walk_pgd(struct ptdump_state *st, struct mm_struct *mm, pgd_t *pgd);
 bool ptdump_check_wx(void);
 

From c004b314913b10025172a2d1728f58d7b923da7e Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@csgroup.eu>
Date: Tue, 30 Jan 2024 11:34:36 +0100
Subject: [PATCH 1106/1406] mm: ptdump: add check_wx_pages debugfs attribute

Add a readable attribute in debugfs to trigger a W^X pages check at any
time.

To trigger the test, just read /sys/kernel/debug/check_wx_pages It will
report FAILED if the test failed, SUCCESS otherwise.

Detailed result is provided into dmesg.

Link: https://lkml.kernel.org/r/e947fb1a9f3f5466344823e532d343ff194ae03d.1706610398.git.christophe.leroy@csgroup.eu
Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Alexandre Ghiti <alexghiti@rivosinc.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: "Aneesh Kumar K.V (IBM)" <aneesh.kumar@kernel.org>
Cc: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Greg KH <greg@kroah.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: "Naveen N. Rao" <naveen.n.rao@linux.ibm.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Phong Tran <tranmanphong@gmail.com>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Steven Price <steven.price@arm.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/ptdump.c | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/mm/ptdump.c b/mm/ptdump.c
index 03c1bdae4a4368..106e1d66e9f9ee 100644
--- a/mm/ptdump.c
+++ b/mm/ptdump.c
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 
 #include <linux/pagewalk.h>
+#include <linux/debugfs.h>
 #include <linux/ptdump.h>
 #include <linux/kasan.h>
 
@@ -163,3 +164,24 @@ void ptdump_walk_pgd(struct ptdump_state *st, struct mm_struct *mm, pgd_t *pgd)
 	/* Flush out the last page */
 	st->note_page(st, 0, -1, 0);
 }
+
+static int check_wx_show(struct seq_file *m, void *v)
+{
+	if (ptdump_check_wx())
+		seq_puts(m, "SUCCESS\n");
+	else
+		seq_puts(m, "FAILED\n");
+
+	return 0;
+}
+
+DEFINE_SHOW_ATTRIBUTE(check_wx);
+
+static int ptdump_debugfs_init(void)
+{
+	debugfs_create_file("check_wx_pages", 0400, NULL, NULL, &check_wx_fops);
+
+	return 0;
+}
+
+device_initcall(ptdump_debugfs_init);

From c5bbadf42399360ca1560b1f238f57e0fba869c0 Mon Sep 17 00:00:00 2001
From: Changbin Du <changbin.du@huawei.com>
Date: Mon, 29 Jan 2024 10:03:04 +0800
Subject: [PATCH 1107/1406] modules: wait do_free_init correctly

commit 1a7b7d922081 ("modules: Use vmalloc special flag") moves
do_free_init() into a global workqueue instead of call_rcu().  So now
rcu_barrier() can not ensure that do_free_init has completed.  We should
wait it via flush_work().

Without this fix, we still could encounter false positive reports in W+X
checking, and rcu synchronization is unnecessary.

Link: https://lkml.kernel.org/r/20240129020304.1981372-1-changbin.du@huawei.com
Fixes: 1a7b7d922081 ("modules: Use vmalloc special flag")
Signed-off-by: Changbin Du <changbin.du@huawei.com>
Tested-by: Eric Chanudet <echanude@redhat.com>
Cc: Xiaoyi Su <suxiaoyi@huawei.com>
Cc: Luis Chamberlain <mcgrof@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/moduleloader.h | 8 ++++++++
 init/main.c                  | 5 +++--
 kernel/module/main.c         | 5 +++++
 3 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/include/linux/moduleloader.h b/include/linux/moduleloader.h
index 001b2ce83832ed..89b1e0ed981144 100644
--- a/include/linux/moduleloader.h
+++ b/include/linux/moduleloader.h
@@ -115,6 +115,14 @@ int module_finalize(const Elf_Ehdr *hdr,
 		    const Elf_Shdr *sechdrs,
 		    struct module *mod);
 
+#ifdef CONFIG_MODULES
+void flush_module_init_free_work(void);
+#else
+static inline void flush_module_init_free_work(void)
+{
+}
+#endif
+
 /* Any cleanup needed when module leaves. */
 void module_arch_cleanup(struct module *mod);
 
diff --git a/init/main.c b/init/main.c
index 749a9f8d2c9b0d..504d417ab9f0f3 100644
--- a/init/main.c
+++ b/init/main.c
@@ -87,6 +87,7 @@
 #include <linux/sched/task.h>
 #include <linux/sched/task_stack.h>
 #include <linux/context_tracking.h>
+#include <linux/moduleloader.h>
 #include <linux/random.h>
 #include <linux/list.h>
 #include <linux/integrity.h>
@@ -1403,11 +1404,11 @@ static void mark_readonly(void)
 	if (rodata_enabled) {
 		/*
 		 * load_module() results in W+X mappings, which are cleaned
-		 * up with call_rcu().  Let's make sure that queued work is
+		 * up with init_free_wq. Let's make sure that queued work is
 		 * flushed so that we don't hit false positives looking for
 		 * insecure pages which are W+X.
 		 */
-		rcu_barrier();
+		flush_module_init_free_work();
 		mark_rodata_ro();
 		debug_checkwx();
 		rodata_test();
diff --git a/kernel/module/main.c b/kernel/module/main.c
index 36681911c05acd..ea66b5c2a2a157 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -2489,6 +2489,11 @@ static void do_free_init(struct work_struct *w)
 	}
 }
 
+void flush_module_init_free_work(void)
+{
+	flush_work(&init_free_wq);
+}
+
 #undef MODULE_PARAM_PREFIX
 #define MODULE_PARAM_PREFIX "module."
 /* Default value for module->async_probe_requested */

From ce15a24ca527b8f2d0c648a116330f5d2cc6d0a4 Mon Sep 17 00:00:00 2001
From: Changbin Du <changbin.du@huawei.com>
Date: Sat, 17 Feb 2024 16:18:10 +0800
Subject: [PATCH 1108/1406] modules: wait do_free_init correctly

amend comment in do_init_module() and update commit msg

Link: https://lkml.kernel.org/r/20240217081810.4155871-1-changbin.du@huawei.com
Fixes: 1a7b7d922081 ("modules: Use vmalloc special flag")
Signed-off-by: Changbin Du <changbin.du@huawei.com>
Tested-by: Eric Chanudet <echanude@redhat.com>
Cc: Xiaoyi Su <suxiaoyi@huawei.com>
Cc: Luis Chamberlain <mcgrof@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/module/main.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/module/main.c b/kernel/module/main.c
index ea66b5c2a2a157..b0b99348e1a8b3 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -2598,8 +2598,8 @@ static noinline int do_init_module(struct module *mod)
 	 * Note that module_alloc() on most architectures creates W+X page
 	 * mappings which won't be cleaned up until do_free_init() runs.  Any
 	 * code such as mark_rodata_ro() which depends on those mappings to
-	 * be cleaned up needs to sync with the queued work - ie
-	 * rcu_barrier()
+	 * be cleaned up needs to sync with the queued work by invoking
+	 * flush_module_init_free_work().
 	 */
 	if (llist_add(&freeinit->node, &init_free_list))
 		schedule_work(&init_free_wq);

From 43d973c9e0d62c95e0bcaf3c7be6b4805b4693d8 Mon Sep 17 00:00:00 2001
From: Anshuman Khandual <anshuman.khandual@arm.com>
Date: Thu, 1 Feb 2024 08:07:14 +0530
Subject: [PATCH 1109/1406] mm/cma: Don't treat bad input arguments for
 cma_alloc() as its failure

Invalid cma_alloc() input scenarios - including excess allocation request
should neither be counted as CMA_ALLOC_FAIL nor 'cma->nr_pages_failed' be
updated when applicable with CONFIG_CMA_SYSFS. This also drops 'out' jump
label which has become redundant.

Link: https://lkml.kernel.org/r/20240201023714.3871061-1-anshuman.khandual@arm.com
Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Kalesh Singh <kaleshsingh@google.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/cma.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/mm/cma.c b/mm/cma.c
index e12cf41d83549a..b6720930312df2 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -441,13 +441,13 @@ struct page *cma_alloc(struct cma *cma, unsigned long count,
 	trace_cma_alloc_start(name, count, align);
 
 	if (!cma || !cma->count || !cma->bitmap)
-		goto out;
+		return page;
 
 	pr_debug("%s(cma %p, name: %s, count %lu, align %d)\n", __func__,
 		(void *)cma, cma->name, count, align);
 
 	if (!count)
-		goto out;
+		return page;
 
 	mask = cma_bitmap_aligned_mask(cma, align);
 	offset = cma_bitmap_aligned_offset(cma, align);
@@ -455,7 +455,7 @@ struct page *cma_alloc(struct cma *cma, unsigned long count,
 	bitmap_count = cma_bitmap_pages_to_bits(cma, count);
 
 	if (bitmap_count > bitmap_maxno)
-		goto out;
+		return page;
 
 	for (;;) {
 		spin_lock_irq(&cma->lock);
@@ -514,15 +514,13 @@ struct page *cma_alloc(struct cma *cma, unsigned long count,
 	}
 
 	pr_debug("%s(): returned %p\n", __func__, page);
-out:
 	trace_cma_alloc_finish(name, pfn, page, count, align, ret);
 	if (page) {
 		count_vm_event(CMA_ALLOC_SUCCESS);
 		cma_sysfs_account_success_pages(cma, count);
 	} else {
 		count_vm_event(CMA_ALLOC_FAIL);
-		if (cma)
-			cma_sysfs_account_fail_pages(cma, count);
+		cma_sysfs_account_fail_pages(cma, count);
 	}
 
 	return page;

From 38abc42bfc246bc696a46db46af43177c6b22007 Mon Sep 17 00:00:00 2001
From: Luis Chamberlain <mcgrof@kernel.org>
Date: Wed, 31 Jan 2024 14:51:24 -0800
Subject: [PATCH 1110/1406] test_xarray: add tests for advanced multi-index use

Patch series "test_xarray: advanced API multi-index tests", v2.

This is a respin of the test_xarray multi-index tests [0] which use and
demonstrate the advanced API which is used by the page cache.  This should
let folks more easily follow how we use multi-index to support for example
a min order later in the page cache.  It also lets us grow the selftests
to mimic more of what we do in the page cache.


This patch (of 2):

The multi index selftests are great but they don't replicate how we deal
with the page cache exactly, which makes it a bit hard to follow as the
page cache uses the advanced API.

Add tests which use the advanced API, mimicking what we do in the page
cache, while at it, extend the example to do what is needed for min order
support.

Link: https://lkml.kernel.org/r/20240131225125.1370598-1-mcgrof@kernel.org
Link: https://lkml.kernel.org/r/20240131225125.1370598-2-mcgrof@kernel.org
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
Tested-by: Daniel Gomez <da.gomez@samsung.com>
Cc: Darrick J. Wong <djwong@kernel.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Hannes Reinecke <hare@suse.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Pankaj Raghav <p.raghav@samsung.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 lib/test_xarray.c | 164 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 164 insertions(+)

diff --git a/lib/test_xarray.c b/lib/test_xarray.c
index e77d4856442c3f..8b23481f0e8ff4 100644
--- a/lib/test_xarray.c
+++ b/lib/test_xarray.c
@@ -674,6 +674,169 @@ static noinline void check_multi_store(struct xarray *xa)
 #endif
 }
 
+#ifdef CONFIG_XARRAY_MULTI
+/* mimics page cache __filemap_add_folio() */
+static noinline void check_xa_multi_store_adv_add(struct xarray *xa,
+						  unsigned long index,
+						  unsigned int order,
+						  void *p)
+{
+	XA_STATE(xas, xa, index);
+	unsigned int nrpages = 1UL << order;
+
+	/* users are responsible for index alignemnt to the order when adding */
+	XA_BUG_ON(xa, index & (nrpages - 1));
+
+	xas_set_order(&xas, index, order);
+
+	do {
+		xas_lock_irq(&xas);
+
+		xas_store(&xas, p);
+		XA_BUG_ON(xa, xas_error(&xas));
+		XA_BUG_ON(xa, xa_load(xa, index) != p);
+
+		xas_unlock_irq(&xas);
+	} while (xas_nomem(&xas, GFP_KERNEL));
+
+	XA_BUG_ON(xa, xas_error(&xas));
+}
+
+/* mimics page_cache_delete() */
+static noinline void check_xa_multi_store_adv_del_entry(struct xarray *xa,
+							unsigned long index,
+							unsigned int order)
+{
+	XA_STATE(xas, xa, index);
+
+	xas_set_order(&xas, index, order);
+	xas_store(&xas, NULL);
+	xas_init_marks(&xas);
+}
+
+static noinline void check_xa_multi_store_adv_delete(struct xarray *xa,
+						     unsigned long index,
+						     unsigned int order)
+{
+	xa_lock_irq(xa);
+	check_xa_multi_store_adv_del_entry(xa, index, order);
+	xa_unlock_irq(xa);
+}
+
+/* mimics page cache filemap_get_entry() */
+static noinline void *test_get_entry(struct xarray *xa, unsigned long index)
+{
+	XA_STATE(xas, xa, index);
+	void *p;
+
+	rcu_read_lock();
+repeat:
+	xas_reset(&xas);
+	p = xas_load(&xas);
+	if (xas_retry(&xas, p))
+		goto repeat;
+	rcu_read_unlock();
+
+	return p;
+}
+
+static unsigned long some_val = 0xdeadbeef;
+static unsigned long some_val_2 = 0xdeaddead;
+
+/* mimics the page cache usage */
+static noinline void check_xa_multi_store_adv(struct xarray *xa,
+					      unsigned long pos,
+					      unsigned int order)
+{
+	unsigned int nrpages = 1UL << order;
+	unsigned long index, base, next_index, next_next_index;
+	unsigned int i;
+
+	index = pos >> PAGE_SHIFT;
+	base = round_down(index, nrpages);
+	next_index = round_down(base + nrpages, nrpages);
+	next_next_index = round_down(next_index + nrpages, nrpages);
+
+	check_xa_multi_store_adv_add(xa, base, order, &some_val);
+
+	for (i = 0; i < nrpages; i++)
+		XA_BUG_ON(xa, test_get_entry(xa, base + i) != &some_val);
+
+	XA_BUG_ON(xa, test_get_entry(xa, next_index) != NULL);
+
+	/* Use order 0 for the next item */
+	check_xa_multi_store_adv_add(xa, next_index, 0, &some_val_2);
+	XA_BUG_ON(xa, test_get_entry(xa, next_index) != &some_val_2);
+
+	/* Remove the next item */
+	check_xa_multi_store_adv_delete(xa, next_index, 0);
+
+	/* Now use order for a new pointer */
+	check_xa_multi_store_adv_add(xa, next_index, order, &some_val_2);
+
+	for (i = 0; i < nrpages; i++)
+		XA_BUG_ON(xa, test_get_entry(xa, next_index + i) != &some_val_2);
+
+	check_xa_multi_store_adv_delete(xa, next_index, order);
+	check_xa_multi_store_adv_delete(xa, base, order);
+	XA_BUG_ON(xa, !xa_empty(xa));
+
+	/* starting fresh again */
+
+	/* let's test some holes now */
+
+	/* hole at base and next_next */
+	check_xa_multi_store_adv_add(xa, next_index, order, &some_val_2);
+
+	for (i = 0; i < nrpages; i++)
+		XA_BUG_ON(xa, test_get_entry(xa, base + i) != NULL);
+
+	for (i = 0; i < nrpages; i++)
+		XA_BUG_ON(xa, test_get_entry(xa, next_index + i) != &some_val_2);
+
+	for (i = 0; i < nrpages; i++)
+		XA_BUG_ON(xa, test_get_entry(xa, next_next_index + i) != NULL);
+
+	check_xa_multi_store_adv_delete(xa, next_index, order);
+	XA_BUG_ON(xa, !xa_empty(xa));
+
+	/* hole at base and next */
+
+	check_xa_multi_store_adv_add(xa, next_next_index, order, &some_val_2);
+
+	for (i = 0; i < nrpages; i++)
+		XA_BUG_ON(xa, test_get_entry(xa, base + i) != NULL);
+
+	for (i = 0; i < nrpages; i++)
+		XA_BUG_ON(xa, test_get_entry(xa, next_index + i) != NULL);
+
+	for (i = 0; i < nrpages; i++)
+		XA_BUG_ON(xa, test_get_entry(xa, next_next_index + i) != &some_val_2);
+
+	check_xa_multi_store_adv_delete(xa, next_next_index, order);
+	XA_BUG_ON(xa, !xa_empty(xa));
+}
+#endif
+
+static noinline void check_multi_store_advanced(struct xarray *xa)
+{
+#ifdef CONFIG_XARRAY_MULTI
+	unsigned int max_order = IS_ENABLED(CONFIG_XARRAY_MULTI) ? 20 : 1;
+	unsigned long end = ULONG_MAX/2;
+	unsigned long pos, i;
+
+	/*
+	 * About 117 million tests below.
+	 */
+	for (pos = 7; pos < end; pos = (pos * pos) + 564) {
+		for (i = 0; i < max_order; i++) {
+			check_xa_multi_store_adv(xa, pos, i);
+			check_xa_multi_store_adv(xa, pos + 157, i);
+		}
+	}
+#endif
+}
+
 static noinline void check_xa_alloc_1(struct xarray *xa, unsigned int base)
 {
 	int i;
@@ -1804,6 +1967,7 @@ static int xarray_checks(void)
 	check_reserve(&array);
 	check_reserve(&xa0);
 	check_multi_store(&array);
+	check_multi_store_advanced(&array);
 	check_get_order(&array);
 	check_xa_alloc();
 	check_find(&array);

From 3f353920642d7695af697fd23966c2f9774981b2 Mon Sep 17 00:00:00 2001
From: Luis Chamberlain <mcgrof@kernel.org>
Date: Fri, 16 Feb 2024 11:43:29 -0800
Subject: [PATCH 1111/1406] test_xarray: fix soft lockup for advanced-api tests

The new adanced API tests want to vet the xarray API is doing what it
promises by manually iterating over a set of possible indexes on its
own, and using a query operation which holds the RCU lock and then
releases it. So it is not using the helper loop options which xarray
provides on purpose. Any loop which iterates over 1 million entries
(which is possible with order 20, so emulating say a 4 GiB block size)
to just to rcu lock and unlock will eventually end up triggering a soft
lockup on systems which don't preempt, and have lock provin and RCU
prooving enabled.

xarray users already use XA_CHECK_SCHED for loops which may take a long
time, in our case we don't want to RCU unlock and lock as the caller
does that already, but rather just force a schedule every XA_CHECK_SCHED
iterations since the test is trying to not trust and rather test that
xarray is doing the right thing.


Link: https://lkml.kernel.org/r/20240216194329.840555-1-mcgrof@kernel.org
Reported-by: kernel test robot <oliver.sang@intel.com>
Closes: https://lkml.kernel.org/r/202402071613.70f28243-lkp@intel.com
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
Cc: Daniel Gomez <da.gomez@samsung.com>
Cc: Luis Chamberlain <mcgrof@kernel.org>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Pankaj Raghav <p.raghav@samsung.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 lib/test_xarray.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/lib/test_xarray.c b/lib/test_xarray.c
index 8b23481f0e8ff4..95027e0d3700d1 100644
--- a/lib/test_xarray.c
+++ b/lib/test_xarray.c
@@ -728,6 +728,7 @@ static noinline void *test_get_entry(struct xarray *xa, unsigned long index)
 {
 	XA_STATE(xas, xa, index);
 	void *p;
+	static unsigned int i = 0;
 
 	rcu_read_lock();
 repeat:
@@ -737,6 +738,17 @@ static noinline void *test_get_entry(struct xarray *xa, unsigned long index)
 		goto repeat;
 	rcu_read_unlock();
 
+	/*
+	 * This is not part of the page cache, this selftest is pretty
+	 * aggressive and does not want to trust the xarray API but rather
+	 * test it, and for order 20 (4 GiB block size) we can loop over
+	 * over a million entries which can cause a soft lockup. Page cache
+	 * APIs won't be stupid, proper page cache APIs loop over the proper
+	 * order so when using a larger order we skip shared entries.
+	 */
+	if (++i % XA_CHECK_SCHED == 0)
+		schedule();
+
 	return p;
 }
 

From 8fce0d4525759865bd0af5d6cf43eda64528e42e Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Mon, 19 Feb 2024 18:23:18 -0800
Subject: [PATCH 1112/1406] 
 test_xarray-fix-soft-lockup-for-advanced-api-tests-fix

s/i/loops/, make non-static

Cc: Luis Chamberlain <mcgrof@kernel.org>
Cc: Daniel Gomez <da.gomez@samsung.com>
Cc: Luis Chamberlain <mcgrof@kernel.org>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Pankaj Raghav <p.raghav@samsung.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 lib/test_xarray.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/test_xarray.c b/lib/test_xarray.c
index 95027e0d3700d1..97558774825c3b 100644
--- a/lib/test_xarray.c
+++ b/lib/test_xarray.c
@@ -728,7 +728,7 @@ static noinline void *test_get_entry(struct xarray *xa, unsigned long index)
 {
 	XA_STATE(xas, xa, index);
 	void *p;
-	static unsigned int i = 0;
+	unsigned int loops = 0;
 
 	rcu_read_lock();
 repeat:
@@ -746,7 +746,7 @@ static noinline void *test_get_entry(struct xarray *xa, unsigned long index)
 	 * APIs won't be stupid, proper page cache APIs loop over the proper
 	 * order so when using a larger order we skip shared entries.
 	 */
-	if (++i % XA_CHECK_SCHED == 0)
+	if (++loops % XA_CHECK_SCHED == 0)
 		schedule();
 
 	return p;

From dcc5f9b4304a12d5d0d1af520edc27fef21f972a Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Tue, 20 Feb 2024 10:59:48 -0800
Subject: [PATCH 1113/1406] 
 test_xarray-add-tests-for-advanced-multi-index-use-fix-fix-fix

restore static storage for loop counter

Cc: Daniel Gomez <da.gomez@samsung.com>
Cc: Luis Chamberlain <mcgrof@kernel.org>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Pankaj Raghav <p.raghav@samsung.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 lib/test_xarray.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/test_xarray.c b/lib/test_xarray.c
index 97558774825c3b..1050e9113d2a5b 100644
--- a/lib/test_xarray.c
+++ b/lib/test_xarray.c
@@ -728,7 +728,7 @@ static noinline void *test_get_entry(struct xarray *xa, unsigned long index)
 {
 	XA_STATE(xas, xa, index);
 	void *p;
-	unsigned int loops = 0;
+	static unsigned int loops = 0;
 
 	rcu_read_lock();
 repeat:

From b396f83e3c324cdb89c9040a2c0fbb878bcac8c1 Mon Sep 17 00:00:00 2001
From: Daniel Gomez <da.gomez@samsung.com>
Date: Wed, 31 Jan 2024 14:51:25 -0800
Subject: [PATCH 1114/1406] XArray: add cmpxchg order test

XArray multi-index entries do not keep track of the order stored once the
entry is being marked as used with cmpxchg (conditionally replaced with
NULL).  Add a test to check the order is actually lost.  The test also
verifies the order and entries for all the tied indexes before and after
the NULL replacement with xa_cmpxchg.

Add another entry at 1 << order that keeps the node around and the order
information for the NULL-entry after xa_cmpxchg.

Link: https://lkml.kernel.org/r/20240131225125.1370598-3-mcgrof@kernel.org
Signed-off-by: Daniel Gomez <da.gomez@samsung.com>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
Cc: Darrick J. Wong <djwong@kernel.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Hannes Reinecke <hare@suse.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Pankaj Raghav <p.raghav@samsung.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 lib/test_xarray.c | 54 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 54 insertions(+)

diff --git a/lib/test_xarray.c b/lib/test_xarray.c
index 1050e9113d2a5b..ebe2af2e072db3 100644
--- a/lib/test_xarray.c
+++ b/lib/test_xarray.c
@@ -423,6 +423,59 @@ static noinline void check_cmpxchg(struct xarray *xa)
 	XA_BUG_ON(xa, !xa_empty(xa));
 }
 
+static noinline void check_cmpxchg_order(struct xarray *xa)
+{
+#ifdef CONFIG_XARRAY_MULTI
+	void *FIVE = xa_mk_value(5);
+	unsigned int i, order = 3;
+
+	XA_BUG_ON(xa, xa_store_order(xa, 0, order, FIVE, GFP_KERNEL));
+
+	/* Check entry FIVE has the order saved */
+	XA_BUG_ON(xa, xa_get_order(xa, xa_to_value(FIVE)) != order);
+
+	/* Check all the tied indexes have the same entry and order */
+	for (i = 0; i < (1 << order); i++) {
+		XA_BUG_ON(xa, xa_load(xa, i) != FIVE);
+		XA_BUG_ON(xa, xa_get_order(xa, i) != order);
+	}
+
+	/* Ensure that nothing is stored at index '1 << order' */
+	XA_BUG_ON(xa, xa_load(xa, 1 << order) != NULL);
+
+	/*
+	 * Additionally, keep the node information and the order at
+	 * '1 << order'
+	 */
+	XA_BUG_ON(xa, xa_store_order(xa, 1 << order, order, FIVE, GFP_KERNEL));
+	for (i = (1 << order); i < (1 << order) + (1 << order) - 1; i++) {
+		XA_BUG_ON(xa, xa_load(xa, i) != FIVE);
+		XA_BUG_ON(xa, xa_get_order(xa, i) != order);
+	}
+
+	/* Conditionally replace FIVE entry at index '0' with NULL */
+	XA_BUG_ON(xa, xa_cmpxchg(xa, 0, FIVE, NULL, GFP_KERNEL) != FIVE);
+
+	/* Verify the order is lost at FIVE (and old) entries */
+	XA_BUG_ON(xa, xa_get_order(xa, xa_to_value(FIVE)) != 0);
+
+	/* Verify the order and entries are lost in all the tied indexes */
+	for (i = 0; i < (1 << order); i++) {
+		XA_BUG_ON(xa, xa_load(xa, i) != NULL);
+		XA_BUG_ON(xa, xa_get_order(xa, i) != 0);
+	}
+
+	/* Verify node and order are kept at '1 << order' */
+	for (i = (1 << order); i < (1 << order) + (1 << order) - 1; i++) {
+		XA_BUG_ON(xa, xa_load(xa, i) != FIVE);
+		XA_BUG_ON(xa, xa_get_order(xa, i) != order);
+	}
+
+	xa_store_order(xa, 0, BITS_PER_LONG - 1, NULL, GFP_KERNEL);
+	XA_BUG_ON(xa, !xa_empty(xa));
+#endif
+}
+
 static noinline void check_reserve(struct xarray *xa)
 {
 	void *entry;
@@ -1976,6 +2029,7 @@ static int xarray_checks(void)
 	check_xas_erase(&array);
 	check_insert(&array);
 	check_cmpxchg(&array);
+	check_cmpxchg_order(&array);
 	check_reserve(&array);
 	check_reserve(&xa0);
 	check_multi_store(&array);

From 432e4834006b4acd731a6ce3bd50bfd309aeee4a Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Wed, 31 Jan 2024 09:56:18 -0800
Subject: [PATCH 1115/1406] userfaultfd: handle zeropage moves by UFFDIO_MOVE

Current implementation of UFFDIO_MOVE fails to move zeropages and returns
EBUSY when it encounters one.  We can handle them by mapping a zeropage at
the destination and clearing the mapping at the source.  This is done both
for ordinary and for huge zeropages.

Link: https://lkml.kernel.org/r/20240131175618.2417291-1-surenb@google.com
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Reported-by: kernel test robot <lkp@intel.com>
Reported-by: Dan Carpenter <dan.carpenter@linaro.org>
Closes: https://lore.kernel.org/r/202401300107.U8iMAkTl-lkp@intel.com/
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Brian Geffon <bgeffon@google.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jann Horn <jannh@google.com>
Cc: Kalesh Singh <kaleshsingh@google.com>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Lokesh Gidra <lokeshgidra@google.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport (IBM) <rppt@kernel.org>
Cc: Nicolas Geoffray <ngeoffray@google.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: ZhangPeng <zhangpeng362@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/huge_memory.c | 105 +++++++++++++++++++++++++++--------------------
 mm/userfaultfd.c |  44 ++++++++++++++++----
 2 files changed, 98 insertions(+), 51 deletions(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index f005f04247355f..016e20bd813eaf 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2200,13 +2200,18 @@ int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pm
 	}
 
 	src_page = pmd_page(src_pmdval);
-	if (unlikely(!PageAnonExclusive(src_page))) {
-		spin_unlock(src_ptl);
-		return -EBUSY;
-	}
 
-	src_folio = page_folio(src_page);
-	folio_get(src_folio);
+	if (!is_huge_zero_pmd(src_pmdval)) {
+		if (unlikely(!PageAnonExclusive(src_page))) {
+			spin_unlock(src_ptl);
+			return -EBUSY;
+		}
+
+		src_folio = page_folio(src_page);
+		folio_get(src_folio);
+	} else
+		src_folio = NULL;
+
 	spin_unlock(src_ptl);
 
 	flush_cache_range(src_vma, src_addr, src_addr + HPAGE_PMD_SIZE);
@@ -2214,19 +2219,22 @@ int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pm
 				src_addr + HPAGE_PMD_SIZE);
 	mmu_notifier_invalidate_range_start(&range);
 
-	folio_lock(src_folio);
+	if (src_folio) {
+		folio_lock(src_folio);
 
-	/*
-	 * split_huge_page walks the anon_vma chain without the page
-	 * lock. Serialize against it with the anon_vma lock, the page
-	 * lock is not enough.
-	 */
-	src_anon_vma = folio_get_anon_vma(src_folio);
-	if (!src_anon_vma) {
-		err = -EAGAIN;
-		goto unlock_folio;
-	}
-	anon_vma_lock_write(src_anon_vma);
+		/*
+		 * split_huge_page walks the anon_vma chain without the page
+		 * lock. Serialize against it with the anon_vma lock, the page
+		 * lock is not enough.
+		 */
+		src_anon_vma = folio_get_anon_vma(src_folio);
+		if (!src_anon_vma) {
+			err = -EAGAIN;
+			goto unlock_folio;
+		}
+		anon_vma_lock_write(src_anon_vma);
+	} else
+		src_anon_vma = NULL;
 
 	dst_ptl = pmd_lockptr(mm, dst_pmd);
 	double_pt_lock(src_ptl, dst_ptl);
@@ -2235,45 +2243,54 @@ int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pm
 		err = -EAGAIN;
 		goto unlock_ptls;
 	}
-	if (folio_maybe_dma_pinned(src_folio) ||
-	    !PageAnonExclusive(&src_folio->page)) {
-		err = -EBUSY;
-		goto unlock_ptls;
-	}
+	if (src_folio) {
+		if (folio_maybe_dma_pinned(src_folio) ||
+		    !PageAnonExclusive(&src_folio->page)) {
+			err = -EBUSY;
+			goto unlock_ptls;
+		}
 
-	if (WARN_ON_ONCE(!folio_test_head(src_folio)) ||
-	    WARN_ON_ONCE(!folio_test_anon(src_folio))) {
-		err = -EBUSY;
-		goto unlock_ptls;
-	}
+		if (WARN_ON_ONCE(!folio_test_head(src_folio)) ||
+		    WARN_ON_ONCE(!folio_test_anon(src_folio))) {
+			err = -EBUSY;
+			goto unlock_ptls;
+		}
 
-	folio_move_anon_rmap(src_folio, dst_vma);
-	WRITE_ONCE(src_folio->index, linear_page_index(dst_vma, dst_addr));
+		folio_move_anon_rmap(src_folio, dst_vma);
+		WRITE_ONCE(src_folio->index, linear_page_index(dst_vma, dst_addr));
 
-	src_pmdval = pmdp_huge_clear_flush(src_vma, src_addr, src_pmd);
-	/* Folio got pinned from under us. Put it back and fail the move. */
-	if (folio_maybe_dma_pinned(src_folio)) {
-		set_pmd_at(mm, src_addr, src_pmd, src_pmdval);
-		err = -EBUSY;
-		goto unlock_ptls;
-	}
+		src_pmdval = pmdp_huge_clear_flush(src_vma, src_addr, src_pmd);
+		/* Folio got pinned from under us. Put it back and fail the move. */
+		if (folio_maybe_dma_pinned(src_folio)) {
+			set_pmd_at(mm, src_addr, src_pmd, src_pmdval);
+			err = -EBUSY;
+			goto unlock_ptls;
+		}
 
-	_dst_pmd = mk_huge_pmd(&src_folio->page, dst_vma->vm_page_prot);
-	/* Follow mremap() behavior and treat the entry dirty after the move */
-	_dst_pmd = pmd_mkwrite(pmd_mkdirty(_dst_pmd), dst_vma);
+		_dst_pmd = mk_huge_pmd(&src_folio->page, dst_vma->vm_page_prot);
+		/* Follow mremap() behavior and treat the entry dirty after the move */
+		_dst_pmd = pmd_mkwrite(pmd_mkdirty(_dst_pmd), dst_vma);
+	} else {
+		src_pmdval = pmdp_huge_clear_flush(src_vma, src_addr, src_pmd);
+		_dst_pmd = mk_huge_pmd(src_page, dst_vma->vm_page_prot);
+	}
 	set_pmd_at(mm, dst_addr, dst_pmd, _dst_pmd);
 
 	src_pgtable = pgtable_trans_huge_withdraw(mm, src_pmd);
 	pgtable_trans_huge_deposit(mm, dst_pmd, src_pgtable);
 unlock_ptls:
 	double_pt_unlock(src_ptl, dst_ptl);
-	anon_vma_unlock_write(src_anon_vma);
-	put_anon_vma(src_anon_vma);
+	if (src_anon_vma) {
+		anon_vma_unlock_write(src_anon_vma);
+		put_anon_vma(src_anon_vma);
+	}
 unlock_folio:
 	/* unblock rmap walks */
-	folio_unlock(src_folio);
+	if (src_folio)
+		folio_unlock(src_folio);
 	mmu_notifier_invalidate_range_end(&range);
-	folio_put(src_folio);
+	if (src_folio)
+		folio_put(src_folio);
 	return err;
 }
 #endif /* CONFIG_USERFAULTFD */
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index ae80c37148290a..9cc93cc1330b1c 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -959,6 +959,33 @@ static int move_swap_pte(struct mm_struct *mm,
 	return 0;
 }
 
+static int move_zeropage_pte(struct mm_struct *mm,
+			     struct vm_area_struct *dst_vma,
+			     struct vm_area_struct *src_vma,
+			     unsigned long dst_addr, unsigned long src_addr,
+			     pte_t *dst_pte, pte_t *src_pte,
+			     pte_t orig_dst_pte, pte_t orig_src_pte,
+			     spinlock_t *dst_ptl, spinlock_t *src_ptl)
+{
+	pte_t zero_pte;
+
+	double_pt_lock(dst_ptl, src_ptl);
+	if (!pte_same(ptep_get(src_pte), orig_src_pte) ||
+	    !pte_same(ptep_get(dst_pte), orig_dst_pte)) {
+		double_pt_unlock(dst_ptl, src_ptl);
+		return -EAGAIN;
+	}
+
+	zero_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr),
+					 dst_vma->vm_page_prot));
+	ptep_clear_flush(src_vma, src_addr, src_pte);
+	set_pte_at(mm, dst_addr, dst_pte, zero_pte);
+	double_pt_unlock(dst_ptl, src_ptl);
+
+	return 0;
+}
+
+
 /*
  * The mmap_lock for reading is held by the caller. Just move the page
  * from src_pmd to dst_pmd if possible, and return true if succeeded
@@ -1041,6 +1068,14 @@ static int move_pages_pte(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd,
 	}
 
 	if (pte_present(orig_src_pte)) {
+		if (is_zero_pfn(pte_pfn(orig_src_pte))) {
+			err = move_zeropage_pte(mm, dst_vma, src_vma,
+					       dst_addr, src_addr, dst_pte, src_pte,
+					       orig_dst_pte, orig_src_pte,
+					       dst_ptl, src_ptl);
+			goto out;
+		}
+
 		/*
 		 * Pin and lock both source folio and anon_vma. Since we are in
 		 * RCU read section, we can't block, so on contention have to
@@ -1404,19 +1439,14 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
 				err = -ENOENT;
 				break;
 			}
-			/* Avoid moving zeropages for now */
-			if (is_huge_zero_pmd(*src_pmd)) {
-				spin_unlock(ptl);
-				err = -EBUSY;
-				break;
-			}
 
 			/* Check if we can move the pmd without splitting it. */
 			if (move_splits_huge_pmd(dst_addr, src_addr, src_start + len) ||
 			    !pmd_none(dst_pmdval)) {
 				struct folio *folio = pfn_folio(pmd_pfn(*src_pmd));
 
-				if (!folio || !PageAnonExclusive(&folio->page)) {
+				if (!folio || (!is_huge_zero_page(&folio->page) &&
+					       !PageAnonExclusive(&folio->page))) {
 					spin_unlock(ptl);
 					err = -EBUSY;
 					break;

From 546a59d570236eb23b69804b42cbadf976ff0f22 Mon Sep 17 00:00:00 2001
From: Muhammad Usama Anjum <usama.anjum@collabora.com>
Date: Fri, 2 Feb 2024 16:31:08 +0500
Subject: [PATCH 1116/1406] selftests/mm: map_fixed_noreplace: conform test to
 TAP format output

Patch series "conform tests to TAP format output", v2.


This patch (of 12):

Conform the layout, informational and status messages to TAP.  No
functional change is intended other than the layout of output messages.
While at it, convert commenting style from // to /**/.

Link: https://lkml.kernel.org/r/20240202113119.2047740-1-usama.anjum@collabora.com
Link: https://lkml.kernel.org/r/20240202113119.2047740-2-usama.anjum@collabora.com
Signed-off-by: Muhammad Usama Anjum <usama.anjum@collabora.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 .../selftests/mm/map_fixed_noreplace.c        | 96 ++++++-------------
 1 file changed, 31 insertions(+), 65 deletions(-)

diff --git a/tools/testing/selftests/mm/map_fixed_noreplace.c b/tools/testing/selftests/mm/map_fixed_noreplace.c
index 598159f3df1f24..b74813fdc95143 100644
--- a/tools/testing/selftests/mm/map_fixed_noreplace.c
+++ b/tools/testing/selftests/mm/map_fixed_noreplace.c
@@ -12,6 +12,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <unistd.h>
+#include "../kselftest.h"
 
 static void dump_maps(void)
 {
@@ -28,15 +29,12 @@ static unsigned long find_base_addr(unsigned long size)
 
 	flags = MAP_PRIVATE | MAP_ANONYMOUS;
 	addr = mmap(NULL, size, PROT_NONE, flags, -1, 0);
-	if (addr == MAP_FAILED) {
-		printf("Error: couldn't map the space we need for the test\n");
-		return 0;
-	}
+	if (addr == MAP_FAILED)
+		ksft_exit_fail_msg("Error: couldn't map the space we need for the test\n");
+
+	if (munmap(addr, size) != 0)
+		ksft_exit_fail_msg("Error: munmap failed\n");
 
-	if (munmap(addr, size) != 0) {
-		printf("Error: couldn't map the space we need for the test\n");
-		return 0;
-	}
 	return (unsigned long)addr;
 }
 
@@ -46,51 +44,39 @@ int main(void)
 	unsigned long flags, addr, size, page_size;
 	char *p;
 
+	ksft_print_header();
+	ksft_set_plan(9);
+
 	page_size = sysconf(_SC_PAGE_SIZE);
 
-	//let's find a base addr that is free before we start the tests
+	/* let's find a base addr that is free before we start the tests */
 	size = 5 * page_size;
 	base_addr = find_base_addr(size);
-	if (!base_addr) {
-		printf("Error: couldn't map the space we need for the test\n");
-		return 1;
-	}
 
 	flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED_NOREPLACE;
 
-	// Check we can map all the areas we need below
-	errno = 0;
+	/* Check we can map all the areas we need below */
 	addr = base_addr;
 	size = 5 * page_size;
 	p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0);
-
-	printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
-
 	if (p == MAP_FAILED) {
 		dump_maps();
-		printf("Error: couldn't map the space we need for the test\n");
-		return 1;
+		ksft_exit_fail_msg("Error: couldn't map the space we need for the test\n");
 	}
-
-	errno = 0;
 	if (munmap((void *)addr, 5 * page_size) != 0) {
 		dump_maps();
-		printf("Error: munmap failed!?\n");
-		return 1;
+		ksft_exit_fail_msg("Error: munmap failed!?\n");
 	}
-	printf("unmap() successful\n");
+	ksft_test_result_pass("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
 
-	errno = 0;
 	addr = base_addr + page_size;
 	size = 3 * page_size;
 	p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0);
-	printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
-
 	if (p == MAP_FAILED) {
 		dump_maps();
-		printf("Error: first mmap() failed unexpectedly\n");
-		return 1;
+		ksft_exit_fail_msg("Error: first mmap() failed unexpectedly\n");
 	}
+	ksft_test_result_pass("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
 
 	/*
 	 * Exact same mapping again:
@@ -100,17 +86,14 @@ int main(void)
 	 *     +3 | mapped | new
 	 *     +4 |  free  | new
 	 */
-	errno = 0;
 	addr = base_addr;
 	size = 5 * page_size;
 	p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0);
-	printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
-
 	if (p != MAP_FAILED) {
 		dump_maps();
-		printf("Error:1: mmap() succeeded when it shouldn't have\n");
-		return 1;
+		ksft_exit_fail_msg("Error:1: mmap() succeeded when it shouldn't have\n");
 	}
+	ksft_test_result_pass("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
 
 	/*
 	 * Second mapping contained within first:
@@ -121,17 +104,14 @@ int main(void)
 	 *     +3 | mapped |
 	 *     +4 |  free  |
 	 */
-	errno = 0;
 	addr = base_addr + (2 * page_size);
 	size = page_size;
 	p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0);
-	printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
-
 	if (p != MAP_FAILED) {
 		dump_maps();
-		printf("Error:2: mmap() succeeded when it shouldn't have\n");
-		return 1;
+		ksft_exit_fail_msg("Error:2: mmap() succeeded when it shouldn't have\n");
 	}
+	ksft_test_result_pass("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
 
 	/*
 	 * Overlap end of existing mapping:
@@ -141,17 +121,14 @@ int main(void)
 	 *     +3 | mapped | new
 	 *     +4 |  free  | new
 	 */
-	errno = 0;
 	addr = base_addr + (3 * page_size);
 	size = 2 * page_size;
 	p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0);
-	printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
-
 	if (p != MAP_FAILED) {
 		dump_maps();
-		printf("Error:3: mmap() succeeded when it shouldn't have\n");
-		return 1;
+		ksft_exit_fail_msg("Error:3: mmap() succeeded when it shouldn't have\n");
 	}
+	ksft_test_result_pass("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
 
 	/*
 	 * Overlap start of existing mapping:
@@ -161,17 +138,14 @@ int main(void)
 	 *     +3 | mapped |
 	 *     +4 |  free  |
 	 */
-	errno = 0;
 	addr = base_addr;
 	size = 2 * page_size;
 	p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0);
-	printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
-
 	if (p != MAP_FAILED) {
 		dump_maps();
-		printf("Error:4: mmap() succeeded when it shouldn't have\n");
-		return 1;
+		ksft_exit_fail_msg("Error:4: mmap() succeeded when it shouldn't have\n");
 	}
+	ksft_test_result_pass("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
 
 	/*
 	 * Adjacent to start of existing mapping:
@@ -181,17 +155,14 @@ int main(void)
 	 *     +3 | mapped |
 	 *     +4 |  free  |
 	 */
-	errno = 0;
 	addr = base_addr;
 	size = page_size;
 	p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0);
-	printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
-
 	if (p == MAP_FAILED) {
 		dump_maps();
-		printf("Error:5: mmap() failed when it shouldn't have\n");
-		return 1;
+		ksft_exit_fail_msg("Error:5: mmap() failed when it shouldn't have\n");
 	}
+	ksft_test_result_pass("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
 
 	/*
 	 * Adjacent to end of existing mapping:
@@ -201,27 +172,22 @@ int main(void)
 	 *     +3 | mapped |
 	 *     +4 |  free  |  new
 	 */
-	errno = 0;
 	addr = base_addr + (4 * page_size);
 	size = page_size;
 	p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0);
-	printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
-
 	if (p == MAP_FAILED) {
 		dump_maps();
-		printf("Error:6: mmap() failed when it shouldn't have\n");
-		return 1;
+		ksft_exit_fail_msg("Error:6: mmap() failed when it shouldn't have\n");
 	}
+	ksft_test_result_pass("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
 
 	addr = base_addr;
 	size = 5 * page_size;
 	if (munmap((void *)addr, size) != 0) {
 		dump_maps();
-		printf("Error: munmap failed!?\n");
-		return 1;
+		ksft_exit_fail_msg("Error: munmap failed!?\n");
 	}
-	printf("unmap() successful\n");
+	ksft_test_result_pass("Base Address unmap() successful\n");
 
-	printf("OK\n");
-	return 0;
+	ksft_finished();
 }

From c3ef7d10955d0c1726966455679456b479d02416 Mon Sep 17 00:00:00 2001
From: Muhammad Usama Anjum <usama.anjum@collabora.com>
Date: Fri, 2 Feb 2024 16:31:09 +0500
Subject: [PATCH 1117/1406] selftests/mm: map_hugetlb: conform test to TAP
 format output

Conform the layout, informational and status messages to TAP.  No
functional change is intended other than the layout of output messages.

Link: https://lkml.kernel.org/r/20240202113119.2047740-3-usama.anjum@collabora.com
Signed-off-by: Muhammad Usama Anjum <usama.anjum@collabora.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/map_hugetlb.c | 42 +++++++++++-------------
 1 file changed, 20 insertions(+), 22 deletions(-)

diff --git a/tools/testing/selftests/mm/map_hugetlb.c b/tools/testing/selftests/mm/map_hugetlb.c
index 86e8f2048a4090..a1f005a90a4f0d 100644
--- a/tools/testing/selftests/mm/map_hugetlb.c
+++ b/tools/testing/selftests/mm/map_hugetlb.c
@@ -16,6 +16,7 @@
 #include <sys/mman.h>
 #include <fcntl.h>
 #include "vm_util.h"
+#include "../kselftest.h"
 
 #define LENGTH (256UL*1024*1024)
 #define PROTECTION (PROT_READ | PROT_WRITE)
@@ -31,7 +32,7 @@
 
 static void check_bytes(char *addr)
 {
-	printf("First hex is %x\n", *((unsigned int *)addr));
+	ksft_print_msg("First hex is %x\n", *((unsigned int *)addr));
 }
 
 static void write_bytes(char *addr, size_t length)
@@ -42,23 +43,21 @@ static void write_bytes(char *addr, size_t length)
 		*(addr + i) = (char)i;
 }
 
-static int read_bytes(char *addr, size_t length)
+static void read_bytes(char *addr, size_t length)
 {
 	unsigned long i;
 
 	check_bytes(addr);
 	for (i = 0; i < length; i++)
-		if (*(addr + i) != (char)i) {
-			printf("Mismatch at %lu\n", i);
-			return 1;
-		}
-	return 0;
+		if (*(addr + i) != (char)i)
+			ksft_exit_fail_msg("Mismatch at %lu\n", i);
+
+	ksft_test_result_pass("Read correct data\n");
 }
 
 int main(int argc, char **argv)
 {
 	void *addr;
-	int ret;
 	size_t hugepage_size;
 	size_t length = LENGTH;
 	int flags = FLAGS;
@@ -69,6 +68,9 @@ int main(int argc, char **argv)
 	if (hugepage_size > length)
 		length = hugepage_size;
 
+	ksft_print_header();
+	ksft_set_plan(1);
+
 	if (argc > 1)
 		length = atol(argv[1]) << 20;
 	if (argc > 2) {
@@ -78,27 +80,23 @@ int main(int argc, char **argv)
 	}
 
 	if (shift)
-		printf("%u kB hugepages\n", 1 << (shift - 10));
+		ksft_print_msg("%u kB hugepages\n", 1 << (shift - 10));
 	else
-		printf("Default size hugepages\n");
-	printf("Mapping %lu Mbytes\n", (unsigned long)length >> 20);
+		ksft_print_msg("Default size hugepages\n");
+	ksft_print_msg("Mapping %lu Mbytes\n", (unsigned long)length >> 20);
 
 	addr = mmap(ADDR, length, PROTECTION, flags, -1, 0);
-	if (addr == MAP_FAILED) {
-		perror("mmap");
-		exit(1);
-	}
+	if (addr == MAP_FAILED)
+		ksft_exit_fail_msg("mmap: %s\n", strerror(errno));
 
-	printf("Returned address is %p\n", addr);
+	ksft_print_msg("Returned address is %p\n", addr);
 	check_bytes(addr);
 	write_bytes(addr, length);
-	ret = read_bytes(addr, length);
+	read_bytes(addr, length);
 
 	/* munmap() length of MAP_HUGETLB memory must be hugepage aligned */
-	if (munmap(addr, length)) {
-		perror("munmap");
-		exit(1);
-	}
+	if (munmap(addr, length))
+		ksft_exit_fail_msg("munmap: %s\n", strerror(errno));
 
-	return ret;
+	ksft_finished();
 }

From 8cfa63acba47c5c99cb53f895d146759095b62e0 Mon Sep 17 00:00:00 2001
From: Muhammad Usama Anjum <usama.anjum@collabora.com>
Date: Fri, 2 Feb 2024 16:31:10 +0500
Subject: [PATCH 1118/1406] selftests/mm: map_populate: conform test to TAP
 format output

Conform the layout, informational and status messages to TAP.  No
functional change is intended other than the layout of output messages.
Minor cleanups have also been included.

Link: https://lkml.kernel.org/r/20240202113119.2047740-4-usama.anjum@collabora.com
Signed-off-by: Muhammad Usama Anjum <usama.anjum@collabora.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/map_populate.c | 37 ++++++++++++++---------
 1 file changed, 23 insertions(+), 14 deletions(-)

diff --git a/tools/testing/selftests/mm/map_populate.c b/tools/testing/selftests/mm/map_populate.c
index 7945d07548751b..5c8a53869b1bd2 100644
--- a/tools/testing/selftests/mm/map_populate.c
+++ b/tools/testing/selftests/mm/map_populate.c
@@ -16,19 +16,21 @@
 #include <stdlib.h>
 #include <string.h>
 #include <unistd.h>
+#include "../kselftest.h"
 
 #define MMAP_SZ		4096
 
-#define BUG_ON(condition, description)					\
-	do {								\
-		if (condition) {					\
-			fprintf(stderr, "[FAIL]\t%s:%d\t%s:%s\n", __func__, \
-				__LINE__, (description), strerror(errno)); \
-			exit(1);					\
-		}							\
+#define BUG_ON(condition, description)						\
+	do {									\
+		if (condition)							\
+			ksft_exit_fail_msg("[FAIL]\t%s:%d\t%s:%s\n",		\
+					   __func__, __LINE__, (description),	\
+					   strerror(errno));			\
 	} while (0)
 
-static int parent_f(int sock, unsigned long *smap, int child)
+#define TESTS_IN_CHILD 2
+
+static void parent_f(int sock, unsigned long *smap, int child)
 {
 	int status, ret;
 
@@ -43,9 +45,10 @@ static int parent_f(int sock, unsigned long *smap, int child)
 	BUG_ON(ret <= 0, "write(sock)");
 
 	waitpid(child, &status, 0);
-	BUG_ON(!WIFEXITED(status), "child in unexpected state");
 
-	return WEXITSTATUS(status);
+	/* The ksft macros don't keep counters between processes */
+	ksft_cnt.ksft_pass = WEXITSTATUS(status);
+	ksft_cnt.ksft_fail = TESTS_IN_CHILD - WEXITSTATUS(status);
 }
 
 static int child_f(int sock, unsigned long *smap, int fd)
@@ -64,10 +67,11 @@ static int child_f(int sock, unsigned long *smap, int fd)
 	ret = read(sock, &buf, sizeof(int));
 	BUG_ON(ret <= 0, "read(sock)");
 
-	BUG_ON(*smap == 0x22222BAD, "MAP_POPULATE didn't COW private page");
-	BUG_ON(*smap != 0xdeadbabe, "mapping was corrupted");
+	ksft_test_result(*smap != 0x22222BAD, "MAP_POPULATE COW private page\n");
+	ksft_test_result(*smap == 0xdeadbabe, "The mapping state\n");
 
-	return 0;
+	/* The ksft macros don't keep counters between processes */
+	return ksft_cnt.ksft_pass;
 }
 
 int main(int argc, char **argv)
@@ -76,6 +80,9 @@ int main(int argc, char **argv)
 	FILE *ftmp;
 	unsigned long *smap;
 
+	ksft_print_header();
+	ksft_set_plan(TESTS_IN_CHILD);
+
 	ftmp = tmpfile();
 	BUG_ON(!ftmp, "tmpfile()");
 
@@ -101,7 +108,9 @@ int main(int argc, char **argv)
 		ret = close(sock[0]);
 		BUG_ON(ret, "close()");
 
-		return parent_f(sock[1], smap, child);
+		parent_f(sock[1], smap, child);
+
+		ksft_finished();
 	}
 
 	ret = close(sock[1]);

From f0b40931a6930fb67beb3b596a3b0c7d04b423bf Mon Sep 17 00:00:00 2001
From: Muhammad Usama Anjum <usama.anjum@collabora.com>
Date: Fri, 2 Feb 2024 16:31:11 +0500
Subject: [PATCH 1119/1406] selftests/mm: mlock-random-test: conform test to
 TAP format output

Conform the layout, informational and status messages to TAP.  No
functional change is intended other than the layout of output messages.

Link: https://lkml.kernel.org/r/20240202113119.2047740-5-usama.anjum@collabora.com
Signed-off-by: Muhammad Usama Anjum <usama.anjum@collabora.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 .../testing/selftests/mm/mlock-random-test.c  | 136 +++++++-----------
 1 file changed, 54 insertions(+), 82 deletions(-)

diff --git a/tools/testing/selftests/mm/mlock-random-test.c b/tools/testing/selftests/mm/mlock-random-test.c
index 1fba77df7f628e..1cd80b0f76c33f 100644
--- a/tools/testing/selftests/mm/mlock-random-test.c
+++ b/tools/testing/selftests/mm/mlock-random-test.c
@@ -13,6 +13,7 @@
 #include <sys/ipc.h>
 #include <sys/shm.h>
 #include <time.h>
+#include "../kselftest.h"
 #include "mlock2.h"
 
 #define CHUNK_UNIT (128 * 1024)
@@ -31,14 +32,14 @@ int set_cap_limits(rlim_t max)
 	new.rlim_cur = max;
 	new.rlim_max = max;
 	if (setrlimit(RLIMIT_MEMLOCK, &new)) {
-		perror("setrlimit() returns error\n");
+		ksft_perror("setrlimit() returns error\n");
 		return -1;
 	}
 
 	/* drop capabilities including CAP_IPC_LOCK */
 	if (cap_set_proc(cap)) {
-		perror("cap_set_proc() returns error\n");
-		return -2;
+		ksft_perror("cap_set_proc() returns error\n");
+		return -1;
 	}
 
 	return 0;
@@ -52,27 +53,24 @@ int get_proc_locked_vm_size(void)
 	unsigned long lock_size = 0;
 
 	f = fopen("/proc/self/status", "r");
-	if (!f) {
-		perror("fopen");
-		return -1;
-	}
+	if (!f)
+		ksft_exit_fail_msg("fopen: %s\n", strerror(errno));
 
 	while (fgets(line, 1024, f)) {
 		if (strstr(line, "VmLck")) {
 			ret = sscanf(line, "VmLck:\t%8lu kB", &lock_size);
 			if (ret <= 0) {
-				printf("sscanf() on VmLck error: %s: %d\n",
-						line, ret);
 				fclose(f);
-				return -1;
+				ksft_exit_fail_msg("sscanf() on VmLck error: %s: %d\n",
+						   line, ret);
 			}
 			fclose(f);
 			return (int)(lock_size << 10);
 		}
 	}
 
-	perror("cannot parse VmLck in /proc/self/status\n");
 	fclose(f);
+	ksft_exit_fail_msg("cannot parse VmLck in /proc/self/status: %s\n", strerror(errno));
 	return -1;
 }
 
@@ -91,10 +89,8 @@ int get_proc_page_size(unsigned long addr)
 	size_t size;
 
 	smaps = seek_to_smaps_entry(addr);
-	if (!smaps) {
-		printf("Unable to parse /proc/self/smaps\n");
-		return 0;
-	}
+	if (!smaps)
+		ksft_exit_fail_msg("Unable to parse /proc/self/smaps\n");
 
 	while (getline(&line, &size, smaps) > 0) {
 		if (!strstr(line, "MMUPageSize")) {
@@ -105,12 +101,9 @@ int get_proc_page_size(unsigned long addr)
 		}
 
 		/* found the MMUPageSize of this section */
-		if (sscanf(line, "MMUPageSize:    %8lu kB",
-					&mmupage_size) < 1) {
-			printf("Unable to parse smaps entry for Size:%s\n",
-					line);
-			break;
-		}
+		if (sscanf(line, "MMUPageSize:    %8lu kB", &mmupage_size) < 1)
+			ksft_exit_fail_msg("Unable to parse smaps entry for Size:%s\n",
+					   line);
 
 	}
 	free(line);
@@ -136,7 +129,7 @@ int get_proc_page_size(unsigned long addr)
  *    return value: 0 - success
  *    else: failure
  */
-int test_mlock_within_limit(char *p, int alloc_size)
+static void test_mlock_within_limit(char *p, int alloc_size)
 {
 	int i;
 	int ret = 0;
@@ -145,11 +138,9 @@ int test_mlock_within_limit(char *p, int alloc_size)
 	int page_size = 0;
 
 	getrlimit(RLIMIT_MEMLOCK, &cur);
-	if (cur.rlim_cur < alloc_size) {
-		printf("alloc_size[%d] < %u rlimit,lead to mlock failure\n",
-				alloc_size, (unsigned int)cur.rlim_cur);
-		return -1;
-	}
+	if (cur.rlim_cur < alloc_size)
+		ksft_exit_fail_msg("alloc_size[%d] < %u rlimit,lead to mlock failure\n",
+				   alloc_size, (unsigned int)cur.rlim_cur);
 
 	srand(time(NULL));
 	for (i = 0; i < TEST_LOOP; i++) {
@@ -169,13 +160,11 @@ int test_mlock_within_limit(char *p, int alloc_size)
 			ret = mlock2_(p + start_offset, lock_size,
 				       MLOCK_ONFAULT);
 
-		if (ret) {
-			printf("%s() failure at |%p(%d)| mlock:|%p(%d)|\n",
-					is_mlock ? "mlock" : "mlock2",
-					p, alloc_size,
-					p + start_offset, lock_size);
-			return ret;
-		}
+		if (ret)
+			ksft_exit_fail_msg("%s() failure at |%p(%d)| mlock:|%p(%d)|\n",
+					   is_mlock ? "mlock" : "mlock2",
+					   p, alloc_size,
+					   p + start_offset, lock_size);
 	}
 
 	/*
@@ -183,18 +172,12 @@ int test_mlock_within_limit(char *p, int alloc_size)
 	 */
 	locked_vm_size = get_proc_locked_vm_size();
 	page_size = get_proc_page_size((unsigned long)p);
-	if (page_size == 0) {
-		printf("cannot get proc MMUPageSize\n");
-		return -1;
-	}
 
-	if (locked_vm_size > PAGE_ALIGN(alloc_size, page_size) + page_size) {
-		printf("test_mlock_within_limit() left VmLck:%d on %d chunk\n",
-				locked_vm_size, alloc_size);
-		return -1;
-	}
+	if (locked_vm_size > PAGE_ALIGN(alloc_size, page_size) + page_size)
+		ksft_exit_fail_msg("%s left VmLck:%d on %d chunk\n",
+				   __func__, locked_vm_size, alloc_size);
 
-	return 0;
+	ksft_test_result_pass("%s\n", __func__);
 }
 
 
@@ -213,7 +196,7 @@ int test_mlock_within_limit(char *p, int alloc_size)
  *    return value: 0 - success
  *    else: failure
  */
-int test_mlock_outof_limit(char *p, int alloc_size)
+static void test_mlock_outof_limit(char *p, int alloc_size)
 {
 	int i;
 	int ret = 0;
@@ -221,11 +204,9 @@ int test_mlock_outof_limit(char *p, int alloc_size)
 	struct rlimit cur;
 
 	getrlimit(RLIMIT_MEMLOCK, &cur);
-	if (cur.rlim_cur >= alloc_size) {
-		printf("alloc_size[%d] >%u rlimit, violates test condition\n",
-				alloc_size, (unsigned int)cur.rlim_cur);
-		return -1;
-	}
+	if (cur.rlim_cur >= alloc_size)
+		ksft_exit_fail_msg("alloc_size[%d] >%u rlimit, violates test condition\n",
+				   alloc_size, (unsigned int)cur.rlim_cur);
 
 	old_locked_vm_size = get_proc_locked_vm_size();
 	srand(time(NULL));
@@ -240,56 +221,47 @@ int test_mlock_outof_limit(char *p, int alloc_size)
 		else
 			ret = mlock2_(p + start_offset, lock_size,
 					MLOCK_ONFAULT);
-		if (ret == 0) {
-			printf("%s() succeeds? on %p(%d) mlock%p(%d)\n",
-					is_mlock ? "mlock" : "mlock2",
-					p, alloc_size,
-					p + start_offset, lock_size);
-			return -1;
-		}
+		if (ret == 0)
+			ksft_exit_fail_msg("%s() succeeds? on %p(%d) mlock%p(%d)\n",
+					   is_mlock ? "mlock" : "mlock2",
+					   p, alloc_size, p + start_offset, lock_size);
 	}
 
 	locked_vm_size = get_proc_locked_vm_size();
-	if (locked_vm_size != old_locked_vm_size) {
-		printf("tests leads to new mlocked page: old[%d], new[%d]\n",
-				old_locked_vm_size,
-				locked_vm_size);
-		return -1;
-	}
+	if (locked_vm_size != old_locked_vm_size)
+		ksft_exit_fail_msg("tests leads to new mlocked page: old[%d], new[%d]\n",
+				   old_locked_vm_size,
+				   locked_vm_size);
 
-	return 0;
+	ksft_test_result_pass("%s\n", __func__);
 }
 
 int main(int argc, char **argv)
 {
 	char *p = NULL;
-	int ret = 0;
+
+	ksft_print_header();
 
 	if (set_cap_limits(MLOCK_RLIMIT_SIZE))
-		return -1;
+		ksft_finished();
+
+	ksft_set_plan(2);
 
 	p = malloc(MLOCK_WITHIN_LIMIT_SIZE);
-	if (p == NULL) {
-		perror("malloc() failure\n");
-		return -1;
-	}
-	ret = test_mlock_within_limit(p, MLOCK_WITHIN_LIMIT_SIZE);
-	if (ret)
-		return ret;
+	if (p == NULL)
+		ksft_exit_fail_msg("malloc() failure: %s\n", strerror(errno));
+
+	test_mlock_within_limit(p, MLOCK_WITHIN_LIMIT_SIZE);
 	munlock(p, MLOCK_WITHIN_LIMIT_SIZE);
 	free(p);
 
-
 	p = malloc(MLOCK_OUTOF_LIMIT_SIZE);
-	if (p == NULL) {
-		perror("malloc() failure\n");
-		return -1;
-	}
-	ret = test_mlock_outof_limit(p, MLOCK_OUTOF_LIMIT_SIZE);
-	if (ret)
-		return ret;
+	if (p == NULL)
+		ksft_exit_fail_msg("malloc() failure: %s\n", strerror(errno));
+
+	test_mlock_outof_limit(p, MLOCK_OUTOF_LIMIT_SIZE);
 	munlock(p, MLOCK_OUTOF_LIMIT_SIZE);
 	free(p);
 
-	return 0;
+	ksft_finished();
 }

From 8d0276ec79b3684101de2b50e36265a5c3ead793 Mon Sep 17 00:00:00 2001
From: Muhammad Usama Anjum <usama.anjum@collabora.com>
Date: Fri, 2 Feb 2024 16:31:12 +0500
Subject: [PATCH 1120/1406] selftests/mm: mlock2-tests: conform test to TAP
 format output

Conform the layout, informational and status messages to TAP.  No
functional change is intended other than the layout of output messages.
I've done some cleanups as well.

Link: https://lkml.kernel.org/r/20240202113119.2047740-6-usama.anjum@collabora.com
Signed-off-by: Muhammad Usama Anjum <usama.anjum@collabora.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/mlock2-tests.c | 282 +++++++++-------------
 tools/testing/selftests/mm/mlock2.h       |  11 +-
 2 files changed, 118 insertions(+), 175 deletions(-)

diff --git a/tools/testing/selftests/mm/mlock2-tests.c b/tools/testing/selftests/mm/mlock2-tests.c
index 80cddc0de20610..26f744188ad0c8 100644
--- a/tools/testing/selftests/mm/mlock2-tests.c
+++ b/tools/testing/selftests/mm/mlock2-tests.c
@@ -7,9 +7,8 @@
 #include <sys/time.h>
 #include <sys/resource.h>
 #include <stdbool.h>
-#include "mlock2.h"
-
 #include "../kselftest.h"
+#include "mlock2.h"
 
 struct vm_boundaries {
 	unsigned long start;
@@ -40,14 +39,14 @@ static int get_vm_area(unsigned long addr, struct vm_boundaries *area)
 	while(fgets(line, 1024, file)) {
 		end_addr = strchr(line, '-');
 		if (!end_addr) {
-			printf("cannot parse /proc/self/maps\n");
+			ksft_print_msg("cannot parse /proc/self/maps\n");
 			goto out;
 		}
 		*end_addr = '\0';
 		end_addr++;
 		stop = strchr(end_addr, ' ');
 		if (!stop) {
-			printf("cannot parse /proc/self/maps\n");
+			ksft_print_msg("cannot parse /proc/self/maps\n");
 			goto out;
 		}
 
@@ -78,7 +77,7 @@ static bool is_vmflag_set(unsigned long addr, const char *vmflag)
 
 	smaps = seek_to_smaps_entry(addr);
 	if (!smaps) {
-		printf("Unable to parse /proc/self/smaps\n");
+		ksft_print_msg("Unable to parse /proc/self/smaps\n");
 		goto out;
 	}
 
@@ -115,7 +114,7 @@ static unsigned long get_value_for_name(unsigned long addr, const char *name)
 
 	smaps = seek_to_smaps_entry(addr);
 	if (!smaps) {
-		printf("Unable to parse /proc/self/smaps\n");
+		ksft_print_msg("Unable to parse /proc/self/smaps\n");
 		goto out;
 	}
 
@@ -129,7 +128,7 @@ static unsigned long get_value_for_name(unsigned long addr, const char *name)
 
 		value_ptr = line + strlen(name);
 		if (sscanf(value_ptr, "%lu kB", &value) < 1) {
-			printf("Unable to parse smaps entry for Size\n");
+			ksft_print_msg("Unable to parse smaps entry for Size\n");
 			goto out;
 		}
 		break;
@@ -180,57 +179,45 @@ static int lock_check(unsigned long addr)
 static int unlock_lock_check(char *map)
 {
 	if (is_vmflag_set((unsigned long)map, LOCKED)) {
-		printf("VMA flag %s is present on page 1 after unlock\n", LOCKED);
+		ksft_print_msg("VMA flag %s is present on page 1 after unlock\n", LOCKED);
 		return 1;
 	}
 
 	return 0;
 }
 
-static int test_mlock_lock()
+static void test_mlock_lock(void)
 {
 	char *map;
-	int ret = 1;
 	unsigned long page_size = getpagesize();
 
 	map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE,
 		   MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
-	if (map == MAP_FAILED) {
-		perror("test_mlock_locked mmap");
-		goto out;
-	}
+	if (map == MAP_FAILED)
+		ksft_exit_fail_msg("mmap error: %s", strerror(errno));
 
 	if (mlock2_(map, 2 * page_size, 0)) {
-		if (errno == ENOSYS) {
-			printf("Cannot call new mlock family, skipping test\n");
-			_exit(KSFT_SKIP);
-		}
-		perror("mlock2(0)");
-		goto unmap;
+		munmap(map, 2 * page_size);
+		ksft_exit_fail_msg("mlock2(0): %s\n", strerror(errno));
 	}
 
-	if (!lock_check((unsigned long)map))
-		goto unmap;
+	ksft_test_result(lock_check((unsigned long)map), "%s: Locked\n", __func__);
 
 	/* Now unlock and recheck attributes */
 	if (munlock(map, 2 * page_size)) {
-		perror("munlock()");
-		goto unmap;
+		munmap(map, 2 * page_size);
+		ksft_exit_fail_msg("munlock(): %s\n", strerror(errno));
 	}
 
-	ret = unlock_lock_check(map);
-
-unmap:
+	ksft_test_result(!unlock_lock_check(map), "%s: Locked\n", __func__);
 	munmap(map, 2 * page_size);
-out:
-	return ret;
 }
 
 static int onfault_check(char *map)
 {
 	*map = 'a';
 	if (!is_vma_lock_on_fault((unsigned long)map)) {
-		printf("VMA is not marked for lock on fault\n");
+		ksft_print_msg("VMA is not marked for lock on fault\n");
 		return 1;
 	}
 
@@ -243,172 +230,131 @@ static int unlock_onfault_check(char *map)
 
 	if (is_vma_lock_on_fault((unsigned long)map) ||
 	    is_vma_lock_on_fault((unsigned long)map + page_size)) {
-		printf("VMA is still lock on fault after unlock\n");
+		ksft_print_msg("VMA is still lock on fault after unlock\n");
 		return 1;
 	}
 
 	return 0;
 }
 
-static int test_mlock_onfault()
+static void test_mlock_onfault(void)
 {
 	char *map;
-	int ret = 1;
 	unsigned long page_size = getpagesize();
 
 	map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE,
 		   MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
-	if (map == MAP_FAILED) {
-		perror("test_mlock_locked mmap");
-		goto out;
-	}
+	if (map == MAP_FAILED)
+		ksft_exit_fail_msg("mmap error: %s", strerror(errno));
 
 	if (mlock2_(map, 2 * page_size, MLOCK_ONFAULT)) {
-		if (errno == ENOSYS) {
-			printf("Cannot call new mlock family, skipping test\n");
-			_exit(KSFT_SKIP);
-		}
-		perror("mlock2(MLOCK_ONFAULT)");
-		goto unmap;
+		munmap(map, 2 * page_size);
+		ksft_exit_fail_msg("mlock2(MLOCK_ONFAULT): %s\n", strerror(errno));
 	}
 
-	if (onfault_check(map))
-		goto unmap;
+	ksft_test_result(!onfault_check(map), "%s: VMA marked for lock on fault\n", __func__);
 
 	/* Now unlock and recheck attributes */
 	if (munlock(map, 2 * page_size)) {
-		if (errno == ENOSYS) {
-			printf("Cannot call new mlock family, skipping test\n");
-			_exit(KSFT_SKIP);
-		}
-		perror("munlock()");
-		goto unmap;
+		munmap(map, 2 * page_size);
+		ksft_exit_fail_msg("munlock(): %s\n", strerror(errno));
 	}
 
-	ret = unlock_onfault_check(map);
-unmap:
+	ksft_test_result(!unlock_onfault_check(map), "VMA open lock after fault\n");
 	munmap(map, 2 * page_size);
-out:
-	return ret;
 }
 
-static int test_lock_onfault_of_present()
+static void test_lock_onfault_of_present(void)
 {
 	char *map;
-	int ret = 1;
 	unsigned long page_size = getpagesize();
 
 	map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE,
 		   MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
-	if (map == MAP_FAILED) {
-		perror("test_mlock_locked mmap");
-		goto out;
-	}
+	if (map == MAP_FAILED)
+		ksft_exit_fail_msg("mmap error: %s", strerror(errno));
 
 	*map = 'a';
 
 	if (mlock2_(map, 2 * page_size, MLOCK_ONFAULT)) {
-		if (errno == ENOSYS) {
-			printf("Cannot call new mlock family, skipping test\n");
-			_exit(KSFT_SKIP);
-		}
-		perror("mlock2(MLOCK_ONFAULT)");
-		goto unmap;
+		munmap(map, 2 * page_size);
+		ksft_test_result_fail("mlock2(MLOCK_ONFAULT) error: %s", strerror(errno));
 	}
 
-	if (!is_vma_lock_on_fault((unsigned long)map) ||
-	    !is_vma_lock_on_fault((unsigned long)map + page_size)) {
-		printf("VMA with present pages is not marked lock on fault\n");
-		goto unmap;
-	}
-	ret = 0;
-unmap:
+	ksft_test_result(is_vma_lock_on_fault((unsigned long)map) ||
+			 is_vma_lock_on_fault((unsigned long)map + page_size),
+			 "VMA with present pages is not marked lock on fault\n");
 	munmap(map, 2 * page_size);
-out:
-	return ret;
 }
 
-static int test_munlockall()
+static void test_munlockall0(void)
 {
 	char *map;
-	int ret = 1;
 	unsigned long page_size = getpagesize();
 
 	map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE,
 		   MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
-
-	if (map == MAP_FAILED) {
-		perror("test_munlockall mmap");
-		goto out;
-	}
+	if (map == MAP_FAILED)
+		ksft_exit_fail_msg("mmap error: %s\n", strerror(errno));
 
 	if (mlockall(MCL_CURRENT)) {
-		perror("mlockall(MCL_CURRENT)");
-		goto out;
+		munmap(map, 2 * page_size);
+		ksft_exit_fail_msg("mlockall(MCL_CURRENT): %s\n", strerror(errno));
 	}
 
-	if (!lock_check((unsigned long)map))
-		goto unmap;
+	ksft_test_result(lock_check((unsigned long)map), "%s: Locked memory area\n", __func__);
 
 	if (munlockall()) {
-		perror("munlockall()");
-		goto unmap;
+		munmap(map, 2 * page_size);
+		ksft_exit_fail_msg("munlockall(): %s\n", strerror(errno));
 	}
 
-	if (unlock_lock_check(map))
-		goto unmap;
-
+	ksft_test_result(!unlock_lock_check(map), "%s: No locked memory\n", __func__);
 	munmap(map, 2 * page_size);
+}
+
+static void test_munlockall1(void)
+{
+	char *map;
+	unsigned long page_size = getpagesize();
 
 	map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE,
 		   MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
-
-	if (map == MAP_FAILED) {
-		perror("test_munlockall second mmap");
-		goto out;
-	}
+	if (map == MAP_FAILED)
+		ksft_exit_fail_msg("mmap error: %s", strerror(errno));
 
 	if (mlockall(MCL_CURRENT | MCL_ONFAULT)) {
-		perror("mlockall(MCL_CURRENT | MCL_ONFAULT)");
-		goto unmap;
+		munmap(map, 2 * page_size);
+		ksft_exit_fail_msg("mlockall(MCL_CURRENT | MCL_ONFAULT): %s\n", strerror(errno));
 	}
 
-	if (onfault_check(map))
-		goto unmap;
+	ksft_test_result(!onfault_check(map), "%s: VMA marked for lock on fault\n", __func__);
 
 	if (munlockall()) {
-		perror("munlockall()");
-		goto unmap;
+		munmap(map, 2 * page_size);
+		ksft_exit_fail_msg("munlockall(): %s\n", strerror(errno));
 	}
 
-	if (unlock_onfault_check(map))
-		goto unmap;
+	ksft_test_result(!unlock_onfault_check(map), "%s: Unlocked\n", __func__);
 
 	if (mlockall(MCL_CURRENT | MCL_FUTURE)) {
-		perror("mlockall(MCL_CURRENT | MCL_FUTURE)");
-		goto out;
+		munmap(map, 2 * page_size);
+		ksft_exit_fail_msg("mlockall(MCL_CURRENT | MCL_FUTURE): %s\n", strerror(errno));
 	}
 
-	if (!lock_check((unsigned long)map))
-		goto unmap;
+	ksft_test_result(lock_check((unsigned long)map), "%s: Locked\n", __func__);
 
 	if (munlockall()) {
-		perror("munlockall()");
-		goto unmap;
+		munmap(map, 2 * page_size);
+		ksft_exit_fail_msg("munlockall() %s\n", strerror(errno));
 	}
 
-	ret = unlock_lock_check(map);
-
-unmap:
+	ksft_test_result(!unlock_lock_check(map), "%s: No locked memory\n", __func__);
 	munmap(map, 2 * page_size);
-out:
-	munlockall();
-	return ret;
 }
 
-static int test_vma_management(bool call_mlock)
+static void test_vma_management(bool call_mlock)
 {
-	int ret = 1;
 	void *map;
 	unsigned long page_size = getpagesize();
 	struct vm_boundaries page1;
@@ -417,25 +363,19 @@ static int test_vma_management(bool call_mlock)
 
 	map = mmap(NULL, 3 * page_size, PROT_READ | PROT_WRITE,
 		   MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
-	if (map == MAP_FAILED) {
-		perror("mmap()");
-		return ret;
-	}
+	if (map == MAP_FAILED)
+		ksft_exit_fail_msg("mmap error: %s", strerror(errno));
 
 	if (call_mlock && mlock2_(map, 3 * page_size, MLOCK_ONFAULT)) {
-		if (errno == ENOSYS) {
-			printf("Cannot call new mlock family, skipping test\n");
-			_exit(KSFT_SKIP);
-		}
-		perror("mlock(ONFAULT)\n");
-		goto out;
+		munmap(map, 3 * page_size);
+		ksft_test_result_fail("mlock error: %s", strerror(errno));
 	}
 
 	if (get_vm_area((unsigned long)map, &page1) ||
 	    get_vm_area((unsigned long)map + page_size, &page2) ||
 	    get_vm_area((unsigned long)map + page_size * 2, &page3)) {
-		printf("couldn't find mapping in /proc/self/maps\n");
-		goto out;
+		munmap(map, 3 * page_size);
+		ksft_test_result_fail("couldn't find mapping in /proc/self/maps");
 	}
 
 	/*
@@ -444,76 +384,86 @@ static int test_vma_management(bool call_mlock)
 	 * not a failure)
 	 */
 	if (page1.start != page2.start || page2.start != page3.start) {
-		printf("VMAs are not merged to start, aborting test\n");
-		ret = 0;
-		goto out;
+		munmap(map, 3 * page_size);
+		ksft_test_result_fail("VMAs are not merged to start, aborting test");
 	}
 
 	if (munlock(map + page_size, page_size)) {
-		perror("munlock()");
-		goto out;
+		munmap(map, 3 * page_size);
+		ksft_test_result_fail("munlock(): %s", strerror(errno));
 	}
 
 	if (get_vm_area((unsigned long)map, &page1) ||
 	    get_vm_area((unsigned long)map + page_size, &page2) ||
 	    get_vm_area((unsigned long)map + page_size * 2, &page3)) {
-		printf("couldn't find mapping in /proc/self/maps\n");
-		goto out;
+		munmap(map, 3 * page_size);
+		ksft_test_result_fail("couldn't find mapping in /proc/self/maps");
 	}
 
 	/* All three VMAs should be different */
 	if (page1.start == page2.start || page2.start == page3.start) {
-		printf("failed to split VMA for munlock\n");
-		goto out;
+		munmap(map, 3 * page_size);
+		ksft_test_result_fail("failed to split VMA for munlock");
 	}
 
 	/* Now unlock the first and third page and check the VMAs again */
 	if (munlock(map, page_size * 3)) {
-		perror("munlock()");
-		goto out;
+		munmap(map, 3 * page_size);
+		ksft_test_result_fail("munlock(): %s", strerror(errno));
 	}
 
 	if (get_vm_area((unsigned long)map, &page1) ||
 	    get_vm_area((unsigned long)map + page_size, &page2) ||
 	    get_vm_area((unsigned long)map + page_size * 2, &page3)) {
-		printf("couldn't find mapping in /proc/self/maps\n");
-		goto out;
+		munmap(map, 3 * page_size);
+		ksft_test_result_fail("couldn't find mapping in /proc/self/maps");
 	}
 
 	/* Now all three VMAs should be the same */
 	if (page1.start != page2.start || page2.start != page3.start) {
-		printf("failed to merge VMAs after munlock\n");
-		goto out;
+		munmap(map, 3 * page_size);
+		ksft_test_result_fail("failed to merge VMAs after munlock");
 	}
 
-	ret = 0;
-out:
+	ksft_test_result_pass("%s call_mlock %d\n", __func__, call_mlock);
 	munmap(map, 3 * page_size);
-	return ret;
 }
 
-static int test_mlockall(int (test_function)(bool call_mlock))
+static void test_mlockall(void)
 {
-	int ret = 1;
+	if (mlockall(MCL_CURRENT | MCL_ONFAULT | MCL_FUTURE))
+		ksft_exit_fail_msg("mlockall failed: %s\n", strerror(errno));
 
-	if (mlockall(MCL_CURRENT | MCL_ONFAULT | MCL_FUTURE)) {
-		perror("mlockall");
-		return ret;
-	}
-
-	ret = test_function(false);
+	test_vma_management(false);
 	munlockall();
-	return ret;
 }
 
 int main(int argc, char **argv)
 {
-	int ret = 0;
-	ret += test_mlock_lock();
-	ret += test_mlock_onfault();
-	ret += test_munlockall();
-	ret += test_lock_onfault_of_present();
-	ret += test_vma_management(true);
-	ret += test_mlockall(test_vma_management);
-	return ret;
+	int ret, size = 3 * getpagesize();
+	void *map;
+
+	ksft_print_header();
+
+	map = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+	if (map == MAP_FAILED)
+		ksft_exit_fail_msg("mmap error: %s", strerror(errno));
+
+	ret = mlock2_(map, size, MLOCK_ONFAULT);
+	if (ret && errno == ENOSYS)
+		ksft_finished();
+
+	munmap(map, size);
+
+	ksft_set_plan(13);
+
+	test_mlock_lock();
+	test_mlock_onfault();
+	test_munlockall0();
+	test_munlockall1();
+	test_lock_onfault_of_present();
+	test_vma_management(true);
+	test_mlockall();
+
+	ksft_finished();
 }
diff --git a/tools/testing/selftests/mm/mlock2.h b/tools/testing/selftests/mm/mlock2.h
index 8e02991b313c8e..4417eaa5cfb78b 100644
--- a/tools/testing/selftests/mm/mlock2.h
+++ b/tools/testing/selftests/mm/mlock2.h
@@ -6,12 +6,7 @@
 
 static int mlock2_(void *start, size_t len, int flags)
 {
-#ifdef __NR_mlock2
 	return syscall(__NR_mlock2, start, len, flags);
-#else
-	errno = ENOSYS;
-	return -1;
-#endif
 }
 
 static FILE *seek_to_smaps_entry(unsigned long addr)
@@ -27,10 +22,8 @@ static FILE *seek_to_smaps_entry(unsigned long addr)
 	char path[BUFSIZ];
 
 	file = fopen("/proc/self/smaps", "r");
-	if (!file) {
-		perror("fopen smaps");
-		_exit(1);
-	}
+	if (!file)
+		ksft_exit_fail_msg("fopen smaps: %s\n", strerror(errno));
 
 	while (getline(&line, &size, file) > 0) {
 		if (sscanf(line, "%lx-%lx %s %lx %s %lu %s\n",

From ef66b7d32e0281c4af5f1a67fdaa55bbb3d4a72c Mon Sep 17 00:00:00 2001
From: Muhammad Usama Anjum <usama.anjum@collabora.com>
Date: Fri, 2 Feb 2024 16:31:13 +0500
Subject: [PATCH 1121/1406] selftests/mm: mrelease_test: conform test to TAP
 format output

Conform the layout, informational and status messages to TAP.  No
functional change is intended other than the layout of output messages.

Link: https://lkml.kernel.org/r/20240202113119.2047740-7-usama.anjum@collabora.com
Signed-off-by: Muhammad Usama Anjum <usama.anjum@collabora.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/mrelease_test.c | 80 +++++++++-------------
 1 file changed, 33 insertions(+), 47 deletions(-)

diff --git a/tools/testing/selftests/mm/mrelease_test.c b/tools/testing/selftests/mm/mrelease_test.c
index d822004a374e9d..100370a7111df5 100644
--- a/tools/testing/selftests/mm/mrelease_test.c
+++ b/tools/testing/selftests/mm/mrelease_test.c
@@ -26,19 +26,15 @@ static int alloc_noexit(unsigned long nr_pages, int pipefd)
 
 	buf = (char *)mmap(NULL, nr_pages * psize(), PROT_READ | PROT_WRITE,
 			   MAP_PRIVATE | MAP_ANON, 0, 0);
-	if (buf == MAP_FAILED) {
-		perror("mmap failed, halting the test");
-		return KSFT_FAIL;
-	}
+	if (buf == MAP_FAILED)
+		ksft_exit_fail_msg("mmap failed, halting the test: %s\n", strerror(errno));
 
 	for (i = 0; i < nr_pages; i++)
 		*((unsigned long *)(buf + (i * psize()))) = i;
 
 	/* Signal the parent that the child is ready */
-	if (write(pipefd, "", 1) < 0) {
-		perror("write");
-		return KSFT_FAIL;
-	}
+	if (write(pipefd, "", 1) < 0)
+		ksft_exit_fail_msg("write: %s\n", strerror(errno));
 
 	/* Wait to be killed (when reparenting happens) */
 	while (getppid() == ppid && timeout > 0) {
@@ -54,23 +50,17 @@ static int alloc_noexit(unsigned long nr_pages, int pipefd)
 /* The process_mrelease calls in this test are expected to fail */
 static void run_negative_tests(int pidfd)
 {
-	int res;
 	/* Test invalid flags. Expect to fail with EINVAL error code. */
 	if (!syscall(__NR_process_mrelease, pidfd, (unsigned int)-1) ||
 			errno != EINVAL) {
-		res = (errno == ENOSYS ? KSFT_SKIP : KSFT_FAIL);
-		perror("process_mrelease with wrong flags");
-		exit(res);
+		ksft_exit_fail_msg("process_mrelease with wrong flags: %s\n", strerror(errno));
 	}
 	/*
 	 * Test reaping while process is alive with no pending SIGKILL.
 	 * Expect to fail with EINVAL error code.
 	 */
-	if (!syscall(__NR_process_mrelease, pidfd, 0) || errno != EINVAL) {
-		res = (errno == ENOSYS ? KSFT_SKIP : KSFT_FAIL);
-		perror("process_mrelease on a live process");
-		exit(res);
-	}
+	if (!syscall(__NR_process_mrelease, pidfd, 0) || errno != EINVAL)
+		ksft_exit_fail_msg("process_mrelease on a live process: %s\n", strerror(errno));
 }
 
 static int child_main(int pipefd[], size_t size)
@@ -93,11 +83,18 @@ int main(void)
 	char byte;
 	int res;
 
+	ksft_print_header();
+	ksft_set_plan(1);
+
 	/* Test a wrong pidfd */
 	if (!syscall(__NR_process_mrelease, -1, 0) || errno != EBADF) {
-		res = (errno == ENOSYS ? KSFT_SKIP : KSFT_FAIL);
-		perror("process_mrelease with wrong pidfd");
-		exit(res);
+		if (errno == ENOSYS) {
+			ksft_test_result_skip("process_mrelease not implemented\n");
+			ksft_finished();
+		} else {
+			ksft_exit_fail_msg("process_mrelease with wrong pidfd: %s",
+					   strerror(errno));
+		}
 	}
 
 	/* Start the test with 1MB child memory allocation */
@@ -107,16 +104,14 @@ int main(void)
 	 * Pipe for the child to signal when it's done allocating
 	 * memory
 	 */
-	if (pipe(pipefd)) {
-		perror("pipe");
-		exit(KSFT_FAIL);
-	}
+	if (pipe(pipefd))
+		ksft_exit_fail_msg("pipe: %s\n", strerror(errno));
+
 	pid = fork();
 	if (pid < 0) {
-		perror("fork");
 		close(pipefd[0]);
 		close(pipefd[1]);
-		exit(KSFT_FAIL);
+		ksft_exit_fail_msg("fork: %s\n", strerror(errno));
 	}
 
 	if (pid == 0) {
@@ -134,28 +129,23 @@ int main(void)
 	res = read(pipefd[0], &byte, 1);
 	close(pipefd[0]);
 	if (res < 0) {
-		perror("read");
 		if (!kill(pid, SIGKILL))
 			waitpid(pid, NULL, 0);
-		exit(KSFT_FAIL);
+		ksft_exit_fail_msg("read: %s\n", strerror(errno));
 	}
 
 	pidfd = syscall(__NR_pidfd_open, pid, 0);
 	if (pidfd < 0) {
-		perror("pidfd_open");
 		if (!kill(pid, SIGKILL))
 			waitpid(pid, NULL, 0);
-		exit(KSFT_FAIL);
+		ksft_exit_fail_msg("pidfd_open: %s\n", strerror(errno));
 	}
 
 	/* Run negative tests which require a live child */
 	run_negative_tests(pidfd);
 
-	if (kill(pid, SIGKILL)) {
-		res = (errno == ENOSYS ? KSFT_SKIP : KSFT_FAIL);
-		perror("kill");
-		exit(res);
-	}
+	if (kill(pid, SIGKILL))
+		ksft_exit_fail_msg("kill: %s\n", strerror(errno));
 
 	success = (syscall(__NR_process_mrelease, pidfd, 0) == 0);
 	if (!success) {
@@ -169,18 +159,15 @@ int main(void)
 		if (errno == ESRCH) {
 			retry = (size <= MAX_SIZE_MB);
 		} else {
-			res = (errno == ENOSYS ? KSFT_SKIP : KSFT_FAIL);
-			perror("process_mrelease");
 			waitpid(pid, NULL, 0);
-			exit(res);
+			ksft_exit_fail_msg("process_mrelease: %s\n", strerror(errno));
 		}
 	}
 
 	/* Cleanup to prevent zombies */
-	if (waitpid(pid, NULL, 0) < 0) {
-		perror("waitpid");
-		exit(KSFT_FAIL);
-	}
+	if (waitpid(pid, NULL, 0) < 0)
+		ksft_exit_fail_msg("waitpid: %s\n", strerror(errno));
+
 	close(pidfd);
 
 	if (!success) {
@@ -188,11 +175,10 @@ int main(void)
 			size *= 2;
 			goto retry;
 		}
-		printf("All process_mrelease attempts failed!\n");
-		exit(KSFT_FAIL);
+		ksft_exit_fail_msg("All process_mrelease attempts failed!\n");
 	}
 
-	printf("Success reaping a child with %zuMB of memory allocations\n",
-	       size);
-	return KSFT_PASS;
+	ksft_test_result_pass("Success reaping a child with %zuMB of memory allocations\n",
+			      size);
+	ksft_finished();
 }

From 9622aa3c4e2e64f4dfeb789fceb95f81a1219584 Mon Sep 17 00:00:00 2001
From: Muhammad Usama Anjum <usama.anjum@collabora.com>
Date: Fri, 2 Feb 2024 16:31:14 +0500
Subject: [PATCH 1122/1406] selftests/mm: mremap_dontunmap: conform test to TAP
 format output

Conform the layout, informational and status messages to TAP.  No
functional change is intended other than the layout of output messages.

Link: https://lkml.kernel.org/r/20240202113119.2047740-8-usama.anjum@collabora.com
Signed-off-by: Muhammad Usama Anjum <usama.anjum@collabora.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/mremap_dontunmap.c | 32 ++++++++++++-------
 1 file changed, 20 insertions(+), 12 deletions(-)

diff --git a/tools/testing/selftests/mm/mremap_dontunmap.c b/tools/testing/selftests/mm/mremap_dontunmap.c
index a06e73ec856823..1d75084b9ca56b 100644
--- a/tools/testing/selftests/mm/mremap_dontunmap.c
+++ b/tools/testing/selftests/mm/mremap_dontunmap.c
@@ -27,14 +27,14 @@ static void dump_maps(void)
 	system(cmd);
 }
 
-#define BUG_ON(condition, description)					      \
-	do {								      \
-		if (condition) {					      \
-			fprintf(stderr, "[FAIL]\t%s():%d\t%s:%s\n", __func__, \
-				__LINE__, (description), strerror(errno));    \
-			dump_maps();					  \
-			exit(1);					      \
-		} 							      \
+#define BUG_ON(condition, description)						\
+	do {									\
+		if (condition) {						\
+			dump_maps();						\
+			ksft_exit_fail_msg("[FAIL]\t%s:%d\t%s:%s\n",		\
+					   __func__, __LINE__, (description),	\
+					   strerror(errno));			\
+		}								\
 	} while (0)
 
 // Try a simple operation for to "test" for kernel support this prevents
@@ -122,6 +122,7 @@ static void mremap_dontunmap_simple()
 	       "unable to unmap destination mapping");
 	BUG_ON(munmap(source_mapping, num_pages * page_size) == -1,
 	       "unable to unmap source mapping");
+	ksft_test_result_pass("%s\n", __func__);
 }
 
 // This test validates that MREMAP_DONTUNMAP on a shared mapping works as expected.
@@ -173,6 +174,7 @@ static void mremap_dontunmap_simple_shmem()
 	       "unable to unmap destination mapping");
 	BUG_ON(munmap(source_mapping, num_pages * page_size) == -1,
 	       "unable to unmap source mapping");
+	ksft_test_result_pass("%s\n", __func__);
 }
 
 // This test validates MREMAP_DONTUNMAP will move page tables to a specific
@@ -219,6 +221,7 @@ static void mremap_dontunmap_simple_fixed()
 	       "unable to unmap destination mapping");
 	BUG_ON(munmap(source_mapping, num_pages * page_size) == -1,
 	       "unable to unmap source mapping");
+	ksft_test_result_pass("%s\n", __func__);
 }
 
 // This test validates that we can MREMAP_DONTUNMAP for a portion of an
@@ -269,6 +272,7 @@ static void mremap_dontunmap_partial_mapping()
 	       "unable to unmap destination mapping");
 	BUG_ON(munmap(source_mapping, num_pages * page_size) == -1,
 	       "unable to unmap source mapping");
+	ksft_test_result_pass("%s\n", __func__);
 }
 
 // This test validates that we can remap over only a portion of a mapping.
@@ -328,19 +332,24 @@ static void mremap_dontunmap_partial_mapping_overwrite(void)
 	       "unable to unmap destination mapping");
 	BUG_ON(munmap(source_mapping, 5 * page_size) == -1,
 	       "unable to unmap source mapping");
+	ksft_test_result_pass("%s\n", __func__);
 }
 
 int main(void)
 {
+	ksft_print_header();
+
 	page_size = sysconf(_SC_PAGE_SIZE);
 
 	// test for kernel support for MREMAP_DONTUNMAP skipping the test if
 	// not.
 	if (kernel_support_for_mremap_dontunmap() != 0) {
-		printf("No kernel support for MREMAP_DONTUNMAP\n");
-		return KSFT_SKIP;
+		ksft_print_msg("No kernel support for MREMAP_DONTUNMAP\n");
+		ksft_finished();
 	}
 
+	ksft_set_plan(5);
+
 	// Keep a page sized buffer around for when we need it.
 	page_buffer =
 	    mmap(NULL, page_size, PROT_READ | PROT_WRITE,
@@ -356,6 +365,5 @@ int main(void)
 	BUG_ON(munmap(page_buffer, page_size) == -1,
 	       "unable to unmap page buffer");
 
-	printf("OK\n");
-	return 0;
+	ksft_finished();
 }

From 0670297c8d516c429368235af3b6a030f293effb Mon Sep 17 00:00:00 2001
From: Muhammad Usama Anjum <usama.anjum@collabora.com>
Date: Fri, 2 Feb 2024 16:31:15 +0500
Subject: [PATCH 1123/1406] selftests/mm: split_huge_page_test: conform test to
 TAP format output

Conform the layout, informational and status messages to TAP.  No
functional change is intended other than the layout of output messages.

Link: https://lkml.kernel.org/r/20240202113119.2047740-9-usama.anjum@collabora.com
Signed-off-by: Muhammad Usama Anjum <usama.anjum@collabora.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 .../selftests/mm/split_huge_page_test.c       | 161 ++++++++----------
 1 file changed, 69 insertions(+), 92 deletions(-)

diff --git a/tools/testing/selftests/mm/split_huge_page_test.c b/tools/testing/selftests/mm/split_huge_page_test.c
index 0e74635c8c3d97..7b698a848babf1 100644
--- a/tools/testing/selftests/mm/split_huge_page_test.c
+++ b/tools/testing/selftests/mm/split_huge_page_test.c
@@ -17,6 +17,7 @@
 #include <malloc.h>
 #include <stdbool.h>
 #include "vm_util.h"
+#include "../kselftest.h"
 
 uint64_t pagesize;
 unsigned int pageshift;
@@ -50,21 +51,19 @@ int is_backed_by_thp(char *vaddr, int pagemap_file, int kpageflags_file)
 	return 0;
 }
 
-static int write_file(const char *path, const char *buf, size_t buflen)
+static void write_file(const char *path, const char *buf, size_t buflen)
 {
 	int fd;
 	ssize_t numwritten;
 
 	fd = open(path, O_WRONLY);
 	if (fd == -1)
-		return 0;
+		ksft_exit_fail_msg("%s open failed: %s\n", path, strerror(errno));
 
 	numwritten = write(fd, buf, buflen - 1);
 	close(fd);
 	if (numwritten < 1)
-		return 0;
-
-	return (unsigned int) numwritten;
+		ksft_exit_fail_msg("Write failed\n");
 }
 
 static void write_debugfs(const char *fmt, ...)
@@ -77,15 +76,10 @@ static void write_debugfs(const char *fmt, ...)
 	ret = vsnprintf(input, INPUT_MAX, fmt, argp);
 	va_end(argp);
 
-	if (ret >= INPUT_MAX) {
-		printf("%s: Debugfs input is too long\n", __func__);
-		exit(EXIT_FAILURE);
-	}
+	if (ret >= INPUT_MAX)
+		ksft_exit_fail_msg("%s: Debugfs input is too long\n", __func__);
 
-	if (!write_file(SPLIT_DEBUGFS, input, ret + 1)) {
-		perror(SPLIT_DEBUGFS);
-		exit(EXIT_FAILURE);
-	}
+	write_file(SPLIT_DEBUGFS, input, ret + 1);
 }
 
 void split_pmd_thp(void)
@@ -95,39 +89,30 @@ void split_pmd_thp(void)
 	size_t i;
 
 	one_page = memalign(pmd_pagesize, len);
-
-	if (!one_page) {
-		printf("Fail to allocate memory\n");
-		exit(EXIT_FAILURE);
-	}
+	if (!one_page)
+		ksft_exit_fail_msg("Fail to allocate memory: %s\n", strerror(errno));
 
 	madvise(one_page, len, MADV_HUGEPAGE);
 
 	for (i = 0; i < len; i++)
 		one_page[i] = (char)i;
 
-	if (!check_huge_anon(one_page, 4, pmd_pagesize)) {
-		printf("No THP is allocated\n");
-		exit(EXIT_FAILURE);
-	}
+	if (!check_huge_anon(one_page, 4, pmd_pagesize))
+		ksft_exit_fail_msg("No THP is allocated\n");
 
 	/* split all THPs */
 	write_debugfs(PID_FMT, getpid(), (uint64_t)one_page,
 		(uint64_t)one_page + len);
 
 	for (i = 0; i < len; i++)
-		if (one_page[i] != (char)i) {
-			printf("%ld byte corrupted\n", i);
-			exit(EXIT_FAILURE);
-		}
+		if (one_page[i] != (char)i)
+			ksft_exit_fail_msg("%ld byte corrupted\n", i);
 
 
-	if (!check_huge_anon(one_page, 0, pmd_pagesize)) {
-		printf("Still AnonHugePages not split\n");
-		exit(EXIT_FAILURE);
-	}
+	if (!check_huge_anon(one_page, 0, pmd_pagesize))
+		ksft_exit_fail_msg("Still AnonHugePages not split\n");
 
-	printf("Split huge pages successful\n");
+	ksft_test_result_pass("Split huge pages successful\n");
 	free(one_page);
 }
 
@@ -143,36 +128,29 @@ void split_pte_mapped_thp(void)
 	int pagemap_fd;
 	int kpageflags_fd;
 
-	if (snprintf(pagemap_proc, 255, pagemap_template, getpid()) < 0) {
-		perror("get pagemap proc error");
-		exit(EXIT_FAILURE);
-	}
-	pagemap_fd = open(pagemap_proc, O_RDONLY);
+	if (snprintf(pagemap_proc, 255, pagemap_template, getpid()) < 0)
+		ksft_exit_fail_msg("get pagemap proc error: %s\n", strerror(errno));
 
-	if (pagemap_fd == -1) {
-		perror("read pagemap:");
-		exit(EXIT_FAILURE);
-	}
+	pagemap_fd = open(pagemap_proc, O_RDONLY);
+	if (pagemap_fd == -1)
+		ksft_exit_fail_msg("read pagemap: %s\n", strerror(errno));
 
 	kpageflags_fd = open(kpageflags_proc, O_RDONLY);
-
-	if (kpageflags_fd == -1) {
-		perror("read kpageflags:");
-		exit(EXIT_FAILURE);
-	}
+	if (kpageflags_fd == -1)
+		ksft_exit_fail_msg("read kpageflags: %s\n", strerror(errno));
 
 	one_page = mmap((void *)(1UL << 30), len, PROT_READ | PROT_WRITE,
 			MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+	if (one_page == MAP_FAILED)
+		ksft_exit_fail_msg("Fail to allocate memory: %s\n", strerror(errno));
 
 	madvise(one_page, len, MADV_HUGEPAGE);
 
 	for (i = 0; i < len; i++)
 		one_page[i] = (char)i;
 
-	if (!check_huge_anon(one_page, 4, pmd_pagesize)) {
-		printf("No THP is allocated\n");
-		exit(EXIT_FAILURE);
-	}
+	if (!check_huge_anon(one_page, 4, pmd_pagesize))
+		ksft_exit_fail_msg("No THP is allocated\n");
 
 	/* remap the first pagesize of first THP */
 	pte_mapped = mremap(one_page, pagesize, pagesize, MREMAP_MAYMOVE);
@@ -183,10 +161,8 @@ void split_pte_mapped_thp(void)
 				     pagesize, pagesize,
 				     MREMAP_MAYMOVE|MREMAP_FIXED,
 				     pte_mapped + pagesize * i);
-		if (pte_mapped2 == (char *)-1) {
-			perror("mremap failed");
-			exit(EXIT_FAILURE);
-		}
+		if (pte_mapped2 == MAP_FAILED)
+			ksft_exit_fail_msg("mremap failed: %s\n", strerror(errno));
 	}
 
 	/* smap does not show THPs after mremap, use kpageflags instead */
@@ -196,10 +172,8 @@ void split_pte_mapped_thp(void)
 		    is_backed_by_thp(&pte_mapped[i], pagemap_fd, kpageflags_fd))
 			thp_size++;
 
-	if (thp_size != 4) {
-		printf("Some THPs are missing during mremap\n");
-		exit(EXIT_FAILURE);
-	}
+	if (thp_size != 4)
+		ksft_exit_fail_msg("Some THPs are missing during mremap\n");
 
 	/* split all remapped THPs */
 	write_debugfs(PID_FMT, getpid(), (uint64_t)pte_mapped,
@@ -208,21 +182,18 @@ void split_pte_mapped_thp(void)
 	/* smap does not show THPs after mremap, use kpageflags instead */
 	thp_size = 0;
 	for (i = 0; i < pagesize * 4; i++) {
-		if (pte_mapped[i] != (char)i) {
-			printf("%ld byte corrupted\n", i);
-			exit(EXIT_FAILURE);
-		}
+		if (pte_mapped[i] != (char)i)
+			ksft_exit_fail_msg("%ld byte corrupted\n", i);
+
 		if (i % pagesize == 0 &&
 		    is_backed_by_thp(&pte_mapped[i], pagemap_fd, kpageflags_fd))
 			thp_size++;
 	}
 
-	if (thp_size) {
-		printf("Still %ld THPs not split\n", thp_size);
-		exit(EXIT_FAILURE);
-	}
+	if (thp_size)
+		ksft_exit_fail_msg("Still %ld THPs not split\n", thp_size);
 
-	printf("Split PTE-mapped huge pages successful\n");
+	ksft_test_result_pass("Split PTE-mapped huge pages successful\n");
 	munmap(one_page, len);
 	close(pagemap_fd);
 	close(kpageflags_fd);
@@ -238,24 +209,21 @@ void split_file_backed_thp(void)
 	char testfile[INPUT_MAX];
 	uint64_t pgoff_start = 0, pgoff_end = 1024;
 
-	printf("Please enable pr_debug in split_huge_pages_in_file() if you need more info.\n");
+	ksft_print_msg("Please enable pr_debug in split_huge_pages_in_file() for more info.\n");
 
 	status = mount("tmpfs", tmpfs_loc, "tmpfs", 0, "huge=always,size=4m");
 
-	if (status) {
-		printf("Unable to create a tmpfs for testing\n");
-		exit(EXIT_FAILURE);
-	}
+	if (status)
+		ksft_exit_fail_msg("Unable to create a tmpfs for testing\n");
 
 	status = snprintf(testfile, INPUT_MAX, "%s/thp_file", tmpfs_loc);
 	if (status >= INPUT_MAX) {
-		printf("Fail to create file-backed THP split testing file\n");
-		goto cleanup;
+		ksft_exit_fail_msg("Fail to create file-backed THP split testing file\n");
 	}
 
 	fd = open(testfile, O_CREAT|O_WRONLY);
 	if (fd == -1) {
-		perror("Cannot open testing file\n");
+		ksft_perror("Cannot open testing file");
 		goto cleanup;
 	}
 
@@ -264,7 +232,7 @@ void split_file_backed_thp(void)
 	close(fd);
 
 	if (num_written < 1) {
-		printf("Fail to write data to testing file\n");
+		ksft_perror("Fail to write data to testing file");
 		goto cleanup;
 	}
 
@@ -272,42 +240,51 @@ void split_file_backed_thp(void)
 	write_debugfs(PATH_FMT, testfile, pgoff_start, pgoff_end);
 
 	status = unlink(testfile);
-	if (status)
-		perror("Cannot remove testing file\n");
+	if (status) {
+		ksft_perror("Cannot remove testing file");
+		goto cleanup;
+	}
 
-cleanup:
 	status = umount(tmpfs_loc);
 	if (status) {
-		printf("Unable to umount %s\n", tmpfs_loc);
-		exit(EXIT_FAILURE);
+		rmdir(tmpfs_loc);
+		ksft_exit_fail_msg("Unable to umount %s\n", tmpfs_loc);
 	}
+
 	status = rmdir(tmpfs_loc);
-	if (status) {
-		perror("cannot remove tmp dir");
-		exit(EXIT_FAILURE);
-	}
+	if (status)
+		ksft_exit_fail_msg("cannot remove tmp dir: %s\n", strerror(errno));
 
-	printf("file-backed THP split test done, please check dmesg for more information\n");
+	ksft_print_msg("Please check dmesg for more information\n");
+	ksft_test_result_pass("File-backed THP split test done\n");
+	return;
+
+cleanup:
+	umount(tmpfs_loc);
+	rmdir(tmpfs_loc);
+	ksft_exit_fail_msg("Error occurred\n");
 }
 
 int main(int argc, char **argv)
 {
+	ksft_print_header();
+
 	if (geteuid() != 0) {
-		printf("Please run the benchmark as root\n");
-		exit(EXIT_FAILURE);
+		ksft_print_msg("Please run the benchmark as root\n");
+		ksft_finished();
 	}
 
+	ksft_set_plan(3);
+
 	pagesize = getpagesize();
 	pageshift = ffs(pagesize) - 1;
 	pmd_pagesize = read_pmd_pagesize();
-	if (!pmd_pagesize) {
-		printf("Reading PMD pagesize failed\n");
-		exit(EXIT_FAILURE);
-	}
+	if (!pmd_pagesize)
+		ksft_exit_fail_msg("Reading PMD pagesize failed\n");
 
 	split_pmd_thp();
 	split_pte_mapped_thp();
 	split_file_backed_thp();
 
-	return 0;
+	ksft_finished();
 }

From b4a44c6f675361993e56747f95b5cec7d1811078 Mon Sep 17 00:00:00 2001
From: Muhammad Usama Anjum <usama.anjum@collabora.com>
Date: Fri, 2 Feb 2024 16:31:17 +0500
Subject: [PATCH 1124/1406] selftests/mm: thuge-gen: conform to TAP format
 output

Conform the layout, informational and status messages to TAP.  No
functional change is intended other than the layout of output messages.

Also remove unneeded logging which isn't enabled.  Skip a hugepage size if
it has less free pages to avoid unnecessary failures.  For examples, some
systems may not have 1GB hugepage free.  So skip 1GB for testing in this
test instead of failing the entire test.

Link: https://lkml.kernel.org/r/20240202113119.2047740-11-usama.anjum@collabora.com
Signed-off-by: Muhammad Usama Anjum <usama.anjum@collabora.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/thuge-gen.c | 147 +++++++++++++------------
 1 file changed, 75 insertions(+), 72 deletions(-)

diff --git a/tools/testing/selftests/mm/thuge-gen.c b/tools/testing/selftests/mm/thuge-gen.c
index 622987f12c89a3..ea7fd8fe287630 100644
--- a/tools/testing/selftests/mm/thuge-gen.c
+++ b/tools/testing/selftests/mm/thuge-gen.c
@@ -4,7 +4,7 @@
    Before running this huge pages for each huge page size must have been
    reserved.
    For large pages beyond MAX_PAGE_ORDER (like 1GB on x86) boot options must
-   be used.
+   be used. 1GB wouldn't be tested if it isn't available.
    Also shmmax must be increased.
    And you need to run as root to work around some weird permissions in shm.
    And nothing using huge pages should run in parallel.
@@ -26,8 +26,7 @@
 #include <stdarg.h>
 #include <string.h>
 #include "vm_util.h"
-
-#define err(x) perror(x), exit(1)
+#include "../kselftest.h"
 
 #define MAP_HUGE_2MB    (21 << MAP_HUGE_SHIFT)
 #define MAP_HUGE_1GB    (30 << MAP_HUGE_SHIFT)
@@ -44,11 +43,8 @@
 #define SHM_HUGE_1GB    (30 << SHM_HUGE_SHIFT)
 
 #define NUM_PAGESIZES   5
-
 #define NUM_PAGES 4
 
-#define Dprintf(fmt...) // printf(fmt)
-
 unsigned long page_sizes[NUM_PAGESIZES];
 int num_page_sizes;
 
@@ -60,28 +56,15 @@ int ilog2(unsigned long v)
 	return l;
 }
 
-void find_pagesizes(void)
-{
-	glob_t g;
-	int i;
-	glob("/sys/kernel/mm/hugepages/hugepages-*kB", 0, NULL, &g);
-	assert(g.gl_pathc <= NUM_PAGESIZES);
-	for (i = 0; i < g.gl_pathc; i++) {
-		sscanf(g.gl_pathv[i], "/sys/kernel/mm/hugepages/hugepages-%lukB",
-				&page_sizes[i]);
-		page_sizes[i] <<= 10;
-		printf("Found %luMB\n", page_sizes[i] >> 20);
-	}
-	num_page_sizes = g.gl_pathc;
-	globfree(&g);
-}
-
 void show(unsigned long ps)
 {
 	char buf[100];
+
 	if (ps == getpagesize())
 		return;
-	printf("%luMB: ", ps >> 20);
+
+	ksft_print_msg("%luMB: ", ps >> 20);
+
 	fflush(stdout);
 	snprintf(buf, sizeof buf,
 		"cat /sys/kernel/mm/hugepages/hugepages-%lukB/free_hugepages",
@@ -105,7 +88,7 @@ unsigned long read_sysfs(int warn, char *fmt, ...)
 	f = fopen(buf, "r");
 	if (!f) {
 		if (warn)
-			printf("missing %s\n", buf);
+			ksft_print_msg("missing %s\n", buf);
 		return 0;
 	}
 	if (getline(&line, &linelen, f) > 0) {
@@ -119,123 +102,143 @@ unsigned long read_sysfs(int warn, char *fmt, ...)
 unsigned long read_free(unsigned long ps)
 {
 	return read_sysfs(ps != getpagesize(),
-			"/sys/kernel/mm/hugepages/hugepages-%lukB/free_hugepages",
-			ps >> 10);
+			  "/sys/kernel/mm/hugepages/hugepages-%lukB/free_hugepages",
+			  ps >> 10);
 }
 
 void test_mmap(unsigned long size, unsigned flags)
 {
 	char *map;
 	unsigned long before, after;
-	int err;
 
 	before = read_free(size);
 	map = mmap(NULL, size*NUM_PAGES, PROT_READ|PROT_WRITE,
 			MAP_PRIVATE|MAP_ANONYMOUS|MAP_HUGETLB|flags, -1, 0);
+	if (map == MAP_FAILED)
+		ksft_exit_fail_msg("mmap: %s\n", strerror(errno));
 
-	if (map == (char *)-1) err("mmap");
 	memset(map, 0xff, size*NUM_PAGES);
 	after = read_free(size);
-	Dprintf("before %lu after %lu diff %ld size %lu\n",
-		before, after, before - after, size);
-	assert(size == getpagesize() || (before - after) == NUM_PAGES);
+
 	show(size);
-	err = munmap(map, size * NUM_PAGES);
-	assert(!err);
+	ksft_test_result(size == getpagesize() || (before - after) == NUM_PAGES,
+			 "%s mmap\n", __func__);
+
+	if (munmap(map, size * NUM_PAGES))
+		ksft_exit_fail_msg("%s: unmap %s\n", __func__, strerror(errno));
 }
 
 void test_shmget(unsigned long size, unsigned flags)
 {
 	int id;
 	unsigned long before, after;
-	int err;
+	struct shm_info i;
+	char *map;
 
 	before = read_free(size);
 	id = shmget(IPC_PRIVATE, size * NUM_PAGES, IPC_CREAT|0600|flags);
-	if (id < 0) err("shmget");
-
-	struct shm_info i;
-	if (shmctl(id, SHM_INFO, (void *)&i) < 0) err("shmctl");
-	Dprintf("alloc %lu res %lu\n", i.shm_tot, i.shm_rss);
+	if (id < 0) {
+		if (errno == EPERM) {
+			ksft_test_result_skip("shmget requires root privileges: %s\n",
+					      strerror(errno));
+			return;
+		}
+		ksft_exit_fail_msg("shmget: %s\n", strerror(errno));
+	}
 
+	if (shmctl(id, SHM_INFO, (void *)&i) < 0)
+		ksft_exit_fail_msg("shmctl: %s\n", strerror(errno));
 
-	Dprintf("id %d\n", id);
-	char *map = shmat(id, NULL, 0600);
-	if (map == (char*)-1) err("shmat");
+	map = shmat(id, NULL, 0600);
+	if (map == MAP_FAILED)
+		ksft_exit_fail_msg("shmat: %s\n", strerror(errno));
 
 	shmctl(id, IPC_RMID, NULL);
 
 	memset(map, 0xff, size*NUM_PAGES);
 	after = read_free(size);
 
-	Dprintf("before %lu after %lu diff %ld size %lu\n",
-		before, after, before - after, size);
-	assert(size == getpagesize() || (before - after) == NUM_PAGES);
 	show(size);
-	err = shmdt(map);
-	assert(!err);
+	ksft_test_result(size == getpagesize() || (before - after) == NUM_PAGES,
+			 "%s: mmap\n", __func__);
+	if (shmdt(map))
+		ksft_exit_fail_msg("%s: shmdt: %s\n", __func__, strerror(errno));
 }
 
-void sanity_checks(void)
+void find_pagesizes(void)
 {
-	int i;
 	unsigned long largest = getpagesize();
+	int i;
+	glob_t g;
 
-	for (i = 0; i < num_page_sizes; i++) {
-		if (page_sizes[i] > largest)
+	glob("/sys/kernel/mm/hugepages/hugepages-*kB", 0, NULL, &g);
+	assert(g.gl_pathc <= NUM_PAGESIZES);
+	for (i = 0; (i < g.gl_pathc) && (num_page_sizes < NUM_PAGESIZES); i++) {
+		sscanf(g.gl_pathv[i], "/sys/kernel/mm/hugepages/hugepages-%lukB",
+				&page_sizes[num_page_sizes]);
+		page_sizes[num_page_sizes] <<= 10;
+		ksft_print_msg("Found %luMB\n", page_sizes[i] >> 20);
+
+		if (page_sizes[num_page_sizes] > largest)
 			largest = page_sizes[i];
 
-		if (read_free(page_sizes[i]) < NUM_PAGES) {
-			printf("Not enough huge pages for page size %lu MB, need %u\n",
-				page_sizes[i] >> 20,
-				NUM_PAGES);
-			exit(0);
-		}
+		if (read_free(page_sizes[num_page_sizes]) >= NUM_PAGES)
+			num_page_sizes++;
+		else
+			ksft_print_msg("SKIP for size %lu MB as not enough huge pages, need %u\n",
+				       page_sizes[num_page_sizes] >> 20, NUM_PAGES);
 	}
+	globfree(&g);
 
-	if (read_sysfs(0, "/proc/sys/kernel/shmmax") < NUM_PAGES * largest) {
-		printf("Please do echo %lu > /proc/sys/kernel/shmmax", largest * NUM_PAGES);
-		exit(0);
-	}
+	if (read_sysfs(0, "/proc/sys/kernel/shmmax") < NUM_PAGES * largest)
+		ksft_exit_fail_msg("Please do echo %lu > /proc/sys/kernel/shmmax",
+				   largest * NUM_PAGES);
 
 #if defined(__x86_64__)
 	if (largest != 1U<<30) {
-		printf("No GB pages available on x86-64\n"
-		       "Please boot with hugepagesz=1G hugepages=%d\n", NUM_PAGES);
-		exit(0);
+		ksft_exit_fail_msg("No GB pages available on x86-64\n"
+				   "Please boot with hugepagesz=1G hugepages=%d\n", NUM_PAGES);
 	}
 #endif
 }
 
 int main(void)
 {
-	int i;
 	unsigned default_hps = default_huge_page_size();
+	int i;
+
+	ksft_print_header();
 
 	find_pagesizes();
 
-	sanity_checks();
+	if (!num_page_sizes)
+		ksft_finished();
+
+	ksft_set_plan(2 * num_page_sizes + 3);
 
 	for (i = 0; i < num_page_sizes; i++) {
 		unsigned long ps = page_sizes[i];
 		int arg = ilog2(ps) << MAP_HUGE_SHIFT;
-		printf("Testing %luMB mmap with shift %x\n", ps >> 20, arg);
+
+		ksft_print_msg("Testing %luMB mmap with shift %x\n", ps >> 20, arg);
 		test_mmap(ps, MAP_HUGETLB | arg);
 	}
-	printf("Testing default huge mmap\n");
+
+	ksft_print_msg("Testing default huge mmap\n");
 	test_mmap(default_hps, MAP_HUGETLB);
 
-	puts("Testing non-huge shmget");
+	ksft_print_msg("Testing non-huge shmget\n");
 	test_shmget(getpagesize(), 0);
 
 	for (i = 0; i < num_page_sizes; i++) {
 		unsigned long ps = page_sizes[i];
 		int arg = ilog2(ps) << SHM_HUGE_SHIFT;
-		printf("Testing %luMB shmget with shift %x\n", ps >> 20, arg);
+		ksft_print_msg("Testing %luMB shmget with shift %x\n", ps >> 20, arg);
 		test_shmget(ps, SHM_HUGETLB | arg);
 	}
-	puts("default huge shmget");
+
+	ksft_print_msg("default huge shmget\n");
 	test_shmget(default_hps, SHM_HUGETLB);
 
-	return 0;
+	ksft_finished();
 }

From 4f376a934d68e974fc56fdb66f133983aec0385a Mon Sep 17 00:00:00 2001
From: Muhammad Usama Anjum <usama.anjum@collabora.com>
Date: Fri, 2 Feb 2024 16:31:18 +0500
Subject: [PATCH 1125/1406] selftests/mm: transhuge-stress: conform to TAP
 format output

Conform the layout, informational and status messages to TAP.  No
functional change is intended other than the layout of output messages.

Link: https://lkml.kernel.org/r/20240202113119.2047740-12-usama.anjum@collabora.com
Signed-off-by: Muhammad Usama Anjum <usama.anjum@collabora.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/transhuge-stress.c | 36 +++++++++++--------
 tools/testing/selftests/mm/vm_util.c          |  6 ++--
 2 files changed, 25 insertions(+), 17 deletions(-)

diff --git a/tools/testing/selftests/mm/transhuge-stress.c b/tools/testing/selftests/mm/transhuge-stress.c
index c61fb9350b8c21..68201192e37c8d 100644
--- a/tools/testing/selftests/mm/transhuge-stress.c
+++ b/tools/testing/selftests/mm/transhuge-stress.c
@@ -16,6 +16,7 @@
 #include <string.h>
 #include <sys/mman.h>
 #include "vm_util.h"
+#include "../kselftest.h"
 
 int backing_fd = -1;
 int mmap_flags = MAP_ANONYMOUS | MAP_NORESERVE | MAP_PRIVATE;
@@ -34,6 +35,8 @@ int main(int argc, char **argv)
 	int pagemap_fd;
 	int duration = 0;
 
+	ksft_print_header();
+
 	ram = sysconf(_SC_PHYS_PAGES);
 	if (ram > SIZE_MAX / psize() / 4)
 		ram = SIZE_MAX / 4;
@@ -43,7 +46,8 @@ int main(int argc, char **argv)
 
 	while (++i < argc) {
 		if (!strcmp(argv[i], "-h"))
-			errx(1, "usage: %s [-f <filename>] [-d <duration>] [size in MiB]", argv[0]);
+			ksft_exit_fail_msg("usage: %s [-f <filename>] [-d <duration>] [size in MiB]\n",
+					   argv[0]);
 		else if (!strcmp(argv[i], "-f"))
 			name = argv[++i];
 		else if (!strcmp(argv[i], "-d"))
@@ -52,10 +56,12 @@ int main(int argc, char **argv)
 			len = atoll(argv[i]) << 20;
 	}
 
+	ksft_set_plan(1);
+
 	if (name) {
 		backing_fd = open(name, O_RDWR);
 		if (backing_fd == -1)
-			errx(2, "open %s", name);
+			ksft_exit_fail_msg("open %s\n", name);
 		mmap_flags = MAP_SHARED;
 	}
 
@@ -65,21 +71,21 @@ int main(int argc, char **argv)
 
 	pagemap_fd = open("/proc/self/pagemap", O_RDONLY);
 	if (pagemap_fd < 0)
-		err(2, "open pagemap");
+		ksft_exit_fail_msg("open pagemap\n");
 
 	len -= len % HPAGE_SIZE;
 	ptr = mmap(NULL, len + HPAGE_SIZE, PROT_RW, mmap_flags, backing_fd, 0);
 	if (ptr == MAP_FAILED)
-		err(2, "initial mmap");
+		ksft_exit_fail_msg("initial mmap");
 	ptr += HPAGE_SIZE - (uintptr_t)ptr % HPAGE_SIZE;
 
 	if (madvise(ptr, len, MADV_HUGEPAGE))
-		err(2, "MADV_HUGEPAGE");
+		ksft_exit_fail_msg("MADV_HUGEPAGE");
 
 	map_len = ram >> (HPAGE_SHIFT - 1);
 	map = malloc(map_len);
 	if (!map)
-		errx(2, "map malloc");
+		ksft_exit_fail_msg("map malloc\n");
 
 	clock_gettime(CLOCK_MONOTONIC, &start);
 
@@ -103,7 +109,7 @@ int main(int argc, char **argv)
 				if (idx >= map_len) {
 					map = realloc(map, idx + 1);
 					if (!map)
-						errx(2, "map realloc");
+						ksft_exit_fail_msg("map realloc\n");
 					memset(map + map_len, 0, idx + 1 - map_len);
 					map_len = idx + 1;
 				}
@@ -114,17 +120,19 @@ int main(int argc, char **argv)
 
 			/* split transhuge page, keep last page */
 			if (madvise(p, HPAGE_SIZE - psize(), MADV_DONTNEED))
-				err(2, "MADV_DONTNEED");
+				ksft_exit_fail_msg("MADV_DONTNEED");
 		}
 		clock_gettime(CLOCK_MONOTONIC, &b);
 		s = b.tv_sec - a.tv_sec + (b.tv_nsec - a.tv_nsec) / 1000000000.;
 
-		warnx("%.3f s/loop, %.3f ms/page, %10.3f MiB/s\t"
-		      "%4d succeed, %4d failed, %4d different pages",
-		      s, s * 1000 / (len >> HPAGE_SHIFT), len / s / (1 << 20),
-		      nr_succeed, nr_failed, nr_pages);
+		ksft_print_msg("%.3f s/loop, %.3f ms/page, %10.3f MiB/s\t"
+			       "%4d succeed, %4d failed, %4d different pages\n",
+			       s, s * 1000 / (len >> HPAGE_SHIFT), len / s / (1 << 20),
+			       nr_succeed, nr_failed, nr_pages);
 
-		if (duration > 0 && b.tv_sec - start.tv_sec >= duration)
-			return 0;
+		if (duration > 0 && b.tv_sec - start.tv_sec >= duration) {
+			ksft_test_result_pass("Completed\n");
+			ksft_finished();
+		}
 	}
 }
diff --git a/tools/testing/selftests/mm/vm_util.c b/tools/testing/selftests/mm/vm_util.c
index 05736c615734fe..5a62530da3b563 100644
--- a/tools/testing/selftests/mm/vm_util.c
+++ b/tools/testing/selftests/mm/vm_util.c
@@ -232,17 +232,17 @@ int64_t allocate_transhuge(void *ptr, int pagemap_fd)
 	if (mmap(ptr, HPAGE_SIZE, PROT_READ | PROT_WRITE,
 		 MAP_FIXED | MAP_ANONYMOUS |
 		 MAP_NORESERVE | MAP_PRIVATE, -1, 0) != ptr)
-		errx(2, "mmap transhuge");
+		ksft_exit_fail_msg("mmap transhuge\n");
 
 	if (madvise(ptr, HPAGE_SIZE, MADV_HUGEPAGE))
-		err(2, "MADV_HUGEPAGE");
+		ksft_exit_fail_msg("MADV_HUGEPAGE\n");
 
 	/* allocate transparent huge page */
 	*(volatile void **)ptr = ptr;
 
 	if (pread(pagemap_fd, ent, sizeof(ent),
 		  (uintptr_t)ptr >> (pshift() - 3)) != sizeof(ent))
-		err(2, "read pagemap");
+		ksft_exit_fail_msg("read pagemap\n");
 
 	if (PAGEMAP_PRESENT(ent[0]) && PAGEMAP_PRESENT(ent[1]) &&
 	    PAGEMAP_PFN(ent[0]) + 1 == PAGEMAP_PFN(ent[1]) &&

From 9bfffe4ee9e2a487168ae3e0a57b210878eac027 Mon Sep 17 00:00:00 2001
From: Muhammad Usama Anjum <usama.anjum@collabora.com>
Date: Fri, 2 Feb 2024 16:31:19 +0500
Subject: [PATCH 1126/1406] selftests/mm: virtual_address_range: conform to TAP
 format output

Conform the layout, informational and status messages to TAP.  No
functional change is intended other than the layout of output messages.

Link: https://lkml.kernel.org/r/20240202113119.2047740-13-usama.anjum@collabora.com
Signed-off-by: Muhammad Usama Anjum <usama.anjum@collabora.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 .../selftests/mm/virtual_address_range.c      | 44 +++++++++----------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/tools/testing/selftests/mm/virtual_address_range.c b/tools/testing/selftests/mm/virtual_address_range.c
index bae0ceaf95b13b..7bcf8d48256a66 100644
--- a/tools/testing/selftests/mm/virtual_address_range.c
+++ b/tools/testing/selftests/mm/virtual_address_range.c
@@ -12,6 +12,7 @@
 #include <errno.h>
 #include <sys/mman.h>
 #include <sys/time.h>
+#include "../kselftest.h"
 
 /*
  * Maximum address range mapped with a single mmap()
@@ -68,23 +69,15 @@ static char *hind_addr(void)
 	return (char *) (1UL << bits);
 }
 
-static int validate_addr(char *ptr, int high_addr)
+static void validate_addr(char *ptr, int high_addr)
 {
 	unsigned long addr = (unsigned long) ptr;
 
-	if (high_addr) {
-		if (addr < HIGH_ADDR_MARK) {
-			printf("Bad address %lx\n", addr);
-			return 1;
-		}
-		return 0;
-	}
+	if (high_addr && addr < HIGH_ADDR_MARK)
+		ksft_exit_fail_msg("Bad address %lx\n", addr);
 
-	if (addr > HIGH_ADDR_MARK) {
-		printf("Bad address %lx\n", addr);
-		return 1;
-	}
-	return 0;
+	if (addr > HIGH_ADDR_MARK)
+		ksft_exit_fail_msg("Bad address %lx\n", addr);
 }
 
 static int validate_lower_address_hint(void)
@@ -107,23 +100,29 @@ int main(int argc, char *argv[])
 	char *hint;
 	unsigned long i, lchunks, hchunks;
 
+	ksft_print_header();
+	ksft_set_plan(1);
+
 	for (i = 0; i < NR_CHUNKS_LOW; i++) {
 		ptr[i] = mmap(NULL, MAP_CHUNK_SIZE, PROT_READ | PROT_WRITE,
 					MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
 
 		if (ptr[i] == MAP_FAILED) {
-			if (validate_lower_address_hint())
-				return 1;
+			if (validate_lower_address_hint()) {
+				ksft_test_result_skip("Memory constraint not fulfilled\n");
+				ksft_finished();
+			}
 			break;
 		}
 
-		if (validate_addr(ptr[i], 0))
-			return 1;
+		validate_addr(ptr[i], 0);
 	}
 	lchunks = i;
 	hptr = (char **) calloc(NR_CHUNKS_HIGH, sizeof(char *));
-	if (hptr == NULL)
-		return 1;
+	if (hptr == NULL) {
+		ksft_test_result_skip("Memory constraint not fulfilled\n");
+		ksft_finished();
+	}
 
 	for (i = 0; i < NR_CHUNKS_HIGH; i++) {
 		hint = hind_addr();
@@ -133,8 +132,7 @@ int main(int argc, char *argv[])
 		if (hptr[i] == MAP_FAILED)
 			break;
 
-		if (validate_addr(hptr[i], 1))
-			return 1;
+		validate_addr(hptr[i], 1);
 	}
 	hchunks = i;
 
@@ -145,5 +143,7 @@ int main(int argc, char *argv[])
 		munmap(hptr[i], MAP_CHUNK_SIZE);
 
 	free(hptr);
-	return 0;
+
+	ksft_test_result_pass("Test\n");
+	ksft_finished();
 }

From be9803e36c129af2a560dcb1f458ad4ba214d917 Mon Sep 17 00:00:00 2001
From: Baolin Wang <baolin.wang@linux.alibaba.com>
Date: Tue, 20 Feb 2024 14:16:31 +0800
Subject: [PATCH 1127/1406] mm: compaction: update the cc->nr_migratepages when
 allocating or freeing the freepages

Currently we will use 'cc->nr_freepages >= cc->nr_migratepages' comparison
to ensure that enough freepages are isolated in isolate_freepages(),
however it just decreases the cc->nr_freepages without updating
cc->nr_migratepages in compaction_alloc(), which will waste more CPU
cycles and cause too many freepages to be isolated.

So we should also update the cc->nr_migratepages when allocating or
freeing the freepages to avoid isolating excess freepages.  And I can see
fewer free pages are scanned and isolated when running thpcompact on my
Arm64 server:

                                       k6.7         k6.7_patched
Ops Compaction pages isolated      120692036.00   118160797.00
Ops Compaction migrate scanned     131210329.00   154093268.00
Ops Compaction free scanned       1090587971.00  1080632536.00
Ops Compact scan efficiency               12.03          14.26

Moreover, I did not see an obvious latency improvements, this is likely
because isolating freepages is not the bottleneck in the thpcompact test
case.

                              k6.7                  k6.7_patched
Amean     fault-both-1      1089.76 (   0.00%)     1080.16 *   0.88%*
Amean     fault-both-3      1616.48 (   0.00%)     1636.65 *  -1.25%*
Amean     fault-both-5      2266.66 (   0.00%)     2219.20 *   2.09%*
Amean     fault-both-7      2909.84 (   0.00%)     2801.90 *   3.71%*
Amean     fault-both-12     4861.26 (   0.00%)     4733.25 *   2.63%*
Amean     fault-both-18     7351.11 (   0.00%)     6950.51 *   5.45%*
Amean     fault-both-24     9059.30 (   0.00%)     9159.99 *  -1.11%*
Amean     fault-both-30    10685.68 (   0.00%)    11399.02 *  -6.68%*

Link: https://lkml.kernel.org/r/6440493f18da82298152b6305d6b41c2962a3ce6.1708409245.git.baolin.wang@linux.alibaba.com
Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Acked-by: Mel Gorman <mgorman@techsingularity.net>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/trace/events/compaction.h |  6 +++---
 mm/compaction.c                   | 12 ++++++++++--
 2 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h
index 2b2a975efd2077..d05759d1853896 100644
--- a/include/trace/events/compaction.h
+++ b/include/trace/events/compaction.h
@@ -78,10 +78,10 @@ DEFINE_EVENT(mm_compaction_isolate_template, mm_compaction_fast_isolate_freepage
 #ifdef CONFIG_COMPACTION
 TRACE_EVENT(mm_compaction_migratepages,
 
-	TP_PROTO(struct compact_control *cc,
+	TP_PROTO(unsigned int nr_migratepages,
 		unsigned int nr_succeeded),
 
-	TP_ARGS(cc, nr_succeeded),
+	TP_ARGS(nr_migratepages, nr_succeeded),
 
 	TP_STRUCT__entry(
 		__field(unsigned long, nr_migrated)
@@ -90,7 +90,7 @@ TRACE_EVENT(mm_compaction_migratepages,
 
 	TP_fast_assign(
 		__entry->nr_migrated = nr_succeeded;
-		__entry->nr_failed = cc->nr_migratepages - nr_succeeded;
+		__entry->nr_failed = nr_migratepages - nr_succeeded;
 	),
 
 	TP_printk("nr_migrated=%lu nr_failed=%lu",
diff --git a/mm/compaction.c b/mm/compaction.c
index 4494b2914386c1..218089b29f1369 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1798,6 +1798,7 @@ static struct folio *compaction_alloc(struct folio *src, unsigned long data)
 	dst = list_entry(cc->freepages.next, struct folio, lru);
 	list_del(&dst->lru);
 	cc->nr_freepages--;
+	cc->nr_migratepages--;
 
 	return dst;
 }
@@ -1813,6 +1814,7 @@ static void compaction_free(struct folio *dst, unsigned long data)
 
 	list_add(&dst->lru, &cc->freepages);
 	cc->nr_freepages++;
+	cc->nr_migratepages++;
 }
 
 /* possible outcome of isolate_migratepages */
@@ -2435,7 +2437,7 @@ compact_zone(struct compact_control *cc, struct capture_control *capc)
 	unsigned long last_migrated_pfn;
 	const bool sync = cc->mode != MIGRATE_ASYNC;
 	bool update_cached;
-	unsigned int nr_succeeded = 0;
+	unsigned int nr_succeeded = 0, nr_migratepages;
 
 	/*
 	 * These counters track activities during zone compaction.  Initialize
@@ -2553,11 +2555,17 @@ compact_zone(struct compact_control *cc, struct capture_control *capc)
 				pageblock_start_pfn(cc->migrate_pfn - 1));
 		}
 
+		/*
+		 * Record the number of pages to migrate since the
+		 * compaction_alloc/free() will update cc->nr_migratepages
+		 * properly.
+		 */
+		nr_migratepages = cc->nr_migratepages;
 		err = migrate_pages(&cc->migratepages, compaction_alloc,
 				compaction_free, (unsigned long)cc, cc->mode,
 				MR_COMPACTION, &nr_succeeded);
 
-		trace_mm_compaction_migratepages(cc, nr_succeeded);
+		trace_mm_compaction_migratepages(nr_migratepages, nr_succeeded);
 
 		/* All pages were either migrated or will be released */
 		cc->nr_migratepages = 0;

From c867797bc0a55ad587bd5fbf3e40149b87d88e35 Mon Sep 17 00:00:00 2001
From: Hao Ge <gehao@kylinos.cn>
Date: Wed, 31 Jan 2024 18:38:02 +0800
Subject: [PATCH 1128/1406] mm/vmscan: change the type of file from int to bool

Change the type of file from int to bool because is_file_lru return bool

Link: https://lkml.kernel.org/r/20240131103802.122920-1-gehao@kylinos.cn
Signed-off-by: Hao Ge <gehao@kylinos.cn>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/vmscan.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 1f139830b26f6c..8e52f8795d2028 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1998,7 +1998,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
 	LIST_HEAD(l_inactive);
 	unsigned nr_deactivate, nr_activate;
 	unsigned nr_rotated = 0;
-	int file = is_file_lru(lru);
+	bool file = is_file_lru(lru);
 	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
 
 	lru_add_drain();
@@ -2412,7 +2412,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
 	denominator = ap + fp;
 out:
 	for_each_evictable_lru(lru) {
-		int file = is_file_lru(lru);
+		bool file = is_file_lru(lru);
 		unsigned long lruvec_size;
 		unsigned long low, min;
 		unsigned long scan;

From ee61fb001e7f9ddf50fcf39f46e79664fd98ac1b Mon Sep 17 00:00:00 2001
From: Ryan Roberts <ryan.roberts@arm.com>
Date: Mon, 29 Jan 2024 13:46:35 +0100
Subject: [PATCH 1129/1406] arm64/mm: Make set_ptes() robust when OAs cross
 48-bit boundary

Patch series "mm/memory: optimize fork() with PTE-mapped THP", v3.

Now that the rmap overhaul[1] is upstream that provides a clean interface
for rmap batching, let's implement PTE batching during fork when
processing PTE-mapped THPs.

This series is partially based on Ryan's previous work[2] to implement
cont-pte support on arm64, but its a complete rewrite based on [1] to
optimize all architectures independent of any such PTE bits, and to use
the new rmap batching functions that simplify the code and prepare for
further rmap accounting changes.

We collect consecutive PTEs that map consecutive pages of the same large
folio, making sure that the other PTE bits are compatible, and (a) adjust
the refcount only once per batch, (b) call rmap handling functions only
once per batch and (c) perform batch PTE setting/updates.

While this series should be beneficial for adding cont-pte support on
ARM64[2], it's one of the requirements for maintaining a total mapcount[3]
for large folios with minimal added overhead and further changes[4] that
build up on top of the total mapcount.

Independent of all that, this series results in a speedup during fork with
PTE-mapped THP, which is the default with THPs that are smaller than a PMD
(for example, 16KiB to 1024KiB mTHPs for anonymous memory[5]).

On an Intel Xeon Silver 4210R CPU, fork'ing with 1GiB of PTE-mapped folios
of the same size (stddev < 1%) results in the following runtimes for
fork() (shorter is better):

Folio Size | v6.8-rc1 |      New | Change
------------------------------------------
      4KiB | 0.014328 | 0.014035 |   - 2%
     16KiB | 0.014263 | 0.01196  |   -16%
     32KiB | 0.014334 | 0.01094  |   -24%
     64KiB | 0.014046 | 0.010444 |   -26%
    128KiB | 0.014011 | 0.010063 |   -28%
    256KiB | 0.013993 | 0.009938 |   -29%
    512KiB | 0.013983 | 0.00985  |   -30%
   1024KiB | 0.013986 | 0.00982  |   -30%
   2048KiB | 0.014305 | 0.010076 |   -30%

Note that these numbers are even better than the ones from v1 (verified
over multiple reboots), even though there were only minimal code changes.
Well, I removed a pte_mkclean() call for anon folios, maybe that also
plays a role.

But my experience is that fork() is extremely sensitive to code size,
inlining, ...  so I suspect we'll see on other architectures rather a
change of -20% instead of -30%, and it will be easy to "lose" some of that
speedup in the future by subtle code changes.

Next up is PTE batching when unmapping.  Only tested on x86-64.
Compile-tested on most other architectures.

[1] https://lkml.kernel.org/r/20231220224504.646757-1-david@redhat.com
[2] https://lkml.kernel.org/r/20231218105100.172635-1-ryan.roberts@arm.com
[3] https://lkml.kernel.org/r/20230809083256.699513-1-david@redhat.com
[4] https://lkml.kernel.org/r/20231124132626.235350-1-david@redhat.com
[5] https://lkml.kernel.org/r/20231207161211.2374093-1-ryan.roberts@arm.com


This patch (of 15):

Since the high bits [51:48] of an OA are not stored contiguously in the
PTE, there is a theoretical bug in set_ptes(), which just adds PAGE_SIZE
to the pte to get the pte with the next pfn.  This works until the pfn
crosses the 48-bit boundary, at which point we overflow into the upper
attributes.

Of course one could argue (and Matthew Wilcox has :) that we will never
see a folio cross this boundary because we only allow naturally aligned
power-of-2 allocation, so this would require a half-petabyte folio.  So
its only a theoretical bug.  But its better that the code is robust
regardless.

I've implemented pte_next_pfn() as part of the fix, which is an opt-in
core-mm interface.  So that is now available to the core-mm, which will be
needed shortly to support forthcoming fork()-batching optimizations.

Link: https://lkml.kernel.org/r/20240129124649.189745-1-david@redhat.com
Link: https://lkml.kernel.org/r/20240125173534.1659317-1-ryan.roberts@arm.com
Link: https://lkml.kernel.org/r/20240129124649.189745-2-david@redhat.com
Fixes: 4a169d61c2ed ("arm64: implement the new page table range API")
Closes: https://lore.kernel.org/linux-mm/fdaeb9a5-d890-499a-92c8-d171df43ad01@arm.com/
Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Tested-by: Ryan Roberts <ryan.roberts@arm.com>
Reviewed-by: Mike Rapoport (IBM) <rppt@kernel.org>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@kernel.org>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: David S. Miller <davem@davemloft.net>
Cc: Dinh Nguyen <dinguyen@kernel.org>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Naveen N. Rao <naveen.n.rao@linux.ibm.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Russell King (Oracle) <linux@armlinux.org.uk>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Alexandre Ghiti <alexghiti@rivosinc.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/arm64/include/asm/pgtable.h | 28 +++++++++++++++++-----------
 1 file changed, 17 insertions(+), 11 deletions(-)

diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 79ce70fbb751c6..52d0b0a763f164 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -341,6 +341,22 @@ static inline void __sync_cache_and_tags(pte_t pte, unsigned int nr_pages)
 		mte_sync_tags(pte, nr_pages);
 }
 
+/*
+ * Select all bits except the pfn
+ */
+static inline pgprot_t pte_pgprot(pte_t pte)
+{
+	unsigned long pfn = pte_pfn(pte);
+
+	return __pgprot(pte_val(pfn_pte(pfn, __pgprot(0))) ^ pte_val(pte));
+}
+
+#define pte_next_pfn pte_next_pfn
+static inline pte_t pte_next_pfn(pte_t pte)
+{
+	return pfn_pte(pte_pfn(pte) + 1, pte_pgprot(pte));
+}
+
 static inline void set_ptes(struct mm_struct *mm,
 			    unsigned long __always_unused addr,
 			    pte_t *ptep, pte_t pte, unsigned int nr)
@@ -354,7 +370,7 @@ static inline void set_ptes(struct mm_struct *mm,
 		if (--nr == 0)
 			break;
 		ptep++;
-		pte_val(pte) += PAGE_SIZE;
+		pte = pte_next_pfn(pte);
 	}
 }
 #define set_ptes set_ptes
@@ -433,16 +449,6 @@ static inline pte_t pte_swp_clear_exclusive(pte_t pte)
 	return clear_pte_bit(pte, __pgprot(PTE_SWP_EXCLUSIVE));
 }
 
-/*
- * Select all bits except the pfn
- */
-static inline pgprot_t pte_pgprot(pte_t pte)
-{
-	unsigned long pfn = pte_pfn(pte);
-
-	return __pgprot(pte_val(pfn_pte(pfn, __pgprot(0))) ^ pte_val(pte));
-}
-
 #ifdef CONFIG_NUMA_BALANCING
 /*
  * See the comment in include/linux/pgtable.h

From 1751e68babc5fb2c51cceb7c24a83cb4e9933823 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Mon, 29 Jan 2024 13:46:36 +0100
Subject: [PATCH 1130/1406] arm/pgtable: define PFN_PTE_SHIFT

We want to make use of pte_next_pfn() outside of set_ptes().  Let's simply
define PFN_PTE_SHIFT, required by pte_next_pfn().

Link: https://lkml.kernel.org/r/20240129124649.189745-3-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Tested-by: Ryan Roberts <ryan.roberts@arm.com>
Reviewed-by: Mike Rapoport (IBM) <rppt@kernel.org>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Alexandre Ghiti <alexghiti@rivosinc.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: David S. Miller <davem@davemloft.net>
Cc: Dinh Nguyen <dinguyen@kernel.org>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Naveen N. Rao <naveen.n.rao@linux.ibm.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Russell King (Oracle) <linux@armlinux.org.uk>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/arm/include/asm/pgtable.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/arm/include/asm/pgtable.h b/arch/arm/include/asm/pgtable.h
index d657b84b6bf706..be91e376df79e4 100644
--- a/arch/arm/include/asm/pgtable.h
+++ b/arch/arm/include/asm/pgtable.h
@@ -209,6 +209,8 @@ static inline void __sync_icache_dcache(pte_t pteval)
 extern void __sync_icache_dcache(pte_t pteval);
 #endif
 
+#define PFN_PTE_SHIFT		PAGE_SHIFT
+
 void set_ptes(struct mm_struct *mm, unsigned long addr,
 		      pte_t *ptep, pte_t pteval, unsigned int nr);
 #define set_ptes set_ptes

From 6846f397d7a7baa1674c0b0db3bb65b7bc0edb27 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Mon, 29 Jan 2024 13:46:37 +0100
Subject: [PATCH 1131/1406] nios2/pgtable: define PFN_PTE_SHIFT

We want to make use of pte_next_pfn() outside of set_ptes().  Let's simply
define PFN_PTE_SHIFT, required by pte_next_pfn().

Link: https://lkml.kernel.org/r/20240129124649.189745-4-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Tested-by: Ryan Roberts <ryan.roberts@arm.com>
Reviewed-by: Mike Rapoport (IBM) <rppt@kernel.org>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Alexandre Ghiti <alexghiti@rivosinc.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: David S. Miller <davem@davemloft.net>
Cc: Dinh Nguyen <dinguyen@kernel.org>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Naveen N. Rao <naveen.n.rao@linux.ibm.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Russell King (Oracle) <linux@armlinux.org.uk>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/nios2/include/asm/pgtable.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/nios2/include/asm/pgtable.h b/arch/nios2/include/asm/pgtable.h
index 5144506dfa6932..d052dfcbe8d3a0 100644
--- a/arch/nios2/include/asm/pgtable.h
+++ b/arch/nios2/include/asm/pgtable.h
@@ -178,6 +178,8 @@ static inline void set_pte(pte_t *ptep, pte_t pteval)
 	*ptep = pteval;
 }
 
+#define PFN_PTE_SHIFT		0
+
 static inline void set_ptes(struct mm_struct *mm, unsigned long addr,
 		pte_t *ptep, pte_t pte, unsigned int nr)
 {

From 2c862977317cd8401a3ccfb2c140fa208c3c233f Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Mon, 29 Jan 2024 13:46:38 +0100
Subject: [PATCH 1132/1406] powerpc/pgtable: define PFN_PTE_SHIFT

We want to make use of pte_next_pfn() outside of set_ptes().  Let's simply
define PFN_PTE_SHIFT, required by pte_next_pfn().

Link: https://lkml.kernel.org/r/20240129124649.189745-5-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Tested-by: Ryan Roberts <ryan.roberts@arm.com>
Reviewed-by: Mike Rapoport (IBM) <rppt@kernel.org>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Alexandre Ghiti <alexghiti@rivosinc.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Dinh Nguyen <dinguyen@kernel.org>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Naveen N. Rao <naveen.n.rao@linux.ibm.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Russell King (Oracle) <linux@armlinux.org.uk>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/powerpc/include/asm/pgtable.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h
index 9224f23065fff9..7a1ba8889aeaea 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@@ -41,6 +41,8 @@ struct mm_struct;
 
 #ifndef __ASSEMBLY__
 
+#define PFN_PTE_SHIFT		PTE_RPN_SHIFT
+
 void set_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep,
 		pte_t pte, unsigned int nr);
 #define set_ptes set_ptes

From 7efb7645930fa28499798155d4c53fb7acd7a38a Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Mon, 29 Jan 2024 13:46:39 +0100
Subject: [PATCH 1133/1406] riscv/pgtable: define PFN_PTE_SHIFT

We want to make use of pte_next_pfn() outside of set_ptes().  Let's simply
define PFN_PTE_SHIFT, required by pte_next_pfn().

Link: https://lkml.kernel.org/r/20240129124649.189745-6-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Alexandre Ghiti <alexghiti@rivosinc.com>
Tested-by: Ryan Roberts <ryan.roberts@arm.com>
Reviewed-by: Mike Rapoport (IBM) <rppt@kernel.org>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: David S. Miller <davem@davemloft.net>
Cc: Dinh Nguyen <dinguyen@kernel.org>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Naveen N. Rao <naveen.n.rao@linux.ibm.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Russell King (Oracle) <linux@armlinux.org.uk>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/riscv/include/asm/pgtable.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
index 0c94260b5d0c12..add5cd30ab34d8 100644
--- a/arch/riscv/include/asm/pgtable.h
+++ b/arch/riscv/include/asm/pgtable.h
@@ -523,6 +523,8 @@ static inline void __set_pte_at(pte_t *ptep, pte_t pteval)
 	set_pte(ptep, pteval);
 }
 
+#define PFN_PTE_SHIFT		_PAGE_PFN_SHIFT
+
 static inline void set_ptes(struct mm_struct *mm, unsigned long addr,
 		pte_t *ptep, pte_t pteval, unsigned int nr)
 {

From ef0ac7709f93fd8cf5261321b2b1243aff941275 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Mon, 29 Jan 2024 13:46:40 +0100
Subject: [PATCH 1134/1406] s390/pgtable: define PFN_PTE_SHIFT

We want to make use of pte_next_pfn() outside of set_ptes().  Let's simply
define PFN_PTE_SHIFT, required by pte_next_pfn().

Link: https://lkml.kernel.org/r/20240129124649.189745-7-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Tested-by: Ryan Roberts <ryan.roberts@arm.com>
Reviewed-by: Mike Rapoport (IBM) <rppt@kernel.org>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Alexandre Ghiti <alexghiti@rivosinc.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: David S. Miller <davem@davemloft.net>
Cc: Dinh Nguyen <dinguyen@kernel.org>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Naveen N. Rao <naveen.n.rao@linux.ibm.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Russell King (Oracle) <linux@armlinux.org.uk>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/s390/include/asm/pgtable.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 1299b56e43f6f9..4b91e65c85d97a 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -1316,6 +1316,8 @@ pgprot_t pgprot_writecombine(pgprot_t prot);
 #define pgprot_writethrough	pgprot_writethrough
 pgprot_t pgprot_writethrough(pgprot_t prot);
 
+#define PFN_PTE_SHIFT		PAGE_SHIFT
+
 /*
  * Set multiple PTEs to consecutive pages with a single call.  All PTEs
  * are within the same folio, PMD and VMA.

From 050200bf5740c98dec55d4d8fd2b144d0600b2ff Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Mon, 29 Jan 2024 13:46:41 +0100
Subject: [PATCH 1135/1406] sparc/pgtable: define PFN_PTE_SHIFT

We want to make use of pte_next_pfn() outside of set_ptes().  Let's simply
define PFN_PTE_SHIFT, required by pte_next_pfn().

Link: https://lkml.kernel.org/r/20240129124649.189745-8-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Tested-by: Ryan Roberts <ryan.roberts@arm.com>
Reviewed-by: Mike Rapoport (IBM) <rppt@kernel.org>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Alexandre Ghiti <alexghiti@rivosinc.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: David S. Miller <davem@davemloft.net>
Cc: Dinh Nguyen <dinguyen@kernel.org>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Naveen N. Rao <naveen.n.rao@linux.ibm.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Russell King (Oracle) <linux@armlinux.org.uk>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/sparc/include/asm/pgtable_64.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h
index a8c871b7d78608..652af9d63fa29e 100644
--- a/arch/sparc/include/asm/pgtable_64.h
+++ b/arch/sparc/include/asm/pgtable_64.h
@@ -929,6 +929,8 @@ static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr,
 	maybe_tlb_batch_add(mm, addr, ptep, orig, fullmm, PAGE_SHIFT);
 }
 
+#define PFN_PTE_SHIFT		PAGE_SHIFT
+
 static inline void set_ptes(struct mm_struct *mm, unsigned long addr,
 		pte_t *ptep, pte_t pte, unsigned int nr)
 {

From 23ae2e3e030369d1e67dab09b0f2d38b5d2402d5 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Mon, 29 Jan 2024 13:46:42 +0100
Subject: [PATCH 1136/1406] mm/pgtable: make pte_next_pfn() independent of
 set_ptes()

Let's provide pte_next_pfn(), independently of set_ptes().  This allows
for using the generic pte_next_pfn() version in some arch-specific
set_ptes() implementations, and prepares for reusing pte_next_pfn() in
other context.

Link: https://lkml.kernel.org/r/20240129124649.189745-9-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Tested-by: Ryan Roberts <ryan.roberts@arm.com>
Reviewed-by: Mike Rapoport (IBM) <rppt@kernel.org>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Alexandre Ghiti <alexghiti@rivosinc.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Dinh Nguyen <dinguyen@kernel.org>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Naveen N. Rao <naveen.n.rao@linux.ibm.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Russell King (Oracle) <linux@armlinux.org.uk>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/pgtable.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index f6d0e3513948ac..351cd9dc7194f8 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -212,7 +212,6 @@ static inline int pmd_dirty(pmd_t pmd)
 #define arch_flush_lazy_mmu_mode()	do {} while (0)
 #endif
 
-#ifndef set_ptes
 
 #ifndef pte_next_pfn
 static inline pte_t pte_next_pfn(pte_t pte)
@@ -221,6 +220,7 @@ static inline pte_t pte_next_pfn(pte_t pte)
 }
 #endif
 
+#ifndef set_ptes
 /**
  * set_ptes - Map consecutive pages to a contiguous range of addresses.
  * @mm: Address space to map the pages into.

From d0a24baa0d956bba207cf3c9e9b5f4e81b81a791 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Mon, 29 Jan 2024 13:46:43 +0100
Subject: [PATCH 1137/1406] arm/mm: use pte_next_pfn() in set_ptes()

Let's use our handy helper now that it's available on all archs.

Link: https://lkml.kernel.org/r/20240129124649.189745-10-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Tested-by: Ryan Roberts <ryan.roberts@arm.com>
Reviewed-by: Mike Rapoport (IBM) <rppt@kernel.org>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Alexandre Ghiti <alexghiti@rivosinc.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: David S. Miller <davem@davemloft.net>
Cc: Dinh Nguyen <dinguyen@kernel.org>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Naveen N. Rao <naveen.n.rao@linux.ibm.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Russell King (Oracle) <linux@armlinux.org.uk>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/arm/mm/mmu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c
index 674ed71573a84c..c24e29c0b9a48e 100644
--- a/arch/arm/mm/mmu.c
+++ b/arch/arm/mm/mmu.c
@@ -1814,6 +1814,6 @@ void set_ptes(struct mm_struct *mm, unsigned long addr,
 		if (--nr == 0)
 			break;
 		ptep++;
-		pte_val(pteval) += PAGE_SIZE;
+		pteval = pte_next_pfn(pteval);
 	}
 }

From 93bda38b7beeeb553dfeac449e97086bc0726b65 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Mon, 29 Jan 2024 13:46:44 +0100
Subject: [PATCH 1138/1406] powerpc/mm: use pte_next_pfn() in set_ptes()

Let's use our handy new helper. Note that the implementation is slightly
different, but shouldn't really make a difference in practice.

Link: https://lkml.kernel.org/r/20240129124649.189745-11-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Tested-by: Ryan Roberts <ryan.roberts@arm.com>
Reviewed-by: Mike Rapoport (IBM) <rppt@kernel.org>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Alexandre Ghiti <alexghiti@rivosinc.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Dinh Nguyen <dinguyen@kernel.org>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Naveen N. Rao <naveen.n.rao@linux.ibm.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Russell King (Oracle) <linux@armlinux.org.uk>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/powerpc/mm/pgtable.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
index a04ae4449a0257..549a440ed7f652 100644
--- a/arch/powerpc/mm/pgtable.c
+++ b/arch/powerpc/mm/pgtable.c
@@ -220,10 +220,7 @@ void set_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep,
 			break;
 		ptep++;
 		addr += PAGE_SIZE;
-		/*
-		 * increment the pfn.
-		 */
-		pte = pfn_pte(pte_pfn(pte) + 1, pte_pgprot((pte)));
+		pte = pte_next_pfn(pte);
 	}
 }
 

From 49a35950042173a095af1133b412ef231eae65ee Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Mon, 29 Jan 2024 13:46:45 +0100
Subject: [PATCH 1139/1406] mm/memory: factor out copying the actual PTE in
 copy_present_pte()

Let's prepare for further changes.

Link: https://lkml.kernel.org/r/20240129124649.189745-12-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Ryan Roberts <ryan.roberts@arm.com>
Reviewed-by: Mike Rapoport (IBM) <rppt@kernel.org>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Alexandre Ghiti <alexghiti@rivosinc.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: David S. Miller <davem@davemloft.net>
Cc: Dinh Nguyen <dinguyen@kernel.org>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Naveen N. Rao <naveen.n.rao@linux.ibm.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Russell King (Oracle) <linux@armlinux.org.uk>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/memory.c | 63 ++++++++++++++++++++++++++++-------------------------
 1 file changed, 33 insertions(+), 30 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index 00f3f4fbd131d0..bef6fd925d0466 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -930,6 +930,29 @@ copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma
 	return 0;
 }
 
+static inline void __copy_present_pte(struct vm_area_struct *dst_vma,
+		struct vm_area_struct *src_vma, pte_t *dst_pte, pte_t *src_pte,
+		pte_t pte, unsigned long addr)
+{
+	struct mm_struct *src_mm = src_vma->vm_mm;
+
+	/* If it's a COW mapping, write protect it both processes. */
+	if (is_cow_mapping(src_vma->vm_flags) && pte_write(pte)) {
+		ptep_set_wrprotect(src_mm, addr, src_pte);
+		pte = pte_wrprotect(pte);
+	}
+
+	/* If it's a shared mapping, mark it clean in the child. */
+	if (src_vma->vm_flags & VM_SHARED)
+		pte = pte_mkclean(pte);
+	pte = pte_mkold(pte);
+
+	if (!userfaultfd_wp(dst_vma))
+		pte = pte_clear_uffd_wp(pte);
+
+	set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte);
+}
+
 /*
  * Copy one pte.  Returns 0 if succeeded, or -EAGAIN if one preallocated page
  * is required to copy this pte.
@@ -939,23 +962,23 @@ copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
 		 pte_t *dst_pte, pte_t *src_pte, unsigned long addr, int *rss,
 		 struct folio **prealloc)
 {
-	struct mm_struct *src_mm = src_vma->vm_mm;
-	unsigned long vm_flags = src_vma->vm_flags;
 	pte_t pte = ptep_get(src_pte);
 	struct page *page;
 	struct folio *folio;
 
 	page = vm_normal_page(src_vma, addr, pte);
-	if (page)
-		folio = page_folio(page);
-	if (page && folio_test_anon(folio)) {
+	if (unlikely(!page))
+		goto copy_pte;
+
+	folio = page_folio(page);
+	folio_get(folio);
+	if (folio_test_anon(folio)) {
 		/*
 		 * If this page may have been pinned by the parent process,
 		 * copy the page immediately for the child so that we'll always
 		 * guarantee the pinned page won't be randomly replaced in the
 		 * future.
 		 */
-		folio_get(folio);
 		if (unlikely(folio_try_dup_anon_rmap_pte(folio, page, src_vma))) {
 			/* Page may be pinned, we have to copy. */
 			folio_put(folio);
@@ -963,34 +986,14 @@ copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
 						 addr, rss, prealloc, page);
 		}
 		rss[MM_ANONPAGES]++;
-	} else if (page) {
-		folio_get(folio);
+		VM_WARN_ON_FOLIO(PageAnonExclusive(page), folio);
+	} else {
 		folio_dup_file_rmap_pte(folio, page);
 		rss[mm_counter_file(folio)]++;
 	}
 
-	/*
-	 * If it's a COW mapping, write protect it both
-	 * in the parent and the child
-	 */
-	if (is_cow_mapping(vm_flags) && pte_write(pte)) {
-		ptep_set_wrprotect(src_mm, addr, src_pte);
-		pte = pte_wrprotect(pte);
-	}
-	VM_BUG_ON(page && folio_test_anon(folio) && PageAnonExclusive(page));
-
-	/*
-	 * If it's a shared mapping, mark it clean in
-	 * the child
-	 */
-	if (vm_flags & VM_SHARED)
-		pte = pte_mkclean(pte);
-	pte = pte_mkold(pte);
-
-	if (!userfaultfd_wp(dst_vma))
-		pte = pte_clear_uffd_wp(pte);
-
-	set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte);
+copy_pte:
+	__copy_present_pte(dst_vma, src_vma, dst_pte, src_pte, pte, addr);
 	return 0;
 }
 

From dec7bf6ca96be3900cd1a7973e3c1c103c54c52a Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Mon, 29 Jan 2024 13:46:46 +0100
Subject: [PATCH 1140/1406] mm/memory: pass PTE to copy_present_pte()

We already read it, let's just forward it.

This patch is based on work by Ryan Roberts.

Link: https://lkml.kernel.org/r/20240129124649.189745-13-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Ryan Roberts <ryan.roberts@arm.com>
Reviewed-by: Mike Rapoport (IBM) <rppt@kernel.org>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Alexandre Ghiti <alexghiti@rivosinc.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: David S. Miller <davem@davemloft.net>
Cc: Dinh Nguyen <dinguyen@kernel.org>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Naveen N. Rao <naveen.n.rao@linux.ibm.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Russell King (Oracle) <linux@armlinux.org.uk>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/memory.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index bef6fd925d0466..f7276a6bb3c782 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -959,10 +959,9 @@ static inline void __copy_present_pte(struct vm_area_struct *dst_vma,
  */
 static inline int
 copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
-		 pte_t *dst_pte, pte_t *src_pte, unsigned long addr, int *rss,
-		 struct folio **prealloc)
+		 pte_t *dst_pte, pte_t *src_pte, pte_t pte, unsigned long addr,
+		 int *rss, struct folio **prealloc)
 {
-	pte_t pte = ptep_get(src_pte);
 	struct page *page;
 	struct folio *folio;
 
@@ -1103,7 +1102,7 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
 		}
 		/* copy_present_pte() will clear `*prealloc' if consumed */
 		ret = copy_present_pte(dst_vma, src_vma, dst_pte, src_pte,
-				       addr, rss, &prealloc);
+				       ptent, addr, rss, &prealloc);
 		/*
 		 * If we need a pre-allocated page for this pte, drop the
 		 * locks, allocate, and try again.

From 1733196542654992b8281845b6e8809088393332 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Wed, 14 Feb 2024 23:40:22 +0100
Subject: [PATCH 1141/1406] fixup: mm/memory: pass PTE to copy_present_pte()

For device-exclusive nonswp entries (is_device_exclusive_entry()),
copy_nonpresent_pte() can turn the PTEs into actual present PTEs while
holding the page table lock.

We hae to re-read the PTE after that operation, such that we won't be
working on the stale non-present PTE, assuming it would be present.

This fixes the hmm "exclusive_cow" selftest.

  ./run_vmtests.sh -t hmm
  # #  RUN           hmm.hmm_device_private.exclusive_cow ...
  # #            OK  hmm.hmm_device_private.exclusive_cow
  # ok 23 hmm.hmm_device_private.exclusive_cow

Link: https://lkml.kernel.org/r/13f296b8-e882-47fd-b939-c2141dc28717@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Alexandre Ghiti <alexghiti@rivosinc.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: David S. Miller <davem@davemloft.net>
Cc: Dinh Nguyen <dinguyen@kernel.org>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Mike Rapoport (IBM) <rppt@kernel.org>
Cc: Naveen N. Rao <naveen.n.rao@linux.ibm.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Russell King (Oracle) <linux@armlinux.org.uk>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/memory.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/mm/memory.c b/mm/memory.c
index f7276a6bb3c782..ca888431680cbd 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1093,6 +1093,8 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
 				progress += 8;
 				continue;
 			}
+			ptent = ptep_get(src_pte);
+			VM_WARN_ON_ONCE(!pte_present(ptent));
 
 			/*
 			 * Device exclusive entry restored, continue by copying

From 20f5b61c03485b3703f36d2dcb2ab8f573fa1a16 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Mon, 29 Jan 2024 13:46:47 +0100
Subject: [PATCH 1142/1406] mm/memory: optimize fork() with PTE-mapped THP

Let's implement PTE batching when consecutive (present) PTEs map
consecutive pages of the same large folio, and all other PTE bits besides
the PFNs are equal.

We will optimize folio_pte_batch() separately, to ignore selected PTE
bits.  This patch is based on work by Ryan Roberts.

Use __always_inline for __copy_present_ptes() and keep the handling for
single PTEs completely separate from the multi-PTE case: we really want
the compiler to optimize for the single-PTE case with small folios, to not
degrade performance.

Note that PTE batching will never exceed a single page table and will
always stay within VMA boundaries.

Further, processing PTE-mapped THP that maybe pinned and have
PageAnonExclusive set on at least one subpage should work as expected, but
there is room for improvement: We will repeatedly (1) detect a PTE batch
(2) detect that we have to copy a page (3) fall back and allocate a single
page to copy a single page.  For now we won't care as pinned pages are a
corner case, and we should rather look into maintaining only a single
PageAnonExclusive bit for large folios.

Link: https://lkml.kernel.org/r/20240129124649.189745-14-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Ryan Roberts <ryan.roberts@arm.com>
Reviewed-by: Mike Rapoport (IBM) <rppt@kernel.org>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Alexandre Ghiti <alexghiti@rivosinc.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: David S. Miller <davem@davemloft.net>
Cc: Dinh Nguyen <dinguyen@kernel.org>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Naveen N. Rao <naveen.n.rao@linux.ibm.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Russell King (Oracle) <linux@armlinux.org.uk>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/pgtable.h |  31 +++++++++++
 mm/memory.c             | 112 +++++++++++++++++++++++++++++++++-------
 2 files changed, 124 insertions(+), 19 deletions(-)

diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 351cd9dc7194f8..aab227e12493fb 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -650,6 +650,37 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addres
 }
 #endif
 
+#ifndef wrprotect_ptes
+/**
+ * wrprotect_ptes - Write-protect PTEs that map consecutive pages of the same
+ *		    folio.
+ * @mm: Address space the pages are mapped into.
+ * @addr: Address the first page is mapped at.
+ * @ptep: Page table pointer for the first entry.
+ * @nr: Number of entries to write-protect.
+ *
+ * May be overridden by the architecture; otherwise, implemented as a simple
+ * loop over ptep_set_wrprotect().
+ *
+ * Note that PTE bits in the PTE range besides the PFN can differ. For example,
+ * some PTEs might be write-protected.
+ *
+ * Context: The caller holds the page table lock.  The PTEs map consecutive
+ * pages that belong to the same folio.  The PTEs are all in the same PMD.
+ */
+static inline void wrprotect_ptes(struct mm_struct *mm, unsigned long addr,
+		pte_t *ptep, unsigned int nr)
+{
+	for (;;) {
+		ptep_set_wrprotect(mm, addr, ptep);
+		if (--nr == 0)
+			break;
+		ptep++;
+		addr += PAGE_SIZE;
+	}
+}
+#endif
+
 /*
  * On some architectures hardware does not set page access bit when accessing
  * memory page, it is responsibility of software setting this bit. It brings
diff --git a/mm/memory.c b/mm/memory.c
index ca888431680cbd..a7eb2301a1d15a 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -930,15 +930,15 @@ copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma
 	return 0;
 }
 
-static inline void __copy_present_pte(struct vm_area_struct *dst_vma,
+static __always_inline void __copy_present_ptes(struct vm_area_struct *dst_vma,
 		struct vm_area_struct *src_vma, pte_t *dst_pte, pte_t *src_pte,
-		pte_t pte, unsigned long addr)
+		pte_t pte, unsigned long addr, int nr)
 {
 	struct mm_struct *src_mm = src_vma->vm_mm;
 
 	/* If it's a COW mapping, write protect it both processes. */
 	if (is_cow_mapping(src_vma->vm_flags) && pte_write(pte)) {
-		ptep_set_wrprotect(src_mm, addr, src_pte);
+		wrprotect_ptes(src_mm, addr, src_pte, nr);
 		pte = pte_wrprotect(pte);
 	}
 
@@ -950,26 +950,93 @@ static inline void __copy_present_pte(struct vm_area_struct *dst_vma,
 	if (!userfaultfd_wp(dst_vma))
 		pte = pte_clear_uffd_wp(pte);
 
-	set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte);
+	set_ptes(dst_vma->vm_mm, addr, dst_pte, pte, nr);
+}
+
+/*
+ * Detect a PTE batch: consecutive (present) PTEs that map consecutive
+ * pages of the same folio.
+ *
+ * All PTEs inside a PTE batch have the same PTE bits set, excluding the PFN.
+ */
+static inline int folio_pte_batch(struct folio *folio, unsigned long addr,
+		pte_t *start_ptep, pte_t pte, int max_nr)
+{
+	unsigned long folio_end_pfn = folio_pfn(folio) + folio_nr_pages(folio);
+	const pte_t *end_ptep = start_ptep + max_nr;
+	pte_t expected_pte = pte_next_pfn(pte);
+	pte_t *ptep = start_ptep + 1;
+
+	VM_WARN_ON_FOLIO(!pte_present(pte), folio);
+
+	while (ptep != end_ptep) {
+		pte = ptep_get(ptep);
+
+		if (!pte_same(pte, expected_pte))
+			break;
+
+		/*
+		 * Stop immediately once we reached the end of the folio. In
+		 * corner cases the next PFN might fall into a different
+		 * folio.
+		 */
+		if (pte_pfn(pte) == folio_end_pfn)
+			break;
+
+		expected_pte = pte_next_pfn(expected_pte);
+		ptep++;
+	}
+
+	return ptep - start_ptep;
 }
 
 /*
- * Copy one pte.  Returns 0 if succeeded, or -EAGAIN if one preallocated page
- * is required to copy this pte.
+ * Copy one present PTE, trying to batch-process subsequent PTEs that map
+ * consecutive pages of the same folio by copying them as well.
+ *
+ * Returns -EAGAIN if one preallocated page is required to copy the next PTE.
+ * Otherwise, returns the number of copied PTEs (at least 1).
  */
 static inline int
-copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
+copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
 		 pte_t *dst_pte, pte_t *src_pte, pte_t pte, unsigned long addr,
-		 int *rss, struct folio **prealloc)
+		 int max_nr, int *rss, struct folio **prealloc)
 {
 	struct page *page;
 	struct folio *folio;
+	int err, nr;
 
 	page = vm_normal_page(src_vma, addr, pte);
 	if (unlikely(!page))
 		goto copy_pte;
 
 	folio = page_folio(page);
+
+	/*
+	 * If we likely have to copy, just don't bother with batching. Make
+	 * sure that the common "small folio" case is as fast as possible
+	 * by keeping the batching logic separate.
+	 */
+	if (unlikely(!*prealloc && folio_test_large(folio) && max_nr != 1)) {
+		nr = folio_pte_batch(folio, addr, src_pte, pte, max_nr);
+		folio_ref_add(folio, nr);
+		if (folio_test_anon(folio)) {
+			if (unlikely(folio_try_dup_anon_rmap_ptes(folio, page,
+								  nr, src_vma))) {
+				folio_ref_sub(folio, nr);
+				return -EAGAIN;
+			}
+			rss[MM_ANONPAGES] += nr;
+			VM_WARN_ON_FOLIO(PageAnonExclusive(page), folio);
+		} else {
+			folio_dup_file_rmap_ptes(folio, page, nr);
+			rss[mm_counter_file(folio)] += nr;
+		}
+		__copy_present_ptes(dst_vma, src_vma, dst_pte, src_pte, pte,
+				    addr, nr);
+		return nr;
+	}
+
 	folio_get(folio);
 	if (folio_test_anon(folio)) {
 		/*
@@ -981,8 +1048,9 @@ copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
 		if (unlikely(folio_try_dup_anon_rmap_pte(folio, page, src_vma))) {
 			/* Page may be pinned, we have to copy. */
 			folio_put(folio);
-			return copy_present_page(dst_vma, src_vma, dst_pte, src_pte,
-						 addr, rss, prealloc, page);
+			err = copy_present_page(dst_vma, src_vma, dst_pte, src_pte,
+						addr, rss, prealloc, page);
+			return err ? err : 1;
 		}
 		rss[MM_ANONPAGES]++;
 		VM_WARN_ON_FOLIO(PageAnonExclusive(page), folio);
@@ -992,8 +1060,8 @@ copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
 	}
 
 copy_pte:
-	__copy_present_pte(dst_vma, src_vma, dst_pte, src_pte, pte, addr);
-	return 0;
+	__copy_present_ptes(dst_vma, src_vma, dst_pte, src_pte, pte, addr, 1);
+	return 1;
 }
 
 static inline struct folio *folio_prealloc(struct mm_struct *src_mm,
@@ -1030,10 +1098,11 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
 	pte_t *src_pte, *dst_pte;
 	pte_t ptent;
 	spinlock_t *src_ptl, *dst_ptl;
-	int progress, ret = 0;
+	int progress, max_nr, ret = 0;
 	int rss[NR_MM_COUNTERS];
 	swp_entry_t entry = (swp_entry_t){0};
 	struct folio *prealloc = NULL;
+	int nr;
 
 again:
 	progress = 0;
@@ -1064,6 +1133,8 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
 	arch_enter_lazy_mmu_mode();
 
 	do {
+		nr = 1;
+
 		/*
 		 * We are holding two locks at this point - either of them
 		 * could generate latencies in another task on another CPU.
@@ -1102,9 +1173,10 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
 			 */
 			WARN_ON_ONCE(ret != -ENOENT);
 		}
-		/* copy_present_pte() will clear `*prealloc' if consumed */
-		ret = copy_present_pte(dst_vma, src_vma, dst_pte, src_pte,
-				       ptent, addr, rss, &prealloc);
+		/* copy_present_ptes() will clear `*prealloc' if consumed */
+		max_nr = (end - addr) / PAGE_SIZE;
+		ret = copy_present_ptes(dst_vma, src_vma, dst_pte, src_pte,
+					ptent, addr, max_nr, rss, &prealloc);
 		/*
 		 * If we need a pre-allocated page for this pte, drop the
 		 * locks, allocate, and try again.
@@ -1121,8 +1193,10 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
 			folio_put(prealloc);
 			prealloc = NULL;
 		}
-		progress += 8;
-	} while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
+		nr = ret;
+		progress += 8 * nr;
+	} while (dst_pte += nr, src_pte += nr, addr += PAGE_SIZE * nr,
+		 addr != end);
 
 	arch_leave_lazy_mmu_mode();
 	pte_unmap_unlock(orig_src_pte, src_ptl);
@@ -1143,7 +1217,7 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
 		prealloc = folio_prealloc(src_mm, src_vma, addr, false);
 		if (!prealloc)
 			return -ENOMEM;
-	} else if (ret) {
+	} else if (ret < 0) {
 		VM_WARN_ON_ONCE(1);
 	}
 

From fee7a6115e1e249a7ca8434cd22368e43a4aa061 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Mon, 29 Jan 2024 13:46:48 +0100
Subject: [PATCH 1143/1406] mm/memory: ignore dirty/accessed/soft-dirty bits in
 folio_pte_batch()

Let's always ignore the accessed/young bit: we'll always mark the PTE as
old in our child process during fork, and upcoming users will similarly
not care.

Ignore the dirty bit only if we don't want to duplicate the dirty bit into
the child process during fork.  Maybe, we could just set all PTEs in the
child dirty if any PTE is dirty.  For now, let's keep the behavior
unchanged, this can be optimized later if required.

Ignore the soft-dirty bit only if the bit doesn't have any meaning in the
src vma, and similarly won't have any in the copied dst vma.

For now, we won't bother with the uffd-wp bit.

Link: https://lkml.kernel.org/r/20240129124649.189745-15-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Ryan Roberts <ryan.roberts@arm.com>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Alexandre Ghiti <alexghiti@rivosinc.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: David S. Miller <davem@davemloft.net>
Cc: Dinh Nguyen <dinguyen@kernel.org>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Naveen N. Rao <naveen.n.rao@linux.ibm.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Russell King (Oracle) <linux@armlinux.org.uk>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Mike Rapoport (IBM) <rppt@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/memory.c | 36 +++++++++++++++++++++++++++++++-----
 1 file changed, 31 insertions(+), 5 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index a7eb2301a1d15a..ec5741f37c77f2 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -953,24 +953,44 @@ static __always_inline void __copy_present_ptes(struct vm_area_struct *dst_vma,
 	set_ptes(dst_vma->vm_mm, addr, dst_pte, pte, nr);
 }
 
+/* Flags for folio_pte_batch(). */
+typedef int __bitwise fpb_t;
+
+/* Compare PTEs after pte_mkclean(), ignoring the dirty bit. */
+#define FPB_IGNORE_DIRTY		((__force fpb_t)BIT(0))
+
+/* Compare PTEs after pte_clear_soft_dirty(), ignoring the soft-dirty bit. */
+#define FPB_IGNORE_SOFT_DIRTY		((__force fpb_t)BIT(1))
+
+static inline pte_t __pte_batch_clear_ignored(pte_t pte, fpb_t flags)
+{
+	if (flags & FPB_IGNORE_DIRTY)
+		pte = pte_mkclean(pte);
+	if (likely(flags & FPB_IGNORE_SOFT_DIRTY))
+		pte = pte_clear_soft_dirty(pte);
+	return pte_mkold(pte);
+}
+
 /*
  * Detect a PTE batch: consecutive (present) PTEs that map consecutive
  * pages of the same folio.
  *
- * All PTEs inside a PTE batch have the same PTE bits set, excluding the PFN.
+ * All PTEs inside a PTE batch have the same PTE bits set, excluding the PFN,
+ * the accessed bit, dirty bit (with FPB_IGNORE_DIRTY) and soft-dirty bit
+ * (with FPB_IGNORE_SOFT_DIRTY).
  */
 static inline int folio_pte_batch(struct folio *folio, unsigned long addr,
-		pte_t *start_ptep, pte_t pte, int max_nr)
+		pte_t *start_ptep, pte_t pte, int max_nr, fpb_t flags)
 {
 	unsigned long folio_end_pfn = folio_pfn(folio) + folio_nr_pages(folio);
 	const pte_t *end_ptep = start_ptep + max_nr;
-	pte_t expected_pte = pte_next_pfn(pte);
+	pte_t expected_pte = __pte_batch_clear_ignored(pte_next_pfn(pte), flags);
 	pte_t *ptep = start_ptep + 1;
 
 	VM_WARN_ON_FOLIO(!pte_present(pte), folio);
 
 	while (ptep != end_ptep) {
-		pte = ptep_get(ptep);
+		pte = __pte_batch_clear_ignored(ptep_get(ptep), flags);
 
 		if (!pte_same(pte, expected_pte))
 			break;
@@ -1004,6 +1024,7 @@ copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma
 {
 	struct page *page;
 	struct folio *folio;
+	fpb_t flags = 0;
 	int err, nr;
 
 	page = vm_normal_page(src_vma, addr, pte);
@@ -1018,7 +1039,12 @@ copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma
 	 * by keeping the batching logic separate.
 	 */
 	if (unlikely(!*prealloc && folio_test_large(folio) && max_nr != 1)) {
-		nr = folio_pte_batch(folio, addr, src_pte, pte, max_nr);
+		if (src_vma->vm_flags & VM_SHARED)
+			flags |= FPB_IGNORE_DIRTY;
+		if (!vma_soft_dirty_enabled(src_vma))
+			flags |= FPB_IGNORE_SOFT_DIRTY;
+
+		nr = folio_pte_batch(folio, addr, src_pte, pte, max_nr, flags);
 		folio_ref_add(folio, nr);
 		if (folio_test_anon(folio)) {
 			if (unlikely(folio_try_dup_anon_rmap_ptes(folio, page,

From 12850631e3012729d3ee729344b3ffea7fc2ffe9 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Mon, 29 Jan 2024 13:46:49 +0100
Subject: [PATCH 1144/1406] mm/memory: ignore writable bit in folio_pte_batch()

...  and conditionally return to the caller if any PTE except the first
one is writable.  fork() has to make sure to properly write-protect in
case any PTE is writable.  Other users (e.g., page unmaping) are expected
to not care.

Link: https://lkml.kernel.org/r/20240129124649.189745-16-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Ryan Roberts <ryan.roberts@arm.com>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Alexandre Ghiti <alexghiti@rivosinc.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: David S. Miller <davem@davemloft.net>
Cc: Dinh Nguyen <dinguyen@kernel.org>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Naveen N. Rao <naveen.n.rao@linux.ibm.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Russell King (Oracle) <linux@armlinux.org.uk>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Mike Rapoport (IBM) <rppt@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/memory.c | 30 ++++++++++++++++++++++++------
 1 file changed, 24 insertions(+), 6 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index ec5741f37c77f2..e5e5056cb53fe5 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -968,7 +968,7 @@ static inline pte_t __pte_batch_clear_ignored(pte_t pte, fpb_t flags)
 		pte = pte_mkclean(pte);
 	if (likely(flags & FPB_IGNORE_SOFT_DIRTY))
 		pte = pte_clear_soft_dirty(pte);
-	return pte_mkold(pte);
+	return pte_wrprotect(pte_mkold(pte));
 }
 
 /*
@@ -976,21 +976,32 @@ static inline pte_t __pte_batch_clear_ignored(pte_t pte, fpb_t flags)
  * pages of the same folio.
  *
  * All PTEs inside a PTE batch have the same PTE bits set, excluding the PFN,
- * the accessed bit, dirty bit (with FPB_IGNORE_DIRTY) and soft-dirty bit
- * (with FPB_IGNORE_SOFT_DIRTY).
+ * the accessed bit, writable bit, dirty bit (with FPB_IGNORE_DIRTY) and
+ * soft-dirty bit (with FPB_IGNORE_SOFT_DIRTY).
+ *
+ * If "any_writable" is set, it will indicate if any other PTE besides the
+ * first (given) PTE is writable.
  */
 static inline int folio_pte_batch(struct folio *folio, unsigned long addr,
-		pte_t *start_ptep, pte_t pte, int max_nr, fpb_t flags)
+		pte_t *start_ptep, pte_t pte, int max_nr, fpb_t flags,
+		bool *any_writable)
 {
 	unsigned long folio_end_pfn = folio_pfn(folio) + folio_nr_pages(folio);
 	const pte_t *end_ptep = start_ptep + max_nr;
 	pte_t expected_pte = __pte_batch_clear_ignored(pte_next_pfn(pte), flags);
 	pte_t *ptep = start_ptep + 1;
+	bool writable;
+
+	if (any_writable)
+		*any_writable = false;
 
 	VM_WARN_ON_FOLIO(!pte_present(pte), folio);
 
 	while (ptep != end_ptep) {
-		pte = __pte_batch_clear_ignored(ptep_get(ptep), flags);
+		pte = ptep_get(ptep);
+		if (any_writable)
+			writable = !!pte_write(pte);
+		pte = __pte_batch_clear_ignored(pte, flags);
 
 		if (!pte_same(pte, expected_pte))
 			break;
@@ -1003,6 +1014,9 @@ static inline int folio_pte_batch(struct folio *folio, unsigned long addr,
 		if (pte_pfn(pte) == folio_end_pfn)
 			break;
 
+		if (any_writable)
+			*any_writable |= writable;
+
 		expected_pte = pte_next_pfn(expected_pte);
 		ptep++;
 	}
@@ -1024,6 +1038,7 @@ copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma
 {
 	struct page *page;
 	struct folio *folio;
+	bool any_writable;
 	fpb_t flags = 0;
 	int err, nr;
 
@@ -1044,7 +1059,8 @@ copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma
 		if (!vma_soft_dirty_enabled(src_vma))
 			flags |= FPB_IGNORE_SOFT_DIRTY;
 
-		nr = folio_pte_batch(folio, addr, src_pte, pte, max_nr, flags);
+		nr = folio_pte_batch(folio, addr, src_pte, pte, max_nr, flags,
+				     &any_writable);
 		folio_ref_add(folio, nr);
 		if (folio_test_anon(folio)) {
 			if (unlikely(folio_try_dup_anon_rmap_ptes(folio, page,
@@ -1058,6 +1074,8 @@ copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma
 			folio_dup_file_rmap_ptes(folio, page, nr);
 			rss[mm_counter_file(folio)] += nr;
 		}
+		if (any_writable)
+			pte = pte_mkwrite(pte, src_vma);
 		__copy_present_ptes(dst_vma, src_vma, dst_pte, src_pte, pte,
 				    addr, nr);
 		return nr;

From 5ca74542683228f3f6bdd8d2175830ebe8474296 Mon Sep 17 00:00:00 2001
From: Breno Leitao <leitao@debian.org>
Date: Mon, 29 Jan 2024 03:52:46 -0800
Subject: [PATCH 1145/1406] selftests/mm: run_vmtests.sh: add hugetlb test
 category

The usage of run_vmtests.sh does not include hugetlb, which is a valid
test category.

Add the 'hugetlb' to the usage of run_vmtests.sh.

Link: https://lkml.kernel.org/r/20240129115246.1234253-1-leitao@debian.org
Signed-off-by: Breno Leitao <leitao@debian.org>
Reviewed-by: Muhammad Usama Anjum <usama.anjum@collabora.com>
Reviewed-by: Joel Savitz <jsavitz@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/run_vmtests.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh
index 040f27e21f47a3..81b5980886dafc 100755
--- a/tools/testing/selftests/mm/run_vmtests.sh
+++ b/tools/testing/selftests/mm/run_vmtests.sh
@@ -64,6 +64,8 @@ separated by spaces:
 	test copy-on-write semantics
 - thp
 	test transparent huge pages
+- hugetlb
+	test hugetlbfs huge pages
 - migration
 	invoke move_pages(2) to exercise the migration entry code
 	paths in the kernel

From 48e546d6064e5844e1f25fca3c6caabe4e691d2f Mon Sep 17 00:00:00 2001
From: Yajun Deng <yajun.deng@linux.dev>
Date: Sat, 3 Feb 2024 09:46:32 +0800
Subject: [PATCH 1146/1406] mm/mmap: pass vma to vma_merge()

These vma_merge() callers will pass mm, anon_vma and file, they all from
the same vma.  There is no need to pass three parameters at the same time.

Pass vma instead of mm, anon_vma and file to vma_merge(), so that it can
save two parameters.

Link: https://lkml.kernel.org/r/20240203014632.2726545-1-yajun.deng@linux.dev
Link: https://lore.kernel.org/lkml/20240125034922.1004671-2-yajun.deng@linux.dev/
Signed-off-by: Yajun Deng <yajun.deng@linux.dev>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Yajun Deng <yajun.deng@linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/mmap.c | 27 +++++++++++++--------------
 1 file changed, 13 insertions(+), 14 deletions(-)

diff --git a/mm/mmap.c b/mm/mmap.c
index 1f9e7024285866..ccf377ee319f70 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -860,13 +860,15 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
  *      area is returned, or the function will return NULL
  */
 static struct vm_area_struct
-*vma_merge(struct vma_iterator *vmi, struct mm_struct *mm,
-	   struct vm_area_struct *prev, unsigned long addr, unsigned long end,
-	   unsigned long vm_flags, struct anon_vma *anon_vma, struct file *file,
-	   pgoff_t pgoff, struct mempolicy *policy,
+*vma_merge(struct vma_iterator *vmi, struct vm_area_struct *prev,
+	   struct vm_area_struct *src, unsigned long addr, unsigned long end,
+	   unsigned long vm_flags, pgoff_t pgoff, struct mempolicy *policy,
 	   struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
 	   struct anon_vma_name *anon_name)
 {
+	struct mm_struct *mm = src->vm_mm;
+	struct anon_vma *anon_vma = src->anon_vma;
+	struct file *file = src->vm_file;
 	struct vm_area_struct *curr, *next, *res;
 	struct vm_area_struct *vma, *adjust, *remove, *remove2;
 	struct vm_area_struct *anon_dup = NULL;
@@ -2426,9 +2428,8 @@ struct vm_area_struct *vma_modify(struct vma_iterator *vmi,
 	pgoff_t pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
 	struct vm_area_struct *merged;
 
-	merged = vma_merge(vmi, vma->vm_mm, prev, start, end, vm_flags,
-			   vma->anon_vma, vma->vm_file, pgoff, policy,
-			   uffd_ctx, anon_name);
+	merged = vma_merge(vmi, prev, vma, start, end, vm_flags,
+			   pgoff, policy, uffd_ctx, anon_name);
 	if (merged)
 		return merged;
 
@@ -2458,9 +2459,8 @@ static struct vm_area_struct
 		   struct vm_area_struct *vma, unsigned long start,
 		   unsigned long end, pgoff_t pgoff)
 {
-	return vma_merge(vmi, vma->vm_mm, prev, start, end, vma->vm_flags,
-			 vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
-			 vma->vm_userfaultfd_ctx, anon_vma_name(vma));
+	return vma_merge(vmi, prev, vma, start, end, vma->vm_flags, pgoff,
+			 vma_policy(vma), vma->vm_userfaultfd_ctx, anon_vma_name(vma));
 }
 
 /*
@@ -2474,10 +2474,9 @@ struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi,
 	pgoff_t pgoff = vma->vm_pgoff + vma_pages(vma);
 
 	/* vma is specified as prev, so case 1 or 2 will apply. */
-	return vma_merge(vmi, vma->vm_mm, vma, vma->vm_end, vma->vm_end + delta,
-			 vma->vm_flags, vma->anon_vma, vma->vm_file, pgoff,
-			 vma_policy(vma), vma->vm_userfaultfd_ctx,
-			 anon_vma_name(vma));
+	return vma_merge(vmi, vma, vma, vma->vm_end, vma->vm_end + delta,
+			 vma->vm_flags, pgoff, vma_policy(vma),
+			 vma->vm_userfaultfd_ctx, anon_vma_name(vma));
 }
 
 /*

From 7d37afbc4d6954395eb3bc7edec0f276f4832693 Mon Sep 17 00:00:00 2001
From: "T.J. Mercier" <tjmercier@google.com>
Date: Fri, 2 Feb 2024 23:38:54 +0000
Subject: [PATCH 1147/1406] mm: memcg: use larger batches for proactive reclaim

Before 388536ac291 ("mm:vmscan: fix inaccurate reclaim during proactive
reclaim") we passed the number of pages for the reclaim request directly
to try_to_free_mem_cgroup_pages, which could lead to significant
overreclaim.  After 0388536ac291 the number of pages was limited to a
maximum 32 (SWAP_CLUSTER_MAX) to reduce the amount of overreclaim.
However such a small batch size caused a regression in reclaim performance
due to many more reclaim start/stop cycles inside memory_reclaim.  The
restart cost is amortized over more pages with larger batch sizes, and
becomes a significant component of the runtime if the batch size is too
small.

Reclaim tries to balance nr_to_reclaim fidelity with fairness across nodes
and cgroups over which the pages are spread.  As such, the bigger the
request, the bigger the absolute overreclaim error.  Historic in-kernel
users of reclaim have used fixed, small sized requests to approach an
appropriate reclaim rate over time.  When we reclaim a user request of
arbitrary size, use decaying batch sizes to manage error while maintaining
reasonable throughput.

MGLRU enabled - memcg LRU used
root - full reclaim       pages/sec   time (sec)
pre-0388536ac291      :    68047        10.46
post-0388536ac291     :    13742        inf
(reclaim-reclaimed)/4 :    67352        10.51

MGLRU enabled - memcg LRU not used
/uid_0 - 1G reclaim       pages/sec   time (sec)  overreclaim (MiB)
pre-0388536ac291      :    258822       1.12            107.8
post-0388536ac291     :    105174       2.49            3.5
(reclaim-reclaimed)/4 :    233396       1.12            -7.4

MGLRU enabled - memcg LRU not used
/uid_0 - full reclaim     pages/sec   time (sec)
pre-0388536ac291      :    72334        7.09
post-0388536ac291     :    38105        14.45
(reclaim-reclaimed)/4 :    72914        6.96

Link: https://lkml.kernel.org/r/20240202233855.1236422-1-tjmercier@google.com
Fixes: 0388536ac291 ("mm:vmscan: fix inaccurate reclaim during proactive reclaim")
Signed-off-by: T.J. Mercier <tjmercier@google.com>
Reviewed-by: Yosry Ahmed <yosryahmed@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Michal Koutny <mkoutny@suse.com>
Acked-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Efly Young <yangyifei03@kuaishou.com>
Cc: Yu Zhao <yuzhao@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/memcontrol.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 484a9d2862d4fe..c01615b34df86a 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -6994,9 +6994,11 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf,
 		if (!nr_retries)
 			lru_add_drain_all();
 
+		/* Will converge on zero, but reclaim enforces a minimum */
+		unsigned long batch_size = (nr_to_reclaim - nr_reclaimed) / 4;
+
 		reclaimed = try_to_free_mem_cgroup_pages(memcg,
-					min(nr_to_reclaim - nr_reclaimed, SWAP_CLUSTER_MAX),
-					GFP_KERNEL, reclaim_options);
+					batch_size, GFP_KERNEL, reclaim_options);
 
 		if (!reclaimed && !nr_retries--)
 			return -EAGAIN;

From ee1ab9c6f7d15eed319d2e1221d2e057ea9adb13 Mon Sep 17 00:00:00 2001
From: "T.J. Mercier" <tjmercier@google.com>
Date: Tue, 6 Feb 2024 17:52:50 +0000
Subject: [PATCH 1148/1406] 
 mm-memcg-use-larger-batches-for-proactive-reclaim-v4

Add additional info to commit message and move definition of batch_size
per Michal Hocko.  No functional changes.

Link: https://lkml.kernel.org/r/20240206175251.3364296-1-tjmercier@google.com
Fixes: 0388536ac291 ("mm:vmscan: fix inaccurate reclaim during proactive reclaim")
Signed-off-by: T.J. Mercier <tjmercier@google.com>
Reviewed-by: Yosry Ahmed <yosryahmed@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Shakeel Butt <shakeelb@google.com>
Reviewed-by: Michal Koutny <mkoutny@suse.com>
Cc: Efly Young <yangyifei03@kuaishou.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Yu Zhao <yuzhao@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/memcontrol.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index c01615b34df86a..cb216d30a22152 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -6981,6 +6981,8 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf,
 
 	reclaim_options	= MEMCG_RECLAIM_MAY_SWAP | MEMCG_RECLAIM_PROACTIVE;
 	while (nr_reclaimed < nr_to_reclaim) {
+		/* Will converge on zero, but reclaim enforces a minimum */
+		unsigned long batch_size = (nr_to_reclaim - nr_reclaimed) / 4;
 		unsigned long reclaimed;
 
 		if (signal_pending(current))
@@ -6994,9 +6996,6 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf,
 		if (!nr_retries)
 			lru_add_drain_all();
 
-		/* Will converge on zero, but reclaim enforces a minimum */
-		unsigned long batch_size = (nr_to_reclaim - nr_reclaimed) / 4;
-
 		reclaimed = try_to_free_mem_cgroup_pages(memcg,
 					batch_size, GFP_KERNEL, reclaim_options);
 

From 05c1741e296d1a356a8b4cccbe3a797036325039 Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Fri, 2 Feb 2024 22:23:18 +0100
Subject: [PATCH 1149/1406] mm: reduce dependencies on <linux/kernel.h>

"page_counter.h" does not need <linux/kernel.h>. <linux/limits.h> is enough
to get LONG_MAX.

Files that include page_counter.h are limited. They have been compile
tested or checked.

$ git grep page_counter\.h
include/linux/hugetlb_cgroup.h: struct page_counter hugepage[HUGE_MAX_HSTATE];
	--> all files that include it have been compile tested

include/linux/memcontrol.h:#include <linux/page_counter.h>
	--> <linux/kernel.h> has been added, to be safe

include/net/sock.h:#include <linux/page_counter.h>
	--> already include <linux/kernel.h>

mm/hugetlb_cgroup.c:#include <linux/page_counter.h>
mm/memcontrol.c:#include <linux/page_counter.h>
mm/page_counter.c:#include <linux/page_counter.h>
	--> compile tested

Link: https://lkml.kernel.org/r/adfdbe21c4d06400d7bd802868762deb85cae8b6.1706908921.git.christophe.jaillet@wanadoo.fr
Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memcontrol.h   | 1 +
 include/linux/page_counter.h | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 20ff87f8e001d2..4e4caeaea4041b 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -14,6 +14,7 @@
 #include <linux/vm_event_item.h>
 #include <linux/hardirq.h>
 #include <linux/jump_label.h>
+#include <linux/kernel.h>
 #include <linux/page_counter.h>
 #include <linux/vmpressure.h>
 #include <linux/eventfd.h>
diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h
index c141ea9a95ef86..8cd858d912c4b9 100644
--- a/include/linux/page_counter.h
+++ b/include/linux/page_counter.h
@@ -4,7 +4,7 @@
 
 #include <linux/atomic.h>
 #include <linux/cache.h>
-#include <linux/kernel.h>
+#include <linux/limits.h>
 #include <asm/page.h>
 
 struct page_counter {

From c820515e2fc14ca30a8132cb553751280d4b953c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Paul=20Heidekr=C3=BCger?= <paul.heidekrueger@tum.de>
Date: Fri, 2 Feb 2024 11:32:59 +0000
Subject: [PATCH 1150/1406] kasan: add atomic tests
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Test that KASan can detect some unsafe atomic accesses.

As discussed in the linked thread below, these tests attempt to cover
the most common uses of atomics and, therefore, aren't exhaustive.

Link: https://lkml.kernel.org/r/20240202113259.3045705-1-paul.heidekrueger@tum.de
Link: https://lore.kernel.org/all/20240131210041.686657-1-paul.heidekrueger@tum.de/T/#u
Signed-off-by: Paul Heidekrüger <paul.heidekrueger@tum.de>
Closes: https://bugzilla.kernel.org/show_bug.cgi?id=214055
Acked-by: Mark Rutland <mark.rutland@arm.com>
Cc: Marco Elver <elver@google.com>
Cc: Andrey Konovalov <andreyknvl@gmail.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Vincenzo Frascino <vincenzo.frascino@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/kasan/kasan_test.c | 79 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 79 insertions(+)

diff --git a/mm/kasan/kasan_test.c b/mm/kasan/kasan_test.c
index 971cfff4ca0b78..318d9cec111aad 100644
--- a/mm/kasan/kasan_test.c
+++ b/mm/kasan/kasan_test.c
@@ -697,6 +697,84 @@ static void kmalloc_uaf3(struct kunit *test)
 	KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr1)[8]);
 }
 
+static void kasan_atomics_helper(struct kunit *test, void *unsafe, void *safe)
+{
+	int *i_unsafe = (int *)unsafe;
+
+	KUNIT_EXPECT_KASAN_FAIL(test, READ_ONCE(*i_unsafe));
+	KUNIT_EXPECT_KASAN_FAIL(test, WRITE_ONCE(*i_unsafe, 42));
+	KUNIT_EXPECT_KASAN_FAIL(test, smp_load_acquire(i_unsafe));
+	KUNIT_EXPECT_KASAN_FAIL(test, smp_store_release(i_unsafe, 42));
+
+	KUNIT_EXPECT_KASAN_FAIL(test, atomic_read(unsafe));
+	KUNIT_EXPECT_KASAN_FAIL(test, atomic_set(unsafe, 42));
+	KUNIT_EXPECT_KASAN_FAIL(test, atomic_add(42, unsafe));
+	KUNIT_EXPECT_KASAN_FAIL(test, atomic_sub(42, unsafe));
+	KUNIT_EXPECT_KASAN_FAIL(test, atomic_inc(unsafe));
+	KUNIT_EXPECT_KASAN_FAIL(test, atomic_dec(unsafe));
+	KUNIT_EXPECT_KASAN_FAIL(test, atomic_and(42, unsafe));
+	KUNIT_EXPECT_KASAN_FAIL(test, atomic_andnot(42, unsafe));
+	KUNIT_EXPECT_KASAN_FAIL(test, atomic_or(42, unsafe));
+	KUNIT_EXPECT_KASAN_FAIL(test, atomic_xor(42, unsafe));
+	KUNIT_EXPECT_KASAN_FAIL(test, atomic_xchg(unsafe, 42));
+	KUNIT_EXPECT_KASAN_FAIL(test, atomic_cmpxchg(unsafe, 21, 42));
+	KUNIT_EXPECT_KASAN_FAIL(test, atomic_try_cmpxchg(unsafe, safe, 42));
+	KUNIT_EXPECT_KASAN_FAIL(test, atomic_try_cmpxchg(safe, unsafe, 42));
+	KUNIT_EXPECT_KASAN_FAIL(test, atomic_sub_and_test(42, unsafe));
+	KUNIT_EXPECT_KASAN_FAIL(test, atomic_dec_and_test(unsafe));
+	KUNIT_EXPECT_KASAN_FAIL(test, atomic_inc_and_test(unsafe));
+	KUNIT_EXPECT_KASAN_FAIL(test, atomic_add_negative(42, unsafe));
+	KUNIT_EXPECT_KASAN_FAIL(test, atomic_add_unless(unsafe, 21, 42));
+	KUNIT_EXPECT_KASAN_FAIL(test, atomic_inc_not_zero(unsafe));
+	KUNIT_EXPECT_KASAN_FAIL(test, atomic_inc_unless_negative(unsafe));
+	KUNIT_EXPECT_KASAN_FAIL(test, atomic_dec_unless_positive(unsafe));
+	KUNIT_EXPECT_KASAN_FAIL(test, atomic_dec_if_positive(unsafe));
+
+	KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_read(unsafe));
+	KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_set(unsafe, 42));
+	KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_add(42, unsafe));
+	KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_sub(42, unsafe));
+	KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_inc(unsafe));
+	KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_dec(unsafe));
+	KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_and(42, unsafe));
+	KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_andnot(42, unsafe));
+	KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_or(42, unsafe));
+	KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_xor(42, unsafe));
+	KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_xchg(unsafe, 42));
+	KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_cmpxchg(unsafe, 21, 42));
+	KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_try_cmpxchg(unsafe, safe, 42));
+	KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_try_cmpxchg(safe, unsafe, 42));
+	KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_sub_and_test(42, unsafe));
+	KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_dec_and_test(unsafe));
+	KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_inc_and_test(unsafe));
+	KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_add_negative(42, unsafe));
+	KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_add_unless(unsafe, 21, 42));
+	KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_inc_not_zero(unsafe));
+	KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_inc_unless_negative(unsafe));
+	KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_dec_unless_positive(unsafe));
+	KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_dec_if_positive(unsafe));
+}
+
+static void kasan_atomics(struct kunit *test)
+{
+	void *a1, *a2;
+
+	/*
+	 * Just as with kasan_bitops_tags(), we allocate 48 bytes of memory such
+	 * that the following 16 bytes will make up the redzone.
+	 */
+	a1 = kzalloc(48, GFP_KERNEL);
+	KUNIT_ASSERT_NOT_ERR_OR_NULL(test, a1);
+	a2 = kzalloc(sizeof(int), GFP_KERNEL);
+	KUNIT_ASSERT_NOT_ERR_OR_NULL(test, a1);
+
+	/* Use atomics to access the redzone. */
+	kasan_atomics_helper(test, a1 + 48, a2);
+
+	kfree(a1);
+	kfree(a2);
+}
+
 static void kmalloc_double_kzfree(struct kunit *test)
 {
 	char *ptr;
@@ -1883,6 +1961,7 @@ static struct kunit_case kasan_kunit_test_cases[] = {
 	KUNIT_CASE(kasan_strings),
 	KUNIT_CASE(kasan_bitops_generic),
 	KUNIT_CASE(kasan_bitops_tags),
+	KUNIT_CASE(kasan_atomics),
 	KUNIT_CASE(vmalloc_helpers_tags),
 	KUNIT_CASE(vmalloc_oob),
 	KUNIT_CASE(vmap_tags),

From e4e16ed6eb8c256107aef0ad70e257473b4bf53d Mon Sep 17 00:00:00 2001
From: Breno Leitao <leitao@debian.org>
Date: Mon, 5 Feb 2024 11:18:41 -0800
Subject: [PATCH 1151/1406] mm/hugetlb: Restore the reservation if needed

Patch series "mm/hugetlb: Restore the reservation", v2.

This is a fix for a case where a backing huge page could stolen after
madvise(MADV_DONTNEED).

A full reproducer is in selftest. See
https://lore.kernel.org/all/20240105155419.1939484-1-leitao@debian.org/

In order to test this patch, I instrumented the kernel with LOCKDEP and
KASAN, and run the following tests, without any regression:
  * The self test that reproduces the problem
  * All mm hugetlb selftests
	SUMMARY: PASS=9 SKIP=0 FAIL=0
  * All libhugetlbfs tests
	PASS:     0     86
	FAIL:     0      0


This patch (of 2):

Currently there is a bug that a huge page could be stolen, and when the
original owner tries to fault in it, it causes a page fault.

You can achieve that by:
  1) Creating a single page
	echo 1 > /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages

  2) mmap() the page above with MAP_HUGETLB into (void *ptr1).
	* This will mark the page as reserved
  3) touch the page, which causes a page fault and allocates the page
	* This will move the page out of the free list.
	* It will also unreserved the page, since there is no more free
	  page
  4) madvise(MADV_DONTNEED) the page
	* This will free the page, but not mark it as reserved.
  5) Allocate a secondary page with mmap(MAP_HUGETLB) into (void *ptr2).
	* it should fail, but, since there is no more available page.
	* But, since the page above is not reserved, this mmap() succeed.
  6) Faulting at ptr1 will cause a SIGBUS
	* it will try to allocate a huge page, but there is none
	  available

A full reproducer is in selftest. See
https://lore.kernel.org/all/20240105155419.1939484-1-leitao@debian.org/

Fix this by restoring the reserved page if necessary.

These are the condition for the page restore:

 * The system is not using surplus pages. The goal is to reduce the
   surplus usage for this case.
 * If the VMA has the HPAGE_RESV_OWNER flag set, and is PRIVATE. This is
   safely checked using __vma_private_lock()
 * The page is anonymous

Once this is scenario is found, set the `hugetlb_restore_reserve` bit in
the folio. Then check if the resv reservations need to be adjusted
later, done later, after the spinlock, since the vma_xxxx_reservation()
might touch the file system lock.

Link: https://lkml.kernel.org/r/20240205191843.4009640-1-leitao@debian.org
Link: https://lkml.kernel.org/r/20240205191843.4009640-2-leitao@debian.org
Signed-off-by: Breno Leitao <leitao@debian.org>
Suggested-by: Rik van Riel <riel@surriel.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/hugetlb.c | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 9d996fe4ecd9cc..edb2b2bf6f53df 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -5665,6 +5665,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
 	struct page *page;
 	struct hstate *h = hstate_vma(vma);
 	unsigned long sz = huge_page_size(h);
+	bool adjust_reservation = false;
 	unsigned long last_addr_mask;
 	bool force_flush = false;
 
@@ -5757,7 +5758,31 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
 		hugetlb_count_sub(pages_per_huge_page(h), mm);
 		hugetlb_remove_rmap(page_folio(page));
 
+		/*
+		 * Restore the reservation for anonymous page, otherwise the
+		 * backing page could be stolen by someone.
+		 * If there we are freeing a surplus, do not set the restore
+		 * reservation bit.
+		 */
+		if (!h->surplus_huge_pages && __vma_private_lock(vma) &&
+		    folio_test_anon(page_folio(page))) {
+			folio_set_hugetlb_restore_reserve(page_folio(page));
+			/* Reservation to be adjusted after the spin lock */
+			adjust_reservation = true;
+		}
+
 		spin_unlock(ptl);
+
+		/*
+		 * Adjust the reservation for the region that will have the
+		 * reserve restored. Keep in mind that vma_needs_reservation() changes
+		 * resv->adds_in_progress if it succeeds. If this is not done,
+		 * do_exit() will not see it, and will keep the reservation
+		 * forever.
+		 */
+		if (adjust_reservation && vma_needs_reservation(h, vma, address))
+			vma_add_reservation(h, vma, address);
+
 		tlb_remove_page_size(tlb, page, huge_page_size(h));
 		/*
 		 * Bail out after unmapping reference page if supplied

From 3bdbf24b4785d837a4b17273080026159cbd149a Mon Sep 17 00:00:00 2001
From: Breno Leitao <leitao@debian.org>
Date: Mon, 5 Feb 2024 11:18:42 -0800
Subject: [PATCH 1152/1406] selftests/mm: run_vmtests.sh: add
 hugetlb_madv_vs_map

hugetlb_madv_vs_map selftest was not part of the mm test-suite since we
didn't have a fix for the problem it found.

Now that the problem is already fixed (see previous commit), let's enable
this selftest in the default test-suite.

Link: https://lkml.kernel.org/r/20240205191843.4009640-3-leitao@debian.org
Signed-off-by: Breno Leitao <leitao@debian.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Rik van Riel <riel@surriel.com>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/run_vmtests.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh
index 81b5980886dafc..de03d38907d62b 100755
--- a/tools/testing/selftests/mm/run_vmtests.sh
+++ b/tools/testing/selftests/mm/run_vmtests.sh
@@ -264,6 +264,7 @@ nr_hugepages_tmp=$(cat /proc/sys/vm/nr_hugepages)
 # For this test, we need one and just one huge page
 echo 1 > /proc/sys/vm/nr_hugepages
 CATEGORY="hugetlb" run_test ./hugetlb_fault_after_madv
+CATEGORY="hugetlb" run_test ./hugetlb_madv_vs_map
 # Restore the previous number of huge pages, since further tests rely on it
 echo "$nr_hugepages_tmp" > /proc/sys/vm/nr_hugepages
 

From 010b68a5cc5f89aedb62c9112e7acde8eebec34b Mon Sep 17 00:00:00 2001
From: Nhat Pham <nphamcs@gmail.com>
Date: Mon, 5 Feb 2024 14:56:06 -0800
Subject: [PATCH 1153/1406] selftests: zswap: add zswap selftest file to zswap
 maintainer entry

Patch series "fix and extend zswap kselftests", v3.

Fix a broken zswap kselftest due to cgroup zswap writeback counter
renaming, and add 2 zswap kselftests, one to cover the (z)swapin case, and
another to check that no zswapping happens when the cgroup limit is 0.

Also, add the zswap kselftest file to zswap maintainer entry so that
get_maintainers script can find zswap maintainers.


This patch (of 3):

Make it easier for contributors to find the zswap maintainers when they
update the zswap tests.

Link: https://lkml.kernel.org/r/20240205225608.3083251-1-nphamcs@gmail.com
Link: https://lkml.kernel.org/r/20240205225608.3083251-2-nphamcs@gmail.com
Signed-off-by: Nhat Pham <nphamcs@gmail.com>
Acked-by: Yosry Ahmed <yosryahmed@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Rik van Riel <riel@surriel.com>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Zefan Li <lizefan.x@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 MAINTAINERS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index f7c81cea9b69e5..f3f5981ced2961 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -24423,6 +24423,7 @@ F:	include/linux/zpool.h
 F:	include/linux/zswap.h
 F:	mm/zpool.c
 F:	mm/zswap.c
+F:	tools/testing/selftests/cgroup/test_zswap.c
 
 THE REST
 M:	Linus Torvalds <torvalds@linux-foundation.org>

From 09489a78132656ae1a7eed181ed77bf8cbcb2dce Mon Sep 17 00:00:00 2001
From: Nhat Pham <nphamcs@gmail.com>
Date: Mon, 5 Feb 2024 14:56:07 -0800
Subject: [PATCH 1154/1406] selftests: fix the zswap invasive shrink test

The zswap no invasive shrink selftest breaks because we rename the zswap
writeback counter (see [1]).  Fix the test.

[1]: https://patchwork.kernel.org/project/linux-kselftest/patch/20231205193307.2432803-1-nphamcs@gmail.com/

Link: https://lkml.kernel.org/r/20240205225608.3083251-3-nphamcs@gmail.com
Fixes: a697dc2be925 ("selftests: cgroup: update per-memcg zswap writeback selftest")
Signed-off-by: Nhat Pham <nphamcs@gmail.com>
Acked-by: Yosry Ahmed <yosryahmed@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Rik van Riel <riel@surriel.com>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Zefan Li <lizefan.x@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/cgroup/test_zswap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/testing/selftests/cgroup/test_zswap.c b/tools/testing/selftests/cgroup/test_zswap.c
index 47fdaa14644300..32ce975b21d1f5 100644
--- a/tools/testing/selftests/cgroup/test_zswap.c
+++ b/tools/testing/selftests/cgroup/test_zswap.c
@@ -52,7 +52,7 @@ static int get_zswap_stored_pages(size_t *value)
 
 static int get_cg_wb_count(const char *cg)
 {
-	return cg_read_key_long(cg, "memory.stat", "zswp_wb");
+	return cg_read_key_long(cg, "memory.stat", "zswpwb");
 }
 
 static long get_zswpout(const char *cgroup)

From e760dd577d1ec101720694c39b6eb377e61845a7 Mon Sep 17 00:00:00 2001
From: Nhat Pham <nphamcs@gmail.com>
Date: Mon, 5 Feb 2024 14:56:08 -0800
Subject: [PATCH 1155/1406] selftests: add zswapin and no zswap tests

Add a selftest to cover the zswapin code path, allocating more memory than
the cgroup limit to trigger swapout/zswapout, then reading the pages back
in memory several times.  This is inspired by a recently encountered
kernel crash on the zswapin path in our internal kernel, which went
undetected because of a lack of test coverage for this path.

Add a selftest to verify that when memory.zswap.max = 0, no pages can go
to the zswap pool for the cgroup.

Link: https://lkml.kernel.org/r/20240205225608.3083251-4-nphamcs@gmail.com
Signed-off-by: Nhat Pham <nphamcs@gmail.com>
Suggested-by: Rik van Riel <riel@surriel.com>
Suggested-by: Yosry Ahmed <yosryahmed@google.com>
Acked-by: Yosry Ahmed <yosryahmed@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Zefan Li <lizefan.x@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/cgroup/test_zswap.c | 120 +++++++++++++++++++-
 1 file changed, 119 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/cgroup/test_zswap.c b/tools/testing/selftests/cgroup/test_zswap.c
index 32ce975b21d1f5..c263610a4a60b6 100644
--- a/tools/testing/selftests/cgroup/test_zswap.c
+++ b/tools/testing/selftests/cgroup/test_zswap.c
@@ -60,6 +60,27 @@ static long get_zswpout(const char *cgroup)
 	return cg_read_key_long(cgroup, "memory.stat", "zswpout ");
 }
 
+static int allocate_and_read_bytes(const char *cgroup, void *arg)
+{
+	size_t size = (size_t)arg;
+	char *mem = (char *)malloc(size);
+	int ret = 0;
+
+	if (!mem)
+		return -1;
+	for (int i = 0; i < size; i += 4095)
+		mem[i] = 'a';
+
+	/* go through the allocated memory to (z)swap in and out pages */
+	for (int i = 0; i < size; i += 4095) {
+		if (mem[i] != 'a')
+			ret = -1;
+	}
+
+	free(mem);
+	return ret;
+}
+
 static int allocate_bytes(const char *cgroup, void *arg)
 {
 	size_t size = (size_t)arg;
@@ -100,7 +121,6 @@ static int test_zswap_usage(const char *root)
 	int ret = KSFT_FAIL;
 	char *test_group;
 
-	/* Set up */
 	test_group = cg_name(root, "no_shrink_test");
 	if (!test_group)
 		goto out;
@@ -133,6 +153,102 @@ static int test_zswap_usage(const char *root)
 	return ret;
 }
 
+/*
+ * Check that when memory.zswap.max = 0, no pages can go to the zswap pool for
+ * the cgroup.
+ */
+static int test_swapin_nozswap(const char *root)
+{
+	int ret = KSFT_FAIL;
+	char *test_group;
+	long swap_peak, zswpout;
+
+	test_group = cg_name(root, "no_zswap_test");
+	if (!test_group)
+		goto out;
+	if (cg_create(test_group))
+		goto out;
+	if (cg_write(test_group, "memory.max", "8M"))
+		goto out;
+	if (cg_write(test_group, "memory.zswap.max", "0"))
+		goto out;
+
+	/* Allocate and read more than memory.max to trigger swapin */
+	if (cg_run(test_group, allocate_and_read_bytes, (void *)MB(32)))
+		goto out;
+
+	/* Verify that pages are swapped out, but no zswap happened */
+	swap_peak = cg_read_long(test_group, "memory.swap.peak");
+	if (swap_peak < 0) {
+		ksft_print_msg("failed to get cgroup's swap_peak\n");
+		goto out;
+	}
+
+	if (swap_peak == 0) {
+		ksft_print_msg("pages should be swapped out\n");
+		goto out;
+	}
+
+	zswpout = get_zswpout(test_group);
+	if (zswpout < 0) {
+		ksft_print_msg("failed to get zswpout\n");
+		goto out;
+	}
+
+	if (zswpout > 0) {
+		ksft_print_msg("zswapout > 0 when memory.zswap.max = 0\n");
+		goto out;
+	}
+
+	ret = KSFT_PASS;
+
+out:
+	cg_destroy(test_group);
+	free(test_group);
+	return ret;
+}
+
+/* Simple test to verify the (z)swapin code paths */
+static int test_zswapin(const char *root)
+{
+	int ret = KSFT_FAIL;
+	char *test_group;
+	long zswpin;
+
+	/* Set up */
+	test_group = cg_name(root, "zswapin_test");
+	if (!test_group)
+		goto out;
+	if (cg_create(test_group))
+		goto out;
+	if (cg_write(test_group, "memory.max", "8M"))
+		goto out;
+	if (cg_write(test_group, "memory.zswap.max", "max"))
+		goto out;
+
+	/* Allocate and read more than memory.max to trigger (z)swap in */
+	if (cg_run(test_group, allocate_and_read_bytes, (void *)MB(32)))
+		goto out;
+
+	zswpin = cg_read_key_long(test_group, "memory.stat", "zswpin ");
+	if (zswpin < 0) {
+		ksft_print_msg("failed to get zswpin\n");
+		goto out;
+	}
+
+	if (zswpin == 0) {
+		ksft_print_msg("zswpin should not be 0\n");
+		goto out;
+	}
+
+	ret = KSFT_PASS;
+
+out:
+	cg_destroy(test_group);
+	free(test_group);
+	return ret;
+}
+
 /*
  * When trying to store a memcg page in zswap, if the memcg hits its memory
  * limit in zswap, writeback should affect only the zswapped pages of that
@@ -309,6 +425,8 @@ struct zswap_test {
 	const char *name;
 } tests[] = {
 	T(test_zswap_usage),
+	T(test_swapin_nozswap),
+	T(test_zswapin),
 	T(test_no_kmem_bypass),
 	T(test_no_invasive_cgroup_shrink),
 };

From 88c526fb64cc7173669703f27eb3b40bec26ce34 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Mon, 5 Feb 2024 01:37:29 -0800
Subject: [PATCH 1156/1406] ubsan: reintroduce signed overflow sanitizer

In order to mitigate unexpected signed wrap-around[1], bring back the
signed integer overflow sanitizer.  It was removed in commit 6aaa31aeb9cf
("ubsan: remove overflow checks") because it was effectively a no-op when
combined with -fno-strict-overflow (which correctly changes signed
overflow from being "undefined" to being explicitly "wrap around").

Compilers are adjusting their sanitizers to trap wrap-around and to
detecting common code patterns that should not be instrumented (e.g.  "var
+ offset < var").  Prepare for this and explicitly rename the option from
"OVERFLOW" to "WRAP".

To annotate intentional wrap-around arithmetic, the add/sub/mul_wrap()
helpers can be used for individual statements.  At the function level, the
__signed_wrap attribute can be used to mark an entire function as
expecting its signed arithmetic to wrap around.  For a single object file
the Makefile can use "UBSAN_WRAP_SIGNED_target.o := n" to mark it as
wrapping, and for an entire directory, "UBSAN_WRAP_SIGNED := n" can be
used.

Additionally keep these disabled under CONFIG_COMPILE_TEST for now.

Link: https://github.com/KSPP/linux/issues/26 [1]
Link: https://lkml.kernel.org/r/20240205093725.make.582-kees@kernel.org
Signed-off-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Marco Elver <elver@google.com>
Cc: Justin Stitt <justinstitt@google.com>
Cc: Miguel Ojeda <ojeda@kernel.org>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Hao Luo <haoluo@google.com>
Cc: Andrey Konovalov <andreyknvl@gmail.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: Nick Desaulniers <ndesaulniers@google.com>
Cc: Nicolas Schier <nicolas@fjasle.eu>
Cc: Przemek Kitszel <przemyslaw.kitszel@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/compiler_types.h |  9 ++++-
 lib/Kconfig.ubsan              | 14 +++++++
 lib/test_ubsan.c               | 37 ++++++++++++++++++
 lib/ubsan.c                    | 68 ++++++++++++++++++++++++++++++++++
 lib/ubsan.h                    |  4 ++
 scripts/Makefile.lib           |  3 ++
 scripts/Makefile.ubsan         |  3 ++
 7 files changed, 137 insertions(+), 1 deletion(-)

diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h
index 0caf354cb94b5a..3e64ec0f7ac80b 100644
--- a/include/linux/compiler_types.h
+++ b/include/linux/compiler_types.h
@@ -282,11 +282,18 @@ struct ftrace_likely_data {
 #define __no_sanitize_or_inline __always_inline
 #endif
 
+/* Do not trap wrapping arithmetic within an annotated function. */
+#ifdef CONFIG_UBSAN_SIGNED_WRAP
+# define __signed_wrap __attribute__((no_sanitize("signed-integer-overflow")))
+#else
+# define __signed_wrap
+#endif
+
 /* Section for code which can't be instrumented at all */
 #define __noinstr_section(section)					\
 	noinline notrace __attribute((__section__(section)))		\
 	__no_kcsan __no_sanitize_address __no_profile __no_sanitize_coverage \
-	__no_sanitize_memory
+	__no_sanitize_memory __signed_wrap
 
 #define noinstr __noinstr_section(".noinstr.text")
 
diff --git a/lib/Kconfig.ubsan b/lib/Kconfig.ubsan
index 59e21bfec188cc..4b6241723d3cd4 100644
--- a/lib/Kconfig.ubsan
+++ b/lib/Kconfig.ubsan
@@ -116,6 +116,20 @@ config UBSAN_UNREACHABLE
 	  This option enables -fsanitize=unreachable which checks for control
 	  flow reaching an expected-to-be-unreachable position.
 
+config UBSAN_SIGNED_WRAP
+	bool "Perform checking for signed arithmetic wrap-around"
+	default UBSAN
+	depends on !COMPILE_TEST
+	depends on $(cc-option,-fsanitize=signed-integer-overflow)
+	help
+	  This option enables -fsanitize=signed-integer-overflow which checks
+	  for wrap-around of any arithmetic operations with signed integers.
+	  This currently performs nearly no instrumentation due to the
+	  kernel's use of -fno-strict-overflow which converts all would-be
+	  arithmetic undefined behavior into wrap-around arithmetic. Future
+	  sanitizer versions will allow for wrap-around checking (rather than
+	  exclusively undefined behavior).
+
 config UBSAN_BOOL
 	bool "Perform checking for non-boolean values used as boolean"
 	default UBSAN
diff --git a/lib/test_ubsan.c b/lib/test_ubsan.c
index 2062be1f2e80f6..c3587b6a482ab9 100644
--- a/lib/test_ubsan.c
+++ b/lib/test_ubsan.c
@@ -11,6 +11,39 @@ typedef void(*test_ubsan_fp)(void);
 			#config, IS_ENABLED(config) ? "y" : "n");	\
 	} while (0)
 
+static void test_ubsan_add_overflow(void)
+{
+	volatile int val = INT_MAX;
+
+	UBSAN_TEST(CONFIG_UBSAN_SIGNED_WRAP);
+	val += 2;
+}
+
+static void test_ubsan_sub_overflow(void)
+{
+	volatile int val = INT_MIN;
+	volatile int val2 = 2;
+
+	UBSAN_TEST(CONFIG_UBSAN_SIGNED_WRAP);
+	val -= val2;
+}
+
+static void test_ubsan_mul_overflow(void)
+{
+	volatile int val = INT_MAX / 2;
+
+	UBSAN_TEST(CONFIG_UBSAN_SIGNED_WRAP);
+	val *= 3;
+}
+
+static void test_ubsan_negate_overflow(void)
+{
+	volatile int val = INT_MIN;
+
+	UBSAN_TEST(CONFIG_UBSAN_SIGNED_WRAP);
+	val = -val;
+}
+
 static void test_ubsan_divrem_overflow(void)
 {
 	volatile int val = 16;
@@ -90,6 +123,10 @@ static void test_ubsan_misaligned_access(void)
 }
 
 static const test_ubsan_fp test_ubsan_array[] = {
+	test_ubsan_add_overflow,
+	test_ubsan_sub_overflow,
+	test_ubsan_mul_overflow,
+	test_ubsan_negate_overflow,
 	test_ubsan_shift_out_of_bounds,
 	test_ubsan_out_of_bounds,
 	test_ubsan_load_invalid_value,
diff --git a/lib/ubsan.c b/lib/ubsan.c
index df4f8d1354bbf4..5fc107f61934c2 100644
--- a/lib/ubsan.c
+++ b/lib/ubsan.c
@@ -222,6 +222,74 @@ static void ubsan_epilogue(void)
 	check_panic_on_warn("UBSAN");
 }
 
+static void handle_overflow(struct overflow_data *data, void *lhs,
+			void *rhs, char op)
+{
+
+	struct type_descriptor *type = data->type;
+	char lhs_val_str[VALUE_LENGTH];
+	char rhs_val_str[VALUE_LENGTH];
+
+	if (suppress_report(&data->location))
+		return;
+
+	ubsan_prologue(&data->location, type_is_signed(type) ?
+			"signed-integer-overflow" :
+			"unsigned-integer-overflow");
+
+	val_to_string(lhs_val_str, sizeof(lhs_val_str), type, lhs);
+	val_to_string(rhs_val_str, sizeof(rhs_val_str), type, rhs);
+	pr_err("%s %c %s cannot be represented in type %s\n",
+		lhs_val_str,
+		op,
+		rhs_val_str,
+		type->type_name);
+
+	ubsan_epilogue();
+}
+
+void __ubsan_handle_add_overflow(void *data,
+				void *lhs, void *rhs)
+{
+
+	handle_overflow(data, lhs, rhs, '+');
+}
+EXPORT_SYMBOL(__ubsan_handle_add_overflow);
+
+void __ubsan_handle_sub_overflow(void *data,
+				void *lhs, void *rhs)
+{
+	handle_overflow(data, lhs, rhs, '-');
+}
+EXPORT_SYMBOL(__ubsan_handle_sub_overflow);
+
+void __ubsan_handle_mul_overflow(void *data,
+				void *lhs, void *rhs)
+{
+	handle_overflow(data, lhs, rhs, '*');
+}
+EXPORT_SYMBOL(__ubsan_handle_mul_overflow);
+
+void __ubsan_handle_negate_overflow(void *_data, void *old_val)
+{
+	struct overflow_data *data = _data;
+	char old_val_str[VALUE_LENGTH];
+
+	if (suppress_report(&data->location))
+		return;
+
+	ubsan_prologue(&data->location, "negation-overflow");
+
+	val_to_string(old_val_str, sizeof(old_val_str), data->type, old_val);
+
+	pr_err("negation of %s cannot be represented in type %s:\n",
+		old_val_str, data->type->type_name);
+
+	ubsan_epilogue();
+}
+EXPORT_SYMBOL(__ubsan_handle_negate_overflow);
+
+
 void __ubsan_handle_divrem_overflow(void *_data, void *lhs, void *rhs)
 {
 	struct overflow_data *data = _data;
diff --git a/lib/ubsan.h b/lib/ubsan.h
index 5d99ab81913bbd..0abbbac8700d19 100644
--- a/lib/ubsan.h
+++ b/lib/ubsan.h
@@ -124,6 +124,10 @@ typedef s64 s_max;
 typedef u64 u_max;
 #endif
 
+void __ubsan_handle_add_overflow(void *data, void *lhs, void *rhs);
+void __ubsan_handle_sub_overflow(void *data, void *lhs, void *rhs);
+void __ubsan_handle_mul_overflow(void *data, void *lhs, void *rhs);
+void __ubsan_handle_negate_overflow(void *_data, void *old_val);
 void __ubsan_handle_divrem_overflow(void *_data, void *lhs, void *rhs);
 void __ubsan_handle_type_mismatch(struct type_mismatch_data *data, void *ptr);
 void __ubsan_handle_type_mismatch_v1(void *_data, void *ptr);
diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib
index cd5b181060f151..4cbe4a0698f2d6 100644
--- a/scripts/Makefile.lib
+++ b/scripts/Makefile.lib
@@ -177,6 +177,9 @@ ifeq ($(CONFIG_UBSAN),y)
 _c_flags += $(if $(patsubst n%,, \
 		$(UBSAN_SANITIZE_$(basetarget).o)$(UBSAN_SANITIZE)$(CONFIG_UBSAN_SANITIZE_ALL)), \
 		$(CFLAGS_UBSAN))
+_c_flags += $(if $(patsubst n%,, \
+		$(UBSAN_WRAP_SIGNED_$(basetarget).o)$(UBSAN_SANITIZE_$(basetarget).o)$(UBSAN_WRAP_SIGNED)$(UBSAN_SANITIZE)y), \
+		$(CFLAGS_UBSAN_WRAP_SIGNED))
 endif
 
 ifeq ($(CONFIG_KCOV),y)
diff --git a/scripts/Makefile.ubsan b/scripts/Makefile.ubsan
index 4749865c1b2c2e..ee7dd0ba4f82b9 100644
--- a/scripts/Makefile.ubsan
+++ b/scripts/Makefile.ubsan
@@ -13,3 +13,6 @@ ubsan-cflags-$(CONFIG_UBSAN_ENUM)		+= -fsanitize=enum
 ubsan-cflags-$(CONFIG_UBSAN_TRAP)		+= -fsanitize-undefined-trap-on-error
 
 export CFLAGS_UBSAN := $(ubsan-cflags-y)
+
+ubsan-wrap-signed-cflags-$(CONFIG_UBSAN_SIGNED_WRAP)     += -fsanitize=signed-integer-overflow
+export CFLAGS_UBSAN_WRAP_SIGNED := $(ubsan-wrap-signed-cflags-y)

From f7705114b93d1d9507b6ea28a8a1cc5e9eb61be7 Mon Sep 17 00:00:00 2001
From: Tiezhu Yang <yangtiezhu@loongson.cn>
Date: Mon, 5 Feb 2024 14:09:21 +0800
Subject: [PATCH 1157/1406] kasan: docs: update descriptions about test file
 and module

After commit f7e01ab828fd ("kasan: move tests to mm/kasan/"), the test
file is renamed to mm/kasan/kasan_test.c and the test module is renamed to
kasan_test.ko, so update the descriptions in the document.

While at it, update the line number and testcase number when the tests
kmalloc_large_oob_right and kmalloc_double_kzfree failed to sync with the
current code in mm/kasan/kasan_test.c.

Link: https://lkml.kernel.org/r/20240205060925.15594-2-yangtiezhu@loongson.cn
Signed-off-by: Tiezhu Yang <yangtiezhu@loongson.cn>
Acked-by: Marco Elver <elver@google.com>
Reviewed-by: Andrey Konovalov <andreyknvl@gmail.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/dev-tools/kasan.rst             | 20 +++++++++----------
 .../translations/zh_CN/dev-tools/kasan.rst    | 20 +++++++++----------
 .../translations/zh_TW/dev-tools/kasan.rst    | 20 +++++++++----------
 3 files changed, 30 insertions(+), 30 deletions(-)

diff --git a/Documentation/dev-tools/kasan.rst b/Documentation/dev-tools/kasan.rst
index 858c77fe7dc46c..a5a6dbe9029f49 100644
--- a/Documentation/dev-tools/kasan.rst
+++ b/Documentation/dev-tools/kasan.rst
@@ -169,7 +169,7 @@ Error reports
 A typical KASAN report looks like this::
 
     ==================================================================
-    BUG: KASAN: slab-out-of-bounds in kmalloc_oob_right+0xa8/0xbc [test_kasan]
+    BUG: KASAN: slab-out-of-bounds in kmalloc_oob_right+0xa8/0xbc [kasan_test]
     Write of size 1 at addr ffff8801f44ec37b by task insmod/2760
 
     CPU: 1 PID: 2760 Comm: insmod Not tainted 4.19.0-rc3+ #698
@@ -179,8 +179,8 @@ A typical KASAN report looks like this::
      print_address_description+0x73/0x280
      kasan_report+0x144/0x187
      __asan_report_store1_noabort+0x17/0x20
-     kmalloc_oob_right+0xa8/0xbc [test_kasan]
-     kmalloc_tests_init+0x16/0x700 [test_kasan]
+     kmalloc_oob_right+0xa8/0xbc [kasan_test]
+     kmalloc_tests_init+0x16/0x700 [kasan_test]
      do_one_initcall+0xa5/0x3ae
      do_init_module+0x1b6/0x547
      load_module+0x75df/0x8070
@@ -200,8 +200,8 @@ A typical KASAN report looks like this::
      save_stack+0x43/0xd0
      kasan_kmalloc+0xa7/0xd0
      kmem_cache_alloc_trace+0xe1/0x1b0
-     kmalloc_oob_right+0x56/0xbc [test_kasan]
-     kmalloc_tests_init+0x16/0x700 [test_kasan]
+     kmalloc_oob_right+0x56/0xbc [kasan_test]
+     kmalloc_tests_init+0x16/0x700 [kasan_test]
      do_one_initcall+0xa5/0x3ae
      do_init_module+0x1b6/0x547
      load_module+0x75df/0x8070
@@ -510,15 +510,15 @@ When a test passes::
 
 When a test fails due to a failed ``kmalloc``::
 
-        # kmalloc_large_oob_right: ASSERTION FAILED at lib/test_kasan.c:163
+        # kmalloc_large_oob_right: ASSERTION FAILED at mm/kasan/kasan_test.c:245
         Expected ptr is not null, but is
-        not ok 4 - kmalloc_large_oob_right
+        not ok 5 - kmalloc_large_oob_right
 
 When a test fails due to a missing KASAN report::
 
-        # kmalloc_double_kzfree: EXPECTATION FAILED at lib/test_kasan.c:974
+        # kmalloc_double_kzfree: EXPECTATION FAILED at mm/kasan/kasan_test.c:709
         KASAN failure expected in "kfree_sensitive(ptr)", but none occurred
-        not ok 44 - kmalloc_double_kzfree
+        not ok 28 - kmalloc_double_kzfree
 
 
 At the end the cumulative status of all KASAN tests is printed. On success::
@@ -534,7 +534,7 @@ There are a few ways to run KUnit-compatible KASAN tests.
 1. Loadable module
 
    With ``CONFIG_KUNIT`` enabled, KASAN-KUnit tests can be built as a loadable
-   module and run by loading ``test_kasan.ko`` with ``insmod`` or ``modprobe``.
+   module and run by loading ``kasan_test.ko`` with ``insmod`` or ``modprobe``.
 
 2. Built-In
 
diff --git a/Documentation/translations/zh_CN/dev-tools/kasan.rst b/Documentation/translations/zh_CN/dev-tools/kasan.rst
index 8fdb20c9665b4b..2b1e8f74904b01 100644
--- a/Documentation/translations/zh_CN/dev-tools/kasan.rst
+++ b/Documentation/translations/zh_CN/dev-tools/kasan.rst
@@ -137,7 +137,7 @@ KASAN受到通用 ``panic_on_warn`` 命令行参数的影响。当它被启用
 典型的KASAN报告如下所示::
 
     ==================================================================
-    BUG: KASAN: slab-out-of-bounds in kmalloc_oob_right+0xa8/0xbc [test_kasan]
+    BUG: KASAN: slab-out-of-bounds in kmalloc_oob_right+0xa8/0xbc [kasan_test]
     Write of size 1 at addr ffff8801f44ec37b by task insmod/2760
 
     CPU: 1 PID: 2760 Comm: insmod Not tainted 4.19.0-rc3+ #698
@@ -147,8 +147,8 @@ KASAN受到通用 ``panic_on_warn`` 命令行参数的影响。当它被启用
      print_address_description+0x73/0x280
      kasan_report+0x144/0x187
      __asan_report_store1_noabort+0x17/0x20
-     kmalloc_oob_right+0xa8/0xbc [test_kasan]
-     kmalloc_tests_init+0x16/0x700 [test_kasan]
+     kmalloc_oob_right+0xa8/0xbc [kasan_test]
+     kmalloc_tests_init+0x16/0x700 [kasan_test]
      do_one_initcall+0xa5/0x3ae
      do_init_module+0x1b6/0x547
      load_module+0x75df/0x8070
@@ -168,8 +168,8 @@ KASAN受到通用 ``panic_on_warn`` 命令行参数的影响。当它被启用
      save_stack+0x43/0xd0
      kasan_kmalloc+0xa7/0xd0
      kmem_cache_alloc_trace+0xe1/0x1b0
-     kmalloc_oob_right+0x56/0xbc [test_kasan]
-     kmalloc_tests_init+0x16/0x700 [test_kasan]
+     kmalloc_oob_right+0x56/0xbc [kasan_test]
+     kmalloc_tests_init+0x16/0x700 [kasan_test]
      do_one_initcall+0xa5/0x3ae
      do_init_module+0x1b6/0x547
      load_module+0x75df/0x8070
@@ -421,15 +421,15 @@ KASAN连接到vmap基础架构以懒清理未使用的影子内存。
 
 当由于 ``kmalloc`` 失败而导致测试失败时::
 
-        # kmalloc_large_oob_right: ASSERTION FAILED at lib/test_kasan.c:163
+        # kmalloc_large_oob_right: ASSERTION FAILED at mm/kasan/kasan_test.c:245
         Expected ptr is not null, but is
-        not ok 4 - kmalloc_large_oob_right
+        not ok 5 - kmalloc_large_oob_right
 
 当由于缺少KASAN报告而导致测试失败时::
 
-        # kmalloc_double_kzfree: EXPECTATION FAILED at lib/test_kasan.c:974
+        # kmalloc_double_kzfree: EXPECTATION FAILED at mm/kasan/kasan_test.c:709
         KASAN failure expected in "kfree_sensitive(ptr)", but none occurred
-        not ok 44 - kmalloc_double_kzfree
+        not ok 28 - kmalloc_double_kzfree
 
 
 最后打印所有KASAN测试的累积状态。成功::
@@ -445,7 +445,7 @@ KASAN连接到vmap基础架构以懒清理未使用的影子内存。
 1. 可加载模块
 
    启用 ``CONFIG_KUNIT`` 后，KASAN-KUnit测试可以构建为可加载模块，并通过使用
-   ``insmod`` 或 ``modprobe`` 加载 ``test_kasan.ko`` 来运行。
+   ``insmod`` 或 ``modprobe`` 加载 ``kasan_test.ko`` 来运行。
 
 2. 内置
 
diff --git a/Documentation/translations/zh_TW/dev-tools/kasan.rst b/Documentation/translations/zh_TW/dev-tools/kasan.rst
index 979eb84bc58f1a..ed342e67d8ed02 100644
--- a/Documentation/translations/zh_TW/dev-tools/kasan.rst
+++ b/Documentation/translations/zh_TW/dev-tools/kasan.rst
@@ -137,7 +137,7 @@ KASAN受到通用 ``panic_on_warn`` 命令行參數的影響。當它被啓用
 典型的KASAN報告如下所示::
 
     ==================================================================
-    BUG: KASAN: slab-out-of-bounds in kmalloc_oob_right+0xa8/0xbc [test_kasan]
+    BUG: KASAN: slab-out-of-bounds in kmalloc_oob_right+0xa8/0xbc [kasan_test]
     Write of size 1 at addr ffff8801f44ec37b by task insmod/2760
 
     CPU: 1 PID: 2760 Comm: insmod Not tainted 4.19.0-rc3+ #698
@@ -147,8 +147,8 @@ KASAN受到通用 ``panic_on_warn`` 命令行參數的影響。當它被啓用
      print_address_description+0x73/0x280
      kasan_report+0x144/0x187
      __asan_report_store1_noabort+0x17/0x20
-     kmalloc_oob_right+0xa8/0xbc [test_kasan]
-     kmalloc_tests_init+0x16/0x700 [test_kasan]
+     kmalloc_oob_right+0xa8/0xbc [kasan_test]
+     kmalloc_tests_init+0x16/0x700 [kasan_test]
      do_one_initcall+0xa5/0x3ae
      do_init_module+0x1b6/0x547
      load_module+0x75df/0x8070
@@ -168,8 +168,8 @@ KASAN受到通用 ``panic_on_warn`` 命令行參數的影響。當它被啓用
      save_stack+0x43/0xd0
      kasan_kmalloc+0xa7/0xd0
      kmem_cache_alloc_trace+0xe1/0x1b0
-     kmalloc_oob_right+0x56/0xbc [test_kasan]
-     kmalloc_tests_init+0x16/0x700 [test_kasan]
+     kmalloc_oob_right+0x56/0xbc [kasan_test]
+     kmalloc_tests_init+0x16/0x700 [kasan_test]
      do_one_initcall+0xa5/0x3ae
      do_init_module+0x1b6/0x547
      load_module+0x75df/0x8070
@@ -421,15 +421,15 @@ KASAN連接到vmap基礎架構以懶清理未使用的影子內存。
 
 當由於 ``kmalloc`` 失敗而導致測試失敗時::
 
-        # kmalloc_large_oob_right: ASSERTION FAILED at lib/test_kasan.c:163
+        # kmalloc_large_oob_right: ASSERTION FAILED at mm/kasan/kasan_test.c:245
         Expected ptr is not null, but is
-        not ok 4 - kmalloc_large_oob_right
+        not ok 5 - kmalloc_large_oob_right
 
 當由於缺少KASAN報告而導致測試失敗時::
 
-        # kmalloc_double_kzfree: EXPECTATION FAILED at lib/test_kasan.c:974
+        # kmalloc_double_kzfree: EXPECTATION FAILED at mm/kasan/kasan_test.c:709
         KASAN failure expected in "kfree_sensitive(ptr)", but none occurred
-        not ok 44 - kmalloc_double_kzfree
+        not ok 28 - kmalloc_double_kzfree
 
 
 最後打印所有KASAN測試的累積狀態。成功::
@@ -445,7 +445,7 @@ KASAN連接到vmap基礎架構以懶清理未使用的影子內存。
 1. 可加載模塊
 
    啓用 ``CONFIG_KUNIT`` 後，KASAN-KUnit測試可以構建爲可加載模塊，並通過使用
-   ``insmod`` 或 ``modprobe`` 加載 ``test_kasan.ko`` 來運行。
+   ``insmod`` 或 ``modprobe`` 加載 ``kasan_test.ko`` 來運行。
 
 2. 內置
 

From a5225cf195a5e9e165c39efe00fac76e95f2c0c6 Mon Sep 17 00:00:00 2001
From: Tiezhu Yang <yangtiezhu@loongson.cn>
Date: Mon, 5 Feb 2024 14:09:22 +0800
Subject: [PATCH 1158/1406] kasan: rename test_kasan_module_init to
 kasan_test_module_init

After commit f7e01ab828fd ("kasan: move tests to mm/kasan/"), the test
module file is renamed from lib/test_kasan_module.c to
mm/kasan/kasan_test_module.c, in order to keep consistent, rename
test_kasan_module_init to kasan_test_module_init.

Link: https://lkml.kernel.org/r/20240205060925.15594-3-yangtiezhu@loongson.cn
Signed-off-by: Tiezhu Yang <yangtiezhu@loongson.cn>
Acked-by: Marco Elver <elver@google.com>
Reviewed-by: Andrey Konovalov <andreyknvl@gmail.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/kasan/kasan_test_module.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mm/kasan/kasan_test_module.c b/mm/kasan/kasan_test_module.c
index 8b7b3ea2c74ea7..27ec22767e4222 100644
--- a/mm/kasan/kasan_test_module.c
+++ b/mm/kasan/kasan_test_module.c
@@ -62,7 +62,7 @@ static noinline void __init copy_user_test(void)
 	kfree(kmem);
 }
 
-static int __init test_kasan_module_init(void)
+static int __init kasan_test_module_init(void)
 {
 	/*
 	 * Temporarily enable multi-shot mode. Otherwise, KASAN would only
@@ -77,5 +77,5 @@ static int __init test_kasan_module_init(void)
 	return -EAGAIN;
 }
 
-module_init(test_kasan_module_init);
+module_init(kasan_test_module_init);
 MODULE_LICENSE("GPL");

From abf780dbd2afa0f0c266f6afe88a6dc28e6aa544 Mon Sep 17 00:00:00 2001
From: Anshuman Khandual <anshuman.khandual@arm.com>
Date: Mon, 5 Feb 2024 08:46:47 +0530
Subject: [PATCH 1159/1406] mm/cma: drop CONFIG_CMA_DEBUG

All pr_debug() prints in (mm/cma.c) could be enabled via standard Makefile
based method.  Besides cma_debug_show_areas() should always be called
during cma_alloc() failure path.  This seemingly redundant config,
CONFIG_CMA_DEBUG can be dropped without any problem.

Link: https://lkml.kernel.org/r/20240205031647.283510-1-anshuman.khandual@arm.com
Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Lukas Bulwahn <lukas.bulwahn@gmail.com>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Cc: Robin Murphy <robin.murphy@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/Kconfig | 9 ---------
 mm/cma.c   | 9 ---------
 2 files changed, 18 deletions(-)

diff --git a/mm/Kconfig b/mm/Kconfig
index ffc3a2ba3a8cd8..35fa9940e61fe4 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -901,15 +901,6 @@ config CMA
 
 	  If unsure, say "n".
 
-config CMA_DEBUG
-	bool "CMA debug messages (DEVELOPMENT)"
-	depends on DEBUG_KERNEL && CMA
-	help
-	  Turns on debug messages in CMA.  This produces KERN_DEBUG
-	  messages for every CMA call as well as various messages while
-	  processing calls such as dma_alloc_from_contiguous().
-	  This option does not affect warning and error messages.
-
 config CMA_DEBUGFS
 	bool "CMA debugfs interface"
 	depends on CMA && DEBUG_FS
diff --git a/mm/cma.c b/mm/cma.c
index b6720930312df2..4902bbfe24f121 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -14,11 +14,6 @@
 
 #define pr_fmt(fmt) "cma: " fmt
 
-#ifdef CONFIG_CMA_DEBUG
-#ifndef DEBUG
-#  define DEBUG
-#endif
-#endif
 #define CREATE_TRACE_POINTS
 
 #include <linux/memblock.h>
@@ -387,7 +382,6 @@ int __init cma_declare_contiguous_nid(phys_addr_t base,
 	return ret;
 }
 
-#ifdef CONFIG_CMA_DEBUG
 static void cma_debug_show_areas(struct cma *cma)
 {
 	unsigned long next_zero_bit, next_set_bit, nr_zero;
@@ -412,9 +406,6 @@ static void cma_debug_show_areas(struct cma *cma)
 	pr_cont("=> %lu free of %lu total pages\n", nr_total, cma->count);
 	spin_unlock_irq(&cma->lock);
 }
-#else
-static inline void cma_debug_show_areas(struct cma *cma) { }
-#endif
 
 /**
  * cma_alloc() - allocate pages from contiguous area

From 261356e0a43dd9f44fa6d89e4ff21ceb455c04fe Mon Sep 17 00:00:00 2001
From: Lukas Bulwahn <lukas.bulwahn@gmail.com>
Date: Wed, 7 Feb 2024 15:38:25 +0100
Subject: [PATCH 1160/1406] dma-contiguous: remove debug code to removed
 CONFIG_CMA_DEBUG

Commit acc2f3e42d4a ("mm/cma: drop CONFIG_CMA_DEBUG") removes the config
CMA_DEBUG and the debug code in cma.c, but misses debug code in
kernel/dma/contiguous.c.

Remove this dead code from this removed config option.

Link: https://lkml.kernel.org/r/20240207143825.986-1-lukas.bulwahn@gmail.com
Signed-off-by: Lukas Bulwahn <lukas.bulwahn@gmail.com>
Reviewed-by: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Cc: Robin Murphy <robin.murphy@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/dma/contiguous.c | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c
index f005c66f378c32..055da410ac71d6 100644
--- a/kernel/dma/contiguous.c
+++ b/kernel/dma/contiguous.c
@@ -37,12 +37,6 @@
 
 #define pr_fmt(fmt) "cma: " fmt
 
-#ifdef CONFIG_CMA_DEBUG
-#ifndef DEBUG
-#  define DEBUG
-#endif
-#endif
-
 #include <asm/page.h>
 
 #include <linux/memblock.h>

From c6af9c7a03344a3a60b0a521c8410e2a23136ffd Mon Sep 17 00:00:00 2001
From: Anshuman Khandual <anshuman.khandual@arm.com>
Date: Mon, 5 Feb 2024 10:49:29 +0530
Subject: [PATCH 1161/1406] mm/cma: make MAX_CMA_AREAS = CONFIG_CMA_AREAS

There is no real difference between the global area, and other
additionally configured CMA areas via CONFIG_CMA_AREAS that always
defaults without user input.  This makes MAX_CMA_AREAS same as
CONFIG_CMA_AREAS, also incrementing its default values, thus maintaining
current default for MAX_CMA_AREAS both for UMA and NUMA systems.

Link: https://lkml.kernel.org/r/20240205051929.298559-1-anshuman.khandual@arm.com
Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/cma.h | 6 +-----
 mm/Kconfig          | 6 +++---
 2 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/include/linux/cma.h b/include/linux/cma.h
index 63873b93deaa62..9db877506ea836 100644
--- a/include/linux/cma.h
+++ b/include/linux/cma.h
@@ -6,12 +6,8 @@
 #include <linux/types.h>
 #include <linux/numa.h>
 
-/*
- * There is always at least global CMA area and a few optional
- * areas configured in kernel .config.
- */
 #ifdef CONFIG_CMA_AREAS
-#define MAX_CMA_AREAS	(1 + CONFIG_CMA_AREAS)
+#define MAX_CMA_AREAS	CONFIG_CMA_AREAS
 #endif
 
 #define CMA_MAX_NAME 64
diff --git a/mm/Kconfig b/mm/Kconfig
index 35fa9940e61fe4..88ba99d84ac315 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -917,14 +917,14 @@ config CMA_SYSFS
 config CMA_AREAS
 	int "Maximum count of the CMA areas"
 	depends on CMA
-	default 19 if NUMA
-	default 7
+	default 20 if NUMA
+	default 8
 	help
 	  CMA allows to create CMA areas for particular purpose, mainly,
 	  used as device private area. This parameter sets the maximum
 	  number of CMA area in the system.
 
-	  If unsure, leave the default value "7" in UMA and "19" in NUMA.
+	  If unsure, leave the default value "8" in UMA and "20" in NUMA.
 
 config MEM_SOFT_DIRTY
 	bool "Track memory changes"

From 346ec53efe9dbccb4c4d23d61b9e3777b7b69103 Mon Sep 17 00:00:00 2001
From: Hao Ge <gehao@kylinos.cn>
Date: Mon, 5 Feb 2024 12:26:18 +0800
Subject: [PATCH 1162/1406] mm/vmscan: make too_many_isolated return bool

too_many_isolated() should return bool as does the similar
too_many_isolated() in mm/compaction.c.

Link: https://lkml.kernel.org/r/20240205042618.108140-1-gehao@kylinos.cn
Signed-off-by: Hao Ge <gehao@kylinos.cn>
Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/vmscan.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 8e52f8795d2028..327bf904fdcdee 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1744,17 +1744,17 @@ bool folio_isolate_lru(struct folio *folio)
  * the LRU list will go small and be scanned faster than necessary, leading to
  * unnecessary swapping, thrashing and OOM.
  */
-static int too_many_isolated(struct pglist_data *pgdat, int file,
+static bool too_many_isolated(struct pglist_data *pgdat, int file,
 		struct scan_control *sc)
 {
 	unsigned long inactive, isolated;
 	bool too_many;
 
 	if (current_is_kswapd())
-		return 0;
+		return false;
 
 	if (!writeback_throttling_sane(sc))
-		return 0;
+		return false;
 
 	if (file) {
 		inactive = node_page_state(pgdat, NR_INACTIVE_FILE);

From bef3d230ec709fdb1e1e862baee09fd13ab3d13f Mon Sep 17 00:00:00 2001
From: "Ricardo B. Marliere" <ricardo@marliere.net>
Date: Sun, 4 Feb 2024 10:56:44 -0300
Subject: [PATCH 1163/1406] memory tier: make memory_tier_subsys const

Now that the driver core can properly handle constant struct bus_type,
move the memory_tier_subsys variable to be a constant structure as well,
placing it into read-only memory which can not be modified at runtime.

Link: https://lkml.kernel.org/r/20240204-bus_cleanup-mm-v1-1-00f49286f164@marliere.net
Signed-off-by: Ricardo B. Marliere <ricardo@marliere.net>
Suggested-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/memory-tiers.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
index 5462d9e3c84c7d..ed20f96bf89dc8 100644
--- a/mm/memory-tiers.c
+++ b/mm/memory-tiers.c
@@ -39,7 +39,7 @@ static LIST_HEAD(memory_tiers);
 static struct node_memory_type_map node_memory_types[MAX_NUMNODES];
 struct memory_dev_type *default_dram_type;
 
-static struct bus_type memory_tier_subsys = {
+static const struct bus_type memory_tier_subsys = {
 	.name = "memory_tiering",
 	.dev_name = "memory_tier",
 };

From 54a716c1b2c9b2f1af3bcece689b5eeb2c7f036b Mon Sep 17 00:00:00 2001
From: Zhongkun He <hezhongkun.hzk@bytedance.com>
Date: Sun, 4 Feb 2024 21:15:43 +0800
Subject: [PATCH 1164/1406] mm/z3fold: remove unneeded spinlock in z3fold_alloc

The spinlock in z3fold_alloc() is used to protect page->lru, but now it
was removed in commit e774a7bc7f0ad ("mm: zswap: remove page reclaim logic
from z3fold"), so remove the spinlock too.

Link: https://lkml.kernel.org/r/20240204131543.1469661-1-hezhongkun.hzk@bytedance.com
Signed-off-by: Zhongkun He <hezhongkun.hzk@bytedance.com>
Cc: Domenico Cerasuolo <cerasuolodomenico@gmail.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/z3fold.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/mm/z3fold.c b/mm/z3fold.c
index 7c76b396b74cfd..dfae0fa58f576a 100644
--- a/mm/z3fold.c
+++ b/mm/z3fold.c
@@ -1070,9 +1070,7 @@ static int z3fold_alloc(struct z3fold_pool *pool, size_t size, gfp_t gfp,
 	add_to_unbuddied(pool, zhdr);
 
 headless:
-	spin_lock(&pool->lock);
 	*handle = encode_handle(zhdr, bud);
-	spin_unlock(&pool->lock);
 	if (bud != HEADLESS)
 		z3fold_page_unlock(zhdr);
 

From a3a2c9000ee2a044e230ce4c27b4ae5fca27bf5f Mon Sep 17 00:00:00 2001
From: Chengming Zhou <zhouchengming@bytedance.com>
Date: Sun, 4 Feb 2024 03:05:59 +0000
Subject: [PATCH 1165/1406] mm/zswap: add more comments in shrink_memcg_cb()

Patch series "mm/zswap: optimize zswap lru list", v2.

This series is motivated when observe the zswap lru list shrinking, noted
there are some unexpected cases in zswap_writeback_entry().

bpftrace -e 'kr:zswap_writeback_entry {@[(int32)retval]=count()}'

There are some -ENOMEM because when the swap entry is freed to per-cpu
swap pool, it doesn't invalidate/drop zswap entry.  Then the shrinker
encounter these trashy zswap entries, it can't be reclaimed and return
-ENOMEM.

So move the invalidation ahead to when swap entry freed to the per-cpu
swap pool, since there is no any benefit to leave trashy zswap entries on
the zswap tree and lru list.

Another case is -EEXIST, which is seen more in the case of
!zswap_exclusive_loads_enabled, in which case the swapin folio will leave
compressed copy on the tree and lru list.  And it can't be reclaimed until
the folio is removed from swapcache.

Changing to zswap_exclusive_loads_enabled mode will invalidate when folio
swapin, which has its own drawback if that folio is still clean in
swapcache and swapout again, we need to compress it again.  Please see the
commit for details on why we choose exclusive load as the default for
zswap.

Another optimization for -EEXIST is that we add LRU_STOP to support
terminating the shrinking process to avoid evicting warmer region.

Testing using kernel build in tmpfs, one 50GB swapfile and
zswap shrinker_enabled, with memory.max set to 2GB.

                mm-unstable   zswap-optimize
real               63.90s       63.25s
user             1064.05s     1063.40s
sys               292.32s      270.94s

The main optimization is in sys cpu, about 7% improvement.


This patch (of 6):

Add more comments in shrink_memcg_cb() to describe the deref dance which
is implemented to fix race problem between lru writeback and swapoff, and
the reason why we rotate the entry at the beginning.

Also fix the stale comments in zswap_writeback_entry(), and add more
comments to state that we only deref the tree after we get the swapcache
reference.

Link: https://lkml.kernel.org/r/20240201-b4-zswap-invalidate-entry-v2-0-99d4084260a0@bytedance.com
Link: https://lkml.kernel.org/r/20240201-b4-zswap-invalidate-entry-v2-1-99d4084260a0@bytedance.com
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
Suggested-by: Yosry Ahmed <yosryahmed@google.com>
Suggested-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Yosry Ahmed <yosryahmed@google.com>
Reviewed-by: Nhat Pham <nphamcs@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/zswap.c | 43 ++++++++++++++++++++++++++-----------------
 1 file changed, 26 insertions(+), 17 deletions(-)

diff --git a/mm/zswap.c b/mm/zswap.c
index 2bf4bf1d356cfe..35da20d3617fec 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -1207,10 +1207,12 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
 
 	/*
 	 * folio is locked, and the swapcache is now secured against
-	 * concurrent swapping to and from the slot. Verify that the
-	 * swap entry hasn't been invalidated and recycled behind our
-	 * backs (our zswap_entry reference doesn't prevent that), to
-	 * avoid overwriting a new swap folio with old compressed data.
+	 * concurrent swapping to and from the slot, and concurrent
+	 * swapoff so we can safely dereference the zswap tree here.
+	 * Verify that the swap entry hasn't been invalidated and recycled
+	 * behind our backs, to avoid overwriting a new swap folio with
+	 * old compressed data. Only when this is successful can the entry
+	 * be dereferenced.
 	 */
 	tree = swap_zswap_tree(swpentry);
 	spin_lock(&tree->lock);
@@ -1263,22 +1265,29 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o
 	int writeback_result;
 
 	/*
-	 * Rotate the entry to the tail before unlocking the LRU,
-	 * so that in case of an invalidation race concurrent
-	 * reclaimers don't waste their time on it.
+	 * As soon as we drop the LRU lock, the entry can be freed by
+	 * a concurrent invalidation. This means the following:
 	 *
-	 * If writeback succeeds, or failure is due to the entry
-	 * being invalidated by the swap subsystem, the invalidation
-	 * will unlink and free it.
+	 * 1. We extract the swp_entry_t to the stack, allowing
+	 *    zswap_writeback_entry() to pin the swap entry and
+	 *    then validate the zwap entry against that swap entry's
+	 *    tree using pointer value comparison. Only when that
+	 *    is successful can the entry be dereferenced.
 	 *
-	 * Temporary failures, where the same entry should be tried
-	 * again immediately, almost never happen for this shrinker.
-	 * We don't do any trylocking; -ENOMEM comes closest,
-	 * but that's extremely rare and doesn't happen spuriously
-	 * either. Don't bother distinguishing this case.
+	 * 2. Usually, objects are taken off the LRU for reclaim. In
+	 *    this case this isn't possible, because if reclaim fails
+	 *    for whatever reason, we have no means of knowing if the
+	 *    entry is alive to put it back on the LRU.
 	 *
-	 * But since they do exist in theory, the entry cannot just
-	 * be unlinked, or we could leak it. Hence, rotate.
+	 *    So rotate it before dropping the lock. If the entry is
+	 *    written back or invalidated, the free path will unlink
+	 *    it. For failures, rotation is the right thing as well.
+	 *
+	 *    Temporary failures, where the same entry should be tried
+	 *    again immediately, almost never happen for this shrinker.
+	 *    We don't do any trylocking; -ENOMEM comes closest,
+	 *    but that's extremely rare and doesn't happen spuriously
+	 *    either. Don't bother distinguishing this case.
 	 */
 	list_move_tail(item, &l->list);
 

From 8565e483763b5d32fe77dcb363a864cda2dd269d Mon Sep 17 00:00:00 2001
From: Chengming Zhou <zhouchengming@bytedance.com>
Date: Sun, 4 Feb 2024 03:06:00 +0000
Subject: [PATCH 1166/1406] mm/zswap: invalidate zswap entry when swap entry
 free

During testing I found there are some times the zswap_writeback_entry()
return -ENOMEM, which is not we expected:

bpftrace -e 'kr:zswap_writeback_entry {@[(int32)retval]=count()}'
@[-12]: 1563
@[0]: 277221

The reason is that __read_swap_cache_async() return NULL because
swapcache_prepare() failed.  The reason is that we won't invalidate zswap
entry when swap entry freed to the per-cpu pool, these zswap entries are
still on the zswap tree and lru list.

This patch moves the invalidation ahead to when swap entry freed to the
per-cpu pool, since there is no any benefit to leave trashy zswap entry on
the tree and lru list.

With this patch:
bpftrace -e 'kr:zswap_writeback_entry {@[(int32)retval]=count()}'
@[0]: 259744

Note: large folio can't have zswap entry for now, so don't bother
to add zswap entry invalidation in the large folio swap free path.

Link: https://lkml.kernel.org/r/20240201-b4-zswap-invalidate-entry-v2-2-99d4084260a0@bytedance.com
Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
Reviewed-by: Nhat Pham <nphamcs@gmail.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Yosry Ahmed <yosryahmed@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/zswap.h | 4 ++--
 mm/swap_slots.c       | 3 +++
 mm/swapfile.c         | 1 -
 mm/zswap.c            | 5 +++--
 4 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/include/linux/zswap.h b/include/linux/zswap.h
index 91895ce1fdbc4f..341aea4900704c 100644
--- a/include/linux/zswap.h
+++ b/include/linux/zswap.h
@@ -29,7 +29,7 @@ struct zswap_lruvec_state {
 
 bool zswap_store(struct folio *folio);
 bool zswap_load(struct folio *folio);
-void zswap_invalidate(int type, pgoff_t offset);
+void zswap_invalidate(swp_entry_t swp);
 int zswap_swapon(int type, unsigned long nr_pages);
 void zswap_swapoff(int type);
 void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg);
@@ -50,7 +50,7 @@ static inline bool zswap_load(struct folio *folio)
 	return false;
 }
 
-static inline void zswap_invalidate(int type, pgoff_t offset) {}
+static inline void zswap_invalidate(swp_entry_t swp) {}
 static inline int zswap_swapon(int type, unsigned long nr_pages)
 {
 	return 0;
diff --git a/mm/swap_slots.c b/mm/swap_slots.c
index 0bec1f705f8e09..90973ce7881db2 100644
--- a/mm/swap_slots.c
+++ b/mm/swap_slots.c
@@ -273,6 +273,9 @@ void free_swap_slot(swp_entry_t entry)
 {
 	struct swap_slots_cache *cache;
 
+	/* Large folio swap slot is not covered. */
+	zswap_invalidate(entry);
+
 	cache = raw_cpu_ptr(&swp_slots);
 	if (likely(use_swap_slot_cache && cache->slots_ret)) {
 		spin_lock_irq(&cache->free_lock);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index a8edaf4e5b8ad9..d1bd8d1e17bd30 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -744,7 +744,6 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
 		swap_slot_free_notify = NULL;
 	while (offset <= end) {
 		arch_swap_invalidate_page(si->type, offset);
-		zswap_invalidate(si->type, offset);
 		if (swap_slot_free_notify)
 			swap_slot_free_notify(si->bdev, offset);
 		offset++;
diff --git a/mm/zswap.c b/mm/zswap.c
index 35da20d3617fec..ef41a7bd81f243 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -1739,9 +1739,10 @@ bool zswap_load(struct folio *folio)
 	return true;
 }
 
-void zswap_invalidate(int type, pgoff_t offset)
+void zswap_invalidate(swp_entry_t swp)
 {
-	struct zswap_tree *tree = swap_zswap_tree(swp_entry(type, offset));
+	pgoff_t offset = swp_offset(swp);
+	struct zswap_tree *tree = swap_zswap_tree(swp);
 	struct zswap_entry *entry;
 
 	spin_lock(&tree->lock);

From cada23030c69a0c3e973ba6348e4f1130a11af3a Mon Sep 17 00:00:00 2001
From: Chengming Zhou <zhouchengming@bytedance.com>
Date: Sun, 4 Feb 2024 03:06:01 +0000
Subject: [PATCH 1167/1406] mm/zswap: stop lru list shrinking when encounter
 warm region

When the shrinker encounter an existing folio in swap cache, it means we
are shrinking into the warmer region.  We should terminate shrinking if
we're in the dynamic shrinker context.

This patch add LRU_STOP to support this, to avoid overshrinking.

Link: https://lkml.kernel.org/r/20240201-b4-zswap-invalidate-entry-v2-3-99d4084260a0@bytedance.com
Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Nhat Pham <nphamcs@gmail.com>
Reviewed-by: Yosry Ahmed <yosryahmed@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/list_lru.h | 2 ++
 mm/list_lru.c            | 3 +++
 mm/zswap.c               | 4 +++-
 3 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h
index f2882a82069027..792b67ceb631b5 100644
--- a/include/linux/list_lru.h
+++ b/include/linux/list_lru.h
@@ -24,6 +24,8 @@ enum lru_status {
 	LRU_SKIP,		/* item cannot be locked, skip */
 	LRU_RETRY,		/* item not freeable. May drop the lock
 				   internally, but has to return locked. */
+	LRU_STOP,		/* stop lru list walking. May drop the lock
+				   internally, but has to return locked. */
 };
 
 struct list_lru_one {
diff --git a/mm/list_lru.c b/mm/list_lru.c
index 61f3b6b1134fbe..3fd64736bc4589 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -243,6 +243,9 @@ __list_lru_walk_one(struct list_lru *lru, int nid, int memcg_idx,
 			 */
 			assert_spin_locked(&nlru->lock);
 			goto restart;
+		case LRU_STOP:
+			assert_spin_locked(&nlru->lock);
+			goto out;
 		default:
 			BUG();
 		}
diff --git a/mm/zswap.c b/mm/zswap.c
index ef41a7bd81f243..f8a4ac389118a0 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -1315,8 +1315,10 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o
 		 * into the warmer region. We should terminate shrinking (if we're in the dynamic
 		 * shrinker context).
 		 */
-		if (writeback_result == -EEXIST && encountered_page_in_swapcache)
+		if (writeback_result == -EEXIST && encountered_page_in_swapcache) {
+			ret = LRU_STOP;
 			*encountered_page_in_swapcache = true;
+		}
 	} else {
 		zswap_written_back_pages++;
 	}

From 65ca3b13fc4e6e96033fb91ffc8ca22b539190f5 Mon Sep 17 00:00:00 2001
From: Chengming Zhou <zhouchengming@bytedance.com>
Date: Sun, 4 Feb 2024 03:06:02 +0000
Subject: [PATCH 1168/1406] mm/zswap: remove duplicate_entry debug value

cat /sys/kernel/debug/zswap/duplicate_entry
2086447

When testing, the duplicate_entry value is very high, but no warning
message in the kernel log.  From the comment of duplicate_entry "Duplicate
store was encountered (rare)", it seems something goes wrong.

Actually it's incremented in the beginning of zswap_store(), which found
its zswap entry has already on the tree.  And this is a normal case, since
the folio could leave zswap entry on the tree after swapin, later it's
dirtied and swapout/zswap_store again, found its original zswap entry.

So duplicate_entry should be only incremented in the real bug case, which
already have "WARN_ON(1)", it looks redundant to count bug case, so this
patch just remove it.

Link: https://lkml.kernel.org/r/20240201-b4-zswap-invalidate-entry-v2-4-99d4084260a0@bytedance.com
Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Nhat Pham <nphamcs@gmail.com>
Acked-by: Yosry Ahmed <yosryahmed@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/zswap.c | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/mm/zswap.c b/mm/zswap.c
index f8a4ac389118a0..91df925359c879 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -71,8 +71,6 @@ static u64 zswap_reject_compress_poor;
 static u64 zswap_reject_alloc_fail;
 /* Store failed because the entry metadata could not be allocated (rare) */
 static u64 zswap_reject_kmemcache_fail;
-/* Duplicate store was encountered (rare) */
-static u64 zswap_duplicate_entry;
 
 /* Shrinker work queue */
 static struct workqueue_struct *shrink_wq;
@@ -1568,10 +1566,8 @@ bool zswap_store(struct folio *folio)
 	 */
 	spin_lock(&tree->lock);
 	entry = zswap_rb_search(&tree->rbroot, offset);
-	if (entry) {
+	if (entry)
 		zswap_invalidate_entry(tree, entry);
-		zswap_duplicate_entry++;
-	}
 	spin_unlock(&tree->lock);
 
 	if (!zswap_enabled)
@@ -1662,7 +1658,6 @@ bool zswap_store(struct folio *folio)
 	 */
 	while (zswap_rb_insert(&tree->rbroot, entry, &dupentry) == -EEXIST) {
 		WARN_ON(1);
-		zswap_duplicate_entry++;
 		zswap_invalidate_entry(tree, dupentry);
 	}
 	if (entry->length) {
@@ -1823,8 +1818,6 @@ static int zswap_debugfs_init(void)
 			   zswap_debugfs_root, &zswap_reject_compress_poor);
 	debugfs_create_u64("written_back_pages", 0444,
 			   zswap_debugfs_root, &zswap_written_back_pages);
-	debugfs_create_u64("duplicate_entry", 0444,
-			   zswap_debugfs_root, &zswap_duplicate_entry);
 	debugfs_create_u64("pool_total_size", 0444,
 			   zswap_debugfs_root, &zswap_pool_total_size);
 	debugfs_create_atomic_t("stored_pages", 0444,

From 80dcecd6eafbbb68c6a267f072835755ba8d3f53 Mon Sep 17 00:00:00 2001
From: Chengming Zhou <zhouchengming@bytedance.com>
Date: Sun, 4 Feb 2024 03:06:03 +0000
Subject: [PATCH 1169/1406] mm/zswap: only support
 zswap_exclusive_loads_enabled

The !zswap_exclusive_loads_enabled mode will leave compressed copy in
the zswap tree and lru list after the folio swapin.

There are some disadvantages in this mode:
1. It's a waste of memory since there are two copies of data, one is
   folio, the other one is compressed data in zswap. And it's unlikely
   the compressed data is useful in the near future.

2. If that folio is dirtied, the compressed data must be not useful,
   but we don't know and don't invalidate the trashy memory in zswap.

3. It's not reclaimable from zswap shrinker since zswap_writeback_entry()
   will always return -EEXIST and terminate the shrinking process.

On the other hand, the only downside of zswap_exclusive_loads_enabled
is a little more cpu usage/latency when compression, and the same if
the folio is removed from swapcache or dirtied.

More explanation by Johannes on why we should consider exclusive load
as the default for zswap:

  Caching "swapout work" is helpful when the system is thrashing. Then
  recently swapped in pages might get swapped out again very soon. It
  certainly makes sense with conventional swap, because keeping a clean
  copy on the disk saves IO work and doesn't cost any additional memory.

  But with zswap, it's different. It saves some compression work on a
  thrashing page. But the act of keeping compressed memory contributes
  to a higher rate of thrashing. And that can cause IO in other places
  like zswap writeback and file memory.

And the A/B test results of the kernel build in tmpfs with limited memory
can support this theory:

			!exclusive	exclusive
real                       63.80         63.01
user                       1063.83       1061.32
sys                        290.31        266.15

workingset_refault_anon    2383084.40    1976397.40
workingset_refault_file    44134.00      45689.40
workingset_activate_anon   837878.00     728441.20
workingset_activate_file   4710.00       4085.20
workingset_restore_anon    732622.60     639428.40
workingset_restore_file    1007.00       926.80
workingset_nodereclaim     0.00          0.00
pgscan                     14343003.40   12409570.20
pgscan_kswapd              0.00          0.00
pgscan_direct              14343003.40   12409570.20
pgscan_khugepaged          0.00          0.00

Link: https://lkml.kernel.org/r/20240201-b4-zswap-invalidate-entry-v2-5-99d4084260a0@bytedance.com
Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
Acked-by: Yosry Ahmed <yosryahmed@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Nhat Pham <nphamcs@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/Kconfig | 16 ----------------
 mm/zswap.c | 14 +++-----------
 2 files changed, 3 insertions(+), 27 deletions(-)

diff --git a/mm/Kconfig b/mm/Kconfig
index 88ba99d84ac315..2b267553f793de 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -45,22 +45,6 @@ config ZSWAP_DEFAULT_ON
 	  The selection made here can be overridden by using the kernel
 	  command line 'zswap.enabled=' option.
 
-config ZSWAP_EXCLUSIVE_LOADS_DEFAULT_ON
-	bool "Invalidate zswap entries when pages are loaded"
-	depends on ZSWAP
-	help
-	  If selected, exclusive loads for zswap will be enabled at boot,
-	  otherwise it will be disabled.
-
-	  If exclusive loads are enabled, when a page is loaded from zswap,
-	  the zswap entry is invalidated at once, as opposed to leaving it
-	  in zswap until the swap entry is freed.
-
-	  This avoids having two copies of the same page in memory
-	  (compressed and uncompressed) after faulting in a page from zswap.
-	  The cost is that if the page was never dirtied and needs to be
-	  swapped out again, it will be re-compressed.
-
 config ZSWAP_SHRINKER_DEFAULT_ON
 	bool "Shrink the zswap pool on memory pressure"
 	depends on ZSWAP
diff --git a/mm/zswap.c b/mm/zswap.c
index 91df925359c879..7a69142817cb37 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -139,10 +139,6 @@ static bool zswap_non_same_filled_pages_enabled = true;
 module_param_named(non_same_filled_pages_enabled, zswap_non_same_filled_pages_enabled,
 		   bool, 0644);
 
-static bool zswap_exclusive_loads_enabled = IS_ENABLED(
-		CONFIG_ZSWAP_EXCLUSIVE_LOADS_DEFAULT_ON);
-module_param_named(exclusive_loads, zswap_exclusive_loads_enabled, bool, 0644);
-
 /* Number of zpools in zswap_pool (empirically determined for scalability) */
 #define ZSWAP_NR_ZPOOLS 32
 
@@ -1723,16 +1719,12 @@ bool zswap_load(struct folio *folio)
 		count_objcg_event(entry->objcg, ZSWPIN);
 
 	spin_lock(&tree->lock);
-	if (zswap_exclusive_loads_enabled) {
-		zswap_invalidate_entry(tree, entry);
-		folio_mark_dirty(folio);
-	} else if (entry->length) {
-		zswap_lru_del(&entry->pool->list_lru, entry);
-		zswap_lru_add(&entry->pool->list_lru, entry);
-	}
+	zswap_invalidate_entry(tree, entry);
 	zswap_entry_put(entry);
 	spin_unlock(&tree->lock);
 
+	folio_mark_dirty(folio);
+
 	return true;
 }
 

From c5f7701cc2e532ae7651bcdd9c6cc3c438b45c98 Mon Sep 17 00:00:00 2001
From: Chengming Zhou <zhouchengming@bytedance.com>
Date: Sun, 4 Feb 2024 03:06:04 +0000
Subject: [PATCH 1170/1406] mm/zswap: zswap entry doesn't need refcount anymore

Since we don't need to leave zswap entry on the zswap tree anymore,
we should remove it from tree once we find it from the tree.

Then after using it, we can directly free it, no concurrent path
can find it from tree. Only the shrinker can see it from lru list,
which will also double check under tree lock, so no race problem.

So we don't need refcount in zswap entry anymore and don't need to
take the spinlock for the second time to invalidate it.

The side effect is that zswap_entry_free() maybe not happen in tree
spinlock, but it's ok since nothing need to be protected by the lock.

Link: https://lkml.kernel.org/r/20240201-b4-zswap-invalidate-entry-v2-6-99d4084260a0@bytedance.com
Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
Reviewed-by: Nhat Pham <nphamcs@gmail.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Yosry Ahmed <yosryahmed@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/zswap.c | 63 ++++++++++--------------------------------------------
 1 file changed, 11 insertions(+), 52 deletions(-)

diff --git a/mm/zswap.c b/mm/zswap.c
index 7a69142817cb37..96664cdee20782 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -193,12 +193,6 @@ struct zswap_pool {
  *
  * rbnode - links the entry into red-black tree for the appropriate swap type
  * swpentry - associated swap entry, the offset indexes into the red-black tree
- * refcount - the number of outstanding reference to the entry. This is needed
- *            to protect against premature freeing of the entry by code
- *            concurrent calls to load, invalidate, and writeback.  The lock
- *            for the zswap_tree structure that contains the entry must
- *            be held while changing the refcount.  Since the lock must
- *            be held, there is no reason to also make refcount atomic.
  * length - the length in bytes of the compressed page data.  Needed during
  *          decompression. For a same value filled page length is 0, and both
  *          pool and lru are invalid and must be ignored.
@@ -211,7 +205,6 @@ struct zswap_pool {
 struct zswap_entry {
 	struct rb_node rbnode;
 	swp_entry_t swpentry;
-	int refcount;
 	unsigned int length;
 	struct zswap_pool *pool;
 	union {
@@ -222,11 +215,6 @@ struct zswap_entry {
 	struct list_head lru;
 };
 
-/*
- * The tree lock in the zswap_tree struct protects a few things:
- * - the rbtree
- * - the refcount field of each entry in the tree
- */
 struct zswap_tree {
 	struct rb_root rbroot;
 	spinlock_t lock;
@@ -890,14 +878,10 @@ static int zswap_rb_insert(struct rb_root *root, struct zswap_entry *entry,
 	return 0;
 }
 
-static bool zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry)
+static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry)
 {
-	if (!RB_EMPTY_NODE(&entry->rbnode)) {
-		rb_erase(&entry->rbnode, root);
-		RB_CLEAR_NODE(&entry->rbnode);
-		return true;
-	}
-	return false;
+	rb_erase(&entry->rbnode, root);
+	RB_CLEAR_NODE(&entry->rbnode);
 }
 
 /*********************************
@@ -911,7 +895,6 @@ static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp, int nid)
 	entry = kmem_cache_alloc_node(zswap_entry_cache, gfp, nid);
 	if (!entry)
 		return NULL;
-	entry->refcount = 1;
 	RB_CLEAR_NODE(&entry->rbnode);
 	return entry;
 }
@@ -954,33 +937,15 @@ static void zswap_entry_free(struct zswap_entry *entry)
 	zswap_update_total_size();
 }
 
-/* caller must hold the tree lock */
-static void zswap_entry_get(struct zswap_entry *entry)
-{
-	WARN_ON_ONCE(!entry->refcount);
-	entry->refcount++;
-}
-
-/* caller must hold the tree lock */
-static void zswap_entry_put(struct zswap_entry *entry)
-{
-	WARN_ON_ONCE(!entry->refcount);
-	if (--entry->refcount == 0) {
-		WARN_ON_ONCE(!RB_EMPTY_NODE(&entry->rbnode));
-		zswap_entry_free(entry);
-	}
-}
-
 /*
- * If the entry is still valid in the tree, drop the initial ref and remove it
- * from the tree. This function must be called with an additional ref held,
- * otherwise it may race with another invalidation freeing the entry.
+ * The caller hold the tree lock and search the entry from the tree,
+ * so it must be on the tree, remove it from the tree and free it.
  */
 static void zswap_invalidate_entry(struct zswap_tree *tree,
 				   struct zswap_entry *entry)
 {
-	if (zswap_rb_erase(&tree->rbroot, entry))
-		zswap_entry_put(entry);
+	zswap_rb_erase(&tree->rbroot, entry);
+	zswap_entry_free(entry);
 }
 
 /*********************************
@@ -1219,7 +1184,7 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
 	}
 
 	/* Safe to deref entry after the entry is verified above. */
-	zswap_entry_get(entry);
+	zswap_rb_erase(&tree->rbroot, entry);
 	spin_unlock(&tree->lock);
 
 	zswap_decompress(entry, &folio->page);
@@ -1228,10 +1193,7 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
 	if (entry->objcg)
 		count_objcg_event(entry->objcg, ZSWPWB);
 
-	spin_lock(&tree->lock);
-	zswap_invalidate_entry(tree, entry);
-	zswap_entry_put(entry);
-	spin_unlock(&tree->lock);
+	zswap_entry_free(entry);
 
 	/* folio is up to date */
 	folio_mark_uptodate(folio);
@@ -1703,7 +1665,7 @@ bool zswap_load(struct folio *folio)
 		spin_unlock(&tree->lock);
 		return false;
 	}
-	zswap_entry_get(entry);
+	zswap_rb_erase(&tree->rbroot, entry);
 	spin_unlock(&tree->lock);
 
 	if (entry->length)
@@ -1718,10 +1680,7 @@ bool zswap_load(struct folio *folio)
 	if (entry->objcg)
 		count_objcg_event(entry->objcg, ZSWPIN);
 
-	spin_lock(&tree->lock);
-	zswap_invalidate_entry(tree, entry);
-	zswap_entry_put(entry);
-	spin_unlock(&tree->lock);
+	zswap_entry_free(entry);
 
 	folio_mark_dirty(folio);
 

From e11072fc005e3afee812c557dd627318d35a842f Mon Sep 17 00:00:00 2001
From: Paul Gofman <pgofman@codeweavers.com>
Date: Tue, 6 Feb 2024 13:48:01 +0500
Subject: [PATCH 1171/1406] mm/migrate: preserve exact soft-dirty state
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

pte_mkdirty() sets both _PAGE_DIRTY and _PAGE_SOFT_DIRTY bits.  The
_PAGE_SOFT_DIRTY can get set even if it wasn't set on original page before
migration.  This makes non-soft-dirty pages soft-dirty just because of
migration/compaction.  Clear the _PAGE_SOFT_DIRTY flag if it wasn't set on
original page.

By definition of soft-dirty feature, there can be spurious soft-dirty
pages because of kernel's internal activity such as VMA merging or
migration/compaction.  This patch is eliminating the spurious soft-dirty
pages because of migration/compaction.

Link: https://lkml.kernel.org/r/20240206084838.34560-1-usama.anjum@collabora.com
Signed-off-by: Paul Gofman <pgofman@codeweavers.com>
Signed-off-by: Muhammad Usama Anjum <usama.anjum@collabora.com>
Acked-by: Andrei Vagin <avagin@gmail.com>
Cc: Michał Mirosław <emmir@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/migrate.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/mm/migrate.c b/mm/migrate.c
index c27b1f8097d4a7..73a052a382f13a 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -211,14 +211,17 @@ static bool remove_migration_pte(struct folio *folio,
 		folio_get(folio);
 		pte = mk_pte(new, READ_ONCE(vma->vm_page_prot));
 		old_pte = ptep_get(pvmw.pte);
-		if (pte_swp_soft_dirty(old_pte))
-			pte = pte_mksoft_dirty(pte);
 
 		entry = pte_to_swp_entry(old_pte);
 		if (!is_migration_entry_young(entry))
 			pte = pte_mkold(pte);
 		if (folio_test_dirty(folio) && is_migration_entry_dirty(entry))
 			pte = pte_mkdirty(pte);
+		if (pte_swp_soft_dirty(old_pte))
+			pte = pte_mksoft_dirty(pte);
+		else
+			pte = pte_clear_soft_dirty(pte);
+
 		if (is_writable_migration_entry(entry))
 			pte = pte_mkwrite(pte, vma);
 		else if (pte_swp_uffd_wp(old_pte))

From e613efeec318f4243541c3fc4730fc5ee4c9ffb7 Mon Sep 17 00:00:00 2001
From: Baolin Wang <baolin.wang@linux.alibaba.com>
Date: Tue, 6 Feb 2024 11:08:11 +0800
Subject: [PATCH 1172/1406] mm: hugetlb: improve the handling of hugetlb
 allocation failure for freed or in-use hugetlb

alloc_and_dissolve_hugetlb_folio() preallocates a new hugetlb page before
it takes hugetlb_lock.  In 3 out of 4 cases the page is not really used
and therefore the newly allocated page is just freed right away.  This is
wasteful and it might cause pre-mature failures in those cases.

Address that by moving the allocation down to the only case (hugetlb page
is really in the free pages pool).  We need to drop hugetlb_lock to do so
and therefore need to recheck the page state after regaining it.

The patch is more of a cleanup than an actual fix to an existing problem.
There are no known reports about pre-mature failures.

Link: https://lkml.kernel.org/r/62890fd60b1ecd5bf1cdc476c973f60fe37aa0cb.1707181934.git.baolin.wang@linux.alibaba.com
Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Reviewed-by: Muchun Song <muchun.song@linux.dev>
Cc: David Hildenbrand <david@redhat.com>
Cc: Oscar Salvador <osalvador@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/hugetlb.c | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index edb2b2bf6f53df..25069ca6ec2486 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3031,21 +3031,9 @@ static int alloc_and_dissolve_hugetlb_folio(struct hstate *h,
 {
 	gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
 	int nid = folio_nid(old_folio);
-	struct folio *new_folio;
+	struct folio *new_folio = NULL;
 	int ret = 0;
 
-	/*
-	 * Before dissolving the folio, we need to allocate a new one for the
-	 * pool to remain stable.  Here, we allocate the folio and 'prep' it
-	 * by doing everything but actually updating counters and adding to
-	 * the pool.  This simplifies and let us do most of the processing
-	 * under the lock.
-	 */
-	new_folio = alloc_buddy_hugetlb_folio(h, gfp_mask, nid, NULL, NULL);
-	if (!new_folio)
-		return -ENOMEM;
-	__prep_new_hugetlb_folio(h, new_folio);
-
 retry:
 	spin_lock_irq(&hugetlb_lock);
 	if (!folio_test_hugetlb(old_folio)) {
@@ -3075,6 +3063,16 @@ static int alloc_and_dissolve_hugetlb_folio(struct hstate *h,
 		cond_resched();
 		goto retry;
 	} else {
+		if (!new_folio) {
+			spin_unlock_irq(&hugetlb_lock);
+			new_folio = alloc_buddy_hugetlb_folio(h, gfp_mask, nid,
+							      NULL, NULL);
+			if (!new_folio)
+				return -ENOMEM;
+			__prep_new_hugetlb_folio(h, new_folio);
+			goto retry;
+		}
+
 		/*
 		 * Ok, old_folio is still a genuine free hugepage. Remove it from
 		 * the freelist and decrease the counters. These will be
@@ -3102,9 +3100,11 @@ static int alloc_and_dissolve_hugetlb_folio(struct hstate *h,
 
 free_new:
 	spin_unlock_irq(&hugetlb_lock);
-	/* Folio has a zero ref count, but needs a ref to be freed */
-	folio_ref_unfreeze(new_folio, 1);
-	update_and_free_hugetlb_folio(h, new_folio, false);
+	if (new_folio) {
+		/* Folio has a zero ref count, but needs a ref to be freed */
+		folio_ref_unfreeze(new_folio, 1);
+		update_and_free_hugetlb_folio(h, new_folio, false);
+	}
 
 	return ret;
 }

From 38c1c62adcaf7f055a32d8c5f8e5ded689194777 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 5 Feb 2024 18:51:58 -0800
Subject: [PATCH 1173/1406] mm/damon/sysfs: handle 'state' file inputs for
 every sampling interval if possible

DAMON sysfs interface need to access kdamond-touching data for some of
kdamond user commands.  It uses ->after_aggregation() kdamond callback to
safely access the data in the case.  It had to use the aggregation
interval callback because that was the only callback that users can access
complete monitoring results.

Since patch series "mm/damon: provide pseudo-moving sum based access
rate", which starts from commit 78fbfb155d20 ("mm/damon/core: define and
use a dedicated function for region access rate update"), DAMON provides
good-to-use quality moitoring results for every sampling interval.  It
aims to help users who need to quickly retrieve the monitoring results.
When the aggregation interval is set too long and therefore waiting for
the aggregation interval can degrade user experience, or when the access
pattern is expected to be significantly changed[1] could be such cases.

However, because DAMON sysfs interface is still handling the commands per
aggregation interval, the end user cannot get the benefit.  Update DAMON
sysfs interface to handle kdamond commands for every sampling interval if
applicable.  Specifically, all kdamond data accessing commands except
'commit' command are applicable.

[1] https://lore.kernel.org/r/20240129121316.GA9706@cuiyangpei

Link: https://lkml.kernel.org/r/20240206025158.203097-1-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: xiongping1 <xiongping1@xiaomi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/sysfs-common.h  |  2 ++
 mm/damon/sysfs-schemes.c | 22 +++++++++-------------
 mm/damon/sysfs.c         | 21 ++++++++++++++++++---
 3 files changed, 29 insertions(+), 16 deletions(-)

diff --git a/mm/damon/sysfs-common.h b/mm/damon/sysfs-common.h
index 4c37a166eb8180..ec0703e1e90b69 100644
--- a/mm/damon/sysfs-common.h
+++ b/mm/damon/sysfs-common.h
@@ -49,6 +49,8 @@ int damon_sysfs_schemes_update_regions_start(
 		struct damon_sysfs_schemes *sysfs_schemes,
 		struct damon_ctx *ctx, bool total_bytes_only);
 
+void damos_sysfs_mark_finished_regions_updates(struct damon_ctx *ctx);
+
 bool damos_sysfs_regions_upd_done(void);
 
 int damon_sysfs_schemes_update_regions_stop(struct damon_ctx *ctx);
diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c
index ae0f0b314f3a9a..f6c7f43f06cc07 100644
--- a/mm/damon/sysfs-schemes.c
+++ b/mm/damon/sysfs-schemes.c
@@ -127,17 +127,17 @@ static const struct kobj_type damon_sysfs_scheme_region_ktype = {
  *
  * Once the tried regions update request is received, the request handling
  * start function (damon_sysfs_scheme_update_regions_start()) sets the status
- * of all schemes as 'idle' again, and register ->before_damos_apply() and
- * ->after_sampling() callbacks.
+ * of all schemes as 'idle' again, and register ->before_damos_apply()
+ * callback.
  *
  * Then, the first followup ->before_damos_apply() callback
  * (damon_sysfs_before_damos_apply()) sets the status 'started'.  The first
- * ->after_sampling() callback (damon_sysfs_after_sampling()) after the call
- * is called only after the scheme is completely applied
- * to the given snapshot.  Hence the callback knows the situation by showing
- * 'started' status, and sets the status as 'finished'.  Then,
- * damon_sysfs_before_damos_apply() understands the situation by showing the
- * 'finished' status and do nothing.
+ * ->after_sampling() or ->after_aggregation() callback
+ *  (damon_sysfs_cmd_request_callback()) after the call is called only after
+ *  the scheme is completely applied to the given snapshot.  Hence the callback
+ *  knows the situation by showing 'started' status, and sets the status as
+ *  'finished'.  Then, damon_sysfs_before_damos_apply() understands the
+ *  situation by showing the 'finished' status and do nothing.
  *
  * If DAMOS is not applied to any region due to any reasons including the
  * access pattern, the watermarks, the quotas, and the filters,
@@ -2122,7 +2122,7 @@ static int damon_sysfs_before_damos_apply(struct damon_ctx *ctx,
  * callback is registered, damon_sysfs_lock should be held to ensure the
  * regions directories exist.
  */
-static int damon_sysfs_after_sampling(struct damon_ctx *ctx)
+void damos_sysfs_mark_finished_regions_updates(struct damon_ctx *ctx)
 {
 	struct damon_sysfs_schemes *sysfs_schemes =
 		damon_sysfs_schemes_for_damos_callback;
@@ -2138,8 +2138,6 @@ static int damon_sysfs_after_sampling(struct damon_ctx *ctx)
 			sysfs_regions->upd_status =
 				DAMOS_TRIED_REGIONS_UPD_FINISHED;
 	}
-
-	return 0;
 }
 
 /* Called from damon_sysfs_cmd_request_callback under damon_sysfs_lock */
@@ -2212,7 +2210,6 @@ int damon_sysfs_schemes_update_regions_start(
 	damos_tried_regions_init_upd_status(sysfs_schemes, ctx);
 	damos_regions_upd_total_bytes_only = total_bytes_only;
 	ctx->callback.before_damos_apply = damon_sysfs_before_damos_apply;
-	ctx->callback.after_sampling = damon_sysfs_after_sampling;
 	return 0;
 }
 
@@ -2241,7 +2238,6 @@ int damon_sysfs_schemes_update_regions_stop(struct damon_ctx *ctx)
 {
 	damon_sysfs_schemes_for_damos_callback = NULL;
 	ctx->callback.before_damos_apply = NULL;
-	ctx->callback.after_sampling = NULL;
 	damon_sysfs_schemes_region_idx = 0;
 	return 0;
 }
diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
index 1f891e18b4ee10..678de97fcc888d 100644
--- a/mm/damon/sysfs.c
+++ b/mm/damon/sysfs.c
@@ -1379,11 +1379,13 @@ static int damon_sysfs_commit_schemes_quota_goals(
  * damon_sysfs_cmd_request_callback() - DAMON callback for handling requests.
  * @c:		The DAMON context of the callback.
  * @active:	Whether @c is not deactivated due to watermarks.
+ * @after_aggr:	Whether this is called from after_aggregation() callback.
  *
  * This function is periodically called back from the kdamond thread for @c.
  * Then, it checks if there is a waiting DAMON sysfs request and handles it.
  */
-static int damon_sysfs_cmd_request_callback(struct damon_ctx *c, bool active)
+static int damon_sysfs_cmd_request_callback(struct damon_ctx *c, bool active,
+		bool after_aggregation)
 {
 	struct damon_sysfs_kdamond *kdamond;
 	bool total_bytes_only = false;
@@ -1401,6 +1403,8 @@ static int damon_sysfs_cmd_request_callback(struct damon_ctx *c, bool active)
 		err = damon_sysfs_upd_schemes_stats(kdamond);
 		break;
 	case DAMON_SYSFS_CMD_COMMIT:
+		if (!after_aggregation)
+			goto out;
 		err = damon_sysfs_commit_input(kdamond);
 		break;
 	case DAMON_SYSFS_CMD_COMMIT_SCHEMES_QUOTA_GOALS:
@@ -1418,6 +1422,7 @@ static int damon_sysfs_cmd_request_callback(struct damon_ctx *c, bool active)
 				goto keep_lock_out;
 			}
 		} else {
+			damos_sysfs_mark_finished_regions_updates(c);
 			/*
 			 * Continue regions updating if DAMON is till
 			 * active and the update for all schemes is not
@@ -1450,7 +1455,16 @@ static int damon_sysfs_after_wmarks_check(struct damon_ctx *c)
 	 * after_wmarks_check() is called back while the context is deactivated
 	 * by watermarks.
 	 */
-	return damon_sysfs_cmd_request_callback(c, false);
+	return damon_sysfs_cmd_request_callback(c, false, false);
+}
+
+static int damon_sysfs_after_sampling(struct damon_ctx *c)
+{
+	/*
+	 * after_sampling() is called back only while the context is not
+	 * deactivated by watermarks.
+	 */
+	return damon_sysfs_cmd_request_callback(c, true, false);
 }
 
 static int damon_sysfs_after_aggregation(struct damon_ctx *c)
@@ -1459,7 +1473,7 @@ static int damon_sysfs_after_aggregation(struct damon_ctx *c)
 	 * after_aggregation() is called back only while the context is not
 	 * deactivated by watermarks.
 	 */
-	return damon_sysfs_cmd_request_callback(c, true);
+	return damon_sysfs_cmd_request_callback(c, true, true);
 }
 
 static struct damon_ctx *damon_sysfs_build_ctx(
@@ -1478,6 +1492,7 @@ static struct damon_ctx *damon_sysfs_build_ctx(
 	}
 
 	ctx->callback.after_wmarks_check = damon_sysfs_after_wmarks_check;
+	ctx->callback.after_sampling = damon_sysfs_after_sampling;
 	ctx->callback.after_aggregation = damon_sysfs_after_aggregation;
 	ctx->callback.before_terminate = damon_sysfs_before_terminate;
 	return ctx;

From c5f88a2c7dc11d5fe75f63a642a44d94c05cc9ae Mon Sep 17 00:00:00 2001
From: Li Zhijian <lizhijian@fujitsu.com>
Date: Tue, 6 Feb 2024 10:01:51 +0800
Subject: [PATCH 1174/1406] mm/demotion: print demotion targets

Currently, when a demotion occurs, it will prioritize selecting a node
from the preferred targets as the destination node for the demotion.  If
the preferred node does not meet the requirements, it will try from all
the lower memory tier nodes until it finds a suitable demotion destination
node or ultimately fails.

However, the demotion target information isn't exposed to the users,
especially the preferred target information, which relies on more factors.
This makes it hard for users to understand the exact demotion behavior.

Rather than having a new sysfs interface to expose this information,
printing directly to kernel messages, just like the current page
allocation fallback order does.

A dmesg example with this patch is as follows:
[    0.704860] Demotion targets for Node 0: null
[    0.705456] Demotion targets for Node 1: null
// node 2 is onlined
[   32.259775] Demotion targets for Node 0: perferred: 2, fallback: 2
[   32.261290] Demotion targets for Node 1: perferred: 2, fallback: 2
[   32.262726] Demotion targets for Node 2: null
// node 3 is onlined
[   42.448809] Demotion targets for Node 0: perferred: 2, fallback: 2-3
[   42.450704] Demotion targets for Node 1: perferred: 2, fallback: 2-3
[   42.452556] Demotion targets for Node 2: perferred: 3, fallback: 3
[   42.454136] Demotion targets for Node 3: null
// node 4 is onlined
[   52.676833] Demotion targets for Node 0: perferred: 2, fallback: 2-4
[   52.678735] Demotion targets for Node 1: perferred: 2, fallback: 2-4
[   52.680493] Demotion targets for Node 2: perferred: 4, fallback: 3-4
[   52.682154] Demotion targets for Node 3: null
[   52.683405] Demotion targets for Node 4: null
// node 5 is onlined
[   62.931902] Demotion targets for Node 0: perferred: 2, fallback: 2-5
[   62.938266] Demotion targets for Node 1: perferred: 5, fallback: 2-5
[   62.943515] Demotion targets for Node 2: perferred: 4, fallback: 3-4
[   62.947471] Demotion targets for Node 3: null
[   62.949908] Demotion targets for Node 4: null
[   62.952137] Demotion targets for Node 5: perferred: 3, fallback: 3-4

Regarding this requirement, we have previously discussed [1].  The initial
proposal involved introducing a new sysfs interface.  However, due to
concerns about potential changes and compatibility issues with the
interface in the future, a consensus was not reached with the community.
Therefore, this time, we are directly printing out the information.

[1] https://lore.kernel.org/all/d1d5add8-8f4a-4578-8bf0-2cbe79b09989@fujitsu.com/

Link: https://lkml.kernel.org/r/20240206020151.605516-1-lizhijian@fujitsu.com
Signed-off-by: Li Zhijian <lizhijian@fujitsu.com>
Reviewed-by: "Huang, Ying" <ying.huang@intel.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/memory-tiers.c | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
index ed20f96bf89dc8..0537664620e5f5 100644
--- a/mm/memory-tiers.c
+++ b/mm/memory-tiers.c
@@ -359,6 +359,26 @@ static void disable_all_demotion_targets(void)
 	synchronize_rcu();
 }
 
+static void dump_demotion_targets(void)
+{
+	int node;
+
+	for_each_node_state(node, N_MEMORY) {
+		struct memory_tier *memtier = __node_get_memory_tier(node);
+		nodemask_t preferred = node_demotion[node].preferred;
+
+		if (!memtier)
+			continue;
+
+		if (nodes_empty(preferred))
+			pr_info("Demotion targets for Node %d: null\n", node);
+		else
+			pr_info("Demotion targets for Node %d: preferred: %*pbl, fallback: %*pbl\n",
+				node, nodemask_pr_args(&preferred),
+				nodemask_pr_args(&memtier->lower_tier_mask));
+	}
+}
+
 /*
  * Find an automatic demotion target for all memory
  * nodes. Failing here is OK.  It might just indicate
@@ -443,7 +463,7 @@ static void establish_demotion_targets(void)
 	 * Now build the lower_tier mask for each node collecting node mask from
 	 * all memory tier below it. This allows us to fallback demotion page
 	 * allocation to a set of nodes that is closer the above selected
-	 * perferred node.
+	 * preferred node.
 	 */
 	lower_tier = node_states[N_MEMORY];
 	list_for_each_entry(memtier, &memory_tiers, list) {
@@ -456,6 +476,8 @@ static void establish_demotion_targets(void)
 		nodes_andnot(lower_tier, lower_tier, tier_nodes);
 		memtier->lower_tier_mask = lower_tier;
 	}
+
+	dump_demotion_targets();
 }
 
 #else

From fe55173855f25de5e1427f73b13c0fa3106366aa Mon Sep 17 00:00:00 2001
From: Mark-PK Tsai <mark-pk.tsai@mediatek.com>
Date: Sat, 7 Oct 2023 15:05:53 +0800
Subject: [PATCH 1175/1406] zram: use copy_page for full page copy

Some architectures, such as arm, have implemented optimized copy_page for
full page copying.

Replace the full page memcpy with copy_page to take advantage of the
optimization.

Link: https://lkml.kernel.org/r/20231007070554.8657-1-mark-pk.tsai@mediatek.com
Signed-off-by: Mark-PK Tsai <mark-pk.tsai@mediatek.com>
Reviewed-by: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Matthias Brugger <matthias.bgg@gmail.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: YJ Chiang <yj.chiang@mediatek.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 drivers/block/zram/zram_drv.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 6772e0c654fa7f..242a1fece18d4e 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -1337,7 +1337,7 @@ static int zram_read_from_zspool(struct zram *zram, struct page *page,
 	src = zs_map_object(zram->mem_pool, handle, ZS_MM_RO);
 	if (size == PAGE_SIZE) {
 		dst = kmap_local_page(page);
-		memcpy(dst, src, PAGE_SIZE);
+		copy_page(dst, src);
 		kunmap_local(dst);
 		ret = 0;
 	} else {

From 1ceaa9bcbcc96447aedb0f71a355c7f4ee7be6f4 Mon Sep 17 00:00:00 2001
From: John Groves <John@Groves.net>
Date: Mon, 5 Feb 2024 18:57:37 -0600
Subject: [PATCH 1176/1406] memremap.h: correct an error in a comment

It tried to send me off to memory_hotplug.h for an enum that is a few
lines above...

Link: https://lkml.kernel.org/r/dba0f5f01162d6fa16e4da2a9fede7f97080e92d.1707179960.git.john@groves.net
Signed-off-by: John Groves <john@groves.net>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Cc: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memremap.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index 9837f3e6fb9582..3f7143ade32c04 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -109,7 +109,7 @@ struct dev_pagemap_ops {
  * @altmap: pre-allocated/reserved memory for vmemmap allocations
  * @ref: reference count that pins the devm_memremap_pages() mapping
  * @done: completion for @ref
- * @type: memory type: see MEMORY_* in memory_hotplug.h
+ * @type: memory type: see MEMORY_* above in memremap.h
  * @flags: PGMAP_* flags to specify defailed behavior
  * @vmemmap_shift: structural definition of how the vmemmap page metadata
  *      is populated, specifically the metadata page order.

From 2f06003ee470a9e33f9fe72800e1ce2d279a3005 Mon Sep 17 00:00:00 2001
From: Chunsheng Luo <luochunsheng@ustc.edu>
Date: Sun, 4 Feb 2024 03:34:14 -0500
Subject: [PATCH 1177/1406] meminfo: provide estimated per-node's available
 memory

The system offers an estimate of the per-node's available memory, in
addition to the system's available memory provided by /proc/meminfo.

Like commit 34e431b0ae39 ("/proc/meminfo: provide estimated available
memory"), it is more convenient to provide such an estimate in
/sys/bus/node/devices/nodex/meminfo.  If things change in the future, we
only have to change it in one place.

Shown below:
/sys/bus/node/devices/node1/meminfo:
Node 1 MemTotal:        4084480 kB
Node 1 MemFree:         3348820 kB
Node 1 MemAvailable:    3647972 kB
Node 1 MemUsed:          735660 kB
....

Link: https://github.com/numactl/numactl/issues/210
Link: https://lkml.kernel.org/r/20240204083414.107799-1-luochunsheng@ustc.edu
Signed-off-by: Chunsheng Luo <luochunsheng@ustc.edu>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 drivers/base/node.c |  4 ++++
 include/linux/mm.h  |  1 +
 mm/show_mem.c       | 43 +++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 48 insertions(+)

diff --git a/drivers/base/node.c b/drivers/base/node.c
index 1c05640461dd16..ba27f25d2b8121 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -372,11 +372,13 @@ static ssize_t node_read_meminfo(struct device *dev,
 	int len = 0;
 	int nid = dev->id;
 	struct pglist_data *pgdat = NODE_DATA(nid);
+	long available;
 	struct sysinfo i;
 	unsigned long sreclaimable, sunreclaimable;
 	unsigned long swapcached = 0;
 
 	si_meminfo_node(&i, nid);
+	available = si_mem_node_available(nid);
 	sreclaimable = node_page_state_pages(pgdat, NR_SLAB_RECLAIMABLE_B);
 	sunreclaimable = node_page_state_pages(pgdat, NR_SLAB_UNRECLAIMABLE_B);
 #ifdef CONFIG_SWAP
@@ -385,6 +387,7 @@ static ssize_t node_read_meminfo(struct device *dev,
 	len = sysfs_emit_at(buf, len,
 			    "Node %d MemTotal:       %8lu kB\n"
 			    "Node %d MemFree:        %8lu kB\n"
+			    "Node %d MemAvailable:   %8lu kB\n"
 			    "Node %d MemUsed:        %8lu kB\n"
 			    "Node %d SwapCached:     %8lu kB\n"
 			    "Node %d Active:         %8lu kB\n"
@@ -397,6 +400,7 @@ static ssize_t node_read_meminfo(struct device *dev,
 			    "Node %d Mlocked:        %8lu kB\n",
 			    nid, K(i.totalram),
 			    nid, K(i.freeram),
+			    nid, K(available),
 			    nid, K(i.totalram - i.freeram),
 			    nid, K(swapcached),
 			    nid, K(node_page_state(pgdat, NR_ACTIVE_ANON) +
diff --git a/include/linux/mm.h b/include/linux/mm.h
index ac6b71cbdffbfa..3c85634b186cf3 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3202,6 +3202,7 @@ static inline void show_mem(void)
 extern long si_mem_available(void);
 extern void si_meminfo(struct sysinfo * val);
 extern void si_meminfo_node(struct sysinfo *val, int nid);
+extern long si_mem_node_available(int nid);
 #ifdef __HAVE_ARCH_RESERVED_KERNEL_PAGES
 extern unsigned long arch_reserved_kernel_pages(void);
 #endif
diff --git a/mm/show_mem.c b/mm/show_mem.c
index 8dcfafbd283c12..37d4c7212b064d 100644
--- a/mm/show_mem.c
+++ b/mm/show_mem.c
@@ -86,6 +86,49 @@ void si_meminfo(struct sysinfo *val)
 EXPORT_SYMBOL(si_meminfo);
 
 #ifdef CONFIG_NUMA
+long si_mem_node_available(int nid)
+{
+	int zone_type;
+	long available;
+	unsigned long pagecache;
+	unsigned long wmark_low = 0;
+	unsigned long reclaimable;
+	pg_data_t *pgdat = NODE_DATA(nid);
+
+	for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)
+		wmark_low += low_wmark_pages((&pgdat->node_zones[zone_type]));
+
+	/*
+	 * Estimate the amount of memory available for userspace allocations,
+	 * without causing swapping for mbind process.
+	 */
+	available = sum_zone_node_page_state(nid, NR_FREE_PAGES) - pgdat->totalreserve_pages;
+
+	/*
+	 * Not all the page cache can be freed, otherwise the system will
+	 * start swapping or thrashing. Assume at least half of the page
+	 * cache, or the low watermark worth of cache, needs to stay.
+	 */
+	pagecache = node_page_state(pgdat, NR_ACTIVE_FILE) +
+		node_page_state(pgdat, NR_INACTIVE_FILE);
+	pagecache -= min(pagecache / 2, wmark_low);
+	available += pagecache;
+
+	/*
+	 * Part of the reclaimable slab and other kernel memory consists of
+	 * items that are in use, and cannot be freed. Cap this estimate at the
+	 * low watermark.
+	 */
+	reclaimable = node_page_state_pages(pgdat, NR_SLAB_RECLAIMABLE_B) +
+		node_page_state(pgdat, NR_KERNEL_MISC_RECLAIMABLE);
+	reclaimable -= min(reclaimable / 2, wmark_low);
+	available += reclaimable;
+
+	if (available < 0)
+		available = 0;
+	return available;
+}
+
 void si_meminfo_node(struct sysinfo *val, int nid)
 {
 	int zone_type;		/* needs to be signed */

From 42913aed2671b8614001aab45aa232ace9787dd6 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Wed, 7 Feb 2024 12:31:27 -0800
Subject: [PATCH 1178/1406] selftests/damon/_damon_sysfs: support DAMOS quota

Patch series "selftests/damon: add more tests for core functionalities and
corner cases".

Continue DAMON selftests' test coverage improvement works with a trivial
improvement of the test code itself.  The sequence of the patches in
patchset is as follows.

The first five patches add two DAMON core functionalities tests.  Those
begins with three patches (patches 1-3) that update the test-purpose DAMON
sysfs interface wrapper to support DAMOS quota, stats, and apply interval
features, respectively.  The fourth patch implements and adds a selftest
for DAMOS quota feature, using the DAMON sysfs interface wrapper's newly
added support of the quota and the stats feature.  The fifth patch further
implements and adds a selftest for DAMOS apply interval using the DAMON
sysfs interface wrapper's newly added support of the apply interval and
the stats feature.

Two patches (patches 6 and 7) for implementing and adding two corner cases
handling selftests follow.  Those try to avoid two previously fixed bugs
from recurring.

Finally, a patch for making DAMON debugfs selftests dependency checker to
use /proc/mounts instead of the hard-coded mount point assumption follows.


This patch (of 8):

Update the test-purpose DAMON sysfs control Python module to support DAMOS
quota.

Link: https://lkml.kernel.org/r/20240207203134.69976-1-sj@kernel.org
Link: https://lkml.kernel.org/r/20240207203134.69976-2-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/damon/_damon_sysfs.py | 42 +++++++++++++++----
 1 file changed, 33 insertions(+), 9 deletions(-)

diff --git a/tools/testing/selftests/damon/_damon_sysfs.py b/tools/testing/selftests/damon/_damon_sysfs.py
index e98cf4b6a4b76f..b4f6e385c564fb 100644
--- a/tools/testing/selftests/damon/_damon_sysfs.py
+++ b/tools/testing/selftests/damon/_damon_sysfs.py
@@ -70,18 +70,48 @@ def stage(self):
         if err != None:
             return err
 
+class DamosQuota:
+    sz = None                   # size quota, in bytes
+    ms = None                   # time quota
+    reset_interval_ms = None    # quota reset interval
+    scheme = None               # owner scheme
+
+    def __init__(self, sz=0, ms=0, reset_interval_ms=0):
+        self.sz = sz
+        self.ms = ms
+        self.reset_interval_ms = reset_interval_ms
+
+    def sysfs_dir(self):
+        return os.path.join(self.scheme.sysfs_dir(), 'quotas')
+
+    def stage(self):
+        err = write_file(os.path.join(self.sysfs_dir(), 'bytes'), self.sz)
+        if err != None:
+            return err
+        err = write_file(os.path.join(self.sysfs_dir(), 'ms'), self.ms)
+        if err != None:
+            return err
+        err = write_file(os.path.join(self.sysfs_dir(), 'reset_interval_ms'),
+                         self.reset_interval_ms)
+        if err != None:
+            return err
+
 class Damos:
     action = None
     access_pattern = None
-    # todo: Support quotas, watermarks, stats, tried_regions
+    quota = None
+    # todo: Support watermarks, stats, tried_regions
     idx = None
     context = None
     tried_bytes = None
 
-    def __init__(self, action='stat', access_pattern=DamosAccessPattern()):
+    def __init__(self, action='stat', access_pattern=DamosAccessPattern(),
+                 quota=DamosQuota()):
         self.action = action
         self.access_pattern = access_pattern
         self.access_pattern.scheme = self
+        self.quota = quota
+        self.quota.scheme = self
 
     def sysfs_dir(self):
         return os.path.join(
@@ -94,13 +124,7 @@ def stage(self):
         err = self.access_pattern.stage()
         if err != None:
             return err
-
-        # disable quotas
-        err = write_file(os.path.join(self.sysfs_dir(), 'quotas', 'ms'), '0')
-        if err != None:
-            return err
-        err = write_file(
-                os.path.join(self.sysfs_dir(), 'quotas', 'bytes'), '0')
+        err = self.quota.stage()
         if err != None:
             return err
 

From a67cebb10064cdc05fdc28857d0e2c6e8623196b Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Wed, 7 Feb 2024 12:31:28 -0800
Subject: [PATCH 1179/1406] selftests/damon/_damon_sysfs: support DAMOS stats

Update the test-purpose DAMON sysfs control Python module to support DAMOS
stats.

Link: https://lkml.kernel.org/r/20240207203134.69976-3-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/damon/_damon_sysfs.py | 32 +++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/tools/testing/selftests/damon/_damon_sysfs.py b/tools/testing/selftests/damon/_damon_sysfs.py
index b4f6e385c564fb..a75244451684b0 100644
--- a/tools/testing/selftests/damon/_damon_sysfs.py
+++ b/tools/testing/selftests/damon/_damon_sysfs.py
@@ -96,6 +96,20 @@ def stage(self):
         if err != None:
             return err
 
+class DamosStats:
+    nr_tried = None
+    sz_tried = None
+    nr_applied = None
+    sz_applied = None
+    qt_exceeds = None
+
+    def __init__(self, nr_tried, sz_tried, nr_applied, sz_applied, qt_exceeds):
+        self.nr_tried = nr_tried
+        self.sz_tried = sz_tried
+        self.nr_applied = nr_applied
+        self.sz_applied = sz_applied
+        self.qt_exceeds = qt_exceeds
+
 class Damos:
     action = None
     access_pattern = None
@@ -104,6 +118,7 @@ class Damos:
     idx = None
     context = None
     tried_bytes = None
+    stats = None
 
     def __init__(self, action='stat', access_pattern=DamosAccessPattern(),
                  quota=DamosQuota()):
@@ -322,6 +337,23 @@ def update_schemes_tried_bytes(self):
                     return err
                 scheme.tried_bytes = int(content)
 
+    def update_schemes_stats(self):
+        err = write_file(os.path.join(self.sysfs_dir(), 'state'),
+                'update_schemes_stats')
+        if err != None:
+            return err
+        for context in self.contexts:
+            for scheme in context.schemes:
+                stat_values = []
+                for stat in ['nr_tried', 'sz_tried', 'nr_applied',
+                             'sz_applied', 'qt_exceeds']:
+                    content, err = read_file(
+                            os.path.join(scheme.sysfs_dir(), 'stats', stat))
+                    if err != None:
+                        return err
+                    stat_values.append(int(content))
+                scheme.stats = DamosStats(*stat_values)
+
 class Kdamonds:
     kdamonds = []
 

From cf4eebbb754861be889786fd7eb3e6f4c097c2d9 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Wed, 7 Feb 2024 12:31:29 -0800
Subject: [PATCH 1180/1406] selftests/damon/_damon_sysfs: support DAMOS apply
 interval

Update the test-purpose DAMON sysfs control Python module to support DAMOS
apply interval.

Link: https://lkml.kernel.org/r/20240207203134.69976-4-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/damon/_damon_sysfs.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/damon/_damon_sysfs.py b/tools/testing/selftests/damon/_damon_sysfs.py
index a75244451684b0..d23d7398a27a8d 100644
--- a/tools/testing/selftests/damon/_damon_sysfs.py
+++ b/tools/testing/selftests/damon/_damon_sysfs.py
@@ -114,6 +114,7 @@ class Damos:
     action = None
     access_pattern = None
     quota = None
+    apply_interval_us = None
     # todo: Support watermarks, stats, tried_regions
     idx = None
     context = None
@@ -121,12 +122,13 @@ class Damos:
     stats = None
 
     def __init__(self, action='stat', access_pattern=DamosAccessPattern(),
-                 quota=DamosQuota()):
+                 quota=DamosQuota(), apply_interval_us=0):
         self.action = action
         self.access_pattern = access_pattern
         self.access_pattern.scheme = self
         self.quota = quota
         self.quota.scheme = self
+        self.apply_interval_us = apply_interval_us
 
     def sysfs_dir(self):
         return os.path.join(
@@ -139,6 +141,11 @@ def stage(self):
         err = self.access_pattern.stage()
         if err != None:
             return err
+        err = write_file(os.path.join(self.sysfs_dir(), 'apply_interval_us'),
+                         '%d' % self.apply_interval_us)
+        if err != None:
+            return err
+
         err = self.quota.stage()
         if err != None:
             return err

From 28dd34e7af9aa44e41cde3acc8a92934c5547725 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Wed, 7 Feb 2024 12:31:30 -0800
Subject: [PATCH 1181/1406] selftests/damon: add a test for DAMOS quota

Add a selftest for verifying the DAMOS quota feature.  The test is very
similar to sysfs_update_schemes_tried_regions_wss_estimation.py.  It
starts an artificial workload of 20 MiB working set, run DAMON to find the
working set size, but with 1 MiB/100 ms size quota.  Then, it collect the
DAMON-found working set size every 100 ms and check if the quota was
always applied as expected.  For the confirmation, the tests shows the
stat-applied region size and the qt_exceeds stat.

Link: https://lkml.kernel.org/r/20240207203134.69976-5-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/damon/Makefile       |  1 +
 tools/testing/selftests/damon/damos_quota.py | 67 ++++++++++++++++++++
 2 files changed, 68 insertions(+)
 create mode 100644 tools/testing/selftests/damon/damos_quota.py

diff --git a/tools/testing/selftests/damon/Makefile b/tools/testing/selftests/damon/Makefile
index 8a1cc2bf1864a8..9c3783f1a39d07 100644
--- a/tools/testing/selftests/damon/Makefile
+++ b/tools/testing/selftests/damon/Makefile
@@ -12,6 +12,7 @@ TEST_PROGS += debugfs_rm_non_contexts.sh
 TEST_PROGS += sysfs.sh sysfs_update_removed_scheme_dir.sh
 TEST_PROGS += sysfs_update_schemes_tried_regions_hang.py
 TEST_PROGS += sysfs_update_schemes_tried_regions_wss_estimation.py
+TEST_PROGS += damos_quota.py
 TEST_PROGS += reclaim.sh lru_sort.sh
 
 include ../lib.mk
diff --git a/tools/testing/selftests/damon/damos_quota.py b/tools/testing/selftests/damon/damos_quota.py
new file mode 100644
index 00000000000000..7d4c6bb2e3cd27
--- /dev/null
+++ b/tools/testing/selftests/damon/damos_quota.py
@@ -0,0 +1,67 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: GPL-2.0
+
+import subprocess
+import time
+
+import _damon_sysfs
+
+def main():
+    # access two 10 MiB memory regions, 2 second per each
+    sz_region = 10 * 1024 * 1024
+    proc = subprocess.Popen(['./access_memory', '2', '%d' % sz_region, '2000'])
+
+    # Set quota up to 1 MiB per 100 ms
+    sz_quota = 1024 * 1024 # 1 MiB
+    quota_reset_interval = 100 # 100 ms
+    kdamonds = _damon_sysfs.Kdamonds([_damon_sysfs.Kdamond(
+            contexts=[_damon_sysfs.DamonCtx(
+                ops='vaddr',
+                targets=[_damon_sysfs.DamonTarget(pid=proc.pid)],
+                schemes=[_damon_sysfs.Damos(
+                    access_pattern=_damon_sysfs.DamosAccessPattern(
+                        # >= 25% access rate, >= 200ms age
+                        nr_accesses=[5, 20], age=[2, 2**64 - 1]),
+                    quota=_damon_sysfs.DamosQuota(
+                        sz=sz_quota, reset_interval_ms=quota_reset_interval)
+                    )] # schemes
+                )] # contexts
+            )]) # kdamonds
+
+    err = kdamonds.start()
+    if err != None:
+        print('kdamond start failed: %s' % err)
+        exit(1)
+
+    wss_collected = []
+    nr_quota_exceeds = 0
+    while proc.poll() == None:
+        time.sleep(0.1)
+        err = kdamonds.kdamonds[0].update_schemes_tried_bytes()
+        if err != None:
+            print('tried bytes update failed: %s' % err)
+            exit(1)
+        err = kdamonds.kdamonds[0].update_schemes_stats()
+        if err != None:
+            print('stats update failed: %s' % err)
+            exit(1)
+
+        scheme = kdamonds.kdamonds[0].contexts[0].schemes[0]
+        wss_collected.append(scheme.tried_bytes)
+        nr_quota_exceeds = scheme.stats.qt_exceeds
+
+    wss_collected.sort()
+    for wss in wss_collected:
+        if wss > sz_quota:
+            print('quota is not kept: %s > %s' % (wss, sz_quota))
+            print('collected samples are as below')
+            print('\n'.join(['%d' % wss for wss in wss_collected]))
+            exit(1)
+
+    if nr_quota_exceeds < len(wss_collected):
+        print('quota is not always exceeded: %d > %d' %
+              (len(wss_collected), nr_quota_exceeds))
+        exit(1)
+
+if __name__ == '__main__':
+    main()

From ba0bc64c40d4bb846e75233a770db23cea70df85 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Wed, 7 Feb 2024 12:31:31 -0800
Subject: [PATCH 1182/1406] selftests/damon: add a test for DAMOS apply
 intervals

Add a selftest for DAMOS apply intervals.  It runs two schemes having
different apply interval agains an artificial memory access workload, and
check if the scheme with smaller apply interval was applied more
frequently.

Link: https://lkml.kernel.org/r/20240207203134.69976-6-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/damon/Makefile        |  2 +-
 .../selftests/damon/damos_apply_interval.py   | 67 +++++++++++++++++++
 2 files changed, 68 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/damon/damos_apply_interval.py

diff --git a/tools/testing/selftests/damon/Makefile b/tools/testing/selftests/damon/Makefile
index 9c3783f1a39d07..b545fedafb3b9f 100644
--- a/tools/testing/selftests/damon/Makefile
+++ b/tools/testing/selftests/damon/Makefile
@@ -12,7 +12,7 @@ TEST_PROGS += debugfs_rm_non_contexts.sh
 TEST_PROGS += sysfs.sh sysfs_update_removed_scheme_dir.sh
 TEST_PROGS += sysfs_update_schemes_tried_regions_hang.py
 TEST_PROGS += sysfs_update_schemes_tried_regions_wss_estimation.py
-TEST_PROGS += damos_quota.py
+TEST_PROGS += damos_quota.py damos_apply_interval.py
 TEST_PROGS += reclaim.sh lru_sort.sh
 
 include ../lib.mk
diff --git a/tools/testing/selftests/damon/damos_apply_interval.py b/tools/testing/selftests/damon/damos_apply_interval.py
new file mode 100644
index 00000000000000..f04d43702481c5
--- /dev/null
+++ b/tools/testing/selftests/damon/damos_apply_interval.py
@@ -0,0 +1,67 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: GPL-2.0
+
+import subprocess
+import time
+
+import _damon_sysfs
+
+def main():
+    # access two 10 MiB memory regions, 2 second per each
+    sz_region = 10 * 1024 * 1024
+    proc = subprocess.Popen(['./access_memory', '2', '%d' % sz_region, '2000'])
+
+    # Set quota up to 1 MiB per 100 ms
+    kdamonds = _damon_sysfs.Kdamonds([_damon_sysfs.Kdamond(
+            contexts=[_damon_sysfs.DamonCtx(
+                ops='vaddr',
+                targets=[_damon_sysfs.DamonTarget(pid=proc.pid)],
+                schemes=[
+                    _damon_sysfs.Damos(
+                        access_pattern=_damon_sysfs.DamosAccessPattern(
+                            # >= 25% access rate, >= 200ms age
+                            nr_accesses=[5, 20], age=[2, 2**64 - 1]),
+                        # aggregation interval (100 ms) is used
+                        apply_interval_us=0),
+                    # use 10ms apply interval
+                    _damon_sysfs.Damos(
+                        access_pattern=_damon_sysfs.DamosAccessPattern(
+                            # >= 25% access rate, >= 200ms age
+                            nr_accesses=[5, 20], age=[2, 2**64 - 1]),
+                        # explicitly set 10 ms apply interval
+                        apply_interval_us=10 * 1000)
+                    ] # schemes
+                )] # contexts
+            )]) # kdamonds
+
+    err = kdamonds.start()
+    if err != None:
+        print('kdamond start failed: %s' % err)
+        exit(1)
+
+    wss_collected = []
+    nr_quota_exceeds = 0
+    while proc.poll() == None:
+        time.sleep(0.1)
+        err = kdamonds.kdamonds[0].update_schemes_stats()
+        if err != None:
+            print('stats update failed: %s' % err)
+            exit(1)
+    schemes = kdamonds.kdamonds[0].contexts[0].schemes
+    nr_tried_stats = [s.stats.nr_tried for s in schemes]
+    if nr_tried_stats[0] == 0 or nr_tried_stats[1] == 0:
+        print('scheme(s) are not tried')
+        exit(1)
+
+    # Because the second scheme was having the apply interval that is ten times
+    # lower than that of the first scheme, the second scheme should be tried
+    # about ten times more frequently than the first scheme.  For possible
+    # timing errors, check if it was at least nine times more freuqnetly tried.
+    ratio = nr_tried_stats[1] / nr_tried_stats[0]
+    if ratio < 9:
+        print('%d / %d = %f (< 9)' %
+              (nr_tried_stats[1], nr_tried_stats[0], ratio))
+        exit(1)
+
+if __name__ == '__main__':
+    main()

From 8b9cbfaa43ab944bfc364e193506c057d42f5ce8 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Wed, 7 Feb 2024 12:31:32 -0800
Subject: [PATCH 1183/1406] selftests/damon: add a test for a race between
 target_ids_read() and dbgfs_before_terminate()

commit 34796417964b ("mm/damon/dbgfs: protect targets destructions with
kdamond_lock") fixed a race of DAMON debugfs interface.  Specifically, the
race was happening between target_ids_read() and dbgfs_before_terminate().
Add a test for the issue to prevent the problem from accidentally
recurring.

Link: https://lkml.kernel.org/r/20240207203134.69976-7-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/damon/.gitignore      |  1 +
 tools/testing/selftests/damon/Makefile        |  2 +
 ...fs_target_ids_read_before_terminate_race.c | 80 +++++++++++++++++++
 ...s_target_ids_read_before_terminate_race.sh | 14 ++++
 4 files changed, 97 insertions(+)
 create mode 100644 tools/testing/selftests/damon/debugfs_target_ids_read_before_terminate_race.c
 create mode 100644 tools/testing/selftests/damon/debugfs_target_ids_read_before_terminate_race.sh

diff --git a/tools/testing/selftests/damon/.gitignore b/tools/testing/selftests/damon/.gitignore
index c6c2965a660754..7d6c6e062be7f7 100644
--- a/tools/testing/selftests/damon/.gitignore
+++ b/tools/testing/selftests/damon/.gitignore
@@ -1,2 +1,3 @@
 # SPDX-License-Identifier: GPL-2.0-only
 huge_count_read_write
+debugfs_target_ids_read_before_terminate_race
diff --git a/tools/testing/selftests/damon/Makefile b/tools/testing/selftests/damon/Makefile
index b545fedafb3b9f..8a3a8df003dbb8 100644
--- a/tools/testing/selftests/damon/Makefile
+++ b/tools/testing/selftests/damon/Makefile
@@ -2,6 +2,7 @@
 # Makefile for damon selftests
 
 TEST_GEN_FILES += huge_count_read_write
+TEST_GEN_FILES += debugfs_target_ids_read_before_terminate_race
 TEST_GEN_FILES += access_memory
 
 TEST_FILES = _chk_dependency.sh _debugfs_common.sh
@@ -9,6 +10,7 @@ TEST_PROGS = debugfs_attrs.sh debugfs_schemes.sh debugfs_target_ids.sh
 TEST_PROGS += debugfs_empty_targets.sh debugfs_huge_count_read_write.sh
 TEST_PROGS += debugfs_duplicate_context_creation.sh
 TEST_PROGS += debugfs_rm_non_contexts.sh
+TEST_PROGS += debugfs_target_ids_read_before_terminate_race.sh
 TEST_PROGS += sysfs.sh sysfs_update_removed_scheme_dir.sh
 TEST_PROGS += sysfs_update_schemes_tried_regions_hang.py
 TEST_PROGS += sysfs_update_schemes_tried_regions_wss_estimation.py
diff --git a/tools/testing/selftests/damon/debugfs_target_ids_read_before_terminate_race.c b/tools/testing/selftests/damon/debugfs_target_ids_read_before_terminate_race.c
new file mode 100644
index 00000000000000..b06f52a8ce2d35
--- /dev/null
+++ b/tools/testing/selftests/damon/debugfs_target_ids_read_before_terminate_race.c
@@ -0,0 +1,80 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Author: SeongJae Park <sj@kernel.org>
+ */
+#define _GNU_SOURCE
+
+#include <fcntl.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <time.h>
+#include <unistd.h>
+
+#define DBGFS_MONITOR_ON "/sys/kernel/debug/damon/monitor_on_DEPRECATED"
+#define DBGFS_TARGET_IDS "/sys/kernel/debug/damon/target_ids"
+
+static void turn_damon_on_exit(void)
+{
+	int target_ids_fd = open(DBGFS_TARGET_IDS, O_RDWR);
+	int monitor_on_fd = open(DBGFS_MONITOR_ON, O_RDWR);
+	char pid_str[128];
+
+	snprintf(pid_str, sizeof(pid_str), "%d", getpid());
+	write(target_ids_fd, pid_str, sizeof(pid_str));
+	write(monitor_on_fd, "on\n", 3);
+	close(target_ids_fd);
+	close(monitor_on_fd);
+	usleep(1000);
+	exit(0);
+}
+
+static void try_race(void)
+{
+	int target_ids_fd = open(DBGFS_TARGET_IDS, O_RDWR);
+	int pid = fork();
+	int buf[256];
+
+	if (pid < 0) {
+		fprintf(stderr, "fork() failed\n");
+		exit(1);
+	}
+	if (pid == 0)
+		turn_damon_on_exit();
+	while (true) {
+		int status;
+
+		read(target_ids_fd, buf, sizeof(buf));
+		if (waitpid(-1, &status, WNOHANG) == pid)
+			break;
+	}
+	close(target_ids_fd);
+}
+
+static inline uint64_t ts_to_ms(struct timespec *ts)
+{
+	return (uint64_t)ts->tv_sec * 1000 + (uint64_t)ts->tv_nsec / 1000000;
+}
+
+int main(int argc, char *argv[])
+{
+	struct timespec start_time, now;
+	int runtime_ms;
+
+	if (argc != 2) {
+		fprintf(stderr, "Usage: %s <runtime in ms>\n", argv[0]);
+		exit(1);
+	}
+	runtime_ms = atoi(argv[1]);
+	clock_gettime(CLOCK_MONOTONIC, &start_time);
+	while (true) {
+		try_race();
+		clock_gettime(CLOCK_MONOTONIC, &now);
+		if (ts_to_ms(&now) - ts_to_ms(&start_time) > runtime_ms)
+			break;
+	}
+	return 0;
+}
diff --git a/tools/testing/selftests/damon/debugfs_target_ids_read_before_terminate_race.sh b/tools/testing/selftests/damon/debugfs_target_ids_read_before_terminate_race.sh
new file mode 100644
index 00000000000000..fc793c4c9aea9f
--- /dev/null
+++ b/tools/testing/selftests/damon/debugfs_target_ids_read_before_terminate_race.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+dmesg -C
+
+./debugfs_target_ids_read_before_terminate_race 5000
+
+if dmesg | grep -q dbgfs_target_ids_read
+then
+	dmesg
+	exit 1
+else
+	exit 0
+fi

From be523cd885b7372a4786c5da07265c49612f98a0 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Wed, 7 Feb 2024 12:31:33 -0800
Subject: [PATCH 1184/1406] selftests/damon: add a test for the pid leak of
 dbgfs_target_ids_write()

Commit ebb3f994dd92 ("mm/damon/dbgfs: fix 'struct pid' leaks in
'dbgfs_target_ids_write()'") fixes a pid leak bug in DAMON debugfs
interface, namely dbgfs_target_ids_write() function.  Add a selftest for
the issue to prevent the problem from mistakenly recurring.

Link: https://lkml.kernel.org/r/20240207203134.69976-8-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/damon/.gitignore      |  1 +
 tools/testing/selftests/damon/Makefile        |  2 +
 .../damon/debugfs_target_ids_pid_leak.c       | 68 +++++++++++++++++++
 .../damon/debugfs_target_ids_pid_leak.sh      | 22 ++++++
 4 files changed, 93 insertions(+)
 create mode 100644 tools/testing/selftests/damon/debugfs_target_ids_pid_leak.c
 create mode 100644 tools/testing/selftests/damon/debugfs_target_ids_pid_leak.sh

diff --git a/tools/testing/selftests/damon/.gitignore b/tools/testing/selftests/damon/.gitignore
index 7d6c6e062be7f7..d861701f032712 100644
--- a/tools/testing/selftests/damon/.gitignore
+++ b/tools/testing/selftests/damon/.gitignore
@@ -1,3 +1,4 @@
 # SPDX-License-Identifier: GPL-2.0-only
 huge_count_read_write
 debugfs_target_ids_read_before_terminate_race
+debugfs_target_ids_pid_leak
diff --git a/tools/testing/selftests/damon/Makefile b/tools/testing/selftests/damon/Makefile
index 8a3a8df003dbb8..789d6949c24715 100644
--- a/tools/testing/selftests/damon/Makefile
+++ b/tools/testing/selftests/damon/Makefile
@@ -3,6 +3,7 @@
 
 TEST_GEN_FILES += huge_count_read_write
 TEST_GEN_FILES += debugfs_target_ids_read_before_terminate_race
+TEST_GEN_FILES += debugfs_target_ids_pid_leak
 TEST_GEN_FILES += access_memory
 
 TEST_FILES = _chk_dependency.sh _debugfs_common.sh
@@ -11,6 +12,7 @@ TEST_PROGS += debugfs_empty_targets.sh debugfs_huge_count_read_write.sh
 TEST_PROGS += debugfs_duplicate_context_creation.sh
 TEST_PROGS += debugfs_rm_non_contexts.sh
 TEST_PROGS += debugfs_target_ids_read_before_terminate_race.sh
+TEST_PROGS += debugfs_target_ids_pid_leak.sh
 TEST_PROGS += sysfs.sh sysfs_update_removed_scheme_dir.sh
 TEST_PROGS += sysfs_update_schemes_tried_regions_hang.py
 TEST_PROGS += sysfs_update_schemes_tried_regions_wss_estimation.py
diff --git a/tools/testing/selftests/damon/debugfs_target_ids_pid_leak.c b/tools/testing/selftests/damon/debugfs_target_ids_pid_leak.c
new file mode 100644
index 00000000000000..0cc2eef7d1425c
--- /dev/null
+++ b/tools/testing/selftests/damon/debugfs_target_ids_pid_leak.c
@@ -0,0 +1,68 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Author: SeongJae Park <sj@kernel.org>
+ */
+
+#define _GNU_SOURCE
+
+#include <fcntl.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <sys/time.h>
+#include <unistd.h>
+
+#define DBGFS_TARGET_IDS "/sys/kernel/debug/damon/target_ids"
+
+static void write_targetid_exit(void)
+{
+	int target_ids_fd = open(DBGFS_TARGET_IDS, O_RDWR);
+	char pid_str[128];
+
+	snprintf(pid_str, sizeof(pid_str), "%d", getpid());
+	write(target_ids_fd, pid_str, sizeof(pid_str));
+	close(target_ids_fd);
+	exit(0);
+}
+
+unsigned long msec_timestamp(void)
+{
+	struct timeval tv;
+
+	gettimeofday(&tv, NULL);
+	return tv.tv_sec * 1000UL + tv.tv_usec / 1000;
+}
+
+int main(int argc, char *argv[])
+{
+	unsigned long start_ms;
+	int time_to_run, nr_forks = 0;
+
+	if (argc != 2) {
+		fprintf(stderr, "Usage: %s <msecs to run>\n", argv[0]);
+		exit(1);
+	}
+	time_to_run = atoi(argv[1]);
+
+	start_ms = msec_timestamp();
+	while (true) {
+		int pid = fork();
+
+		if (pid < 0) {
+			fprintf(stderr, "fork() failed\n");
+			exit(1);
+		}
+		if (pid == 0)
+			write_targetid_exit();
+		wait(NULL);
+		nr_forks++;
+
+		if (msec_timestamp() - start_ms > time_to_run)
+			break;
+	}
+	printf("%d\n", nr_forks);
+	return 0;
+}
diff --git a/tools/testing/selftests/damon/debugfs_target_ids_pid_leak.sh b/tools/testing/selftests/damon/debugfs_target_ids_pid_leak.sh
new file mode 100644
index 00000000000000..31fe33c2b03256
--- /dev/null
+++ b/tools/testing/selftests/damon/debugfs_target_ids_pid_leak.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+before=$(grep "^pid " /proc/slabinfo | awk '{print $2}')
+
+nr_leaks=$(./debugfs_target_ids_pid_leak 1000)
+expected_after_max=$((before + nr_leaks / 2))
+
+after=$(grep "^pid " /proc/slabinfo | awk '{print $2}')
+
+echo > /sys/kernel/debug/damon/target_ids
+
+echo "tried $nr_leaks pid leak"
+echo "number of active pid slabs: $before -> $after"
+echo "(up to $expected_after_max expected)"
+if [ $after -gt $expected_after_max ]
+then
+	echo "maybe pids are leaking"
+	exit 1
+else
+	exit 0
+fi

From 3b09b0166011888b8b1b8e9c3aad4e55c510e67f Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Wed, 7 Feb 2024 12:31:34 -0800
Subject: [PATCH 1185/1406] selftests/damon/_chk_dependency: get debugfs mount
 point from /proc/mounts

DAMON debugfs selftests dependency checker assumes debugfs would be
mounted at /sys/kernel/debug.  That would be ok for many cases, but some
systems might mounted the file system on some different places.  Parse the
real mount point using /proc/mounts file.

Link: https://lkml.kernel.org/r/20240207203134.69976-9-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/damon/_chk_dependency.sh | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/damon/_chk_dependency.sh b/tools/testing/selftests/damon/_chk_dependency.sh
index 350f8c2b071dbc..dda3a87dc00a26 100644
--- a/tools/testing/selftests/damon/_chk_dependency.sh
+++ b/tools/testing/selftests/damon/_chk_dependency.sh
@@ -4,7 +4,14 @@
 # Kselftest framework requirement - SKIP code is 4.
 ksft_skip=4
 
-DBGFS=/sys/kernel/debug/damon
+DBGFS=$(grep debugfs /proc/mounts --max-count 1 | awk '{print $2}')
+if [ "$DBGFS" = "" ]
+then
+	echo "debugfs not mounted"
+	exit $ksft_skip
+fi
+
+DBGFS+="/damon"
 
 if [ $EUID -ne 0 ];
 then

From 21862e9236186711d943cac83ce654a24582bda0 Mon Sep 17 00:00:00 2001
From: Zhongkun He <hezhongkun.hzk@bytedance.com>
Date: Sun, 4 Feb 2024 20:54:04 +0800
Subject: [PATCH 1186/1406] mm/z3fold: remove unneeded spinlock

There is no need to use spinlock in this section, so remove it.

Link: https://lkml.kernel.org/r/20240204125404.2112384-1-hezhongkun.hzk@bytedance.com
Signed-off-by: Zhongkun He <hezhongkun.hzk@bytedance.com>
Cc: Domenico Cerasuolo <cerasuolodomenico@gmail.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/z3fold.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/mm/z3fold.c b/mm/z3fold.c
index dfae0fa58f576a..58946cacbfbb86 100644
--- a/mm/z3fold.c
+++ b/mm/z3fold.c
@@ -442,8 +442,6 @@ static void __release_z3fold_page(struct z3fold_header *zhdr, bool locked)
 	WARN_ON(!list_empty(&zhdr->buddy));
 	set_bit(PAGE_STALE, &page->private);
 	clear_bit(NEEDS_COMPACTING, &page->private);
-	spin_lock(&pool->lock);
-	spin_unlock(&pool->lock);
 
 	if (locked)
 		z3fold_page_unlock(zhdr);

From beefe532e09c04a21aa3578a6fb5a5bae693c964 Mon Sep 17 00:00:00 2001
From: Anshuman Khandual <anshuman.khandual@arm.com>
Date: Tue, 6 Feb 2024 10:27:31 +0530
Subject: [PATCH 1187/1406] mm/cma: add sysfs file 'release_pages_success'

This adds the following new sysfs file tracking the number of successfully
released pages from a given CMA heap area.  This file will be available
via CONFIG_CMA_SYSFS and help in determining active CMA pages available on
the CMA heap area.  This adds a new 'nr_pages_released' (CONFIG_CMA_SYSFS)
into 'struct cma' which gets updated during cma_release().

/sys/kernel/mm/cma/<cma-heap-area>/release_pages_success

After this change, an user will be able to find active CMA pages available
in a given CMA heap area via the following method.

Active pages = alloc_pages_success - release_pages_success

That's valuable information for both software designers, and system admins
as it allows them to tune the number of CMA pages available in the system.
This increases user visibility for allocated CMA area and its
utilization.

Link: https://lkml.kernel.org/r/20240206045731.472759-1-anshuman.khandual@arm.com
Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/ABI/testing/sysfs-kernel-mm-cma |  6 ++++++
 mm/cma.c                                      |  1 +
 mm/cma.h                                      |  5 +++++
 mm/cma_sysfs.c                                | 15 +++++++++++++++
 4 files changed, 27 insertions(+)

diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-cma b/Documentation/ABI/testing/sysfs-kernel-mm-cma
index 02b2bb60c2969c..dfd755201142f1 100644
--- a/Documentation/ABI/testing/sysfs-kernel-mm-cma
+++ b/Documentation/ABI/testing/sysfs-kernel-mm-cma
@@ -23,3 +23,9 @@ Date:		Feb 2021
 Contact:	Minchan Kim <minchan@kernel.org>
 Description:
 		the number of pages CMA API failed to allocate
+
+What:		/sys/kernel/mm/cma/<cma-heap-name>/release_pages_success
+Date:		Feb 2024
+Contact:	Anshuman Khandual <anshuman.khandual@arm.com>
+Description:
+		the number of pages CMA API succeeded to release
diff --git a/mm/cma.c b/mm/cma.c
index 4902bbfe24f121..01f5a8f71ddfa7 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -562,6 +562,7 @@ bool cma_release(struct cma *cma, const struct page *pages,
 
 	free_contig_range(pfn, count);
 	cma_clear_bitmap(cma, pfn, count);
+	cma_sysfs_account_release_pages(cma, count);
 	trace_cma_release(cma->name, pfn, pages, count);
 
 	return true;
diff --git a/mm/cma.h b/mm/cma.h
index 88a0595670b766..ad61cc6dd4396f 100644
--- a/mm/cma.h
+++ b/mm/cma.h
@@ -27,6 +27,8 @@ struct cma {
 	atomic64_t nr_pages_succeeded;
 	/* the number of CMA page allocation failures */
 	atomic64_t nr_pages_failed;
+	/* the number of CMA page released */
+	atomic64_t nr_pages_released;
 	/* kobject requires dynamic object */
 	struct cma_kobject *cma_kobj;
 #endif
@@ -44,10 +46,13 @@ static inline unsigned long cma_bitmap_maxno(struct cma *cma)
 #ifdef CONFIG_CMA_SYSFS
 void cma_sysfs_account_success_pages(struct cma *cma, unsigned long nr_pages);
 void cma_sysfs_account_fail_pages(struct cma *cma, unsigned long nr_pages);
+void cma_sysfs_account_release_pages(struct cma *cma, unsigned long nr_pages);
 #else
 static inline void cma_sysfs_account_success_pages(struct cma *cma,
 						   unsigned long nr_pages) {};
 static inline void cma_sysfs_account_fail_pages(struct cma *cma,
 						unsigned long nr_pages) {};
+static inline void cma_sysfs_account_release_pages(struct cma *cma,
+						   unsigned long nr_pages) {};
 #endif
 #endif
diff --git a/mm/cma_sysfs.c b/mm/cma_sysfs.c
index 56347d15b7e8b5..f50db397317182 100644
--- a/mm/cma_sysfs.c
+++ b/mm/cma_sysfs.c
@@ -24,6 +24,11 @@ void cma_sysfs_account_fail_pages(struct cma *cma, unsigned long nr_pages)
 	atomic64_add(nr_pages, &cma->nr_pages_failed);
 }
 
+void cma_sysfs_account_release_pages(struct cma *cma, unsigned long nr_pages)
+{
+	atomic64_add(nr_pages, &cma->nr_pages_released);
+}
+
 static inline struct cma *cma_from_kobj(struct kobject *kobj)
 {
 	return container_of(kobj, struct cma_kobject, kobj)->cma;
@@ -48,6 +53,15 @@ static ssize_t alloc_pages_fail_show(struct kobject *kobj,
 }
 CMA_ATTR_RO(alloc_pages_fail);
 
+static ssize_t release_pages_success_show(struct kobject *kobj,
+					  struct kobj_attribute *attr, char *buf)
+{
+	struct cma *cma = cma_from_kobj(kobj);
+
+	return sysfs_emit(buf, "%llu\n", atomic64_read(&cma->nr_pages_released));
+}
+CMA_ATTR_RO(release_pages_success);
+
 static void cma_kobj_release(struct kobject *kobj)
 {
 	struct cma *cma = cma_from_kobj(kobj);
@@ -60,6 +74,7 @@ static void cma_kobj_release(struct kobject *kobj)
 static struct attribute *cma_attrs[] = {
 	&alloc_pages_success_attr.attr,
 	&alloc_pages_fail_attr.attr,
+	&release_pages_success_attr.attr,
 	NULL,
 };
 ATTRIBUTE_GROUPS(cma);

From 0b6318cd6e0adfda7e7317469caa8255e7970714 Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Thu, 8 Feb 2024 09:36:07 +0800
Subject: [PATCH 1188/1406] mm: compaction: refactor compact_node()

Refactor compact_node() to handle both proactive and synchronous compact
memory, which cleanups code a bit.

Link: https://lkml.kernel.org/r/20240208013607.1731817-1-wangkefeng.wang@huawei.com
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/compaction.c | 65 ++++++++++++++++---------------------------------
 1 file changed, 21 insertions(+), 44 deletions(-)

diff --git a/mm/compaction.c b/mm/compaction.c
index 218089b29f1369..f146478b01bc74 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -2798,25 +2798,27 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
 }
 
 /*
- * Compact all zones within a node till each zone's fragmentation score
- * reaches within proactive compaction thresholds (as determined by the
- * proactiveness tunable).
+ * compact_node() - compact all zones within a node
+ * @pgdat: The node page data
+ * @proactive: Whether the compaction is proactive
  *
- * It is possible that the function returns before reaching score targets
- * due to various back-off conditions, such as, contention on per-node or
- * per-zone locks.
+ * For proactive compaction, compact till each zone's fragmentation score
+ * reaches within proactive compaction thresholds (as determined by the
+ * proactiveness tunable), it is possible that the function returns before
+ * reaching score targets due to various back-off conditions, such as,
+ * contention on per-node or per-zone locks.
  */
-static void proactive_compact_node(pg_data_t *pgdat)
+static void compact_node(pg_data_t *pgdat, bool proactive)
 {
 	int zoneid;
 	struct zone *zone;
 	struct compact_control cc = {
 		.order = -1,
-		.mode = MIGRATE_SYNC_LIGHT,
+		.mode = proactive ? MIGRATE_SYNC_LIGHT : MIGRATE_SYNC,
 		.ignore_skip_hint = true,
 		.whole_zone = true,
 		.gfp_mask = GFP_KERNEL,
-		.proactive_compaction = true,
+		.proactive_compaction = proactive,
 	};
 
 	for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
@@ -2828,41 +2830,16 @@ static void proactive_compact_node(pg_data_t *pgdat)
 
 		compact_zone(&cc, NULL);
 
-		count_compact_events(KCOMPACTD_MIGRATE_SCANNED,
-				     cc.total_migrate_scanned);
-		count_compact_events(KCOMPACTD_FREE_SCANNED,
-				     cc.total_free_scanned);
-	}
-}
-
-/* Compact all zones within a node */
-static void compact_node(int nid)
-{
-	pg_data_t *pgdat = NODE_DATA(nid);
-	int zoneid;
-	struct zone *zone;
-	struct compact_control cc = {
-		.order = -1,
-		.mode = MIGRATE_SYNC,
-		.ignore_skip_hint = true,
-		.whole_zone = true,
-		.gfp_mask = GFP_KERNEL,
-	};
-
-
-	for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
-
-		zone = &pgdat->node_zones[zoneid];
-		if (!populated_zone(zone))
-			continue;
-
-		cc.zone = zone;
-
-		compact_zone(&cc, NULL);
+		if (proactive) {
+			count_compact_events(KCOMPACTD_MIGRATE_SCANNED,
+					     cc.total_migrate_scanned);
+			count_compact_events(KCOMPACTD_FREE_SCANNED,
+					     cc.total_free_scanned);
+		}
 	}
 }
 
-/* Compact all nodes in the system */
+/* Compact all zones of all nodes in the system */
 static void compact_nodes(void)
 {
 	int nid;
@@ -2871,7 +2848,7 @@ static void compact_nodes(void)
 	lru_add_drain_all();
 
 	for_each_online_node(nid)
-		compact_node(nid);
+		compact_node(NODE_DATA(nid), false);
 }
 
 static int compaction_proactiveness_sysctl_handler(struct ctl_table *table, int write,
@@ -2933,7 +2910,7 @@ static ssize_t compact_store(struct device *dev,
 		/* Flush pending updates to the LRU lists */
 		lru_add_drain_all();
 
-		compact_node(nid);
+		compact_node(NODE_DATA(nid), false);
 	}
 
 	return count;
@@ -3142,7 +3119,7 @@ static int kcompactd(void *p)
 			unsigned int prev_score, score;
 
 			prev_score = fragmentation_score_node(pgdat);
-			proactive_compact_node(pgdat);
+			compact_node(pgdat, true);
 			score = fragmentation_score_node(pgdat);
 			/*
 			 * Defer proactive compaction if the fragmentation

From 091f4571faa6c3f71519bf5bbbf2425b4f46312a Mon Sep 17 00:00:00 2001
From: Byungchul Park <byungchul@sk.com>
Date: Thu, 8 Feb 2024 15:18:25 +0900
Subject: [PATCH 1189/1406] mm, vmscan: don't turn on cache_trim_mode at the
 highest scan priority

With cache_trim_mode on, reclaim logic doesn't bother reclaiming anon
pages.  However, it should be more careful to turn on the mode because
it's going to prevent anon pages from being reclaimed even if there are a
huge number of anon pages that are very cold and which should be
reclaimed.  Even worse, that can lead kswapd_failures to reach
MAX_RECLAIM_RETRIES and stopping kswapd until direct reclaim eventually
works to resume kswapd.

Link: https://lkml.kernel.org/r/20240208061825.36640-1-byungchul@sk.com
Signed-off-by: Byungchul Park <byungchul@sk.com>
Cc: Yu Zhao <yuzhao@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/vmscan.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 327bf904fdcdee..0b9f6981db7c29 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2269,7 +2269,8 @@ static void prepare_scan_control(pg_data_t *pgdat, struct scan_control *sc)
 	 * anonymous pages.
 	 */
 	file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE);
-	if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE))
+	if (sc->priority != 1 && file >> sc->priority &&
+	    !(sc->may_deactivate & DEACTIVATE_FILE))
 		sc->cache_trim_mode = 1;
 	else
 		sc->cache_trim_mode = 0;

From c65b07b85b3008cf70197e135ccf6cb5519b5d60 Mon Sep 17 00:00:00 2001
From: "Mike Rapoport (IBM)" <rppt@kernel.org>
Date: Thu, 8 Feb 2024 07:57:27 +0200
Subject: [PATCH 1190/1406] MAINTAINERS: update mm and memcg entries

Add F: lines for memory management and memory cgroup include files.

Link: https://lkml.kernel.org/r/20240208055727.142387-1-rppt@kernel.org
Signed-off-by: Mike Rapoport (IBM) <rppt@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 MAINTAINERS | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index f3f5981ced2961..129a237b788044 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5383,6 +5383,7 @@ R:	Muchun Song <muchun.song@linux.dev>
 L:	cgroups@vger.kernel.org
 L:	linux-mm@kvack.org
 S:	Maintained
+F:	include/linux/memcontrol.h
 F:	mm/memcontrol.c
 F:	mm/swap_cgroup.c
 F:	samples/cgroup/*
@@ -14101,15 +14102,24 @@ T:	git git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
 T:	quilt git://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new
 F:	include/linux/gfp.h
 F:	include/linux/gfp_types.h
+F:	include/linux/memfd.h
+F:	include/linux/memory.h
 F:	include/linux/memory_hotplug.h
+F:	include/linux/memory-tiers.h
+F:	include/linux/mempolicy.h
+F:	include/linux/mempool.h
+F:	include/linux/memremap.h
 F:	include/linux/mm.h
+F:	include/linux/mm_*.h
 F:	include/linux/mmzone.h
+F:	include/linux/mmu_notifier.h
 F:	include/linux/pagewalk.h
 F:	include/linux/rmap.h
 F:	include/trace/events/ksm.h
 F:	mm/
 F:	tools/mm/
 F:	tools/testing/selftests/mm/
+N:	include/linux/page[-_]*
 
 MEMORY MAPPING
 M:	Andrew Morton <akpm@linux-foundation.org>

From b1530e928d61b01d0a920acd3ed32652fc52723b Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Thu, 8 Feb 2024 10:25:08 +0800
Subject: [PATCH 1191/1406] mm: compaction: early termination in
 compact_nodes()

No need to continue try compact memory if pending fatal signal, allow loop
termination earlier in compact_nodes().

Link: https://lkml.kernel.org/r/20240208022508.1771534-1-wangkefeng.wang@huawei.com
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/compaction.c | 24 +++++++++++++++++-------
 1 file changed, 17 insertions(+), 7 deletions(-)

diff --git a/mm/compaction.c b/mm/compaction.c
index f146478b01bc74..52ff6b9344c707 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -2808,7 +2808,7 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
  * reaching score targets due to various back-off conditions, such as,
  * contention on per-node or per-zone locks.
  */
-static void compact_node(pg_data_t *pgdat, bool proactive)
+static int compact_node(pg_data_t *pgdat, bool proactive)
 {
 	int zoneid;
 	struct zone *zone;
@@ -2826,6 +2826,9 @@ static void compact_node(pg_data_t *pgdat, bool proactive)
 		if (!populated_zone(zone))
 			continue;
 
+		if (fatal_signal_pending(current))
+			return -EINTR;
+
 		cc.zone = zone;
 
 		compact_zone(&cc, NULL);
@@ -2837,18 +2840,25 @@ static void compact_node(pg_data_t *pgdat, bool proactive)
 					     cc.total_free_scanned);
 		}
 	}
+
+	return 0;
 }
 
 /* Compact all zones of all nodes in the system */
-static void compact_nodes(void)
+static int compact_nodes(void)
 {
-	int nid;
+	int ret, nid;
 
 	/* Flush pending updates to the LRU lists */
 	lru_add_drain_all();
 
-	for_each_online_node(nid)
-		compact_node(NODE_DATA(nid), false);
+	for_each_online_node(nid) {
+		ret = compact_node(NODE_DATA(nid), false);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
 }
 
 static int compaction_proactiveness_sysctl_handler(struct ctl_table *table, int write,
@@ -2894,9 +2904,9 @@ static int sysctl_compaction_handler(struct ctl_table *table, int write,
 		return -EINVAL;
 
 	if (write)
-		compact_nodes();
+		ret = compact_nodes();
 
-	return 0;
+	return ret;
 }
 
 #if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA)

From a0011f0f31f825642bc5dcd1f070b2dba2d6b3eb Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Fri, 9 Feb 2024 14:30:03 +0000
Subject: [PATCH 1192/1406] selftests/mm: log skipped compaction test as a skip

Patch series "selftests/mm: Output cleanups for the compaction test".

A couple of small updates for the check_compaction selftest which make
it play more nicely with test automation systems.


This patch (of 2):

When the compaction test is run it checks to make sure that prerequistives
the test requires are available and skips the tests if not.  When this
happens we log the test as a pass rather than a skip, log as a skip so
that the distinction is clear and automation can see unexpected skips.

Link: https://lkml.kernel.org/r/20240209-kselftest-mm-cleanup-v1-0-a3c0386496b5@kernel.org
Link: https://lkml.kernel.org/r/20240209-kselftest-mm-cleanup-v1-1-a3c0386496b5@kernel.org
Signed-off-by: Mark Brown <broonie@kernel.org>
Cc: Muhammad Usama Anjum <usama.anjum@collabora.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/compaction_test.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/testing/selftests/mm/compaction_test.c b/tools/testing/selftests/mm/compaction_test.c
index 656afba02dbcc0..30150929c8c5dc 100644
--- a/tools/testing/selftests/mm/compaction_test.c
+++ b/tools/testing/selftests/mm/compaction_test.c
@@ -174,7 +174,7 @@ int main(int argc, char **argv)
 	ksft_print_header();
 
 	if (prereq() || geteuid())
-		return ksft_exit_pass();
+		return ksft_exit_skip("Prerequisites unsatisfied\n");
 
 	ksft_set_plan(1);
 

From dc462f9700b5cd72ab5f226eb524cfcc9faf9c7b Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Fri, 9 Feb 2024 14:30:04 +0000
Subject: [PATCH 1193/1406] selftests/mm: log a consistent test name for
 check_compaction

Every test result report in the compaction test prints a distinct log
messae, and some of the reports print a name that varies at runtime.  This
causes problems for automation since a lot of automation software uses the
printed string as the name of the test, if the name varies from run to run
and from pass to fail then the automation software can't identify that a
test changed result or that the same tests are being run.

Refactor the logging to use a consistent name when printing the result of
the test, printing the existing messages as diagnostic information instead
so they are still available for people trying to interpret the results.

Link: https://lkml.kernel.org/r/20240209-kselftest-mm-cleanup-v1-2-a3c0386496b5@kernel.org
Signed-off-by: Mark Brown <broonie@kernel.org>
Cc: Muhammad Usama Anjum <usama.anjum@collabora.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/compaction_test.c | 35 +++++++++++---------
 1 file changed, 19 insertions(+), 16 deletions(-)

diff --git a/tools/testing/selftests/mm/compaction_test.c b/tools/testing/selftests/mm/compaction_test.c
index 30150929c8c5dc..533999b6c28444 100644
--- a/tools/testing/selftests/mm/compaction_test.c
+++ b/tools/testing/selftests/mm/compaction_test.c
@@ -95,21 +95,22 @@ int check_compaction(unsigned long mem_free, unsigned int hugepage_size)
 
 	fd = open("/proc/sys/vm/nr_hugepages", O_RDWR | O_NONBLOCK);
 	if (fd < 0) {
-		ksft_test_result_fail("Failed to open /proc/sys/vm/nr_hugepages: %s\n",
-				      strerror(errno));
-		return -1;
+		ksft_print_msg("Failed to open /proc/sys/vm/nr_hugepages: %s\n",
+			       strerror(errno));
+		ret = -1;
+		goto out;
 	}
 
 	if (read(fd, initial_nr_hugepages, sizeof(initial_nr_hugepages)) <= 0) {
-		ksft_test_result_fail("Failed to read from /proc/sys/vm/nr_hugepages: %s\n",
-				      strerror(errno));
+		ksft_print_msg("Failed to read from /proc/sys/vm/nr_hugepages: %s\n",
+			       strerror(errno));
 		goto close_fd;
 	}
 
 	/* Start with the initial condition of 0 huge pages*/
 	if (write(fd, "0", sizeof(char)) != sizeof(char)) {
-		ksft_test_result_fail("Failed to write 0 to /proc/sys/vm/nr_hugepages: %s\n",
-				      strerror(errno));
+		ksft_print_msg("Failed to write 0 to /proc/sys/vm/nr_hugepages: %s\n",
+			       strerror(errno));
 		goto close_fd;
 	}
 
@@ -118,16 +119,16 @@ int check_compaction(unsigned long mem_free, unsigned int hugepage_size)
 	/* Request a large number of huge pages. The Kernel will allocate
 	   as much as it can */
 	if (write(fd, "100000", (6*sizeof(char))) != (6*sizeof(char))) {
-		ksft_test_result_fail("Failed to write 100000 to /proc/sys/vm/nr_hugepages: %s\n",
-				      strerror(errno));
+		ksft_print_msg("Failed to write 100000 to /proc/sys/vm/nr_hugepages: %s\n",
+			       strerror(errno));
 		goto close_fd;
 	}
 
 	lseek(fd, 0, SEEK_SET);
 
 	if (read(fd, nr_hugepages, sizeof(nr_hugepages)) <= 0) {
-		ksft_test_result_fail("Failed to re-read from /proc/sys/vm/nr_hugepages: %s\n",
-				      strerror(errno));
+		ksft_print_msg("Failed to re-read from /proc/sys/vm/nr_hugepages: %s\n",
+			       strerror(errno));
 		goto close_fd;
 	}
 
@@ -139,24 +140,26 @@ int check_compaction(unsigned long mem_free, unsigned int hugepage_size)
 
 	if (write(fd, initial_nr_hugepages, strlen(initial_nr_hugepages))
 	    != strlen(initial_nr_hugepages)) {
-		ksft_test_result_fail("Failed to write value to /proc/sys/vm/nr_hugepages: %s\n",
-				      strerror(errno));
+		ksft_print_msg("Failed to write value to /proc/sys/vm/nr_hugepages: %s\n",
+			       strerror(errno));
 		goto close_fd;
 	}
 
+	ksft_print_msg("Number of huge pages allocated = %d\n",
+		       atoi(nr_hugepages));
+
 	if (compaction_index > 3) {
 		ksft_print_msg("ERROR: Less that 1/%d of memory is available\n"
 			       "as huge pages\n", compaction_index);
-		ksft_test_result_fail("No of huge pages allocated = %d\n", (atoi(nr_hugepages)));
 		goto close_fd;
 	}
 
-	ksft_test_result_pass("Memory compaction succeeded. No of huge pages allocated = %d\n",
-			      (atoi(nr_hugepages)));
 	ret = 0;
 
  close_fd:
 	close(fd);
+ out:
+	ksft_test_result(ret == 0, "check_compaction\n");
 	return ret;
 }
 

From 8820a05a149a07070fcbb7cf087b6771248e4ac9 Mon Sep 17 00:00:00 2001
From: Chengming Zhou <zhouchengming@bytedance.com>
Date: Fri, 9 Feb 2024 04:41:12 +0000
Subject: [PATCH 1194/1406] mm/zswap: optimize and cleanup the invalidation of
 duplicate entry

We may encounter duplicate entry in the zswap_store():

1. swap slot that freed to per-cpu swap cache, doesn't invalidate
   the zswap entry, then got reused. This has been fixed.

2. !exclusive load mode, swapin folio will leave its zswap entry
   on the tree, then swapout again. This has been removed.

3. one folio can be dirtied again after zswap_store(), so need to
   zswap_store() again. This should be handled correctly.

So we must invalidate the old duplicate entry before inserting the
new one, which actually doesn't have to be done at the beginning
of zswap_store().

The good point is that we don't need to lock the tree twice in the normal
store success path.  And cleanup the loop as we are here.

Note we still need to invalidate the old duplicate entry when store failed
or zswap is disabled , otherwise the new data in swapfile could be
overwrite by the old data in zswap pool when lru writeback.

Link: https://lkml.kernel.org/r/20240209044112.3883835-1-chengming.zhou@linux.dev
Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Yosry Ahmed <yosryahmed@google.com>
Acked-by: Chris Li <chrisl@kernel.org>
Acked-by: Nhat Pham <nphamcs@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/zswap.c | 34 ++++++++++++++++------------------
 1 file changed, 16 insertions(+), 18 deletions(-)

diff --git a/mm/zswap.c b/mm/zswap.c
index 96664cdee20782..62fe307521c937 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -1517,19 +1517,8 @@ bool zswap_store(struct folio *folio)
 	if (folio_test_large(folio))
 		return false;
 
-	/*
-	 * If this is a duplicate, it must be removed before attempting to store
-	 * it, otherwise, if the store fails the old page won't be removed from
-	 * the tree, and it might be written back overriding the new data.
-	 */
-	spin_lock(&tree->lock);
-	entry = zswap_rb_search(&tree->rbroot, offset);
-	if (entry)
-		zswap_invalidate_entry(tree, entry);
-	spin_unlock(&tree->lock);
-
 	if (!zswap_enabled)
-		return false;
+		goto check_old;
 
 	objcg = get_obj_cgroup_from_folio(folio);
 	if (objcg && !obj_cgroup_may_zswap(objcg)) {
@@ -1609,14 +1598,12 @@ bool zswap_store(struct folio *folio)
 	/* map */
 	spin_lock(&tree->lock);
 	/*
-	 * A duplicate entry should have been removed at the beginning of this
-	 * function. Since the swap entry should be pinned, if a duplicate is
-	 * found again here it means that something went wrong in the swap
-	 * cache.
+	 * The folio may have been dirtied again, invalidate the
+	 * possibly stale entry before inserting the new entry.
 	 */
-	while (zswap_rb_insert(&tree->rbroot, entry, &dupentry) == -EEXIST) {
-		WARN_ON(1);
+	if (zswap_rb_insert(&tree->rbroot, entry, &dupentry) == -EEXIST) {
 		zswap_invalidate_entry(tree, dupentry);
+		WARN_ON(zswap_rb_insert(&tree->rbroot, entry, &dupentry));
 	}
 	if (entry->length) {
 		INIT_LIST_HEAD(&entry->lru);
@@ -1639,6 +1626,17 @@ bool zswap_store(struct folio *folio)
 reject:
 	if (objcg)
 		obj_cgroup_put(objcg);
+check_old:
+	/*
+	 * If the zswap store fails or zswap is disabled, we must invalidate the
+	 * possibly stale entry which was previously stored at this offset.
+	 * Otherwise, writeback could overwrite the new data in the swapfile.
+	 */
+	spin_lock(&tree->lock);
+	entry = zswap_rb_search(&tree->rbroot, offset);
+	if (entry)
+		zswap_invalidate_entry(tree, entry);
+	spin_unlock(&tree->lock);
 	return false;
 
 shrink:

From 1b265191f734bd10f5ea87a9fa33b1f5ed78f482 Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Mon, 12 Feb 2024 19:29:51 +0100
Subject: [PATCH 1195/1406] mm: document memalloc_noreclaim_save() and
 memalloc_pin_save()

The memalloc_noreclaim_save() function currently has no documentation
comment, so the implications of its usage are not obvious.  Namely that it
not only prevents entering reclaim (as the name suggests), but also allows
using all memory reserves and thus should be only used in contexts that
are allocating memory to free memory.  This may lead to new improper
usages being added.

Thus add a documenting comment, based on the description of
__GFP_MEMALLOC.  While at it, also document memalloc_pin_save() so that
all the memalloc_ scopes are documented.  For those already documented,
add missing Return: descriptions, and mark Context: description per
kernel-docs style guide.

In the comments describing the relevant PF_MEMALLOC flags, refer to their
scope setting functions.

Link: https://lkml.kernel.org/r/20240212182950.32730-2-vbabka@suse.cz
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Mike Rapoport (IBM) <rppt@kernel.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Kent Overstreet <kent.overstreet@linux.dev>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/sched.h    |  9 ++++----
 include/linux/sched/mm.h | 45 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 50 insertions(+), 4 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index b9ce285d8c9c81..998861865b844f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1624,15 +1624,15 @@ extern struct pid *cad_pid;
 #define PF_SUPERPRIV		0x00000100	/* Used super-user privileges */
 #define PF_DUMPCORE		0x00000200	/* Dumped core */
 #define PF_SIGNALED		0x00000400	/* Killed by a signal */
-#define PF_MEMALLOC		0x00000800	/* Allocating memory */
+#define PF_MEMALLOC		0x00000800	/* Allocating memory to free memory. See memalloc_noreclaim_save() */
 #define PF_NPROC_EXCEEDED	0x00001000	/* set_user() noticed that RLIMIT_NPROC was exceeded */
 #define PF_USED_MATH		0x00002000	/* If unset the fpu must be initialized before use */
 #define PF_USER_WORKER		0x00004000	/* Kernel thread cloned from userspace thread */
 #define PF_NOFREEZE		0x00008000	/* This thread should not be frozen */
 #define PF__HOLE__00010000	0x00010000
 #define PF_KSWAPD		0x00020000	/* I am kswapd */
-#define PF_MEMALLOC_NOFS	0x00040000	/* All allocation requests will inherit GFP_NOFS */
-#define PF_MEMALLOC_NOIO	0x00080000	/* All allocation requests will inherit GFP_NOIO */
+#define PF_MEMALLOC_NOFS	0x00040000	/* All allocations inherit GFP_NOFS. See memalloc_nfs_save() */
+#define PF_MEMALLOC_NOIO	0x00080000	/* All allocations inherit GFP_NOIO. See memalloc_noio_save() */
 #define PF_LOCAL_THROTTLE	0x00100000	/* Throttle writes only against the bdi I write to,
 						 * I am cleaning dirty pages from some other bdi. */
 #define PF_KTHREAD		0x00200000	/* I am a kernel thread */
@@ -1642,7 +1642,8 @@ extern struct pid *cad_pid;
 #define PF__HOLE__02000000	0x02000000
 #define PF_NO_SETAFFINITY	0x04000000	/* Userland is not allowed to meddle with cpus_mask */
 #define PF_MCE_EARLY		0x08000000      /* Early kill for mce process policy */
-#define PF_MEMALLOC_PIN		0x10000000	/* Allocation context constrained to zones which allow long term pinning. */
+#define PF_MEMALLOC_PIN		0x10000000	/* Allocations constrained to zones which allow long term pinning.
+						 * See memalloc_pin_save() */
 #define PF__HOLE__20000000	0x20000000
 #define PF__HOLE__40000000	0x40000000
 #define PF_SUSPEND_TASK		0x80000000      /* This thread called freeze_processes() and should not be frozen */
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index 9a19f1b42f6412..eef8fa5ba5de29 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -368,6 +368,27 @@ static inline void memalloc_nofs_restore(unsigned int flags)
 	current->flags = (current->flags & ~PF_MEMALLOC_NOFS) | flags;
 }
 
+/**
+ * memalloc_noreclaim_save - Marks implicit __GFP_MEMALLOC scope.
+ *
+ * This functions marks the beginning of the __GFP_MEMALLOC allocation scope.
+ * All further allocations will implicitly add the __GFP_MEMALLOC flag, which
+ * prevents entering reclaim and allows access to all memory reserves. This
+ * should only be used when the caller guarantees the allocation will allow more
+ * memory to be freed very shortly, i.e. it needs to allocate some memory in
+ * the process of freeing memory, and cannot reclaim due to potential recursion.
+ *
+ * Users of this scope have to be extremely careful to not deplete the reserves
+ * completely and implement a throttling mechanism which controls the
+ * consumption of the reserve based on the amount of freed memory. Usage of a
+ * pre-allocated pool (e.g. mempool) should be always considered before using
+ * this scope.
+ *
+ * Individual allocations under the scope can opt out using __GFP_NOMEMALLOC
+ *
+ * This function should not be used in an interrupt context as that one does not
+ * give PF_MEMALLOC access to reserves, see __gfp_pfmemalloc_flags().
+ */
 static inline unsigned int memalloc_noreclaim_save(void)
 {
 	unsigned int flags = current->flags & PF_MEMALLOC;
@@ -375,11 +396,27 @@ static inline unsigned int memalloc_noreclaim_save(void)
 	return flags;
 }
 
+/**
+ * memalloc_noreclaim_restore - Ends the implicit __GFP_MEMALLOC scope.
+ * @flags: Flags to restore.
+ *
+ * Ends the implicit __GFP_MEMALLOC scope started by memalloc_noreclaim_save
+ * function. Always make sure that the given flags is the return value from the
+ * pairing memalloc_noreclaim_save call.
+ */
 static inline void memalloc_noreclaim_restore(unsigned int flags)
 {
 	current->flags = (current->flags & ~PF_MEMALLOC) | flags;
 }
 
+/**
+ * memalloc_pin_save - Marks implicit ~__GFP_MOVABLE scope.
+ *
+ * This functions marks the beginning of the ~__GFP_MOVABLE allocation scope.
+ * All further allocations will implicitly remove the __GFP_MOVABLE flag, which
+ * will constraint the allocations to zones that allow long term pinning, i.e.
+ * not ZONE_MOVABLE zones.
+ */
 static inline unsigned int memalloc_pin_save(void)
 {
 	unsigned int flags = current->flags & PF_MEMALLOC_PIN;
@@ -388,6 +425,14 @@ static inline unsigned int memalloc_pin_save(void)
 	return flags;
 }
 
+/**
+ * memalloc_pin_restore - Ends the implicit ~__GFP_MOVABLE scope.
+ * @flags: Flags to restore.
+ *
+ * Ends the implicit ~__GFP_MOVABLE scope started by memalloc_pin_save function.
+ * Always make sure that the given flags is the return value from the pairing
+ * memalloc_pin_save call.
+ */
 static inline void memalloc_pin_restore(unsigned int flags)
 {
 	current->flags = (current->flags & ~PF_MEMALLOC_PIN) | flags;

From f49b9b3dab8b97f4e9d17eb780a2f58637b28eb6 Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Thu, 15 Feb 2024 10:58:28 +0100
Subject: [PATCH 1196/1406] 
 mm-document-memalloc_noreclaim_save-and-memalloc_pin_save-v2

fix issues that Mike pointed out

Link: https://lkml.kernel.org/r/20240215095827.13756-2-vbabka@suse.cz
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Michal Hocko <mhocko@suse.com>
Reviewed-by: Mike Rapoport (IBM) <rppt@kernel.org>
Cc: Kent Overstreet <kent.overstreet@linux.dev>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/sched/mm.h | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index eef8fa5ba5de29..7a4066d228832a 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -315,7 +315,8 @@ static inline void might_alloc(gfp_t gfp_mask)
  * point of view. Use memalloc_noio_restore to end the scope with flags
  * returned by this function.
  *
- * This function is safe to be used from any context.
+ * Context: This function is safe to be used from any context.
+ * Return: The saved flags to be passed to memalloc_noio_restore.
  */
 static inline unsigned int memalloc_noio_save(void)
 {
@@ -346,7 +347,8 @@ static inline void memalloc_noio_restore(unsigned int flags)
  * point of view. Use memalloc_nofs_restore to end the scope with flags
  * returned by this function.
  *
- * This function is safe to be used from any context.
+ * Context: This function is safe to be used from any context.
+ * Return: The saved flags to be passed to memalloc_nofs_restore.
  */
 static inline unsigned int memalloc_nofs_save(void)
 {
@@ -371,7 +373,7 @@ static inline void memalloc_nofs_restore(unsigned int flags)
 /**
  * memalloc_noreclaim_save - Marks implicit __GFP_MEMALLOC scope.
  *
- * This functions marks the beginning of the __GFP_MEMALLOC allocation scope.
+ * This function marks the beginning of the __GFP_MEMALLOC allocation scope.
  * All further allocations will implicitly add the __GFP_MEMALLOC flag, which
  * prevents entering reclaim and allows access to all memory reserves. This
  * should only be used when the caller guarantees the allocation will allow more
@@ -386,8 +388,10 @@ static inline void memalloc_nofs_restore(unsigned int flags)
  *
  * Individual allocations under the scope can opt out using __GFP_NOMEMALLOC
  *
- * This function should not be used in an interrupt context as that one does not
- * give PF_MEMALLOC access to reserves, see __gfp_pfmemalloc_flags().
+ * Context: This function should not be used in an interrupt context as that one
+ *          does not give PF_MEMALLOC access to reserves.
+ *          See __gfp_pfmemalloc_flags().
+ * Return: The saved flags to be passed to memalloc_noreclaim_restore.
  */
 static inline unsigned int memalloc_noreclaim_save(void)
 {
@@ -412,10 +416,12 @@ static inline void memalloc_noreclaim_restore(unsigned int flags)
 /**
  * memalloc_pin_save - Marks implicit ~__GFP_MOVABLE scope.
  *
- * This functions marks the beginning of the ~__GFP_MOVABLE allocation scope.
+ * This function marks the beginning of the ~__GFP_MOVABLE allocation scope.
  * All further allocations will implicitly remove the __GFP_MOVABLE flag, which
  * will constraint the allocations to zones that allow long term pinning, i.e.
  * not ZONE_MOVABLE zones.
+ *
+ * Return: The saved flags to be passed to memalloc_pin_restore.
  */
 static inline unsigned int memalloc_pin_save(void)
 {

From f644c077ebfddb26c1768e2b576cb9d85281a23c Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Mon, 12 Feb 2024 12:15:52 +0100
Subject: [PATCH 1197/1406] kasan/test: avoid gcc warning for intentional
 overflow

The out-of-bounds test allocates an object that is three bytes too short
in order to validate the bounds checking.  Starting with gcc-14, this
causes a compile-time warning as gcc has grown smart enough to understand
the sizeof() logic:

mm/kasan/kasan_test.c: In function 'kmalloc_oob_16':
mm/kasan/kasan_test.c:443:14: error: allocation of insufficient size '13' for type 'struct <anonymous>' with size '16' [-Werror=alloc-size]
  443 |         ptr1 = kmalloc(sizeof(*ptr1) - 3, GFP_KERNEL);
      |              ^

Hide the actual computation behind a RELOC_HIDE() that ensures
the compiler misses the intentional bug.

Link: https://lkml.kernel.org/r/20240212111609.869266-1-arnd@kernel.org
Fixes: 3f15801cdc23 ("lib: add kasan test module")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Andrey Konovalov <andreyknvl@gmail.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Marco Elver <elver@google.com>
Cc: Vincenzo Frascino <vincenzo.frascino@arm.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/kasan/kasan_test.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mm/kasan/kasan_test.c b/mm/kasan/kasan_test.c
index 318d9cec111aad..2d8ae4fbe63bb0 100644
--- a/mm/kasan/kasan_test.c
+++ b/mm/kasan/kasan_test.c
@@ -440,7 +440,8 @@ static void kmalloc_oob_16(struct kunit *test)
 	/* This test is specifically crafted for the generic mode. */
 	KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_GENERIC);
 
-	ptr1 = kmalloc(sizeof(*ptr1) - 3, GFP_KERNEL);
+	/* RELOC_HIDE to prevent gcc from warning about short alloc */
+	ptr1 = RELOC_HIDE(kmalloc(sizeof(*ptr1) - 3, GFP_KERNEL), 0);
 	KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr1);
 
 	ptr2 = kmalloc(sizeof(*ptr2), GFP_KERNEL);

From d20faa780b21808c9cb6aa78d103719bf67c644f Mon Sep 17 00:00:00 2001
From: Kinsey Ho <kinseyho@google.com>
Date: Wed, 14 Feb 2024 06:05:34 +0000
Subject: [PATCH 1198/1406] mm/mglru: drop unused parameter

Patch series "mm/mglru: code cleanup and refactoring"

This provides MGLRU code cleanup and refactoring for better readability.


This patch (of 5):

struct scan_control *sc is currently passed into try_to_inc_max_seq() and
run_aging().  This parameter is not used.

Drop the unused parameter struct scan_control *sc. No functional change.

Link: https://lkml.kernel.org/r/20240214060538.3524462-1-kinseyho@google.com
Link: https://lkml.kernel.org/r/20240214060538.3524462-2-kinseyho@google.com
Signed-off-by: Kinsey Ho <kinseyho@google.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Donet Tom <donettom@linux.vnet.ibm.com>
Cc: Yu Zhao <yuzhao@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/vmscan.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 0b9f6981db7c29..c85f5243ee41a6 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -3817,7 +3817,7 @@ static bool inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
 }
 
 static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
-			       struct scan_control *sc, bool can_swap, bool force_scan)
+			       bool can_swap, bool force_scan)
 {
 	bool success;
 	struct lru_gen_mm_walk *walk;
@@ -4673,7 +4673,7 @@ static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, bool
 		return nr_to_scan;
 
 	/* skip this lruvec as it's low on cold folios */
-	return try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false) ? -1 : 0;
+	return try_to_inc_max_seq(lruvec, max_seq, can_swap, false) ? -1 : 0;
 }
 
 static bool should_abort_scan(struct lruvec *lruvec, struct scan_control *sc)
@@ -5333,7 +5333,7 @@ static const struct seq_operations lru_gen_seq_ops = {
 	.show = lru_gen_seq_show,
 };
 
-static int run_aging(struct lruvec *lruvec, unsigned long seq, struct scan_control *sc,
+static int run_aging(struct lruvec *lruvec, unsigned long seq,
 		     bool can_swap, bool force_scan)
 {
 	DEFINE_MAX_SEQ(lruvec);
@@ -5348,7 +5348,7 @@ static int run_aging(struct lruvec *lruvec, unsigned long seq, struct scan_contr
 	if (!force_scan && min_seq[!can_swap] + MAX_NR_GENS - 1 <= max_seq)
 		return -ERANGE;
 
-	try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, force_scan);
+	try_to_inc_max_seq(lruvec, max_seq, can_swap, force_scan);
 
 	return 0;
 }
@@ -5416,7 +5416,7 @@ static int run_cmd(char cmd, int memcg_id, int nid, unsigned long seq,
 
 	switch (cmd) {
 	case '+':
-		err = run_aging(lruvec, seq, sc, swappiness, opt);
+		err = run_aging(lruvec, seq, swappiness, opt);
 		break;
 	case '-':
 		err = run_eviction(lruvec, seq, sc, swappiness, opt);

From baf65eb11f418679662768d730a1aded8b1bf850 Mon Sep 17 00:00:00 2001
From: Kinsey Ho <kinseyho@google.com>
Date: Wed, 14 Feb 2024 06:05:35 +0000
Subject: [PATCH 1199/1406] mm/mglru: improve should_run_aging()

scan_control *sc does not need to be passed into should_run_aging(), as it
provides only the reclaim priority.  This can be moved to
get_nr_to_scan().

Refactor should_run_aging() and get_nr_to_scan() to improve code
readability.  No functional changes.

Link: https://lkml.kernel.org/r/20240214060538.3524462-3-kinseyho@google.com
Signed-off-by: Kinsey Ho <kinseyho@google.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Donet Tom <donettom@linux.vnet.ibm.com>
Cc: Yu Zhao <yuzhao@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/vmscan.c | 25 +++++++++++--------------
 1 file changed, 11 insertions(+), 14 deletions(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index c85f5243ee41a6..9d18caddfe1620 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -4585,14 +4585,13 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap
 }
 
 static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq,
-			     struct scan_control *sc, bool can_swap, unsigned long *nr_to_scan)
+			     bool can_swap, unsigned long *nr_to_scan)
 {
 	int gen, type, zone;
 	unsigned long old = 0;
 	unsigned long young = 0;
 	unsigned long total = 0;
 	struct lru_gen_folio *lrugen = &lruvec->lrugen;
-	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
 	DEFINE_MIN_SEQ(lruvec);
 
 	/* whether this lruvec is completely out of cold folios */
@@ -4620,13 +4619,7 @@ static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq,
 		}
 	}
 
-	/* try to scrape all its memory if this memcg was deleted */
-	if (!mem_cgroup_online(memcg)) {
-		*nr_to_scan = total;
-		return false;
-	}
-
-	*nr_to_scan = total >> sc->priority;
+	*nr_to_scan = total;
 
 	/*
 	 * The aging tries to be lazy to reduce the overhead, while the eviction
@@ -4658,6 +4651,7 @@ static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq,
  */
 static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, bool can_swap)
 {
+	bool success;
 	unsigned long nr_to_scan;
 	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
 	DEFINE_MAX_SEQ(lruvec);
@@ -4665,14 +4659,17 @@ static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, bool
 	if (mem_cgroup_below_min(sc->target_mem_cgroup, memcg))
 		return -1;
 
-	if (!should_run_aging(lruvec, max_seq, sc, can_swap, &nr_to_scan))
-		return nr_to_scan;
+	success = should_run_aging(lruvec, max_seq, can_swap, &nr_to_scan);
 
-	/* skip the aging path at the default priority */
-	if (sc->priority == DEF_PRIORITY)
+	/* try to scrape all its memory if this memcg was deleted */
+	if (nr_to_scan && !mem_cgroup_online(memcg))
 		return nr_to_scan;
 
-	/* skip this lruvec as it's low on cold folios */
+	/* try to get away with not aging at the default priority */
+	if (!success || sc->priority == DEF_PRIORITY)
+		return nr_to_scan >> sc->priority;
+
+	/* stop scanning this lruvec as it's low on cold folios */
 	return try_to_inc_max_seq(lruvec, max_seq, can_swap, false) ? -1 : 0;
 }
 

From 2be505fbaabb0ccc1bdb99bc680bb85de02cbe75 Mon Sep 17 00:00:00 2001
From: Kinsey Ho <kinseyho@google.com>
Date: Wed, 14 Feb 2024 06:05:36 +0000
Subject: [PATCH 1200/1406] mm/mglru: improve reset_mm_stats()

struct lruvec* is already a field of struct lru_gen_mm_walk.  Remove the
parameter struct lruvec* into functions that already have access to struct
lru_gen_mm_walk*.

Also, we do not need to handle reset histogram stats when
!should_walk_mmu().  Remove the call to reset_mm_stats() in
iterate_mm_list_nowalk().

Link: https://lkml.kernel.org/r/20240214060538.3524462-4-kinseyho@google.com
Signed-off-by: Kinsey Ho <kinseyho@google.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Donet Tom <donettom@linux.vnet.ibm.com>
Cc: Yu Zhao <yuzhao@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/vmscan.c | 42 ++++++++++++++++++++++--------------------
 1 file changed, 22 insertions(+), 20 deletions(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 9d18caddfe1620..e2b5af83838a93 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2880,38 +2880,37 @@ static struct mm_struct *get_next_mm(struct lru_gen_mm_walk *walk)
 
 #endif
 
-static void reset_mm_stats(struct lruvec *lruvec, struct lru_gen_mm_walk *walk, bool last)
+static void reset_mm_stats(struct lru_gen_mm_walk *walk, bool last)
 {
 	int i;
 	int hist;
+	struct lruvec *lruvec = walk->lruvec;
 	struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);
 
 	lockdep_assert_held(&get_mm_list(lruvec_memcg(lruvec))->lock);
 
-	if (walk) {
-		hist = lru_hist_from_seq(walk->max_seq);
+	hist = lru_hist_from_seq(walk->max_seq);
 
-		for (i = 0; i < NR_MM_STATS; i++) {
-			WRITE_ONCE(mm_state->stats[hist][i],
-				   mm_state->stats[hist][i] + walk->mm_stats[i]);
-			walk->mm_stats[i] = 0;
-		}
+	for (i = 0; i < NR_MM_STATS; i++) {
+		WRITE_ONCE(mm_state->stats[hist][i],
+			   mm_state->stats[hist][i] + walk->mm_stats[i]);
+		walk->mm_stats[i] = 0;
 	}
 
 	if (NR_HIST_GENS > 1 && last) {
-		hist = lru_hist_from_seq(mm_state->seq + 1);
+		hist = lru_hist_from_seq(walk->max_seq + 1);
 
 		for (i = 0; i < NR_MM_STATS; i++)
 			WRITE_ONCE(mm_state->stats[hist][i], 0);
 	}
 }
 
-static bool iterate_mm_list(struct lruvec *lruvec, struct lru_gen_mm_walk *walk,
-			    struct mm_struct **iter)
+static bool iterate_mm_list(struct lru_gen_mm_walk *walk, struct mm_struct **iter)
 {
 	bool first = false;
 	bool last = false;
 	struct mm_struct *mm = NULL;
+	struct lruvec *lruvec = walk->lruvec;
 	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
 	struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
 	struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);
@@ -2955,7 +2954,7 @@ static bool iterate_mm_list(struct lruvec *lruvec, struct lru_gen_mm_walk *walk,
 	} while (!(mm = get_next_mm(walk)));
 done:
 	if (*iter || last)
-		reset_mm_stats(lruvec, walk, last);
+		reset_mm_stats(walk, last);
 
 	spin_unlock(&mm_list->lock);
 
@@ -2985,7 +2984,6 @@ static bool iterate_mm_list_nowalk(struct lruvec *lruvec, unsigned long max_seq)
 		mm_state->head = NULL;
 		mm_state->tail = NULL;
 		WRITE_ONCE(mm_state->seq, mm_state->seq + 1);
-		reset_mm_stats(lruvec, NULL, true);
 		success = true;
 	}
 
@@ -3160,9 +3158,10 @@ static void update_batch_size(struct lru_gen_mm_walk *walk, struct folio *folio,
 	walk->nr_pages[new_gen][type][zone] += delta;
 }
 
-static void reset_batch_size(struct lruvec *lruvec, struct lru_gen_mm_walk *walk)
+static void reset_batch_size(struct lru_gen_mm_walk *walk)
 {
 	int gen, type, zone;
+	struct lruvec *lruvec = walk->lruvec;
 	struct lru_gen_folio *lrugen = &lruvec->lrugen;
 
 	walk->batched = 0;
@@ -3592,7 +3591,7 @@ static int walk_pud_range(p4d_t *p4d, unsigned long start, unsigned long end,
 	return -EAGAIN;
 }
 
-static void walk_mm(struct lruvec *lruvec, struct mm_struct *mm, struct lru_gen_mm_walk *walk)
+static void walk_mm(struct mm_struct *mm, struct lru_gen_mm_walk *walk)
 {
 	static const struct mm_walk_ops mm_walk_ops = {
 		.test_walk = should_skip_vma,
@@ -3601,6 +3600,7 @@ static void walk_mm(struct lruvec *lruvec, struct mm_struct *mm, struct lru_gen_
 	};
 
 	int err;
+	struct lruvec *lruvec = walk->lruvec;
 	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
 
 	walk->next_addr = FIRST_USER_ADDRESS;
@@ -3629,7 +3629,7 @@ static void walk_mm(struct lruvec *lruvec, struct mm_struct *mm, struct lru_gen_
 
 		if (walk->batched) {
 			spin_lock_irq(&lruvec->lru_lock);
-			reset_batch_size(lruvec, walk);
+			reset_batch_size(walk);
 			spin_unlock_irq(&lruvec->lru_lock);
 		}
 
@@ -3857,9 +3857,9 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
 	walk->force_scan = force_scan;
 
 	do {
-		success = iterate_mm_list(lruvec, walk, &mm);
+		success = iterate_mm_list(walk, &mm);
 		if (mm)
-			walk_mm(lruvec, mm, walk);
+			walk_mm(mm, walk);
 	} while (mm);
 done:
 	if (success) {
@@ -4559,8 +4559,10 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap
 	move_folios_to_lru(lruvec, &list);
 
 	walk = current->reclaim_state->mm_walk;
-	if (walk && walk->batched)
-		reset_batch_size(lruvec, walk);
+	if (walk && walk->batched) {
+		walk->lruvec = lruvec;
+		reset_batch_size(walk);
+	}
 
 	item = PGSTEAL_KSWAPD + reclaimer_offset();
 	if (!cgroup_reclaim(sc))

From 0a425ec79b5218e00a58a6164737903f1c8caa02 Mon Sep 17 00:00:00 2001
From: Kinsey Ho <kinseyho@google.com>
Date: Wed, 14 Feb 2024 06:05:37 +0000
Subject: [PATCH 1201/1406] mm/mglru: improve struct lru_gen_mm_walk

Rename max_seq to seq in struct lru_gen_mm_walk to keep consistent with
struct lru_gen_mm_state.  Note that seq is not always up to date with
max_seq from lru_gen_folio.

No functional changes.

Link: https://lkml.kernel.org/r/20240214060538.3524462-5-kinseyho@google.com
Signed-off-by: Kinsey Ho <kinseyho@google.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Donet Tom <donettom@linux.vnet.ibm.com>
Cc: Yu Zhao <yuzhao@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mmzone.h |  6 ++---
 mm/vmscan.c            | 50 ++++++++++++++++++++++--------------------
 2 files changed, 29 insertions(+), 27 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index a497f189d98818..633812a1d22024 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -464,7 +464,7 @@ enum {
 #define NR_BLOOM_FILTERS	2
 
 struct lru_gen_mm_state {
-	/* set to max_seq after each iteration */
+	/* synced with max_seq after each iteration */
 	unsigned long seq;
 	/* where the current iteration continues after */
 	struct list_head *head;
@@ -479,8 +479,8 @@ struct lru_gen_mm_state {
 struct lru_gen_mm_walk {
 	/* the lruvec under reclaim */
 	struct lruvec *lruvec;
-	/* unstable max_seq from lru_gen_folio */
-	unsigned long max_seq;
+	/* max_seq from lru_gen_folio: can be out of date */
+	unsigned long seq;
 	/* the next address within an mm to scan */
 	unsigned long next_addr;
 	/* to batch promoted pages */
diff --git a/mm/vmscan.c b/mm/vmscan.c
index e2b5af83838a93..20f9d6873f3989 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2889,7 +2889,7 @@ static void reset_mm_stats(struct lru_gen_mm_walk *walk, bool last)
 
 	lockdep_assert_held(&get_mm_list(lruvec_memcg(lruvec))->lock);
 
-	hist = lru_hist_from_seq(walk->max_seq);
+	hist = lru_hist_from_seq(walk->seq);
 
 	for (i = 0; i < NR_MM_STATS; i++) {
 		WRITE_ONCE(mm_state->stats[hist][i],
@@ -2898,7 +2898,7 @@ static void reset_mm_stats(struct lru_gen_mm_walk *walk, bool last)
 	}
 
 	if (NR_HIST_GENS > 1 && last) {
-		hist = lru_hist_from_seq(walk->max_seq + 1);
+		hist = lru_hist_from_seq(walk->seq + 1);
 
 		for (i = 0; i < NR_MM_STATS; i++)
 			WRITE_ONCE(mm_state->stats[hist][i], 0);
@@ -2927,9 +2927,9 @@ static bool iterate_mm_list(struct lru_gen_mm_walk *walk, struct mm_struct **ite
 	 */
 	spin_lock(&mm_list->lock);
 
-	VM_WARN_ON_ONCE(mm_state->seq + 1 < walk->max_seq);
+	VM_WARN_ON_ONCE(mm_state->seq + 1 < walk->seq);
 
-	if (walk->max_seq <= mm_state->seq)
+	if (walk->seq <= mm_state->seq)
 		goto done;
 
 	if (!mm_state->head)
@@ -2959,7 +2959,7 @@ static bool iterate_mm_list(struct lru_gen_mm_walk *walk, struct mm_struct **ite
 	spin_unlock(&mm_list->lock);
 
 	if (mm && first)
-		reset_bloom_filter(mm_state, walk->max_seq + 1);
+		reset_bloom_filter(mm_state, walk->seq + 1);
 
 	if (*iter)
 		mmput_async(*iter);
@@ -2969,7 +2969,7 @@ static bool iterate_mm_list(struct lru_gen_mm_walk *walk, struct mm_struct **ite
 	return last;
 }
 
-static bool iterate_mm_list_nowalk(struct lruvec *lruvec, unsigned long max_seq)
+static bool iterate_mm_list_nowalk(struct lruvec *lruvec, unsigned long seq)
 {
 	bool success = false;
 	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
@@ -2978,9 +2978,9 @@ static bool iterate_mm_list_nowalk(struct lruvec *lruvec, unsigned long max_seq)
 
 	spin_lock(&mm_list->lock);
 
-	VM_WARN_ON_ONCE(mm_state->seq + 1 < max_seq);
+	VM_WARN_ON_ONCE(mm_state->seq + 1 < seq);
 
-	if (max_seq > mm_state->seq) {
+	if (seq > mm_state->seq) {
 		mm_state->head = NULL;
 		mm_state->tail = NULL;
 		WRITE_ONCE(mm_state->seq, mm_state->seq + 1);
@@ -3331,7 +3331,8 @@ static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end,
 	struct lru_gen_mm_walk *walk = args->private;
 	struct mem_cgroup *memcg = lruvec_memcg(walk->lruvec);
 	struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
-	int old_gen, new_gen = lru_gen_from_seq(walk->max_seq);
+	DEFINE_MAX_SEQ(walk->lruvec);
+	int old_gen, new_gen = lru_gen_from_seq(max_seq);
 
 	pte = pte_offset_map_nolock(args->mm, pmd, start & PMD_MASK, &ptl);
 	if (!pte)
@@ -3398,7 +3399,8 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area
 	struct lru_gen_mm_walk *walk = args->private;
 	struct mem_cgroup *memcg = lruvec_memcg(walk->lruvec);
 	struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
-	int old_gen, new_gen = lru_gen_from_seq(walk->max_seq);
+	DEFINE_MAX_SEQ(walk->lruvec);
+	int old_gen, new_gen = lru_gen_from_seq(max_seq);
 
 	VM_WARN_ON_ONCE(pud_leaf(*pud));
 
@@ -3529,7 +3531,7 @@ static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end,
 			walk_pmd_range_locked(pud, addr, vma, args, bitmap, &first);
 		}
 
-		if (!walk->force_scan && !test_bloom_filter(mm_state, walk->max_seq, pmd + i))
+		if (!walk->force_scan && !test_bloom_filter(mm_state, walk->seq, pmd + i))
 			continue;
 
 		walk->mm_stats[MM_NONLEAF_FOUND]++;
@@ -3540,7 +3542,7 @@ static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end,
 		walk->mm_stats[MM_NONLEAF_ADDED]++;
 
 		/* carry over to the next generation */
-		update_bloom_filter(mm_state, walk->max_seq + 1, pmd + i);
+		update_bloom_filter(mm_state, walk->seq + 1, pmd + i);
 	}
 
 	walk_pmd_range_locked(pud, -1, vma, args, bitmap, &first);
@@ -3611,7 +3613,7 @@ static void walk_mm(struct mm_struct *mm, struct lru_gen_mm_walk *walk)
 		err = -EBUSY;
 
 		/* another thread might have called inc_max_seq() */
-		if (walk->max_seq != max_seq)
+		if (walk->seq != max_seq)
 			break;
 
 		/* folio_update_gen() requires stable folio_memcg() */
@@ -3748,7 +3750,7 @@ static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap)
 	return success;
 }
 
-static bool inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
+static bool inc_max_seq(struct lruvec *lruvec, unsigned long seq,
 			bool can_swap, bool force_scan)
 {
 	bool success;
@@ -3756,14 +3758,14 @@ static bool inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
 	int type, zone;
 	struct lru_gen_folio *lrugen = &lruvec->lrugen;
 restart:
-	if (max_seq < READ_ONCE(lrugen->max_seq))
+	if (seq < READ_ONCE(lrugen->max_seq))
 		return false;
 
 	spin_lock_irq(&lruvec->lru_lock);
 
 	VM_WARN_ON_ONCE(!seq_is_valid(lruvec));
 
-	success = max_seq == lrugen->max_seq;
+	success = seq == lrugen->max_seq;
 	if (!success)
 		goto unlock;
 
@@ -3816,7 +3818,7 @@ static bool inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
 	return success;
 }
 
-static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
+static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long seq,
 			       bool can_swap, bool force_scan)
 {
 	bool success;
@@ -3825,13 +3827,13 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
 	struct lru_gen_folio *lrugen = &lruvec->lrugen;
 	struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);
 
-	VM_WARN_ON_ONCE(max_seq > READ_ONCE(lrugen->max_seq));
+	VM_WARN_ON_ONCE(seq > READ_ONCE(lrugen->max_seq));
 
 	if (!mm_state)
-		return inc_max_seq(lruvec, max_seq, can_swap, force_scan);
+		return inc_max_seq(lruvec, seq, can_swap, force_scan);
 
 	/* see the comment in iterate_mm_list() */
-	if (max_seq <= READ_ONCE(mm_state->seq))
+	if (seq <= READ_ONCE(mm_state->seq))
 		return false;
 
 	/*
@@ -3841,18 +3843,18 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
 	 * is less efficient, but it avoids bursty page faults.
 	 */
 	if (!should_walk_mmu()) {
-		success = iterate_mm_list_nowalk(lruvec, max_seq);
+		success = iterate_mm_list_nowalk(lruvec, seq);
 		goto done;
 	}
 
 	walk = set_mm_walk(NULL, true);
 	if (!walk) {
-		success = iterate_mm_list_nowalk(lruvec, max_seq);
+		success = iterate_mm_list_nowalk(lruvec, seq);
 		goto done;
 	}
 
 	walk->lruvec = lruvec;
-	walk->max_seq = max_seq;
+	walk->seq = seq;
 	walk->can_swap = can_swap;
 	walk->force_scan = force_scan;
 
@@ -3863,7 +3865,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
 	} while (mm);
 done:
 	if (success) {
-		success = inc_max_seq(lruvec, max_seq, can_swap, force_scan);
+		success = inc_max_seq(lruvec, seq, can_swap, force_scan);
 		WARN_ON_ONCE(!success);
 	}
 

From 3b5ce5e912a8aa0561db2732f9232803c2609725 Mon Sep 17 00:00:00 2001
From: Kinsey Ho <kinseyho@google.com>
Date: Wed, 14 Feb 2024 06:05:38 +0000
Subject: [PATCH 1202/1406] mm/mglru: improve swappiness handling

The reclaimable number of anon pages used to set initial reclaim priority
is only based on get_swappiness().  Use can_reclaim_anon_pages() to
include NUMA node demotion.

Also move the swappiness handling of when !__GFP_IO in
try_to_shrink_lruvec() into isolate_folios().

Link: https://lkml.kernel.org/r/20240214060538.3524462-6-kinseyho@google.com
Signed-off-by: Kinsey Ho <kinseyho@google.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Donet Tom <donettom@linux.vnet.ibm.com>
Cc: Yu Zhao <yuzhao@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/vmscan.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 20f9d6873f3989..e738a210cafc6d 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -4290,7 +4290,7 @@ static bool isolate_folio(struct lruvec *lruvec, struct folio *folio, struct sca
 {
 	bool success;
 
-	/* swapping inhibited */
+	/* swap constrained */
 	if (!(sc->gfp_mask & __GFP_IO) &&
 	    (folio_test_dirty(folio) ||
 	     (folio_test_anon(folio) && !folio_test_swapcache(folio))))
@@ -4459,9 +4459,12 @@ static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int sw
 	DEFINE_MIN_SEQ(lruvec);
 
 	/*
-	 * Try to make the obvious choice first. When anon and file are both
-	 * available from the same generation, interpret swappiness 1 as file
-	 * first and 200 as anon first.
+	 * Try to make the obvious choice first, and if anon and file are both
+	 * available from the same generation,
+	 * 1. Interpret swappiness 1 as file first and MAX_SWAPPINESS as anon
+	 *    first.
+	 * 2. If !__GFP_IO, file first since clean pagecache is more likely to
+	 *    exist than clean swapcache.
 	 */
 	if (!swappiness)
 		type = LRU_GEN_FILE;
@@ -4471,6 +4474,8 @@ static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int sw
 		type = LRU_GEN_FILE;
 	else if (swappiness == 200)
 		type = LRU_GEN_ANON;
+	else if (!(sc->gfp_mask & __GFP_IO))
+		type = LRU_GEN_FILE;
 	else
 		type = get_type_to_scan(lruvec, swappiness, &tier);
 
@@ -4714,10 +4719,6 @@ static bool try_to_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
 	unsigned long scanned = 0;
 	int swappiness = get_swappiness(lruvec, sc);
 
-	/* clean file folios are more likely to exist */
-	if (swappiness && !(sc->gfp_mask & __GFP_IO))
-		swappiness = 1;
-
 	while (true) {
 		int delta;
 
@@ -4880,7 +4881,6 @@ static void set_initial_priority(struct pglist_data *pgdat, struct scan_control
 {
 	int priority;
 	unsigned long reclaimable;
-	struct lruvec *lruvec = mem_cgroup_lruvec(NULL, pgdat);
 
 	if (sc->priority != DEF_PRIORITY || sc->nr_to_reclaim < MIN_LRU_BATCH)
 		return;
@@ -4890,7 +4890,7 @@ static void set_initial_priority(struct pglist_data *pgdat, struct scan_control
 	 * where reclaimed_to_scanned_ratio = inactive / total.
 	 */
 	reclaimable = node_page_state(pgdat, NR_INACTIVE_FILE);
-	if (get_swappiness(lruvec, sc))
+	if (can_reclaim_anon_pages(NULL, pgdat->node_id, sc))
 		reclaimable += node_page_state(pgdat, NR_INACTIVE_ANON);
 
 	/* round down reclaimable and round up sc->nr_to_reclaim */

From 8771dd3829a5c946292ee7d84be0dbe5a24765ad Mon Sep 17 00:00:00 2001
From: Anshuman Khandual <anshuman.khandual@arm.com>
Date: Fri, 9 Feb 2024 11:12:21 +0530
Subject: [PATCH 1203/1406] mm/hugetlb: move page order check inside
 hugetlb_cma_reserve()

All platforms could benefit from page order check against MAX_PAGE_ORDER
before allocating a CMA area for gigantic hugetlb pages.  Let's move this
check from individual platforms to generic hugetlb.

Link: https://lkml.kernel.org/r/20240209054221.1403364-1-anshuman.khandual@arm.com
Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
Reviewed-by: Jane Chu <jane.chu@oracle.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/arm64/mm/hugetlbpage.c   | 7 -------
 arch/powerpc/mm/hugetlbpage.c | 4 +---
 mm/hugetlb.c                  | 7 +++++++
 3 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
index 8116ac599f801d..6720ec8d50e76c 100644
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -45,13 +45,6 @@ void __init arm64_hugetlb_cma_reserve(void)
 	else
 		order = CONT_PMD_SHIFT - PAGE_SHIFT;
 
-	/*
-	 * HugeTLB CMA reservation is required for gigantic
-	 * huge pages which could not be allocated via the
-	 * page allocator. Just warn if there is any change
-	 * breaking this assumption.
-	 */
-	WARN_ON(order <= MAX_PAGE_ORDER);
 	hugetlb_cma_reserve(order);
 }
 #endif /* CONFIG_CMA */
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index a1651d54718626..594a4b7b2ca246 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -614,8 +614,6 @@ void __init gigantic_hugetlb_cma_reserve(void)
 		 */
 		order = mmu_psize_to_shift(MMU_PAGE_16G) - PAGE_SHIFT;
 
-	if (order) {
-		VM_WARN_ON(order <= MAX_PAGE_ORDER);
+	if (order)
 		hugetlb_cma_reserve(order);
-	}
 }
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 25069ca6ec2486..68283e54c899e7 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -7800,6 +7800,13 @@ void __init hugetlb_cma_reserve(int order)
 	bool node_specific_cma_alloc = false;
 	int nid;
 
+	/*
+	 * HugeTLB CMA reservation is required for gigantic
+	 * huge pages which could not be allocated via the
+	 * page allocator. Just warn if there is any change
+	 * breaking this assumption.
+	 */
+	VM_WARN_ON(order <= MAX_PAGE_ORDER);
 	cma_reserve_called = true;
 
 	if (!hugetlb_cma_size)

From 11c6070d077ca2716771a4e4a965daf93d481c11 Mon Sep 17 00:00:00 2001
From: Barry Song <v-songbaohua@oppo.com>
Date: Tue, 13 Feb 2024 19:54:00 +1300
Subject: [PATCH 1204/1406] zram: do not allocate physically contiguous strm
 buffers

Currently zram allocates 2 physically contiguous pages per-CPU's
compression stream (we may have up to 4 streams per-CPU).  Since those
buffers are per-CPU we allocate them from CPU hotplug path, which may have
higher risks of failed allocations on devices with fragmented memory.

Switch to virtually contiguous allocations - crypto comp does not seem
impose requirements on compression working buffers to be physically
contiguous.

Link: https://lkml.kernel.org/r/20240213065400.6561-1-21cnbao@gmail.com
Signed-off-by: Barry Song <v-songbaohua@oppo.com>
Reviewed-by: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Sergey Senozhatsky <senozhatsky@chromium.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 drivers/block/zram/zcomp.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c
index 55af4efd798356..8237b08c49d861 100644
--- a/drivers/block/zram/zcomp.c
+++ b/drivers/block/zram/zcomp.c
@@ -11,6 +11,7 @@
 #include <linux/sched.h>
 #include <linux/cpu.h>
 #include <linux/crypto.h>
+#include <linux/vmalloc.h>
 
 #include "zcomp.h"
 
@@ -37,7 +38,7 @@ static void zcomp_strm_free(struct zcomp_strm *zstrm)
 {
 	if (!IS_ERR_OR_NULL(zstrm->tfm))
 		crypto_free_comp(zstrm->tfm);
-	free_pages((unsigned long)zstrm->buffer, 1);
+	vfree(zstrm->buffer);
 	zstrm->tfm = NULL;
 	zstrm->buffer = NULL;
 }
@@ -53,7 +54,7 @@ static int zcomp_strm_init(struct zcomp_strm *zstrm, struct zcomp *comp)
 	 * allocate 2 pages. 1 for compressed data, plus 1 extra for the
 	 * case when compressed size is larger than the original one
 	 */
-	zstrm->buffer = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, 1);
+	zstrm->buffer = vzalloc(2 * PAGE_SIZE);
 	if (IS_ERR_OR_NULL(zstrm->tfm) || !zstrm->buffer) {
 		zcomp_strm_free(zstrm);
 		return -ENOMEM;

From 6d0387edba89e6646f9721a838db03786c0c4d8b Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Wed, 14 Feb 2024 21:44:26 +0100
Subject: [PATCH 1205/1406] mm/memory: factor out zapping of present pte into
 zap_present_pte()

Patch series "mm/memory: optimize unmap/zap with PTE-mapped THP", v3.

This series is based on [1].  Similar to what we did with fork(), let's
implement PTE batching during unmap/zap when processing PTE-mapped THPs.

We collect consecutive PTEs that map consecutive pages of the same large
folio, making sure that the other PTE bits are compatible, and (a) adjust
the refcount only once per batch, (b) call rmap handling functions only
once per batch, (c) perform batch PTE setting/updates and (d) perform TLB
entry removal once per batch.

Ryan was previously working on this in the context of cont-pte for arm64,
int latest iteration [2] with a focus on arm6 with cont-pte only.  This
series implements the optimization for all architectures, independent of
such PTE bits, teaches MMU gather/TLB code to be fully aware of such
large-folio-pages batches as well, and amkes use of our new rmap batching
function when removing the rmap.

To achieve that, we have to enlighten MMU gather / page freeing code
(i.e., everything that consumes encoded_page) to process unmapping of
consecutive pages that all belong to the same large folio.  I'm being very
careful to not degrade order-0 performance, and it looks like I managed to
achieve that.

While this series should -- similar to [1] -- be beneficial for adding
cont-pte support on arm64[2], it's one of the requirements for maintaining
a total mapcount[3] for large folios with minimal added overhead and
further changes[4] that build up on top of the total mapcount.

Independent of all that, this series results in a speedup during munmap()
and similar unmapping (process teardown, MADV_DONTNEED on larger ranges)
with PTE-mapped THP, which is the default with THPs that are smaller than
a PMD (for example, 16KiB to 1024KiB mTHPs for anonymous memory[5]).

On an Intel Xeon Silver 4210R CPU, munmap'ing a 1GiB VMA backed by
PTE-mapped folios of the same size (stddev < 1%) results in the following
runtimes for munmap() in seconds (shorter is better):

Folio Size | mm-unstable |      New | Change
---------------------------------------------
      4KiB |    0.058110 | 0.057715 |   - 1%
     16KiB |    0.044198 | 0.035469 |   -20%
     32KiB |    0.034216 | 0.023522 |   -31%
     64KiB |    0.029207 | 0.018434 |   -37%
    128KiB |    0.026579 | 0.014026 |   -47%
    256KiB |    0.025130 | 0.011756 |   -53%
    512KiB |    0.024292 | 0.010703 |   -56%
   1024KiB |    0.023812 | 0.010294 |   -57%
   2048KiB |    0.023785 | 0.009910 |   -58%

[1] https://lkml.kernel.org/r/20240129124649.189745-1-david@redhat.com
[2] https://lkml.kernel.org/r/20231218105100.172635-1-ryan.roberts@arm.com
[3] https://lkml.kernel.org/r/20230809083256.699513-1-david@redhat.com
[4] https://lkml.kernel.org/r/20231124132626.235350-1-david@redhat.com
[5] https://lkml.kernel.org/r/20231207161211.2374093-1-ryan.roberts@arm.com


This patch (of 10):

Let's prepare for further changes by factoring out processing of present
PTEs.

Link: https://lkml.kernel.org/r/20240214204435.167852-1-david@redhat.com
Link: https://lkml.kernel.org/r/20240214204435.167852-2-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Ryan Roberts <ryan.roberts@arm.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: linuxppc-dev@lists.ozlabs.org
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Michal Hocko <mhocko@suse.com>
Cc: "Naveen N. Rao" <naveen.n.rao@linux.ibm.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Yin Fengwei <fengwei.yin@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/memory.c | 94 ++++++++++++++++++++++++++++++-----------------------
 1 file changed, 53 insertions(+), 41 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index e5e5056cb53fe5..6712453888fa84 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1534,13 +1534,61 @@ zap_install_uffd_wp_if_needed(struct vm_area_struct *vma,
 	pte_install_uffd_wp_if_needed(vma, addr, pte, pteval);
 }
 
+static inline void zap_present_pte(struct mmu_gather *tlb,
+		struct vm_area_struct *vma, pte_t *pte, pte_t ptent,
+		unsigned long addr, struct zap_details *details,
+		int *rss, bool *force_flush, bool *force_break)
+{
+	struct mm_struct *mm = tlb->mm;
+	struct folio *folio = NULL;
+	bool delay_rmap = false;
+	struct page *page;
+
+	page = vm_normal_page(vma, addr, ptent);
+	if (page)
+		folio = page_folio(page);
+
+	if (unlikely(!should_zap_folio(details, folio)))
+		return;
+	ptent = ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm);
+	arch_check_zapped_pte(vma, ptent);
+	tlb_remove_tlb_entry(tlb, pte, addr);
+	zap_install_uffd_wp_if_needed(vma, addr, pte, details, ptent);
+	if (unlikely(!page)) {
+		ksm_might_unmap_zero_page(mm, ptent);
+		return;
+	}
+
+	if (!folio_test_anon(folio)) {
+		if (pte_dirty(ptent)) {
+			folio_mark_dirty(folio);
+			if (tlb_delay_rmap(tlb)) {
+				delay_rmap = true;
+				*force_flush = true;
+			}
+		}
+		if (pte_young(ptent) && likely(vma_has_recency(vma)))
+			folio_mark_accessed(folio);
+	}
+	rss[mm_counter(folio)]--;
+	if (!delay_rmap) {
+		folio_remove_rmap_pte(folio, page, vma);
+		if (unlikely(page_mapcount(page) < 0))
+			print_bad_pte(vma, addr, ptent, page);
+	}
+	if (unlikely(__tlb_remove_page(tlb, page, delay_rmap))) {
+		*force_flush = true;
+		*force_break = true;
+	}
+}
+
 static unsigned long zap_pte_range(struct mmu_gather *tlb,
 				struct vm_area_struct *vma, pmd_t *pmd,
 				unsigned long addr, unsigned long end,
 				struct zap_details *details)
 {
+	bool force_flush = false, force_break = false;
 	struct mm_struct *mm = tlb->mm;
-	int force_flush = 0;
 	int rss[NR_MM_COUNTERS];
 	spinlock_t *ptl;
 	pte_t *start_pte;
@@ -1557,7 +1605,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
 	arch_enter_lazy_mmu_mode();
 	do {
 		pte_t ptent = ptep_get(pte);
-		struct folio *folio = NULL;
+		struct folio *folio;
 		struct page *page;
 
 		if (pte_none(ptent))
@@ -1567,45 +1615,9 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
 			break;
 
 		if (pte_present(ptent)) {
-			unsigned int delay_rmap;
-
-			page = vm_normal_page(vma, addr, ptent);
-			if (page)
-				folio = page_folio(page);
-
-			if (unlikely(!should_zap_folio(details, folio)))
-				continue;
-			ptent = ptep_get_and_clear_full(mm, addr, pte,
-							tlb->fullmm);
-			arch_check_zapped_pte(vma, ptent);
-			tlb_remove_tlb_entry(tlb, pte, addr);
-			zap_install_uffd_wp_if_needed(vma, addr, pte, details,
-						      ptent);
-			if (unlikely(!page)) {
-				ksm_might_unmap_zero_page(mm, ptent);
-				continue;
-			}
-
-			delay_rmap = 0;
-			if (!folio_test_anon(folio)) {
-				if (pte_dirty(ptent)) {
-					folio_mark_dirty(folio);
-					if (tlb_delay_rmap(tlb)) {
-						delay_rmap = 1;
-						force_flush = 1;
-					}
-				}
-				if (pte_young(ptent) && likely(vma_has_recency(vma)))
-					folio_mark_accessed(folio);
-			}
-			rss[mm_counter(folio)]--;
-			if (!delay_rmap) {
-				folio_remove_rmap_pte(folio, page, vma);
-				if (unlikely(page_mapcount(page) < 0))
-					print_bad_pte(vma, addr, ptent, page);
-			}
-			if (unlikely(__tlb_remove_page(tlb, page, delay_rmap))) {
-				force_flush = 1;
+			zap_present_pte(tlb, vma, pte, ptent, addr, details,
+					rss, &force_flush, &force_break);
+			if (unlikely(force_break)) {
 				addr += PAGE_SIZE;
 				break;
 			}

From b1e1a09746946db687157975fa7fe62b8e2d72a1 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Wed, 14 Feb 2024 21:44:27 +0100
Subject: [PATCH 1206/1406] mm/memory: handle !page case in zap_present_pte()
 separately

We don't need uptodate accessed/dirty bits, so in theory we could replace
ptep_get_and_clear_full() by an optimized ptep_clear_full() function.
Let's rely on the provided pte.

Further, there is no scenario where we would have to insert uffd-wp
markers when zapping something that is not a normal page (i.e., zeropage).
Add a sanity check to make sure this remains true.

should_zap_folio() no longer has to handle NULL pointers.  This change
replaces 2/3 "!page/!folio" checks by a single "!page" one.

Note that arch_check_zapped_pte() on x86-64 checks the HW-dirty bit to
detect shadow stack entries.  But for shadow stack entries, the HW dirty
bit (in combination with non-writable PTEs) is set by software.  So for
the arch_check_zapped_pte() check, we don't have to sync against HW
setting the HW dirty bit concurrently, it is always set.

Link: https://lkml.kernel.org/r/20240214204435.167852-3-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Ryan Roberts <ryan.roberts@arm.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Michal Hocko <mhocko@suse.com>
Cc: "Naveen N. Rao" <naveen.n.rao@linux.ibm.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Yin Fengwei <fengwei.yin@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/memory.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index 6712453888fa84..2b1e9abbe1e093 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1499,10 +1499,6 @@ static inline bool should_zap_folio(struct zap_details *details,
 	if (should_zap_cows(details))
 		return true;
 
-	/* E.g. the caller passes NULL for the case of a zero folio */
-	if (!folio)
-		return true;
-
 	/* Otherwise we should only zap non-anon folios */
 	return !folio_test_anon(folio);
 }
@@ -1540,24 +1536,28 @@ static inline void zap_present_pte(struct mmu_gather *tlb,
 		int *rss, bool *force_flush, bool *force_break)
 {
 	struct mm_struct *mm = tlb->mm;
-	struct folio *folio = NULL;
 	bool delay_rmap = false;
+	struct folio *folio;
 	struct page *page;
 
 	page = vm_normal_page(vma, addr, ptent);
-	if (page)
-		folio = page_folio(page);
+	if (!page) {
+		/* We don't need up-to-date accessed/dirty bits. */
+		ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm);
+		arch_check_zapped_pte(vma, ptent);
+		tlb_remove_tlb_entry(tlb, pte, addr);
+		VM_WARN_ON_ONCE(userfaultfd_wp(vma));
+		ksm_might_unmap_zero_page(mm, ptent);
+		return;
+	}
 
+	folio = page_folio(page);
 	if (unlikely(!should_zap_folio(details, folio)))
 		return;
 	ptent = ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm);
 	arch_check_zapped_pte(vma, ptent);
 	tlb_remove_tlb_entry(tlb, pte, addr);
 	zap_install_uffd_wp_if_needed(vma, addr, pte, details, ptent);
-	if (unlikely(!page)) {
-		ksm_might_unmap_zero_page(mm, ptent);
-		return;
-	}
 
 	if (!folio_test_anon(folio)) {
 		if (pte_dirty(ptent)) {

From 8c384eb99826b5b2a39bb17582bc1c17728e12c2 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Wed, 14 Feb 2024 21:44:28 +0100
Subject: [PATCH 1207/1406] mm/memory: further separate anon and pagecache
 folio handling in zap_present_pte()

We don't need up-to-date accessed-dirty information for anon folios and
can simply work with the ptent we already have.  Also, we know the RSS
counter we want to update.

We can safely move arch_check_zapped_pte() + tlb_remove_tlb_entry() +
zap_install_uffd_wp_if_needed() after updating the folio and RSS.

While at it, only call zap_install_uffd_wp_if_needed() if there is even
any chance that pte_install_uffd_wp_if_needed() would do *something*.
That is, just don't bother if uffd-wp does not apply.

Link: https://lkml.kernel.org/r/20240214204435.167852-4-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Ryan Roberts <ryan.roberts@arm.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Michal Hocko <mhocko@suse.com>
Cc: "Naveen N. Rao" <naveen.n.rao@linux.ibm.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Yin Fengwei <fengwei.yin@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/memory.c | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index 2b1e9abbe1e093..0b4d76dcf23274 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1554,12 +1554,9 @@ static inline void zap_present_pte(struct mmu_gather *tlb,
 	folio = page_folio(page);
 	if (unlikely(!should_zap_folio(details, folio)))
 		return;
-	ptent = ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm);
-	arch_check_zapped_pte(vma, ptent);
-	tlb_remove_tlb_entry(tlb, pte, addr);
-	zap_install_uffd_wp_if_needed(vma, addr, pte, details, ptent);
 
 	if (!folio_test_anon(folio)) {
+		ptent = ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm);
 		if (pte_dirty(ptent)) {
 			folio_mark_dirty(folio);
 			if (tlb_delay_rmap(tlb)) {
@@ -1569,8 +1566,17 @@ static inline void zap_present_pte(struct mmu_gather *tlb,
 		}
 		if (pte_young(ptent) && likely(vma_has_recency(vma)))
 			folio_mark_accessed(folio);
+		rss[mm_counter(folio)]--;
+	} else {
+		/* We don't need up-to-date accessed/dirty bits. */
+		ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm);
+		rss[MM_ANONPAGES]--;
 	}
-	rss[mm_counter(folio)]--;
+	arch_check_zapped_pte(vma, ptent);
+	tlb_remove_tlb_entry(tlb, pte, addr);
+	if (unlikely(userfaultfd_pte_wp(vma, ptent)))
+		zap_install_uffd_wp_if_needed(vma, addr, pte, details, ptent);
+
 	if (!delay_rmap) {
 		folio_remove_rmap_pte(folio, page, vma);
 		if (unlikely(page_mapcount(page) < 0))

From 6d644584124026cfc14f578e70f4d1b5bb4f83c4 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Wed, 14 Feb 2024 21:44:29 +0100
Subject: [PATCH 1208/1406] mm/memory: factor out zapping folio pte into
 zap_present_folio_pte()

Let's prepare for further changes by factoring it out into a separate
function.

Link: https://lkml.kernel.org/r/20240214204435.167852-5-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Ryan Roberts <ryan.roberts@arm.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Michal Hocko <mhocko@suse.com>
Cc: "Naveen N. Rao" <naveen.n.rao@linux.ibm.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Yin Fengwei <fengwei.yin@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/memory.c | 53 ++++++++++++++++++++++++++++++++---------------------
 1 file changed, 32 insertions(+), 21 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index 0b4d76dcf23274..168096f9360e7b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1530,30 +1530,14 @@ zap_install_uffd_wp_if_needed(struct vm_area_struct *vma,
 	pte_install_uffd_wp_if_needed(vma, addr, pte, pteval);
 }
 
-static inline void zap_present_pte(struct mmu_gather *tlb,
-		struct vm_area_struct *vma, pte_t *pte, pte_t ptent,
-		unsigned long addr, struct zap_details *details,
-		int *rss, bool *force_flush, bool *force_break)
+static inline void zap_present_folio_pte(struct mmu_gather *tlb,
+		struct vm_area_struct *vma, struct folio *folio,
+		struct page *page, pte_t *pte, pte_t ptent, unsigned long addr,
+		struct zap_details *details, int *rss, bool *force_flush,
+		bool *force_break)
 {
 	struct mm_struct *mm = tlb->mm;
 	bool delay_rmap = false;
-	struct folio *folio;
-	struct page *page;
-
-	page = vm_normal_page(vma, addr, ptent);
-	if (!page) {
-		/* We don't need up-to-date accessed/dirty bits. */
-		ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm);
-		arch_check_zapped_pte(vma, ptent);
-		tlb_remove_tlb_entry(tlb, pte, addr);
-		VM_WARN_ON_ONCE(userfaultfd_wp(vma));
-		ksm_might_unmap_zero_page(mm, ptent);
-		return;
-	}
-
-	folio = page_folio(page);
-	if (unlikely(!should_zap_folio(details, folio)))
-		return;
 
 	if (!folio_test_anon(folio)) {
 		ptent = ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm);
@@ -1588,6 +1572,33 @@ static inline void zap_present_pte(struct mmu_gather *tlb,
 	}
 }
 
+static inline void zap_present_pte(struct mmu_gather *tlb,
+		struct vm_area_struct *vma, pte_t *pte, pte_t ptent,
+		unsigned long addr, struct zap_details *details,
+		int *rss, bool *force_flush, bool *force_break)
+{
+	struct mm_struct *mm = tlb->mm;
+	struct folio *folio;
+	struct page *page;
+
+	page = vm_normal_page(vma, addr, ptent);
+	if (!page) {
+		/* We don't need up-to-date accessed/dirty bits. */
+		ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm);
+		arch_check_zapped_pte(vma, ptent);
+		tlb_remove_tlb_entry(tlb, pte, addr);
+		VM_WARN_ON_ONCE(userfaultfd_wp(vma));
+		ksm_might_unmap_zero_page(mm, ptent);
+		return;
+	}
+
+	folio = page_folio(page);
+	if (unlikely(!should_zap_folio(details, folio)))
+		return;
+	zap_present_folio_pte(tlb, vma, folio, page, pte, ptent, addr, details,
+			      rss, force_flush, force_break);
+}
+
 static unsigned long zap_pte_range(struct mmu_gather *tlb,
 				struct vm_area_struct *vma, pmd_t *pmd,
 				unsigned long addr, unsigned long end,

From 7fd96bfb7a76a906ca09d65aae72a318366535ac Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Wed, 14 Feb 2024 21:44:30 +0100
Subject: [PATCH 1209/1406] mm/mmu_gather: pass "delay_rmap" instead of encoded
 page to __tlb_remove_page_size()

We have two bits available in the encoded page pointer to store additional
information.  Currently, we use one bit to request delay of the rmap
removal until after a TLB flush.

We want to make use of the remaining bit internally for batching of
multiple pages of the same folio, specifying that the next encoded page
pointer in an array is actually "nr_pages".  So pass page + delay_rmap
flag instead of an encoded page, to handle the encoding internally.

Link: https://lkml.kernel.org/r/20240214204435.167852-6-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Ryan Roberts <ryan.roberts@arm.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Michal Hocko <mhocko@suse.com>
Cc: "Naveen N. Rao" <naveen.n.rao@linux.ibm.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Yin Fengwei <fengwei.yin@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/s390/include/asm/tlb.h | 13 ++++++-------
 include/asm-generic/tlb.h   | 12 ++++++------
 mm/mmu_gather.c             |  7 ++++---
 3 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/arch/s390/include/asm/tlb.h b/arch/s390/include/asm/tlb.h
index d1455a601adcad..48df896d5b79d2 100644
--- a/arch/s390/include/asm/tlb.h
+++ b/arch/s390/include/asm/tlb.h
@@ -25,8 +25,7 @@
 void __tlb_remove_table(void *_table);
 static inline void tlb_flush(struct mmu_gather *tlb);
 static inline bool __tlb_remove_page_size(struct mmu_gather *tlb,
-					  struct encoded_page *page,
-					  int page_size);
+		struct page *page, bool delay_rmap, int page_size);
 
 #define tlb_flush tlb_flush
 #define pte_free_tlb pte_free_tlb
@@ -42,14 +41,14 @@ static inline bool __tlb_remove_page_size(struct mmu_gather *tlb,
  * tlb_ptep_clear_flush. In both flush modes the tlb for a page cache page
  * has already been freed, so just do free_page_and_swap_cache.
  *
- * s390 doesn't delay rmap removal, so there is nothing encoded in
- * the page pointer.
+ * s390 doesn't delay rmap removal.
  */
 static inline bool __tlb_remove_page_size(struct mmu_gather *tlb,
-					  struct encoded_page *page,
-					  int page_size)
+		struct page *page, bool delay_rmap, int page_size)
 {
-	free_page_and_swap_cache(encoded_page_ptr(page));
+	VM_WARN_ON_ONCE(delay_rmap);
+
+	free_page_and_swap_cache(page);
 	return false;
 }
 
diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index 129a3a75997659..2eb7b0d4f5d2b5 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -260,9 +260,8 @@ struct mmu_gather_batch {
  */
 #define MAX_GATHER_BATCH_COUNT	(10000UL/MAX_GATHER_BATCH)
 
-extern bool __tlb_remove_page_size(struct mmu_gather *tlb,
-				   struct encoded_page *page,
-				   int page_size);
+extern bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page,
+		bool delay_rmap, int page_size);
 
 #ifdef CONFIG_SMP
 /*
@@ -462,13 +461,14 @@ static inline void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb)
 static inline void tlb_remove_page_size(struct mmu_gather *tlb,
 					struct page *page, int page_size)
 {
-	if (__tlb_remove_page_size(tlb, encode_page(page, 0), page_size))
+	if (__tlb_remove_page_size(tlb, page, false, page_size))
 		tlb_flush_mmu(tlb);
 }
 
-static __always_inline bool __tlb_remove_page(struct mmu_gather *tlb, struct page *page, unsigned int flags)
+static __always_inline bool __tlb_remove_page(struct mmu_gather *tlb,
+		struct page *page, bool delay_rmap)
 {
-	return __tlb_remove_page_size(tlb, encode_page(page, flags), PAGE_SIZE);
+	return __tlb_remove_page_size(tlb, page, delay_rmap, PAGE_SIZE);
 }
 
 /* tlb_remove_page
diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
index 604ddf08affed2..ac733d81b11211 100644
--- a/mm/mmu_gather.c
+++ b/mm/mmu_gather.c
@@ -116,7 +116,8 @@ static void tlb_batch_list_free(struct mmu_gather *tlb)
 	tlb->local.next = NULL;
 }
 
-bool __tlb_remove_page_size(struct mmu_gather *tlb, struct encoded_page *page, int page_size)
+bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page,
+		bool delay_rmap, int page_size)
 {
 	struct mmu_gather_batch *batch;
 
@@ -131,13 +132,13 @@ bool __tlb_remove_page_size(struct mmu_gather *tlb, struct encoded_page *page, i
 	 * Add the page and check if we are full. If so
 	 * force a flush.
 	 */
-	batch->encoded_pages[batch->nr++] = page;
+	batch->encoded_pages[batch->nr++] = encode_page(page, delay_rmap);
 	if (batch->nr == batch->max) {
 		if (!tlb_next_batch(tlb))
 			return true;
 		batch = tlb->active;
 	}
-	VM_BUG_ON_PAGE(batch->nr > batch->max, encoded_page_ptr(page));
+	VM_BUG_ON_PAGE(batch->nr > batch->max, page);
 
 	return false;
 }

From 1aa51ddb343705a71a4976e25dd9b16e4b4d16a1 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Wed, 14 Feb 2024 21:44:31 +0100
Subject: [PATCH 1210/1406] mm/mmu_gather: define ENCODED_PAGE_FLAG_DELAY_RMAP

Nowadays, encoded pages are only used in mmu_gather handling.  Let's
update the documentation, and define ENCODED_PAGE_BIT_DELAY_RMAP.  While
at it, rename ENCODE_PAGE_BITS to ENCODED_PAGE_BITS.

If encoded page pointers would ever be used in other context again, we'd
likely want to change the defines to reflect their context (e.g.,
ENCODED_PAGE_FLAG_MMU_GATHER_DELAY_RMAP).  For now, let's keep it simple.

This is a preparation for using the remaining spare bit to indicate that
the next item in an array of encoded pages is a "nr_pages" argument and
not an encoded page.

Link: https://lkml.kernel.org/r/20240214204435.167852-7-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Ryan Roberts <ryan.roberts@arm.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Michal Hocko <mhocko@suse.com>
Cc: "Naveen N. Rao" <naveen.n.rao@linux.ibm.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Yin Fengwei <fengwei.yin@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm_types.h | 17 +++++++++++------
 mm/mmu_gather.c          |  5 +++--
 2 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 8b611e13153e68..1b89eec0d6df88 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -210,8 +210,8 @@ struct page {
  *
  * An 'encoded_page' pointer is a pointer to a regular 'struct page', but
  * with the low bits of the pointer indicating extra context-dependent
- * information. Not super-common, but happens in mmu_gather and mlock
- * handling, and this acts as a type system check on that use.
+ * information. Only used in mmu_gather handling, and this acts as a type
+ * system check on that use.
  *
  * We only really have two guaranteed bits in general, although you could
  * play with 'struct page' alignment (see CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
@@ -220,21 +220,26 @@ struct page {
  * Use the supplied helper functions to endcode/decode the pointer and bits.
  */
 struct encoded_page;
-#define ENCODE_PAGE_BITS 3ul
+
+#define ENCODED_PAGE_BITS			3ul
+
+/* Perform rmap removal after we have flushed the TLB. */
+#define ENCODED_PAGE_BIT_DELAY_RMAP		1ul
+
 static __always_inline struct encoded_page *encode_page(struct page *page, unsigned long flags)
 {
-	BUILD_BUG_ON(flags > ENCODE_PAGE_BITS);
+	BUILD_BUG_ON(flags > ENCODED_PAGE_BITS);
 	return (struct encoded_page *)(flags | (unsigned long)page);
 }
 
 static inline unsigned long encoded_page_flags(struct encoded_page *page)
 {
-	return ENCODE_PAGE_BITS & (unsigned long)page;
+	return ENCODED_PAGE_BITS & (unsigned long)page;
 }
 
 static inline struct page *encoded_page_ptr(struct encoded_page *page)
 {
-	return (struct page *)(~ENCODE_PAGE_BITS & (unsigned long)page);
+	return (struct page *)(~ENCODED_PAGE_BITS & (unsigned long)page);
 }
 
 /*
diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
index ac733d81b11211..6540c99c675813 100644
--- a/mm/mmu_gather.c
+++ b/mm/mmu_gather.c
@@ -53,7 +53,7 @@ static void tlb_flush_rmap_batch(struct mmu_gather_batch *batch, struct vm_area_
 	for (int i = 0; i < batch->nr; i++) {
 		struct encoded_page *enc = batch->encoded_pages[i];
 
-		if (encoded_page_flags(enc)) {
+		if (encoded_page_flags(enc) & ENCODED_PAGE_BIT_DELAY_RMAP) {
 			struct page *page = encoded_page_ptr(enc);
 			folio_remove_rmap_pte(page_folio(page), page, vma);
 		}
@@ -119,6 +119,7 @@ static void tlb_batch_list_free(struct mmu_gather *tlb)
 bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page,
 		bool delay_rmap, int page_size)
 {
+	int flags = delay_rmap ? ENCODED_PAGE_BIT_DELAY_RMAP : 0;
 	struct mmu_gather_batch *batch;
 
 	VM_BUG_ON(!tlb->end);
@@ -132,7 +133,7 @@ bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page,
 	 * Add the page and check if we are full. If so
 	 * force a flush.
 	 */
-	batch->encoded_pages[batch->nr++] = encode_page(page, delay_rmap);
+	batch->encoded_pages[batch->nr++] = encode_page(page, flags);
 	if (batch->nr == batch->max) {
 		if (!tlb_next_batch(tlb))
 			return true;

From 66958b447695c2e980486591027b8ca27140fef5 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Wed, 14 Feb 2024 21:44:32 +0100
Subject: [PATCH 1211/1406] mm/mmu_gather: add tlb_remove_tlb_entries()

Let's add a helper that lets us batch-process multiple consecutive PTEs.

Note that the loop will get optimized out on all architectures except on
powerpc.  We have to add an early define of __tlb_remove_tlb_entry() on
ppc to make the compiler happy (and avoid making tlb_remove_tlb_entries()
a macro).

Link: https://lkml.kernel.org/r/20240214204435.167852-8-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Ryan Roberts <ryan.roberts@arm.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Michal Hocko <mhocko@suse.com>
Cc: "Naveen N. Rao" <naveen.n.rao@linux.ibm.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Yin Fengwei <fengwei.yin@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/powerpc/include/asm/tlb.h |  2 ++
 include/asm-generic/tlb.h      | 20 ++++++++++++++++++++
 2 files changed, 22 insertions(+)

diff --git a/arch/powerpc/include/asm/tlb.h b/arch/powerpc/include/asm/tlb.h
index b3de6102a90779..1ca7d4c4b90dbf 100644
--- a/arch/powerpc/include/asm/tlb.h
+++ b/arch/powerpc/include/asm/tlb.h
@@ -19,6 +19,8 @@
 
 #include <linux/pagemap.h>
 
+static inline void __tlb_remove_tlb_entry(struct mmu_gather *tlb, pte_t *ptep,
+					  unsigned long address);
 #define __tlb_remove_tlb_entry	__tlb_remove_tlb_entry
 
 #define tlb_flush tlb_flush
diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index 2eb7b0d4f5d2b5..95d60a4f468a05 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -608,6 +608,26 @@ static inline void tlb_flush_p4d_range(struct mmu_gather *tlb,
 		__tlb_remove_tlb_entry(tlb, ptep, address);	\
 	} while (0)
 
+/**
+ * tlb_remove_tlb_entries - remember unmapping of multiple consecutive ptes for
+ *			    later tlb invalidation.
+ *
+ * Similar to tlb_remove_tlb_entry(), but remember unmapping of multiple
+ * consecutive ptes instead of only a single one.
+ */
+static inline void tlb_remove_tlb_entries(struct mmu_gather *tlb,
+		pte_t *ptep, unsigned int nr, unsigned long address)
+{
+	tlb_flush_pte_range(tlb, address, PAGE_SIZE * nr);
+	for (;;) {
+		__tlb_remove_tlb_entry(tlb, ptep, address);
+		if (--nr == 0)
+			break;
+		ptep++;
+		address += PAGE_SIZE;
+	}
+}
+
 #define tlb_remove_huge_tlb_entry(h, tlb, ptep, address)	\
 	do {							\
 		unsigned long _sz = huge_page_size(h);		\

From 8ea5d31faab3894572cd42055ba1153fe8acd743 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Wed, 14 Feb 2024 21:44:33 +0100
Subject: [PATCH 1212/1406] mm/mmu_gather: add __tlb_remove_folio_pages()

Add __tlb_remove_folio_pages(), which will remove multiple consecutive
pages that belong to the same large folio, instead of only a single page.
We'll be using this function when optimizing unmapping/zapping of large
folios that are mapped by PTEs.

We're using the remaining spare bit in an encoded_page to indicate that
the next enoced page in an array contains actually shifted "nr_pages".
Teach swap/freeing code about putting multiple folio references, and
delayed rmap handling to remove page ranges of a folio.

This extension allows for still gathering almost as many small folios as
we used to (-1, because we have to prepare for a possibly bigger next
entry), but still allows for gathering consecutive pages that belong to
the same large folio.

Note that we don't pass the folio pointer, because it is not required for
now.  Further, we don't support page_size != PAGE_SIZE, it won't be
required for simple PTE batching.

We have to provide a separate s390 implementation, but it's fairly
straight forward.

Another, more invasive and likely more expensive, approach would be to use
folio+range or a PFN range instead of page+nr_pages.  But, we should do
that consistently for the whole mmu_gather.  For now, let's keep it simple
and add "nr_pages" only.

Note that it is now possible to gather significantly more pages: In the
past, we were able to gather ~10000 pages, now we can also gather ~5000
folio fragments that span multiple pages.  A folio fragment on x86-64 can
span up to 512 pages (2 MiB THP) and on arm64 with 64k in theory 8192
pages (512 MiB THP).  Gathering more memory is not considered something we
should worry about, especially because these are already corner cases.

While we can gather more total memory, we won't free more folio fragments.
As long as page freeing time primarily only depends on the number of
involved folios, there is no effective change for !preempt configurations.
However, we'll adjust tlb_batch_pages_flush() separately to handle corner
cases where page freeing time grows proportionally with the actual memory
size.

Link: https://lkml.kernel.org/r/20240214204435.167852-9-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Ryan Roberts <ryan.roberts@arm.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Michal Hocko <mhocko@suse.com>
Cc: "Naveen N. Rao" <naveen.n.rao@linux.ibm.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Yin Fengwei <fengwei.yin@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/s390/include/asm/tlb.h | 17 +++++++++++
 include/asm-generic/tlb.h   |  8 +++++
 include/linux/mm_types.h    | 20 ++++++++++++
 mm/mmu_gather.c             | 61 +++++++++++++++++++++++++++++++------
 mm/swap.c                   | 12 ++++++--
 mm/swap_state.c             | 15 +++++++--
 6 files changed, 119 insertions(+), 14 deletions(-)

diff --git a/arch/s390/include/asm/tlb.h b/arch/s390/include/asm/tlb.h
index 48df896d5b79d2..e95b2c8081eb8e 100644
--- a/arch/s390/include/asm/tlb.h
+++ b/arch/s390/include/asm/tlb.h
@@ -26,6 +26,8 @@ void __tlb_remove_table(void *_table);
 static inline void tlb_flush(struct mmu_gather *tlb);
 static inline bool __tlb_remove_page_size(struct mmu_gather *tlb,
 		struct page *page, bool delay_rmap, int page_size);
+static inline bool __tlb_remove_folio_pages(struct mmu_gather *tlb,
+		struct page *page, unsigned int nr_pages, bool delay_rmap);
 
 #define tlb_flush tlb_flush
 #define pte_free_tlb pte_free_tlb
@@ -52,6 +54,21 @@ static inline bool __tlb_remove_page_size(struct mmu_gather *tlb,
 	return false;
 }
 
+static inline bool __tlb_remove_folio_pages(struct mmu_gather *tlb,
+		struct page *page, unsigned int nr_pages, bool delay_rmap)
+{
+	struct encoded_page *encoded_pages[] = {
+		encode_page(page, ENCODED_PAGE_BIT_NR_PAGES_NEXT),
+		encode_nr_pages(nr_pages),
+	};
+
+	VM_WARN_ON_ONCE(delay_rmap);
+	VM_WARN_ON_ONCE(page_folio(page) != page_folio(page + nr_pages - 1));
+
+	free_pages_and_swap_cache(encoded_pages, ARRAY_SIZE(encoded_pages));
+	return false;
+}
+
 static inline void tlb_flush(struct mmu_gather *tlb)
 {
 	__tlb_flush_mm_lazy(tlb->mm);
diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index 95d60a4f468a05..bd00dd238b794c 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -69,6 +69,7 @@
  *
  *  - tlb_remove_page() / __tlb_remove_page()
  *  - tlb_remove_page_size() / __tlb_remove_page_size()
+ *  - __tlb_remove_folio_pages()
  *
  *    __tlb_remove_page_size() is the basic primitive that queues a page for
  *    freeing. __tlb_remove_page() assumes PAGE_SIZE. Both will return a
@@ -78,6 +79,11 @@
  *    tlb_remove_page() and tlb_remove_page_size() imply the call to
  *    tlb_flush_mmu() when required and has no return value.
  *
+ *    __tlb_remove_folio_pages() is similar to __tlb_remove_page(), however,
+ *    instead of removing a single page, remove the given number of consecutive
+ *    pages that are all part of the same (large) folio: just like calling
+ *    __tlb_remove_page() on each page individually.
+ *
  *  - tlb_change_page_size()
  *
  *    call before __tlb_remove_page*() to set the current page-size; implies a
@@ -262,6 +268,8 @@ struct mmu_gather_batch {
 
 extern bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page,
 		bool delay_rmap, int page_size);
+bool __tlb_remove_folio_pages(struct mmu_gather *tlb, struct page *page,
+		unsigned int nr_pages, bool delay_rmap);
 
 #ifdef CONFIG_SMP
 /*
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 1b89eec0d6df88..a7223ba3ea1e4e 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -226,6 +226,15 @@ struct encoded_page;
 /* Perform rmap removal after we have flushed the TLB. */
 #define ENCODED_PAGE_BIT_DELAY_RMAP		1ul
 
+/*
+ * The next item in an encoded_page array is the "nr_pages" argument, specifying
+ * the number of consecutive pages starting from this page, that all belong to
+ * the same folio. For example, "nr_pages" corresponds to the number of folio
+ * references that must be dropped. If this bit is not set, "nr_pages" is
+ * implicitly 1.
+ */
+#define ENCODED_PAGE_BIT_NR_PAGES_NEXT		2ul
+
 static __always_inline struct encoded_page *encode_page(struct page *page, unsigned long flags)
 {
 	BUILD_BUG_ON(flags > ENCODED_PAGE_BITS);
@@ -242,6 +251,17 @@ static inline struct page *encoded_page_ptr(struct encoded_page *page)
 	return (struct page *)(~ENCODED_PAGE_BITS & (unsigned long)page);
 }
 
+static __always_inline struct encoded_page *encode_nr_pages(unsigned long nr)
+{
+	VM_WARN_ON_ONCE((nr << 2) >> 2 != nr);
+	return (struct encoded_page *)(nr << 2);
+}
+
+static __always_inline unsigned long encoded_nr_pages(struct encoded_page *page)
+{
+	return ((unsigned long)page) >> 2;
+}
+
 /*
  * A swap entry has to fit into a "unsigned long", as the entry is hidden
  * in the "index" field of the swapper address space.
diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
index 6540c99c675813..d175c0f1e2c8c2 100644
--- a/mm/mmu_gather.c
+++ b/mm/mmu_gather.c
@@ -50,12 +50,21 @@ static bool tlb_next_batch(struct mmu_gather *tlb)
 #ifdef CONFIG_SMP
 static void tlb_flush_rmap_batch(struct mmu_gather_batch *batch, struct vm_area_struct *vma)
 {
+	struct encoded_page **pages = batch->encoded_pages;
+
 	for (int i = 0; i < batch->nr; i++) {
-		struct encoded_page *enc = batch->encoded_pages[i];
+		struct encoded_page *enc = pages[i];
 
 		if (encoded_page_flags(enc) & ENCODED_PAGE_BIT_DELAY_RMAP) {
 			struct page *page = encoded_page_ptr(enc);
-			folio_remove_rmap_pte(page_folio(page), page, vma);
+			unsigned int nr_pages = 1;
+
+			if (unlikely(encoded_page_flags(enc) &
+				     ENCODED_PAGE_BIT_NR_PAGES_NEXT))
+				nr_pages = encoded_nr_pages(pages[++i]);
+
+			folio_remove_rmap_ptes(page_folio(page), page, nr_pages,
+					       vma);
 		}
 	}
 }
@@ -89,18 +98,26 @@ static void tlb_batch_pages_flush(struct mmu_gather *tlb)
 	for (batch = &tlb->local; batch && batch->nr; batch = batch->next) {
 		struct encoded_page **pages = batch->encoded_pages;
 
-		do {
+		while (batch->nr) {
 			/*
 			 * limit free batch count when PAGE_SIZE > 4K
 			 */
 			unsigned int nr = min(512U, batch->nr);
 
+			/*
+			 * Make sure we cover page + nr_pages, and don't leave
+			 * nr_pages behind when capping the number of entries.
+			 */
+			if (unlikely(encoded_page_flags(pages[nr - 1]) &
+				     ENCODED_PAGE_BIT_NR_PAGES_NEXT))
+				nr++;
+
 			free_pages_and_swap_cache(pages, nr);
 			pages += nr;
 			batch->nr -= nr;
 
 			cond_resched();
-		} while (batch->nr);
+		}
 	}
 	tlb->active = &tlb->local;
 }
@@ -116,8 +133,9 @@ static void tlb_batch_list_free(struct mmu_gather *tlb)
 	tlb->local.next = NULL;
 }
 
-bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page,
-		bool delay_rmap, int page_size)
+static bool __tlb_remove_folio_pages_size(struct mmu_gather *tlb,
+		struct page *page, unsigned int nr_pages, bool delay_rmap,
+		int page_size)
 {
 	int flags = delay_rmap ? ENCODED_PAGE_BIT_DELAY_RMAP : 0;
 	struct mmu_gather_batch *batch;
@@ -126,6 +144,8 @@ bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page,
 
 #ifdef CONFIG_MMU_GATHER_PAGE_SIZE
 	VM_WARN_ON(tlb->page_size != page_size);
+	VM_WARN_ON_ONCE(nr_pages != 1 && page_size != PAGE_SIZE);
+	VM_WARN_ON_ONCE(page_folio(page) != page_folio(page + nr_pages - 1));
 #endif
 
 	batch = tlb->active;
@@ -133,17 +153,40 @@ bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page,
 	 * Add the page and check if we are full. If so
 	 * force a flush.
 	 */
-	batch->encoded_pages[batch->nr++] = encode_page(page, flags);
-	if (batch->nr == batch->max) {
+	if (likely(nr_pages == 1)) {
+		batch->encoded_pages[batch->nr++] = encode_page(page, flags);
+	} else {
+		flags |= ENCODED_PAGE_BIT_NR_PAGES_NEXT;
+		batch->encoded_pages[batch->nr++] = encode_page(page, flags);
+		batch->encoded_pages[batch->nr++] = encode_nr_pages(nr_pages);
+	}
+	/*
+	 * Make sure that we can always add another "page" + "nr_pages",
+	 * requiring two entries instead of only a single one.
+	 */
+	if (batch->nr >= batch->max - 1) {
 		if (!tlb_next_batch(tlb))
 			return true;
 		batch = tlb->active;
 	}
-	VM_BUG_ON_PAGE(batch->nr > batch->max, page);
+	VM_BUG_ON_PAGE(batch->nr > batch->max - 1, page);
 
 	return false;
 }
 
+bool __tlb_remove_folio_pages(struct mmu_gather *tlb, struct page *page,
+		unsigned int nr_pages, bool delay_rmap)
+{
+	return __tlb_remove_folio_pages_size(tlb, page, nr_pages, delay_rmap,
+					     PAGE_SIZE);
+}
+
+bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page,
+		bool delay_rmap, int page_size)
+{
+	return __tlb_remove_folio_pages_size(tlb, page, 1, delay_rmap, page_size);
+}
+
 #endif /* MMU_GATHER_NO_GATHER */
 
 #ifdef CONFIG_MMU_GATHER_TABLE_FREE
diff --git a/mm/swap.c b/mm/swap.c
index cd8f0150ba3aa8..e5380d732c0dfb 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -967,11 +967,17 @@ void release_pages(release_pages_arg arg, int nr)
 	unsigned int lock_batch;
 
 	for (i = 0; i < nr; i++) {
+		unsigned int nr_refs = 1;
 		struct folio *folio;
 
 		/* Turn any of the argument types into a folio */
 		folio = page_folio(encoded_page_ptr(encoded[i]));
 
+		/* Is our next entry actually "nr_pages" -> "nr_refs" ? */
+		if (unlikely(encoded_page_flags(encoded[i]) &
+			     ENCODED_PAGE_BIT_NR_PAGES_NEXT))
+			nr_refs = encoded_nr_pages(encoded[++i]);
+
 		/*
 		 * Make sure the IRQ-safe lock-holding time does not get
 		 * excessive with a continuous string of pages from the
@@ -990,14 +996,14 @@ void release_pages(release_pages_arg arg, int nr)
 				unlock_page_lruvec_irqrestore(lruvec, flags);
 				lruvec = NULL;
 			}
-			if (put_devmap_managed_page(&folio->page))
+			if (put_devmap_managed_page_refs(&folio->page, nr_refs))
 				continue;
-			if (folio_put_testzero(folio))
+			if (folio_ref_sub_and_test(folio, nr_refs))
 				free_zone_device_page(&folio->page);
 			continue;
 		}
 
-		if (!folio_put_testzero(folio))
+		if (!folio_ref_sub_and_test(folio, nr_refs))
 			continue;
 
 		if (folio_test_large(folio)) {
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 7255c01a1e4e16..2f540748f7c0cc 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -311,8 +311,19 @@ void free_page_and_swap_cache(struct page *page)
 void free_pages_and_swap_cache(struct encoded_page **pages, int nr)
 {
 	lru_add_drain();
-	for (int i = 0; i < nr; i++)
-		free_swap_cache(encoded_page_ptr(pages[i]));
+	for (int i = 0; i < nr; i++) {
+		struct page *page = encoded_page_ptr(pages[i]);
+
+		/*
+		 * Skip over the "nr_pages" entry. It's sufficient to call
+		 * free_swap_cache() only once per folio.
+		 */
+		if (unlikely(encoded_page_flags(pages[i]) &
+			     ENCODED_PAGE_BIT_NR_PAGES_NEXT))
+			i++;
+
+		free_swap_cache(page);
+	}
 	release_pages(pages, nr);
 }
 

From b34115055672727ddea631a32bc0744861534cce Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Wed, 14 Feb 2024 21:44:34 +0100
Subject: [PATCH 1213/1406] mm/mmu_gather: improve cond_resched() handling with
 large folios and expensive page freeing

In tlb_batch_pages_flush(), we can end up freeing up to 512 pages or now
up to 256 folio fragments that span more than one page, before we
conditionally reschedule.

It's a pain that we have to handle cond_resched() in
tlb_batch_pages_flush() manually and cannot simply handle it in
release_pages() -- release_pages() can be called from atomic context.
Well, in a perfect world we wouldn't have to make our code more
complicated at all.

With page poisoning and init_on_free, we might now run into soft lockups
when we free a lot of rather large folio fragments, because page freeing
time then depends on the actual memory size we are freeing instead of on
the number of folios that are involved.

In the absolute (unlikely) worst case, on arm64 with 64k we will be able
to free up to 256 folio fragments that each span 512 MiB: zeroing out 128
GiB does sound like it might take a while.  But instead of ignoring this
unlikely case, let's just handle it.

So, let's teach tlb_batch_pages_flush() that there are some configurations
where page freeing is horribly slow, and let's reschedule more frequently
-- similarly like we did for now before we had large folio fragments in
there.  Avoid yet another loop over all encoded pages in the common case
by handling that separately.

Note that with page poisoning/zeroing, we might now end up freeing only a
single folio fragment at a time that might exceed the old 512 pages limit:
but if we cannot even free a single MAX_ORDER page on a system without
running into soft lockups, something else is already completely bogus.
Freeing a PMD-mapped THP would similarly cause trouble.

In theory, we might even free 511 order-0 pages + a single MAX_ORDER page,
effectively having to zero out 8703 pages on arm64 with 64k, translating
to ~544 MiB of memory: however, if 512 MiB doesn't result in soft lockups,
544 MiB is unlikely to result in soft lockups, so we won't care about that
for the time being.

In the future, we might want to detect if handling cond_resched() is
required at all, and just not do any of that with full preemption enabled.

Link: https://lkml.kernel.org/r/20240214204435.167852-10-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Ryan Roberts <ryan.roberts@arm.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Michal Hocko <mhocko@suse.com>
Cc: "Naveen N. Rao" <naveen.n.rao@linux.ibm.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Yin Fengwei <fengwei.yin@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/mmu_gather.c | 58 ++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 43 insertions(+), 15 deletions(-)

diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
index d175c0f1e2c8c2..99b3e9408aa0fb 100644
--- a/mm/mmu_gather.c
+++ b/mm/mmu_gather.c
@@ -91,18 +91,21 @@ void tlb_flush_rmaps(struct mmu_gather *tlb, struct vm_area_struct *vma)
 }
 #endif
 
-static void tlb_batch_pages_flush(struct mmu_gather *tlb)
-{
-	struct mmu_gather_batch *batch;
+/*
+ * We might end up freeing a lot of pages. Reschedule on a regular
+ * basis to avoid soft lockups in configurations without full
+ * preemption enabled. The magic number of 512 folios seems to work.
+ */
+#define MAX_NR_FOLIOS_PER_FREE		512
 
-	for (batch = &tlb->local; batch && batch->nr; batch = batch->next) {
-		struct encoded_page **pages = batch->encoded_pages;
+static void __tlb_batch_free_encoded_pages(struct mmu_gather_batch *batch)
+{
+	struct encoded_page **pages = batch->encoded_pages;
+	unsigned int nr, nr_pages;
 
-		while (batch->nr) {
-			/*
-			 * limit free batch count when PAGE_SIZE > 4K
-			 */
-			unsigned int nr = min(512U, batch->nr);
+	while (batch->nr) {
+		if (!page_poisoning_enabled_static() && !want_init_on_free()) {
+			nr = min(MAX_NR_FOLIOS_PER_FREE, batch->nr);
 
 			/*
 			 * Make sure we cover page + nr_pages, and don't leave
@@ -111,14 +114,39 @@ static void tlb_batch_pages_flush(struct mmu_gather *tlb)
 			if (unlikely(encoded_page_flags(pages[nr - 1]) &
 				     ENCODED_PAGE_BIT_NR_PAGES_NEXT))
 				nr++;
+		} else {
+			/*
+			 * With page poisoning and init_on_free, the time it
+			 * takes to free memory grows proportionally with the
+			 * actual memory size. Therefore, limit based on the
+			 * actual memory size and not the number of involved
+			 * folios.
+			 */
+			for (nr = 0, nr_pages = 0;
+			     nr < batch->nr && nr_pages < MAX_NR_FOLIOS_PER_FREE;
+			     nr++) {
+				if (unlikely(encoded_page_flags(pages[nr]) &
+					     ENCODED_PAGE_BIT_NR_PAGES_NEXT))
+					nr_pages += encoded_nr_pages(pages[++nr]);
+				else
+					nr_pages++;
+			}
+		}
 
-			free_pages_and_swap_cache(pages, nr);
-			pages += nr;
-			batch->nr -= nr;
+		free_pages_and_swap_cache(pages, nr);
+		pages += nr;
+		batch->nr -= nr;
 
-			cond_resched();
-		}
+		cond_resched();
 	}
+}
+
+static void tlb_batch_pages_flush(struct mmu_gather *tlb)
+{
+	struct mmu_gather_batch *batch;
+
+	for (batch = &tlb->local; batch && batch->nr; batch = batch->next)
+		__tlb_batch_free_encoded_pages(batch);
 	tlb->active = &tlb->local;
 }
 

From c82fa87182c0a20a543ab64f9c201b368f3b90e6 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Wed, 14 Feb 2024 21:44:35 +0100
Subject: [PATCH 1214/1406] mm/memory: optimize unmap/zap with PTE-mapped THP

Similar to how we optimized fork(), let's implement PTE batching when
consecutive (present) PTEs map consecutive pages of the same large folio.

Most infrastructure we need for batching (mmu gather, rmap) is already
there.  We only have to add get_and_clear_full_ptes() and
clear_full_ptes().  Similarly, extend zap_install_uffd_wp_if_needed() to
process a PTE range.

We won't bother sanity-checking the mapcount of all subpages, but only
check the mapcount of the first subpage we process.  If there is a real
problem hiding somewhere, we can trigger it simply by using small folios,
or when we zap single pages of a large folio.  Ideally, we had that check
in rmap code (including for delayed rmap), but then we cannot print the
PTE.  Let's keep it simple for now.  If we ever have a cheap
folio_mapcount(), we might just want to check for underflows there.

To keep small folios as fast as possible force inlining of a specialized
variant using __always_inline with nr=1.

Link: https://lkml.kernel.org/r/20240214204435.167852-11-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Ryan Roberts <ryan.roberts@arm.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Michal Hocko <mhocko@suse.com>
Cc: "Naveen N. Rao" <naveen.n.rao@linux.ibm.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Yin Fengwei <fengwei.yin@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/pgtable.h | 70 +++++++++++++++++++++++++++++++
 mm/memory.c             | 92 +++++++++++++++++++++++++++++------------
 2 files changed, 136 insertions(+), 26 deletions(-)

diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index aab227e12493fb..49ab1f73b5c2c0 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -580,6 +580,76 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm,
 }
 #endif
 
+#ifndef get_and_clear_full_ptes
+/**
+ * get_and_clear_full_ptes - Clear present PTEs that map consecutive pages of
+ *			     the same folio, collecting dirty/accessed bits.
+ * @mm: Address space the pages are mapped into.
+ * @addr: Address the first page is mapped at.
+ * @ptep: Page table pointer for the first entry.
+ * @nr: Number of entries to clear.
+ * @full: Whether we are clearing a full mm.
+ *
+ * May be overridden by the architecture; otherwise, implemented as a simple
+ * loop over ptep_get_and_clear_full(), merging dirty/accessed bits into the
+ * returned PTE.
+ *
+ * Note that PTE bits in the PTE range besides the PFN can differ. For example,
+ * some PTEs might be write-protected.
+ *
+ * Context: The caller holds the page table lock.  The PTEs map consecutive
+ * pages that belong to the same folio.  The PTEs are all in the same PMD.
+ */
+static inline pte_t get_and_clear_full_ptes(struct mm_struct *mm,
+		unsigned long addr, pte_t *ptep, unsigned int nr, int full)
+{
+	pte_t pte, tmp_pte;
+
+	pte = ptep_get_and_clear_full(mm, addr, ptep, full);
+	while (--nr) {
+		ptep++;
+		addr += PAGE_SIZE;
+		tmp_pte = ptep_get_and_clear_full(mm, addr, ptep, full);
+		if (pte_dirty(tmp_pte))
+			pte = pte_mkdirty(pte);
+		if (pte_young(tmp_pte))
+			pte = pte_mkyoung(pte);
+	}
+	return pte;
+}
+#endif
+
+#ifndef clear_full_ptes
+/**
+ * clear_full_ptes - Clear present PTEs that map consecutive pages of the same
+ *		     folio.
+ * @mm: Address space the pages are mapped into.
+ * @addr: Address the first page is mapped at.
+ * @ptep: Page table pointer for the first entry.
+ * @nr: Number of entries to clear.
+ * @full: Whether we are clearing a full mm.
+ *
+ * May be overridden by the architecture; otherwise, implemented as a simple
+ * loop over ptep_get_and_clear_full().
+ *
+ * Note that PTE bits in the PTE range besides the PFN can differ. For example,
+ * some PTEs might be write-protected.
+ *
+ * Context: The caller holds the page table lock.  The PTEs map consecutive
+ * pages that belong to the same folio.  The PTEs are all in the same PMD.
+ */
+static inline void clear_full_ptes(struct mm_struct *mm, unsigned long addr,
+		pte_t *ptep, unsigned int nr, int full)
+{
+	for (;;) {
+		ptep_get_and_clear_full(mm, addr, ptep, full);
+		if (--nr == 0)
+			break;
+		ptep++;
+		addr += PAGE_SIZE;
+	}
+}
+#endif
 
 /*
  * If two threads concurrently fault at the same page, the thread that
diff --git a/mm/memory.c b/mm/memory.c
index 168096f9360e7b..465ada39c2b7d3 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1517,7 +1517,7 @@ static inline bool zap_drop_file_uffd_wp(struct zap_details *details)
  */
 static inline void
 zap_install_uffd_wp_if_needed(struct vm_area_struct *vma,
-			      unsigned long addr, pte_t *pte,
+			      unsigned long addr, pte_t *pte, int nr,
 			      struct zap_details *details, pte_t pteval)
 {
 	/* Zap on anonymous always means dropping everything */
@@ -1527,20 +1527,27 @@ zap_install_uffd_wp_if_needed(struct vm_area_struct *vma,
 	if (zap_drop_file_uffd_wp(details))
 		return;
 
-	pte_install_uffd_wp_if_needed(vma, addr, pte, pteval);
+	for (;;) {
+		/* the PFN in the PTE is irrelevant. */
+		pte_install_uffd_wp_if_needed(vma, addr, pte, pteval);
+		if (--nr == 0)
+			break;
+		pte++;
+		addr += PAGE_SIZE;
+	}
 }
 
-static inline void zap_present_folio_pte(struct mmu_gather *tlb,
+static __always_inline void zap_present_folio_ptes(struct mmu_gather *tlb,
 		struct vm_area_struct *vma, struct folio *folio,
-		struct page *page, pte_t *pte, pte_t ptent, unsigned long addr,
-		struct zap_details *details, int *rss, bool *force_flush,
-		bool *force_break)
+		struct page *page, pte_t *pte, pte_t ptent, unsigned int nr,
+		unsigned long addr, struct zap_details *details, int *rss,
+		bool *force_flush, bool *force_break)
 {
 	struct mm_struct *mm = tlb->mm;
 	bool delay_rmap = false;
 
 	if (!folio_test_anon(folio)) {
-		ptent = ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm);
+		ptent = get_and_clear_full_ptes(mm, addr, pte, nr, tlb->fullmm);
 		if (pte_dirty(ptent)) {
 			folio_mark_dirty(folio);
 			if (tlb_delay_rmap(tlb)) {
@@ -1550,36 +1557,49 @@ static inline void zap_present_folio_pte(struct mmu_gather *tlb,
 		}
 		if (pte_young(ptent) && likely(vma_has_recency(vma)))
 			folio_mark_accessed(folio);
-		rss[mm_counter(folio)]--;
+		rss[mm_counter(folio)] -= nr;
 	} else {
 		/* We don't need up-to-date accessed/dirty bits. */
-		ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm);
-		rss[MM_ANONPAGES]--;
+		clear_full_ptes(mm, addr, pte, nr, tlb->fullmm);
+		rss[MM_ANONPAGES] -= nr;
 	}
+	/* Checking a single PTE in a batch is sufficient. */
 	arch_check_zapped_pte(vma, ptent);
-	tlb_remove_tlb_entry(tlb, pte, addr);
+	tlb_remove_tlb_entries(tlb, pte, nr, addr);
 	if (unlikely(userfaultfd_pte_wp(vma, ptent)))
-		zap_install_uffd_wp_if_needed(vma, addr, pte, details, ptent);
+		zap_install_uffd_wp_if_needed(vma, addr, pte, nr, details,
+					      ptent);
 
 	if (!delay_rmap) {
-		folio_remove_rmap_pte(folio, page, vma);
+		folio_remove_rmap_ptes(folio, page, nr, vma);
+
+		/* Only sanity-check the first page in a batch. */
 		if (unlikely(page_mapcount(page) < 0))
 			print_bad_pte(vma, addr, ptent, page);
 	}
-	if (unlikely(__tlb_remove_page(tlb, page, delay_rmap))) {
+	if (unlikely(__tlb_remove_folio_pages(tlb, page, nr, delay_rmap))) {
 		*force_flush = true;
 		*force_break = true;
 	}
 }
 
-static inline void zap_present_pte(struct mmu_gather *tlb,
+/*
+ * Zap or skip at least one present PTE, trying to batch-process subsequent
+ * PTEs that map consecutive pages of the same folio.
+ *
+ * Returns the number of processed (skipped or zapped) PTEs (at least 1).
+ */
+static inline int zap_present_ptes(struct mmu_gather *tlb,
 		struct vm_area_struct *vma, pte_t *pte, pte_t ptent,
-		unsigned long addr, struct zap_details *details,
-		int *rss, bool *force_flush, bool *force_break)
+		unsigned int max_nr, unsigned long addr,
+		struct zap_details *details, int *rss, bool *force_flush,
+		bool *force_break)
 {
+	const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
 	struct mm_struct *mm = tlb->mm;
 	struct folio *folio;
 	struct page *page;
+	int nr;
 
 	page = vm_normal_page(vma, addr, ptent);
 	if (!page) {
@@ -1589,14 +1609,29 @@ static inline void zap_present_pte(struct mmu_gather *tlb,
 		tlb_remove_tlb_entry(tlb, pte, addr);
 		VM_WARN_ON_ONCE(userfaultfd_wp(vma));
 		ksm_might_unmap_zero_page(mm, ptent);
-		return;
+		return 1;
 	}
 
 	folio = page_folio(page);
 	if (unlikely(!should_zap_folio(details, folio)))
-		return;
-	zap_present_folio_pte(tlb, vma, folio, page, pte, ptent, addr, details,
-			      rss, force_flush, force_break);
+		return 1;
+
+	/*
+	 * Make sure that the common "small folio" case is as fast as possible
+	 * by keeping the batching logic separate.
+	 */
+	if (unlikely(folio_test_large(folio) && max_nr != 1)) {
+		nr = folio_pte_batch(folio, addr, pte, ptent, max_nr, fpb_flags,
+				     NULL);
+
+		zap_present_folio_ptes(tlb, vma, folio, page, pte, ptent, nr,
+				       addr, details, rss, force_flush,
+				       force_break);
+		return nr;
+	}
+	zap_present_folio_ptes(tlb, vma, folio, page, pte, ptent, 1, addr,
+			       details, rss, force_flush, force_break);
+	return 1;
 }
 
 static unsigned long zap_pte_range(struct mmu_gather *tlb,
@@ -1611,6 +1646,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
 	pte_t *start_pte;
 	pte_t *pte;
 	swp_entry_t entry;
+	int nr;
 
 	tlb_change_page_size(tlb, PAGE_SIZE);
 	init_rss_vec(rss);
@@ -1624,7 +1660,9 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
 		pte_t ptent = ptep_get(pte);
 		struct folio *folio;
 		struct page *page;
+		int max_nr;
 
+		nr = 1;
 		if (pte_none(ptent))
 			continue;
 
@@ -1632,10 +1670,12 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
 			break;
 
 		if (pte_present(ptent)) {
-			zap_present_pte(tlb, vma, pte, ptent, addr, details,
-					rss, &force_flush, &force_break);
+			max_nr = (end - addr) / PAGE_SIZE;
+			nr = zap_present_ptes(tlb, vma, pte, ptent, max_nr,
+					      addr, details, rss, &force_flush,
+					      &force_break);
 			if (unlikely(force_break)) {
-				addr += PAGE_SIZE;
+				addr += nr * PAGE_SIZE;
 				break;
 			}
 			continue;
@@ -1689,8 +1729,8 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
 			WARN_ON_ONCE(1);
 		}
 		pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
-		zap_install_uffd_wp_if_needed(vma, addr, pte, details, ptent);
-	} while (pte++, addr += PAGE_SIZE, addr != end);
+		zap_install_uffd_wp_if_needed(vma, addr, pte, 1, details, ptent);
+	} while (pte += nr, addr += PAGE_SIZE * nr, addr != end);
 
 	add_mm_rss_vec(mm, rss);
 	arch_leave_lazy_mmu_mode();

From c8707de7efde52a939892f9b168e67fefadb2755 Mon Sep 17 00:00:00 2001
From: Ryan Roberts <ryan.roberts@arm.com>
Date: Thu, 15 Feb 2024 10:31:48 +0000
Subject: [PATCH 1215/1406] mm: clarify the spec for set_ptes()

Patch series "Transparent Contiguous PTEs for User Mappings", v6.

This is a series to opportunistically and transparently use contpte
mappings (set the contiguous bit in ptes) for user memory when those
mappings meet the requirements.  The change benefits arm64, but there is
some (very) minor refactoring for x86 to enable its integration with
core-mm.

It is part of a wider effort to improve performance by allocating and
mapping variable-sized blocks of memory (folios).  One aim is for the 4K
kernel to approach the performance of the 16K kernel, but without breaking
compatibility and without the associated increase in memory.  Another aim
is to benefit the 16K and 64K kernels by enabling 2M THP, since this is
the contpte size for those kernels.  We have good performance data that
demonstrates both aims are being met (see below).

Of course this is only one half of the change.  We require the mapped
physical memory to be the correct size and alignment for this to actually
be useful (i.e.  64K for 4K pages, or 2M for 16K/64K pages).  Fortunately
folios are solving this problem for us.  Filesystems that support it (XFS,
AFS, EROFS, tmpfs, ...) will allocate large folios up to the PMD size
today, and more filesystems are coming.  And for anonymous memory,
"multi-size THP" is now upstream.


Patch Layout
============

In this version, I've split the patches to better show each optimization:

  - 1-2:    mm prep: misc code and docs cleanups
  - 3-6:    mm,arm64,x86 prep: Add pte_advance_pfn() and make pte_next_pfn() a
            generic wrapper around it
  - 7-11:   arm64 prep: Refactor ptep helpers into new layer
  - 12:     functional contpte implementation
  - 23-18:  various optimizations on top of the contpte implementation


Testing
=======

I've tested this series on both Ampere Altra (bare metal) and Apple M2 (VM):
  - mm selftests (inc new tests written for multi-size THP); no regressions
  - Speedometer Java script benchmark in Chromium web browser; no issues
  - Kernel compilation; no issues
  - Various tests under high memory pressure with swap enabled; no issues


Performance
===========

High Level Use Cases
~~~~~~~~~~~~~~~~~~~~

First some high level use cases (kernel compilation and speedometer JavaScript
benchmarks). These are running on Ampere Altra (I've seen similar improvements
on Android/Pixel 6).

baseline:                  mm-unstable (mTHP switched off)
mTHP:                      + enable 16K, 32K, 64K mTHP sizes "always"
mTHP + contpte:            + this series
mTHP + contpte + exefolio: + patch at [6], which series supports

Kernel Compilation with -j8 (negative is faster):

| kernel                    | real-time | kern-time | user-time |
|---------------------------|-----------|-----------|-----------|
| baseline                  |      0.0% |      0.0% |      0.0% |
| mTHP                      |     -5.0% |    -39.1% |     -0.7% |
| mTHP + contpte            |     -6.0% |    -41.4% |     -1.5% |
| mTHP + contpte + exefolio |     -7.8% |    -43.1% |     -3.4% |

Kernel Compilation with -j80 (negative is faster):

| kernel                    | real-time | kern-time | user-time |
|---------------------------|-----------|-----------|-----------|
| baseline                  |      0.0% |      0.0% |      0.0% |
| mTHP                      |     -5.0% |    -36.6% |     -0.6% |
| mTHP + contpte            |     -6.1% |    -38.2% |     -1.6% |
| mTHP + contpte + exefolio |     -7.4% |    -39.2% |     -3.2% |

Speedometer (positive is faster):

| kernel                    | runs_per_min |
|:--------------------------|--------------|
| baseline                  |         0.0% |
| mTHP                      |         1.5% |
| mTHP + contpte            |         3.2% |
| mTHP + contpte + exefolio |         4.5% |


Micro Benchmarks
~~~~~~~~~~~~~~~~

The following microbenchmarks are intended to demonstrate the performance of
fork() and munmap() do not regress. I'm showing results for order-0 (4K)
mappings, and for order-9 (2M) PTE-mapped THP. Thanks to David for sharing his
benchmarks.

baseline:                  mm-unstable + batch zap [7] series
contpte-basic:             + patches 0-19; functional contpte implementation
contpte-batch:             + patches 20-23; implement new batched APIs
contpte-inline:            + patch 24; __always_inline to help compiler
contpte-fold:              + patch 25; fold contpte mapping when sensible

Primary platform is Ampere Altra bare metal. I'm also showing results for M2 VM
(on top of MacOS) for reference, although experience suggests this might not be
the most reliable for performance numbers of this sort:

| FORK           |         order-0        |         order-9        |
| Ampere Altra   |------------------------|------------------------|
| (pte-map)      |       mean |     stdev |       mean |     stdev |
|----------------|------------|-----------|------------|-----------|
| baseline       |       0.0% |      2.7% |       0.0% |      0.2% |
| contpte-basic  |       6.3% |      1.4% |    1948.7% |      0.2% |
| contpte-batch  |       7.6% |      2.0% |      -1.9% |      0.4% |
| contpte-inline |       3.6% |      1.5% |      -1.0% |      0.2% |
| contpte-fold   |       4.6% |      2.1% |      -1.8% |      0.2% |

| MUNMAP         |         order-0        |         order-9        |
| Ampere Altra   |------------------------|------------------------|
| (pte-map)      |       mean |     stdev |       mean |     stdev |
|----------------|------------|-----------|------------|-----------|
| baseline       |       0.0% |      0.5% |       0.0% |      0.3% |
| contpte-basic  |       1.8% |      0.3% |    1104.8% |      0.1% |
| contpte-batch  |      -0.3% |      0.4% |       2.7% |      0.1% |
| contpte-inline |      -0.1% |      0.6% |       0.9% |      0.1% |
| contpte-fold   |       0.1% |      0.6% |       0.8% |      0.1% |

| FORK           |         order-0        |         order-9        |
| Apple M2 VM    |------------------------|------------------------|
| (pte-map)      |       mean |     stdev |       mean |     stdev |
|----------------|------------|-----------|------------|-----------|
| baseline       |       0.0% |      1.4% |       0.0% |      0.8% |
| contpte-basic  |       6.8% |      1.2% |     469.4% |      1.4% |
| contpte-batch  |      -7.7% |      2.0% |      -8.9% |      0.7% |
| contpte-inline |      -6.0% |      2.1% |      -6.0% |      2.0% |
| contpte-fold   |       5.9% |      1.4% |      -6.4% |      1.4% |

| MUNMAP         |         order-0        |         order-9        |
| Apple M2 VM    |------------------------|------------------------|
| (pte-map)      |       mean |     stdev |       mean |     stdev |
|----------------|------------|-----------|------------|-----------|
| baseline       |       0.0% |      0.6% |       0.0% |      0.4% |
| contpte-basic  |       1.6% |      0.6% |     233.6% |      0.7% |
| contpte-batch  |       1.9% |      0.3% |      -3.9% |      0.4% |
| contpte-inline |       2.2% |      0.8% |      -1.6% |      0.9% |
| contpte-fold   |       1.5% |      0.7% |      -1.7% |      0.7% |

Misc
~~~~

John Hubbard at Nvidia has indicated dramatic 10x performance improvements
for some workloads at [8], when using 64K base page kernel.

[1] https://lore.kernel.org/linux-arm-kernel/20230622144210.2623299-1-ryan.roberts@arm.com/
[2] https://lore.kernel.org/linux-arm-kernel/20231115163018.1303287-1-ryan.roberts@arm.com/
[3] https://lore.kernel.org/linux-arm-kernel/20231204105440.61448-1-ryan.roberts@arm.com/
[4] https://lore.kernel.org/lkml/20231218105100.172635-1-ryan.roberts@arm.com/
[5] https://lore.kernel.org/linux-mm/633af0a7-0823-424f-b6ef-374d99483f05@arm.com/
[6] https://lore.kernel.org/lkml/08c16f7d-f3b3-4f22-9acc-da943f647dc3@arm.com/
[7] https://lore.kernel.org/linux-mm/20240214204435.167852-1-david@redhat.com/
[8] https://lore.kernel.org/linux-mm/c507308d-bdd4-5f9e-d4ff-e96e4520be85@nvidia.com/
[9] https://gitlab.arm.com/linux-arm/linux-rr/-/tree/features/granule_perf/contpte-lkml_v6


This patch (of 18):

set_ptes() spec implies that it can only be used to set a present pte
because it interprets the PFN field to increment it.  However,
set_pte_at() has been implemented on top of set_ptes() since set_ptes()
was introduced, and set_pte_at() allows setting a pte to a not-present
state.  So clarify the spec to state that when nr==1, new state of pte may
be present or not present.  When nr>1, new state of all ptes must be
present.

While we are at it, tighten the spec to set requirements around the
initial state of ptes; when nr==1 it may be either present or not-present.
But when nr>1 all ptes must initially be not-present.  All set_ptes()
callsites already conform to this requirement.  Stating it explicitly is
useful because it allows for a simplification to the upcoming arm64
contpte implementation.

Link: https://lkml.kernel.org/r/20240215103205.2607016-1-ryan.roberts@arm.com
Link: https://lkml.kernel.org/r/20240215103205.2607016-2-ryan.roberts@arm.com
Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Barry Song <21cnbao@gmail.com>
Cc: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Morse <james.morse@arm.com>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will@kernel.org>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/pgtable.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 49ab1f73b5c2c0..231370e1b80fc5 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -229,6 +229,10 @@ static inline pte_t pte_next_pfn(pte_t pte)
  * @pte: Page table entry for the first page.
  * @nr: Number of pages to map.
  *
+ * When nr==1, initial state of pte may be present or not present, and new state
+ * may be present or not present. When nr>1, initial state of all ptes must be
+ * not present, and new state must be present.
+ *
  * May be overridden by the architecture, or the architecture can define
  * set_pte() and PFN_PTE_SHIFT.
  *

From 5881b11c4745873511b356b5a85e09f8d9d2f70e Mon Sep 17 00:00:00 2001
From: Ryan Roberts <ryan.roberts@arm.com>
Date: Thu, 15 Feb 2024 10:31:49 +0000
Subject: [PATCH 1216/1406] mm: thp: batch-collapse PMD with set_ptes()

Refactor __split_huge_pmd_locked() so that a present PMD can be collapsed
to PTEs in a single batch using set_ptes().

This should improve performance a little bit, but the real motivation is
to remove the need for the arm64 backend to have to fold the contpte
entries.  Instead, since the ptes are set as a batch, the contpte blocks
can be initially set up pre-folded (once the arm64 contpte support is
added in the next few patches).  This leads to noticeable performance
improvement during split.

Link: https://lkml.kernel.org/r/20240215103205.2607016-3-ryan.roberts@arm.com
Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Barry Song <21cnbao@gmail.com>
Cc: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Morse <james.morse@arm.com>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will@kernel.org>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/huge_memory.c | 58 +++++++++++++++++++++++++++---------------------
 1 file changed, 33 insertions(+), 25 deletions(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 016e20bd813eaf..14888b15121e59 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2579,15 +2579,16 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
 
 	pte = pte_offset_map(&_pmd, haddr);
 	VM_BUG_ON(!pte);
-	for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
-		pte_t entry;
-		/*
-		 * Note that NUMA hinting access restrictions are not
-		 * transferred to avoid any possibility of altering
-		 * permissions across VMAs.
-		 */
-		if (freeze || pmd_migration) {
+
+	/*
+	 * Note that NUMA hinting access restrictions are not transferred to
+	 * avoid any possibility of altering permissions across VMAs.
+	 */
+	if (freeze || pmd_migration) {
+		for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
+			pte_t entry;
 			swp_entry_t swp_entry;
+
 			if (write)
 				swp_entry = make_writable_migration_entry(
 							page_to_pfn(page + i));
@@ -2606,25 +2607,32 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
 				entry = pte_swp_mksoft_dirty(entry);
 			if (uffd_wp)
 				entry = pte_swp_mkuffd_wp(entry);
-		} else {
-			entry = mk_pte(page + i, READ_ONCE(vma->vm_page_prot));
-			if (write)
-				entry = pte_mkwrite(entry, vma);
-			if (!young)
-				entry = pte_mkold(entry);
-			/* NOTE: this may set soft-dirty too on some archs */
-			if (dirty)
-				entry = pte_mkdirty(entry);
-			if (soft_dirty)
-				entry = pte_mksoft_dirty(entry);
-			if (uffd_wp)
-				entry = pte_mkuffd_wp(entry);
+
+			VM_WARN_ON(!pte_none(ptep_get(pte + i)));
+			set_pte_at(mm, addr, pte + i, entry);
 		}
-		VM_BUG_ON(!pte_none(ptep_get(pte)));
-		set_pte_at(mm, addr, pte, entry);
-		pte++;
+	} else {
+		pte_t entry;
+
+		entry = mk_pte(page, READ_ONCE(vma->vm_page_prot));
+		if (write)
+			entry = pte_mkwrite(entry, vma);
+		if (!young)
+			entry = pte_mkold(entry);
+		/* NOTE: this may set soft-dirty too on some archs */
+		if (dirty)
+			entry = pte_mkdirty(entry);
+		if (soft_dirty)
+			entry = pte_mksoft_dirty(entry);
+		if (uffd_wp)
+			entry = pte_mkuffd_wp(entry);
+
+		for (i = 0; i < HPAGE_PMD_NR; i++)
+			VM_WARN_ON(!pte_none(ptep_get(pte + i)));
+
+		set_ptes(mm, haddr, pte, entry, HPAGE_PMD_NR);
 	}
-	pte_unmap(pte - 1);
+	pte_unmap(pte);
 
 	if (!pmd_migration)
 		folio_remove_rmap_pmd(folio, page, vma);

From 35a57343d3c9086315605e32f33b831a3fb4eb13 Mon Sep 17 00:00:00 2001
From: Ryan Roberts <ryan.roberts@arm.com>
Date: Thu, 15 Feb 2024 10:31:50 +0000
Subject: [PATCH 1217/1406] mm: introduce pte_advance_pfn() and use for
 pte_next_pfn()

The goal is to be able to advance a PTE by an arbitrary number of PFNs.
So introduce a new API that takes a nr param.  Define the default
implementation here and allow for architectures to override.
pte_next_pfn() becomes a wrapper around pte_advance_pfn().

Follow up commits will convert each overriding architecture's
pte_next_pfn() to pte_advance_pfn().

Link: https://lkml.kernel.org/r/20240215103205.2607016-4-ryan.roberts@arm.com
Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Barry Song <21cnbao@gmail.com>
Cc: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Morse <james.morse@arm.com>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will@kernel.org>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/pgtable.h | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 231370e1b80fc5..b7ac8358f2aa5f 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -212,14 +212,17 @@ static inline int pmd_dirty(pmd_t pmd)
 #define arch_flush_lazy_mmu_mode()	do {} while (0)
 #endif
 
-
 #ifndef pte_next_pfn
-static inline pte_t pte_next_pfn(pte_t pte)
+#ifndef pte_advance_pfn
+static inline pte_t pte_advance_pfn(pte_t pte, unsigned long nr)
 {
-	return __pte(pte_val(pte) + (1UL << PFN_PTE_SHIFT));
+	return __pte(pte_val(pte) + (nr << PFN_PTE_SHIFT));
 }
 #endif
 
+#define pte_next_pfn(pte) pte_advance_pfn(pte, 1)
+#endif
+
 #ifndef set_ptes
 /**
  * set_ptes - Map consecutive pages to a contiguous range of addresses.

From db7667549cd42fa4fed6a2f4a1340e0c71ea4957 Mon Sep 17 00:00:00 2001
From: Ryan Roberts <ryan.roberts@arm.com>
Date: Thu, 15 Feb 2024 10:31:51 +0000
Subject: [PATCH 1218/1406] arm64/mm: convert pte_next_pfn() to
 pte_advance_pfn()

Core-mm needs to be able to advance the pfn by an arbitrary amount, so
override the new pte_advance_pfn() API to do so.

Link: https://lkml.kernel.org/r/20240215103205.2607016-5-ryan.roberts@arm.com
Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
Acked-by: David Hildenbrand <david@redhat.com>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Barry Song <21cnbao@gmail.com>
Cc: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Morse <james.morse@arm.com>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will@kernel.org>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/arm64/include/asm/pgtable.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 52d0b0a763f164..b6d3e9e0a94624 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -351,10 +351,10 @@ static inline pgprot_t pte_pgprot(pte_t pte)
 	return __pgprot(pte_val(pfn_pte(pfn, __pgprot(0))) ^ pte_val(pte));
 }
 
-#define pte_next_pfn pte_next_pfn
-static inline pte_t pte_next_pfn(pte_t pte)
+#define pte_advance_pfn pte_advance_pfn
+static inline pte_t pte_advance_pfn(pte_t pte, unsigned long nr)
 {
-	return pfn_pte(pte_pfn(pte) + 1, pte_pgprot(pte));
+	return pfn_pte(pte_pfn(pte) + nr, pte_pgprot(pte));
 }
 
 static inline void set_ptes(struct mm_struct *mm,
@@ -370,7 +370,7 @@ static inline void set_ptes(struct mm_struct *mm,
 		if (--nr == 0)
 			break;
 		ptep++;
-		pte = pte_next_pfn(pte);
+		pte = pte_advance_pfn(pte, 1);
 	}
 }
 #define set_ptes set_ptes

From 71f90268db0236c0f3e54ba12c5a5cf67ee56eba Mon Sep 17 00:00:00 2001
From: Ryan Roberts <ryan.roberts@arm.com>
Date: Thu, 15 Feb 2024 10:31:52 +0000
Subject: [PATCH 1219/1406] x86/mm: convert pte_next_pfn() to pte_advance_pfn()

Core-mm needs to be able to advance the pfn by an arbitrary amount, so
override the new pte_advance_pfn() API to do so.

Link: https://lkml.kernel.org/r/20240215103205.2607016-6-ryan.roberts@arm.com
Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Barry Song <21cnbao@gmail.com>
Cc: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Morse <james.morse@arm.com>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will@kernel.org>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/x86/include/asm/pgtable.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index b50b2ef63672f4..69ed0ea0641bd0 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -955,13 +955,13 @@ static inline int pte_same(pte_t a, pte_t b)
 	return a.pte == b.pte;
 }
 
-static inline pte_t pte_next_pfn(pte_t pte)
+static inline pte_t pte_advance_pfn(pte_t pte, unsigned long nr)
 {
 	if (__pte_needs_invert(pte_val(pte)))
-		return __pte(pte_val(pte) - (1UL << PFN_PTE_SHIFT));
-	return __pte(pte_val(pte) + (1UL << PFN_PTE_SHIFT));
+		return __pte(pte_val(pte) - (nr << PFN_PTE_SHIFT));
+	return __pte(pte_val(pte) + (nr << PFN_PTE_SHIFT));
 }
-#define pte_next_pfn	pte_next_pfn
+#define pte_advance_pfn	pte_advance_pfn
 
 static inline int pte_present(pte_t a)
 {

From 7cddd544710a873fbec45289f7dffa44477ffe41 Mon Sep 17 00:00:00 2001
From: Ryan Roberts <ryan.roberts@arm.com>
Date: Thu, 15 Feb 2024 10:31:53 +0000
Subject: [PATCH 1220/1406] mm: tidy up pte_next_pfn() definition

Now that the all architecture overrides of pte_next_pfn() have been
replaced with pte_advance_pfn(), we can simplify the definition of the
generic pte_next_pfn() macro so that it is unconditionally defined.

Link: https://lkml.kernel.org/r/20240215103205.2607016-7-ryan.roberts@arm.com
Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Barry Song <21cnbao@gmail.com>
Cc: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Morse <james.morse@arm.com>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will@kernel.org>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/pgtable.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index b7ac8358f2aa5f..bc005d84f764b7 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -212,7 +212,6 @@ static inline int pmd_dirty(pmd_t pmd)
 #define arch_flush_lazy_mmu_mode()	do {} while (0)
 #endif
 
-#ifndef pte_next_pfn
 #ifndef pte_advance_pfn
 static inline pte_t pte_advance_pfn(pte_t pte, unsigned long nr)
 {
@@ -221,7 +220,6 @@ static inline pte_t pte_advance_pfn(pte_t pte, unsigned long nr)
 #endif
 
 #define pte_next_pfn(pte) pte_advance_pfn(pte, 1)
-#endif
 
 #ifndef set_ptes
 /**

From 670ea63a330d9aabeebfbf18504dec93c54e9c7d Mon Sep 17 00:00:00 2001
From: Ryan Roberts <ryan.roberts@arm.com>
Date: Thu, 15 Feb 2024 10:31:54 +0000
Subject: [PATCH 1221/1406] arm64/mm: convert READ_ONCE(*ptep) to
 ptep_get(ptep)

There are a number of places in the arch code that read a pte by using the
READ_ONCE() macro.  Refactor these call sites to instead use the
ptep_get() helper, which itself is a READ_ONCE().  Generated code should
be the same.

This will benefit us when we shortly introduce the transparent contpte
support.  In this case, ptep_get() will become more complex so we now have
all the code abstracted through it.

Link: https://lkml.kernel.org/r/20240215103205.2607016-8-ryan.roberts@arm.com
Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
Tested-by: John Hubbard <jhubbard@nvidia.com>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Barry Song <21cnbao@gmail.com>
Cc: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Morse <james.morse@arm.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will@kernel.org>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/arm64/include/asm/pgtable.h | 12 +++++++++---
 arch/arm64/kernel/efi.c          |  2 +-
 arch/arm64/mm/fault.c            |  4 ++--
 arch/arm64/mm/hugetlbpage.c      |  6 +++---
 arch/arm64/mm/kasan_init.c       |  2 +-
 arch/arm64/mm/mmu.c              | 12 ++++++------
 arch/arm64/mm/pageattr.c         |  4 ++--
 arch/arm64/mm/trans_pgd.c        |  2 +-
 8 files changed, 25 insertions(+), 19 deletions(-)

diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index b6d3e9e0a94624..de034ca40bad1f 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -275,6 +275,12 @@ static inline void set_pte(pte_t *ptep, pte_t pte)
 	}
 }
 
+#define ptep_get ptep_get
+static inline pte_t ptep_get(pte_t *ptep)
+{
+	return READ_ONCE(*ptep);
+}
+
 extern void __sync_icache_dcache(pte_t pteval);
 bool pgattr_change_is_safe(u64 old, u64 new);
 
@@ -302,7 +308,7 @@ static inline void __check_safe_pte_update(struct mm_struct *mm, pte_t *ptep,
 	if (!IS_ENABLED(CONFIG_DEBUG_VM))
 		return;
 
-	old_pte = READ_ONCE(*ptep);
+	old_pte = ptep_get(ptep);
 
 	if (!pte_valid(old_pte) || !pte_valid(pte))
 		return;
@@ -904,7 +910,7 @@ static inline int __ptep_test_and_clear_young(pte_t *ptep)
 {
 	pte_t old_pte, pte;
 
-	pte = READ_ONCE(*ptep);
+	pte = ptep_get(ptep);
 	do {
 		old_pte = pte;
 		pte = pte_mkold(pte);
@@ -986,7 +992,7 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addres
 {
 	pte_t old_pte, pte;
 
-	pte = READ_ONCE(*ptep);
+	pte = ptep_get(ptep);
 	do {
 		old_pte = pte;
 		pte = pte_wrprotect(pte);
diff --git a/arch/arm64/kernel/efi.c b/arch/arm64/kernel/efi.c
index 0228001347beaf..d0e08e93b24648 100644
--- a/arch/arm64/kernel/efi.c
+++ b/arch/arm64/kernel/efi.c
@@ -103,7 +103,7 @@ static int __init set_permissions(pte_t *ptep, unsigned long addr, void *data)
 {
 	struct set_perm_data *spd = data;
 	const efi_memory_desc_t *md = spd->md;
-	pte_t pte = READ_ONCE(*ptep);
+	pte_t pte = ptep_get(ptep);
 
 	if (md->attribute & EFI_MEMORY_RO)
 		pte = set_pte_bit(pte, __pgprot(PTE_RDONLY));
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 55f6455a828434..a254761fa1bd4b 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -191,7 +191,7 @@ static void show_pte(unsigned long addr)
 		if (!ptep)
 			break;
 
-		pte = READ_ONCE(*ptep);
+		pte = ptep_get(ptep);
 		pr_cont(", pte=%016llx", pte_val(pte));
 		pte_unmap(ptep);
 	} while(0);
@@ -214,7 +214,7 @@ int ptep_set_access_flags(struct vm_area_struct *vma,
 			  pte_t entry, int dirty)
 {
 	pteval_t old_pteval, pteval;
-	pte_t pte = READ_ONCE(*ptep);
+	pte_t pte = ptep_get(ptep);
 
 	if (pte_same(pte, entry))
 		return 0;
diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
index 6720ec8d50e76c..2892f925ed66dd 100644
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -485,7 +485,7 @@ void huge_ptep_set_wrprotect(struct mm_struct *mm,
 	size_t pgsize;
 	pte_t pte;
 
-	if (!pte_cont(READ_ONCE(*ptep))) {
+	if (!pte_cont(ptep_get(ptep))) {
 		ptep_set_wrprotect(mm, addr, ptep);
 		return;
 	}
@@ -510,7 +510,7 @@ pte_t huge_ptep_clear_flush(struct vm_area_struct *vma,
 	size_t pgsize;
 	int ncontig;
 
-	if (!pte_cont(READ_ONCE(*ptep)))
+	if (!pte_cont(ptep_get(ptep)))
 		return ptep_clear_flush(vma, addr, ptep);
 
 	ncontig = find_num_contig(mm, addr, ptep, &pgsize);
@@ -543,7 +543,7 @@ pte_t huge_ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr
 		 * when the permission changes from executable to non-executable
 		 * in cases where cpu is affected with errata #2645198.
 		 */
-		if (pte_user_exec(READ_ONCE(*ptep)))
+		if (pte_user_exec(ptep_get(ptep)))
 			return huge_ptep_clear_flush(vma, addr, ptep);
 	}
 	return huge_ptep_get_and_clear(vma->vm_mm, addr, ptep);
diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c
index 4c7ad574b946bf..c2a9f4f6c7dd06 100644
--- a/arch/arm64/mm/kasan_init.c
+++ b/arch/arm64/mm/kasan_init.c
@@ -113,7 +113,7 @@ static void __init kasan_pte_populate(pmd_t *pmdp, unsigned long addr,
 			memset(__va(page_phys), KASAN_SHADOW_INIT, PAGE_SIZE);
 		next = addr + PAGE_SIZE;
 		set_pte(ptep, pfn_pte(__phys_to_pfn(page_phys), PAGE_KERNEL));
-	} while (ptep++, addr = next, addr != end && pte_none(READ_ONCE(*ptep)));
+	} while (ptep++, addr = next, addr != end && pte_none(ptep_get(ptep)));
 }
 
 static void __init kasan_pmd_populate(pud_t *pudp, unsigned long addr,
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 3a27d887f7dd71..343629a1704237 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -173,7 +173,7 @@ static void init_pte(pmd_t *pmdp, unsigned long addr, unsigned long end,
 
 	ptep = pte_set_fixmap_offset(pmdp, addr);
 	do {
-		pte_t old_pte = READ_ONCE(*ptep);
+		pte_t old_pte = ptep_get(ptep);
 
 		set_pte(ptep, pfn_pte(__phys_to_pfn(phys), prot));
 
@@ -182,7 +182,7 @@ static void init_pte(pmd_t *pmdp, unsigned long addr, unsigned long end,
 		 * only allow updates to the permission attributes.
 		 */
 		BUG_ON(!pgattr_change_is_safe(pte_val(old_pte),
-					      READ_ONCE(pte_val(*ptep))));
+					      pte_val(ptep_get(ptep))));
 
 		phys += PAGE_SIZE;
 	} while (ptep++, addr += PAGE_SIZE, addr != end);
@@ -852,7 +852,7 @@ static void unmap_hotplug_pte_range(pmd_t *pmdp, unsigned long addr,
 
 	do {
 		ptep = pte_offset_kernel(pmdp, addr);
-		pte = READ_ONCE(*ptep);
+		pte = ptep_get(ptep);
 		if (pte_none(pte))
 			continue;
 
@@ -985,7 +985,7 @@ static void free_empty_pte_table(pmd_t *pmdp, unsigned long addr,
 
 	do {
 		ptep = pte_offset_kernel(pmdp, addr);
-		pte = READ_ONCE(*ptep);
+		pte = ptep_get(ptep);
 
 		/*
 		 * This is just a sanity check here which verifies that
@@ -1004,7 +1004,7 @@ static void free_empty_pte_table(pmd_t *pmdp, unsigned long addr,
 	 */
 	ptep = pte_offset_kernel(pmdp, 0UL);
 	for (i = 0; i < PTRS_PER_PTE; i++) {
-		if (!pte_none(READ_ONCE(ptep[i])))
+		if (!pte_none(ptep_get(&ptep[i])))
 			return;
 	}
 
@@ -1473,7 +1473,7 @@ pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte
 		 * when the permission changes from executable to non-executable
 		 * in cases where cpu is affected with errata #2645198.
 		 */
-		if (pte_user_exec(READ_ONCE(*ptep)))
+		if (pte_user_exec(ptep_get(ptep)))
 			return ptep_clear_flush(vma, addr, ptep);
 	}
 	return ptep_get_and_clear(vma->vm_mm, addr, ptep);
diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c
index 924843f1f661bf..73a5e8f825867d 100644
--- a/arch/arm64/mm/pageattr.c
+++ b/arch/arm64/mm/pageattr.c
@@ -36,7 +36,7 @@ bool can_set_direct_map(void)
 static int change_page_range(pte_t *ptep, unsigned long addr, void *data)
 {
 	struct page_change_data *cdata = data;
-	pte_t pte = READ_ONCE(*ptep);
+	pte_t pte = ptep_get(ptep);
 
 	pte = clear_pte_bit(pte, cdata->clear_mask);
 	pte = set_pte_bit(pte, cdata->set_mask);
@@ -245,5 +245,5 @@ bool kernel_page_present(struct page *page)
 		return true;
 
 	ptep = pte_offset_kernel(pmdp, addr);
-	return pte_valid(READ_ONCE(*ptep));
+	return pte_valid(ptep_get(ptep));
 }
diff --git a/arch/arm64/mm/trans_pgd.c b/arch/arm64/mm/trans_pgd.c
index 7b14df3c64776f..f71ab4704cce7c 100644
--- a/arch/arm64/mm/trans_pgd.c
+++ b/arch/arm64/mm/trans_pgd.c
@@ -33,7 +33,7 @@ static void *trans_alloc(struct trans_pgd_info *info)
 
 static void _copy_pte(pte_t *dst_ptep, pte_t *src_ptep, unsigned long addr)
 {
-	pte_t pte = READ_ONCE(*src_ptep);
+	pte_t pte = ptep_get(src_ptep);
 
 	if (pte_valid(pte)) {
 		/*

From c9dd0dd10c5d238b73f0472e3d4a03b1f6bded73 Mon Sep 17 00:00:00 2001
From: Ryan Roberts <ryan.roberts@arm.com>
Date: Thu, 15 Feb 2024 10:31:55 +0000
Subject: [PATCH 1222/1406] arm64/mm: convert set_pte_at() to set_ptes(..., 1)

Since set_ptes() was introduced, set_pte_at() has been implemented as a
generic macro around set_ptes(..., 1).  So this change should continue to
generate the same code.  However, making this change prepares us for the
transparent contpte support.  It means we can reroute set_ptes() to
__set_ptes().  Since set_pte_at() is a generic macro, there will be no
equivalent __set_pte_at() to reroute to.

Note that a couple of calls to set_pte_at() remain in the arch code.  This
is intentional, since those call sites are acting on behalf of core-mm and
should continue to call into the public set_ptes() rather than the
arch-private __set_ptes().

Link: https://lkml.kernel.org/r/20240215103205.2607016-9-ryan.roberts@arm.com
Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
Tested-by: John Hubbard <jhubbard@nvidia.com>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Barry Song <21cnbao@gmail.com>
Cc: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Morse <james.morse@arm.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will@kernel.org>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/arm64/include/asm/pgtable.h |  2 +-
 arch/arm64/kernel/mte.c          |  2 +-
 arch/arm64/kvm/guest.c           |  2 +-
 arch/arm64/mm/fault.c            |  2 +-
 arch/arm64/mm/hugetlbpage.c      | 10 +++++-----
 5 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index de034ca40bad1f..9a2df85eb493ef 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -1084,7 +1084,7 @@ static inline void arch_swap_restore(swp_entry_t entry, struct folio *folio)
 #endif /* CONFIG_ARM64_MTE */
 
 /*
- * On AArch64, the cache coherency is handled via the set_pte_at() function.
+ * On AArch64, the cache coherency is handled via the set_ptes() function.
  */
 static inline void update_mmu_cache_range(struct vm_fault *vmf,
 		struct vm_area_struct *vma, unsigned long addr, pte_t *ptep,
diff --git a/arch/arm64/kernel/mte.c b/arch/arm64/kernel/mte.c
index a41ef3213e1e95..59bfe2e96f8f31 100644
--- a/arch/arm64/kernel/mte.c
+++ b/arch/arm64/kernel/mte.c
@@ -67,7 +67,7 @@ int memcmp_pages(struct page *page1, struct page *page2)
 	/*
 	 * If the page content is identical but at least one of the pages is
 	 * tagged, return non-zero to avoid KSM merging. If only one of the
-	 * pages is tagged, set_pte_at() may zero or change the tags of the
+	 * pages is tagged, set_ptes() may zero or change the tags of the
 	 * other page via mte_sync_tags().
 	 */
 	if (page_mte_tagged(page1) || page_mte_tagged(page2))
diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c
index aaf1d49397392b..6e0df623c8e9d6 100644
--- a/arch/arm64/kvm/guest.c
+++ b/arch/arm64/kvm/guest.c
@@ -1072,7 +1072,7 @@ int kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm,
 		} else {
 			/*
 			 * Only locking to serialise with a concurrent
-			 * set_pte_at() in the VMM but still overriding the
+			 * set_ptes() in the VMM but still overriding the
 			 * tags, hence ignoring the return value.
 			 */
 			try_page_mte_tagging(page);
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index a254761fa1bd4b..3235e23309ec90 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -205,7 +205,7 @@ static void show_pte(unsigned long addr)
  *
  * It needs to cope with hardware update of the accessed/dirty state by other
  * agents in the system and can safely skip the __sync_icache_dcache() call as,
- * like set_pte_at(), the PTE is never changed from no-exec to exec here.
+ * like set_ptes(), the PTE is never changed from no-exec to exec here.
  *
  * Returns whether or not the PTE actually changed.
  */
diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
index 2892f925ed66dd..27f6160890d1d9 100644
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -247,12 +247,12 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
 
 	if (!pte_present(pte)) {
 		for (i = 0; i < ncontig; i++, ptep++, addr += pgsize)
-			set_pte_at(mm, addr, ptep, pte);
+			set_ptes(mm, addr, ptep, pte, 1);
 		return;
 	}
 
 	if (!pte_cont(pte)) {
-		set_pte_at(mm, addr, ptep, pte);
+		set_ptes(mm, addr, ptep, pte, 1);
 		return;
 	}
 
@@ -263,7 +263,7 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
 	clear_flush(mm, addr, ptep, pgsize, ncontig);
 
 	for (i = 0; i < ncontig; i++, ptep++, addr += pgsize, pfn += dpfn)
-		set_pte_at(mm, addr, ptep, pfn_pte(pfn, hugeprot));
+		set_ptes(mm, addr, ptep, pfn_pte(pfn, hugeprot), 1);
 }
 
 pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
@@ -471,7 +471,7 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma,
 
 	hugeprot = pte_pgprot(pte);
 	for (i = 0; i < ncontig; i++, ptep++, addr += pgsize, pfn += dpfn)
-		set_pte_at(mm, addr, ptep, pfn_pte(pfn, hugeprot));
+		set_ptes(mm, addr, ptep, pfn_pte(pfn, hugeprot), 1);
 
 	return 1;
 }
@@ -500,7 +500,7 @@ void huge_ptep_set_wrprotect(struct mm_struct *mm,
 	pfn = pte_pfn(pte);
 
 	for (i = 0; i < ncontig; i++, ptep++, addr += pgsize, pfn += dpfn)
-		set_pte_at(mm, addr, ptep, pfn_pte(pfn, hugeprot));
+		set_ptes(mm, addr, ptep, pfn_pte(pfn, hugeprot), 1);
 }
 
 pte_t huge_ptep_clear_flush(struct vm_area_struct *vma,

From f437cc36f260b70a34a4a714873d1076218e6e93 Mon Sep 17 00:00:00 2001
From: Ryan Roberts <ryan.roberts@arm.com>
Date: Thu, 15 Feb 2024 10:31:56 +0000
Subject: [PATCH 1223/1406] arm64/mm: convert ptep_clear() to
 ptep_get_and_clear()

ptep_clear() is a generic wrapper around the arch-implemented
ptep_get_and_clear().  We are about to convert ptep_get_and_clear() into a
public version and private version (__ptep_get_and_clear()) to support the
transparent contpte work.  We won't have a private version of ptep_clear()
so let's convert it to directly call ptep_get_and_clear().

Link: https://lkml.kernel.org/r/20240215103205.2607016-10-ryan.roberts@arm.com
Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
Tested-by: John Hubbard <jhubbard@nvidia.com>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Barry Song <21cnbao@gmail.com>
Cc: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Morse <james.morse@arm.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will@kernel.org>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/arm64/mm/hugetlbpage.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
index 27f6160890d1d9..48e8b429879dff 100644
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -229,7 +229,7 @@ static void clear_flush(struct mm_struct *mm,
 	unsigned long i, saddr = addr;
 
 	for (i = 0; i < ncontig; i++, addr += pgsize, ptep++)
-		ptep_clear(mm, addr, ptep);
+		ptep_get_and_clear(mm, addr, ptep);
 
 	flush_tlb_range(&vma, saddr, addr);
 }

From e1f5c935aa1aa352ab967eaf62e5bd76c001845f Mon Sep 17 00:00:00 2001
From: Ryan Roberts <ryan.roberts@arm.com>
Date: Thu, 15 Feb 2024 10:31:57 +0000
Subject: [PATCH 1224/1406] arm64/mm: new ptep layer to manage contig bit

Create a new layer for the in-table PTE manipulation APIs.  For now, The
existing API is prefixed with double underscore to become the arch-private
API and the public API is just a simple wrapper that calls the private
API.

The public API implementation will subsequently be used to transparently
manipulate the contiguous bit where appropriate.  But since there are
already some contig-aware users (e.g.  hugetlb, kernel mapper), we must
first ensure those users use the private API directly so that the future
contig-bit manipulations in the public API do not interfere with those
existing uses.

The following APIs are treated this way:

 - ptep_get
 - set_pte
 - set_ptes
 - pte_clear
 - ptep_get_and_clear
 - ptep_test_and_clear_young
 - ptep_clear_flush_young
 - ptep_set_wrprotect
 - ptep_set_access_flags

Link: https://lkml.kernel.org/r/20240215103205.2607016-11-ryan.roberts@arm.com
Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
Tested-by: John Hubbard <jhubbard@nvidia.com>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Barry Song <21cnbao@gmail.com>
Cc: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Morse <james.morse@arm.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will@kernel.org>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/arm64/include/asm/pgtable.h | 83 +++++++++++++++++---------------
 arch/arm64/kernel/efi.c          |  4 +-
 arch/arm64/kernel/mte.c          |  2 +-
 arch/arm64/kvm/guest.c           |  2 +-
 arch/arm64/mm/fault.c            | 12 ++---
 arch/arm64/mm/fixmap.c           |  4 +-
 arch/arm64/mm/hugetlbpage.c      | 40 +++++++--------
 arch/arm64/mm/kasan_init.c       |  6 +--
 arch/arm64/mm/mmu.c              | 14 +++---
 arch/arm64/mm/pageattr.c         |  6 +--
 arch/arm64/mm/trans_pgd.c        |  6 +--
 11 files changed, 93 insertions(+), 86 deletions(-)

diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 9a2df85eb493ef..7336d40a893a88 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -93,7 +93,8 @@ static inline pteval_t __phys_to_pte_val(phys_addr_t phys)
 	__pte(__phys_to_pte_val((phys_addr_t)(pfn) << PAGE_SHIFT) | pgprot_val(prot))
 
 #define pte_none(pte)		(!pte_val(pte))
-#define pte_clear(mm,addr,ptep)	set_pte(ptep, __pte(0))
+#define __pte_clear(mm, addr, ptep) \
+				__set_pte(ptep, __pte(0))
 #define pte_page(pte)		(pfn_to_page(pte_pfn(pte)))
 
 /*
@@ -137,7 +138,7 @@ static inline pteval_t __phys_to_pte_val(phys_addr_t phys)
  * so that we don't erroneously return false for pages that have been
  * remapped as PROT_NONE but are yet to be flushed from the TLB.
  * Note that we can't make any assumptions based on the state of the access
- * flag, since ptep_clear_flush_young() elides a DSB when invalidating the
+ * flag, since __ptep_clear_flush_young() elides a DSB when invalidating the
  * TLB.
  */
 #define pte_accessible(mm, pte)	\
@@ -261,7 +262,7 @@ static inline pte_t pte_mkdevmap(pte_t pte)
 	return set_pte_bit(pte, __pgprot(PTE_DEVMAP | PTE_SPECIAL));
 }
 
-static inline void set_pte(pte_t *ptep, pte_t pte)
+static inline void __set_pte(pte_t *ptep, pte_t pte)
 {
 	WRITE_ONCE(*ptep, pte);
 
@@ -275,8 +276,7 @@ static inline void set_pte(pte_t *ptep, pte_t pte)
 	}
 }
 
-#define ptep_get ptep_get
-static inline pte_t ptep_get(pte_t *ptep)
+static inline pte_t __ptep_get(pte_t *ptep)
 {
 	return READ_ONCE(*ptep);
 }
@@ -308,7 +308,7 @@ static inline void __check_safe_pte_update(struct mm_struct *mm, pte_t *ptep,
 	if (!IS_ENABLED(CONFIG_DEBUG_VM))
 		return;
 
-	old_pte = ptep_get(ptep);
+	old_pte = __ptep_get(ptep);
 
 	if (!pte_valid(old_pte) || !pte_valid(pte))
 		return;
@@ -317,7 +317,7 @@ static inline void __check_safe_pte_update(struct mm_struct *mm, pte_t *ptep,
 
 	/*
 	 * Check for potential race with hardware updates of the pte
-	 * (ptep_set_access_flags safely changes valid ptes without going
+	 * (__ptep_set_access_flags safely changes valid ptes without going
 	 * through an invalid entry).
 	 */
 	VM_WARN_ONCE(!pte_young(pte),
@@ -363,23 +363,22 @@ static inline pte_t pte_advance_pfn(pte_t pte, unsigned long nr)
 	return pfn_pte(pte_pfn(pte) + nr, pte_pgprot(pte));
 }
 
-static inline void set_ptes(struct mm_struct *mm,
-			    unsigned long __always_unused addr,
-			    pte_t *ptep, pte_t pte, unsigned int nr)
+static inline void __set_ptes(struct mm_struct *mm,
+			      unsigned long __always_unused addr,
+			      pte_t *ptep, pte_t pte, unsigned int nr)
 {
 	page_table_check_ptes_set(mm, ptep, pte, nr);
 	__sync_cache_and_tags(pte, nr);
 
 	for (;;) {
 		__check_safe_pte_update(mm, ptep, pte);
-		set_pte(ptep, pte);
+		__set_pte(ptep, pte);
 		if (--nr == 0)
 			break;
 		ptep++;
 		pte = pte_advance_pfn(pte, 1);
 	}
 }
-#define set_ptes set_ptes
 
 /*
  * Huge pte definitions.
@@ -546,7 +545,7 @@ static inline void __set_pte_at(struct mm_struct *mm,
 {
 	__sync_cache_and_tags(pte, nr);
 	__check_safe_pte_update(mm, ptep, pte);
-	set_pte(ptep, pte);
+	__set_pte(ptep, pte);
 }
 
 static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
@@ -860,8 +859,7 @@ static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
 	return pte_pmd(pte_modify(pmd_pte(pmd), newprot));
 }
 
-#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
-extern int ptep_set_access_flags(struct vm_area_struct *vma,
+extern int __ptep_set_access_flags(struct vm_area_struct *vma,
 				 unsigned long address, pte_t *ptep,
 				 pte_t entry, int dirty);
 
@@ -871,7 +869,8 @@ static inline int pmdp_set_access_flags(struct vm_area_struct *vma,
 					unsigned long address, pmd_t *pmdp,
 					pmd_t entry, int dirty)
 {
-	return ptep_set_access_flags(vma, address, (pte_t *)pmdp, pmd_pte(entry), dirty);
+	return __ptep_set_access_flags(vma, address, (pte_t *)pmdp,
+							pmd_pte(entry), dirty);
 }
 
 static inline int pud_devmap(pud_t pud)
@@ -905,12 +904,13 @@ static inline bool pud_user_accessible_page(pud_t pud)
 /*
  * Atomic pte/pmd modifications.
  */
-#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
-static inline int __ptep_test_and_clear_young(pte_t *ptep)
+static inline int __ptep_test_and_clear_young(struct vm_area_struct *vma,
+					      unsigned long address,
+					      pte_t *ptep)
 {
 	pte_t old_pte, pte;
 
-	pte = ptep_get(ptep);
+	pte = __ptep_get(ptep);
 	do {
 		old_pte = pte;
 		pte = pte_mkold(pte);
@@ -921,18 +921,10 @@ static inline int __ptep_test_and_clear_young(pte_t *ptep)
 	return pte_young(pte);
 }
 
-static inline int ptep_test_and_clear_young(struct vm_area_struct *vma,
-					    unsigned long address,
-					    pte_t *ptep)
-{
-	return __ptep_test_and_clear_young(ptep);
-}
-
-#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
-static inline int ptep_clear_flush_young(struct vm_area_struct *vma,
+static inline int __ptep_clear_flush_young(struct vm_area_struct *vma,
 					 unsigned long address, pte_t *ptep)
 {
-	int young = ptep_test_and_clear_young(vma, address, ptep);
+	int young = __ptep_test_and_clear_young(vma, address, ptep);
 
 	if (young) {
 		/*
@@ -955,12 +947,11 @@ static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma,
 					    unsigned long address,
 					    pmd_t *pmdp)
 {
-	return ptep_test_and_clear_young(vma, address, (pte_t *)pmdp);
+	return __ptep_test_and_clear_young(vma, address, (pte_t *)pmdp);
 }
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
-#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
-static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
+static inline pte_t __ptep_get_and_clear(struct mm_struct *mm,
 				       unsigned long address, pte_t *ptep)
 {
 	pte_t pte = __pte(xchg_relaxed(&pte_val(*ptep), 0));
@@ -984,15 +975,15 @@ static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
 /*
- * ptep_set_wrprotect - mark read-only while trasferring potential hardware
+ * __ptep_set_wrprotect - mark read-only while trasferring potential hardware
  * dirty status (PTE_DBM && !PTE_RDONLY) to the software PTE_DIRTY bit.
  */
-#define __HAVE_ARCH_PTEP_SET_WRPROTECT
-static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long address, pte_t *ptep)
+static inline void __ptep_set_wrprotect(struct mm_struct *mm,
+					unsigned long address, pte_t *ptep)
 {
 	pte_t old_pte, pte;
 
-	pte = ptep_get(ptep);
+	pte = __ptep_get(ptep);
 	do {
 		old_pte = pte;
 		pte = pte_wrprotect(pte);
@@ -1006,7 +997,7 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addres
 static inline void pmdp_set_wrprotect(struct mm_struct *mm,
 				      unsigned long address, pmd_t *pmdp)
 {
-	ptep_set_wrprotect(mm, address, (pte_t *)pmdp);
+	__ptep_set_wrprotect(mm, address, (pte_t *)pmdp);
 }
 
 #define pmdp_establish pmdp_establish
@@ -1084,7 +1075,7 @@ static inline void arch_swap_restore(swp_entry_t entry, struct folio *folio)
 #endif /* CONFIG_ARM64_MTE */
 
 /*
- * On AArch64, the cache coherency is handled via the set_ptes() function.
+ * On AArch64, the cache coherency is handled via the __set_ptes() function.
  */
 static inline void update_mmu_cache_range(struct vm_fault *vmf,
 		struct vm_area_struct *vma, unsigned long addr, pte_t *ptep,
@@ -1136,6 +1127,22 @@ extern pte_t ptep_modify_prot_start(struct vm_area_struct *vma,
 extern void ptep_modify_prot_commit(struct vm_area_struct *vma,
 				    unsigned long addr, pte_t *ptep,
 				    pte_t old_pte, pte_t new_pte);
+
+#define ptep_get				__ptep_get
+#define set_pte					__set_pte
+#define set_ptes				__set_ptes
+#define pte_clear				__pte_clear
+#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
+#define ptep_get_and_clear			__ptep_get_and_clear
+#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
+#define ptep_test_and_clear_young		__ptep_test_and_clear_young
+#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
+#define ptep_clear_flush_young			__ptep_clear_flush_young
+#define __HAVE_ARCH_PTEP_SET_WRPROTECT
+#define ptep_set_wrprotect			__ptep_set_wrprotect
+#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
+#define ptep_set_access_flags			__ptep_set_access_flags
+
 #endif /* !__ASSEMBLY__ */
 
 #endif /* __ASM_PGTABLE_H */
diff --git a/arch/arm64/kernel/efi.c b/arch/arm64/kernel/efi.c
index d0e08e93b24648..9afcc690fe73c2 100644
--- a/arch/arm64/kernel/efi.c
+++ b/arch/arm64/kernel/efi.c
@@ -103,7 +103,7 @@ static int __init set_permissions(pte_t *ptep, unsigned long addr, void *data)
 {
 	struct set_perm_data *spd = data;
 	const efi_memory_desc_t *md = spd->md;
-	pte_t pte = ptep_get(ptep);
+	pte_t pte = __ptep_get(ptep);
 
 	if (md->attribute & EFI_MEMORY_RO)
 		pte = set_pte_bit(pte, __pgprot(PTE_RDONLY));
@@ -111,7 +111,7 @@ static int __init set_permissions(pte_t *ptep, unsigned long addr, void *data)
 		pte = set_pte_bit(pte, __pgprot(PTE_PXN));
 	else if (system_supports_bti_kernel() && spd->has_bti)
 		pte = set_pte_bit(pte, __pgprot(PTE_GP));
-	set_pte(ptep, pte);
+	__set_pte(ptep, pte);
 	return 0;
 }
 
diff --git a/arch/arm64/kernel/mte.c b/arch/arm64/kernel/mte.c
index 59bfe2e96f8f31..dcdcccd40891c6 100644
--- a/arch/arm64/kernel/mte.c
+++ b/arch/arm64/kernel/mte.c
@@ -67,7 +67,7 @@ int memcmp_pages(struct page *page1, struct page *page2)
 	/*
 	 * If the page content is identical but at least one of the pages is
 	 * tagged, return non-zero to avoid KSM merging. If only one of the
-	 * pages is tagged, set_ptes() may zero or change the tags of the
+	 * pages is tagged, __set_ptes() may zero or change the tags of the
 	 * other page via mte_sync_tags().
 	 */
 	if (page_mte_tagged(page1) || page_mte_tagged(page2))
diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c
index 6e0df623c8e9d6..629145fd3161d4 100644
--- a/arch/arm64/kvm/guest.c
+++ b/arch/arm64/kvm/guest.c
@@ -1072,7 +1072,7 @@ int kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm,
 		} else {
 			/*
 			 * Only locking to serialise with a concurrent
-			 * set_ptes() in the VMM but still overriding the
+			 * __set_ptes() in the VMM but still overriding the
 			 * tags, hence ignoring the return value.
 			 */
 			try_page_mte_tagging(page);
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 3235e23309ec90..9a1c66183d168d 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -191,7 +191,7 @@ static void show_pte(unsigned long addr)
 		if (!ptep)
 			break;
 
-		pte = ptep_get(ptep);
+		pte = __ptep_get(ptep);
 		pr_cont(", pte=%016llx", pte_val(pte));
 		pte_unmap(ptep);
 	} while(0);
@@ -205,16 +205,16 @@ static void show_pte(unsigned long addr)
  *
  * It needs to cope with hardware update of the accessed/dirty state by other
  * agents in the system and can safely skip the __sync_icache_dcache() call as,
- * like set_ptes(), the PTE is never changed from no-exec to exec here.
+ * like __set_ptes(), the PTE is never changed from no-exec to exec here.
  *
  * Returns whether or not the PTE actually changed.
  */
-int ptep_set_access_flags(struct vm_area_struct *vma,
-			  unsigned long address, pte_t *ptep,
-			  pte_t entry, int dirty)
+int __ptep_set_access_flags(struct vm_area_struct *vma,
+			    unsigned long address, pte_t *ptep,
+			    pte_t entry, int dirty)
 {
 	pteval_t old_pteval, pteval;
-	pte_t pte = ptep_get(ptep);
+	pte_t pte = __ptep_get(ptep);
 
 	if (pte_same(pte, entry))
 		return 0;
diff --git a/arch/arm64/mm/fixmap.c b/arch/arm64/mm/fixmap.c
index c0a3301203bdf7..bfc02568805aea 100644
--- a/arch/arm64/mm/fixmap.c
+++ b/arch/arm64/mm/fixmap.c
@@ -121,9 +121,9 @@ void __set_fixmap(enum fixed_addresses idx,
 	ptep = fixmap_pte(addr);
 
 	if (pgprot_val(flags)) {
-		set_pte(ptep, pfn_pte(phys >> PAGE_SHIFT, flags));
+		__set_pte(ptep, pfn_pte(phys >> PAGE_SHIFT, flags));
 	} else {
-		pte_clear(&init_mm, addr, ptep);
+		__pte_clear(&init_mm, addr, ptep);
 		flush_tlb_kernel_range(addr, addr+PAGE_SIZE);
 	}
 }
diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
index 48e8b429879dff..0f0e10bb0a9540 100644
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -145,14 +145,14 @@ pte_t huge_ptep_get(pte_t *ptep)
 {
 	int ncontig, i;
 	size_t pgsize;
-	pte_t orig_pte = ptep_get(ptep);
+	pte_t orig_pte = __ptep_get(ptep);
 
 	if (!pte_present(orig_pte) || !pte_cont(orig_pte))
 		return orig_pte;
 
 	ncontig = num_contig_ptes(page_size(pte_page(orig_pte)), &pgsize);
 	for (i = 0; i < ncontig; i++, ptep++) {
-		pte_t pte = ptep_get(ptep);
+		pte_t pte = __ptep_get(ptep);
 
 		if (pte_dirty(pte))
 			orig_pte = pte_mkdirty(orig_pte);
@@ -177,11 +177,11 @@ static pte_t get_clear_contig(struct mm_struct *mm,
 			     unsigned long pgsize,
 			     unsigned long ncontig)
 {
-	pte_t orig_pte = ptep_get(ptep);
+	pte_t orig_pte = __ptep_get(ptep);
 	unsigned long i;
 
 	for (i = 0; i < ncontig; i++, addr += pgsize, ptep++) {
-		pte_t pte = ptep_get_and_clear(mm, addr, ptep);
+		pte_t pte = __ptep_get_and_clear(mm, addr, ptep);
 
 		/*
 		 * If HW_AFDBM is enabled, then the HW could turn on
@@ -229,7 +229,7 @@ static void clear_flush(struct mm_struct *mm,
 	unsigned long i, saddr = addr;
 
 	for (i = 0; i < ncontig; i++, addr += pgsize, ptep++)
-		ptep_get_and_clear(mm, addr, ptep);
+		__ptep_get_and_clear(mm, addr, ptep);
 
 	flush_tlb_range(&vma, saddr, addr);
 }
@@ -247,12 +247,12 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
 
 	if (!pte_present(pte)) {
 		for (i = 0; i < ncontig; i++, ptep++, addr += pgsize)
-			set_ptes(mm, addr, ptep, pte, 1);
+			__set_ptes(mm, addr, ptep, pte, 1);
 		return;
 	}
 
 	if (!pte_cont(pte)) {
-		set_ptes(mm, addr, ptep, pte, 1);
+		__set_ptes(mm, addr, ptep, pte, 1);
 		return;
 	}
 
@@ -263,7 +263,7 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
 	clear_flush(mm, addr, ptep, pgsize, ncontig);
 
 	for (i = 0; i < ncontig; i++, ptep++, addr += pgsize, pfn += dpfn)
-		set_ptes(mm, addr, ptep, pfn_pte(pfn, hugeprot), 1);
+		__set_ptes(mm, addr, ptep, pfn_pte(pfn, hugeprot), 1);
 }
 
 pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
@@ -393,7 +393,7 @@ void huge_pte_clear(struct mm_struct *mm, unsigned long addr,
 	ncontig = num_contig_ptes(sz, &pgsize);
 
 	for (i = 0; i < ncontig; i++, addr += pgsize, ptep++)
-		pte_clear(mm, addr, ptep);
+		__pte_clear(mm, addr, ptep);
 }
 
 pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
@@ -401,10 +401,10 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
 {
 	int ncontig;
 	size_t pgsize;
-	pte_t orig_pte = ptep_get(ptep);
+	pte_t orig_pte = __ptep_get(ptep);
 
 	if (!pte_cont(orig_pte))
-		return ptep_get_and_clear(mm, addr, ptep);
+		return __ptep_get_and_clear(mm, addr, ptep);
 
 	ncontig = find_num_contig(mm, addr, ptep, &pgsize);
 
@@ -424,11 +424,11 @@ static int __cont_access_flags_changed(pte_t *ptep, pte_t pte, int ncontig)
 {
 	int i;
 
-	if (pte_write(pte) != pte_write(ptep_get(ptep)))
+	if (pte_write(pte) != pte_write(__ptep_get(ptep)))
 		return 1;
 
 	for (i = 0; i < ncontig; i++) {
-		pte_t orig_pte = ptep_get(ptep + i);
+		pte_t orig_pte = __ptep_get(ptep + i);
 
 		if (pte_dirty(pte) != pte_dirty(orig_pte))
 			return 1;
@@ -452,7 +452,7 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma,
 	pte_t orig_pte;
 
 	if (!pte_cont(pte))
-		return ptep_set_access_flags(vma, addr, ptep, pte, dirty);
+		return __ptep_set_access_flags(vma, addr, ptep, pte, dirty);
 
 	ncontig = find_num_contig(mm, addr, ptep, &pgsize);
 	dpfn = pgsize >> PAGE_SHIFT;
@@ -471,7 +471,7 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma,
 
 	hugeprot = pte_pgprot(pte);
 	for (i = 0; i < ncontig; i++, ptep++, addr += pgsize, pfn += dpfn)
-		set_ptes(mm, addr, ptep, pfn_pte(pfn, hugeprot), 1);
+		__set_ptes(mm, addr, ptep, pfn_pte(pfn, hugeprot), 1);
 
 	return 1;
 }
@@ -485,8 +485,8 @@ void huge_ptep_set_wrprotect(struct mm_struct *mm,
 	size_t pgsize;
 	pte_t pte;
 
-	if (!pte_cont(ptep_get(ptep))) {
-		ptep_set_wrprotect(mm, addr, ptep);
+	if (!pte_cont(__ptep_get(ptep))) {
+		__ptep_set_wrprotect(mm, addr, ptep);
 		return;
 	}
 
@@ -500,7 +500,7 @@ void huge_ptep_set_wrprotect(struct mm_struct *mm,
 	pfn = pte_pfn(pte);
 
 	for (i = 0; i < ncontig; i++, ptep++, addr += pgsize, pfn += dpfn)
-		set_ptes(mm, addr, ptep, pfn_pte(pfn, hugeprot), 1);
+		__set_ptes(mm, addr, ptep, pfn_pte(pfn, hugeprot), 1);
 }
 
 pte_t huge_ptep_clear_flush(struct vm_area_struct *vma,
@@ -510,7 +510,7 @@ pte_t huge_ptep_clear_flush(struct vm_area_struct *vma,
 	size_t pgsize;
 	int ncontig;
 
-	if (!pte_cont(ptep_get(ptep)))
+	if (!pte_cont(__ptep_get(ptep)))
 		return ptep_clear_flush(vma, addr, ptep);
 
 	ncontig = find_num_contig(mm, addr, ptep, &pgsize);
@@ -543,7 +543,7 @@ pte_t huge_ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr
 		 * when the permission changes from executable to non-executable
 		 * in cases where cpu is affected with errata #2645198.
 		 */
-		if (pte_user_exec(ptep_get(ptep)))
+		if (pte_user_exec(__ptep_get(ptep)))
 			return huge_ptep_clear_flush(vma, addr, ptep);
 	}
 	return huge_ptep_get_and_clear(vma->vm_mm, addr, ptep);
diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c
index c2a9f4f6c7dd06..9ee16cfce587f5 100644
--- a/arch/arm64/mm/kasan_init.c
+++ b/arch/arm64/mm/kasan_init.c
@@ -112,8 +112,8 @@ static void __init kasan_pte_populate(pmd_t *pmdp, unsigned long addr,
 		if (!early)
 			memset(__va(page_phys), KASAN_SHADOW_INIT, PAGE_SIZE);
 		next = addr + PAGE_SIZE;
-		set_pte(ptep, pfn_pte(__phys_to_pfn(page_phys), PAGE_KERNEL));
-	} while (ptep++, addr = next, addr != end && pte_none(ptep_get(ptep)));
+		__set_pte(ptep, pfn_pte(__phys_to_pfn(page_phys), PAGE_KERNEL));
+	} while (ptep++, addr = next, addr != end && pte_none(__ptep_get(ptep)));
 }
 
 static void __init kasan_pmd_populate(pud_t *pudp, unsigned long addr,
@@ -271,7 +271,7 @@ static void __init kasan_init_shadow(void)
 	 * so we should make sure that it maps the zero page read-only.
 	 */
 	for (i = 0; i < PTRS_PER_PTE; i++)
-		set_pte(&kasan_early_shadow_pte[i],
+		__set_pte(&kasan_early_shadow_pte[i],
 			pfn_pte(sym_to_pfn(kasan_early_shadow_page),
 				PAGE_KERNEL_RO));
 
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 343629a1704237..6208c7541f8732 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -173,16 +173,16 @@ static void init_pte(pmd_t *pmdp, unsigned long addr, unsigned long end,
 
 	ptep = pte_set_fixmap_offset(pmdp, addr);
 	do {
-		pte_t old_pte = ptep_get(ptep);
+		pte_t old_pte = __ptep_get(ptep);
 
-		set_pte(ptep, pfn_pte(__phys_to_pfn(phys), prot));
+		__set_pte(ptep, pfn_pte(__phys_to_pfn(phys), prot));
 
 		/*
 		 * After the PTE entry has been populated once, we
 		 * only allow updates to the permission attributes.
 		 */
 		BUG_ON(!pgattr_change_is_safe(pte_val(old_pte),
-					      pte_val(ptep_get(ptep))));
+					      pte_val(__ptep_get(ptep))));
 
 		phys += PAGE_SIZE;
 	} while (ptep++, addr += PAGE_SIZE, addr != end);
@@ -852,12 +852,12 @@ static void unmap_hotplug_pte_range(pmd_t *pmdp, unsigned long addr,
 
 	do {
 		ptep = pte_offset_kernel(pmdp, addr);
-		pte = ptep_get(ptep);
+		pte = __ptep_get(ptep);
 		if (pte_none(pte))
 			continue;
 
 		WARN_ON(!pte_present(pte));
-		pte_clear(&init_mm, addr, ptep);
+		__pte_clear(&init_mm, addr, ptep);
 		flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
 		if (free_mapped)
 			free_hotplug_page_range(pte_page(pte),
@@ -985,7 +985,7 @@ static void free_empty_pte_table(pmd_t *pmdp, unsigned long addr,
 
 	do {
 		ptep = pte_offset_kernel(pmdp, addr);
-		pte = ptep_get(ptep);
+		pte = __ptep_get(ptep);
 
 		/*
 		 * This is just a sanity check here which verifies that
@@ -1004,7 +1004,7 @@ static void free_empty_pte_table(pmd_t *pmdp, unsigned long addr,
 	 */
 	ptep = pte_offset_kernel(pmdp, 0UL);
 	for (i = 0; i < PTRS_PER_PTE; i++) {
-		if (!pte_none(ptep_get(&ptep[i])))
+		if (!pte_none(__ptep_get(&ptep[i])))
 			return;
 	}
 
diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c
index 73a5e8f825867d..0c4e3ecf989d43 100644
--- a/arch/arm64/mm/pageattr.c
+++ b/arch/arm64/mm/pageattr.c
@@ -36,12 +36,12 @@ bool can_set_direct_map(void)
 static int change_page_range(pte_t *ptep, unsigned long addr, void *data)
 {
 	struct page_change_data *cdata = data;
-	pte_t pte = ptep_get(ptep);
+	pte_t pte = __ptep_get(ptep);
 
 	pte = clear_pte_bit(pte, cdata->clear_mask);
 	pte = set_pte_bit(pte, cdata->set_mask);
 
-	set_pte(ptep, pte);
+	__set_pte(ptep, pte);
 	return 0;
 }
 
@@ -245,5 +245,5 @@ bool kernel_page_present(struct page *page)
 		return true;
 
 	ptep = pte_offset_kernel(pmdp, addr);
-	return pte_valid(ptep_get(ptep));
+	return pte_valid(__ptep_get(ptep));
 }
diff --git a/arch/arm64/mm/trans_pgd.c b/arch/arm64/mm/trans_pgd.c
index f71ab4704cce7c..5139a28130c088 100644
--- a/arch/arm64/mm/trans_pgd.c
+++ b/arch/arm64/mm/trans_pgd.c
@@ -33,7 +33,7 @@ static void *trans_alloc(struct trans_pgd_info *info)
 
 static void _copy_pte(pte_t *dst_ptep, pte_t *src_ptep, unsigned long addr)
 {
-	pte_t pte = ptep_get(src_ptep);
+	pte_t pte = __ptep_get(src_ptep);
 
 	if (pte_valid(pte)) {
 		/*
@@ -41,7 +41,7 @@ static void _copy_pte(pte_t *dst_ptep, pte_t *src_ptep, unsigned long addr)
 		 * read only (code, rodata). Clear the RDONLY bit from
 		 * the temporary mappings we use during restore.
 		 */
-		set_pte(dst_ptep, pte_mkwrite_novma(pte));
+		__set_pte(dst_ptep, pte_mkwrite_novma(pte));
 	} else if ((debug_pagealloc_enabled() ||
 		   is_kfence_address((void *)addr)) && !pte_none(pte)) {
 		/*
@@ -55,7 +55,7 @@ static void _copy_pte(pte_t *dst_ptep, pte_t *src_ptep, unsigned long addr)
 		 */
 		BUG_ON(!pfn_valid(pte_pfn(pte)));
 
-		set_pte(dst_ptep, pte_mkpresent(pte_mkwrite_novma(pte)));
+		__set_pte(dst_ptep, pte_mkpresent(pte_mkwrite_novma(pte)));
 	}
 }
 

From cb5fe29d8a58b69b931ab1d4ea6e0259631ec0bc Mon Sep 17 00:00:00 2001
From: Ryan Roberts <ryan.roberts@arm.com>
Date: Thu, 15 Feb 2024 10:31:58 +0000
Subject: [PATCH 1225/1406] arm64/mm: dplit __flush_tlb_range() to elide
 trailing DSB

Split __flush_tlb_range() into __flush_tlb_range_nosync() +
__flush_tlb_range(), in the same way as the existing flush_tlb_page()
arrangement.  This allows calling __flush_tlb_range_nosync() to elide the
trailing DSB.  Forthcoming "contpte" code will take advantage of this when
clearing the young bit from a contiguous range of ptes.

Ordering between dsb and mmu_notifier_arch_invalidate_secondary_tlbs() has
changed, but now aligns with the ordering of __flush_tlb_page().  It has
been discussed that __flush_tlb_page() may be wrong though.  Regardless,
both will be resolved separately if needed.

Link: https://lkml.kernel.org/r/20240215103205.2607016-12-ryan.roberts@arm.com
Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Tested-by: John Hubbard <jhubbard@nvidia.com>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Barry Song <21cnbao@gmail.com>
Cc: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Morse <james.morse@arm.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will@kernel.org>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/arm64/include/asm/tlbflush.h | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h
index 1deb5d789c2e23..3b0e8248e1a41a 100644
--- a/arch/arm64/include/asm/tlbflush.h
+++ b/arch/arm64/include/asm/tlbflush.h
@@ -422,7 +422,7 @@ do {									\
 #define __flush_s2_tlb_range_op(op, start, pages, stride, tlb_level) \
 	__flush_tlb_range_op(op, start, pages, stride, 0, tlb_level, false, kvm_lpa2_is_enabled());
 
-static inline void __flush_tlb_range(struct vm_area_struct *vma,
+static inline void __flush_tlb_range_nosync(struct vm_area_struct *vma,
 				     unsigned long start, unsigned long end,
 				     unsigned long stride, bool last_level,
 				     int tlb_level)
@@ -456,10 +456,19 @@ static inline void __flush_tlb_range(struct vm_area_struct *vma,
 		__flush_tlb_range_op(vae1is, start, pages, stride, asid,
 				     tlb_level, true, lpa2_is_enabled());
 
-	dsb(ish);
 	mmu_notifier_arch_invalidate_secondary_tlbs(vma->vm_mm, start, end);
 }
 
+static inline void __flush_tlb_range(struct vm_area_struct *vma,
+				     unsigned long start, unsigned long end,
+				     unsigned long stride, bool last_level,
+				     int tlb_level)
+{
+	__flush_tlb_range_nosync(vma, start, end, stride,
+				 last_level, tlb_level);
+	dsb(ish);
+}
+
 static inline void flush_tlb_range(struct vm_area_struct *vma,
 				   unsigned long start, unsigned long end)
 {

From f23579f6de2d3fdc6a0582581e248fb530f1df43 Mon Sep 17 00:00:00 2001
From: Ryan Roberts <ryan.roberts@arm.com>
Date: Thu, 15 Feb 2024 10:31:59 +0000
Subject: [PATCH 1226/1406] arm64/mm: wire up PTE_CONT for user mappings

With the ptep API sufficiently refactored, we can now introduce a new
"contpte" API layer, which transparently manages the PTE_CONT bit for user
mappings.

In this initial implementation, only suitable batches of PTEs, set via
set_ptes(), are mapped with the PTE_CONT bit.  Any subsequent modification
of individual PTEs will cause an "unfold" operation to repaint the contpte
block as individual PTEs before performing the requested operation.
While, a modification of a single PTE could cause the block of PTEs to
which it belongs to become eligible for "folding" into a contpte entry,
"folding" is not performed in this initial implementation due to the costs
of checking the requirements are met.  Due to this, contpte mappings will
degrade back to normal pte mappings over time if/when protections are
changed.  This will be solved in a future patch.

Since a contpte block only has a single access and dirty bit, the semantic
here changes slightly; when getting a pte (e.g.  ptep_get()) that is part
of a contpte mapping, the access and dirty information are pulled from the
block (so all ptes in the block return the same access/dirty info).  When
changing the access/dirty info on a pte (e.g.  ptep_set_access_flags())
that is part of a contpte mapping, this change will affect the whole
contpte block.  This is works fine in practice since we guarantee that
only a single folio is mapped by a contpte block, and the core-mm tracks
access/dirty information per folio.

In order for the public functions, which used to be pure inline, to
continue to be callable by modules, export all the contpte_* symbols that
are now called by those public inline functions.

The feature is enabled/disabled with the ARM64_CONTPTE Kconfig parameter
at build time.  It defaults to enabled as long as its dependency,
TRANSPARENT_HUGEPAGE is also enabled.  The core-mm depends upon
TRANSPARENT_HUGEPAGE to be able to allocate large folios, so if its not
enabled, then there is no chance of meeting the physical contiguity
requirement for contpte mappings.

Link: https://lkml.kernel.org/r/20240215103205.2607016-13-ryan.roberts@arm.com
Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
Acked-by: Ard Biesheuvel <ardb@kernel.org>
Tested-by: John Hubbard <jhubbard@nvidia.com>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Barry Song <21cnbao@gmail.com>
Cc: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Morse <james.morse@arm.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will@kernel.org>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/arm64/Kconfig               |   9 +
 arch/arm64/include/asm/pgtable.h | 167 ++++++++++++++++++
 arch/arm64/mm/Makefile           |   1 +
 arch/arm64/mm/contpte.c          | 285 +++++++++++++++++++++++++++++++
 include/linux/efi.h              |   5 +
 5 files changed, 467 insertions(+)
 create mode 100644 arch/arm64/mm/contpte.c

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index e8275a40afbd3b..5a7ac1f37bdc98 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -2229,6 +2229,15 @@ config UNWIND_PATCH_PAC_INTO_SCS
 	select UNWIND_TABLES
 	select DYNAMIC_SCS
 
+config ARM64_CONTPTE
+	bool "Contiguous PTE mappings for user memory" if EXPERT
+	depends on TRANSPARENT_HUGEPAGE
+	default y
+	help
+	  When enabled, user mappings are configured using the PTE contiguous
+	  bit, for any mappings that meet the size and alignment requirements.
+	  This reduces TLB pressure and improves performance.
+
 endmenu # "Kernel Features"
 
 menu "Boot options"
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 7336d40a893a88..831099cfc96bdd 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -133,6 +133,10 @@ static inline pteval_t __phys_to_pte_val(phys_addr_t phys)
  */
 #define pte_valid_not_user(pte) \
 	((pte_val(pte) & (PTE_VALID | PTE_USER | PTE_UXN)) == (PTE_VALID | PTE_UXN))
+/*
+ * Returns true if the pte is valid and has the contiguous bit set.
+ */
+#define pte_valid_cont(pte)	(pte_valid(pte) && pte_cont(pte))
 /*
  * Could the pte be present in the TLB? We must check mm_tlb_flush_pending
  * so that we don't erroneously return false for pages that have been
@@ -1128,6 +1132,167 @@ extern void ptep_modify_prot_commit(struct vm_area_struct *vma,
 				    unsigned long addr, pte_t *ptep,
 				    pte_t old_pte, pte_t new_pte);
 
+#ifdef CONFIG_ARM64_CONTPTE
+
+/*
+ * The contpte APIs are used to transparently manage the contiguous bit in ptes
+ * where it is possible and makes sense to do so. The PTE_CONT bit is considered
+ * a private implementation detail of the public ptep API (see below).
+ */
+extern void __contpte_try_unfold(struct mm_struct *mm, unsigned long addr,
+				pte_t *ptep, pte_t pte);
+extern pte_t contpte_ptep_get(pte_t *ptep, pte_t orig_pte);
+extern pte_t contpte_ptep_get_lockless(pte_t *orig_ptep);
+extern void contpte_set_ptes(struct mm_struct *mm, unsigned long addr,
+				pte_t *ptep, pte_t pte, unsigned int nr);
+extern int contpte_ptep_test_and_clear_young(struct vm_area_struct *vma,
+				unsigned long addr, pte_t *ptep);
+extern int contpte_ptep_clear_flush_young(struct vm_area_struct *vma,
+				unsigned long addr, pte_t *ptep);
+extern int contpte_ptep_set_access_flags(struct vm_area_struct *vma,
+				unsigned long addr, pte_t *ptep,
+				pte_t entry, int dirty);
+
+static inline void contpte_try_unfold(struct mm_struct *mm, unsigned long addr,
+					pte_t *ptep, pte_t pte)
+{
+	if (unlikely(pte_valid_cont(pte)))
+		__contpte_try_unfold(mm, addr, ptep, pte);
+}
+
+/*
+ * The below functions constitute the public API that arm64 presents to the
+ * core-mm to manipulate PTE entries within their page tables (or at least this
+ * is the subset of the API that arm64 needs to implement). These public
+ * versions will automatically and transparently apply the contiguous bit where
+ * it makes sense to do so. Therefore any users that are contig-aware (e.g.
+ * hugetlb, kernel mapper) should NOT use these APIs, but instead use the
+ * private versions, which are prefixed with double underscore. All of these
+ * APIs except for ptep_get_lockless() are expected to be called with the PTL
+ * held. Although the contiguous bit is considered private to the
+ * implementation, it is deliberately allowed to leak through the getters (e.g.
+ * ptep_get()), back to core code. This is required so that pte_leaf_size() can
+ * provide an accurate size for perf_get_pgtable_size(). But this leakage means
+ * its possible a pte will be passed to a setter with the contiguous bit set, so
+ * we explicitly clear the contiguous bit in those cases to prevent accidentally
+ * setting it in the pgtable.
+ */
+
+#define ptep_get ptep_get
+static inline pte_t ptep_get(pte_t *ptep)
+{
+	pte_t pte = __ptep_get(ptep);
+
+	if (likely(!pte_valid_cont(pte)))
+		return pte;
+
+	return contpte_ptep_get(ptep, pte);
+}
+
+#define ptep_get_lockless ptep_get_lockless
+static inline pte_t ptep_get_lockless(pte_t *ptep)
+{
+	pte_t pte = __ptep_get(ptep);
+
+	if (likely(!pte_valid_cont(pte)))
+		return pte;
+
+	return contpte_ptep_get_lockless(ptep);
+}
+
+static inline void set_pte(pte_t *ptep, pte_t pte)
+{
+	/*
+	 * We don't have the mm or vaddr so cannot unfold contig entries (since
+	 * it requires tlb maintenance). set_pte() is not used in core code, so
+	 * this should never even be called. Regardless do our best to service
+	 * any call and emit a warning if there is any attempt to set a pte on
+	 * top of an existing contig range.
+	 */
+	pte_t orig_pte = __ptep_get(ptep);
+
+	WARN_ON_ONCE(pte_valid_cont(orig_pte));
+	__set_pte(ptep, pte_mknoncont(pte));
+}
+
+#define set_ptes set_ptes
+static inline void set_ptes(struct mm_struct *mm, unsigned long addr,
+				pte_t *ptep, pte_t pte, unsigned int nr)
+{
+	pte = pte_mknoncont(pte);
+
+	if (likely(nr == 1)) {
+		contpte_try_unfold(mm, addr, ptep, __ptep_get(ptep));
+		__set_ptes(mm, addr, ptep, pte, 1);
+	} else {
+		contpte_set_ptes(mm, addr, ptep, pte, nr);
+	}
+}
+
+static inline void pte_clear(struct mm_struct *mm,
+				unsigned long addr, pte_t *ptep)
+{
+	contpte_try_unfold(mm, addr, ptep, __ptep_get(ptep));
+	__pte_clear(mm, addr, ptep);
+}
+
+#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
+static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
+				unsigned long addr, pte_t *ptep)
+{
+	contpte_try_unfold(mm, addr, ptep, __ptep_get(ptep));
+	return __ptep_get_and_clear(mm, addr, ptep);
+}
+
+#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
+static inline int ptep_test_and_clear_young(struct vm_area_struct *vma,
+				unsigned long addr, pte_t *ptep)
+{
+	pte_t orig_pte = __ptep_get(ptep);
+
+	if (likely(!pte_valid_cont(orig_pte)))
+		return __ptep_test_and_clear_young(vma, addr, ptep);
+
+	return contpte_ptep_test_and_clear_young(vma, addr, ptep);
+}
+
+#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
+static inline int ptep_clear_flush_young(struct vm_area_struct *vma,
+				unsigned long addr, pte_t *ptep)
+{
+	pte_t orig_pte = __ptep_get(ptep);
+
+	if (likely(!pte_valid_cont(orig_pte)))
+		return __ptep_clear_flush_young(vma, addr, ptep);
+
+	return contpte_ptep_clear_flush_young(vma, addr, ptep);
+}
+
+#define __HAVE_ARCH_PTEP_SET_WRPROTECT
+static inline void ptep_set_wrprotect(struct mm_struct *mm,
+				unsigned long addr, pte_t *ptep)
+{
+	contpte_try_unfold(mm, addr, ptep, __ptep_get(ptep));
+	__ptep_set_wrprotect(mm, addr, ptep);
+}
+
+#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
+static inline int ptep_set_access_flags(struct vm_area_struct *vma,
+				unsigned long addr, pte_t *ptep,
+				pte_t entry, int dirty)
+{
+	pte_t orig_pte = __ptep_get(ptep);
+
+	entry = pte_mknoncont(entry);
+
+	if (likely(!pte_valid_cont(orig_pte)))
+		return __ptep_set_access_flags(vma, addr, ptep, entry, dirty);
+
+	return contpte_ptep_set_access_flags(vma, addr, ptep, entry, dirty);
+}
+
+#else /* CONFIG_ARM64_CONTPTE */
+
 #define ptep_get				__ptep_get
 #define set_pte					__set_pte
 #define set_ptes				__set_ptes
@@ -1143,6 +1308,8 @@ extern void ptep_modify_prot_commit(struct vm_area_struct *vma,
 #define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
 #define ptep_set_access_flags			__ptep_set_access_flags
 
+#endif /* CONFIG_ARM64_CONTPTE */
+
 #endif /* !__ASSEMBLY__ */
 
 #endif /* __ASM_PGTABLE_H */
diff --git a/arch/arm64/mm/Makefile b/arch/arm64/mm/Makefile
index dbd1bc95967d00..60454256945b8b 100644
--- a/arch/arm64/mm/Makefile
+++ b/arch/arm64/mm/Makefile
@@ -3,6 +3,7 @@ obj-y				:= dma-mapping.o extable.o fault.o init.o \
 				   cache.o copypage.o flush.o \
 				   ioremap.o mmap.o pgd.o mmu.o \
 				   context.o proc.o pageattr.o fixmap.o
+obj-$(CONFIG_ARM64_CONTPTE)	+= contpte.o
 obj-$(CONFIG_HUGETLB_PAGE)	+= hugetlbpage.o
 obj-$(CONFIG_PTDUMP_CORE)	+= ptdump.o
 obj-$(CONFIG_PTDUMP_DEBUGFS)	+= ptdump_debugfs.o
diff --git a/arch/arm64/mm/contpte.c b/arch/arm64/mm/contpte.c
new file mode 100644
index 00000000000000..6d7f40667fa235
--- /dev/null
+++ b/arch/arm64/mm/contpte.c
@@ -0,0 +1,285 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2023 ARM Ltd.
+ */
+
+#include <linux/mm.h>
+#include <linux/efi.h>
+#include <linux/export.h>
+#include <asm/tlbflush.h>
+
+static inline bool mm_is_user(struct mm_struct *mm)
+{
+	/*
+	 * Don't attempt to apply the contig bit to kernel mappings, because
+	 * dynamically adding/removing the contig bit can cause page faults.
+	 * These racing faults are ok for user space, since they get serialized
+	 * on the PTL. But kernel mappings can't tolerate faults.
+	 */
+	if (unlikely(mm_is_efi(mm)))
+		return false;
+	return mm != &init_mm;
+}
+
+static inline pte_t *contpte_align_down(pte_t *ptep)
+{
+	return PTR_ALIGN_DOWN(ptep, sizeof(*ptep) * CONT_PTES);
+}
+
+static void contpte_convert(struct mm_struct *mm, unsigned long addr,
+			    pte_t *ptep, pte_t pte)
+{
+	struct vm_area_struct vma = TLB_FLUSH_VMA(mm, 0);
+	unsigned long start_addr;
+	pte_t *start_ptep;
+	int i;
+
+	start_ptep = ptep = contpte_align_down(ptep);
+	start_addr = addr = ALIGN_DOWN(addr, CONT_PTE_SIZE);
+	pte = pfn_pte(ALIGN_DOWN(pte_pfn(pte), CONT_PTES), pte_pgprot(pte));
+
+	for (i = 0; i < CONT_PTES; i++, ptep++, addr += PAGE_SIZE) {
+		pte_t ptent = __ptep_get_and_clear(mm, addr, ptep);
+
+		if (pte_dirty(ptent))
+			pte = pte_mkdirty(pte);
+
+		if (pte_young(ptent))
+			pte = pte_mkyoung(pte);
+	}
+
+	__flush_tlb_range(&vma, start_addr, addr, PAGE_SIZE, true, 3);
+
+	__set_ptes(mm, start_addr, start_ptep, pte, CONT_PTES);
+}
+
+void __contpte_try_unfold(struct mm_struct *mm, unsigned long addr,
+			pte_t *ptep, pte_t pte)
+{
+	/*
+	 * We have already checked that the ptes are contiguous in
+	 * contpte_try_unfold(), so just check that the mm is user space.
+	 */
+	if (!mm_is_user(mm))
+		return;
+
+	pte = pte_mknoncont(pte);
+	contpte_convert(mm, addr, ptep, pte);
+}
+EXPORT_SYMBOL(__contpte_try_unfold);
+
+pte_t contpte_ptep_get(pte_t *ptep, pte_t orig_pte)
+{
+	/*
+	 * Gather access/dirty bits, which may be populated in any of the ptes
+	 * of the contig range. We are guaranteed to be holding the PTL, so any
+	 * contiguous range cannot be unfolded or otherwise modified under our
+	 * feet.
+	 */
+
+	pte_t pte;
+	int i;
+
+	ptep = contpte_align_down(ptep);
+
+	for (i = 0; i < CONT_PTES; i++, ptep++) {
+		pte = __ptep_get(ptep);
+
+		if (pte_dirty(pte))
+			orig_pte = pte_mkdirty(orig_pte);
+
+		if (pte_young(pte))
+			orig_pte = pte_mkyoung(orig_pte);
+	}
+
+	return orig_pte;
+}
+EXPORT_SYMBOL(contpte_ptep_get);
+
+pte_t contpte_ptep_get_lockless(pte_t *orig_ptep)
+{
+	/*
+	 * Gather access/dirty bits, which may be populated in any of the ptes
+	 * of the contig range. We may not be holding the PTL, so any contiguous
+	 * range may be unfolded/modified/refolded under our feet. Therefore we
+	 * ensure we read a _consistent_ contpte range by checking that all ptes
+	 * in the range are valid and have CONT_PTE set, that all pfns are
+	 * contiguous and that all pgprots are the same (ignoring access/dirty).
+	 * If we find a pte that is not consistent, then we must be racing with
+	 * an update so start again. If the target pte does not have CONT_PTE
+	 * set then that is considered consistent on its own because it is not
+	 * part of a contpte range.
+	 */
+
+	pgprot_t orig_prot;
+	unsigned long pfn;
+	pte_t orig_pte;
+	pgprot_t prot;
+	pte_t *ptep;
+	pte_t pte;
+	int i;
+
+retry:
+	orig_pte = __ptep_get(orig_ptep);
+
+	if (!pte_valid_cont(orig_pte))
+		return orig_pte;
+
+	orig_prot = pte_pgprot(pte_mkold(pte_mkclean(orig_pte)));
+	ptep = contpte_align_down(orig_ptep);
+	pfn = pte_pfn(orig_pte) - (orig_ptep - ptep);
+
+	for (i = 0; i < CONT_PTES; i++, ptep++, pfn++) {
+		pte = __ptep_get(ptep);
+		prot = pte_pgprot(pte_mkold(pte_mkclean(pte)));
+
+		if (!pte_valid_cont(pte) ||
+		   pte_pfn(pte) != pfn ||
+		   pgprot_val(prot) != pgprot_val(orig_prot))
+			goto retry;
+
+		if (pte_dirty(pte))
+			orig_pte = pte_mkdirty(orig_pte);
+
+		if (pte_young(pte))
+			orig_pte = pte_mkyoung(orig_pte);
+	}
+
+	return orig_pte;
+}
+EXPORT_SYMBOL(contpte_ptep_get_lockless);
+
+void contpte_set_ptes(struct mm_struct *mm, unsigned long addr,
+					pte_t *ptep, pte_t pte, unsigned int nr)
+{
+	unsigned long next;
+	unsigned long end;
+	unsigned long pfn;
+	pgprot_t prot;
+
+	/*
+	 * The set_ptes() spec guarantees that when nr > 1, the initial state of
+	 * all ptes is not-present. Therefore we never need to unfold or
+	 * otherwise invalidate a range before we set the new ptes.
+	 * contpte_set_ptes() should never be called for nr < 2.
+	 */
+	VM_WARN_ON(nr == 1);
+
+	if (!mm_is_user(mm))
+		return __set_ptes(mm, addr, ptep, pte, nr);
+
+	end = addr + (nr << PAGE_SHIFT);
+	pfn = pte_pfn(pte);
+	prot = pte_pgprot(pte);
+
+	do {
+		next = pte_cont_addr_end(addr, end);
+		nr = (next - addr) >> PAGE_SHIFT;
+		pte = pfn_pte(pfn, prot);
+
+		if (((addr | next | (pfn << PAGE_SHIFT)) & ~CONT_PTE_MASK) == 0)
+			pte = pte_mkcont(pte);
+		else
+			pte = pte_mknoncont(pte);
+
+		__set_ptes(mm, addr, ptep, pte, nr);
+
+		addr = next;
+		ptep += nr;
+		pfn += nr;
+
+	} while (addr != end);
+}
+EXPORT_SYMBOL(contpte_set_ptes);
+
+int contpte_ptep_test_and_clear_young(struct vm_area_struct *vma,
+					unsigned long addr, pte_t *ptep)
+{
+	/*
+	 * ptep_clear_flush_young() technically requires us to clear the access
+	 * flag for a _single_ pte. However, the core-mm code actually tracks
+	 * access/dirty per folio, not per page. And since we only create a
+	 * contig range when the range is covered by a single folio, we can get
+	 * away with clearing young for the whole contig range here, so we avoid
+	 * having to unfold.
+	 */
+
+	int young = 0;
+	int i;
+
+	ptep = contpte_align_down(ptep);
+	addr = ALIGN_DOWN(addr, CONT_PTE_SIZE);
+
+	for (i = 0; i < CONT_PTES; i++, ptep++, addr += PAGE_SIZE)
+		young |= __ptep_test_and_clear_young(vma, addr, ptep);
+
+	return young;
+}
+EXPORT_SYMBOL(contpte_ptep_test_and_clear_young);
+
+int contpte_ptep_clear_flush_young(struct vm_area_struct *vma,
+					unsigned long addr, pte_t *ptep)
+{
+	int young;
+
+	young = contpte_ptep_test_and_clear_young(vma, addr, ptep);
+
+	if (young) {
+		/*
+		 * See comment in __ptep_clear_flush_young(); same rationale for
+		 * eliding the trailing DSB applies here.
+		 */
+		addr = ALIGN_DOWN(addr, CONT_PTE_SIZE);
+		__flush_tlb_range_nosync(vma, addr, addr + CONT_PTE_SIZE,
+					 PAGE_SIZE, true, 3);
+	}
+
+	return young;
+}
+EXPORT_SYMBOL(contpte_ptep_clear_flush_young);
+
+int contpte_ptep_set_access_flags(struct vm_area_struct *vma,
+					unsigned long addr, pte_t *ptep,
+					pte_t entry, int dirty)
+{
+	unsigned long start_addr;
+	pte_t orig_pte;
+	int i;
+
+	/*
+	 * Gather the access/dirty bits for the contiguous range. If nothing has
+	 * changed, its a noop.
+	 */
+	orig_pte = pte_mknoncont(ptep_get(ptep));
+	if (pte_val(orig_pte) == pte_val(entry))
+		return 0;
+
+	/*
+	 * We can fix up access/dirty bits without having to unfold the contig
+	 * range. But if the write bit is changing, we must unfold.
+	 */
+	if (pte_write(orig_pte) == pte_write(entry)) {
+		/*
+		 * For HW access management, we technically only need to update
+		 * the flag on a single pte in the range. But for SW access
+		 * management, we need to update all the ptes to prevent extra
+		 * faults. Avoid per-page tlb flush in __ptep_set_access_flags()
+		 * and instead flush the whole range at the end.
+		 */
+		ptep = contpte_align_down(ptep);
+		start_addr = addr = ALIGN_DOWN(addr, CONT_PTE_SIZE);
+
+		for (i = 0; i < CONT_PTES; i++, ptep++, addr += PAGE_SIZE)
+			__ptep_set_access_flags(vma, addr, ptep, entry, 0);
+
+		if (dirty)
+			__flush_tlb_range(vma, start_addr, addr,
+							PAGE_SIZE, true, 3);
+	} else {
+		__contpte_try_unfold(vma->vm_mm, addr, ptep, orig_pte);
+		__ptep_set_access_flags(vma, addr, ptep, entry, dirty);
+	}
+
+	return 1;
+}
+EXPORT_SYMBOL(contpte_ptep_set_access_flags);
diff --git a/include/linux/efi.h b/include/linux/efi.h
index c74f47711f0bdf..57da15e7429c97 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -692,6 +692,11 @@ extern struct efi {
 
 extern struct mm_struct efi_mm;
 
+static inline bool mm_is_efi(struct mm_struct *mm)
+{
+	return IS_ENABLED(CONFIG_EFI) && mm == &efi_mm;
+}
+
 static inline int
 efi_guidcmp (efi_guid_t left, efi_guid_t right)
 {

From b4f56a349f68a09b3664f0e7f20eeca2ad8a7268 Mon Sep 17 00:00:00 2001
From: Ryan Roberts <ryan.roberts@arm.com>
Date: Thu, 15 Feb 2024 10:32:00 +0000
Subject: [PATCH 1227/1406] arm64/mm: implement new wrprotect_ptes() batch API

Optimize the contpte implementation to fix some of the fork performance
regression introduced by the initial contpte commit.  Subsequent patches
will solve it entirely.

During fork(), any private memory in the parent must be write-protected.
Previously this was done 1 PTE at a time.  But the core-mm supports
batched wrprotect via the new wrprotect_ptes() API.  So let's implement
that API and for fully covered contpte mappings, we no longer need to
unfold the contpte.  This has 2 benefits:

  - reduced unfolding, reduces the number of tlbis that must be issued.
  - The memory remains contpte-mapped ("folded") in the parent, so it
    continues to benefit from the more efficient use of the TLB after
    the fork.

The optimization to wrprotect a whole contpte block without unfolding is
possible thanks to the tightening of the Arm ARM in respect to the
definition and behaviour when 'Misprogramming the Contiguous bit'.  See
section D21194 at https://developer.arm.com/documentation/102105/ja-07/

Link: https://lkml.kernel.org/r/20240215103205.2607016-14-ryan.roberts@arm.com
Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
Tested-by: John Hubbard <jhubbard@nvidia.com>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Barry Song <21cnbao@gmail.com>
Cc: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Morse <james.morse@arm.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will@kernel.org>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/arm64/include/asm/pgtable.h | 61 ++++++++++++++++++++++++++------
 arch/arm64/mm/contpte.c          | 38 ++++++++++++++++++++
 2 files changed, 89 insertions(+), 10 deletions(-)

diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 831099cfc96bdd..8643227c318bfe 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -978,16 +978,12 @@ static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
 }
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
-/*
- * __ptep_set_wrprotect - mark read-only while trasferring potential hardware
- * dirty status (PTE_DBM && !PTE_RDONLY) to the software PTE_DIRTY bit.
- */
-static inline void __ptep_set_wrprotect(struct mm_struct *mm,
-					unsigned long address, pte_t *ptep)
+static inline void ___ptep_set_wrprotect(struct mm_struct *mm,
+					unsigned long address, pte_t *ptep,
+					pte_t pte)
 {
-	pte_t old_pte, pte;
+	pte_t old_pte;
 
-	pte = __ptep_get(ptep);
 	do {
 		old_pte = pte;
 		pte = pte_wrprotect(pte);
@@ -996,6 +992,25 @@ static inline void __ptep_set_wrprotect(struct mm_struct *mm,
 	} while (pte_val(pte) != pte_val(old_pte));
 }
 
+/*
+ * __ptep_set_wrprotect - mark read-only while trasferring potential hardware
+ * dirty status (PTE_DBM && !PTE_RDONLY) to the software PTE_DIRTY bit.
+ */
+static inline void __ptep_set_wrprotect(struct mm_struct *mm,
+					unsigned long address, pte_t *ptep)
+{
+	___ptep_set_wrprotect(mm, address, ptep, __ptep_get(ptep));
+}
+
+static inline void __wrprotect_ptes(struct mm_struct *mm, unsigned long address,
+				pte_t *ptep, unsigned int nr)
+{
+	unsigned int i;
+
+	for (i = 0; i < nr; i++, address += PAGE_SIZE, ptep++)
+		__ptep_set_wrprotect(mm, address, ptep);
+}
+
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 #define __HAVE_ARCH_PMDP_SET_WRPROTECT
 static inline void pmdp_set_wrprotect(struct mm_struct *mm,
@@ -1149,6 +1164,8 @@ extern int contpte_ptep_test_and_clear_young(struct vm_area_struct *vma,
 				unsigned long addr, pte_t *ptep);
 extern int contpte_ptep_clear_flush_young(struct vm_area_struct *vma,
 				unsigned long addr, pte_t *ptep);
+extern void contpte_wrprotect_ptes(struct mm_struct *mm, unsigned long addr,
+				pte_t *ptep, unsigned int nr);
 extern int contpte_ptep_set_access_flags(struct vm_area_struct *vma,
 				unsigned long addr, pte_t *ptep,
 				pte_t entry, int dirty);
@@ -1268,12 +1285,35 @@ static inline int ptep_clear_flush_young(struct vm_area_struct *vma,
 	return contpte_ptep_clear_flush_young(vma, addr, ptep);
 }
 
+#define wrprotect_ptes wrprotect_ptes
+static inline void wrprotect_ptes(struct mm_struct *mm, unsigned long addr,
+				pte_t *ptep, unsigned int nr)
+{
+	if (likely(nr == 1)) {
+		/*
+		 * Optimization: wrprotect_ptes() can only be called for present
+		 * ptes so we only need to check contig bit as condition for
+		 * unfold, and we can remove the contig bit from the pte we read
+		 * to avoid re-reading. This speeds up fork() which is sensitive
+		 * for order-0 folios. Equivalent to contpte_try_unfold().
+		 */
+		pte_t orig_pte = __ptep_get(ptep);
+
+		if (unlikely(pte_cont(orig_pte))) {
+			__contpte_try_unfold(mm, addr, ptep, orig_pte);
+			orig_pte = pte_mknoncont(orig_pte);
+		}
+		___ptep_set_wrprotect(mm, addr, ptep, orig_pte);
+	} else {
+		contpte_wrprotect_ptes(mm, addr, ptep, nr);
+	}
+}
+
 #define __HAVE_ARCH_PTEP_SET_WRPROTECT
 static inline void ptep_set_wrprotect(struct mm_struct *mm,
 				unsigned long addr, pte_t *ptep)
 {
-	contpte_try_unfold(mm, addr, ptep, __ptep_get(ptep));
-	__ptep_set_wrprotect(mm, addr, ptep);
+	wrprotect_ptes(mm, addr, ptep, 1);
 }
 
 #define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
@@ -1305,6 +1345,7 @@ static inline int ptep_set_access_flags(struct vm_area_struct *vma,
 #define ptep_clear_flush_young			__ptep_clear_flush_young
 #define __HAVE_ARCH_PTEP_SET_WRPROTECT
 #define ptep_set_wrprotect			__ptep_set_wrprotect
+#define wrprotect_ptes				__wrprotect_ptes
 #define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
 #define ptep_set_access_flags			__ptep_set_access_flags
 
diff --git a/arch/arm64/mm/contpte.c b/arch/arm64/mm/contpte.c
index 6d7f40667fa235..bedb5852453563 100644
--- a/arch/arm64/mm/contpte.c
+++ b/arch/arm64/mm/contpte.c
@@ -26,6 +26,26 @@ static inline pte_t *contpte_align_down(pte_t *ptep)
 	return PTR_ALIGN_DOWN(ptep, sizeof(*ptep) * CONT_PTES);
 }
 
+static void contpte_try_unfold_partial(struct mm_struct *mm, unsigned long addr,
+					pte_t *ptep, unsigned int nr)
+{
+	/*
+	 * Unfold any partially covered contpte block at the beginning and end
+	 * of the range.
+	 */
+
+	if (ptep != contpte_align_down(ptep) || nr < CONT_PTES)
+		contpte_try_unfold(mm, addr, ptep, __ptep_get(ptep));
+
+	if (ptep + nr != contpte_align_down(ptep + nr)) {
+		unsigned long last_addr = addr + PAGE_SIZE * (nr - 1);
+		pte_t *last_ptep = ptep + nr - 1;
+
+		contpte_try_unfold(mm, last_addr, last_ptep,
+				   __ptep_get(last_ptep));
+	}
+}
+
 static void contpte_convert(struct mm_struct *mm, unsigned long addr,
 			    pte_t *ptep, pte_t pte)
 {
@@ -238,6 +258,24 @@ int contpte_ptep_clear_flush_young(struct vm_area_struct *vma,
 }
 EXPORT_SYMBOL(contpte_ptep_clear_flush_young);
 
+void contpte_wrprotect_ptes(struct mm_struct *mm, unsigned long addr,
+					pte_t *ptep, unsigned int nr)
+{
+	/*
+	 * If wrprotecting an entire contig range, we can avoid unfolding. Just
+	 * set wrprotect and wait for the later mmu_gather flush to invalidate
+	 * the tlb. Until the flush, the page may or may not be wrprotected.
+	 * After the flush, it is guaranteed wrprotected. If it's a partial
+	 * range though, we must unfold, because we can't have a case where
+	 * CONT_PTE is set but wrprotect applies to a subset of the PTEs; this
+	 * would cause it to continue to be unpredictable after the flush.
+	 */
+
+	contpte_try_unfold_partial(mm, addr, ptep, nr);
+	__wrprotect_ptes(mm, addr, ptep, nr);
+}
+EXPORT_SYMBOL(contpte_wrprotect_ptes);
+
 int contpte_ptep_set_access_flags(struct vm_area_struct *vma,
 					unsigned long addr, pte_t *ptep,
 					pte_t entry, int dirty)

From eaf02999967bed73cbe20532f6977ec394a4d590 Mon Sep 17 00:00:00 2001
From: Ryan Roberts <ryan.roberts@arm.com>
Date: Thu, 15 Feb 2024 10:32:01 +0000
Subject: [PATCH 1228/1406] arm64/mm: implement new [get_and_]clear_full_ptes()
 batch APIs

Optimize the contpte implementation to fix some of the
exit/munmap/dontneed performance regression introduced by the initial
contpte commit.  Subsequent patches will solve it entirely.

During exit(), munmap() or madvise(MADV_DONTNEED), mappings must be
cleared.  Previously this was done 1 PTE at a time.  But the core-mm
supports batched clear via the new [get_and_]clear_full_ptes() APIs.  So
let's implement those APIs and for fully covered contpte mappings, we no
longer need to unfold the contpte.  This significantly reduces unfolding
operations, reducing the number of tlbis that must be issued.

Link: https://lkml.kernel.org/r/20240215103205.2607016-15-ryan.roberts@arm.com
Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
Tested-by: John Hubbard <jhubbard@nvidia.com>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Barry Song <21cnbao@gmail.com>
Cc: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Morse <james.morse@arm.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will@kernel.org>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/arm64/include/asm/pgtable.h | 67 ++++++++++++++++++++++++++++++++
 arch/arm64/mm/contpte.c          | 17 ++++++++
 2 files changed, 84 insertions(+)

diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 8643227c318bfe..a8f1a35e308673 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -965,6 +965,37 @@ static inline pte_t __ptep_get_and_clear(struct mm_struct *mm,
 	return pte;
 }
 
+static inline void __clear_full_ptes(struct mm_struct *mm, unsigned long addr,
+				pte_t *ptep, unsigned int nr, int full)
+{
+	for (;;) {
+		__ptep_get_and_clear(mm, addr, ptep);
+		if (--nr == 0)
+			break;
+		ptep++;
+		addr += PAGE_SIZE;
+	}
+}
+
+static inline pte_t __get_and_clear_full_ptes(struct mm_struct *mm,
+				unsigned long addr, pte_t *ptep,
+				unsigned int nr, int full)
+{
+	pte_t pte, tmp_pte;
+
+	pte = __ptep_get_and_clear(mm, addr, ptep);
+	while (--nr) {
+		ptep++;
+		addr += PAGE_SIZE;
+		tmp_pte = __ptep_get_and_clear(mm, addr, ptep);
+		if (pte_dirty(tmp_pte))
+			pte = pte_mkdirty(pte);
+		if (pte_young(tmp_pte))
+			pte = pte_mkyoung(pte);
+	}
+	return pte;
+}
+
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 #define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
 static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
@@ -1160,6 +1191,11 @@ extern pte_t contpte_ptep_get(pte_t *ptep, pte_t orig_pte);
 extern pte_t contpte_ptep_get_lockless(pte_t *orig_ptep);
 extern void contpte_set_ptes(struct mm_struct *mm, unsigned long addr,
 				pte_t *ptep, pte_t pte, unsigned int nr);
+extern void contpte_clear_full_ptes(struct mm_struct *mm, unsigned long addr,
+				pte_t *ptep, unsigned int nr, int full);
+extern pte_t contpte_get_and_clear_full_ptes(struct mm_struct *mm,
+				unsigned long addr, pte_t *ptep,
+				unsigned int nr, int full);
 extern int contpte_ptep_test_and_clear_young(struct vm_area_struct *vma,
 				unsigned long addr, pte_t *ptep);
 extern int contpte_ptep_clear_flush_young(struct vm_area_struct *vma,
@@ -1253,6 +1289,35 @@ static inline void pte_clear(struct mm_struct *mm,
 	__pte_clear(mm, addr, ptep);
 }
 
+#define clear_full_ptes clear_full_ptes
+static inline void clear_full_ptes(struct mm_struct *mm, unsigned long addr,
+				pte_t *ptep, unsigned int nr, int full)
+{
+	if (likely(nr == 1)) {
+		contpte_try_unfold(mm, addr, ptep, __ptep_get(ptep));
+		__clear_full_ptes(mm, addr, ptep, nr, full);
+	} else {
+		contpte_clear_full_ptes(mm, addr, ptep, nr, full);
+	}
+}
+
+#define get_and_clear_full_ptes get_and_clear_full_ptes
+static inline pte_t get_and_clear_full_ptes(struct mm_struct *mm,
+				unsigned long addr, pte_t *ptep,
+				unsigned int nr, int full)
+{
+	pte_t pte;
+
+	if (likely(nr == 1)) {
+		contpte_try_unfold(mm, addr, ptep, __ptep_get(ptep));
+		pte = __get_and_clear_full_ptes(mm, addr, ptep, nr, full);
+	} else {
+		pte = contpte_get_and_clear_full_ptes(mm, addr, ptep, nr, full);
+	}
+
+	return pte;
+}
+
 #define __HAVE_ARCH_PTEP_GET_AND_CLEAR
 static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
 				unsigned long addr, pte_t *ptep)
@@ -1337,6 +1402,8 @@ static inline int ptep_set_access_flags(struct vm_area_struct *vma,
 #define set_pte					__set_pte
 #define set_ptes				__set_ptes
 #define pte_clear				__pte_clear
+#define clear_full_ptes				__clear_full_ptes
+#define get_and_clear_full_ptes			__get_and_clear_full_ptes
 #define __HAVE_ARCH_PTEP_GET_AND_CLEAR
 #define ptep_get_and_clear			__ptep_get_and_clear
 #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
diff --git a/arch/arm64/mm/contpte.c b/arch/arm64/mm/contpte.c
index bedb5852453563..50e0173dc5eee0 100644
--- a/arch/arm64/mm/contpte.c
+++ b/arch/arm64/mm/contpte.c
@@ -212,6 +212,23 @@ void contpte_set_ptes(struct mm_struct *mm, unsigned long addr,
 }
 EXPORT_SYMBOL(contpte_set_ptes);
 
+void contpte_clear_full_ptes(struct mm_struct *mm, unsigned long addr,
+				pte_t *ptep, unsigned int nr, int full)
+{
+	contpte_try_unfold_partial(mm, addr, ptep, nr);
+	__clear_full_ptes(mm, addr, ptep, nr, full);
+}
+EXPORT_SYMBOL(contpte_clear_full_ptes);
+
+pte_t contpte_get_and_clear_full_ptes(struct mm_struct *mm,
+				unsigned long addr, pte_t *ptep,
+				unsigned int nr, int full)
+{
+	contpte_try_unfold_partial(mm, addr, ptep, nr);
+	return __get_and_clear_full_ptes(mm, addr, ptep, nr, full);
+}
+EXPORT_SYMBOL(contpte_get_and_clear_full_ptes);
+
 int contpte_ptep_test_and_clear_young(struct vm_area_struct *vma,
 					unsigned long addr, pte_t *ptep)
 {

From 3ef2482b69216453539f672c75b981c87b52437a Mon Sep 17 00:00:00 2001
From: Ryan Roberts <ryan.roberts@arm.com>
Date: Thu, 15 Feb 2024 10:32:02 +0000
Subject: [PATCH 1229/1406] mm: add pte_batch_hint() to reduce scanning in
 folio_pte_batch()

Some architectures (e.g.  arm64) can tell from looking at a pte, if some
follow-on ptes also map contiguous physical memory with the same pgprot.
(for arm64, these are contpte mappings).

Take advantage of this knowledge to optimize folio_pte_batch() so that it
can skip these ptes when scanning to create a batch.  By default, if an
arch does not opt-in, folio_pte_batch() returns a compile-time 1, so the
changes are optimized out and the behaviour is as before.

arm64 will opt-in to providing this hint in the next patch, which will
greatly reduce the cost of ptep_get() when scanning a range of contptes.

Link: https://lkml.kernel.org/r/20240215103205.2607016-16-ryan.roberts@arm.com
Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
Acked-by: David Hildenbrand <david@redhat.com>
Tested-by: John Hubbard <jhubbard@nvidia.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Barry Song <21cnbao@gmail.com>
Cc: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Morse <james.morse@arm.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will@kernel.org>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/pgtable.h | 21 +++++++++++++++++++++
 mm/memory.c             | 19 ++++++++++++-------
 2 files changed, 33 insertions(+), 7 deletions(-)

diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index bc005d84f764b7..a36cf4e124b0e4 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -212,6 +212,27 @@ static inline int pmd_dirty(pmd_t pmd)
 #define arch_flush_lazy_mmu_mode()	do {} while (0)
 #endif
 
+#ifndef pte_batch_hint
+/**
+ * pte_batch_hint - Number of pages that can be added to batch without scanning.
+ * @ptep: Page table pointer for the entry.
+ * @pte: Page table entry.
+ *
+ * Some architectures know that a set of contiguous ptes all map the same
+ * contiguous memory with the same permissions. In this case, it can provide a
+ * hint to aid pte batching without the core code needing to scan every pte.
+ *
+ * An architecture implementation may ignore the PTE accessed state. Further,
+ * the dirty state must apply atomically to all the PTEs described by the hint.
+ *
+ * May be overridden by the architecture, else pte_batch_hint is always 1.
+ */
+static inline unsigned int pte_batch_hint(pte_t *ptep, pte_t pte)
+{
+	return 1;
+}
+#endif
+
 #ifndef pte_advance_pfn
 static inline pte_t pte_advance_pfn(pte_t pte, unsigned long nr)
 {
diff --git a/mm/memory.c b/mm/memory.c
index 465ada39c2b7d3..642b4f2be5239e 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -988,16 +988,20 @@ static inline int folio_pte_batch(struct folio *folio, unsigned long addr,
 {
 	unsigned long folio_end_pfn = folio_pfn(folio) + folio_nr_pages(folio);
 	const pte_t *end_ptep = start_ptep + max_nr;
-	pte_t expected_pte = __pte_batch_clear_ignored(pte_next_pfn(pte), flags);
-	pte_t *ptep = start_ptep + 1;
+	pte_t expected_pte, *ptep;
 	bool writable;
+	int nr;
 
 	if (any_writable)
 		*any_writable = false;
 
 	VM_WARN_ON_FOLIO(!pte_present(pte), folio);
 
-	while (ptep != end_ptep) {
+	nr = pte_batch_hint(start_ptep, pte);
+	expected_pte = __pte_batch_clear_ignored(pte_advance_pfn(pte, nr), flags);
+	ptep = start_ptep + nr;
+
+	while (ptep < end_ptep) {
 		pte = ptep_get(ptep);
 		if (any_writable)
 			writable = !!pte_write(pte);
@@ -1011,17 +1015,18 @@ static inline int folio_pte_batch(struct folio *folio, unsigned long addr,
 		 * corner cases the next PFN might fall into a different
 		 * folio.
 		 */
-		if (pte_pfn(pte) == folio_end_pfn)
+		if (pte_pfn(pte) >= folio_end_pfn)
 			break;
 
 		if (any_writable)
 			*any_writable |= writable;
 
-		expected_pte = pte_next_pfn(expected_pte);
-		ptep++;
+		nr = pte_batch_hint(ptep, pte);
+		expected_pte = pte_advance_pfn(expected_pte, nr);
+		ptep += nr;
 	}
 
-	return ptep - start_ptep;
+	return min(ptep - start_ptep, max_nr);
 }
 
 /*

From c4cbfacc9678197e81ff0456c972c238d21b464f Mon Sep 17 00:00:00 2001
From: Ryan Roberts <ryan.roberts@arm.com>
Date: Thu, 15 Feb 2024 10:32:03 +0000
Subject: [PATCH 1230/1406] arm64/mm: implement pte_batch_hint()

When core code iterates over a range of ptes and calls ptep_get() for each
of them, if the range happens to cover contpte mappings, the number of pte
reads becomes amplified by a factor of the number of PTEs in a contpte
block.  This is because for each call to ptep_get(), the implementation
must read all of the ptes in the contpte block to which it belongs to
gather the access and dirty bits.

This causes a hotspot for fork(), as well as operations that unmap memory
such as munmap(), exit and madvise(MADV_DONTNEED).  Fortunately we can fix
this by implementing pte_batch_hint() which allows their iterators to skip
getting the contpte tail ptes when gathering the batch of ptes to operate
on.  This results in the number of PTE reads returning to 1 per pte.

Link: https://lkml.kernel.org/r/20240215103205.2607016-17-ryan.roberts@arm.com
Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Tested-by: John Hubbard <jhubbard@nvidia.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Barry Song <21cnbao@gmail.com>
Cc: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Morse <james.morse@arm.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will@kernel.org>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/arm64/include/asm/pgtable.h | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index a8f1a35e308673..d759a20d2929ab 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -1213,6 +1213,15 @@ static inline void contpte_try_unfold(struct mm_struct *mm, unsigned long addr,
 		__contpte_try_unfold(mm, addr, ptep, pte);
 }
 
+#define pte_batch_hint pte_batch_hint
+static inline unsigned int pte_batch_hint(pte_t *ptep, pte_t pte)
+{
+	if (!pte_valid_cont(pte))
+		return 1;
+
+	return CONT_PTES - (((unsigned long)ptep >> 3) & (CONT_PTES - 1));
+}
+
 /*
  * The below functions constitute the public API that arm64 presents to the
  * core-mm to manipulate PTE entries within their page tables (or at least this

From 2030dbb2b824c8359ed97564384d1da5d40f340c Mon Sep 17 00:00:00 2001
From: Ryan Roberts <ryan.roberts@arm.com>
Date: Thu, 15 Feb 2024 10:32:04 +0000
Subject: [PATCH 1231/1406] arm64/mm: __always_inline to improve fork() perf

As set_ptes() and wrprotect_ptes() become a bit more complex, the compiler
may choose not to inline them.  But this is critical for fork()
performance.  So mark the functions, along with contpte_try_unfold() which
is called by them, as __always_inline.  This is worth ~1% on the fork()
microbenchmark with order-0 folios (the common case).

Link: https://lkml.kernel.org/r/20240215103205.2607016-18-ryan.roberts@arm.com
Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Barry Song <21cnbao@gmail.com>
Cc: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Morse <james.morse@arm.com>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will@kernel.org>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/arm64/include/asm/pgtable.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index d759a20d2929ab..8310875133ffc5 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -1206,8 +1206,8 @@ extern int contpte_ptep_set_access_flags(struct vm_area_struct *vma,
 				unsigned long addr, pte_t *ptep,
 				pte_t entry, int dirty);
 
-static inline void contpte_try_unfold(struct mm_struct *mm, unsigned long addr,
-					pte_t *ptep, pte_t pte)
+static __always_inline void contpte_try_unfold(struct mm_struct *mm,
+				unsigned long addr, pte_t *ptep, pte_t pte)
 {
 	if (unlikely(pte_valid_cont(pte)))
 		__contpte_try_unfold(mm, addr, ptep, pte);
@@ -1278,7 +1278,7 @@ static inline void set_pte(pte_t *ptep, pte_t pte)
 }
 
 #define set_ptes set_ptes
-static inline void set_ptes(struct mm_struct *mm, unsigned long addr,
+static __always_inline void set_ptes(struct mm_struct *mm, unsigned long addr,
 				pte_t *ptep, pte_t pte, unsigned int nr)
 {
 	pte = pte_mknoncont(pte);
@@ -1360,8 +1360,8 @@ static inline int ptep_clear_flush_young(struct vm_area_struct *vma,
 }
 
 #define wrprotect_ptes wrprotect_ptes
-static inline void wrprotect_ptes(struct mm_struct *mm, unsigned long addr,
-				pte_t *ptep, unsigned int nr)
+static __always_inline void wrprotect_ptes(struct mm_struct *mm,
+				unsigned long addr, pte_t *ptep, unsigned int nr)
 {
 	if (likely(nr == 1)) {
 		/*

From 7533cd01431b6004b4517dfd7bee1b1bb372e39c Mon Sep 17 00:00:00 2001
From: Ryan Roberts <ryan.roberts@arm.com>
Date: Thu, 15 Feb 2024 10:32:05 +0000
Subject: [PATCH 1232/1406] arm64/mm: automatically fold contpte mappings

There are situations where a change to a single PTE could cause the
contpte block in which it resides to become foldable (i.e.  could be
repainted with the contiguous bit).  Such situations arise, for example,
when user space temporarily changes protections, via mprotect, for
individual pages, such can be the case for certain garbage collectors.

We would like to detect when such a PTE change occurs.  However this can
be expensive due to the amount of checking required.  Therefore only
perform the checks when an indiviual PTE is modified via mprotect
(ptep_modify_prot_commit() -> set_pte_at() -> set_ptes(nr=1)) and only
when we are setting the final PTE in a contpte-aligned block.

Link: https://lkml.kernel.org/r/20240215103205.2607016-19-ryan.roberts@arm.com
Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Barry Song <21cnbao@gmail.com>
Cc: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Morse <james.morse@arm.com>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will@kernel.org>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/arm64/include/asm/pgtable.h | 26 +++++++++++++
 arch/arm64/mm/contpte.c          | 64 ++++++++++++++++++++++++++++++++
 2 files changed, 90 insertions(+)

diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 8310875133ffc5..401087e8a43dc0 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -1185,6 +1185,8 @@ extern void ptep_modify_prot_commit(struct vm_area_struct *vma,
  * where it is possible and makes sense to do so. The PTE_CONT bit is considered
  * a private implementation detail of the public ptep API (see below).
  */
+extern void __contpte_try_fold(struct mm_struct *mm, unsigned long addr,
+				pte_t *ptep, pte_t pte);
 extern void __contpte_try_unfold(struct mm_struct *mm, unsigned long addr,
 				pte_t *ptep, pte_t pte);
 extern pte_t contpte_ptep_get(pte_t *ptep, pte_t orig_pte);
@@ -1206,6 +1208,29 @@ extern int contpte_ptep_set_access_flags(struct vm_area_struct *vma,
 				unsigned long addr, pte_t *ptep,
 				pte_t entry, int dirty);
 
+static __always_inline void contpte_try_fold(struct mm_struct *mm,
+				unsigned long addr, pte_t *ptep, pte_t pte)
+{
+	/*
+	 * Only bother trying if both the virtual and physical addresses are
+	 * aligned and correspond to the last entry in a contig range. The core
+	 * code mostly modifies ranges from low to high, so this is the likely
+	 * the last modification in the contig range, so a good time to fold.
+	 * We can't fold special mappings, because there is no associated folio.
+	 */
+
+	const unsigned long contmask = CONT_PTES - 1;
+	bool valign = ((addr >> PAGE_SHIFT) & contmask) == contmask;
+
+	if (unlikely(valign)) {
+		bool palign = (pte_pfn(pte) & contmask) == contmask;
+
+		if (unlikely(palign &&
+		    pte_valid(pte) && !pte_cont(pte) && !pte_special(pte)))
+			__contpte_try_fold(mm, addr, ptep, pte);
+	}
+}
+
 static __always_inline void contpte_try_unfold(struct mm_struct *mm,
 				unsigned long addr, pte_t *ptep, pte_t pte)
 {
@@ -1286,6 +1311,7 @@ static __always_inline void set_ptes(struct mm_struct *mm, unsigned long addr,
 	if (likely(nr == 1)) {
 		contpte_try_unfold(mm, addr, ptep, __ptep_get(ptep));
 		__set_ptes(mm, addr, ptep, pte, 1);
+		contpte_try_fold(mm, addr, ptep, pte);
 	} else {
 		contpte_set_ptes(mm, addr, ptep, pte, nr);
 	}
diff --git a/arch/arm64/mm/contpte.c b/arch/arm64/mm/contpte.c
index 50e0173dc5eee0..16788f07716d58 100644
--- a/arch/arm64/mm/contpte.c
+++ b/arch/arm64/mm/contpte.c
@@ -73,6 +73,70 @@ static void contpte_convert(struct mm_struct *mm, unsigned long addr,
 	__set_ptes(mm, start_addr, start_ptep, pte, CONT_PTES);
 }
 
+void __contpte_try_fold(struct mm_struct *mm, unsigned long addr,
+			pte_t *ptep, pte_t pte)
+{
+	/*
+	 * We have already checked that the virtual and pysical addresses are
+	 * correctly aligned for a contpte mapping in contpte_try_fold() so the
+	 * remaining checks are to ensure that the contpte range is fully
+	 * covered by a single folio, and ensure that all the ptes are valid
+	 * with contiguous PFNs and matching prots. We ignore the state of the
+	 * access and dirty bits for the purpose of deciding if its a contiguous
+	 * range; the folding process will generate a single contpte entry which
+	 * has a single access and dirty bit. Those 2 bits are the logical OR of
+	 * their respective bits in the constituent pte entries. In order to
+	 * ensure the contpte range is covered by a single folio, we must
+	 * recover the folio from the pfn, but special mappings don't have a
+	 * folio backing them. Fortunately contpte_try_fold() already checked
+	 * that the pte is not special - we never try to fold special mappings.
+	 * Note we can't use vm_normal_page() for this since we don't have the
+	 * vma.
+	 */
+
+	unsigned long folio_start, folio_end;
+	unsigned long cont_start, cont_end;
+	pte_t expected_pte, subpte;
+	struct folio *folio;
+	struct page *page;
+	unsigned long pfn;
+	pte_t *orig_ptep;
+	pgprot_t prot;
+
+	int i;
+
+	if (!mm_is_user(mm))
+		return;
+
+	page = pte_page(pte);
+	folio = page_folio(page);
+	folio_start = addr - (page - &folio->page) * PAGE_SIZE;
+	folio_end = folio_start + folio_nr_pages(folio) * PAGE_SIZE;
+	cont_start = ALIGN_DOWN(addr, CONT_PTE_SIZE);
+	cont_end = cont_start + CONT_PTE_SIZE;
+
+	if (folio_start > cont_start || folio_end < cont_end)
+		return;
+
+	pfn = ALIGN_DOWN(pte_pfn(pte), CONT_PTES);
+	prot = pte_pgprot(pte_mkold(pte_mkclean(pte)));
+	expected_pte = pfn_pte(pfn, prot);
+	orig_ptep = ptep;
+	ptep = contpte_align_down(ptep);
+
+	for (i = 0; i < CONT_PTES; i++) {
+		subpte = pte_mkold(pte_mkclean(__ptep_get(ptep)));
+		if (!pte_same(subpte, expected_pte))
+			return;
+		expected_pte = pte_advance_pfn(expected_pte, 1);
+		ptep++;
+	}
+
+	pte = pte_mkcont(pte);
+	contpte_convert(mm, addr, orig_ptep, pte);
+}
+EXPORT_SYMBOL(__contpte_try_fold);
+
 void __contpte_try_unfold(struct mm_struct *mm, unsigned long addr,
 			pte_t *ptep, pte_t pte)
 {

From d6884fa29c4d28d68a9fbfe41a0e9cff2cc28c65 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Thu, 8 Feb 2024 13:49:02 -0500
Subject: [PATCH 1233/1406] nvdimm/pmem: fix leak on dax_add_host() failure

Fix a leak on dax_add_host() error, where "goto out_cleanup_dax" is done
before setting pmem->dax_dev, which therefore issues the two following
calls on NULL pointers:

out_cleanup_dax:
        kill_dax(pmem->dax_dev);
        put_dax(pmem->dax_dev);

Link: https://lkml.kernel.org/r/20240208184913.484340-1-mathieu.desnoyers@efficios.com
Link: https://lkml.kernel.org/r/20240208184913.484340-2-mathieu.desnoyers@efficios.com
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Fan Ni <fan.ni@samsung.com>
Reviewed-by: Dave Jiang <dave.jiang@intel.com>
Cc: Alasdair Kergon <agk@redhat.com>
Cc: Mike Snitzer <snitzer@kernel.org>
Cc: Mikulas Patocka <mpatocka@redhat.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 drivers/nvdimm/pmem.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 4e8fdcb3f1c827..9fe3580907207b 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -566,12 +566,11 @@ static int pmem_attach_disk(struct device *dev,
 	set_dax_nomc(dax_dev);
 	if (is_nvdimm_sync(nd_region))
 		set_dax_synchronous(dax_dev);
+	pmem->dax_dev = dax_dev;
 	rc = dax_add_host(dax_dev, disk);
 	if (rc)
 		goto out_cleanup_dax;
 	dax_write_cache(dax_dev, nvdimm_has_cache(nd_region));
-	pmem->dax_dev = dax_dev;
-
 	rc = device_add_disk(dev, disk, pmem_attribute_groups);
 	if (rc)
 		goto out_remove_host;

From 2e1e2beed9b251c9a8cd2948a9bb1c7c87c4b49a Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Thu, 15 Feb 2024 09:46:25 -0500
Subject: [PATCH 1234/1406] dax: add empty static inline for CONFIG_DAX=n

Patch series "Introduce cpu_dcache_is_aliasing() to fix DAX regression",
v6.

This commit introduced in v4.0 prevents building FS_DAX on 32-bit ARM,
even on ARMv7 which does not have virtually aliased data caches:

commit d92576f1167c ("dax: does not work correctly with virtual aliasing caches")

Even though it used to work fine before.

The root of the issue here is the fact that DAX was never designed to
handle virtually aliasing data caches (VIVT and VIPT with aliasing data
cache). It touches the pages through their linear mapping, which is not
consistent with the userspace mappings with virtually aliasing data
caches.

This patch series introduces cpu_dcache_is_aliasing() with the new
Kconfig option ARCH_HAS_CPU_CACHE_ALIASING and implements it for all
architectures. The implementation of cpu_dcache_is_aliasing() is either
evaluated to a constant at compile-time or a runtime check, which is
what is needed on ARM.

With this we can basically narrow down the list of architectures which
are unsupported by DAX to those which are really affected.


This patch (of 9):

When building a kernel with CONFIG_DAX=n, all uses of set_dax_nocache()
and set_dax_nomc() need to be either within regions of code or compile
units which are explicitly not compiled, or they need to rely on compiler
optimizations to eliminate calls to those undefined symbols.

It appears that at least the openrisc and loongarch architectures don't
end up eliminating those undefined symbols even if they are provably
within code which is eliminated due to conditional branches depending on
constants.

Implement empty static inline functions for set_dax_nocache() and
set_dax_nomc() in CONFIG_DAX=n to ensure those undefined references are
removed.

Link: https://lkml.kernel.org/r/20240215144633.96437-1-mathieu.desnoyers@efficios.com
Link: https://lkml.kernel.org/r/20240215144633.96437-2-mathieu.desnoyers@efficios.com
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202402140037.wGfA1kqX-lkp@intel.com/
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202402131351.a0FZOgEG-lkp@intel.com/
Fixes: 7ac5360cd4d0 ("dax: remove the copy_from_iter and copy_to_iter methods")
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Vishal Verma <vishal.l.verma@intel.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Michael Sclafani <dm-devel@lists.linux.dev>
Cc: Alasdair Kergon <agk@redhat.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Mike Snitzer <snitzer@kernel.org>
Cc: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/dax.h | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/include/linux/dax.h b/include/linux/dax.h
index b463502b16e17f..e3ffe7c7f01dac 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -63,6 +63,8 @@ void kill_dax(struct dax_device *dax_dev);
 void dax_write_cache(struct dax_device *dax_dev, bool wc);
 bool dax_write_cache_enabled(struct dax_device *dax_dev);
 bool dax_synchronous(struct dax_device *dax_dev);
+void set_dax_nocache(struct dax_device *dax_dev);
+void set_dax_nomc(struct dax_device *dax_dev);
 void set_dax_synchronous(struct dax_device *dax_dev);
 size_t dax_recovery_write(struct dax_device *dax_dev, pgoff_t pgoff,
 		void *addr, size_t bytes, struct iov_iter *i);
@@ -109,6 +111,12 @@ static inline bool dax_synchronous(struct dax_device *dax_dev)
 {
 	return true;
 }
+static inline void set_dax_nocache(struct dax_device *dax_dev)
+{
+}
+static inline void set_dax_nomc(struct dax_device *dax_dev)
+{
+}
 static inline void set_dax_synchronous(struct dax_device *dax_dev)
 {
 }
@@ -124,9 +132,6 @@ static inline size_t dax_recovery_write(struct dax_device *dax_dev,
 }
 #endif
 
-void set_dax_nocache(struct dax_device *dax_dev);
-void set_dax_nomc(struct dax_device *dax_dev);
-
 struct writeback_control;
 #if defined(CONFIG_BLOCK) && defined(CONFIG_FS_DAX)
 int dax_add_host(struct dax_device *dax_dev, struct gendisk *disk);

From 91876c1b80a555c57dc3918d7c24a3fbb23c57c8 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Thu, 15 Feb 2024 09:46:26 -0500
Subject: [PATCH 1235/1406] dax: alloc_dax() return ERR_PTR(-EOPNOTSUPP) for
 CONFIG_DAX=n

Change the return value from NULL to PTR_ERR(-EOPNOTSUPP) for
CONFIG_DAX=n to be consistent with the fact that CONFIG_DAX=y
never returns NULL.

This is done in preparation for using cpu_dcache_is_aliasing() in a
following change which will properly support architectures which detect
data cache aliasing at runtime.

Link: https://lkml.kernel.org/r/20240215144633.96437-3-mathieu.desnoyers@efficios.com
Fixes: 4e4ced93794a ("dax: Move mandatory ->zero_page_range() check in alloc_dax()")
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Vishal Verma <vishal.l.verma@intel.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Alasdair Kergon <agk@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: kernel test robot <lkp@intel.com>
Cc: Michael Sclafani <dm-devel@lists.linux.dev>
Cc: Mike Snitzer <snitzer@kernel.org>
Cc: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 drivers/dax/super.c | 5 +++++
 include/linux/dax.h | 6 +-----
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/drivers/dax/super.c b/drivers/dax/super.c
index f4b635526345ad..97ef2a9d878d12 100644
--- a/drivers/dax/super.c
+++ b/drivers/dax/super.c
@@ -319,6 +319,11 @@ EXPORT_SYMBOL_GPL(dax_alive);
  * that any fault handlers or operations that might have seen
  * dax_alive(), have completed.  Any operations that start after
  * synchronize_srcu() has run will abort upon seeing !dax_alive().
+ *
+ * Note, because alloc_dax() returns an ERR_PTR() on error, callers
+ * typically store its result into a local variable in order to check
+ * the result. Therefore, care must be taken to populate the struct
+ * device dax_dev field make sure the dax_dev is not leaked.
  */
 void kill_dax(struct dax_device *dax_dev)
 {
diff --git a/include/linux/dax.h b/include/linux/dax.h
index e3ffe7c7f01dac..9d3e3327af4c05 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -88,11 +88,7 @@ static inline void *dax_holder(struct dax_device *dax_dev)
 static inline struct dax_device *alloc_dax(void *private,
 		const struct dax_operations *ops)
 {
-	/*
-	 * Callers should check IS_ENABLED(CONFIG_DAX) to know if this
-	 * NULL is an error or expected.
-	 */
-	return NULL;
+	return ERR_PTR(-EOPNOTSUPP);
 }
 static inline void put_dax(struct dax_device *dax_dev)
 {

From 40cd83047f0626f15887b44660cde5bfc91cc477 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Thu, 15 Feb 2024 09:46:27 -0500
Subject: [PATCH 1236/1406] nvdimm/pmem: Treat alloc_dax() -EOPNOTSUPP failure
 as non-fatal

In preparation for checking whether the architecture has data cache
aliasing within alloc_dax(), modify the error handling of nvdimm/pmem
pmem_attach_disk() to treat alloc_dax() -EOPNOTSUPP failure as non-fatal.

[ Based on commit "nvdimm/pmem: Fix leak on dax_add_host() failure". ]

Link: https://lkml.kernel.org/r/20240215144633.96437-4-mathieu.desnoyers@efficios.com
Fixes: d92576f1167c ("dax: does not work correctly with virtual aliasing caches")
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Cc: Alasdair Kergon <agk@redhat.com>
Cc: Mike Snitzer <snitzer@kernel.org>
Cc: Mikulas Patocka <mpatocka@redhat.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Vishal Verma <vishal.l.verma@intel.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: kernel test robot <lkp@intel.com>
Cc: Michael Sclafani <dm-devel@lists.linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 drivers/nvdimm/pmem.c | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 9fe3580907207b..e9898457a7bd85 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -560,17 +560,19 @@ static int pmem_attach_disk(struct device *dev,
 	dax_dev = alloc_dax(pmem, &pmem_dax_ops);
 	if (IS_ERR(dax_dev)) {
 		rc = PTR_ERR(dax_dev);
-		goto out;
+		if (rc != -EOPNOTSUPP)
+			goto out;
+	} else {
+		set_dax_nocache(dax_dev);
+		set_dax_nomc(dax_dev);
+		if (is_nvdimm_sync(nd_region))
+			set_dax_synchronous(dax_dev);
+		pmem->dax_dev = dax_dev;
+		rc = dax_add_host(dax_dev, disk);
+		if (rc)
+			goto out_cleanup_dax;
+		dax_write_cache(dax_dev, nvdimm_has_cache(nd_region));
 	}
-	set_dax_nocache(dax_dev);
-	set_dax_nomc(dax_dev);
-	if (is_nvdimm_sync(nd_region))
-		set_dax_synchronous(dax_dev);
-	pmem->dax_dev = dax_dev;
-	rc = dax_add_host(dax_dev, disk);
-	if (rc)
-		goto out_cleanup_dax;
-	dax_write_cache(dax_dev, nvdimm_has_cache(nd_region));
 	rc = device_add_disk(dev, disk, pmem_attribute_groups);
 	if (rc)
 		goto out_remove_host;

From 95eceb55dddeddb31e45505f5231be9a433d3c48 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Thu, 15 Feb 2024 09:46:28 -0500
Subject: [PATCH 1237/1406] dm: Treat alloc_dax() -EOPNOTSUPP failure as
 non-fatal

In preparation for checking whether the architecture has data cache
aliasing within alloc_dax(), modify the error handling of dm alloc_dev()
to treat alloc_dax() -EOPNOTSUPP failure as non-fatal.

Link: https://lkml.kernel.org/r/20240215144633.96437-5-mathieu.desnoyers@efficios.com
Fixes: d92576f1167c ("dax: does not work correctly with virtual aliasing caches")
Suggested-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Cc: Alasdair Kergon <agk@redhat.com>
Cc: Mike Snitzer <snitzer@kernel.org>
Cc: Mikulas Patocka <mpatocka@redhat.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Vishal Verma <vishal.l.verma@intel.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: kernel test robot <lkp@intel.com>
Cc: Michael Sclafani <dm-devel@lists.linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 drivers/md/dm.c | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 8dcabf84d866e6..10c73af93d00cc 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -2054,6 +2054,7 @@ static void cleanup_mapped_device(struct mapped_device *md)
 static struct mapped_device *alloc_dev(int minor)
 {
 	int r, numa_node_id = dm_get_numa_node();
+	struct dax_device *dax_dev;
 	struct mapped_device *md;
 	void *old_md;
 
@@ -2122,15 +2123,15 @@ static struct mapped_device *alloc_dev(int minor)
 	md->disk->private_data = md;
 	sprintf(md->disk->disk_name, "dm-%d", minor);
 
-	if (IS_ENABLED(CONFIG_FS_DAX)) {
-		md->dax_dev = alloc_dax(md, &dm_dax_ops);
-		if (IS_ERR(md->dax_dev)) {
-			md->dax_dev = NULL;
+	dax_dev = alloc_dax(md, &dm_dax_ops);
+	if (IS_ERR(dax_dev)) {
+		if (PTR_ERR(dax_dev) != -EOPNOTSUPP)
 			goto bad;
-		}
-		set_dax_nocache(md->dax_dev);
-		set_dax_nomc(md->dax_dev);
-		if (dax_add_host(md->dax_dev, md->disk))
+	} else {
+		set_dax_nocache(dax_dev);
+		set_dax_nomc(dax_dev);
+		md->dax_dev = dax_dev;
+		if (dax_add_host(dax_dev, md->disk))
 			goto bad;
 	}
 

From 387689b83c79eadc3c32faf3e4a0816f36e12ca4 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Thu, 15 Feb 2024 09:46:29 -0500
Subject: [PATCH 1238/1406] dcssblk: Handle alloc_dax() -EOPNOTSUPP failure

In preparation for checking whether the architecture has data cache
aliasing within alloc_dax(), modify the error handling of dcssblk
dcssblk_add_store() to handle alloc_dax() -EOPNOTSUPP failures.

Considering that s390 is not a data cache aliasing architecture,
and considering that DCSSBLK selects DAX, a return value of -EOPNOTSUPP
from alloc_dax() should make dcssblk_add_store() fail.

Link: https://lkml.kernel.org/r/20240215144633.96437-6-mathieu.desnoyers@efficios.com
Fixes: d92576f1167c ("dax: does not work correctly with virtual aliasing caches")
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Acked-by: Heiko Carstens <hca@linux.ibm.com>
Cc: Alasdair Kergon <agk@redhat.com>
Cc: Mike Snitzer <snitzer@kernel.org>
Cc: Mikulas Patocka <mpatocka@redhat.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Vishal Verma <vishal.l.verma@intel.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Dave Chinner <david@fromorbit.com>
Cc: kernel test robot <lkp@intel.com>
Cc: Michael Sclafani <dm-devel@lists.linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 drivers/s390/block/dcssblk.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c
index 4b7ecd4fd4319c..f363c1d51d9a84 100644
--- a/drivers/s390/block/dcssblk.c
+++ b/drivers/s390/block/dcssblk.c
@@ -549,6 +549,7 @@ dcssblk_add_store(struct device *dev, struct device_attribute *attr, const char
 	int rc, i, j, num_of_segments;
 	struct dcssblk_dev_info *dev_info;
 	struct segment_info *seg_info, *temp;
+	struct dax_device *dax_dev;
 	char *local_buf;
 	unsigned long seg_byte_size;
 
@@ -677,13 +678,13 @@ dcssblk_add_store(struct device *dev, struct device_attribute *attr, const char
 	if (rc)
 		goto put_dev;
 
-	dev_info->dax_dev = alloc_dax(dev_info, &dcssblk_dax_ops);
-	if (IS_ERR(dev_info->dax_dev)) {
-		rc = PTR_ERR(dev_info->dax_dev);
-		dev_info->dax_dev = NULL;
+	dax_dev = alloc_dax(dev_info, &dcssblk_dax_ops);
+	if (IS_ERR(dax_dev)) {
+		rc = PTR_ERR(dax_dev);
 		goto put_dev;
 	}
-	set_dax_synchronous(dev_info->dax_dev);
+	set_dax_synchronous(dax_dev);
+	dev_info->dax_dev = dax_dev;
 	rc = dax_add_host(dev_info->dax_dev, dev_info->gd);
 	if (rc)
 		goto out_dax;

From 6933d66cc84726ae1dc4c793f465e7e7f1187cb1 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Thu, 15 Feb 2024 09:46:30 -0500
Subject: [PATCH 1239/1406] virtio: Treat alloc_dax() -EOPNOTSUPP failure as
 non-fatal

In preparation for checking whether the architecture has data cache
aliasing within alloc_dax(), modify the error handling of virtio
virtio_fs_setup_dax() to treat alloc_dax() -EOPNOTSUPP failure as
non-fatal.

Link: https://lkml.kernel.org/r/20240215144633.96437-7-mathieu.desnoyers@efficios.com
Co-developed-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Fixes: d92576f1167c ("dax: does not work correctly with virtual aliasing caches")
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Alasdair Kergon <agk@redhat.com>
Cc: Mike Snitzer <snitzer@kernel.org>
Cc: Mikulas Patocka <mpatocka@redhat.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Vishal Verma <vishal.l.verma@intel.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: kernel test robot <lkp@intel.com>
Cc: Michael Sclafani <dm-devel@lists.linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/fuse/virtio_fs.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
index 5f1be1da92ce94..a28466c2da7143 100644
--- a/fs/fuse/virtio_fs.c
+++ b/fs/fuse/virtio_fs.c
@@ -16,6 +16,7 @@
 #include <linux/fs_context.h>
 #include <linux/fs_parser.h>
 #include <linux/highmem.h>
+#include <linux/cleanup.h>
 #include <linux/uio.h>
 #include "fuse_i.h"
 
@@ -795,8 +796,11 @@ static void virtio_fs_cleanup_dax(void *data)
 	put_dax(dax_dev);
 }
 
+DEFINE_FREE(cleanup_dax, struct dax_dev *, if (!IS_ERR_OR_NULL(_T)) virtio_fs_cleanup_dax(_T))
+
 static int virtio_fs_setup_dax(struct virtio_device *vdev, struct virtio_fs *fs)
 {
+	struct dax_device *dax_dev __free(cleanup_dax) = NULL;
 	struct virtio_shm_region cache_reg;
 	struct dev_pagemap *pgmap;
 	bool have_cache;
@@ -804,6 +808,12 @@ static int virtio_fs_setup_dax(struct virtio_device *vdev, struct virtio_fs *fs)
 	if (!IS_ENABLED(CONFIG_FUSE_DAX))
 		return 0;
 
+	dax_dev = alloc_dax(fs, &virtio_fs_dax_ops);
+	if (IS_ERR(dax_dev)) {
+		int rc = PTR_ERR(dax_dev);
+		return rc == -EOPNOTSUPP ? 0 : rc;
+	}
+
 	/* Get cache region */
 	have_cache = virtio_get_shm_region(vdev, &cache_reg,
 					   (u8)VIRTIO_FS_SHMCAP_ID_CACHE);
@@ -849,10 +859,7 @@ static int virtio_fs_setup_dax(struct virtio_device *vdev, struct virtio_fs *fs)
 	dev_dbg(&vdev->dev, "%s: window kaddr 0x%px phys_addr 0x%llx len 0x%llx\n",
 		__func__, fs->window_kaddr, cache_reg.addr, cache_reg.len);
 
-	fs->dax_dev = alloc_dax(fs, &virtio_fs_dax_ops);
-	if (IS_ERR(fs->dax_dev))
-		return PTR_ERR(fs->dax_dev);
-
+	fs->dax_dev = no_free_ptr(dax_dev);
 	return devm_add_action_or_reset(&vdev->dev, virtio_fs_cleanup_dax,
 					fs->dax_dev);
 }

From 087f0a1fe57628dc93f270cab4a6d8b8c5c0f206 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Thu, 15 Feb 2024 09:46:31 -0500
Subject: [PATCH 1240/1406] dax: Check for data cache aliasing at runtime

Replace the following fs/Kconfig:FS_DAX dependency:

  depends on !(ARM || MIPS || SPARC)

By a runtime check within alloc_dax(). This runtime check returns
ERR_PTR(-EOPNOTSUPP) if the @ops parameter is non-NULL (which means
the kernel is using an aliased mapping) on an architecture which
has data cache aliasing.

Change the return value from NULL to PTR_ERR(-EOPNOTSUPP) for
CONFIG_DAX=n for consistency.

This is done in preparation for using cpu_dcache_is_aliasing() in a
following change which will properly support architectures which detect
data cache aliasing at runtime.

Link: https://lkml.kernel.org/r/20240215144633.96437-8-mathieu.desnoyers@efficios.com
Fixes: d92576f1167c ("dax: does not work correctly with virtual aliasing caches")
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Vishal Verma <vishal.l.verma@intel.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Alasdair Kergon <agk@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: kernel test robot <lkp@intel.com>
Cc: Michael Sclafani <dm-devel@lists.linux.dev>
Cc: Mike Snitzer <snitzer@kernel.org>
Cc: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 drivers/dax/super.c | 10 ++++++++++
 fs/Kconfig          |  1 -
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/drivers/dax/super.c b/drivers/dax/super.c
index 97ef2a9d878d12..7643b1a078d9f1 100644
--- a/drivers/dax/super.c
+++ b/drivers/dax/super.c
@@ -451,6 +451,16 @@ struct dax_device *alloc_dax(void *private, const struct dax_operations *ops)
 	dev_t devt;
 	int minor;
 
+	/*
+	 * Unavailable on architectures with virtually aliased data caches,
+	 * except for device-dax (NULL operations pointer), which does
+	 * not use aliased mappings from the kernel.
+	 */
+	if (ops && (IS_ENABLED(CONFIG_ARM) ||
+	    IS_ENABLED(CONFIG_MIPS) ||
+	    IS_ENABLED(CONFIG_SPARC)))
+		return ERR_PTR(-EOPNOTSUPP);
+
 	if (WARN_ON_ONCE(ops && !ops->zero_page_range))
 		return ERR_PTR(-EINVAL);
 
diff --git a/fs/Kconfig b/fs/Kconfig
index a57d6e6c41e6f1..4a51331f172e57 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -60,7 +60,6 @@ endif # BLOCK
 config FS_DAX
 	bool "File system based Direct Access (DAX) support"
 	depends on MMU
-	depends on !(ARM || MIPS || SPARC)
 	depends on ZONE_DEVICE || FS_DAX_LIMITED
 	select FS_IOMAP
 	select DAX

From b719a4bdb3b19bf48ba948c6f940837df230b86c Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Thu, 15 Feb 2024 09:46:32 -0500
Subject: [PATCH 1241/1406] Introduce cpu_dcache_is_aliasing() across all
 architectures

Introduce a generic way to query whether the data cache is virtually
aliased on all architectures. Its purpose is to ensure that subsystems
which are incompatible with virtually aliased data caches (e.g. FS_DAX)
can reliably query this.

For data cache aliasing, there are three scenarios dependending on the
architecture. Here is a breakdown based on my understanding:

A) The data cache is always aliasing:

* arc
* csky
* m68k (note: shared memory mappings are incoherent ? SHMLBA is missing there.)
* sh
* parisc

B) The data cache aliasing is statically known or depends on querying CPU
   state at runtime:

* arm (cache_is_vivt() || cache_is_vipt_aliasing())
* mips (cpu_has_dc_aliases)
* nios2 (NIOS2_DCACHE_SIZE > PAGE_SIZE)
* sparc32 (vac_cache_size > PAGE_SIZE)
* sparc64 (L1DCACHE_SIZE > PAGE_SIZE)
* xtensa (DCACHE_WAY_SIZE > PAGE_SIZE)

C) The data cache is never aliasing:

* alpha
* arm64 (aarch64)
* hexagon
* loongarch (but with incoherent write buffers, which are disabled since
             commit d23b7795 ("LoongArch: Change SHMLBA from SZ_64K to PAGE_SIZE"))
* microblaze
* openrisc
* powerpc
* riscv
* s390
* um
* x86

Require architectures in A) and B) to select ARCH_HAS_CPU_CACHE_ALIASING and
implement "cpu_dcache_is_aliasing()".

Architectures in C) don't select ARCH_HAS_CPU_CACHE_ALIASING, and thus
cpu_dcache_is_aliasing() simply evaluates to "false".

Note that this leaves "cpu_icache_is_aliasing()" to be implemented as future
work. This would be useful to gate features like XIP on architectures
which have aliasing CPU dcache-icache but not CPU dcache-dcache.

Use "cpu_dcache" and "cpu_cache" rather than just "dcache" and "cache"
to clarify that we really mean "CPU data cache" and "CPU cache" to
eliminate any possible confusion with VFS "dentry cache" and "page
cache".

Link: https://lore.kernel.org/lkml/20030910210416.GA24258@mail.jlokier.co.uk/
Link: https://lkml.kernel.org/r/20240215144633.96437-9-mathieu.desnoyers@efficios.com
Fixes: d92576f1167c ("dax: does not work correctly with virtual aliasing caches")
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Vishal Verma <vishal.l.verma@intel.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Alasdair Kergon <agk@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: kernel test robot <lkp@intel.com>
Cc: Michael Sclafani <dm-devel@lists.linux.dev>
Cc: Mike Snitzer <snitzer@kernel.org>
Cc: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/arc/Kconfig                    |  1 +
 arch/arc/include/asm/cachetype.h    |  9 +++++++++
 arch/arm/Kconfig                    |  1 +
 arch/arm/include/asm/cachetype.h    |  2 ++
 arch/csky/Kconfig                   |  1 +
 arch/csky/include/asm/cachetype.h   |  9 +++++++++
 arch/m68k/Kconfig                   |  1 +
 arch/m68k/include/asm/cachetype.h   |  9 +++++++++
 arch/mips/Kconfig                   |  1 +
 arch/mips/include/asm/cachetype.h   |  9 +++++++++
 arch/nios2/Kconfig                  |  1 +
 arch/nios2/include/asm/cachetype.h  | 10 ++++++++++
 arch/parisc/Kconfig                 |  1 +
 arch/parisc/include/asm/cachetype.h |  9 +++++++++
 arch/sh/Kconfig                     |  1 +
 arch/sh/include/asm/cachetype.h     |  9 +++++++++
 arch/sparc/Kconfig                  |  1 +
 arch/sparc/include/asm/cachetype.h  | 14 ++++++++++++++
 arch/xtensa/Kconfig                 |  1 +
 arch/xtensa/include/asm/cachetype.h | 10 ++++++++++
 include/linux/cacheinfo.h           |  6 ++++++
 mm/Kconfig                          |  6 ++++++
 22 files changed, 112 insertions(+)
 create mode 100644 arch/arc/include/asm/cachetype.h
 create mode 100644 arch/csky/include/asm/cachetype.h
 create mode 100644 arch/m68k/include/asm/cachetype.h
 create mode 100644 arch/mips/include/asm/cachetype.h
 create mode 100644 arch/nios2/include/asm/cachetype.h
 create mode 100644 arch/parisc/include/asm/cachetype.h
 create mode 100644 arch/sh/include/asm/cachetype.h
 create mode 100644 arch/sparc/include/asm/cachetype.h
 create mode 100644 arch/xtensa/include/asm/cachetype.h

diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig
index 1b0483c51cc169..7d294a3242a47a 100644
--- a/arch/arc/Kconfig
+++ b/arch/arc/Kconfig
@@ -6,6 +6,7 @@
 config ARC
 	def_bool y
 	select ARC_TIMERS
+	select ARCH_HAS_CPU_CACHE_ALIASING
 	select ARCH_HAS_CACHE_LINE_SIZE
 	select ARCH_HAS_DEBUG_VM_PGTABLE
 	select ARCH_HAS_DMA_PREP_COHERENT
diff --git a/arch/arc/include/asm/cachetype.h b/arch/arc/include/asm/cachetype.h
new file mode 100644
index 00000000000000..05fc7ed5971258
--- /dev/null
+++ b/arch/arc/include/asm/cachetype.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __ASM_ARC_CACHETYPE_H
+#define __ASM_ARC_CACHETYPE_H
+
+#include <linux/types.h>
+
+#define cpu_dcache_is_aliasing()	true
+
+#endif
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 0af6709570d147..66a8e64b226e29 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -5,6 +5,7 @@ config ARM
 	select ARCH_32BIT_OFF_T
 	select ARCH_CORRECT_STACKTRACE_ON_KRETPROBE if HAVE_KRETPROBES && FRAME_POINTER && !ARM_UNWIND
 	select ARCH_HAS_BINFMT_FLAT
+	select ARCH_HAS_CPU_CACHE_ALIASING
 	select ARCH_HAS_CPU_FINALIZE_INIT if MMU
 	select ARCH_HAS_CURRENT_STACK_POINTER
 	select ARCH_HAS_DEBUG_VIRTUAL if MMU
diff --git a/arch/arm/include/asm/cachetype.h b/arch/arm/include/asm/cachetype.h
index e8c30430be33f5..b9dbe1d4c8fe08 100644
--- a/arch/arm/include/asm/cachetype.h
+++ b/arch/arm/include/asm/cachetype.h
@@ -20,6 +20,8 @@ extern unsigned int cacheid;
 #define icache_is_vipt_aliasing()	cacheid_is(CACHEID_VIPT_I_ALIASING)
 #define icache_is_pipt()		cacheid_is(CACHEID_PIPT)
 
+#define cpu_dcache_is_aliasing()	(cache_is_vivt() || cache_is_vipt_aliasing())
+
 /*
  * __LINUX_ARM_ARCH__ is the minimum supported CPU architecture
  * Mask out support which will never be present on newer CPUs.
diff --git a/arch/csky/Kconfig b/arch/csky/Kconfig
index cf2a6fd7dff871..8a91eccf76dcee 100644
--- a/arch/csky/Kconfig
+++ b/arch/csky/Kconfig
@@ -2,6 +2,7 @@
 config CSKY
 	def_bool y
 	select ARCH_32BIT_OFF_T
+	select ARCH_HAS_CPU_CACHE_ALIASING
 	select ARCH_HAS_DMA_PREP_COHERENT
 	select ARCH_HAS_GCOV_PROFILE_ALL
 	select ARCH_HAS_SYNC_DMA_FOR_CPU
diff --git a/arch/csky/include/asm/cachetype.h b/arch/csky/include/asm/cachetype.h
new file mode 100644
index 00000000000000..98cbe3af662ffa
--- /dev/null
+++ b/arch/csky/include/asm/cachetype.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __ASM_CSKY_CACHETYPE_H
+#define __ASM_CSKY_CACHETYPE_H
+
+#include <linux/types.h>
+
+#define cpu_dcache_is_aliasing()	true
+
+#endif
diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig
index 4b3e93cac72320..a9c3e3de0c6d06 100644
--- a/arch/m68k/Kconfig
+++ b/arch/m68k/Kconfig
@@ -3,6 +3,7 @@ config M68K
 	bool
 	default y
 	select ARCH_32BIT_OFF_T
+	select ARCH_HAS_CPU_CACHE_ALIASING
 	select ARCH_HAS_BINFMT_FLAT
 	select ARCH_HAS_CPU_FINALIZE_INIT if MMU
 	select ARCH_HAS_CURRENT_STACK_POINTER
diff --git a/arch/m68k/include/asm/cachetype.h b/arch/m68k/include/asm/cachetype.h
new file mode 100644
index 00000000000000..7fad5d9ab8fe46
--- /dev/null
+++ b/arch/m68k/include/asm/cachetype.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __ASM_M68K_CACHETYPE_H
+#define __ASM_M68K_CACHETYPE_H
+
+#include <linux/types.h>
+
+#define cpu_dcache_is_aliasing()	true
+
+#endif
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index 797ae590ebdba5..ab1c8bd966664e 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -4,6 +4,7 @@ config MIPS
 	default y
 	select ARCH_32BIT_OFF_T if !64BIT
 	select ARCH_BINFMT_ELF_STATE if MIPS_FP_SUPPORT
+	select ARCH_HAS_CPU_CACHE_ALIASING
 	select ARCH_HAS_CPU_FINALIZE_INIT
 	select ARCH_HAS_CURRENT_STACK_POINTER if !CC_IS_CLANG || CLANG_VERSION >= 140000
 	select ARCH_HAS_DEBUG_VIRTUAL if !64BIT
diff --git a/arch/mips/include/asm/cachetype.h b/arch/mips/include/asm/cachetype.h
new file mode 100644
index 00000000000000..9f4ba2fe115517
--- /dev/null
+++ b/arch/mips/include/asm/cachetype.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __ASM_MIPS_CACHETYPE_H
+#define __ASM_MIPS_CACHETYPE_H
+
+#include <asm/cpu-features.h>
+
+#define cpu_dcache_is_aliasing()	cpu_has_dc_aliases
+
+#endif
diff --git a/arch/nios2/Kconfig b/arch/nios2/Kconfig
index 58d9565dc2c770..6b3a14633d2f27 100644
--- a/arch/nios2/Kconfig
+++ b/arch/nios2/Kconfig
@@ -2,6 +2,7 @@
 config NIOS2
 	def_bool y
 	select ARCH_32BIT_OFF_T
+	select ARCH_HAS_CPU_CACHE_ALIASING
 	select ARCH_HAS_DMA_PREP_COHERENT
 	select ARCH_HAS_SYNC_DMA_FOR_CPU
 	select ARCH_HAS_SYNC_DMA_FOR_DEVICE
diff --git a/arch/nios2/include/asm/cachetype.h b/arch/nios2/include/asm/cachetype.h
new file mode 100644
index 00000000000000..eb9c416b8a1c18
--- /dev/null
+++ b/arch/nios2/include/asm/cachetype.h
@@ -0,0 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __ASM_NIOS2_CACHETYPE_H
+#define __ASM_NIOS2_CACHETYPE_H
+
+#include <asm/page.h>
+#include <asm/cache.h>
+
+#define cpu_dcache_is_aliasing()	(NIOS2_DCACHE_SIZE > PAGE_SIZE)
+
+#endif
diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig
index 5c845e8d59d92f..da6e97ba46a619 100644
--- a/arch/parisc/Kconfig
+++ b/arch/parisc/Kconfig
@@ -8,6 +8,7 @@ config PARISC
 	select HAVE_FUNCTION_GRAPH_TRACER
 	select HAVE_SYSCALL_TRACEPOINTS
 	select ARCH_WANT_FRAME_POINTERS
+	select ARCH_HAS_CPU_CACHE_ALIASING
 	select ARCH_HAS_DMA_ALLOC if PA11
 	select ARCH_HAS_ELF_RANDOMIZE
 	select ARCH_HAS_STRICT_KERNEL_RWX
diff --git a/arch/parisc/include/asm/cachetype.h b/arch/parisc/include/asm/cachetype.h
new file mode 100644
index 00000000000000..e0868a1d3c474d
--- /dev/null
+++ b/arch/parisc/include/asm/cachetype.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __ASM_PARISC_CACHETYPE_H
+#define __ASM_PARISC_CACHETYPE_H
+
+#include <linux/types.h>
+
+#define cpu_dcache_is_aliasing()	true
+
+#endif
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 7500521b2b984a..2ad3e29f0ebec4 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -2,6 +2,7 @@
 config SUPERH
 	def_bool y
 	select ARCH_32BIT_OFF_T
+	select ARCH_HAS_CPU_CACHE_ALIASING
 	select ARCH_ENABLE_MEMORY_HOTPLUG if SPARSEMEM && MMU
 	select ARCH_ENABLE_MEMORY_HOTREMOVE if SPARSEMEM && MMU
 	select ARCH_HAVE_NMI_SAFE_CMPXCHG if (GUSA_RB || CPU_SH4A)
diff --git a/arch/sh/include/asm/cachetype.h b/arch/sh/include/asm/cachetype.h
new file mode 100644
index 00000000000000..a5fffe5360684d
--- /dev/null
+++ b/arch/sh/include/asm/cachetype.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __ASM_SH_CACHETYPE_H
+#define __ASM_SH_CACHETYPE_H
+
+#include <linux/types.h>
+
+#define cpu_dcache_is_aliasing()	true
+
+#endif
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index 204c43cb3d4356..cbec48219d9e15 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -13,6 +13,7 @@ config 64BIT
 config SPARC
 	bool
 	default y
+	select ARCH_HAS_CPU_CACHE_ALIASING
 	select ARCH_MIGHT_HAVE_PC_PARPORT if SPARC64 && PCI
 	select ARCH_MIGHT_HAVE_PC_SERIO
 	select DMA_OPS
diff --git a/arch/sparc/include/asm/cachetype.h b/arch/sparc/include/asm/cachetype.h
new file mode 100644
index 00000000000000..caf1c004589247
--- /dev/null
+++ b/arch/sparc/include/asm/cachetype.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __ASM_SPARC_CACHETYPE_H
+#define __ASM_SPARC_CACHETYPE_H
+
+#include <asm/page.h>
+
+#ifdef CONFIG_SPARC32
+extern int vac_cache_size;
+#define cpu_dcache_is_aliasing()	(vac_cache_size > PAGE_SIZE)
+#else
+#define cpu_dcache_is_aliasing()	(L1DCACHE_SIZE > PAGE_SIZE)
+#endif
+
+#endif
diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig
index 6f248d87e496aa..6689a85473463c 100644
--- a/arch/xtensa/Kconfig
+++ b/arch/xtensa/Kconfig
@@ -2,6 +2,7 @@
 config XTENSA
 	def_bool y
 	select ARCH_32BIT_OFF_T
+	select ARCH_HAS_CPU_CACHE_ALIASING
 	select ARCH_HAS_BINFMT_FLAT if !MMU
 	select ARCH_HAS_CURRENT_STACK_POINTER
 	select ARCH_HAS_DEBUG_VM_PGTABLE
diff --git a/arch/xtensa/include/asm/cachetype.h b/arch/xtensa/include/asm/cachetype.h
new file mode 100644
index 00000000000000..51bd49e2a1c54f
--- /dev/null
+++ b/arch/xtensa/include/asm/cachetype.h
@@ -0,0 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __ASM_XTENSA_CACHETYPE_H
+#define __ASM_XTENSA_CACHETYPE_H
+
+#include <asm/cache.h>
+#include <asm/page.h>
+
+#define cpu_dcache_is_aliasing()	(DCACHE_WAY_SIZE > PAGE_SIZE)
+
+#endif
diff --git a/include/linux/cacheinfo.h b/include/linux/cacheinfo.h
index d504eb4b49abec..2cb15fe4fe1291 100644
--- a/include/linux/cacheinfo.h
+++ b/include/linux/cacheinfo.h
@@ -138,4 +138,10 @@ static inline int get_cpu_cacheinfo_id(int cpu, int level)
 #define use_arch_cache_info()	(false)
 #endif
 
+#ifndef CONFIG_ARCH_HAS_CPU_CACHE_ALIASING
+#define cpu_dcache_is_aliasing()	false
+#else
+#include <asm/cachetype.h>
+#endif
+
 #endif /* _LINUX_CACHEINFO_H */
diff --git a/mm/Kconfig b/mm/Kconfig
index 2b267553f793de..b924f4a5a3ef8a 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -973,6 +973,12 @@ config IDLE_PAGE_TRACKING
 	  See Documentation/admin-guide/mm/idle_page_tracking.rst for
 	  more details.
 
+# Architectures which implement cpu_dcache_is_aliasing() to query
+# whether the data caches are aliased (VIVT or VIPT with dcache
+# aliasing) need to select this.
+config ARCH_HAS_CPU_CACHE_ALIASING
+	bool
+
 config ARCH_HAS_CACHE_LINE_SIZE
 	bool
 

From 940a7fcc4ea8cc850fece81ad2174bc9e8844160 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Thu, 15 Feb 2024 09:46:33 -0500
Subject: [PATCH 1242/1406] dax: Fix incorrect list of data cache aliasing
 architectures

commit d92576f1167c ("dax: does not work correctly with virtual aliasing caches")
prevents DAX from building on architectures with virtually aliased
dcache with:

  depends on !(ARM || MIPS || SPARC)

This check is too broad (e.g. recent ARMv7 don't have virtually aliased
dcaches), and also misses many other architectures with virtually
aliased data cache.

This is a regression introduced in the v4.0 Linux kernel where the
dax mount option is removed for 32-bit ARMv7 boards which have no data
cache aliasing, and therefore should work fine with FS_DAX.

This was turned into the following check in alloc_dax() by a preparatory
change:

        if (ops && (IS_ENABLED(CONFIG_ARM) ||
            IS_ENABLED(CONFIG_MIPS) ||
            IS_ENABLED(CONFIG_SPARC)))
                return NULL;

Use cpu_dcache_is_aliasing() instead to figure out whether the environment
has aliasing data caches.

Link: https://lkml.kernel.org/r/20240215144633.96437-10-mathieu.desnoyers@efficios.com
Fixes: d92576f1167c ("dax: does not work correctly with virtual aliasing caches")
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Vishal Verma <vishal.l.verma@intel.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Alasdair Kergon <agk@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: kernel test robot <lkp@intel.com>
Cc: Michael Sclafani <dm-devel@lists.linux.dev>
Cc: Mike Snitzer <snitzer@kernel.org>
Cc: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 drivers/dax/super.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/dax/super.c b/drivers/dax/super.c
index 7643b1a078d9f1..54e52877987743 100644
--- a/drivers/dax/super.c
+++ b/drivers/dax/super.c
@@ -13,6 +13,7 @@
 #include <linux/uio.h>
 #include <linux/dax.h>
 #include <linux/fs.h>
+#include <linux/cacheinfo.h>
 #include "dax-private.h"
 
 /**
@@ -456,9 +457,7 @@ struct dax_device *alloc_dax(void *private, const struct dax_operations *ops)
 	 * except for device-dax (NULL operations pointer), which does
 	 * not use aliased mappings from the kernel.
 	 */
-	if (ops && (IS_ENABLED(CONFIG_ARM) ||
-	    IS_ENABLED(CONFIG_MIPS) ||
-	    IS_ENABLED(CONFIG_SPARC)))
+	if (ops && cpu_dcache_is_aliasing())
 		return ERR_PTR(-EOPNOTSUPP);
 
 	if (WARN_ON_ONCE(ops && !ops->zero_page_range))

From c8b5458a1b0f08a68e7dcd390ec556691c52b319 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 15 Feb 2024 20:53:05 +0000
Subject: [PATCH 1243/1406] rmap: peplace two calls to compound_order with
 folio_order

Removes two unnecessary conversions from folio to page.  Should be no
difference in behaviour.

Link: https://lkml.kernel.org/r/20240215205307.674707-1-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/rmap.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mm/rmap.c b/mm/rmap.c
index 1cf2bffa48ed87..3746a553101832 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -2169,7 +2169,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
 				swp_pte = pte_swp_mkuffd_wp(swp_pte);
 			set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte);
 			trace_set_migration_pte(pvmw.address, pte_val(swp_pte),
-						compound_order(&folio->page));
+						folio_order(folio));
 			/*
 			 * No need to invalidate here it will synchronize on
 			 * against the special swap migration pte.
@@ -2261,7 +2261,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
 			else
 				set_pte_at(mm, address, pvmw.pte, swp_pte);
 			trace_set_migration_pte(address, pte_val(swp_pte),
-						compound_order(&folio->page));
+						folio_order(folio));
 			/*
 			 * No need to invalidate here it will synchronize on
 			 * against the special swap migration pte.

From 137bea2ac3f955b6ec9ee99afa2c318e65e73ad5 Mon Sep 17 00:00:00 2001
From: Juntong Deng <juntong.deng@outlook.com>
Date: Thu, 15 Feb 2024 18:39:55 +0000
Subject: [PATCH 1244/1406] kasan: increase the number of bits to shift when
 recording extra timestamps

In 5d4c6ac94694 ("kasan: record and report more information") I thought
that printk only displays a maximum of 99999 seconds, but actually printk
can display a larger number of seconds.

So increase the number of bits to shift when recording the extra timestamp
(44 bits), without affecting the precision, shift it right by 9 bits,
discarding all bits that do not affect the microsecond part (nanoseconds
will not be shown).

Currently the maximum time that can be displayed is 9007199.254740s,
because

11111111111111111111111111111111111111111111 (44 bits) << 9
= 11111111111111111111111111111111111111111111000000000
= 9007199.254740

Link: https://lkml.kernel.org/r/AM6PR03MB58481629F2F28CE007412139994D2@AM6PR03MB5848.eurprd03.prod.outlook.com
Fixes: 5d4c6ac94694 ("kasan: record and report more information")
Signed-off-by: Juntong Deng <juntong.deng@outlook.com>
Acked-by: Andrey Konovalov <andreyknvl@gmail.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Vincenzo Frascino <vincenzo.frascino@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/kasan/common.c | 2 +-
 mm/kasan/report.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index 6ca63e8dda741b..e7c9a4dc89f826 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -55,7 +55,7 @@ void kasan_set_track(struct kasan_track *track, depot_stack_handle_t stack)
 	u64 ts_nsec = local_clock();
 
 	track->cpu = cpu;
-	track->timestamp = ts_nsec >> 3;
+	track->timestamp = ts_nsec >> 9;
 #endif /* CONFIG_KASAN_EXTRA_INFO */
 	track->pid = current->pid;
 	track->stack = stack;
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index 7afa4feb03e18f..b48c768acc84d2 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -267,7 +267,7 @@ static void print_track(struct kasan_track *track, const char *prefix)
 	u64 ts_nsec = track->timestamp;
 	unsigned long rem_usec;
 
-	ts_nsec <<= 3;
+	ts_nsec <<= 9;
 	rem_usec = do_div(ts_nsec, NSEC_PER_SEC) / 1000;
 
 	pr_err("%s by task %u on cpu %d at %lu.%06lus:\n",

From bf91196c1df45e335a3a51faff8ae0d61dd771e5 Mon Sep 17 00:00:00 2001
From: Lokesh Gidra <lokeshgidra@google.com>
Date: Thu, 15 Feb 2024 10:27:53 -0800
Subject: [PATCH 1245/1406] userfaultfd: move userfaultfd_ctx struct to header
 file

Patch series "per-vma locks in userfaultfd", v7.

Performing userfaultfd operations (like copy/move etc.) in critical
section of mmap_lock (read-mode) causes significant contention on the lock
when operations requiring the lock in write-mode are taking place
concurrently.  We can use per-vma locks instead to significantly reduce
the contention issue.

Android runtime's Garbage Collector uses userfaultfd for concurrent
compaction.  mmap-lock contention during compaction potentially causes
jittery experience for the user.  During one such reproducible scenario,
we observed the following improvements with this patch-set:

- Wall clock time of compaction phase came down from ~3s to <500ms
- Uninterruptible sleep time (across all threads in the process) was
  ~10ms (none in mmap_lock) during compaction, instead of >20s


This patch (of 4):

Move the struct to userfaultfd_k.h to be accessible from mm/userfaultfd.c.
There are no other changes in the struct.

This is required to prepare for using per-vma locks in userfaultfd
operations.

Link: https://lkml.kernel.org/r/20240215182756.3448972-1-lokeshgidra@google.com
Link: https://lkml.kernel.org/r/20240215182756.3448972-2-lokeshgidra@google.com
Signed-off-by: Lokesh Gidra <lokeshgidra@google.com>
Reviewed-by: Mike Rapoport (IBM) <rppt@kernel.org>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Brian Geffon <bgeffon@google.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: Kalesh Singh <kaleshsingh@google.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Nicolas Geoffray <ngeoffray@google.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Tim Murray <timmurray@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/userfaultfd.c              | 39 -----------------------------------
 include/linux/userfaultfd_k.h | 39 +++++++++++++++++++++++++++++++++++
 2 files changed, 39 insertions(+), 39 deletions(-)

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 05c8e8a054272f..58331b83d648da 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -50,45 +50,6 @@ static struct ctl_table vm_userfaultfd_table[] = {
 
 static struct kmem_cache *userfaultfd_ctx_cachep __ro_after_init;
 
-/*
- * Start with fault_pending_wqh and fault_wqh so they're more likely
- * to be in the same cacheline.
- *
- * Locking order:
- *	fd_wqh.lock
- *		fault_pending_wqh.lock
- *			fault_wqh.lock
- *		event_wqh.lock
- *
- * To avoid deadlocks, IRQs must be disabled when taking any of the above locks,
- * since fd_wqh.lock is taken by aio_poll() while it's holding a lock that's
- * also taken in IRQ context.
- */
-struct userfaultfd_ctx {
-	/* waitqueue head for the pending (i.e. not read) userfaults */
-	wait_queue_head_t fault_pending_wqh;
-	/* waitqueue head for the userfaults */
-	wait_queue_head_t fault_wqh;
-	/* waitqueue head for the pseudo fd to wakeup poll/read */
-	wait_queue_head_t fd_wqh;
-	/* waitqueue head for events */
-	wait_queue_head_t event_wqh;
-	/* a refile sequence protected by fault_pending_wqh lock */
-	seqcount_spinlock_t refile_seq;
-	/* pseudo fd refcounting */
-	refcount_t refcount;
-	/* userfaultfd syscall flags */
-	unsigned int flags;
-	/* features requested from the userspace */
-	unsigned int features;
-	/* released */
-	bool released;
-	/* memory mappings are changing because of non-cooperative event */
-	atomic_t mmap_changing;
-	/* mm with one ore more vmas attached to this userfaultfd_ctx */
-	struct mm_struct *mm;
-};
-
 struct userfaultfd_fork_ctx {
 	struct userfaultfd_ctx *orig;
 	struct userfaultfd_ctx *new;
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index e4056547fbe615..691d928ee8640c 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -36,6 +36,45 @@
 #define UFFD_SHARED_FCNTL_FLAGS (O_CLOEXEC | O_NONBLOCK)
 #define UFFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS)
 
+/*
+ * Start with fault_pending_wqh and fault_wqh so they're more likely
+ * to be in the same cacheline.
+ *
+ * Locking order:
+ *	fd_wqh.lock
+ *		fault_pending_wqh.lock
+ *			fault_wqh.lock
+ *		event_wqh.lock
+ *
+ * To avoid deadlocks, IRQs must be disabled when taking any of the above locks,
+ * since fd_wqh.lock is taken by aio_poll() while it's holding a lock that's
+ * also taken in IRQ context.
+ */
+struct userfaultfd_ctx {
+	/* waitqueue head for the pending (i.e. not read) userfaults */
+	wait_queue_head_t fault_pending_wqh;
+	/* waitqueue head for the userfaults */
+	wait_queue_head_t fault_wqh;
+	/* waitqueue head for the pseudo fd to wakeup poll/read */
+	wait_queue_head_t fd_wqh;
+	/* waitqueue head for events */
+	wait_queue_head_t event_wqh;
+	/* a refile sequence protected by fault_pending_wqh lock */
+	seqcount_spinlock_t refile_seq;
+	/* pseudo fd refcounting */
+	refcount_t refcount;
+	/* userfaultfd syscall flags */
+	unsigned int flags;
+	/* features requested from the userspace */
+	unsigned int features;
+	/* released */
+	bool released;
+	/* memory mappings are changing because of non-cooperative event */
+	atomic_t mmap_changing;
+	/* mm with one ore more vmas attached to this userfaultfd_ctx */
+	struct mm_struct *mm;
+};
+
 extern vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason);
 
 /* A combined operation mode + behavior flags. */

From a69749d61080c36faa89f8e072ae5c531054c7ae Mon Sep 17 00:00:00 2001
From: Lokesh Gidra <lokeshgidra@google.com>
Date: Thu, 15 Feb 2024 10:27:54 -0800
Subject: [PATCH 1246/1406] userfaultfd: protect mmap_changing with rw_sem in
 userfaulfd_ctx

Increments and loads to mmap_changing are always in mmap_lock critical
section.  This ensures that if userspace requests event notification for
non-cooperative operations (e.g.  mremap), userfaultfd operations don't
occur concurrently.

This can be achieved by using a separate read-write semaphore in
userfaultfd_ctx such that increments are done in write-mode and loads in
read-mode, thereby eliminating the dependency on mmap_lock for this
purpose.

This is a preparatory step before we replace mmap_lock usage with per-vma
locks in fill/move ioctls.

Link: https://lkml.kernel.org/r/20240215182756.3448972-3-lokeshgidra@google.com
Signed-off-by: Lokesh Gidra <lokeshgidra@google.com>
Reviewed-by: Mike Rapoport (IBM) <rppt@kernel.org>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Brian Geffon <bgeffon@google.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: Kalesh Singh <kaleshsingh@google.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Nicolas Geoffray <ngeoffray@google.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Tim Murray <timmurray@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/userfaultfd.c              | 40 ++++++++++++----------
 include/linux/userfaultfd_k.h | 31 ++++++++++--------
 mm/userfaultfd.c              | 62 ++++++++++++++++++++---------------
 3 files changed, 75 insertions(+), 58 deletions(-)

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 58331b83d648da..c00a021bcce466 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -685,12 +685,15 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
 		ctx->flags = octx->flags;
 		ctx->features = octx->features;
 		ctx->released = false;
+		init_rwsem(&ctx->map_changing_lock);
 		atomic_set(&ctx->mmap_changing, 0);
 		ctx->mm = vma->vm_mm;
 		mmgrab(ctx->mm);
 
 		userfaultfd_ctx_get(octx);
+		down_write(&octx->map_changing_lock);
 		atomic_inc(&octx->mmap_changing);
+		up_write(&octx->map_changing_lock);
 		fctx->orig = octx;
 		fctx->new = ctx;
 		list_add_tail(&fctx->list, fcs);
@@ -737,7 +740,9 @@ void mremap_userfaultfd_prep(struct vm_area_struct *vma,
 	if (ctx->features & UFFD_FEATURE_EVENT_REMAP) {
 		vm_ctx->ctx = ctx;
 		userfaultfd_ctx_get(ctx);
+		down_write(&ctx->map_changing_lock);
 		atomic_inc(&ctx->mmap_changing);
+		up_write(&ctx->map_changing_lock);
 	} else {
 		/* Drop uffd context if remap feature not enabled */
 		vma_start_write(vma);
@@ -783,7 +788,9 @@ bool userfaultfd_remove(struct vm_area_struct *vma,
 		return true;
 
 	userfaultfd_ctx_get(ctx);
+	down_write(&ctx->map_changing_lock);
 	atomic_inc(&ctx->mmap_changing);
+	up_write(&ctx->map_changing_lock);
 	mmap_read_unlock(mm);
 
 	msg_init(&ewq.msg);
@@ -825,7 +832,9 @@ int userfaultfd_unmap_prep(struct vm_area_struct *vma, unsigned long start,
 		return -ENOMEM;
 
 	userfaultfd_ctx_get(ctx);
+	down_write(&ctx->map_changing_lock);
 	atomic_inc(&ctx->mmap_changing);
+	up_write(&ctx->map_changing_lock);
 	unmap_ctx->ctx = ctx;
 	unmap_ctx->start = start;
 	unmap_ctx->end = end;
@@ -1709,9 +1718,8 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
 	if (uffdio_copy.mode & UFFDIO_COPY_MODE_WP)
 		flags |= MFILL_ATOMIC_WP;
 	if (mmget_not_zero(ctx->mm)) {
-		ret = mfill_atomic_copy(ctx->mm, uffdio_copy.dst, uffdio_copy.src,
-					uffdio_copy.len, &ctx->mmap_changing,
-					flags);
+		ret = mfill_atomic_copy(ctx, uffdio_copy.dst, uffdio_copy.src,
+					uffdio_copy.len, flags);
 		mmput(ctx->mm);
 	} else {
 		return -ESRCH;
@@ -1761,9 +1769,8 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
 		goto out;
 
 	if (mmget_not_zero(ctx->mm)) {
-		ret = mfill_atomic_zeropage(ctx->mm, uffdio_zeropage.range.start,
-					   uffdio_zeropage.range.len,
-					   &ctx->mmap_changing);
+		ret = mfill_atomic_zeropage(ctx, uffdio_zeropage.range.start,
+					   uffdio_zeropage.range.len);
 		mmput(ctx->mm);
 	} else {
 		return -ESRCH;
@@ -1818,9 +1825,8 @@ static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
 		return -EINVAL;
 
 	if (mmget_not_zero(ctx->mm)) {
-		ret = mwriteprotect_range(ctx->mm, uffdio_wp.range.start,
-					  uffdio_wp.range.len, mode_wp,
-					  &ctx->mmap_changing);
+		ret = mwriteprotect_range(ctx, uffdio_wp.range.start,
+					  uffdio_wp.range.len, mode_wp);
 		mmput(ctx->mm);
 	} else {
 		return -ESRCH;
@@ -1870,9 +1876,8 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
 		flags |= MFILL_ATOMIC_WP;
 
 	if (mmget_not_zero(ctx->mm)) {
-		ret = mfill_atomic_continue(ctx->mm, uffdio_continue.range.start,
-					    uffdio_continue.range.len,
-					    &ctx->mmap_changing, flags);
+		ret = mfill_atomic_continue(ctx, uffdio_continue.range.start,
+					    uffdio_continue.range.len, flags);
 		mmput(ctx->mm);
 	} else {
 		return -ESRCH;
@@ -1925,9 +1930,8 @@ static inline int userfaultfd_poison(struct userfaultfd_ctx *ctx, unsigned long
 		goto out;
 
 	if (mmget_not_zero(ctx->mm)) {
-		ret = mfill_atomic_poison(ctx->mm, uffdio_poison.range.start,
-					  uffdio_poison.range.len,
-					  &ctx->mmap_changing, 0);
+		ret = mfill_atomic_poison(ctx, uffdio_poison.range.start,
+					  uffdio_poison.range.len, 0);
 		mmput(ctx->mm);
 	} else {
 		return -ESRCH;
@@ -2003,13 +2007,14 @@ static int userfaultfd_move(struct userfaultfd_ctx *ctx,
 	if (mmget_not_zero(mm)) {
 		mmap_read_lock(mm);
 
-		/* Re-check after taking mmap_lock */
+		/* Re-check after taking map_changing_lock */
+		down_read(&ctx->map_changing_lock);
 		if (likely(!atomic_read(&ctx->mmap_changing)))
 			ret = move_pages(ctx, mm, uffdio_move.dst, uffdio_move.src,
 					 uffdio_move.len, uffdio_move.mode);
 		else
 			ret = -EAGAIN;
-
+		up_read(&ctx->map_changing_lock);
 		mmap_read_unlock(mm);
 		mmput(mm);
 	} else {
@@ -2216,6 +2221,7 @@ static int new_userfaultfd(int flags)
 	ctx->flags = flags;
 	ctx->features = 0;
 	ctx->released = false;
+	init_rwsem(&ctx->map_changing_lock);
 	atomic_set(&ctx->mmap_changing, 0);
 	ctx->mm = current->mm;
 	/* prevent the mm struct to be freed */
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index 691d928ee8640c..3210c355297629 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -69,6 +69,13 @@ struct userfaultfd_ctx {
 	unsigned int features;
 	/* released */
 	bool released;
+	/*
+	 * Prevents userfaultfd operations (fill/move/wp) from happening while
+	 * some non-cooperative event(s) is taking place. Increments are done
+	 * in write-mode. Whereas, userfaultfd operations, which includes
+	 * reading mmap_changing, is done under read-mode.
+	 */
+	struct rw_semaphore map_changing_lock;
 	/* memory mappings are changing because of non-cooperative event */
 	atomic_t mmap_changing;
 	/* mm with one ore more vmas attached to this userfaultfd_ctx */
@@ -113,22 +120,18 @@ extern int mfill_atomic_install_pte(pmd_t *dst_pmd,
 				    unsigned long dst_addr, struct page *page,
 				    bool newly_allocated, uffd_flags_t flags);
 
-extern ssize_t mfill_atomic_copy(struct mm_struct *dst_mm, unsigned long dst_start,
+extern ssize_t mfill_atomic_copy(struct userfaultfd_ctx *ctx, unsigned long dst_start,
 				 unsigned long src_start, unsigned long len,
-				 atomic_t *mmap_changing, uffd_flags_t flags);
-extern ssize_t mfill_atomic_zeropage(struct mm_struct *dst_mm,
+				 uffd_flags_t flags);
+extern ssize_t mfill_atomic_zeropage(struct userfaultfd_ctx *ctx,
 				     unsigned long dst_start,
-				     unsigned long len,
-				     atomic_t *mmap_changing);
-extern ssize_t mfill_atomic_continue(struct mm_struct *dst_mm, unsigned long dst_start,
-				     unsigned long len, atomic_t *mmap_changing,
-				     uffd_flags_t flags);
-extern ssize_t mfill_atomic_poison(struct mm_struct *dst_mm, unsigned long start,
-				   unsigned long len, atomic_t *mmap_changing,
-				   uffd_flags_t flags);
-extern int mwriteprotect_range(struct mm_struct *dst_mm,
-			       unsigned long start, unsigned long len,
-			       bool enable_wp, atomic_t *mmap_changing);
+				     unsigned long len);
+extern ssize_t mfill_atomic_continue(struct userfaultfd_ctx *ctx, unsigned long dst_start,
+				     unsigned long len, uffd_flags_t flags);
+extern ssize_t mfill_atomic_poison(struct userfaultfd_ctx *ctx, unsigned long start,
+				   unsigned long len, uffd_flags_t flags);
+extern int mwriteprotect_range(struct userfaultfd_ctx *ctx, unsigned long start,
+			       unsigned long len, bool enable_wp);
 extern long uffd_wp_range(struct vm_area_struct *vma,
 			  unsigned long start, unsigned long len, bool enable_wp);
 
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 9cc93cc1330b1c..74aad0831e4010 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -353,11 +353,11 @@ static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address)
  * called with mmap_lock held, it will release mmap_lock before returning.
  */
 static __always_inline ssize_t mfill_atomic_hugetlb(
+					      struct userfaultfd_ctx *ctx,
 					      struct vm_area_struct *dst_vma,
 					      unsigned long dst_start,
 					      unsigned long src_start,
 					      unsigned long len,
-					      atomic_t *mmap_changing,
 					      uffd_flags_t flags)
 {
 	struct mm_struct *dst_mm = dst_vma->vm_mm;
@@ -379,6 +379,7 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
 	 * feature is not supported.
 	 */
 	if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE)) {
+		up_read(&ctx->map_changing_lock);
 		mmap_read_unlock(dst_mm);
 		return -EINVAL;
 	}
@@ -463,6 +464,7 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
 		cond_resched();
 
 		if (unlikely(err == -ENOENT)) {
+			up_read(&ctx->map_changing_lock);
 			mmap_read_unlock(dst_mm);
 			BUG_ON(!folio);
 
@@ -473,12 +475,13 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
 				goto out;
 			}
 			mmap_read_lock(dst_mm);
+			down_read(&ctx->map_changing_lock);
 			/*
 			 * If memory mappings are changing because of non-cooperative
 			 * operation (e.g. mremap) running in parallel, bail out and
 			 * request the user to retry later
 			 */
-			if (mmap_changing && atomic_read(mmap_changing)) {
+			if (atomic_read(&ctx->mmap_changing)) {
 				err = -EAGAIN;
 				break;
 			}
@@ -501,6 +504,7 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
 	}
 
 out_unlock:
+	up_read(&ctx->map_changing_lock);
 	mmap_read_unlock(dst_mm);
 out:
 	if (folio)
@@ -512,11 +516,11 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
 }
 #else /* !CONFIG_HUGETLB_PAGE */
 /* fail at build time if gcc attempts to use this */
-extern ssize_t mfill_atomic_hugetlb(struct vm_area_struct *dst_vma,
+extern ssize_t mfill_atomic_hugetlb(struct userfaultfd_ctx *ctx,
+				    struct vm_area_struct *dst_vma,
 				    unsigned long dst_start,
 				    unsigned long src_start,
 				    unsigned long len,
-				    atomic_t *mmap_changing,
 				    uffd_flags_t flags);
 #endif /* CONFIG_HUGETLB_PAGE */
 
@@ -564,13 +568,13 @@ static __always_inline ssize_t mfill_atomic_pte(pmd_t *dst_pmd,
 	return err;
 }
 
-static __always_inline ssize_t mfill_atomic(struct mm_struct *dst_mm,
+static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
 					    unsigned long dst_start,
 					    unsigned long src_start,
 					    unsigned long len,
-					    atomic_t *mmap_changing,
 					    uffd_flags_t flags)
 {
+	struct mm_struct *dst_mm = ctx->mm;
 	struct vm_area_struct *dst_vma;
 	ssize_t err;
 	pmd_t *dst_pmd;
@@ -600,8 +604,9 @@ static __always_inline ssize_t mfill_atomic(struct mm_struct *dst_mm,
 	 * operation (e.g. mremap) running in parallel, bail out and
 	 * request the user to retry later
 	 */
+	down_read(&ctx->map_changing_lock);
 	err = -EAGAIN;
-	if (mmap_changing && atomic_read(mmap_changing))
+	if (atomic_read(&ctx->mmap_changing))
 		goto out_unlock;
 
 	/*
@@ -633,8 +638,8 @@ static __always_inline ssize_t mfill_atomic(struct mm_struct *dst_mm,
 	 * If this is a HUGETLB vma, pass off to appropriate routine
 	 */
 	if (is_vm_hugetlb_page(dst_vma))
-		return  mfill_atomic_hugetlb(dst_vma, dst_start, src_start,
-					     len, mmap_changing, flags);
+		return  mfill_atomic_hugetlb(ctx, dst_vma, dst_start,
+					     src_start, len, flags);
 
 	if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
 		goto out_unlock;
@@ -693,6 +698,7 @@ static __always_inline ssize_t mfill_atomic(struct mm_struct *dst_mm,
 		if (unlikely(err == -ENOENT)) {
 			void *kaddr;
 
+			up_read(&ctx->map_changing_lock);
 			mmap_read_unlock(dst_mm);
 			BUG_ON(!folio);
 
@@ -723,6 +729,7 @@ static __always_inline ssize_t mfill_atomic(struct mm_struct *dst_mm,
 	}
 
 out_unlock:
+	up_read(&ctx->map_changing_lock);
 	mmap_read_unlock(dst_mm);
 out:
 	if (folio)
@@ -733,34 +740,33 @@ static __always_inline ssize_t mfill_atomic(struct mm_struct *dst_mm,
 	return copied ? copied : err;
 }
 
-ssize_t mfill_atomic_copy(struct mm_struct *dst_mm, unsigned long dst_start,
+ssize_t mfill_atomic_copy(struct userfaultfd_ctx *ctx, unsigned long dst_start,
 			  unsigned long src_start, unsigned long len,
-			  atomic_t *mmap_changing, uffd_flags_t flags)
+			  uffd_flags_t flags)
 {
-	return mfill_atomic(dst_mm, dst_start, src_start, len, mmap_changing,
+	return mfill_atomic(ctx, dst_start, src_start, len,
 			    uffd_flags_set_mode(flags, MFILL_ATOMIC_COPY));
 }
 
-ssize_t mfill_atomic_zeropage(struct mm_struct *dst_mm, unsigned long start,
-			      unsigned long len, atomic_t *mmap_changing)
+ssize_t mfill_atomic_zeropage(struct userfaultfd_ctx *ctx,
+			      unsigned long start,
+			      unsigned long len)
 {
-	return mfill_atomic(dst_mm, start, 0, len, mmap_changing,
+	return mfill_atomic(ctx, start, 0, len,
 			    uffd_flags_set_mode(0, MFILL_ATOMIC_ZEROPAGE));
 }
 
-ssize_t mfill_atomic_continue(struct mm_struct *dst_mm, unsigned long start,
-			      unsigned long len, atomic_t *mmap_changing,
-			      uffd_flags_t flags)
+ssize_t mfill_atomic_continue(struct userfaultfd_ctx *ctx, unsigned long start,
+			      unsigned long len, uffd_flags_t flags)
 {
-	return mfill_atomic(dst_mm, start, 0, len, mmap_changing,
+	return mfill_atomic(ctx, start, 0, len,
 			    uffd_flags_set_mode(flags, MFILL_ATOMIC_CONTINUE));
 }
 
-ssize_t mfill_atomic_poison(struct mm_struct *dst_mm, unsigned long start,
-			    unsigned long len, atomic_t *mmap_changing,
-			    uffd_flags_t flags)
+ssize_t mfill_atomic_poison(struct userfaultfd_ctx *ctx, unsigned long start,
+			    unsigned long len, uffd_flags_t flags)
 {
-	return mfill_atomic(dst_mm, start, 0, len, mmap_changing,
+	return mfill_atomic(ctx, start, 0, len,
 			    uffd_flags_set_mode(flags, MFILL_ATOMIC_POISON));
 }
 
@@ -793,10 +799,10 @@ long uffd_wp_range(struct vm_area_struct *dst_vma,
 	return ret;
 }
 
-int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
-			unsigned long len, bool enable_wp,
-			atomic_t *mmap_changing)
+int mwriteprotect_range(struct userfaultfd_ctx *ctx, unsigned long start,
+			unsigned long len, bool enable_wp)
 {
+	struct mm_struct *dst_mm = ctx->mm;
 	unsigned long end = start + len;
 	unsigned long _start, _end;
 	struct vm_area_struct *dst_vma;
@@ -820,8 +826,9 @@ int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
 	 * operation (e.g. mremap) running in parallel, bail out and
 	 * request the user to retry later
 	 */
+	down_read(&ctx->map_changing_lock);
 	err = -EAGAIN;
-	if (mmap_changing && atomic_read(mmap_changing))
+	if (atomic_read(&ctx->mmap_changing))
 		goto out_unlock;
 
 	err = -ENOENT;
@@ -850,6 +857,7 @@ int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
 		err = 0;
 	}
 out_unlock:
+	up_read(&ctx->map_changing_lock);
 	mmap_read_unlock(dst_mm);
 	return err;
 }

From b511c2a86bec5f674d89171e4d332ac496fafbe8 Mon Sep 17 00:00:00 2001
From: Lokesh Gidra <lokeshgidra@google.com>
Date: Thu, 15 Feb 2024 10:27:55 -0800
Subject: [PATCH 1247/1406] mm: add vma_assert_locked() for
 !CONFIG_PER_VMA_LOCK

vma_assert_locked() is needed to replace mmap_assert_locked() once we
start using per-vma locks in userfaultfd operations.

In !CONFIG_PER_VMA_LOCK case when mm is locked, it implies that the given
VMA is locked.

Link: https://lkml.kernel.org/r/20240215182756.3448972-4-lokeshgidra@google.com
Signed-off-by: Lokesh Gidra <lokeshgidra@google.com>
Reviewed-by: Suren Baghdasaryan <surenb@google.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Brian Geffon <bgeffon@google.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: Kalesh Singh <kaleshsingh@google.com>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Mike Rapoport (IBM) <rppt@kernel.org>
Cc: Nicolas Geoffray <ngeoffray@google.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Tim Murray <timmurray@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 3c85634b186cf3..5ece3ad34ef813 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -781,6 +781,11 @@ static inline struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
 	return NULL;
 }
 
+static inline void vma_assert_locked(struct vm_area_struct *vma)
+{
+	mmap_assert_locked(vma->vm_mm);
+}
+
 static inline void release_fault_lock(struct vm_fault *vmf)
 {
 	mmap_read_unlock(vmf->vma->vm_mm);

From c94198bbac0a3f8c65c5430be3405e88d7d907a7 Mon Sep 17 00:00:00 2001
From: Lokesh Gidra <lokeshgidra@google.com>
Date: Thu, 15 Feb 2024 10:27:56 -0800
Subject: [PATCH 1248/1406] userfaultfd: use per-vma locks in userfaultfd
 operations

All userfaultfd operations, except write-protect, opportunistically use
per-vma locks to lock vmas.  On failure, attempt again inside mmap_lock
critical section.

Write-protect operation requires mmap_lock as it iterates over multiple
vmas.

Link: https://lkml.kernel.org/r/20240215182756.3448972-5-lokeshgidra@google.com
Signed-off-by: Lokesh Gidra <lokeshgidra@google.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Brian Geffon <bgeffon@google.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: Kalesh Singh <kaleshsingh@google.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Mike Rapoport (IBM) <rppt@kernel.org>
Cc: Nicolas Geoffray <ngeoffray@google.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Tim Murray <timmurray@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/userfaultfd.c              |  13 +-
 include/linux/userfaultfd_k.h |   5 +-
 mm/huge_memory.c              |   5 +-
 mm/userfaultfd.c              | 380 ++++++++++++++++++++++++++--------
 4 files changed, 299 insertions(+), 104 deletions(-)

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index c00a021bcce466..60dcfafdc11a84 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -2005,17 +2005,8 @@ static int userfaultfd_move(struct userfaultfd_ctx *ctx,
 		return -EINVAL;
 
 	if (mmget_not_zero(mm)) {
-		mmap_read_lock(mm);
-
-		/* Re-check after taking map_changing_lock */
-		down_read(&ctx->map_changing_lock);
-		if (likely(!atomic_read(&ctx->mmap_changing)))
-			ret = move_pages(ctx, mm, uffdio_move.dst, uffdio_move.src,
-					 uffdio_move.len, uffdio_move.mode);
-		else
-			ret = -EAGAIN;
-		up_read(&ctx->map_changing_lock);
-		mmap_read_unlock(mm);
+		ret = move_pages(ctx, uffdio_move.dst, uffdio_move.src,
+				 uffdio_move.len, uffdio_move.mode);
 		mmput(mm);
 	} else {
 		return -ESRCH;
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index 3210c355297629..05d59f74fc887f 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -138,9 +138,8 @@ extern long uffd_wp_range(struct vm_area_struct *vma,
 /* move_pages */
 void double_pt_lock(spinlock_t *ptl1, spinlock_t *ptl2);
 void double_pt_unlock(spinlock_t *ptl1, spinlock_t *ptl2);
-ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
-		   unsigned long dst_start, unsigned long src_start,
-		   unsigned long len, __u64 flags);
+ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start,
+		   unsigned long src_start, unsigned long len, __u64 flags);
 int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pmd_t dst_pmdval,
 			struct vm_area_struct *dst_vma,
 			struct vm_area_struct *src_vma,
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 14888b15121e59..28341a5067fbad 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2158,7 +2158,7 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 
 #ifdef CONFIG_USERFAULTFD
 /*
- * The PT lock for src_pmd and the mmap_lock for reading are held by
+ * The PT lock for src_pmd and dst_vma/src_vma (for reading) are locked by
  * the caller, but it must return after releasing the page_table_lock.
  * Just move the page from src_pmd to dst_pmd if possible.
  * Return zero if succeeded in moving the page, -EAGAIN if it needs to be
@@ -2181,7 +2181,8 @@ int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pm
 	src_ptl = pmd_lockptr(mm, src_pmd);
 
 	lockdep_assert_held(src_ptl);
-	mmap_assert_locked(mm);
+	vma_assert_locked(src_vma);
+	vma_assert_locked(dst_vma);
 
 	/* Sanity checks before the operation */
 	if (WARN_ON_ONCE(!pmd_none(dst_pmdval)) || WARN_ON_ONCE(src_addr & ~HPAGE_PMD_MASK) ||
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 74aad0831e4010..4744d6a96f9673 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -20,19 +20,11 @@
 #include "internal.h"
 
 static __always_inline
-struct vm_area_struct *find_dst_vma(struct mm_struct *dst_mm,
-				    unsigned long dst_start,
-				    unsigned long len)
+bool validate_dst_vma(struct vm_area_struct *dst_vma, unsigned long dst_end)
 {
-	/*
-	 * Make sure that the dst range is both valid and fully within a
-	 * single existing vma.
-	 */
-	struct vm_area_struct *dst_vma;
-
-	dst_vma = find_vma(dst_mm, dst_start);
-	if (!range_in_vma(dst_vma, dst_start, dst_start + len))
-		return NULL;
+	/* Make sure that the dst range is fully within dst_vma. */
+	if (dst_end > dst_vma->vm_end)
+		return false;
 
 	/*
 	 * Check the vma is registered in uffd, this is required to
@@ -40,11 +32,122 @@ struct vm_area_struct *find_dst_vma(struct mm_struct *dst_mm,
 	 * time.
 	 */
 	if (!dst_vma->vm_userfaultfd_ctx.ctx)
-		return NULL;
+		return false;
+
+	return true;
+}
+
+static __always_inline
+struct vm_area_struct *find_vma_and_prepare_anon(struct mm_struct *mm,
+						 unsigned long addr)
+{
+	struct vm_area_struct *vma;
+
+	mmap_assert_locked(mm);
+	vma = vma_lookup(mm, addr);
+	if (!vma)
+		vma = ERR_PTR(-ENOENT);
+	else if (!(vma->vm_flags & VM_SHARED) &&
+		 unlikely(anon_vma_prepare(vma)))
+		vma = ERR_PTR(-ENOMEM);
+
+	return vma;
+}
+
+#ifdef CONFIG_PER_VMA_LOCK
+/*
+ * lock_vma() - Lookup and lock vma corresponding to @address.
+ * @mm: mm to search vma in.
+ * @address: address that the vma should contain.
+ *
+ * Should be called without holding mmap_lock. vma should be unlocked after use
+ * with unlock_vma().
+ *
+ * Return: A locked vma containing @address, -ENOENT if no vma is found, or
+ * -ENOMEM if anon_vma couldn't be allocated.
+ */
+static struct vm_area_struct *lock_vma(struct mm_struct *mm,
+				       unsigned long address)
+{
+	struct vm_area_struct *vma;
+
+	vma = lock_vma_under_rcu(mm, address);
+	if (vma) {
+		/*
+		 * lock_vma_under_rcu() only checks anon_vma for private
+		 * anonymous mappings. But we need to ensure it is assigned in
+		 * private file-backed vmas as well.
+		 */
+		if (!(vma->vm_flags & VM_SHARED) && unlikely(!vma->anon_vma))
+			vma_end_read(vma);
+		else
+			return vma;
+	}
+
+	mmap_read_lock(mm);
+	vma = find_vma_and_prepare_anon(mm, address);
+	if (!IS_ERR(vma)) {
+		/*
+		 * We cannot use vma_start_read() as it may fail due to
+		 * false locked (see comment in vma_start_read()). We
+		 * can avoid that by directly locking vm_lock under
+		 * mmap_lock, which guarantees that nobody can lock the
+		 * vma for write (vma_start_write()) under us.
+		 */
+		down_read(&vma->vm_lock->lock);
+	}
+
+	mmap_read_unlock(mm);
+	return vma;
+}
+
+static struct vm_area_struct *uffd_mfill_lock(struct mm_struct *dst_mm,
+					      unsigned long dst_start,
+					      unsigned long len)
+{
+	struct vm_area_struct *dst_vma;
 
+	dst_vma = lock_vma(dst_mm, dst_start);
+	if (IS_ERR(dst_vma) || validate_dst_vma(dst_vma, dst_start + len))
+		return dst_vma;
+
+	vma_end_read(dst_vma);
+	return ERR_PTR(-ENOENT);
+}
+
+static void uffd_mfill_unlock(struct vm_area_struct *vma)
+{
+	vma_end_read(vma);
+}
+
+#else
+
+static struct vm_area_struct *uffd_mfill_lock(struct mm_struct *dst_mm,
+					      unsigned long dst_start,
+					      unsigned long len)
+{
+	struct vm_area_struct *dst_vma;
+
+	mmap_read_lock(dst_mm);
+	dst_vma = find_vma_and_prepare_anon(dst_mm, dst_start);
+	if (IS_ERR(dst_vma))
+		goto out_unlock;
+
+	if (validate_dst_vma(dst_vma, dst_start + len))
+		return dst_vma;
+
+	dst_vma = ERR_PTR(-ENOENT);
+out_unlock:
+	mmap_read_unlock(dst_mm);
 	return dst_vma;
 }
 
+static void uffd_mfill_unlock(struct vm_area_struct *vma)
+{
+	mmap_read_unlock(vma->vm_mm);
+}
+#endif
+
 /* Check if dst_addr is outside of file's size. Must be called with ptl held. */
 static bool mfill_file_over_size(struct vm_area_struct *dst_vma,
 				 unsigned long dst_addr)
@@ -350,7 +453,8 @@ static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address)
 #ifdef CONFIG_HUGETLB_PAGE
 /*
  * mfill_atomic processing for HUGETLB vmas.  Note that this routine is
- * called with mmap_lock held, it will release mmap_lock before returning.
+ * called with either vma-lock or mmap_lock held, it will release the lock
+ * before returning.
  */
 static __always_inline ssize_t mfill_atomic_hugetlb(
 					      struct userfaultfd_ctx *ctx,
@@ -361,7 +465,6 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
 					      uffd_flags_t flags)
 {
 	struct mm_struct *dst_mm = dst_vma->vm_mm;
-	int vm_shared = dst_vma->vm_flags & VM_SHARED;
 	ssize_t err;
 	pte_t *dst_pte;
 	unsigned long src_addr, dst_addr;
@@ -380,7 +483,7 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
 	 */
 	if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE)) {
 		up_read(&ctx->map_changing_lock);
-		mmap_read_unlock(dst_mm);
+		uffd_mfill_unlock(dst_vma);
 		return -EINVAL;
 	}
 
@@ -403,24 +506,28 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
 	 * retry, dst_vma will be set to NULL and we must lookup again.
 	 */
 	if (!dst_vma) {
+		dst_vma = uffd_mfill_lock(dst_mm, dst_start, len);
+		if (IS_ERR(dst_vma)) {
+			err = PTR_ERR(dst_vma);
+			goto out;
+		}
+
 		err = -ENOENT;
-		dst_vma = find_dst_vma(dst_mm, dst_start, len);
-		if (!dst_vma || !is_vm_hugetlb_page(dst_vma))
-			goto out_unlock;
+		if (!is_vm_hugetlb_page(dst_vma))
+			goto out_unlock_vma;
 
 		err = -EINVAL;
 		if (vma_hpagesize != vma_kernel_pagesize(dst_vma))
-			goto out_unlock;
-
-		vm_shared = dst_vma->vm_flags & VM_SHARED;
-	}
+			goto out_unlock_vma;
 
-	/*
-	 * If not shared, ensure the dst_vma has a anon_vma.
-	 */
-	err = -ENOMEM;
-	if (!vm_shared) {
-		if (unlikely(anon_vma_prepare(dst_vma)))
+		/*
+		 * If memory mappings are changing because of non-cooperative
+		 * operation (e.g. mremap) running in parallel, bail out and
+		 * request the user to retry later
+		 */
+		down_read(&ctx->map_changing_lock);
+		err = -EAGAIN;
+		if (atomic_read(&ctx->mmap_changing))
 			goto out_unlock;
 	}
 
@@ -465,7 +572,7 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
 
 		if (unlikely(err == -ENOENT)) {
 			up_read(&ctx->map_changing_lock);
-			mmap_read_unlock(dst_mm);
+			uffd_mfill_unlock(dst_vma);
 			BUG_ON(!folio);
 
 			err = copy_folio_from_user(folio,
@@ -474,17 +581,6 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
 				err = -EFAULT;
 				goto out;
 			}
-			mmap_read_lock(dst_mm);
-			down_read(&ctx->map_changing_lock);
-			/*
-			 * If memory mappings are changing because of non-cooperative
-			 * operation (e.g. mremap) running in parallel, bail out and
-			 * request the user to retry later
-			 */
-			if (atomic_read(&ctx->mmap_changing)) {
-				err = -EAGAIN;
-				break;
-			}
 
 			dst_vma = NULL;
 			goto retry;
@@ -505,7 +601,8 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
 
 out_unlock:
 	up_read(&ctx->map_changing_lock);
-	mmap_read_unlock(dst_mm);
+out_unlock_vma:
+	uffd_mfill_unlock(dst_vma);
 out:
 	if (folio)
 		folio_put(folio);
@@ -597,7 +694,15 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
 	copied = 0;
 	folio = NULL;
 retry:
-	mmap_read_lock(dst_mm);
+	/*
+	 * Make sure the vma is not shared, that the dst range is
+	 * both valid and fully within a single existing vma.
+	 */
+	dst_vma = uffd_mfill_lock(dst_mm, dst_start, len);
+	if (IS_ERR(dst_vma)) {
+		err = PTR_ERR(dst_vma);
+		goto out;
+	}
 
 	/*
 	 * If memory mappings are changing because of non-cooperative
@@ -609,15 +714,6 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
 	if (atomic_read(&ctx->mmap_changing))
 		goto out_unlock;
 
-	/*
-	 * Make sure the vma is not shared, that the dst range is
-	 * both valid and fully within a single existing vma.
-	 */
-	err = -ENOENT;
-	dst_vma = find_dst_vma(dst_mm, dst_start, len);
-	if (!dst_vma)
-		goto out_unlock;
-
 	err = -EINVAL;
 	/*
 	 * shmem_zero_setup is invoked in mmap for MAP_ANONYMOUS|MAP_SHARED but
@@ -647,16 +743,6 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
 	    uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE))
 		goto out_unlock;
 
-	/*
-	 * Ensure the dst_vma has a anon_vma or this page
-	 * would get a NULL anon_vma when moved in the
-	 * dst_vma.
-	 */
-	err = -ENOMEM;
-	if (!(dst_vma->vm_flags & VM_SHARED) &&
-	    unlikely(anon_vma_prepare(dst_vma)))
-		goto out_unlock;
-
 	while (src_addr < src_start + len) {
 		pmd_t dst_pmdval;
 
@@ -699,7 +785,7 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
 			void *kaddr;
 
 			up_read(&ctx->map_changing_lock);
-			mmap_read_unlock(dst_mm);
+			uffd_mfill_unlock(dst_vma);
 			BUG_ON(!folio);
 
 			kaddr = kmap_local_folio(folio, 0);
@@ -730,7 +816,7 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
 
 out_unlock:
 	up_read(&ctx->map_changing_lock);
-	mmap_read_unlock(dst_mm);
+	uffd_mfill_unlock(dst_vma);
 out:
 	if (folio)
 		folio_put(folio);
@@ -1267,27 +1353,136 @@ static int validate_move_areas(struct userfaultfd_ctx *ctx,
 	if (!vma_is_anonymous(src_vma) || !vma_is_anonymous(dst_vma))
 		return -EINVAL;
 
+	return 0;
+}
+
+static __always_inline
+int find_vmas_mm_locked(struct mm_struct *mm,
+			unsigned long dst_start,
+			unsigned long src_start,
+			struct vm_area_struct **dst_vmap,
+			struct vm_area_struct **src_vmap)
+{
+	struct vm_area_struct *vma;
+
+	mmap_assert_locked(mm);
+	vma = find_vma_and_prepare_anon(mm, dst_start);
+	if (IS_ERR(vma))
+		return PTR_ERR(vma);
+
+	*dst_vmap = vma;
+	/* Skip finding src_vma if src_start is in dst_vma */
+	if (src_start >= vma->vm_start && src_start < vma->vm_end)
+		goto out_success;
+
+	vma = vma_lookup(mm, src_start);
+	if (!vma)
+		return -ENOENT;
+out_success:
+	*src_vmap = vma;
+	return 0;
+}
+
+#ifdef CONFIG_PER_VMA_LOCK
+static int uffd_move_lock(struct mm_struct *mm,
+			  unsigned long dst_start,
+			  unsigned long src_start,
+			  struct vm_area_struct **dst_vmap,
+			  struct vm_area_struct **src_vmap)
+{
+	struct vm_area_struct *vma;
+	int err;
+
+	vma = lock_vma(mm, dst_start);
+	if (IS_ERR(vma))
+		return PTR_ERR(vma);
+
+	*dst_vmap = vma;
 	/*
-	 * Ensure the dst_vma has a anon_vma or this page
-	 * would get a NULL anon_vma when moved in the
-	 * dst_vma.
+	 * Skip finding src_vma if src_start is in dst_vma. This also ensures
+	 * that we don't lock the same vma twice.
 	 */
-	if (unlikely(anon_vma_prepare(dst_vma)))
-		return -ENOMEM;
+	if (src_start >= vma->vm_start && src_start < vma->vm_end) {
+		*src_vmap = vma;
+		return 0;
+	}
 
-	return 0;
+	/*
+	 * Using lock_vma() to get src_vma can lead to following deadlock:
+	 *
+	 * Thread1				Thread2
+	 * -------				-------
+	 * vma_start_read(dst_vma)
+	 *					mmap_write_lock(mm)
+	 *					vma_start_write(src_vma)
+	 * vma_start_read(src_vma)
+	 * mmap_read_lock(mm)
+	 *					vma_start_write(dst_vma)
+	 */
+	*src_vmap = lock_vma_under_rcu(mm, src_start);
+	if (likely(*src_vmap))
+		return 0;
+
+	/* Undo any locking and retry in mmap_lock critical section */
+	vma_end_read(*dst_vmap);
+
+	mmap_read_lock(mm);
+	err = find_vmas_mm_locked(mm, dst_start, src_start, dst_vmap, src_vmap);
+	if (!err) {
+		/*
+		 * See comment in lock_vma() as to why not using
+		 * vma_start_read() here.
+		 */
+		down_read(&(*dst_vmap)->vm_lock->lock);
+		if (*dst_vmap != *src_vmap)
+			down_read(&(*src_vmap)->vm_lock->lock);
+	}
+	mmap_read_unlock(mm);
+	return err;
+}
+
+static void uffd_move_unlock(struct vm_area_struct *dst_vma,
+			     struct vm_area_struct *src_vma)
+{
+	vma_end_read(src_vma);
+	if (src_vma != dst_vma)
+		vma_end_read(dst_vma);
 }
 
+#else
+
+static int uffd_move_lock(struct mm_struct *mm,
+			  unsigned long dst_start,
+			  unsigned long src_start,
+			  struct vm_area_struct **dst_vmap,
+			  struct vm_area_struct **src_vmap)
+{
+	int err;
+
+	mmap_read_lock(mm);
+	err = find_vmas_mm_locked(mm, dst_start, src_start, dst_vmap, src_vmap);
+	if (err)
+		mmap_read_unlock(mm);
+	return err;
+}
+
+static void uffd_move_unlock(struct vm_area_struct *dst_vma,
+			     struct vm_area_struct *src_vma)
+{
+	mmap_assert_locked(src_vma->vm_mm);
+	mmap_read_unlock(dst_vma->vm_mm);
+}
+#endif
+
 /**
  * move_pages - move arbitrary anonymous pages of an existing vma
  * @ctx: pointer to the userfaultfd context
- * @mm: the address space to move pages
  * @dst_start: start of the destination virtual memory range
  * @src_start: start of the source virtual memory range
  * @len: length of the virtual memory range
  * @mode: flags from uffdio_move.mode
  *
- * Must be called with mmap_lock held for read.
+ * It will either use the mmap_lock in read mode or per-vma locks
  *
  * move_pages() remaps arbitrary anonymous pages atomically in zero
  * copy. It only works on non shared anonymous pages because those can
@@ -1355,10 +1550,10 @@ static int validate_move_areas(struct userfaultfd_ctx *ctx,
  * could be obtained. This is the only additional complexity added to
  * the rmap code to provide this anonymous page remapping functionality.
  */
-ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
-		   unsigned long dst_start, unsigned long src_start,
-		   unsigned long len, __u64 mode)
+ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start,
+		   unsigned long src_start, unsigned long len, __u64 mode)
 {
+	struct mm_struct *mm = ctx->mm;
 	struct vm_area_struct *src_vma, *dst_vma;
 	unsigned long src_addr, dst_addr;
 	pmd_t *src_pmd, *dst_pmd;
@@ -1376,28 +1571,34 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
 	    WARN_ON_ONCE(dst_start + len <= dst_start))
 		goto out;
 
+	err = uffd_move_lock(mm, dst_start, src_start, &dst_vma, &src_vma);
+	if (err)
+		goto out;
+
+	/* Re-check after taking map_changing_lock */
+	err = -EAGAIN;
+	down_read(&ctx->map_changing_lock);
+	if (likely(atomic_read(&ctx->mmap_changing)))
+		goto out_unlock;
 	/*
 	 * Make sure the vma is not shared, that the src and dst remap
 	 * ranges are both valid and fully within a single existing
 	 * vma.
 	 */
-	src_vma = find_vma(mm, src_start);
-	if (!src_vma || (src_vma->vm_flags & VM_SHARED))
-		goto out;
-	if (src_start < src_vma->vm_start ||
-	    src_start + len > src_vma->vm_end)
-		goto out;
+	err = -EINVAL;
+	if (src_vma->vm_flags & VM_SHARED)
+		goto out_unlock;
+	if (src_start + len > src_vma->vm_end)
+		goto out_unlock;
 
-	dst_vma = find_vma(mm, dst_start);
-	if (!dst_vma || (dst_vma->vm_flags & VM_SHARED))
-		goto out;
-	if (dst_start < dst_vma->vm_start ||
-	    dst_start + len > dst_vma->vm_end)
-		goto out;
+	if (dst_vma->vm_flags & VM_SHARED)
+		goto out_unlock;
+	if (dst_start + len > dst_vma->vm_end)
+		goto out_unlock;
 
 	err = validate_move_areas(ctx, src_vma, dst_vma);
 	if (err)
-		goto out;
+		goto out_unlock;
 
 	for (src_addr = src_start, dst_addr = dst_start;
 	     src_addr < src_start + len;) {
@@ -1514,6 +1715,9 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
 		moved += step_size;
 	}
 
+out_unlock:
+	up_read(&ctx->map_changing_lock);
+	uffd_move_unlock(dst_vma, src_vma);
 out:
 	VM_WARN_ON(moved < 0);
 	VM_WARN_ON(err > 0);

From 4b758d70257d5c7830c5411639b03541336c9bf7 Mon Sep 17 00:00:00 2001
From: "Ricardo B. Marliere" <ricardo@marliere.net>
Date: Wed, 14 Feb 2024 16:15:00 -0300
Subject: [PATCH 1249/1406] kunit: make kunit_bus_type const

Since commit d492cc2573a0 ("driver core: device.h: make struct
bus_type a const *"), the driver core can properly handle constant
struct bus_type, move the kunit_bus_type variable to be a constant
structure as well, placing it into read-only memory which can not be
modified at runtime.

Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Suggested-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Ricardo B. Marliere <ricardo@marliere.net>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: David Gow <davidgow@google.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 lib/kunit/device.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/kunit/device.c b/lib/kunit/device.c
index 644a38a1f5b1cf..e2064155931492 100644
--- a/lib/kunit/device.c
+++ b/lib/kunit/device.c
@@ -35,7 +35,7 @@ struct kunit_device {
 
 #define to_kunit_device(d) container_of_const(d, struct kunit_device, dev)
 
-static struct bus_type kunit_bus_type = {
+static const struct bus_type kunit_bus_type = {
 	.name		= "kunit",
 };
 

From 02f45c1a3cd388d60b87ecffd61d64777e897f01 Mon Sep 17 00:00:00 2001
From: Oscar Salvador <osalvador@suse.de>
Date: Thu, 15 Feb 2024 22:59:01 +0100
Subject: [PATCH 1250/1406] lib/stackdepot: fix first entry having a 0-handle

Patch series "page_owner: print stacks and their outstanding allocations",
v10.

page_owner is a great debug functionality tool that lets us know about all
pages that have been allocated/freed and their specific stacktrace.  This
comes very handy when debugging memory leaks, since with some scripting we
can see the outstanding allocations, which might point to a memory leak.

In my experience, that is one of the most useful cases, but it can get
really tedious to screen through all pages and try to reconstruct the
stack <-> allocated/freed relationship, becoming most of the time a
daunting and slow process when we have tons of allocation/free operations.


This patchset aims to ease that by adding a new functionality into
page_owner.  This functionality creates a new directory called
'page_owner_stacks' under 'sys/kernel//debug' with a read-only file called
'show_stacks', which prints out all the stacks followed by their
outstanding number of allocations (being that the times the stacktrace has
allocated but not freed yet).  This gives us a clear and a quick overview
of stacks <-> allocated/free.

We take advantage of the new refcount_f field that stack_record struct
gained, and increment/decrement the stack refcount on every
__set_page_owner() (alloc operation) and __reset_page_owner (free
operation) call.

Unfortunately, we cannot use the new stackdepot api STACK_DEPOT_FLAG_GET
because it does not fulfill page_owner needs, meaning we would have to
special case things, at which point makes more sense for page_owner to do
its own {dec,inc}rementing of the stacks.  E.g: Using
STACK_DEPOT_FLAG_PUT, once the refcount reaches 0, such stack gets
evicted, so page_owner would lose information.

This patchset also creates a new file called 'set_threshold' within
'page_owner_stacks' directory, and by writing a value to it, the stacks
which refcount is below such value will be filtered out.

A PoC can be found below:

 # cat /sys/kernel/debug/page_owner_stacks/show_stacks > page_owner_full_stacks.txt
 # head -40 page_owner_full_stacks.txt
  prep_new_page+0xa9/0x120
  get_page_from_freelist+0x801/0x2210
  __alloc_pages+0x18b/0x350
  alloc_pages_mpol+0x91/0x1f0
  folio_alloc+0x14/0x50
  filemap_alloc_folio+0xb2/0x100
  page_cache_ra_unbounded+0x96/0x180
  filemap_get_pages+0xfd/0x590
  filemap_read+0xcc/0x330
  blkdev_read_iter+0xb8/0x150
  vfs_read+0x285/0x320
  ksys_read+0xa5/0xe0
  do_syscall_64+0x80/0x160
  entry_SYSCALL_64_after_hwframe+0x6e/0x76
 stack_count: 521


  prep_new_page+0xa9/0x120
  get_page_from_freelist+0x801/0x2210
  __alloc_pages+0x18b/0x350
  alloc_pages_mpol+0x91/0x1f0
  folio_alloc+0x14/0x50
  filemap_alloc_folio+0xb2/0x100
  __filemap_get_folio+0x14a/0x490
  ext4_write_begin+0xbd/0x4b0 [ext4]
  generic_perform_write+0xc1/0x1e0
  ext4_buffered_write_iter+0x68/0xe0 [ext4]
  ext4_file_write_iter+0x70/0x740 [ext4]
  vfs_write+0x33d/0x420
  ksys_write+0xa5/0xe0
  do_syscall_64+0x80/0x160
  entry_SYSCALL_64_after_hwframe+0x6e/0x76
 stack_count: 4609
...
...

 # echo 5000 > /sys/kernel/debug/page_owner_stacks/set_threshold
 # cat /sys/kernel/debug/page_owner_stacks/show_stacks > page_owner_full_stacks_5000.txt
 # head -40 page_owner_full_stacks_5000.txt
  prep_new_page+0xa9/0x120
  get_page_from_freelist+0x801/0x2210
  __alloc_pages+0x18b/0x350
  alloc_pages_mpol+0x91/0x1f0
  folio_alloc+0x14/0x50
  filemap_alloc_folio+0xb2/0x100
  __filemap_get_folio+0x14a/0x490
  ext4_write_begin+0xbd/0x4b0 [ext4]
  generic_perform_write+0xc1/0x1e0
  ext4_buffered_write_iter+0x68/0xe0 [ext4]
  ext4_file_write_iter+0x70/0x740 [ext4]
  vfs_write+0x33d/0x420
  ksys_pwrite64+0x75/0x90
  do_syscall_64+0x80/0x160
  entry_SYSCALL_64_after_hwframe+0x6e/0x76
 stack_count: 6781


  prep_new_page+0xa9/0x120
  get_page_from_freelist+0x801/0x2210
  __alloc_pages+0x18b/0x350
  pcpu_populate_chunk+0xec/0x350
  pcpu_balance_workfn+0x2d1/0x4a0
  process_scheduled_works+0x84/0x380
  worker_thread+0x12a/0x2a0
  kthread+0xe3/0x110
  ret_from_fork+0x30/0x50
  ret_from_fork_asm+0x1b/0x30
 stack_count: 8641


This patch (of 7):

The very first entry of stack_record gets a handle of 0, but this is wrong
because stackdepot treats a 0-handle as a non-valid one.  E.g: See the
check in stack_depot_fetch()

Fix this by adding and offset of 1.

This bug has been lurking since the very beginning of stackdepot, but no
one really cared as it seems.  Because of that I am not adding a Fixes
tag.

Link: https://lkml.kernel.org/r/20240215215907.20121-1-osalvador@suse.de
Link: https://lkml.kernel.org/r/20240215215907.20121-2-osalvador@suse.de
Co-developed-by: Marco Elver <elver@google.com>
Signed-off-by: Marco Elver <elver@google.com>
Signed-off-by: Oscar Salvador <osalvador@suse.de>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Andrey Konovalov <andreyknvl@gmail.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 lib/stackdepot.c | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/lib/stackdepot.c b/lib/stackdepot.c
index 4a7055a63d9f8a..c043a4186bc598 100644
--- a/lib/stackdepot.c
+++ b/lib/stackdepot.c
@@ -45,15 +45,16 @@
 #define DEPOT_POOL_INDEX_BITS (DEPOT_HANDLE_BITS - DEPOT_OFFSET_BITS - \
 			       STACK_DEPOT_EXTRA_BITS)
 #define DEPOT_POOLS_CAP 8192
+/* The pool_index is offset by 1 so the first record does not have a 0 handle. */
 #define DEPOT_MAX_POOLS \
-	(((1LL << (DEPOT_POOL_INDEX_BITS)) < DEPOT_POOLS_CAP) ? \
-	 (1LL << (DEPOT_POOL_INDEX_BITS)) : DEPOT_POOLS_CAP)
+	(((1LL << (DEPOT_POOL_INDEX_BITS)) - 1 < DEPOT_POOLS_CAP) ? \
+	 (1LL << (DEPOT_POOL_INDEX_BITS)) - 1 : DEPOT_POOLS_CAP)
 
 /* Compact structure that stores a reference to a stack. */
 union handle_parts {
 	depot_stack_handle_t handle;
 	struct {
-		u32 pool_index	: DEPOT_POOL_INDEX_BITS;
+		u32 pool_index	: DEPOT_POOL_INDEX_BITS; /* pool_index is offset by 1 */
 		u32 offset	: DEPOT_OFFSET_BITS;
 		u32 extra	: STACK_DEPOT_EXTRA_BITS;
 	};
@@ -372,7 +373,7 @@ static struct stack_record *depot_pop_free_pool(void **prealloc, size_t size)
 	stack = current_pool + pool_offset;
 
 	/* Pre-initialize handle once. */
-	stack->handle.pool_index = pool_index;
+	stack->handle.pool_index = pool_index + 1;
 	stack->handle.offset = pool_offset >> DEPOT_STACK_ALIGN;
 	stack->handle.extra = 0;
 	INIT_LIST_HEAD(&stack->hash_list);
@@ -483,18 +484,19 @@ static struct stack_record *depot_fetch_stack(depot_stack_handle_t handle)
 	const int pools_num_cached = READ_ONCE(pools_num);
 	union handle_parts parts = { .handle = handle };
 	void *pool;
+	u32 pool_index = parts.pool_index - 1;
 	size_t offset = parts.offset << DEPOT_STACK_ALIGN;
 	struct stack_record *stack;
 
 	lockdep_assert_not_held(&pool_lock);
 
-	if (parts.pool_index > pools_num_cached) {
+	if (pool_index > pools_num_cached) {
 		WARN(1, "pool index %d out of bounds (%d) for stack id %08x\n",
-		     parts.pool_index, pools_num_cached, handle);
+		     pool_index, pools_num_cached, handle);
 		return NULL;
 	}
 
-	pool = stack_pools[parts.pool_index];
+	pool = stack_pools[pool_index];
 	if (WARN_ON(!pool))
 		return NULL;
 

From 4869a9405aa20a16a5b83c104143292c3bdbd261 Mon Sep 17 00:00:00 2001
From: Oscar Salvador <osalvador@suse.de>
Date: Thu, 15 Feb 2024 22:59:02 +0100
Subject: [PATCH 1251/1406] lib/stackdepot: move stack_record struct definition
 into the header

In order to move the heavy lifting into page_owner code, this one needs to
have access to the stack_record structure, which right now sits in
lib/stackdepot.c.  Move it to the stackdepot.h header so page_owner can
access stack_record's struct fields.

Link: https://lkml.kernel.org/r/20240215215907.20121-3-osalvador@suse.de
Signed-off-by: Oscar Salvador <osalvador@suse.de>
Reviewed-by: Marco Elver <elver@google.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Andrey Konovalov <andreyknvl@gmail.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/stackdepot.h | 47 ++++++++++++++++++++++++++++++++++++++
 lib/stackdepot.c           | 43 ----------------------------------
 2 files changed, 47 insertions(+), 43 deletions(-)

diff --git a/include/linux/stackdepot.h b/include/linux/stackdepot.h
index adcbb8f2360007..c4b5ad57c0660c 100644
--- a/include/linux/stackdepot.h
+++ b/include/linux/stackdepot.h
@@ -30,6 +30,53 @@ typedef u32 depot_stack_handle_t;
  */
 #define STACK_DEPOT_EXTRA_BITS 5
 
+#define DEPOT_HANDLE_BITS (sizeof(depot_stack_handle_t) * 8)
+
+#define DEPOT_POOL_ORDER 2 /* Pool size order, 4 pages */
+#define DEPOT_POOL_SIZE (1LL << (PAGE_SHIFT + DEPOT_POOL_ORDER))
+#define DEPOT_STACK_ALIGN 4
+#define DEPOT_OFFSET_BITS (DEPOT_POOL_ORDER + PAGE_SHIFT - DEPOT_STACK_ALIGN)
+#define DEPOT_POOL_INDEX_BITS (DEPOT_HANDLE_BITS - DEPOT_OFFSET_BITS - \
+			       STACK_DEPOT_EXTRA_BITS)
+
+#ifdef CONFIG_STACKDEPOT
+/* Compact structure that stores a reference to a stack. */
+union handle_parts {
+	depot_stack_handle_t handle;
+	struct {
+		/* pool_index is offset by 1 */
+		u32 pool_index	: DEPOT_POOL_INDEX_BITS;
+		u32 offset	: DEPOT_OFFSET_BITS;
+		u32 extra	: STACK_DEPOT_EXTRA_BITS;
+	};
+};
+
+struct stack_record {
+	struct list_head hash_list;	/* Links in the hash table */
+	u32 hash;			/* Hash in hash table */
+	u32 size;			/* Number of stored frames */
+	union handle_parts handle;	/* Constant after initialization */
+	refcount_t count;
+	union {
+		unsigned long entries[CONFIG_STACKDEPOT_MAX_FRAMES];	/* Frames */
+		struct {
+			/*
+			 * An important invariant of the implementation is to
+			 * only place a stack record onto the freelist iff its
+			 * refcount is zero. Because stack records with a zero
+			 * refcount are never considered as valid, it is safe to
+			 * union @entries and freelist management state below.
+			 * Conversely, as soon as an entry is off the freelist
+			 * and its refcount becomes non-zero, the below must not
+			 * be accessed until being placed back on the freelist.
+			 */
+			struct list_head free_list;	/* Links in the freelist */
+			unsigned long rcu_state;	/* RCU cookie */
+		};
+	};
+};
+#endif
+
 typedef u32 depot_flags_t;
 
 /*
diff --git a/lib/stackdepot.c b/lib/stackdepot.c
index c043a4186bc598..514b8d40ff57b4 100644
--- a/lib/stackdepot.c
+++ b/lib/stackdepot.c
@@ -36,55 +36,12 @@
 #include <linux/memblock.h>
 #include <linux/kasan-enabled.h>
 
-#define DEPOT_HANDLE_BITS (sizeof(depot_stack_handle_t) * 8)
-
-#define DEPOT_POOL_ORDER 2 /* Pool size order, 4 pages */
-#define DEPOT_POOL_SIZE (1LL << (PAGE_SHIFT + DEPOT_POOL_ORDER))
-#define DEPOT_STACK_ALIGN 4
-#define DEPOT_OFFSET_BITS (DEPOT_POOL_ORDER + PAGE_SHIFT - DEPOT_STACK_ALIGN)
-#define DEPOT_POOL_INDEX_BITS (DEPOT_HANDLE_BITS - DEPOT_OFFSET_BITS - \
-			       STACK_DEPOT_EXTRA_BITS)
 #define DEPOT_POOLS_CAP 8192
 /* The pool_index is offset by 1 so the first record does not have a 0 handle. */
 #define DEPOT_MAX_POOLS \
 	(((1LL << (DEPOT_POOL_INDEX_BITS)) - 1 < DEPOT_POOLS_CAP) ? \
 	 (1LL << (DEPOT_POOL_INDEX_BITS)) - 1 : DEPOT_POOLS_CAP)
 
-/* Compact structure that stores a reference to a stack. */
-union handle_parts {
-	depot_stack_handle_t handle;
-	struct {
-		u32 pool_index	: DEPOT_POOL_INDEX_BITS; /* pool_index is offset by 1 */
-		u32 offset	: DEPOT_OFFSET_BITS;
-		u32 extra	: STACK_DEPOT_EXTRA_BITS;
-	};
-};
-
-struct stack_record {
-	struct list_head hash_list;	/* Links in the hash table */
-	u32 hash;			/* Hash in hash table */
-	u32 size;			/* Number of stored frames */
-	union handle_parts handle;	/* Constant after initialization */
-	refcount_t count;
-	union {
-		unsigned long entries[CONFIG_STACKDEPOT_MAX_FRAMES];	/* Frames */
-		struct {
-			/*
-			 * An important invariant of the implementation is to
-			 * only place a stack record onto the freelist iff its
-			 * refcount is zero. Because stack records with a zero
-			 * refcount are never considered as valid, it is safe to
-			 * union @entries and freelist management state below.
-			 * Conversely, as soon as an entry is off the freelist
-			 * and its refcount becomes non-zero, the below must not
-			 * be accessed until being placed back on the freelist.
-			 */
-			struct list_head free_list;	/* Links in the freelist */
-			unsigned long rcu_state;	/* RCU cookie */
-		};
-	};
-};
-
 static bool stack_depot_disabled;
 static bool __stack_depot_early_init_requested __initdata = IS_ENABLED(CONFIG_STACKDEPOT_ALWAYS_INIT);
 static bool __stack_depot_early_init_passed __initdata;

From 5321a8be8ff1f50e0931e1ab5fc2a76f15ccdf5a Mon Sep 17 00:00:00 2001
From: Oscar Salvador <osalvador@suse.de>
Date: Thu, 15 Feb 2024 22:59:03 +0100
Subject: [PATCH 1252/1406] mm,page_owner: maintain own list of stack_records
 structs

page_owner needs to increment a stack_record refcount when a new
allocation occurs, and decrement it on a free operation.  In order to do
that, we need to have a way to get a stack_record from a handle.
Implement __stack_depot_get_stack_record() which just does that, and make
it public so page_owner can use it.

Also, traversing all stackdepot buckets comes with its own complexity,
plus we would have to implement a way to mark only those stack_records
that were originated from page_owner, as those are the ones we are
interested in.  For that reason, page_owner maintains its own list of
stack_records, because traversing that list is faster than traversing all
buckets while keeping at the same time a low complexity.

For now, add to stack_list only the stack_records of dummy_handle and
failure_handle, and set their refcount of 1.

Further patches will add code to increment or decrement stack_records
count on allocation and free operation.

Link: https://lkml.kernel.org/r/20240215215907.20121-4-osalvador@suse.de
Signed-off-by: Oscar Salvador <osalvador@suse.de>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Marco Elver <elver@google.com>
Acked-by: Andrey Konovalov <andreyknvl@gmail.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/stackdepot.h | 11 +++++++++++
 lib/stackdepot.c           |  8 ++++++++
 mm/page_owner.c            | 15 +++++++++++++++
 3 files changed, 34 insertions(+)

diff --git a/include/linux/stackdepot.h b/include/linux/stackdepot.h
index c4b5ad57c0660c..3c6caa5abc7c42 100644
--- a/include/linux/stackdepot.h
+++ b/include/linux/stackdepot.h
@@ -178,6 +178,17 @@ depot_stack_handle_t stack_depot_save_flags(unsigned long *entries,
 depot_stack_handle_t stack_depot_save(unsigned long *entries,
 				      unsigned int nr_entries, gfp_t gfp_flags);
 
+/**
+ * __stack_depot_get_stack_record - Get a pointer to a stack_record struct
+ *
+ * @handle: Stack depot handle
+ *
+ * This function is only for internal purposes.
+ *
+ * Return: Returns a pointer to a stack_record struct
+ */
+struct stack_record *__stack_depot_get_stack_record(depot_stack_handle_t handle);
+
 /**
  * stack_depot_fetch - Fetch a stack trace from stack depot
  *
diff --git a/lib/stackdepot.c b/lib/stackdepot.c
index 514b8d40ff57b4..8c795bb20afb22 100644
--- a/lib/stackdepot.c
+++ b/lib/stackdepot.c
@@ -687,6 +687,14 @@ depot_stack_handle_t stack_depot_save(unsigned long *entries,
 }
 EXPORT_SYMBOL_GPL(stack_depot_save);
 
+struct stack_record *__stack_depot_get_stack_record(depot_stack_handle_t handle)
+{
+	if (!handle)
+		return NULL;
+
+	return depot_fetch_stack(handle);
+}
+
 unsigned int stack_depot_fetch(depot_stack_handle_t handle,
 			       unsigned long **entries)
 {
diff --git a/mm/page_owner.c b/mm/page_owner.c
index 5634e5d890f881..33e342b15d9b7f 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -36,6 +36,14 @@ struct page_owner {
 	pid_t free_tgid;
 };
 
+struct stack {
+	struct stack_record *stack_record;
+	struct stack *next;
+};
+static struct stack dummy_stack;
+static struct stack failure_stack;
+static struct stack *stack_list;
+
 static bool page_owner_enabled __initdata;
 DEFINE_STATIC_KEY_FALSE(page_owner_inited);
 
@@ -95,6 +103,13 @@ static __init void init_page_owner(void)
 	register_early_stack();
 	static_branch_enable(&page_owner_inited);
 	init_early_allocated_pages();
+	/* Initialize dummy and failure stacks and link them to stack_list */
+	dummy_stack.stack_record = __stack_depot_get_stack_record(dummy_handle);
+	failure_stack.stack_record = __stack_depot_get_stack_record(failure_handle);
+	refcount_set(&dummy_stack.stack_record->count, 1);
+	refcount_set(&failure_stack.stack_record->count, 1);
+	dummy_stack.next = &failure_stack;
+	stack_list = &dummy_stack;
 }
 
 struct page_ext_operations page_owner_ops = {

From 9dda82b6179e6795047453f56f188b10162402d6 Mon Sep 17 00:00:00 2001
From: Oscar Salvador <osalvador@suse.de>
Date: Thu, 15 Feb 2024 22:59:04 +0100
Subject: [PATCH 1253/1406] mm,page_owner: implement the tracking of the stacks
 count

Implement {inc,dec}_stack_record_count() which increments or decrements on
respective allocation and free operations, via __reset_page_owner() (free
operation) and __set_page_owner() (alloc operation).

Newly allocated stack_record structs will be added to the list stack_list
via add_stack_record_to_list().  Modifications on the list are protected
via a spinlock with irqs disabled, since this code can also be reached
from IRQ context.

Link: https://lkml.kernel.org/r/20240215215907.20121-5-osalvador@suse.de
Signed-off-by: Oscar Salvador <osalvador@suse.de>
Reviewed-by: Marco Elver <elver@google.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Andrey Konovalov <andreyknvl@gmail.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/page_owner.c | 73 ++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 72 insertions(+), 1 deletion(-)

diff --git a/mm/page_owner.c b/mm/page_owner.c
index 33e342b15d9b7f..df6a923af5de6c 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -43,6 +43,7 @@ struct stack {
 static struct stack dummy_stack;
 static struct stack failure_stack;
 static struct stack *stack_list;
+static DEFINE_SPINLOCK(stack_list_lock);
 
 static bool page_owner_enabled __initdata;
 DEFINE_STATIC_KEY_FALSE(page_owner_inited);
@@ -150,11 +151,68 @@ static noinline depot_stack_handle_t save_stack(gfp_t flags)
 	return handle;
 }
 
+static void add_stack_record_to_list(struct stack_record *stack_record,
+				     gfp_t gfp_mask)
+{
+	unsigned long flags;
+	struct stack *stack;
+
+	/* Filter gfp_mask the same way stackdepot does, for consistency */
+	gfp_mask &= ~GFP_ZONEMASK;
+	gfp_mask &= (GFP_ATOMIC | GFP_KERNEL);
+	gfp_mask |= __GFP_NOWARN;
+
+	stack = kmalloc(sizeof(*stack), gfp_mask);
+	if (!stack)
+		return;
+
+	stack->stack_record = stack_record;
+	stack->next = NULL;
+
+	spin_lock_irqsave(&stack_list_lock, flags);
+	stack->next = stack_list;
+	stack_list = stack;
+	spin_unlock_irqrestore(&stack_list_lock, flags);
+}
+
+static void inc_stack_record_count(depot_stack_handle_t handle, gfp_t gfp_mask)
+{
+	struct stack_record *stack_record = __stack_depot_get_stack_record(handle);
+
+	if (!stack_record)
+		return;
+
+	/*
+	 * New stack_record's that do not use STACK_DEPOT_FLAG_GET start
+	 * with REFCOUNT_SATURATED to catch spurious increments of their
+	 * refcount.
+	 * Since we do not use STACK_DEPOT_FLAG_GET API, let us
+	 * set a refcount of 1 ourselves.
+	 */
+	if (refcount_read(&stack_record->count) == REFCOUNT_SATURATED) {
+		int old = REFCOUNT_SATURATED;
+
+		if (atomic_try_cmpxchg_relaxed(&stack_record->count.refs, &old, 1))
+			/* Add the new stack_record to our list */
+			add_stack_record_to_list(stack_record, gfp_mask);
+	}
+	refcount_inc(&stack_record->count);
+}
+
+static void dec_stack_record_count(depot_stack_handle_t handle)
+{
+	struct stack_record *stack_record = __stack_depot_get_stack_record(handle);
+
+	if (stack_record)
+		refcount_dec(&stack_record->count);
+}
+
 void __reset_page_owner(struct page *page, unsigned short order)
 {
 	int i;
 	struct page_ext *page_ext;
 	depot_stack_handle_t handle;
+	depot_stack_handle_t alloc_handle;
 	struct page_owner *page_owner;
 	u64 free_ts_nsec = local_clock();
 
@@ -162,17 +220,29 @@ void __reset_page_owner(struct page *page, unsigned short order)
 	if (unlikely(!page_ext))
 		return;
 
+	page_owner = get_page_owner(page_ext);
+	alloc_handle = page_owner->handle;
+
 	handle = save_stack(GFP_NOWAIT | __GFP_NOWARN);
 	for (i = 0; i < (1 << order); i++) {
 		__clear_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags);
-		page_owner = get_page_owner(page_ext);
 		page_owner->free_handle = handle;
 		page_owner->free_ts_nsec = free_ts_nsec;
 		page_owner->free_pid = current->pid;
 		page_owner->free_tgid = current->tgid;
 		page_ext = page_ext_next(page_ext);
+		page_owner = get_page_owner(page_ext);
 	}
 	page_ext_put(page_ext);
+	if (alloc_handle != early_handle)
+		/*
+		 * early_handle is being set as a handle for all those
+		 * early allocated pages. See init_pages_in_zone().
+		 * Since their refcount is not being incremented because
+		 * the machinery is not ready yet, we cannot decrement
+		 * their refcount either.
+		 */
+		dec_stack_record_count(alloc_handle);
 }
 
 static inline void __set_page_owner_handle(struct page_ext *page_ext,
@@ -214,6 +284,7 @@ noinline void __set_page_owner(struct page *page, unsigned short order,
 		return;
 	__set_page_owner_handle(page_ext, handle, order, gfp_mask);
 	page_ext_put(page_ext);
+	inc_stack_record_count(handle, gfp_mask);
 }
 
 void __set_page_owner_migrate_reason(struct page *page, int reason)

From 7c1bad8e479f418e1894bb4f08bb092ac03952a8 Mon Sep 17 00:00:00 2001
From: Oscar Salvador <osalvador@suse.de>
Date: Thu, 15 Feb 2024 22:59:05 +0100
Subject: [PATCH 1254/1406] mm,page_owner: display all stacks and their count

This patch adds a new directory called 'page_owner_stacks' under
/sys/kernel/debug/, with a file called 'show_stacks' in it.  Reading from
that file will show all stacks that were added by page_owner followed by
their counting, giving us a clear overview of stack <-> count
relationship.

E.g:

  prep_new_page+0xa9/0x120
  get_page_from_freelist+0x801/0x2210
  __alloc_pages+0x18b/0x350
  alloc_pages_mpol+0x91/0x1f0
  folio_alloc+0x14/0x50
  filemap_alloc_folio+0xb2/0x100
  __filemap_get_folio+0x14a/0x490
  ext4_write_begin+0xbd/0x4b0 [ext4]
  generic_perform_write+0xc1/0x1e0
  ext4_buffered_write_iter+0x68/0xe0 [ext4]
  ext4_file_write_iter+0x70/0x740 [ext4]
  vfs_write+0x33d/0x420
  ksys_write+0xa5/0xe0
  do_syscall_64+0x80/0x160
  entry_SYSCALL_64_after_hwframe+0x6e/0x76
 stack_count: 4578

The seq stack_{start,next} functions will iterate through the list
stack_list in order to print all stacks.

Link: https://lkml.kernel.org/r/20240215215907.20121-6-osalvador@suse.de
Signed-off-by: Oscar Salvador <osalvador@suse.de>
Acked-by: Marco Elver <elver@google.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Andrey Konovalov <andreyknvl@gmail.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/page_owner.c | 93 ++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 92 insertions(+), 1 deletion(-)

diff --git a/mm/page_owner.c b/mm/page_owner.c
index df6a923af5de6c..e99fbf822dd693 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -171,7 +171,13 @@ static void add_stack_record_to_list(struct stack_record *stack_record,
 
 	spin_lock_irqsave(&stack_list_lock, flags);
 	stack->next = stack_list;
-	stack_list = stack;
+	/*
+	 * This pairs with smp_load_acquire() from function
+	 * stack_start(). This guarantees that stack_start()
+	 * will see an updated stack_list before starting to
+	 * traverse the list.
+	 */
+	smp_store_release(&stack_list, stack);
 	spin_unlock_irqrestore(&stack_list_lock, flags);
 }
 
@@ -805,8 +811,90 @@ static const struct file_operations proc_page_owner_operations = {
 	.llseek		= lseek_page_owner,
 };
 
+static void *stack_start(struct seq_file *m, loff_t *ppos)
+{
+	struct stack *stack;
+
+	if (*ppos == -1UL)
+		return NULL;
+
+	if (!*ppos) {
+		/*
+		 * This pairs with smp_store_release() from function
+		 * add_stack_record_to_list(), so we get a consistent
+		 * value of stack_list.
+		 */
+		stack = smp_load_acquire(&stack_list);
+	} else {
+		stack = m->private;
+		stack = stack->next;
+	}
+
+	m->private = stack;
+
+	return stack;
+}
+
+static void *stack_next(struct seq_file *m, void *v, loff_t *ppos)
+{
+	struct stack *stack = v;
+
+	stack = stack->next;
+	*ppos = stack ? *ppos + 1 : -1UL;
+	m->private = stack;
+
+	return stack;
+}
+
+static int stack_print(struct seq_file *m, void *v)
+{
+	int i, stack_count;
+	struct stack *stack = v;
+	unsigned long *entries;
+	unsigned long nr_entries;
+	struct stack_record *stack_record = stack->stack_record;
+
+	nr_entries = stack_record->size;
+	entries = stack_record->entries;
+	stack_count = refcount_read(&stack_record->count) - 1;
+
+	if (!nr_entries || nr_entries < 0 || stack_count < 1)
+		return 0;
+
+	for (i = 0; i < nr_entries; i++)
+		seq_printf(m, " %pS\n", (void *)entries[i]);
+	seq_printf(m, "stack_count: %d\n\n", stack_count);
+
+	return 0;
+}
+
+static void stack_stop(struct seq_file *m, void *v)
+{
+}
+
+static const struct seq_operations page_owner_stack_op = {
+	.start	= stack_start,
+	.next	= stack_next,
+	.stop	= stack_stop,
+	.show	= stack_print
+};
+
+static int page_owner_stack_open(struct inode *inode, struct file *file)
+{
+	return seq_open_private(file, &page_owner_stack_op, 0);
+}
+
+static const struct file_operations page_owner_stack_operations = {
+	.open		= page_owner_stack_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
 static int __init pageowner_init(void)
 {
+	struct dentry *dir;
+
 	if (!static_branch_unlikely(&page_owner_inited)) {
 		pr_info("page_owner is disabled\n");
 		return 0;
@@ -814,6 +902,9 @@ static int __init pageowner_init(void)
 
 	debugfs_create_file("page_owner", 0400, NULL, NULL,
 			    &proc_page_owner_operations);
+	dir = debugfs_create_dir("page_owner_stacks", NULL);
+	debugfs_create_file("show_stacks", 0400, dir, NULL,
+			    &page_owner_stack_operations);
 
 	return 0;
 }

From 2cfb2fcdace0a434719606a9ec6a6492dfdf5fdf Mon Sep 17 00:00:00 2001
From: Oscar Salvador <osalvador@suse.de>
Date: Thu, 15 Feb 2024 22:59:06 +0100
Subject: [PATCH 1255/1406] mm,page_owner: filter out stacks by a threshold

We want to be able to filter out the stacks based on a threshold we can
can tune.  By writing to 'count_threshold' file, we can adjust the
threshold value.

Link: https://lkml.kernel.org/r/20240215215907.20121-7-osalvador@suse.de
Signed-off-by: Oscar Salvador <osalvador@suse.de>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Andrey Konovalov <andreyknvl@gmail.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Marco Elver <elver@google.com>
Cc: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/page_owner.c | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/mm/page_owner.c b/mm/page_owner.c
index e99fbf822dd693..e56c1e92eccf5a 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -846,6 +846,8 @@ static void *stack_next(struct seq_file *m, void *v, loff_t *ppos)
 	return stack;
 }
 
+static unsigned long page_owner_stack_threshold;
+
 static int stack_print(struct seq_file *m, void *v)
 {
 	int i, stack_count;
@@ -858,7 +860,8 @@ static int stack_print(struct seq_file *m, void *v)
 	entries = stack_record->entries;
 	stack_count = refcount_read(&stack_record->count) - 1;
 
-	if (!nr_entries || nr_entries < 0 || stack_count < 1)
+	if (!nr_entries || nr_entries < 0 || stack_count < 1 ||
+	    stack_count < page_owner_stack_threshold)
 		return 0;
 
 	for (i = 0; i < nr_entries; i++)
@@ -891,6 +894,22 @@ static const struct file_operations page_owner_stack_operations = {
 	.release	= seq_release,
 };
 
+static int page_owner_threshold_get(void *data, u64 *val)
+{
+	*val = READ_ONCE(page_owner_stack_threshold);
+	return 0;
+}
+
+static int page_owner_threshold_set(void *data, u64 val)
+{
+	WRITE_ONCE(page_owner_stack_threshold, val);
+	return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(proc_page_owner_threshold, &page_owner_threshold_get,
+			&page_owner_threshold_set, "%llu");
+
+
 static int __init pageowner_init(void)
 {
 	struct dentry *dir;
@@ -905,6 +924,8 @@ static int __init pageowner_init(void)
 	dir = debugfs_create_dir("page_owner_stacks", NULL);
 	debugfs_create_file("show_stacks", 0400, dir, NULL,
 			    &page_owner_stack_operations);
+	debugfs_create_file("count_threshold", 0600, dir, NULL,
+			    &proc_page_owner_threshold);
 
 	return 0;
 }

From 8e684f0a11127d06d296de9d455cf5bfc01a6f0d Mon Sep 17 00:00:00 2001
From: Oscar Salvador <osalvador@suse.de>
Date: Thu, 15 Feb 2024 22:59:07 +0100
Subject: [PATCH 1256/1406] mm,page_owner: update Documentation regarding
 page_owner_stacks

Update page_owner documentation including the new page_owner_stacks
feature to show how it can be used.

Link: https://lkml.kernel.org/r/20240215215907.20121-8-osalvador@suse.de
Signed-off-by: Oscar Salvador <osalvador@suse.de>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Marco Elver <elver@google.com>
Acked-by: Andrey Konovalov <andreyknvl@gmail.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/mm/page_owner.rst | 45 +++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

diff --git a/Documentation/mm/page_owner.rst b/Documentation/mm/page_owner.rst
index 62e3f7ab23cc18..0d0334cd51798b 100644
--- a/Documentation/mm/page_owner.rst
+++ b/Documentation/mm/page_owner.rst
@@ -24,6 +24,11 @@ fragmentation statistics can be obtained through gfp flag information of
 each page. It is already implemented and activated if page owner is
 enabled. Other usages are more than welcome.
 
+It can also be used to show all the stacks and their outstanding
+allocations, which gives us a quick overview of where the memory is going
+without the need to screen through all the pages and match the allocation
+and free operation.
+
 page owner is disabled by default. So, if you'd like to use it, you need
 to add "page_owner=on" to your boot cmdline. If the kernel is built
 with page owner and page owner is disabled in runtime due to not enabling
@@ -68,6 +73,46 @@ Usage
 
 4) Analyze information from page owner::
 
+	cat /sys/kernel/debug/page_owner_stacks/show_stacks > stacks.txt
+	cat stacks.txt
+	 prep_new_page+0xa9/0x120
+	 get_page_from_freelist+0x7e6/0x2140
+	 __alloc_pages+0x18a/0x370
+	 new_slab+0xc8/0x580
+	 ___slab_alloc+0x1f2/0xaf0
+	 __slab_alloc.isra.86+0x22/0x40
+	 kmem_cache_alloc+0x31b/0x350
+	 __khugepaged_enter+0x39/0x100
+	 dup_mmap+0x1c7/0x5ce
+	 copy_process+0x1afe/0x1c90
+	 kernel_clone+0x9a/0x3c0
+	 __do_sys_clone+0x66/0x90
+	 do_syscall_64+0x7f/0x160
+	 entry_SYSCALL_64_after_hwframe+0x6c/0x74
+	stack_count: 234
+	...
+	...
+	echo 7000 > /sys/kernel/debug/page_owner_stacks/count_threshold
+	cat /sys/kernel/debug/page_owner_stacks/show_stacks> stacks_7000.txt
+	cat stacks_7000.txt
+	 prep_new_page+0xa9/0x120
+	 get_page_from_freelist+0x7e6/0x2140
+	 __alloc_pages+0x18a/0x370
+	 alloc_pages_mpol+0xdf/0x1e0
+	 folio_alloc+0x14/0x50
+	 filemap_alloc_folio+0xb0/0x100
+	 page_cache_ra_unbounded+0x97/0x180
+	 filemap_fault+0x4b4/0x1200
+	 __do_fault+0x2d/0x110
+	 do_pte_missing+0x4b0/0xa30
+	 __handle_mm_fault+0x7fa/0xb70
+	 handle_mm_fault+0x125/0x300
+	 do_user_addr_fault+0x3c9/0x840
+	 exc_page_fault+0x68/0x150
+	 asm_exc_page_fault+0x22/0x30
+	stack_count: 8248
+	...
+
 	cat /sys/kernel/debug/page_owner > page_owner_full.txt
 	./page_owner_sort page_owner_full.txt sorted_page_owner.txt
 

From cbfc257db768c0221b60001e0ec8b62b1fb76baf Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 16 Feb 2024 16:58:38 -0800
Subject: [PATCH 1257/1406] Docs/mm/damon/maintainer-profile: fix reference
 links for mm-[un]stable tree

Patch series "Docs/mm/damon: misc readability improvements".

Fix trivial mistakes and improve layout of information on different
documents for DAMON.


This patch (of 5):

A couple of sentences on maintainer-profile.rst are having reference links
for mm-unstable and mm-stable trees with wrong rst markup.  Fix those.

Link: https://lkml.kernel.org/r/20240217005842.87348-1-sj@kernel.org
Link: https://lkml.kernel.org/r/20240217005842.87348-2-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/mm/damon/maintainer-profile.rst | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/Documentation/mm/damon/maintainer-profile.rst b/Documentation/mm/damon/maintainer-profile.rst
index a84c14e5905307..5a306e4de22e50 100644
--- a/Documentation/mm/damon/maintainer-profile.rst
+++ b/Documentation/mm/damon/maintainer-profile.rst
@@ -21,8 +21,8 @@ be queued in mm-stable [3]_ , and finally pull-requested to the mainline by the
 memory management subsystem maintainer.
 
 Note again the patches for review should be made against the mm-unstable
-tree[1] whenever possible.  damon/next is only for preview of others' works in
-progress.
+tree [1]_ whenever possible.  damon/next is only for preview of others' works
+in progress.
 
 Submit checklist addendum
 -------------------------
@@ -41,8 +41,8 @@ Further doing below and putting the results will be helpful.
 Key cycle dates
 ---------------
 
-Patches can be sent anytime.  Key cycle dates of the mm-unstable[1] and
-mm-stable[3] trees depend on the memory management subsystem maintainer.
+Patches can be sent anytime.  Key cycle dates of the mm-unstable [1]_ and
+mm-stable [3]_ trees depend on the memory management subsystem maintainer.
 
 Review cadence
 --------------

From 86d8550adb65fd948eb6c20b35a6063eddbe1eb4 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 16 Feb 2024 16:58:39 -0800
Subject: [PATCH 1258/1406] Docs/mm/damon: move the list of DAMOS actions to
 design doc

DAMOS operation actions are explained nearly twice on the DAMON usage
document, once for the sysfs interface, and then again for the debugfs
interface.  Duplication is bad.  Also it would better to keep this kind of
concept level details in design document and keep the usage document small
and focus on only the usage.  Move the list to design document and update
usage document to reference it.

Link: https://lkml.kernel.org/r/20240217005842.87348-3-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/admin-guide/mm/damon/usage.rst | 47 ++++++--------------
 Documentation/mm/damon/design.rst            | 26 +++++++++--
 2 files changed, 36 insertions(+), 37 deletions(-)

diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst
index 58c34e66b31b2b..0335d584956b5e 100644
--- a/Documentation/admin-guide/mm/damon/usage.rst
+++ b/Documentation/admin-guide/mm/damon/usage.rst
@@ -302,27 +302,8 @@ In each scheme directory, five directories (``access_pattern``, ``quotas``,
 
 The ``action`` file is for setting and getting the scheme's :ref:`action
 <damon_design_damos_action>`.  The keywords that can be written to and read
-from the file and their meaning are as below.
-
-Note that support of each action depends on the running DAMON operations set
-:ref:`implementation <sysfs_context>`.
-
- - ``willneed``: Call ``madvise()`` for the region with ``MADV_WILLNEED``.
-   Supported by ``vaddr`` and ``fvaddr`` operations set.
- - ``cold``: Call ``madvise()`` for the region with ``MADV_COLD``.
-   Supported by ``vaddr`` and ``fvaddr`` operations set.
- - ``pageout``: Call ``madvise()`` for the region with ``MADV_PAGEOUT``.
-   Supported by ``vaddr``, ``fvaddr`` and ``paddr`` operations set.
- - ``hugepage``: Call ``madvise()`` for the region with ``MADV_HUGEPAGE``.
-   Supported by ``vaddr`` and ``fvaddr`` operations set.
- - ``nohugepage``: Call ``madvise()`` for the region with ``MADV_NOHUGEPAGE``.
-   Supported by ``vaddr`` and ``fvaddr`` operations set.
- - ``lru_prio``: Prioritize the region on its LRU lists.
-   Supported by ``paddr`` operations set.
- - ``lru_deprio``: Deprioritize the region on its LRU lists.
-   Supported by ``paddr`` operations set.
- - ``stat``: Do nothing but count the statistics.
-   Supported by all operations sets.
+from the file and their meaning are same to those of the list on
+:ref:`design doc <damon_design_damos_action>`.
 
 The ``apply_interval_us`` file is for setting and getting the scheme's
 :ref:`apply_interval <damon_design_damos>` in microseconds.
@@ -763,19 +744,17 @@ Action
 ~~~~~~
 
 The ``<action>`` is a predefined integer for memory management :ref:`actions
-<damon_design_damos_action>`.  The supported numbers and their meanings are as
-below.
-
- - 0: Call ``madvise()`` for the region with ``MADV_WILLNEED``.  Ignored if
-   ``target`` is ``paddr``.
- - 1: Call ``madvise()`` for the region with ``MADV_COLD``.  Ignored if
-   ``target`` is ``paddr``.
- - 2: Call ``madvise()`` for the region with ``MADV_PAGEOUT``.
- - 3: Call ``madvise()`` for the region with ``MADV_HUGEPAGE``.  Ignored if
-   ``target`` is ``paddr``.
- - 4: Call ``madvise()`` for the region with ``MADV_NOHUGEPAGE``.  Ignored if
-   ``target`` is ``paddr``.
- - 5: Do nothing but count the statistics
+<damon_design_damos_action>`.  The mapping between the ``<action>`` values and
+the memory management actions is as below.  For the detailed meaning of the
+action and DAMON operations set supporting each action, please refer to the
+list on :ref:`design doc <damon_design_damos_action>`.
+
+ - 0: ``willneed``
+ - 1: ``cold``
+ - 2: ``pageout``
+ - 3: ``hugepage``
+ - 4: ``nohugepage``
+ - 5: ``stat``
 
 Quota
 ~~~~~
diff --git a/Documentation/mm/damon/design.rst b/Documentation/mm/damon/design.rst
index 1bb69524a62ea6..9f16c4e62e724f 100644
--- a/Documentation/mm/damon/design.rst
+++ b/Documentation/mm/damon/design.rst
@@ -294,9 +294,29 @@ not mandated to support all actions of the list.  Hence, the availability of
 specific DAMOS action depends on what operations set is selected to be used
 together.
 
-Applying an action to a region is considered as changing the region's
-characteristics.  Hence, DAMOS resets the age of regions when an action is
-applied to those.
+The list of the supported actions, their meaning, and DAMON operations sets
+that supports each action are as below.
+
+ - ``willneed``: Call ``madvise()`` for the region with ``MADV_WILLNEED``.
+   Supported by ``vaddr`` and ``fvaddr`` operations set.
+ - ``cold``: Call ``madvise()`` for the region with ``MADV_COLD``.
+   Supported by ``vaddr`` and ``fvaddr`` operations set.
+ - ``pageout``: Call ``madvise()`` for the region with ``MADV_PAGEOUT``.
+   Supported by ``vaddr``, ``fvaddr`` and ``paddr`` operations set.
+ - ``hugepage``: Call ``madvise()`` for the region with ``MADV_HUGEPAGE``.
+   Supported by ``vaddr`` and ``fvaddr`` operations set.
+ - ``nohugepage``: Call ``madvise()`` for the region with ``MADV_NOHUGEPAGE``.
+   Supported by ``vaddr`` and ``fvaddr`` operations set.
+ - ``lru_prio``: Prioritize the region on its LRU lists.
+   Supported by ``paddr`` operations set.
+ - ``lru_deprio``: Deprioritize the region on its LRU lists.
+   Supported by ``paddr`` operations set.
+ - ``stat``: Do nothing but count the statistics.
+   Supported by all operations sets.
+
+Applying the actions except ``stat`` to a region is considered as changing the
+region's characteristics.  Hence, DAMOS resets the age of regions when any such
+actions are applied to those.
 
 
 .. _damon_design_damos_access_pattern:

From afc858f0e6db69bf6b01457be6b90b7b92599b45 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 16 Feb 2024 16:58:40 -0800
Subject: [PATCH 1259/1406] Docs/mm/damon: move DAMON operation sets list from
 the usage to the design document

The list of DAMON operation sets and their explanation, which may better
to be on design document, is written on the usage document.  Move the
detail to design document and make the usage document only reference the
design document.

Link: https://lkml.kernel.org/r/20240217005842.87348-4-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/admin-guide/mm/damon/usage.rst | 19 +++++++------------
 Documentation/mm/damon/design.rst            | 12 ++++++++++--
 2 files changed, 17 insertions(+), 14 deletions(-)

diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst
index 0335d584956b5e..be0924f47a42cf 100644
--- a/Documentation/admin-guide/mm/damon/usage.rst
+++ b/Documentation/admin-guide/mm/damon/usage.rst
@@ -180,19 +180,14 @@ In each context directory, two files (``avail_operations`` and ``operations``)
 and three directories (``monitoring_attrs``, ``targets``, and ``schemes``)
 exist.
 
-DAMON supports multiple types of monitoring operations, including those for
-virtual address space and the physical address space.  You can get the list of
-available monitoring operations set on the currently running kernel by reading
+DAMON supports multiple types of :ref:`monitoring operations
+<damon_design_confiurable_operations_set>`, including those for virtual address
+space and the physical address space.  You can get the list of available
+monitoring operations set on the currently running kernel by reading
 ``avail_operations`` file.  Based on the kernel configuration, the file will
-list some or all of below keywords.
-
- - vaddr: Monitor virtual address spaces of specific processes
- - fvaddr: Monitor fixed virtual address ranges
- - paddr: Monitor the physical address space of the system
-
-Please refer to :ref:`regions sysfs directory <sysfs_regions>` for detailed
-differences between the operations sets in terms of the monitoring target
-regions.
+list different available operation sets.  Please refer to the :ref:`design
+<damon_operations_set>` for the list of all available operation sets and their
+brief explanations.
 
 You can set and get what type of monitoring operations DAMON will use for the
 context by writing one of the keywords listed in ``avail_operations`` file and
diff --git a/Documentation/mm/damon/design.rst b/Documentation/mm/damon/design.rst
index 9f16c4e62e724f..6abf976dd71fdc 100644
--- a/Documentation/mm/damon/design.rst
+++ b/Documentation/mm/damon/design.rst
@@ -31,6 +31,8 @@ DAMON subsystem is configured with three layers including
   interfaces for the user space, on top of the core layer.
 
 
+.. _damon_design_configurable_operations_set:
+
 Configurable Operations Set
 ---------------------------
 
@@ -63,6 +65,8 @@ modules that built on top of the core layer using the API, which can be easily
 used by the user space end users.
 
 
+.. _damon_operations_set:
+
 Operations Set Layer
 ====================
 
@@ -71,8 +75,12 @@ The monitoring operations are defined in two parts:
 1. Identification of the monitoring target address range for the address space.
 2. Access check of specific address range in the target space.
 
-DAMON currently provides the implementations of the operations for the physical
-and virtual address spaces. Below two subsections describe how those work.
+DAMON currently provides below three operation sets.  Below two subsections
+describe how those work.
+
+ - vaddr: Monitor virtual address spaces of specific processes
+ - fvaddr: Monitor fixed virtual address ranges
+ - paddr: Monitor the physical address space of the system
 
 
 VMA-based Target Address Range Construction

From 55adb6aacb8d2d693e627300c2b0a938dadfbad4 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 16 Feb 2024 16:58:41 -0800
Subject: [PATCH 1260/1406] Docs/mm/damon: move monitoring target regions setup
 detail from the usage to the design document

Design doc is aimed to have all concept level details, while the usage doc
is focused on only how the features can be used.  Some details about
monitoring target regions construction is on the usage doc.  Move the
details about the monitoring target regions construction differences for
DAMON operations set from the usage to the design doc.

Link: https://lkml.kernel.org/r/20240217005842.87348-5-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/admin-guide/mm/damon/usage.rst | 16 +++++-----------
 Documentation/mm/damon/design.rst            | 12 +++++++++---
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst
index be0924f47a42cf..fefe62e0a46650 100644
--- a/Documentation/admin-guide/mm/damon/usage.rst
+++ b/Documentation/admin-guide/mm/damon/usage.rst
@@ -242,17 +242,11 @@ process to the ``pid_target`` file.
 targets/<N>/regions
 -------------------
 
-When ``vaddr`` monitoring operations set is being used (``vaddr`` is written to
-the ``contexts/<N>/operations`` file), DAMON automatically sets and updates the
-monitoring target regions so that entire memory mappings of target processes
-can be covered.  However, users could want to set the initial monitoring region
-to specific address ranges.
-
-In contrast, DAMON do not automatically sets and updates the monitoring target
-regions when ``fvaddr`` or ``paddr`` monitoring operations sets are being used
-(``fvaddr`` or ``paddr`` have written to the ``contexts/<N>/operations``).
-Therefore, users should set the monitoring target regions by themselves in the
-cases.
+In case of ``fvaddr`` or ``paddr`` monitoring operations sets, users are
+required to set the monitoring target address ranges.  In case of ``vaddr``
+operations set, it is not mandatory, but users can optionally set the initial
+monitoring region to specific address ranges.  Please refer to the :ref:`design
+<damon_design_vaddr_target_regions_construction>` for more details.
 
 For such cases, users can explicitly set the initial monitoring target regions
 as they want, by writing proper values to the files under this directory.
diff --git a/Documentation/mm/damon/design.rst b/Documentation/mm/damon/design.rst
index 6abf976dd71fdc..2bd0c203dcfb78 100644
--- a/Documentation/mm/damon/design.rst
+++ b/Documentation/mm/damon/design.rst
@@ -83,12 +83,18 @@ describe how those work.
  - paddr: Monitor the physical address space of the system
 
 
+ .. _damon_design_vaddr_target_regions_construction:
+
 VMA-based Target Address Range Construction
 -------------------------------------------
 
-This is only for the virtual address space monitoring operations
-implementation.  That for the physical address space simply asks users to
-manually set the monitoring target address ranges.
+A mechanism of ``vaddr`` DAMON operations set that automatically initializes
+and updates the monitoring target address regions so that entire memory
+mappings of the target processes can be covered.
+
+This mechanism is only for the ``vaddr`` operations set.  In cases of
+``fvaddr`` and ``paddr`` operation sets, users are asked to manually set the
+monitoring target address ranges.
 
 Only small parts in the super-huge virtual address space of the processes are
 mapped to the physical memory and accessed.  Thus, tracking the unmapped

From 3b690bd4e83bd80d1e0843fa9b75b00e8c3a6b7e Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 16 Feb 2024 16:58:42 -0800
Subject: [PATCH 1261/1406] Docs/admin-guide/mm/damon/usage: fix wrong quotas
 diabling condition

After the introduction of DAMOS quotas, DAMOS quotas is not disabled if
both size and time quotas are zero but the quota goal is set.  The new
rule is also applied to DAMON sysfs interface, but the usage doc is not
updated.  Update it.

Link: https://lkml.kernel.org/r/20240217005842.87348-6-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/admin-guide/mm/damon/usage.rst | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst
index fefe62e0a46650..db6620b5bc0a40 100644
--- a/Documentation/admin-guide/mm/damon/usage.rst
+++ b/Documentation/admin-guide/mm/damon/usage.rst
@@ -329,7 +329,8 @@ respectively.  Then, DAMON tries to use only up to ``time quota`` milliseconds
 for applying the ``action`` to memory regions of the ``access_pattern``, and to
 apply the action to only up to ``bytes`` bytes of memory regions within the
 ``reset_interval_ms``.  Setting both ``ms`` and ``bytes`` zero disables the
-quota limits.
+quota limits unless at least one :ref:`goal <sysfs_schemes_quota_goals>` is
+set.
 
 Under ``weights`` directory, three files (``sz_permil``,
 ``nr_accesses_permil``, and ``age_permil``) exist.

From ec5664d4d7c62b477336563fd0a24893bb31504b Mon Sep 17 00:00:00 2001
From: Donet Tom <donettom@linux.ibm.com>
Date: Sat, 17 Feb 2024 01:31:33 -0600
Subject: [PATCH 1262/1406] mm/mempolicy: use the already fetched local
 variable

Avoid doing a per cpu read and use the local variable thisnid. IMHO
this also makes the code more readable.

Link: https://lkml.kernel.org/r/9c3f7b743477560d1c5b12b8c111a584a2cc92ee.1708097962.git.donettom@linux.ibm.com
Signed-off-by: Aneesh Kumar K.V (IBM) <aneesh.kumar@kernel.org>
Signed-off-by: Donet Tom <donettom@linux.ibm.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Ben Widawsky <ben.widawsky@intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Feng Tang <feng.tang@intel.com>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Rik van Riel <riel@surriel.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/mempolicy.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 56f9a6ed939adf..77c98c32240874 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2780,7 +2780,7 @@ int mpol_misplaced(struct folio *folio, struct vm_area_struct *vma,
 		if (node_isset(curnid, pol->nodes))
 			goto out;
 		z = first_zones_zonelist(
-				node_zonelist(numa_node_id(), GFP_HIGHUSER),
+				node_zonelist(thisnid, GFP_HIGHUSER),
 				gfp_zone(GFP_HIGHUSER),
 				&pol->nodes);
 		polnid = zone_to_nid(z->zone);

From 07e33d7ad0e1b32190204d6a4546b4d3ac4309e0 Mon Sep 17 00:00:00 2001
From: Donet Tom <donettom@linux.ibm.com>
Date: Sat, 17 Feb 2024 01:31:34 -0600
Subject: [PATCH 1263/1406] mm/mempolicy: avoid the fallthrough with MPOLD_BIND
 in mpol_misplaced.

We will update MPOL_PREFERRED_MANY in the follow up patch. This change
is required for that.  In this patch there are no functional changes.

Link: https://lkml.kernel.org/r/bf7e6779f842fb65cf7bb9b2c617feb2af271cb7.1708097962.git.donettom@linux.ibm.com
Signed-off-by: Aneesh Kumar K.V (IBM) <aneesh.kumar@kernel.org>
Signed-off-by: Donet Tom <donettom@linux.ibm.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Ben Widawsky <ben.widawsky@intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Feng Tang <feng.tang@intel.com>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Rik van Riel <riel@surriel.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/mempolicy.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 77c98c32240874..81c540307a8bb3 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2769,7 +2769,15 @@ int mpol_misplaced(struct folio *folio, struct vm_area_struct *vma,
 				break;
 			goto out;
 		}
-		fallthrough;
+
+		if (node_isset(curnid, pol->nodes))
+			goto out;
+		z = first_zones_zonelist(
+				node_zonelist(thisnid, GFP_HIGHUSER),
+				gfp_zone(GFP_HIGHUSER),
+				&pol->nodes);
+		polnid = zone_to_nid(z->zone);
+		break;
 
 	case MPOL_PREFERRED_MANY:
 		/*

From baa63c8de69fe6c03035a707df00c1c76206b74d Mon Sep 17 00:00:00 2001
From: Donet Tom <donettom@linux.ibm.com>
Date: Sat, 17 Feb 2024 01:31:35 -0600
Subject: [PATCH 1264/1406] mm/numa_balancing: allow migrate on protnone
 reference with MPOL_PREFERRED_MANY policy
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit bda420b98505 ("numa balancing: migrate on fault among multiple
bound nodes") added support for migrate on protnone reference with
MPOL_BIND memory policy.  This allowed numa fault migration when the
executing node is part of the policy mask for MPOL_BIND.  This patch
extends migration support to MPOL_PREFERRED_MANY policy.

Currently, we cannot specify MPOL_PREFERRED_MANY with the mempolicy flag
MPOL_F_NUMA_BALANCING.  This causes issues when we want to use
NUMA_BALANCING_MEMORY_TIERING.  To effectively use the slow memory tier,
the kernel should not allocate pages from the slower memory tier via
allocation control zonelist fallback.  Instead, we should move cold pages
from the faster memory node via memory demotion.  For a page allocation,
kswapd is only woken up after we try to allocate pages from all nodes in
the allocation zone list.  This implies that, without using memory
policies, we will end up allocating hot pages in the slower memory tier.

MPOL_PREFERRED_MANY was added by commit b27abaccf8e8 ("mm/mempolicy: add
MPOL_PREFERRED_MANY for multiple preferred nodes") to allow better
allocation control when we have memory tiers in the system.  With
MPOL_PREFERRED_MANY, the user can use a policy node mask consisting only
of faster memory nodes.  When we fail to allocate pages from the faster
memory node, kswapd would be woken up, allowing demotion of cold pages to
slower memory nodes.

With the current kernel, such usage of memory policies implies we can't do
page promotion from a slower memory tier to a faster memory tier using
numa fault.  This patch fixes this issue.

For MPOL_PREFERRED_MANY, if the executing node is in the policy node mask,
we allow numa migration to the executing nodes.  If the executing node is
not in the policy node mask but the folio is already allocated based on
policy preference (the folio node is in the policy node mask), we don't
allow numa migration.  If both the executing node and folio node are
outside the policy node mask, we allow numa migration to the executing
nodes.

I have a test program which allocate memory on a specified node and
trigger the promotion or migration (Keep accessing the pages).

Without this patch if we set MPOL_PREFERRED_MANY promotion or migration
was not happening with this patch I could see pages are getting
migrated or promoted.

My system has 2 CPU+DRAM node (Tier 1) and 1 PMEM node(Tier 2).  Below
are my test results.

In below table N0 and N1 are Tier1 Nodes.  N6 is the Tier2 Node.
Exec_Node is the execution node, Policy is the nodes in nodemask and
"Curr Location Pages" is the node where pages present before migration
or promotion start.

Tests Results
------------------
Scenario 1:  if the executing node is in the policy node mask
================================================================================
Exec_Node    Policy           Curr Location Pages       Observations
================================================================================
N0           N0 N1 N6             N1                Pages Migrated from N1 to N0
N0           N0 N1 N6             N6                Pages Promoted from N6 to N0
N0           N0 N1                N1                Pages Migrated from N1 to N0
N0           N0 N1                N6                Pages Promoted from N6 to N0

Scenario 2: If the folio node is in policy node mask and Exec node not in policy  node mask
================================================================================
Exec_Node    Policy       Curr Location Pages       Observations
================================================================================
N0           N1 N6             N1               Pages are not Migrating to N0
N0           N1 N6             N6               Pages are not migration to N0
N0           N1                N1               Pages are not Migrating to N0

Scenario 3: both the folio node and executing node are outside the policy nodemask
==============================================================================
Exec_Node    Policy         Curr Location Pages       Observations
==============================================================================
N0            N1                     N6          Pages Promoted from N6 to N0
N0            N6                     N1          Pages Migrated from N1 to N0
Link: https://lkml.kernel.org/r/8d7737208bd24e754dc7a538a3f7f02de84f1f72.1708097962.git.donettom@linux.ibm.com
Signed-off-by: Aneesh Kumar K.V (IBM) <aneesh.kumar@kernel.org>
Signed-off-by: Donet Tom <donettom@linux.ibm.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Ben Widawsky <ben.widawsky@intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Feng Tang <feng.tang@intel.com>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Rik van Riel <riel@surriel.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/mempolicy.c | 28 ++++++++++++++++++++++++++--
 1 file changed, 26 insertions(+), 2 deletions(-)

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 81c540307a8bb3..f60b4c99f13027 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1503,9 +1503,10 @@ static inline int sanitize_mpol_flags(int *mode, unsigned short *flags)
 	if ((*flags & MPOL_F_STATIC_NODES) && (*flags & MPOL_F_RELATIVE_NODES))
 		return -EINVAL;
 	if (*flags & MPOL_F_NUMA_BALANCING) {
-		if (*mode != MPOL_BIND)
+		if (*mode == MPOL_BIND || *mode == MPOL_PREFERRED_MANY)
+			*flags |= (MPOL_F_MOF | MPOL_F_MORON);
+		else
 			return -EINVAL;
-		*flags |= (MPOL_F_MOF | MPOL_F_MORON);
 	}
 	return 0;
 }
@@ -2713,6 +2714,23 @@ static void sp_free(struct sp_node *n)
 	kmem_cache_free(sn_cache, n);
 }
 
+static inline bool mpol_preferred_should_numa_migrate(int exec_node, int folio_node,
+					    struct mempolicy *pol)
+{
+	/* if the executing node is in the policy node mask, migrate */
+	if (node_isset(exec_node, pol->nodes))
+		return true;
+
+	/* If the folio node is in policy node mask, don't migrate */
+	if (node_isset(folio_node, pol->nodes))
+		return false;
+	/*
+	 * both the folio node and executing node are outside the policy nodemask,
+	 * migrate as normal numa fault migration.
+	 */
+	return true;
+}
+
 /**
  * mpol_misplaced - check whether current folio node is valid in policy
  *
@@ -2780,6 +2798,12 @@ int mpol_misplaced(struct folio *folio, struct vm_area_struct *vma,
 		break;
 
 	case MPOL_PREFERRED_MANY:
+		if (pol->flags & MPOL_F_MORON) {
+			if (!mpol_preferred_should_numa_migrate(thisnid, curnid, pol))
+				goto out;
+			break;
+		}
+
 		/*
 		 * use current page if in policy nodemask,
 		 * else select nearest allowed node, if any.

From 802ba04397191f624e07bc8cd13e0202f6741880 Mon Sep 17 00:00:00 2001
From: Barry Song <v-songbaohua@oppo.com>
Date: Tue, 20 Feb 2024 10:19:35 +1300
Subject: [PATCH 1265/1406] mm: zswap: increase reject_compress_poor but not
 reject_compress_fail if compression returns ENOSPC

We used to rely on the returned -ENOSPC of zpool_malloc() to increase
reject_compress_poor.  But the code wouldn't get to there after commit
744e1885922a ("crypto: scomp - fix req->dst buffer overflow") as the new
code will goto out immediately after the special compression case happens.
So there might be no longer a chance to execute zpool_malloc now.  We are
incorrectly increasing zswap_reject_compress_fail instead.  Thus, we need
to fix the counters handling right after compressions return ENOSPC.  This
patch also centralizes the counters handling for all of compress_poor,
compress_fail and alloc_fail.

Link: https://lkml.kernel.org/r/20240219211935.72394-1-21cnbao@gmail.com
Fixes: 744e1885922a ("crypto: scomp - fix req->dst buffer overflow")
Signed-off-by: Barry Song <v-songbaohua@oppo.com>
Cc: Sergey Senozhatsky <senozhatsky@chromium.org>
Reviewed-by: Nhat Pham <nphamcs@gmail.com>
Acked-by: Yosry Ahmed <yosryahmed@google.com>
Reviewed-by: Chengming Zhou <zhouchengming@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/zswap.c | 27 +++++++++++++--------------
 1 file changed, 13 insertions(+), 14 deletions(-)

diff --git a/mm/zswap.c b/mm/zswap.c
index 62fe307521c937..51de79aa86593d 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -1021,12 +1021,12 @@ static bool zswap_compress(struct folio *folio, struct zswap_entry *entry)
 {
 	struct crypto_acomp_ctx *acomp_ctx;
 	struct scatterlist input, output;
+	int comp_ret = 0, alloc_ret = 0;
 	unsigned int dlen = PAGE_SIZE;
 	unsigned long handle;
 	struct zpool *zpool;
 	char *buf;
 	gfp_t gfp;
-	int ret;
 	u8 *dst;
 
 	acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
@@ -1057,26 +1057,18 @@ static bool zswap_compress(struct folio *folio, struct zswap_entry *entry)
 	 * but in different threads running on different cpu, we have different
 	 * acomp instance, so multiple threads can do (de)compression in parallel.
 	 */
-	ret = crypto_wait_req(crypto_acomp_compress(acomp_ctx->req), &acomp_ctx->wait);
+	comp_ret = crypto_wait_req(crypto_acomp_compress(acomp_ctx->req), &acomp_ctx->wait);
 	dlen = acomp_ctx->req->dlen;
-	if (ret) {
-		zswap_reject_compress_fail++;
+	if (comp_ret)
 		goto unlock;
-	}
 
 	zpool = zswap_find_zpool(entry);
 	gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
 	if (zpool_malloc_support_movable(zpool))
 		gfp |= __GFP_HIGHMEM | __GFP_MOVABLE;
-	ret = zpool_malloc(zpool, dlen, gfp, &handle);
-	if (ret == -ENOSPC) {
-		zswap_reject_compress_poor++;
-		goto unlock;
-	}
-	if (ret) {
-		zswap_reject_alloc_fail++;
+	alloc_ret = zpool_malloc(zpool, dlen, gfp, &handle);
+	if (alloc_ret)
 		goto unlock;
-	}
 
 	buf = zpool_map_handle(zpool, handle, ZPOOL_MM_WO);
 	memcpy(buf, dst, dlen);
@@ -1086,8 +1078,15 @@ static bool zswap_compress(struct folio *folio, struct zswap_entry *entry)
 	entry->length = dlen;
 
 unlock:
+	if (comp_ret == -ENOSPC || alloc_ret == -ENOSPC)
+		zswap_reject_compress_poor++;
+	else if (comp_ret)
+		zswap_reject_compress_fail++;
+	else if (alloc_ret)
+		zswap_reject_alloc_fail++;
+
 	mutex_unlock(&acomp_ctx->mutex);
-	return ret == 0;
+	return comp_ret == 0 && alloc_ret == 0;
 }
 
 static void zswap_decompress(struct zswap_entry *entry, struct page *page)

From 3a96ea36134526a09c6fe64381f597e72e8796b1 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 19 Feb 2024 11:44:12 -0800
Subject: [PATCH 1266/1406] mm/damon/core: set damos_quota->esz as public field
 and document

Patch series "mm/damon: let DAMOS feeds and tame/auto-tune itself".

The Aim-oriented Feedback-driven DAMOS Aggressiveness Auto-tuning
patchset[1] which has merged since commit 9294a037c015 ("mm/damon/core:
implement goal-oriented feedback-driven quota auto-tuning") made the
mechanism and the policy separated.  That is, users can set a part of
DAMOS control policies without a deep understanding of the mechanism but
just their demands such as SLA.

However, users are still required to do some additional work of manually
collecting their target metric and feeding it to DAMOS.  In the case of
end-users who use DAMON sysfs interface, the context switches between
user-space and kernel-space could also make it inefficient.  The overhead
is supposed to be only trivial in common cases, though.  Meanwhile, in
simple use cases, the target metric could be common system metrics that
the kernel can efficiently self-retrieve, such as memory pressure stall
time (PSI).

Extend DAMOS quota auto-tuning to support multiple types of metrics
including the DAMOS self-retrievable ones, and add support for memory
pressure stall time metric.  Different types of metrics can be supported
in future.  The auto-tuning capability is currently supported for only
users of DAMOS kernel API and DAMON sysfs interface.  Extend the support
to DAMON_RECLAIM.

Patches Sequence
================

First five patches are for helping debugging and fine-tuning existing
quota control features.  The first one (patch 1) exposes the effective
quota that is made with given user inputs to DAMOS kernel API users and
kernel-doc documents.  Following four patches implement (patches 1, 2 and
3) and document (patches 4 and 5) a new DAMON sysfs file that exposes the
value.

Following six patches cleanup and simplify the existing DAMOS quota
auto-tuning code by improving layout of comments and data structures
(patches 6 and 7), supporting common use cases, namely multiple goals
(patches 8, 9 and 10), and simplifying the interface (patch 11).

Then six patches for the main purpose of this patchset follow.  The first
three changes extend the core logic for various target metrics (patch 12),
implement memory pressure stall time-based target metric support (patch
13), and update DAMON sysfs interface to support the new target metric
(patch 14).  Then, documentation updates for the features on design (patch
15), ABI (patch 16), and usage (patch 17) follow.

Last three patches add auto-tuning support on DAMON_RECLAIM.  The patches
implement DAMON_RECLAIM parameters for user-feedback driven quota
auto-tuning (patch 18), memory pressure stall time-driven quota
self-tuning (patch 19), and finally update the DAMON_RECLAIM usage
document for the new parameters (patch 20).

[1] https://lore.kernel.org/all/20231130023652.50284-1-sj@kernel.org/


This patch (of 20):

DAMOS allow users to specify the quota as they want in multiple ways
including time quota, size quota, and feedback-based auto-tuning.  DAMOS
makes one effective quota out of the inputs and use it at the end.
Knowing the current effective quota helps understanding DAMOS' internal
mechanism and fine-tuning quotas.  DAMON kernel API users can get the
information from ->esz field of damos_quota struct, but the field is
marked as private purpose, and not kernel-doc documented.  Make it public
and document.

Link: https://lkml.kernel.org/r/20240219194431.159606-1-sj@kernel.org
Link: https://lkml.kernel.org/r/20240219194431.159606-2-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h | 6 ++++--
 mm/damon/core.c       | 8 ++++----
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 5881e4ac30be6a..93ef45b87b9cd3 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -138,6 +138,7 @@ enum damos_action {
  *
  * @get_score:		Feedback function for self-tuning quota.
  * @get_score_arg:	Parameter for @get_score
+ * @esz:		Effective size quota in bytes.
  *
  * To avoid consuming too much CPU time or IO resources for applying the
  * &struct damos->action to large memory, DAMON allows users to set time and/or
@@ -167,6 +168,8 @@ enum damos_action {
  * tuning is getting the feedback screo value of 10,000.  If @ms and/or @sz are
  * set together, those work as a hard limit quota.  If neither @ms nor @sz are
  * set, the mechanism starts from the quota of one byte.
+ *
+ * The resulting effective size quota in bytes is set to @esz.
  */
 struct damos_quota {
 	unsigned long ms;
@@ -179,14 +182,13 @@ struct damos_quota {
 
 	unsigned long (*get_score)(void *arg);
 	void *get_score_arg;
+	unsigned long esz;
 
 /* private: */
 	/* For throughput estimation */
 	unsigned long total_charged_sz;
 	unsigned long total_charged_ns;
 
-	unsigned long esz;	/* Effective size quota in bytes */
-
 	/* For charging the quota */
 	unsigned long charged_sz;
 	unsigned long charged_from;
diff --git a/mm/damon/core.c b/mm/damon/core.c
index 5b325749fc1259..0656966a6fc435 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -299,12 +299,12 @@ void damos_destroy_filter(struct damos_filter *f)
 	damos_free_filter(f);
 }
 
-/* initialize private fields of damos_quota and return the pointer */
-static struct damos_quota *damos_quota_init_priv(struct damos_quota *quota)
+/* initialize fields of @quota that normally API users wouldn't set */
+static struct damos_quota *damos_quota_init(struct damos_quota *quota)
 {
+	quota->esz = 0;
 	quota->total_charged_sz = 0;
 	quota->total_charged_ns = 0;
-	quota->esz = 0;
 	quota->charged_sz = 0;
 	quota->charged_from = 0;
 	quota->charge_target_from = NULL;
@@ -336,7 +336,7 @@ struct damos *damon_new_scheme(struct damos_access_pattern *pattern,
 	scheme->stat = (struct damos_stat){};
 	INIT_LIST_HEAD(&scheme->list);
 
-	scheme->quota = *(damos_quota_init_priv(quota));
+	scheme->quota = *(damos_quota_init(quota));
 
 	scheme->wmarks = *wmarks;
 	scheme->wmarks.activated = true;

From 0cf2d06d9bf0b0d64c11add13e106fdcc5f7fdb6 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 19 Feb 2024 11:44:13 -0800
Subject: [PATCH 1267/1406] mm/damon/sysfs-schemes: implement quota
 effective_bytes file

DAMON sysfs interface allows users to set two types of quotas, namely time
quota and size quota.  DAMOS converts time quota to a size quota and use
smaller one among the resulting two size quotas.  The resulting effective
size quota can be helpful for debugging and analysis, but not exposed to
the user.  The recently added feedback-driven quota auto-tuning is making
it even more mysterious.

Implement a DAMON sysfs interface read-only empty file, namely
'effective_bytes', under the quota goal DAMON sysfs directory.  It will be
extended to expose the effective quota to the end user.

Link: https://lkml.kernel.org/r/20240219194431.159606-3-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/sysfs-schemes.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c
index f6c7f43f06cc07..dd46b2db5455af 100644
--- a/mm/damon/sysfs-schemes.c
+++ b/mm/damon/sysfs-schemes.c
@@ -1139,6 +1139,7 @@ struct damon_sysfs_quotas {
 	unsigned long ms;
 	unsigned long sz;
 	unsigned long reset_interval_ms;
+	unsigned long effective_sz;	/* Effective size quota in bytes */
 };
 
 static struct damon_sysfs_quotas *damon_sysfs_quotas_alloc(void)
@@ -1252,6 +1253,15 @@ static ssize_t reset_interval_ms_store(struct kobject *kobj,
 	return count;
 }
 
+static ssize_t effective_bytes_show(struct kobject *kobj,
+		struct kobj_attribute *attr, char *buf)
+{
+	struct damon_sysfs_quotas *quotas = container_of(kobj,
+			struct damon_sysfs_quotas, kobj);
+
+	return sysfs_emit(buf, "%lu\n", quotas->effective_sz);
+}
+
 static void damon_sysfs_quotas_release(struct kobject *kobj)
 {
 	kfree(container_of(kobj, struct damon_sysfs_quotas, kobj));
@@ -1266,10 +1276,14 @@ static struct kobj_attribute damon_sysfs_quotas_sz_attr =
 static struct kobj_attribute damon_sysfs_quotas_reset_interval_ms_attr =
 		__ATTR_RW_MODE(reset_interval_ms, 0600);
 
+static struct kobj_attribute damon_sysfs_quotas_effective_bytes_attr =
+		__ATTR_RO_MODE(effective_bytes, 0400);
+
 static struct attribute *damon_sysfs_quotas_attrs[] = {
 	&damon_sysfs_quotas_ms_attr.attr,
 	&damon_sysfs_quotas_sz_attr.attr,
 	&damon_sysfs_quotas_reset_interval_ms_attr.attr,
+	&damon_sysfs_quotas_effective_bytes_attr.attr,
 	NULL,
 };
 ATTRIBUTE_GROUPS(damon_sysfs_quotas);

From a2a040702e74063789f2bc0620b53707713d44a3 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 19 Feb 2024 11:44:14 -0800
Subject: [PATCH 1268/1406] mm/damon/sysfs: implement a kdamond command for
 updating schemes' effective quotas

Implement yet another kdamond 'state' file input command, namely
'update_schemes_effective_quotas'.  If it is written, the
'effective_bytes' files of the kdamond will be updated to provide the
current effective size quota of each scheme in bytes.

Link: https://lkml.kernel.org/r/20240219194431.159606-4-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/sysfs-common.h  |  4 ++++
 mm/damon/sysfs-schemes.c | 20 ++++++++++++++++++++
 mm/damon/sysfs.c         | 32 ++++++++++++++++++++++++++++++++
 3 files changed, 56 insertions(+)

diff --git a/mm/damon/sysfs-common.h b/mm/damon/sysfs-common.h
index ec0703e1e90b69..5a1ac15fb2f8b7 100644
--- a/mm/damon/sysfs-common.h
+++ b/mm/damon/sysfs-common.h
@@ -61,3 +61,7 @@ int damon_sysfs_schemes_clear_regions(
 
 void damos_sysfs_set_quota_scores(struct damon_sysfs_schemes *sysfs_schemes,
 		struct damon_ctx *ctx);
+
+void damos_sysfs_update_effective_quotas(
+		struct damon_sysfs_schemes *sysfs_schemes,
+		struct damon_ctx *ctx);
diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c
index dd46b2db5455af..9d90e7b757b765 100644
--- a/mm/damon/sysfs-schemes.c
+++ b/mm/damon/sysfs-schemes.c
@@ -1930,6 +1930,26 @@ void damos_sysfs_set_quota_scores(struct damon_sysfs_schemes *sysfs_schemes,
 	}
 }
 
+void damos_sysfs_update_effective_quotas(
+		struct damon_sysfs_schemes *sysfs_schemes,
+		struct damon_ctx *ctx)
+{
+	struct damos *scheme;
+	int schemes_idx = 0;
+
+	damon_for_each_scheme(scheme, ctx) {
+		struct damon_sysfs_quotas *sysfs_quotas;
+
+		/* user could have removed the scheme sysfs dir */
+		if (schemes_idx >= sysfs_schemes->nr)
+			break;
+
+		sysfs_quotas =
+			sysfs_schemes->schemes_arr[schemes_idx++]->quotas;
+		sysfs_quotas->effective_sz = scheme->quota.esz;
+	}
+}
+
 static struct damos *damon_sysfs_mk_scheme(
 		struct damon_sysfs_scheme *sysfs_scheme)
 {
diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
index 678de97fcc888d..cc2d88a901f45f 100644
--- a/mm/damon/sysfs.c
+++ b/mm/damon/sysfs.c
@@ -1019,6 +1019,11 @@ enum damon_sysfs_cmd {
 	 * regions
 	 */
 	DAMON_SYSFS_CMD_CLEAR_SCHEMES_TRIED_REGIONS,
+	/*
+	 * @DAMON_SYSFS_CMD_UPDATE_SCHEMES_EFFECTIVE_QUOTAS: Update the
+	 * effective size quota of the scheme in bytes.
+	 */
+	DAMON_SYSFS_CMD_UPDATE_SCHEMES_EFFECTIVE_QUOTAS,
 	/*
 	 * @NR_DAMON_SYSFS_CMDS: Total number of DAMON sysfs commands.
 	 */
@@ -1035,6 +1040,7 @@ static const char * const damon_sysfs_cmd_strs[] = {
 	"update_schemes_tried_bytes",
 	"update_schemes_tried_regions",
 	"clear_schemes_tried_regions",
+	"update_schemes_effective_quotas",
 };
 
 /*
@@ -1375,6 +1381,29 @@ static int damon_sysfs_commit_schemes_quota_goals(
 	return 0;
 }
 
+/*
+ * damon_sysfs_upd_schemes_effective_quotas() - Update schemes effective quotas
+ * sysfs files.
+ * @kdamond:	The kobject wrapper that associated to the kdamond thread.
+ *
+ * This function reads the schemes' effective quotas of specific kdamond and
+ * update the related values for sysfs files.  This function should be called
+ * from DAMON callbacks while holding ``damon_syfs_lock``, to safely access the
+ * DAMON contexts-internal data and DAMON sysfs variables.
+ */
+static int damon_sysfs_upd_schemes_effective_quotas(
+		struct damon_sysfs_kdamond *kdamond)
+{
+	struct damon_ctx *ctx = kdamond->damon_ctx;
+
+	if (!ctx)
+		return -EINVAL;
+	damos_sysfs_update_effective_quotas(
+			kdamond->contexts->contexts_arr[0]->schemes, ctx);
+	return 0;
+}
+
+
 /*
  * damon_sysfs_cmd_request_callback() - DAMON callback for handling requests.
  * @c:		The DAMON context of the callback.
@@ -1437,6 +1466,9 @@ static int damon_sysfs_cmd_request_callback(struct damon_ctx *c, bool active,
 	case DAMON_SYSFS_CMD_CLEAR_SCHEMES_TRIED_REGIONS:
 		err = damon_sysfs_clear_schemes_regions(kdamond);
 		break;
+	case DAMON_SYSFS_CMD_UPDATE_SCHEMES_EFFECTIVE_QUOTAS:
+		err = damon_sysfs_upd_schemes_effective_quotas(kdamond);
+		break;
 	default:
 		break;
 	}

From 955f0cdbce11d59e2a625be53203298fbe5fddf2 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 19 Feb 2024 11:44:15 -0800
Subject: [PATCH 1269/1406] Docs/ABI/damon: document effective_bytes sysfs file

Update the DAMON ABI doc for the effective_bytes sysfs file and the
kdamond state file input command for updating the content of the file.

Link: https://lkml.kernel.org/r/20240219194431.159606-5-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/ABI/testing/sysfs-kernel-mm-damon | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-damon b/Documentation/ABI/testing/sysfs-kernel-mm-damon
index bfa5b8288d8d11..a1e4fdb04f951e 100644
--- a/Documentation/ABI/testing/sysfs-kernel-mm-damon
+++ b/Documentation/ABI/testing/sysfs-kernel-mm-damon
@@ -34,7 +34,9 @@ Description:	Writing 'on' or 'off' to this file makes the kdamond starts or
 		kdamond.  Writing 'update_schemes_tried_bytes' to the file
 		updates only '.../tried_regions/total_bytes' files of this
 		kdamond.  Writing 'clear_schemes_tried_regions' to the file
-		removes contents of the 'tried_regions' directory.
+		removes contents of the 'tried_regions' directory.  Writing
+		'update_schemes_effective_quotas' to the file updates
+		'.../quotas/effective_bytes' files of this kdamond.
 
 What:		/sys/kernel/mm/damon/admin/kdamonds/<K>/pid
 Date:		Mar 2022
@@ -208,6 +210,12 @@ Contact:	SeongJae Park <sj@kernel.org>
 Description:	Writing to and reading from this file sets and gets the size
 		quota of the scheme in bytes.
 
+What:		/sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/quotas/effective_bytes
+Date:		Feb 2024
+Contact:	SeongJae Park <sj@kernel.org>
+Description:	Reading from this file gets the effective size quota of the
+		scheme in bytes, which adjusted for the time quota and goals.
+
 What:		/sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/quotas/reset_interval_ms
 Date:		Mar 2022
 Contact:	SeongJae Park <sj@kernel.org>

From cc2e33b689b65c770bbf627cd29b7b38efd415f5 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 19 Feb 2024 11:44:16 -0800
Subject: [PATCH 1270/1406] Docs/admin-guide/mm/damon/usage: document
 effective_bytes file

Update DAMON usage document for the effective quota file of the DAMON
sysfs interface.

Link: https://lkml.kernel.org/r/20240219194431.159606-6-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/admin-guide/mm/damon/usage.rst | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst
index db6620b5bc0a40..220ebbde7324a7 100644
--- a/Documentation/admin-guide/mm/damon/usage.rst
+++ b/Documentation/admin-guide/mm/damon/usage.rst
@@ -83,7 +83,7 @@ comma (",").
     │ │ │ │ │ │ │ │ sz/min,max
     │ │ │ │ │ │ │ │ nr_accesses/min,max
     │ │ │ │ │ │ │ │ age/min,max
-    │ │ │ │ │ │ │ :ref:`quotas <sysfs_quotas>`/ms,bytes,reset_interval_ms
+    │ │ │ │ │ │ │ :ref:`quotas <sysfs_quotas>`/ms,bytes,reset_interval_ms,effective_bytes
     │ │ │ │ │ │ │ │ weights/sz_permil,nr_accesses_permil,age_permil
     │ │ │ │ │ │ │ │ :ref:`goals <sysfs_schemes_quota_goals>`/nr_goals
     │ │ │ │ │ │ │ │ │ 0/target_value,current_value
@@ -153,6 +153,9 @@ Users can write below commands for the kdamond to the ``state`` file.
 - ``clear_schemes_tried_regions``: Clear the DAMON-based operating scheme
   action tried regions directory for each DAMON-based operation scheme of the
   kdamond.
+- ``update_schemes_effective_bytes``: Update the contents of
+  ``effective_bytes`` files for each DAMON-based operation scheme of the
+  kdamond.  For more details, refer to :ref:`quotas directory <sysfs_quotas>`.
 
 If the state is ``on``, reading ``pid`` shows the pid of the kdamond thread.
 
@@ -320,8 +323,9 @@ schemes/<N>/quotas/
 The directory for the :ref:`quotas <damon_design_damos_quotas>` of the given
 DAMON-based operation scheme.
 
-Under ``quotas`` directory, three files (``ms``, ``bytes``,
-``reset_interval_ms``) and two directores (``weights`` and ``goals``) exist.
+Under ``quotas`` directory, four files (``ms``, ``bytes``,
+``reset_interval_ms``, ``effective_bytes``) and two directores (``weights`` and
+``goals``) exist.
 
 You can set the ``time quota`` in milliseconds, ``size quota`` in bytes, and
 ``reset interval`` in milliseconds by writing the values to the three files,
@@ -332,6 +336,15 @@ apply the action to only up to ``bytes`` bytes of memory regions within the
 quota limits unless at least one :ref:`goal <sysfs_schemes_quota_goals>` is
 set.
 
+The time quota is internally transformed to a size quota.  Between the
+transformed size quota and user-specified size quota, smaller one is applied.
+Based on the user-specified :ref:`goal <sysfs_schemes_quota_goals>`, the
+effective size quota is further adjusted.  Reading ``effective_bytes`` returns
+the current effective size quota.  The file is not updated in real time, so
+users should ask DAMON sysfs interface to update the content of the file for
+the stats by writing a special keyword, ``update_schemes_effective_bytes`` to
+the relevant ``kdamonds/<N>/state`` file.
+
 Under ``weights`` directory, three files (``sz_permil``,
 ``nr_accesses_permil``, and ``age_permil``) exist.
 You can set the :ref:`prioritization weights

From bec3590c573cb1cf58068f6f7b7a4d3e523881d1 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 19 Feb 2024 11:44:17 -0800
Subject: [PATCH 1271/1406] mm/damon: move comments and fields for
 damos-quota-prioritization to the end

The comments and definition of 'struct damos_quota' lists a few fields for
effective quota generation first, fields for regions prioritization under
the quota, and then remaining fields for effective quota generation.
Readers' should unnecesssarily switch their context in the middle.  List
all the fields for the effective quota first, and then fields for the
prioritization for making it easier to read.

Link: https://lkml.kernel.org/r/20240219194431.159606-7-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h | 30 ++++++++++++++----------------
 1 file changed, 14 insertions(+), 16 deletions(-)

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 93ef45b87b9cd3..bd17b14828bc27 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -128,18 +128,17 @@ enum damos_action {
 
 /**
  * struct damos_quota - Controls the aggressiveness of the given scheme.
+ * @reset_interval:	Charge reset interval in milliseconds.
  * @ms:			Maximum milliseconds that the scheme can use.
  * @sz:			Maximum bytes of memory that the action can be applied.
- * @reset_interval:	Charge reset interval in milliseconds.
+ * @get_score:		Feedback function for self-tuning quota.
+ * @get_score_arg:	Parameter for @get_score
+ * @esz:		Effective size quota in bytes.
  *
  * @weight_sz:		Weight of the region's size for prioritization.
  * @weight_nr_accesses:	Weight of the region's nr_accesses for prioritization.
  * @weight_age:		Weight of the region's age for prioritization.
  *
- * @get_score:		Feedback function for self-tuning quota.
- * @get_score_arg:	Parameter for @get_score
- * @esz:		Effective size quota in bytes.
- *
  * To avoid consuming too much CPU time or IO resources for applying the
  * &struct damos->action to large memory, DAMON allows users to set time and/or
  * size quotas.  The quotas can be set by writing non-zero values to &ms and
@@ -152,12 +151,6 @@ enum damos_action {
  * throughput of the scheme's action.  DAMON then compares it against &sz and
  * uses smaller one as the effective quota.
  *
- * For selecting regions within the quota, DAMON prioritizes current scheme's
- * target memory regions using the &struct damon_operations->get_scheme_score.
- * You could customize the prioritization logic by setting &weight_sz,
- * &weight_nr_accesses, and &weight_age, because monitoring operations are
- * encouraged to respect those.
- *
  * If @get_score function pointer is set, DAMON calls it back with
  * @get_score_arg and get the return value of it for every @reset_interval.
  * Then, DAMON adjusts the effective quota using the return value as a feedback
@@ -170,20 +163,25 @@ enum damos_action {
  * set, the mechanism starts from the quota of one byte.
  *
  * The resulting effective size quota in bytes is set to @esz.
+ *
+ * For selecting regions within the quota, DAMON prioritizes current scheme's
+ * target memory regions using the &struct damon_operations->get_scheme_score.
+ * You could customize the prioritization logic by setting &weight_sz,
+ * &weight_nr_accesses, and &weight_age, because monitoring operations are
+ * encouraged to respect those.
  */
 struct damos_quota {
+	unsigned long reset_interval;
 	unsigned long ms;
 	unsigned long sz;
-	unsigned long reset_interval;
+	unsigned long (*get_score)(void *arg);
+	void *get_score_arg;
+	unsigned long esz;
 
 	unsigned int weight_sz;
 	unsigned int weight_nr_accesses;
 	unsigned int weight_age;
 
-	unsigned long (*get_score)(void *arg);
-	void *get_score_arg;
-	unsigned long esz;
-
 /* private: */
 	/* For throughput estimation */
 	unsigned long total_charged_sz;

From 0902086c470915d12d83f6c2aaf38dc40f12d4ba Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 19 Feb 2024 11:44:18 -0800
Subject: [PATCH 1272/1406] mm/damon/core: split out quota goal related fields
 to a struct

'struct damos_quota' is not small now.  Split out fields for quota goal to
a separate struct for easier reading.

Link: https://lkml.kernel.org/r/20240219194431.159606-8-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h    | 36 ++++++++++++++++++++++--------------
 mm/damon/core.c          | 13 +++++++------
 mm/damon/sysfs-schemes.c | 10 +++++-----
 3 files changed, 34 insertions(+), 25 deletions(-)

diff --git a/include/linux/damon.h b/include/linux/damon.h
index bd17b14828bc27..2fe345adf6b2c5 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -126,13 +126,28 @@ enum damos_action {
 	NR_DAMOS_ACTIONS,
 };
 
+/**
+ * struct damos_quota_goal - DAMOS scheme quota auto-tuning goal.
+ * @get_score:		Function for getting current score of the goal.
+ * @get_score_arg:	Parameter for @get_score
+ *
+ * Data structure for getting the current score of the quota tuning goal.
+ * Calling @get_score with @get_score_arg as the parameter should return the
+ * current score.  Then the score is entered to DAMON's internal feedback loop
+ * mechanism to get the auto-tuned quota.  The goal of the tuning is getting
+ * the feedback score value of 10,000.
+ */
+struct damos_quota_goal {
+	unsigned long (*get_score)(void *arg);
+	void *get_score_arg;
+};
+
 /**
  * struct damos_quota - Controls the aggressiveness of the given scheme.
  * @reset_interval:	Charge reset interval in milliseconds.
  * @ms:			Maximum milliseconds that the scheme can use.
  * @sz:			Maximum bytes of memory that the action can be applied.
- * @get_score:		Feedback function for self-tuning quota.
- * @get_score_arg:	Parameter for @get_score
+ * @goal:		Quota auto-tuning goal.
  * @esz:		Effective size quota in bytes.
  *
  * @weight_sz:		Weight of the region's size for prioritization.
@@ -151,16 +166,10 @@ enum damos_action {
  * throughput of the scheme's action.  DAMON then compares it against &sz and
  * uses smaller one as the effective quota.
  *
- * If @get_score function pointer is set, DAMON calls it back with
- * @get_score_arg and get the return value of it for every @reset_interval.
- * Then, DAMON adjusts the effective quota using the return value as a feedback
- * score to the current quota, using its internal feedback loop algorithm.
- *
- * The feedback loop algorithem assumes the quota input and the feedback score
- * output are in a positive proportional relationship, and the goal of the
- * tuning is getting the feedback screo value of 10,000.  If @ms and/or @sz are
- * set together, those work as a hard limit quota.  If neither @ms nor @sz are
- * set, the mechanism starts from the quota of one byte.
+ * If ->get_score field of @goal is set, DAMON calculates yet another size
+ * quota based on the goal using its internal feedback loop algorithm, for
+ * every @reset_interval.  Then, if the new size quota is smaller than the
+ * effective quota, it uses the new size quota as the effective quota.
  *
  * The resulting effective size quota in bytes is set to @esz.
  *
@@ -174,8 +183,7 @@ struct damos_quota {
 	unsigned long reset_interval;
 	unsigned long ms;
 	unsigned long sz;
-	unsigned long (*get_score)(void *arg);
-	void *get_score_arg;
+	struct damos_quota_goal goal;
 	unsigned long esz;
 
 	unsigned int weight_sz;
diff --git a/mm/damon/core.c b/mm/damon/core.c
index 0656966a6fc435..fe420967212159 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -1083,21 +1083,22 @@ static unsigned long damon_feed_loop_next_input(unsigned long last_input,
 	return min_input;
 }
 
-/* Shouldn't be called if quota->ms, quota->sz, and quota->get_score unset */
+/* Called only if quota->ms, quota->sz, or quota->goal.get_score are set */
 static void damos_set_effective_quota(struct damos_quota *quota)
 {
 	unsigned long throughput;
 	unsigned long esz;
 
-	if (!quota->ms && !quota->get_score) {
+	if (!quota->ms && !quota->goal.get_score) {
 		quota->esz = quota->sz;
 		return;
 	}
 
-	if (quota->get_score) {
+	if (quota->goal.get_score) {
 		quota->esz_bp = damon_feed_loop_next_input(
 				max(quota->esz_bp, 10000UL),
-				quota->get_score(quota->get_score_arg));
+				quota->goal.get_score(
+					quota->goal.get_score_arg));
 		esz = quota->esz_bp / 10000;
 	}
 
@@ -1107,7 +1108,7 @@ static void damos_set_effective_quota(struct damos_quota *quota)
 				quota->total_charged_ns;
 		else
 			throughput = PAGE_SIZE * 1024;
-		if (quota->get_score)
+		if (quota->goal.get_score)
 			esz = min(throughput * quota->ms, esz);
 		else
 			esz = throughput * quota->ms;
@@ -1127,7 +1128,7 @@ static void damos_adjust_quota(struct damon_ctx *c, struct damos *s)
 	unsigned long cumulated_sz;
 	unsigned int score, max_score = 0;
 
-	if (!quota->ms && !quota->sz && !quota->get_score)
+	if (!quota->ms && !quota->sz && !quota->goal.get_score)
 		return;
 
 	/* New charge window starts */
diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c
index 9d90e7b757b765..85ef58f98a87c2 100644
--- a/mm/damon/sysfs-schemes.c
+++ b/mm/damon/sysfs-schemes.c
@@ -1894,19 +1894,19 @@ static void damos_sysfs_set_quota_score(
 	struct damos_sysfs_quota_goal *sysfs_goal;
 	int i;
 
-	quota->get_score = NULL;
-	quota->get_score_arg = (void *)0;
+	quota->goal.get_score = NULL;
+	quota->goal.get_score_arg = (void *)0;
 	for (i = 0; i < sysfs_goals->nr; i++) {
 		sysfs_goal = sysfs_goals->goals_arr[i];
 		if (!sysfs_goal->target_value)
 			continue;
 
 		/* Higher score makes scheme less aggressive */
-		quota->get_score_arg = (void *)max(
-				(unsigned long)quota->get_score_arg,
+		quota->goal.get_score_arg = (void *)max(
+				(unsigned long)quota->goal.get_score_arg,
 				sysfs_goal->current_value * 10000 /
 				sysfs_goal->target_value);
-		quota->get_score = damos_sysfs_get_quota_score;
+		quota->goal.get_score = damos_sysfs_get_quota_score;
 	}
 }
 

From 896c47b94526fd2101ae4ee1c49cda7ed507116f Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 19 Feb 2024 11:44:19 -0800
Subject: [PATCH 1273/1406] mm/damon/core: add multiple goals per damos_quota
 and helpers for those

The feedback-driven DAMOS quota auto-tuning feature allows only single
goal to the DAMON kernel API users.  The API users could implement
multiple goals for the end-users on their level, and that's what DAMON
sysfs interface is doing.  More DAMON kernel API users such as
DAMON_RECLAIM would need to do similar work.  To reduce unnecessary future
duplciated efforts, support multiple goals from DAMOS core layer.  To make
the support in minimum non-destructive change, keep the old single goal
setup interface, and add multiple goals setup.  The single goal will
treated as one of the multiple goals, so old API users are not required to
make any change.

Link: https://lkml.kernel.org/r/20240219194431.159606-9-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h | 17 ++++++++++
 mm/damon/core.c       | 78 +++++++++++++++++++++++++++++++++++++++----
 2 files changed, 88 insertions(+), 7 deletions(-)

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 2fe345adf6b2c5..4bd898eaf80eb1 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -130,6 +130,7 @@ enum damos_action {
  * struct damos_quota_goal - DAMOS scheme quota auto-tuning goal.
  * @get_score:		Function for getting current score of the goal.
  * @get_score_arg:	Parameter for @get_score
+ * @list:		List head for siblings.
  *
  * Data structure for getting the current score of the quota tuning goal.
  * Calling @get_score with @get_score_arg as the parameter should return the
@@ -140,6 +141,7 @@ enum damos_action {
 struct damos_quota_goal {
 	unsigned long (*get_score)(void *arg);
 	void *get_score_arg;
+	struct list_head list;
 };
 
 /**
@@ -148,6 +150,7 @@ struct damos_quota_goal {
  * @ms:			Maximum milliseconds that the scheme can use.
  * @sz:			Maximum bytes of memory that the action can be applied.
  * @goal:		Quota auto-tuning goal.
+ * @goals:		Head of quota tuning goals (&damos_quota_goal) list.
  * @esz:		Effective size quota in bytes.
  *
  * @weight_sz:		Weight of the region's size for prioritization.
@@ -171,6 +174,8 @@ struct damos_quota_goal {
  * every @reset_interval.  Then, if the new size quota is smaller than the
  * effective quota, it uses the new size quota as the effective quota.
  *
+ * If @goals is not empty, same action is taken for each goal of the list.
+ *
  * The resulting effective size quota in bytes is set to @esz.
  *
  * For selecting regions within the quota, DAMON prioritizes current scheme's
@@ -184,6 +189,7 @@ struct damos_quota {
 	unsigned long ms;
 	unsigned long sz;
 	struct damos_quota_goal goal;
+	struct list_head goals;
 	unsigned long esz;
 
 	unsigned int weight_sz;
@@ -648,6 +654,12 @@ static inline unsigned long damon_sz_region(struct damon_region *r)
 #define damon_for_each_scheme_safe(s, next, ctx) \
 	list_for_each_entry_safe(s, next, &(ctx)->schemes, list)
 
+#define damos_for_each_quota_goal(goal, quota) \
+	list_for_each_entry(goal, &quota->goals, list)
+
+#define damos_for_each_quota_goal_safe(goal, next, quota) \
+	list_for_each_entry_safe(goal, next, &(quota)->goals, list)
+
 #define damos_for_each_filter(f, scheme) \
 	list_for_each_entry(f, &(scheme)->filters, list)
 
@@ -681,6 +693,11 @@ struct damos_filter *damos_new_filter(enum damos_filter_type type,
 void damos_add_filter(struct damos *s, struct damos_filter *f);
 void damos_destroy_filter(struct damos_filter *f);
 
+struct damos_quota_goal *damos_new_quota_goal(
+		unsigned long (*get_score)(void *), void *get_score_arg);
+void damos_add_quota_goal(struct damos_quota *q, struct damos_quota_goal *g);
+void damos_destroy_quota_goal(struct damos_quota_goal *goal);
+
 struct damos *damon_new_scheme(struct damos_access_pattern *pattern,
 			enum damos_action action,
 			unsigned long apply_interval_us,
diff --git a/mm/damon/core.c b/mm/damon/core.c
index fe420967212159..b6cd99b64e8512 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -299,6 +299,41 @@ void damos_destroy_filter(struct damos_filter *f)
 	damos_free_filter(f);
 }
 
+struct damos_quota_goal *damos_new_quota_goal(
+		unsigned long (*get_score)(void *), void *get_score_arg)
+{
+	struct damos_quota_goal *goal;
+
+	goal = kmalloc(sizeof(*goal), GFP_KERNEL);
+	if (!goal)
+		return NULL;
+	goal->get_score = get_score;
+	goal->get_score_arg = get_score_arg;
+	INIT_LIST_HEAD(&goal->list);
+	return goal;
+}
+
+void damos_add_quota_goal(struct damos_quota *q, struct damos_quota_goal *g)
+{
+	list_add_tail(&g->list, &q->goals);
+}
+
+static void damos_del_quota_goal(struct damos_quota_goal *g)
+{
+	list_del(&g->list);
+}
+
+static void damos_free_quota_goal(struct damos_quota_goal *g)
+{
+	kfree(g);
+}
+
+void damos_destroy_quota_goal(struct damos_quota_goal *g)
+{
+	damos_del_quota_goal(g);
+	damos_free_quota_goal(g);
+}
+
 /* initialize fields of @quota that normally API users wouldn't set */
 static struct damos_quota *damos_quota_init(struct damos_quota *quota)
 {
@@ -337,6 +372,8 @@ struct damos *damon_new_scheme(struct damos_access_pattern *pattern,
 	INIT_LIST_HEAD(&scheme->list);
 
 	scheme->quota = *(damos_quota_init(quota));
+	/* quota.goals should be separately set by caller */
+	INIT_LIST_HEAD(&scheme->quota.goals);
 
 	scheme->wmarks = *wmarks;
 	scheme->wmarks.activated = true;
@@ -373,8 +410,12 @@ static void damon_free_scheme(struct damos *s)
 
 void damon_destroy_scheme(struct damos *s)
 {
+	struct damos_quota_goal *g, *g_next;
 	struct damos_filter *f, *next;
 
+	damos_for_each_quota_goal_safe(g, g_next, &s->quota)
+		damos_destroy_quota_goal(g);
+
 	damos_for_each_filter_safe(f, next, s)
 		damos_destroy_filter(f);
 	damon_del_scheme(s);
@@ -1083,22 +1124,44 @@ static unsigned long damon_feed_loop_next_input(unsigned long last_input,
 	return min_input;
 }
 
-/* Called only if quota->ms, quota->sz, or quota->goal.get_score are set */
+/* Return the highest score since it makes schemes least aggressive */
+static unsigned long damos_quota_score(struct damos_quota *quota)
+{
+	struct damos_quota_goal *goal;
+	unsigned long highest_score = 0;
+
+	if (quota->goal.get_score)
+		highest_score = quota->goal.get_score(
+				quota->goal.get_score_arg);
+
+	damos_for_each_quota_goal(goal, quota)
+		highest_score = max(highest_score,
+				goal->get_score(goal->get_score_arg));
+
+	return highest_score;
+}
+
+/*
+ * Called only if quota->ms, quota->sz, or quota->goal.get_score are set, or
+ * quota->goals is not empty
+ */
 static void damos_set_effective_quota(struct damos_quota *quota)
 {
 	unsigned long throughput;
 	unsigned long esz;
 
-	if (!quota->ms && !quota->goal.get_score) {
+	if (!quota->ms && !quota->goal.get_score &&
+			list_empty(&quota->goals)) {
 		quota->esz = quota->sz;
 		return;
 	}
 
-	if (quota->goal.get_score) {
+	if (quota->goal.get_score || !list_empty(&quota->goals)) {
+		unsigned long score = damos_quota_score(quota);
+
 		quota->esz_bp = damon_feed_loop_next_input(
 				max(quota->esz_bp, 10000UL),
-				quota->goal.get_score(
-					quota->goal.get_score_arg));
+				score);
 		esz = quota->esz_bp / 10000;
 	}
 
@@ -1108,7 +1171,7 @@ static void damos_set_effective_quota(struct damos_quota *quota)
 				quota->total_charged_ns;
 		else
 			throughput = PAGE_SIZE * 1024;
-		if (quota->goal.get_score)
+		if (quota->goal.get_score || !list_empty(&quota->goals))
 			esz = min(throughput * quota->ms, esz);
 		else
 			esz = throughput * quota->ms;
@@ -1128,7 +1191,8 @@ static void damos_adjust_quota(struct damon_ctx *c, struct damos *s)
 	unsigned long cumulated_sz;
 	unsigned int score, max_score = 0;
 
-	if (!quota->ms && !quota->sz && !quota->goal.get_score)
+	if (!quota->ms && !quota->sz && !quota->goal.get_score &&
+			list_empty(&quota->goals))
 		return;
 
 	/* New charge window starts */

From 25a7c6a2ead86c28987fcac1606f725760d0284e Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 19 Feb 2024 11:44:20 -0800
Subject: [PATCH 1274/1406] mm/damon/sysfs: use only quota->goals

DAMON sysfs interface implements multiple quota auto-tuning goals on its
level since the DAMOS core logic was supporting only single goal.  Now the
core logic supports multiple goals on its level.  Update DAMON sysfs
interface to reuse the core logic and drop unnecessary duplicated multiple
goals implementation.

Link: https://lkml.kernel.org/r/20240219194431.159606-10-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/sysfs-common.h  |  2 +-
 mm/damon/sysfs-schemes.c | 49 +++++++++++++++++++++++++++-------------
 mm/damon/sysfs.c         |  3 +--
 3 files changed, 35 insertions(+), 19 deletions(-)

diff --git a/mm/damon/sysfs-common.h b/mm/damon/sysfs-common.h
index 5a1ac15fb2f8b7..a63f51577cffdb 100644
--- a/mm/damon/sysfs-common.h
+++ b/mm/damon/sysfs-common.h
@@ -59,7 +59,7 @@ int damon_sysfs_schemes_clear_regions(
 		struct damon_sysfs_schemes *sysfs_schemes,
 		struct damon_ctx *ctx);
 
-void damos_sysfs_set_quota_scores(struct damon_sysfs_schemes *sysfs_schemes,
+int damos_sysfs_set_quota_scores(struct damon_sysfs_schemes *sysfs_schemes,
 		struct damon_ctx *ctx);
 
 void damos_sysfs_update_effective_quotas(
diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c
index 85ef58f98a87c2..7bf94b1ed6f7d3 100644
--- a/mm/damon/sysfs-schemes.c
+++ b/mm/damon/sysfs-schemes.c
@@ -1887,30 +1887,34 @@ static unsigned long damos_sysfs_get_quota_score(void *arg)
 	return (unsigned long)arg;
 }
 
-static void damos_sysfs_set_quota_score(
+static int damos_sysfs_set_quota_score(
 		struct damos_sysfs_quota_goals *sysfs_goals,
 		struct damos_quota *quota)
 {
-	struct damos_sysfs_quota_goal *sysfs_goal;
+	struct damos_quota_goal *goal, *next;
 	int i;
 
-	quota->goal.get_score = NULL;
-	quota->goal.get_score_arg = (void *)0;
+	damos_for_each_quota_goal_safe(goal, next, quota)
+		damos_destroy_quota_goal(goal);
+
 	for (i = 0; i < sysfs_goals->nr; i++) {
-		sysfs_goal = sysfs_goals->goals_arr[i];
+		struct damos_sysfs_quota_goal *sysfs_goal =
+			sysfs_goals->goals_arr[i];
+
 		if (!sysfs_goal->target_value)
 			continue;
 
-		/* Higher score makes scheme less aggressive */
-		quota->goal.get_score_arg = (void *)max(
-				(unsigned long)quota->goal.get_score_arg,
-				sysfs_goal->current_value * 10000 /
-				sysfs_goal->target_value);
-		quota->goal.get_score = damos_sysfs_get_quota_score;
+		goal = damos_new_quota_goal(damos_sysfs_get_quota_score,
+				(void *)(sysfs_goal->current_value * 10000 /
+				sysfs_goal->target_value));
+		if (!goal)
+			return -ENOMEM;
+		damos_add_quota_goal(quota, goal);
 	}
+	return 0;
 }
 
-void damos_sysfs_set_quota_scores(struct damon_sysfs_schemes *sysfs_schemes,
+int damos_sysfs_set_quota_scores(struct damon_sysfs_schemes *sysfs_schemes,
 		struct damon_ctx *ctx)
 {
 	struct damos *scheme;
@@ -1918,16 +1922,21 @@ void damos_sysfs_set_quota_scores(struct damon_sysfs_schemes *sysfs_schemes,
 
 	damon_for_each_scheme(scheme, ctx) {
 		struct damon_sysfs_scheme *sysfs_scheme;
+		int err;
 
 		/* user could have removed the scheme sysfs dir */
 		if (i >= sysfs_schemes->nr)
 			break;
 
 		sysfs_scheme = sysfs_schemes->schemes_arr[i];
-		damos_sysfs_set_quota_score(sysfs_scheme->quotas->goals,
+		err = damos_sysfs_set_quota_score(sysfs_scheme->quotas->goals,
 				&scheme->quota);
+		if (err)
+			/* kdamond will clean up schemes and terminated */
+			return err;
 		i++;
 	}
+	return 0;
 }
 
 void damos_sysfs_update_effective_quotas(
@@ -1987,13 +1996,17 @@ static struct damos *damon_sysfs_mk_scheme(
 		.low = sysfs_wmarks->low,
 	};
 
-	damos_sysfs_set_quota_score(sysfs_quotas->goals, &quota);
-
 	scheme = damon_new_scheme(&pattern, sysfs_scheme->action,
 			sysfs_scheme->apply_interval_us, &quota, &wmarks);
 	if (!scheme)
 		return NULL;
 
+	err = damos_sysfs_set_quota_score(sysfs_quotas->goals, &scheme->quota);
+	if (err) {
+		damon_destroy_scheme(scheme);
+		return NULL;
+	}
+
 	err = damon_sysfs_set_scheme_filters(scheme, sysfs_filters);
 	if (err) {
 		damon_destroy_scheme(scheme);
@@ -2029,7 +2042,11 @@ static void damon_sysfs_update_scheme(struct damos *scheme,
 	scheme->quota.weight_nr_accesses = sysfs_weights->nr_accesses;
 	scheme->quota.weight_age = sysfs_weights->age;
 
-	damos_sysfs_set_quota_score(sysfs_quotas->goals, &scheme->quota);
+	err = damos_sysfs_set_quota_score(sysfs_quotas->goals, &scheme->quota);
+	if (err) {
+		damon_destroy_scheme(scheme);
+		return;
+	}
 
 	scheme->wmarks.metric = sysfs_wmarks->metric;
 	scheme->wmarks.interval = sysfs_wmarks->interval_us;
diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
index cc2d88a901f45f..6fee383bc0c54c 100644
--- a/mm/damon/sysfs.c
+++ b/mm/damon/sysfs.c
@@ -1377,8 +1377,7 @@ static int damon_sysfs_commit_schemes_quota_goals(
 
 	ctx = sysfs_kdamond->damon_ctx;
 	sysfs_ctx = sysfs_kdamond->contexts->contexts_arr[0];
-	damos_sysfs_set_quota_scores(sysfs_ctx->schemes, ctx);
-	return 0;
+	return damos_sysfs_set_quota_scores(sysfs_ctx->schemes, ctx);
 }
 
 /*

From 4ccfcd64aabc009dca5165f16cc752ab10689682 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 19 Feb 2024 11:44:21 -0800
Subject: [PATCH 1275/1406] mm/damon/core: remove ->goal field of damos_quota

DAMOS quota auto-tuning feature supports static signle goal and dynamic
multiple goals via DAMON kernel API, specifically via ->goal and ->goals
fields of damos_quota struct, respectively.  All in-tree DAMOS kernel API
users are using only the dynamic multiple goals now.  Remove the unsued
static single goal interface.

Link: https://lkml.kernel.org/r/20240219194431.159606-11-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h | 12 ++++--------
 mm/damon/core.c       | 17 +++++------------
 2 files changed, 9 insertions(+), 20 deletions(-)

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 4bd898eaf80eb1..76c965c1eea375 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -149,7 +149,6 @@ struct damos_quota_goal {
  * @reset_interval:	Charge reset interval in milliseconds.
  * @ms:			Maximum milliseconds that the scheme can use.
  * @sz:			Maximum bytes of memory that the action can be applied.
- * @goal:		Quota auto-tuning goal.
  * @goals:		Head of quota tuning goals (&damos_quota_goal) list.
  * @esz:		Effective size quota in bytes.
  *
@@ -169,12 +168,10 @@ struct damos_quota_goal {
  * throughput of the scheme's action.  DAMON then compares it against &sz and
  * uses smaller one as the effective quota.
  *
- * If ->get_score field of @goal is set, DAMON calculates yet another size
- * quota based on the goal using its internal feedback loop algorithm, for
- * every @reset_interval.  Then, if the new size quota is smaller than the
- * effective quota, it uses the new size quota as the effective quota.
- *
- * If @goals is not empty, same action is taken for each goal of the list.
+ * If @goals is not empt, DAMON calculates yet another size quota based on the
+ * goals using its internal feedback loop algorithm, for every @reset_interval.
+ * Then, if the new size quota is smaller than the effective quota, it uses the
+ * new size quota as the effective quota.
  *
  * The resulting effective size quota in bytes is set to @esz.
  *
@@ -188,7 +185,6 @@ struct damos_quota {
 	unsigned long reset_interval;
 	unsigned long ms;
 	unsigned long sz;
-	struct damos_quota_goal goal;
 	struct list_head goals;
 	unsigned long esz;
 
diff --git a/mm/damon/core.c b/mm/damon/core.c
index b6cd99b64e8512..7b06d926c552ec 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -1130,10 +1130,6 @@ static unsigned long damos_quota_score(struct damos_quota *quota)
 	struct damos_quota_goal *goal;
 	unsigned long highest_score = 0;
 
-	if (quota->goal.get_score)
-		highest_score = quota->goal.get_score(
-				quota->goal.get_score_arg);
-
 	damos_for_each_quota_goal(goal, quota)
 		highest_score = max(highest_score,
 				goal->get_score(goal->get_score_arg));
@@ -1142,21 +1138,19 @@ static unsigned long damos_quota_score(struct damos_quota *quota)
 }
 
 /*
- * Called only if quota->ms, quota->sz, or quota->goal.get_score are set, or
- * quota->goals is not empty
+ * Called only if quota->ms, or quota->sz are set, or quota->goals is not empty
  */
 static void damos_set_effective_quota(struct damos_quota *quota)
 {
 	unsigned long throughput;
 	unsigned long esz;
 
-	if (!quota->ms && !quota->goal.get_score &&
-			list_empty(&quota->goals)) {
+	if (!quota->ms && list_empty(&quota->goals)) {
 		quota->esz = quota->sz;
 		return;
 	}
 
-	if (quota->goal.get_score || !list_empty(&quota->goals)) {
+	if (!list_empty(&quota->goals)) {
 		unsigned long score = damos_quota_score(quota);
 
 		quota->esz_bp = damon_feed_loop_next_input(
@@ -1171,7 +1165,7 @@ static void damos_set_effective_quota(struct damos_quota *quota)
 				quota->total_charged_ns;
 		else
 			throughput = PAGE_SIZE * 1024;
-		if (quota->goal.get_score || !list_empty(&quota->goals))
+		if (!list_empty(&quota->goals))
 			esz = min(throughput * quota->ms, esz);
 		else
 			esz = throughput * quota->ms;
@@ -1191,8 +1185,7 @@ static void damos_adjust_quota(struct damon_ctx *c, struct damos *s)
 	unsigned long cumulated_sz;
 	unsigned int score, max_score = 0;
 
-	if (!quota->ms && !quota->sz && !quota->goal.get_score &&
-			list_empty(&quota->goals))
+	if (!quota->ms && !quota->sz && list_empty(&quota->goals))
 		return;
 
 	/* New charge window starts */

From 921f306fddc4b21b4e4acf0d886251bcf8d490fb Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 19 Feb 2024 11:44:22 -0800
Subject: [PATCH 1276/1406] mm/damon/core: let goal specified with only target
 and current values

DAMOS quota auto-tuning feature let users to set the goal by providing a
function for getting the current score of the tuned quota.  It allows
flexible goal setup, but only simple user-set quota is currently being
used.  As a result, the only user of the DAMOS quota auto-tuning is using
a silly void pointer casting based score value passing function.  Simplify
the interface and the user code by letting user directly set the target
and the current value.

Link: https://lkml.kernel.org/r/20240219194431.159606-12-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h    | 19 +++++++++----------
 mm/damon/core.c          |  9 +++++----
 mm/damon/sysfs-schemes.c | 10 ++--------
 3 files changed, 16 insertions(+), 22 deletions(-)

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 76c965c1eea375..de0cdc7f96d2d9 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -128,19 +128,18 @@ enum damos_action {
 
 /**
  * struct damos_quota_goal - DAMOS scheme quota auto-tuning goal.
- * @get_score:		Function for getting current score of the goal.
- * @get_score_arg:	Parameter for @get_score
+ * @target_value:	Target value to achieve with the tuning.
+ * @current_value:	Current value that achieving with the tuning.
  * @list:		List head for siblings.
  *
- * Data structure for getting the current score of the quota tuning goal.
- * Calling @get_score with @get_score_arg as the parameter should return the
- * current score.  Then the score is entered to DAMON's internal feedback loop
- * mechanism to get the auto-tuned quota.  The goal of the tuning is getting
- * the feedback score value of 10,000.
+ * Data structure for getting the current score of the quota tuning goal.  The
+ * score is calculated by how close @current_value and @target_value are.  Then
+ * the score is entered to DAMON's internal feedback loop mechanism to get the
+ * auto-tuned quota.
  */
 struct damos_quota_goal {
-	unsigned long (*get_score)(void *arg);
-	void *get_score_arg;
+	unsigned long target_value;
+	unsigned long current_value;
 	struct list_head list;
 };
 
@@ -690,7 +689,7 @@ void damos_add_filter(struct damos *s, struct damos_filter *f);
 void damos_destroy_filter(struct damos_filter *f);
 
 struct damos_quota_goal *damos_new_quota_goal(
-		unsigned long (*get_score)(void *), void *get_score_arg);
+		unsigned long target_value, unsigned long current_value);
 void damos_add_quota_goal(struct damos_quota *q, struct damos_quota_goal *g);
 void damos_destroy_quota_goal(struct damos_quota_goal *goal);
 
diff --git a/mm/damon/core.c b/mm/damon/core.c
index 7b06d926c552ec..907f467fc8c007 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -300,15 +300,15 @@ void damos_destroy_filter(struct damos_filter *f)
 }
 
 struct damos_quota_goal *damos_new_quota_goal(
-		unsigned long (*get_score)(void *), void *get_score_arg)
+		unsigned long target_value, unsigned long current_value)
 {
 	struct damos_quota_goal *goal;
 
 	goal = kmalloc(sizeof(*goal), GFP_KERNEL);
 	if (!goal)
 		return NULL;
-	goal->get_score = get_score;
-	goal->get_score_arg = get_score_arg;
+	goal->target_value = target_value;
+	goal->current_value = current_value;
 	INIT_LIST_HEAD(&goal->list);
 	return goal;
 }
@@ -1132,7 +1132,8 @@ static unsigned long damos_quota_score(struct damos_quota *quota)
 
 	damos_for_each_quota_goal(goal, quota)
 		highest_score = max(highest_score,
-				goal->get_score(goal->get_score_arg));
+				goal->current_value * 10000 /
+				goal->target_value);
 
 	return highest_score;
 }
diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c
index 7bf94b1ed6f7d3..50218a7bfa0a68 100644
--- a/mm/damon/sysfs-schemes.c
+++ b/mm/damon/sysfs-schemes.c
@@ -1882,11 +1882,6 @@ static int damon_sysfs_set_scheme_filters(struct damos *scheme,
 	return 0;
 }
 
-static unsigned long damos_sysfs_get_quota_score(void *arg)
-{
-	return (unsigned long)arg;
-}
-
 static int damos_sysfs_set_quota_score(
 		struct damos_sysfs_quota_goals *sysfs_goals,
 		struct damos_quota *quota)
@@ -1904,9 +1899,8 @@ static int damos_sysfs_set_quota_score(
 		if (!sysfs_goal->target_value)
 			continue;
 
-		goal = damos_new_quota_goal(damos_sysfs_get_quota_score,
-				(void *)(sysfs_goal->current_value * 10000 /
-				sysfs_goal->target_value));
+		goal = damos_new_quota_goal(sysfs_goal->target_value,
+				sysfs_goal->current_value);
 		if (!goal)
 			return -ENOMEM;
 		damos_add_quota_goal(quota, goal);

From 70cbf80267ab7a8f208ce33aa6ecd75da277ddbc Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 19 Feb 2024 11:44:23 -0800
Subject: [PATCH 1277/1406] mm/damon/core: support multiple metrics for quota
 goal

DAMOS quota auto-tuning asks users to assess the current tuned quota and
provide the feedback in a manual and repeated way.  It allows users
generate the feedback from a source that the kernel cannot access, and
writing a script or a function for doing the manual and repeated feeding
is not a big deal.  However, additional works are additional works, and it
could be more efficient if DAMOS could do the fetch itself, especially in
case of DAMON sysfs interface use case, since it can avoid the context
switches between the user-space and the kernel-space, though the overhead
would be only trivial in most cases.  Also in many cases, feedbacks could
be made from kernel-accessible sources, such as PSI, CPU usage, etc.  Make
the quota goal to support multiple types of metrics including such ones.

Link: https://lkml.kernel.org/r/20240219194431.159606-13-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h    | 26 +++++++++++++++++++++++---
 mm/damon/core.c          | 22 +++++++++++++++++++---
 mm/damon/sysfs-schemes.c |  5 +++--
 3 files changed, 45 insertions(+), 8 deletions(-)

diff --git a/include/linux/damon.h b/include/linux/damon.h
index de0cdc7f96d2d9..5a06993d847941 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -126,18 +126,37 @@ enum damos_action {
 	NR_DAMOS_ACTIONS,
 };
 
+/**
+ * enum damos_quota_goal_metric - Represents the metric to be used as the goal
+ *
+ * @DAMOS_QUOTA_USER_INPUT:	User-input value.
+ * @NR_DAMOS_QUOTA_GOAL_METRICS:	Number of DAMOS quota goal metrics.
+ *
+ * Metrics equal to larger than @NR_DAMOS_QUOTA_GOAL_METRICS are unsupported.
+ */
+enum damos_quota_goal_metric {
+	DAMOS_QUOTA_USER_INPUT,
+	NR_DAMOS_QUOTA_GOAL_METRICS,
+};
+
 /**
  * struct damos_quota_goal - DAMOS scheme quota auto-tuning goal.
- * @target_value:	Target value to achieve with the tuning.
- * @current_value:	Current value that achieving with the tuning.
+ * @metric:		Metric to be used for representing the goal.
+ * @target_value:	Target value of @metric to achieve with the tuning.
+ * @current_value:	Current value of @metric.
  * @list:		List head for siblings.
  *
  * Data structure for getting the current score of the quota tuning goal.  The
  * score is calculated by how close @current_value and @target_value are.  Then
  * the score is entered to DAMON's internal feedback loop mechanism to get the
  * auto-tuned quota.
+ *
+ * If @metric is DAMOS_QUOTA_USER_INPUT, @current_value should be manually
+ * entered by the user, probably inside the kdamond callbacks.  Otherwise,
+ * DAMON sets @current_value with self-measured value of @metric.
  */
 struct damos_quota_goal {
+	enum damos_quota_goal_metric metric;
 	unsigned long target_value;
 	unsigned long current_value;
 	struct list_head list;
@@ -689,7 +708,8 @@ void damos_add_filter(struct damos *s, struct damos_filter *f);
 void damos_destroy_filter(struct damos_filter *f);
 
 struct damos_quota_goal *damos_new_quota_goal(
-		unsigned long target_value, unsigned long current_value);
+		enum damos_quota_goal_metric metric,
+		unsigned long target_value);
 void damos_add_quota_goal(struct damos_quota *q, struct damos_quota_goal *g);
 void damos_destroy_quota_goal(struct damos_quota_goal *goal);
 
diff --git a/mm/damon/core.c b/mm/damon/core.c
index 907f467fc8c007..973423166ee28f 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -300,15 +300,16 @@ void damos_destroy_filter(struct damos_filter *f)
 }
 
 struct damos_quota_goal *damos_new_quota_goal(
-		unsigned long target_value, unsigned long current_value)
+		enum damos_quota_goal_metric metric,
+		unsigned long target_value)
 {
 	struct damos_quota_goal *goal;
 
 	goal = kmalloc(sizeof(*goal), GFP_KERNEL);
 	if (!goal)
 		return NULL;
+	goal->metric = metric;
 	goal->target_value = target_value;
-	goal->current_value = current_value;
 	INIT_LIST_HEAD(&goal->list);
 	return goal;
 }
@@ -1124,16 +1125,31 @@ static unsigned long damon_feed_loop_next_input(unsigned long last_input,
 	return min_input;
 }
 
+static void damos_set_quota_goal_current_value(struct damos_quota_goal *goal)
+{
+	u64 now_psi_total;
+
+	switch (goal->metric) {
+	case DAMOS_QUOTA_USER_INPUT:
+		/* User should already set goal->current_value */
+		break;
+	default:
+		break;
+	}
+}
+
 /* Return the highest score since it makes schemes least aggressive */
 static unsigned long damos_quota_score(struct damos_quota *quota)
 {
 	struct damos_quota_goal *goal;
 	unsigned long highest_score = 0;
 
-	damos_for_each_quota_goal(goal, quota)
+	damos_for_each_quota_goal(goal, quota) {
+		damos_set_quota_goal_current_value(goal);
 		highest_score = max(highest_score,
 				goal->current_value * 10000 /
 				goal->target_value);
+	}
 
 	return highest_score;
 }
diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c
index 50218a7bfa0a68..7a8a39f2679b73 100644
--- a/mm/damon/sysfs-schemes.c
+++ b/mm/damon/sysfs-schemes.c
@@ -1899,10 +1899,11 @@ static int damos_sysfs_set_quota_score(
 		if (!sysfs_goal->target_value)
 			continue;
 
-		goal = damos_new_quota_goal(sysfs_goal->target_value,
-				sysfs_goal->current_value);
+		goal = damos_new_quota_goal(DAMOS_QUOTA_USER_INPUT,
+				sysfs_goal->target_value);
 		if (!goal)
 			return -ENOMEM;
+		goal->current_value = sysfs_goal->current_value;
 		damos_add_quota_goal(quota, goal);
 	}
 	return 0;

From 180ad231839435d0c85fac686380789fed8b940b Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 19 Feb 2024 11:44:24 -0800
Subject: [PATCH 1278/1406] mm/damon/core: implement PSI metric DAMOS quota
 goal

Extend DAMOS quota goal metric with system wide memory pressure stall
time.  Specifically, the system level 'some' PSI for memory is used.  The
target value can be set in microseconds.  DAMOS measures the increased
amount of the PSI metric in last quota_reset_interval and use the ratio of
it versus the user-specified target PSI value as the score for the
auto-tuning feedback loop.

Link: https://lkml.kernel.org/r/20240219194431.159606-14-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h |  7 +++++++
 mm/damon/core.c       | 25 +++++++++++++++++++++++++
 2 files changed, 32 insertions(+)

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 5a06993d847941..886d07294f4e7c 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -130,12 +130,14 @@ enum damos_action {
  * enum damos_quota_goal_metric - Represents the metric to be used as the goal
  *
  * @DAMOS_QUOTA_USER_INPUT:	User-input value.
+ * @DAMOS_QUOTA_SOME_MEM_PSI_US:	System level some memory PSI in us.
  * @NR_DAMOS_QUOTA_GOAL_METRICS:	Number of DAMOS quota goal metrics.
  *
  * Metrics equal to larger than @NR_DAMOS_QUOTA_GOAL_METRICS are unsupported.
  */
 enum damos_quota_goal_metric {
 	DAMOS_QUOTA_USER_INPUT,
+	DAMOS_QUOTA_SOME_MEM_PSI_US,
 	NR_DAMOS_QUOTA_GOAL_METRICS,
 };
 
@@ -144,6 +146,7 @@ enum damos_quota_goal_metric {
  * @metric:		Metric to be used for representing the goal.
  * @target_value:	Target value of @metric to achieve with the tuning.
  * @current_value:	Current value of @metric.
+ * @last_psi_total:	Last measured total PSI
  * @list:		List head for siblings.
  *
  * Data structure for getting the current score of the quota tuning goal.  The
@@ -159,6 +162,10 @@ struct damos_quota_goal {
 	enum damos_quota_goal_metric metric;
 	unsigned long target_value;
 	unsigned long current_value;
+	/* metric-dependent fields */
+	union {
+		u64 last_psi_total;
+	};
 	struct list_head list;
 };
 
diff --git a/mm/damon/core.c b/mm/damon/core.c
index 973423166ee28f..6d503c1c125ef0 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -11,6 +11,7 @@
 #include <linux/delay.h>
 #include <linux/kthread.h>
 #include <linux/mm.h>
+#include <linux/psi.h>
 #include <linux/slab.h>
 #include <linux/string.h>
 
@@ -1125,6 +1126,25 @@ static unsigned long damon_feed_loop_next_input(unsigned long last_input,
 	return min_input;
 }
 
+#ifdef CONFIG_PSI
+
+static u64 damos_get_some_mem_psi_total(void)
+{
+	if (static_branch_likely(&psi_disabled))
+		return 0;
+	return div_u64(psi_system.total[PSI_AVGS][PSI_MEM * 2],
+			NSEC_PER_USEC);
+}
+
+#else	/* CONFIG_PSI */
+
+static inline u64 damos_get_some_mem_psi_total(void)
+{
+	return 0;
+};
+
+#endif	/* CONFIG_PSI */
+
 static void damos_set_quota_goal_current_value(struct damos_quota_goal *goal)
 {
 	u64 now_psi_total;
@@ -1133,6 +1153,11 @@ static void damos_set_quota_goal_current_value(struct damos_quota_goal *goal)
 	case DAMOS_QUOTA_USER_INPUT:
 		/* User should already set goal->current_value */
 		break;
+	case DAMOS_QUOTA_SOME_MEM_PSI_US:
+		now_psi_total = damos_get_some_mem_psi_total();
+		goal->current_value = now_psi_total - goal->last_psi_total;
+		goal->last_psi_total = now_psi_total;
+		break;
 	default:
 		break;
 	}

From a5d709eda037277224f3f56712da9f327f7b6692 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 19 Feb 2024 11:44:25 -0800
Subject: [PATCH 1279/1406] mm/damon/sysfs-schemes: support PSI-based quota
 auto-tune

Extend DAMON sysfs interface to support the PSI-based quota auto-tuning by
adding a new file, 'target_metric' under the quota goal directory.  Old
users don't get any behavioral changes since the default value of the
metric is 'user input'.

Link: https://lkml.kernel.org/r/20240219194431.159606-15-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/sysfs-schemes.c | 42 ++++++++++++++++++++++++++++++++++++++--
 1 file changed, 40 insertions(+), 2 deletions(-)

diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c
index 7a8a39f2679b73..53a90ac678fb98 100644
--- a/mm/damon/sysfs-schemes.c
+++ b/mm/damon/sysfs-schemes.c
@@ -826,15 +826,48 @@ static const struct kobj_type damon_sysfs_watermarks_ktype = {
 
 struct damos_sysfs_quota_goal {
 	struct kobject kobj;
+	enum damos_quota_goal_metric metric;
 	unsigned long target_value;
 	unsigned long current_value;
 };
 
+/* This should match with enum damos_action */
+static const char * const damos_sysfs_quota_goal_metric_strs[] = {
+	"user_input",
+	"some_mem_psi_us",
+};
+
 static struct damos_sysfs_quota_goal *damos_sysfs_quota_goal_alloc(void)
 {
 	return kzalloc(sizeof(struct damos_sysfs_quota_goal), GFP_KERNEL);
 }
 
+static ssize_t target_metric_show(struct kobject *kobj,
+		struct kobj_attribute *attr, char *buf)
+{
+	struct damos_sysfs_quota_goal *goal = container_of(kobj,
+			struct damos_sysfs_quota_goal, kobj);
+
+	return sysfs_emit(buf, "%s\n",
+			damos_sysfs_quota_goal_metric_strs[goal->metric]);
+}
+
+static ssize_t target_metric_store(struct kobject *kobj,
+		struct kobj_attribute *attr, const char *buf, size_t count)
+{
+	struct damos_sysfs_quota_goal *goal = container_of(kobj,
+			struct damos_sysfs_quota_goal, kobj);
+	enum damos_quota_goal_metric m;
+
+	for (m = 0; m < NR_DAMOS_QUOTA_GOAL_METRICS; m++) {
+		if (sysfs_streq(buf, damos_sysfs_quota_goal_metric_strs[m])) {
+			goal->metric = m;
+			return count;
+		}
+	}
+	return -EINVAL;
+}
+
 static ssize_t target_value_show(struct kobject *kobj,
 		struct kobj_attribute *attr, char *buf)
 {
@@ -880,6 +913,9 @@ static void damos_sysfs_quota_goal_release(struct kobject *kobj)
 	kfree(container_of(kobj, struct damos_sysfs_quota_goal, kobj));
 }
 
+static struct kobj_attribute damos_sysfs_quota_goal_target_metric_attr =
+		__ATTR_RW_MODE(target_metric, 0600);
+
 static struct kobj_attribute damos_sysfs_quota_goal_target_value_attr =
 		__ATTR_RW_MODE(target_value, 0600);
 
@@ -887,6 +923,7 @@ static struct kobj_attribute damos_sysfs_quota_goal_current_value_attr =
 		__ATTR_RW_MODE(current_value, 0600);
 
 static struct attribute *damos_sysfs_quota_goal_attrs[] = {
+	&damos_sysfs_quota_goal_target_metric_attr.attr,
 	&damos_sysfs_quota_goal_target_value_attr.attr,
 	&damos_sysfs_quota_goal_current_value_attr.attr,
 	NULL,
@@ -1899,11 +1936,12 @@ static int damos_sysfs_set_quota_score(
 		if (!sysfs_goal->target_value)
 			continue;
 
-		goal = damos_new_quota_goal(DAMOS_QUOTA_USER_INPUT,
+		goal = damos_new_quota_goal(sysfs_goal->metric,
 				sysfs_goal->target_value);
 		if (!goal)
 			return -ENOMEM;
-		goal->current_value = sysfs_goal->current_value;
+		if (sysfs_goal->metric == DAMOS_QUOTA_USER_INPUT)
+			goal->current_value = sysfs_goal->current_value;
 		damos_add_quota_goal(quota, goal);
 	}
 	return 0;

From a152a44adc877062180e9123fc829bb889fd41fb Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 19 Feb 2024 11:44:26 -0800
Subject: [PATCH 1280/1406] Docs/mm/damon/design: document quota goal
 self-tuning

Update DAMON design doc to explain the quota goal self-tuning, which can
be used by setting the goal's metric to metrics that kernel can
self-retrieve.

Link: https://lkml.kernel.org/r/20240219194431.159606-16-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/mm/damon/design.rst | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/Documentation/mm/damon/design.rst b/Documentation/mm/damon/design.rst
index 2bd0c203dcfb78..8c89d26f0baa1f 100644
--- a/Documentation/mm/damon/design.rst
+++ b/Documentation/mm/damon/design.rst
@@ -398,12 +398,28 @@ Aim-oriented Feedback-driven Auto-tuning
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Automatic feedback-driven quota tuning.  Instead of setting the absolute quota
-value, users can repeatedly provide numbers representing how much of their goal
-for the scheme is achieved as feedback.  DAMOS then automatically tunes the
+value, users can specify the metric of their interest, and what target value
+they want the metric value to be.  DAMOS then automatically tunes the
 aggressiveness (the quota) of the corresponding scheme.  For example, if DAMOS
 is under achieving the goal, DAMOS automatically increases the quota.  If DAMOS
 is over achieving the goal, it decreases the quota.
 
+The goal can be specified with three parameters, namely ``target_metric``,
+``target_value``, and ``current_value``.  The auto-tuning mechanism tries to
+make ``current_value`` of ``target_metric`` be same to ``target_value``.
+Currently, two ``target_metric`` are provided.
+
+- ``user_input``: User-provided value.  Users could use any metric that they
+  has interest in for the value.  Use space main workload's latency or
+  throughput, system metrics like free memory ratio or memory pressure stall
+  time (PSI) could be examples.  Note that users should explicitly set
+  ``current_value`` on their own in this case.  In other words, users should
+  repeatedly provide the feedback.
+- ``some_mem_psi_us``: System-wide ``some`` memory pressure stall information
+  in microseconds that measured from last quota reset to next quota reset.
+  DAMOS does the measurement on its own, so only ``target_value`` need to be
+  set by users at the initial time.  In other words, DAMOS does self-feedback.
+
 
 .. _damon_design_damos_watermarks:
 

From dbb1a4705063907499170589fe332dfa1fe36301 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 19 Feb 2024 11:44:27 -0800
Subject: [PATCH 1281/1406] Docs/ABI/damon: document quota goal metric file

Update DAMON ABI document for the quota goal target_metric file.

Link: https://lkml.kernel.org/r/20240219194431.159606-17-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/ABI/testing/sysfs-kernel-mm-damon | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-damon b/Documentation/ABI/testing/sysfs-kernel-mm-damon
index a1e4fdb04f951e..dad4d5ffd78656 100644
--- a/Documentation/ABI/testing/sysfs-kernel-mm-damon
+++ b/Documentation/ABI/testing/sysfs-kernel-mm-damon
@@ -229,6 +229,12 @@ Description:	Writing a number 'N' to this file creates the number of
 		directories for setting automatic tuning of the scheme's
 		aggressiveness named '0' to 'N-1' under the goals/ directory.
 
+What:		/sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/quotas/goals/<G>/target_metric
+Date:		Feb 2024
+Contact:	SeongJae Park <sj@kernel.org>
+Description:	Writing to and reading from this file sets and gets the quota
+		auto-tuning goal metric.
+
 What:		/sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/quotas/goals/<G>/target_value
 Date:		Nov 2023
 Contact:	SeongJae Park <sj@kernel.org>

From d50e871bd78b2802d8deb55c409e14f33ead0239 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 19 Feb 2024 11:44:28 -0800
Subject: [PATCH 1282/1406] Docs/admin-guide/mm/damon/usage: document quota
 goal metric file

Update DAMON usage document for the quota goal target_metric file.

Link: https://lkml.kernel.org/r/20240219194431.159606-18-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/admin-guide/mm/damon/usage.rst | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst
index 220ebbde7324a7..b33eecfd0e901c 100644
--- a/Documentation/admin-guide/mm/damon/usage.rst
+++ b/Documentation/admin-guide/mm/damon/usage.rst
@@ -86,7 +86,7 @@ comma (",").
     │ │ │ │ │ │ │ :ref:`quotas <sysfs_quotas>`/ms,bytes,reset_interval_ms,effective_bytes
     │ │ │ │ │ │ │ │ weights/sz_permil,nr_accesses_permil,age_permil
     │ │ │ │ │ │ │ │ :ref:`goals <sysfs_schemes_quota_goals>`/nr_goals
-    │ │ │ │ │ │ │ │ │ 0/target_value,current_value
+    │ │ │ │ │ │ │ │ │ 0/target_metric,target_value,current_value
     │ │ │ │ │ │ │ :ref:`watermarks <sysfs_watermarks>`/metric,interval_us,high,mid,low
     │ │ │ │ │ │ │ :ref:`filters <sysfs_filters>`/nr_filters
     │ │ │ │ │ │ │ │ 0/type,matching,memcg_id
@@ -366,11 +366,11 @@ number (``N``) to the file creates the number of child directories named ``0``
 to ``N-1``.  Each directory represents each goal and current achievement.
 Among the multiple feedback, the best one is used.
 
-Each goal directory contains two files, namely ``target_value`` and
-``current_value``.  Users can set and get any number to those files to set the
-feedback.  User space main workload's latency or throughput, system metrics
-like free memory ratio or memory pressure stall time (PSI) could be example
-metrics for the values.  Note that users should write
+Each goal directory contains three files, namely ``target_metric``,
+``target_value`` and ``current_value``.  Users can set and get the three
+parameters for the quota auto-tuning goals that specified on the :ref:`design
+doc <damon_design_damos_quota_auto_tuning>` by writing to and reading from each
+of the files.  Note that users should further write
 ``commit_schemes_quota_goals`` to the ``state`` file of the :ref:`kdamond
 directory <sysfs_kdamond>` to pass the feedback to DAMON.
 

From 2181bff13aaa2dc08458a02ed65fdab28d672129 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 19 Feb 2024 11:44:29 -0800
Subject: [PATCH 1283/1406] mm/damon/reclaim: implement user-feedback driven
 quota auto-tuning

DAMOS supports user-feedback driven quota auto-tuning, but only DAMON
sysfs interface is using it.  Add support of the feature on DAMON_RECLAIM
by adding one more input parameter, namely 'quota_autotune_feedback', for
providing the user feedback to DAMON_RECLAIM.  It assumes the target value
of the feedback is 10,000.

Link: https://lkml.kernel.org/r/20240219194431.159606-19-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/reclaim.c | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c
index 66e190f0374ac8..9df6b8819998e0 100644
--- a/mm/damon/reclaim.c
+++ b/mm/damon/reclaim.c
@@ -62,6 +62,21 @@ static struct damos_quota damon_reclaim_quota = {
 };
 DEFINE_DAMON_MODULES_DAMOS_QUOTAS(damon_reclaim_quota);
 
+/*
+ * User-specifiable feedback for auto-tuning of the effective quota.
+ *
+ * While keeping the caps that set by other quotas, DAMON_RECLAIM automatically
+ * increases and decreases the effective level of the quota aiming receiving this
+ * feedback of value ``10,000`` from the user.  DAMON_RECLAIM assumes the feedback
+ * value and the quota are positively proportional.  Value zero means disabling
+ * this auto-tuning feature.
+ *
+ * Disabled by default.
+ *
+ */
+static unsigned long quota_autotune_feedback __read_mostly;
+module_param(quota_autotune_feedback, ulong, 0600);
+
 static struct damos_watermarks damon_reclaim_wmarks = {
 	.metric = DAMOS_WMARK_FREE_MEM_RATE,
 	.interval = 5000000,	/* 5 seconds */
@@ -159,11 +174,13 @@ static void damon_reclaim_copy_quota_status(struct damos_quota *dst,
 	dst->charged_from = src->charged_from;
 	dst->charge_target_from = src->charge_target_from;
 	dst->charge_addr_from = src->charge_addr_from;
+	dst->esz_bp = src->esz_bp;
 }
 
 static int damon_reclaim_apply_parameters(void)
 {
 	struct damos *scheme, *old_scheme;
+	struct damos_quota_goal *goal;
 	struct damos_filter *filter;
 	int err = 0;
 
@@ -180,6 +197,17 @@ static int damon_reclaim_apply_parameters(void)
 			damon_reclaim_copy_quota_status(&scheme->quota,
 					&old_scheme->quota);
 	}
+
+	if (quota_autotune_feedback) {
+		goal = damos_new_quota_goal(DAMOS_QUOTA_USER_INPUT, 10000);
+		if (!goal) {
+			damon_destroy_scheme(scheme);
+			return -ENOMEM;
+		}
+		goal->current_value = quota_autotune_feedback;
+		damos_add_quota_goal(&scheme->quota, goal);
+	}
+
 	if (skip_anon) {
 		filter = damos_new_filter(DAMOS_FILTER_TYPE_ANON, true);
 		if (!filter) {

From af5bbf3a427f00ff5a263b36eed0758c511c3ddb Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 19 Feb 2024 11:44:30 -0800
Subject: [PATCH 1284/1406] mm/damon/reclaim: implement memory PSI-driven quota
 self-tuning

Support the PSI-driven quota self-tuning from DAMON_RECLAIM by introducing
yet another parameter, 'quota_mem_pressure_us'.  Users can set the desired
amount of memory pressure stall time per each quota reset interval using
the parameter.  Then DAMON_RECLAIM monitor the memory pressure stall time,
specifically system-wide memory 'some' PSI value that increased during the
given time interval, and self-tune the quota using the DAMOS core logic.

Link: https://lkml.kernel.org/r/20240219194431.159606-20-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/reclaim.c | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c
index 9df6b8819998e0..9bd341d62b4c75 100644
--- a/mm/damon/reclaim.c
+++ b/mm/damon/reclaim.c
@@ -62,6 +62,21 @@ static struct damos_quota damon_reclaim_quota = {
 };
 DEFINE_DAMON_MODULES_DAMOS_QUOTAS(damon_reclaim_quota);
 
+/*
+ * Desired level of memory pressure-stall time in microseconds.
+ *
+ * While keeping the caps that set by other quotas, DAMON_RECLAIM automatically
+ * increases and decreases the effective level of the quota aiming this level of
+ * memory pressure is incurred.  System-wide ``some`` memory PSI in microseconds
+ * per quota reset interval (``quota_reset_interval_ms``) is collected and
+ * compared to this value to see if the aim is satisfied.  Value zero means
+ * disabling this auto-tuning feature.
+ *
+ * Disabled by default.
+ */
+static unsigned long quota_mem_pressure_us __read_mostly;
+module_param(quota_mem_pressure_us, ulong, 0600);
+
 /*
  * User-specifiable feedback for auto-tuning of the effective quota.
  *
@@ -198,6 +213,16 @@ static int damon_reclaim_apply_parameters(void)
 					&old_scheme->quota);
 	}
 
+	if (quota_mem_pressure_us) {
+		goal = damos_new_quota_goal(DAMOS_QUOTA_SOME_MEM_PSI_US,
+				quota_mem_pressure_us);
+		if (!goal) {
+			damon_destroy_scheme(scheme);
+			return -ENOMEM;
+		}
+		damos_add_quota_goal(&scheme->quota, goal);
+	}
+
 	if (quota_autotune_feedback) {
 		goal = damos_new_quota_goal(DAMOS_QUOTA_USER_INPUT, 10000);
 		if (!goal) {

From d82aabaf69be54b9687d4a0867866fe7b8a71036 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 19 Feb 2024 11:44:31 -0800
Subject: [PATCH 1285/1406] Docs/admin-guide/mm/damon/reclaim: document
 auto-tuning parameters

Update DAMON_RECLAIM usage document for the user/self feedback based
auto-tuning of the quota.

Link: https://lkml.kernel.org/r/20240219194431.159606-21-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 .../admin-guide/mm/damon/reclaim.rst          | 27 +++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/Documentation/admin-guide/mm/damon/reclaim.rst b/Documentation/admin-guide/mm/damon/reclaim.rst
index 343e25b252f430..af05ae6170184f 100644
--- a/Documentation/admin-guide/mm/damon/reclaim.rst
+++ b/Documentation/admin-guide/mm/damon/reclaim.rst
@@ -117,6 +117,33 @@ milliseconds.
 
 1 second by default.
 
+quota_mem_pressure_us
+---------------------
+
+Desired level of memory pressure-stall time in microseconds.
+
+While keeping the caps that set by other quotas, DAMON_RECLAIM automatically
+increases and decreases the effective level of the quota aiming this level of
+memory pressure is incurred.  System-wide ``some`` memory PSI in microseconds
+per quota reset interval (``quota_reset_interval_ms``) is collected and
+compared to this value to see if the aim is satisfied.  Value zero means
+disabling this auto-tuning feature.
+
+Disabled by default.
+
+quota_autotune_feedback
+-----------------------
+
+User-specifiable feedback for auto-tuning of the effective quota.
+
+While keeping the caps that set by other quotas, DAMON_RECLAIM automatically
+increases and decreases the effective level of the quota aiming receiving this
+feedback of value ``10,000`` from the user.  DAMON_RECLAIM assumes the feedback
+value and the quota are positively proportional.  Value zero means disabling
+this auto-tuning feature.
+
+Disabled by default.
+
 wmarks_interval
 ---------------
 

From c3daae75e1ee869d8834718ca5edc1e0a185497d Mon Sep 17 00:00:00 2001
From: Chengming Zhou <zhouchengming@bytedance.com>
Date: Mon, 19 Feb 2024 13:33:51 +0000
Subject: [PATCH 1286/1406] mm/zsmalloc: fix migrate_write_lock() when
 !CONFIG_COMPACTION

Patch series "mm/zsmalloc: fix and optimize objects/page migration".

This series is to fix and optimize the zsmalloc objects/page migration.


This patch (of 3):

migrate_write_lock() is a empty function when !CONFIG_COMPACTION, in which
case zs_compact() can be triggered from shrinker reclaim context.  (Maybe
it's better to rename it to zs_shrink()?)

And zspage map object users rely on this migrate_read_lock() so object
won't be migrated elsewhere.

Fix it by always implementing the migrate_write_lock() related functions.

Link: https://lkml.kernel.org/r/20240219-b4-szmalloc-migrate-v1-0-34cd49c6545b@bytedance.com
Link: https://lkml.kernel.org/r/20240219-b4-szmalloc-migrate-v1-1-34cd49c6545b@bytedance.com
Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Yosry Ahmed <yosryahmed@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/zsmalloc.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index c937635e0ad15e..64d5533fa5d8ed 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -278,18 +278,15 @@ static bool ZsHugePage(struct zspage *zspage)
 static void migrate_lock_init(struct zspage *zspage);
 static void migrate_read_lock(struct zspage *zspage);
 static void migrate_read_unlock(struct zspage *zspage);
-
-#ifdef CONFIG_COMPACTION
 static void migrate_write_lock(struct zspage *zspage);
 static void migrate_write_lock_nested(struct zspage *zspage);
 static void migrate_write_unlock(struct zspage *zspage);
+
+#ifdef CONFIG_COMPACTION
 static void kick_deferred_free(struct zs_pool *pool);
 static void init_deferred_free(struct zs_pool *pool);
 static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage);
 #else
-static void migrate_write_lock(struct zspage *zspage) {}
-static void migrate_write_lock_nested(struct zspage *zspage) {}
-static void migrate_write_unlock(struct zspage *zspage) {}
 static void kick_deferred_free(struct zs_pool *pool) {}
 static void init_deferred_free(struct zs_pool *pool) {}
 static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage) {}
@@ -1725,7 +1722,6 @@ static void migrate_read_unlock(struct zspage *zspage) __releases(&zspage->lock)
 	read_unlock(&zspage->lock);
 }
 
-#ifdef CONFIG_COMPACTION
 static void migrate_write_lock(struct zspage *zspage)
 {
 	write_lock(&zspage->lock);
@@ -1741,6 +1737,7 @@ static void migrate_write_unlock(struct zspage *zspage)
 	write_unlock(&zspage->lock);
 }
 
+#ifdef CONFIG_COMPACTION
 /* Number of isolated subpage for *page migration* in this zspage */
 static void inc_zspage_isolation(struct zspage *zspage)
 {

From 1d390e92929c09b4d8eec3e4b1c0920ecbd32cad Mon Sep 17 00:00:00 2001
From: Chengming Zhou <zhouchengming@bytedance.com>
Date: Mon, 19 Feb 2024 13:33:52 +0000
Subject: [PATCH 1287/1406] mm/zsmalloc: remove migrate_write_lock_nested()

The migrate write lock is to protect the race between zspage migration and
zspage objects' map users.

We only need to lock out the map users of src zspage, not dst zspage,
which is safe to map by users concurrently, since we only need to do
obj_malloc() from dst zspage.

So we can remove the migrate_write_lock_nested() use case.

As we are here, cleanup the __zs_compact() by moving putback_zspage()
outside of migrate_write_unlock since we hold pool lock, no malloc or free
users can come in.

Link: https://lkml.kernel.org/r/20240219-b4-szmalloc-migrate-v1-2-34cd49c6545b@bytedance.com
Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Yosry Ahmed <yosryahmed@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/zsmalloc.c | 22 +++++-----------------
 1 file changed, 5 insertions(+), 17 deletions(-)

diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 64d5533fa5d8ed..f2ae7d4c6f216f 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -279,7 +279,6 @@ static void migrate_lock_init(struct zspage *zspage);
 static void migrate_read_lock(struct zspage *zspage);
 static void migrate_read_unlock(struct zspage *zspage);
 static void migrate_write_lock(struct zspage *zspage);
-static void migrate_write_lock_nested(struct zspage *zspage);
 static void migrate_write_unlock(struct zspage *zspage);
 
 #ifdef CONFIG_COMPACTION
@@ -1727,11 +1726,6 @@ static void migrate_write_lock(struct zspage *zspage)
 	write_lock(&zspage->lock);
 }
 
-static void migrate_write_lock_nested(struct zspage *zspage)
-{
-	write_lock_nested(&zspage->lock, SINGLE_DEPTH_NESTING);
-}
-
 static void migrate_write_unlock(struct zspage *zspage)
 {
 	write_unlock(&zspage->lock);
@@ -2003,19 +1997,17 @@ static unsigned long __zs_compact(struct zs_pool *pool,
 			dst_zspage = isolate_dst_zspage(class);
 			if (!dst_zspage)
 				break;
-			migrate_write_lock(dst_zspage);
 		}
 
 		src_zspage = isolate_src_zspage(class);
 		if (!src_zspage)
 			break;
 
-		migrate_write_lock_nested(src_zspage);
-
+		migrate_write_lock(src_zspage);
 		migrate_zspage(pool, src_zspage, dst_zspage);
-		fg = putback_zspage(class, src_zspage);
 		migrate_write_unlock(src_zspage);
 
+		fg = putback_zspage(class, src_zspage);
 		if (fg == ZS_INUSE_RATIO_0) {
 			free_zspage(pool, class, src_zspage);
 			pages_freed += class->pages_per_zspage;
@@ -2025,7 +2017,6 @@ static unsigned long __zs_compact(struct zs_pool *pool,
 		if (get_fullness_group(class, dst_zspage) == ZS_INUSE_RATIO_100
 		    || spin_is_contended(&pool->lock)) {
 			putback_zspage(class, dst_zspage);
-			migrate_write_unlock(dst_zspage);
 			dst_zspage = NULL;
 
 			spin_unlock(&pool->lock);
@@ -2034,15 +2025,12 @@ static unsigned long __zs_compact(struct zs_pool *pool,
 		}
 	}
 
-	if (src_zspage) {
+	if (src_zspage)
 		putback_zspage(class, src_zspage);
-		migrate_write_unlock(src_zspage);
-	}
 
-	if (dst_zspage) {
+	if (dst_zspage)
 		putback_zspage(class, dst_zspage);
-		migrate_write_unlock(dst_zspage);
-	}
+
 	spin_unlock(&pool->lock);
 
 	return pages_freed;

From 9ef7a7d6ebf4d0c86c23e8e388f74baefcd68358 Mon Sep 17 00:00:00 2001
From: Chengming Zhou <zhouchengming@bytedance.com>
Date: Mon, 19 Feb 2024 13:33:53 +0000
Subject: [PATCH 1288/1406] mm/zsmalloc: remove unused zspage->isolated

The zspage->isolated is not used anywhere, we don't need to maintain it,
which needs to hold the heavy pool lock to update it, so just remove it.

Link: https://lkml.kernel.org/r/20240219-b4-szmalloc-migrate-v1-3-34cd49c6545b@bytedance.com
Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Yosry Ahmed <yosryahmed@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/zsmalloc.c | 32 --------------------------------
 1 file changed, 32 deletions(-)

diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index f2ae7d4c6f216f..a48f4651d143bc 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -116,7 +116,6 @@
 #define HUGE_BITS	1
 #define FULLNESS_BITS	4
 #define CLASS_BITS	8
-#define ISOLATED_BITS	5
 #define MAGIC_VAL_BITS	8
 
 #define MAX(a, b) ((a) >= (b) ? (a) : (b))
@@ -246,7 +245,6 @@ struct zspage {
 		unsigned int huge:HUGE_BITS;
 		unsigned int fullness:FULLNESS_BITS;
 		unsigned int class:CLASS_BITS + 1;
-		unsigned int isolated:ISOLATED_BITS;
 		unsigned int magic:MAGIC_VAL_BITS;
 	};
 	unsigned int inuse;
@@ -1732,17 +1730,6 @@ static void migrate_write_unlock(struct zspage *zspage)
 }
 
 #ifdef CONFIG_COMPACTION
-/* Number of isolated subpage for *page migration* in this zspage */
-static void inc_zspage_isolation(struct zspage *zspage)
-{
-	zspage->isolated++;
-}
-
-static void dec_zspage_isolation(struct zspage *zspage)
-{
-	VM_BUG_ON(zspage->isolated == 0);
-	zspage->isolated--;
-}
 
 static const struct movable_operations zsmalloc_mops;
 
@@ -1771,21 +1758,12 @@ static void replace_sub_page(struct size_class *class, struct zspage *zspage,
 
 static bool zs_page_isolate(struct page *page, isolate_mode_t mode)
 {
-	struct zs_pool *pool;
-	struct zspage *zspage;
-
 	/*
 	 * Page is locked so zspage couldn't be destroyed. For detail, look at
 	 * lock_zspage in free_zspage.
 	 */
 	VM_BUG_ON_PAGE(PageIsolated(page), page);
 
-	zspage = get_zspage(page);
-	pool = zspage->pool;
-	spin_lock(&pool->lock);
-	inc_zspage_isolation(zspage);
-	spin_unlock(&pool->lock);
-
 	return true;
 }
 
@@ -1850,7 +1828,6 @@ static int zs_page_migrate(struct page *newpage, struct page *page,
 	kunmap_atomic(s_addr);
 
 	replace_sub_page(class, zspage, newpage, page);
-	dec_zspage_isolation(zspage);
 	/*
 	 * Since we complete the data copy and set up new zspage structure,
 	 * it's okay to release the pool's lock.
@@ -1872,16 +1849,7 @@ static int zs_page_migrate(struct page *newpage, struct page *page,
 
 static void zs_page_putback(struct page *page)
 {
-	struct zs_pool *pool;
-	struct zspage *zspage;
-
 	VM_BUG_ON_PAGE(!PageIsolated(page), page);
-
-	zspage = get_zspage(page);
-	pool = zspage->pool;
-	spin_lock(&pool->lock);
-	dec_zspage_isolation(zspage);
-	spin_unlock(&pool->lock);
 }
 
 static const struct movable_operations zsmalloc_mops = {

From 7b43362da84758c92bce2caded40131375e648a1 Mon Sep 17 00:00:00 2001
From: Zhongkun He <hezhongkun.hzk@bytedance.com>
Date: Mon, 19 Feb 2024 10:44:53 +0800
Subject: [PATCH 1289/1406] mm/z3fold: fix the comment for __encode_handle()

The comment is confusing that Pool lock should be held as this function
accesses first_num above the __encode_handle() because first_num is the
element of z3fold_header which is protected by z3fold_header->page_lock.

I found the same comment for encode_handle() in zbud.c by accident ,Pool
lock should be held as this function accesses first|last_chunks, which is
the element of zbud_header and it does not have any lock, so pool lock
should be held.

Z3fold is based on zbud, maybe the comment come from zbud, but it was
wrong, so fix it.

Link: https://lkml.kernel.org/r/20240219024453.2240147-1-hezhongkun.hzk@bytedance.com
Signed-off-by: Zhongkun He <hezhongkun.hzk@bytedance.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Vitaly Wool <vitaly.wool@konsulko.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/z3fold.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/mm/z3fold.c b/mm/z3fold.c
index 58946cacbfbb86..446cd11e2cad2e 100644
--- a/mm/z3fold.c
+++ b/mm/z3fold.c
@@ -364,8 +364,9 @@ static inline int __idx(struct z3fold_header *zhdr, enum buddy bud)
 }
 
 /*
- * Encodes the handle of a particular buddy within a z3fold page
- * Pool lock should be held as this function accesses first_num
+ * Encodes the handle of a particular buddy within a z3fold page.
+ * Zhdr->page_lock should be held as this function accesses first_num
+ * if bud != HEADLESS.
  */
 static unsigned long __encode_handle(struct z3fold_header *zhdr,
 				struct z3fold_buddy_slots *slots,

From 937503d484d6d452440cd963fb3629496cd3edb1 Mon Sep 17 00:00:00 2001
From: Chengming Zhou <zhouchengming@bytedance.com>
Date: Fri, 16 Feb 2024 08:55:04 +0000
Subject: [PATCH 1290/1406] mm/zswap: global lru and shrinker shared by all
 zswap_pools

Patch series "mm/zswap: optimize for dynamic zswap_pools", v3.

Dynamic pool creation has been supported for a long time, which maybe not
used so much in practice.  But with the per-memcg lru merged, the current
structure of zswap_pool's lru and shrinker become less optimal.

In the current structure, each zswap_pool has its own lru, shrinker and
shrink_work, but only the latest zswap_pool will be the current used.

1. When memory has pressure, all shrinkers of zswap_pools will try to
   shrink its lru list, there is no order between them.

2. When zswap limit hit, only the last zswap_pool's shrink_work will
   try to shrink its own lru, which is inefficient.

A more natural way is to have a global zswap lru shared between all
zswap_pools, and so is the shrinker. The code becomes much simpler too.

Another optimization is changing zswap_pool kref to percpu_ref, which will
be taken reference by every zswap entry.  So the scalability is better.

Testing kernel build (32 threads) in tmpfs with memory.max=2GB.  (zswap
shrinker and writeback enabled with one 50GB swapfile, on a 128 CPUs
x86-64 machine, below is the average of 5 runs)

        mm-unstable  zswap-global-lru
real    63.20        63.12
user    1061.75      1062.95
sys     268.74       264.44


This patch (of 3):

Dynamic zswap_pool creation may create/reuse to have multiple zswap_pools
in a list, only the first will be current used.

Each zswap_pool has its own lru and shrinker, which is not necessary and
has its problem:

1. When memory has pressure, all shrinker of zswap_pools will
   try to shrink its own lru, there is no order between them.

2. When zswap limit hit, only the last zswap_pool's shrink_work
   will try to shrink its lru list. The rationale here was to
   try and empty the old pool first so that we can completely
   drop it. However, since we only support exclusive loads now,
   the LRU ordering should be entirely decided by the order of
   stores, so the oldest entries on the LRU will naturally be
   from the oldest pool.

Anyway, having a global lru and shrinker shared by all zswap_pools is
better and efficient.

Link: https://lkml.kernel.org/r/20240210-zswap-global-lru-v3-0-200495333595@bytedance.com
Link: https://lkml.kernel.org/r/20240210-zswap-global-lru-v3-1-200495333595@bytedance.com
Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
Acked-by: Yosry Ahmed <yosryahmed@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Nhat Pham <nphamcs@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/zswap.c | 171 +++++++++++++++++++++--------------------------------
 1 file changed, 66 insertions(+), 105 deletions(-)

diff --git a/mm/zswap.c b/mm/zswap.c
index 51de79aa86593d..0141c45a5a6f61 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -176,14 +176,19 @@ struct zswap_pool {
 	struct kref kref;
 	struct list_head list;
 	struct work_struct release_work;
-	struct work_struct shrink_work;
 	struct hlist_node node;
 	char tfm_name[CRYPTO_MAX_ALG_NAME];
+};
+
+static struct {
 	struct list_lru list_lru;
-	struct mem_cgroup *next_shrink;
-	struct shrinker *shrinker;
 	atomic_t nr_stored;
-};
+	struct shrinker *shrinker;
+	struct work_struct shrink_work;
+	struct mem_cgroup *next_shrink;
+	/* The lock protects next_shrink. */
+	spinlock_t shrink_lock;
+} zswap;
 
 /*
  * struct zswap_entry
@@ -301,9 +306,6 @@ static void zswap_update_total_size(void)
 * pool functions
 **********************************/
 
-static void zswap_alloc_shrinker(struct zswap_pool *pool);
-static void shrink_worker(struct work_struct *w);
-
 static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
 {
 	int i;
@@ -353,30 +355,16 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
 	if (ret)
 		goto error;
 
-	zswap_alloc_shrinker(pool);
-	if (!pool->shrinker)
-		goto error;
-
-	pr_debug("using %s compressor\n", pool->tfm_name);
-
 	/* being the current pool takes 1 ref; this func expects the
 	 * caller to always add the new pool as the current pool
 	 */
 	kref_init(&pool->kref);
 	INIT_LIST_HEAD(&pool->list);
-	if (list_lru_init_memcg(&pool->list_lru, pool->shrinker))
-		goto lru_fail;
-	shrinker_register(pool->shrinker);
-	INIT_WORK(&pool->shrink_work, shrink_worker);
-	atomic_set(&pool->nr_stored, 0);
 
 	zswap_pool_debug("created", pool);
 
 	return pool;
 
-lru_fail:
-	list_lru_destroy(&pool->list_lru);
-	shrinker_free(pool->shrinker);
 error:
 	if (pool->acomp_ctx)
 		free_percpu(pool->acomp_ctx);
@@ -434,15 +422,8 @@ static void zswap_pool_destroy(struct zswap_pool *pool)
 
 	zswap_pool_debug("destroying", pool);
 
-	shrinker_free(pool->shrinker);
 	cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node);
 	free_percpu(pool->acomp_ctx);
-	list_lru_destroy(&pool->list_lru);
-
-	spin_lock(&zswap_pools_lock);
-	mem_cgroup_iter_break(NULL, pool->next_shrink);
-	pool->next_shrink = NULL;
-	spin_unlock(&zswap_pools_lock);
 
 	for (i = 0; i < ZSWAP_NR_ZPOOLS; i++)
 		zpool_destroy_pool(pool->zpools[i]);
@@ -529,24 +510,6 @@ static struct zswap_pool *zswap_pool_current_get(void)
 	return pool;
 }
 
-static struct zswap_pool *zswap_pool_last_get(void)
-{
-	struct zswap_pool *pool, *last = NULL;
-
-	rcu_read_lock();
-
-	list_for_each_entry_rcu(pool, &zswap_pools, list)
-		last = pool;
-	WARN_ONCE(!last && zswap_has_pool,
-		  "%s: no page storage pool!\n", __func__);
-	if (!zswap_pool_get(last))
-		last = NULL;
-
-	rcu_read_unlock();
-
-	return last;
-}
-
 /* type and compressor must be null-terminated */
 static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor)
 {
@@ -816,15 +779,11 @@ void zswap_folio_swapin(struct folio *folio)
 
 void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg)
 {
-	struct zswap_pool *pool;
-
-	/* lock out zswap pools list modification */
-	spin_lock(&zswap_pools_lock);
-	list_for_each_entry(pool, &zswap_pools, list) {
-		if (pool->next_shrink == memcg)
-			pool->next_shrink = mem_cgroup_iter(NULL, pool->next_shrink, NULL);
-	}
-	spin_unlock(&zswap_pools_lock);
+	/* lock out zswap shrinker walking memcg tree */
+	spin_lock(&zswap.shrink_lock);
+	if (zswap.next_shrink == memcg)
+		zswap.next_shrink = mem_cgroup_iter(NULL, zswap.next_shrink, NULL);
+	spin_unlock(&zswap.shrink_lock);
 }
 
 /*********************************
@@ -923,9 +882,9 @@ static void zswap_entry_free(struct zswap_entry *entry)
 	if (!entry->length)
 		atomic_dec(&zswap_same_filled_pages);
 	else {
-		zswap_lru_del(&entry->pool->list_lru, entry);
+		zswap_lru_del(&zswap.list_lru, entry);
 		zpool_free(zswap_find_zpool(entry), entry->handle);
-		atomic_dec(&entry->pool->nr_stored);
+		atomic_dec(&zswap.nr_stored);
 		zswap_pool_put(entry->pool);
 	}
 	if (entry->objcg) {
@@ -1287,7 +1246,6 @@ static unsigned long zswap_shrinker_scan(struct shrinker *shrinker,
 {
 	struct lruvec *lruvec = mem_cgroup_lruvec(sc->memcg, NODE_DATA(sc->nid));
 	unsigned long shrink_ret, nr_protected, lru_size;
-	struct zswap_pool *pool = shrinker->private_data;
 	bool encountered_page_in_swapcache = false;
 
 	if (!zswap_shrinker_enabled ||
@@ -1298,7 +1256,7 @@ static unsigned long zswap_shrinker_scan(struct shrinker *shrinker,
 
 	nr_protected =
 		atomic_long_read(&lruvec->zswap_lruvec_state.nr_zswap_protected);
-	lru_size = list_lru_shrink_count(&pool->list_lru, sc);
+	lru_size = list_lru_shrink_count(&zswap.list_lru, sc);
 
 	/*
 	 * Abort if we are shrinking into the protected region.
@@ -1315,7 +1273,7 @@ static unsigned long zswap_shrinker_scan(struct shrinker *shrinker,
 		return SHRINK_STOP;
 	}
 
-	shrink_ret = list_lru_shrink_walk(&pool->list_lru, sc, &shrink_memcg_cb,
+	shrink_ret = list_lru_shrink_walk(&zswap.list_lru, sc, &shrink_memcg_cb,
 		&encountered_page_in_swapcache);
 
 	if (encountered_page_in_swapcache)
@@ -1327,7 +1285,6 @@ static unsigned long zswap_shrinker_scan(struct shrinker *shrinker,
 static unsigned long zswap_shrinker_count(struct shrinker *shrinker,
 		struct shrink_control *sc)
 {
-	struct zswap_pool *pool = shrinker->private_data;
 	struct mem_cgroup *memcg = sc->memcg;
 	struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(sc->nid));
 	unsigned long nr_backing, nr_stored, nr_freeable, nr_protected;
@@ -1341,8 +1298,8 @@ static unsigned long zswap_shrinker_count(struct shrinker *shrinker,
 	nr_stored = memcg_page_state(memcg, MEMCG_ZSWAPPED);
 #else
 	/* use pool stats instead of memcg stats */
-	nr_backing = get_zswap_pool_size(pool) >> PAGE_SHIFT;
-	nr_stored = atomic_read(&pool->nr_stored);
+	nr_backing = zswap_pool_total_size >> PAGE_SHIFT;
+	nr_stored = atomic_read(&zswap.nr_stored);
 #endif
 
 	if (!nr_stored)
@@ -1350,7 +1307,7 @@ static unsigned long zswap_shrinker_count(struct shrinker *shrinker,
 
 	nr_protected =
 		atomic_long_read(&lruvec->zswap_lruvec_state.nr_zswap_protected);
-	nr_freeable = list_lru_shrink_count(&pool->list_lru, sc);
+	nr_freeable = list_lru_shrink_count(&zswap.list_lru, sc);
 	/*
 	 * Subtract the lru size by an estimate of the number of pages
 	 * that should be protected.
@@ -1366,23 +1323,24 @@ static unsigned long zswap_shrinker_count(struct shrinker *shrinker,
 	return mult_frac(nr_freeable, nr_backing, nr_stored);
 }
 
-static void zswap_alloc_shrinker(struct zswap_pool *pool)
+static struct shrinker *zswap_alloc_shrinker(void)
 {
-	pool->shrinker =
+	struct shrinker *shrinker;
+
+	shrinker =
 		shrinker_alloc(SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE, "mm-zswap");
-	if (!pool->shrinker)
-		return;
+	if (!shrinker)
+		return NULL;
 
-	pool->shrinker->private_data = pool;
-	pool->shrinker->scan_objects = zswap_shrinker_scan;
-	pool->shrinker->count_objects = zswap_shrinker_count;
-	pool->shrinker->batch = 0;
-	pool->shrinker->seeks = DEFAULT_SEEKS;
+	shrinker->scan_objects = zswap_shrinker_scan;
+	shrinker->count_objects = zswap_shrinker_count;
+	shrinker->batch = 0;
+	shrinker->seeks = DEFAULT_SEEKS;
+	return shrinker;
 }
 
 static int shrink_memcg(struct mem_cgroup *memcg)
 {
-	struct zswap_pool *pool;
 	int nid, shrunk = 0;
 
 	if (!mem_cgroup_zswap_writeback_enabled(memcg))
@@ -1395,32 +1353,25 @@ static int shrink_memcg(struct mem_cgroup *memcg)
 	if (memcg && !mem_cgroup_online(memcg))
 		return -ENOENT;
 
-	pool = zswap_pool_current_get();
-	if (!pool)
-		return -EINVAL;
-
 	for_each_node_state(nid, N_NORMAL_MEMORY) {
 		unsigned long nr_to_walk = 1;
 
-		shrunk += list_lru_walk_one(&pool->list_lru, nid, memcg,
+		shrunk += list_lru_walk_one(&zswap.list_lru, nid, memcg,
 					    &shrink_memcg_cb, NULL, &nr_to_walk);
 	}
-	zswap_pool_put(pool);
 	return shrunk ? 0 : -EAGAIN;
 }
 
 static void shrink_worker(struct work_struct *w)
 {
-	struct zswap_pool *pool = container_of(w, typeof(*pool),
-						shrink_work);
 	struct mem_cgroup *memcg;
 	int ret, failures = 0;
 
 	/* global reclaim will select cgroup in a round-robin fashion. */
 	do {
-		spin_lock(&zswap_pools_lock);
-		pool->next_shrink = mem_cgroup_iter(NULL, pool->next_shrink, NULL);
-		memcg = pool->next_shrink;
+		spin_lock(&zswap.shrink_lock);
+		zswap.next_shrink = mem_cgroup_iter(NULL, zswap.next_shrink, NULL);
+		memcg = zswap.next_shrink;
 
 		/*
 		 * We need to retry if we have gone through a full round trip, or if we
@@ -1434,7 +1385,7 @@ static void shrink_worker(struct work_struct *w)
 		 * memcg is not killed when we are reclaiming.
 		 */
 		if (!memcg) {
-			spin_unlock(&zswap_pools_lock);
+			spin_unlock(&zswap.shrink_lock);
 			if (++failures == MAX_RECLAIM_RETRIES)
 				break;
 
@@ -1444,15 +1395,15 @@ static void shrink_worker(struct work_struct *w)
 		if (!mem_cgroup_tryget_online(memcg)) {
 			/* drop the reference from mem_cgroup_iter() */
 			mem_cgroup_iter_break(NULL, memcg);
-			pool->next_shrink = NULL;
-			spin_unlock(&zswap_pools_lock);
+			zswap.next_shrink = NULL;
+			spin_unlock(&zswap.shrink_lock);
 
 			if (++failures == MAX_RECLAIM_RETRIES)
 				break;
 
 			goto resched;
 		}
-		spin_unlock(&zswap_pools_lock);
+		spin_unlock(&zswap.shrink_lock);
 
 		ret = shrink_memcg(memcg);
 		/* drop the extra reference */
@@ -1466,7 +1417,6 @@ static void shrink_worker(struct work_struct *w)
 resched:
 		cond_resched();
 	} while (!zswap_can_accept());
-	zswap_pool_put(pool);
 }
 
 static int zswap_is_page_same_filled(void *ptr, unsigned long *value)
@@ -1507,7 +1457,6 @@ bool zswap_store(struct folio *folio)
 	struct zswap_entry *entry, *dupentry;
 	struct obj_cgroup *objcg = NULL;
 	struct mem_cgroup *memcg = NULL;
-	struct zswap_pool *shrink_pool;
 
 	VM_WARN_ON_ONCE(!folio_test_locked(folio));
 	VM_WARN_ON_ONCE(!folio_test_swapcache(folio));
@@ -1575,7 +1524,7 @@ bool zswap_store(struct folio *folio)
 
 	if (objcg) {
 		memcg = get_mem_cgroup_from_objcg(objcg);
-		if (memcg_list_lru_alloc(memcg, &entry->pool->list_lru, GFP_KERNEL)) {
+		if (memcg_list_lru_alloc(memcg, &zswap.list_lru, GFP_KERNEL)) {
 			mem_cgroup_put(memcg);
 			goto put_pool;
 		}
@@ -1606,8 +1555,8 @@ bool zswap_store(struct folio *folio)
 	}
 	if (entry->length) {
 		INIT_LIST_HEAD(&entry->lru);
-		zswap_lru_add(&entry->pool->list_lru, entry);
-		atomic_inc(&entry->pool->nr_stored);
+		zswap_lru_add(&zswap.list_lru, entry);
+		atomic_inc(&zswap.nr_stored);
 	}
 	spin_unlock(&tree->lock);
 
@@ -1639,9 +1588,7 @@ bool zswap_store(struct folio *folio)
 	return false;
 
 shrink:
-	shrink_pool = zswap_pool_last_get();
-	if (shrink_pool && !queue_work(shrink_wq, &shrink_pool->shrink_work))
-		zswap_pool_put(shrink_pool);
+	queue_work(shrink_wq, &zswap.shrink_work);
 	goto reject;
 }
 
@@ -1803,6 +1750,22 @@ static int zswap_setup(void)
 	if (ret)
 		goto hp_fail;
 
+	shrink_wq = alloc_workqueue("zswap-shrink",
+			WQ_UNBOUND|WQ_MEM_RECLAIM, 1);
+	if (!shrink_wq)
+		goto shrink_wq_fail;
+
+	zswap.shrinker = zswap_alloc_shrinker();
+	if (!zswap.shrinker)
+		goto shrinker_fail;
+	if (list_lru_init_memcg(&zswap.list_lru, zswap.shrinker))
+		goto lru_fail;
+	shrinker_register(zswap.shrinker);
+
+	INIT_WORK(&zswap.shrink_work, shrink_worker);
+	atomic_set(&zswap.nr_stored, 0);
+	spin_lock_init(&zswap.shrink_lock);
+
 	pool = __zswap_pool_create_fallback();
 	if (pool) {
 		pr_info("loaded using pool %s/%s\n", pool->tfm_name,
@@ -1814,19 +1777,17 @@ static int zswap_setup(void)
 		zswap_enabled = false;
 	}
 
-	shrink_wq = alloc_workqueue("zswap-shrink",
-			WQ_UNBOUND|WQ_MEM_RECLAIM, 1);
-	if (!shrink_wq)
-		goto fallback_fail;
-
 	if (zswap_debugfs_init())
 		pr_warn("debugfs initialization failed\n");
 	zswap_init_state = ZSWAP_INIT_SUCCEED;
 	return 0;
 
-fallback_fail:
-	if (pool)
-		zswap_pool_destroy(pool);
+lru_fail:
+	shrinker_free(zswap.shrinker);
+shrinker_fail:
+	destroy_workqueue(shrink_wq);
+shrink_wq_fail:
+	cpuhp_remove_multi_state(CPUHP_MM_ZSWP_POOL_PREPARE);
 hp_fail:
 	kmem_cache_destroy(zswap_entry_cache);
 cache_fail:

From 767c01f6e0b9efa25c058cff01af87ef878335bc Mon Sep 17 00:00:00 2001
From: Chengming Zhou <zhouchengming@bytedance.com>
Date: Fri, 16 Feb 2024 08:55:05 +0000
Subject: [PATCH 1291/1406] mm/zswap: change zswap_pool kref to percpu_ref

All zswap entries will take a reference of zswap_pool when zswap_store(),
and drop it when free.  Change it to use the percpu_ref is better for
scalability performance.

Although percpu_ref use a bit more memory which should be ok for our use
case, since we almost have only one zswap_pool to be using.  The
performance gain is for zswap_store/load hotpath.

Testing kernel build (32 threads) in tmpfs with memory.max=2GB.  (zswap
shrinker and writeback enabled with one 50GB swapfile, on a 128 CPUs
x86-64 machine, below is the average of 5 runs)

        mm-unstable  zswap-global-lru
real    63.20        63.12
user    1061.75      1062.95
sys     268.74       264.44

Link: https://lkml.kernel.org/r/20240210-zswap-global-lru-v3-2-200495333595@bytedance.com
Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
Reviewed-by: Nhat Pham <nphamcs@gmail.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Yosry Ahmed <yosryahmed@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/zswap.c | 36 +++++++++++++++++++++++++++---------
 1 file changed, 27 insertions(+), 9 deletions(-)

diff --git a/mm/zswap.c b/mm/zswap.c
index 0141c45a5a6f61..011e068eb355e6 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -173,7 +173,7 @@ struct crypto_acomp_ctx {
 struct zswap_pool {
 	struct zpool *zpools[ZSWAP_NR_ZPOOLS];
 	struct crypto_acomp_ctx __percpu *acomp_ctx;
-	struct kref kref;
+	struct percpu_ref ref;
 	struct list_head list;
 	struct work_struct release_work;
 	struct hlist_node node;
@@ -305,6 +305,7 @@ static void zswap_update_total_size(void)
 /*********************************
 * pool functions
 **********************************/
+static void __zswap_pool_empty(struct percpu_ref *ref);
 
 static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
 {
@@ -358,13 +359,18 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
 	/* being the current pool takes 1 ref; this func expects the
 	 * caller to always add the new pool as the current pool
 	 */
-	kref_init(&pool->kref);
+	ret = percpu_ref_init(&pool->ref, __zswap_pool_empty,
+			      PERCPU_REF_ALLOW_REINIT, GFP_KERNEL);
+	if (ret)
+		goto ref_fail;
 	INIT_LIST_HEAD(&pool->list);
 
 	zswap_pool_debug("created", pool);
 
 	return pool;
 
+ref_fail:
+	cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node);
 error:
 	if (pool->acomp_ctx)
 		free_percpu(pool->acomp_ctx);
@@ -437,8 +443,9 @@ static void __zswap_pool_release(struct work_struct *work)
 
 	synchronize_rcu();
 
-	/* nobody should have been able to get a kref... */
-	WARN_ON(kref_get_unless_zero(&pool->kref));
+	/* nobody should have been able to get a ref... */
+	WARN_ON(!percpu_ref_is_zero(&pool->ref));
+	percpu_ref_exit(&pool->ref);
 
 	/* pool is now off zswap_pools list and has no references. */
 	zswap_pool_destroy(pool);
@@ -446,11 +453,11 @@ static void __zswap_pool_release(struct work_struct *work)
 
 static struct zswap_pool *zswap_pool_current(void);
 
-static void __zswap_pool_empty(struct kref *kref)
+static void __zswap_pool_empty(struct percpu_ref *ref)
 {
 	struct zswap_pool *pool;
 
-	pool = container_of(kref, typeof(*pool), kref);
+	pool = container_of(ref, typeof(*pool), ref);
 
 	spin_lock(&zswap_pools_lock);
 
@@ -469,12 +476,12 @@ static int __must_check zswap_pool_get(struct zswap_pool *pool)
 	if (!pool)
 		return 0;
 
-	return kref_get_unless_zero(&pool->kref);
+	return percpu_ref_tryget(&pool->ref);
 }
 
 static void zswap_pool_put(struct zswap_pool *pool)
 {
-	kref_put(&pool->kref, __zswap_pool_empty);
+	percpu_ref_put(&pool->ref);
 }
 
 static struct zswap_pool *__zswap_pool_current(void)
@@ -604,6 +611,17 @@ static int __zswap_param_set(const char *val, const struct kernel_param *kp,
 
 	if (!pool)
 		pool = zswap_pool_create(type, compressor);
+	else {
+		/*
+		 * Restore the initial ref dropped by percpu_ref_kill()
+		 * when the pool was decommissioned and switch it again
+		 * to percpu mode.
+		 */
+		percpu_ref_resurrect(&pool->ref);
+
+		/* Drop the ref from zswap_pool_find_get(). */
+		zswap_pool_put(pool);
+	}
 
 	if (pool)
 		ret = param_set_charp(s, kp);
@@ -642,7 +660,7 @@ static int __zswap_param_set(const char *val, const struct kernel_param *kp,
 	 * or the new pool we failed to add
 	 */
 	if (put_pool)
-		zswap_pool_put(put_pool);
+		percpu_ref_kill(&put_pool->ref);
 
 	return ret;
 }

From 5cafb50c0c8df6f50a64ee4f98dc60ab72c3fd90 Mon Sep 17 00:00:00 2001
From: Byungchul Park <byungchul@sk.com>
Date: Mon, 19 Feb 2024 13:10:47 +0900
Subject: [PATCH 1292/1406] sched/numa, mm: do not try to migrate memory to
 memoryless nodes

Memoryless nodes do not have any memory to migrate to, so, as an
optimization, stop trying it.

Link: https://lkml.kernel.org/r/20240219041920.1183-1-byungchul@sk.com
Link: https://lkml.kernel.org/r/20240216111502.79759-1-byungchul@sk.com
Fixes: c574bbe917036 ("NUMA balancing: optimize page placement for memory tiering system")
Signed-off-by: Byungchul Park <byungchul@sk.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Reviewed-by: "Huang, Ying" <ying.huang@intel.com>
Reviewed-by: Phil Auld <pauld@redhat.com>
Reviewed-by: Davidlohr Bueso <dave@stgolabs.net>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Benjamin Segall <bsegall@google.com>
Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Valentin Schneider <vschneid@redhat.com>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/sched/fair.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 533547e3c90a75..d8d71ad7f9f8dc 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1830,6 +1830,12 @@ bool should_numa_migrate_memory(struct task_struct *p, struct folio *folio,
 	int dst_nid = cpu_to_node(dst_cpu);
 	int last_cpupid, this_cpupid;
 
+	/*
+	 * Cannot migrate to memoryless nodes.
+	 */
+	if (!node_state(dst_nid, N_MEMORY))
+		return false;
+
 	/*
 	 * The pages in slow memory node should be migrated according
 	 * to hot/cold instead of private/shared.

From d748102e8480f7f97b742560116fa1303ac25233 Mon Sep 17 00:00:00 2001
From: Chengming Zhou <zhouchengming@bytedance.com>
Date: Tue, 20 Feb 2024 06:53:00 +0000
Subject: [PATCH 1293/1406] mm/zsmalloc: remove set_zspage_mapping()

Patch series "mm/zsmalloc: some cleanup for get/set_zspage_mapping()".

The discussion[1] with Sergey shows there are some cleanup works to do
in get/set_zspage_mapping():

- the fullness returned from get_zspage_mapping() is not stable outside
  pool->lock, this usage pattern is confusing, but should be ok in this
  free_zspage path.

- we seldom use the class_idx returned from get_zspage_mapping(), only
  free_zspage path use to get its class.

- set_zspage_mapping() always set the zspage->class, but it's never
  changed after zspage allocated.

[1] https://lore.kernel.org/all/a6c22e30-cf10-4122-91bc-ceb9fb57a5d6@bytedance.com/


This patch (of 3):

We only need to update zspage->fullness when insert_zspage(), since
zspage->class is never changed after allocated.

Link: https://lkml.kernel.org/r/20240220-b4-zsmalloc-cleanup-v1-0-5c5ee4ccdd87@bytedance.com
Link: https://lkml.kernel.org/r/20240220-b4-zsmalloc-cleanup-v1-1-5c5ee4ccdd87@bytedance.com
Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Yosry Ahmed <yosryahmed@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/zsmalloc.c | 13 ++-----------
 1 file changed, 2 insertions(+), 11 deletions(-)

diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index a48f4651d143bc..a6653915bf175e 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -486,14 +486,6 @@ static struct size_class *zspage_class(struct zs_pool *pool,
 	return pool->size_class[zspage->class];
 }
 
-static void set_zspage_mapping(struct zspage *zspage,
-			       unsigned int class_idx,
-			       int fullness)
-{
-	zspage->class = class_idx;
-	zspage->fullness = fullness;
-}
-
 /*
  * zsmalloc divides the pool into various size classes where each
  * class maintains a list of zspages where each zspage is divided
@@ -688,6 +680,7 @@ static void insert_zspage(struct size_class *class,
 {
 	class_stat_inc(class, fullness, 1);
 	list_add(&zspage->list, &class->fullness_list[fullness]);
+	zspage->fullness = fullness;
 }
 
 /*
@@ -725,7 +718,6 @@ static int fix_fullness_group(struct size_class *class, struct zspage *zspage)
 
 	remove_zspage(class, zspage, currfg);
 	insert_zspage(class, zspage, newfg);
-	set_zspage_mapping(zspage, class_idx, newfg);
 out:
 	return newfg;
 }
@@ -1005,6 +997,7 @@ static struct zspage *alloc_zspage(struct zs_pool *pool,
 	create_page_chain(class, zspage, pages);
 	init_zspage(class, zspage);
 	zspage->pool = pool;
+	zspage->class = class->index;
 
 	return zspage;
 }
@@ -1397,7 +1390,6 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp)
 	obj = obj_malloc(pool, zspage, handle);
 	newfg = get_fullness_group(class, zspage);
 	insert_zspage(class, zspage, newfg);
-	set_zspage_mapping(zspage, class->index, newfg);
 	record_obj(handle, obj);
 	atomic_long_add(class->pages_per_zspage, &pool->pages_allocated);
 	class_stat_inc(class, ZS_OBJS_ALLOCATED, class->objs_per_zspage);
@@ -1655,7 +1647,6 @@ static int putback_zspage(struct size_class *class, struct zspage *zspage)
 
 	fullness = get_fullness_group(class, zspage);
 	insert_zspage(class, zspage, fullness);
-	set_zspage_mapping(zspage, class->index, fullness);
 
 	return fullness;
 }

From 9034fc84303fc3ba288eaf6ff2be0083aaf47f7e Mon Sep 17 00:00:00 2001
From: Chengming Zhou <zhouchengming@bytedance.com>
Date: Tue, 20 Feb 2024 06:53:01 +0000
Subject: [PATCH 1294/1406] mm/zsmalloc: remove_zspage() don't need fullness
 parameter

We must remove_zspage() from its current fullness list, then use
insert_zspage() to update its fullness and insert to new fullness list.
Obviously, remove_zspage() doesn't need the fullness parameter.

Link: https://lkml.kernel.org/r/20240220-b4-zsmalloc-cleanup-v1-2-5c5ee4ccdd87@bytedance.com
Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Yosry Ahmed <yosryahmed@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/zsmalloc.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index a6653915bf175e..c39fac9361d700 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -687,10 +687,10 @@ static void insert_zspage(struct size_class *class,
  * This function removes the given zspage from the freelist identified
  * by <class, fullness_group>.
  */
-static void remove_zspage(struct size_class *class,
-				struct zspage *zspage,
-				int fullness)
+static void remove_zspage(struct size_class *class, struct zspage *zspage)
 {
+	int fullness = zspage->fullness;
+
 	VM_BUG_ON(list_empty(&class->fullness_list[fullness]));
 
 	list_del_init(&zspage->list);
@@ -716,7 +716,7 @@ static int fix_fullness_group(struct size_class *class, struct zspage *zspage)
 	if (newfg == currfg)
 		goto out;
 
-	remove_zspage(class, zspage, currfg);
+	remove_zspage(class, zspage);
 	insert_zspage(class, zspage, newfg);
 out:
 	return newfg;
@@ -878,7 +878,7 @@ static void free_zspage(struct zs_pool *pool, struct size_class *class,
 		return;
 	}
 
-	remove_zspage(class, zspage, ZS_INUSE_RATIO_0);
+	remove_zspage(class, zspage);
 	__free_zspage(pool, class, zspage);
 }
 
@@ -1609,7 +1609,7 @@ static struct zspage *isolate_src_zspage(struct size_class *class)
 		zspage = list_first_entry_or_null(&class->fullness_list[fg],
 						  struct zspage, list);
 		if (zspage) {
-			remove_zspage(class, zspage, fg);
+			remove_zspage(class, zspage);
 			return zspage;
 		}
 	}
@@ -1626,7 +1626,7 @@ static struct zspage *isolate_dst_zspage(struct size_class *class)
 		zspage = list_first_entry_or_null(&class->fullness_list[fg],
 						  struct zspage, list);
 		if (zspage) {
-			remove_zspage(class, zspage, fg);
+			remove_zspage(class, zspage);
 			return zspage;
 		}
 	}

From b8c140be2b7dc548e1d9db5c7bfd1a1ddbf3bf28 Mon Sep 17 00:00:00 2001
From: Chengming Zhou <zhouchengming@bytedance.com>
Date: Tue, 20 Feb 2024 06:53:02 +0000
Subject: [PATCH 1295/1406] mm/zsmalloc: remove get_zspage_mapping()

Actually we seldom use the class_idx returned from get_zspage_mapping(),
only the zspage->fullness is useful, just use zspage->fullness to remove
this helper.

Note zspage->fullness is not stable outside pool->lock, remove redundant
"VM_BUG_ON(fullness != ZS_INUSE_RATIO_0)" in async_free_zspage() since we
already have the same VM_BUG_ON() in __free_zspage(), which is safe to
access zspage->fullness with pool->lock held.

Link: https://lkml.kernel.org/r/20240220-b4-zsmalloc-cleanup-v1-3-5c5ee4ccdd87@bytedance.com
Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Yosry Ahmed <yosryahmed@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/zsmalloc.c | 28 ++++------------------------
 1 file changed, 4 insertions(+), 24 deletions(-)

diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index c39fac9361d700..63ec385cd670f3 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -470,16 +470,6 @@ static inline void set_freeobj(struct zspage *zspage, unsigned int obj)
 	zspage->freeobj = obj;
 }
 
-static void get_zspage_mapping(struct zspage *zspage,
-			       unsigned int *class_idx,
-			       int *fullness)
-{
-	BUG_ON(zspage->magic != ZSPAGE_MAGIC);
-
-	*fullness = zspage->fullness;
-	*class_idx = zspage->class;
-}
-
 static struct size_class *zspage_class(struct zs_pool *pool,
 				       struct zspage *zspage)
 {
@@ -708,12 +698,10 @@ static void remove_zspage(struct size_class *class, struct zspage *zspage)
  */
 static int fix_fullness_group(struct size_class *class, struct zspage *zspage)
 {
-	int class_idx;
-	int currfg, newfg;
+	int newfg;
 
-	get_zspage_mapping(zspage, &class_idx, &currfg);
 	newfg = get_fullness_group(class, zspage);
-	if (newfg == currfg)
+	if (newfg == zspage->fullness)
 		goto out;
 
 	remove_zspage(class, zspage);
@@ -835,15 +823,11 @@ static void __free_zspage(struct zs_pool *pool, struct size_class *class,
 				struct zspage *zspage)
 {
 	struct page *page, *next;
-	int fg;
-	unsigned int class_idx;
-
-	get_zspage_mapping(zspage, &class_idx, &fg);
 
 	assert_spin_locked(&pool->lock);
 
 	VM_BUG_ON(get_zspage_inuse(zspage));
-	VM_BUG_ON(fg != ZS_INUSE_RATIO_0);
+	VM_BUG_ON(zspage->fullness != ZS_INUSE_RATIO_0);
 
 	next = page = get_first_page(zspage);
 	do {
@@ -1857,8 +1841,6 @@ static void async_free_zspage(struct work_struct *work)
 {
 	int i;
 	struct size_class *class;
-	unsigned int class_idx;
-	int fullness;
 	struct zspage *zspage, *tmp;
 	LIST_HEAD(free_pages);
 	struct zs_pool *pool = container_of(work, struct zs_pool,
@@ -1879,10 +1861,8 @@ static void async_free_zspage(struct work_struct *work)
 		list_del(&zspage->list);
 		lock_zspage(zspage);
 
-		get_zspage_mapping(zspage, &class_idx, &fullness);
-		VM_BUG_ON(fullness != ZS_INUSE_RATIO_0);
-		class = pool->size_class[class_idx];
 		spin_lock(&pool->lock);
+		class = zspage_class(pool, zspage);
 		__free_zspage(pool, class, zspage);
 		spin_unlock(&pool->lock);
 	}

From 25a036697a2e5b4c119c76ab147ab095611eb2d2 Mon Sep 17 00:00:00 2001
From: Chengming Zhou <zhouchengming@bytedance.com>
Date: Tue, 20 Feb 2024 07:38:51 +0000
Subject: [PATCH 1296/1406] MAINTAINERS: add Chengming Zhou as a zswap reviewer

I have been actively contributing to zswap and reviewing zswap patches for
a while, and I am already getting CC'd on most of them.  So add myself as
a reviewer, will continue to work on it and help with the review process.

Link: https://lkml.kernel.org/r/20240220073851.865113-1-chengming.zhou@linux.dev
Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: Yosry Ahmed <yosryahmed@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 MAINTAINERS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index 129a237b788044..f4ddbcdfb29ac4 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -24426,6 +24426,7 @@ ZSWAP COMPRESSED SWAP CACHING
 M:	Johannes Weiner <hannes@cmpxchg.org>
 M:	Yosry Ahmed <yosryahmed@google.com>
 M:	Nhat Pham <nphamcs@gmail.com>
+R:	Chengming Zhou <chengming.zhou@linux.dev>
 L:	linux-mm@kvack.org
 S:	Maintained
 F:	Documentation/admin-guide/mm/zswap.rst

From 5be0361bce7d5ccfc82cd847444accc74497175b Mon Sep 17 00:00:00 2001
From: Zi Yan <ziy@nvidia.com>
Date: Tue, 20 Feb 2024 13:32:17 -0500
Subject: [PATCH 1297/1406] mm/page_alloc: remove unused fpi_flags in
 free_pages_prepare()

Patch series "Enable >0 order folio memory compaction", v7.

This patchset enables >0 order folio memory compaction, which is one of
the prerequisitions for large folio support[1].

I am aware of that split free pages is necessary for folio migration in
compaction, since if >0 order free pages are never split and no order-0
free page is scanned, compaction will end prematurely due to migration
returns -ENOMEM.  Free page split becomes a must instead of an
optimization.

lkp ncompare results (on a 8-CPU (Intel Xeon E5-2650 v4 @2.20GHz) 16G VM)
for default LRU (-no-mglru) and CONFIG_LRU_GEN are shown at the bottom,
copied from V3[4].  In sum, most of vm-scalability applications do not see
performance change, and the others see ~4% to ~26% performance boost under
default LRU and ~2% to ~6% performance boost under CONFIG_LRU_GEN.

Overview
===

To support >0 order folio compaction, the patchset changes how free pages
used for migration are kept during compaction.  Free pages used to be
split into order-0 pages that are post allocation processed (i.e.,
PageBuddy flag cleared, page order stored in page->private is zeroed, and
page reference is set to 1).  Now all free pages are kept in a
NR_PAGE_ORDER array of page lists based on their order without post
allocation process.  When migrate_pages() asks for a new page, one of the
free pages, based on the requested page order, is then processed and given
out.  And THP <2MB would need this feature.


[1] https://lore.kernel.org/linux-mm/f8d47176-03a8-99bf-a813-b5942830fd73@arm.com/
[2] https://lore.kernel.org/linux-mm/20231113170157.280181-1-zi.yan@sent.com/
[3] https://lore.kernel.org/linux-mm/20240123034636.1095672-1-zi.yan@sent.com/
[4] https://lore.kernel.org/linux-mm/20240202161554.565023-1-zi.yan@sent.com/
[5] https://lore.kernel.org/linux-mm/20240212163510.859822-1-zi.yan@sent.com/
[6] https://lore.kernel.org/linux-mm/20240214220420.1229173-1-zi.yan@sent.com/
[7] https://lore.kernel.org/linux-mm/20240216170432.1268753-1-zi.yan@sent.com/


This patch (of 4):

Commit 0a54864f8dfb ("kasan: remove PG_skip_kasan_poison flag") removes
the use of fpi_flags in should_skip_kasan_poison() and fpi_flags is only
passed to should_skip_kasan_poison() in free_pages_prepare().  Remove the
unused parameter.

Link: https://lkml.kernel.org/r/20240220183220.1451315-1-zi.yan@sent.com
Link: https://lkml.kernel.org/r/20240220183220.1451315-2-zi.yan@sent.com
Signed-off-by: Zi Yan <ziy@nvidia.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: David Hildenbrand <david@redhat.com>
Cc: Adam Manzanares <a.manzanares@samsung.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kemeng Shi <shikemeng@huaweicloud.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Luis Chamberlain <mcgrof@kernel.org>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com>
Cc: Yin Fengwei <fengwei.yin@intel.com>
Cc: Yu Zhao <yuzhao@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/page_alloc.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 9faca05d124e60..dc59fb225cbf5c 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1061,7 +1061,7 @@ static int free_tail_page_prepare(struct page *head_page, struct page *page)
  * on-demand allocation and then freed again before the deferred pages
  * initialization is done, but this is not likely to happen.
  */
-static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags)
+static inline bool should_skip_kasan_poison(struct page *page)
 {
 	if (IS_ENABLED(CONFIG_KASAN_GENERIC))
 		return deferred_pages_enabled();
@@ -1081,10 +1081,10 @@ static void kernel_init_pages(struct page *page, int numpages)
 }
 
 static __always_inline bool free_pages_prepare(struct page *page,
-			unsigned int order, fpi_t fpi_flags)
+			unsigned int order)
 {
 	int bad = 0;
-	bool skip_kasan_poison = should_skip_kasan_poison(page, fpi_flags);
+	bool skip_kasan_poison = should_skip_kasan_poison(page);
 	bool init = want_init_on_free();
 	bool compound = PageCompound(page);
 
@@ -1266,7 +1266,7 @@ static void __free_pages_ok(struct page *page, unsigned int order,
 	unsigned long pfn = page_to_pfn(page);
 	struct zone *zone = page_zone(page);
 
-	if (!free_pages_prepare(page, order, fpi_flags))
+	if (!free_pages_prepare(page, order))
 		return;
 
 	/*
@@ -2343,7 +2343,7 @@ static bool free_unref_page_prepare(struct page *page, unsigned long pfn,
 {
 	int migratetype;
 
-	if (!free_pages_prepare(page, order, FPI_NONE))
+	if (!free_pages_prepare(page, order))
 		return false;
 
 	migratetype = get_pfnblock_migratetype(page, pfn);

From 6f7d8aeedb585b653b63a453a1bf8cc20ebd816d Mon Sep 17 00:00:00 2001
From: Zi Yan <ziy@nvidia.com>
Date: Tue, 20 Feb 2024 13:32:18 -0500
Subject: [PATCH 1298/1406] mm/compaction: enable compacting >0 order folios.

migrate_pages() supports >0 order folio migration and during compaction,
even if compaction_alloc() cannot provide >0 order free pages,
migrate_pages() can split the source page and try to migrate the base
pages from the split.  It can be a baseline and start point for adding
support for compacting >0 order folios.

Link: https://lkml.kernel.org/r/20240220183220.1451315-3-zi.yan@sent.com
Signed-off-by: Zi Yan <ziy@nvidia.com>
Suggested-by: Huang Ying <ying.huang@intel.com>
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Tested-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Tested-by: Yu Zhao <yuzhao@google.com>
Cc: Adam Manzanares <a.manzanares@samsung.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kemeng Shi <shikemeng@huaweicloud.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Luis Chamberlain <mcgrof@kernel.org>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Yin Fengwei <fengwei.yin@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/compaction.c | 101 ++++++++++++++++++++++++++++++++++++------------
 1 file changed, 76 insertions(+), 25 deletions(-)

diff --git a/mm/compaction.c b/mm/compaction.c
index 52ff6b9344c707..2ba9ba49b0e9ff 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -40,9 +40,22 @@ static inline void count_compact_events(enum vm_event_item item, long delta)
 {
 	count_vm_events(item, delta);
 }
+
+/*
+ * order == -1 is expected when compacting proactively via
+ * 1. /proc/sys/vm/compact_memory
+ * 2. /sys/devices/system/node/nodex/compact
+ * 3. /proc/sys/vm/compaction_proactiveness
+ */
+static inline bool is_via_compact_memory(int order)
+{
+	return order == -1;
+}
+
 #else
 #define count_compact_event(item) do { } while (0)
 #define count_compact_events(item, delta) do { } while (0)
+static inline bool is_via_compact_memory(int order) { return false; }
 #endif
 
 #if defined CONFIG_COMPACTION || defined CONFIG_CMA
@@ -816,6 +829,32 @@ static bool too_many_isolated(struct compact_control *cc)
 	return too_many;
 }
 
+/**
+ * skip_isolation_on_order() - determine when to skip folio isolation based on
+ *			       folio order and compaction target order
+ * @order:		to-be-isolated folio order
+ * @target_order:	compaction target order
+ *
+ * This avoids unnecessary folio isolations during compaction.
+ */
+static bool skip_isolation_on_order(int order, int target_order)
+{
+	/*
+	 * Unless we are performing global compaction (i.e.,
+	 * is_via_compact_memory), skip any folios that are larger than the
+	 * target order: we wouldn't be here if we'd have a free folio with
+	 * the desired target_order, so migrating this folio would likely fail
+	 * later.
+	 */
+	if (!is_via_compact_memory(target_order) && order >= target_order)
+		return true;
+	/*
+	 * We limit memory compaction to pageblocks and won't try
+	 * creating free blocks of memory that are larger than that.
+	 */
+	return order >= pageblock_order;
+}
+
 /**
  * isolate_migratepages_block() - isolate all migrate-able pages within
  *				  a single pageblock
@@ -947,7 +986,22 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
 			valid_page = page;
 		}
 
-		if (PageHuge(page) && cc->alloc_contig) {
+		if (PageHuge(page)) {
+			/*
+			 * skip hugetlbfs if we are not compacting for pages
+			 * bigger than its order. THPs and other compound pages
+			 * are handled below.
+			 */
+			if (!cc->alloc_contig) {
+				const unsigned int order = compound_order(page);
+
+				if (order <= MAX_PAGE_ORDER) {
+					low_pfn += (1UL << order) - 1;
+					nr_scanned += (1UL << order) - 1;
+				}
+				goto isolate_fail;
+			}
+			/* for alloc_contig case */
 			if (locked) {
 				unlock_page_lruvec_irqrestore(locked, flags);
 				locked = NULL;
@@ -1008,21 +1062,24 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
 		}
 
 		/*
-		 * Regardless of being on LRU, compound pages such as THP and
-		 * hugetlbfs are not to be compacted unless we are attempting
-		 * an allocation much larger than the huge page size (eg CMA).
-		 * We can potentially save a lot of iterations if we skip them
-		 * at once. The check is racy, but we can consider only valid
-		 * values and the only danger is skipping too much.
+		 * Regardless of being on LRU, compound pages such as THP
+		 * (hugetlbfs is handled above) are not to be compacted unless
+		 * we are attempting an allocation larger than the compound
+		 * page size. We can potentially save a lot of iterations if we
+		 * skip them at once. The check is racy, but we can consider
+		 * only valid values and the only danger is skipping too much.
 		 */
 		if (PageCompound(page) && !cc->alloc_contig) {
 			const unsigned int order = compound_order(page);
 
-			if (likely(order <= MAX_PAGE_ORDER)) {
-				low_pfn += (1UL << order) - 1;
-				nr_scanned += (1UL << order) - 1;
+			/* Skip based on page order and compaction target order. */
+			if (skip_isolation_on_order(order, cc->order)) {
+				if (order <= MAX_PAGE_ORDER) {
+					low_pfn += (1UL << order) - 1;
+					nr_scanned += (1UL << order) - 1;
+				}
+				goto isolate_fail;
 			}
-			goto isolate_fail;
 		}
 
 		/*
@@ -1165,10 +1222,11 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
 			}
 
 			/*
-			 * folio become large since the non-locked check,
-			 * and it's on LRU.
+			 * Check LRU folio order under the lock
 			 */
-			if (unlikely(folio_test_large(folio) && !cc->alloc_contig)) {
+			if (unlikely(skip_isolation_on_order(folio_order(folio),
+							     cc->order) &&
+				     !cc->alloc_contig)) {
 				low_pfn += folio_nr_pages(folio) - 1;
 				nr_scanned += folio_nr_pages(folio) - 1;
 				folio_set_lru(folio);
@@ -1788,6 +1846,10 @@ static struct folio *compaction_alloc(struct folio *src, unsigned long data)
 	struct compact_control *cc = (struct compact_control *)data;
 	struct folio *dst;
 
+	/* this makes migrate_pages() split the source page and retry */
+	if (folio_test_large(src))
+		return NULL;
+
 	if (list_empty(&cc->freepages)) {
 		isolate_freepages(cc);
 
@@ -2090,17 +2152,6 @@ static isolate_migrate_t isolate_migratepages(struct compact_control *cc)
 	return cc->nr_migratepages ? ISOLATE_SUCCESS : ISOLATE_NONE;
 }
 
-/*
- * order == -1 is expected when compacting proactively via
- * 1. /proc/sys/vm/compact_memory
- * 2. /sys/devices/system/node/nodex/compact
- * 3. /proc/sys/vm/compaction_proactiveness
- */
-static inline bool is_via_compact_memory(int order)
-{
-	return order == -1;
-}
-
 /*
  * Determine whether kswapd is (or recently was!) running on this node.
  *

From 3cd99feed10aba3cdabdfb764c1e8b0829fce2a4 Mon Sep 17 00:00:00 2001
From: Zi Yan <ziy@nvidia.com>
Date: Tue, 20 Feb 2024 13:32:19 -0500
Subject: [PATCH 1299/1406] mm/compaction: add support for >0 order folio
 memory compaction.

Before last commit, memory compaction only migrates order-0 folios and
skips >0 order folios.  Last commit splits all >0 order folios during
compaction.  This commit migrates >0 order folios during compaction by
keeping isolated free pages at their original size without splitting them
into order-0 pages and using them directly during migration process.

What is different from the prior implementation:
1. All isolated free pages are kept in a NR_PAGE_ORDERS array of page
   lists, where each page list stores free pages in the same order.
2. All free pages are not post_alloc_hook() processed nor buddy pages,
   although their orders are stored in first page's private like buddy
   pages.
3. During migration, in new page allocation time (i.e., in
   compaction_alloc()), free pages are then processed by post_alloc_hook().
   When migration fails and a new page is returned (i.e., in
   compaction_free()), free pages are restored by reversing the
   post_alloc_hook() operations using newly added
   free_pages_prepare_fpi_none().

Step 3 is done for a latter optimization that splitting and/or merging
free pages during compaction becomes easier.

Note: without splitting free pages, compaction can end prematurely due to
migration will return -ENOMEM even if there is free pages.  This happens
when no order-0 free page exist and compaction_alloc() return NULL.

Link: https://lkml.kernel.org/r/20240220183220.1451315-4-zi.yan@sent.com
Signed-off-by: Zi Yan <ziy@nvidia.com>
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Tested-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Tested-by: Yu Zhao <yuzhao@google.com>
Cc: Adam Manzanares <a.manzanares@samsung.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kemeng Shi <shikemeng@huaweicloud.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Luis Chamberlain <mcgrof@kernel.org>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Yin Fengwei <fengwei.yin@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/compaction.c | 140 +++++++++++++++++++++++++++---------------------
 mm/internal.h   |   4 +-
 mm/page_alloc.c |   2 +-
 3 files changed, 83 insertions(+), 63 deletions(-)

diff --git a/mm/compaction.c b/mm/compaction.c
index 2ba9ba49b0e9ff..61b2c731c9db94 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -79,45 +79,56 @@ static inline bool is_via_compact_memory(int order) { return false; }
 #define COMPACTION_HPAGE_ORDER	(PMD_SHIFT - PAGE_SHIFT)
 #endif
 
-static unsigned long release_freepages(struct list_head *freelist)
+static void split_map_pages(struct list_head *freepages)
 {
+	unsigned int i, order;
 	struct page *page, *next;
-	unsigned long high_pfn = 0;
+	LIST_HEAD(tmp_list);
 
-	list_for_each_entry_safe(page, next, freelist, lru) {
-		unsigned long pfn = page_to_pfn(page);
-		list_del(&page->lru);
-		__free_page(page);
-		if (pfn > high_pfn)
-			high_pfn = pfn;
-	}
+	for (order = 0; order < NR_PAGE_ORDERS; order++) {
+		list_for_each_entry_safe(page, next, &freepages[order], lru) {
+			unsigned int nr_pages;
 
-	return high_pfn;
+			list_del(&page->lru);
+
+			nr_pages = 1 << order;
+
+			post_alloc_hook(page, order, __GFP_MOVABLE);
+			if (order)
+				split_page(page, order);
+
+			for (i = 0; i < nr_pages; i++) {
+				list_add(&page->lru, &tmp_list);
+				page++;
+			}
+		}
+		list_splice_init(&tmp_list, &freepages[0]);
+	}
 }
 
-static void split_map_pages(struct list_head *list)
+static unsigned long release_free_list(struct list_head *freepages)
 {
-	unsigned int i, order, nr_pages;
-	struct page *page, *next;
-	LIST_HEAD(tmp_list);
-
-	list_for_each_entry_safe(page, next, list, lru) {
-		list_del(&page->lru);
+	int order;
+	unsigned long high_pfn = 0;
 
-		order = page_private(page);
-		nr_pages = 1 << order;
+	for (order = 0; order < NR_PAGE_ORDERS; order++) {
+		struct page *page, *next;
 
-		post_alloc_hook(page, order, __GFP_MOVABLE);
-		if (order)
-			split_page(page, order);
+		list_for_each_entry_safe(page, next, &freepages[order], lru) {
+			unsigned long pfn = page_to_pfn(page);
 
-		for (i = 0; i < nr_pages; i++) {
-			list_add(&page->lru, &tmp_list);
-			page++;
+			list_del(&page->lru);
+			/*
+			 * Convert free pages into post allocation pages, so
+			 * that we can free them via __free_page.
+			 */
+			post_alloc_hook(page, order, __GFP_MOVABLE);
+			__free_pages(page, order);
+			if (pfn > high_pfn)
+				high_pfn = pfn;
 		}
 	}
-
-	list_splice(&tmp_list, list);
+	return high_pfn;
 }
 
 #ifdef CONFIG_COMPACTION
@@ -670,7 +681,7 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
 		nr_scanned += isolated - 1;
 		total_isolated += isolated;
 		cc->nr_freepages += isolated;
-		list_add_tail(&page->lru, freelist);
+		list_add_tail(&page->lru, &freelist[order]);
 
 		if (!strict && cc->nr_migratepages <= cc->nr_freepages) {
 			blockpfn += isolated;
@@ -735,7 +746,11 @@ isolate_freepages_range(struct compact_control *cc,
 			unsigned long start_pfn, unsigned long end_pfn)
 {
 	unsigned long isolated, pfn, block_start_pfn, block_end_pfn;
-	LIST_HEAD(freelist);
+	int order;
+	struct list_head tmp_freepages[NR_PAGE_ORDERS];
+
+	for (order = 0; order < NR_PAGE_ORDERS; order++)
+		INIT_LIST_HEAD(&tmp_freepages[order]);
 
 	pfn = start_pfn;
 	block_start_pfn = pageblock_start_pfn(pfn);
@@ -766,7 +781,7 @@ isolate_freepages_range(struct compact_control *cc,
 			break;
 
 		isolated = isolate_freepages_block(cc, &isolate_start_pfn,
-					block_end_pfn, &freelist, 0, true);
+					block_end_pfn, tmp_freepages, 0, true);
 
 		/*
 		 * In strict mode, isolate_freepages_block() returns 0 if
@@ -783,15 +798,15 @@ isolate_freepages_range(struct compact_control *cc,
 		 */
 	}
 
-	/* __isolate_free_page() does not map the pages */
-	split_map_pages(&freelist);
-
 	if (pfn < end_pfn) {
 		/* Loop terminated early, cleanup. */
-		release_freepages(&freelist);
+		release_free_list(tmp_freepages);
 		return 0;
 	}
 
+	/* __isolate_free_page() does not map the pages */
+	split_map_pages(tmp_freepages);
+
 	/* We don't use freelists for anything. */
 	return pfn;
 }
@@ -1518,7 +1533,7 @@ fast_isolate_around(struct compact_control *cc, unsigned long pfn)
 	if (!page)
 		return;
 
-	isolate_freepages_block(cc, &start_pfn, end_pfn, &cc->freepages, 1, false);
+	isolate_freepages_block(cc, &start_pfn, end_pfn, cc->freepages, 1, false);
 
 	/* Skip this pageblock in the future as it's full or nearly full */
 	if (start_pfn == end_pfn && !cc->no_set_skip_hint)
@@ -1647,7 +1662,7 @@ static void fast_isolate_freepages(struct compact_control *cc)
 				nr_scanned += nr_isolated - 1;
 				total_isolated += nr_isolated;
 				cc->nr_freepages += nr_isolated;
-				list_add_tail(&page->lru, &cc->freepages);
+				list_add_tail(&page->lru, &cc->freepages[order]);
 				count_compact_events(COMPACTISOLATED, nr_isolated);
 			} else {
 				/* If isolation fails, abort the search */
@@ -1724,13 +1739,12 @@ static void isolate_freepages(struct compact_control *cc)
 	unsigned long isolate_start_pfn; /* exact pfn we start at */
 	unsigned long block_end_pfn;	/* end of current pageblock */
 	unsigned long low_pfn;	     /* lowest pfn scanner is able to scan */
-	struct list_head *freelist = &cc->freepages;
 	unsigned int stride;
 
 	/* Try a small search of the free lists for a candidate */
 	fast_isolate_freepages(cc);
 	if (cc->nr_freepages)
-		goto splitmap;
+		return;
 
 	/*
 	 * Initialise the free scanner. The starting point is where we last
@@ -1790,7 +1804,7 @@ static void isolate_freepages(struct compact_control *cc)
 
 		/* Found a block suitable for isolating free pages from. */
 		nr_isolated = isolate_freepages_block(cc, &isolate_start_pfn,
-					block_end_pfn, freelist, stride, false);
+					block_end_pfn, cc->freepages, stride, false);
 
 		/* Update the skip hint if the full pageblock was scanned */
 		if (isolate_start_pfn == block_end_pfn)
@@ -1831,10 +1845,6 @@ static void isolate_freepages(struct compact_control *cc)
 	 * and the loop terminated due to isolate_start_pfn < low_pfn
 	 */
 	cc->free_pfn = isolate_start_pfn;
-
-splitmap:
-	/* __isolate_free_page() does not map the pages */
-	split_map_pages(freelist);
 }
 
 /*
@@ -1845,24 +1855,22 @@ static struct folio *compaction_alloc(struct folio *src, unsigned long data)
 {
 	struct compact_control *cc = (struct compact_control *)data;
 	struct folio *dst;
+	int order = folio_order(src);
 
-	/* this makes migrate_pages() split the source page and retry */
-	if (folio_test_large(src))
-		return NULL;
-
-	if (list_empty(&cc->freepages)) {
+	if (list_empty(&cc->freepages[order])) {
 		isolate_freepages(cc);
-
-		if (list_empty(&cc->freepages))
+		if (list_empty(&cc->freepages[order]))
 			return NULL;
 	}
 
-	dst = list_entry(cc->freepages.next, struct folio, lru);
+	dst = list_first_entry(&cc->freepages[order], struct folio, lru);
 	list_del(&dst->lru);
-	cc->nr_freepages--;
-	cc->nr_migratepages--;
-
-	return dst;
+	post_alloc_hook(&dst->page, order, __GFP_MOVABLE);
+	if (order)
+		prep_compound_page(&dst->page, order);
+	cc->nr_freepages -= 1 << order;
+	cc->nr_migratepages -= 1 << order;
+	return page_rmappable_folio(&dst->page);
 }
 
 /*
@@ -1873,10 +1881,19 @@ static struct folio *compaction_alloc(struct folio *src, unsigned long data)
 static void compaction_free(struct folio *dst, unsigned long data)
 {
 	struct compact_control *cc = (struct compact_control *)data;
+	int order = folio_order(dst);
+	struct page *page = &dst->page;
 
-	list_add(&dst->lru, &cc->freepages);
-	cc->nr_freepages++;
-	cc->nr_migratepages++;
+	if (folio_put_testzero(dst)) {
+		free_pages_prepare(page, order);
+		list_add(&dst->lru, &cc->freepages[order]);
+		cc->nr_freepages += 1 << order;
+	}
+	cc->nr_migratepages += 1 << order;
+	/*
+	 * someone else has referenced the page, we cannot take it back to our
+	 * free list.
+	 */
 }
 
 /* possible outcome of isolate_migratepages */
@@ -2489,6 +2506,7 @@ compact_zone(struct compact_control *cc, struct capture_control *capc)
 	const bool sync = cc->mode != MIGRATE_ASYNC;
 	bool update_cached;
 	unsigned int nr_succeeded = 0, nr_migratepages;
+	int order;
 
 	/*
 	 * These counters track activities during zone compaction.  Initialize
@@ -2498,7 +2516,8 @@ compact_zone(struct compact_control *cc, struct capture_control *capc)
 	cc->total_free_scanned = 0;
 	cc->nr_migratepages = 0;
 	cc->nr_freepages = 0;
-	INIT_LIST_HEAD(&cc->freepages);
+	for (order = 0; order < NR_PAGE_ORDERS; order++)
+		INIT_LIST_HEAD(&cc->freepages[order]);
 	INIT_LIST_HEAD(&cc->migratepages);
 
 	cc->migratetype = gfp_migratetype(cc->gfp_mask);
@@ -2690,7 +2709,7 @@ compact_zone(struct compact_control *cc, struct capture_control *capc)
 	 * so we don't leave any returned pages behind in the next attempt.
 	 */
 	if (cc->nr_freepages > 0) {
-		unsigned long free_pfn = release_freepages(&cc->freepages);
+		unsigned long free_pfn = release_free_list(cc->freepages);
 
 		cc->nr_freepages = 0;
 		VM_BUG_ON(free_pfn == 0);
@@ -2709,7 +2728,6 @@ compact_zone(struct compact_control *cc, struct capture_control *capc)
 
 	trace_mm_compaction_end(cc, start_pfn, end_pfn, sync, ret);
 
-	VM_BUG_ON(!list_empty(&cc->freepages));
 	VM_BUG_ON(!list_empty(&cc->migratepages));
 
 	return ret;
diff --git a/mm/internal.h b/mm/internal.h
index 1e29c5821a1dde..93e2291120452d 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -447,6 +447,8 @@ extern void prep_compound_page(struct page *page, unsigned int order);
 
 extern void post_alloc_hook(struct page *page, unsigned int order,
 					gfp_t gfp_flags);
+extern bool free_pages_prepare(struct page *page, unsigned int order);
+
 extern int user_min_free_kbytes;
 
 extern void free_unref_page(struct page *page, unsigned int order);
@@ -481,7 +483,7 @@ int split_free_page(struct page *free_page,
  * completes when free_pfn <= migrate_pfn
  */
 struct compact_control {
-	struct list_head freepages;	/* List of free pages to migrate to */
+	struct list_head freepages[NR_PAGE_ORDERS];	/* List of free pages to migrate to */
 	struct list_head migratepages;	/* List of pages being migrated */
 	unsigned int nr_freepages;	/* Number of isolated free pages */
 	unsigned int nr_migratepages;	/* Number of pages to migrate */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index dc59fb225cbf5c..51e13aa605ecbd 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1080,7 +1080,7 @@ static void kernel_init_pages(struct page *page, int numpages)
 	kasan_enable_current();
 }
 
-static __always_inline bool free_pages_prepare(struct page *page,
+__always_inline bool free_pages_prepare(struct page *page,
 			unsigned int order)
 {
 	int bad = 0;

From 879f8bb57ca31128cf9370f62ae73646861b2a35 Mon Sep 17 00:00:00 2001
From: Zi Yan <ziy@nvidia.com>
Date: Tue, 20 Feb 2024 13:32:20 -0500
Subject: [PATCH 1300/1406] mm/compaction: optimize >0 order folio compaction
 with free page split.

During migration in a memory compaction, free pages are placed in an array
of page lists based on their order.  But the desired free page order
(i.e., the order of a source page) might not be always present, thus
leading to migration failures and premature compaction termination.  Split
a high order free pages when source migration page has a lower order to
increase migration successful rate.

Note: merging free pages when a migration fails and a lower order free
page is returned via compaction_free() is possible, but there is too much
work.  Since the free pages are not buddy pages, it is hard to identify
these free pages using existing PFN-based page merging algorithm.

Link: https://lkml.kernel.org/r/20240220183220.1451315-5-zi.yan@sent.com
Signed-off-by: Zi Yan <ziy@nvidia.com>
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Tested-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Tested-by: Yu Zhao <yuzhao@google.com>
Cc: Adam Manzanares <a.manzanares@samsung.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kemeng Shi <shikemeng@huaweicloud.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Luis Chamberlain <mcgrof@kernel.org>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Yin Fengwei <fengwei.yin@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/compaction.c | 35 ++++++++++++++++++++++++++++++-----
 1 file changed, 30 insertions(+), 5 deletions(-)

diff --git a/mm/compaction.c b/mm/compaction.c
index 61b2c731c9db94..1faeffb287201c 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1856,15 +1856,40 @@ static struct folio *compaction_alloc(struct folio *src, unsigned long data)
 	struct compact_control *cc = (struct compact_control *)data;
 	struct folio *dst;
 	int order = folio_order(src);
+	bool has_isolated_pages = false;
+	int start_order;
+	struct page *freepage;
+	unsigned long size;
+
+again:
+	for (start_order = order; start_order < NR_PAGE_ORDERS; start_order++)
+		if (!list_empty(&cc->freepages[start_order]))
+			break;
 
-	if (list_empty(&cc->freepages[order])) {
-		isolate_freepages(cc);
-		if (list_empty(&cc->freepages[order]))
+	/* no free pages in the list */
+	if (start_order == NR_PAGE_ORDERS) {
+		if (has_isolated_pages)
 			return NULL;
+		isolate_freepages(cc);
+		has_isolated_pages = true;
+		goto again;
+	}
+
+	freepage = list_first_entry(&cc->freepages[start_order], struct page,
+				lru);
+	size = 1 << start_order;
+
+	list_del(&freepage->lru);
+
+	while (start_order > order) {
+		start_order--;
+		size >>= 1;
+
+		list_add(&freepage[size].lru, &cc->freepages[start_order]);
+		set_page_private(&freepage[size], start_order);
 	}
+	dst = (struct folio *)freepage;
 
-	dst = list_first_entry(&cc->freepages[order], struct folio, lru);
-	list_del(&dst->lru);
 	post_alloc_hook(&dst->page, order, __GFP_MOVABLE);
 	if (order)
 		prep_compound_page(&dst->page, order);

From 00144cb00eca7835bbc76166dab8424ddc09f93e Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 29 Jan 2024 13:01:31 +0100
Subject: [PATCH 1301/1406] shmem: properly report quota mount options

Report quota options among the set of mount options. This allows proper
user visibility into whether quotas are enabled or not.

Link: https://lkml.kernel.org/r/20240129120131.21145-1-jack@suse.cz
Fixes: e09764cff44b ("shmem: quota support")
Signed-off-by: Jan Kara <jack@suse.cz>
Reviewed-by: Carlos Maiolino <cmaiolino@redhat.com>
Acked-by: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/shmem.c | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/mm/shmem.c b/mm/shmem.c
index d7c84ff621860b..30c9dc86250506 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -4265,6 +4265,24 @@ static int shmem_show_options(struct seq_file *seq, struct dentry *root)
 	mpol_put(mpol);
 	if (sbinfo->noswap)
 		seq_printf(seq, ",noswap");
+#ifdef CONFIG_TMPFS_QUOTA
+	if (sb_has_quota_active(root->d_sb, USRQUOTA))
+		seq_printf(seq, ",usrquota");
+	if (sb_has_quota_active(root->d_sb, GRPQUOTA))
+		seq_printf(seq, ",grpquota");
+	if (sbinfo->qlimits.usrquota_bhardlimit)
+		seq_printf(seq, ",usrquota_block_hardlimit=%lld",
+			   sbinfo->qlimits.usrquota_bhardlimit);
+	if (sbinfo->qlimits.grpquota_bhardlimit)
+		seq_printf(seq, ",grpquota_block_hardlimit=%lld",
+			   sbinfo->qlimits.grpquota_bhardlimit);
+	if (sbinfo->qlimits.usrquota_ihardlimit)
+		seq_printf(seq, ",usrquota_inode_hardlimit=%lld",
+			   sbinfo->qlimits.usrquota_ihardlimit);
+	if (sbinfo->qlimits.grpquota_ihardlimit)
+		seq_printf(seq, ",grpquota_inode_hardlimit=%lld",
+			   sbinfo->qlimits.grpquota_ihardlimit);
+#endif
 	return 0;
 }
 

From f0d9f63dbc72732a745acc2f0f3d69b38fa90714 Mon Sep 17 00:00:00 2001
From: Zhaoyang Huang <zhaoyang.huang@unisoc.com>
Date: Thu, 11 May 2023 13:22:30 +0800
Subject: [PATCH 1302/1406] mm: optimization on page allocation when CMA
 enabled

According to current CMA utilization policy, an alloc_pages(GFP_USER)
could 'steal' UNMOVABLE & RECLAIMABLE page blocks via the help of CMA(pass
zone_watermark_ok by counting CMA in but use U&R in rmqueue), which could
lead to following alloc_pages(GFP_KERNEL) fail.  Solving this by
introducing second watermark checking for GFP_MOVABLE, which could have
the allocation use CMA when proper.

-- Free_pages(30MB)
|
|
-- WMARK_LOW(25MB)
|
-- Free_CMA(12MB)
|
|
--

Link: https://lkml.kernel.org/r/20231016071245.2865233-1-zhaoyang.huang@unisoc.com
Link: https://lkml.kernel.org/r/1683782550-25799-1-git-send-email-zhaoyang.huang@unisoc.com
Signed-off-by: Zhaoyang Huang <zhaoyang.huang@unisoc.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: ke.wang <ke.wang@unisoc.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Zhaoyang Huang <huangzhaoyang@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/page_alloc.c | 44 ++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 40 insertions(+), 4 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 51e13aa605ecbd..b0b92ce997dc65 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2078,6 +2078,43 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype,
 
 }
 
+#ifdef CONFIG_CMA
+/*
+ * GFP_MOVABLE allocation could drain UNMOVABLE & RECLAIMABLE page blocks via
+ * the help of CMA which makes GFP_KERNEL failed. Checking if zone_watermark_ok
+ * again without ALLOC_CMA to see if to use CMA first.
+ */
+static bool use_cma_first(struct zone *zone, unsigned int order, unsigned int alloc_flags)
+{
+	unsigned long watermark;
+	bool cma_first = false;
+
+	watermark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK);
+	/* check if GFP_MOVABLE pass previous zone_watermark_ok via the help of CMA */
+	if (zone_watermark_ok(zone, order, watermark, 0, alloc_flags & (~ALLOC_CMA))) {
+		/*
+		 * Balance movable allocations between regular and CMA areas by
+		 * allocating from CMA when over half of the zone's free memory
+		 * is in the CMA area.
+		 */
+		cma_first = (zone_page_state(zone, NR_FREE_CMA_PAGES) >
+				zone_page_state(zone, NR_FREE_PAGES) / 2);
+	} else {
+		/*
+		 * watermark failed means UNMOVABLE & RECLAIMBLE is not enough
+		 * now, we should use cma first to keep them stay around the
+		 * corresponding watermark
+		 */
+		cma_first = true;
+	}
+	return cma_first;
+}
+#else
+static bool use_cma_first(struct zone *zone, unsigned int order, unsigned int alloc_flags)
+{
+	return false;
+}
+#endif
 /*
  * Do the hard work of removing an element from the buddy allocator.
  * Call me with the zone->lock already held.
@@ -2091,12 +2128,11 @@ __rmqueue(struct zone *zone, unsigned int order, int migratetype,
 	if (IS_ENABLED(CONFIG_CMA)) {
 		/*
 		 * Balance movable allocations between regular and CMA areas by
-		 * allocating from CMA when over half of the zone's free memory
-		 * is in the CMA area.
+		 * allocating from CMA base on judging zone_watermark_ok again
+		 * to see if the latest check got pass via the help of CMA
 		 */
 		if (alloc_flags & ALLOC_CMA &&
-		    zone_page_state(zone, NR_FREE_CMA_PAGES) >
-		    zone_page_state(zone, NR_FREE_PAGES) / 2) {
+			use_cma_first(zone, order, alloc_flags)) {
 			page = __rmqueue_cma_fallback(zone, order);
 			if (page)
 				return page;

From d1f84d6fcef82bc35dea74ad28d24e4e2ce5df2e Mon Sep 17 00:00:00 2001
From: Dan Schatzberg <schatzberg.dan@gmail.com>
Date: Wed, 3 Jan 2024 08:48:36 -0800
Subject: [PATCH 1303/1406] mm: add defines for min/max swappiness

Patch series "Add swappiness argument to memory.reclaim", v6.

This patch proposes augmenting the memory.reclaim interface with a
swappiness=<val> argument that overrides the swappiness value for that
instance of proactive reclaim.

Userspace proactive reclaimers use the memory.reclaim interface to trigger
reclaim.  The memory.reclaim interface does not allow for any way to
effect the balance of file vs anon during proactive reclaim.  The only
approach is to adjust the vm.swappiness setting.  However, there are a few
reasons we look to control the balance of file vs anon during proactive
reclaim, separately from reactive reclaim:

* Swapout should be limited to manage SSD write endurance.  In near-OOM
  situations we are fine with lots of swap-out to avoid OOMs.  As these
  are typically rare events, they have relatively little impact on write
  endurance.  However, proactive reclaim runs continuously and so its
  impact on SSD write endurance is more significant.  Therefore it is
  desireable to control swap-out for proactive reclaim separately from
  reactive reclaim

* Some userspace OOM killers like systemd-oomd[1] support OOM killing on
  swap exhaustion.  This makes sense if the swap exhaustion is triggered
  due to reactive reclaim but less so if it is triggered due to proactive
  reclaim (e.g.  one could see OOMs when free memory is ample but anon is
  just particularly cold).  Therefore, it's desireable to have proactive
  reclaim reduce or stop swap-out before the threshold at which OOM
  killing occurs.

In the case of Meta's Senpai proactive reclaimer, we adjust vm.swappiness
before writes to memory.reclaim[2].  This has been in production for
nearly two years and has addressed our needs to control proactive vs
reactive reclaim behavior but is still not ideal for a number of reasons:

* vm.swappiness is a global setting, adjusting it can race/interfere
  with other system administration that wishes to control vm.swappiness.
  In our case, we need to disable Senpai before adjusting vm.swappiness.

* vm.swappiness is stateful - so a crash or restart of Senpai can leave
  a misconfigured setting.  This requires some additional management to
  record the "desired" setting and ensure Senpai always adjusts to it.

With this patch, we avoid these downsides of adjusting vm.swappiness
globally.

Previously, this exact interface addition was proposed by Yosry[3].  In
response, Roman proposed instead an interface to specify precise
file/anon/slab reclaim amounts[4].  More recently Huan also proposed this
as well[5] and others similarly questioned if this was the proper
interface.

Previous proposals sought to use this to allow proactive reclaimers to
effectively perform a custom reclaim algorithm by issuing proactive
reclaim with different settings to control file vs anon reclaim (e.g.  to
only reclaim anon from some applications).  Responses argued that
adjusting swappiness is a poor interface for custom reclaim.

In contrast, I argue in favor of a swappiness setting not as a way to
implement custom reclaim algorithms but rather to bias the balance of anon
vs file due to differences of proactive vs reactive reclaim.  In this
context, swappiness is the existing interface for controlling this balance
and this patch simply allows for it to be configured differently for
proactive vs reactive reclaim.

Specifying explicit amounts of anon vs file pages to reclaim feels
inappropriate for this prupose.  Proactive reclaimers are un-aware of the
relative age of file vs anon for a cgroup which makes it difficult to
manage proactive reclaim of different memory pools.  A proactive reclaimer
would need some amount of anon reclaim attempts separate from the amount
of file reclaim attempts which seems brittle given that it's difficult to
observe the impact.

[1]https://www.freedesktop.org/software/systemd/man/latest/systemd-oomd.service.html
[2]https://github.com/facebookincubator/oomd/blob/main/src/oomd/plugins/Senpai.cpp#L585-L598
[3]https://lore.kernel.org/linux-mm/CAJD7tkbDpyoODveCsnaqBBMZEkDvshXJmNdbk51yKSNgD7aGdg@mail.gmail.com/
[4]https://lore.kernel.org/linux-mm/YoPHtHXzpK51F%2F1Z@carbon/
[5]https://lore.kernel.org/lkml/20231108065818.19932-1-link@vivo.com/


This patch (of 2):

We use the constants 0 and 200 in a few places in the mm code when
referring to the min and max swappiness.  This patch adds MIN_SWAPPINESS
and MAX_SWAPPINESS #defines to improve clarity.  There are no functional
changes.

Link: https://lkml.kernel.org/r/20240103164841.2800183-1-schatzberg.dan@gmail.com
Link: https://lkml.kernel.org/r/20240103164841.2800183-2-schatzberg.dan@gmail.com
Signed-off-by: Dan Schatzberg <schatzberg.dan@gmail.com>
Acked-by: David Rientjes <rientjes@google.com>
Acked-by: Chris Li <chrisl@kernel.org>
Reviewed-by: Nhat Pham <nphamcs@gmail.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Yosry Ahmed <yosryahmed@google.com>
Cc: Yue Zhao <findns94@gmail.com>
Cc: Zefan Li <lizefan.x@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/swap.h |  2 ++
 mm/memcontrol.c      |  2 +-
 mm/vmscan.c          | 12 ++++++------
 3 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 8d28f6091a320e..56cd072673b2c7 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -404,6 +404,8 @@ extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 
 #define MEMCG_RECLAIM_MAY_SWAP (1 << 1)
 #define MEMCG_RECLAIM_PROACTIVE (1 << 2)
+#define MIN_SWAPPINESS 0
+#define MAX_SWAPPINESS 200
 extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
 						  unsigned long nr_pages,
 						  gfp_t gfp_mask,
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index cb216d30a22152..7cbb1eef5234d4 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4383,7 +4383,7 @@ static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
 
-	if (val > 200)
+	if (val > MAX_SWAPPINESS)
 		return -EINVAL;
 
 	if (!mem_cgroup_is_root(memcg))
diff --git a/mm/vmscan.c b/mm/vmscan.c
index e738a210cafc6d..07e20f16b1c1c8 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -183,7 +183,7 @@ struct scan_control {
 #endif
 
 /*
- * From 0 .. 200.  Higher means more swappy.
+ * From 0 .. MAX_SWAPPINESS.  Higher means more swappy.
  */
 int vm_swappiness = 60;
 
@@ -2405,7 +2405,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
 	ap = swappiness * (total_cost + 1);
 	ap /= anon_cost + 1;
 
-	fp = (200 - swappiness) * (total_cost + 1);
+	fp = (MAX_SWAPPINESS - swappiness) * (total_cost + 1);
 	fp /= file_cost + 1;
 
 	fraction[0] = ap;
@@ -4425,7 +4425,7 @@ static int get_type_to_scan(struct lruvec *lruvec, int swappiness, int *tier_idx
 {
 	int type, tier;
 	struct ctrl_pos sp, pv;
-	int gain[ANON_AND_FILE] = { swappiness, 200 - swappiness };
+	int gain[ANON_AND_FILE] = { swappiness, MAX_SWAPPINESS - swappiness };
 
 	/*
 	 * Compare the first tier of anon with that of file to determine which
@@ -4472,7 +4472,7 @@ static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int sw
 		type = LRU_GEN_ANON;
 	else if (swappiness == 1)
 		type = LRU_GEN_FILE;
-	else if (swappiness == 200)
+	else if (swappiness == MAX_SWAPPINESS)
 		type = LRU_GEN_ANON;
 	else if (!(sc->gfp_mask & __GFP_IO))
 		type = LRU_GEN_FILE;
@@ -5410,9 +5410,9 @@ static int run_cmd(char cmd, int memcg_id, int nid, unsigned long seq,
 
 	lruvec = get_lruvec(memcg, nid);
 
-	if (swappiness < 0)
+	if (swappiness < MIN_SWAPPINESS)
 		swappiness = get_swappiness(lruvec, sc);
-	else if (swappiness > 200)
+	else if (swappiness > MAX_SWAPPINESS)
 		goto done;
 
 	switch (cmd) {

From c09a8e005eff6c064e2e9f11549966c36a724fbf Mon Sep 17 00:00:00 2001
From: Dan Schatzberg <schatzberg.dan@gmail.com>
Date: Wed, 3 Jan 2024 08:48:37 -0800
Subject: [PATCH 1304/1406] mm: add swappiness= arg to memory.reclaim

Allow proactive reclaimers to submit an additional swappiness=<val>
argument to memory.reclaim.  This overrides the global or per-memcg
swappiness setting for that reclaim attempt.

For example:

echo "2M swappiness=0" > /sys/fs/cgroup/memory.reclaim

will perform reclaim on the rootcg with a swappiness setting of 0 (no
swap) regardless of the vm.swappiness sysctl setting.

Userspace proactive reclaimers use the memory.reclaim interface to trigger
reclaim.  The memory.reclaim interface does not allow for any way to
effect the balance of file vs anon during proactive reclaim.  The only
approach is to adjust the vm.swappiness setting.  However, there are a few
reasons we look to control the balance of file vs anon during proactive
reclaim, separately from reactive reclaim:

* Swapout should be limited to manage SSD write endurance.  In near-OOM
  situations we are fine with lots of swap-out to avoid OOMs.  As these
  are typically rare events, they have relatively little impact on write
  endurance.  However, proactive reclaim runs continuously and so its
  impact on SSD write endurance is more significant.  Therefore it is
  desireable to control swap-out for proactive reclaim separately from
  reactive reclaim

* Some userspace OOM killers like systemd-oomd[1] support OOM killing on
  swap exhaustion.  This makes sense if the swap exhaustion is triggered
  due to reactive reclaim but less so if it is triggered due to proactive
  reclaim (e.g.  one could see OOMs when free memory is ample but anon is
  just particularly cold).  Therefore, it's desireable to have proactive
  reclaim reduce or stop swap-out before the threshold at which OOM
  killing occurs.

In the case of Meta's Senpai proactive reclaimer, we adjust vm.swappiness
before writes to memory.reclaim[2].  This has been in production for
nearly two years and has addressed our needs to control proactive vs
reactive reclaim behavior but is still not ideal for a number of reasons:

* vm.swappiness is a global setting, adjusting it can race/interfere
  with other system administration that wishes to control vm.swappiness.
  In our case, we need to disable Senpai before adjusting vm.swappiness.

* vm.swappiness is stateful - so a crash or restart of Senpai can leave
  a misconfigured setting.  This requires some additional management to
  record the "desired" setting and ensure Senpai always adjusts to it.

With this patch, we avoid these downsides of adjusting vm.swappiness
globally.

[1]https://www.freedesktop.org/software/systemd/man/latest/systemd-oomd.service.html
[2]https://github.com/facebookincubator/oomd/blob/main/src/oomd/plugins/Senpai.cpp#L585-L598

Link: https://lkml.kernel.org/r/20240103164841.2800183-3-schatzberg.dan@gmail.com
Signed-off-by: Dan Schatzberg <schatzberg.dan@gmail.com>
Suggested-by: Yosry Ahmed <yosryahmed@google.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: David Rientjes <rientjes@google.com>
Acked-by: Chris Li <chrisl@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Yue Zhao <findns94@gmail.com>
Cc: Zefan Li <lizefan.x@bytedance.com>
Cc: Nhat Pham <nphamcs@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/admin-guide/cgroup-v2.rst | 18 +++++---
 include/linux/swap.h                    |  3 +-
 mm/memcontrol.c                         | 57 ++++++++++++++++++++-----
 mm/vmscan.c                             | 25 +++++++++--
 4 files changed, 81 insertions(+), 22 deletions(-)

diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index 17e6e956515640..0270517ade47cf 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -1296,17 +1296,10 @@ PAGE_SIZE multiple when read back.
 	This is a simple interface to trigger memory reclaim in the
 	target cgroup.
 
-	This file accepts a single key, the number of bytes to reclaim.
-	No nested keys are currently supported.
-
 	Example::
 
 	  echo "1G" > memory.reclaim
 
-	The interface can be later extended with nested keys to
-	configure the reclaim behavior. For example, specify the
-	type of memory to reclaim from (anon, file, ..).
-
 	Please note that the kernel can over or under reclaim from
 	the target cgroup. If less bytes are reclaimed than the
 	specified amount, -EAGAIN is returned.
@@ -1318,6 +1311,17 @@ PAGE_SIZE multiple when read back.
 	This means that the networking layer will not adapt based on
 	reclaim induced by memory.reclaim.
 
+The following nested keys are defined.
+
+	  ==========            ================================
+	  swappiness            Swappiness value to reclaim with
+	  ==========            ================================
+
+	Specifying a swappiness value instructs the kernel to perform
+	the reclaim with that swappiness value. Note that this has the
+	same semantics as vm.swappiness applied to memcg reclaim with
+	all the existing limitations and potential future extensions.
+
   memory.peak
 	A read-only single value file which exists on non-root
 	cgroups.
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 56cd072673b2c7..25f6368be078a9 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -409,7 +409,8 @@ extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
 						  unsigned long nr_pages,
 						  gfp_t gfp_mask,
-						  unsigned int reclaim_options);
+						  unsigned int reclaim_options,
+						  int *swappiness);
 extern unsigned long mem_cgroup_shrink_node(struct mem_cgroup *mem,
 						gfp_t gfp_mask, bool noswap,
 						pg_data_t *pgdat,
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 7cbb1eef5234d4..95c3fccb321bd1 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -52,6 +52,7 @@
 #include <linux/sort.h>
 #include <linux/fs.h>
 #include <linux/seq_file.h>
+#include <linux/parser.h>
 #include <linux/vmpressure.h>
 #include <linux/memremap.h>
 #include <linux/mm_inline.h>
@@ -2474,7 +2475,8 @@ static unsigned long reclaim_high(struct mem_cgroup *memcg,
 		psi_memstall_enter(&pflags);
 		nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages,
 							gfp_mask,
-							MEMCG_RECLAIM_MAY_SWAP);
+							MEMCG_RECLAIM_MAY_SWAP,
+							NULL);
 		psi_memstall_leave(&pflags);
 	} while ((memcg = parent_mem_cgroup(memcg)) &&
 		 !mem_cgroup_is_root(memcg));
@@ -2780,7 +2782,7 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
 
 	psi_memstall_enter(&pflags);
 	nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
-						    gfp_mask, reclaim_options);
+						    gfp_mask, reclaim_options, NULL);
 	psi_memstall_leave(&pflags);
 
 	if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
@@ -3706,7 +3708,7 @@ static int mem_cgroup_resize_max(struct mem_cgroup *memcg,
 		}
 
 		if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL,
-					memsw ? 0 : MEMCG_RECLAIM_MAY_SWAP)) {
+					memsw ? 0 : MEMCG_RECLAIM_MAY_SWAP, NULL)) {
 			ret = -EBUSY;
 			break;
 		}
@@ -3820,7 +3822,7 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
 			return -EINTR;
 
 		if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL,
-						  MEMCG_RECLAIM_MAY_SWAP))
+						  MEMCG_RECLAIM_MAY_SWAP, NULL))
 			nr_retries--;
 	}
 
@@ -6786,7 +6788,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
 		}
 
 		reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_pages - high,
-					GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP);
+					GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP, NULL);
 
 		if (!reclaimed && !nr_retries--)
 			break;
@@ -6835,7 +6837,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
 
 		if (nr_reclaims) {
 			if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max,
-					GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP))
+					GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP, NULL))
 				nr_reclaims--;
 			continue;
 		}
@@ -6965,19 +6967,50 @@ static ssize_t memory_oom_group_write(struct kernfs_open_file *of,
 	return nbytes;
 }
 
+enum {
+	MEMORY_RECLAIM_SWAPPINESS = 0,
+	MEMORY_RECLAIM_NULL,
+};
+
+static const match_table_t tokens = {
+	{ MEMORY_RECLAIM_SWAPPINESS, "swappiness=%d"},
+	{ MEMORY_RECLAIM_NULL, NULL },
+};
+
 static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf,
 			      size_t nbytes, loff_t off)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
 	unsigned int nr_retries = MAX_RECLAIM_RETRIES;
 	unsigned long nr_to_reclaim, nr_reclaimed = 0;
+	int swappiness = -1;
 	unsigned int reclaim_options;
-	int err;
+	char *old_buf, *start;
+	substring_t args[MAX_OPT_ARGS];
 
 	buf = strstrip(buf);
-	err = page_counter_memparse(buf, "", &nr_to_reclaim);
-	if (err)
-		return err;
+
+	old_buf = buf;
+	nr_to_reclaim = memparse(buf, &buf) / PAGE_SIZE;
+	if (buf == old_buf)
+		return -EINVAL;
+
+	buf = strstrip(buf);
+
+	while ((start = strsep(&buf, " ")) != NULL) {
+		if (!strlen(start))
+			continue;
+		switch (match_token(start, tokens, args)) {
+		case MEMORY_RECLAIM_SWAPPINESS:
+			if (match_int(&args[0], &swappiness))
+				return -EINVAL;
+			if (swappiness < MIN_SWAPPINESS || swappiness > MAX_SWAPPINESS)
+				return -EINVAL;
+			break;
+		default:
+			return -EINVAL;
+		}
+	}
 
 	reclaim_options	= MEMCG_RECLAIM_MAY_SWAP | MEMCG_RECLAIM_PROACTIVE;
 	while (nr_reclaimed < nr_to_reclaim) {
@@ -6997,7 +7030,9 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf,
 			lru_add_drain_all();
 
 		reclaimed = try_to_free_mem_cgroup_pages(memcg,
-					batch_size, GFP_KERNEL, reclaim_options);
+					batch_size, GFP_KERNEL,
+					reclaim_options,
+					swappiness == -1 ? NULL : &swappiness);
 
 		if (!reclaimed && !nr_retries--)
 			return -EAGAIN;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 07e20f16b1c1c8..87df3a48bdd769 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -92,6 +92,11 @@ struct scan_control {
 	unsigned long	anon_cost;
 	unsigned long	file_cost;
 
+#ifdef CONFIG_MEMCG
+	/* Swappiness value for proactive reclaim. Always use sc_swappiness()! */
+	int *proactive_swappiness;
+#endif
+
 	/* Can active folios be deactivated as part of reclaim? */
 #define DEACTIVATE_ANON 1
 #define DEACTIVATE_FILE 2
@@ -227,6 +232,13 @@ static bool writeback_throttling_sane(struct scan_control *sc)
 #endif
 	return false;
 }
+
+static int sc_swappiness(struct scan_control *sc, struct mem_cgroup *memcg)
+{
+	if (sc->proactive && sc->proactive_swappiness)
+		return *sc->proactive_swappiness;
+	return mem_cgroup_swappiness(memcg);
+}
 #else
 static bool cgroup_reclaim(struct scan_control *sc)
 {
@@ -242,6 +254,11 @@ static bool writeback_throttling_sane(struct scan_control *sc)
 {
 	return true;
 }
+
+static int sc_swappiness(struct scan_control *sc, struct mem_cgroup *memcg)
+{
+	return READ_ONCE(vm_swappiness);
+}
 #endif
 
 static void set_task_reclaim_state(struct task_struct *task,
@@ -2329,7 +2346,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
 	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
 	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
 	unsigned long anon_cost, file_cost, total_cost;
-	int swappiness = mem_cgroup_swappiness(memcg);
+	int swappiness = sc_swappiness(sc, memcg);
 	u64 fraction[ANON_AND_FILE];
 	u64 denominator = 0;	/* gcc */
 	enum scan_balance scan_balance;
@@ -2610,7 +2627,7 @@ static int get_swappiness(struct lruvec *lruvec, struct scan_control *sc)
 	    mem_cgroup_get_nr_swap_pages(memcg) < MIN_LRU_BATCH)
 		return 0;
 
-	return mem_cgroup_swappiness(memcg);
+	return sc_swappiness(sc, memcg);
 }
 
 static int get_nr_gens(struct lruvec *lruvec, int type)
@@ -6490,12 +6507,14 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
 unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
 					   unsigned long nr_pages,
 					   gfp_t gfp_mask,
-					   unsigned int reclaim_options)
+					   unsigned int reclaim_options,
+					   int *swappiness)
 {
 	unsigned long nr_reclaimed;
 	unsigned int noreclaim_flag;
 	struct scan_control sc = {
 		.nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
+		.proactive_swappiness = swappiness,
 		.gfp_mask = (current_gfp_context(gfp_mask) & GFP_RECLAIM_MASK) |
 				(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
 		.reclaim_idx = MAX_NR_ZONES - 1,

From 4e39eae1e4bd36c7fc73b4d162711213846019ea Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Tue, 10 Oct 2023 15:55:49 +0100
Subject: [PATCH 1305/1406] bounds: support non-power-of-two CONFIG_NR_CPUS

ilog2() rounds down, so for example when PowerPC 85xx sets CONFIG_NR_CPUS
to 24, we will only allocate 4 bits to store the number of CPUs instead of
5.  Use bits_per() instead, which rounds up.  Found by code inspection.
The effect of this would probably be a misaccounting when doing NUMA
balancing, so to a user, it would only be a performance penalty.  The
effects may be more wide-spread; it's hard to tell.

Link: https://lkml.kernel.org/r/20231010145549.1244748-1-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Fixes: 90572890d202 ("mm: numa: Change page last {nid,pid} into {cpu,pid}")
Reviewed-by: Rik van Riel <riel@surriel.com>
Acked-by: Mel Gorman <mgorman@techsingularity.net>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/bounds.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/bounds.c b/kernel/bounds.c
index b529182e8b04fc..c5a9fcd2d62281 100644
--- a/kernel/bounds.c
+++ b/kernel/bounds.c
@@ -19,7 +19,7 @@ int main(void)
 	DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS);
 	DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES);
 #ifdef CONFIG_SMP
-	DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS));
+	DEFINE(NR_CPUS_BITS, bits_per(CONFIG_NR_CPUS));
 #endif
 	DEFINE(SPINLOCK_SIZE, sizeof(spinlock_t));
 #ifdef CONFIG_LRU_GEN

From 5f4876d54b6b69dacfb339728ec0a8e3e83fff34 Mon Sep 17 00:00:00 2001
From: Nathan Chancellor <nathan@kernel.org>
Date: Tue, 9 Jan 2024 15:16:30 -0700
Subject: [PATCH 1306/1406] arch and include: update LLVM Phabricator links

reviews.llvm.org was LLVM's Phabricator instances for code review.  It has
been abandoned in favor of GitHub pull requests.  While the majority of
links in the kernel sources still work because of the work Fangrui has
done turning the dynamic Phabricator instance into a static archive, there
are some issues with that work, so preemptively convert all the links in
the kernel sources to point to the commit on GitHub.

Most of the commits have the corresponding differential review link in the
commit message itself so there should not be any loss of fidelity in the
relevant information.

Link: https://discourse.llvm.org/t/update-on-github-pull-requests/71540/172
Link: https://lkml.kernel.org/r/20240109-update-llvm-links-v1-2-eb09b59db071@kernel.org
Signed-off-by: Nathan Chancellor <nathan@kernel.org>
Reviewed-by: Conor Dooley <conor.dooley@microchip.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Acked-by: Fangrui Song <maskray@google.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Andrii Nakryiko <andrii@kernel.org>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Mykola Lysenko <mykolal@fb.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/arm64/Kconfig              | 4 ++--
 arch/riscv/Kconfig              | 2 +-
 arch/riscv/include/asm/ftrace.h | 2 +-
 include/linux/compiler-clang.h  | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index aa7c1d43513968..5a8acca4dbf495 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -382,7 +382,7 @@ config BROKEN_GAS_INST
 config BUILTIN_RETURN_ADDRESS_STRIPS_PAC
 	bool
 	# Clang's __builtin_return_adddress() strips the PAC since 12.0.0
-	# https://reviews.llvm.org/D75044
+	# https://github.com/llvm/llvm-project/commit/2a96f47c5ffca84cd774ad402cacd137f4bf45e2
 	default y if CC_IS_CLANG && (CLANG_VERSION >= 120000)
 	# GCC's __builtin_return_address() strips the PAC since 11.1.0,
 	# and this was backported to 10.2.0, 9.4.0, 8.5.0, but not earlier
@@ -2222,7 +2222,7 @@ config STACKPROTECTOR_PER_TASK
 
 config UNWIND_PATCH_PAC_INTO_SCS
 	bool "Enable shadow call stack dynamically using code patching"
-	# needs Clang with https://reviews.llvm.org/D111780 incorporated
+	# needs Clang with https://github.com/llvm/llvm-project/commit/de07cde67b5d205d58690be012106022aea6d2b3 incorporated
 	depends on CC_IS_CLANG && CLANG_VERSION >= 150000
 	depends on ARM64_PTR_AUTH_KERNEL && CC_HAS_BRANCH_PROT_PAC_RET
 	depends on SHADOW_CALL_STACK
diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index bffbd869a06828..69d24f51392206 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -312,7 +312,7 @@ config AS_HAS_INSN
 	def_bool $(as-instr,.insn r 51$(comma) 0$(comma) 0$(comma) t0$(comma) t0$(comma) zero)
 
 config AS_HAS_OPTION_ARCH
-	# https://reviews.llvm.org/D123515
+	# https://github.com/llvm/llvm-project/commit/9e8ed3403c191ab9c4903e8eeb8f732ff8a43cb4
 	def_bool y
 	depends on $(as-instr, .option arch$(comma) +m)
 	depends on !$(as-instr, .option arch$(comma) -i)
diff --git a/arch/riscv/include/asm/ftrace.h b/arch/riscv/include/asm/ftrace.h
index 32917212295234..06874fb1311e5e 100644
--- a/arch/riscv/include/asm/ftrace.h
+++ b/arch/riscv/include/asm/ftrace.h
@@ -15,7 +15,7 @@
 
 /*
  * Clang prior to 13 had "mcount" instead of "_mcount":
- * https://reviews.llvm.org/D98881
+ * https://github.com/llvm/llvm-project/commit/ef58ae86ba778ed7d01cd3f6bd6d08f943abab44
  */
 #if defined(CONFIG_CC_IS_GCC) || CONFIG_CLANG_VERSION >= 130000
 #define MCOUNT_NAME _mcount
diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h
index ddab1ef22beef3..f0a47afef12581 100644
--- a/include/linux/compiler-clang.h
+++ b/include/linux/compiler-clang.h
@@ -9,7 +9,7 @@
  * Clang prior to 17 is being silly and considers many __cleanup() variables
  * as unused (because they are, their sole purpose is to go out of scope).
  *
- * https://reviews.llvm.org/D152180
+ * https://github.com/llvm/llvm-project/commit/877210faa447f4cc7db87812f8ed80e398fedd61
  */
 #undef __cleanup
 #define __cleanup(func) __maybe_unused __attribute__((__cleanup__(func)))

From 0477e90d9cc9b572789e2e39959cde089304fd8b Mon Sep 17 00:00:00 2001
From: Nathan Chancellor <nathan@kernel.org>
Date: Tue, 9 Jan 2024 15:16:31 -0700
Subject: [PATCH 1307/1406] treewide: update LLVM Bugzilla links

LLVM moved their issue tracker from their own Bugzilla instance to GitHub
issues.  While all of the links are still valid, they may not necessarily
show the most up to date information around the issues, as all updates
will occur on GitHub, not Bugzilla.

Another complication is that the Bugzilla issue number is not always the
same as the GitHub issue number.  Thankfully, LLVM maintains this mapping
through two shortlinks:

  https://llvm.org/bz<num> -> https://bugs.llvm.org/show_bug.cgi?id=<num>
  https://llvm.org/pr<num> -> https://github.com/llvm/llvm-project/issues/<mapped_num>

Switch all "https://bugs.llvm.org/show_bug.cgi?id=<num>" links to the
"https://llvm.org/pr<num>" shortlink so that the links show the most up to
date information.  Each migrated issue links back to the Bugzilla entry,
so there should be no loss of fidelity of information here.

Link: https://lkml.kernel.org/r/20240109-update-llvm-links-v1-3-eb09b59db071@kernel.org
Signed-off-by: Nathan Chancellor <nathan@kernel.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Acked-by: Fangrui Song <maskray@google.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Andrii Nakryiko <andrii@kernel.org>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Mykola Lysenko <mykolal@fb.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/powerpc/Makefile                           | 4 ++--
 arch/powerpc/kvm/book3s_hv_nested.c             | 2 +-
 arch/s390/include/asm/ftrace.h                  | 2 +-
 arch/x86/power/Makefile                         | 2 +-
 crypto/blake2b_generic.c                        | 2 +-
 drivers/firmware/efi/libstub/Makefile           | 2 +-
 drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c        | 2 +-
 drivers/media/test-drivers/vicodec/codec-fwht.c | 2 +-
 drivers/regulator/Kconfig                       | 2 +-
 include/asm-generic/vmlinux.lds.h               | 2 +-
 lib/Kconfig.kasan                               | 2 +-
 lib/raid6/Makefile                              | 2 +-
 lib/stackinit_kunit.c                           | 2 +-
 mm/slab_common.c                                | 2 +-
 net/bridge/br_multicast.c                       | 2 +-
 security/Kconfig                                | 2 +-
 16 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile
index 051247027da0ba..457cee9b03ee04 100644
--- a/arch/powerpc/Makefile
+++ b/arch/powerpc/Makefile
@@ -144,11 +144,11 @@ CFLAGS-$(CONFIG_PPC64)	+= $(call cc-option,-mno-pointers-to-nested-functions)
 CFLAGS-$(CONFIG_PPC64)	+= $(call cc-option,-mlong-double-128)
 
 # Clang unconditionally reserves r2 on ppc32 and does not support the flag
-# https://bugs.llvm.org/show_bug.cgi?id=39555
+# https://llvm.org/pr39555
 CFLAGS-$(CONFIG_PPC32)	:= $(call cc-option, -ffixed-r2)
 
 # Clang doesn't support -mmultiple / -mno-multiple
-# https://bugs.llvm.org/show_bug.cgi?id=39556
+# https://llvm.org/pr39556
 CFLAGS-$(CONFIG_PPC32)	+= $(call cc-option, $(MULTIPLEWORD))
 
 CFLAGS-$(CONFIG_PPC32)	+= $(call cc-option,-mno-readonly-in-sdata)
diff --git a/arch/powerpc/kvm/book3s_hv_nested.c b/arch/powerpc/kvm/book3s_hv_nested.c
index 5c375ec1a3c608..05f5220960c63b 100644
--- a/arch/powerpc/kvm/book3s_hv_nested.c
+++ b/arch/powerpc/kvm/book3s_hv_nested.c
@@ -55,7 +55,7 @@ void kvmhv_save_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr)
 	hr->dawrx1 = vcpu->arch.dawrx1;
 }
 
-/* Use noinline_for_stack due to https://bugs.llvm.org/show_bug.cgi?id=49610 */
+/* Use noinline_for_stack due to https://llvm.org/pr49610 */
 static noinline_for_stack void byteswap_pt_regs(struct pt_regs *regs)
 {
 	unsigned long *addr = (unsigned long *) regs;
diff --git a/arch/s390/include/asm/ftrace.h b/arch/s390/include/asm/ftrace.h
index 5a82b08f03cd3e..621f23d5ae30a6 100644
--- a/arch/s390/include/asm/ftrace.h
+++ b/arch/s390/include/asm/ftrace.h
@@ -9,7 +9,7 @@
 #ifndef __ASSEMBLY__
 
 #ifdef CONFIG_CC_IS_CLANG
-/* https://bugs.llvm.org/show_bug.cgi?id=41424 */
+/* https://llvm.org/pr41424 */
 #define ftrace_return_address(n) 0UL
 #else
 #define ftrace_return_address(n) __builtin_return_address(n)
diff --git a/arch/x86/power/Makefile b/arch/x86/power/Makefile
index 379777572bc9fe..e0cd7afd53022a 100644
--- a/arch/x86/power/Makefile
+++ b/arch/x86/power/Makefile
@@ -5,7 +5,7 @@
 CFLAGS_cpu.o	:= -fno-stack-protector
 
 # Clang may incorrectly inline functions with stack protector enabled into
-# __restore_processor_state(): https://bugs.llvm.org/show_bug.cgi?id=47479
+# __restore_processor_state(): https://llvm.org/pr47479
 CFLAGS_REMOVE_cpu.o := $(CC_FLAGS_LTO)
 
 obj-$(CONFIG_PM_SLEEP)		+= cpu.o
diff --git a/crypto/blake2b_generic.c b/crypto/blake2b_generic.c
index 6704c035588967..32e380b714b6cc 100644
--- a/crypto/blake2b_generic.c
+++ b/crypto/blake2b_generic.c
@@ -102,7 +102,7 @@ static void blake2b_compress_one_generic(struct blake2b_state *S,
 	ROUND(10);
 	ROUND(11);
 #ifdef CONFIG_CC_IS_CLANG
-#pragma nounroll /* https://bugs.llvm.org/show_bug.cgi?id=45803 */
+#pragma nounroll /* https://llvm.org/pr45803 */
 #endif
 	for (i = 0; i < 8; ++i)
 		S->h[i] = S->h[i] ^ v[i] ^ v[i + 8];
diff --git a/drivers/firmware/efi/libstub/Makefile b/drivers/firmware/efi/libstub/Makefile
index 73f4810f6db38e..31eb1e287ce161 100644
--- a/drivers/firmware/efi/libstub/Makefile
+++ b/drivers/firmware/efi/libstub/Makefile
@@ -105,7 +105,7 @@ lib-y				:= $(patsubst %.o,%.stub.o,$(lib-y))
 # Even when -mbranch-protection=none is set, Clang will generate a
 # .note.gnu.property for code-less object files (like lib/ctype.c),
 # so work around this by explicitly removing the unwanted section.
-# https://bugs.llvm.org/show_bug.cgi?id=46480
+# https://llvm.org/pr46480
 STUBCOPY_FLAGS-y		+= --remove-section=.note.gnu.property
 
 STUBCOPY_RELOC-$(CONFIG_X86_32)	:= R_386_32
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
index 2d688dca26bedb..78a2773b74f2f0 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
@@ -610,7 +610,7 @@ static uint32_t sdma_v4_4_2_rb_cntl(struct amdgpu_ring *ring, uint32_t rb_cntl)
 	/* Set ring buffer size in dwords */
 	uint32_t rb_bufsz = order_base_2(ring->ring_size / 4);
 
-	barrier(); /* work around https://bugs.llvm.org/show_bug.cgi?id=42576 */
+	barrier(); /* work around https://llvm.org/pr42576 */
 	rb_cntl = REG_SET_FIELD(rb_cntl, SDMA_GFX_RB_CNTL, RB_SIZE, rb_bufsz);
 #ifdef __BIG_ENDIAN
 	rb_cntl = REG_SET_FIELD(rb_cntl, SDMA_GFX_RB_CNTL, RB_SWAP_ENABLE, 1);
diff --git a/drivers/media/test-drivers/vicodec/codec-fwht.c b/drivers/media/test-drivers/vicodec/codec-fwht.c
index 1ce682e1b85c32..fd75457d03b202 100644
--- a/drivers/media/test-drivers/vicodec/codec-fwht.c
+++ b/drivers/media/test-drivers/vicodec/codec-fwht.c
@@ -49,7 +49,7 @@ static const uint8_t zigzag[64] = {
 
 /*
  * noinline_for_stack to work around
- * https://bugs.llvm.org/show_bug.cgi?id=38809
+ * https://llvm.org/pr38809
  */
 static int noinline_for_stack
 rlc(const s16 *in, __be16 *output, int blocktype)
diff --git a/drivers/regulator/Kconfig b/drivers/regulator/Kconfig
index 550145f82726e9..7db0a29b5b8dcd 100644
--- a/drivers/regulator/Kconfig
+++ b/drivers/regulator/Kconfig
@@ -288,7 +288,7 @@ config REGULATOR_CROS_EC
 config REGULATOR_DA903X
 	tristate "Dialog Semiconductor DA9030/DA9034 regulators"
 	depends on PMIC_DA903X
-	depends on !CC_IS_CLANG # https://bugs.llvm.org/show_bug.cgi?id=38789
+	depends on !CC_IS_CLANG # https://llvm.org/pr38789
 	help
 	  Say y here to support the BUCKs and LDOs regulators found on
 	  Dialog Semiconductor DA9030/DA9034 PMIC.
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 5dd3a61d673d4f..f7749d0f2562f1 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -984,7 +984,7 @@
  * -fsanitize=thread produce unwanted sections (.eh_frame
  * and .init_array.*), but CONFIG_CONSTRUCTORS wants to
  * keep any .init_array.* sections.
- * https://bugs.llvm.org/show_bug.cgi?id=46478
+ * https://llvm.org/pr46478
  */
 #ifdef CONFIG_UNWIND_TABLES
 #define DISCARD_EH_FRAME
diff --git a/lib/Kconfig.kasan b/lib/Kconfig.kasan
index e6eda054ab275f..98016e137b7f09 100644
--- a/lib/Kconfig.kasan
+++ b/lib/Kconfig.kasan
@@ -158,7 +158,7 @@ config KASAN_STACK
 	  out-of-bounds bugs in stack variables.
 
 	  With Clang, stack instrumentation has a problem that causes excessive
-	  stack usage, see https://bugs.llvm.org/show_bug.cgi?id=38809. Thus,
+	  stack usage, see https://llvm.org/pr38809. Thus,
 	  with Clang, this option is deemed unsafe.
 
 	  This option is always disabled when compile-testing with Clang to
diff --git a/lib/raid6/Makefile b/lib/raid6/Makefile
index 1c5420ff254e84..385a94aa0b999b 100644
--- a/lib/raid6/Makefile
+++ b/lib/raid6/Makefile
@@ -21,7 +21,7 @@ altivec_flags += -isystem $(shell $(CC) -print-file-name=include)
 ifdef CONFIG_CC_IS_CLANG
 # clang ppc port does not yet support -maltivec when -msoft-float is
 # enabled. A future release of clang will resolve this
-# https://bugs.llvm.org/show_bug.cgi?id=31177
+# https://llvm.org/pr31177
 CFLAGS_REMOVE_altivec1.o  += -msoft-float
 CFLAGS_REMOVE_altivec2.o  += -msoft-float
 CFLAGS_REMOVE_altivec4.o  += -msoft-float
diff --git a/lib/stackinit_kunit.c b/lib/stackinit_kunit.c
index 05947a2feb93c0..7a10e1d1725817 100644
--- a/lib/stackinit_kunit.c
+++ b/lib/stackinit_kunit.c
@@ -404,7 +404,7 @@ static noinline int leaf_switch_2_none(unsigned long sp, bool fill,
  * These are expected to fail for most configurations because neither
  * GCC nor Clang have a way to perform initialization of variables in
  * non-code areas (i.e. in a switch statement before the first "case").
- * https://bugs.llvm.org/show_bug.cgi?id=44916
+ * https://llvm.org/pr44916
  */
 DEFINE_TEST_DRIVER(switch_1_none, uint64_t, SCALAR, ALWAYS_FAIL);
 DEFINE_TEST_DRIVER(switch_2_none, uint64_t, SCALAR, ALWAYS_FAIL);
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 238293b1dbe14b..954af676d79ee8 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -651,7 +651,7 @@ static struct kmem_cache *__init create_kmalloc_cache(const char *name,
 
 struct kmem_cache *
 kmalloc_caches[NR_KMALLOC_TYPES][KMALLOC_SHIFT_HIGH + 1] __ro_after_init =
-{ /* initialization for https://bugs.llvm.org/show_bug.cgi?id=42570 */ };
+{ /* initialization for https://llvm.org/pr42570 */ };
 EXPORT_SYMBOL(kmalloc_caches);
 
 #ifdef CONFIG_RANDOM_KMALLOC_CACHES
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index 2d7b7324295885..9a1cb5079a7a07 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -5053,7 +5053,7 @@ void br_multicast_uninit_stats(struct net_bridge *br)
 	free_percpu(br->mcast_stats);
 }
 
-/* noinline for https://bugs.llvm.org/show_bug.cgi?id=45802#c9 */
+/* noinline for https://llvm.org/pr45802#c9 */
 static noinline_for_stack void mcast_stats_add_dir(u64 *dst, u64 *src)
 {
 	dst[BR_MCAST_DIR_RX] += src[BR_MCAST_DIR_RX];
diff --git a/security/Kconfig b/security/Kconfig
index 52c9af08ad35d3..606a87c29a0170 100644
--- a/security/Kconfig
+++ b/security/Kconfig
@@ -142,7 +142,7 @@ config HARDENED_USERCOPY
 config FORTIFY_SOURCE
 	bool "Harden common str/mem functions against buffer overflows"
 	depends on ARCH_HAS_FORTIFY_SOURCE
-	# https://bugs.llvm.org/show_bug.cgi?id=41459
+	# https://llvm.org/pr41459
 	depends on !CC_IS_CLANG || CLANG_VERSION >= 120001
 	# https://github.com/llvm/llvm-project/issues/53645
 	depends on !CC_IS_CLANG || !X86_32

From bcbcb8764af4030cd0fbb36d1b9e5da1bf867b0c Mon Sep 17 00:00:00 2001
From: Wen Yang <wenyang.linux@foxmail.com>
Date: Mon, 8 Jan 2024 23:51:32 +0800
Subject: [PATCH 1308/1406] selftests: add eventfd selftests

This adds the promised selftest for eventfd.  It will verify the flags of
eventfd2, including EFD_CLOEXEC, EFD_NONBLOCK and EFD_SEMAPHORE.

Link: https://lkml.kernel.org/r/tencent_3C9A298878D22B5D8F79DC2FEE99BB4A8F05@qq.com
Signed-off-by: Wen Yang <wenyang.linux@foxmail.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Javier Martinez Canillas <javierm@redhat.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Pengfei Xu <pengfei.xu@intel.com>
Cc: Miklos Szeredi <mszeredi@redhat.com>
Cc: Andrei Vagin <avagin@google.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 .../selftests/filesystems/eventfd/.gitignore  |   2 +
 .../selftests/filesystems/eventfd/Makefile    |   7 +
 .../filesystems/eventfd/eventfd_test.c        | 186 ++++++++++++++++++
 3 files changed, 195 insertions(+)
 create mode 100644 tools/testing/selftests/filesystems/eventfd/.gitignore
 create mode 100644 tools/testing/selftests/filesystems/eventfd/Makefile
 create mode 100644 tools/testing/selftests/filesystems/eventfd/eventfd_test.c

diff --git a/tools/testing/selftests/filesystems/eventfd/.gitignore b/tools/testing/selftests/filesystems/eventfd/.gitignore
new file mode 100644
index 00000000000000..483faf59fe4adb
--- /dev/null
+++ b/tools/testing/selftests/filesystems/eventfd/.gitignore
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+eventfd_test
diff --git a/tools/testing/selftests/filesystems/eventfd/Makefile b/tools/testing/selftests/filesystems/eventfd/Makefile
new file mode 100644
index 00000000000000..0a8e3910df1572
--- /dev/null
+++ b/tools/testing/selftests/filesystems/eventfd/Makefile
@@ -0,0 +1,7 @@
+# SPDX-License-Identifier: GPL-2.0
+
+CFLAGS += $(KHDR_INCLUDES)
+LDLIBS += -lpthread
+TEST_GEN_PROGS := eventfd_test
+
+include ../../lib.mk
diff --git a/tools/testing/selftests/filesystems/eventfd/eventfd_test.c b/tools/testing/selftests/filesystems/eventfd/eventfd_test.c
new file mode 100644
index 00000000000000..f142a137526cda
--- /dev/null
+++ b/tools/testing/selftests/filesystems/eventfd/eventfd_test.c
@@ -0,0 +1,186 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define _GNU_SOURCE
+#include <errno.h>
+#include <fcntl.h>
+#include <asm/unistd.h>
+#include <linux/time_types.h>
+#include <unistd.h>
+#include <assert.h>
+#include <signal.h>
+#include <pthread.h>
+#include <sys/epoll.h>
+#include <sys/eventfd.h>
+#include "../../kselftest_harness.h"
+
+struct error {
+	int  code;
+	char msg[512];
+};
+
+static int error_set(struct error *err, int code, const char *fmt, ...)
+{
+	va_list args;
+	int r;
+
+	if (code == 0 || !err || err->code != 0)
+		return code;
+
+	err->code = code;
+	va_start(args, fmt);
+	r = vsnprintf(err->msg, sizeof(err->msg), fmt, args);
+	assert((size_t)r < sizeof(err->msg));
+	va_end(args);
+
+	return code;
+}
+
+static inline int sys_eventfd2(unsigned int count, int flags)
+{
+	return syscall(__NR_eventfd2, count, flags);
+}
+
+TEST(eventfd01)
+{
+	int fd, flags;
+
+	fd = sys_eventfd2(0, 0);
+	ASSERT_GE(fd, 0);
+
+	flags = fcntl(fd, F_GETFL);
+	// since the kernel automatically added O_RDWR.
+	EXPECT_EQ(flags, O_RDWR);
+
+	close(fd);
+}
+
+TEST(eventfd02)
+{
+	int fd, flags;
+
+	fd = sys_eventfd2(0, EFD_CLOEXEC);
+	ASSERT_GE(fd, 0);
+
+	flags = fcntl(fd, F_GETFD);
+	ASSERT_GT(flags, -1);
+	EXPECT_EQ(flags, FD_CLOEXEC);
+
+	close(fd);
+}
+
+TEST(eventfd03)
+{
+	int fd, flags;
+
+	fd = sys_eventfd2(0, EFD_NONBLOCK);
+	ASSERT_GE(fd, 0);
+
+	flags = fcntl(fd, F_GETFL);
+	ASSERT_GT(flags, -1);
+	EXPECT_EQ(flags & EFD_NONBLOCK, EFD_NONBLOCK);
+	EXPECT_EQ(flags & O_RDWR, O_RDWR);
+
+	close(fd);
+}
+
+TEST(eventfd04)
+{
+	int fd, flags;
+
+	fd = sys_eventfd2(0, EFD_CLOEXEC|EFD_NONBLOCK);
+	ASSERT_GE(fd, 0);
+
+	flags = fcntl(fd, F_GETFL);
+	ASSERT_GT(flags, -1);
+	EXPECT_EQ(flags & EFD_NONBLOCK, EFD_NONBLOCK);
+	EXPECT_EQ(flags & O_RDWR, O_RDWR);
+
+	flags = fcntl(fd, F_GETFD);
+	ASSERT_GT(flags, -1);
+	EXPECT_EQ(flags, FD_CLOEXEC);
+
+	close(fd);
+}
+
+static inline void trim_newline(char *str)
+{
+	char *pos = strrchr(str, '\n');
+
+	if (pos)
+		*pos = '\0';
+}
+
+static int verify_fdinfo(int fd, struct error *err, const char *prefix,
+		size_t prefix_len, const char *expect, ...)
+{
+	char buffer[512] = {0, };
+	char path[512] = {0, };
+	va_list args;
+	FILE *f;
+	char *line = NULL;
+	size_t n = 0;
+	int found = 0;
+	int r;
+
+	va_start(args, expect);
+	r = vsnprintf(buffer, sizeof(buffer), expect, args);
+	assert((size_t)r < sizeof(buffer));
+	va_end(args);
+
+	snprintf(path, sizeof(path), "/proc/self/fdinfo/%d", fd);
+	f = fopen(path, "re");
+	if (!f)
+		return error_set(err, -1, "fdinfo open failed for %d", fd);
+
+	while (getline(&line, &n, f) != -1) {
+		char *val;
+
+		if (strncmp(line, prefix, prefix_len))
+			continue;
+
+		found = 1;
+
+		val = line + prefix_len;
+		r = strcmp(val, buffer);
+		if (r != 0) {
+			trim_newline(line);
+			trim_newline(buffer);
+			error_set(err, -1, "%s '%s' != '%s'",
+				  prefix, val, buffer);
+		}
+		break;
+	}
+
+	free(line);
+	fclose(f);
+
+	if (found == 0)
+		return error_set(err, -1, "%s not found for fd %d",
+				 prefix, fd);
+
+	return 0;
+}
+
+TEST(eventfd05)
+{
+	struct error err = {0};
+	int fd, ret;
+
+	fd = sys_eventfd2(0, EFD_SEMAPHORE);
+	ASSERT_GE(fd, 0);
+
+	ret = fcntl(fd, F_GETFL);
+	ASSERT_GT(ret, -1);
+	EXPECT_EQ(ret & O_RDWR, O_RDWR);
+
+	// The semaphore could only be obtained from fdinfo.
+	ret = verify_fdinfo(fd, &err, "eventfd-semaphore: ", 19, "1\n");
+	if (ret != 0)
+		ksft_print_msg("eventfd-semaphore check failed, msg: %s\n",
+				err.msg);
+	EXPECT_EQ(ret, 0);
+
+	close(fd);
+}
+
+TEST_HARNESS_MAIN

From c94f05da54c8af21ac1c815761951b138b606fc9 Mon Sep 17 00:00:00 2001
From: Pierre Gondois <pierre.gondois@arm.com>
Date: Thu, 4 Jan 2024 17:49:33 +0100
Subject: [PATCH 1309/1406] list: add hlist_count_nodes()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a generic hlist_count_nodes() function and use it in two drivers.


This patch (of 3):

Add a function to count nodes in a hlist.  hlist_count_nodes() is similar
to list_count_nodes().

Link: https://lkml.kernel.org/r/20240104164937.424320-1-pierre.gondois@arm.com
Link: https://lkml.kernel.org/r/20240104164937.424320-2-pierre.gondois@arm.com
Signed-off-by: Pierre Gondois <pierre.gondois@arm.com>
Reviewed-by: Carlos Llamas <cmllamas@google.com>
Acked-by: Coly Li <colyli@suse.de>
Acked-by: Marco Elver <elver@google.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Arve Hjønnevåg <arve@android.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jani Nikula <jani.nikula@intel.com>
Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Cc: Kees Cook <keescook@chromium.org>
Cc: Kent Overstreet <kent.overstreet@gmail.com>
Cc: Martijn Coenen <maco@android.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Todd Kjos <tkjos@android.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/list.h | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/include/linux/list.h b/include/linux/list.h
index 059aa1fff41e9c..523b7c4d000a1f 100644
--- a/include/linux/list.h
+++ b/include/linux/list.h
@@ -1195,4 +1195,19 @@ static inline void hlist_splice_init(struct hlist_head *from,
 	     pos && ({ n = pos->member.next; 1; });			\
 	     pos = hlist_entry_safe(n, typeof(*pos), member))
 
+/**
+ * hlist_count_nodes - count nodes in the hlist
+ * @head:	the head for your hlist.
+ */
+static inline size_t hlist_count_nodes(struct hlist_head *head)
+{
+	struct hlist_node *pos;
+	size_t count = 0;
+
+	hlist_for_each(pos, head)
+		count++;
+
+	return count;
+}
+
 #endif

From 198865fcf6c51348d1153837a4098816b4d93923 Mon Sep 17 00:00:00 2001
From: Pierre Gondois <pierre.gondois@arm.com>
Date: Thu, 4 Jan 2024 17:49:34 +0100
Subject: [PATCH 1310/1406] binder: use of hlist_count_nodes()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Make use of the newly added hlist_count_nodes().

Link: https://lkml.kernel.org/r/20240104164937.424320-3-pierre.gondois@arm.com
Signed-off-by: Pierre Gondois <pierre.gondois@arm.com>
Acked-by: Carlos Llamas <cmllamas@google.com>
Acked-by: Marco Elver <elver@google.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Arve Hjønnevåg <arve@android.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Coly Li <colyli@suse.de>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jani Nikula <jani.nikula@intel.com>
Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Cc: Kees Cook <keescook@chromium.org>
Cc: Kent Overstreet <kent.overstreet@gmail.com>
Cc: Martijn Coenen <maco@android.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Todd Kjos <tkjos@android.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 drivers/android/binder.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index eca24f41556df0..bad28cf4201041 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -6086,9 +6086,7 @@ static void print_binder_node_nilocked(struct seq_file *m,
 	struct binder_work *w;
 	int count;
 
-	count = 0;
-	hlist_for_each_entry(ref, &node->refs, node_entry)
-		count++;
+	count = hlist_count_nodes(&node->refs);
 
 	seq_printf(m, "  node %d: u%016llx c%016llx hs %d hw %d ls %d lw %d is %d iw %d tr %d",
 		   node->debug_id, (u64)node->ptr, (u64)node->cookie,

From e8cdfe165db3f2d8e3a6f3100488a1c9d38c4c2c Mon Sep 17 00:00:00 2001
From: Pierre Gondois <pierre.gondois@arm.com>
Date: Thu, 4 Jan 2024 17:49:35 +0100
Subject: [PATCH 1311/1406] bcache: use of hlist_count_nodes()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Make use of the newly added hlist_count_nodes().

Link: https://lkml.kernel.org/r/20240104164937.424320-4-pierre.gondois@arm.com
Signed-off-by: Pierre Gondois <pierre.gondois@arm.com>
Acked-by: Coly Li <colyli@suse.de>
Acked-by: Marco Elver <elver@google.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Arve Hjønnevåg <arve@android.com>
Cc: Carlos Llamas <cmllamas@google.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jani Nikula <jani.nikula@intel.com>
Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Cc: Kees Cook <keescook@chromium.org>
Cc: Kent Overstreet <kent.overstreet@gmail.com>
Cc: Martijn Coenen <maco@android.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Todd Kjos <tkjos@android.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 drivers/md/bcache/sysfs.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index a438efb660699b..6956beb55326f5 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -702,13 +702,7 @@ static unsigned int bch_cache_max_chain(struct cache_set *c)
 	for (h = c->bucket_hash;
 	     h < c->bucket_hash + (1 << BUCKET_HASH_BITS);
 	     h++) {
-		unsigned int i = 0;
-		struct hlist_node *p;
-
-		hlist_for_each(p, h)
-			i++;
-
-		ret = max(ret, i);
+		ret = max(ret, hlist_count_nodes(h));
 	}
 
 	mutex_unlock(&c->bucket_lock);

From afa56006dfa06dd1f45921f595d1e8b6f261f251 Mon Sep 17 00:00:00 2001
From: Yongzhen Zhang <zhangyongzhen@kylinos.cn>
Date: Mon, 8 Jan 2024 09:56:04 +0800
Subject: [PATCH 1312/1406] ocfs2: Spelling fix

Modify reques to request in the comment.

Link: https://lkml.kernel.org/r/20240108015604.38377-1-zhangyongzhen@kylinos.cn
Signed-off-by: Yongzhen Zhang <zhangyongzhen@kylinos.cn>
Acked-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Cc: Mark Fasheh <mark@fasheh.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Changwei Ge <gechangwei@live.cn>
Cc: Gang He <ghe@suse.com>
Cc: Jun Piao <piaojun@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/ocfs2/dlmglue.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 64a6ef638495c2..cb40cafbc06237 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -1615,7 +1615,7 @@ static int __ocfs2_cluster_lock(struct ocfs2_super *osb,
 unlock:
 	lockres_clear_flags(lockres, OCFS2_LOCK_UPCONVERT_FINISHING);
 
-	/* ocfs2_unblock_lock reques on seeing OCFS2_LOCK_UPCONVERT_FINISHING */
+	/* ocfs2_unblock_lock request on seeing OCFS2_LOCK_UPCONVERT_FINISHING */
 	kick_dc = (lockres->l_flags & OCFS2_LOCK_BLOCKED);
 
 	spin_unlock_irqrestore(&lockres->l_lock, flags);

From 205bd162f92eebbc0d1924e33681a6ced0e9901b Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Sun, 7 Jan 2024 14:01:55 -0800
Subject: [PATCH 1313/1406] lib/win_minmax: fix header comments

Don't use "/**" kernel-doc comment marker for non-kernel-doc
comment.

Correct the filename but omit the path since we know where it is
and it could change (but not likely).

Link: https://lkml.kernel.org/r/20240107220155.29013-1-rdunlap@infradead.org
Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/win_minmax.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/linux/win_minmax.h b/include/linux/win_minmax.h
index 4ca2842d2842d0..6a5bb052fcc27f 100644
--- a/include/linux/win_minmax.h
+++ b/include/linux/win_minmax.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/**
- * lib/minmax.c: windowed min/max tracker by Kathleen Nichols.
+/*
+ * win_minmax.h: windowed min/max tracker by Kathleen Nichols.
  *
  */
 #ifndef MINMAX_H

From 636a378df7419cf7c61c5acfe68388e02431f6d8 Mon Sep 17 00:00:00 2001
From: Baoquan He <bhe@redhat.com>
Date: Sun, 7 Jan 2024 17:16:41 +0800
Subject: [PATCH 1314/1406] panic: suppress gnu_printf warning

with GCC 13.2.1 and W=1, there's compiling warning like this:

kernel/panic.c: In function `__warn':
kernel/panic.c:676:17: warning: function `__warn' might be a candidate for `gnu_printf' format attribute [-Wsuggest-attribute=format]
  676 |                 vprintk(args->fmt, args->args);
      |                 ^~~~~~~

The normal __printf(x,y) adding can't fix it. So add workaround which
disables -Wsuggest-attribute=format to mute it.

Link: https://lkml.kernel.org/r/20240107091641.579849-1-bhe@redhat.com
Signed-off-by: Baoquan He <bhe@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/panic.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/kernel/panic.c b/kernel/panic.c
index 2807639aab51d1..d49b68184c563e 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -666,8 +666,13 @@ void __warn(const char *file, int line, void *caller, unsigned taint,
 		pr_warn("WARNING: CPU: %d PID: %d at %pS\n",
 			raw_smp_processor_id(), current->pid, caller);
 
+#pragma GCC diagnostic push
+#ifndef __clang__
+#pragma GCC diagnostic ignored "-Wsuggest-attribute=format"
+#endif
 	if (args)
 		vprintk(args->fmt, args->args);
+#pragma GCC diagnostic pop
 
 	print_modules();
 

From 7625fcfab1f17e76614bd758cca01ed2d2629f9e Mon Sep 17 00:00:00 2001
From: Kuan-Wei Chiu <visitorckw@gmail.com>
Date: Wed, 10 Jan 2024 16:12:12 +0800
Subject: [PATCH 1315/1406] lib min_heap: optimize number of calls to
 min_heapify()

Patch series "lib min_heap: Min heap optimizations".

The purpose of this patch series is to enhance the existing min heap
implementation.  The optimization focuses on both the heap construction
process and the number of comparisons made during the heapify operation.


This patch (of 2):

Improve the heap construction process by reducing unnecessary heapify
operations.  Specifically, adjust the starting condition from n / 2 to n /
2 - 1 in the loop that iterates over all non-leaf elements.

Link: https://lkml.kernel.org/r/20240110081213.2289636-1-visitorckw@gmail.com
Link: https://lkml.kernel.org/r/20240110081213.2289636-2-visitorckw@gmail.com
Signed-off-by: Kuan-Wei Chiu <visitorckw@gmail.com>
Acked-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/min_heap.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/min_heap.h b/include/linux/min_heap.h
index 44077837385f89..18a581310eb350 100644
--- a/include/linux/min_heap.h
+++ b/include/linux/min_heap.h
@@ -70,7 +70,7 @@ void min_heapify_all(struct min_heap *heap,
 {
 	int i;
 
-	for (i = heap->nr / 2; i >= 0; i--)
+	for (i = heap->nr / 2 - 1; i >= 0; i--)
 		min_heapify(heap, i, func);
 }
 

From c9b2ddf771ebdd0ca202ee3fb56a3a897071a512 Mon Sep 17 00:00:00 2001
From: Kuan-Wei Chiu <visitorckw@gmail.com>
Date: Wed, 10 Jan 2024 16:12:13 +0800
Subject: [PATCH 1316/1406] lib min_heap: optimize number of comparisons in
 min_heapify()

Optimize the min_heapify() function, resulting in a significant reduction
of approximately 50% in the number of comparisons for large random inputs,
while maintaining identical results.

The current implementation performs two comparisons per level to identify
the minimum among three elements.  In contrast, the proposed bottom-up
variation uses only one comparison per level to assess two children until
reaching the leaves.  Then, it sifts up until the correct position is
determined.

Typically, the process of sifting down proceeds to the leaf level,
resulting in O(1) secondary comparisons instead of log2(n).  This
optimization significantly reduces the number of costly indirect function
calls and improves overall performance.

Link: https://lkml.kernel.org/r/20240110081213.2289636-3-visitorckw@gmail.com
Signed-off-by: Kuan-Wei Chiu <visitorckw@gmail.com>
Acked-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/min_heap.h | 42 +++++++++++++++++++++-------------------
 1 file changed, 22 insertions(+), 20 deletions(-)

diff --git a/include/linux/min_heap.h b/include/linux/min_heap.h
index 18a581310eb350..d52daf45861b9a 100644
--- a/include/linux/min_heap.h
+++ b/include/linux/min_heap.h
@@ -35,31 +35,33 @@ static __always_inline
 void min_heapify(struct min_heap *heap, int pos,
 		const struct min_heap_callbacks *func)
 {
-	void *left, *right, *parent, *smallest;
+	void *left, *right;
 	void *data = heap->data;
+	void *root = data + pos * func->elem_size;
+	int i = pos, j;
 
+	/* Find the sift-down path all the way to the leaves. */
 	for (;;) {
-		if (pos * 2 + 1 >= heap->nr)
+		if (i * 2 + 2 >= heap->nr)
 			break;
+		left = data + (i * 2 + 1) * func->elem_size;
+		right = data + (i * 2 + 2) * func->elem_size;
+		i = func->less(left, right) ? i * 2 + 1 : i * 2 + 2;
+	}
 
-		left = data + ((pos * 2 + 1) * func->elem_size);
-		parent = data + (pos * func->elem_size);
-		smallest = parent;
-		if (func->less(left, smallest))
-			smallest = left;
-
-		if (pos * 2 + 2 < heap->nr) {
-			right = data + ((pos * 2 + 2) * func->elem_size);
-			if (func->less(right, smallest))
-				smallest = right;
-		}
-		if (smallest == parent)
-			break;
-		func->swp(smallest, parent);
-		if (smallest == left)
-			pos = (pos * 2) + 1;
-		else
-			pos = (pos * 2) + 2;
+	/* Special case for the last leaf with no sibling. */
+	if (i * 2 + 2 == heap->nr)
+		i = i * 2 + 1;
+
+	/* Backtrack to the correct location. */
+	while (i != pos && func->less(root, data + i * func->elem_size))
+		i = (i - 1) / 2;
+
+	/* Shift the element into its correct place. */
+	j = i;
+	while (i != pos) {
+		i = (i - 1) / 2;
+		func->swp(data + i * func->elem_size, data + j * func->elem_size);
 	}
 }
 

From 48211155f0a0f0bb897e9c8808686fd380feb442 Mon Sep 17 00:00:00 2001
From: Alexey Gladkov <legion@kernel.org>
Date: Mon, 15 Jan 2024 15:46:41 +0000
Subject: [PATCH 1317/1406] sysctl: allow change system v ipc sysctls inside
 ipc namespace

Patch series "Allow to change ipc/mq sysctls inside ipc namespace", v3.

Right now ipc and mq limits count as per ipc namespace, but only real root
can change them.  By default, the current values of these limits are such
that it can only be reduced.  Since only root can change the values, it is
impossible to reduce these limits in the rootless container.

We can allow limit changes within ipc namespace because mq parameters are
limited by RLIMIT_MSGQUEUE and ipc parameters are not limited to anything
other than cgroups.


This patch (of 3):

Rootless containers are not allowed to modify kernel IPC parameters.

All default limits are set to such high values that in fact there are no
limits at all.  All limits are not inherited and are initialized to
default values when a new ipc_namespace is created.

For new ipc_namespace:

size_t       ipc_ns.shm_ctlmax = SHMMAX; // (ULONG_MAX - (1UL << 24))
size_t       ipc_ns.shm_ctlall = SHMALL; // (ULONG_MAX - (1UL << 24))
int          ipc_ns.shm_ctlmni = IPCMNI; // (1 << 15)
int          ipc_ns.shm_rmid_forced = 0;
unsigned int ipc_ns.msg_ctlmax = MSGMAX; // 8192
unsigned int ipc_ns.msg_ctlmni = MSGMNI; // 32000
unsigned int ipc_ns.msg_ctlmnb = MSGMNB; // 16384

The shm_tot (total amount of shared pages) has also ceased to be global,
it is located in ipc_namespace and is not inherited from anywhere.

In such conditions, it cannot be said that these limits limit anything.
The real limiter for them is cgroups.

If we allow rootless containers to change these parameters, then it can
only be reduced.

Link: https://lkml.kernel.org/r/cover.1705333426.git.legion@kernel.org
Link: https://lkml.kernel.org/r/d2f4603305cbfed58a24755aa61d027314b73a45.1705333426.git.legion@kernel.org
Signed-off-by: Alexey Gladkov <legion@kernel.org>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Link: https://lkml.kernel.org/r/e2d84d3ec0172cfff759e6065da84ce0cc2736f8.1663756794.git.legion@kernel.org
Cc: Christian Brauner <brauner@kernel.org>
Cc: Joel Granados <joel.granados@gmail.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Luis Chamberlain <mcgrof@kernel.org>
Cc: Manfred Spraul <manfred@colorfullife.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 ipc/ipc_sysctl.c | 37 +++++++++++++++++++++++++++++++++++--
 1 file changed, 35 insertions(+), 2 deletions(-)

diff --git a/ipc/ipc_sysctl.c b/ipc/ipc_sysctl.c
index 8c62e443f78b3c..01c4a50d22b2d2 100644
--- a/ipc/ipc_sysctl.c
+++ b/ipc/ipc_sysctl.c
@@ -14,6 +14,7 @@
 #include <linux/ipc_namespace.h>
 #include <linux/msg.h>
 #include <linux/slab.h>
+#include <linux/cred.h>
 #include "util.h"
 
 static int proc_ipc_dointvec_minmax_orphans(struct ctl_table *table, int write,
@@ -190,25 +191,57 @@ static int set_is_seen(struct ctl_table_set *set)
 	return &current->nsproxy->ipc_ns->ipc_set == set;
 }
 
+static void ipc_set_ownership(struct ctl_table_header *head,
+			      struct ctl_table *table,
+			      kuid_t *uid, kgid_t *gid)
+{
+	struct ipc_namespace *ns =
+		container_of(head->set, struct ipc_namespace, ipc_set);
+
+	kuid_t ns_root_uid = make_kuid(ns->user_ns, 0);
+	kgid_t ns_root_gid = make_kgid(ns->user_ns, 0);
+
+	*uid = uid_valid(ns_root_uid) ? ns_root_uid : GLOBAL_ROOT_UID;
+	*gid = gid_valid(ns_root_gid) ? ns_root_gid : GLOBAL_ROOT_GID;
+}
+
 static int ipc_permissions(struct ctl_table_header *head, struct ctl_table *table)
 {
 	int mode = table->mode;
 
 #ifdef CONFIG_CHECKPOINT_RESTORE
-	struct ipc_namespace *ns = current->nsproxy->ipc_ns;
+	struct ipc_namespace *ns =
+		container_of(head->set, struct ipc_namespace, ipc_set);
 
 	if (((table->data == &ns->ids[IPC_SEM_IDS].next_id) ||
 	     (table->data == &ns->ids[IPC_MSG_IDS].next_id) ||
 	     (table->data == &ns->ids[IPC_SHM_IDS].next_id)) &&
 	    checkpoint_restore_ns_capable(ns->user_ns))
 		mode = 0666;
+	else
 #endif
-	return mode;
+	{
+		kuid_t ns_root_uid;
+		kgid_t ns_root_gid;
+
+		ipc_set_ownership(head, table, &ns_root_uid, &ns_root_gid);
+
+		if (uid_eq(current_euid(), ns_root_uid))
+			mode >>= 6;
+
+		else if (in_egroup_p(ns_root_gid))
+			mode >>= 3;
+	}
+
+	mode &= 7;
+
+	return (mode << 6) | (mode << 3) | mode;
 }
 
 static struct ctl_table_root set_root = {
 	.lookup = set_lookup,
 	.permissions = ipc_permissions,
+	.set_ownership = ipc_set_ownership,
 };
 
 bool setup_ipc_sysctls(struct ipc_namespace *ns)

From 7c57aa7c4a3cbab3730174942ff6a0a6f959dc0c Mon Sep 17 00:00:00 2001
From: Alexey Gladkov <legion@kernel.org>
Date: Mon, 15 Jan 2024 15:46:42 +0000
Subject: [PATCH 1318/1406] docs: add information about ipc sysctls limitations

After 25b21cb2f6d6 ("[PATCH] IPC namespace core") and 4e9823111bdc
("[PATCH] IPC namespace - shm") the shared memory page count stopped being
global and started counting per ipc namespace.  The documentation and
shmget(2) still says that shmall is a global option.

shmget(2):

SHMALL System-wide limit on the total amount of shared memory, measured in
units of the system page size.  On Linux, this limit can be read and
modified via /proc/sys/kernel/shmall.

I think the changes made in 2006 should be documented.

Link: https://lkml.kernel.org/r/09e99911071766958af488beb4e8a728a4f12135.1705333426.git.legion@kernel.org
Signed-off-by: Alexey Gladkov <legion@kernel.org>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Acked-by: "Eric W. Biederman" <ebiederm@xmission.com>
Link: https://lkml.kernel.org/r/ede20ddf7be48b93e8084c3be2e920841ee1a641.1663756794.git.legion@kernel.org
Cc: Christian Brauner <brauner@kernel.org>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Joel Granados <joel.granados@gmail.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Luis Chamberlain <mcgrof@kernel.org>
Cc: Manfred Spraul <manfred@colorfullife.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/admin-guide/sysctl/kernel.rst | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst
index 6584a1f9bfe39d..bc578663619d6e 100644
--- a/Documentation/admin-guide/sysctl/kernel.rst
+++ b/Documentation/admin-guide/sysctl/kernel.rst
@@ -594,6 +594,9 @@ default (``MSGMNB``).
 ``msgmni`` is the maximum number of IPC queues. 32000 by default
 (``MSGMNI``).
 
+All of these parameters are set per ipc namespace. The maximum number of bytes
+in POSIX message queues is limited by ``RLIMIT_MSGQUEUE``. This limit is
+respected hierarchically in the each user namespace.
 
 msg_next_id, sem_next_id, and shm_next_id (System V IPC)
 ========================================================
@@ -1274,15 +1277,20 @@ are doing anyway :)
 shmall
 ======
 
-This parameter sets the total amount of shared memory pages that
-can be used system wide. Hence, ``shmall`` should always be at least
-``ceil(shmmax/PAGE_SIZE)``.
+This parameter sets the total amount of shared memory pages that can be used
+inside ipc namespace. The shared memory pages counting occurs for each ipc
+namespace separately and is not inherited. Hence, ``shmall`` should always be at
+least ``ceil(shmmax/PAGE_SIZE)``.
 
 If you are not sure what the default ``PAGE_SIZE`` is on your Linux
 system, you can run the following command::
 
 	# getconf PAGE_SIZE
 
+To reduce or disable the ability to allocate shared memory, you must create a
+new ipc namespace, set this parameter to the required value and prohibit the
+creation of a new ipc namespace in the current user namespace or cgroups can
+be used.
 
 shmmax
 ======

From 856baf3015c5b37ac52cba504bbbdf575ce856e6 Mon Sep 17 00:00:00 2001
From: Alexey Gladkov <legion@kernel.org>
Date: Mon, 15 Jan 2024 15:46:43 +0000
Subject: [PATCH 1319/1406] sysctl: allow to change limits for posix messages
 queues

All parameters of posix messages queues (queues_max/msg_max/msgsize_max)
end up being limited by RLIMIT_MSGQUEUE.  The code in mqueue_get_inode is
where that limiting happens.

The RLIMIT_MSGQUEUE is bound to the user namespace and is counted
hierarchically.

We can allow root in the user namespace to modify the posix messages
queues parameters.

Link: https://lkml.kernel.org/r/6ad67f23d1459a4f4339f74aa73bac0ecf3995e1.1705333426.git.legion@kernel.org
Signed-off-by: Alexey Gladkov <legion@kernel.org>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Link: https://lkml.kernel.org/r/7eb21211c8622e91d226e63416b1b93c079f60ee.1663756794.git.legion@kernel.org
Cc: Christian Brauner <brauner@kernel.org>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Joel Granados <joel.granados@gmail.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Luis Chamberlain <mcgrof@kernel.org>
Cc: Manfred Spraul <manfred@colorfullife.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 ipc/mq_sysctl.c | 36 ++++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/ipc/mq_sysctl.c b/ipc/mq_sysctl.c
index ebb5ed81c151a8..21fba3a6edaf7a 100644
--- a/ipc/mq_sysctl.c
+++ b/ipc/mq_sysctl.c
@@ -12,6 +12,7 @@
 #include <linux/stat.h>
 #include <linux/capability.h>
 #include <linux/slab.h>
+#include <linux/cred.h>
 
 static int msg_max_limit_min = MIN_MSGMAX;
 static int msg_max_limit_max = HARD_MSGMAX;
@@ -76,8 +77,43 @@ static int set_is_seen(struct ctl_table_set *set)
 	return &current->nsproxy->ipc_ns->mq_set == set;
 }
 
+static void mq_set_ownership(struct ctl_table_header *head,
+			     struct ctl_table *table,
+			     kuid_t *uid, kgid_t *gid)
+{
+	struct ipc_namespace *ns =
+		container_of(head->set, struct ipc_namespace, mq_set);
+
+	kuid_t ns_root_uid = make_kuid(ns->user_ns, 0);
+	kgid_t ns_root_gid = make_kgid(ns->user_ns, 0);
+
+	*uid = uid_valid(ns_root_uid) ? ns_root_uid : GLOBAL_ROOT_UID;
+	*gid = gid_valid(ns_root_gid) ? ns_root_gid : GLOBAL_ROOT_GID;
+}
+
+static int mq_permissions(struct ctl_table_header *head, struct ctl_table *table)
+{
+	int mode = table->mode;
+	kuid_t ns_root_uid;
+	kgid_t ns_root_gid;
+
+	mq_set_ownership(head, table, &ns_root_uid, &ns_root_gid);
+
+	if (uid_eq(current_euid(), ns_root_uid))
+		mode >>= 6;
+
+	else if (in_egroup_p(ns_root_gid))
+		mode >>= 3;
+
+	mode &= 7;
+
+	return (mode << 6) | (mode << 3) | mode;
+}
+
 static struct ctl_table_root set_root = {
 	.lookup = set_lookup,
+	.permissions = mq_permissions,
+	.set_ownership = mq_set_ownership,
 };
 
 bool setup_mq_sysctls(struct ipc_namespace *ns)

From ee1c21d892cb3c7354c4bf8b719a144d3d8999fe Mon Sep 17 00:00:00 2001
From: Li zeming <zeming@nfschina.com>
Date: Mon, 15 Jan 2024 14:25:19 +0800
Subject: [PATCH 1320/1406] user_namespace: Remove unnecessary NULL values from
 kbuf

kbuf is assigned first, so it does not need to initialize the assignment.

Link: https://lkml.kernel.org/r/20240115062519.31298-1-zeming@nfschina.com
Signed-off-by: Li zeming <zeming@nfschina.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/user_namespace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index ce4d99df5f0eb4..0b0b95418b16a7 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -931,7 +931,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
 	struct uid_gid_map new_map;
 	unsigned idx;
 	struct uid_gid_extent extent;
-	char *kbuf = NULL, *pos, *next_line;
+	char *kbuf, *pos, *next_line;
 	ssize_t ret;
 
 	/* Only allow < page size writes at the beginning of the file */

From 6844833946858ed6ef471ea3e166ec4ddd39fd5b Mon Sep 17 00:00:00 2001
From: Kuan-Wei Chiu <visitorckw@gmail.com>
Date: Sat, 13 Jan 2024 11:13:51 +0800
Subject: [PATCH 1321/1406] lib/sort: optimize heapsort for equal elements in
 sift-down path

Patch series "lib/sort: Optimize the number of swaps and comparisons".

This patch series aims to optimize the heapsort algorithm, specifically
targeting a reduction in the number of swaps and comparisons required.


This patch (of 2):

Currently, when searching for the sift-down path and encountering equal
elements, the algorithm chooses the left child.  However, considering that
the height of the right subtree may be one less than that of the left
subtree, selecting the right child in such cases can potentially reduce
the number of comparisons and swaps.

For instance, when sorting an array of 10,000 identical elements, the
current implementation requires 247,209 comparisons.  With this patch, the
number of comparisons can be reduced to 227,241.

Link: https://lkml.kernel.org/r/20240113031352.2395118-1-visitorckw@gmail.com
Link: https://lkml.kernel.org/r/20240113031352.2395118-2-visitorckw@gmail.com
Signed-off-by: Kuan-Wei Chiu <visitorckw@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 lib/sort.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/sort.c b/lib/sort.c
index b399bf10d6759b..fe4efd4a1410f7 100644
--- a/lib/sort.c
+++ b/lib/sort.c
@@ -262,7 +262,7 @@ void sort_r(void *base, size_t num, size_t size,
 		 * average, 3/4 worst-case.)
 		 */
 		for (b = a; c = 2*b + size, (d = c + size) < n;)
-			b = do_cmp(base + c, base + d, cmp_func, priv) >= 0 ? c : d;
+			b = do_cmp(base + c, base + d, cmp_func, priv) > 0 ? c : d;
 		if (d == n)	/* Special case last leaf with no sibling */
 			b = c;
 

From 188ccf4e4556178efbe4960cbec1468830cbc20d Mon Sep 17 00:00:00 2001
From: Kuan-Wei Chiu <visitorckw@gmail.com>
Date: Sat, 13 Jan 2024 11:13:52 +0800
Subject: [PATCH 1322/1406] lib/sort: Optimize heapsort with double-pop
 variation

Instead of popping only the maximum element from the heap during each
iteration, we now pop the two largest elements at once.  Although this
introduces an additional comparison to determine the second largest
element, it enables a reduction in the height of the tree by one during
the heapify operations starting from root's left/right child.  This
reduction in tree height by one leads to a decrease of one comparison and
one swap.

This optimization results in saving approximately 0.5 * n swaps without
increasing the number of comparisons.  Additionally, the heap size during
heapify is now one less than the original size, offering a chance for
further reduction in comparisons and swaps.

The following experimental data is based on the array generated using
get_random_u32().

| N     | swaps (old) | swaps (new) | comparisons (old) | comparisons (new) |
|-------|-------------|-------------|-------------------|-------------------|
| 1000  | 9054        | 8569        | 10328             | 10320             |
| 2000  | 20137       | 19182       | 22634             | 22587             |
| 3000  | 32062       | 30623       | 35833             | 35752             |
| 4000  | 44274       | 42282       | 49332             | 49306             |
| 5000  | 57195       | 54676       | 63300             | 63294             |
| 6000  | 70205       | 67202       | 77599             | 77557             |
| 7000  | 83276       | 79831       | 92113             | 92032             |
| 8000  | 96630       | 92678       | 106635            | 106617            |
| 9000  | 110349      | 105883      | 121505            | 121404            |
| 10000 | 124165      | 119202      | 136628            | 136617            |


Link: https://lkml.kernel.org/r/20240113031352.2395118-3-visitorckw@gmail.com
Signed-off-by: Kuan-Wei Chiu <visitorckw@gmail.com>
Cc: Ching-Chun (Jim) Huang <jserv@ccns.ncku.edu.tw>
Cc: George Spelvin <lkml@sdf.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 lib/sort.c | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/lib/sort.c b/lib/sort.c
index fe4efd4a1410f7..a0509088f82aa5 100644
--- a/lib/sort.c
+++ b/lib/sort.c
@@ -215,6 +215,7 @@ void sort_r(void *base, size_t num, size_t size,
 	/* pre-scale counters for performance */
 	size_t n = num * size, a = (num/2) * size;
 	const unsigned int lsbit = size & -size;  /* Used to find parent */
+	size_t shift = 0;
 
 	if (!a)		/* num < 2 || size == 0 */
 		return;
@@ -242,12 +243,21 @@ void sort_r(void *base, size_t num, size_t size,
 	for (;;) {
 		size_t b, c, d;
 
-		if (a)			/* Building heap: sift down --a */
-			a -= size;
-		else if (n -= size)	/* Sorting: Extract root to --n */
+		if (a)			/* Building heap: sift down a */
+			a -= size << shift;
+		else if (n > 3 * size) { /* Sorting: Extract two largest elements */
+			n -= size;
 			do_swap(base, base + n, size, swap_func, priv);
-		else			/* Sort complete */
+			shift = do_cmp(base + size, base + 2 * size, cmp_func, priv) <= 0;
+			a = size << shift;
+			n -= size;
+			do_swap(base + a, base + n, size, swap_func, priv);
+		} else if (n > size) {	/* Sorting: Extract root */
+			n -= size;
+			do_swap(base, base + n, size, swap_func, priv);
+		} else	{		/* Sort complete */
 			break;
+		}
 
 		/*
 		 * Sift element at "a" down into heap.  This is the

From 6fe06801ddf75890aff58e8099b47c2d15d017f0 Mon Sep 17 00:00:00 2001
From: Chen Zhongjin <chenzhongjin@huawei.com>
Date: Wed, 17 Jan 2024 06:16:36 +0000
Subject: [PATCH 1323/1406] kprobes: use synchronize_rcu_tasks_rude in
 kprobe_optimizer
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There is a deadlock scenario in kprobe_optimizer():

pid A				pid B			pid C
kprobe_optimizer()		do_exit()		perf_kprobe_init()
mutex_lock(&kprobe_mutex)	exit_tasks_rcu_start()	mutex_lock(&kprobe_mutex)
synchronize_rcu_tasks()		zap_pid_ns_processes()	// waiting kprobe_mutex
// waiting tasks_rcu_exit_srcu	kernel_wait4()
				// waiting pid C exit

To avoid this deadlock loop, use synchronize_rcu_tasks_rude() in
kprobe_optimizer() rather than synchronize_rcu_tasks().
synchronize_rcu_tasks_rude() can also promise that all preempted tasks
have scheduled, but it will not wait tasks_rcu_exit_srcu.

Link: https://lkml.kernel.org/r/20240117061636.288412-1-chenzhongjin@huawei.com
Fixes: a30b85df7d59 ("kprobes: Use synchronize_rcu_tasks() for optprobe with CONFIG_PREEMPT=y")
Signed-off-by: Chen Zhongjin <chenzhongjin@huawei.com>
Cc: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Douglas Anderson <dianders@chromium.org>
Cc: Eric DeVolder <eric.devolder@oracle.com>
Cc: Jakob Koschel <jkl820.git@gmail.com>
Cc: Juerg Haefliger <juerg.haefliger@canonical.com>
Cc: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
Cc: Michael Ellerman <mpe@ellerman.id.au> (powerpc)
Cc: Mickaël Salaün <mic@digikod.net>
Cc: "Naveen N. Rao" <naveen.n.rao@linux.ibm.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Rick Edgecombe <rick.p.edgecombe@intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Yang Jihong <yangjihong1@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/Kconfig     | 2 +-
 kernel/kprobes.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index a5af0edd3eb8f3..f5bc5533869915 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -104,7 +104,7 @@ config STATIC_CALL_SELFTEST
 config OPTPROBES
 	def_bool y
 	depends on KPROBES && HAVE_OPTPROBES
-	select TASKS_RCU if PREEMPTION
+	select TASKS_RUDE_RCU
 
 config KPROBES_ON_FTRACE
 	def_bool y
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 9d9095e8179286..263281dd620ca6 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -623,7 +623,7 @@ static void kprobe_optimizer(struct work_struct *work)
 	 * Note that on non-preemptive kernel, this is transparently converted
 	 * to synchronoze_sched() to wait for all interrupts to have completed.
 	 */
-	synchronize_rcu_tasks();
+	synchronize_rcu_tasks_rude();
 
 	/* Step 3: Optimize kprobes after quiesence period */
 	do_optimize_kprobes();

From 62852a1bb6018cdde546848815b842b86919bb12 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Wed, 17 Jan 2024 12:33:11 -0800
Subject: [PATCH 1324/1406] 
 kprobes-use-synchronize_rcu_tasks_rude-in-kprobe_optimizer-fix
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

unrelated comment typo fix

Cc: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Cc: Chen Zhongjin <chenzhongjin@huawei.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Douglas Anderson <dianders@chromium.org>
Cc: Eric DeVolder <eric.devolder@oracle.com>
Cc: Jakob Koschel <jkl820.git@gmail.com>
Cc: Juerg Haefliger <juerg.haefliger@canonical.com>
Cc: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Mickaël Salaün <mic@digikod.net>
Cc: "Naveen N. Rao" <naveen.n.rao@linux.ibm.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Rick Edgecombe <rick.p.edgecombe@intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Yang Jihong <yangjihong1@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/kprobes.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 263281dd620ca6..0bfbcb83214794 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -621,7 +621,7 @@ static void kprobe_optimizer(struct work_struct *work)
 	 * instruction is preempted. In that case, such tasks can return
 	 * to 2nd-Nth byte of jump instruction. This wait is for avoiding it.
 	 * Note that on non-preemptive kernel, this is transparently converted
-	 * to synchronoze_sched() to wait for all interrupts to have completed.
+	 * to synchronize_sched() to wait for all interrupts to have completed.
 	 */
 	synchronize_rcu_tasks_rude();
 

From 6c5b79e9e546683abb5316c599181f2ffc5f8a54 Mon Sep 17 00:00:00 2001
From: Kemeng Shi <shikemeng@huaweicloud.com>
Date: Fri, 19 Jan 2024 04:13:21 +0800
Subject: [PATCH 1325/1406] flex_proportions: remove unused fprop_local_single

The single variant of flex_proportions is not used.  Simply remove it.

Link: https://lkml.kernel.org/r/20240118201321.759174-1-shikemeng@huaweicloud.com
Signed-off-by: Kemeng Shi <shikemeng@huaweicloud.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/flex_proportions.h | 32 -------------
 lib/flex_proportions.c           | 77 --------------------------------
 2 files changed, 109 deletions(-)

diff --git a/include/linux/flex_proportions.h b/include/linux/flex_proportions.h
index 3e378b1fb0bc82..e9a72fd0bfe78b 100644
--- a/include/linux/flex_proportions.h
+++ b/include/linux/flex_proportions.h
@@ -38,38 +38,6 @@ int fprop_global_init(struct fprop_global *p, gfp_t gfp);
 void fprop_global_destroy(struct fprop_global *p);
 bool fprop_new_period(struct fprop_global *p, int periods);
 
-/*
- *  ---- SINGLE ----
- */
-struct fprop_local_single {
-	/* the local events counter */
-	unsigned long events;
-	/* Period in which we last updated events */
-	unsigned int period;
-	raw_spinlock_t lock;	/* Protect period and numerator */
-};
-
-#define INIT_FPROP_LOCAL_SINGLE(name)			\
-{	.lock = __RAW_SPIN_LOCK_UNLOCKED(name.lock),	\
-}
-
-int fprop_local_init_single(struct fprop_local_single *pl);
-void fprop_local_destroy_single(struct fprop_local_single *pl);
-void __fprop_inc_single(struct fprop_global *p, struct fprop_local_single *pl);
-void fprop_fraction_single(struct fprop_global *p,
-	struct fprop_local_single *pl, unsigned long *numerator,
-	unsigned long *denominator);
-
-static inline
-void fprop_inc_single(struct fprop_global *p, struct fprop_local_single *pl)
-{
-	unsigned long flags;
-
-	local_irq_save(flags);
-	__fprop_inc_single(p, pl);
-	local_irq_restore(flags);
-}
-
 /*
  * ---- PERCPU ----
  */
diff --git a/lib/flex_proportions.c b/lib/flex_proportions.c
index 83332fefa6f42e..84ecccddc77182 100644
--- a/lib/flex_proportions.c
+++ b/lib/flex_proportions.c
@@ -83,83 +83,6 @@ bool fprop_new_period(struct fprop_global *p, int periods)
 	return true;
 }
 
-/*
- * ---- SINGLE ----
- */
-
-int fprop_local_init_single(struct fprop_local_single *pl)
-{
-	pl->events = 0;
-	pl->period = 0;
-	raw_spin_lock_init(&pl->lock);
-	return 0;
-}
-
-void fprop_local_destroy_single(struct fprop_local_single *pl)
-{
-}
-
-static void fprop_reflect_period_single(struct fprop_global *p,
-					struct fprop_local_single *pl)
-{
-	unsigned int period = p->period;
-	unsigned long flags;
-
-	/* Fast path - period didn't change */
-	if (pl->period == period)
-		return;
-	raw_spin_lock_irqsave(&pl->lock, flags);
-	/* Someone updated pl->period while we were spinning? */
-	if (pl->period >= period) {
-		raw_spin_unlock_irqrestore(&pl->lock, flags);
-		return;
-	}
-	/* Aging zeroed our fraction? */
-	if (period - pl->period < BITS_PER_LONG)
-		pl->events >>= period - pl->period;
-	else
-		pl->events = 0;
-	pl->period = period;
-	raw_spin_unlock_irqrestore(&pl->lock, flags);
-}
-
-/* Event of type pl happened */
-void __fprop_inc_single(struct fprop_global *p, struct fprop_local_single *pl)
-{
-	fprop_reflect_period_single(p, pl);
-	pl->events++;
-	percpu_counter_add(&p->events, 1);
-}
-
-/* Return fraction of events of type pl */
-void fprop_fraction_single(struct fprop_global *p,
-			   struct fprop_local_single *pl,
-			   unsigned long *numerator, unsigned long *denominator)
-{
-	unsigned int seq;
-	s64 num, den;
-
-	do {
-		seq = read_seqcount_begin(&p->sequence);
-		fprop_reflect_period_single(p, pl);
-		num = pl->events;
-		den = percpu_counter_read_positive(&p->events);
-	} while (read_seqcount_retry(&p->sequence, seq));
-
-	/*
-	 * Make fraction <= 1 and denominator > 0 even in presence of percpu
-	 * counter errors
-	 */
-	if (den <= num) {
-		if (num)
-			den = num;
-		else
-			den = 1;
-	}
-	*denominator = den;
-	*numerator = num;
-}
-
 /*
  * ---- PERCPU ----
  */

From 106f90203d722bbb89a8acefa5425b90a48c4beb Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Mon, 22 Jan 2024 18:16:31 +0100
Subject: [PATCH 1326/1406] ptrace_attach: shift send(SIGSTOP) into
 ptrace_set_stopped()

Turn send_sig_info(SIGSTOP) into send_signal_locked(SIGSTOP) and move it
from ptrace_attach() to ptrace_set_stopped().

This looks more logical and avoids lock(siglock) right after unlock().

Link: https://lkml.kernel.org/r/20240122171631.GA29844@redhat.com
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/ptrace.c | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 2fabd497d65988..d5f89f9ef29f65 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -375,10 +375,13 @@ static int check_ptrace_options(unsigned long data)
 	return 0;
 }
 
-static inline void ptrace_set_stopped(struct task_struct *task)
+static inline void ptrace_set_stopped(struct task_struct *task, bool seize)
 {
 	guard(spinlock)(&task->sighand->siglock);
 
+	/* SEIZE doesn't trap tracee on attach */
+	if (!seize)
+		send_signal_locked(SIGSTOP, SEND_SIG_PRIV, task, PIDTYPE_PID);
 	/*
 	 * If the task is already STOPPED, set JOBCTL_TRAP_STOP and
 	 * TRAPPING, and kick it so that it transits to TRACED.  TRAPPING
@@ -457,14 +460,8 @@ static int ptrace_attach(struct task_struct *task, long request,
 				return -EPERM;
 
 			task->ptrace = flags;
-
 			ptrace_link(task, current);
-
-			/* SEIZE doesn't trap tracee on attach */
-			if (!seize)
-				send_sig_info(SIGSTOP, SEND_SIG_PRIV, task);
-
-			ptrace_set_stopped(task);
+			ptrace_set_stopped(task, seize);
 		}
 	}
 

From 105df40102709850de5a34b0eecb6127a320a9f7 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Mon, 22 Jan 2024 15:50:43 +0100
Subject: [PATCH 1327/1406] lib: dhry: remove unneeded <linux/mutex.h>

Patch series "lib: dhry: miscellaneous cleanups".

This patch series contains a few miscellaneous cleanups for the
Dhrystone benchmark test.


This patch (of 3):

The Dhrystone benchmark test does not use mutexes.

Link: https://lkml.kernel.org/r/cover.1705934853.git.geert+renesas@glider.be
Link: https://lkml.kernel.org/r/cf8fafaedccf96143f1513745c43a457480bfc24.1705934853.git.geert+renesas@glider.be
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 lib/dhry_run.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/lib/dhry_run.c b/lib/dhry_run.c
index f15ac666e9d38b..e6a279dabf848e 100644
--- a/lib/dhry_run.c
+++ b/lib/dhry_run.c
@@ -10,7 +10,6 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/moduleparam.h>
-#include <linux/mutex.h>
 #include <linux/smp.h>
 
 #define DHRY_VAX	1757

From 612284109df6632c078de82b29f03c6d677ed5e8 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Mon, 22 Jan 2024 15:50:44 +0100
Subject: [PATCH 1328/1406] lib: dhry: use ktime_ms_delta() helper

Use the existing ktime_ms_delta() helper instead of open-coding the same
operation.

Link: https://lkml.kernel.org/r/bb43c67a7580de6152f5e6eb225071166d33b6e4.1705934853.git.geert+renesas@glider.be
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 lib/dhry_1.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/dhry_1.c b/lib/dhry_1.c
index 08edbbb19f573f..ca6c87232c5809 100644
--- a/lib/dhry_1.c
+++ b/lib/dhry_1.c
@@ -277,7 +277,7 @@ int dhry(int n)
 	dhry_assert_string_eq(Str_1_Loc, "DHRYSTONE PROGRAM, 1'ST STRING");
 	dhry_assert_string_eq(Str_2_Loc, "DHRYSTONE PROGRAM, 2'ND STRING");
 
-	User_Time = ktime_to_ms(ktime_sub(End_Time, Begin_Time));
+	User_Time = ktime_ms_delta(End_Time, Begin_Time);
 
 	kfree(Ptr_Glob);
 	kfree(Next_Ptr_Glob);

From cc003017f1a285a7ecf37ce827d3eba768cda08b Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Mon, 22 Jan 2024 15:50:45 +0100
Subject: [PATCH 1329/1406] lib: dhry: add missing closing parenthesis

The help text for the Dhrystone benchmark test lacks a matching closing
parenthesis.

Link: https://lkml.kernel.org/r/772b43271bcb3dd17a6aae671b2084f08c05b079.1705934853.git.geert+renesas@glider.be
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 lib/Kconfig.debug | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index ef36b829ae1f55..57a4dc50325ff8 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -2142,7 +2142,7 @@ config TEST_DHRY
 
 	  To run the benchmark, it needs to be enabled explicitly, either from
 	  the kernel command line (when built-in), or from userspace (when
-	  built-in or modular.
+	  built-in or modular).
 
 	  Run once during kernel boot:
 

From 5366db30355c51ef295bde68f33e4813101c63c0 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Date: Mon, 22 Jan 2024 23:01:49 +0900
Subject: [PATCH 1330/1406] nilfs2: convert segment buffer to use kmap_local

In the segment buffer code used for log writing, a CRC calculation routine
uses the deprecated kmap_atomic(), so convert it to use kmap_local.

Link: https://lkml.kernel.org/r/20240122140202.6950-3-konishi.ryusuke@gmail.com
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/nilfs2/segbuf.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index 6e59dc19a73249..dc431b4c34c96c 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -220,9 +220,9 @@ static void nilfs_segbuf_fill_in_data_crc(struct nilfs_segment_buffer *segbuf,
 		crc = crc32_le(crc, bh->b_data, bh->b_size);
 	}
 	list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) {
-		kaddr = kmap_atomic(bh->b_page);
+		kaddr = kmap_local_page(bh->b_page);
 		crc = crc32_le(crc, kaddr + bh_offset(bh), bh->b_size);
-		kunmap_atomic(kaddr);
+		kunmap_local(kaddr);
 	}
 	raw_sum->ss_datasum = cpu_to_le32(crc);
 }

From cc9616b129bc1682a5b3645e17ccaabccbed9124 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Date: Mon, 22 Jan 2024 23:01:50 +0900
Subject: [PATCH 1331/1406] nilfs2: convert nilfs_copy_buffer() to use
 kmap_local

The routine nilfs_copy_buffer() that copies a block buffer still uses the
deprecated kmap_atomic(), so convert it to use kmap_local.

Link: https://lkml.kernel.org/r/20240122140202.6950-4-konishi.ryusuke@gmail.com
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/nilfs2/page.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index 5c2eba1987bd70..14e470fb88706a 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -103,11 +103,11 @@ void nilfs_copy_buffer(struct buffer_head *dbh, struct buffer_head *sbh)
 	struct page *spage = sbh->b_page, *dpage = dbh->b_page;
 	struct buffer_head *bh;
 
-	kaddr0 = kmap_atomic(spage);
-	kaddr1 = kmap_atomic(dpage);
+	kaddr0 = kmap_local_page(spage);
+	kaddr1 = kmap_local_page(dpage);
 	memcpy(kaddr1 + bh_offset(dbh), kaddr0 + bh_offset(sbh), sbh->b_size);
-	kunmap_atomic(kaddr1);
-	kunmap_atomic(kaddr0);
+	kunmap_local(kaddr1);
+	kunmap_local(kaddr0);
 
 	dbh->b_state = sbh->b_state & NILFS_BUFFER_INHERENT_BITS;
 	dbh->b_blocknr = sbh->b_blocknr;

From 2ffd8a0a490832ce6fcab798119ca1879a4f98d1 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Date: Mon, 22 Jan 2024 23:01:51 +0900
Subject: [PATCH 1332/1406] nilfs2: convert metadata file common code to use
 kmap_local

In the common code of metadata files, the new block creation routine
nilfs_mdt_insert_new_block() still uses the deprecated kmap_atomic(), so
convert it to use kmap_local.

Link: https://lkml.kernel.org/r/20240122140202.6950-5-konishi.ryusuke@gmail.com
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/nilfs2/mdt.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index e45c01a559c013..4f792a0ad0f0ff 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -47,12 +47,12 @@ nilfs_mdt_insert_new_block(struct inode *inode, unsigned long block,
 
 	set_buffer_mapped(bh);
 
-	kaddr = kmap_atomic(bh->b_page);
+	kaddr = kmap_local_page(bh->b_page);
 	memset(kaddr + bh_offset(bh), 0, i_blocksize(inode));
 	if (init_block)
 		init_block(inode, bh, kaddr);
 	flush_dcache_page(bh->b_page);
-	kunmap_atomic(kaddr);
+	kunmap_local(kaddr);
 
 	set_buffer_uptodate(bh);
 	mark_buffer_dirty(bh);

From e95bf2468147ceef5859f7cf80364799d96b195f Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Date: Mon, 22 Jan 2024 23:01:52 +0900
Subject: [PATCH 1333/1406] nilfs2: convert sufile to use kmap_local

Concerning the code of the metadata file sufile for segment management,
convert all parts that uses the deprecated kmap_atomic() to use
kmap_local.  All transformations are directly possible here.

Link: https://lkml.kernel.org/r/20240122140202.6950-6-konishi.ryusuke@gmail.com
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/nilfs2/sufile.c | 86 +++++++++++++++++++++++-----------------------
 1 file changed, 43 insertions(+), 43 deletions(-)

diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c
index 0a8119456c2136..abf05dc5750c79 100644
--- a/fs/nilfs2/sufile.c
+++ b/fs/nilfs2/sufile.c
@@ -107,11 +107,11 @@ static void nilfs_sufile_mod_counter(struct buffer_head *header_bh,
 	struct nilfs_sufile_header *header;
 	void *kaddr;
 
-	kaddr = kmap_atomic(header_bh->b_page);
+	kaddr = kmap_local_page(header_bh->b_page);
 	header = kaddr + bh_offset(header_bh);
 	le64_add_cpu(&header->sh_ncleansegs, ncleanadd);
 	le64_add_cpu(&header->sh_ndirtysegs, ndirtyadd);
-	kunmap_atomic(kaddr);
+	kunmap_local(kaddr);
 
 	mark_buffer_dirty(header_bh);
 }
@@ -315,10 +315,10 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump)
 	ret = nilfs_sufile_get_header_block(sufile, &header_bh);
 	if (ret < 0)
 		goto out_sem;
-	kaddr = kmap_atomic(header_bh->b_page);
+	kaddr = kmap_local_page(header_bh->b_page);
 	header = kaddr + bh_offset(header_bh);
 	last_alloc = le64_to_cpu(header->sh_last_alloc);
-	kunmap_atomic(kaddr);
+	kunmap_local(kaddr);
 
 	nsegments = nilfs_sufile_get_nsegments(sufile);
 	maxsegnum = sui->allocmax;
@@ -352,7 +352,7 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump)
 							   &su_bh);
 		if (ret < 0)
 			goto out_header;
-		kaddr = kmap_atomic(su_bh->b_page);
+		kaddr = kmap_local_page(su_bh->b_page);
 		su = nilfs_sufile_block_get_segment_usage(
 			sufile, segnum, su_bh, kaddr);
 
@@ -363,14 +363,14 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump)
 				continue;
 			/* found a clean segment */
 			nilfs_segment_usage_set_dirty(su);
-			kunmap_atomic(kaddr);
+			kunmap_local(kaddr);
 
-			kaddr = kmap_atomic(header_bh->b_page);
+			kaddr = kmap_local_page(header_bh->b_page);
 			header = kaddr + bh_offset(header_bh);
 			le64_add_cpu(&header->sh_ncleansegs, -1);
 			le64_add_cpu(&header->sh_ndirtysegs, 1);
 			header->sh_last_alloc = cpu_to_le64(segnum);
-			kunmap_atomic(kaddr);
+			kunmap_local(kaddr);
 
 			sui->ncleansegs--;
 			mark_buffer_dirty(header_bh);
@@ -384,7 +384,7 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump)
 			goto out_header;
 		}
 
-		kunmap_atomic(kaddr);
+		kunmap_local(kaddr);
 		brelse(su_bh);
 	}
 
@@ -406,16 +406,16 @@ void nilfs_sufile_do_cancel_free(struct inode *sufile, __u64 segnum,
 	struct nilfs_segment_usage *su;
 	void *kaddr;
 
-	kaddr = kmap_atomic(su_bh->b_page);
+	kaddr = kmap_local_page(su_bh->b_page);
 	su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr);
 	if (unlikely(!nilfs_segment_usage_clean(su))) {
 		nilfs_warn(sufile->i_sb, "%s: segment %llu must be clean",
 			   __func__, (unsigned long long)segnum);
-		kunmap_atomic(kaddr);
+		kunmap_local(kaddr);
 		return;
 	}
 	nilfs_segment_usage_set_dirty(su);
-	kunmap_atomic(kaddr);
+	kunmap_local(kaddr);
 
 	nilfs_sufile_mod_counter(header_bh, -1, 1);
 	NILFS_SUI(sufile)->ncleansegs--;
@@ -432,11 +432,11 @@ void nilfs_sufile_do_scrap(struct inode *sufile, __u64 segnum,
 	void *kaddr;
 	int clean, dirty;
 
-	kaddr = kmap_atomic(su_bh->b_page);
+	kaddr = kmap_local_page(su_bh->b_page);
 	su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr);
 	if (su->su_flags == cpu_to_le32(BIT(NILFS_SEGMENT_USAGE_DIRTY)) &&
 	    su->su_nblocks == cpu_to_le32(0)) {
-		kunmap_atomic(kaddr);
+		kunmap_local(kaddr);
 		return;
 	}
 	clean = nilfs_segment_usage_clean(su);
@@ -446,7 +446,7 @@ void nilfs_sufile_do_scrap(struct inode *sufile, __u64 segnum,
 	su->su_lastmod = cpu_to_le64(0);
 	su->su_nblocks = cpu_to_le32(0);
 	su->su_flags = cpu_to_le32(BIT(NILFS_SEGMENT_USAGE_DIRTY));
-	kunmap_atomic(kaddr);
+	kunmap_local(kaddr);
 
 	nilfs_sufile_mod_counter(header_bh, clean ? (u64)-1 : 0, dirty ? 0 : 1);
 	NILFS_SUI(sufile)->ncleansegs -= clean;
@@ -463,12 +463,12 @@ void nilfs_sufile_do_free(struct inode *sufile, __u64 segnum,
 	void *kaddr;
 	int sudirty;
 
-	kaddr = kmap_atomic(su_bh->b_page);
+	kaddr = kmap_local_page(su_bh->b_page);
 	su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr);
 	if (nilfs_segment_usage_clean(su)) {
 		nilfs_warn(sufile->i_sb, "%s: segment %llu is already clean",
 			   __func__, (unsigned long long)segnum);
-		kunmap_atomic(kaddr);
+		kunmap_local(kaddr);
 		return;
 	}
 	if (unlikely(nilfs_segment_usage_error(su)))
@@ -481,7 +481,7 @@ void nilfs_sufile_do_free(struct inode *sufile, __u64 segnum,
 			   (unsigned long long)segnum);
 
 	nilfs_segment_usage_set_clean(su);
-	kunmap_atomic(kaddr);
+	kunmap_local(kaddr);
 	mark_buffer_dirty(su_bh);
 
 	nilfs_sufile_mod_counter(header_bh, 1, sudirty ? (u64)-1 : 0);
@@ -509,12 +509,12 @@ int nilfs_sufile_mark_dirty(struct inode *sufile, __u64 segnum)
 	if (ret)
 		goto out_sem;
 
-	kaddr = kmap_atomic(bh->b_page);
+	kaddr = kmap_local_page(bh->b_page);
 	su = nilfs_sufile_block_get_segment_usage(sufile, segnum, bh, kaddr);
 	if (unlikely(nilfs_segment_usage_error(su))) {
 		struct the_nilfs *nilfs = sufile->i_sb->s_fs_info;
 
-		kunmap_atomic(kaddr);
+		kunmap_local(kaddr);
 		brelse(bh);
 		if (nilfs_segment_is_active(nilfs, segnum)) {
 			nilfs_error(sufile->i_sb,
@@ -532,7 +532,7 @@ int nilfs_sufile_mark_dirty(struct inode *sufile, __u64 segnum)
 		ret = -EIO;
 	} else {
 		nilfs_segment_usage_set_dirty(su);
-		kunmap_atomic(kaddr);
+		kunmap_local(kaddr);
 		mark_buffer_dirty(bh);
 		nilfs_mdt_mark_dirty(sufile);
 		brelse(bh);
@@ -562,7 +562,7 @@ int nilfs_sufile_set_segment_usage(struct inode *sufile, __u64 segnum,
 	if (ret < 0)
 		goto out_sem;
 
-	kaddr = kmap_atomic(bh->b_page);
+	kaddr = kmap_local_page(bh->b_page);
 	su = nilfs_sufile_block_get_segment_usage(sufile, segnum, bh, kaddr);
 	if (modtime) {
 		/*
@@ -573,7 +573,7 @@ int nilfs_sufile_set_segment_usage(struct inode *sufile, __u64 segnum,
 		su->su_lastmod = cpu_to_le64(modtime);
 	}
 	su->su_nblocks = cpu_to_le32(nblocks);
-	kunmap_atomic(kaddr);
+	kunmap_local(kaddr);
 
 	mark_buffer_dirty(bh);
 	nilfs_mdt_mark_dirty(sufile);
@@ -614,7 +614,7 @@ int nilfs_sufile_get_stat(struct inode *sufile, struct nilfs_sustat *sustat)
 	if (ret < 0)
 		goto out_sem;
 
-	kaddr = kmap_atomic(header_bh->b_page);
+	kaddr = kmap_local_page(header_bh->b_page);
 	header = kaddr + bh_offset(header_bh);
 	sustat->ss_nsegs = nilfs_sufile_get_nsegments(sufile);
 	sustat->ss_ncleansegs = le64_to_cpu(header->sh_ncleansegs);
@@ -624,7 +624,7 @@ int nilfs_sufile_get_stat(struct inode *sufile, struct nilfs_sustat *sustat)
 	spin_lock(&nilfs->ns_last_segment_lock);
 	sustat->ss_prot_seq = nilfs->ns_prot_seq;
 	spin_unlock(&nilfs->ns_last_segment_lock);
-	kunmap_atomic(kaddr);
+	kunmap_local(kaddr);
 	brelse(header_bh);
 
  out_sem:
@@ -640,15 +640,15 @@ void nilfs_sufile_do_set_error(struct inode *sufile, __u64 segnum,
 	void *kaddr;
 	int suclean;
 
-	kaddr = kmap_atomic(su_bh->b_page);
+	kaddr = kmap_local_page(su_bh->b_page);
 	su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr);
 	if (nilfs_segment_usage_error(su)) {
-		kunmap_atomic(kaddr);
+		kunmap_local(kaddr);
 		return;
 	}
 	suclean = nilfs_segment_usage_clean(su);
 	nilfs_segment_usage_set_error(su);
-	kunmap_atomic(kaddr);
+	kunmap_local(kaddr);
 
 	if (suclean) {
 		nilfs_sufile_mod_counter(header_bh, -1, 0);
@@ -717,7 +717,7 @@ static int nilfs_sufile_truncate_range(struct inode *sufile,
 			/* hole */
 			continue;
 		}
-		kaddr = kmap_atomic(su_bh->b_page);
+		kaddr = kmap_local_page(su_bh->b_page);
 		su = nilfs_sufile_block_get_segment_usage(
 			sufile, segnum, su_bh, kaddr);
 		su2 = su;
@@ -726,7 +726,7 @@ static int nilfs_sufile_truncate_range(struct inode *sufile,
 			     ~BIT(NILFS_SEGMENT_USAGE_ERROR)) ||
 			    nilfs_segment_is_active(nilfs, segnum + j)) {
 				ret = -EBUSY;
-				kunmap_atomic(kaddr);
+				kunmap_local(kaddr);
 				brelse(su_bh);
 				goto out_header;
 			}
@@ -738,7 +738,7 @@ static int nilfs_sufile_truncate_range(struct inode *sufile,
 				nc++;
 			}
 		}
-		kunmap_atomic(kaddr);
+		kunmap_local(kaddr);
 		if (nc > 0) {
 			mark_buffer_dirty(su_bh);
 			ncleaned += nc;
@@ -823,10 +823,10 @@ int nilfs_sufile_resize(struct inode *sufile, __u64 newnsegs)
 		sui->allocmin = 0;
 	}
 
-	kaddr = kmap_atomic(header_bh->b_page);
+	kaddr = kmap_local_page(header_bh->b_page);
 	header = kaddr + bh_offset(header_bh);
 	header->sh_ncleansegs = cpu_to_le64(sui->ncleansegs);
-	kunmap_atomic(kaddr);
+	kunmap_local(kaddr);
 
 	mark_buffer_dirty(header_bh);
 	nilfs_mdt_mark_dirty(sufile);
@@ -891,7 +891,7 @@ ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum, void *buf,
 			continue;
 		}
 
-		kaddr = kmap_atomic(su_bh->b_page);
+		kaddr = kmap_local_page(su_bh->b_page);
 		su = nilfs_sufile_block_get_segment_usage(
 			sufile, segnum, su_bh, kaddr);
 		for (j = 0; j < n;
@@ -904,7 +904,7 @@ ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum, void *buf,
 				si->sui_flags |=
 					BIT(NILFS_SEGMENT_USAGE_ACTIVE);
 		}
-		kunmap_atomic(kaddr);
+		kunmap_local(kaddr);
 		brelse(su_bh);
 	}
 	ret = nsegs;
@@ -973,7 +973,7 @@ ssize_t nilfs_sufile_set_suinfo(struct inode *sufile, void *buf,
 		goto out_header;
 
 	for (;;) {
-		kaddr = kmap_atomic(bh->b_page);
+		kaddr = kmap_local_page(bh->b_page);
 		su = nilfs_sufile_block_get_segment_usage(
 			sufile, sup->sup_segnum, bh, kaddr);
 
@@ -1010,7 +1010,7 @@ ssize_t nilfs_sufile_set_suinfo(struct inode *sufile, void *buf,
 			su->su_flags = cpu_to_le32(sup->sup_sui.sui_flags);
 		}
 
-		kunmap_atomic(kaddr);
+		kunmap_local(kaddr);
 
 		sup = (void *)sup + supsz;
 		if (sup >= supend)
@@ -1115,7 +1115,7 @@ int nilfs_sufile_trim_fs(struct inode *sufile, struct fstrim_range *range)
 			continue;
 		}
 
-		kaddr = kmap_atomic(su_bh->b_page);
+		kaddr = kmap_local_page(su_bh->b_page);
 		su = nilfs_sufile_block_get_segment_usage(sufile, segnum,
 				su_bh, kaddr);
 		for (i = 0; i < n; ++i, ++segnum, su = (void *)su + susz) {
@@ -1145,7 +1145,7 @@ int nilfs_sufile_trim_fs(struct inode *sufile, struct fstrim_range *range)
 			}
 
 			if (nblocks >= minlen) {
-				kunmap_atomic(kaddr);
+				kunmap_local(kaddr);
 
 				ret = blkdev_issue_discard(nilfs->ns_bdev,
 						start * sects_per_block,
@@ -1157,7 +1157,7 @@ int nilfs_sufile_trim_fs(struct inode *sufile, struct fstrim_range *range)
 				}
 
 				ndiscarded += nblocks;
-				kaddr = kmap_atomic(su_bh->b_page);
+				kaddr = kmap_local_page(su_bh->b_page);
 				su = nilfs_sufile_block_get_segment_usage(
 					sufile, segnum, su_bh, kaddr);
 			}
@@ -1166,7 +1166,7 @@ int nilfs_sufile_trim_fs(struct inode *sufile, struct fstrim_range *range)
 			start = seg_start;
 			nblocks = seg_end - seg_start + 1;
 		}
-		kunmap_atomic(kaddr);
+		kunmap_local(kaddr);
 		put_bh(su_bh);
 	}
 
@@ -1246,10 +1246,10 @@ int nilfs_sufile_read(struct super_block *sb, size_t susize,
 		goto failed;
 
 	sui = NILFS_SUI(sufile);
-	kaddr = kmap_atomic(header_bh->b_page);
+	kaddr = kmap_local_page(header_bh->b_page);
 	header = kaddr + bh_offset(header_bh);
 	sui->ncleansegs = le64_to_cpu(header->sh_ncleansegs);
-	kunmap_atomic(kaddr);
+	kunmap_local(kaddr);
 	brelse(header_bh);
 
 	sui->allocmax = nilfs_sufile_get_nsegments(sufile) - 1;

From a216e9f09cbb1c3064451cd16b866cfd2f276f29 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Date: Mon, 22 Jan 2024 23:01:53 +0900
Subject: [PATCH 1334/1406] nilfs2: convert persistent object allocator to use
 kmap_local

Regarding the allocator code that is commonly used in the ondisk inode
metadata file ifile and the disk address translation metadata file DAT,
convert the parts that use the deprecated kmap_atomic() and kmap() to use
kmap_local.

Most can be converted directly, but only
nilfs_palloc_prepare_alloc_entry() needs to be rewritten to change mapping
sections so that multiple kmap_local/kunmap_local calls are nested and
disk I/O can be avoided within the mapping sections.

Link: https://lkml.kernel.org/r/20240122140202.6950-7-konishi.ryusuke@gmail.com
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/nilfs2/alloc.c | 91 ++++++++++++++++++++++++-----------------------
 1 file changed, 46 insertions(+), 45 deletions(-)

diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c
index 7342de296ec3c6..89caef7513db35 100644
--- a/fs/nilfs2/alloc.c
+++ b/fs/nilfs2/alloc.c
@@ -525,54 +525,55 @@ int nilfs_palloc_prepare_alloc_entry(struct inode *inode,
 		ret = nilfs_palloc_get_desc_block(inode, group, 1, &desc_bh);
 		if (ret < 0)
 			return ret;
-		desc_kaddr = kmap(desc_bh->b_page);
+		desc_kaddr = kmap_local_page(desc_bh->b_page);
 		desc = nilfs_palloc_block_get_group_desc(
 			inode, group, desc_bh, desc_kaddr);
 		n = nilfs_palloc_rest_groups_in_desc_block(inode, group,
 							   maxgroup);
-		for (j = 0; j < n; j++, desc++, group++) {
+		for (j = 0; j < n; j++, desc++, group++, group_offset = 0) {
 			lock = nilfs_mdt_bgl_lock(inode, group);
-			if (nilfs_palloc_group_desc_nfrees(desc, lock) > 0) {
-				ret = nilfs_palloc_get_bitmap_block(
-					inode, group, 1, &bitmap_bh);
-				if (ret < 0)
-					goto out_desc;
-				bitmap_kaddr = kmap(bitmap_bh->b_page);
-				bitmap = bitmap_kaddr + bh_offset(bitmap_bh);
-				pos = nilfs_palloc_find_available_slot(
-					bitmap, group_offset,
-					entries_per_group, lock);
-				if (pos >= 0) {
-					/* found a free entry */
-					nilfs_palloc_group_desc_add_entries(
-						desc, lock, -1);
-					req->pr_entry_nr =
-						entries_per_group * group + pos;
-					kunmap(desc_bh->b_page);
-					kunmap(bitmap_bh->b_page);
-
-					req->pr_desc_bh = desc_bh;
-					req->pr_bitmap_bh = bitmap_bh;
-					return 0;
-				}
-				kunmap(bitmap_bh->b_page);
-				brelse(bitmap_bh);
+			if (nilfs_palloc_group_desc_nfrees(desc, lock) == 0)
+				continue;
+
+			kunmap_local(desc_kaddr);
+			ret = nilfs_palloc_get_bitmap_block(inode, group, 1,
+							    &bitmap_bh);
+			if (unlikely(ret < 0)) {
+				brelse(desc_bh);
+				return ret;
 			}
 
-			group_offset = 0;
+			desc_kaddr = kmap_local_page(desc_bh->b_page);
+			desc = nilfs_palloc_block_get_group_desc(
+				inode, group, desc_bh, desc_kaddr);
+
+			bitmap_kaddr = kmap_local_page(bitmap_bh->b_page);
+			bitmap = bitmap_kaddr + bh_offset(bitmap_bh);
+			pos = nilfs_palloc_find_available_slot(
+				bitmap, group_offset, entries_per_group, lock);
+			kunmap_local(bitmap_kaddr);
+			if (pos >= 0)
+				goto found;
+
+			brelse(bitmap_bh);
 		}
 
-		kunmap(desc_bh->b_page);
+		kunmap_local(desc_kaddr);
 		brelse(desc_bh);
 	}
 
 	/* no entries left */
 	return -ENOSPC;
 
- out_desc:
-	kunmap(desc_bh->b_page);
-	brelse(desc_bh);
-	return ret;
+found:
+	/* found a free entry */
+	nilfs_palloc_group_desc_add_entries(desc, lock, -1);
+	req->pr_entry_nr = entries_per_group * group + pos;
+	kunmap_local(desc_kaddr);
+
+	req->pr_desc_bh = desc_bh;
+	req->pr_bitmap_bh = bitmap_bh;
+	return 0;
 }
 
 /**
@@ -606,10 +607,10 @@ void nilfs_palloc_commit_free_entry(struct inode *inode,
 	spinlock_t *lock;
 
 	group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset);
-	desc_kaddr = kmap(req->pr_desc_bh->b_page);
+	desc_kaddr = kmap_local_page(req->pr_desc_bh->b_page);
 	desc = nilfs_palloc_block_get_group_desc(inode, group,
 						 req->pr_desc_bh, desc_kaddr);
-	bitmap_kaddr = kmap(req->pr_bitmap_bh->b_page);
+	bitmap_kaddr = kmap_local_page(req->pr_bitmap_bh->b_page);
 	bitmap = bitmap_kaddr + bh_offset(req->pr_bitmap_bh);
 	lock = nilfs_mdt_bgl_lock(inode, group);
 
@@ -621,8 +622,8 @@ void nilfs_palloc_commit_free_entry(struct inode *inode,
 	else
 		nilfs_palloc_group_desc_add_entries(desc, lock, 1);
 
-	kunmap(req->pr_bitmap_bh->b_page);
-	kunmap(req->pr_desc_bh->b_page);
+	kunmap_local(bitmap_kaddr);
+	kunmap_local(desc_kaddr);
 
 	mark_buffer_dirty(req->pr_desc_bh);
 	mark_buffer_dirty(req->pr_bitmap_bh);
@@ -647,10 +648,10 @@ void nilfs_palloc_abort_alloc_entry(struct inode *inode,
 	spinlock_t *lock;
 
 	group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset);
-	desc_kaddr = kmap(req->pr_desc_bh->b_page);
+	desc_kaddr = kmap_local_page(req->pr_desc_bh->b_page);
 	desc = nilfs_palloc_block_get_group_desc(inode, group,
 						 req->pr_desc_bh, desc_kaddr);
-	bitmap_kaddr = kmap(req->pr_bitmap_bh->b_page);
+	bitmap_kaddr = kmap_local_page(req->pr_bitmap_bh->b_page);
 	bitmap = bitmap_kaddr + bh_offset(req->pr_bitmap_bh);
 	lock = nilfs_mdt_bgl_lock(inode, group);
 
@@ -662,8 +663,8 @@ void nilfs_palloc_abort_alloc_entry(struct inode *inode,
 	else
 		nilfs_palloc_group_desc_add_entries(desc, lock, 1);
 
-	kunmap(req->pr_bitmap_bh->b_page);
-	kunmap(req->pr_desc_bh->b_page);
+	kunmap_local(bitmap_kaddr);
+	kunmap_local(desc_kaddr);
 
 	brelse(req->pr_bitmap_bh);
 	brelse(req->pr_desc_bh);
@@ -755,7 +756,7 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems)
 		/* Get the first entry number of the group */
 		group_min_nr = (__u64)group * epg;
 
-		bitmap_kaddr = kmap(bitmap_bh->b_page);
+		bitmap_kaddr = kmap_local_page(bitmap_bh->b_page);
 		bitmap = bitmap_kaddr + bh_offset(bitmap_bh);
 		lock = nilfs_mdt_bgl_lock(inode, group);
 
@@ -801,7 +802,7 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems)
 			entry_start = rounddown(group_offset, epb);
 		} while (true);
 
-		kunmap(bitmap_bh->b_page);
+		kunmap_local(bitmap_kaddr);
 		mark_buffer_dirty(bitmap_bh);
 		brelse(bitmap_bh);
 
@@ -815,11 +816,11 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems)
 					   inode->i_ino);
 		}
 
-		desc_kaddr = kmap_atomic(desc_bh->b_page);
+		desc_kaddr = kmap_local_page(desc_bh->b_page);
 		desc = nilfs_palloc_block_get_group_desc(
 			inode, group, desc_bh, desc_kaddr);
 		nfree = nilfs_palloc_group_desc_add_entries(desc, lock, n);
-		kunmap_atomic(desc_kaddr);
+		kunmap_local(desc_kaddr);
 		mark_buffer_dirty(desc_bh);
 		nilfs_mdt_mark_dirty(inode);
 		brelse(desc_bh);

From f5ffa1ff9d49d45e6b1a9b72f3dffe8f7d70d4a4 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Date: Mon, 22 Jan 2024 23:01:54 +0900
Subject: [PATCH 1335/1406] nilfs2: convert DAT to use kmap_local

Concerning the code of the metadata file DAT for disk address translation,
convert all parts that use the deprecated kmap_atomic to use kmap_local.
All transformations are directly possible.

Link: https://lkml.kernel.org/r/20240122140202.6950-8-konishi.ryusuke@gmail.com
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/nilfs2/dat.c | 38 +++++++++++++++++++-------------------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c
index 9cf6ba58f5859f..8f71f8b0e2188b 100644
--- a/fs/nilfs2/dat.c
+++ b/fs/nilfs2/dat.c
@@ -91,13 +91,13 @@ void nilfs_dat_commit_alloc(struct inode *dat, struct nilfs_palloc_req *req)
 	struct nilfs_dat_entry *entry;
 	void *kaddr;
 
-	kaddr = kmap_atomic(req->pr_entry_bh->b_page);
+	kaddr = kmap_local_page(req->pr_entry_bh->b_page);
 	entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,
 					     req->pr_entry_bh, kaddr);
 	entry->de_start = cpu_to_le64(NILFS_CNO_MIN);
 	entry->de_end = cpu_to_le64(NILFS_CNO_MAX);
 	entry->de_blocknr = cpu_to_le64(0);
-	kunmap_atomic(kaddr);
+	kunmap_local(kaddr);
 
 	nilfs_palloc_commit_alloc_entry(dat, req);
 	nilfs_dat_commit_entry(dat, req);
@@ -115,13 +115,13 @@ static void nilfs_dat_commit_free(struct inode *dat,
 	struct nilfs_dat_entry *entry;
 	void *kaddr;
 
-	kaddr = kmap_atomic(req->pr_entry_bh->b_page);
+	kaddr = kmap_local_page(req->pr_entry_bh->b_page);
 	entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,
 					     req->pr_entry_bh, kaddr);
 	entry->de_start = cpu_to_le64(NILFS_CNO_MIN);
 	entry->de_end = cpu_to_le64(NILFS_CNO_MIN);
 	entry->de_blocknr = cpu_to_le64(0);
-	kunmap_atomic(kaddr);
+	kunmap_local(kaddr);
 
 	nilfs_dat_commit_entry(dat, req);
 
@@ -145,12 +145,12 @@ void nilfs_dat_commit_start(struct inode *dat, struct nilfs_palloc_req *req,
 	struct nilfs_dat_entry *entry;
 	void *kaddr;
 
-	kaddr = kmap_atomic(req->pr_entry_bh->b_page);
+	kaddr = kmap_local_page(req->pr_entry_bh->b_page);
 	entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,
 					     req->pr_entry_bh, kaddr);
 	entry->de_start = cpu_to_le64(nilfs_mdt_cno(dat));
 	entry->de_blocknr = cpu_to_le64(blocknr);
-	kunmap_atomic(kaddr);
+	kunmap_local(kaddr);
 
 	nilfs_dat_commit_entry(dat, req);
 }
@@ -167,12 +167,12 @@ int nilfs_dat_prepare_end(struct inode *dat, struct nilfs_palloc_req *req)
 	if (ret < 0)
 		return ret;
 
-	kaddr = kmap_atomic(req->pr_entry_bh->b_page);
+	kaddr = kmap_local_page(req->pr_entry_bh->b_page);
 	entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,
 					     req->pr_entry_bh, kaddr);
 	start = le64_to_cpu(entry->de_start);
 	blocknr = le64_to_cpu(entry->de_blocknr);
-	kunmap_atomic(kaddr);
+	kunmap_local(kaddr);
 
 	if (blocknr == 0) {
 		ret = nilfs_palloc_prepare_free_entry(dat, req);
@@ -202,7 +202,7 @@ void nilfs_dat_commit_end(struct inode *dat, struct nilfs_palloc_req *req,
 	sector_t blocknr;
 	void *kaddr;
 
-	kaddr = kmap_atomic(req->pr_entry_bh->b_page);
+	kaddr = kmap_local_page(req->pr_entry_bh->b_page);
 	entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,
 					     req->pr_entry_bh, kaddr);
 	end = start = le64_to_cpu(entry->de_start);
@@ -212,7 +212,7 @@ void nilfs_dat_commit_end(struct inode *dat, struct nilfs_palloc_req *req,
 	}
 	entry->de_end = cpu_to_le64(end);
 	blocknr = le64_to_cpu(entry->de_blocknr);
-	kunmap_atomic(kaddr);
+	kunmap_local(kaddr);
 
 	if (blocknr == 0)
 		nilfs_dat_commit_free(dat, req);
@@ -227,12 +227,12 @@ void nilfs_dat_abort_end(struct inode *dat, struct nilfs_palloc_req *req)
 	sector_t blocknr;
 	void *kaddr;
 
-	kaddr = kmap_atomic(req->pr_entry_bh->b_page);
+	kaddr = kmap_local_page(req->pr_entry_bh->b_page);
 	entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,
 					     req->pr_entry_bh, kaddr);
 	start = le64_to_cpu(entry->de_start);
 	blocknr = le64_to_cpu(entry->de_blocknr);
-	kunmap_atomic(kaddr);
+	kunmap_local(kaddr);
 
 	if (start == nilfs_mdt_cno(dat) && blocknr == 0)
 		nilfs_palloc_abort_free_entry(dat, req);
@@ -362,7 +362,7 @@ int nilfs_dat_move(struct inode *dat, __u64 vblocknr, sector_t blocknr)
 		}
 	}
 
-	kaddr = kmap_atomic(entry_bh->b_page);
+	kaddr = kmap_local_page(entry_bh->b_page);
 	entry = nilfs_palloc_block_get_entry(dat, vblocknr, entry_bh, kaddr);
 	if (unlikely(entry->de_blocknr == cpu_to_le64(0))) {
 		nilfs_crit(dat->i_sb,
@@ -370,13 +370,13 @@ int nilfs_dat_move(struct inode *dat, __u64 vblocknr, sector_t blocknr)
 			   __func__, (unsigned long long)vblocknr,
 			   (unsigned long long)le64_to_cpu(entry->de_start),
 			   (unsigned long long)le64_to_cpu(entry->de_end));
-		kunmap_atomic(kaddr);
+		kunmap_local(kaddr);
 		brelse(entry_bh);
 		return -EINVAL;
 	}
 	WARN_ON(blocknr == 0);
 	entry->de_blocknr = cpu_to_le64(blocknr);
-	kunmap_atomic(kaddr);
+	kunmap_local(kaddr);
 
 	mark_buffer_dirty(entry_bh);
 	nilfs_mdt_mark_dirty(dat);
@@ -426,7 +426,7 @@ int nilfs_dat_translate(struct inode *dat, __u64 vblocknr, sector_t *blocknrp)
 		}
 	}
 
-	kaddr = kmap_atomic(entry_bh->b_page);
+	kaddr = kmap_local_page(entry_bh->b_page);
 	entry = nilfs_palloc_block_get_entry(dat, vblocknr, entry_bh, kaddr);
 	blocknr = le64_to_cpu(entry->de_blocknr);
 	if (blocknr == 0) {
@@ -436,7 +436,7 @@ int nilfs_dat_translate(struct inode *dat, __u64 vblocknr, sector_t *blocknrp)
 	*blocknrp = blocknr;
 
  out:
-	kunmap_atomic(kaddr);
+	kunmap_local(kaddr);
 	brelse(entry_bh);
 	return ret;
 }
@@ -457,7 +457,7 @@ ssize_t nilfs_dat_get_vinfo(struct inode *dat, void *buf, unsigned int visz,
 						   0, &entry_bh);
 		if (ret < 0)
 			return ret;
-		kaddr = kmap_atomic(entry_bh->b_page);
+		kaddr = kmap_local_page(entry_bh->b_page);
 		/* last virtual block number in this block */
 		first = vinfo->vi_vblocknr;
 		do_div(first, entries_per_block);
@@ -473,7 +473,7 @@ ssize_t nilfs_dat_get_vinfo(struct inode *dat, void *buf, unsigned int visz,
 			vinfo->vi_end = le64_to_cpu(entry->de_end);
 			vinfo->vi_blocknr = le64_to_cpu(entry->de_blocknr);
 		}
-		kunmap_atomic(kaddr);
+		kunmap_local(kaddr);
 		brelse(entry_bh);
 	}
 

From 527fa97eeb2c38d2f3bea2ebcc495d41ae282fa6 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Date: Mon, 22 Jan 2024 23:01:55 +0900
Subject: [PATCH 1336/1406] nilfs2: move nilfs_bmap_write call out of
 nilfs_write_inode_common

Before converting the disk inode management metadata file ifile, the call
to nilfs_bmap_write(), the i_device_code setting, and the zero-fill code
for inodes on the super root block are moved from
nilfs_write_inode_common() to its callers.

This cleanup simplifies the role and arguments of
nilfs_write_inode_common() and collects calls to nilfs_bmap_write() to the
log writing code.

Also, add and use a new helper nilfs_write_root_mdt_inode() to avoid code
duplication in the data export routine nilfs_segctor_fill_in_super_root()
to the super root block's buffer.

Link: https://lkml.kernel.org/r/20240122140202.6950-9-konishi.ryusuke@gmail.com
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/nilfs2/inode.c   | 38 +++++++++++++++---------------------
 fs/nilfs2/nilfs.h   |  3 ++-
 fs/nilfs2/segment.c | 47 +++++++++++++++++++++++++++++++++++++--------
 3 files changed, 57 insertions(+), 31 deletions(-)

diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 9c334c722fc1c1..b9d40f5e94d32a 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -759,8 +759,18 @@ struct inode *nilfs_iget_for_shadow(struct inode *inode)
 	return s_inode;
 }
 
+/**
+ * nilfs_write_inode_common - export common inode information to on-disk inode
+ * @inode:     inode object
+ * @raw_inode: on-disk inode
+ *
+ * This function writes standard information from the on-memory inode @inode
+ * to @raw_inode on ifile, cpfile or a super root block.  Since inode bmap
+ * data is not exported, nilfs_bmap_write() must be called separately during
+ * log writing.
+ */
 void nilfs_write_inode_common(struct inode *inode,
-			      struct nilfs_inode *raw_inode, int has_bmap)
+			      struct nilfs_inode *raw_inode)
 {
 	struct nilfs_inode_info *ii = NILFS_I(inode);
 
@@ -778,21 +788,6 @@ void nilfs_write_inode_common(struct inode *inode,
 	raw_inode->i_flags = cpu_to_le32(ii->i_flags);
 	raw_inode->i_generation = cpu_to_le32(inode->i_generation);
 
-	if (NILFS_ROOT_METADATA_FILE(inode->i_ino)) {
-		struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
-
-		/* zero-fill unused portion in the case of super root block */
-		raw_inode->i_xattr = 0;
-		raw_inode->i_pad = 0;
-		memset((void *)raw_inode + sizeof(*raw_inode), 0,
-		       nilfs->ns_inode_size - sizeof(*raw_inode));
-	}
-
-	if (has_bmap)
-		nilfs_bmap_write(ii->i_bmap, raw_inode);
-	else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
-		raw_inode->i_device_code =
-			cpu_to_le64(huge_encode_dev(inode->i_rdev));
 	/*
 	 * When extending inode, nilfs->ns_inode_size should be checked
 	 * for substitutions of appended fields.
@@ -813,12 +808,11 @@ void nilfs_update_inode(struct inode *inode, struct buffer_head *ibh, int flags)
 	if (flags & I_DIRTY_DATASYNC)
 		set_bit(NILFS_I_INODE_SYNC, &ii->i_state);
 
-	nilfs_write_inode_common(inode, raw_inode, 0);
-		/*
-		 * XXX: call with has_bmap = 0 is a workaround to avoid
-		 * deadlock of bmap.  This delays update of i_bmap to just
-		 * before writing.
-		 */
+	nilfs_write_inode_common(inode, raw_inode);
+
+	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
+		raw_inode->i_device_code =
+			cpu_to_le64(huge_encode_dev(inode->i_rdev));
 
 	nilfs_ifile_unmap_inode(ifile, ino, ibh);
 }
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index 98cffaf0ac1277..2e29b98ba8bab2 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -256,7 +256,8 @@ extern struct inode *nilfs_new_inode(struct inode *, umode_t);
 extern int nilfs_get_block(struct inode *, sector_t, struct buffer_head *, int);
 extern void nilfs_set_inode_flags(struct inode *);
 extern int nilfs_read_inode_common(struct inode *, struct nilfs_inode *);
-extern void nilfs_write_inode_common(struct inode *, struct nilfs_inode *, int);
+void nilfs_write_inode_common(struct inode *inode,
+			      struct nilfs_inode *raw_inode);
 struct inode *nilfs_ilookup(struct super_block *sb, struct nilfs_root *root,
 			    unsigned long ino);
 struct inode *nilfs_iget_locked(struct super_block *sb, struct nilfs_root *root,
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 2bfb08052d3999..9044596813cc89 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -913,6 +913,7 @@ static int nilfs_segctor_fill_in_checkpoint(struct nilfs_sc_info *sci)
 	struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
 	struct buffer_head *bh_cp;
 	struct nilfs_checkpoint *raw_cp;
+	struct inode *ifile;
 	int err;
 
 	err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, nilfs->ns_cno, 0,
@@ -941,8 +942,10 @@ static int nilfs_segctor_fill_in_checkpoint(struct nilfs_sc_info *sci)
 	else
 		nilfs_checkpoint_set_minor(raw_cp);
 
-	nilfs_write_inode_common(sci->sc_root->ifile,
-				 &raw_cp->cp_ifile_inode, 1);
+	ifile = sci->sc_root->ifile;
+	nilfs_write_inode_common(ifile, &raw_cp->cp_ifile_inode);
+	nilfs_bmap_write(NILFS_I(ifile)->i_bmap, &raw_cp->cp_ifile_inode);
+
 	nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, nilfs->ns_cno, bh_cp);
 	return 0;
 
@@ -977,6 +980,33 @@ static void nilfs_segctor_fill_in_file_bmap(struct nilfs_sc_info *sci)
 	}
 }
 
+/**
+ * nilfs_write_root_mdt_inode - export root metadata inode information to
+ *                              the on-disk inode
+ * @inode:     inode object of the root metadata file
+ * @raw_inode: on-disk inode
+ *
+ * nilfs_write_root_mdt_inode() writes inode information and bmap data of
+ * @inode to the inode area of the metadata file allocated on the super root
+ * block created to finalize the log.  Since super root blocks are configured
+ * each time, this function zero-fills the unused area of @raw_inode.
+ */
+static void nilfs_write_root_mdt_inode(struct inode *inode,
+				       struct nilfs_inode *raw_inode)
+{
+	struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
+
+	nilfs_write_inode_common(inode, raw_inode);
+
+	/* zero-fill unused portion of raw_inode */
+	raw_inode->i_xattr = 0;
+	raw_inode->i_pad = 0;
+	memset((void *)raw_inode + sizeof(*raw_inode), 0,
+	       nilfs->ns_inode_size - sizeof(*raw_inode));
+
+	nilfs_bmap_write(NILFS_I(inode)->i_bmap, raw_inode);
+}
+
 static void nilfs_segctor_fill_in_super_root(struct nilfs_sc_info *sci,
 					     struct the_nilfs *nilfs)
 {
@@ -998,12 +1028,13 @@ static void nilfs_segctor_fill_in_super_root(struct nilfs_sc_info *sci,
 			      nilfs->ns_nongc_ctime : sci->sc_seg_ctime);
 	raw_sr->sr_flags = 0;
 
-	nilfs_write_inode_common(nilfs->ns_dat, (void *)raw_sr +
-				 NILFS_SR_DAT_OFFSET(isz), 1);
-	nilfs_write_inode_common(nilfs->ns_cpfile, (void *)raw_sr +
-				 NILFS_SR_CPFILE_OFFSET(isz), 1);
-	nilfs_write_inode_common(nilfs->ns_sufile, (void *)raw_sr +
-				 NILFS_SR_SUFILE_OFFSET(isz), 1);
+	nilfs_write_root_mdt_inode(nilfs->ns_dat, (void *)raw_sr +
+				   NILFS_SR_DAT_OFFSET(isz));
+	nilfs_write_root_mdt_inode(nilfs->ns_cpfile, (void *)raw_sr +
+				   NILFS_SR_CPFILE_OFFSET(isz));
+	nilfs_write_root_mdt_inode(nilfs->ns_sufile, (void *)raw_sr +
+				   NILFS_SR_SUFILE_OFFSET(isz));
+
 	memset((void *)raw_sr + srsz, 0, nilfs->ns_blocksize - srsz);
 	set_buffer_uptodate(bh_sr);
 	unlock_buffer(bh_sr);

From 6c39d7a2dbebe74cdc125f67fbc424e1aab2f720 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Date: Mon, 22 Jan 2024 23:01:56 +0900
Subject: [PATCH 1337/1406] nilfs2: do not acquire rwsem in nilfs_bmap_write()

It is now clear that nilfs_bmap_write() is only used to finalize logs
written to disk.  Concurrent bmap modification operations are not
performed on bmaps in this context.  Additionally, this function does not
modify data used in read-only operations such as bmap lookups.

Therefore, there is no need to acquire bmap->b_sem in nilfs_bmap_write(),
so delete it.

Link: https://lkml.kernel.org/r/20240122140202.6950-10-konishi.ryusuke@gmail.com
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/nilfs2/bmap.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c
index 7a8f166f2c8d84..383f0afa2cea36 100644
--- a/fs/nilfs2/bmap.c
+++ b/fs/nilfs2/bmap.c
@@ -548,13 +548,10 @@ int nilfs_bmap_read(struct nilfs_bmap *bmap, struct nilfs_inode *raw_inode)
  */
 void nilfs_bmap_write(struct nilfs_bmap *bmap, struct nilfs_inode *raw_inode)
 {
-	down_write(&bmap->b_sem);
 	memcpy(raw_inode->i_bmap, bmap->b_u.u_data,
 	       NILFS_INODE_BMAP_SIZE * sizeof(__le64));
 	if (bmap->b_inode->i_ino == NILFS_DAT_INO)
 		bmap->b_last_allocated_ptr = NILFS_BMAP_NEW_PTR_INIT;
-
-	up_write(&bmap->b_sem);
 }
 
 void nilfs_bmap_init_gc(struct nilfs_bmap *bmap)

From 48afd20d15c4df8c4140b87f226f78d773c8ef80 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Date: Mon, 22 Jan 2024 23:01:57 +0900
Subject: [PATCH 1338/1406] nilfs2: convert ifile to use kmap_local

Convert deprecated kmap() and kmap_atomic() to use kmap_local for the
ifile metadata file used to manage disk inodes.

In some usages, calls to kmap_local and kunmap_local are split into
different helpers, but those usages can be safely changed to local thread
kmap.

Link: https://lkml.kernel.org/r/20240122140202.6950-11-konishi.ryusuke@gmail.com
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/nilfs2/ifile.c   | 4 ++--
 fs/nilfs2/ifile.h   | 7 +++----
 fs/nilfs2/inode.c   | 6 +++---
 fs/nilfs2/segment.c | 2 +-
 4 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/fs/nilfs2/ifile.c b/fs/nilfs2/ifile.c
index a8a4bc8490b4d8..e9538fa46ff27d 100644
--- a/fs/nilfs2/ifile.c
+++ b/fs/nilfs2/ifile.c
@@ -115,11 +115,11 @@ int nilfs_ifile_delete_inode(struct inode *ifile, ino_t ino)
 		return ret;
 	}
 
-	kaddr = kmap_atomic(req.pr_entry_bh->b_page);
+	kaddr = kmap_local_page(req.pr_entry_bh->b_page);
 	raw_inode = nilfs_palloc_block_get_entry(ifile, req.pr_entry_nr,
 						 req.pr_entry_bh, kaddr);
 	raw_inode->i_flags = 0;
-	kunmap_atomic(kaddr);
+	kunmap_local(kaddr);
 
 	mark_buffer_dirty(req.pr_entry_bh);
 	brelse(req.pr_entry_bh);
diff --git a/fs/nilfs2/ifile.h b/fs/nilfs2/ifile.h
index 35c5273f48219b..b71ab0a81dc45e 100644
--- a/fs/nilfs2/ifile.h
+++ b/fs/nilfs2/ifile.h
@@ -21,15 +21,14 @@
 static inline struct nilfs_inode *
 nilfs_ifile_map_inode(struct inode *ifile, ino_t ino, struct buffer_head *ibh)
 {
-	void *kaddr = kmap(ibh->b_page);
+	void *kaddr = kmap_local_page(ibh->b_page);
 
 	return nilfs_palloc_block_get_entry(ifile, ino, ibh, kaddr);
 }
 
-static inline void nilfs_ifile_unmap_inode(struct inode *ifile, ino_t ino,
-					   struct buffer_head *ibh)
+static inline void nilfs_ifile_unmap_inode(struct nilfs_inode *raw_inode)
 {
-	kunmap(ibh->b_page);
+	kunmap_local(raw_inode);
 }
 
 int nilfs_ifile_create_inode(struct inode *, ino_t *, struct buffer_head **);
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index b9d40f5e94d32a..a475095a5e80b7 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -520,7 +520,7 @@ static int __nilfs_read_inode(struct super_block *sb,
 			inode, inode->i_mode,
 			huge_decode_dev(le64_to_cpu(raw_inode->i_device_code)));
 	}
-	nilfs_ifile_unmap_inode(root->ifile, ino, bh);
+	nilfs_ifile_unmap_inode(raw_inode);
 	brelse(bh);
 	up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
 	nilfs_set_inode_flags(inode);
@@ -529,7 +529,7 @@ static int __nilfs_read_inode(struct super_block *sb,
 	return 0;
 
  failed_unmap:
-	nilfs_ifile_unmap_inode(root->ifile, ino, bh);
+	nilfs_ifile_unmap_inode(raw_inode);
 	brelse(bh);
 
  bad_inode:
@@ -814,7 +814,7 @@ void nilfs_update_inode(struct inode *inode, struct buffer_head *ibh, int flags)
 		raw_inode->i_device_code =
 			cpu_to_le64(huge_encode_dev(inode->i_rdev));
 
-	nilfs_ifile_unmap_inode(ifile, ino, ibh);
+	nilfs_ifile_unmap_inode(raw_inode);
 }
 
 #define NILFS_MAX_TRUNCATE_BLOCKS	16384  /* 64MB for 4KB block */
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 9044596813cc89..7faefeb5904072 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -966,7 +966,7 @@ static void nilfs_fill_in_file_bmap(struct inode *ifile,
 		raw_inode = nilfs_ifile_map_inode(ifile, ii->vfs_inode.i_ino,
 						  ibh);
 		nilfs_bmap_write(ii->i_bmap, raw_inode);
-		nilfs_ifile_unmap_inode(ifile, ii->vfs_inode.i_ino, ibh);
+		nilfs_ifile_unmap_inode(raw_inode);
 	}
 }
 

From 69f944a68da4a4249447a289ea089d0e4c7f455e Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Date: Mon, 22 Jan 2024 23:01:58 +0900
Subject: [PATCH 1339/1406] nilfs2: localize highmem mapping for checkpoint
 creation within cpfile

In order to convert kmap() used in cpfile to kmap_local, first move the
checkpoint creation routine, which is one of the places where kmap is
used, to the cpfile side and make the page mapping local and temporary.
And use kmap_local instead of kmap to access the checkpoint entry page
(and header block page) when generating a checkpoint.

Link: https://lkml.kernel.org/r/20240122140202.6950-12-konishi.ryusuke@gmail.com
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/nilfs2/cpfile.c  | 74 +++++++++++++++++++++++++++++++++++++++++++++
 fs/nilfs2/cpfile.h  |  1 +
 fs/nilfs2/segment.c | 31 ++-----------------
 3 files changed, 77 insertions(+), 29 deletions(-)

diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c
index 39136637f7155b..f62da80e530a73 100644
--- a/fs/nilfs2/cpfile.c
+++ b/fs/nilfs2/cpfile.c
@@ -272,6 +272,80 @@ int nilfs_cpfile_get_checkpoint(struct inode *cpfile,
 	return ret;
 }
 
+/**
+ * nilfs_cpfile_create_checkpoint - create a checkpoint entry on cpfile
+ * @cpfile: checkpoint file inode
+ * @cno:    number of checkpoint to set up
+ *
+ * This function creates a checkpoint with the number specified by @cno on
+ * cpfile.  If the specified checkpoint entry already exists due to a past
+ * failure, it will be reused without returning an error.
+ * In either case, the buffer of the block containing the checkpoint entry
+ * and the cpfile inode are made dirty for inclusion in the write log.
+ *
+ * Return: 0 on success, or the following negative error code on failure.
+ * * %-ENOMEM	- Insufficient memory available.
+ * * %-EIO	- I/O error (including metadata corruption).
+ * * %-EROFS	- Read only filesystem
+ */
+int nilfs_cpfile_create_checkpoint(struct inode *cpfile, __u64 cno)
+{
+	struct buffer_head *header_bh, *cp_bh;
+	struct nilfs_cpfile_header *header;
+	struct nilfs_checkpoint *cp;
+	void *kaddr;
+	int ret;
+
+	if (WARN_ON_ONCE(cno < 1))
+		return -EIO;
+
+	down_write(&NILFS_MDT(cpfile)->mi_sem);
+	ret = nilfs_cpfile_get_header_block(cpfile, &header_bh);
+	if (unlikely(ret < 0)) {
+		if (ret == -ENOENT) {
+			nilfs_error(cpfile->i_sb,
+				    "checkpoint creation failed due to metadata corruption.");
+			ret = -EIO;
+		}
+		goto out_sem;
+	}
+	ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 1, &cp_bh);
+	if (unlikely(ret < 0))
+		goto out_header;
+
+	kaddr = kmap_local_page(cp_bh->b_page);
+	cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr);
+	if (nilfs_checkpoint_invalid(cp)) {
+		/* a newly-created checkpoint */
+		nilfs_checkpoint_clear_invalid(cp);
+		if (!nilfs_cpfile_is_in_first(cpfile, cno))
+			nilfs_cpfile_block_add_valid_checkpoints(cpfile, cp_bh,
+								 kaddr, 1);
+		kunmap_local(kaddr);
+
+		kaddr = kmap_local_page(header_bh->b_page);
+		header = nilfs_cpfile_block_get_header(cpfile, header_bh,
+						       kaddr);
+		le64_add_cpu(&header->ch_ncheckpoints, 1);
+		kunmap_local(kaddr);
+		mark_buffer_dirty(header_bh);
+	} else {
+		kunmap_local(kaddr);
+	}
+
+	/* Force the buffer and the inode to become dirty */
+	mark_buffer_dirty(cp_bh);
+	brelse(cp_bh);
+	nilfs_mdt_mark_dirty(cpfile);
+
+out_header:
+	brelse(header_bh);
+
+out_sem:
+	up_write(&NILFS_MDT(cpfile)->mi_sem);
+	return ret;
+}
+
 /**
  * nilfs_cpfile_put_checkpoint - put a checkpoint
  * @cpfile: inode of checkpoint file
diff --git a/fs/nilfs2/cpfile.h b/fs/nilfs2/cpfile.h
index edabb2dc57567c..fcb1a94097b3f9 100644
--- a/fs/nilfs2/cpfile.h
+++ b/fs/nilfs2/cpfile.h
@@ -19,6 +19,7 @@
 int nilfs_cpfile_get_checkpoint(struct inode *, __u64, int,
 				struct nilfs_checkpoint **,
 				struct buffer_head **);
+int nilfs_cpfile_create_checkpoint(struct inode *cpfile, __u64 cno);
 void nilfs_cpfile_put_checkpoint(struct inode *, __u64, struct buffer_head *);
 int nilfs_cpfile_delete_checkpoints(struct inode *, __u64, __u64);
 int nilfs_cpfile_delete_checkpoint(struct inode *, __u64);
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 7faefeb5904072..f38e56aa5aad02 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -880,34 +880,6 @@ static void nilfs_segctor_clear_metadata_dirty(struct nilfs_sc_info *sci)
 	nilfs_mdt_clear_dirty(nilfs->ns_dat);
 }
 
-static int nilfs_segctor_create_checkpoint(struct nilfs_sc_info *sci)
-{
-	struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
-	struct buffer_head *bh_cp;
-	struct nilfs_checkpoint *raw_cp;
-	int err;
-
-	/* XXX: this interface will be changed */
-	err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, nilfs->ns_cno, 1,
-					  &raw_cp, &bh_cp);
-	if (likely(!err)) {
-		/*
-		 * The following code is duplicated with cpfile.  But, it is
-		 * needed to collect the checkpoint even if it was not newly
-		 * created.
-		 */
-		mark_buffer_dirty(bh_cp);
-		nilfs_mdt_mark_dirty(nilfs->ns_cpfile);
-		nilfs_cpfile_put_checkpoint(
-			nilfs->ns_cpfile, nilfs->ns_cno, bh_cp);
-	} else if (err == -EINVAL || err == -ENOENT) {
-		nilfs_error(sci->sc_super,
-			    "checkpoint creation failed due to metadata corruption.");
-		err = -EIO;
-	}
-	return err;
-}
-
 static int nilfs_segctor_fill_in_checkpoint(struct nilfs_sc_info *sci)
 {
 	struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
@@ -1261,7 +1233,8 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
 			break;
 		nilfs_sc_cstage_inc(sci);
 		/* Creating a checkpoint */
-		err = nilfs_segctor_create_checkpoint(sci);
+		err = nilfs_cpfile_create_checkpoint(nilfs->ns_cpfile,
+						     nilfs->ns_cno);
 		if (unlikely(err))
 			break;
 		fallthrough;

From 4624272bea8762bbb02c5f82a9f24967555c344d Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Date: Mon, 22 Jan 2024 23:01:59 +0900
Subject: [PATCH 1340/1406] nilfs2: localize highmem mapping for checkpoint
 finalization within cpfile

Move the checkpoint finalization routine to the cpfile side, and make the
page mapping local and temporary.  And use kmap_local instead of kmap to
access the checkpoint entry page when finalizing a checkpoint.

In this conversion, some of the information on the checkpoint entry being
rewritten is passed through the arguments of the newly added method
nilfs_cpfile_finalize_checkpoint().

Link: https://lkml.kernel.org/r/20240122140202.6950-13-konishi.ryusuke@gmail.com
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/nilfs2/cpfile.c  | 74 +++++++++++++++++++++++++++++++++++++++++++++
 fs/nilfs2/cpfile.h  |  3 ++
 fs/nilfs2/segment.c | 51 +++----------------------------
 3 files changed, 82 insertions(+), 46 deletions(-)

diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c
index f62da80e530a73..3af77252e08141 100644
--- a/fs/nilfs2/cpfile.c
+++ b/fs/nilfs2/cpfile.c
@@ -363,6 +363,80 @@ void nilfs_cpfile_put_checkpoint(struct inode *cpfile, __u64 cno,
 	brelse(bh);
 }
 
+/**
+ * nilfs_cpfile_finalize_checkpoint - fill in a checkpoint entry in cpfile
+ * @cpfile: checkpoint file inode
+ * @cno:    checkpoint number
+ * @root:   nilfs root object
+ * @blkinc: number of blocks added by this checkpoint
+ * @ctime:  checkpoint creation time
+ * @minor:  minor checkpoint flag
+ *
+ * This function completes the checkpoint entry numbered by @cno in the
+ * cpfile with the data given by the arguments @root, @blkinc, @ctime, and
+ * @minor.
+ *
+ * Return: 0 on success, or the following negative error code on failure.
+ * * %-ENOMEM	- Insufficient memory available.
+ * * %-EIO	- I/O error (including metadata corruption).
+ */
+int nilfs_cpfile_finalize_checkpoint(struct inode *cpfile, __u64 cno,
+				     struct nilfs_root *root, __u64 blkinc,
+				     time64_t ctime, bool minor)
+{
+	struct buffer_head *cp_bh;
+	struct nilfs_checkpoint *cp;
+	void *kaddr;
+	int ret;
+
+	if (WARN_ON_ONCE(cno < 1))
+		return -EIO;
+
+	down_write(&NILFS_MDT(cpfile)->mi_sem);
+	ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &cp_bh);
+	if (unlikely(ret < 0)) {
+		if (ret == -ENOENT)
+			goto error;
+		goto out_sem;
+	}
+
+	kaddr = kmap_local_page(cp_bh->b_page);
+	cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr);
+	if (unlikely(nilfs_checkpoint_invalid(cp))) {
+		kunmap_local(kaddr);
+		brelse(cp_bh);
+		goto error;
+	}
+
+	cp->cp_snapshot_list.ssl_next = 0;
+	cp->cp_snapshot_list.ssl_prev = 0;
+	cp->cp_inodes_count = cpu_to_le64(atomic64_read(&root->inodes_count));
+	cp->cp_blocks_count = cpu_to_le64(atomic64_read(&root->blocks_count));
+	cp->cp_nblk_inc = cpu_to_le64(blkinc);
+	cp->cp_create = cpu_to_le64(ctime);
+	cp->cp_cno = cpu_to_le64(cno);
+
+	if (minor)
+		nilfs_checkpoint_set_minor(cp);
+	else
+		nilfs_checkpoint_clear_minor(cp);
+
+	nilfs_write_inode_common(root->ifile, &cp->cp_ifile_inode);
+	nilfs_bmap_write(NILFS_I(root->ifile)->i_bmap, &cp->cp_ifile_inode);
+
+	kunmap_local(kaddr);
+	brelse(cp_bh);
+out_sem:
+	up_write(&NILFS_MDT(cpfile)->mi_sem);
+	return ret;
+
+error:
+	nilfs_error(cpfile->i_sb,
+		    "checkpoint finalization failed due to metadata corruption.");
+	ret = -EIO;
+	goto out_sem;
+}
+
 /**
  * nilfs_cpfile_delete_checkpoints - delete checkpoints
  * @cpfile: inode of checkpoint file
diff --git a/fs/nilfs2/cpfile.h b/fs/nilfs2/cpfile.h
index fcb1a94097b3f9..aa1408a3af010e 100644
--- a/fs/nilfs2/cpfile.h
+++ b/fs/nilfs2/cpfile.h
@@ -21,6 +21,9 @@ int nilfs_cpfile_get_checkpoint(struct inode *, __u64, int,
 				struct buffer_head **);
 int nilfs_cpfile_create_checkpoint(struct inode *cpfile, __u64 cno);
 void nilfs_cpfile_put_checkpoint(struct inode *, __u64, struct buffer_head *);
+int nilfs_cpfile_finalize_checkpoint(struct inode *cpfile, __u64 cno,
+				     struct nilfs_root *root, __u64 blkinc,
+				     time64_t ctime, bool minor);
 int nilfs_cpfile_delete_checkpoints(struct inode *, __u64, __u64);
 int nilfs_cpfile_delete_checkpoint(struct inode *, __u64);
 int nilfs_cpfile_change_cpmode(struct inode *, __u64, int);
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index f38e56aa5aad02..aa5290cb7467cf 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -880,51 +880,6 @@ static void nilfs_segctor_clear_metadata_dirty(struct nilfs_sc_info *sci)
 	nilfs_mdt_clear_dirty(nilfs->ns_dat);
 }
 
-static int nilfs_segctor_fill_in_checkpoint(struct nilfs_sc_info *sci)
-{
-	struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
-	struct buffer_head *bh_cp;
-	struct nilfs_checkpoint *raw_cp;
-	struct inode *ifile;
-	int err;
-
-	err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, nilfs->ns_cno, 0,
-					  &raw_cp, &bh_cp);
-	if (unlikely(err)) {
-		if (err == -EINVAL || err == -ENOENT) {
-			nilfs_error(sci->sc_super,
-				    "checkpoint finalization failed due to metadata corruption.");
-			err = -EIO;
-		}
-		goto failed_ibh;
-	}
-	raw_cp->cp_snapshot_list.ssl_next = 0;
-	raw_cp->cp_snapshot_list.ssl_prev = 0;
-	raw_cp->cp_inodes_count =
-		cpu_to_le64(atomic64_read(&sci->sc_root->inodes_count));
-	raw_cp->cp_blocks_count =
-		cpu_to_le64(atomic64_read(&sci->sc_root->blocks_count));
-	raw_cp->cp_nblk_inc =
-		cpu_to_le64(sci->sc_nblk_inc + sci->sc_nblk_this_inc);
-	raw_cp->cp_create = cpu_to_le64(sci->sc_seg_ctime);
-	raw_cp->cp_cno = cpu_to_le64(nilfs->ns_cno);
-
-	if (test_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags))
-		nilfs_checkpoint_clear_minor(raw_cp);
-	else
-		nilfs_checkpoint_set_minor(raw_cp);
-
-	ifile = sci->sc_root->ifile;
-	nilfs_write_inode_common(ifile, &raw_cp->cp_ifile_inode);
-	nilfs_bmap_write(NILFS_I(ifile)->i_bmap, &raw_cp->cp_ifile_inode);
-
-	nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, nilfs->ns_cno, bh_cp);
-	return 0;
-
- failed_ibh:
-	return err;
-}
-
 static void nilfs_fill_in_file_bmap(struct inode *ifile,
 				    struct nilfs_inode_info *ii)
 
@@ -2105,7 +2060,11 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
 
 		if (mode == SC_LSEG_SR &&
 		    nilfs_sc_cstage_get(sci) >= NILFS_ST_CPFILE) {
-			err = nilfs_segctor_fill_in_checkpoint(sci);
+			err = nilfs_cpfile_finalize_checkpoint(
+				nilfs->ns_cpfile, nilfs->ns_cno, sci->sc_root,
+				sci->sc_nblk_inc + sci->sc_nblk_this_inc,
+				sci->sc_seg_ctime,
+				!test_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags));
 			if (unlikely(err))
 				goto failed_to_write;
 

From 01f6ac4b3c6b7c2bbd67c211f42fb68b98b000f7 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Date: Mon, 22 Jan 2024 23:02:00 +0900
Subject: [PATCH 1341/1406] nilfs2: localize highmem mapping for checkpoint
 reading within cpfile

Move the code for reading from a checkpoint entry that is performed in
nilfs_attach_checkpoint() to the cpfile side, and make the page mapping
local and temporary.  And use kmap_local instead of kmap to access the
checkpoint entry page.

In order to load the ifile inode information included in the checkpoint
entry within the inode lock section of nilfs_ifile_read(), the newly added
checkpoint reading method nilfs_cpfile_read_checkpoint() is called
indirectly via nilfs_ifile_read() instead of from
nilfs_attach_checkpoint().

Link: https://lkml.kernel.org/r/20240122140202.6950-14-konishi.ryusuke@gmail.com
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/nilfs2/cpfile.c | 68 ++++++++++++++++++++++++++++++++++++++++++++++
 fs/nilfs2/cpfile.h |  2 ++
 fs/nilfs2/ifile.c  | 17 ++++++++----
 fs/nilfs2/ifile.h  |  3 +-
 fs/nilfs2/super.c  | 31 ++++-----------------
 5 files changed, 87 insertions(+), 34 deletions(-)

diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c
index 3af77252e08141..56e38843536b96 100644
--- a/fs/nilfs2/cpfile.c
+++ b/fs/nilfs2/cpfile.c
@@ -186,6 +186,74 @@ static inline int nilfs_cpfile_delete_checkpoint_block(struct inode *cpfile,
 				      nilfs_cpfile_get_blkoff(cpfile, cno));
 }
 
+/**
+ * nilfs_cpfile_read_checkpoint - read a checkpoint entry in cpfile
+ * @cpfile: checkpoint file inode
+ * @cno:    number of checkpoint entry to read
+ * @root:   nilfs root object
+ * @ifile:  ifile's inode to read and attach to @root
+ *
+ * This function imports checkpoint information from the checkpoint file and
+ * stores it to the inode file given by @ifile and the nilfs root object
+ * given by @root.
+ *
+ * Return: 0 on success, or the following negative error code on failure.
+ * * %-EINVAL	- Invalid checkpoint.
+ * * %-ENOMEM	- Insufficient memory available.
+ * * %-EIO	- I/O error (including metadata corruption).
+ */
+int nilfs_cpfile_read_checkpoint(struct inode *cpfile, __u64 cno,
+				 struct nilfs_root *root, struct inode *ifile)
+{
+	struct buffer_head *cp_bh;
+	struct nilfs_checkpoint *cp;
+	void *kaddr;
+	int ret;
+
+	if (cno < 1 || cno > nilfs_mdt_cno(cpfile))
+		return -EINVAL;
+
+	down_read(&NILFS_MDT(cpfile)->mi_sem);
+	ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &cp_bh);
+	if (unlikely(ret < 0)) {
+		if (ret == -ENOENT)
+			ret = -EINVAL;
+		goto out_sem;
+	}
+
+	kaddr = kmap_local_page(cp_bh->b_page);
+	cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr);
+	if (nilfs_checkpoint_invalid(cp)) {
+		ret = -EINVAL;
+		goto put_cp;
+	}
+
+	ret = nilfs_read_inode_common(ifile, &cp->cp_ifile_inode);
+	if (unlikely(ret)) {
+		/*
+		 * Since this inode is on a checkpoint entry, treat errors
+		 * as metadata corruption.
+		 */
+		nilfs_err(cpfile->i_sb,
+			  "ifile inode (checkpoint number=%llu) corrupted",
+			  (unsigned long long)cno);
+		ret = -EIO;
+		goto put_cp;
+	}
+
+	/* Configure the nilfs root object */
+	atomic64_set(&root->inodes_count, le64_to_cpu(cp->cp_inodes_count));
+	atomic64_set(&root->blocks_count, le64_to_cpu(cp->cp_blocks_count));
+	root->ifile = ifile;
+
+put_cp:
+	kunmap_local(kaddr);
+	brelse(cp_bh);
+out_sem:
+	up_read(&NILFS_MDT(cpfile)->mi_sem);
+	return ret;
+}
+
 /**
  * nilfs_cpfile_get_checkpoint - get a checkpoint
  * @cpfile: inode of checkpoint file
diff --git a/fs/nilfs2/cpfile.h b/fs/nilfs2/cpfile.h
index aa1408a3af010e..2cfa14011bc832 100644
--- a/fs/nilfs2/cpfile.h
+++ b/fs/nilfs2/cpfile.h
@@ -19,6 +19,8 @@
 int nilfs_cpfile_get_checkpoint(struct inode *, __u64, int,
 				struct nilfs_checkpoint **,
 				struct buffer_head **);
+int nilfs_cpfile_read_checkpoint(struct inode *cpfile, __u64 cno,
+				 struct nilfs_root *root, struct inode *ifile);
 int nilfs_cpfile_create_checkpoint(struct inode *cpfile, __u64 cno);
 void nilfs_cpfile_put_checkpoint(struct inode *, __u64, struct buffer_head *);
 int nilfs_cpfile_finalize_checkpoint(struct inode *cpfile, __u64 cno,
diff --git a/fs/nilfs2/ifile.c b/fs/nilfs2/ifile.c
index e9538fa46ff27d..612e609158b520 100644
--- a/fs/nilfs2/ifile.c
+++ b/fs/nilfs2/ifile.c
@@ -15,6 +15,7 @@
 #include "mdt.h"
 #include "alloc.h"
 #include "ifile.h"
+#include "cpfile.h"
 
 /**
  * struct nilfs_ifile_info - on-memory private data of ifile
@@ -173,14 +174,18 @@ int nilfs_ifile_count_free_inodes(struct inode *ifile,
  * nilfs_ifile_read - read or get ifile inode
  * @sb: super block instance
  * @root: root object
+ * @cno: number of checkpoint entry to read
  * @inode_size: size of an inode
- * @raw_inode: on-disk ifile inode
- * @inodep: buffer to store the inode
+ *
+ * Return: 0 on success, or the following negative error code on failure.
+ * * %-EINVAL	- Invalid checkpoint.
+ * * %-ENOMEM	- Insufficient memory available.
+ * * %-EIO	- I/O error (including metadata corruption).
  */
 int nilfs_ifile_read(struct super_block *sb, struct nilfs_root *root,
-		     size_t inode_size, struct nilfs_inode *raw_inode,
-		     struct inode **inodep)
+		     __u64 cno, size_t inode_size)
 {
+	struct the_nilfs *nilfs;
 	struct inode *ifile;
 	int err;
 
@@ -201,13 +206,13 @@ int nilfs_ifile_read(struct super_block *sb, struct nilfs_root *root,
 
 	nilfs_palloc_setup_cache(ifile, &NILFS_IFILE_I(ifile)->palloc_cache);
 
-	err = nilfs_read_inode_common(ifile, raw_inode);
+	nilfs = sb->s_fs_info;
+	err = nilfs_cpfile_read_checkpoint(nilfs->ns_cpfile, cno, root, ifile);
 	if (err)
 		goto failed;
 
 	unlock_new_inode(ifile);
  out:
-	*inodep = ifile;
 	return 0;
  failed:
 	iget_failed(ifile);
diff --git a/fs/nilfs2/ifile.h b/fs/nilfs2/ifile.h
index b71ab0a81dc45e..625545cc2a989f 100644
--- a/fs/nilfs2/ifile.h
+++ b/fs/nilfs2/ifile.h
@@ -38,7 +38,6 @@ int nilfs_ifile_get_inode_block(struct inode *, ino_t, struct buffer_head **);
 int nilfs_ifile_count_free_inodes(struct inode *, u64 *, u64 *);
 
 int nilfs_ifile_read(struct super_block *sb, struct nilfs_root *root,
-		     size_t inode_size, struct nilfs_inode *raw_inode,
-		     struct inode **inodep);
+		     __u64 cno, size_t inode_size);
 
 #endif	/* _NILFS_IFILE_H */
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index df8674173b2202..5e630c179a1e29 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -544,8 +544,6 @@ int nilfs_attach_checkpoint(struct super_block *sb, __u64 cno, int curr_mnt,
 {
 	struct the_nilfs *nilfs = sb->s_fs_info;
 	struct nilfs_root *root;
-	struct nilfs_checkpoint *raw_cp;
-	struct buffer_head *bh_cp;
 	int err = -ENOMEM;
 
 	root = nilfs_find_or_create_root(
@@ -557,38 +555,19 @@ int nilfs_attach_checkpoint(struct super_block *sb, __u64 cno, int curr_mnt,
 		goto reuse; /* already attached checkpoint */
 
 	down_read(&nilfs->ns_segctor_sem);
-	err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, cno, 0, &raw_cp,
-					  &bh_cp);
+	err = nilfs_ifile_read(sb, root, cno, nilfs->ns_inode_size);
 	up_read(&nilfs->ns_segctor_sem);
-	if (unlikely(err)) {
-		if (err == -ENOENT || err == -EINVAL) {
-			nilfs_err(sb,
-				  "Invalid checkpoint (checkpoint number=%llu)",
-				  (unsigned long long)cno);
-			err = -EINVAL;
-		}
+	if (unlikely(err))
 		goto failed;
-	}
-
-	err = nilfs_ifile_read(sb, root, nilfs->ns_inode_size,
-			       &raw_cp->cp_ifile_inode, &root->ifile);
-	if (err)
-		goto failed_bh;
-
-	atomic64_set(&root->inodes_count,
-			le64_to_cpu(raw_cp->cp_inodes_count));
-	atomic64_set(&root->blocks_count,
-			le64_to_cpu(raw_cp->cp_blocks_count));
-
-	nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, cno, bh_cp);
 
  reuse:
 	*rootp = root;
 	return 0;
 
- failed_bh:
-	nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, cno, bh_cp);
  failed:
+	if (err == -EINVAL)
+		nilfs_err(sb, "Invalid checkpoint (checkpoint number=%llu)",
+			  (unsigned long long)cno);
 	nilfs_put_root(root);
 
 	return err;

From efdf99a0295d1c1de07e99d3bcc38004bf9eb06a Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Date: Mon, 22 Jan 2024 23:02:01 +0900
Subject: [PATCH 1342/1406] nilfs2: remove nilfs_cpfile_{get,put}_checkpoint()

All calls to nilfs_cpfile_get_checkpoint() and
nilfs_cpfile_put_checkpoint() that call kmap() and kunmap() separately are
now gone, so remove these methods.

Link: https://lkml.kernel.org/r/20240122140202.6950-15-konishi.ryusuke@gmail.com
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/nilfs2/cpfile.c | 103 ---------------------------------------------
 fs/nilfs2/cpfile.h |   4 --
 2 files changed, 107 deletions(-)

diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c
index 56e38843536b96..b5bad332d630c5 100644
--- a/fs/nilfs2/cpfile.c
+++ b/fs/nilfs2/cpfile.c
@@ -254,92 +254,6 @@ int nilfs_cpfile_read_checkpoint(struct inode *cpfile, __u64 cno,
 	return ret;
 }
 
-/**
- * nilfs_cpfile_get_checkpoint - get a checkpoint
- * @cpfile: inode of checkpoint file
- * @cno: checkpoint number
- * @create: create flag
- * @cpp: pointer to a checkpoint
- * @bhp: pointer to a buffer head
- *
- * Description: nilfs_cpfile_get_checkpoint() acquires the checkpoint
- * specified by @cno. A new checkpoint will be created if @cno is the current
- * checkpoint number and @create is nonzero.
- *
- * Return Value: On success, 0 is returned, and the checkpoint and the
- * buffer head of the buffer on which the checkpoint is located are stored in
- * the place pointed by @cpp and @bhp, respectively. On error, one of the
- * following negative error codes is returned.
- *
- * %-EIO - I/O error.
- *
- * %-ENOMEM - Insufficient amount of memory available.
- *
- * %-ENOENT - No such checkpoint.
- *
- * %-EINVAL - invalid checkpoint.
- */
-int nilfs_cpfile_get_checkpoint(struct inode *cpfile,
-				__u64 cno,
-				int create,
-				struct nilfs_checkpoint **cpp,
-				struct buffer_head **bhp)
-{
-	struct buffer_head *header_bh, *cp_bh;
-	struct nilfs_cpfile_header *header;
-	struct nilfs_checkpoint *cp;
-	void *kaddr;
-	int ret;
-
-	if (unlikely(cno < 1 || cno > nilfs_mdt_cno(cpfile) ||
-		     (cno < nilfs_mdt_cno(cpfile) && create)))
-		return -EINVAL;
-
-	down_write(&NILFS_MDT(cpfile)->mi_sem);
-
-	ret = nilfs_cpfile_get_header_block(cpfile, &header_bh);
-	if (ret < 0)
-		goto out_sem;
-	ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, create, &cp_bh);
-	if (ret < 0)
-		goto out_header;
-	kaddr = kmap(cp_bh->b_page);
-	cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr);
-	if (nilfs_checkpoint_invalid(cp)) {
-		if (!create) {
-			kunmap(cp_bh->b_page);
-			brelse(cp_bh);
-			ret = -ENOENT;
-			goto out_header;
-		}
-		/* a newly-created checkpoint */
-		nilfs_checkpoint_clear_invalid(cp);
-		if (!nilfs_cpfile_is_in_first(cpfile, cno))
-			nilfs_cpfile_block_add_valid_checkpoints(cpfile, cp_bh,
-								 kaddr, 1);
-		mark_buffer_dirty(cp_bh);
-
-		kaddr = kmap_atomic(header_bh->b_page);
-		header = nilfs_cpfile_block_get_header(cpfile, header_bh,
-						       kaddr);
-		le64_add_cpu(&header->ch_ncheckpoints, 1);
-		kunmap_atomic(kaddr);
-		mark_buffer_dirty(header_bh);
-		nilfs_mdt_mark_dirty(cpfile);
-	}
-
-	if (cpp != NULL)
-		*cpp = cp;
-	*bhp = cp_bh;
-
- out_header:
-	brelse(header_bh);
-
- out_sem:
-	up_write(&NILFS_MDT(cpfile)->mi_sem);
-	return ret;
-}
-
 /**
  * nilfs_cpfile_create_checkpoint - create a checkpoint entry on cpfile
  * @cpfile: checkpoint file inode
@@ -414,23 +328,6 @@ int nilfs_cpfile_create_checkpoint(struct inode *cpfile, __u64 cno)
 	return ret;
 }
 
-/**
- * nilfs_cpfile_put_checkpoint - put a checkpoint
- * @cpfile: inode of checkpoint file
- * @cno: checkpoint number
- * @bh: buffer head
- *
- * Description: nilfs_cpfile_put_checkpoint() releases the checkpoint
- * specified by @cno. @bh must be the buffer head which has been returned by
- * a previous call to nilfs_cpfile_get_checkpoint() with @cno.
- */
-void nilfs_cpfile_put_checkpoint(struct inode *cpfile, __u64 cno,
-				 struct buffer_head *bh)
-{
-	kunmap(bh->b_page);
-	brelse(bh);
-}
-
 /**
  * nilfs_cpfile_finalize_checkpoint - fill in a checkpoint entry in cpfile
  * @cpfile: checkpoint file inode
diff --git a/fs/nilfs2/cpfile.h b/fs/nilfs2/cpfile.h
index 2cfa14011bc832..f5b1d59289ebf8 100644
--- a/fs/nilfs2/cpfile.h
+++ b/fs/nilfs2/cpfile.h
@@ -16,13 +16,9 @@
 #include <linux/nilfs2_ondisk.h>	/* nilfs_inode, nilfs_checkpoint */
 
 
-int nilfs_cpfile_get_checkpoint(struct inode *, __u64, int,
-				struct nilfs_checkpoint **,
-				struct buffer_head **);
 int nilfs_cpfile_read_checkpoint(struct inode *cpfile, __u64 cno,
 				 struct nilfs_root *root, struct inode *ifile);
 int nilfs_cpfile_create_checkpoint(struct inode *cpfile, __u64 cno);
-void nilfs_cpfile_put_checkpoint(struct inode *, __u64, struct buffer_head *);
 int nilfs_cpfile_finalize_checkpoint(struct inode *cpfile, __u64 cno,
 				     struct nilfs_root *root, __u64 blkinc,
 				     time64_t ctime, bool minor);

From aa7331efe1dad70b3a9904fb8df735b51cbf95f0 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Date: Mon, 22 Jan 2024 23:02:02 +0900
Subject: [PATCH 1343/1406] nilfs2: convert cpfile to use kmap_local

Convert all remaining usages of kmap_atomic in cpfile to kmap_local.

Link: https://lkml.kernel.org/r/20240122140202.6950-16-konishi.ryusuke@gmail.com
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/nilfs2/cpfile.c | 90 +++++++++++++++++++++++-----------------------
 1 file changed, 45 insertions(+), 45 deletions(-)

diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c
index b5bad332d630c5..2c57132584de74 100644
--- a/fs/nilfs2/cpfile.c
+++ b/fs/nilfs2/cpfile.c
@@ -460,7 +460,7 @@ int nilfs_cpfile_delete_checkpoints(struct inode *cpfile,
 			continue;
 		}
 
-		kaddr = kmap_atomic(cp_bh->b_page);
+		kaddr = kmap_local_page(cp_bh->b_page);
 		cp = nilfs_cpfile_block_get_checkpoint(
 			cpfile, cno, cp_bh, kaddr);
 		nicps = 0;
@@ -482,7 +482,7 @@ int nilfs_cpfile_delete_checkpoints(struct inode *cpfile,
 						cpfile, cp_bh, kaddr, nicps);
 				if (count == 0) {
 					/* make hole */
-					kunmap_atomic(kaddr);
+					kunmap_local(kaddr);
 					brelse(cp_bh);
 					ret =
 					  nilfs_cpfile_delete_checkpoint_block(
@@ -497,18 +497,18 @@ int nilfs_cpfile_delete_checkpoints(struct inode *cpfile,
 			}
 		}
 
-		kunmap_atomic(kaddr);
+		kunmap_local(kaddr);
 		brelse(cp_bh);
 	}
 
 	if (tnicps > 0) {
-		kaddr = kmap_atomic(header_bh->b_page);
+		kaddr = kmap_local_page(header_bh->b_page);
 		header = nilfs_cpfile_block_get_header(cpfile, header_bh,
 						       kaddr);
 		le64_add_cpu(&header->ch_ncheckpoints, -(u64)tnicps);
 		mark_buffer_dirty(header_bh);
 		nilfs_mdt_mark_dirty(cpfile);
-		kunmap_atomic(kaddr);
+		kunmap_local(kaddr);
 	}
 
 	brelse(header_bh);
@@ -560,7 +560,7 @@ static ssize_t nilfs_cpfile_do_get_cpinfo(struct inode *cpfile, __u64 *cnop,
 		}
 		ncps = nilfs_cpfile_checkpoints_in_block(cpfile, cno, cur_cno);
 
-		kaddr = kmap_atomic(bh->b_page);
+		kaddr = kmap_local_page(bh->b_page);
 		cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, bh, kaddr);
 		for (i = 0; i < ncps && n < nci; i++, cp = (void *)cp + cpsz) {
 			if (!nilfs_checkpoint_invalid(cp)) {
@@ -570,7 +570,7 @@ static ssize_t nilfs_cpfile_do_get_cpinfo(struct inode *cpfile, __u64 *cnop,
 				n++;
 			}
 		}
-		kunmap_atomic(kaddr);
+		kunmap_local(kaddr);
 		brelse(bh);
 	}
 
@@ -604,10 +604,10 @@ static ssize_t nilfs_cpfile_do_get_ssinfo(struct inode *cpfile, __u64 *cnop,
 		ret = nilfs_cpfile_get_header_block(cpfile, &bh);
 		if (ret < 0)
 			goto out;
-		kaddr = kmap_atomic(bh->b_page);
+		kaddr = kmap_local_page(bh->b_page);
 		header = nilfs_cpfile_block_get_header(cpfile, bh, kaddr);
 		curr = le64_to_cpu(header->ch_snapshot_list.ssl_next);
-		kunmap_atomic(kaddr);
+		kunmap_local(kaddr);
 		brelse(bh);
 		if (curr == 0) {
 			ret = 0;
@@ -625,7 +625,7 @@ static ssize_t nilfs_cpfile_do_get_ssinfo(struct inode *cpfile, __u64 *cnop,
 			ret = 0; /* No snapshots (started from a hole block) */
 		goto out;
 	}
-	kaddr = kmap_atomic(bh->b_page);
+	kaddr = kmap_local_page(bh->b_page);
 	while (n < nci) {
 		cp = nilfs_cpfile_block_get_checkpoint(cpfile, curr, bh, kaddr);
 		curr = ~(__u64)0; /* Terminator */
@@ -641,7 +641,7 @@ static ssize_t nilfs_cpfile_do_get_ssinfo(struct inode *cpfile, __u64 *cnop,
 
 		next_blkoff = nilfs_cpfile_get_blkoff(cpfile, next);
 		if (curr_blkoff != next_blkoff) {
-			kunmap_atomic(kaddr);
+			kunmap_local(kaddr);
 			brelse(bh);
 			ret = nilfs_cpfile_get_checkpoint_block(cpfile, next,
 								0, &bh);
@@ -649,12 +649,12 @@ static ssize_t nilfs_cpfile_do_get_ssinfo(struct inode *cpfile, __u64 *cnop,
 				WARN_ON(ret == -ENOENT);
 				goto out;
 			}
-			kaddr = kmap_atomic(bh->b_page);
+			kaddr = kmap_local_page(bh->b_page);
 		}
 		curr = next;
 		curr_blkoff = next_blkoff;
 	}
-	kunmap_atomic(kaddr);
+	kunmap_local(kaddr);
 	brelse(bh);
 	*cnop = curr;
 	ret = n;
@@ -763,24 +763,24 @@ static int nilfs_cpfile_set_snapshot(struct inode *cpfile, __u64 cno)
 	ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &cp_bh);
 	if (ret < 0)
 		goto out_sem;
-	kaddr = kmap_atomic(cp_bh->b_page);
+	kaddr = kmap_local_page(cp_bh->b_page);
 	cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr);
 	if (nilfs_checkpoint_invalid(cp)) {
 		ret = -ENOENT;
-		kunmap_atomic(kaddr);
+		kunmap_local(kaddr);
 		goto out_cp;
 	}
 	if (nilfs_checkpoint_snapshot(cp)) {
 		ret = 0;
-		kunmap_atomic(kaddr);
+		kunmap_local(kaddr);
 		goto out_cp;
 	}
-	kunmap_atomic(kaddr);
+	kunmap_local(kaddr);
 
 	ret = nilfs_cpfile_get_header_block(cpfile, &header_bh);
 	if (ret < 0)
 		goto out_cp;
-	kaddr = kmap_atomic(header_bh->b_page);
+	kaddr = kmap_local_page(header_bh->b_page);
 	header = nilfs_cpfile_block_get_header(cpfile, header_bh, kaddr);
 	list = &header->ch_snapshot_list;
 	curr_bh = header_bh;
@@ -792,13 +792,13 @@ static int nilfs_cpfile_set_snapshot(struct inode *cpfile, __u64 cno)
 		prev_blkoff = nilfs_cpfile_get_blkoff(cpfile, prev);
 		curr = prev;
 		if (curr_blkoff != prev_blkoff) {
-			kunmap_atomic(kaddr);
+			kunmap_local(kaddr);
 			brelse(curr_bh);
 			ret = nilfs_cpfile_get_checkpoint_block(cpfile, curr,
 								0, &curr_bh);
 			if (ret < 0)
 				goto out_header;
-			kaddr = kmap_atomic(curr_bh->b_page);
+			kaddr = kmap_local_page(curr_bh->b_page);
 		}
 		curr_blkoff = prev_blkoff;
 		cp = nilfs_cpfile_block_get_checkpoint(
@@ -806,7 +806,7 @@ static int nilfs_cpfile_set_snapshot(struct inode *cpfile, __u64 cno)
 		list = &cp->cp_snapshot_list;
 		prev = le64_to_cpu(list->ssl_prev);
 	}
-	kunmap_atomic(kaddr);
+	kunmap_local(kaddr);
 
 	if (prev != 0) {
 		ret = nilfs_cpfile_get_checkpoint_block(cpfile, prev, 0,
@@ -818,29 +818,29 @@ static int nilfs_cpfile_set_snapshot(struct inode *cpfile, __u64 cno)
 		get_bh(prev_bh);
 	}
 
-	kaddr = kmap_atomic(curr_bh->b_page);
+	kaddr = kmap_local_page(curr_bh->b_page);
 	list = nilfs_cpfile_block_get_snapshot_list(
 		cpfile, curr, curr_bh, kaddr);
 	list->ssl_prev = cpu_to_le64(cno);
-	kunmap_atomic(kaddr);
+	kunmap_local(kaddr);
 
-	kaddr = kmap_atomic(cp_bh->b_page);
+	kaddr = kmap_local_page(cp_bh->b_page);
 	cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr);
 	cp->cp_snapshot_list.ssl_next = cpu_to_le64(curr);
 	cp->cp_snapshot_list.ssl_prev = cpu_to_le64(prev);
 	nilfs_checkpoint_set_snapshot(cp);
-	kunmap_atomic(kaddr);
+	kunmap_local(kaddr);
 
-	kaddr = kmap_atomic(prev_bh->b_page);
+	kaddr = kmap_local_page(prev_bh->b_page);
 	list = nilfs_cpfile_block_get_snapshot_list(
 		cpfile, prev, prev_bh, kaddr);
 	list->ssl_next = cpu_to_le64(cno);
-	kunmap_atomic(kaddr);
+	kunmap_local(kaddr);
 
-	kaddr = kmap_atomic(header_bh->b_page);
+	kaddr = kmap_local_page(header_bh->b_page);
 	header = nilfs_cpfile_block_get_header(cpfile, header_bh, kaddr);
 	le64_add_cpu(&header->ch_nsnapshots, 1);
-	kunmap_atomic(kaddr);
+	kunmap_local(kaddr);
 
 	mark_buffer_dirty(prev_bh);
 	mark_buffer_dirty(curr_bh);
@@ -881,23 +881,23 @@ static int nilfs_cpfile_clear_snapshot(struct inode *cpfile, __u64 cno)
 	ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &cp_bh);
 	if (ret < 0)
 		goto out_sem;
-	kaddr = kmap_atomic(cp_bh->b_page);
+	kaddr = kmap_local_page(cp_bh->b_page);
 	cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr);
 	if (nilfs_checkpoint_invalid(cp)) {
 		ret = -ENOENT;
-		kunmap_atomic(kaddr);
+		kunmap_local(kaddr);
 		goto out_cp;
 	}
 	if (!nilfs_checkpoint_snapshot(cp)) {
 		ret = 0;
-		kunmap_atomic(kaddr);
+		kunmap_local(kaddr);
 		goto out_cp;
 	}
 
 	list = &cp->cp_snapshot_list;
 	next = le64_to_cpu(list->ssl_next);
 	prev = le64_to_cpu(list->ssl_prev);
-	kunmap_atomic(kaddr);
+	kunmap_local(kaddr);
 
 	ret = nilfs_cpfile_get_header_block(cpfile, &header_bh);
 	if (ret < 0)
@@ -921,29 +921,29 @@ static int nilfs_cpfile_clear_snapshot(struct inode *cpfile, __u64 cno)
 		get_bh(prev_bh);
 	}
 
-	kaddr = kmap_atomic(next_bh->b_page);
+	kaddr = kmap_local_page(next_bh->b_page);
 	list = nilfs_cpfile_block_get_snapshot_list(
 		cpfile, next, next_bh, kaddr);
 	list->ssl_prev = cpu_to_le64(prev);
-	kunmap_atomic(kaddr);
+	kunmap_local(kaddr);
 
-	kaddr = kmap_atomic(prev_bh->b_page);
+	kaddr = kmap_local_page(prev_bh->b_page);
 	list = nilfs_cpfile_block_get_snapshot_list(
 		cpfile, prev, prev_bh, kaddr);
 	list->ssl_next = cpu_to_le64(next);
-	kunmap_atomic(kaddr);
+	kunmap_local(kaddr);
 
-	kaddr = kmap_atomic(cp_bh->b_page);
+	kaddr = kmap_local_page(cp_bh->b_page);
 	cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr);
 	cp->cp_snapshot_list.ssl_next = cpu_to_le64(0);
 	cp->cp_snapshot_list.ssl_prev = cpu_to_le64(0);
 	nilfs_checkpoint_clear_snapshot(cp);
-	kunmap_atomic(kaddr);
+	kunmap_local(kaddr);
 
-	kaddr = kmap_atomic(header_bh->b_page);
+	kaddr = kmap_local_page(header_bh->b_page);
 	header = nilfs_cpfile_block_get_header(cpfile, header_bh, kaddr);
 	le64_add_cpu(&header->ch_nsnapshots, -1);
-	kunmap_atomic(kaddr);
+	kunmap_local(kaddr);
 
 	mark_buffer_dirty(next_bh);
 	mark_buffer_dirty(prev_bh);
@@ -1002,13 +1002,13 @@ int nilfs_cpfile_is_snapshot(struct inode *cpfile, __u64 cno)
 	ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &bh);
 	if (ret < 0)
 		goto out;
-	kaddr = kmap_atomic(bh->b_page);
+	kaddr = kmap_local_page(bh->b_page);
 	cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, bh, kaddr);
 	if (nilfs_checkpoint_invalid(cp))
 		ret = -ENOENT;
 	else
 		ret = nilfs_checkpoint_snapshot(cp);
-	kunmap_atomic(kaddr);
+	kunmap_local(kaddr);
 	brelse(bh);
 
  out:
@@ -1085,12 +1085,12 @@ int nilfs_cpfile_get_stat(struct inode *cpfile, struct nilfs_cpstat *cpstat)
 	ret = nilfs_cpfile_get_header_block(cpfile, &bh);
 	if (ret < 0)
 		goto out_sem;
-	kaddr = kmap_atomic(bh->b_page);
+	kaddr = kmap_local_page(bh->b_page);
 	header = nilfs_cpfile_block_get_header(cpfile, bh, kaddr);
 	cpstat->cs_cno = nilfs_mdt_cno(cpfile);
 	cpstat->cs_ncps = le64_to_cpu(header->ch_ncheckpoints);
 	cpstat->cs_nsss = le64_to_cpu(header->ch_nsnapshots);
-	kunmap_atomic(kaddr);
+	kunmap_local(kaddr);
 	brelse(bh);
 
  out_sem:

From 3a22e00424efcc620a4daec7fda4f8df0ed59fc1 Mon Sep 17 00:00:00 2001
From: Nathan Chancellor <nathan@kernel.org>
Date: Thu, 25 Jan 2024 15:55:07 -0700
Subject: [PATCH 1344/1406] kbuild: raise the minimum supported version of LLVM
 to 13.0.1

Patch series "Bump the minimum supported version of LLVM to 13.0.1".

This series bumps the minimum supported version of LLVM for building the
kernel to 13.0.1.  The first patch does the bump and all subsequent
patches clean up all the various workarounds and checks for earlier
versions.

Quoting the first patch's commit message for those that were only on CC
for the clean ups:

  When __builtin_mul_overflow() has arguments that differ in terms of
  signedness and width, LLVM may generate a libcall to __muloti4 because
  it performs the checks in terms of 65-bit multiplication. This issue
  becomes harder to hit (but still possible) after LLVM 12.0.0, which
  includes a special case for matching widths but different signs.

  To gain access to this special case, which the kernel can take advantage
  of when calls to __muloti4 appear, bump the minimum supported version of
  LLVM for building the kernel to 13.0.1. 13.0.1 was chosen because there
  is minimal impact to distribution support while allowing a few more
  workarounds to be dropped in the kernel source than if 12.0.0 were
  chosen. Looking at container images of up to date distribution versions:

    archlinux:latest              clang version 16.0.6
    debian:oldoldstable-slim      clang version 7.0.1-8+deb10u2 (tags/RELEASE_701/final)
    debian:oldstable-slim         Debian clang version 11.0.1-2
    debian:stable-slim            Debian clang version 14.0.6
    debian:testing-slim           Debian clang version 16.0.6 (19)
    debian:unstable-slim          Debian clang version 16.0.6 (19)
    fedora:38                     clang version 16.0.6 (Fedora 16.0.6-3.fc38)
    fedora:latest                 clang version 17.0.6 (Fedora 17.0.6-1.fc39)
    fedora:rawhide                clang version 17.0.6 (Fedora 17.0.6-1.fc40)
    opensuse/leap:latest          clang version 15.0.7
    opensuse/tumbleweed:latest    clang version 17.0.6
    ubuntu:focal                  clang version 10.0.0-4ubuntu1
    ubuntu:latest                 Ubuntu clang version 14.0.0-1ubuntu1.1
    ubuntu:rolling                Ubuntu clang version 16.0.6 (15)
    ubuntu:devel                  Ubuntu clang version 17.0.6 (3)

  The only distribution that gets left behind is Debian Bullseye, as the
  default version is 11.0.1; other distributions either have a newer
  version than 13.0.1 or one older than the current minimum of 11.0.0.
  Debian has easy access to more recent LLVM versions through
  apt.llvm.org, so this is not as much of a concern. There are also the
  kernel.org LLVM toolchains, which should work with distributions with
  glibc 2.28 and newer.

  Another benefit of slimming up the number of supported versions of LLVM
  for building the kernel is reducing the build capacity needed to support
  a matrix that builds with each supported version, which allows a matrix
  to reallocate the freed up build capacity towards something else, such
  as more configuration combinations.

This passes my build matrix with all supported versions.

This is based on Andrew's mm-nonmm-unstable to avoid trivial conflicts
with my series to update the LLVM links across the repository [1] but I
can easily rebase it to linux-kbuild if Masahiro would rather these
patches go through there (and defer the conflict resolution to the merge
window).

[1]: https://lore.kernel.org/20240109-update-llvm-links-v1-0-eb09b59db071@kernel.org/


This patch (of 11):

When __builtin_mul_overflow() has arguments that differ in terms of
signedness and width, LLVM may generate a libcall to __muloti4 because it
performs the checks in terms of 65-bit multiplication.  This issue becomes
harder to hit (but still possible) after LLVM 12.0.0, which includes a
special case for matching widths but different signs.

To gain access to this special case, which the kernel can take advantage
of when calls to __muloti4 appear, bump the minimum supported version of
LLVM for building the kernel to 13.0.1.  13.0.1 was chosen because there
is minimal impact to distribution support while allowing a few more
workarounds to be dropped in the kernel source than if 12.0.0 were chosen.
Looking at container images of up to date distribution versions:

  archlinux:latest              clang version 16.0.6
  debian:oldoldstable-slim      clang version 7.0.1-8+deb10u2 (tags/RELEASE_701/final)
  debian:oldstable-slim         Debian clang version 11.0.1-2
  debian:stable-slim            Debian clang version 14.0.6
  debian:testing-slim           Debian clang version 16.0.6 (19)
  debian:unstable-slim          Debian clang version 16.0.6 (19)
  fedora:38                     clang version 16.0.6 (Fedora 16.0.6-3.fc38)
  fedora:latest                 clang version 17.0.6 (Fedora 17.0.6-1.fc39)
  fedora:rawhide                clang version 17.0.6 (Fedora 17.0.6-1.fc40)
  opensuse/leap:latest          clang version 15.0.7
  opensuse/tumbleweed:latest    clang version 17.0.6
  ubuntu:focal                  clang version 10.0.0-4ubuntu1
  ubuntu:latest                 Ubuntu clang version 14.0.0-1ubuntu1.1
  ubuntu:rolling                Ubuntu clang version 16.0.6 (15)
  ubuntu:devel                  Ubuntu clang version 17.0.6 (3)

The only distribution that gets left behind is Debian Bullseye, as the
default version is 11.0.1; other distributions either have a newer version
than 13.0.1 or one older than the current minimum of 11.0.0.  Debian has
easy access to more recent LLVM versions through apt.llvm.org, so this is
not as much of a concern.  There are also the kernel.org LLVM toolchains,
which should work with distributions with glibc 2.28 and newer.

Another benefit of slimming up the number of supported versions of LLVM
for building the kernel is reducing the build capacity needed to support a
matrix that builds with each supported version, which allows a matrix to
reallocate the freed up build capacity towards something else, such as
more configuration combinations.

Link: https://lkml.kernel.org/r/20240125-bump-min-llvm-ver-to-13-0-1-v1-0-f5ff9bda41c5@kernel.org
Closes: https://github.com/ClangBuiltLinux/linux/issues/1975
Link: https://github.com/llvm/llvm-project/issues/38013
Link: https://github.com/llvm/llvm-project/commit/3203143f1356a4e4e3ada231156fc6da6e1a9f9d
Link: https://mirrors.edge.kernel.org/pub/tools/llvm/
Link: https://lkml.kernel.org/r/20240125-bump-min-llvm-ver-to-13-0-1-v1-1-f5ff9bda41c5@kernel.org
Signed-off-by: Nathan Chancellor <nathan@kernel.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: "Aneesh Kumar K.V (IBM)" <aneesh.kumar@kernel.org>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Conor Dooley <conor@kernel.org>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: "Naveen N. Rao" <naveen.n.rao@linux.ibm.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Nicolas Schier <nicolas@fjasle.eu>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/process/changes.rst | 2 +-
 scripts/min-tool-version.sh       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Documentation/process/changes.rst b/Documentation/process/changes.rst
index 50b3d1cb11159b..d7306b8cad1378 100644
--- a/Documentation/process/changes.rst
+++ b/Documentation/process/changes.rst
@@ -30,7 +30,7 @@ you probably needn't concern yourself with pcmciautils.
         Program        Minimal version       Command to check the version
 ====================== ===============  ========================================
 GNU C                  5.1              gcc --version
-Clang/LLVM (optional)  11.0.0           clang --version
+Clang/LLVM (optional)  13.0.1           clang --version
 Rust (optional)        1.74.1           rustc --version
 bindgen (optional)     0.65.1           bindgen --version
 GNU make               3.82             make --version
diff --git a/scripts/min-tool-version.sh b/scripts/min-tool-version.sh
index 9faa4d3d91e358..5d17022ee1f6f8 100755
--- a/scripts/min-tool-version.sh
+++ b/scripts/min-tool-version.sh
@@ -29,7 +29,7 @@ llvm)
 	elif [ "$SRCARCH" = loongarch ]; then
 		echo 18.0.0
 	else
-		echo 11.0.0
+		echo 13.0.1
 	fi
 	;;
 rustc)

From 562ddd4e3a12332f21deb62bcca30536da0b0990 Mon Sep 17 00:00:00 2001
From: Nathan Chancellor <nathan@kernel.org>
Date: Thu, 25 Jan 2024 15:55:08 -0700
Subject: [PATCH 1345/1406] Makefile: drop warn-stack-size plugin opt

Now that the minimum supported version of LLVM for building the kernel has
been bumped to 13.0.1, the inner ifeq statement is always false, as the
build will fail during the configuration stage for older LLVM versions.

This effectively reverts commit 24845dcb170e ("Makefile: LTO: have linker
check -Wframe-larger-than") and its follow up fix, commit 0236526d76b8
("Makefile: lto: Pass -warn-stack-size only on LLD < 13.0.0").

Link: https://lkml.kernel.org/r/20240125-bump-min-llvm-ver-to-13-0-1-v1-2-f5ff9bda41c5@kernel.org
Signed-off-by: Nathan Chancellor <nathan@kernel.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: "Aneesh Kumar K.V (IBM)" <aneesh.kumar@kernel.org>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Conor Dooley <conor@kernel.org>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: "Naveen N. Rao" <naveen.n.rao@linux.ibm.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Nicolas Schier <nicolas@fjasle.eu>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Makefile | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/Makefile b/Makefile
index 41fa8a2565f54e..6c921a465c85ae 100644
--- a/Makefile
+++ b/Makefile
@@ -951,14 +951,6 @@ CC_FLAGS_LTO	+= -fvisibility=hidden
 
 # Limit inlining across translation units to reduce binary size
 KBUILD_LDFLAGS += -mllvm -import-instr-limit=5
-
-# Check for frame size exceeding threshold during prolog/epilog insertion
-# when using lld < 13.0.0.
-ifneq ($(CONFIG_FRAME_WARN),0)
-ifeq ($(call test-lt, $(CONFIG_LLD_VERSION), 130000),y)
-KBUILD_LDFLAGS	+= -plugin-opt=-warn-stack-size=$(CONFIG_FRAME_WARN)
-endif
-endif
 endif
 
 ifdef CONFIG_LTO

From 1f6a0b07712adbbce69eb407869f1bad13bcf9c7 Mon Sep 17 00:00:00 2001
From: Nathan Chancellor <nathan@kernel.org>
Date: Thu, 25 Jan 2024 15:55:09 -0700
Subject: [PATCH 1346/1406] x86: drop stack-alignment plugin opt

Now that the minimum supported version of LLVM for building the kernel has
been bumped to 13.0.1, the inner ifeq statement is always false, as the
build will fail during the configuration stage for older LLVM versions.

This effectively reverts part of commit b33fff07e3e3 ("x86, build: allow
LTO to be selected") and its follow up fix, commit 2398ce80152a ("x86,
lto: Pass -stack-alignment only on LLD < 13.0.0").

Link: https://lkml.kernel.org/r/20240125-bump-min-llvm-ver-to-13-0-1-v1-3-f5ff9bda41c5@kernel.org
Signed-off-by: Nathan Chancellor <nathan@kernel.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: "Aneesh Kumar K.V (IBM)" <aneesh.kumar@kernel.org>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Conor Dooley <conor@kernel.org>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: "Naveen N. Rao" <naveen.n.rao@linux.ibm.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Nicolas Schier <nicolas@fjasle.eu>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/x86/Makefile | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index da8f3caf27815e..d5462cee9da93f 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -217,12 +217,6 @@ endif
 
 KBUILD_LDFLAGS += -m elf_$(UTS_MACHINE)
 
-ifdef CONFIG_LTO_CLANG
-ifeq ($(call test-lt, $(CONFIG_LLD_VERSION), 130000),y)
-KBUILD_LDFLAGS	+= -plugin-opt=-stack-alignment=$(if $(CONFIG_X86_32),4,8)
-endif
-endif
-
 ifdef CONFIG_X86_NEED_RELOCS
 LDFLAGS_vmlinux := --emit-relocs --discard-none
 else

From 8dd77795817d0e44a6bac36d283fc238eefe8636 Mon Sep 17 00:00:00 2001
From: Nathan Chancellor <nathan@kernel.org>
Date: Thu, 25 Jan 2024 15:55:10 -0700
Subject: [PATCH 1347/1406] ARM: remove Thumb2 __builtin_thread_pointer
 workaround for Clang

Now that the minimum supported version of LLVM for building the kernel has
been bumped to 13.0.1, the conditional expression added to get_current()
by commit c1e42efacb9b ("ARM: 9151/1: Thumb2: avoid
__builtin_thread_pointer() on Clang") is always true, as the build will
fail during the configuration stage for older LLVM versions.  Remove it,
effectively reverting the aforementioned change.

Link: https://lkml.kernel.org/r/20240125-bump-min-llvm-ver-to-13-0-1-v1-4-f5ff9bda41c5@kernel.org
Signed-off-by: Nathan Chancellor <nathan@kernel.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: "Aneesh Kumar K.V (IBM)" <aneesh.kumar@kernel.org>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Conor Dooley <conor@kernel.org>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: "Naveen N. Rao" <naveen.n.rao@linux.ibm.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Nicolas Schier <nicolas@fjasle.eu>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/arm/include/asm/current.h | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/arch/arm/include/asm/current.h b/arch/arm/include/asm/current.h
index 1e1178bf176da6..5225cb1c803b16 100644
--- a/arch/arm/include/asm/current.h
+++ b/arch/arm/include/asm/current.h
@@ -18,18 +18,12 @@ static __always_inline __attribute_const__ struct task_struct *get_current(void)
 {
 	struct task_struct *cur;
 
-#if __has_builtin(__builtin_thread_pointer) && \
-    defined(CONFIG_CURRENT_POINTER_IN_TPIDRURO) && \
-    !(defined(CONFIG_THUMB2_KERNEL) && \
-      defined(CONFIG_CC_IS_CLANG) && CONFIG_CLANG_VERSION < 130001)
+#if __has_builtin(__builtin_thread_pointer) && defined(CONFIG_CURRENT_POINTER_IN_TPIDRURO)
 	/*
 	 * Use the __builtin helper when available - this results in better
 	 * code, especially when using GCC in combination with the per-task
 	 * stack protector, as the compiler will recognize that it needs to
 	 * load the TLS register only once in every function.
-	 *
-	 * Clang < 13.0.1 gets this wrong for Thumb2 builds:
-	 * https://github.com/ClangBuiltLinux/linux/issues/1485
 	 */
 	cur = __builtin_thread_pointer();
 #elif defined(CONFIG_CURRENT_POINTER_IN_TPIDRURO) || defined(CONFIG_SMP)

From bb3d6dd6ba18343b0a8d70b0a68a35ca3edd01a8 Mon Sep 17 00:00:00 2001
From: Nathan Chancellor <nathan@kernel.org>
Date: Thu, 25 Jan 2024 15:55:11 -0700
Subject: [PATCH 1348/1406] arm64: Kconfig: clean up tautological LLVM version
 checks

Now that the minimum supported version of LLVM for building the kernel has
been bumped to 13.0.1, several conditions become tautologies, as they will
always be true because the build will fail during the configuration stage
for older LLVM versions.  Drop them, as they are unnecessary.

Link: https://lkml.kernel.org/r/20240125-bump-min-llvm-ver-to-13-0-1-v1-5-f5ff9bda41c5@kernel.org
Signed-off-by: Nathan Chancellor <nathan@kernel.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: "Aneesh Kumar K.V (IBM)" <aneesh.kumar@kernel.org>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Conor Dooley <conor@kernel.org>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: "Naveen N. Rao" <naveen.n.rao@linux.ibm.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Nicolas Schier <nicolas@fjasle.eu>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/arm64/Kconfig | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 5a8acca4dbf495..cb34e7d780c090 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -383,7 +383,7 @@ config BUILTIN_RETURN_ADDRESS_STRIPS_PAC
 	bool
 	# Clang's __builtin_return_adddress() strips the PAC since 12.0.0
 	# https://github.com/llvm/llvm-project/commit/2a96f47c5ffca84cd774ad402cacd137f4bf45e2
-	default y if CC_IS_CLANG && (CLANG_VERSION >= 120000)
+	default y if CC_IS_CLANG
 	# GCC's __builtin_return_address() strips the PAC since 11.1.0,
 	# and this was backported to 10.2.0, 9.4.0, 8.5.0, but not earlier
 	# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=94891
@@ -1387,7 +1387,6 @@ choice
 
 config CPU_BIG_ENDIAN
 	bool "Build big-endian kernel"
-	depends on !LD_IS_LLD || LLD_VERSION >= 130000
 	# https://github.com/llvm/llvm-project/commit/1379b150991f70a5782e9a143c2ba5308da1161c
 	depends on AS_IS_GNU || AS_VERSION >= 150000
 	help
@@ -2018,8 +2017,6 @@ config ARM64_BTI_KERNEL
 	depends on !CC_IS_GCC || GCC_VERSION >= 100100
 	# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106671
 	depends on !CC_IS_GCC
-	# https://github.com/llvm/llvm-project/commit/a88c722e687e6780dcd6a58718350dc76fcc4cc9
-	depends on !CC_IS_CLANG || CLANG_VERSION >= 120000
 	depends on (!FUNCTION_GRAPH_TRACER || DYNAMIC_FTRACE_WITH_ARGS)
 	help
 	  Build the kernel with Branch Target Identification annotations

From 8a3ccb56098e15a72c456e578ef91dadffb37119 Mon Sep 17 00:00:00 2001
From: Nathan Chancellor <nathan@kernel.org>
Date: Thu, 25 Jan 2024 15:55:12 -0700
Subject: [PATCH 1349/1406] powerpc: Kconfig: remove tautology in CONFIG_COMPAT

This reverts commit 6fcb574125e6 ("powerpc: Kconfig: disable
CONFIG_COMPAT for clang < 12").

Now that the minimum supported version of LLVM for building the kernel has
been bumped to 13.0.1, this condition is always true, as the build will
fail during the configuration stage for older LLVM versions.  Remove it.

Link: https://lkml.kernel.org/r/20240125-bump-min-llvm-ver-to-13-0-1-v1-6-f5ff9bda41c5@kernel.org
Signed-off-by: Nathan Chancellor <nathan@kernel.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: "Aneesh Kumar K.V (IBM)" <aneesh.kumar@kernel.org>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Conor Dooley <conor@kernel.org>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: "Naveen N. Rao" <naveen.n.rao@linux.ibm.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Nicolas Schier <nicolas@fjasle.eu>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/powerpc/Kconfig | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index b9fc064d38d281..86da0d01365a75 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -333,7 +333,6 @@ config PANIC_TIMEOUT
 config COMPAT
 	bool "Enable support for 32bit binaries"
 	depends on PPC64
-	depends on !CC_IS_CLANG || CLANG_VERSION >= 120000
 	default y if !CPU_LITTLE_ENDIAN
 	select ARCH_WANT_OLD_COMPAT_IPC
 	select COMPAT_OLD_SIGACTION

From 1cce442eb7f80e7ce137b5481452a20078b03c74 Mon Sep 17 00:00:00 2001
From: Nathan Chancellor <nathan@kernel.org>
Date: Thu, 25 Jan 2024 15:55:13 -0700
Subject: [PATCH 1350/1406] riscv: remove MCOUNT_NAME workaround

Now that the minimum supported version of LLVM for building the kernel has
been bumped to 13.0.1, the condition for using _mcount as MCOUNT_NAME is
always true, as the build will fail during the configuration stage for
older LLVM versions.  Replace MCOUNT_NAME with _mcount directly.

This effectively reverts commit 7ce047715030 ("riscv: Workaround mcount
name prior to clang-13").

Link: https://lkml.kernel.org/r/20240125-bump-min-llvm-ver-to-13-0-1-v1-7-f5ff9bda41c5@kernel.org
Signed-off-by: Nathan Chancellor <nathan@kernel.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: "Aneesh Kumar K.V (IBM)" <aneesh.kumar@kernel.org>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Conor Dooley <conor@kernel.org>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: "Naveen N. Rao" <naveen.n.rao@linux.ibm.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Nicolas Schier <nicolas@fjasle.eu>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/riscv/include/asm/ftrace.h | 14 ++------------
 arch/riscv/kernel/mcount.S      | 10 +++++-----
 scripts/recordmcount.pl         |  2 +-
 3 files changed, 8 insertions(+), 18 deletions(-)

diff --git a/arch/riscv/include/asm/ftrace.h b/arch/riscv/include/asm/ftrace.h
index 06874fb1311e5e..cf5b63e789fa7c 100644
--- a/arch/riscv/include/asm/ftrace.h
+++ b/arch/riscv/include/asm/ftrace.h
@@ -13,19 +13,9 @@
 #endif
 #define HAVE_FUNCTION_GRAPH_RET_ADDR_PTR
 
-/*
- * Clang prior to 13 had "mcount" instead of "_mcount":
- * https://github.com/llvm/llvm-project/commit/ef58ae86ba778ed7d01cd3f6bd6d08f943abab44
- */
-#if defined(CONFIG_CC_IS_GCC) || CONFIG_CLANG_VERSION >= 130000
-#define MCOUNT_NAME _mcount
-#else
-#define MCOUNT_NAME mcount
-#endif
-
 #define ARCH_SUPPORTS_FTRACE_OPS 1
 #ifndef __ASSEMBLY__
-void MCOUNT_NAME(void);
+void _mcount(void);
 static inline unsigned long ftrace_call_adjust(unsigned long addr)
 {
 	return addr;
@@ -75,7 +65,7 @@ struct dyn_arch_ftrace {
  * both auipc and jalr at the same time.
  */
 
-#define MCOUNT_ADDR		((unsigned long)MCOUNT_NAME)
+#define MCOUNT_ADDR		((unsigned long)_mcount)
 #define JALR_SIGN_MASK		(0x00000800)
 #define JALR_OFFSET_MASK	(0x00000fff)
 #define AUIPC_OFFSET_MASK	(0xfffff000)
diff --git a/arch/riscv/kernel/mcount.S b/arch/riscv/kernel/mcount.S
index d7ec69ac6910c6..3a42f6287909d0 100644
--- a/arch/riscv/kernel/mcount.S
+++ b/arch/riscv/kernel/mcount.S
@@ -50,8 +50,8 @@
 
 SYM_TYPED_FUNC_START(ftrace_stub)
 #ifdef CONFIG_DYNAMIC_FTRACE
-       .global MCOUNT_NAME
-       .set    MCOUNT_NAME, ftrace_stub
+       .global _mcount
+       .set    _mcount, ftrace_stub
 #endif
 	ret
 SYM_FUNC_END(ftrace_stub)
@@ -80,7 +80,7 @@ SYM_FUNC_END(return_to_handler)
 #endif
 
 #ifndef CONFIG_DYNAMIC_FTRACE
-SYM_FUNC_START(MCOUNT_NAME)
+SYM_FUNC_START(_mcount)
 	la	t4, ftrace_stub
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 	la	t0, ftrace_graph_return
@@ -126,6 +126,6 @@ SYM_FUNC_START(MCOUNT_NAME)
 	jalr	t5
 	RESTORE_ABI_STATE
 	ret
-SYM_FUNC_END(MCOUNT_NAME)
+SYM_FUNC_END(_mcount)
 #endif
-EXPORT_SYMBOL(MCOUNT_NAME)
+EXPORT_SYMBOL(_mcount)
diff --git a/scripts/recordmcount.pl b/scripts/recordmcount.pl
index f84df9e383fd0a..0871b2e92584b2 100755
--- a/scripts/recordmcount.pl
+++ b/scripts/recordmcount.pl
@@ -352,7 +352,7 @@
     $mcount_regex = "^\\s*([0-9a-fA-F]+):.*\\s_mcount\$";
 } elsif ($arch eq "riscv") {
     $function_regex = "^([0-9a-fA-F]+)\\s+<([^.0-9][0-9a-zA-Z_\\.]+)>:";
-    $mcount_regex = "^\\s*([0-9a-fA-F]+):\\sR_RISCV_CALL(_PLT)?\\s_?mcount\$";
+    $mcount_regex = "^\\s*([0-9a-fA-F]+):\\sR_RISCV_CALL(_PLT)?\\s_mcount\$";
     $type = ".quad";
     $alignment = 2;
 } elsif ($arch eq "csky") {

From 433d15aad4ebb1fdff24ed35db4048eadf16bbd0 Mon Sep 17 00:00:00 2001
From: Nathan Chancellor <nathan@kernel.org>
Date: Thu, 25 Jan 2024 15:55:14 -0700
Subject: [PATCH 1351/1406] riscv: Kconfig: remove version dependency from
 CONFIG_CLANG_SUPPORTS_DYNAMIC_FTRACE

Now that the minimum supported version of LLVM for building the kernel has
been bumped to 13.0.1, this condition is always true, as the build will
fail during the configuration stage for older LLVM versions.  Remove it.

Link: https://lkml.kernel.org/r/20240125-bump-min-llvm-ver-to-13-0-1-v1-8-f5ff9bda41c5@kernel.org
Signed-off-by: Nathan Chancellor <nathan@kernel.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: "Aneesh Kumar K.V (IBM)" <aneesh.kumar@kernel.org>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Conor Dooley <conor@kernel.org>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: "Naveen N. Rao" <naveen.n.rao@linux.ibm.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Nicolas Schier <nicolas@fjasle.eu>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/riscv/Kconfig | 2 --
 1 file changed, 2 deletions(-)

diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index 69d24f51392206..00edc4ff589c99 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -174,8 +174,6 @@ config RISCV
 
 config CLANG_SUPPORTS_DYNAMIC_FTRACE
 	def_bool CC_IS_CLANG
-	# https://github.com/llvm/llvm-project/commit/6ab8927931851bb42b2c93a00801dc499d7d9b1e
-	depends on CLANG_VERSION >= 130000
 	# https://github.com/ClangBuiltLinux/linux/issues/1817
 	depends on AS_IS_GNU || (AS_IS_LLVM && (LD_IS_LLD || LD_VERSION >= 23600))
 

From 0524d54aaae3adc73c6401eeab40351164cf7b9b Mon Sep 17 00:00:00 2001
From: Nathan Chancellor <nathan@kernel.org>
Date: Thu, 25 Jan 2024 15:55:15 -0700
Subject: [PATCH 1352/1406] fortify: drop Clang version check for 12.0.1 or
 newer

Now that the minimum supported version of LLVM for building the kernel has
been bumped to 13.0.1, this condition is always true, as the build will
fail during the configuration stage for older LLVM versions.  Remove it.

Link: https://lkml.kernel.org/r/20240125-bump-min-llvm-ver-to-13-0-1-v1-9-f5ff9bda41c5@kernel.org
Signed-off-by: Nathan Chancellor <nathan@kernel.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: "Aneesh Kumar K.V (IBM)" <aneesh.kumar@kernel.org>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Conor Dooley <conor@kernel.org>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: "Naveen N. Rao" <naveen.n.rao@linux.ibm.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Nicolas Schier <nicolas@fjasle.eu>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 security/Kconfig | 2 --
 1 file changed, 2 deletions(-)

diff --git a/security/Kconfig b/security/Kconfig
index 606a87c29a0170..412e76f1575d0d 100644
--- a/security/Kconfig
+++ b/security/Kconfig
@@ -142,8 +142,6 @@ config HARDENED_USERCOPY
 config FORTIFY_SOURCE
 	bool "Harden common str/mem functions against buffer overflows"
 	depends on ARCH_HAS_FORTIFY_SOURCE
-	# https://llvm.org/pr41459
-	depends on !CC_IS_CLANG || CLANG_VERSION >= 120001
 	# https://github.com/llvm/llvm-project/issues/53645
 	depends on !CC_IS_CLANG || !X86_32
 	help

From 038e996ff5eb8c8489528eca99f0a5aa2d83aa34 Mon Sep 17 00:00:00 2001
From: Nathan Chancellor <nathan@kernel.org>
Date: Thu, 25 Jan 2024 15:55:16 -0700
Subject: [PATCH 1353/1406] lib/Kconfig.debug: update Clang version check in
 CONFIG_KCOV

Now that the minimum supported version of LLVM for building the kernel has
been bumped to 13.0.1, this condition can be changed to just
CONFIG_CC_IS_CLANG, as the build will fail during the configuration stage
for older LLVM versions.

Link: https://lkml.kernel.org/r/20240125-bump-min-llvm-ver-to-13-0-1-v1-10-f5ff9bda41c5@kernel.org
Signed-off-by: Nathan Chancellor <nathan@kernel.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: "Aneesh Kumar K.V (IBM)" <aneesh.kumar@kernel.org>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Conor Dooley <conor@kernel.org>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: "Naveen N. Rao" <naveen.n.rao@linux.ibm.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Nicolas Schier <nicolas@fjasle.eu>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 lib/Kconfig.debug | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 57a4dc50325ff8..d5ffdc92a06fee 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -2085,7 +2085,7 @@ config KCOV
 	depends on ARCH_HAS_KCOV
 	depends on CC_HAS_SANCOV_TRACE_PC || GCC_PLUGINS
 	depends on !ARCH_WANTS_NO_INSTR || HAVE_NOINSTR_HACK || \
-		   GCC_VERSION >= 120000 || CLANG_VERSION >= 130000
+		   GCC_VERSION >= 120000 || CC_IS_CLANG
 	select DEBUG_FS
 	select GCC_PLUGIN_SANCOV if !CC_HAS_SANCOV_TRACE_PC
 	select OBJTOOL if HAVE_NOINSTR_HACK

From aebe7050cb4ad871793e3e55a834226d53aac540 Mon Sep 17 00:00:00 2001
From: Nathan Chancellor <nathan@kernel.org>
Date: Thu, 25 Jan 2024 15:55:17 -0700
Subject: [PATCH 1354/1406] compiler-clang.h: update __diag_clang() macros for
 minimum version bump

The minimum supported version of LLVM for building the kernel has been
bumped to 13.0.1.  Update the __diag_clang() macros for this bump.

Link: https://lkml.kernel.org/r/20240125-bump-min-llvm-ver-to-13-0-1-v1-11-f5ff9bda41c5@kernel.org
Signed-off-by: Nathan Chancellor <nathan@kernel.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: "Aneesh Kumar K.V (IBM)" <aneesh.kumar@kernel.org>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Conor Dooley <conor@kernel.org>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: "Naveen N. Rao" <naveen.n.rao@linux.ibm.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Nicolas Schier <nicolas@fjasle.eu>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/compiler-clang.h | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h
index f0a47afef12581..49feac0162a526 100644
--- a/include/linux/compiler-clang.h
+++ b/include/linux/compiler-clang.h
@@ -114,11 +114,7 @@
 #define __diag_str(s)		__diag_str1(s)
 #define __diag(s)		_Pragma(__diag_str(clang diagnostic s))
 
-#if CONFIG_CLANG_VERSION >= 110000
-#define __diag_clang_11(s)	__diag(s)
-#else
-#define __diag_clang_11(s)
-#endif
+#define __diag_clang_13(s)	__diag(s)
 
 #define __diag_ignore_all(option, comment) \
-	__diag_clang(11, ignore, option)
+	__diag_clang(13, ignore, option)

From 106d6d29972f751cae37677cf9457b56bc950762 Mon Sep 17 00:00:00 2001
From: Muhammad Usama Anjum <usama.anjum@collabora.com>
Date: Thu, 25 Jan 2024 20:46:04 +0500
Subject: [PATCH 1355/1406] selftests/mm: hugetlb_reparenting_test: do not
 unmount

Patch series "selftests/mm: Improve run_vmtests.sh", v3.

In this series, I'm trying to add 3 missing tests to vm_runtests.sh which
is used to run all the tests in mm suite.  These tests weren't running by
CIs.  While enabling them and through review feedback, I've fixed some
problems in tests as well.  I've found more flakiness in more tests which
I'll be fixing with future patches.

hugetlb-read-hwpoison test is being added where it can only run with newly
added "-d" (destructive) flag only.  Not sure why it is failing again.  So
once it become stable, we can think of moving it to default set of tests
if it doesn't have any side-effect to them.


This patch (of 5):

Do not unmount the cgroup if it wasn't mounted by the test.  The earlier
patch had fixed this for charge_reserved_hugetlb, but not for this test.
I'm adding fixes tag to that earlier patch.

Link: https://lkml.kernel.org/r/20240125154608.720072-1-usama.anjum@collabora.com
Link: https://lkml.kernel.org/r/20240125154608.720072-2-usama.anjum@collabora.com
Fixes: 209376ed2a84 ("selftests/vm: make charge_reserved_hugetlb.sh work with existing cgroup setting")
Signed-off-by: Muhammad Usama Anjum <usama.anjum@collabora.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/hugetlb_reparenting_test.sh | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/mm/hugetlb_reparenting_test.sh b/tools/testing/selftests/mm/hugetlb_reparenting_test.sh
index 14d26075c8635f..615c4d766c9093 100755
--- a/tools/testing/selftests/mm/hugetlb_reparenting_test.sh
+++ b/tools/testing/selftests/mm/hugetlb_reparenting_test.sh
@@ -248,5 +248,7 @@ cleanup
 
 echo ALL PASS
 
-umount $CGROUP_ROOT
-rm -rf $CGROUP_ROOT
+if [[ $do_umount ]]; then
+  umount $CGROUP_ROOT
+  rm -rf $CGROUP_ROOT
+fi

From 932759f045acc358ed100148932af6457ecce5ff Mon Sep 17 00:00:00 2001
From: Muhammad Usama Anjum <usama.anjum@collabora.com>
Date: Thu, 25 Jan 2024 20:46:05 +0500
Subject: [PATCH 1356/1406] selftests/mm: run_vmtests: remove sudo and conform
 to tap

Remove sudo as some test running environments may not have sudo available.
Instead skip the test if root privileges aren't available in the test.

Link: https://lkml.kernel.org/r/20240125154608.720072-3-usama.anjum@collabora.com
Signed-off-by: Muhammad Usama Anjum <usama.anjum@collabora.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/on-fault-limit.c | 36 ++++++++++-----------
 tools/testing/selftests/mm/run_vmtests.sh   |  2 +-
 2 files changed, 18 insertions(+), 20 deletions(-)

diff --git a/tools/testing/selftests/mm/on-fault-limit.c b/tools/testing/selftests/mm/on-fault-limit.c
index b5888d613f34eb..0ea98ffab35892 100644
--- a/tools/testing/selftests/mm/on-fault-limit.c
+++ b/tools/testing/selftests/mm/on-fault-limit.c
@@ -5,40 +5,38 @@
 #include <string.h>
 #include <sys/time.h>
 #include <sys/resource.h>
+#include "../kselftest.h"
 
-static int test_limit(void)
+static void test_limit(void)
 {
-	int ret = 1;
 	struct rlimit lims;
 	void *map;
 
-	if (getrlimit(RLIMIT_MEMLOCK, &lims)) {
-		perror("getrlimit");
-		return ret;
-	}
+	if (getrlimit(RLIMIT_MEMLOCK, &lims))
+		ksft_exit_fail_msg("getrlimit: %s\n", strerror(errno));
 
-	if (mlockall(MCL_ONFAULT | MCL_FUTURE)) {
-		perror("mlockall");
-		return ret;
-	}
+	if (mlockall(MCL_ONFAULT | MCL_FUTURE))
+		ksft_exit_fail_msg("mlockall: %s\n", strerror(errno));
 
 	map = mmap(NULL, 2 * lims.rlim_max, PROT_READ | PROT_WRITE,
 		   MAP_PRIVATE | MAP_ANONYMOUS | MAP_POPULATE, -1, 0);
+
+	ksft_test_result(map == MAP_FAILED, "Failed mmap\n");
+
 	if (map != MAP_FAILED)
-		printf("mmap should have failed, but didn't\n");
-	else {
-		ret = 0;
 		munmap(map, 2 * lims.rlim_max);
-	}
-
 	munlockall();
-	return ret;
 }
 
 int main(int argc, char **argv)
 {
-	int ret = 0;
+	ksft_print_header();
+	ksft_set_plan(1);
+
+	if (getuid())
+		ksft_test_result_skip("Require root privileges to run\n");
+	else
+		test_limit();
 
-	ret += test_limit();
-	return ret;
+	ksft_finished();
 }
diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh
index 246d53a5d7f287..e373d592dbf5cb 100755
--- a/tools/testing/selftests/mm/run_vmtests.sh
+++ b/tools/testing/selftests/mm/run_vmtests.sh
@@ -291,7 +291,7 @@ echo "$nr_hugepgs" > /proc/sys/vm/nr_hugepages
 
 CATEGORY="compaction" run_test ./compaction_test
 
-CATEGORY="mlock" run_test sudo -u nobody ./on-fault-limit
+CATEGORY="mlock" run_test ./on-fault-limit
 
 CATEGORY="mmap" run_test ./map_populate
 

From 8283275e0e11b16baff339c67f2a5986c443e08b Mon Sep 17 00:00:00 2001
From: Muhammad Usama Anjum <usama.anjum@collabora.com>
Date: Thu, 1 Feb 2024 18:05:36 +0500
Subject: [PATCH 1357/1406] selftests/mm: on-fault-limit: run test without root
 privileges otherwise skip

The mmap() respects rlimit only for normal users.  This test should be run
as normal user, without root privileges.  Also add back the sudo -u nobody
as run_vmtests.sh is run as root most of the times.  Skip the test instead
if sudo isn't present to lower the privileges.

Link: https://lkml.kernel.org/r/20240201130538.1404897-1-usama.anjum@collabora.com
Fixes: b6221771d468 ("selftests/mm: run_vmtests: remove sudo and conform to tap")
Signed-off-by: Muhammad Usama Anjum <usama.anjum@collabora.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/on-fault-limit.c | 6 +++---
 tools/testing/selftests/mm/run_vmtests.sh   | 7 ++++++-
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/tools/testing/selftests/mm/on-fault-limit.c b/tools/testing/selftests/mm/on-fault-limit.c
index 0ea98ffab35892..431c1277d83a1d 100644
--- a/tools/testing/selftests/mm/on-fault-limit.c
+++ b/tools/testing/selftests/mm/on-fault-limit.c
@@ -21,7 +21,7 @@ static void test_limit(void)
 	map = mmap(NULL, 2 * lims.rlim_max, PROT_READ | PROT_WRITE,
 		   MAP_PRIVATE | MAP_ANONYMOUS | MAP_POPULATE, -1, 0);
 
-	ksft_test_result(map == MAP_FAILED, "Failed mmap\n");
+	ksft_test_result(map == MAP_FAILED, "The map failed respecting mlock limits\n");
 
 	if (map != MAP_FAILED)
 		munmap(map, 2 * lims.rlim_max);
@@ -33,8 +33,8 @@ int main(int argc, char **argv)
 	ksft_print_header();
 	ksft_set_plan(1);
 
-	if (getuid())
-		ksft_test_result_skip("Require root privileges to run\n");
+	if (!getuid())
+		ksft_test_result_skip("The test must be run from a normal user\n");
 	else
 		test_limit();
 
diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh
index e373d592dbf5cb..416bfc8198b30e 100755
--- a/tools/testing/selftests/mm/run_vmtests.sh
+++ b/tools/testing/selftests/mm/run_vmtests.sh
@@ -291,7 +291,12 @@ echo "$nr_hugepgs" > /proc/sys/vm/nr_hugepages
 
 CATEGORY="compaction" run_test ./compaction_test
 
-CATEGORY="mlock" run_test ./on-fault-limit
+if command -v sudo &> /dev/null;
+then
+	CATEGORY="mlock" run_test sudo -u nobody ./on-fault-limit
+else
+	echo "# SKIP ./on-fault-limit"
+fi
 
 CATEGORY="mmap" run_test ./map_populate
 

From 520f7b34db99f1f2e811eb32cb440a2d6c7e40b3 Mon Sep 17 00:00:00 2001
From: Muhammad Usama Anjum <usama.anjum@collabora.com>
Date: Thu, 25 Jan 2024 20:46:06 +0500
Subject: [PATCH 1358/1406] selftests/mm: save and restore nr_hugepages value

Save and restore nr_hugepages before changing it during the test.  A test
should not change system wide settings.

Link: https://lkml.kernel.org/r/20240125154608.720072-4-usama.anjum@collabora.com
Signed-off-by: Muhammad Usama Anjum <usama.anjum@collabora.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/charge_reserved_hugetlb.sh  | 4 ++++
 tools/testing/selftests/mm/hugetlb_reparenting_test.sh | 3 +++
 2 files changed, 7 insertions(+)

diff --git a/tools/testing/selftests/mm/charge_reserved_hugetlb.sh b/tools/testing/selftests/mm/charge_reserved_hugetlb.sh
index e14bdd4455f2d2..d680c00d2853ac 100755
--- a/tools/testing/selftests/mm/charge_reserved_hugetlb.sh
+++ b/tools/testing/selftests/mm/charge_reserved_hugetlb.sh
@@ -11,6 +11,8 @@ if [[ $(id -u) -ne 0 ]]; then
   exit $ksft_skip
 fi
 
+nr_hugepgs=$(cat /proc/sys/vm/nr_hugepages)
+
 fault_limit_file=limit_in_bytes
 reservation_limit_file=rsvd.limit_in_bytes
 fault_usage_file=usage_in_bytes
@@ -582,3 +584,5 @@ if [[ $do_umount ]]; then
   umount $cgroup_path
   rmdir $cgroup_path
 fi
+
+echo "$nr_hugepgs" > /proc/sys/vm/nr_hugepages
diff --git a/tools/testing/selftests/mm/hugetlb_reparenting_test.sh b/tools/testing/selftests/mm/hugetlb_reparenting_test.sh
index 615c4d766c9093..11f9bbe7dc222b 100755
--- a/tools/testing/selftests/mm/hugetlb_reparenting_test.sh
+++ b/tools/testing/selftests/mm/hugetlb_reparenting_test.sh
@@ -11,6 +11,7 @@ if [[ $(id -u) -ne 0 ]]; then
   exit $ksft_skip
 fi
 
+nr_hugepgs=$(cat /proc/sys/vm/nr_hugepages)
 usage_file=usage_in_bytes
 
 if [[ "$1" == "-cgroup-v2" ]]; then
@@ -252,3 +253,5 @@ if [[ $do_umount ]]; then
   umount $CGROUP_ROOT
   rm -rf $CGROUP_ROOT
 fi
+
+echo "$nr_hugepgs" > /proc/sys/vm/nr_hugepages

From 49b145c1d6217a79d4558f455bff23517dd91688 Mon Sep 17 00:00:00 2001
From: Muhammad Usama Anjum <usama.anjum@collabora.com>
Date: Thu, 25 Jan 2024 20:46:07 +0500
Subject: [PATCH 1359/1406] selftests/mm: protection_keys: save/restore
 nr_hugepages settings

Save and restore nr_hugepages before changing it during the test.  A test
should not change system wide settings.

Link: https://lkml.kernel.org/r/20240125154608.720072-5-usama.anjum@collabora.com
Fixes: 5f23f6d082a9 ("x86/pkeys: Add self-tests")
Signed-off-by: Muhammad Usama Anjum <usama.anjum@collabora.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/protection_keys.c | 34 ++++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/tools/testing/selftests/mm/protection_keys.c b/tools/testing/selftests/mm/protection_keys.c
index 48dc151f8fca8a..f822ae31af22e2 100644
--- a/tools/testing/selftests/mm/protection_keys.c
+++ b/tools/testing/selftests/mm/protection_keys.c
@@ -54,6 +54,7 @@ int test_nr;
 u64 shadow_pkey_reg;
 int dprint_in_signal;
 char dprint_in_signal_buffer[DPRINT_IN_SIGNAL_BUF_SIZE];
+char buf[256];
 
 void cat_into_file(char *str, char *file)
 {
@@ -1744,6 +1745,38 @@ void pkey_setup_shadow(void)
 	shadow_pkey_reg = __read_pkey_reg();
 }
 
+void restore_settings_atexit(void)
+{
+	cat_into_file(buf, "/proc/sys/vm/nr_hugepages");
+}
+
+void save_settings(void)
+{
+	int fd;
+	int err;
+
+	if (geteuid())
+		return;
+
+	fd = open("/proc/sys/vm/nr_hugepages", O_RDONLY);
+	if (fd < 0) {
+		fprintf(stderr, "error opening\n");
+		perror("error: ");
+		exit(__LINE__);
+	}
+
+	/* -1 to guarantee leaving the trailing \0 */
+	err = read(fd, buf, sizeof(buf)-1);
+	if (err < 0) {
+		fprintf(stderr, "error reading\n");
+		perror("error: ");
+		exit(__LINE__);
+	}
+
+	atexit(restore_settings_atexit);
+	close(fd);
+}
+
 int main(void)
 {
 	int nr_iterations = 22;
@@ -1751,6 +1784,7 @@ int main(void)
 
 	srand((unsigned int)time(NULL));
 
+	save_settings();
 	setup_handlers();
 
 	printf("has pkeys: %d\n", pkeys_supported);

From 01198a196594ac88628a219e364d1c1baeb355da Mon Sep 17 00:00:00 2001
From: Muhammad Usama Anjum <usama.anjum@collabora.com>
Date: Thu, 25 Jan 2024 20:46:08 +0500
Subject: [PATCH 1360/1406] selftests/mm: run_vmtests.sh: add missing tests

Add missing tests to run_vmtests.sh.  The mm kselftests are run through
run_vmtests.sh.  If a test isn't present in this script, it'll not run
with run_tests or `make -C tools/testing/selftests/mm run_tests`.

Link: https://lkml.kernel.org/r/20240125154608.720072-6-usama.anjum@collabora.com
Cc: Ryan Roberts <ryan.roberts@arm.com>
Signed-off-by: Muhammad Usama Anjum <usama.anjum@collabora.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/Makefile       | 5 +++++
 tools/testing/selftests/mm/run_vmtests.sh | 8 ++++++++
 2 files changed, 13 insertions(+)

diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile
index 2453add65d12f8..f3aec7be80730b 100644
--- a/tools/testing/selftests/mm/Makefile
+++ b/tools/testing/selftests/mm/Makefile
@@ -114,6 +114,11 @@ TEST_PROGS := run_vmtests.sh
 TEST_FILES := test_vmalloc.sh
 TEST_FILES += test_hmm.sh
 TEST_FILES += va_high_addr_switch.sh
+TEST_FILES += charge_reserved_hugetlb.sh
+TEST_FILES += hugetlb_reparenting_test.sh
+
+# required by charge_reserved_hugetlb.sh
+TEST_FILES += write_hugetlb_memory.sh
 
 include ../lib.mk
 
diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh
index 416bfc8198b30e..4b68bcdc7cb7ae 100755
--- a/tools/testing/selftests/mm/run_vmtests.sh
+++ b/tools/testing/selftests/mm/run_vmtests.sh
@@ -19,6 +19,7 @@ usage: ${BASH_SOURCE[0]:-$0} [ options ]
   -t: specify specific categories to tests to run
   -h: display this message
   -n: disable TAP output
+  -d: run destructive tests
 
 The default behavior is to run required tests only.  If -a is specified,
 will run all tests.
@@ -79,6 +80,7 @@ EOF
 }
 
 RUN_ALL=false
+RUN_DESTRUCTIVE_TEST=false
 TAP_PREFIX="# "
 
 while getopts "aht:n" OPT; do
@@ -87,6 +89,7 @@ while getopts "aht:n" OPT; do
 		"h") usage ;;
 		"t") VM_SELFTEST_ITEMS=${OPTARG} ;;
 		"n") TAP_PREFIX= ;;
+		"a") RUN_DESTRUCTIVE_TEST=true ;;
 	esac
 done
 shift $((OPTIND -1))
@@ -309,6 +312,11 @@ CATEGORY="process_mrelease" run_test ./mrelease_test
 CATEGORY="mremap" run_test ./mremap_test
 
 CATEGORY="hugetlb" run_test ./thuge-gen
+CATEGORY="hugetlb" run_test ./charge_reserved_hugetlb.sh -cgroup-v2
+CATEGORY="hugetlb" run_test ./hugetlb_reparenting_test.sh -cgroup-v2
+if $RUN_DESTRUCTIVE_TEST; then
+CATEGORY="hugetlb" run_test ./hugetlb-read-hwpoison
+fi
 
 if [ $VADDR64 -ne 0 ]; then
 

From da9f0c3179663860c6499db0a0e5a6f44fffc7a5 Mon Sep 17 00:00:00 2001
From: Muhammad Usama Anjum <usama.anjum@collabora.com>
Date: Thu, 1 Feb 2024 18:05:37 +0500
Subject: [PATCH 1361/1406] selftests/mm: run_vmtests: use correct flag in the
 code

Use correct -d flag as mentioned in comments for destructive tests.
Rename variable and update comment for some clarification.

Link: https://lkml.kernel.org/r/20240201130538.1404897-2-usama.anjum@collabora.com
Fixes: cc7b9955344c ("selftests/mm: run_vmtests.sh: add missing tests")
Signed-off-by: Muhammad Usama Anjum <usama.anjum@collabora.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/run_vmtests.sh | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh
index 4b68bcdc7cb7ae..52f26578a5875b 100755
--- a/tools/testing/selftests/mm/run_vmtests.sh
+++ b/tools/testing/selftests/mm/run_vmtests.sh
@@ -15,7 +15,7 @@ usage() {
 	cat <<EOF
 usage: ${BASH_SOURCE[0]:-$0} [ options ]
 
-  -a: run all tests, including extra ones
+  -a: run all tests, including extra ones (other than destructive ones)
   -t: specify specific categories to tests to run
   -h: display this message
   -n: disable TAP output
@@ -80,7 +80,7 @@ EOF
 }
 
 RUN_ALL=false
-RUN_DESTRUCTIVE_TEST=false
+RUN_DESTRUCTIVE=false
 TAP_PREFIX="# "
 
 while getopts "aht:n" OPT; do
@@ -89,7 +89,7 @@ while getopts "aht:n" OPT; do
 		"h") usage ;;
 		"t") VM_SELFTEST_ITEMS=${OPTARG} ;;
 		"n") TAP_PREFIX= ;;
-		"a") RUN_DESTRUCTIVE_TEST=true ;;
+		"d") RUN_DESTRUCTIVE=true ;;
 	esac
 done
 shift $((OPTIND -1))
@@ -314,7 +314,7 @@ CATEGORY="mremap" run_test ./mremap_test
 CATEGORY="hugetlb" run_test ./thuge-gen
 CATEGORY="hugetlb" run_test ./charge_reserved_hugetlb.sh -cgroup-v2
 CATEGORY="hugetlb" run_test ./hugetlb_reparenting_test.sh -cgroup-v2
-if $RUN_DESTRUCTIVE_TEST; then
+if $RUN_DESTRUCTIVE; then
 CATEGORY="hugetlb" run_test ./hugetlb-read-hwpoison
 fi
 

From 6b466ecc9921a40d6010a93395f0a30c48c7bddb Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Wed, 24 Jan 2024 15:27:35 +0100
Subject: [PATCH 1362/1406] init: remove obsolete arch_call_rest_init() wrapper

Since commit 3570ee046c46b5dc ("s390/smp: keep the original lowcore for
CPU 0"), there is no longer any architecture that needs to override
arch_call_rest_init().

Remove the weak wrapper around rest_init(), call rest_init() directly, and
make rest_init() static.

Link: https://lkml.kernel.org/r/aa10868bfb176eef4abb8bb4a710b85330792694.1706106183.git.geert@linux-m68k.org
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Ilya Leoshkevich <iii@linux.ibm.com>
Cc: Josh Poimboeuf <jpoimboe@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/start_kernel.h | 2 --
 init/main.c                  | 9 ++-------
 tools/objtool/noreturns.h    | 1 -
 3 files changed, 2 insertions(+), 10 deletions(-)

diff --git a/include/linux/start_kernel.h b/include/linux/start_kernel.h
index a9806a44a605c7..09f994ac87df44 100644
--- a/include/linux/start_kernel.h
+++ b/include/linux/start_kernel.h
@@ -9,7 +9,5 @@
    up something else. */
 
 extern asmlinkage void __init __noreturn start_kernel(void);
-extern void __init __noreturn arch_call_rest_init(void);
-extern void __ref __noreturn rest_init(void);
 
 #endif /* _LINUX_START_KERNEL_H */
diff --git a/init/main.c b/init/main.c
index e24b0780fdff7a..521f40770e67dd 100644
--- a/init/main.c
+++ b/init/main.c
@@ -681,7 +681,7 @@ static void __init setup_command_line(char *command_line)
 
 static __initdata DECLARE_COMPLETION(kthreadd_done);
 
-noinline void __ref __noreturn rest_init(void)
+static noinline void __ref __noreturn rest_init(void)
 {
 	struct task_struct *tsk;
 	int pid;
@@ -822,11 +822,6 @@ static int __init early_randomize_kstack_offset(char *buf)
 early_param("randomize_kstack_offset", early_randomize_kstack_offset);
 #endif
 
-void __init __weak __noreturn arch_call_rest_init(void)
-{
-	rest_init();
-}
-
 static void __init print_unknown_bootoptions(void)
 {
 	char *unknown_options;
@@ -1069,7 +1064,7 @@ void start_kernel(void)
 	kcsan_init();
 
 	/* Do the rest non-__init'ed, we're now alive */
-	arch_call_rest_init();
+	rest_init();
 
 	/*
 	 * Avoid stack canaries in callers of boot_init_stack_canary for gcc-10
diff --git a/tools/objtool/noreturns.h b/tools/objtool/noreturns.h
index 1685d7ea6a9f70..7cda577da897ca 100644
--- a/tools/objtool/noreturns.h
+++ b/tools/objtool/noreturns.h
@@ -12,7 +12,6 @@ NORETURN(__reiserfs_panic)
 NORETURN(__stack_chk_fail)
 NORETURN(__tdx_hypercall_failed)
 NORETURN(__ubsan_handle_builtin_unreachable)
-NORETURN(arch_call_rest_init)
 NORETURN(arch_cpu_idle_dead)
 NORETURN(bch2_trans_in_restart_error)
 NORETURN(bch2_trans_restart_error)

From 082a553b79749b9435ba18dc5d8ce9db71fd0ae1 Mon Sep 17 00:00:00 2001
From: Feng Tang <feng.tang@intel.com>
Date: Fri, 2 Feb 2024 21:20:42 +0800
Subject: [PATCH 1363/1406] panic: add option to dump blocked tasks in
 panic_print

For debugging kernel panics and other bugs, there is already an option of
panic_print to dump all tasks' call stacks.  On today's large servers
running many containers, there could be thousands of tasks or more, and
this will print out huge amount of call stacks, taking a lot of time (for
serial console which is main target user case of panic_print).

And in many cases, only those several tasks being blocked are key for the
panic, so add an option to only dump blocked tasks' call stacks.

Link: https://lkml.kernel.org/r/20240202132042.3609657-1-feng.tang@intel.com
Signed-off-by: Feng Tang <feng.tang@intel.com>
Tested-by: Guilherme G. Piccoli <gpiccoli@igalia.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Josh Poimboeuf <jpoimboe@kernel.org>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/admin-guide/kernel-parameters.txt | 1 +
 Documentation/admin-guide/sysctl/kernel.rst     | 1 +
 kernel/panic.c                                  | 4 ++++
 3 files changed, 6 insertions(+)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 31b3a25680d08c..0f2369e87175a9 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -4182,6 +4182,7 @@
 			bit 4: print ftrace buffer
 			bit 5: print all printk messages in buffer
 			bit 6: print all CPUs backtrace (if available in the arch)
+			bit 7: print tasks in uninterruptible (blocked) state
 			*Be aware* that this option may print a _lot_ of lines,
 			so there are risks of losing older messages in the log.
 			Use this option carefully, maybe worth to setup a
diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst
index bc578663619d6e..c78d79151e7faa 100644
--- a/Documentation/admin-guide/sysctl/kernel.rst
+++ b/Documentation/admin-guide/sysctl/kernel.rst
@@ -853,6 +853,7 @@ bit 3  print locks info if ``CONFIG_LOCKDEP`` is on
 bit 4  print ftrace buffer
 bit 5  print all printk messages in buffer
 bit 6  print all CPUs backtrace (if available in the arch)
+bit 7  print tasks in uninterruptible (blocked) state
 =====  ============================================
 
 So for example to print tasks and memory info on panic, user can::
diff --git a/kernel/panic.c b/kernel/panic.c
index d49b68184c563e..d4cb86d97f8f49 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -73,6 +73,7 @@ EXPORT_SYMBOL_GPL(panic_timeout);
 #define PANIC_PRINT_FTRACE_INFO		0x00000010
 #define PANIC_PRINT_ALL_PRINTK_MSG	0x00000020
 #define PANIC_PRINT_ALL_CPU_BT		0x00000040
+#define PANIC_PRINT_BLOCKED_TASKS	0x00000080
 unsigned long panic_print;
 
 ATOMIC_NOTIFIER_HEAD(panic_notifier_list);
@@ -227,6 +228,9 @@ static void panic_print_sys_info(bool console_flush)
 
 	if (panic_print & PANIC_PRINT_FTRACE_INFO)
 		ftrace_dump(DUMP_ALL);
+
+	if (panic_print & PANIC_PRINT_BLOCKED_TASKS)
+		show_state_filter(TASK_UNINTERRUPTIBLE);
 }
 
 void check_panic_on_warn(const char *origin)

From 9dea0d051058752e22dee951f0e603b54a8b02da Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Sat, 3 Feb 2024 04:31:14 -0800
Subject: [PATCH 1364/1406] 
 panic-add-option-to-dump-blocked-tasks-in-panic_print-fix

clarify documentation a little

Cc: Feng Tang <feng.tang@intel.com>
Cc: Guilherme G. Piccoli <gpiccoli@igalia.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Josh Poimboeuf <jpoimboe@kernel.org>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/admin-guide/kernel-parameters.txt | 2 +-
 Documentation/admin-guide/sysctl/kernel.rst     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 0f2369e87175a9..800b4b5dcbb4c7 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -4182,7 +4182,7 @@
 			bit 4: print ftrace buffer
 			bit 5: print all printk messages in buffer
 			bit 6: print all CPUs backtrace (if available in the arch)
-			bit 7: print tasks in uninterruptible (blocked) state
+			bit 7: print only tasks in uninterruptible (blocked) state
 			*Be aware* that this option may print a _lot_ of lines,
 			so there are risks of losing older messages in the log.
 			Use this option carefully, maybe worth to setup a
diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst
index c78d79151e7faa..a9b71190399d9e 100644
--- a/Documentation/admin-guide/sysctl/kernel.rst
+++ b/Documentation/admin-guide/sysctl/kernel.rst
@@ -853,7 +853,7 @@ bit 3  print locks info if ``CONFIG_LOCKDEP`` is on
 bit 4  print ftrace buffer
 bit 5  print all printk messages in buffer
 bit 6  print all CPUs backtrace (if available in the arch)
-bit 7  print tasks in uninterruptible (blocked) state
+bit 7  print only tasks in uninterruptible (blocked) state
 =====  ============================================
 
 So for example to print tasks and memory info on panic, user can::

From a68a3cd6e420b6ea34e686b4b137fd5266e4cdfd Mon Sep 17 00:00:00 2001
From: "Ricardo B. Marliere" <ricardo@marliere.net>
Date: Sun, 4 Feb 2024 19:39:57 -0300
Subject: [PATCH 1365/1406] const_structs.checkpatch: add bus_type
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Since commit d492cc2573a0 ("driver core: device.h: make struct bus_type a
const *"), the driver core can properly handle constant struct bus_type.
Make sure that new usages of the struct already enter the tree as const.

Link: https://lkml.kernel.org/r/20240204-bus_cleanup-checkpatch-v1-1-8d51dcecda20@marliere.net
Signed-off-by: Ricardo B. Marliere <ricardo@marliere.net>
Suggested-by: Thomas Weißschuh <linux@weissschuh.net>
Reviewed-by: Thomas Weißschuh <linux@weissschuh.net>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Joe Perches <joe@perches.com>
Cc: Andy Whitcroft <apw@canonical.com>
Cc: Dwaipayan Ray <dwaipayanray1@gmail.com>
Cc: Lukas Bulwahn <lukas.bulwahn@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 scripts/const_structs.checkpatch | 1 +
 1 file changed, 1 insertion(+)

diff --git a/scripts/const_structs.checkpatch b/scripts/const_structs.checkpatch
index 188412aa275795..7427313adc7aab 100644
--- a/scripts/const_structs.checkpatch
+++ b/scripts/const_structs.checkpatch
@@ -2,6 +2,7 @@ acpi_dock_ops
 address_space_operations
 backlight_ops
 block_device_operations
+bus_type
 clk_ops
 comedi_lrange
 component_ops

From ed3b7419d4c991a472c6d171e89f9bb48c2f72c5 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 5 Feb 2024 13:26:26 +0100
Subject: [PATCH 1366/1406] fat: Fix uninitialized field in nostale filehandles

When fat_encode_fh_nostale() encodes file handle without a parent it
stores only first 10 bytes of the file handle. However the length of the
file handle must be a multiple of 4 so the file handle is actually 12
bytes long and the last two bytes remain uninitialized. This is not
great at we potentially leak uninitialized information with the handle
to userspace. Properly initialize the full handle length.

Link: https://lkml.kernel.org/r/20240205122626.13701-1-jack@suse.cz
Reported-by: syzbot+3ce5dea5b1539ff36769@syzkaller.appspotmail.com
Fixes: ea3983ace6b7 ("fat: restructure export_operations")
Signed-off-by: Jan Kara <jack@suse.cz>
Acked-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Cc: Amir Goldstein <amir73il@gmail.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/fat/nfs.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/fs/fat/nfs.c b/fs/fat/nfs.c
index c52e63e10d35cd..509eea96a457d4 100644
--- a/fs/fat/nfs.c
+++ b/fs/fat/nfs.c
@@ -130,6 +130,12 @@ fat_encode_fh_nostale(struct inode *inode, __u32 *fh, int *lenp,
 		fid->parent_i_gen = parent->i_generation;
 		type = FILEID_FAT_WITH_PARENT;
 		*lenp = FAT_FID_SIZE_WITH_PARENT;
+	} else {
+		/*
+		 * We need to initialize this field because the fh is actually
+		 * 12 bytes long
+		 */
+		fid->parent_i_pos_hi = 0;
 	}
 
 	return type;

From 6b1b5b05ffc28db583e2d68f9d34a998f66b10af Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Mon, 5 Feb 2024 12:39:30 +0300
Subject: [PATCH 1367/1406] smp: make __smp_processor_id() 0-argument macro
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

smp_processor_id family of macros never accepted any arguments.

	#define __smp_processor_id(x)

works by accident (see C99 6.10.3 §4). __smp_processor_id() gets
1 (empty) argument and passes it down to raw_smp_processor_id()
which doesn't accept arguments.

Link: https://lkml.kernel.org/r/0037d1f2-8153-4b33-b43e-f4b6ecd710ac@p183
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/smp.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/smp.h b/include/linux/smp.h
index e87520dc2959dd..cc517002c59931 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -261,7 +261,7 @@ static inline int get_boot_cpu_id(void)
  * regular asm read for the stable.
  */
 #ifndef __smp_processor_id
-#define __smp_processor_id(x) raw_smp_processor_id(x)
+#define __smp_processor_id() raw_smp_processor_id()
 #endif
 
 #ifdef CONFIG_DEBUG_PREEMPT

From 6aebc512bfad6f9c85c9af16909351789e76b082 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Date: Thu, 8 Feb 2024 18:30:18 +0900
Subject: [PATCH 1368/1406] nilfs2: MAINTAINERS: drop unreachable project
 mirror site

The hosting site where the nilfs project had a mirror site continues to be
in trouble, so we have decided not to use that site.  This will reflect it
in the MAINTAINERS file.

Link: https://lkml.kernel.org/r/20240208093018.6334-1-konishi.ryusuke@gmail.com
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 MAINTAINERS | 1 -
 1 file changed, 1 deletion(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index f7c81cea9b69e5..67998eb4c57994 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -15446,7 +15446,6 @@ M:	Ryusuke Konishi <konishi.ryusuke@gmail.com>
 L:	linux-nilfs@vger.kernel.org
 S:	Supported
 W:	https://nilfs.sourceforge.io/
-W:	https://nilfs.osdn.jp/
 T:	git https://github.com/konis/nilfs2.git
 F:	Documentation/filesystems/nilfs2.rst
 F:	fs/nilfs2/

From 168adbd72f82df0ff4ddbec4580c55173d5ba4c1 Mon Sep 17 00:00:00 2001
From: Wei Yang <richard.weiyang@gmail.com>
Date: Thu, 8 Feb 2024 02:14:23 +0000
Subject: [PATCH 1369/1406] list: leverage list_is_head() for
 list_entry_is_head()

This is what list_is_head() exactly do.

Link: https://lkml.kernel.org/r/20240208021423.15704-1-richard.weiyang@gmail.com
Signed-off-by: Wei Yang <richard.weiyang@gmail.com>
Cc: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/list.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/list.h b/include/linux/list.h
index 523b7c4d000a1f..5f4b0a39cf46a3 100644
--- a/include/linux/list.h
+++ b/include/linux/list.h
@@ -766,7 +766,7 @@ static inline size_t list_count_nodes(struct list_head *head)
  * @member:	the name of the list_head within the struct.
  */
 #define list_entry_is_head(pos, head, member)				\
-	(&pos->member == (head))
+	list_is_head(&pos->member, (head))
 
 /**
  * list_for_each_entry	-	iterate over list of given type

From 5e015cb8737e744e19c0e1dadd40143003bdb3be Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Tue, 13 Feb 2024 18:27:41 +0200
Subject: [PATCH 1370/1406] dyndbg: replace kstrdup() + strchr() with
 kstrdup_and_replace()

Replace open coded functionalify of kstrdup_and_replace() with a call.

Link: https://lkml.kernel.org/r/20240213162741.3102810-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Luis Chamberlain <mcgrof@kernel.org>
Cc: Jason Baron <jbaron@akamai.com>
Cc: Jim Cromie <jim.cromie@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 lib/dynamic_debug.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/lib/dynamic_debug.c b/lib/dynamic_debug.c
index 6fba6423cc10b5..c78f335fa98137 100644
--- a/lib/dynamic_debug.c
+++ b/lib/dynamic_debug.c
@@ -640,10 +640,9 @@ static int param_set_dyndbg_classnames(const char *instr, const struct kernel_pa
 	int cls_id, totct = 0;
 	bool wanted;
 
-	cl_str = tmp = kstrdup(instr, GFP_KERNEL);
-	p = strchr(cl_str, '\n');
-	if (p)
-		*p = '\0';
+	cl_str = tmp = kstrdup_and_replace(instr, '\n', '\0', GFP_KERNEL);
+	if (!tmp)
+		return -ENOMEM;
 
 	/* start with previously set state-bits, then modify */
 	curr_bits = old_bits = *dcp->bits;

From 56915030c44420cbf64f2d032abd7bb9d0e8ce03 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ahelenia=20Ziemia=C5=84ska?=
 <nabijaczleweli@nabijaczleweli.xyz>
Date: Tue, 13 Feb 2024 15:54:04 +0100
Subject: [PATCH 1371/1406] Normalise "name (ad@dr)" MODULE_AUTHORs to "name
 <ad@dr>"
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Found with git grep 'MODULE_AUTHOR(".*([^)]*@'
Fixed with
  sed -i '/MODULE_AUTHOR(".*([^)]*@/{s/ (/ </g;s/)"/>"/;s/)and/> and/}' \
    $(git grep -l 'MODULE_AUTHOR(".*([^)]*@')

Also:
  in drivers/media/usb/siano/smsusb.c normalise ", INC" to ", Inc";
     this is what every other MODULE_AUTHOR for this company says,
     and it's what the header says
  in drivers/sbus/char/openprom.c normalise a double-spaced separator;
     this is clearly copied from the copyright header,
     where the names are aligned on consecutive lines thusly:
      * Linux/SPARC PROM Configuration Driver
      * Copyright (C) 1996 Thomas K. Dyas (tdyas@noc.rutgers.edu)
      * Copyright (C) 1996 Eddie C. Dost  (ecd@skynet.be)
     but the authorship branding is single-line

Link: https://lkml.kernel.org/r/mk3geln4azm5binjjlfsgjepow4o73domjv6ajybws3tz22vb3@tarta.nabijaczleweli.xyz
Signed-off-by: Ahelenia Ziemiańska <nabijaczleweli@nabijaczleweli.xyz>
Cc: Joe Perches <joe@perches.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/sparc/kernel/chmc.c                  | 2 +-
 arch/sparc/kernel/ds.c                    | 2 +-
 drivers/block/sunvdc.c                    | 2 +-
 drivers/char/hw_random/n2-drv.c           | 2 +-
 drivers/char/tpm/st33zp24/i2c.c           | 2 +-
 drivers/char/tpm/st33zp24/spi.c           | 2 +-
 drivers/char/tpm/st33zp24/st33zp24.c      | 2 +-
 drivers/char/tpm/tpm-interface.c          | 2 +-
 drivers/char/tpm/tpm_atmel.c              | 2 +-
 drivers/char/tpm/tpm_i2c_nuvoton.c        | 2 +-
 drivers/char/tpm/tpm_nsc.c                | 2 +-
 drivers/char/tpm/tpm_tis.c                | 2 +-
 drivers/char/tpm/tpm_tis_core.c           | 2 +-
 drivers/char/tpm/tpm_vtpm_proxy.c         | 2 +-
 drivers/crypto/n2_core.c                  | 2 +-
 drivers/hwmon/dell-smm-hwmon.c            | 2 +-
 drivers/hwmon/ultra45_env.c               | 2 +-
 drivers/i2c/muxes/i2c-mux-mlxcpld.c       | 2 +-
 drivers/leds/leds-sunfire.c               | 2 +-
 drivers/media/common/siano/smscoreapi.c   | 2 +-
 drivers/media/common/siano/smsdvb-main.c  | 2 +-
 drivers/media/dvb-frontends/cx24117.c     | 2 +-
 drivers/media/usb/siano/smsusb.c          | 2 +-
 drivers/net/ethernet/broadcom/tg3.c       | 2 +-
 drivers/net/ethernet/sun/cassini.c        | 2 +-
 drivers/net/ethernet/sun/niu.c            | 2 +-
 drivers/net/ethernet/sun/sunhme.c         | 2 +-
 drivers/net/ethernet/sun/sunvnet.c        | 2 +-
 drivers/net/ethernet/sun/sunvnet_common.c | 2 +-
 drivers/net/ppp/pptp.c                    | 2 +-
 drivers/platform/x86/compal-laptop.c      | 2 +-
 drivers/platform/x86/intel/oaktrail.c     | 2 +-
 drivers/platform/x86/mlx-platform.c       | 2 +-
 drivers/s390/net/fsm.c                    | 2 +-
 drivers/sbus/char/openprom.c              | 2 +-
 drivers/scsi/esp_scsi.c                   | 2 +-
 drivers/scsi/jazz_esp.c                   | 2 +-
 drivers/scsi/mesh.c                       | 2 +-
 drivers/scsi/qlogicpti.c                  | 2 +-
 drivers/scsi/sun3x_esp.c                  | 2 +-
 drivers/scsi/sun_esp.c                    | 2 +-
 drivers/video/fbdev/hgafb.c               | 2 +-
 net/ipv4/gre_demux.c                      | 2 +-
 net/ipv6/ip6_gre.c                        | 2 +-
 net/iucv/iucv.c                           | 2 +-
 net/mpls/mpls_gso.c                       | 2 +-
 46 files changed, 46 insertions(+), 46 deletions(-)

diff --git a/arch/sparc/kernel/chmc.c b/arch/sparc/kernel/chmc.c
index d5fad5fb04c1d9..00e571c30bb5a9 100644
--- a/arch/sparc/kernel/chmc.c
+++ b/arch/sparc/kernel/chmc.c
@@ -30,7 +30,7 @@
 #define PFX DRV_MODULE_NAME	": "
 #define DRV_MODULE_VERSION	"0.2"
 
-MODULE_AUTHOR("David S. Miller (davem@davemloft.net)");
+MODULE_AUTHOR("David S. Miller <davem@davemloft.net>");
 MODULE_DESCRIPTION("UltraSPARC-III memory controller driver");
 MODULE_LICENSE("GPL");
 MODULE_VERSION(DRV_MODULE_VERSION);
diff --git a/arch/sparc/kernel/ds.c b/arch/sparc/kernel/ds.c
index 4a5bdb0df7797b..ffdc15588ac2e2 100644
--- a/arch/sparc/kernel/ds.c
+++ b/arch/sparc/kernel/ds.c
@@ -33,7 +33,7 @@
 
 static char version[] =
 	DRV_MODULE_NAME ".c:v" DRV_MODULE_VERSION " (" DRV_MODULE_RELDATE ")\n";
-MODULE_AUTHOR("David S. Miller (davem@davemloft.net)");
+MODULE_AUTHOR("David S. Miller <davem@davemloft.net>");
 MODULE_DESCRIPTION("Sun LDOM domain services driver");
 MODULE_LICENSE("GPL");
 MODULE_VERSION(DRV_MODULE_VERSION);
diff --git a/drivers/block/sunvdc.c b/drivers/block/sunvdc.c
index 7bf4b48e2282e7..9105d7ec671da2 100644
--- a/drivers/block/sunvdc.c
+++ b/drivers/block/sunvdc.c
@@ -28,7 +28,7 @@
 
 static char version[] =
 	DRV_MODULE_NAME ".c:v" DRV_MODULE_VERSION " (" DRV_MODULE_RELDATE ")\n";
-MODULE_AUTHOR("David S. Miller (davem@davemloft.net)");
+MODULE_AUTHOR("David S. Miller <davem@davemloft.net>");
 MODULE_DESCRIPTION("Sun LDOM virtual disk client driver");
 MODULE_LICENSE("GPL");
 MODULE_VERSION(DRV_MODULE_VERSION);
diff --git a/drivers/char/hw_random/n2-drv.c b/drivers/char/hw_random/n2-drv.c
index 2e669e7c14d31c..1b49e3a86d57b7 100644
--- a/drivers/char/hw_random/n2-drv.c
+++ b/drivers/char/hw_random/n2-drv.c
@@ -29,7 +29,7 @@
 static char version[] =
 	DRV_MODULE_NAME " v" DRV_MODULE_VERSION " (" DRV_MODULE_RELDATE ")\n";
 
-MODULE_AUTHOR("David S. Miller (davem@davemloft.net)");
+MODULE_AUTHOR("David S. Miller <davem@davemloft.net>");
 MODULE_DESCRIPTION("Niagara2 RNG driver");
 MODULE_LICENSE("GPL");
 MODULE_VERSION(DRV_MODULE_VERSION);
diff --git a/drivers/char/tpm/st33zp24/i2c.c b/drivers/char/tpm/st33zp24/i2c.c
index 661574bb0acf59..45ca33b3dcb268 100644
--- a/drivers/char/tpm/st33zp24/i2c.c
+++ b/drivers/char/tpm/st33zp24/i2c.c
@@ -167,7 +167,7 @@ static struct i2c_driver st33zp24_i2c_driver = {
 
 module_i2c_driver(st33zp24_i2c_driver);
 
-MODULE_AUTHOR("TPM support (TPMsupport@list.st.com)");
+MODULE_AUTHOR("TPM support <TPMsupport@list.st.com>");
 MODULE_DESCRIPTION("STM TPM 1.2 I2C ST33 Driver");
 MODULE_VERSION("1.3.0");
 MODULE_LICENSE("GPL");
diff --git a/drivers/char/tpm/st33zp24/spi.c b/drivers/char/tpm/st33zp24/spi.c
index f5811b301d3b22..5149231f3de28b 100644
--- a/drivers/char/tpm/st33zp24/spi.c
+++ b/drivers/char/tpm/st33zp24/spi.c
@@ -284,7 +284,7 @@ static struct spi_driver st33zp24_spi_driver = {
 
 module_spi_driver(st33zp24_spi_driver);
 
-MODULE_AUTHOR("TPM support (TPMsupport@list.st.com)");
+MODULE_AUTHOR("TPM support <TPMsupport@list.st.com>");
 MODULE_DESCRIPTION("STM TPM 1.2 SPI ST33 Driver");
 MODULE_VERSION("1.3.0");
 MODULE_LICENSE("GPL");
diff --git a/drivers/char/tpm/st33zp24/st33zp24.c b/drivers/char/tpm/st33zp24/st33zp24.c
index a5b554cd477861..c0771980bc2ff1 100644
--- a/drivers/char/tpm/st33zp24/st33zp24.c
+++ b/drivers/char/tpm/st33zp24/st33zp24.c
@@ -582,7 +582,7 @@ int st33zp24_pm_resume(struct device *dev)
 EXPORT_SYMBOL(st33zp24_pm_resume);
 #endif
 
-MODULE_AUTHOR("TPM support (TPMsupport@list.st.com)");
+MODULE_AUTHOR("TPM support <TPMsupport@list.st.com>");
 MODULE_DESCRIPTION("ST33ZP24 TPM 1.2 driver");
 MODULE_VERSION("1.3.0");
 MODULE_LICENSE("GPL");
diff --git a/drivers/char/tpm/tpm-interface.c b/drivers/char/tpm/tpm-interface.c
index 66b16d26eecc78..757336324c904c 100644
--- a/drivers/char/tpm/tpm-interface.c
+++ b/drivers/char/tpm/tpm-interface.c
@@ -524,7 +524,7 @@ static void __exit tpm_exit(void)
 subsys_initcall(tpm_init);
 module_exit(tpm_exit);
 
-MODULE_AUTHOR("Leendert van Doorn (leendert@watson.ibm.com)");
+MODULE_AUTHOR("Leendert van Doorn <leendert@watson.ibm.com>");
 MODULE_DESCRIPTION("TPM Driver");
 MODULE_VERSION("2.0");
 MODULE_LICENSE("GPL");
diff --git a/drivers/char/tpm/tpm_atmel.c b/drivers/char/tpm/tpm_atmel.c
index 54a6750a675781..9fb2defa9dc421 100644
--- a/drivers/char/tpm/tpm_atmel.c
+++ b/drivers/char/tpm/tpm_atmel.c
@@ -229,7 +229,7 @@ static void __exit cleanup_atmel(void)
 module_init(init_atmel);
 module_exit(cleanup_atmel);
 
-MODULE_AUTHOR("Leendert van Doorn (leendert@watson.ibm.com)");
+MODULE_AUTHOR("Leendert van Doorn <leendert@watson.ibm.com>");
 MODULE_DESCRIPTION("TPM Driver");
 MODULE_VERSION("2.0");
 MODULE_LICENSE("GPL");
diff --git a/drivers/char/tpm/tpm_i2c_nuvoton.c b/drivers/char/tpm/tpm_i2c_nuvoton.c
index 5490f7e0fa4369..3c3ee5f551db1b 100644
--- a/drivers/char/tpm/tpm_i2c_nuvoton.c
+++ b/drivers/char/tpm/tpm_i2c_nuvoton.c
@@ -654,6 +654,6 @@ static struct i2c_driver i2c_nuvoton_driver = {
 
 module_i2c_driver(i2c_nuvoton_driver);
 
-MODULE_AUTHOR("Dan Morav (dan.morav@nuvoton.com)");
+MODULE_AUTHOR("Dan Morav <dan.morav@nuvoton.com>");
 MODULE_DESCRIPTION("Nuvoton TPM I2C Driver");
 MODULE_LICENSE("GPL");
diff --git a/drivers/char/tpm/tpm_nsc.c b/drivers/char/tpm/tpm_nsc.c
index 038701d4835130..0f62bbc940daa8 100644
--- a/drivers/char/tpm/tpm_nsc.c
+++ b/drivers/char/tpm/tpm_nsc.c
@@ -410,7 +410,7 @@ static void __exit cleanup_nsc(void)
 module_init(init_nsc);
 module_exit(cleanup_nsc);
 
-MODULE_AUTHOR("Leendert van Doorn (leendert@watson.ibm.com)");
+MODULE_AUTHOR("Leendert van Doorn <leendert@watson.ibm.com>");
 MODULE_DESCRIPTION("TPM Driver");
 MODULE_VERSION("2.0");
 MODULE_LICENSE("GPL");
diff --git a/drivers/char/tpm/tpm_tis.c b/drivers/char/tpm/tpm_tis.c
index 2c52b7905b0706..f301b8fc5fcf7a 100644
--- a/drivers/char/tpm/tpm_tis.c
+++ b/drivers/char/tpm/tpm_tis.c
@@ -428,7 +428,7 @@ static void __exit cleanup_tis(void)
 
 module_init(init_tis);
 module_exit(cleanup_tis);
-MODULE_AUTHOR("Leendert van Doorn (leendert@watson.ibm.com)");
+MODULE_AUTHOR("Leendert van Doorn <leendert@watson.ibm.com>");
 MODULE_DESCRIPTION("TPM Driver");
 MODULE_VERSION("2.0");
 MODULE_LICENSE("GPL");
diff --git a/drivers/char/tpm/tpm_tis_core.c b/drivers/char/tpm/tpm_tis_core.c
index 1b350412d8a6be..a57cf698e18d51 100644
--- a/drivers/char/tpm/tpm_tis_core.c
+++ b/drivers/char/tpm/tpm_tis_core.c
@@ -1361,7 +1361,7 @@ int tpm_tis_resume(struct device *dev)
 EXPORT_SYMBOL_GPL(tpm_tis_resume);
 #endif
 
-MODULE_AUTHOR("Leendert van Doorn (leendert@watson.ibm.com)");
+MODULE_AUTHOR("Leendert van Doorn <leendert@watson.ibm.com>");
 MODULE_DESCRIPTION("TPM Driver");
 MODULE_VERSION("2.0");
 MODULE_LICENSE("GPL");
diff --git a/drivers/char/tpm/tpm_vtpm_proxy.c b/drivers/char/tpm/tpm_vtpm_proxy.c
index 30e953988cabe9..11c502039faf58 100644
--- a/drivers/char/tpm/tpm_vtpm_proxy.c
+++ b/drivers/char/tpm/tpm_vtpm_proxy.c
@@ -711,7 +711,7 @@ static void __exit vtpm_module_exit(void)
 module_init(vtpm_module_init);
 module_exit(vtpm_module_exit);
 
-MODULE_AUTHOR("Stefan Berger (stefanb@us.ibm.com)");
+MODULE_AUTHOR("Stefan Berger <stefanb@us.ibm.com>");
 MODULE_DESCRIPTION("vTPM Driver");
 MODULE_VERSION("0.1");
 MODULE_LICENSE("GPL");
diff --git a/drivers/crypto/n2_core.c b/drivers/crypto/n2_core.c
index 7a3083debc2bb6..59d472cb11e750 100644
--- a/drivers/crypto/n2_core.c
+++ b/drivers/crypto/n2_core.c
@@ -41,7 +41,7 @@
 static const char version[] =
 	DRV_MODULE_NAME ".c:v" DRV_MODULE_VERSION " (" DRV_MODULE_RELDATE ")\n";
 
-MODULE_AUTHOR("David S. Miller (davem@davemloft.net)");
+MODULE_AUTHOR("David S. Miller <davem@davemloft.net>");
 MODULE_DESCRIPTION("Niagara2 Crypto driver");
 MODULE_LICENSE("GPL");
 MODULE_VERSION(DRV_MODULE_VERSION);
diff --git a/drivers/hwmon/dell-smm-hwmon.c b/drivers/hwmon/dell-smm-hwmon.c
index 6d8c0f328b7bbf..88e48e30c893c2 100644
--- a/drivers/hwmon/dell-smm-hwmon.c
+++ b/drivers/hwmon/dell-smm-hwmon.c
@@ -108,7 +108,7 @@ struct dell_smm_cooling_data {
 	struct dell_smm_data *data;
 };
 
-MODULE_AUTHOR("Massimo Dal Zotto (dz@debian.org)");
+MODULE_AUTHOR("Massimo Dal Zotto <dz@debian.org>");
 MODULE_AUTHOR("Pali Rohár <pali@kernel.org>");
 MODULE_DESCRIPTION("Dell laptop SMM BIOS hwmon driver");
 MODULE_LICENSE("GPL");
diff --git a/drivers/hwmon/ultra45_env.c b/drivers/hwmon/ultra45_env.c
index 9823afb0675a06..2765d5f1b7f05c 100644
--- a/drivers/hwmon/ultra45_env.c
+++ b/drivers/hwmon/ultra45_env.c
@@ -18,7 +18,7 @@
 
 #define DRV_MODULE_VERSION	"0.1"
 
-MODULE_AUTHOR("David S. Miller (davem@davemloft.net)");
+MODULE_AUTHOR("David S. Miller <davem@davemloft.net>");
 MODULE_DESCRIPTION("Ultra45 environmental monitor driver");
 MODULE_LICENSE("GPL");
 MODULE_VERSION(DRV_MODULE_VERSION);
diff --git a/drivers/i2c/muxes/i2c-mux-mlxcpld.c b/drivers/i2c/muxes/i2c-mux-mlxcpld.c
index 3dda00f1df78da..4c6ed1d58c79a3 100644
--- a/drivers/i2c/muxes/i2c-mux-mlxcpld.c
+++ b/drivers/i2c/muxes/i2c-mux-mlxcpld.c
@@ -187,7 +187,7 @@ static struct platform_driver mlxcpld_mux_driver = {
 
 module_platform_driver(mlxcpld_mux_driver);
 
-MODULE_AUTHOR("Michael Shych (michaels@mellanox.com)");
+MODULE_AUTHOR("Michael Shych <michaels@mellanox.com>");
 MODULE_DESCRIPTION("Mellanox I2C-CPLD-MUX driver");
 MODULE_LICENSE("Dual BSD/GPL");
 MODULE_ALIAS("platform:i2c-mux-mlxcpld");
diff --git a/drivers/leds/leds-sunfire.c b/drivers/leds/leds-sunfire.c
index 6fd89efb420aa3..a621e5e5c75c5f 100644
--- a/drivers/leds/leds-sunfire.c
+++ b/drivers/leds/leds-sunfire.c
@@ -17,7 +17,7 @@
 #include <asm/fhc.h>
 #include <asm/upa.h>
 
-MODULE_AUTHOR("David S. Miller (davem@davemloft.net)");
+MODULE_AUTHOR("David S. Miller <davem@davemloft.net>");
 MODULE_DESCRIPTION("Sun Fire LED driver");
 MODULE_LICENSE("GPL");
 
diff --git a/drivers/media/common/siano/smscoreapi.c b/drivers/media/common/siano/smscoreapi.c
index 7d4bc2733f2b07..7ebcb10126c9c3 100644
--- a/drivers/media/common/siano/smscoreapi.c
+++ b/drivers/media/common/siano/smscoreapi.c
@@ -2155,7 +2155,7 @@ module_init(smscore_module_init);
 module_exit(smscore_module_exit);
 
 MODULE_DESCRIPTION("Siano MDTV Core module");
-MODULE_AUTHOR("Siano Mobile Silicon, Inc. (uris@siano-ms.com)");
+MODULE_AUTHOR("Siano Mobile Silicon, Inc. <uris@siano-ms.com>");
 MODULE_LICENSE("GPL");
 
 /* This should match what's defined at smscoreapi.h */
diff --git a/drivers/media/common/siano/smsdvb-main.c b/drivers/media/common/siano/smsdvb-main.c
index f80caaa333daf5..d893a0e4672b2f 100644
--- a/drivers/media/common/siano/smsdvb-main.c
+++ b/drivers/media/common/siano/smsdvb-main.c
@@ -1267,5 +1267,5 @@ module_init(smsdvb_module_init);
 module_exit(smsdvb_module_exit);
 
 MODULE_DESCRIPTION("SMS DVB subsystem adaptation module");
-MODULE_AUTHOR("Siano Mobile Silicon, Inc. (uris@siano-ms.com)");
+MODULE_AUTHOR("Siano Mobile Silicon, Inc. <uris@siano-ms.com>");
 MODULE_LICENSE("GPL");
diff --git a/drivers/media/dvb-frontends/cx24117.c b/drivers/media/dvb-frontends/cx24117.c
index ac6e47d81b9ebd..75fc7ad263d05c 100644
--- a/drivers/media/dvb-frontends/cx24117.c
+++ b/drivers/media/dvb-frontends/cx24117.c
@@ -1647,7 +1647,7 @@ static const struct dvb_frontend_ops cx24117_ops = {
 
 
 MODULE_DESCRIPTION("DVB Frontend module for Conexant cx24117/cx24132 hardware");
-MODULE_AUTHOR("Luis Alves (ljalvs@gmail.com)");
+MODULE_AUTHOR("Luis Alves <ljalvs@gmail.com>");
 MODULE_LICENSE("GPL");
 MODULE_VERSION("1.1");
 MODULE_FIRMWARE(CX24117_DEFAULT_FIRMWARE);
diff --git a/drivers/media/usb/siano/smsusb.c b/drivers/media/usb/siano/smsusb.c
index 9d9e14c858e670..723510520d092b 100644
--- a/drivers/media/usb/siano/smsusb.c
+++ b/drivers/media/usb/siano/smsusb.c
@@ -724,5 +724,5 @@ static struct usb_driver smsusb_driver = {
 module_usb_driver(smsusb_driver);
 
 MODULE_DESCRIPTION("Driver for the Siano SMS1xxx USB dongle");
-MODULE_AUTHOR("Siano Mobile Silicon, INC. (uris@siano-ms.com)");
+MODULE_AUTHOR("Siano Mobile Silicon, Inc. <uris@siano-ms.com>");
 MODULE_LICENSE("GPL");
diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c
index 04964bbe08cf33..61a0168f9b0eb2 100644
--- a/drivers/net/ethernet/broadcom/tg3.c
+++ b/drivers/net/ethernet/broadcom/tg3.c
@@ -221,7 +221,7 @@ static inline void _tg3_flag_clear(enum TG3_FLAGS flag, unsigned long *bits)
 #define FIRMWARE_TG3TSO		"tigon/tg3_tso.bin"
 #define FIRMWARE_TG3TSO5	"tigon/tg3_tso5.bin"
 
-MODULE_AUTHOR("David S. Miller (davem@redhat.com) and Jeff Garzik (jgarzik@pobox.com)");
+MODULE_AUTHOR("David S. Miller <davem@redhat.com> and Jeff Garzik <jgarzik@pobox.com>");
 MODULE_DESCRIPTION("Broadcom Tigon3 ethernet driver");
 MODULE_LICENSE("GPL");
 MODULE_FIRMWARE(FIRMWARE_TG3);
diff --git a/drivers/net/ethernet/sun/cassini.c b/drivers/net/ethernet/sun/cassini.c
index b317b94864554a..bfb90350636717 100644
--- a/drivers/net/ethernet/sun/cassini.c
+++ b/drivers/net/ethernet/sun/cassini.c
@@ -176,7 +176,7 @@ static char version[] =
 static int cassini_debug = -1;	/* -1 == use CAS_DEF_MSG_ENABLE as value */
 static int link_mode;
 
-MODULE_AUTHOR("Adrian Sun (asun@darksunrising.com)");
+MODULE_AUTHOR("Adrian Sun <asun@darksunrising.com>");
 MODULE_DESCRIPTION("Sun Cassini(+) ethernet driver");
 MODULE_LICENSE("GPL");
 MODULE_FIRMWARE("sun/cassini.bin");
diff --git a/drivers/net/ethernet/sun/niu.c b/drivers/net/ethernet/sun/niu.c
index 21431f43e4c223..f68aa813d4fb10 100644
--- a/drivers/net/ethernet/sun/niu.c
+++ b/drivers/net/ethernet/sun/niu.c
@@ -61,7 +61,7 @@ union niu_page {
 static char version[] =
 	DRV_MODULE_NAME ".c:v" DRV_MODULE_VERSION " (" DRV_MODULE_RELDATE ")\n";
 
-MODULE_AUTHOR("David S. Miller (davem@davemloft.net)");
+MODULE_AUTHOR("David S. Miller <davem@davemloft.net>");
 MODULE_DESCRIPTION("NIU ethernet driver");
 MODULE_LICENSE("GPL");
 MODULE_VERSION(DRV_MODULE_VERSION);
diff --git a/drivers/net/ethernet/sun/sunhme.c b/drivers/net/ethernet/sun/sunhme.c
index b983b9c23be68a..50ace461a1af4d 100644
--- a/drivers/net/ethernet/sun/sunhme.c
+++ b/drivers/net/ethernet/sun/sunhme.c
@@ -59,7 +59,7 @@
 
 #define DRV_NAME	"sunhme"
 
-MODULE_AUTHOR("David S. Miller (davem@davemloft.net)");
+MODULE_AUTHOR("David S. Miller <davem@davemloft.net>");
 MODULE_DESCRIPTION("Sun HappyMealEthernet(HME) 10/100baseT ethernet driver");
 MODULE_LICENSE("GPL");
 
diff --git a/drivers/net/ethernet/sun/sunvnet.c b/drivers/net/ethernet/sun/sunvnet.c
index e220620d0ffc90..2f30715e9b67f0 100644
--- a/drivers/net/ethernet/sun/sunvnet.c
+++ b/drivers/net/ethernet/sun/sunvnet.c
@@ -44,7 +44,7 @@
 
 static char version[] =
 	DRV_MODULE_NAME " " DRV_MODULE_VERSION " (" DRV_MODULE_RELDATE ")";
-MODULE_AUTHOR("David S. Miller (davem@davemloft.net)");
+MODULE_AUTHOR("David S. Miller <davem@davemloft.net>");
 MODULE_DESCRIPTION("Sun LDOM virtual network driver");
 MODULE_LICENSE("GPL");
 MODULE_VERSION(DRV_MODULE_VERSION);
diff --git a/drivers/net/ethernet/sun/sunvnet_common.c b/drivers/net/ethernet/sun/sunvnet_common.c
index 3525d5c0d694ca..dbe51524b275da 100644
--- a/drivers/net/ethernet/sun/sunvnet_common.c
+++ b/drivers/net/ethernet/sun/sunvnet_common.c
@@ -39,7 +39,7 @@
  */
 #define	VNET_MAX_RETRIES	10
 
-MODULE_AUTHOR("David S. Miller (davem@davemloft.net)");
+MODULE_AUTHOR("David S. Miller <davem@davemloft.net>");
 MODULE_DESCRIPTION("Sun LDOM virtual network support library");
 MODULE_LICENSE("GPL");
 MODULE_VERSION("1.1");
diff --git a/drivers/net/ppp/pptp.c b/drivers/net/ppp/pptp.c
index 6833ef0c79305f..689687bd2574bc 100644
--- a/drivers/net/ppp/pptp.c
+++ b/drivers/net/ppp/pptp.c
@@ -694,6 +694,6 @@ module_init(pptp_init_module);
 module_exit(pptp_exit_module);
 
 MODULE_DESCRIPTION("Point-to-Point Tunneling Protocol");
-MODULE_AUTHOR("D. Kozlov (xeb@mail.ru)");
+MODULE_AUTHOR("D. Kozlov <xeb@mail.ru>");
 MODULE_LICENSE("GPL");
 MODULE_ALIAS_NET_PF_PROTO(PF_PPPOX, PX_PROTO_PPTP);
diff --git a/drivers/platform/x86/compal-laptop.c b/drivers/platform/x86/compal-laptop.c
index 61c745490d714d..5546fb18949130 100644
--- a/drivers/platform/x86/compal-laptop.c
+++ b/drivers/platform/x86/compal-laptop.c
@@ -1107,7 +1107,7 @@ module_init(compal_init);
 module_exit(compal_cleanup);
 
 MODULE_AUTHOR("Cezary Jackiewicz");
-MODULE_AUTHOR("Roald Frederickx (roald.frederickx@gmail.com)");
+MODULE_AUTHOR("Roald Frederickx <roald.frederickx@gmail.com>");
 MODULE_DESCRIPTION("Compal Laptop Support");
 MODULE_VERSION(DRIVER_VERSION);
 MODULE_LICENSE("GPL");
diff --git a/drivers/platform/x86/intel/oaktrail.c b/drivers/platform/x86/intel/oaktrail.c
index fa720967e69bfb..217630f40c3f8b 100644
--- a/drivers/platform/x86/intel/oaktrail.c
+++ b/drivers/platform/x86/intel/oaktrail.c
@@ -365,7 +365,7 @@ static void __exit oaktrail_cleanup(void)
 module_init(oaktrail_init);
 module_exit(oaktrail_cleanup);
 
-MODULE_AUTHOR("Yin Kangkai (kangkai.yin@intel.com)");
+MODULE_AUTHOR("Yin Kangkai <kangkai.yin@intel.com>");
 MODULE_DESCRIPTION("Intel Oaktrail Platform ACPI Extras");
 MODULE_VERSION(DRIVER_VERSION);
 MODULE_LICENSE("GPL");
diff --git a/drivers/platform/x86/mlx-platform.c b/drivers/platform/x86/mlx-platform.c
index 32981e2ad3b390..9d70146fd7420a 100644
--- a/drivers/platform/x86/mlx-platform.c
+++ b/drivers/platform/x86/mlx-platform.c
@@ -6659,6 +6659,6 @@ static void __exit mlxplat_exit(void)
 }
 module_exit(mlxplat_exit);
 
-MODULE_AUTHOR("Vadim Pasternak (vadimp@mellanox.com)");
+MODULE_AUTHOR("Vadim Pasternak <vadimp@mellanox.com>");
 MODULE_DESCRIPTION("Mellanox platform driver");
 MODULE_LICENSE("Dual BSD/GPL");
diff --git a/drivers/s390/net/fsm.c b/drivers/s390/net/fsm.c
index 0ff61d00feb19c..8672d225ba77fe 100644
--- a/drivers/s390/net/fsm.c
+++ b/drivers/s390/net/fsm.c
@@ -9,7 +9,7 @@
 #include <linux/slab.h>
 #include <linux/timer.h>
 
-MODULE_AUTHOR("(C) 2000 IBM Corp. by Fritz Elfert (felfert@millenux.com)");
+MODULE_AUTHOR("(C) 2000 IBM Corp. by Fritz Elfert <felfert@millenux.com>");
 MODULE_DESCRIPTION("Finite state machine helper functions");
 MODULE_LICENSE("GPL");
 
diff --git a/drivers/sbus/char/openprom.c b/drivers/sbus/char/openprom.c
index 30b9751aad302b..cc178874c4a662 100644
--- a/drivers/sbus/char/openprom.c
+++ b/drivers/sbus/char/openprom.c
@@ -33,7 +33,7 @@
 #include <linux/pci.h>
 #endif
 
-MODULE_AUTHOR("Thomas K. Dyas (tdyas@noc.rutgers.edu) and Eddie C. Dost  (ecd@skynet.be)");
+MODULE_AUTHOR("Thomas K. Dyas <tdyas@noc.rutgers.edu> and Eddie C. Dost <ecd@skynet.be>");
 MODULE_DESCRIPTION("OPENPROM Configuration Driver");
 MODULE_LICENSE("GPL");
 MODULE_VERSION("1.0");
diff --git a/drivers/scsi/esp_scsi.c b/drivers/scsi/esp_scsi.c
index 97816a0e6240a9..0175d2282b4581 100644
--- a/drivers/scsi/esp_scsi.c
+++ b/drivers/scsi/esp_scsi.c
@@ -2753,7 +2753,7 @@ static void __exit esp_exit(void)
 }
 
 MODULE_DESCRIPTION("ESP SCSI driver core");
-MODULE_AUTHOR("David S. Miller (davem@davemloft.net)");
+MODULE_AUTHOR("David S. Miller <davem@davemloft.net>");
 MODULE_LICENSE("GPL");
 MODULE_VERSION(DRV_VERSION);
 
diff --git a/drivers/scsi/jazz_esp.c b/drivers/scsi/jazz_esp.c
index 494a671fb5564d..fb04b0b515ab1f 100644
--- a/drivers/scsi/jazz_esp.c
+++ b/drivers/scsi/jazz_esp.c
@@ -204,6 +204,6 @@ static struct platform_driver esp_jazz_driver = {
 module_platform_driver(esp_jazz_driver);
 
 MODULE_DESCRIPTION("JAZZ ESP SCSI driver");
-MODULE_AUTHOR("Thomas Bogendoerfer (tsbogend@alpha.franken.de)");
+MODULE_AUTHOR("Thomas Bogendoerfer <tsbogend@alpha.franken.de>");
 MODULE_LICENSE("GPL");
 MODULE_VERSION(DRV_VERSION);
diff --git a/drivers/scsi/mesh.c b/drivers/scsi/mesh.c
index e276583c590c38..0a48da52d1dc40 100644
--- a/drivers/scsi/mesh.c
+++ b/drivers/scsi/mesh.c
@@ -54,7 +54,7 @@
 #define KERN_DEBUG KERN_WARNING
 #endif
 
-MODULE_AUTHOR("Paul Mackerras (paulus@samba.org)");
+MODULE_AUTHOR("Paul Mackerras <paulus@samba.org>");
 MODULE_DESCRIPTION("PowerMac MESH SCSI driver");
 MODULE_LICENSE("GPL");
 
diff --git a/drivers/scsi/qlogicpti.c b/drivers/scsi/qlogicpti.c
index 5d560d9b894405..6177f4798f3ac9 100644
--- a/drivers/scsi/qlogicpti.c
+++ b/drivers/scsi/qlogicpti.c
@@ -1468,7 +1468,7 @@ static struct platform_driver qpti_sbus_driver = {
 module_platform_driver(qpti_sbus_driver);
 
 MODULE_DESCRIPTION("QlogicISP SBUS driver");
-MODULE_AUTHOR("David S. Miller (davem@davemloft.net)");
+MODULE_AUTHOR("David S. Miller <davem@davemloft.net>");
 MODULE_LICENSE("GPL");
 MODULE_VERSION("2.1");
 MODULE_FIRMWARE("qlogic/isp1000.bin");
diff --git a/drivers/scsi/sun3x_esp.c b/drivers/scsi/sun3x_esp.c
index 09219c362acccf..e20f314cf3e7d7 100644
--- a/drivers/scsi/sun3x_esp.c
+++ b/drivers/scsi/sun3x_esp.c
@@ -273,7 +273,7 @@ static struct platform_driver esp_sun3x_driver = {
 module_platform_driver(esp_sun3x_driver);
 
 MODULE_DESCRIPTION("Sun3x ESP SCSI driver");
-MODULE_AUTHOR("Thomas Bogendoerfer (tsbogend@alpha.franken.de)");
+MODULE_AUTHOR("Thomas Bogendoerfer <tsbogend@alpha.franken.de>");
 MODULE_LICENSE("GPL");
 MODULE_VERSION(DRV_VERSION);
 MODULE_ALIAS("platform:sun3x_esp");
diff --git a/drivers/scsi/sun_esp.c b/drivers/scsi/sun_esp.c
index 64a7c2c6c5ff42..5ce6c9d19d1e61 100644
--- a/drivers/scsi/sun_esp.c
+++ b/drivers/scsi/sun_esp.c
@@ -608,6 +608,6 @@ static struct platform_driver esp_sbus_driver = {
 module_platform_driver(esp_sbus_driver);
 
 MODULE_DESCRIPTION("Sun ESP SCSI driver");
-MODULE_AUTHOR("David S. Miller (davem@davemloft.net)");
+MODULE_AUTHOR("David S. Miller <davem@davemloft.net>");
 MODULE_LICENSE("GPL");
 MODULE_VERSION(DRV_VERSION);
diff --git a/drivers/video/fbdev/hgafb.c b/drivers/video/fbdev/hgafb.c
index 264c8cedba159a..c3bc5b78b749a2 100644
--- a/drivers/video/fbdev/hgafb.c
+++ b/drivers/video/fbdev/hgafb.c
@@ -670,7 +670,7 @@ static void __exit hgafb_exit(void)
  *
  * ------------------------------------------------------------------------- */
 
-MODULE_AUTHOR("Ferenc Bakonyi (fero@drama.obuda.kando.hu)");
+MODULE_AUTHOR("Ferenc Bakonyi <fero@drama.obuda.kando.hu>");
 MODULE_DESCRIPTION("FBDev driver for Hercules Graphics Adaptor");
 MODULE_LICENSE("GPL");
 
diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c
index cbb2b4bb0dfac5..3757fd93523f0f 100644
--- a/net/ipv4/gre_demux.c
+++ b/net/ipv4/gre_demux.c
@@ -217,5 +217,5 @@ module_init(gre_init);
 module_exit(gre_exit);
 
 MODULE_DESCRIPTION("GRE over IPv4 demultiplexer driver");
-MODULE_AUTHOR("D. Kozlov (xeb@mail.ru)");
+MODULE_AUTHOR("D. Kozlov <xeb@mail.ru>");
 MODULE_LICENSE("GPL");
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 070d87abf7c028..2dd77c2ba218b2 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -2405,7 +2405,7 @@ static void __exit ip6gre_fini(void)
 module_init(ip6gre_init);
 module_exit(ip6gre_fini);
 MODULE_LICENSE("GPL");
-MODULE_AUTHOR("D. Kozlov (xeb@mail.ru)");
+MODULE_AUTHOR("D. Kozlov <xeb@mail.ru>");
 MODULE_DESCRIPTION("GRE over IPv6 tunneling device");
 MODULE_ALIAS_RTNL_LINK("ip6gre");
 MODULE_ALIAS_RTNL_LINK("ip6gretap");
diff --git a/net/iucv/iucv.c b/net/iucv/iucv.c
index 6334f64f04d5f2..1f27bc26c79009 100644
--- a/net/iucv/iucv.c
+++ b/net/iucv/iucv.c
@@ -1903,6 +1903,6 @@ static void __exit iucv_exit(void)
 subsys_initcall(iucv_init);
 module_exit(iucv_exit);
 
-MODULE_AUTHOR("(C) 2001 IBM Corp. by Fritz Elfert (felfert@millenux.com)");
+MODULE_AUTHOR("(C) 2001 IBM Corp. by Fritz Elfert <felfert@millenux.com>");
 MODULE_DESCRIPTION("Linux for S/390 IUCV lowlevel driver");
 MODULE_LICENSE("GPL");
diff --git a/net/mpls/mpls_gso.c b/net/mpls/mpls_gso.c
index 533d082f0701e5..f779b4f23b78d3 100644
--- a/net/mpls/mpls_gso.c
+++ b/net/mpls/mpls_gso.c
@@ -106,5 +106,5 @@ module_init(mpls_gso_init);
 module_exit(mpls_gso_exit);
 
 MODULE_DESCRIPTION("MPLS GSO support");
-MODULE_AUTHOR("Simon Horman (horms@verge.net.au)");
+MODULE_AUTHOR("Simon Horman <horms@verge.net.au>");
 MODULE_LICENSE("GPL");

From cded526a309ee87482a3048875a75436a3bab150 Mon Sep 17 00:00:00 2001
From: "Ricardo B. Marliere" <ricardo@marliere.net>
Date: Sun, 18 Feb 2024 16:25:51 -0300
Subject: [PATCH 1372/1406] const_structs.checkpatch: add device_type

Since commit aed65af1cc2f ("drivers: make device_type const"), the driver
core can properly handle constant struct device_type.  Make sure that new
usages of the struct already enter the tree as const.

Link: https://lkml.kernel.org/r/20240218-device_cleanup-checkpatch-v1-1-8b0b89c4f6b1@marliere.net
Signed-off-by: Ricardo B. Marliere <ricardo@marliere.net>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Joe Perches <joe@perches.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 scripts/const_structs.checkpatch | 1 +
 1 file changed, 1 insertion(+)

diff --git a/scripts/const_structs.checkpatch b/scripts/const_structs.checkpatch
index 7427313adc7aab..fa96cfd16e998d 100644
--- a/scripts/const_structs.checkpatch
+++ b/scripts/const_structs.checkpatch
@@ -8,6 +8,7 @@ comedi_lrange
 component_ops
 dentry_operations
 dev_pm_ops
+device_type
 dma_map_ops
 driver_info
 drm_connector_funcs

From 4893c639cc3659cefaa675bf1e59f4e7571afb5c Mon Sep 17 00:00:00 2001
From: Stephen Rothwell <sfr@canb.auug.org.au>
Date: Wed, 21 Feb 2024 16:41:44 +1100
Subject: [PATCH 1373/1406] Add linux-next specific files for 20240221

Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
---
 Next/SHA1s        |   371 ++
 Next/Trees        |   373 ++
 Next/merge.log    | 10879 ++++++++++++++++++++++++++++++++++++++++++++
 localversion-next |     1 +
 4 files changed, 11624 insertions(+)
 create mode 100644 Next/SHA1s
 create mode 100644 Next/Trees
 create mode 100644 Next/merge.log
 create mode 100644 localversion-next

diff --git a/Next/SHA1s b/Next/SHA1s
new file mode 100644
index 00000000000000..8c223fa6fa93be
--- /dev/null
+++ b/Next/SHA1s
@@ -0,0 +1,371 @@
+Name		SHA1
+----		----
+origin		fca7526b7d8910c6125cb1ebc3e78ccd5f50ec52
+fixes		2dde18cd1d8fac735875f2e4987f11817cc0bc2c
+mm-hotfixes	0eb702ab51ac8e631795cd92f2c672ae40864b21
+kbuild-current	b401b621758e46812da61fa58a67c3fd8d91de0d
+arc-current	861deac3b092f37b2c5e6871732f3e11486f7082
+arm-current	f54e8634d1366926c807e2af6125b33cff555fa7
+arm64-fixes	d7b77a0d565b048cb0808fa8a4fb031352b22a01
+arm-soc-fixes	78b6f8e7379b5399d1804f0852bb2ddabd049019
+davinci-current	6613476e225e090cc9aad49be7fa504e290dd33d
+drivers-memory-fixes	6613476e225e090cc9aad49be7fa504e290dd33d
+sophgo-fixes	41bccc98fb7931d63d03f326a746ac4d429c1dd3
+tee-fixes	ceaa837f96adb69c0df0397937cd74991d5d821a
+m68k-current	e8a7824856def1c8608401b0d7d05566d6e81c95
+powerpc-fixes	20c8c4dafe93e82441583e93bd68c0d256d7bed4
+s390-fixes	124468af7e769a52d27c3290007ac6e2ba346ccd
+fscrypt-current	4bcf6f827a79c59806c695dc280e763c5b6a6813
+fsverity-current	a075bacde257f755bea0e53400c9f1cdd1b8e8e6
+net		23f9c2c066e7e5052406fb8f04a115d3d0260b22
+bpf		5c138a8a4abe152fcbef1ed40a6a4b5727b2991b
+ipsec		983a73da1f996faee9997149eb05b12fa7bd8cbf
+netfilter	40b9385dd8e6a0515e1c9cd06a277483556b7286
+ipvs		84443741faab9045d53f022a9ac6a6633067a481
+wireless	f78c1375339a291cba492a70eaf12ec501d28a8e
+wpan		b85ea95d086471afb4ad062012a4d73cd328fa86
+rdma-fixes	eb5c7465c3240151cd42a55c7ace9da0026308a1
+sound-current	49cbb7b7d36ec3ba73ce1daf7ae1d71d435453b8
+sound-asoc-fixes	0db0c1770834f39e11a2902e20e1f11a482f4465
+regmap-fixes	2f0dbb24f78a333433a2b875c0b76bf55c119cd4
+regulator-fixes	e5d40e9afd84cec01cdbbbfe62d52f89959ab3ee
+spi-fixes	269e31aecdd0b70f53a05def79480f15cbcc0fd6
+pci-current	6613476e225e090cc9aad49be7fa504e290dd33d
+driver-core.current	b401b621758e46812da61fa58a67c3fd8d91de0d
+tty.current	3b69e32e151bc4a4e3c785cbdb1f918d5ee337ed
+usb.current	69f89168b310878be82d7d97bc0d22068ad858c0
+usb-serial-fixes	54be6c6c5ae8e0d93a6c4641cb7528eb0b6ba478
+phy		d4c08d8b23b22807c712208cd05cb047e92e7672
+staging.current	6613476e225e090cc9aad49be7fa504e290dd33d
+iio-fixes	78367c32bebfe833cd30c855755d863a4ff3fdee
+counter-current	c83ccdc9586b3e9882da9e27507c046751999d59
+char-misc.current	daaf5286b6d2528a73c651aa2d4059bc1bd67c2e
+soundwire-fixes	6613476e225e090cc9aad49be7fa504e290dd33d
+thunderbolt-fixes	d3d17e23d1a0d1f959b4fa55b35f1802d9c584fa
+input-current	4255447ad34c5c3785fcdcf76cfa0271d6e5ed39
+crypto-current	c0ec2a712daf133d9996a8a1b7ee2d4996080363
+vfio-fixes	4ea95c04fa6b9043a1a301240996aeebe3cb28ec
+kselftest-fixes	b54761f6e9773350c0d1fb8e1e5aacaba7769d0f
+modules-fixes	f412eef03938d3a40d4f6f5a79d0f98ed89b596d
+dmaengine-fixes	a79f949a5ce1d45329d63742c2a995f2b47f9852
+backlight-fixes	6613476e225e090cc9aad49be7fa504e290dd33d
+mtd-fixes	e6a30d0c48a1e8a68f1cc413bee65302ab03ddfb
+mfd-fixes	6613476e225e090cc9aad49be7fa504e290dd33d
+v4l-dvb-fixes	346c84e281a963437b9fe9dfcd92c531630289de
+reset-fixes	4a6756f56bcf8e64c87144a626ce53aea4899c0e
+mips-fixes	b401b621758e46812da61fa58a67c3fd8d91de0d
+at91-fixes	6613476e225e090cc9aad49be7fa504e290dd33d
+omap-fixes	9b6a51aab5f5f9f71d2fa16e8b4d530e1643dfcb
+kvm-fixes	9895ceeb5cd61092f147f8d611e2df575879dd6f
+kvms390-fixes	83303a4c776ce1032d88df59e811183479acea77
+hwmon-fixes	841c35169323cd833294798e58b9bf63fa4fa1de
+nvdimm-fixes	33908660e814203e996f6e775d033c5c32fcf9a7
+cxl-fixes	daeacfa75d08954e1a5b71c36a8fbfcdd0b3fec9
+btrfs-fixes	eb90d142fc1bc3bc59d42f14e7af7b7e3508f2e0
+vfs-fixes	2c88c16dc20e88dd54d2f6f4d01ae1dce6cc9654
+dma-mapping-fixes	d5090484b021794271280ab64d20253883b7f6fd
+drivers-x86-fixes	1abdf288b0ef5606f76b6e191fa6df05330e3d7e
+samsung-krzk-fixes	eab4f56d3e75dad697acf8dc2c8be3c341d6c63e
+pinctrl-samsung-fixes	6613476e225e090cc9aad49be7fa504e290dd33d
+devicetree-fixes	4e06ec0774f5bebf10e27bc7a5ace4b48ae0fa56
+dt-krzk-fixes	6613476e225e090cc9aad49be7fa504e290dd33d
+scsi-fixes	9ddf190a7df77b77817f955fdb9c2ae9d1c9c9a3
+drm-fixes	b401b621758e46812da61fa58a67c3fd8d91de0d
+drm-intel-fixes	b401b621758e46812da61fa58a67c3fd8d91de0d
+mmc-fixes	6b1ba3f9040be5efc4396d86c9752cdc564730be
+rtc-fixes	08279468a294d8c996a657ecc9e51bd5c084c75d
+gnss-fixes	54be6c6c5ae8e0d93a6c4641cb7528eb0b6ba478
+hyperv-fixes	564eac2860bdbe6ac651e6909ac07ecd93d778f3
+soc-fsl-fixes	06c2afb862f9da8dc5efa4b6076a0e48c3fbaaa5
+risc-v-fixes	3951f6add519a8e954bf78691a412f65b24f4715
+riscv-dt-fixes	ce6b6d1513965f500a05f3facf223fa01fd74920
+riscv-soc-fixes	bf456162601ff701267a33a082350cceee8b1f5f
+fpga-fixes	6613476e225e090cc9aad49be7fa504e290dd33d
+spdx		6613476e225e090cc9aad49be7fa504e290dd33d
+gpio-brgl-fixes	ae366ba8576da0135d7d3db2dfa6304f3338d0c2
+gpio-intel-fixes	6613476e225e090cc9aad49be7fa504e290dd33d
+pinctrl-intel-fixes	6613476e225e090cc9aad49be7fa504e290dd33d
+auxdisplay-fixes	6613476e225e090cc9aad49be7fa504e290dd33d
+erofs-fixes	d9281660ff3ffb4a05302b485cc59a87e709aefc
+kunit-fixes	4b758d70257d5c7830c5411639b03541336c9bf7
+ubifs-fixes	2241ab53cbb5cdb08a6b2d4688feb13971058f65
+memblock-fixes	6a9531c3a88096a26cf3ac582f7ec44f94a7dcb2
+nfsd-fixes	5ea9a7c5fe4149f165f0e3b624fe08df02b6c301
+renesas-fixes	9eab43facdadb7d00456c2657001ae2e5353c814
+perf-current	fdd0ae72b34e56eb5e896d067c49a78ecb451032
+efi-fixes	e258b85f1c3c9122fe4592a0cf99669c60df35e1
+zstd-fixes	77618db346455129424fadbbaec596a09feaf3bb
+battery-fixes	2df70149e73e79783bcbc7db4fa51ecef0e2022c
+uml-fixes	73a23d7710331a530e972903318528b75e5a5f58
+iommufd-fixes	28b9f669e10f5584aba9856c5aa9d86d64ec9f69
+rust-fixes	b401b621758e46812da61fa58a67c3fd8d91de0d
+v9fs-fixes	6613476e225e090cc9aad49be7fa504e290dd33d
+w1-fixes	6613476e225e090cc9aad49be7fa504e290dd33d
+pmdomain-fixes	eb5555d422d0fc325e1574a7353d3c616f82d8b5
+overlayfs-fixes	420332b94119cdc7db4477cc88484691cb92ae71
+i2c-host-fixes	eb9f7f654f251b57db310eab90bbae5876898ae3
+drm-misc-fixes	eb0d253ff9c74dee30aa92fe460b825eb28acd73
+mm-stable	b401b621758e46812da61fa58a67c3fd8d91de0d
+mm-nonmm-stable	b401b621758e46812da61fa58a67c3fd8d91de0d
+mm		e36e2a37ad2edf087947433fa779a3d7b180eb6b
+kbuild		ba3b759fb688c09cd9b09852d2728b012cf040ba
+clang-format	5a205c6a9f79d14db38006aa2d7c4f4e76b1bfc7
+perf		81901fc0640dc8d0210a24c43edff5018b4cc047
+compiler-attributes	2993eb7a8d34aee6165e1f6676e81cdf1d22aa62
+dma-mapping	7c65aa3cc072cee76f577262fbe381a111a98774
+asm-generic	34b2321cc648a246d08cc51e423532eac690ccf1
+arc		0bb80ecc33a8fb5a682236443c1e740d5c917d1d
+arm		8790fade1a19caf714ba1d91ce1fdceb9f2067f2
+arm64		19cf01c42e5b5a86c38de5048a623d4589a38b57
+arm-perf	fd185a245155be9cb90839fa451ba8f2c3e4004c
+arm-soc		37156e9b997483f1d08db52ef298b9878394eef0
+amlogic		8026dced77f2662a228a6f92ea4bceacc5b735d3
+asahi-soc	ffc253263a1375a65fa6c9f62a893e9767fbebfa
+aspeed		e60f7a99d3789b5d0b24d3c0571b013309e56815
+at91		859f600457ccfe176d7887bc753bcdb83734c1bd
+broadcom	412c6bd2c649bcacc057e274dccb0b93f69fd1c7
+davinci		6613476e225e090cc9aad49be7fa504e290dd33d
+drivers-memory	2f542c937c48c2bd5a8ddf180b417fbe7152559f
+imx-mxs		d93b6c641bc962a298bd8629c6669e97942d1231
+mediatek	ba90af39ba57b3fe3ecfdba0c87a80d20c7b788d
+mvebu		476887312c6082e2c03efc3f016e8134c076108e
+omap		0012c1958460386adc5770baf2f53206aed77ff3
+qcom		f6265e31fc717283224752dde476128191737d69
+renesas		0c096fb42ae57ad9733eb6c0e0ba8b1d856ddfc3
+reset		c3c46acd5be9a3351c163d2869045cab4d5342dc
+rockchip	504c4c60e70bf27612acde066f3b3c93cd4f513e
+samsung-krzk	d9e0e7c68345ccaa06cfe6e3c34400670aef0064
+scmi		f49191cdf83499c333b1d2f842c262592c33554a
+sophgo		41bccc98fb7931d63d03f326a746ac4d429c1dd3
+stm32		7fd195f01ae52871b04b752447f865e1a6661487
+sunxi		5db172482d9de4ec130c8197881573fe78e77332
+tee		58ea7e692a9e95c8d945d1febf7cec22ca00a8f0
+tegra		fc9699999179f0cde82828506ea7e8c4b78e02f9
+ti		0ad5d338af6d04ae8c389120ecef0e9699f4f397
+xilinx		2d81f5ef567ce96f29e698939673226d2d1b0fcb
+clk		efe5a1b888ab0f6acf723e2a12a4644a599294d0
+clk-imx		6613476e225e090cc9aad49be7fa504e290dd33d
+clk-renesas	81a7a88a98062ffcd8d7d5ac3b540a96dbff5490
+csky		2c40c1c6adab90ee4660caf03722b3a3ec67767b
+loongarch	cca5efe77a6a2d02b3da4960f799fa233e460ab1
+m68k		6b9c045b0602cf64b33ea6da5e6aa6f81dd47ae8
+m68knommu	b401b621758e46812da61fa58a67c3fd8d91de0d
+microblaze	6613476e225e090cc9aad49be7fa504e290dd33d
+mips		3c35da51f77e45f345b229df4b3adebad82ce4ef
+openrisc	c289330331eb93bc6a3c68b9119ccd7d4285a4a2
+parisc-hd	3a34e3fcdd835cc18e7e54bd835451a82828b72e
+powerpc		3281366a8e79a512956382885091565db1036b64
+soc-fsl		fb9c384625dd604e8a5be1f42b35e83104b90670
+risc-v		cb4ede926134a65bc3bf90ed58dace8451d7e759
+riscv-dt	5669bb5a16a0b8bfc3f2877dbfcd77b62cc9ebf4
+riscv-soc	6613476e225e090cc9aad49be7fa504e290dd33d
+s390		cba7aa7faf8624b2e9c865cc125c9c1949775f0d
+sh		0a2d3ce0031f504b2e3ad47625e149ad5759ad33
+uml		83aec96c631e0fa75cfe6d6a1b113a32151aaa88
+xtensa		7ab7acb68adf053c78a2cdf32bf1a3dce95912ec
+bcachefs	c887148ebf9989ce8bdf6f814d4342ba5bf465fa
+pidfd		a901a3568fd26ca9c4a82d8bc5ed5b3ed844d451
+fscrypt		d3a7bd4200762d11c33ebe7e2c47c5813ddc65b4
+afs		abcbd3bfbbfe97a8912d0c929d4aa18f50d9bc52
+btrfs		d3cfdbb1ea50e9b669524fa4e45553e36de58c15
+ceph		dbc347ef7f0c53aa4a5383238a804d7ebbb0b5ca
+cifs		0ab0a5fed47634cef2a49b80a35197880aa96658
+configfs	4425c1d9b44ded655d2668e1ce95a62bccf7b21b
+ecryptfs	a3d78fe3e1ae8c6a1901635c54a1a799656f72c8
+erofs		aa12a790d31be14b289d5a2c6f41ca535fcc7841
+exfat		3a7845041eb7235f2fb00ef0960995da5be63b11
+exportfs	42c3732fa8073717dd7d924472f1c0bc5b452fdc
+ext3		21174ac99fe471b793a188c710bb96d3477444f4
+ext4		ec9d669eba4c276d00af88951947fe0e82a6b84c
+f2fs		21ec68234826b1b54ab980a8df6e33c74cfbee58
+fsverity	8e43fb06e10d2c811797740dd578c5099a3e6378
+fuse		3f29f1c336c0e8a4bec52f1e5217f88835553e5b
+gfs2		6b89b6af459fdd6f2741d0c2e33c67af8193697e
+jfs		e42e29cc442395d62f1a8963ec2dfb700ba6a5d7
+ksmbd		b401b621758e46812da61fa58a67c3fd8d91de0d
+nfs		052d534373b7ed33712a63d5e17b2b6cdbce84fd
+nfs-anna	57331a59ac0d680f606403eb24edd3c35aecba31
+nfsd		26102396d4e0559e0cf147e1f36123d7ee6afaca
+ntfs3		622cd3daa8eae37359a6fd3c07c36d19f66606b5
+orangefs	9bf93dcfc453fae192fe5d7874b89699e8f800ac
+overlayfs	d17bb4620f90f81d8a8a45c3d025c679a1b5efcd
+ubifs		adbf4c4954e33e623897058a617c583d65a177f6
+v9fs		be3193e58ec210b2a72fb1134c2a0695088a911d
+v9fs-ericvh	be57855f505003c5cafff40338d5d0f23b00ba4d
+xfs		49c379d3a72ab86aafeafebe6b43577acb1ef359
+zonefs		567e629fd296561aacd04547a603b163de3dabbe
+iomap		3ac974796e5d94509b85a403449132ea660127c2
+djw-vfs		ce85a1e04645b1ed386b074297df27ab5b8801c0
+file-locks	e0152e7481c6c63764d6ea8ee41af5cf9dfac5e9
+iversion	e0152e7481c6c63764d6ea8ee41af5cf9dfac5e9
+vfs-brauner	b630a177e61d081cc1d2c28bde54a17862d4e6b0
+vfs		052d534373b7ed33712a63d5e17b2b6cdbce84fd
+printk		e7081d5a9d976b84f61f497316d7c940a4a2e67a
+pci		5b52c9afa3dd95ea0903b6d16225e5744f20ad5b
+pstore		69f381e67d6fe94c4f1416fbd5672b715cae098a
+hid		8f0a3ff87887a8994a9c5a680dae4865ec97ceca
+i2c		67ec505fae32419354f4172c06c853def2541300
+i2c-host	48acf8292280f257fb0047478153a81471ee7f4d
+i3c		8f06fb45853900b4deaa52e2a9e16f3d9550b011
+hwmon-staging	e6b33455c319e77613f44862d9d19ef63d208862
+jc_docs		920290fe2a188e6553da135e4ef4f6c44c6138ae
+v4l-dvb		8c64f4cdf4e6cc5682c52523713af8c39c94e6d5
+v4l-dvb-next	e0b8eb0f6d652981bfd9ba7c619c9d81ed087ad0
+pm		4cb5c331c4dfd553077664717ed061ecc8d2a0f7
+cpufreq-arm	3093fa33539b54db77171d2919352ad4f044a1c5
+cpupower	babb46746cc5683fc930fea7d0ef6d5323d6a6cd
+devfreq		b401b621758e46812da61fa58a67c3fd8d91de0d
+pmdomain	713240877a26f3cc035d6531795bd819dfaa633c
+opp		ace4b31b297dfd7b8c969ff5046c8128c3e025be
+thermal		5314b1543787e6cd5d248186fcfd5c5fc4ca2146
+dlm		5beebc1dda47719dac85830c53bca1a0ab497d96
+rdma		aafe4cc5096996873817ff4981a3744e8caf7808
+net-next	4934446297c292611d3b6cd388efb215f2ba5698
+bpf-next	7648f0c91eaa3598add9e91991a5483b29da32ee
+ipsec-next	1476de6d2b578673e20fb4cf654ff61cf2782873
+mlx5-next	d727d27db536faea7178290c677cc0567f647231
+netfilter-next	219eee9c0d16f1b754a8b85275854ab17df0850a
+ipvs-next	f77581bfda2409a0a3f3a42fa70cab9ef0891e9c
+bluetooth	25956d989c603a269aeb04336164acef82da50c5
+wireless-next	dd66185c23f71af36397bebfc99ede608dca07b6
+wpan-next	2373699560a754079579b7722b50d1d38de1960e
+wpan-staging	2373699560a754079579b7722b50d1d38de1960e
+mtd		18af7e357033f1a1cee50db2663ef982b4a2226e
+nand		4bd14b2fd8a83a2f5220ba4ef323f741e11bfdfd
+spi-nor		3c0e1dfa703cd2a16fbfb1290b0970b61add3cde
+crypto		7d42e097607c4d246d99225bf2b195b6167a210c
+drm		9ac4beb7578a88baa4f7e6a59eeb5be79d7b011a
+drm-ci		ad6bfe1b66a5c146ec236847eca7af4c8806d666
+drm-exynos	9ac4beb7578a88baa4f7e6a59eeb5be79d7b011a
+drm-misc	1f4c6f11a557642505e5f403e0dfabbaff9c529a
+amdgpu		31e0a586f3385134bcad00d8194eb0728cb1a17d
+drm-intel	bf7626f19d6ff14b9722273e23700400cc4d78ba
+drm-tegra	2429b3c529da29d4277d519bd66d034842dcd70c
+drm-msm		41c177cf354126a22443b5c80cec9fdd313e67e1
+drm-msm-lumag	ffa0c87f172bf7a0132aa960db412f8d63b2f533
+etnaviv		c9959996a8fc171bbb2c2d9c7478306f331a6cca
+fbdev		72fee6b0a3a4ad6c5131d4c20e8ab7253b16e38b
+regmap		cfe1cab458f0e6d5dad882aa9b7e93b496d6356a
+sound		52592932405cecaaa0f4b8cb41128d014b96858c
+ieee1394	41ebb53b1bffb24547e21015ea53f382f922a099
+sound-asoc	db38c4ba8be6ba5d792f2be88d5441cc202305b6
+modules		d1909c0221739356f31c721de4743e7d219a56cc
+input		d03f030115fe930de1222fef294730ba21b93045
+block		40192a566897f3d9f3b844bbb07bbce56d141cb5
+device-mapper	10e8baf7b3eb350233325d18c175dada7e72f451
+libata		1ab5b472493f530ed9d94cbcd58c4299153c2777
+pcmcia		1bec7691b32710ea27741f0f8b00c1dc98d92930
+mmc		25e69172db8a31ef8564ce1cf755ac5cb8374daa
+mfd		d5132d176d6f21742ac67fd311ccc61fe830e999
+backlight	770c0f4975fd7b4bb68ca7cf150d3b1c9c864a99
+battery		a9b254892ce1a447b06c5019cbf0e9caeb48c138
+regulator	32ca2f8f6696d6e45220631e543cc12f3323af10
+security	1fc5baf574b7b3011b49571811cafdd7e2295e5d
+apparmor	8ead196be219adade3bd0d4115cc9b8506643121
+integrity	85445b96429057d87446bcb24ec0cac9ea9c7fdf
+selinux		7c655bee5cd8e060983bd89460fffc1f9f780cda
+smack		69b6d71052b54fb10feba68564ccb41c0f0ce1e9
+tomoyo		0bb80ecc33a8fb5a682236443c1e740d5c917d1d
+tpmdd		4a25541b236f5d8f98c1fd2f8848a290eafdb8a8
+watchdog	41bccc98fb7931d63d03f326a746ac4d429c1dd3
+iommu		05f64ad28da11ccac7a1ec698d93c05e4fbfd3ea
+audit		aa13b709084a0287ef250a9fbde5993e4dfc3078
+devicetree	2ff94f7ce292a77e77965919d7dccb7ac04a88f5
+dt-krzk		8c82b4eef2972200f6171aaa260d7bba2ad29889
+mailbox		cd795fb0c352c1f70e5fa437b01572c8693e1b77
+spi		55072343f1df834879b8bae9e419cd5cbb5f3259
+tip		c56ac217a3c00d12e23606e0635cc95d6eb8c36a
+clockevents	9256cec7b4f3293c11585326401325b1f81670e1
+edac		4cc8411bb56d11ecb8192ec14f05f5b9e9f23d4a
+ftrace		4af12c95cbe888b71e905058c48e8d1e779264b5
+rcu		b4c7a9cd36e1d1eb1ce43b4329e359a00d75a355
+kvm		687d8f4c3dea0758afd748968d91288220bbe7e3
+kvm-arm		9e00a15ec81e56e8b330024e606fa9a1fca004e9
+kvms390		7b2411e793673b6282ab7907be26b82234681313
+kvm-ppc		41bccc98fb7931d63d03f326a746ac4d429c1dd3
+kvm-riscv	f072b272aa27d57cf7fe6fdedb30fb50f391974e
+kvm-x86		ca19f5c9d3c31b8d91f0f8a7f3d71c45d5eaa0f5
+xen-tip		fa765c4b4aed2d64266b694520ecb025c862c5a9
+percpu		2d9ad81ef93570bc0d4929d05d0601ea400d6fcf
+workqueues	fd0a68a2337b79a7bd4dad5e7d9dc726828527af
+drivers-x86	c5211eacf3326538fbf31b612e5ea546ca8a3425
+chrome-platform	6613476e225e090cc9aad49be7fa504e290dd33d
+chrome-platform-firmware	8a0a62941a042612f7487f6c4ff291f9054ff214
+hsi		a0e35a173a86d93040d0e08f9c38526b6cf6c1d1
+leds-lj		12ce20e02e532f101b725d71c52a36c5cc8ad1e6
+ipmi		296455ade1fdcf5f8f8c033201633b60946c589a
+driver-core	07749061b837a1268146dc8a620a522253cea877
+usb		3bf0514dc6f36f81ee11b1becd977cb87b4c90c6
+thunderbolt	b4734507ac55cc7ea1380e20e83f60fcd7031955
+usb-serial	54be6c6c5ae8e0d93a6c4641cb7528eb0b6ba478
+tty		1643281347f80116a500e6a17726351a7265a55c
+char-misc	8d11c6d9b14f7a87f65529cb33edc5fed846ed9d
+accel		570a7f66cc7a1b3f3eae63c6c3639bb5b456a928
+coresight	c099fdd218a0fedfe87e0b88d2ba0667eab3b3c8
+fastrpc		6613476e225e090cc9aad49be7fa504e290dd33d
+fpga		ff49b00e9621402cf723c3cb11489dff2d09a738
+icc		b9a9c447277f247b453b92eddb5eb2732408dd28
+iio		3cc5ebd3a2d6247aeba81873d6b040d5d87f7db1
+phy-next	505dfc6ba84c85651f8f8a7bf721aadc49049a44
+soundwire	81a7d0c4d059cb5c122110acbeec7bedfb91a741
+extcon		b401b621758e46812da61fa58a67c3fd8d91de0d
+gnss		54be6c6c5ae8e0d93a6c4641cb7528eb0b6ba478
+vfio		78f70c02bdbccb5e9b0b0c728185d4aeb7044ace
+w1		d97d263132a69a0bda54efce3df04e55fa6341f7
+spmi		b85ea95d086471afb4ad062012a4d73cd328fa86
+staging		455c5e12a3b7d08c2ab47b7dd54944901c69cdcd
+counter-next	b6dce0452a0276339392bc5eeb722370a466ba25
+mux		44c026a73be8038f03dbdeef028b642880cf1511
+dmaengine	35b78e2eef2d75c8722bf39d6bd1d89a8e21479e
+cgroup		8d4c171f451d384f3a287eb14bd60825d0b2381b
+scsi		d970d094663aea6906bb5966e952f0e2b30f7ba6
+scsi-mkp	9f3dbcb5632d6876226031d552ef6163bb3ad215
+vhost		f16d65124380ac6de8055c4a8e5373a1043bb09b
+rpmsg		929654e8f1add50b01d5a56171a31c311b0a739a
+gpio		0bb80ecc33a8fb5a682236443c1e740d5c917d1d
+gpio-brgl	56c608c9e773a9d9827643eec352d831f7da6220
+gpio-intel	6613476e225e090cc9aad49be7fa504e290dd33d
+pinctrl		b3b8c7865c273342ca51c2d2b3f0f788f7a0ba9d
+pinctrl-intel	6613476e225e090cc9aad49be7fa504e290dd33d
+pinctrl-renesas	97191e536c37359d17d6d32bc29acb911f731e60
+pinctrl-samsung	6613476e225e090cc9aad49be7fa504e290dd33d
+pwm		801de0882d8a95aa1b1fe67df1696e037d785656
+ktest		7dc8e24f0e09834341f84d37433840b353d64bc8
+kselftest	6f1a214d446b2f2f9c8c4b96755a8f0316ba4436
+kunit		6613476e225e090cc9aad49be7fa504e290dd33d
+kunit-next	08c454e26daab6f843e5883fb96f680f11784fa6
+livepatching	602bf18307981f3bfd9ebf19921791a4256d3fd1
+rtc		6613476e225e090cc9aad49be7fa504e290dd33d
+nvdimm		bc22374c96d959084bea1287cfc6ea0fd9ca4e40
+at24		6613476e225e090cc9aad49be7fa504e290dd33d
+ntb		9341b37ec17a8793e8439e9b18354ba69556b786
+seccomp		56af94aace8a0489fb1a32fd6f1cf0c548fe3911
+fsi		c5eeb63edac9497f9a0d46d3b75cf8b293771ecf
+slimbus		04b945e4cf81a12365f8207a4d34dbc81ba17413
+nvmem		2c8df24cc166478910c4e9e870adf44d157330fa
+xarray		2a15de80dd0f7e04a823291aa9eb49c5294f56af
+hyperv		ce9ecca0238b140b88f43859b211c9fdfd8e5b70
+auxdisplay	34ddc83dc72030ded90b5ff038cca67354ea8d34
+kgdb		4f41d30cd6dc865c3cbc1a852372321eba6d4e4c
+hmm		6613476e225e090cc9aad49be7fa504e290dd33d
+cfi		06c2afb862f9da8dc5efa4b6076a0e48c3fbaaa5
+mhi		ceeb64f41fe6a1eb9fc56d583983a81f8f3dd058
+memblock	2159bd4e905704b1765b6b883ea15e51ad986a6a
+cxl		73bf93edeeea866b0b6efbc8d2595bdaaba7f1a5
+zstd		3f832dfb8a8eafee3cecd479d99651a64a61485a
+efi		841c35169323cd833294798e58b9bf63fa4fa1de
+unicode		367122c529f35b4655acbe33c0cc4d6d3b32ba71
+slab		7d2ec24bd8a59853c7660d3eac50a3b7ffce8ae3
+random		1f719a2f3fa67665578c759ac34fd3d3690c1a20
+landlock	28c2be13a1e03127a01d8cc6015ab36063cbb715
+rust		e3c3d34507c7a146de1c5ce01bd0b2c0018b2609
+sysctl		cec030ec414ec94fb25b49da688eea264e054e42
+execve		15fd1dc3dadb4268207fa6797e753541aca09a2a
+bitmap		071ad962baf5e857fd965595421cf6fb588610ed
+hte		b85ea95d086471afb4ad062012a4d73cd328fa86
+kspp		f0f4273404295c368efd3054e58dd0ec681ca175
+kspp-gustavo	6613476e225e090cc9aad49be7fa504e290dd33d
+nolibc		6613476e225e090cc9aad49be7fa504e290dd33d
+tsm		f4738f56d1dc62aaba69b33702a5ab098f1b8c63
+iommufd		6613476e225e090cc9aad49be7fa504e290dd33d
+header_cleanup	5f4c01f1e3c7b0c8d1e5dd6f080531de7aa5e47b
diff --git a/Next/Trees b/Next/Trees
new file mode 100644
index 00000000000000..d4956937d1bbe2
--- /dev/null
+++ b/Next/Trees
@@ -0,0 +1,373 @@
+Trees included into this release:
+
+Name		Type	Url
+----		----	---
+origin		git	git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git#master
+fixes		git	git://git.kernel.org/pub/scm/linux/kernel/git/sfr/next-fixes.git#fixes
+mm-hotfixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm#mm-hotfixes-unstable
+kbuild-current	git	git://git.kernel.org/pub/scm/linux/kernel/git/masahiroy/linux-kbuild.git#fixes
+arc-current	git	git://git.kernel.org/pub/scm/linux/kernel/git/vgupta/arc.git#for-curr
+arm-current	git	git://git.armlinux.org.uk/~rmk/linux-arm.git#fixes
+arm64-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux#for-next/fixes
+arm-soc-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/soc/soc.git#arm/fixes
+davinci-current	git	git://git.kernel.org/pub/scm/linux/kernel/git/brgl/linux.git#davinci/for-current
+drivers-memory-fixes	git	https://git.kernel.org/pub/scm/linux/kernel/git/krzk/linux-mem-ctrl.git#fixes
+sophgo-fixes	git	https://github.com/sophgo/linux.git#fixes
+tee-fixes	git	https://git.linaro.org/people/jens.wiklander/linux-tee.git#fixes
+m68k-current	git	git://git.kernel.org/pub/scm/linux/kernel/git/geert/linux-m68k.git#for-linus
+powerpc-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git#fixes
+s390-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/s390/linux.git#fixes
+fscrypt-current	git	git://git.kernel.org/pub/scm/fs/fscrypt/linux.git#for-current
+fsverity-current	git	git://git.kernel.org/pub/scm/fs/fsverity/linux.git#for-current
+net		git	git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net.git#main
+bpf		git	git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf.git#master
+ipsec		git	git://git.kernel.org/pub/scm/linux/kernel/git/klassert/ipsec.git#master
+netfilter	git	git://git.kernel.org/pub/scm/linux/kernel/git/netfilter/nf.git#main
+ipvs		git	git://git.kernel.org/pub/scm/linux/kernel/git/horms/ipvs.git#main
+wireless	git	git://git.kernel.org/pub/scm/linux/kernel/git/wireless/wireless.git#for-next
+wpan		git	git://git.kernel.org/pub/scm/linux/kernel/git/wpan/wpan.git#master
+rdma-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git#for-rc
+sound-current	git	git://git.kernel.org/pub/scm/linux/kernel/git/tiwai/sound.git#for-linus
+sound-asoc-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/broonie/sound.git#for-linus
+regmap-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/broonie/regmap.git#for-linus
+regulator-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/broonie/regulator.git#for-linus
+spi-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/broonie/spi.git#for-linus
+pci-current	git	git://git.kernel.org/pub/scm/linux/kernel/git/pci/pci.git#for-linus
+driver-core.current	git	git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/driver-core.git#driver-core-linus
+tty.current	git	git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/tty.git#tty-linus
+usb.current	git	git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/usb.git#usb-linus
+usb-serial-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/johan/usb-serial.git#usb-linus
+phy		git	git://git.kernel.org/pub/scm/linux/kernel/git/phy/linux-phy.git#fixes
+staging.current	git	git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/staging.git#staging-linus
+iio-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/jic23/iio.git#fixes-togreg
+counter-current	git	git://git.kernel.org/pub/scm/linux/kernel/git/wbg/counter.git#counter-current
+char-misc.current	git	git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/char-misc.git#char-misc-linus
+soundwire-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/vkoul/soundwire.git#fixes
+thunderbolt-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/westeri/thunderbolt.git#fixes
+input-current	git	git://git.kernel.org/pub/scm/linux/kernel/git/dtor/input.git#for-linus
+crypto-current	git	git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6.git#master
+vfio-fixes	git	git://github.com/awilliam/linux-vfio.git#for-linus
+kselftest-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/shuah/linux-kselftest.git#fixes
+modules-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/mcgrof/linux.git#modules-linus
+dmaengine-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/vkoul/dmaengine.git#fixes
+backlight-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/lee/backlight.git#for-backlight-fixes
+mtd-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/mtd/linux.git#mtd/fixes
+mfd-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/lee/mfd.git#for-mfd-fixes
+v4l-dvb-fixes	git	https://git.linuxtv.org/media_stage.git#fixes
+reset-fixes	git	https://git.pengutronix.de/git/pza/linux#reset/fixes
+mips-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/mips/linux.git#mips-fixes
+at91-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/at91/linux.git#at91-fixes
+omap-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/tmlind/linux-omap.git#fixes
+kvm-fixes	git	git://git.kernel.org/pub/scm/virt/kvm/kvm.git#master
+kvms390-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/kvms390/linux.git#master
+hwmon-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/groeck/linux-staging.git#hwmon
+nvdimm-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm.git#libnvdimm-fixes
+cxl-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/cxl/cxl.git#fixes
+btrfs-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux.git#next-fixes
+vfs-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs.git#fixes
+dma-mapping-fixes	git	git://git.infradead.org/users/hch/dma-mapping.git#for-linus
+drivers-x86-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/pdx86/platform-drivers-x86.git#fixes
+samsung-krzk-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/krzk/linux.git#fixes
+pinctrl-samsung-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/pinctrl/samsung.git#fixes
+devicetree-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/robh/linux.git#dt/linus
+dt-krzk-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/krzk/linux-dt.git#fixes
+scsi-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/mkp/scsi.git#fixes
+drm-fixes	git	git://git.freedesktop.org/git/drm/drm.git#drm-fixes
+drm-intel-fixes	git	git://anongit.freedesktop.org/drm-intel#for-linux-next-fixes
+mmc-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/ulfh/mmc.git#fixes
+rtc-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/abelloni/linux.git#rtc-fixes
+gnss-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/johan/gnss.git#gnss-linus
+hyperv-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/hyperv/linux.git#hyperv-fixes
+soc-fsl-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/leo/linux.git#fix
+risc-v-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/riscv/linux.git#fixes
+riscv-dt-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/conor/linux.git#riscv-dt-fixes
+riscv-soc-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/conor/linux.git#riscv-soc-fixes
+fpga-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/fpga/linux-fpga.git#fixes
+spdx		git	git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/spdx.git#spdx-linus
+gpio-brgl-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/brgl/linux.git#gpio/for-current
+gpio-intel-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/andy/linux-gpio-intel.git#fixes
+pinctrl-intel-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/pinctrl/intel.git#fixes
+auxdisplay-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/andy/linux-auxdisplay.git#fixes
+erofs-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/xiang/erofs.git#fixes
+kunit-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/shuah/linux-kselftest.git#kunit-fixes
+ubifs-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/rw/ubifs.git#fixes
+memblock-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/rppt/memblock.git#fixes
+nfsd-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/cel/linux#nfsd-fixes
+renesas-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/geert/renesas-devel.git#fixes
+perf-current	git	git://git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools#perf-tools
+efi-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/efi/efi.git#urgent
+zstd-fixes	git	https://github.com/terrelln/linux.git#zstd-linus
+battery-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/sre/linux-power-supply.git#fixes
+uml-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/uml/linux.git#fixes
+iommufd-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/jgg/iommufd.git#for-rc
+rust-fixes	git	https://github.com/Rust-for-Linux/linux.git#rust-fixes
+v9fs-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/ericvh/v9fs.git#fixes/next
+w1-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/krzk/linux-w1.git#fixes
+pmdomain-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/ulfh/linux-pm.git#fixes
+overlayfs-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/overlayfs/vfs.git#ovl-fixes
+i2c-host-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/andi.shyti/linux.git#i2c/i2c-host-fixes
+drm-misc-fixes	git	git://anongit.freedesktop.org/drm/drm-misc#for-linux-next-fixes
+mm-stable	git	git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm#mm-stable
+mm-nonmm-stable	git	git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm#mm-nonmm-stable
+mm		git	git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm#mm-everything
+kbuild		git	git://git.kernel.org/pub/scm/linux/kernel/git/masahiroy/linux-kbuild.git#for-next
+clang-format	git	https://github.com/ojeda/linux.git#clang-format
+perf		git	git://git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools-next.git#perf-tools-next
+compiler-attributes	git	https://github.com/ojeda/linux.git#compiler-attributes
+dma-mapping	git	git://git.infradead.org/users/hch/dma-mapping.git#for-next
+asm-generic	git	git://git.kernel.org/pub/scm/linux/kernel/git/arnd/asm-generic.git#master
+arc		git	git://git.kernel.org/pub/scm/linux/kernel/git/vgupta/arc.git#for-next
+arm		git	git://git.armlinux.org.uk/~rmk/linux-arm.git#for-next
+arm64		git	git://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux#for-next/core
+arm-perf	git	git://git.kernel.org/pub/scm/linux/kernel/git/will/linux.git#for-next/perf
+arm-soc		git	git://git.kernel.org/pub/scm/linux/kernel/git/soc/soc.git#for-next
+amlogic		git	git://git.kernel.org/pub/scm/linux/kernel/git/amlogic/linux.git#for-next
+asahi-soc	git	https://github.com/AsahiLinux/linux.git#asahi-soc/for-next
+aspeed		git	git://git.kernel.org/pub/scm/linux/kernel/git/joel/bmc.git#for-next
+at91		git	git://git.kernel.org/pub/scm/linux/kernel/git/at91/linux.git#at91-next
+broadcom	git	https://github.com/Broadcom/stblinux.git#next
+davinci		git	git://git.kernel.org/pub/scm/linux/kernel/git/brgl/linux.git#davinci/for-next
+drivers-memory	git	https://git.kernel.org/pub/scm/linux/kernel/git/krzk/linux-mem-ctrl.git#for-next
+imx-mxs		git	git://git.kernel.org/pub/scm/linux/kernel/git/shawnguo/linux.git#for-next
+mediatek	git	git://git.kernel.org/pub/scm/linux/kernel/git/mediatek/linux.git#for-next
+mvebu		git	git://git.kernel.org/pub/scm/linux/kernel/git/gclement/mvebu.git#for-next
+omap		git	git://git.kernel.org/pub/scm/linux/kernel/git/tmlind/linux-omap.git#for-next
+qcom		git	git://git.kernel.org/pub/scm/linux/kernel/git/qcom/linux.git#for-next
+renesas		git	git://git.kernel.org/pub/scm/linux/kernel/git/geert/renesas-devel.git#next
+reset		git	https://git.pengutronix.de/git/pza/linux#reset/next
+rockchip	git	git://git.kernel.org/pub/scm/linux/kernel/git/mmind/linux-rockchip.git#for-next
+samsung-krzk	git	git://git.kernel.org/pub/scm/linux/kernel/git/krzk/linux.git#for-next
+scmi		git	git://git.kernel.org/pub/scm/linux/kernel/git/sudeep.holla/linux.git#for-linux-next
+sophgo		git	https://github.com/sophgo/linux.git#for-next
+stm32		git	git://git.kernel.org/pub/scm/linux/kernel/git/atorgue/stm32.git#stm32-next
+sunxi		git	git://git.kernel.org/pub/scm/linux/kernel/git/sunxi/linux.git#sunxi/for-next
+tee		git	https://git.linaro.org/people/jens.wiklander/linux-tee.git#next
+tegra		git	git://git.kernel.org/pub/scm/linux/kernel/git/tegra/linux.git#for-next
+ti		git	git://git.kernel.org/pub/scm/linux/kernel/git/ti/linux.git#ti-next
+xilinx		git	git://github.com/Xilinx/linux-xlnx.git#for-next
+clk		git	git://git.kernel.org/pub/scm/linux/kernel/git/clk/linux.git#clk-next
+clk-imx		git	git://git.kernel.org/pub/scm/linux/kernel/git/abelvesa/linux.git#for-next
+clk-renesas	git	git://git.kernel.org/pub/scm/linux/kernel/git/geert/renesas-drivers.git#renesas-clk
+csky		git	git://github.com/c-sky/csky-linux.git#linux-next
+loongarch	git	git://git.kernel.org/pub/scm/linux/kernel/git/chenhuacai/linux-loongson.git#loongarch-next
+m68k		git	git://git.kernel.org/pub/scm/linux/kernel/git/geert/linux-m68k.git#for-next
+m68knommu	git	git://git.kernel.org/pub/scm/linux/kernel/git/gerg/m68knommu.git#for-next
+microblaze	git	git://git.monstr.eu/linux-2.6-microblaze.git#next
+mips		git	git://git.kernel.org/pub/scm/linux/kernel/git/mips/linux.git#mips-next
+openrisc	git	git://github.com/openrisc/linux.git#for-next
+parisc-hd	git	git://git.kernel.org/pub/scm/linux/kernel/git/deller/parisc-linux.git#for-next
+powerpc		git	git://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git#next
+soc-fsl		git	git://git.kernel.org/pub/scm/linux/kernel/git/leo/linux.git#next
+risc-v		git	git://git.kernel.org/pub/scm/linux/kernel/git/riscv/linux.git#for-next
+riscv-dt	git	git://git.kernel.org/pub/scm/linux/kernel/git/conor/linux.git#riscv-dt-for-next
+riscv-soc	git	git://git.kernel.org/pub/scm/linux/kernel/git/conor/linux.git#riscv-soc-for-next
+s390		git	git://git.kernel.org/pub/scm/linux/kernel/git/s390/linux.git#for-next
+sh		git	git:git.kernel.org/pub/scm/linux/kernel/git/glaubitz/sh-linux.git#for-next
+uml		git	git://git.kernel.org/pub/scm/linux/kernel/git/uml/linux.git#next
+xtensa		git	git://github.com/jcmvbkbc/linux-xtensa.git#xtensa-for-next
+bcachefs	git	https://evilpiepirate.org/git/bcachefs.git#for-next
+pidfd		git	git://git.kernel.org/pub/scm/linux/kernel/git/brauner/linux.git#for-next
+fscrypt		git	git://git.kernel.org/pub/scm/fs/fscrypt/linux.git#for-next
+afs		git	git://git.kernel.org/pub/scm/linux/kernel/git/dhowells/linux-fs.git#afs-next
+btrfs		git	git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux.git#for-next
+ceph		git	git://github.com/ceph/ceph-client.git#master
+cifs		git	git://git.samba.org/sfrench/cifs-2.6.git#for-next
+configfs	git	git://git.infradead.org/users/hch/configfs.git#for-next
+ecryptfs	git	git://git.kernel.org/pub/scm/linux/kernel/git/tyhicks/ecryptfs.git#next
+erofs		git	git://git.kernel.org/pub/scm/linux/kernel/git/xiang/erofs.git#dev
+exfat		git	git://git.kernel.org/pub/scm/linux/kernel/git/linkinjeon/exfat.git#dev
+exportfs	git	git://git.kernel.org/pub/scm/linux/kernel/git/cel/linux#exportfs-next
+ext3		git	git://git.kernel.org/pub/scm/linux/kernel/git/jack/linux-fs.git#for_next
+ext4		git	git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4.git#dev
+f2fs		git	git://git.kernel.org/pub/scm/linux/kernel/git/jaegeuk/f2fs.git#dev
+fsverity	git	git://git.kernel.org/pub/scm/fs/fsverity/linux.git#for-next
+fuse		git	git://git.kernel.org/pub/scm/linux/kernel/git/mszeredi/fuse.git#for-next
+gfs2		git	git://git.kernel.org/pub/scm/linux/kernel/git/gfs2/linux-gfs2.git#for-next
+jfs		git	git://github.com/kleikamp/linux-shaggy.git#jfs-next
+ksmbd		git	https://github.com/smfrench/smb3-kernel.git#ksmbd-for-next
+nfs		git	git://git.linux-nfs.org/projects/trondmy/nfs-2.6.git#linux-next
+nfs-anna	git	git://git.linux-nfs.org/projects/anna/linux-nfs.git#linux-next
+nfsd		git	git://git.kernel.org/pub/scm/linux/kernel/git/cel/linux#nfsd-next
+ntfs3		git	https://github.com/Paragon-Software-Group/linux-ntfs3.git#master
+orangefs	git	git://git.kernel.org/pub/scm/linux/kernel/git/hubcap/linux#for-next
+overlayfs	git	git://git.kernel.org/pub/scm/linux/kernel/git/overlayfs/vfs.git#overlayfs-next
+ubifs		git	git://git.kernel.org/pub/scm/linux/kernel/git/rw/ubifs.git#next
+v9fs		git	git://github.com/martinetd/linux#9p-next
+v9fs-ericvh	git	git://git.kernel.org/pub/scm/linux/kernel/git/ericvh/v9fs.git#ericvh/for-next
+xfs		git	git://git.kernel.org/pub/scm/fs/xfs/xfs-linux.git#for-next
+zonefs		git	git://git.kernel.org/pub/scm/linux/kernel/git/dlemoal/zonefs.git#for-next
+iomap		git	git://git.kernel.org/pub/scm/fs/xfs/xfs-linux.git#iomap-for-next
+djw-vfs		git	git://git.kernel.org/pub/scm/fs/xfs/xfs-linux.git#vfs-for-next
+file-locks	git	git://git.kernel.org/pub/scm/linux/kernel/git/jlayton/linux.git#locks-next
+iversion	git	git://git.kernel.org/pub/scm/linux/kernel/git/jlayton/linux.git#iversion-next
+vfs-brauner	git	git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs.git#vfs.all
+vfs		git	git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs.git#for-next
+printk		git	git://git.kernel.org/pub/scm/linux/kernel/git/printk/linux.git#for-next
+pci		git	git://git.kernel.org/pub/scm/linux/kernel/git/pci/pci.git#next
+pstore		git	git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux.git#for-next/pstore
+hid		git	git://git.kernel.org/pub/scm/linux/kernel/git/hid/hid.git#for-next
+i2c		git	git://git.kernel.org/pub/scm/linux/kernel/git/wsa/linux.git#i2c/for-next
+i2c-host	git	git://git.kernel.org/pub/scm/linux/kernel/git/andi.shyti/linux.git#i2c/i2c-host
+i3c		git	git://git.kernel.org/pub/scm/linux/kernel/git/i3c/linux.git#i3c/next
+hwmon-staging	git	git://git.kernel.org/pub/scm/linux/kernel/git/groeck/linux-staging.git#hwmon-next
+jc_docs		git	git://git.lwn.net/linux.git#docs-next
+v4l-dvb		git	git://linuxtv.org/media_tree.git#master
+v4l-dvb-next	git	git://linuxtv.org/mchehab/media-next.git#master
+pm		git	git://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm.git#linux-next
+cpufreq-arm	git	git://git.kernel.org/pub/scm/linux/kernel/git/vireshk/pm.git#cpufreq/arm/linux-next
+cpupower	git	git://git.kernel.org/pub/scm/linux/kernel/git/shuah/linux.git#cpupower
+devfreq		git	git://git.kernel.org/pub/scm/linux/kernel/git/chanwoo/linux.git#devfreq-next
+pmdomain	git	git://git.kernel.org/pub/scm/linux/kernel/git/ulfh/linux-pm.git#next
+opp		git	git://git.kernel.org/pub/scm/linux/kernel/git/vireshk/pm.git#opp/linux-next
+thermal		git	git://git.kernel.org/pub/scm/linux/kernel/git/thermal/linux.git#thermal/linux-next
+dlm		git	git://git.kernel.org/pub/scm/linux/kernel/git/teigland/linux-dlm.git#next
+rdma		git	git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git#for-next
+net-next	git	git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git#main
+bpf-next	git	git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git#for-next
+ipsec-next	git	git://git.kernel.org/pub/scm/linux/kernel/git/klassert/ipsec-next.git#master
+mlx5-next	git	git://git.kernel.org/pub/scm/linux/kernel/git/mellanox/linux.git#mlx5-next
+netfilter-next	git	git://git.kernel.org/pub/scm/linux/kernel/git/netfilter/nf-next.git#main
+ipvs-next	git	git://git.kernel.org/pub/scm/linux/kernel/git/horms/ipvs-next.git#main
+bluetooth	git	git://git.kernel.org/pub/scm/linux/kernel/git/bluetooth/bluetooth-next.git#master
+wireless-next	git	git://git.kernel.org/pub/scm/linux/kernel/git/wireless/wireless-next.git#for-next
+wpan-next	git	git://git.kernel.org/pub/scm/linux/kernel/git/wpan/wpan-next.git#master
+wpan-staging	git	git://git.kernel.org/pub/scm/linux/kernel/git/wpan/wpan-next.git#staging
+mtd		git	git://git.kernel.org/pub/scm/linux/kernel/git/mtd/linux.git#mtd/next
+nand		git	git://git.kernel.org/pub/scm/linux/kernel/git/mtd/linux.git#nand/next
+spi-nor		git	git://git.kernel.org/pub/scm/linux/kernel/git/mtd/linux.git#spi-nor/next
+crypto		git	git://git.kernel.org/pub/scm/linux/kernel/git/herbert/cryptodev-2.6.git#master
+drm		git	git://git.freedesktop.org/git/drm/drm.git#drm-next
+drm-ci		git	git://git.freedesktop.org/git/drm/drm.git#topic/drm-ci
+drm-exynos	git	git://git.kernel.org/pub/scm/linux/kernel/git/daeinki/drm-exynos.git#for-linux-next
+drm-misc	git	git://anongit.freedesktop.org/drm/drm-misc#for-linux-next
+amdgpu		git	https://gitlab.freedesktop.org/agd5f/linux#drm-next
+drm-intel	git	git://anongit.freedesktop.org/drm-intel#for-linux-next
+drm-tegra	git	https://gitlab.freedesktop.org/drm/tegra.git#for-next
+drm-msm		git	https://gitlab.freedesktop.org/drm/msm.git#msm-next
+drm-msm-lumag	git	https://gitlab.freedesktop.org/lumag/msm.git#msm-next-lumag
+etnaviv		git	https://git.pengutronix.de/git/lst/linux#etnaviv/next
+fbdev		git	git://git.kernel.org/pub/scm/linux/kernel/git/deller/linux-fbdev.git#for-next
+regmap		git	git://git.kernel.org/pub/scm/linux/kernel/git/broonie/regmap.git#for-next
+sound		git	git://git.kernel.org/pub/scm/linux/kernel/git/tiwai/sound.git#for-next
+ieee1394	git	https://git.kernel.org/pub/scm/linux/kernel/git/ieee1394/linux1394.git#for-next
+sound-asoc	git	git://git.kernel.org/pub/scm/linux/kernel/git/broonie/sound.git#for-next
+modules		git	git://git.kernel.org/pub/scm/linux/kernel/git/mcgrof/linux.git#modules-next
+input		git	git://git.kernel.org/pub/scm/linux/kernel/git/dtor/input.git#next
+block		git	git://git.kernel.dk/linux-block.git#for-next
+device-mapper	git	git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm.git#for-next
+libata		git	git://git.kernel.org/pub/scm/linux/kernel/git/libata/linux#for-next
+pcmcia		git	git://git.kernel.org/pub/scm/linux/kernel/git/brodo/linux.git#pcmcia-next
+mmc		git	git://git.kernel.org/pub/scm/linux/kernel/git/ulfh/mmc.git#next
+mfd		git	git://git.kernel.org/pub/scm/linux/kernel/git/lee/mfd.git#for-mfd-next
+backlight	git	git://git.kernel.org/pub/scm/linux/kernel/git/lee/backlight.git#for-backlight-next
+battery		git	git://git.kernel.org/pub/scm/linux/kernel/git/sre/linux-power-supply.git#for-next
+regulator	git	git://git.kernel.org/pub/scm/linux/kernel/git/broonie/regulator.git#for-next
+security	git	git://git.kernel.org/pub/scm/linux/kernel/git/pcmoore/lsm.git#next
+apparmor	git	git://git.kernel.org/pub/scm/linux/kernel/git/jj/linux-apparmor#apparmor-next
+integrity	git	git://git.kernel.org/pub/scm/linux/kernel/git/zohar/linux-integrity#next-integrity
+selinux		git	git://git.kernel.org/pub/scm/linux/kernel/git/pcmoore/selinux.git#next
+smack		git	git://github.com/cschaufler/smack-next#next
+tomoyo		git	https://scm.osdn.net/gitroot/tomoyo/tomoyo-test1.git#master
+tpmdd		git	git://git.kernel.org/pub/scm/linux/kernel/git/jarkko/linux-tpmdd.git#next
+watchdog	git	git://www.linux-watchdog.org/linux-watchdog-next.git#master
+iommu		git	git://git.kernel.org/pub/scm/linux/kernel/git/joro/iommu.git#next
+audit		git	git://git.kernel.org/pub/scm/linux/kernel/git/pcmoore/audit.git#next
+devicetree	git	git://git.kernel.org/pub/scm/linux/kernel/git/robh/linux.git#for-next
+dt-krzk		git	git://git.kernel.org/pub/scm/linux/kernel/git/krzk/linux-dt.git#for-next
+mailbox		git	git://git.kernel.org/pub/scm/linux/kernel/git/jassibrar/mailbox.git#for-next
+spi		git	git://git.kernel.org/pub/scm/linux/kernel/git/broonie/spi.git#for-next
+tip		git	git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git#master
+clockevents	git	git://git.linaro.org/people/daniel.lezcano/linux.git#timers/drivers/next
+edac		git	git://git.kernel.org/pub/scm/linux/kernel/git/ras/ras.git#edac-for-next
+ftrace		git	git://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace.git#for-next
+rcu		git	git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu.git#rcu/next
+kvm		git	git://git.kernel.org/pub/scm/virt/kvm/kvm.git#next
+kvm-arm		git	git://git.kernel.org/pub/scm/linux/kernel/git/kvmarm/kvmarm.git#next
+kvms390		git	git://git.kernel.org/pub/scm/linux/kernel/git/kvms390/linux.git#next
+kvm-ppc		git	git://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git#topic/ppc-kvm
+kvm-riscv	git	https://github.com/kvm-riscv/linux.git#riscv_kvm_next
+kvm-x86		git	https://github.com/kvm-x86/linux.git#next
+xen-tip		git	git://git.kernel.org/pub/scm/linux/kernel/git/xen/tip.git#linux-next
+percpu		git	git://git.kernel.org/pub/scm/linux/kernel/git/dennis/percpu.git#for-next
+workqueues	git	git://git.kernel.org/pub/scm/linux/kernel/git/tj/wq.git#for-next
+drivers-x86	git	git://git.kernel.org/pub/scm/linux/kernel/git/pdx86/platform-drivers-x86.git#for-next
+chrome-platform	git	git://git.kernel.org/pub/scm/linux/kernel/git/chrome-platform/linux.git#for-next
+chrome-platform-firmware	git	git://git.kernel.org/pub/scm/linux/kernel/git/chrome-platform/linux.git#for-firmware-next
+hsi		git	git://git.kernel.org/pub/scm/linux/kernel/git/sre/linux-hsi.git#for-next
+leds-lj		git	git://git.kernel.org/pub/scm/linux/kernel/git/lee/leds.git#for-leds-next
+ipmi		git	git://github.com/cminyard/linux-ipmi.git#for-next
+driver-core	git	git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/driver-core.git#driver-core-next
+usb		git	git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/usb.git#usb-next
+thunderbolt	git	git://git.kernel.org/pub/scm/linux/kernel/git/westeri/thunderbolt.git#next
+usb-serial	git	git://git.kernel.org/pub/scm/linux/kernel/git/johan/usb-serial.git#usb-next
+tty		git	git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/tty.git#tty-next
+char-misc	git	git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/char-misc.git#char-misc-next
+accel		git	git://git.kernel.org/pub/scm/linux/kernel/git/ogabbay/linux.git#habanalabs-next
+coresight	git	git://git.kernel.org/pub/scm/linux/kernel/git/coresight/linux.git#next
+fastrpc		git	git://git.kernel.org/pub/scm/linux/kernel/git/srini/fastrpc.git#for-next
+fpga		git	git://git.kernel.org/pub/scm/linux/kernel/git/fpga/linux-fpga.git#for-next
+icc		git	git://git.kernel.org/pub/scm/linux/kernel/git/djakov/icc.git#icc-next
+iio		git	git://git.kernel.org/pub/scm/linux/kernel/git/jic23/iio.git#togreg
+phy-next	git	git://git.kernel.org/pub/scm/linux/kernel/git/phy/linux-phy.git#next
+soundwire	git	git://git.kernel.org/pub/scm/linux/kernel/git/vkoul/soundwire.git#next
+extcon		git	git://git.kernel.org/pub/scm/linux/kernel/git/chanwoo/extcon.git#extcon-next
+gnss		git	git://git.kernel.org/pub/scm/linux/kernel/git/johan/gnss.git#gnss-next
+vfio		git	git://github.com/awilliam/linux-vfio.git#next
+w1		git	git://git.kernel.org/pub/scm/linux/kernel/git/krzk/linux-w1.git#for-next
+spmi		git	git://git.kernel.org/pub/scm/linux/kernel/git/sboyd/spmi.git#spmi-next
+staging		git	git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/staging.git#staging-next
+counter-next	git	git://git.kernel.org/pub/scm/linux/kernel/git/wbg/counter.git#counter-next
+mux		git	https://gitlab.com/peda-linux/mux.git#for-next
+dmaengine	git	git://git.kernel.org/pub/scm/linux/kernel/git/vkoul/dmaengine.git#next
+cgroup		git	git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git#for-next
+scsi		git	git://git.kernel.org/pub/scm/linux/kernel/git/jejb/scsi.git#for-next
+scsi-mkp	git	git://git.kernel.org/pub/scm/linux/kernel/git/mkp/scsi.git#for-next
+vhost		git	git://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost.git#linux-next
+rpmsg		git	git://git.kernel.org/pub/scm/linux/kernel/git/remoteproc/linux.git#for-next
+gpio		git	git://git.kernel.org/pub/scm/linux/kernel/git/linusw/linux-gpio.git#for-next
+gpio-brgl	git	git://git.kernel.org/pub/scm/linux/kernel/git/brgl/linux.git#gpio/for-next
+gpio-intel	git	git://git.kernel.org/pub/scm/linux/kernel/git/andy/linux-gpio-intel.git#for-next
+pinctrl		git	git://git.kernel.org/pub/scm/linux/kernel/git/linusw/linux-pinctrl.git#for-next
+pinctrl-intel	git	git://git.kernel.org/pub/scm/linux/kernel/git/pinctrl/intel.git#for-next
+pinctrl-renesas	git	git://git.kernel.org/pub/scm/linux/kernel/git/geert/renesas-drivers.git#renesas-pinctrl
+pinctrl-samsung	git	git://git.kernel.org/pub/scm/linux/kernel/git/pinctrl/samsung.git#for-next
+pwm		git	git://git.kernel.org/pub/scm/linux/kernel/git/ukleinek/linux.git#pwm/for-next
+ktest		git	git://git.kernel.org/pub/scm/linux/kernel/git/rostedt/linux-ktest.git#for-next
+kselftest	git	git://git.kernel.org/pub/scm/linux/kernel/git/shuah/linux-kselftest.git#next
+kunit		git	git://git.kernel.org/pub/scm/linux/kernel/git/shuah/linux-kselftest.git#test
+kunit-next	git	git://git.kernel.org/pub/scm/linux/kernel/git/shuah/linux-kselftest.git#kunit
+livepatching	git	git://git.kernel.org/pub/scm/linux/kernel/git/livepatching/livepatching#for-next
+rtc		git	git://git.kernel.org/pub/scm/linux/kernel/git/abelloni/linux.git#rtc-next
+nvdimm		git	git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm.git#libnvdimm-for-next
+at24		git	git://git.kernel.org/pub/scm/linux/kernel/git/brgl/linux.git#at24/for-next
+ntb		git	https://github.com/jonmason/ntb.git#ntb-next
+seccomp		git	git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux.git#for-next/seccomp
+fsi		git	git://git.kernel.org/pub/scm/linux/kernel/git/joel/fsi.git#next
+slimbus		git	git://git.kernel.org/pub/scm/linux/kernel/git/srini/slimbus.git#for-next
+nvmem		git	git://git.kernel.org/pub/scm/linux/kernel/git/srini/nvmem.git#for-next
+xarray		git	git://git.infradead.org/users/willy/xarray.git#main
+hyperv		git	git://git.kernel.org/pub/scm/linux/kernel/git/hyperv/linux.git#hyperv-next
+auxdisplay	git	git://git.kernel.org/pub/scm/linux/kernel/git/andy/linux-auxdisplay.git#for-next
+kgdb		git	git://git.kernel.org/pub/scm/linux/kernel/git/danielt/linux.git#kgdb/for-next
+hmm		git	git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git#hmm
+cfi		git	git://git.kernel.org/pub/scm/linux/kernel/git/mtd/linux.git#cfi/next
+mhi		git	git://git.kernel.org/pub/scm/linux/kernel/git/mani/mhi.git#mhi-next
+memblock	git	git://git.kernel.org/pub/scm/linux/kernel/git/rppt/memblock.git#for-next
+cxl		git	git://git.kernel.org/pub/scm/linux/kernel/git/cxl/cxl.git#next
+zstd		git	https://github.com/terrelln/linux.git#zstd-next
+efi		git	git://git.kernel.org/pub/scm/linux/kernel/git/efi/efi.git#next
+unicode		git	git://git.kernel.org/pub/scm/linux/kernel/git/krisman/unicode.git#for-next
+slab		git	git://git.kernel.org/pub/scm/linux/kernel/git/vbabka/slab.git#slab/for-next
+random		git	git://git.kernel.org/pub/scm/linux/kernel/git/crng/random.git#master
+landlock	git	git://git.kernel.org/pub/scm/linux/kernel/git/mic/linux.git#next
+rust		git	https://github.com/Rust-for-Linux/linux.git#rust-next
+sysctl		git	git://git.kernel.org/pub/scm/linux/kernel/git/sysctl/sysctl.git#sysctl-next
+execve		git	git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux.git#for-next/execve
+bitmap		git	https://github.com/norov/linux.git#bitmap-for-next
+hte		git	git://git.kernel.org/pub/scm/linux/kernel/git/pateldipen1984/linux.git#for-next
+kspp		git	git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux.git#for-next/kspp
+kspp-gustavo	git	git://git.kernel.org/pub/scm/linux/kernel/git/gustavoars/linux.git#for-next/kspp
+nolibc		git	git://git.kernel.org/pub/scm/linux/kernel/git/shuah/linux-kselftest.git#nolibc
+tsm		git	git://git.kernel.org/pub/scm/linux/kernel/git/djbw/linux#tsm-next
+iommufd		git	git://git.kernel.org/pub/scm/linux/kernel/git/jgg/iommufd.git#for-next
+header_cleanup	git	https://evilpiepirate.org/git/bcachefs.git#header_cleanup
diff --git a/Next/merge.log b/Next/merge.log
new file mode 100644
index 00000000000000..bf6e50ffea3e3f
--- /dev/null
+++ b/Next/merge.log
@@ -0,0 +1,10879 @@
+$ date -R
+Wed, 21 Feb 2024 08:56:00 +1100
+$ git checkout master
+Already on 'master'
+$ git reset --hard stable
+Updating files:  65% (4783/7320)Updating files:  66% (4832/7320)Updating files:  67% (4905/7320)Updating files:  68% (4978/7320)Updating files:  69% (5051/7320)Updating files:  70% (5124/7320)Updating files:  71% (5198/7320)Updating files:  72% (5271/7320)Updating files:  73% (5344/7320)Updating files:  74% (5417/7320)Updating files:  75% (5490/7320)Updating files:  76% (5564/7320)Updating files:  77% (5637/7320)Updating files:  78% (5710/7320)Updating files:  79% (5783/7320)Updating files:  80% (5856/7320)Updating files:  81% (5930/7320)Updating files:  82% (6003/7320)Updating files:  83% (6076/7320)Updating files:  84% (6149/7320)Updating files:  85% (6222/7320)Updating files:  86% (6296/7320)Updating files:  87% (6369/7320)Updating files:  88% (6442/7320)Updating files:  89% (6515/7320)Updating files:  90% (6588/7320)Updating files:  91% (6662/7320)Updating files:  92% (6735/7320)Updating files:  93% (6808/7320)Updating files:  94% (6881/7320)Updating files:  95% (6954/7320)Updating files:  96% (7028/7320)Updating files:  97% (7101/7320)Updating files:  98% (7174/7320)Updating files:  99% (7247/7320)Updating files: 100% (7320/7320)Updating files: 100% (7320/7320), done.
+HEAD is now at b401b621758e Linux 6.8-rc5
+Merging origin/master (fca7526b7d89 drm/tests/drm_buddy: fix build failure on 32-bit targets)
+$ git merge -m Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git origin/master
+Updating b401b621758e..fca7526b7d89
+Fast-forward (no commit created; -m option ignored)
+ drivers/gpu/drm/tests/drm_buddy_test.c | 5 ++---
+ kernel/sched/membarrier.c              | 6 ++++++
+ 2 files changed, 8 insertions(+), 3 deletions(-)
+Merging fixes/fixes (2dde18cd1d8f Linux 6.5)
+$ git merge -m Merge branch 'fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/sfr/next-fixes.git fixes/fixes
+Already up to date.
+Merging mm-hotfixes/mm-hotfixes-unstable (0eb702ab51ac mm/debug_vm_pgtable: fix BUG_ON with pud advanced test)
+$ git merge -m Merge branch 'mm-hotfixes-unstable' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm mm-hotfixes/mm-hotfixes-unstable
+Merge made by the 'ort' strategy.
+ .mailmap                                     |   1 +
+ MAINTAINERS                                  |  13 +-
+ include/linux/poison.h                       |   3 +
+ include/linux/swap.h                         |   5 +
+ lib/Kconfig.debug                            |   1 +
+ lib/stackdepot.c                             | 254 ++++++++++++++-------------
+ mm/damon/core.c                              |  15 +-
+ mm/damon/lru_sort.c                          |  43 ++++-
+ mm/damon/reclaim.c                           |  18 +-
+ mm/damon/sysfs-schemes.c                     |   4 +
+ mm/debug_vm_pgtable.c                        |   8 +
+ mm/filemap.c                                 |  51 +++---
+ mm/kasan/common.c                            |   8 +-
+ mm/kasan/generic.c                           |  71 ++------
+ mm/kasan/kasan.h                             |  10 --
+ mm/kasan/quarantine.c                        |   5 +-
+ mm/memblock.c                                |   1 +
+ mm/memcontrol.c                              |  10 +-
+ mm/memory.c                                  |  20 +++
+ mm/migrate.c                                 |   8 +
+ mm/swap.h                                    |   5 +
+ mm/swap_state.c                              |  10 +-
+ mm/swapfile.c                                |  13 ++
+ mm/zswap.c                                   |  15 +-
+ tools/testing/selftests/mm/uffd-unit-tests.c |   6 +
+ 25 files changed, 346 insertions(+), 252 deletions(-)
+Merging kbuild-current/fixes (b401b621758e Linux 6.8-rc5)
+$ git merge -m Merge branch 'fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/masahiroy/linux-kbuild.git kbuild-current/fixes
+Already up to date.
+Merging arc-current/for-curr (861deac3b092 Linux 6.7-rc7)
+$ git merge -m Merge branch 'for-curr' of git://git.kernel.org/pub/scm/linux/kernel/git/vgupta/arc.git arc-current/for-curr
+Already up to date.
+Merging arm-current/fixes (f54e8634d136 ARM: 9330/1: davinci: also select PINCTRL)
+$ git merge -m Merge branch 'fixes' of git://git.armlinux.org.uk/~rmk/linux-arm.git arm-current/fixes
+Already up to date.
+Merging arm64-fixes/for-next/fixes (d7b77a0d565b arm64/sme: Restore SMCR_EL1.EZT0 on exit from suspend)
+$ git merge -m Merge branch 'for-next/fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux arm64-fixes/for-next/fixes
+Auto-merging arch/arm64/include/asm/jump_label.h
+Merge made by the 'ort' strategy.
+ arch/arm64/include/asm/fpsimd.h     |  2 ++
+ arch/arm64/include/asm/jump_label.h | 12 ++++--------
+ arch/arm64/kernel/fpsimd.c          | 16 ++++++++++++++++
+ arch/arm64/kernel/suspend.c         |  3 +++
+ drivers/perf/cxl_pmu.c              | 10 +++++-----
+ 5 files changed, 30 insertions(+), 13 deletions(-)
+Merging arm-soc-fixes/arm/fixes (78b6f8e7379b dtc: Enable dtc interrupt_provider check)
+$ git merge -m Merge branch 'arm/fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/soc/soc.git arm-soc-fixes/arm/fixes
+Merge made by the 'ort' strategy.
+ arch/arm/boot/dts/amazon/alpine.dtsi                  |  1 -
+ arch/arm/boot/dts/aspeed/aspeed-g4.dtsi               | 14 --------------
+ arch/arm/boot/dts/aspeed/aspeed-g5.dtsi               | 15 +--------------
+ arch/arm/boot/dts/aspeed/aspeed-g6.dtsi               | 18 ++----------------
+ arch/arm/boot/dts/broadcom/bcm-cygnus.dtsi            |  3 +++
+ arch/arm/boot/dts/broadcom/bcm-hr2.dtsi               |  1 +
+ arch/arm/boot/dts/broadcom/bcm-nsp.dtsi               |  2 ++
+ .../boot/dts/intel/ixp/intel-ixp42x-gateway-7001.dts  |  2 ++
+ .../dts/intel/ixp/intel-ixp42x-goramo-multilink.dts   |  2 ++
+ arch/arm/boot/dts/marvell/kirkwood-l-50.dts           |  2 ++
+ arch/arm/boot/dts/nuvoton/nuvoton-wpcm450.dtsi        |  2 ++
+ arch/arm/boot/dts/nvidia/tegra30-apalis-v1.1.dtsi     |  1 -
+ arch/arm/boot/dts/nvidia/tegra30-apalis.dtsi          |  1 -
+ arch/arm/boot/dts/nvidia/tegra30-colibri.dtsi         |  1 -
+ arch/arm/boot/dts/nxp/imx/imx6q-b850v3.dts            |  3 ---
+ arch/arm/boot/dts/nxp/imx/imx6q-bx50v3.dtsi           |  2 +-
+ arch/arm/boot/dts/nxp/imx/imx6qdl-apalis.dtsi         |  1 -
+ arch/arm/boot/dts/nxp/imx/imx6qdl-colibri.dtsi        |  1 -
+ arch/arm/boot/dts/nxp/imx/imx6qdl-emcon.dtsi          |  1 -
+ arch/arm/boot/dts/nxp/imx/imx6qdl-phytec-pfla02.dtsi  |  1 +
+ .../boot/dts/nxp/imx/imx6qdl-phytec-phycore-som.dtsi  |  1 +
+ arch/arm/boot/dts/nxp/imx/imx7d-pico-dwarf.dts        |  1 +
+ arch/arm/boot/dts/nxp/vf/vf610-zii-dev-rev-b.dts      |  1 +
+ arch/arm/boot/dts/qcom/qcom-sdx55.dtsi                |  8 ++++----
+ arch/arm/boot/dts/rockchip/rv1108.dtsi                |  8 --------
+ arch/arm/boot/dts/st/stm32429i-eval.dts               |  1 -
+ arch/arm/boot/dts/st/stm32mp157c-dk2.dts              |  1 -
+ arch/arm/boot/dts/ti/omap/am5729-beagleboneai.dts     |  1 -
+ arch/arm/mach-ep93xx/core.c                           |  1 +
+ arch/arm64/boot/dts/amazon/alpine-v2.dtsi             |  1 -
+ arch/arm64/boot/dts/amazon/alpine-v3.dtsi             |  1 -
+ arch/arm64/boot/dts/broadcom/northstar2/ns2.dtsi      |  1 +
+ arch/arm64/boot/dts/broadcom/stingray/stingray.dtsi   |  1 +
+ arch/arm64/boot/dts/freescale/Makefile                | 19 +++++++++++++++++++
+ .../boot/dts/freescale/imx8mn-var-som-symphony.dts    | 11 +++--------
+ .../boot/dts/freescale/imx8mp-data-modul-edm-sbc.dts  |  2 +-
+ arch/arm64/boot/dts/freescale/imx8mp-dhcom-pdk3.dts   | 10 +++-------
+ .../boot/dts/freescale/imx8mp-tqma8mpql-mba8mpxl.dts  |  9 ++++++++-
+ arch/arm64/boot/dts/lg/lg1312.dtsi                    |  1 -
+ arch/arm64/boot/dts/lg/lg1313.dtsi                    |  1 -
+ arch/arm64/boot/dts/marvell/armada-ap80x.dtsi         |  1 -
+ arch/arm64/boot/dts/mediatek/mt8195-demo.dts          |  1 +
+ arch/arm64/boot/dts/qcom/ipq6018.dtsi                 |  8 ++++----
+ arch/arm64/boot/dts/qcom/ipq8074.dtsi                 | 16 ++++++++--------
+ arch/arm64/boot/dts/renesas/ulcb-kf.dtsi              |  4 ++++
+ arch/arm64/boot/dts/rockchip/px30.dtsi                |  2 ++
+ arch/arm64/boot/dts/rockchip/rk3328.dtsi              |  1 -
+ .../arm64/boot/dts/rockchip/rk3588-coolpi-cm5-evb.dts |  8 +++++---
+ arch/arm64/boot/dts/rockchip/rk3588-coolpi-cm5.dtsi   |  4 ++--
+ arch/arm64/boot/dts/rockchip/rk3588-evb1-v10.dts      |  1 +
+ arch/arm64/boot/dts/rockchip/rk3588-jaguar.dts        |  1 -
+ arch/arm64/boot/dts/rockchip/rk3588-nanopc-t6.dts     |  8 ++++----
+ arch/arm64/boot/dts/rockchip/rk3588s-coolpi-4b.dts    |  4 ++--
+ .../boot/dts/rockchip/rk3588s-indiedroid-nova.dts     | 10 +++++-----
+ drivers/bus/imx-weim.c                                |  2 +-
+ scripts/Makefile.lib                                  |  3 +--
+ 56 files changed, 103 insertions(+), 125 deletions(-)
+Merging davinci-current/davinci/for-current (6613476e225e Linux 6.8-rc1)
+$ git merge -m Merge branch 'davinci/for-current' of git://git.kernel.org/pub/scm/linux/kernel/git/brgl/linux.git davinci-current/davinci/for-current
+Already up to date.
+Merging drivers-memory-fixes/fixes (6613476e225e Linux 6.8-rc1)
+$ git merge -m Merge branch 'fixes' of https://git.kernel.org/pub/scm/linux/kernel/git/krzk/linux-mem-ctrl.git drivers-memory-fixes/fixes
+Already up to date.
+Merging sophgo-fixes/fixes (41bccc98fb79 Linux 6.8-rc2)
+$ git merge -m Merge branch 'fixes' of https://github.com/sophgo/linux.git sophgo-fixes/fixes
+Already up to date.
+Merging tee-fixes/fixes (ceaa837f96ad Linux 6.2-rc8)
+$ git merge -m Merge branch 'fixes' of https://git.linaro.org/people/jens.wiklander/linux-tee.git tee-fixes/fixes
+Already up to date.
+Merging m68k-current/for-linus (e8a7824856de m68k: defconfig: Update defconfigs for v6.8-rc1)
+$ git merge -m Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/geert/linux-m68k.git m68k-current/for-linus
+Merge made by the 'ort' strategy.
+ arch/m68k/configs/amiga_defconfig    | 3 ---
+ arch/m68k/configs/apollo_defconfig   | 3 ---
+ arch/m68k/configs/atari_defconfig    | 3 ---
+ arch/m68k/configs/bvme6000_defconfig | 3 ---
+ arch/m68k/configs/hp300_defconfig    | 3 ---
+ arch/m68k/configs/mac_defconfig      | 3 ---
+ arch/m68k/configs/multi_defconfig    | 3 ---
+ arch/m68k/configs/mvme147_defconfig  | 3 ---
+ arch/m68k/configs/mvme16x_defconfig  | 3 ---
+ arch/m68k/configs/q40_defconfig      | 3 ---
+ arch/m68k/configs/sun3_defconfig     | 3 ---
+ arch/m68k/configs/sun3x_defconfig    | 3 ---
+ drivers/zorro/zorro-driver.c         | 2 +-
+ drivers/zorro/zorro.h                | 2 +-
+ 14 files changed, 2 insertions(+), 38 deletions(-)
+Merging powerpc-fixes/fixes (20c8c4dafe93 KVM: PPC: Book3S HV: Fix L2 guest reboot failure due to empty 'arch_compat')
+$ git merge -m Merge branch 'fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git powerpc-fixes/fixes
+Auto-merging arch/powerpc/kernel/iommu.c
+Merge made by the 'ort' strategy.
+ arch/powerpc/include/asm/ppc-pci.h         | 10 ++++++++++
+ arch/powerpc/kernel/iommu.c                | 23 +++++++++++++++++------
+ arch/powerpc/kvm/book3s_hv.c               | 26 ++++++++++++++++++++++++--
+ arch/powerpc/kvm/book3s_hv_nestedv2.c      | 20 ++++++++++++++++++--
+ arch/powerpc/platforms/pseries/pci_dlpar.c |  4 ++++
+ 5 files changed, 73 insertions(+), 10 deletions(-)
+Merging s390-fixes/fixes (124468af7e76 s390/configs: update default configurations)
+$ git merge -m Merge branch 'fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/s390/linux.git s390-fixes/fixes
+Merge made by the 'ort' strategy.
+ arch/s390/configs/compat.config      | 3 +++
+ arch/s390/configs/debug_defconfig    | 8 +-------
+ arch/s390/configs/defconfig          | 9 +--------
+ arch/s390/configs/zfcpdump_defconfig | 2 +-
+ 4 files changed, 6 insertions(+), 16 deletions(-)
+ create mode 100644 arch/s390/configs/compat.config
+Merging fscrypt-current/for-current (4bcf6f827a79 fscrypt: check for NULL keyring in fscrypt_put_master_key_activeref())
+$ git merge -m Merge branch 'for-current' of git://git.kernel.org/pub/scm/fs/fscrypt/linux.git fscrypt-current/for-current
+Already up to date.
+Merging fsverity-current/for-current (a075bacde257 fsverity: don't drop pagecache at end of FS_IOC_ENABLE_VERITY)
+$ git merge -m Merge branch 'for-current' of git://git.kernel.org/pub/scm/fs/fsverity/linux.git fsverity-current/for-current
+Already up to date.
+Merging net/main (23f9c2c066e7 docs: netdev: update the link to the CI repo)
+$ git merge -m Merge branch 'main' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net.git net/main
+Merge made by the 'ort' strategy.
+ Documentation/process/maintainer-netdev.rst        |  2 +-
+ drivers/net/ethernet/adi/Kconfig                   |  1 +
+ drivers/net/ethernet/broadcom/asp2/bcmasp.c        |  6 +-
+ drivers/net/ethernet/broadcom/asp2/bcmasp_intf.c   |  3 +
+ drivers/net/ethernet/cisco/enic/vnic_vic.c         |  3 +-
+ .../net/ethernet/pensando/ionic/ionic_bus_pci.c    |  2 +-
+ drivers/net/ethernet/stmicro/stmmac/stmmac_main.c  | 20 -----
+ include/net/switchdev.h                            |  3 +
+ include/net/tcp.h                                  |  2 +-
+ net/bridge/br_switchdev.c                          | 86 ++++++++++++++--------
+ net/devlink/core.c                                 | 12 ++-
+ net/ipv4/arp.c                                     |  3 +-
+ net/ipv4/devinet.c                                 | 21 +++++-
+ net/ipv4/inet_hashtables.c                         | 25 ++++++-
+ net/ipv6/addrconf.c                                | 21 +++++-
+ net/ipv6/seg6.c                                    | 20 ++---
+ net/iucv/iucv.c                                    |  4 +-
+ net/mptcp/diag.c                                   |  8 +-
+ net/mptcp/pm_netlink.c                             | 69 ++++++++++-------
+ net/mptcp/pm_userspace.c                           | 15 ++--
+ net/mptcp/protocol.c                               |  2 +-
+ net/mptcp/protocol.h                               | 15 +++-
+ net/mptcp/subflow.c                                | 15 ++--
+ net/sched/act_mirred.c                             | 36 ++++-----
+ net/switchdev/switchdev.c                          | 73 ++++++++++++++++++
+ net/tls/tls_main.c                                 |  2 +-
+ .../selftests/drivers/net/bonding/bond_options.sh  |  2 +
+ .../testing/selftests/net/forwarding/tc_actions.sh |  3 -
+ tools/testing/selftests/net/mptcp/diag.sh          | 41 +++++++----
+ tools/testing/selftests/net/mptcp/pm_netlink.sh    |  8 +-
+ tools/testing/selftests/net/mptcp/simult_flows.sh  |  3 +-
+ tools/testing/selftests/net/mptcp/userspace_pm.sh  |  4 +-
+ 32 files changed, 360 insertions(+), 170 deletions(-)
+Merging bpf/master (5c138a8a4abe selftests/bpf: Add negtive test cases for task iter)
+$ git merge -m Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf.git bpf/master
+Merge made by the 'ort' strategy.
+ arch/x86/include/asm/vsyscall.h                    | 10 ++++
+ arch/x86/mm/fault.c                                |  9 ----
+ arch/x86/mm/maccess.c                              | 10 ++++
+ kernel/bpf/helpers.c                               |  5 +-
+ kernel/bpf/task_iter.c                             |  2 +
+ kernel/bpf/verifier.c                              |  2 +
+ net/xdp/xsk.c                                      |  3 +-
+ scripts/bpf_doc.py                                 |  2 +-
+ tools/testing/selftests/bpf/prog_tests/iters.c     |  1 +
+ .../selftests/bpf/prog_tests/read_vsyscall.c       | 57 ++++++++++++++++++++++
+ tools/testing/selftests/bpf/prog_tests/timer.c     | 35 ++++++++++++-
+ tools/testing/selftests/bpf/progs/iters_task.c     | 12 ++++-
+ tools/testing/selftests/bpf/progs/read_vsyscall.c  | 45 +++++++++++++++++
+ tools/testing/selftests/bpf/progs/timer.c          | 34 ++++++++++++-
+ 14 files changed, 212 insertions(+), 15 deletions(-)
+ create mode 100644 tools/testing/selftests/bpf/prog_tests/read_vsyscall.c
+ create mode 100644 tools/testing/selftests/bpf/progs/read_vsyscall.c
+Merging ipsec/master (983a73da1f99 xfrm: Pass UDP encapsulation in TX packet offload)
+$ git merge -m Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/klassert/ipsec.git ipsec/master
+Merge made by the 'ort' strategy.
+ net/xfrm/xfrm_device.c | 2 +-
+ net/xfrm/xfrm_policy.c | 2 +-
+ 2 files changed, 2 insertions(+), 2 deletions(-)
+Merging netfilter/main (40b9385dd8e6 enic: Avoid false positive under FORTIFY_SOURCE)
+$ git merge -m Merge branch 'main' of git://git.kernel.org/pub/scm/linux/kernel/git/netfilter/nf.git netfilter/main
+Already up to date.
+Merging ipvs/main (84443741faab netfilter: nf_tables: fix bidirectional offload regression)
+$ git merge -m Merge branch 'main' of git://git.kernel.org/pub/scm/linux/kernel/git/horms/ipvs.git ipvs/main
+Already up to date.
+Merging wireless/for-next (f78c1375339a wifi: nl80211: reject iftype change with mesh ID change)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/wireless/wireless.git wireless/for-next
+Merge made by the 'ort' strategy.
+ net/wireless/nl80211.c | 2 ++
+ 1 file changed, 2 insertions(+)
+Merging wpan/master (b85ea95d0864 Linux 6.7-rc1)
+$ git merge -m Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/wpan/wpan.git wpan/master
+Already up to date.
+Merging rdma-fixes/for-rc (eb5c7465c324 RDMA/srpt: fix function pointer cast warnings)
+$ git merge -m Merge branch 'for-rc' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git rdma-fixes/for-rc
+Auto-merging include/linux/mlx5/mlx5_ifc.h
+Merge made by the 'ort' strategy.
+ drivers/infiniband/hw/bnxt_re/ib_verbs.c | 43 +++++++++++++++++++++-----------
+ drivers/infiniband/hw/bnxt_re/main.c     |  3 ---
+ drivers/infiniband/hw/bnxt_re/qplib_fp.c |  3 ++-
+ drivers/infiniband/hw/hfi1/pio.c         |  6 ++++-
+ drivers/infiniband/hw/hfi1/sdma.c        |  2 +-
+ drivers/infiniband/hw/irdma/defs.h       |  1 +
+ drivers/infiniband/hw/irdma/hw.c         |  8 ++++++
+ drivers/infiniband/hw/irdma/verbs.c      |  9 ++++---
+ drivers/infiniband/hw/mlx5/cong.c        |  6 +++++
+ drivers/infiniband/hw/mlx5/devx.c        |  2 +-
+ drivers/infiniband/hw/mlx5/wr.c          |  2 +-
+ drivers/infiniband/hw/qedr/verbs.c       | 11 +++++++-
+ drivers/infiniband/ulp/srpt/ib_srpt.c    | 17 ++++++++-----
+ include/linux/mlx5/mlx5_ifc.h            |  2 +-
+ include/linux/mlx5/qp.h                  |  5 +++-
+ 15 files changed, 84 insertions(+), 36 deletions(-)
+Merging sound-current/for-linus (49cbb7b7d36e ALSA: ump: Fix the discard error code from snd_ump_legacy_open())
+$ git merge -m Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tiwai/sound.git sound-current/for-linus
+Merge made by the 'ort' strategy.
+ sound/core/ump.c              | 4 ++--
+ sound/firewire/amdtp-stream.c | 2 +-
+ sound/pci/hda/patch_realtek.c | 2 ++
+ 3 files changed, 5 insertions(+), 3 deletions(-)
+Merging sound-asoc-fixes/for-linus (0db0c1770834 ASoC: cs35l56: Workaround for ACPI with broken spk-id-gpios property)
+$ git merge -m Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/broonie/sound.git sound-asoc-fixes/for-linus
+Already up to date.
+Merging regmap-fixes/for-linus (2f0dbb24f78a regmap: kunit: Ensure that changed bytes are actually different)
+$ git merge -m Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/broonie/regmap.git regmap-fixes/for-linus
+Already up to date.
+Merging regulator-fixes/for-linus (e5d40e9afd84 regulator: max5970: Fix regulator child node name)
+$ git merge -m Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/broonie/regulator.git regulator-fixes/for-linus
+Auto-merging MAINTAINERS
+Merge made by the 'ort' strategy.
+ MAINTAINERS                           | 2 +-
+ drivers/regulator/max5970-regulator.c | 8 ++++----
+ 2 files changed, 5 insertions(+), 5 deletions(-)
+Merging spi-fixes/for-linus (269e31aecdd0 spi-mxs: Fix chipselect glitch)
+$ git merge -m Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/broonie/spi.git spi-fixes/for-linus
+Already up to date.
+Merging pci-current/for-linus (6613476e225e Linux 6.8-rc1)
+$ git merge -m Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/pci/pci.git pci-current/for-linus
+Already up to date.
+Merging driver-core.current/driver-core-linus (b401b621758e Linux 6.8-rc5)
+$ git merge -m Merge branch 'driver-core-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/driver-core.git driver-core.current/driver-core-linus
+Already up to date.
+Merging tty.current/tty-linus (3b69e32e151b serial: amba-pl011: Fix DMA transmission in RS485 mode)
+$ git merge -m Merge branch 'tty-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/tty.git tty.current/tty-linus
+Merge made by the 'ort' strategy.
+ drivers/tty/hvc/Kconfig          |  8 ++++--
+ drivers/tty/serial/amba-pl011.c  | 60 ++++++++++++++++++++--------------------
+ drivers/tty/serial/stm32-usart.c |  4 ++-
+ 3 files changed, 38 insertions(+), 34 deletions(-)
+Merging usb.current/usb-linus (69f89168b310 usb: typec: tpcm: Fix issues with power being removed during reset)
+$ git merge -m Merge branch 'usb-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/usb.git usb.current/usb-linus
+Auto-merging MAINTAINERS
+Merge made by the 'ort' strategy.
+ MAINTAINERS                         |  3 +--
+ drivers/usb/cdns3/cdns3-gadget.c    |  8 ++++++--
+ drivers/usb/cdns3/core.c            |  1 -
+ drivers/usb/cdns3/drd.c             | 13 +++++++++----
+ drivers/usb/cdns3/drd.h             |  6 +++++-
+ drivers/usb/cdns3/host.c            | 16 ++++++++++++++--
+ drivers/usb/dwc3/gadget.c           |  5 +++++
+ drivers/usb/gadget/function/f_ncm.c | 10 +++++++++-
+ drivers/usb/gadget/udc/omap_udc.c   |  3 ++-
+ drivers/usb/host/uhci-grlib.c       |  1 +
+ drivers/usb/roles/class.c           | 29 +++++++++++++++++++++--------
+ drivers/usb/typec/tcpm/tcpm.c       |  6 ++----
+ 12 files changed, 75 insertions(+), 26 deletions(-)
+Merging usb-serial-fixes/usb-linus (54be6c6c5ae8 Linux 6.8-rc3)
+$ git merge -m Merge branch 'usb-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/johan/usb-serial.git usb-serial-fixes/usb-linus
+Already up to date.
+Merging phy/fixes (d4c08d8b23b2 phy: qcom-qmp-usb: fix v3 offsets data)
+$ git merge -m Merge branch 'fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/phy/linux-phy.git phy/fixes
+Merge made by the 'ort' strategy.
+ drivers/phy/freescale/phy-fsl-imx8-mipi-dphy.c |   2 +-
+ drivers/phy/qualcomm/phy-qcom-eusb2-repeater.c | 160 +++++++++----------------
+ drivers/phy/qualcomm/phy-qcom-m31.c            |   2 +-
+ drivers/phy/qualcomm/phy-qcom-qmp-usb.c        |  10 +-
+ 4 files changed, 66 insertions(+), 108 deletions(-)
+Merging staging.current/staging-linus (6613476e225e Linux 6.8-rc1)
+$ git merge -m Merge branch 'staging-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/staging.git staging.current/staging-linus
+Already up to date.
+Merging iio-fixes/fixes-togreg (78367c32bebf iio: adc: ad4130: only set GPIO_CTRL if pin is unused)
+$ git merge -m Merge branch 'fixes-togreg' of git://git.kernel.org/pub/scm/linux/kernel/git/jic23/iio.git iio-fixes/fixes-togreg
+Already up to date.
+Merging counter-current/counter-current (c83ccdc9586b counter: fix privdata alignment)
+$ git merge -m Merge branch 'counter-current' of git://git.kernel.org/pub/scm/linux/kernel/git/wbg/counter.git counter-current/counter-current
+Merge made by the 'ort' strategy.
+ drivers/counter/counter-core.c | 7 ++++---
+ 1 file changed, 4 insertions(+), 3 deletions(-)
+Merging char-misc.current/char-misc-linus (daaf5286b6d2 mei: Add Meteor Lake support for IVSC device)
+$ git merge -m Merge branch 'char-misc-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/char-misc.git char-misc.current/char-misc-linus
+Merge made by the 'ort' strategy.
+ drivers/misc/mei/hw-me-regs.h | 2 ++
+ drivers/misc/mei/pci-me.c     | 2 ++
+ drivers/misc/mei/vsc-tp.c     | 1 +
+ 3 files changed, 5 insertions(+)
+Merging soundwire-fixes/fixes (6613476e225e Linux 6.8-rc1)
+$ git merge -m Merge branch 'fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/vkoul/soundwire.git soundwire-fixes/fixes
+Already up to date.
+Merging thunderbolt-fixes/fixes (d3d17e23d1a0 thunderbolt: Fix NULL pointer dereference in tb_port_update_credits())
+$ git merge -m Merge branch 'fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/westeri/thunderbolt.git thunderbolt-fixes/fixes
+Merge made by the 'ort' strategy.
+ drivers/thunderbolt/switch.c | 3 +++
+ 1 file changed, 3 insertions(+)
+Merging input-current/for-linus (4255447ad34c Input: i8042 - add Fujitsu Lifebook U728 to i8042 quirk table)
+$ git merge -m Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/dtor/input.git input-current/for-linus
+Already up to date.
+Merging crypto-current/master (c0ec2a712daf crypto: virtio/akcipher - Fix stack overflow on memcpy)
+$ git merge -m Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6.git crypto-current/master
+Merge made by the 'ort' strategy.
+ drivers/crypto/virtio/virtio_crypto_akcipher_algs.c | 5 +++--
+ 1 file changed, 3 insertions(+), 2 deletions(-)
+Merging vfio-fixes/for-linus (4ea95c04fa6b vfio: Drop vfio_file_iommu_group() stub to fudge around a KVM wart)
+$ git merge -m Merge branch 'for-linus' of git://github.com/awilliam/linux-vfio.git vfio-fixes/for-linus
+Already up to date.
+Merging kselftest-fixes/fixes (b54761f6e977 kselftest/seccomp: Report each expectation we assert as a KTAP test)
+$ git merge -m Merge branch 'fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/shuah/linux-kselftest.git kselftest-fixes/fixes
+Already up to date.
+Merging modules-fixes/modules-linus (f412eef03938 Documentation: livepatch: module-elf-format: Remove local klp_modinfo definition)
+$ git merge -m Merge branch 'modules-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mcgrof/linux.git modules-fixes/modules-linus
+Already up to date.
+Merging dmaengine-fixes/fixes (a79f949a5ce1 dmaengine: fsl-edma: correct max_segment_size setting)
+$ git merge -m Merge branch 'fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/vkoul/dmaengine.git dmaengine-fixes/fixes
+Merge made by the 'ort' strategy.
+ drivers/dma/dw-edma/dw-edma-v0-core.c | 17 +++++++++++++++
+ drivers/dma/dw-edma/dw-hdma-v0-core.c | 39 +++++++++++++++++++++++------------
+ drivers/dma/dw-edma/dw-hdma-v0-regs.h |  2 +-
+ drivers/dma/fsl-edma-common.c         |  2 +-
+ drivers/dma/fsl-edma-common.h         |  5 +++--
+ drivers/dma/fsl-edma-main.c           |  4 +++-
+ drivers/dma/fsl-qdma.c                | 25 +++++++++++-----------
+ drivers/dma/idxd/cdev.c               |  2 +-
+ drivers/dma/idxd/debugfs.c            |  2 +-
+ drivers/dma/idxd/idxd.h               |  1 -
+ drivers/dma/idxd/irq.c                |  3 +--
+ 11 files changed, 67 insertions(+), 35 deletions(-)
+Merging backlight-fixes/for-backlight-fixes (6613476e225e Linux 6.8-rc1)
+$ git merge -m Merge branch 'for-backlight-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/lee/backlight.git backlight-fixes/for-backlight-fixes
+Already up to date.
+Merging mtd-fixes/mtd/fixes (e6a30d0c48a1 mtd: rawnand: marvell: fix layouts)
+$ git merge -m Merge branch 'mtd/fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/mtd/linux.git mtd-fixes/mtd/fixes
+Merge made by the 'ort' strategy.
+ drivers/mtd/mtdcore.c               |  1 +
+ drivers/mtd/nand/raw/marvell_nand.c | 13 +++++--------
+ drivers/mtd/nand/spi/gigadevice.c   |  6 ++++--
+ 3 files changed, 10 insertions(+), 10 deletions(-)
+Merging mfd-fixes/for-mfd-fixes (6613476e225e Linux 6.8-rc1)
+$ git merge -m Merge branch 'for-mfd-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/lee/mfd.git mfd-fixes/for-mfd-fixes
+Already up to date.
+Merging v4l-dvb-fixes/fixes (346c84e281a9 media: pwm-ir-tx: Depend on CONFIG_HIGH_RES_TIMERS)
+$ git merge -m Merge branch 'fixes' of https://git.linuxtv.org/media_stage.git v4l-dvb-fixes/fixes
+Already up to date.
+Merging reset-fixes/reset/fixes (4a6756f56bcf reset: Fix crash when freeing non-existent optional resets)
+$ git merge -m Merge branch 'reset/fixes' of https://git.pengutronix.de/git/pza/linux reset-fixes/reset/fixes
+Already up to date.
+Merging mips-fixes/mips-fixes (b401b621758e Linux 6.8-rc5)
+$ git merge -m Merge branch 'mips-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/mips/linux.git mips-fixes/mips-fixes
+Already up to date.
+Merging at91-fixes/at91-fixes (6613476e225e Linux 6.8-rc1)
+$ git merge -m Merge branch 'at91-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/at91/linux.git at91-fixes/at91-fixes
+Already up to date.
+Merging omap-fixes/fixes (9b6a51aab5f5 ARM: dts: Fix occasional boot hang for am3 usb)
+$ git merge -m Merge branch 'fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tmlind/linux-omap.git omap-fixes/fixes
+Already up to date.
+Merging kvm-fixes/master (9895ceeb5cd6 Merge tag 'kvmarm-fixes-6.8-2' of git://git.kernel.org/pub/scm/linux/kernel/git/kvmarm/kvmarm into HEAD)
+$ git merge -m Merge branch 'master' of git://git.kernel.org/pub/scm/virt/kvm/kvm.git kvm-fixes/master
+Already up to date.
+Merging kvms390-fixes/master (83303a4c776c KVM: s390: fix cc for successful PQAP)
+$ git merge -m Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/kvms390/linux.git kvms390-fixes/master
+Already up to date.
+Merging hwmon-fixes/hwmon (841c35169323 Linux 6.8-rc4)
+$ git merge -m Merge branch 'hwmon' of git://git.kernel.org/pub/scm/linux/kernel/git/groeck/linux-staging.git hwmon-fixes/hwmon
+Already up to date.
+Merging nvdimm-fixes/libnvdimm-fixes (33908660e814 ACPI: NFIT: Fix incorrect calculation of idt size)
+$ git merge -m Merge branch 'libnvdimm-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm.git nvdimm-fixes/libnvdimm-fixes
+Already up to date.
+Merging cxl-fixes/fixes (daeacfa75d08 Merge branch 'for-6.8/cxl-cper' into for-6.8/cxl)
+$ git merge -m Merge branch 'fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/cxl/cxl.git cxl-fixes/fixes
+Auto-merging mm/memblock.c
+Merge made by the 'ort' strategy.
+ arch/x86/mm/numa.c            | 21 ++++-------
+ drivers/acpi/apei/ghes.c      | 63 -------------------------------
+ drivers/cxl/acpi.c            | 45 +++++++++++++---------
+ drivers/cxl/core/cdat.c       | 86 +++++++++++++------------------------------
+ drivers/cxl/core/mbox.c       |  4 +-
+ drivers/cxl/core/memdev.c     | 63 +++++++++++++++++++++++++++++++
+ drivers/cxl/core/pci.c        | 49 ++++++++++++++++--------
+ drivers/cxl/core/region.c     | 62 +++++++++++++++++++++++--------
+ drivers/cxl/cxl.h             |  2 +
+ drivers/cxl/cxlmem.h          | 10 ++---
+ drivers/cxl/mem.c             | 56 ----------------------------
+ drivers/cxl/pci.c             | 57 +---------------------------
+ include/linux/cxl-event.h     | 18 ---------
+ include/linux/memblock.h      |  2 +
+ mm/memblock.c                 |  5 ++-
+ tools/testing/cxl/Kbuild      |  1 +
+ tools/testing/cxl/test/cxl.c  | 63 ++++++++++++++++++++++++++-----
+ tools/testing/cxl/test/mock.c | 14 +++++++
+ tools/testing/cxl/test/mock.h |  1 +
+ 19 files changed, 288 insertions(+), 334 deletions(-)
+Merging btrfs-fixes/next-fixes (eb90d142fc1b Merge branch 'misc-6.8' into next-fixes)
+$ git merge -m Merge branch 'next-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux.git btrfs-fixes/next-fixes
+Auto-merging fs/btrfs/extent_io.c
+Merge made by the 'ort' strategy.
+ fs/btrfs/defrag.c    |  2 +-
+ fs/btrfs/extent_io.c | 62 ++++++++++++++++++++++++++++++++++++++--------------
+ fs/btrfs/volumes.c   | 44 ++++++++++++++++++++++++++++---------
+ 3 files changed, 80 insertions(+), 28 deletions(-)
+Merging vfs-fixes/fixes (2c88c16dc20e erofs: fix handling kern_mount() failure)
+$ git merge -m Merge branch 'fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs.git vfs-fixes/fixes
+Auto-merging fs/erofs/fscache.c
+Merge made by the 'ort' strategy.
+ fs/dcache.c            | 5 ++++-
+ fs/erofs/fscache.c     | 7 ++++---
+ include/linux/dcache.h | 1 +
+ 3 files changed, 9 insertions(+), 4 deletions(-)
+Merging dma-mapping-fixes/for-linus (d5090484b021 swiotlb: do not try to allocate a TLB bigger than MAX_ORDER pages)
+$ git merge -m Merge branch 'for-linus' of git://git.infradead.org/users/hch/dma-mapping.git dma-mapping-fixes/for-linus
+Already up to date.
+Merging drivers-x86-fixes/fixes (1abdf288b0ef platform/x86: touchscreen_dmi: Add info for the TECLAST X16 Plus tablet)
+$ git merge -m Merge branch 'fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/pdx86/platform-drivers-x86.git drivers-x86-fixes/fixes
+Already up to date.
+Merging samsung-krzk-fixes/fixes (eab4f56d3e75 ARM: dts: exynos4212-tab3: add samsung,invert-vclk flag to fimd)
+$ git merge -m Merge branch 'fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/krzk/linux.git samsung-krzk-fixes/fixes
+Already up to date.
+Merging pinctrl-samsung-fixes/fixes (6613476e225e Linux 6.8-rc1)
+$ git merge -m Merge branch 'fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/pinctrl/samsung.git pinctrl-samsung-fixes/fixes
+Already up to date.
+Merging devicetree-fixes/dt/linus (4e06ec0774f5 dt-bindings: ufs: samsung,exynos-ufs: Add size constraints on "samsung,sysreg")
+$ git merge -m Merge branch 'dt/linus' of git://git.kernel.org/pub/scm/linux/kernel/git/robh/linux.git devicetree-fixes/dt/linus
+Already up to date.
+Merging dt-krzk-fixes/fixes (6613476e225e Linux 6.8-rc1)
+$ git merge -m Merge branch 'fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/krzk/linux-dt.git dt-krzk-fixes/fixes
+Already up to date.
+Merging scsi-fixes/fixes (9ddf190a7df7 scsi: jazz_esp: Only build if SCSI core is builtin)
+$ git merge -m Merge branch 'fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/mkp/scsi.git scsi-fixes/fixes
+Merge made by the 'ort' strategy.
+ drivers/scsi/Kconfig                  |  2 +-
+ drivers/scsi/scsi.c                   | 22 ++++++++++++++++++++--
+ drivers/scsi/sd.c                     | 26 +++++++++++++++++++++++++-
+ drivers/scsi/smartpqi/smartpqi_init.c |  5 ++++-
+ drivers/target/target_core_pscsi.c    |  9 ++++++---
+ drivers/ufs/core/ufshcd.c             |  2 +-
+ drivers/usb/storage/scsiglue.c        |  7 +++++++
+ drivers/usb/storage/uas.c             |  7 +++++++
+ include/scsi/scsi_device.h            |  5 +----
+ 9 files changed, 72 insertions(+), 13 deletions(-)
+Merging drm-fixes/drm-fixes (b401b621758e Linux 6.8-rc5)
+$ git merge -m Merge branch 'drm-fixes' of git://git.freedesktop.org/git/drm/drm.git drm-fixes/drm-fixes
+Already up to date.
+Merging drm-intel-fixes/for-linux-next-fixes (b401b621758e Linux 6.8-rc5)
+$ git merge -m Merge branch 'for-linux-next-fixes' of git://anongit.freedesktop.org/drm-intel drm-intel-fixes/for-linux-next-fixes
+Already up to date.
+Merging mmc-fixes/fixes (6b1ba3f9040b mmc: mmci: stm32: fix DMA API overlapping mappings warning)
+$ git merge -m Merge branch 'fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/ulfh/mmc.git mmc-fixes/fixes
+Merge made by the 'ort' strategy.
+ drivers/mmc/core/mmc.c              |  2 ++
+ drivers/mmc/host/mmci_stm32_sdmmc.c | 24 ++++++++++++++++++++++++
+ 2 files changed, 26 insertions(+)
+Merging rtc-fixes/rtc-fixes (08279468a294 rtc: sunplus: fix format string for printing resource)
+$ git merge -m Merge branch 'rtc-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/abelloni/linux.git rtc-fixes/rtc-fixes
+Already up to date.
+Merging gnss-fixes/gnss-linus (54be6c6c5ae8 Linux 6.8-rc3)
+$ git merge -m Merge branch 'gnss-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/johan/gnss.git gnss-fixes/gnss-linus
+Already up to date.
+Merging hyperv-fixes/hyperv-fixes (564eac2860bd hv_utils: Allow implicit ICTIMESYNCFLAG_SYNC)
+$ git merge -m Merge branch 'hyperv-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/hyperv/linux.git hyperv-fixes/hyperv-fixes
+Merge made by the 'ort' strategy.
+ drivers/hv/hv_util.c | 31 ++++++++++++++++++++++++++++++-
+ 1 file changed, 30 insertions(+), 1 deletion(-)
+Merging soc-fsl-fixes/fix (06c2afb862f9 Linux 6.5-rc1)
+$ git merge -m Merge branch 'fix' of git://git.kernel.org/pub/scm/linux/kernel/git/leo/linux.git soc-fsl-fixes/fix
+Already up to date.
+Merging risc-v-fixes/fixes (3951f6add519 riscv: Fix arch_tlbbatch_flush() by clearing the batch cpumask)
+$ git merge -m Merge branch 'fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/riscv/linux.git risc-v-fixes/fixes
+Already up to date.
+Merging riscv-dt-fixes/riscv-dt-fixes (ce6b6d151396 riscv: dts: sifive: add missing #interrupt-cells to pmic)
+$ git merge -m Merge branch 'riscv-dt-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/conor/linux.git riscv-dt-fixes/riscv-dt-fixes
+Merge made by the 'ort' strategy.
+ arch/riscv/boot/dts/sifive/hifive-unmatched-a00.dts |  1 +
+ arch/riscv/boot/dts/starfive/jh7100.dtsi            | 12 ++++++------
+ arch/riscv/boot/dts/starfive/jh7110.dtsi            |  4 ++--
+ 3 files changed, 9 insertions(+), 8 deletions(-)
+Merging riscv-soc-fixes/riscv-soc-fixes (bf456162601f Merge branches 'riscv-cache-fixes' and 'riscv-firmware-fixes' into riscv-soc-fixes)
+$ git merge -m Merge branch 'riscv-soc-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/conor/linux.git riscv-soc-fixes/riscv-soc-fixes
+Merge made by the 'ort' strategy.
+ drivers/cache/ax45mp_cache.c                  | 4 ++++
+ drivers/firmware/microchip/mpfs-auto-update.c | 2 +-
+ drivers/soc/microchip/Kconfig                 | 2 +-
+ 3 files changed, 6 insertions(+), 2 deletions(-)
+Merging fpga-fixes/fixes (6613476e225e Linux 6.8-rc1)
+$ git merge -m Merge branch 'fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/fpga/linux-fpga.git fpga-fixes/fixes
+Already up to date.
+Merging spdx/spdx-linus (6613476e225e Linux 6.8-rc1)
+$ git merge -m Merge branch 'spdx-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/spdx.git spdx/spdx-linus
+Already up to date.
+Merging gpio-brgl-fixes/gpio/for-current (ae366ba8576d gpiolib: Handle no pin_ranges in gpiochip_generic_config())
+$ git merge -m Merge branch 'gpio/for-current' of git://git.kernel.org/pub/scm/linux/kernel/git/brgl/linux.git gpio-brgl-fixes/gpio/for-current
+Merge made by the 'ort' strategy.
+ drivers/gpio/gpiolib.c | 5 +++++
+ 1 file changed, 5 insertions(+)
+Merging gpio-intel-fixes/fixes (6613476e225e Linux 6.8-rc1)
+$ git merge -m Merge branch 'fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/andy/linux-gpio-intel.git gpio-intel-fixes/fixes
+Already up to date.
+Merging pinctrl-intel-fixes/fixes (6613476e225e Linux 6.8-rc1)
+$ git merge -m Merge branch 'fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/pinctrl/intel.git pinctrl-intel-fixes/fixes
+Already up to date.
+Merging auxdisplay-fixes/fixes (6613476e225e Linux 6.8-rc1)
+$ git merge -m Merge branch 'fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/andy/linux-auxdisplay.git auxdisplay-fixes/fixes
+Already up to date.
+Merging erofs-fixes/fixes (d9281660ff3f erofs: relaxed temporary buffers allocation on readahead)
+$ git merge -m Merge branch 'fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/xiang/erofs.git erofs-fixes/fixes
+Already up to date.
+Merging kunit-fixes/kunit-fixes (4b758d70257d kunit: make kunit_bus_type const)
+$ git merge -m Merge branch 'kunit-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/shuah/linux-kselftest.git kunit-fixes/kunit-fixes
+Merge made by the 'ort' strategy.
+ lib/kunit/device.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+Merging ubifs-fixes/fixes (2241ab53cbb5 Linux 6.2-rc5)
+$ git merge -m Merge branch 'fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/rw/ubifs.git ubifs-fixes/fixes
+Already up to date.
+Merging memblock-fixes/fixes (6a9531c3a880 memblock: fix crash when reserved memory is not added to memory)
+$ git merge -m Merge branch 'fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/rppt/memblock.git memblock-fixes/fixes
+Already up to date.
+Merging nfsd-fixes/nfsd-fixes (5ea9a7c5fe41 nfsd: don't take fi_lock in nfsd_break_deleg_cb())
+$ git merge -m Merge branch 'nfsd-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/cel/linux nfsd-fixes/nfsd-fixes
+Already up to date.
+Merging renesas-fixes/fixes (9eab43facdad soc: renesas: ARCH_R9A07G043 depends on !RISCV_ISA_ZICBOM)
+$ git merge -m Merge branch 'fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/geert/renesas-devel.git renesas-fixes/fixes
+Already up to date.
+Merging perf-current/perf-tools (fdd0ae72b34e perf tools headers: update the asm-generic/unaligned.h copy with the kernel sources)
+$ git merge -m Merge branch 'perf-tools' of git://git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools perf-current/perf-tools
+Already up to date.
+Merging efi-fixes/urgent (e258b85f1c3c efivarfs: Request at most 512 bytes for variable names)
+$ git merge -m Merge branch 'urgent' of git://git.kernel.org/pub/scm/linux/kernel/git/efi/efi.git efi-fixes/urgent
+Merge made by the 'ort' strategy.
+ drivers/firmware/efi/capsule-loader.c |  2 +-
+ fs/efivarfs/vars.c                    | 16 +++++++++++-----
+ 2 files changed, 12 insertions(+), 6 deletions(-)
+Merging zstd-fixes/zstd-linus (77618db34645 zstd: Fix array-index-out-of-bounds UBSAN warning)
+$ git merge -m Merge branch 'zstd-linus' of https://github.com/terrelln/linux.git zstd-fixes/zstd-linus
+Already up to date.
+Merging battery-fixes/fixes (2df70149e73e power: supply: bq27xxx-i2c: Do not free non existing IRQ)
+$ git merge -m Merge branch 'fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/sre/linux-power-supply.git battery-fixes/fixes
+Merge made by the 'ort' strategy.
+ drivers/power/supply/Kconfig               | 1 +
+ drivers/power/supply/bq27xxx_battery_i2c.c | 4 +++-
+ 2 files changed, 4 insertions(+), 1 deletion(-)
+Merging uml-fixes/fixes (73a23d771033 um: harddog: fix modular build)
+$ git merge -m Merge branch 'fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/uml/linux.git uml-fixes/fixes
+Already up to date.
+Merging iommufd-fixes/for-rc (28b9f669e10f iommufd/iova_bitmap: Consider page offset for the pages to be pinned)
+$ git merge -m Merge branch 'for-rc' of git://git.kernel.org/pub/scm/linux/kernel/git/jgg/iommufd.git iommufd-fixes/for-rc
+Merge made by the 'ort' strategy.
+ drivers/iommu/iommufd/iommufd_test.h          |  1 +
+ drivers/iommu/iommufd/iova_bitmap.c           | 68 +++++++++++++++++++----
+ drivers/iommu/iommufd/selftest.c              | 79 ++++++++++++++++++++-------
+ tools/testing/selftests/iommu/iommufd.c       | 78 +++++++++++++++++++++-----
+ tools/testing/selftests/iommu/iommufd_utils.h | 39 ++++++++-----
+ 5 files changed, 205 insertions(+), 60 deletions(-)
+Merging rust-fixes/rust-fixes (b401b621758e Linux 6.8-rc5)
+$ git merge -m Merge branch 'rust-fixes' of https://github.com/Rust-for-Linux/linux.git rust-fixes/rust-fixes
+Already up to date.
+Merging v9fs-fixes/fixes/next (6613476e225e Linux 6.8-rc1)
+$ git merge -m Merge branch 'fixes/next' of git://git.kernel.org/pub/scm/linux/kernel/git/ericvh/v9fs.git v9fs-fixes/fixes/next
+Already up to date.
+Merging w1-fixes/fixes (6613476e225e Linux 6.8-rc1)
+$ git merge -m Merge branch 'fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/krzk/linux-w1.git w1-fixes/fixes
+Already up to date.
+Merging pmdomain-fixes/fixes (eb5555d422d0 pmdomain: arm: Fix NULL dereference on scmi_perf_domain removal)
+$ git merge -m Merge branch 'fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/ulfh/linux-pm.git pmdomain-fixes/fixes
+Merge made by the 'ort' strategy.
+ drivers/pmdomain/arm/scmi_perf_domain.c | 3 +++
+ 1 file changed, 3 insertions(+)
+Merging overlayfs-fixes/ovl-fixes (420332b94119 ovl: mark xwhiteouts directory with overlay.opaque='x')
+$ git merge -m Merge branch 'ovl-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/overlayfs/vfs.git overlayfs-fixes/ovl-fixes
+Already up to date.
+Merging i2c-host-fixes/i2c/i2c-host-fixes (eb9f7f654f25 i2c: i801: Fix block process call transactions)
+$ git merge -m Merge branch 'i2c/i2c-host-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/andi.shyti/linux.git i2c-host-fixes/i2c/i2c-host-fixes
+Merge made by the 'ort' strategy.
+Merging drm-misc-fixes/for-linux-next-fixes (eb0d253ff9c7 accel/ivpu: Don't enable any tiles by default on VPU40xx)
+$ git merge -m Merge branch 'for-linux-next-fixes' of git://anongit.freedesktop.org/drm/drm-misc drm-misc-fixes/for-linux-next-fixes
+Auto-merging drivers/gpu/drm/tests/drm_buddy_test.c
+CONFLICT (content): Merge conflict in drivers/gpu/drm/tests/drm_buddy_test.c
+Recorded preimage for 'drivers/gpu/drm/tests/drm_buddy_test.c'
+Automatic merge failed; fix conflicts and then commit the result.
+$ git commit --no-edit -v -a
+Recorded resolution for 'drivers/gpu/drm/tests/drm_buddy_test.c'.
+[master 5b87d6762756] Merge branch 'for-linux-next-fixes' of git://anongit.freedesktop.org/drm/drm-misc
+$ git diff -M --stat --summary HEAD^..
+ drivers/accel/ivpu/ivpu_hw_40xx.c                 | 2 +-
+ drivers/gpu/drm/drm_buddy.c                       | 4 ++--
+ drivers/gpu/drm/meson/meson_encoder_cvbs.c        | 1 -
+ drivers/gpu/drm/meson/meson_encoder_dsi.c         | 1 -
+ drivers/gpu/drm/meson/meson_encoder_hdmi.c        | 1 -
+ drivers/gpu/drm/nouveau/nvkm/subdev/bar/r535.c    | 5 ++---
+ drivers/gpu/drm/nouveau/nvkm/subdev/bios/shadow.c | 8 +++++++-
+ 7 files changed, 12 insertions(+), 10 deletions(-)
+Merging mm-stable/mm-stable (b401b621758e Linux 6.8-rc5)
+$ git merge -m Merge branch 'mm-stable' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm mm-stable/mm-stable
+Already up to date.
+Merging mm-nonmm-stable/mm-nonmm-stable (b401b621758e Linux 6.8-rc5)
+$ git merge -m Merge branch 'mm-nonmm-stable' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm mm-nonmm-stable/mm-nonmm-stable
+Already up to date.
+Merging mm/mm-everything (e36e2a37ad2e Merge branch 'mm-nonmm-unstable' into mm-everything)
+$ git merge -m Merge branch 'mm-everything' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm mm/mm-everything
+Auto-merging MAINTAINERS
+Auto-merging net/iucv/iucv.c
+Auto-merging scripts/Makefile.lib
+Merge made by the 'ort' strategy.
+ Documentation/ABI/testing/sysfs-bus-dax            |  153 ++
+ Documentation/ABI/testing/sysfs-kernel-mm-cma      |    6 +
+ Documentation/ABI/testing/sysfs-kernel-mm-damon    |   16 +-
+ .../ABI/testing/sysfs-kernel-mm-mempolicy          |    4 +
+ .../sysfs-kernel-mm-mempolicy-weighted-interleave  |   25 +
+ Documentation/admin-guide/cgroup-v2.rst            |   18 +-
+ Documentation/admin-guide/kdump/vmcoreinfo.rst     |    8 +-
+ Documentation/admin-guide/kernel-parameters.txt    |    1 +
+ Documentation/admin-guide/mm/damon/reclaim.rst     |   27 +
+ Documentation/admin-guide/mm/damon/usage.rst       |  156 +-
+ .../admin-guide/mm/numa_memory_policy.rst          |    9 +
+ Documentation/admin-guide/sysctl/kernel.rst        |   15 +-
+ Documentation/dev-tools/kasan.rst                  |   20 +-
+ Documentation/mm/damon/design.rst                  |   70 +-
+ Documentation/mm/damon/maintainer-profile.rst      |    8 +-
+ Documentation/mm/page_owner.rst                    |   45 +
+ Documentation/process/changes.rst                  |    2 +-
+ .../zh_CN/admin-guide/mm/damon/usage.rst           |   20 +-
+ .../translations/zh_CN/dev-tools/kasan.rst         |   20 +-
+ .../zh_TW/admin-guide/mm/damon/usage.rst           |   20 +-
+ .../translations/zh_TW/dev-tools/kasan.rst         |   20 +-
+ MAINTAINERS                                        |   13 +-
+ Makefile                                           |    8 -
+ arch/Kconfig                                       |    2 +-
+ arch/arc/Kconfig                                   |    1 +
+ arch/arc/include/asm/cachetype.h                   |    9 +
+ arch/arm/Kconfig                                   |    1 +
+ arch/arm/Kconfig.debug                             |    2 +-
+ arch/arm/configs/aspeed_g4_defconfig               |    2 +-
+ arch/arm/configs/aspeed_g5_defconfig               |    2 +-
+ arch/arm/include/asm/cachetype.h                   |    2 +
+ arch/arm/include/asm/current.h                     |    8 +-
+ arch/arm/include/asm/pgtable.h                     |    2 +
+ arch/arm/include/asm/ptdump.h                      |    6 +-
+ arch/arm/kernel/Makefile                           |    1 +
+ arch/arm/kernel/machine_kexec.c                    |    7 -
+ arch/arm/kernel/setup.c                            |    4 +-
+ arch/arm/kernel/vmcore_info.c                      |   10 +
+ arch/arm/mm/init.c                                 |    2 +-
+ arch/arm/mm/mmu.c                                  |    2 +-
+ arch/arm64/Kconfig                                 |   20 +-
+ .../include/asm/{crash_core.h => crash_reserve.h}  |    4 +-
+ arch/arm64/include/asm/kexec.h                     |    2 +-
+ arch/arm64/include/asm/pgtable.h                   |  431 ++++-
+ arch/arm64/include/asm/ptdump.h                    |    7 -
+ arch/arm64/include/asm/tlbflush.h                  |   13 +-
+ arch/arm64/kernel/Makefile                         |    2 +-
+ arch/arm64/kernel/efi.c                            |    4 +-
+ arch/arm64/kernel/machine_kexec.c                  |    2 +-
+ arch/arm64/kernel/machine_kexec_file.c             |   10 +-
+ arch/arm64/kernel/mte.c                            |    2 +-
+ arch/arm64/kernel/{crash_core.c => vmcore_info.c}  |    3 +-
+ arch/arm64/kvm/guest.c                             |    2 +-
+ arch/arm64/mm/Makefile                             |    1 +
+ arch/arm64/mm/contpte.c                            |  404 +++++
+ arch/arm64/mm/fault.c                              |   12 +-
+ arch/arm64/mm/fixmap.c                             |    4 +-
+ arch/arm64/mm/hugetlbpage.c                        |   47 +-
+ arch/arm64/mm/init.c                               |    2 +-
+ arch/arm64/mm/kasan_init.c                         |    6 +-
+ arch/arm64/mm/mmu.c                                |   18 +-
+ arch/arm64/mm/pageattr.c                           |    6 +-
+ arch/arm64/mm/ptdump.c                             |   11 +-
+ arch/arm64/mm/trans_pgd.c                          |    6 +-
+ arch/csky/Kconfig                                  |    1 +
+ arch/csky/include/asm/cachetype.h                  |    9 +
+ arch/loongarch/kernel/setup.c                      |    2 +-
+ arch/m68k/Kconfig                                  |    1 +
+ arch/m68k/include/asm/cachetype.h                  |    9 +
+ arch/mips/Kconfig                                  |    1 +
+ arch/mips/include/asm/cachetype.h                  |    9 +
+ arch/mips/kernel/setup.c                           |   17 +-
+ arch/nios2/Kconfig                                 |    1 +
+ arch/nios2/include/asm/cachetype.h                 |   10 +
+ arch/nios2/include/asm/pgtable.h                   |    2 +
+ arch/parisc/Kconfig                                |    1 +
+ arch/parisc/include/asm/cachetype.h                |    9 +
+ arch/powerpc/Kconfig                               |    8 +-
+ arch/powerpc/Makefile                              |    4 +-
+ arch/powerpc/include/asm/pgtable.h                 |    2 +
+ arch/powerpc/include/asm/tlb.h                     |    2 +
+ arch/powerpc/kernel/setup-common.c                 |    2 +-
+ arch/powerpc/kexec/Makefile                        |    1 +
+ arch/powerpc/kexec/core.c                          |   28 -
+ arch/powerpc/kexec/vmcore_info.c                   |   32 +
+ arch/powerpc/kvm/book3s_hv_nested.c                |    2 +-
+ arch/powerpc/mm/hugetlbpage.c                      |    6 +-
+ arch/powerpc/mm/mmu_decl.h                         |    6 -
+ arch/powerpc/mm/nohash/kaslr_booke.c               |    4 +-
+ arch/powerpc/mm/pgtable.c                          |    5 +-
+ arch/powerpc/mm/pgtable_32.c                       |    4 -
+ arch/powerpc/mm/pgtable_64.c                       |    3 -
+ arch/powerpc/mm/ptdump/ptdump.c                    |   21 +-
+ arch/powerpc/platforms/powernv/opal-core.c         |    2 +-
+ arch/riscv/Kconfig                                 |    6 +-
+ .../include/asm/{crash_core.h => crash_reserve.h}  |    4 +-
+ arch/riscv/include/asm/ftrace.h                    |   14 +-
+ arch/riscv/include/asm/pgtable.h                   |    2 +
+ arch/riscv/include/asm/ptdump.h                    |   22 -
+ arch/riscv/kernel/Makefile                         |    2 +-
+ arch/riscv/kernel/elf_kexec.c                      |    9 +-
+ arch/riscv/kernel/mcount.S                         |   10 +-
+ arch/riscv/kernel/{crash_core.c => vmcore_info.c}  |    3 +-
+ arch/riscv/mm/init.c                               |    5 +-
+ arch/riscv/mm/ptdump.c                             |   12 +-
+ arch/s390/Kconfig                                  |    1 +
+ arch/s390/include/asm/ftrace.h                     |    2 +-
+ arch/s390/include/asm/pgtable.h                    |    2 +
+ arch/s390/include/asm/ptdump.h                     |   14 -
+ arch/s390/include/asm/tlb.h                        |   30 +-
+ arch/s390/kernel/Makefile                          |    1 +
+ arch/s390/kernel/kexec_elf.c                       |    2 +
+ arch/s390/kernel/kexec_image.c                     |    2 +
+ arch/s390/kernel/machine_kexec.c                   |   15 -
+ arch/s390/kernel/machine_kexec_file.c              |   10 +
+ arch/s390/kernel/vmcore_info.c                     |   21 +
+ arch/s390/mm/dump_pagetables.c                     |   21 +-
+ arch/s390/mm/init.c                                |    5 -
+ arch/s390/mm/pgtable.c                             |    4 +-
+ arch/s390/mm/vmem.c                                |   62 +-
+ arch/sh/Kconfig                                    |    1 +
+ arch/sh/include/asm/cachetype.h                    |    9 +
+ arch/sh/kernel/Makefile                            |    1 +
+ arch/sh/kernel/machine_kexec.c                     |   14 +-
+ arch/sh/kernel/setup.c                             |    2 +-
+ arch/sh/kernel/vmcore_info.c                       |   15 +
+ arch/sparc/Kconfig                                 |    1 +
+ arch/sparc/include/asm/cachetype.h                 |   14 +
+ arch/sparc/include/asm/pgtable_64.h                |    2 +
+ arch/sparc/kernel/chmc.c                           |    2 +-
+ arch/sparc/kernel/ds.c                             |    2 +-
+ arch/x86/Kconfig                                   |    2 +-
+ arch/x86/Makefile                                  |    6 -
+ .../include/asm/{crash_core.h => crash_reserve.h}  |    6 +-
+ arch/x86/include/asm/mmu.h                         |    2 +-
+ arch/x86/include/asm/pgtable.h                     |   13 +-
+ arch/x86/kernel/Makefile                           |    6 +-
+ arch/x86/kernel/alternative.c                      |    2 +-
+ arch/x86/kernel/cpu/mshyperv.c                     |   10 +-
+ arch/x86/kernel/kexec-bzimage64.c                  |    4 +
+ arch/x86/kernel/kvm.c                              |    4 +-
+ arch/x86/kernel/machine_kexec_64.c                 |    3 +
+ arch/x86/kernel/reboot.c                           |    4 +-
+ arch/x86/kernel/setup.c                            |    2 +-
+ arch/x86/kernel/smp.c                              |    2 +-
+ .../kernel/{crash_core_32.c => vmcore_info_32.c}   |    2 +-
+ .../kernel/{crash_core_64.c => vmcore_info_64.c}   |    2 +-
+ arch/x86/mm/dump_pagetables.c                      |   24 +-
+ arch/x86/mm/init_32.c                              |    2 -
+ arch/x86/mm/init_64.c                              |    2 -
+ arch/x86/mm/tlb.c                                  |   37 +-
+ arch/x86/power/Makefile                            |    2 +-
+ arch/x86/xen/enlighten_hvm.c                       |    4 +
+ arch/x86/xen/mmu_pv.c                              |    4 +-
+ arch/xtensa/Kconfig                                |    1 +
+ arch/xtensa/include/asm/cachetype.h                |   10 +
+ crypto/blake2b_generic.c                           |    2 +-
+ drivers/android/binder.c                           |    4 +-
+ drivers/base/cacheinfo.c                           |   50 +-
+ drivers/base/cpu.c                                 |    6 +-
+ drivers/base/memory.c                              |   23 +-
+ drivers/base/node.c                                |    4 +
+ drivers/block/sunvdc.c                             |    2 +-
+ drivers/block/zram/zcomp.c                         |    5 +-
+ drivers/block/zram/zram_drv.c                      |    2 +-
+ drivers/char/hw_random/n2-drv.c                    |    2 +-
+ drivers/char/tpm/st33zp24/i2c.c                    |    2 +-
+ drivers/char/tpm/st33zp24/spi.c                    |    2 +-
+ drivers/char/tpm/st33zp24/st33zp24.c               |    2 +-
+ drivers/char/tpm/tpm-interface.c                   |    2 +-
+ drivers/char/tpm/tpm_atmel.c                       |    2 +-
+ drivers/char/tpm/tpm_i2c_nuvoton.c                 |    2 +-
+ drivers/char/tpm/tpm_nsc.c                         |    2 +-
+ drivers/char/tpm/tpm_tis.c                         |    2 +-
+ drivers/char/tpm/tpm_tis_core.c                    |    2 +-
+ drivers/char/tpm/tpm_vtpm_proxy.c                  |    2 +-
+ drivers/cpuidle/cpuidle.c                          |    2 +-
+ drivers/crypto/n2_core.c                           |    2 +-
+ drivers/dax/bus.c                                  |  295 +++-
+ drivers/dax/super.c                                |   14 +
+ drivers/firmware/efi/libstub/Makefile              |    2 +-
+ drivers/firmware/qemu_fw_cfg.c                     |   14 +-
+ drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c           |    2 +-
+ drivers/hwmon/dell-smm-hwmon.c                     |    2 +-
+ drivers/hwmon/ultra45_env.c                        |    2 +-
+ drivers/i2c/muxes/i2c-mux-mlxcpld.c                |    2 +-
+ drivers/leds/leds-sunfire.c                        |    2 +-
+ drivers/md/bcache/sysfs.c                          |    8 +-
+ drivers/md/dm.c                                    |   17 +-
+ drivers/media/common/siano/smscoreapi.c            |    2 +-
+ drivers/media/common/siano/smsdvb-main.c           |    2 +-
+ drivers/media/dvb-frontends/cx24117.c              |    2 +-
+ drivers/media/test-drivers/vicodec/codec-fwht.c    |    2 +-
+ drivers/media/usb/siano/smsusb.c                   |    2 +-
+ drivers/net/ethernet/broadcom/tg3.c                |    2 +-
+ drivers/net/ethernet/sun/cassini.c                 |    2 +-
+ drivers/net/ethernet/sun/niu.c                     |    2 +-
+ drivers/net/ethernet/sun/sunhme.c                  |    2 +-
+ drivers/net/ethernet/sun/sunvnet.c                 |    2 +-
+ drivers/net/ethernet/sun/sunvnet_common.c          |    2 +-
+ drivers/net/ppp/pptp.c                             |    2 +-
+ drivers/nvdimm/pmem.c                              |   23 +-
+ drivers/of/kexec.c                                 |    2 +
+ drivers/platform/x86/compal-laptop.c               |    2 +-
+ drivers/platform/x86/intel/oaktrail.c              |    2 +-
+ drivers/platform/x86/mlx-platform.c                |    2 +-
+ drivers/regulator/Kconfig                          |    2 +-
+ drivers/s390/block/dcssblk.c                       |   11 +-
+ drivers/s390/char/sclp_cmd.c                       |   44 +-
+ drivers/s390/net/fsm.c                             |    2 +-
+ drivers/sbus/char/openprom.c                       |    2 +-
+ drivers/scsi/esp_scsi.c                            |    2 +-
+ drivers/scsi/jazz_esp.c                            |    2 +-
+ drivers/scsi/mesh.c                                |    2 +-
+ drivers/scsi/qlogicpti.c                           |    2 +-
+ drivers/scsi/sun3x_esp.c                           |    2 +-
+ drivers/scsi/sun_esp.c                             |    2 +-
+ drivers/video/fbdev/hgafb.c                        |    2 +-
+ fs/Kconfig                                         |    2 +-
+ fs/fat/nfs.c                                       |    6 +
+ fs/fuse/virtio_fs.c                                |   15 +-
+ fs/nilfs2/alloc.c                                  |   89 +-
+ fs/nilfs2/bmap.c                                   |    3 -
+ fs/nilfs2/cpfile.c                                 |  323 ++--
+ fs/nilfs2/cpfile.h                                 |   10 +-
+ fs/nilfs2/dat.c                                    |   38 +-
+ fs/nilfs2/ifile.c                                  |   21 +-
+ fs/nilfs2/ifile.h                                  |   10 +-
+ fs/nilfs2/inode.c                                  |   44 +-
+ fs/nilfs2/mdt.c                                    |    4 +-
+ fs/nilfs2/nilfs.h                                  |    3 +-
+ fs/nilfs2/page.c                                   |    8 +-
+ fs/nilfs2/segbuf.c                                 |    4 +-
+ fs/nilfs2/segment.c                                |  121 +-
+ fs/nilfs2/sufile.c                                 |   86 +-
+ fs/nilfs2/super.c                                  |   31 +-
+ fs/ocfs2/dlmglue.c                                 |    2 +-
+ fs/proc/Kconfig                                    |    2 +-
+ fs/proc/kcore.c                                    |    2 +-
+ fs/proc/task_mmu.c                                 |   17 +-
+ fs/userfaultfd.c                                   |   86 +-
+ include/asm-generic/tlb.h                          |   40 +-
+ include/asm-generic/vmlinux.lds.h                  |    2 +-
+ include/linux/buildid.h                            |    2 +-
+ include/linux/cacheinfo.h                          |    6 +
+ include/linux/cma.h                                |    6 +-
+ include/linux/compiler-clang.h                     |   10 +-
+ include/linux/compiler_types.h                     |    9 +-
+ include/linux/crash_core.h                         |  158 +-
+ include/linux/crash_reserve.h                      |   48 +
+ include/linux/damon.h                              |   89 +-
+ include/linux/dax.h                                |   17 +-
+ include/linux/efi.h                                |    5 +
+ include/linux/flex_proportions.h                   |   32 -
+ include/linux/gfp.h                                |    2 +-
+ include/linux/highmem.h                            |   14 +
+ include/linux/hugetlb.h                            |    2 +-
+ include/linux/kexec.h                              |   47 +-
+ include/linux/list.h                               |   17 +-
+ include/linux/list_lru.h                           |   20 +-
+ include/linux/memcontrol.h                         |    1 +
+ include/linux/memory.h                             |    9 +
+ include/linux/memory_hotplug.h                     |   24 +-
+ include/linux/memremap.h                           |    3 +-
+ include/linux/min_heap.h                           |   42 +-
+ include/linux/mm.h                                 |   18 +-
+ include/linux/mm_types.h                           |   37 +-
+ include/linux/mmu_context.h                        |    2 +-
+ include/linux/mmzone.h                             |    6 +-
+ include/linux/moduleloader.h                       |    8 +
+ include/linux/padata.h                             |    2 +
+ include/linux/page_counter.h                       |    2 +-
+ include/linux/pgtable.h                            |  137 +-
+ include/linux/ptdump.h                             |   10 +
+ include/linux/sched.h                              |   10 +-
+ include/linux/sched/mm.h                           |   55 +-
+ include/linux/smp.h                                |    2 +-
+ include/linux/stackdepot.h                         |   58 +
+ include/linux/start_kernel.h                       |    2 -
+ include/linux/swap.h                               |    5 +-
+ include/linux/swapops.h                            |   13 +
+ include/linux/userfaultfd_k.h                      |   75 +-
+ include/linux/vmalloc.h                            |    1 -
+ include/linux/vmcore_info.h                        |   81 +
+ include/linux/win_minmax.h                         |    4 +-
+ include/linux/zswap.h                              |   11 +-
+ include/trace/events/compaction.h                  |    6 +-
+ include/trace/events/oom.h                         |   19 +-
+ include/uapi/linux/mempolicy.h                     |    1 +
+ init/initramfs.c                                   |    2 +-
+ init/main.c                                        |   16 +-
+ ipc/ipc_sysctl.c                                   |   37 +-
+ ipc/mq_sysctl.c                                    |   36 +
+ kernel/Kconfig.kexec                               |   12 +-
+ kernel/Makefile                                    |    5 +-
+ kernel/bounds.c                                    |    2 +-
+ kernel/crash_core.c                                |  816 +++------
+ kernel/crash_reserve.c                             |  464 ++++++
+ kernel/dma/contiguous.c                            |    6 -
+ kernel/{crash_dump.c => elfcorehdr.c}              |    0
+ kernel/events/uprobes.c                            |    2 +-
+ kernel/kallsyms_selftest.c                         |    1 -
+ kernel/kexec.c                                     |   11 +-
+ kernel/kexec_core.c                                |  250 +--
+ kernel/kexec_file.c                                |   15 +-
+ kernel/kexec_internal.h                            |    2 +
+ kernel/kprobes.c                                   |    4 +-
+ kernel/ksysfs.c                                    |   10 +-
+ kernel/module/main.c                               |    9 +-
+ kernel/padata.c                                    |   14 +-
+ kernel/panic.c                                     |    9 +
+ kernel/printk/printk.c                             |    4 +-
+ kernel/ptrace.c                                    |   13 +-
+ kernel/sched/fair.c                                |    6 +
+ kernel/user_namespace.c                            |    2 +-
+ kernel/vmcore_info.c                               |  230 +++
+ lib/Kconfig.debug                                  |    4 +-
+ lib/Kconfig.kasan                                  |    2 +-
+ lib/Kconfig.ubsan                                  |   14 +
+ lib/buildid.c                                      |    2 +-
+ lib/dhry_1.c                                       |    2 +-
+ lib/dhry_run.c                                     |    1 -
+ lib/dynamic_debug.c                                |    7 +-
+ lib/flex_proportions.c                             |   77 -
+ lib/maple_tree.c                                   |    6 +-
+ lib/raid6/Makefile                                 |    2 +-
+ lib/sort.c                                         |   20 +-
+ lib/stackdepot.c                                   |   65 +-
+ lib/stackinit_kunit.c                              |    2 +-
+ lib/test_ubsan.c                                   |   37 +
+ lib/test_xarray.c                                  |  230 +++
+ lib/ubsan.c                                        |   68 +
+ lib/ubsan.h                                        |    4 +
+ mm/Kconfig                                         |   37 +-
+ mm/cma.c                                           |   28 +-
+ mm/cma.h                                           |    5 +
+ mm/cma_sysfs.c                                     |   15 +
+ mm/compaction.c                                    |  361 ++--
+ mm/damon/Kconfig                                   |    7 +-
+ mm/damon/core.c                                    |  120 +-
+ mm/damon/dbgfs.c                                   |   26 +-
+ mm/damon/reclaim.c                                 |   53 +
+ mm/damon/sysfs-common.h                            |    8 +-
+ mm/damon/sysfs-schemes.c                           |  146 +-
+ mm/damon/sysfs.c                                   |   54 +-
+ mm/filemap.c                                       |    4 +-
+ mm/huge_memory.c                                   |  197 ++-
+ mm/hugetlb.c                                       |  298 ++--
+ mm/internal.h                                      |   13 +-
+ mm/kasan/common.c                                  |    2 +-
+ mm/kasan/kasan_test.c                              |   82 +-
+ mm/kasan/kasan_test_module.c                       |    4 +-
+ mm/kasan/report.c                                  |    2 +-
+ mm/khugepaged.c                                    |   22 +-
+ mm/kmsan/hooks.c                                   |   50 +-
+ mm/list_lru.c                                      |   20 +-
+ mm/memcontrol.c                                    |  154 +-
+ mm/memory-tiers.c                                  |   26 +-
+ mm/memory.c                                        |  466 ++++--
+ mm/memory_hotplug.c                                |   34 +-
+ mm/mempolicy.c                                     |  532 +++++-
+ mm/migrate.c                                       |    7 +-
+ mm/mm_init.c                                       |    1 +
+ mm/mmap.c                                          |  110 +-
+ mm/mmu_gather.c                                    |  121 +-
+ mm/mprotect.c                                      |    4 +-
+ mm/nommu.c                                         |    2 -
+ mm/oom_kill.c                                      |    6 +-
+ mm/page_alloc.c                                    |   95 +-
+ mm/page_owner.c                                    |  200 ++-
+ mm/ptdump.c                                        |   22 +
+ mm/readahead.c                                     |    6 +-
+ mm/rmap.c                                          |   14 +-
+ mm/shmem.c                                         |   18 +
+ mm/show_mem.c                                      |   43 +
+ mm/slab_common.c                                   |    2 +-
+ mm/sparse.c                                        |    3 +-
+ mm/swap.c                                          |   12 +-
+ mm/swap_slots.c                                    |    3 +
+ mm/swap_state.c                                    |   15 +-
+ mm/swapfile.c                                      |   29 +-
+ mm/userfaultfd.c                                   |  488 ++++--
+ mm/vmalloc.c                                       | 1094 +++++++++---
+ mm/vmscan.c                                        |  205 ++-
+ mm/z3fold.c                                        |    9 +-
+ mm/zsmalloc.c                                      |  118 +-
+ mm/zswap.c                                         | 1763 +++++++++-----------
+ net/bridge/br_multicast.c                          |    2 +-
+ net/ipv4/gre_demux.c                               |    2 +-
+ net/ipv6/ip6_gre.c                                 |    2 +-
+ net/iucv/iucv.c                                    |    2 +-
+ net/mpls/mpls_gso.c                                |    2 +-
+ scripts/Makefile.lib                               |    3 +
+ scripts/Makefile.ubsan                             |    3 +
+ scripts/const_structs.checkpatch                   |    2 +
+ scripts/gdb/linux/vmalloc.py                       |   56 +-
+ scripts/min-tool-version.sh                        |    2 +-
+ scripts/recordmcount.pl                            |    2 +-
+ security/Kconfig                                   |    2 -
+ tools/mm/Makefile                                  |    9 +-
+ tools/mm/thpmaps                                   |  675 ++++++++
+ tools/objtool/noreturns.h                          |    1 -
+ tools/testing/selftests/cgroup/test_zswap.c        |  122 +-
+ tools/testing/selftests/damon/.gitignore           |    2 +
+ tools/testing/selftests/damon/Makefile             |    5 +
+ tools/testing/selftests/damon/_chk_dependency.sh   |   20 +-
+ tools/testing/selftests/damon/_damon_sysfs.py      |   77 +-
+ tools/testing/selftests/damon/_debugfs_common.sh   |    7 +
+ .../selftests/damon/damos_apply_interval.py        |   67 +
+ tools/testing/selftests/damon/damos_quota.py       |   67 +
+ .../selftests/damon/debugfs_empty_targets.sh       |   12 +-
+ .../selftests/damon/debugfs_target_ids_pid_leak.c  |   68 +
+ .../selftests/damon/debugfs_target_ids_pid_leak.sh |   22 +
+ ...debugfs_target_ids_read_before_terminate_race.c |   80 +
+ ...ebugfs_target_ids_read_before_terminate_race.sh |   14 +
+ .../selftests/filesystems/eventfd/.gitignore       |    2 +
+ .../testing/selftests/filesystems/eventfd/Makefile |    7 +
+ .../selftests/filesystems/eventfd/eventfd_test.c   |  186 +++
+ tools/testing/selftests/memfd/memfd_test.c         |   10 -
+ tools/testing/selftests/mm/.gitignore              |    1 +
+ tools/testing/selftests/mm/Makefile                |    6 +
+ .../selftests/mm/charge_reserved_hugetlb.sh        |    4 +
+ tools/testing/selftests/mm/compaction_test.c       |   37 +-
+ tools/testing/selftests/mm/hugetlb_madv_vs_map.c   |  124 ++
+ .../selftests/mm/hugetlb_reparenting_test.sh       |    9 +-
+ tools/testing/selftests/mm/ksm_functional_tests.c  |    4 +-
+ tools/testing/selftests/mm/map_fixed_noreplace.c   |   96 +-
+ tools/testing/selftests/mm/map_hugetlb.c           |   42 +-
+ tools/testing/selftests/mm/map_populate.c          |   37 +-
+ tools/testing/selftests/mm/mlock-random-test.c     |  136 +-
+ tools/testing/selftests/mm/mlock2-tests.c          |  282 ++--
+ tools/testing/selftests/mm/mlock2.h                |   11 +-
+ tools/testing/selftests/mm/mrelease_test.c         |   80 +-
+ tools/testing/selftests/mm/mremap_dontunmap.c      |   32 +-
+ tools/testing/selftests/mm/on-fault-limit.c        |   38 +-
+ tools/testing/selftests/mm/protection_keys.c       |   34 +
+ tools/testing/selftests/mm/run_vmtests.sh          |   29 +-
+ tools/testing/selftests/mm/split_huge_page_test.c  |  163 +-
+ tools/testing/selftests/mm/thuge-gen.c             |  147 +-
+ tools/testing/selftests/mm/transhuge-stress.c      |   36 +-
+ tools/testing/selftests/mm/virtual_address_range.c |   44 +-
+ tools/testing/selftests/mm/vm_util.c               |    6 +-
+ 442 files changed, 12033 insertions(+), 5756 deletions(-)
+ create mode 100644 Documentation/ABI/testing/sysfs-bus-dax
+ create mode 100644 Documentation/ABI/testing/sysfs-kernel-mm-mempolicy
+ create mode 100644 Documentation/ABI/testing/sysfs-kernel-mm-mempolicy-weighted-interleave
+ create mode 100644 arch/arc/include/asm/cachetype.h
+ create mode 100644 arch/arm/kernel/vmcore_info.c
+ rename arch/arm64/include/asm/{crash_core.h => crash_reserve.h} (81%)
+ rename arch/arm64/kernel/{crash_core.c => vmcore_info.c} (92%)
+ create mode 100644 arch/arm64/mm/contpte.c
+ create mode 100644 arch/csky/include/asm/cachetype.h
+ create mode 100644 arch/m68k/include/asm/cachetype.h
+ create mode 100644 arch/mips/include/asm/cachetype.h
+ create mode 100644 arch/nios2/include/asm/cachetype.h
+ create mode 100644 arch/parisc/include/asm/cachetype.h
+ create mode 100644 arch/powerpc/kexec/vmcore_info.c
+ rename arch/riscv/include/asm/{crash_core.h => crash_reserve.h} (78%)
+ delete mode 100644 arch/riscv/include/asm/ptdump.h
+ rename arch/riscv/kernel/{crash_core.c => vmcore_info.c} (88%)
+ delete mode 100644 arch/s390/include/asm/ptdump.h
+ create mode 100644 arch/s390/kernel/vmcore_info.c
+ create mode 100644 arch/sh/include/asm/cachetype.h
+ create mode 100644 arch/sh/kernel/vmcore_info.c
+ create mode 100644 arch/sparc/include/asm/cachetype.h
+ rename arch/x86/include/asm/{crash_core.h => crash_reserve.h} (92%)
+ rename arch/x86/kernel/{crash_core_32.c => vmcore_info_32.c} (90%)
+ rename arch/x86/kernel/{crash_core_64.c => vmcore_info_64.c} (94%)
+ create mode 100644 arch/xtensa/include/asm/cachetype.h
+ create mode 100644 include/linux/crash_reserve.h
+ create mode 100644 include/linux/vmcore_info.h
+ create mode 100644 kernel/crash_reserve.c
+ rename kernel/{crash_dump.c => elfcorehdr.c} (100%)
+ create mode 100644 kernel/vmcore_info.c
+ create mode 100644 tools/mm/thpmaps
+ create mode 100644 tools/testing/selftests/damon/damos_apply_interval.py
+ create mode 100644 tools/testing/selftests/damon/damos_quota.py
+ create mode 100644 tools/testing/selftests/damon/debugfs_target_ids_pid_leak.c
+ create mode 100644 tools/testing/selftests/damon/debugfs_target_ids_pid_leak.sh
+ create mode 100644 tools/testing/selftests/damon/debugfs_target_ids_read_before_terminate_race.c
+ create mode 100644 tools/testing/selftests/damon/debugfs_target_ids_read_before_terminate_race.sh
+ create mode 100644 tools/testing/selftests/filesystems/eventfd/.gitignore
+ create mode 100644 tools/testing/selftests/filesystems/eventfd/Makefile
+ create mode 100644 tools/testing/selftests/filesystems/eventfd/eventfd_test.c
+ create mode 100644 tools/testing/selftests/mm/hugetlb_madv_vs_map.c
+Merging kbuild/for-next (ba3b759fb688 kconfig: lxdialog: fix cursor render in checklist)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/masahiroy/linux-kbuild.git kbuild/for-next
+Auto-merging Makefile
+Auto-merging arch/parisc/Kconfig
+Auto-merging lib/Kconfig.debug
+Auto-merging mm/Kconfig
+Auto-merging scripts/Makefile.lib
+Merge made by the 'ort' strategy.
+ .gitignore                                         |   1 +
+ Documentation/kbuild/kconfig.rst                   | 363 ++++++++++-----------
+ Makefile                                           |   6 +-
+ arch/hexagon/Kconfig                               |  12 +-
+ arch/parisc/Kconfig                                |   4 +-
+ arch/riscv/kernel/tests/Kconfig.debug              |   2 +-
+ arch/x86/kvm/Kconfig                               |   2 +-
+ arch/x86/xen/Kconfig                               |   1 -
+ drivers/acpi/Kconfig                               |   1 -
+ drivers/iommu/intel/Kconfig                        |   2 +-
+ drivers/md/Kconfig                                 |   1 -
+ init/Kconfig                                       |   2 +-
+ kernel/module/Kconfig                              |   3 +-
+ lib/Kconfig.debug                                  |   2 +-
+ mm/Kconfig                                         |   7 +-
+ net/dccp/ccids/Kconfig                             |   2 +-
+ scripts/Kbuild.include                             |   6 -
+ scripts/Makefile.build                             |  20 +-
+ scripts/Makefile.dtbinst                           |  32 +-
+ scripts/Makefile.lib                               |   6 +
+ scripts/kconfig/array_size.h                       |  11 +
+ scripts/kconfig/conf.c                             |  12 +-
+ scripts/kconfig/confdata.c                         |  91 +-----
+ scripts/kconfig/expr.h                             |  24 +-
+ scripts/kconfig/hashtable.h                        |  48 +++
+ scripts/kconfig/internal.h                         |  12 +
+ scripts/kconfig/lexer.l                            | 128 ++++----
+ scripts/kconfig/list.h                             | 254 ++++++++++----
+ scripts/kconfig/list_types.h                       |  17 +
+ scripts/kconfig/lkc.h                              |   6 +-
+ scripts/kconfig/lkc_proto.h                        |  15 -
+ scripts/kconfig/lxdialog/checklist.c               |   5 +-
+ scripts/kconfig/mconf.c                            |   1 +
+ scripts/kconfig/menu.c                             |  24 +-
+ scripts/kconfig/nconf.c                            |   1 +
+ scripts/kconfig/parser.y                           | 109 ++++---
+ scripts/kconfig/preprocess.c                       |  23 +-
+ scripts/kconfig/preprocess.h                       |  19 ++
+ scripts/kconfig/qconf.cc                           |   2 +-
+ scripts/kconfig/symbol.c                           |  45 +--
+ .../tests/err_recursive_dep/expected_stderr        |  24 +-
+ scripts/kconfig/util.c                             |  47 ++-
+ scripts/package/builddeb                           |  48 +--
+ scripts/package/debian/rules                       |  63 +++-
+ scripts/package/kernel.spec                        |  35 +-
+ 45 files changed, 857 insertions(+), 682 deletions(-)
+ create mode 100644 scripts/kconfig/array_size.h
+ create mode 100644 scripts/kconfig/hashtable.h
+ create mode 100644 scripts/kconfig/list_types.h
+ create mode 100644 scripts/kconfig/preprocess.h
+Merging clang-format/clang-format (5a205c6a9f79 clang-format: Update with v6.7-rc4's `for_each` macro list)
+$ git merge -m Merge branch 'clang-format' of https://github.com/ojeda/linux.git clang-format/clang-format
+Already up to date.
+Merging perf/perf-tools-next (81901fc0640d perf build: Cleanup perf register configuration)
+$ git merge -m Merge branch 'perf-tools-next' of git://git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools-next.git perf/perf-tools-next
+Merge made by the 'ort' strategy.
+ tools/perf/Documentation/perf-report.txt           |   29 +-
+ tools/perf/Documentation/perf-script-python.txt    |    4 +-
+ tools/perf/Documentation/perf-stat.txt             |   11 +
+ tools/perf/Documentation/perf-top.txt              |   32 +-
+ tools/perf/Documentation/perf.txt                  |    2 +
+ tools/perf/Documentation/tips.txt                  |   31 +-
+ tools/perf/Makefile.config                         |   31 +-
+ tools/perf/Makefile.perf                           |    4 +-
+ tools/perf/arch/arm/util/perf_regs.c               |    7 +-
+ tools/perf/arch/arm/util/pmu.c                     |    3 +
+ tools/perf/arch/arm64/util/machine.c               |    2 +
+ tools/perf/arch/arm64/util/mem-events.c            |   39 +-
+ tools/perf/arch/arm64/util/mem-events.h            |    7 +
+ tools/perf/arch/arm64/util/perf_regs.c             |    7 +-
+ tools/perf/arch/csky/util/perf_regs.c              |    7 +-
+ tools/perf/arch/loongarch/util/perf_regs.c         |    7 +-
+ tools/perf/arch/mips/util/perf_regs.c              |    7 +-
+ tools/perf/arch/powerpc/util/Build                 |    1 +
+ tools/perf/arch/powerpc/util/kvm-stat.c            |    2 +-
+ tools/perf/arch/powerpc/util/mem-events.c          |   16 +-
+ tools/perf/arch/powerpc/util/mem-events.h          |    7 +
+ tools/perf/arch/powerpc/util/perf_regs.c           |    7 +-
+ tools/perf/arch/powerpc/util/pmu.c                 |   12 +
+ tools/perf/arch/riscv/util/perf_regs.c             |    7 +-
+ tools/perf/arch/s390/util/perf_regs.c              |    7 +-
+ tools/perf/arch/x86/tests/dwarf-unwind.c           |    1 +
+ tools/perf/arch/x86/tests/hybrid.c                 |    5 +-
+ tools/perf/arch/x86/util/mem-events.c              |   99 +-
+ tools/perf/arch/x86/util/mem-events.h              |   10 +
+ tools/perf/arch/x86/util/perf_regs.c               |    7 +-
+ tools/perf/arch/x86/util/pmu.c                     |   19 +-
+ tools/perf/arch/x86/util/tsc.c                     |    4 +-
+ tools/perf/builtin-c2c.c                           |   45 +-
+ tools/perf/builtin-mem.c                           |   48 +-
+ tools/perf/builtin-record.c                        |   19 +-
+ tools/perf/builtin-report.c                        |    4 +-
+ tools/perf/builtin-sched.c                         |  220 +--
+ tools/perf/builtin-script.c                        |   21 +-
+ tools/perf/builtin-stat.c                          |   52 +-
+ tools/perf/builtin-top.c                           |    2 +-
+ tools/perf/builtin-version.c                       |    1 +
+ tools/perf/pmu-events/arch/powerpc/mapfile.csv     |    1 +
+ tools/perf/pmu-events/jevents.py                   |   23 +-
+ tools/perf/tests/expand-cgroup.c                   |    3 +-
+ tools/perf/tests/maps.c                            |    3 +
+ tools/perf/tests/parse-events.c                    |    9 +-
+ tools/perf/tests/pmu-events.c                      |   22 +-
+ .../perf/tests/shell/lib/perf_json_output_lint.py  |    4 +-
+ .../perf/tests/shell/lib/perf_metric_validation.py |  229 ++--
+ tools/perf/tests/shell/lib/stat_output.sh          |   12 +
+ tools/perf/tests/shell/stat+csv_output.sh          |    2 +
+ tools/perf/tests/shell/stat+json_output.sh         |   13 +
+ tools/perf/tests/shell/stat+std_output.sh          |    4 +-
+ tools/perf/tests/shell/stat_bpf_counters.sh        |   12 +-
+ tools/perf/tests/shell/stat_metrics_values.sh      |    4 +-
+ tools/perf/tests/shell/test_arm_callgraph_fp.sh    |    6 +
+ tools/perf/tests/thread-maps-share.c               |    8 +-
+ tools/perf/tests/vmlinux-kallsyms.c                |   10 +-
+ tools/perf/util/annotate-data.c                    |  119 +-
+ tools/perf/util/annotate-data.h                    |    8 +-
+ tools/perf/util/annotate.c                         |  153 ++-
+ tools/perf/util/annotate.h                         |   12 +-
+ tools/perf/util/bpf-event.c                        |    1 +
+ tools/perf/util/callchain.c                        |    2 +-
+ tools/perf/util/cpumap.c                           |   33 +-
+ tools/perf/util/cpumap.h                           |   19 +-
+ tools/perf/util/data.c                             |   10 +-
+ tools/perf/util/data.h                             |    6 +-
+ tools/perf/util/debug.c                            |    3 +
+ tools/perf/util/debug.h                            |    1 +
+ tools/perf/util/dwarf-aux.c                        |  187 ++-
+ tools/perf/util/dwarf-aux.h                        |   18 +
+ tools/perf/util/env.h                              |    1 +
+ tools/perf/util/event.c                            |    4 +-
+ tools/perf/util/evsel.c                            |   34 +-
+ tools/perf/util/evsel.h                            |    1 +
+ tools/perf/util/expr.c                             |   20 +-
+ tools/perf/util/expr.l                             |    9 +
+ tools/perf/util/machine.c                          |   34 +-
+ tools/perf/util/map.c                              |    9 +-
+ tools/perf/util/maps.c                             | 1396 ++++++++++++--------
+ tools/perf/util/maps.h                             |   65 +-
+ tools/perf/util/mem-events.c                       |  221 ++--
+ tools/perf/util/mem-events.h                       |   19 +-
+ tools/perf/util/parse-events.c                     |   96 +-
+ tools/perf/util/parse-events.h                     |   14 +-
+ tools/perf/util/parse-events.y                     |    2 -
+ tools/perf/util/parse-regs-options.c               |    8 +-
+ tools/perf/util/perf-regs-arch/perf_regs_aarch64.c |    4 -
+ tools/perf/util/perf-regs-arch/perf_regs_arm.c     |    4 -
+ tools/perf/util/perf-regs-arch/perf_regs_csky.c    |    4 -
+ .../perf/util/perf-regs-arch/perf_regs_loongarch.c |    4 -
+ tools/perf/util/perf-regs-arch/perf_regs_mips.c    |    4 -
+ tools/perf/util/perf-regs-arch/perf_regs_powerpc.c |    4 -
+ tools/perf/util/perf-regs-arch/perf_regs_riscv.c   |    4 -
+ tools/perf/util/perf-regs-arch/perf_regs_s390.c    |    4 -
+ tools/perf/util/perf-regs-arch/perf_regs_x86.c     |    4 -
+ tools/perf/util/perf_regs.c                        |   11 +-
+ tools/perf/util/perf_regs.h                        |   34 +-
+ tools/perf/util/pmu.c                              |   25 +-
+ tools/perf/util/pmu.h                              |    7 +
+ tools/perf/util/pmus.c                             |    6 -
+ tools/perf/util/pmus.h                             |    1 -
+ tools/perf/util/probe-event.c                      |    1 +
+ tools/perf/util/python-ext-sources                 |    1 +
+ tools/perf/util/python.c                           |    1 +
+ .../util/scripting-engines/trace-event-python.c    |    8 +-
+ tools/perf/util/session.c                          |   11 +
+ tools/perf/util/session.h                          |    2 +
+ tools/perf/util/setup.py                           |    1 +
+ tools/perf/util/sort.c                             |    2 +-
+ tools/perf/util/srcline.c                          |    2 +
+ tools/perf/util/stat-display.c                     |   15 +-
+ tools/perf/util/stat-shadow.c                      |    7 +-
+ tools/perf/util/stat.h                             |    1 +
+ tools/perf/util/symbol-elf.c                       |   79 +-
+ tools/perf/util/symbol.c                           |   31 +-
+ tools/perf/util/thread.c                           |    2 +-
+ tools/perf/util/thread_map.c                       |    2 +-
+ tools/perf/util/trace-event-parse.c                |  113 ++
+ tools/perf/util/trace-event.h                      |    3 +
+ tools/perf/util/unwind-libdw.c                     |    2 +-
+ tools/perf/util/unwind-libunwind-local.c           |    2 +-
+ tools/perf/util/unwind-libunwind.c                 |    7 +-
+ 124 files changed, 2692 insertions(+), 1437 deletions(-)
+ create mode 100644 tools/perf/arch/arm64/util/mem-events.h
+ create mode 100644 tools/perf/arch/powerpc/util/mem-events.h
+ create mode 100644 tools/perf/arch/powerpc/util/pmu.c
+ create mode 100644 tools/perf/arch/x86/util/mem-events.h
+Merging compiler-attributes/compiler-attributes (2993eb7a8d34 Compiler Attributes: counted_by: fixup clang URL)
+$ git merge -m Merge branch 'compiler-attributes' of https://github.com/ojeda/linux.git compiler-attributes/compiler-attributes
+Merge made by the 'ort' strategy.
+ include/linux/compiler_attributes.h | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+Merging dma-mapping/for-next (7c65aa3cc072 dma-debug: fix kernel-doc warnings)
+$ git merge -m Merge branch 'for-next' of git://git.infradead.org/users/hch/dma-mapping.git dma-mapping/for-next
+Already up to date.
+Merging asm-generic/master (34b2321cc648 MAINTAINERS: Add Andreas Larsson as co-maintainer for arch/sparc)
+$ git merge -m Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/arnd/asm-generic.git asm-generic/master
+Auto-merging MAINTAINERS
+Merge made by the 'ort' strategy.
+Merging arc/for-next (0bb80ecc33a8 Linux 6.6-rc1)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/vgupta/arc.git arc/for-next
+Already up to date.
+Merging arm/for-next (8790fade1a19 Merge branches 'misc' and 'fixes' into for-next)
+$ git merge -m Merge branch 'for-next' of git://git.armlinux.org.uk/~rmk/linux-arm.git arm/for-next
+Already up to date.
+Merging arm64/for-next/core (19cf01c42e5b Merge branch 'for-next/stage1-lpa2' into for-next/core)
+$ git merge -m Merge branch 'for-next/core' of git://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux arm64/for-next/core
+Auto-merging Documentation/arch/arm64/silicon-errata.rst
+Auto-merging Makefile
+Auto-merging arch/arm64/Kconfig
+Auto-merging arch/arm64/include/asm/cpufeature.h
+Auto-merging arch/arm64/include/asm/pgtable.h
+Auto-merging arch/arm64/kernel/Makefile
+Auto-merging arch/arm64/kernel/signal.c
+Auto-merging arch/arm64/mm/fault.c
+Auto-merging arch/arm64/mm/fixmap.c
+Auto-merging arch/arm64/mm/init.c
+Auto-merging arch/arm64/mm/kasan_init.c
+Auto-merging arch/arm64/mm/mmu.c
+Auto-merging arch/arm64/mm/ptdump.c
+Auto-merging arch/x86/Makefile
+Auto-merging mm/mmap.c
+Merge made by the 'ort' strategy.
+ Documentation/arch/arm64/silicon-errata.rst   |   5 +-
+ Documentation/rust/arch-support.rst           |   1 +
+ Makefile                                      |   1 -
+ arch/arm64/Kconfig                            |  64 ++--
+ arch/arm64/Makefile                           |   4 +
+ arch/arm64/configs/defconfig                  |   1 -
+ arch/arm64/include/asm/archrandom.h           |   2 -
+ arch/arm64/include/asm/assembler.h            |  55 ++-
+ arch/arm64/include/asm/brk-imm.h              |   2 +
+ arch/arm64/include/asm/cpufeature.h           | 116 +++++++
+ arch/arm64/include/asm/esr.h                  |  13 +-
+ arch/arm64/include/asm/exception.h            |   2 +-
+ arch/arm64/include/asm/fixmap.h               |   2 +-
+ arch/arm64/include/asm/io.h                   |  12 +-
+ arch/arm64/include/asm/kasan.h                |   2 -
+ arch/arm64/include/asm/kernel-pgtable.h       | 103 +++---
+ arch/arm64/include/asm/kvm_emulate.h          |  10 +-
+ arch/arm64/include/asm/memory.h               |  31 +-
+ arch/arm64/include/asm/mman.h                 |  36 ++
+ arch/arm64/include/asm/mmu.h                  |  40 ++-
+ arch/arm64/include/asm/mmu_context.h          |  83 ++---
+ arch/arm64/include/asm/pgalloc.h              |  53 ++-
+ arch/arm64/include/asm/pgtable-hwdef.h        |  33 +-
+ arch/arm64/include/asm/pgtable-prot.h         |  20 +-
+ arch/arm64/include/asm/pgtable-types.h        |   6 +
+ arch/arm64/include/asm/pgtable.h              | 237 +++++++++++--
+ arch/arm64/include/asm/scs.h                  |  36 +-
+ arch/arm64/include/asm/setup.h                |   3 -
+ arch/arm64/include/asm/tlb.h                  |   3 +
+ arch/arm64/kernel/Makefile                    |  13 +-
+ arch/arm64/kernel/cpufeature.c                | 102 +++---
+ arch/arm64/kernel/entry-common.c              |  36 +-
+ arch/arm64/kernel/head.S                      | 463 +++-----------------------
+ arch/arm64/kernel/image-vars.h                |  35 ++
+ arch/arm64/kernel/kaslr.c                     |   4 +-
+ arch/arm64/kernel/module.c                    |   2 +-
+ arch/arm64/kernel/pi/Makefile                 |  27 +-
+ arch/arm64/kernel/{ => pi}/idreg-override.c   |  80 +++--
+ arch/arm64/kernel/pi/kaslr_early.c            |  82 ++---
+ arch/arm64/kernel/pi/map_kernel.c             | 276 +++++++++++++++
+ arch/arm64/kernel/pi/map_range.c              | 105 ++++++
+ arch/arm64/kernel/{ => pi}/patch-scs.c        |  36 +-
+ arch/arm64/kernel/pi/pi.h                     |  36 ++
+ arch/arm64/kernel/pi/relacheck.c              | 130 ++++++++
+ arch/arm64/kernel/pi/relocate.c               |  64 ++++
+ arch/arm64/kernel/probes/kprobes.c            |  21 +-
+ arch/arm64/kernel/probes/kprobes_trampoline.S |  78 +----
+ arch/arm64/kernel/setup.c                     |  22 --
+ arch/arm64/kernel/signal.c                    |  39 +--
+ arch/arm64/kernel/sleep.S                     |   3 -
+ arch/arm64/kernel/vmlinux.lds.S               |  17 +-
+ arch/arm64/kvm/mmu.c                          |  17 +-
+ arch/arm64/mm/fault.c                         |  30 +-
+ arch/arm64/mm/fixmap.c                        |  39 +--
+ arch/arm64/mm/init.c                          |   2 +-
+ arch/arm64/mm/kasan_init.c                    | 165 +++++++--
+ arch/arm64/mm/mmap.c                          |   4 +
+ arch/arm64/mm/mmu.c                           | 253 ++++++++------
+ arch/arm64/mm/pgd.c                           |  17 +-
+ arch/arm64/mm/proc.S                          | 122 +++++--
+ arch/arm64/mm/ptdump.c                        |  77 +++--
+ arch/arm64/tools/cpucaps                      |   1 +
+ arch/arm64/tools/sysreg                       |   8 +-
+ arch/loongarch/Makefile                       |   1 +
+ arch/x86/Makefile                             |   1 +
+ include/linux/mman.h                          |  15 +
+ mm/mmap.c                                     |   3 +
+ rust/Makefile                                 |   6 +-
+ scripts/Makefile                              |   4 +-
+ scripts/generate_rust_target.rs               |   4 +-
+ 70 files changed, 2102 insertions(+), 1314 deletions(-)
+ rename arch/arm64/kernel/{ => pi}/idreg-override.c (83%)
+ create mode 100644 arch/arm64/kernel/pi/map_kernel.c
+ create mode 100644 arch/arm64/kernel/pi/map_range.c
+ rename arch/arm64/kernel/{ => pi}/patch-scs.c (89%)
+ create mode 100644 arch/arm64/kernel/pi/pi.h
+ create mode 100644 arch/arm64/kernel/pi/relacheck.c
+ create mode 100644 arch/arm64/kernel/pi/relocate.c
+Merging arm-perf/for-next/perf (fd185a245155 perf/arm_cspmu: Add devicetree support)
+$ git merge -m Merge branch 'for-next/perf' of git://git.kernel.org/pub/scm/linux/kernel/git/will/linux.git arm-perf/for-next/perf
+Auto-merging drivers/perf/arm-cmn.c
+Merge made by the 'ort' strategy.
+ .../bindings/perf/arm,coresight-pmu.yaml           |  39 +++++
+ drivers/perf/alibaba_uncore_drw_pmu.c              |   6 +-
+ drivers/perf/amlogic/meson_g12_ddr_pmu.c           |   6 +-
+ drivers/perf/arm-cci.c                             |   8 +-
+ drivers/perf/arm-ccn.c                             |   6 +-
+ drivers/perf/arm-cmn.c                             |  14 +-
+ drivers/perf/arm_cspmu/arm_cspmu.c                 | 159 ++++++++++++---------
+ drivers/perf/arm_cspmu/arm_cspmu.h                 |   1 +
+ drivers/perf/arm_cspmu/nvidia_cspmu.c              |   6 -
+ drivers/perf/arm_dmc620_pmu.c                      |   6 +-
+ drivers/perf/arm_dsu_pmu.c                         |   6 +-
+ drivers/perf/arm_smmuv3_pmu.c                      |   6 +-
+ drivers/perf/arm_spe_pmu.c                         |   5 +-
+ drivers/perf/fsl_imx8_ddr_perf.c                   |   5 +-
+ drivers/perf/fsl_imx9_ddr_perf.c                   |   6 +-
+ drivers/perf/hisilicon/hisi_uncore_cpa_pmu.c       |   5 +-
+ drivers/perf/hisilicon/hisi_uncore_ddrc_pmu.c      |   5 +-
+ drivers/perf/hisilicon/hisi_uncore_hha_pmu.c       |   5 +-
+ drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c       |   5 +-
+ drivers/perf/hisilicon/hisi_uncore_pa_pmu.c        |   5 +-
+ drivers/perf/hisilicon/hisi_uncore_sllc_pmu.c      |   5 +-
+ drivers/perf/marvell_cn10k_ddr_pmu.c               |   5 +-
+ drivers/perf/marvell_cn10k_tad_pmu.c               |   6 +-
+ drivers/perf/qcom_l2_pmu.c                         |   5 +-
+ drivers/perf/thunderx2_pmu.c                       |   5 +-
+ drivers/perf/xgene_pmu.c                           |   6 +-
+ 26 files changed, 178 insertions(+), 158 deletions(-)
+ create mode 100644 Documentation/devicetree/bindings/perf/arm,coresight-pmu.yaml
+Merging arm-soc/for-next (37156e9b9974 soc: document merges)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/soc/soc.git arm-soc/for-next
+Auto-merging MAINTAINERS
+Auto-merging arch/arm64/configs/defconfig
+Merge made by the 'ort' strategy.
+ .../devicetree/bindings/i2c/i2c-exynos5.yaml       |   1 +
+ .../memory-controllers/nvidia,tegra20-emc.yaml     |   2 +-
+ .../soc/samsung/samsung,exynos-sysreg.yaml         |   2 +
+ MAINTAINERS                                        |   2 -
+ arch/arm/arm-soc-for-next-contents.txt             |  39 +++++
+ arch/arm/configs/multi_v7_defconfig                |  18 +-
+ arch/arm/configs/shmobile_defconfig                |   2 -
+ arch/arm64/configs/defconfig                       |   1 +
+ arch/arm64/configs/virt.config                     |   4 +
+ drivers/memory/emif.c                              |  65 +++----
+ drivers/soc/mediatek/Kconfig                       |   9 +
+ drivers/soc/mediatek/Makefile                      |   1 +
+ drivers/soc/mediatek/mtk-socinfo.c                 | 191 +++++++++++++++++++++
+ drivers/soc/renesas/Kconfig                        |  17 +-
+ drivers/soc/renesas/rcar-rst.c                     |   1 +
+ drivers/soc/renesas/renesas-soc.c                  |   8 +
+ drivers/soc/tegra/fuse/fuse-tegra30.c              |   3 +-
+ 17 files changed, 315 insertions(+), 51 deletions(-)
+ create mode 100644 arch/arm/arm-soc-for-next-contents.txt
+ create mode 100644 drivers/soc/mediatek/mtk-socinfo.c
+Merging amlogic/for-next (8026dced77f2 Merge branch 'v6.9/arm64-dt' into for-next)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/amlogic/linux.git amlogic/for-next
+Merge made by the 'ort' strategy.
+ Documentation/devicetree/bindings/arm/amlogic.yaml |   1 +
+ .../devicetree/bindings/vendor-prefixes.yaml       |   2 +
+ arch/arm/boot/dts/amlogic/meson.dtsi               |   6 +-
+ arch/arm/boot/dts/amlogic/meson8.dtsi              |   1 -
+ arch/arm/boot/dts/amlogic/meson8b.dtsi             |   1 -
+ arch/arm64/boot/dts/amlogic/Makefile               |   6 +
+ arch/arm64/boot/dts/amlogic/amlogic-c3.dtsi        |   7 +
+ arch/arm64/boot/dts/amlogic/amlogic-t7.dtsi        |   2 +-
+ arch/arm64/boot/dts/amlogic/meson-a1-ad402.dts     |   2 +-
+ arch/arm64/boot/dts/amlogic/meson-a1.dtsi          |   2 +
+ .../dts/amlogic/meson-axg-jethome-jethub-j1xx.dtsi |  30 +-
+ arch/arm64/boot/dts/amlogic/meson-axg-s400.dts     |  16 +-
+ arch/arm64/boot/dts/amlogic/meson-axg.dtsi         |   8 +
+ arch/arm64/boot/dts/amlogic/meson-g12-common.dtsi  |   3 +
+ .../boot/dts/amlogic/meson-g12a-fbx8am-brcm.dtso   |  35 ++
+ .../dts/amlogic/meson-g12a-fbx8am-realtek.dtso     |  25 ++
+ arch/arm64/boot/dts/amlogic/meson-g12a-fbx8am.dts  | 462 +++++++++++++++++++++
+ .../boot/dts/amlogic/meson-g12a-radxa-zero.dts     |  12 +-
+ arch/arm64/boot/dts/amlogic/meson-g12a-sei510.dts  |  14 +-
+ arch/arm64/boot/dts/amlogic/meson-g12a-u200.dts    |  16 +-
+ arch/arm64/boot/dts/amlogic/meson-g12a-x96-max.dts |  14 +-
+ .../boot/dts/amlogic/meson-g12b-odroid-n2.dtsi     |   2 +-
+ arch/arm64/boot/dts/amlogic/meson-g12b-odroid.dtsi |  20 +-
+ arch/arm64/boot/dts/amlogic/meson-g12b-w400.dtsi   |  10 +-
+ .../boot/dts/amlogic/meson-gx-libretech-pc.dtsi    |  12 +-
+ .../arm64/boot/dts/amlogic/meson-gx-p23x-q20x.dtsi |   8 +-
+ .../boot/dts/amlogic/meson-gxbb-nexbox-a95x.dts    |   6 +-
+ .../arm64/boot/dts/amlogic/meson-gxbb-odroidc2.dts |   8 +-
+ arch/arm64/boot/dts/amlogic/meson-gxbb-p200.dts    |   4 +-
+ arch/arm64/boot/dts/amlogic/meson-gxbb-p20x.dtsi   |   6 +-
+ .../boot/dts/amlogic/meson-gxbb-vega-s95.dtsi      |   8 +-
+ arch/arm64/boot/dts/amlogic/meson-gxbb-wetek.dtsi  |   8 +-
+ .../dts/amlogic/meson-gxl-s805x-libretech-ac.dts   |   8 +-
+ .../boot/dts/amlogic/meson-gxl-s805x-p241.dts      |   8 +-
+ .../amlogic/meson-gxl-s905w-jethome-jethub-j80.dts |   8 +-
+ .../dts/amlogic/meson-gxl-s905x-hwacom-amazetv.dts |   6 +-
+ .../amlogic/meson-gxl-s905x-libretech-cc-v2.dts    |  12 +-
+ .../dts/amlogic/meson-gxl-s905x-libretech-cc.dts   |   6 +-
+ .../dts/amlogic/meson-gxl-s905x-nexbox-a95x.dts    |   6 +-
+ .../boot/dts/amlogic/meson-gxl-s905x-p212.dtsi     |   8 +-
+ .../boot/dts/amlogic/meson-gxm-khadas-vim2.dts     |   8 +-
+ .../dts/amlogic/meson-gxm-s912-libretech-pc.dts    |   2 +-
+ arch/arm64/boot/dts/amlogic/meson-khadas-vim3.dtsi |  16 +-
+ .../dts/amlogic/meson-libretech-cottonwood.dtsi    |   6 +-
+ arch/arm64/boot/dts/amlogic/meson-sm1-ac2xx.dtsi   |  10 +-
+ .../arm64/boot/dts/amlogic/meson-sm1-bananapi.dtsi |  14 +-
+ .../boot/dts/amlogic/meson-sm1-odroid-hc4.dts      |   4 +-
+ arch/arm64/boot/dts/amlogic/meson-sm1-odroid.dtsi  |  20 +-
+ arch/arm64/boot/dts/amlogic/meson-sm1-sei610.dts   |  12 +-
+ 49 files changed, 722 insertions(+), 189 deletions(-)
+ create mode 100644 arch/arm64/boot/dts/amlogic/meson-g12a-fbx8am-brcm.dtso
+ create mode 100644 arch/arm64/boot/dts/amlogic/meson-g12a-fbx8am-realtek.dtso
+ create mode 100644 arch/arm64/boot/dts/amlogic/meson-g12a-fbx8am.dts
+Merging asahi-soc/asahi-soc/for-next (ffc253263a13 Linux 6.6)
+$ git merge -m Merge branch 'asahi-soc/for-next' of https://github.com/AsahiLinux/linux.git asahi-soc/asahi-soc/for-next
+Already up to date.
+Merging aspeed/for-next (e60f7a99d378 ARM: dts: aspeed: minerva: add sgpio line name)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/joel/bmc.git aspeed/for-next
+Merge made by the 'ort' strategy.
+ .../devicetree/bindings/arm/aspeed/aspeed.yaml     |   4 +
+ arch/arm/boot/dts/aspeed/Makefile                  |   6 +-
+ .../dts/aspeed/aspeed-bmc-asrock-e3c256d4i.dts     | 322 ++++++++++++
+ .../dts/aspeed/aspeed-bmc-asrock-spc621d8hm3.dts   | 324 ++++++++++++
+ .../boot/dts/aspeed/aspeed-bmc-asrock-x570d4u.dts  | 377 +++++++++++++
+ .../boot/dts/aspeed/aspeed-bmc-facebook-harma.dts  | 585 +++++++++++++++++++++
+ .../dts/aspeed/aspeed-bmc-facebook-minerva-cmc.dts | 265 ----------
+ .../dts/aspeed/aspeed-bmc-facebook-minerva.dts     | 543 +++++++++++++++++++
+ 8 files changed, 2160 insertions(+), 266 deletions(-)
+ create mode 100644 arch/arm/boot/dts/aspeed/aspeed-bmc-asrock-e3c256d4i.dts
+ create mode 100644 arch/arm/boot/dts/aspeed/aspeed-bmc-asrock-spc621d8hm3.dts
+ create mode 100644 arch/arm/boot/dts/aspeed/aspeed-bmc-asrock-x570d4u.dts
+ create mode 100644 arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-harma.dts
+ delete mode 100644 arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva-cmc.dts
+ create mode 100644 arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts
+Merging at91/at91-next (859f600457cc Merge branch 'clk-microchip' into at91-next)
+$ git merge -m Merge branch 'at91-next' of git://git.kernel.org/pub/scm/linux/kernel/git/at91/linux.git at91/at91-next
+Merge made by the 'ort' strategy.
+ .../bindings/net/can/microchip,mpfs-can.yaml       |   6 +-
+ .../at91sam9g25-gardena-smart-gateway.dts          |   2 +
+ arch/arm/boot/dts/microchip/at91sam9x5ek.dtsi      |   2 +
+ drivers/clk/microchip/clk-mpfs.c                   | 154 ++++++++++++---------
+ include/dt-bindings/clock/microchip,mpfs-clock.h   |   5 +
+ 5 files changed, 100 insertions(+), 69 deletions(-)
+Merging broadcom/next (412c6bd2c649 Merge branch 'soc/next' into next)
+$ git merge -m Merge branch 'next' of https://github.com/Broadcom/stblinux.git broadcom/next
+Merge made by the 'ort' strategy.
+ Documentation/devicetree/bindings/bus/brcm,gisb-arb.yaml  |  1 +
+ arch/arm/include/debug/brcmstb.S                          |  8 +++++---
+ .../boot/dts/broadcom/bcmbca/bcm4908-asus-gt-ac5300.dts   | 13 +++++++------
+ arch/arm64/boot/dts/broadcom/bcmbca/bcm4908.dtsi          |  3 ---
+ drivers/bus/brcmstb_gisb.c                                | 15 +++++++++++++++
+ 5 files changed, 28 insertions(+), 12 deletions(-)
+Merging davinci/davinci/for-next (6613476e225e Linux 6.8-rc1)
+$ git merge -m Merge branch 'davinci/for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/brgl/linux.git davinci/davinci/for-next
+Already up to date.
+Merging drivers-memory/for-next (2f542c937c48 dt-bindings: memory-controllers: narrow regex for unit address to hex numbers)
+$ git merge -m Merge branch 'for-next' of https://git.kernel.org/pub/scm/linux/kernel/git/krzk/linux-mem-ctrl.git drivers-memory/for-next
+Already up to date.
+Merging imx-mxs/for-next (d93b6c641bc9 Merge branch 'imx/defconfig' into for-next)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/shawnguo/linux.git imx-mxs/for-next
+Auto-merging arch/arm64/boot/dts/freescale/Makefile
+Auto-merging arch/arm64/configs/defconfig
+Merge made by the 'ort' strategy.
+ Documentation/devicetree/bindings/arm/fsl.yaml     |  40 +-
+ arch/arm/boot/dts/nxp/imx/Makefile                 |   2 +
+ arch/arm/boot/dts/nxp/imx/imx6dl-sielaff.dts       | 533 +++++++++++++++++++
+ .../boot/dts/nxp/imx/imx6q-apalis-eval-v1.2.dts    | 200 ++++++++
+ arch/arm/boot/dts/nxp/imx/imx6q-apalis-eval.dts    | 108 +---
+ arch/arm/boot/dts/nxp/imx/imx6q-apalis-eval.dtsi   | 120 +++++
+ .../arm/boot/dts/nxp/imx/imx6qdl-hummingboard.dtsi |   7 +-
+ .../boot/dts/nxp/imx/imx6qdl-hummingboard2.dtsi    |   5 +
+ arch/arm/boot/dts/nxp/imx/imx6qdl-skov-cpu.dtsi    |  10 +-
+ .../boot/dts/nxp/imx/imx6sl-tolino-shine2hd.dts    |   6 +-
+ arch/arm/boot/dts/nxp/imx/imx6ul.dtsi              |   1 +
+ .../dts/nxp/imx/imx6ull-dhcom-som-cfg-sdcard.dtsi  |   4 +-
+ arch/arm/boot/dts/nxp/imx/imx6ull-dhcom-som.dtsi   |   2 +-
+ arch/arm/boot/dts/nxp/imx/imx6ull-dhcor-som.dtsi   |   7 +-
+ arch/arm/boot/dts/nxp/imx/imx7-mba7.dtsi           | 315 +++++++-----
+ arch/arm/boot/dts/nxp/imx/imx7-tqma7.dtsi          | 144 +++---
+ arch/arm/boot/dts/nxp/imx/imx7d-mba7.dts           |  94 ++--
+ arch/arm/boot/dts/nxp/ls/ls1021a.dtsi              |   1 +
+ arch/arm/configs/imx_v6_v7_defconfig               |   1 +
+ arch/arm/mach-imx/mmdc.c                           |   6 +-
+ arch/arm64/boot/dts/freescale/Makefile             |   6 +
+ arch/arm64/boot/dts/freescale/fsl-ls1012a.dtsi     |  10 +-
+ arch/arm64/boot/dts/freescale/fsl-ls1046a.dtsi     |   1 -
+ .../boot/dts/freescale/imx8-apalis-eval-v1.1.dtsi  |  26 +
+ .../boot/dts/freescale/imx8-apalis-eval-v1.2.dtsi  | 124 +++++
+ .../arm64/boot/dts/freescale/imx8-apalis-eval.dtsi |  22 -
+ arch/arm64/boot/dts/freescale/imx8-ss-audio.dtsi   | 330 ++++++++++++
+ arch/arm64/boot/dts/freescale/imx8-ss-gpu0.dtsi    |  27 +
+ arch/arm64/boot/dts/freescale/imx8dxl-evk.dts      | 101 ++++
+ arch/arm64/boot/dts/freescale/imx8dxl-ss-adma.dtsi |  12 +
+ .../boot/dts/freescale/imx8dxp-tqma8xdp-mba8xx.dts |  16 +
+ .../arm64/boot/dts/freescale/imx8dxp-tqma8xdp.dtsi |  24 +
+ arch/arm64/boot/dts/freescale/imx8dxp.dtsi         |  24 +
+ arch/arm64/boot/dts/freescale/imx8mm-evk.dtsi      |  36 ++
+ .../boot/dts/freescale/imx8mm-kontron-bl-osm-s.dts | 295 ++++-------
+ .../arm64/boot/dts/freescale/imx8mm-kontron-bl.dts |  38 +-
+ .../boot/dts/freescale/imx8mm-kontron-osm-s.dtsi   | 567 ++++++++++++++++++++-
+ .../boot/dts/freescale/imx8mm-kontron-sl.dtsi      |   4 +-
+ .../boot/dts/freescale/imx8mm-venice-gw71xx.dtsi   |  40 +-
+ .../boot/dts/freescale/imx8mm-venice-gw7901.dts    |  14 +-
+ arch/arm64/boot/dts/freescale/imx8mn-evk.dtsi      |  36 ++
+ .../boot/dts/freescale/imx8mn-rve-gateway.dts      |   2 +-
+ .../dts/freescale/imx8mp-data-modul-edm-sbc.dts    |  16 +
+ .../dts/freescale/imx8mp-phyboard-pollux-rdk.dts   |  74 +++
+ .../boot/dts/freescale/imx8mp-venice-gw71xx.dtsi   |  10 +-
+ arch/arm64/boot/dts/freescale/imx8mp-verdin.dtsi   |   3 +-
+ arch/arm64/boot/dts/freescale/imx8mp.dtsi          |  12 +-
+ .../boot/dts/freescale/imx8qm-apalis-eval-v1.2.dts |  16 +
+ .../boot/dts/freescale/imx8qm-apalis-eval.dts      |   2 +-
+ .../dts/freescale/imx8qm-apalis-v1.1-eval-v1.2.dts |  26 +
+ .../boot/dts/freescale/imx8qm-apalis-v1.1-eval.dts |   2 +-
+ arch/arm64/boot/dts/freescale/imx8qm-ss-conn.dtsi  |   5 +
+ arch/arm64/boot/dts/freescale/imx8qm-ss-dma.dtsi   |  29 +-
+ arch/arm64/boot/dts/freescale/imx8qm.dtsi          |  41 ++
+ .../boot/dts/freescale/imx8qxp-tqma8xqp-mba8xx.dts |  16 +
+ .../arm64/boot/dts/freescale/imx8qxp-tqma8xqp.dtsi |  14 +
+ arch/arm64/boot/dts/freescale/imx8qxp.dtsi         |   8 +
+ .../boot/dts/freescale/imx93-phyboard-segin.dts    | 117 +++++
+ .../boot/dts/freescale/imx93-phycore-som.dtsi      | 126 +++++
+ arch/arm64/boot/dts/freescale/imx93-tqma9352.dtsi  |   4 +-
+ .../boot/dts/freescale/imx93-var-som-symphony.dts  | 351 +++++++++++++
+ arch/arm64/boot/dts/freescale/imx93-var-som.dtsi   | 111 ++++
+ arch/arm64/boot/dts/freescale/imx93.dtsi           |   4 +-
+ arch/arm64/boot/dts/freescale/mba8xx.dtsi          | 553 ++++++++++++++++++++
+ arch/arm64/boot/dts/freescale/tqma8xx.dtsi         | 265 ++++++++++
+ arch/arm64/configs/defconfig                       |   8 +
+ 66 files changed, 4500 insertions(+), 674 deletions(-)
+ create mode 100644 arch/arm/boot/dts/nxp/imx/imx6dl-sielaff.dts
+ create mode 100644 arch/arm/boot/dts/nxp/imx/imx6q-apalis-eval-v1.2.dts
+ create mode 100644 arch/arm/boot/dts/nxp/imx/imx6q-apalis-eval.dtsi
+ create mode 100644 arch/arm64/boot/dts/freescale/imx8-apalis-eval-v1.1.dtsi
+ create mode 100644 arch/arm64/boot/dts/freescale/imx8-apalis-eval-v1.2.dtsi
+ create mode 100644 arch/arm64/boot/dts/freescale/imx8-ss-gpu0.dtsi
+ create mode 100644 arch/arm64/boot/dts/freescale/imx8dxp-tqma8xdp-mba8xx.dts
+ create mode 100644 arch/arm64/boot/dts/freescale/imx8dxp-tqma8xdp.dtsi
+ create mode 100644 arch/arm64/boot/dts/freescale/imx8dxp.dtsi
+ create mode 100644 arch/arm64/boot/dts/freescale/imx8qm-apalis-eval-v1.2.dts
+ create mode 100644 arch/arm64/boot/dts/freescale/imx8qm-apalis-v1.1-eval-v1.2.dts
+ create mode 100644 arch/arm64/boot/dts/freescale/imx8qxp-tqma8xqp-mba8xx.dts
+ create mode 100644 arch/arm64/boot/dts/freescale/imx8qxp-tqma8xqp.dtsi
+ create mode 100644 arch/arm64/boot/dts/freescale/imx93-phyboard-segin.dts
+ create mode 100644 arch/arm64/boot/dts/freescale/imx93-phycore-som.dtsi
+ create mode 100644 arch/arm64/boot/dts/freescale/imx93-var-som-symphony.dts
+ create mode 100644 arch/arm64/boot/dts/freescale/imx93-var-som.dtsi
+ create mode 100644 arch/arm64/boot/dts/freescale/mba8xx.dtsi
+ create mode 100644 arch/arm64/boot/dts/freescale/tqma8xx.dtsi
+Merging mediatek/for-next (ba90af39ba57 arm64: dts: mediatek: mt8183-pico6: Fix wake-on-X event node names)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/mediatek/linux.git mediatek/for-next
+Auto-merging Documentation/devicetree/bindings/vendor-prefixes.yaml
+Auto-merging arch/arm64/boot/dts/mediatek/mt8183-kukui.dtsi
+Auto-merging arch/arm64/boot/dts/mediatek/mt8192-asurada.dtsi
+Auto-merging arch/arm64/boot/dts/mediatek/mt8195-demo.dts
+Merge made by the 'ort' strategy.
+ .../devicetree/bindings/arm/mediatek.yaml          |  198 ++-
+ .../bindings/media/mediatek,vcodec-encoder.yaml    |   31 +-
+ .../bindings/media/mediatek-jpeg-encoder.yaml      |    3 +-
+ .../devicetree/bindings/vendor-prefixes.yaml       |    2 +
+ arch/arm64/boot/dts/mediatek/Makefile              |   14 +
+ arch/arm64/boot/dts/mediatek/mt2712-evb.dts        |    4 +-
+ arch/arm64/boot/dts/mediatek/mt2712e.dtsi          |    2 +-
+ arch/arm64/boot/dts/mediatek/mt6797.dtsi           |    8 +-
+ .../boot/dts/mediatek/mt7622-bananapi-bpi-r64.dts  |   13 +
+ arch/arm64/boot/dts/mediatek/mt7622-rfb1.dts       |   25 +
+ .../boot/dts/mediatek/mt7981b-xiaomi-ax3000t.dts   |   15 +
+ arch/arm64/boot/dts/mediatek/mt7981b.dtsi          |  105 ++
+ .../dts/mediatek/mt7986a-acelink-ew-7886cax.dts    |  173 ++
+ .../dts/mediatek/mt7986a-bananapi-bpi-r3-nand.dtso |    2 +-
+ .../boot/dts/mediatek/mt7986a-bananapi-bpi-r3.dts  |    2 +-
+ arch/arm64/boot/dts/mediatek/mt7986a-rfb.dts       |   31 +-
+ arch/arm64/boot/dts/mediatek/mt7986a.dtsi          |  222 +--
+ arch/arm64/boot/dts/mediatek/mt7986b-rfb.dts       |   31 +-
+ .../boot/dts/mediatek/mt7988a-bananapi-bpi-r4.dts  |   11 +
+ arch/arm64/boot/dts/mediatek/mt7988a.dtsi          |  136 ++
+ .../boot/dts/mediatek/mt8173-elm-hana-rev7.dts     |    2 +-
+ arch/arm64/boot/dts/mediatek/mt8173-elm.dtsi       |    3 +-
+ arch/arm64/boot/dts/mediatek/mt8173-evb.dts        |    2 +-
+ arch/arm64/boot/dts/mediatek/mt8173.dtsi           |   19 +-
+ .../dts/mediatek/mt8183-kukui-jacuzzi-pico6.dts    |    8 +-
+ .../boot/dts/mediatek/mt8183-kukui-jacuzzi.dtsi    |   25 +-
+ .../boot/dts/mediatek/mt8183-kukui-kakadu.dtsi     |    4 +
+ .../boot/dts/mediatek/mt8183-kukui-kodama.dtsi     |    4 +
+ .../boot/dts/mediatek/mt8183-kukui-krane.dtsi      |    4 +
+ arch/arm64/boot/dts/mediatek/mt8183-kukui.dtsi     |    7 +-
+ arch/arm64/boot/dts/mediatek/mt8183-pumpkin.dts    |    2 +-
+ arch/arm64/boot/dts/mediatek/mt8183.dtsi           |   11 +-
+ .../boot/dts/mediatek/mt8186-corsola-krabby.dtsi   |  129 ++
+ .../mediatek/mt8186-corsola-magneton-sku393216.dts |   39 +
+ .../mediatek/mt8186-corsola-magneton-sku393217.dts |   39 +
+ .../mediatek/mt8186-corsola-magneton-sku393218.dts |   26 +
+ .../mediatek/mt8186-corsola-rusty-sku196608.dts    |   26 +
+ .../mediatek/mt8186-corsola-steelix-sku131072.dts  |   18 +
+ .../mediatek/mt8186-corsola-steelix-sku131073.dts  |   18 +
+ .../boot/dts/mediatek/mt8186-corsola-steelix.dtsi  |  199 +++
+ .../mt8186-corsola-tentacool-sku327681.dts         |   57 +
+ .../mt8186-corsola-tentacool-sku327683.dts         |   24 +
+ .../mt8186-corsola-tentacruel-sku262144.dts        |   44 +
+ .../mt8186-corsola-tentacruel-sku262148.dts        |   26 +
+ arch/arm64/boot/dts/mediatek/mt8186-corsola.dtsi   | 1681 ++++++++++++++++++++
+ arch/arm64/boot/dts/mediatek/mt8186.dtsi           |   93 +-
+ arch/arm64/boot/dts/mediatek/mt8192-asurada.dtsi   |    5 +-
+ arch/arm64/boot/dts/mediatek/mt8192.dtsi           |   10 +-
+ .../boot/dts/mediatek/mt8195-cherry-tomato-r1.dts  |    4 +
+ .../boot/dts/mediatek/mt8195-cherry-tomato-r2.dts  |    4 +
+ .../boot/dts/mediatek/mt8195-cherry-tomato-r3.dts  |    4 +
+ arch/arm64/boot/dts/mediatek/mt8195-cherry.dtsi    |   27 +-
+ arch/arm64/boot/dts/mediatek/mt8195-demo.dts       |   18 +-
+ arch/arm64/boot/dts/mediatek/mt8195-evb.dts        |   12 +
+ arch/arm64/boot/dts/mediatek/mt8195.dtsi           |  128 +-
+ .../boot/dts/mediatek/mt8395-genio-1200-evk.dts    |   17 +-
+ .../boot/dts/mediatek/mt8395-radxa-nio-12l.dts     |  825 ++++++++++
+ drivers/soc/mediatek/Kconfig                       |    1 +
+ 58 files changed, 4287 insertions(+), 306 deletions(-)
+ create mode 100644 arch/arm64/boot/dts/mediatek/mt7981b-xiaomi-ax3000t.dts
+ create mode 100644 arch/arm64/boot/dts/mediatek/mt7981b.dtsi
+ create mode 100644 arch/arm64/boot/dts/mediatek/mt7986a-acelink-ew-7886cax.dts
+ create mode 100644 arch/arm64/boot/dts/mediatek/mt7988a-bananapi-bpi-r4.dts
+ create mode 100644 arch/arm64/boot/dts/mediatek/mt7988a.dtsi
+ create mode 100644 arch/arm64/boot/dts/mediatek/mt8186-corsola-krabby.dtsi
+ create mode 100644 arch/arm64/boot/dts/mediatek/mt8186-corsola-magneton-sku393216.dts
+ create mode 100644 arch/arm64/boot/dts/mediatek/mt8186-corsola-magneton-sku393217.dts
+ create mode 100644 arch/arm64/boot/dts/mediatek/mt8186-corsola-magneton-sku393218.dts
+ create mode 100644 arch/arm64/boot/dts/mediatek/mt8186-corsola-rusty-sku196608.dts
+ create mode 100644 arch/arm64/boot/dts/mediatek/mt8186-corsola-steelix-sku131072.dts
+ create mode 100644 arch/arm64/boot/dts/mediatek/mt8186-corsola-steelix-sku131073.dts
+ create mode 100644 arch/arm64/boot/dts/mediatek/mt8186-corsola-steelix.dtsi
+ create mode 100644 arch/arm64/boot/dts/mediatek/mt8186-corsola-tentacool-sku327681.dts
+ create mode 100644 arch/arm64/boot/dts/mediatek/mt8186-corsola-tentacool-sku327683.dts
+ create mode 100644 arch/arm64/boot/dts/mediatek/mt8186-corsola-tentacruel-sku262144.dts
+ create mode 100644 arch/arm64/boot/dts/mediatek/mt8186-corsola-tentacruel-sku262148.dts
+ create mode 100644 arch/arm64/boot/dts/mediatek/mt8186-corsola.dtsi
+ create mode 100644 arch/arm64/boot/dts/mediatek/mt8395-radxa-nio-12l.dts
+Merging mvebu/for-next (476887312c60 Merge branch 'mvebu/drivers' into mvebu/for-next)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/gclement/mvebu.git mvebu/for-next
+Merge made by the 'ort' strategy.
+Merging omap/for-next (0012c1958460 Merge branches 'sgx-for-v6.9' and 'omap-for-v6.9/soc' into for-next)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/tmlind/linux-omap.git omap/for-next
+Auto-merging MAINTAINERS
+Merge made by the 'ort' strategy.
+ .../{img,powervr.yaml => img,powervr-rogue.yaml}   |   4 +-
+ .../devicetree/bindings/gpu/img,powervr-sgx.yaml   | 138 +++++++++++++++++++++
+ MAINTAINERS                                        |   3 +-
+ arch/arm/boot/dts/ti/omap/am33xx.dtsi              |   9 +-
+ arch/arm/boot/dts/ti/omap/am3517.dtsi              |  11 +-
+ arch/arm/boot/dts/ti/omap/am4372.dtsi              |   6 +
+ arch/arm/boot/dts/ti/omap/dra7.dtsi                |   9 +-
+ arch/arm/boot/dts/ti/omap/omap34xx.dtsi            |  11 +-
+ arch/arm/boot/dts/ti/omap/omap36xx.dtsi            |   9 +-
+ arch/arm/boot/dts/ti/omap/omap4.dtsi               |   9 +-
+ arch/arm/boot/dts/ti/omap/omap5.dtsi               |   9 +-
+ arch/arm/mach-omap2/am33xx-restart.c               |   2 +-
+ arch/arm/mach-omap2/clkt2xxx_virt_prcm_set.c       |   2 +-
+ arch/arm/mach-omap2/clockdomain.c                  |   4 +-
+ arch/arm/mach-omap2/cm33xx.c                       |   2 +-
+ arch/arm/mach-omap2/cminst44xx.c                   |   2 +-
+ arch/arm/mach-omap2/omap-secure.c                  |   4 +-
+ arch/arm/mach-omap2/omap_hwmod.c                   |   9 +-
+ arch/arm/mach-omap2/omap_hwmod_common_data.c       |   6 +-
+ arch/arm/mach-omap2/pmic-cpcap.c                   |  24 ++--
+ arch/arm/mach-omap2/powerdomain.c                  |   2 +-
+ arch/arm/mach-omap2/prm44xx.c                      |   2 +-
+ arch/arm/mach-omap2/prm_common.c                   |   4 +-
+ arch/arm/mach-omap2/wd_timer.c                     |   4 +-
+ arch/arm64/boot/dts/ti/k3-am65-main.dtsi           |   7 ++
+ 25 files changed, 231 insertions(+), 61 deletions(-)
+ rename Documentation/devicetree/bindings/gpu/{img,powervr.yaml => img,powervr-rogue.yaml} (91%)
+ create mode 100644 Documentation/devicetree/bindings/gpu/img,powervr-sgx.yaml
+Merging qcom/for-next (f6265e31fc71 Merge branches 'arm32-for-6.9', 'arm64-defconfig-for-6.9', 'arm64-fixes-for-6.8', 'arm64-for-6.9', 'clk-for-6.9' and 'drivers-for-6.9' into for-next)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/qcom/linux.git qcom/for-next
+Auto-merging arch/arm/boot/dts/qcom/qcom-sdx55.dtsi
+Auto-merging arch/arm64/boot/dts/qcom/ipq6018.dtsi
+Auto-merging arch/arm64/boot/dts/qcom/ipq8074.dtsi
+Auto-merging arch/arm64/configs/defconfig
+Merge made by the 'ort' strategy.
+ .../devicetree/bindings/arm/msm/qcom,saw2.txt      |   58 -
+ Documentation/devicetree/bindings/arm/qcom.yaml    |   58 +-
+ .../bindings/clock/qcom,gcc-sc8180x.yaml           |    7 +
+ .../devicetree/bindings/clock/qcom,gpucc.yaml      |    9 +
+ .../devicetree/bindings/clock/qcom,q6sstopcc.yaml  |    2 +-
+ .../devicetree/bindings/clock/qcom,sc7180-mss.yaml |   61 -
+ .../bindings/clock/qcom,sm8450-camcc.yaml          |    2 +
+ .../bindings/clock/qcom,sm8450-gpucc.yaml          |    2 +
+ .../bindings/clock/qcom,sm8550-dispcc.yaml         |    7 +-
+ .../bindings/clock/qcom,sm8550-tcsr.yaml           |    1 +
+ .../bindings/clock/qcom,sm8650-dispcc.yaml         |  106 -
+ .../devicetree/bindings/soc/qcom/qcom,pbs.yaml     |   46 +
+ .../bindings/soc/qcom/qcom,pmic-glink.yaml         |    2 +
+ .../bindings/soc/qcom/qcom,rpm-master-stats.yaml   |    2 +
+ .../soc/qcom/{qcom,spm.yaml => qcom,saw2.yaml}     |   46 +-
+ arch/arm/Makefile                                  |    4 +-
+ arch/arm/boot/dts/qcom/Makefile                    |    1 +
+ arch/arm/boot/dts/qcom/qcom-apq8026-lg-lenok.dts   |   38 +
+ .../dts/qcom/qcom-apq8026-samsung-matisse-wifi.dts |  452 +---
+ arch/arm/boot/dts/qcom/qcom-apq8064.dtsi           |   70 +-
+ arch/arm/boot/dts/qcom/qcom-apq8084.dtsi           |   13 +-
+ arch/arm/boot/dts/qcom/qcom-ipq4019-ap.dk01.1.dtsi |  146 +-
+ arch/arm/boot/dts/qcom/qcom-ipq4019.dtsi           |   35 +-
+ arch/arm/boot/dts/qcom/qcom-ipq8064.dtsi           |   12 +-
+ .../qcom/qcom-msm8226-samsung-matisse-common.dtsi  |  457 ++++
+ arch/arm/boot/dts/qcom/qcom-msm8226.dtsi           |  764 +++---
+ arch/arm/boot/dts/qcom/qcom-msm8660.dtsi           |   17 +-
+ arch/arm/boot/dts/qcom/qcom-msm8926-htc-memul.dts  |   15 +-
+ .../dts/qcom/qcom-msm8926-samsung-matisselte.dts   |   37 +
+ arch/arm/boot/dts/qcom/qcom-msm8960-pins.dtsi      |   21 +
+ .../dts/qcom/qcom-msm8960-samsung-expressatt.dts   |   71 +-
+ arch/arm/boot/dts/qcom/qcom-msm8960.dtsi           |   48 +-
+ arch/arm/boot/dts/qcom/qcom-msm8974.dtsi           |   33 +-
+ arch/arm/boot/dts/qcom/qcom-sdx55.dtsi             |   32 +-
+ arch/arm/boot/dts/qcom/qcom-sdx65.dtsi             |   48 +-
+ arch/arm/mach-qcom/Kconfig                         |   41 +-
+ arch/arm64/boot/dts/qcom/Makefile                  |    6 +
+ .../dts/qcom/apq8016-sbc-d3-camera-mezzanine.dts   |    8 +-
+ arch/arm64/boot/dts/qcom/ipq5332.dtsi              |    8 +-
+ arch/arm64/boot/dts/qcom/ipq6018.dtsi              |  159 ++
+ arch/arm64/boot/dts/qcom/ipq8074.dtsi              |   16 +
+ arch/arm64/boot/dts/qcom/ipq9574.dtsi              |   12 +-
+ .../boot/dts/qcom/msm8216-samsung-fortuna3g.dts    |   11 +
+ .../dts/qcom/msm8916-samsung-fortuna-common.dtsi   |  203 ++
+ .../boot/dts/qcom/msm8916-samsung-gprimeltecan.dts |   27 +
+ .../dts/qcom/msm8916-samsung-grandprimelte.dts     |   16 +
+ .../dts/qcom/msm8916-samsung-rossa-common.dtsi     |   16 +
+ arch/arm64/boot/dts/qcom/msm8916-samsung-rossa.dts |   16 +
+ arch/arm64/boot/dts/qcom/msm8916.dtsi              |    9 +
+ arch/arm64/boot/dts/qcom/msm8939.dtsi              |   11 +-
+ arch/arm64/boot/dts/qcom/msm8953.dtsi              |  155 +-
+ .../boot/dts/qcom/msm8994-msft-lumia-octagon.dtsi  |    2 +-
+ .../dts/qcom/msm8994-sony-xperia-kitakami.dtsi     |    2 +-
+ arch/arm64/boot/dts/qcom/msm8994.dtsi              |    4 +-
+ arch/arm64/boot/dts/qcom/msm8996.dtsi              |   18 +-
+ arch/arm64/boot/dts/qcom/msm8998.dtsi              |   26 +-
+ .../boot/dts/qcom/{pm2250.dtsi => pm4125.dtsi}     |   38 +-
+ arch/arm64/boot/dts/qcom/pmi632.dtsi               |   39 +
+ arch/arm64/boot/dts/qcom/qcm2290.dtsi              |    7 +
+ arch/arm64/boot/dts/qcom/qcm6490-fairphone-fp5.dts |   56 +-
+ arch/arm64/boot/dts/qcom/qcm6490-idp.dts           |   39 +-
+ arch/arm64/boot/dts/qcom/qcs404.dtsi               |   16 +
+ arch/arm64/boot/dts/qcom/qcs6490-rb3gen2.dts       |   23 +-
+ arch/arm64/boot/dts/qcom/qrb2210-rb1.dts           |   96 +-
+ arch/arm64/boot/dts/qcom/qrb4210-rb2.dts           |   50 +-
+ arch/arm64/boot/dts/qcom/sa8295p-adp.dts           |   68 +
+ arch/arm64/boot/dts/qcom/sa8540p-ride.dts          |    4 +-
+ arch/arm64/boot/dts/qcom/sa8540p.dtsi              |    3 +
+ arch/arm64/boot/dts/qcom/sa8775p.dtsi              |  121 +-
+ arch/arm64/boot/dts/qcom/sc7180-trogdor.dtsi       |    1 +
+ arch/arm64/boot/dts/qcom/sc7180.dtsi               |   86 +-
+ arch/arm64/boot/dts/qcom/sc7280-chrome-common.dtsi |   28 +
+ arch/arm64/boot/dts/qcom/sc7280-herobrine.dtsi     |    1 +
+ arch/arm64/boot/dts/qcom/sc7280-idp-ec-h1.dtsi     |    1 +
+ arch/arm64/boot/dts/qcom/sc7280.dtsi               |  125 +-
+ arch/arm64/boot/dts/qcom/sc8180x.dtsi              |  141 +-
+ .../dts/qcom/sc8280xp-lenovo-thinkpad-x13s.dts     |   39 +-
+ arch/arm64/boot/dts/qcom/sc8280xp-pmics.dtsi       |   39 +-
+ arch/arm64/boot/dts/qcom/sc8280xp.dtsi             |  601 ++++-
+ .../arm64/boot/dts/qcom/sda660-inforce-ifc6560.dts |    5 +
+ arch/arm64/boot/dts/qcom/sdm450-motorola-ali.dts   |    2 +-
+ arch/arm64/boot/dts/qcom/sdm450.dtsi               |   14 +
+ .../boot/dts/qcom/sdm630-sony-xperia-nile.dtsi     |   16 +
+ arch/arm64/boot/dts/qcom/sdm630.dtsi               |   62 +-
+ arch/arm64/boot/dts/qcom/sdm632.dtsi               |    8 +
+ .../arm64/boot/dts/qcom/sdm660-xiaomi-lavender.dts |    6 +
+ arch/arm64/boot/dts/qcom/sdm670.dtsi               |   14 +-
+ arch/arm64/boot/dts/qcom/sdm845-cheza.dtsi         |    1 +
+ arch/arm64/boot/dts/qcom/sdm845-db845c.dts         |    2 +-
+ .../arm64/boot/dts/qcom/sdm845-oneplus-common.dtsi |    8 +-
+ arch/arm64/boot/dts/qcom/sdm845-shift-axolotl.dts  |    2 +-
+ arch/arm64/boot/dts/qcom/sdm845.dtsi               |   63 +-
+ arch/arm64/boot/dts/qcom/sm4450.dtsi               |    2 +-
+ arch/arm64/boot/dts/qcom/sm6115.dtsi               |   98 +-
+ arch/arm64/boot/dts/qcom/sm6125.dtsi               |   17 +-
+ arch/arm64/boot/dts/qcom/sm6350.dtsi               |  597 ++++-
+ arch/arm64/boot/dts/qcom/sm6375.dtsi               |   12 +-
+ arch/arm64/boot/dts/qcom/sm7125-xiaomi-common.dtsi |   26 +
+ arch/arm64/boot/dts/qcom/sm7125-xiaomi-curtana.dts |   16 +
+ arch/arm64/boot/dts/qcom/sm7225-fairphone-fp4.dts  |   61 +-
+ arch/arm64/boot/dts/qcom/sm8150.dtsi               |  115 +-
+ arch/arm64/boot/dts/qcom/sm8250.dtsi               |  107 +-
+ arch/arm64/boot/dts/qcom/sm8350.dtsi               |   87 +-
+ arch/arm64/boot/dts/qcom/sm8450-hdk.dts            |    6 +-
+ arch/arm64/boot/dts/qcom/sm8450.dtsi               |   81 +-
+ arch/arm64/boot/dts/qcom/sm8550-hdk.dts            | 1306 ++++++++++
+ arch/arm64/boot/dts/qcom/sm8550-mtp.dts            |   11 +-
+ arch/arm64/boot/dts/qcom/sm8550-qrd.dts            |   53 +-
+ arch/arm64/boot/dts/qcom/sm8550.dtsi               |  187 +-
+ arch/arm64/boot/dts/qcom/sm8650-mtp.dts            |  157 +-
+ arch/arm64/boot/dts/qcom/sm8650-qrd.dts            |  441 +++-
+ arch/arm64/boot/dts/qcom/sm8650.dtsi               |   81 +-
+ arch/arm64/boot/dts/qcom/x1e80100-crd.dts          |  450 ++++
+ arch/arm64/boot/dts/qcom/x1e80100-qcp.dts          |  175 +-
+ arch/arm64/boot/dts/qcom/x1e80100.dtsi             | 1777 +++++++++++++-
+ arch/arm64/configs/defconfig                       |   17 +-
+ drivers/clk/qcom/Kconfig                           |   45 +-
+ drivers/clk/qcom/Makefile                          |    5 +-
+ drivers/clk/qcom/camcc-sc7180.c                    |   12 +-
+ drivers/clk/qcom/camcc-sc7280.c                    |   12 +-
+ drivers/clk/qcom/camcc-sc8280xp.c                  |    6 +-
+ drivers/clk/qcom/camcc-sdm845.c                    |   12 +-
+ drivers/clk/qcom/camcc-sm6350.c                    |   12 +-
+ drivers/clk/qcom/camcc-sm8550.c                    |   10 +-
+ drivers/clk/qcom/camcc-x1e80100.c                  | 2486 ++++++++++++++++++++
+ drivers/clk/qcom/clk-alpha-pll.c                   |   16 +
+ drivers/clk/qcom/clk-alpha-pll.h                   |    4 +
+ drivers/clk/qcom/clk-branch.h                      |    6 +
+ drivers/clk/qcom/dispcc-qcm2290.c                  |   16 +-
+ drivers/clk/qcom/dispcc-sc7180.c                   |   12 +-
+ drivers/clk/qcom/dispcc-sc7280.c                   |   19 +-
+ drivers/clk/qcom/dispcc-sc8280xp.c                 |   16 +-
+ drivers/clk/qcom/dispcc-sdm845.c                   |   14 +-
+ drivers/clk/qcom/dispcc-sm6115.c                   |    4 +-
+ drivers/clk/qcom/dispcc-sm6125.c                   |   12 +-
+ drivers/clk/qcom/dispcc-sm6350.c                   |   12 +-
+ drivers/clk/qcom/dispcc-sm6375.c                   |   12 +-
+ drivers/clk/qcom/dispcc-sm8250.c                   |  134 +-
+ drivers/clk/qcom/dispcc-sm8450.c                   |   19 +-
+ drivers/clk/qcom/dispcc-sm8550.c                   |   19 +-
+ drivers/clk/qcom/dispcc-sm8650.c                   |   16 +-
+ drivers/clk/qcom/dispcc-x1e80100.c                 | 1718 ++++++++++++++
+ drivers/clk/qcom/gcc-ipq6018.c                     |   17 +
+ drivers/clk/qcom/gcc-msm8953.c                     |    4 +
+ drivers/clk/qcom/gcc-sa8775p.c                     |   29 +-
+ drivers/clk/qcom/gcc-sc7180.c                      |   22 +-
+ drivers/clk/qcom/gcc-sc7280.c                      |   20 +-
+ drivers/clk/qcom/gcc-sc8180x.c                     |   62 +-
+ drivers/clk/qcom/gcc-sc8280xp.c                    |   29 +-
+ drivers/clk/qcom/gcc-sdm845.c                      |    1 +
+ drivers/clk/qcom/gcc-sdx55.c                       |   12 +-
+ drivers/clk/qcom/gcc-sdx65.c                       |   13 +-
+ drivers/clk/qcom/gcc-sdx75.c                       |   10 +-
+ drivers/clk/qcom/gcc-sm4450.c                      |   32 +-
+ drivers/clk/qcom/gcc-sm6375.c                      |   11 +-
+ drivers/clk/qcom/gcc-sm7150.c                      |   25 +-
+ drivers/clk/qcom/gcc-sm8150.c                      |  352 +--
+ drivers/clk/qcom/gcc-sm8250.c                      |   23 +-
+ drivers/clk/qcom/gcc-sm8350.c                      |   24 +-
+ drivers/clk/qcom/gcc-sm8450.c                      |   25 +-
+ drivers/clk/qcom/gcc-sm8550.c                      |   25 +-
+ drivers/clk/qcom/gcc-sm8650.c                      |   20 +-
+ drivers/clk/qcom/gcc-x1e80100.c                    |   16 +-
+ drivers/clk/qcom/gdsc.c                            |   12 +-
+ drivers/clk/qcom/gpucc-sa8775p.c                   |   12 +-
+ drivers/clk/qcom/gpucc-sc7180.c                    |   12 +-
+ drivers/clk/qcom/gpucc-sc7280.c                    |   21 +-
+ drivers/clk/qcom/gpucc-sc8280xp.c                  |   10 +-
+ drivers/clk/qcom/gpucc-sdm845.c                    |   12 +-
+ drivers/clk/qcom/gpucc-sm8150.c                    |   12 +-
+ drivers/clk/qcom/gpucc-sm8250.c                    |   12 +-
+ drivers/clk/qcom/gpucc-sm8350.c                    |   12 +-
+ drivers/clk/qcom/gpucc-sm8550.c                    |   22 +-
+ drivers/clk/qcom/gpucc-x1e80100.c                  |  656 ++++++
+ drivers/clk/qcom/lpasscorecc-sc7180.c              |    7 +-
+ drivers/clk/qcom/mss-sc7180.c                      |  140 --
+ drivers/clk/qcom/reset.c                           |   27 +-
+ drivers/clk/qcom/reset.h                           |    2 +-
+ drivers/clk/qcom/tcsrcc-x1e80100.c                 |  285 +++
+ drivers/clk/qcom/videocc-sc7180.c                  |   12 +-
+ drivers/clk/qcom/videocc-sc7280.c                  |   12 +-
+ drivers/clk/qcom/videocc-sdm845.c                  |   12 +-
+ drivers/clk/qcom/videocc-sm8150.c                  |   14 +-
+ drivers/clk/qcom/videocc-sm8250.c                  |   22 +-
+ drivers/clk/qcom/videocc-sm8350.c                  |   14 +-
+ drivers/clk/qcom/videocc-sm8450.c                  |   29 +-
+ drivers/clk/qcom/videocc-sm8550.c                  |   29 +-
+ drivers/iommu/Kconfig                              |    2 +-
+ drivers/pmdomain/qcom/rpmhpd.c                     |    1 -
+ drivers/soc/qcom/Kconfig                           |    9 +
+ drivers/soc/qcom/Makefile                          |    2 +
+ drivers/soc/qcom/apr.c                             |    2 +-
+ drivers/soc/qcom/llcc-qcom.c                       |    2 +
+ drivers/soc/qcom/qcom-pbs.c                        |  236 ++
+ drivers/soc/qcom/qcom_aoss.c                       |  103 +-
+ drivers/soc/qcom/smem.c                            |   11 -
+ drivers/soc/qcom/smp2p.c                           |    6 +-
+ drivers/soc/qcom/socinfo.c                         |    7 +-
+ drivers/soc/qcom/spm.c                             |  254 +-
+ drivers/soc/qcom/trace-aoss.h                      |   48 +
+ include/dt-bindings/arm/qcom,ids.h                 |    5 +
+ include/dt-bindings/clock/qcom,gcc-msm8953.h       |    4 +
+ include/dt-bindings/clock/qcom,gcc-sc8180x.h       |    2 +
+ include/dt-bindings/clock/qcom,gcc-sm8150.h        |    3 +
+ include/dt-bindings/clock/qcom,x1e80100-camcc.h    |  135 ++
+ include/dt-bindings/clock/qcom,x1e80100-dispcc.h   |   98 +
+ include/dt-bindings/clock/qcom,x1e80100-gpucc.h    |   41 +
+ include/dt-bindings/clock/qcom,x1e80100-tcsr.h     |   23 +
+ include/dt-bindings/reset/qcom,x1e80100-gpucc.h    |   19 +
+ include/linux/soc/qcom/apr.h                       |    2 +-
+ include/linux/soc/qcom/qcom-pbs.h                  |   30 +
+ include/soc/qcom/qcom-spmi-pmic.h                  |    2 +-
+ include/soc/qcom/spm.h                             |   23 +-
+ 213 files changed, 16290 insertions(+), 2793 deletions(-)
+ delete mode 100644 Documentation/devicetree/bindings/arm/msm/qcom,saw2.txt
+ delete mode 100644 Documentation/devicetree/bindings/clock/qcom,sc7180-mss.yaml
+ delete mode 100644 Documentation/devicetree/bindings/clock/qcom,sm8650-dispcc.yaml
+ create mode 100644 Documentation/devicetree/bindings/soc/qcom/qcom,pbs.yaml
+ rename Documentation/devicetree/bindings/soc/qcom/{qcom,spm.yaml => qcom,saw2.yaml} (53%)
+ create mode 100644 arch/arm/boot/dts/qcom/qcom-msm8226-samsung-matisse-common.dtsi
+ create mode 100644 arch/arm/boot/dts/qcom/qcom-msm8926-samsung-matisselte.dts
+ create mode 100644 arch/arm/boot/dts/qcom/qcom-msm8960-pins.dtsi
+ create mode 100644 arch/arm64/boot/dts/qcom/msm8216-samsung-fortuna3g.dts
+ create mode 100644 arch/arm64/boot/dts/qcom/msm8916-samsung-fortuna-common.dtsi
+ create mode 100644 arch/arm64/boot/dts/qcom/msm8916-samsung-gprimeltecan.dts
+ create mode 100644 arch/arm64/boot/dts/qcom/msm8916-samsung-grandprimelte.dts
+ create mode 100644 arch/arm64/boot/dts/qcom/msm8916-samsung-rossa-common.dtsi
+ create mode 100644 arch/arm64/boot/dts/qcom/msm8916-samsung-rossa.dts
+ rename arch/arm64/boot/dts/qcom/{pm2250.dtsi => pm4125.dtsi} (56%)
+ create mode 100644 arch/arm64/boot/dts/qcom/sdm450.dtsi
+ create mode 100644 arch/arm64/boot/dts/qcom/sm7125-xiaomi-curtana.dts
+ create mode 100644 arch/arm64/boot/dts/qcom/sm8550-hdk.dts
+ create mode 100644 drivers/clk/qcom/camcc-x1e80100.c
+ create mode 100644 drivers/clk/qcom/dispcc-x1e80100.c
+ create mode 100644 drivers/clk/qcom/gpucc-x1e80100.c
+ delete mode 100644 drivers/clk/qcom/mss-sc7180.c
+ create mode 100644 drivers/clk/qcom/tcsrcc-x1e80100.c
+ create mode 100644 drivers/soc/qcom/qcom-pbs.c
+ create mode 100644 drivers/soc/qcom/trace-aoss.h
+ create mode 100644 include/dt-bindings/clock/qcom,x1e80100-camcc.h
+ create mode 100644 include/dt-bindings/clock/qcom,x1e80100-dispcc.h
+ create mode 100644 include/dt-bindings/clock/qcom,x1e80100-gpucc.h
+ create mode 100644 include/dt-bindings/clock/qcom,x1e80100-tcsr.h
+ create mode 100644 include/dt-bindings/reset/qcom,x1e80100-gpucc.h
+ create mode 100644 include/linux/soc/qcom/qcom-pbs.h
+Merging renesas/next (0c096fb42ae5 Merge branch 'renesas-dts-for-v6.9' into renesas-next)
+$ git merge -m Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/geert/renesas-devel.git renesas/next
+Auto-merging arch/arm64/boot/dts/renesas/ulcb-kf.dtsi
+Merge made by the 'ort' strategy.
+ .../bindings/clock/renesas,cpg-mssr.yaml           |   1 +
+ .../bindings/power/renesas,rcar-sysc.yaml          |   1 +
+ .../devicetree/bindings/reset/renesas,rst.yaml     |   1 +
+ .../bindings/soc/renesas/renesas-soc.yaml          |  73 +++
+ .../devicetree/bindings/soc/renesas/renesas.yaml   |  25 +-
+ arch/arm/boot/dts/renesas/r8a73a4-ape6evm.dts      |  12 +
+ arch/arm/boot/dts/renesas/r8a73a4.dtsi             |  23 +-
+ arch/arm/boot/dts/renesas/r8a7740.dtsi             |   2 +
+ arch/arm/boot/dts/renesas/r8a7778.dtsi             |  11 +-
+ arch/arm/boot/dts/renesas/r8a7779.dtsi             |   9 +-
+ arch/arm/boot/dts/renesas/r8a7790-lager.dts        |   1 +
+ arch/arm/boot/dts/renesas/r8a7790-stout.dts        |   1 +
+ arch/arm/boot/dts/renesas/r8a7791-koelsch.dts      |   1 +
+ arch/arm/boot/dts/renesas/r8a7791-porter.dts       |   1 +
+ arch/arm/boot/dts/renesas/r8a7792-blanche.dts      |   1 +
+ arch/arm/boot/dts/renesas/r8a7793-gose.dts         |   1 +
+ arch/arm/boot/dts/renesas/r8a7794-alt.dts          |   1 +
+ arch/arm/boot/dts/renesas/r8a7794-silk.dts         |   1 +
+ arch/arm64/boot/dts/renesas/Makefile               |   5 +
+ arch/arm64/boot/dts/renesas/r8a774a1.dtsi          |  11 +-
+ arch/arm64/boot/dts/renesas/r8a774b1.dtsi          |  11 +-
+ arch/arm64/boot/dts/renesas/r8a774c0.dtsi          |  11 +-
+ arch/arm64/boot/dts/renesas/r8a774e1.dtsi          |  11 +-
+ arch/arm64/boot/dts/renesas/r8a77951.dtsi          |  11 +-
+ arch/arm64/boot/dts/renesas/r8a77960.dtsi          |  11 +-
+ arch/arm64/boot/dts/renesas/r8a77961.dtsi          |  11 +-
+ arch/arm64/boot/dts/renesas/r8a77965.dtsi          |  11 +-
+ arch/arm64/boot/dts/renesas/r8a77970.dtsi          |  11 +-
+ arch/arm64/boot/dts/renesas/r8a77980.dtsi          |  17 +-
+ arch/arm64/boot/dts/renesas/r8a77990.dtsi          |  11 +-
+ arch/arm64/boot/dts/renesas/r8a77995.dtsi          |  11 +-
+ arch/arm64/boot/dts/renesas/r8a779a0.dtsi          |  21 +-
+ arch/arm64/boot/dts/renesas/r8a779f0.dtsi          |  17 +-
+ .../boot/dts/renesas/r8a779g0-white-hawk-cpu.dts   |  13 +
+ .../boot/dts/renesas/r8a779g0-white-hawk-cpu.dtsi  | 368 +-----------
+ .../arm64/boot/dts/renesas/r8a779g0-white-hawk.dts |  58 +-
+ arch/arm64/boot/dts/renesas/r8a779g0.dtsi          | 105 ++--
+ .../dts/renesas/r8a779g2-white-hawk-single.dts     |  26 +
+ arch/arm64/boot/dts/renesas/r8a779g2.dtsi          |  12 +
+ .../boot/dts/renesas/r8a779h0-gray-hawk-single.dts | 230 +++++++
+ arch/arm64/boot/dts/renesas/r8a779h0.dtsi          | 664 +++++++++++++++++++++
+ arch/arm64/boot/dts/renesas/r9a07g043u.dtsi        |  69 +++
+ arch/arm64/boot/dts/renesas/r9a08g045.dtsi         |  19 +
+ arch/arm64/boot/dts/renesas/rzg3s-smarc-som.dtsi   |   9 +
+ arch/arm64/boot/dts/renesas/rzg3s-smarc.dtsi       |  53 ++
+ arch/arm64/boot/dts/renesas/ulcb-kf.dtsi           |  81 +--
+ arch/arm64/boot/dts/renesas/white-hawk-common.dtsi |  65 ++
+ .../boot/dts/renesas/white-hawk-cpu-common.dtsi    | 375 ++++++++++++
+ ...e-hawk-csi-dsi.dtsi => white-hawk-csi-dsi.dtsi} |   2 +-
+ ...hawk-ethernet.dtsi => white-hawk-ethernet.dtsi} |   2 +-
+ .../dt-bindings/clock/renesas,r8a779h0-cpg-mssr.h  |  96 +++
+ include/dt-bindings/power/renesas,r8a779h0-sysc.h  |  49 ++
+ 52 files changed, 2086 insertions(+), 557 deletions(-)
+ create mode 100644 Documentation/devicetree/bindings/soc/renesas/renesas-soc.yaml
+ create mode 100644 arch/arm64/boot/dts/renesas/r8a779g0-white-hawk-cpu.dts
+ create mode 100644 arch/arm64/boot/dts/renesas/r8a779g2-white-hawk-single.dts
+ create mode 100644 arch/arm64/boot/dts/renesas/r8a779g2.dtsi
+ create mode 100644 arch/arm64/boot/dts/renesas/r8a779h0-gray-hawk-single.dts
+ create mode 100644 arch/arm64/boot/dts/renesas/r8a779h0.dtsi
+ create mode 100644 arch/arm64/boot/dts/renesas/white-hawk-common.dtsi
+ create mode 100644 arch/arm64/boot/dts/renesas/white-hawk-cpu-common.dtsi
+ rename arch/arm64/boot/dts/renesas/{r8a779g0-white-hawk-csi-dsi.dtsi => white-hawk-csi-dsi.dtsi} (97%)
+ rename arch/arm64/boot/dts/renesas/{r8a779g0-white-hawk-ethernet.dtsi => white-hawk-ethernet.dtsi} (76%)
+ create mode 100644 include/dt-bindings/clock/renesas,r8a779h0-cpg-mssr.h
+ create mode 100644 include/dt-bindings/power/renesas,r8a779h0-sysc.h
+Merging reset/reset/next (c3c46acd5be9 dt-bindings: reset: hisilicon,hi3660-reset: Drop providers and consumers from example)
+$ git merge -m Merge branch 'reset/next' of https://git.pengutronix.de/git/pza/linux reset/reset/next
+Already up to date.
+Merging rockchip/for-next (504c4c60e70b Merge branch 'v6.9-armsoc/dts64' into for-next)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/mmind/linux-rockchip.git rockchip/for-next
+Merge made by the 'ort' strategy.
+ .../devicetree/bindings/arm/rockchip.yaml          |  47 +-
+ .../devicetree/bindings/soc/rockchip/grf.yaml      |  22 +
+ arch/arm/boot/dts/rockchip/rk3128-xpi-3128.dts     |  29 +
+ arch/arm/boot/dts/rockchip/rk3128.dtsi             |  60 ++
+ arch/arm/boot/dts/rockchip/rk322x.dtsi             |  16 +-
+ arch/arm/boot/dts/rockchip/rk3288.dtsi             |  16 +-
+ arch/arm64/boot/dts/rockchip/Makefile              |   8 +
+ .../boot/dts/rockchip/px30-ringneck-haikou.dts     |   1 +
+ arch/arm64/boot/dts/rockchip/px30-ringneck.dtsi    |   6 +
+ arch/arm64/boot/dts/rockchip/rk3328-rock-pi-e.dts  |   4 +-
+ arch/arm64/boot/dts/rockchip/rk3328.dtsi           |  11 +-
+ .../boot/dts/rockchip/rk3399-kobol-helios64.dts    |   3 -
+ .../arm64/boot/dts/rockchip/rk3399-puma-haikou.dts |   3 +-
+ arch/arm64/boot/dts/rockchip/rk3399-rock-pi-4a.dts |   2 +-
+ arch/arm64/boot/dts/rockchip/rk3399-rock-pi-4b.dts |   2 +-
+ arch/arm64/boot/dts/rockchip/rk3399-rock-pi-4c.dts |   2 +-
+ arch/arm64/boot/dts/rockchip/rk3399.dtsi           |  82 +-
+ .../boot/dts/rockchip/rk3566-anbernic-rg-arc-d.dts |  60 ++
+ .../boot/dts/rockchip/rk3566-anbernic-rg-arc-s.dts |  19 +
+ .../boot/dts/rockchip/rk3566-anbernic-rg-arc.dtsi  | 237 ++++++
+ .../boot/dts/rockchip/rk3566-anbernic-rg353x.dtsi  |  74 ++
+ .../boot/dts/rockchip/rk3566-anbernic-rg503.dts    |  74 ++
+ .../boot/dts/rockchip/rk3566-anbernic-rgxx3.dtsi   |  74 --
+ .../boot/dts/rockchip/rk3566-pinetab2-v0.1.dts     |  28 +
+ .../boot/dts/rockchip/rk3566-pinetab2-v2.0.dts     |  48 ++
+ arch/arm64/boot/dts/rockchip/rk3566-pinetab2.dtsi  | 943 +++++++++++++++++++++
+ .../dts/rockchip/rk3566-powkiddy-rgb10max3.dts     |  87 ++
+ .../boot/dts/rockchip/rk3566-powkiddy-rgb30.dts    |  18 +
+ .../boot/dts/rockchip/rk3566-powkiddy-rk2023.dts   |  18 +
+ .../boot/dts/rockchip/rk3566-powkiddy-rk2023.dtsi  |  18 +-
+ .../dts/rockchip/rk3588-edgeble-neu6a-common.dtsi  | 466 ++++++++++
+ .../boot/dts/rockchip/rk3588-edgeble-neu6a-io.dts  |  10 +-
+ .../boot/dts/rockchip/rk3588-edgeble-neu6a-io.dtsi | 232 +++++
+ .../dts/rockchip/rk3588-edgeble-neu6a-wifi.dtso    |  56 ++
+ .../boot/dts/rockchip/rk3588-edgeble-neu6a.dtsi    |  25 +-
+ .../boot/dts/rockchip/rk3588-edgeble-neu6b-io.dts  |  76 +-
+ .../boot/dts/rockchip/rk3588-edgeble-neu6b.dtsi    | 383 +--------
+ arch/arm64/boot/dts/rockchip/rk3588-nanopc-t6.dts  |  31 +-
+ .../boot/dts/rockchip/rk3588-orangepi-5-plus.dts   |   1 -
+ .../arm64/boot/dts/rockchip/rk3588-quartzpro64.dts |   1 -
+ arch/arm64/boot/dts/rockchip/rk3588-rock-5b.dts    |   8 +-
+ .../boot/dts/rockchip/rk3588s-indiedroid-nova.dts  |   8 +
+ .../arm64/boot/dts/rockchip/rk3588s-nanopi-r6c.dts |  14 +
+ .../arm64/boot/dts/rockchip/rk3588s-nanopi-r6s.dts | 764 +++++++++++++++++
+ arch/arm64/boot/dts/rockchip/rk3588s-rock-5a.dts   |   1 -
+ drivers/clk/rockchip/clk-rk3568.c                  |   1 +
+ 46 files changed, 3461 insertions(+), 628 deletions(-)
+ create mode 100644 arch/arm64/boot/dts/rockchip/rk3566-anbernic-rg-arc-d.dts
+ create mode 100644 arch/arm64/boot/dts/rockchip/rk3566-anbernic-rg-arc-s.dts
+ create mode 100644 arch/arm64/boot/dts/rockchip/rk3566-anbernic-rg-arc.dtsi
+ create mode 100644 arch/arm64/boot/dts/rockchip/rk3566-pinetab2-v0.1.dts
+ create mode 100644 arch/arm64/boot/dts/rockchip/rk3566-pinetab2-v2.0.dts
+ create mode 100644 arch/arm64/boot/dts/rockchip/rk3566-pinetab2.dtsi
+ create mode 100644 arch/arm64/boot/dts/rockchip/rk3566-powkiddy-rgb10max3.dts
+ create mode 100644 arch/arm64/boot/dts/rockchip/rk3588-edgeble-neu6a-common.dtsi
+ create mode 100644 arch/arm64/boot/dts/rockchip/rk3588-edgeble-neu6a-io.dtsi
+ create mode 100644 arch/arm64/boot/dts/rockchip/rk3588-edgeble-neu6a-wifi.dtso
+ create mode 100644 arch/arm64/boot/dts/rockchip/rk3588s-nanopi-r6c.dts
+ create mode 100644 arch/arm64/boot/dts/rockchip/rk3588s-nanopi-r6s.dts
+Merging samsung-krzk/for-next (d9e0e7c68345 Merge branch 'next/dt' into for-next)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/krzk/linux.git samsung-krzk/for-next
+Auto-merging arch/arm/configs/multi_v7_defconfig
+Merge made by the 'ort' strategy.
+ .../bindings/clock/google,gs101-clock.yaml         |  32 +-
+ .../devicetree/bindings/clock/tesla,fsd-clock.yaml |   2 +-
+ arch/arm/boot/dts/samsung/exynos4412-i9300.dts     |   2 +-
+ arch/arm/boot/dts/samsung/exynos4412-i9305.dts     |   2 +-
+ arch/arm/boot/dts/samsung/exynos4412-n710x.dts     |   2 +-
+ arch/arm/boot/dts/samsung/exynos4412-p4note.dtsi   |  53 +-
+ .../dts/samsung/exynos5420-galaxy-tab-common.dtsi  |  34 +-
+ arch/arm/boot/dts/samsung/exynos5420-peach-pit.dts |   1 +
+ .../dts/samsung/exynos5422-odroidxu3-common.dtsi   |  16 +-
+ arch/arm/boot/dts/samsung/exynos5800-peach-pi.dts  |   1 +
+ arch/arm/configs/exynos_defconfig                  |   3 +
+ arch/arm/configs/multi_v7_defconfig                |   3 +
+ arch/arm/mach-s3c/cpu.h                            |   2 +-
+ arch/arm/mach-s3c/s3c6410.c                        |   2 +-
+ arch/arm/mach-s3c/s3c64xx.c                        |   2 +-
+ arch/arm/mach-s5pv210/pm.c                         |   2 +-
+ arch/arm64/boot/dts/exynos/exynos850.dtsi          |  64 ++
+ arch/arm64/boot/dts/exynos/google/gs101-oriole.dts |  24 +
+ .../boot/dts/exynos/google/gs101-pinctrl.dtsi      |   2 +-
+ arch/arm64/boot/dts/exynos/google/gs101.dtsi       | 131 ++-
+ arch/arm64/boot/dts/tesla/fsd.dtsi                 |   2 +
+ drivers/clk/samsung/clk-exynos850.c                |  43 +-
+ drivers/clk/samsung/clk-gs101.c                    | 942 ++++++++++++++++++++-
+ include/dt-bindings/clock/exynos850.h              |   2 +
+ include/dt-bindings/clock/google,gs101.h           | 129 +++
+ 25 files changed, 1432 insertions(+), 66 deletions(-)
+Merging scmi/for-linux-next (f49191cdf834 Merge branches 'for-next/vexpress/updates', 'for-next/ffa/updates' and 'for-next/scmi/updates' of git://git.kernel.org/pub/scm/linux/kernel/git/sudeep.holla/linux into for-linux-next)
+$ git merge -m Merge branch 'for-linux-next' of git://git.kernel.org/pub/scm/linux/kernel/git/sudeep.holla/linux.git scmi/for-linux-next
+Merge made by the 'ort' strategy.
+ arch/arm/boot/dts/arm/vexpress-v2p-ca9.dts |   4 +-
+ drivers/clk/clk-scmi.c                     | 161 +++++++++++++++++-------
+ drivers/firmware/arm_ffa/bus.c             |   2 +-
+ drivers/firmware/arm_scmi/bus.c            |  26 +++-
+ drivers/firmware/arm_scmi/clock.c          | 194 ++++++++++++++++++++++++-----
+ drivers/firmware/arm_scmi/common.h         |   2 +-
+ drivers/firmware/arm_scmi/driver.c         |  99 ++++++++++++++-
+ drivers/firmware/arm_scmi/notify.c         |  17 ++-
+ drivers/firmware/arm_scmi/notify.h         |   4 +
+ drivers/firmware/arm_scmi/perf.c           | 163 +++++++++++++++++++++---
+ drivers/firmware/arm_scmi/power.c          |  30 ++++-
+ drivers/firmware/arm_scmi/powercap.c       |  45 ++++++-
+ drivers/firmware/arm_scmi/protocols.h      |   5 +
+ drivers/firmware/arm_scmi/reset.c          |  37 ++++--
+ drivers/firmware/arm_scmi/sensors.c        |  37 +++++-
+ drivers/firmware/arm_scmi/smc.c            |   7 ++
+ drivers/firmware/arm_scmi/system.c         |  16 +++
+ include/linux/arm_ffa.h                    |   2 +-
+ include/linux/scmi_protocol.h              |  21 +++-
+ 19 files changed, 749 insertions(+), 123 deletions(-)
+Merging sophgo/for-next (41bccc98fb79 Linux 6.8-rc2)
+$ git merge -m Merge branch 'for-next' of https://github.com/sophgo/linux.git sophgo/for-next
+Already up to date.
+Merging stm32/stm32-next (7fd195f01ae5 ARM: dts: stm32: lxa-tac: reduce RGMII interface drive strength)
+$ git merge -m Merge branch 'stm32-next' of git://git.kernel.org/pub/scm/linux/kernel/git/atorgue/stm32.git stm32/stm32-next
+Merge made by the 'ort' strategy.
+ arch/arm/boot/dts/st/stm32mp157.dtsi              | 2 +-
+ arch/arm/boot/dts/st/stm32mp157a-dk1-scmi.dts     | 2 +-
+ arch/arm/boot/dts/st/stm32mp157c-dk2-scmi.dts     | 2 +-
+ arch/arm/boot/dts/st/stm32mp157c-ed1-scmi.dts     | 2 +-
+ arch/arm/boot/dts/st/stm32mp157c-ev1-scmi.dts     | 2 +-
+ arch/arm/boot/dts/st/stm32mp157c-lxa-tac-gen2.dts | 2 +-
+ arch/arm/boot/dts/st/stm32mp15xc-lxa-tac.dtsi     | 6 +++++-
+ 7 files changed, 11 insertions(+), 7 deletions(-)
+Merging sunxi/sunxi/for-next (5db172482d9d Merge branch 'sunxi/dt-for-6.9' into sunxi/for-next)
+$ git merge -m Merge branch 'sunxi/for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/sunxi/linux.git sunxi/sunxi/for-next
+Auto-merging Documentation/devicetree/bindings/vendor-prefixes.yaml
+Merge made by the 'ort' strategy.
+ Documentation/devicetree/bindings/arm/sunxi.yaml   |  12 +
+ .../sram/allwinner,sun4i-a10-system-control.yaml   |   2 +-
+ .../devicetree/bindings/vendor-prefixes.yaml       |   2 +
+ arch/arm64/boot/dts/allwinner/Makefile             |   2 +
+ .../boot/dts/allwinner/sun50i-h6-beelink-gs1.dts   |   2 +
+ arch/arm64/boot/dts/allwinner/sun50i-h6-tanix.dtsi |   2 +
+ arch/arm64/boot/dts/allwinner/sun50i-h6.dtsi       |   7 +-
+ .../sun50i-h616-bigtreetech-cb1-manta.dts          |   2 +-
+ .../dts/allwinner/sun50i-h616-bigtreetech-cb1.dtsi |   4 +-
+ .../dts/allwinner/sun50i-h616-bigtreetech-pi.dts   |   2 +-
+ arch/arm64/boot/dts/allwinner/sun50i-h616.dtsi     |  67 ++++
+ .../allwinner/sun50i-h618-longan-module-3h.dtsi    |  75 +++++
+ .../boot/dts/allwinner/sun50i-h618-longanpi-3h.dts | 144 +++++++++
+ .../allwinner/sun50i-h618-transpeed-8k618-t.dts    |  23 ++
+ .../dts/allwinner/sun50i-h64-remix-mini-pc.dts     | 356 +++++++++++++++++++++
+ drivers/bus/sunxi-rsb.c                            |   4 +-
+ drivers/clk/sunxi/clk-a20-gmac.c                   |  21 +-
+ drivers/clk/sunxi/clk-sun9i-cpus.c                 |   7 +-
+ drivers/clk/sunxi/clk-usb.c                        |   9 +-
+ 19 files changed, 716 insertions(+), 27 deletions(-)
+ create mode 100644 arch/arm64/boot/dts/allwinner/sun50i-h618-longan-module-3h.dtsi
+ create mode 100644 arch/arm64/boot/dts/allwinner/sun50i-h618-longanpi-3h.dts
+ create mode 100644 arch/arm64/boot/dts/allwinner/sun50i-h64-remix-mini-pc.dts
+Merging tee/next (58ea7e692a9e Merge branch 'tee_bus_type_for_v6.9' into next)
+$ git merge -m Merge branch 'next' of https://git.linaro.org/people/jens.wiklander/linux-tee.git tee/next
+Merge made by the 'ort' strategy.
+ drivers/tee/tee_core.c  | 2 +-
+ include/linux/tee_drv.h | 2 +-
+ 2 files changed, 2 insertions(+), 2 deletions(-)
+Merging tegra/for-next (fc9699999179 Merge branch for-6.8/arm64/dt into for-next)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/tegra/linux.git tegra/for-next
+Auto-merging drivers/soc/tegra/fuse/fuse-tegra30.c
+Merge made by the 'ort' strategy.
+ .../bindings/arm/tegra/nvidia,tegra186-pmc.yaml    |  54 ++++++----
+ arch/arm/boot/dts/nvidia/tegra124-nyan.dtsi        |   1 +
+ arch/arm/boot/dts/nvidia/tegra124-venice2.dts      |   1 +
+ .../dts/nvidia/tegra234-p3737-0000+p3701-0000.dts  |   2 +-
+ drivers/bus/Kconfig                                |   5 +-
+ drivers/soc/tegra/Kconfig                          |   5 +
+ drivers/soc/tegra/fuse/fuse-tegra.c                | 118 ++++++++++++++++-----
+ drivers/soc/tegra/fuse/fuse-tegra30.c              |  20 ++++
+ drivers/soc/tegra/fuse/fuse.h                      |   8 +-
+ drivers/soc/tegra/fuse/tegra-apbmisc.c             | 110 ++++++++++++++++---
+ drivers/soc/tegra/pmc.c                            |  87 +++++++--------
+ include/linux/string.h                             |   1 +
+ include/soc/tegra/fuse.h                           |   1 +
+ include/soc/tegra/pmc.h                            |  18 ----
+ mm/util.c                                          |  17 +++
+ 15 files changed, 315 insertions(+), 133 deletions(-)
+Merging ti/ti-next (0ad5d338af6d Merge branch 'ti-k3-dts-next' into ti-next)
+$ git merge -m Merge branch 'ti-next' of git://git.kernel.org/pub/scm/linux/kernel/git/ti/linux.git ti/ti-next
+Auto-merging arch/arm/configs/multi_v7_defconfig
+Auto-merging arch/arm64/boot/dts/ti/k3-am65-main.dtsi
+Merge made by the 'ort' strategy.
+ Documentation/devicetree/bindings/arm/ti/k3.yaml   |   7 +
+ arch/arm/boot/dts/ti/keystone/keystone-clocks.dtsi |   2 +-
+ .../boot/dts/ti/keystone/keystone-k2e-clocks.dtsi  |   2 +-
+ arch/arm/boot/dts/ti/keystone/keystone-k2e-evm.dts |   2 +-
+ .../boot/dts/ti/keystone/keystone-k2e-netcp.dtsi   |   2 +-
+ arch/arm/boot/dts/ti/keystone/keystone-k2e.dtsi    |   2 +-
+ arch/arm/boot/dts/ti/keystone/keystone-k2g-evm.dts |   2 +-
+ arch/arm/boot/dts/ti/keystone/keystone-k2g-ice.dts |   2 +-
+ .../boot/dts/ti/keystone/keystone-k2g-netcp.dtsi   |   2 +-
+ arch/arm/boot/dts/ti/keystone/keystone-k2g.dtsi    |   2 +-
+ .../boot/dts/ti/keystone/keystone-k2hk-clocks.dtsi |   2 +-
+ .../arm/boot/dts/ti/keystone/keystone-k2hk-evm.dts |   2 +-
+ .../boot/dts/ti/keystone/keystone-k2hk-netcp.dtsi  |   2 +-
+ arch/arm/boot/dts/ti/keystone/keystone-k2hk.dtsi   |   2 +-
+ .../boot/dts/ti/keystone/keystone-k2l-clocks.dtsi  |   2 +-
+ arch/arm/boot/dts/ti/keystone/keystone-k2l-evm.dts |   2 +-
+ .../boot/dts/ti/keystone/keystone-k2l-netcp.dtsi   |   2 +-
+ arch/arm/boot/dts/ti/keystone/keystone-k2l.dtsi    |   2 +-
+ arch/arm/boot/dts/ti/keystone/keystone.dtsi        |   2 +-
+ arch/arm64/boot/dts/ti/Makefile                    |  38 +-
+ arch/arm64/boot/dts/ti/k3-am62-lp-sk.dts           |   4 +-
+ arch/arm64/boot/dts/ti/k3-am62-main.dtsi           |  30 +-
+ arch/arm64/boot/dts/ti/k3-am62-mcu.dtsi            |   4 +-
+ arch/arm64/boot/dts/ti/k3-am62-phycore-som.dtsi    |   5 +-
+ arch/arm64/boot/dts/ti/k3-am62-thermal.dtsi        |   5 +-
+ arch/arm64/boot/dts/ti/k3-am62-verdin-dahlia.dtsi  |   1 -
+ arch/arm64/boot/dts/ti/k3-am62-verdin-dev.dtsi     |   1 -
+ arch/arm64/boot/dts/ti/k3-am62-verdin-mallow.dtsi  |  10 +
+ arch/arm64/boot/dts/ti/k3-am62-verdin-wifi.dtsi    |   1 -
+ arch/arm64/boot/dts/ti/k3-am62-verdin.dtsi         |  59 +-
+ arch/arm64/boot/dts/ti/k3-am62-wakeup.dtsi         |  38 +-
+ arch/arm64/boot/dts/ti/k3-am62.dtsi                |   4 +-
+ .../dts/ti/k3-am625-beagleplay-csi2-ov5640.dtso    |   4 +-
+ .../ti/k3-am625-beagleplay-csi2-tevi-ov5640.dtso   |   4 +-
+ arch/arm64/boot/dts/ti/k3-am625-beagleplay.dts     |  54 +-
+ .../boot/dts/ti/k3-am625-phyboard-lyra-rdk.dts     |   6 +-
+ arch/arm64/boot/dts/ti/k3-am625-sk.dts             |   4 +-
+ arch/arm64/boot/dts/ti/k3-am625.dtsi               |   4 +-
+ arch/arm64/boot/dts/ti/k3-am62a-main.dtsi          |  80 +-
+ arch/arm64/boot/dts/ti/k3-am62a-mcu.dtsi           |   4 +-
+ arch/arm64/boot/dts/ti/k3-am62a-thermal.dtsi       |   5 +-
+ arch/arm64/boot/dts/ti/k3-am62a-wakeup.dtsi        |   4 +-
+ arch/arm64/boot/dts/ti/k3-am62a.dtsi               |   4 +-
+ arch/arm64/boot/dts/ti/k3-am62a7-sk.dts            | 123 ++-
+ arch/arm64/boot/dts/ti/k3-am62a7.dtsi              |   4 +-
+ arch/arm64/boot/dts/ti/k3-am62p-main.dtsi          |  48 +-
+ arch/arm64/boot/dts/ti/k3-am62p-mcu.dtsi           |   6 +-
+ arch/arm64/boot/dts/ti/k3-am62p-thermal.dtsi       |   5 +-
+ arch/arm64/boot/dts/ti/k3-am62p-wakeup.dtsi        |   5 +-
+ arch/arm64/boot/dts/ti/k3-am62p.dtsi               |   4 +-
+ arch/arm64/boot/dts/ti/k3-am62p5-sk.dts            |  11 +-
+ arch/arm64/boot/dts/ti/k3-am62p5.dtsi              |   4 +-
+ .../dts/ti/k3-am62x-phyboard-lyra-gpio-fan.dtso    |  50 ++
+ arch/arm64/boot/dts/ti/k3-am62x-sk-common.dtsi     |   8 +-
+ .../arm64/boot/dts/ti/k3-am62x-sk-csi2-imx219.dtso |   4 +-
+ .../arm64/boot/dts/ti/k3-am62x-sk-csi2-ov5640.dtso |   4 +-
+ .../boot/dts/ti/k3-am62x-sk-csi2-tevi-ov5640.dtso  |   4 +-
+ arch/arm64/boot/dts/ti/k3-am62x-sk-hdmi-audio.dtso |   4 +-
+ arch/arm64/boot/dts/ti/k3-am64-main.dtsi           |  69 +-
+ arch/arm64/boot/dts/ti/k3-am64-mcu.dtsi            |   4 +-
+ arch/arm64/boot/dts/ti/k3-am64-phycore-som.dtsi    |  13 +-
+ arch/arm64/boot/dts/ti/k3-am64-thermal.dtsi        |   5 +-
+ arch/arm64/boot/dts/ti/k3-am64.dtsi                |   4 +-
+ .../boot/dts/ti/k3-am642-evm-icssg1-dualemac.dtso  |  79 ++
+ arch/arm64/boot/dts/ti/k3-am642-evm.dts            | 119 ++-
+ .../boot/dts/ti/k3-am642-phyboard-electra-rdk.dts  |  30 +-
+ arch/arm64/boot/dts/ti/k3-am642-sk.dts             |  14 +-
+ .../boot/dts/ti/k3-am642-tqma64xxl-mbax4xxl.dts    |   1 -
+ arch/arm64/boot/dts/ti/k3-am642.dtsi               |   4 +-
+ .../dts/ti/k3-am65-iot2050-arduino-connector.dtsi  | 768 ++++++++++++++++++
+ .../boot/dts/ti/k3-am65-iot2050-common-pg1.dtsi    |   7 +-
+ .../boot/dts/ti/k3-am65-iot2050-common-pg2.dtsi    |  27 +-
+ arch/arm64/boot/dts/ti/k3-am65-iot2050-common.dtsi | 887 +--------------------
+ arch/arm64/boot/dts/ti/k3-am65-iot2050-dp.dtsi     |  98 +++
+ arch/arm64/boot/dts/ti/k3-am65-iot2050-usb3.dtsi   |  27 +
+ arch/arm64/boot/dts/ti/k3-am65-main.dtsi           |  37 +-
+ arch/arm64/boot/dts/ti/k3-am65-mcu.dtsi            |   4 +-
+ arch/arm64/boot/dts/ti/k3-am65-wakeup.dtsi         |   4 +-
+ arch/arm64/boot/dts/ti/k3-am65.dtsi                |   4 +-
+ arch/arm64/boot/dts/ti/k3-am652.dtsi               |   4 +-
+ .../dts/ti/k3-am6528-iot2050-basic-common.dtsi     |   8 +-
+ .../boot/dts/ti/k3-am6528-iot2050-basic-pg2.dts    |   4 +-
+ arch/arm64/boot/dts/ti/k3-am6528-iot2050-basic.dts |   7 +-
+ .../k3-am654-base-board-rocktech-rk101-panel.dtso  |   4 +-
+ arch/arm64/boot/dts/ti/k3-am654-base-board.dts     |   8 +-
+ arch/arm64/boot/dts/ti/k3-am654-icssg2.dtso        |   4 +-
+ arch/arm64/boot/dts/ti/k3-am654-idk.dtso           |   4 +-
+ .../boot/dts/ti/k3-am654-industrial-thermal.dtsi   |   5 +-
+ arch/arm64/boot/dts/ti/k3-am654-pcie-usb2.dtso     |  59 ++
+ arch/arm64/boot/dts/ti/k3-am654-pcie-usb3.dtso     |  61 ++
+ arch/arm64/boot/dts/ti/k3-am654.dtsi               |   4 +-
+ .../dts/ti/k3-am6548-iot2050-advanced-common.dtsi  |   2 +-
+ .../boot/dts/ti/k3-am6548-iot2050-advanced-m2.dts  |  22 +-
+ .../boot/dts/ti/k3-am6548-iot2050-advanced-pg2.dts |  12 +-
+ .../boot/dts/ti/k3-am6548-iot2050-advanced-sm.dts  | 189 +++++
+ .../boot/dts/ti/k3-am6548-iot2050-advanced.dts     |   3 +-
+ arch/arm64/boot/dts/ti/k3-am68-sk-base-board.dts   |  54 +-
+ arch/arm64/boot/dts/ti/k3-am68-sk-som.dtsi         |  20 +-
+ arch/arm64/boot/dts/ti/k3-am69-sk.dts              |  90 ++-
+ .../boot/dts/ti/k3-j7200-common-proc-board.dts     | 109 ++-
+ .../dts/ti/k3-j7200-evm-quad-port-eth-exp.dtso     |   4 +-
+ arch/arm64/boot/dts/ti/k3-j7200-main.dtsi          | 315 +++++++-
+ arch/arm64/boot/dts/ti/k3-j7200-mcu-wakeup.dtsi    |  57 +-
+ arch/arm64/boot/dts/ti/k3-j7200-som-p0.dtsi        |  47 +-
+ arch/arm64/boot/dts/ti/k3-j7200-thermal.dtsi       |   5 +-
+ arch/arm64/boot/dts/ti/k3-j7200.dtsi               |   4 +-
+ arch/arm64/boot/dts/ti/k3-j721e-beagleboneai64.dts |  26 +-
+ .../boot/dts/ti/k3-j721e-common-proc-board.dts     |   4 +-
+ .../boot/dts/ti/k3-j721e-evm-gesi-exp-board.dtso   |   4 +-
+ arch/arm64/boot/dts/ti/k3-j721e-evm-pcie0-ep.dtso  |   4 +-
+ .../dts/ti/k3-j721e-evm-quad-port-eth-exp.dtso     |   4 +-
+ arch/arm64/boot/dts/ti/k3-j721e-main.dtsi          | 149 +++-
+ arch/arm64/boot/dts/ti/k3-j721e-mcu-wakeup.dtsi    |   4 +-
+ .../boot/dts/ti/k3-j721e-sk-csi2-dual-imx219.dtso  | 165 ++++
+ arch/arm64/boot/dts/ti/k3-j721e-sk.dts             |  45 +-
+ arch/arm64/boot/dts/ti/k3-j721e-som-p0.dtsi        |  22 +-
+ arch/arm64/boot/dts/ti/k3-j721e-thermal.dtsi       |   5 +-
+ arch/arm64/boot/dts/ti/k3-j721e.dtsi               |   4 +-
+ .../boot/dts/ti/k3-j721s2-common-proc-board.dts    |  31 +-
+ .../boot/dts/ti/k3-j721s2-evm-gesi-exp-board.dtso  |   4 +-
+ arch/arm64/boot/dts/ti/k3-j721s2-evm-pcie1-ep.dtso |   4 +-
+ arch/arm64/boot/dts/ti/k3-j721s2-main.dtsi         | 135 +++-
+ arch/arm64/boot/dts/ti/k3-j721s2-mcu-wakeup.dtsi   |   6 +-
+ arch/arm64/boot/dts/ti/k3-j721s2-som-p0.dtsi       |  20 +-
+ arch/arm64/boot/dts/ti/k3-j721s2-thermal.dtsi      |   5 +-
+ arch/arm64/boot/dts/ti/k3-j721s2.dtsi              |   4 +-
+ arch/arm64/boot/dts/ti/k3-j722s-evm.dts            | 251 ++++++
+ arch/arm64/boot/dts/ti/k3-j722s.dtsi               |  89 +++
+ arch/arm64/boot/dts/ti/k3-j784s4-evm.dts           |  32 +-
+ arch/arm64/boot/dts/ti/k3-j784s4-main.dtsi         | 187 ++++-
+ arch/arm64/boot/dts/ti/k3-j784s4-mcu-wakeup.dtsi   |   6 +-
+ arch/arm64/boot/dts/ti/k3-j784s4-thermal.dtsi      |   5 +-
+ arch/arm64/boot/dts/ti/k3-j784s4.dtsi              |   4 +-
+ arch/arm64/boot/dts/ti/k3-pinctrl.h                |   7 +-
+ arch/arm64/boot/dts/ti/k3-serdes.h                 |   4 +-
+ 135 files changed, 3832 insertions(+), 1408 deletions(-)
+ create mode 100644 arch/arm64/boot/dts/ti/k3-am62x-phyboard-lyra-gpio-fan.dtso
+ create mode 100644 arch/arm64/boot/dts/ti/k3-am642-evm-icssg1-dualemac.dtso
+ create mode 100644 arch/arm64/boot/dts/ti/k3-am65-iot2050-arduino-connector.dtsi
+ create mode 100644 arch/arm64/boot/dts/ti/k3-am65-iot2050-dp.dtsi
+ create mode 100644 arch/arm64/boot/dts/ti/k3-am65-iot2050-usb3.dtsi
+ create mode 100644 arch/arm64/boot/dts/ti/k3-am654-pcie-usb2.dtso
+ create mode 100644 arch/arm64/boot/dts/ti/k3-am654-pcie-usb3.dtso
+ create mode 100644 arch/arm64/boot/dts/ti/k3-am6548-iot2050-advanced-sm.dts
+ create mode 100644 arch/arm64/boot/dts/ti/k3-j721e-sk-csi2-dual-imx219.dtso
+ create mode 100644 arch/arm64/boot/dts/ti/k3-j722s-evm.dts
+ create mode 100644 arch/arm64/boot/dts/ti/k3-j722s.dtsi
+Merging xilinx/for-next (2d81f5ef567c Merge remote-tracking branch 'git/zynqmp/dt' into for-next)
+$ git merge -m Merge branch 'for-next' of git://github.com/Xilinx/linux-xlnx.git xilinx/for-next
+Auto-merging MAINTAINERS
+Merge made by the 'ort' strategy.
+ .../firmware/xilinx/xlnx,zynqmp-firmware.yaml      | 96 +++++++++++++++++++---
+ .../devicetree/bindings/fpga/xlnx,versal-fpga.yaml |  2 +-
+ .../devicetree/bindings/soc/xilinx/xilinx.yaml     | 70 +++++++++++++---
+ MAINTAINERS                                        |  2 +-
+ arch/arm/mach-zynq/slcr.c                          |  5 +-
+ arch/arm64/boot/dts/xilinx/zynqmp-clk-ccf.dtsi     | 16 +++-
+ .../boot/dts/xilinx/zynqmp-sck-kv-g-revA.dtso      | 36 +++++++-
+ .../boot/dts/xilinx/zynqmp-sck-kv-g-revB.dtso      | 37 ++++++++-
+ .../boot/dts/xilinx/zynqmp-zc1751-xm015-dc1.dts    |  2 +-
+ .../boot/dts/xilinx/zynqmp-zc1751-xm016-dc2.dts    |  2 +-
+ .../boot/dts/xilinx/zynqmp-zc1751-xm019-dc5.dts    |  4 +-
+ arch/arm64/boot/dts/xilinx/zynqmp-zcu100-revC.dts  |  2 +-
+ arch/arm64/boot/dts/xilinx/zynqmp-zcu102-revA.dts  |  6 +-
+ arch/arm64/boot/dts/xilinx/zynqmp-zcu104-revA.dts  |  2 +-
+ arch/arm64/boot/dts/xilinx/zynqmp-zcu104-revC.dts  |  2 +-
+ arch/arm64/boot/dts/xilinx/zynqmp-zcu106-revA.dts  |  6 +-
+ arch/arm64/boot/dts/xilinx/zynqmp-zcu111-revA.dts  |  4 +-
+ arch/arm64/boot/dts/xilinx/zynqmp-zcu1275-revA.dts |  2 +-
+ arch/arm64/boot/dts/xilinx/zynqmp.dtsi             | 85 +++++++++++--------
+ 19 files changed, 298 insertions(+), 83 deletions(-)
+Merging clk/clk-next (efe5a1b888ab Merge branch 'clk-fixes' into clk-next)
+$ git merge -m Merge branch 'clk-next' of git://git.kernel.org/pub/scm/linux/kernel/git/clk/linux.git clk/clk-next
+Merge made by the 'ort' strategy.
+Merging clk-imx/for-next (6613476e225e Linux 6.8-rc1)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/abelvesa/linux.git clk-imx/for-next
+Already up to date.
+Merging clk-renesas/renesas-clk (81a7a88a9806 clk: renesas: r8a779h0: Add RPC-IF clock)
+$ git merge -m Merge branch 'renesas-clk' of git://git.kernel.org/pub/scm/linux/kernel/git/geert/renesas-drivers.git clk-renesas/renesas-clk
+Merge made by the 'ort' strategy.
+ drivers/clk/renesas/Kconfig                   |   5 +
+ drivers/clk/renesas/Makefile                  |   1 +
+ drivers/clk/renesas/clk-mstp.c                |  16 +-
+ drivers/clk/renesas/r8a779f0-cpg-mssr.c       |   2 +-
+ drivers/clk/renesas/r8a779g0-cpg-mssr.c       |  13 +-
+ drivers/clk/renesas/r8a779h0-cpg-mssr.c       | 256 ++++++++++++++++++++++++++
+ drivers/clk/renesas/r9a07g043-cpg.c           |  37 +++-
+ drivers/clk/renesas/r9a07g044-cpg.c           |   6 +-
+ drivers/clk/renesas/r9a08g045-cpg.c           |   3 +
+ drivers/clk/renesas/rcar-gen4-cpg.c           |  10 +-
+ drivers/clk/renesas/renesas-cpg-mssr.c        | 117 +++++++++++-
+ drivers/clk/renesas/renesas-cpg-mssr.h        |   1 +
+ drivers/of/base.c                             | 123 +++++++++----
+ include/dt-bindings/clock/r8a779g0-cpg-mssr.h |   1 +
+ include/linux/of.h                            |  11 ++
+ 15 files changed, 531 insertions(+), 71 deletions(-)
+ create mode 100644 drivers/clk/renesas/r8a779h0-cpg-mssr.c
+Merging csky/linux-next (2c40c1c6adab Merge tag 'usb-6.7-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/usb)
+$ git merge -m Merge branch 'linux-next' of git://github.com/c-sky/csky-linux.git csky/linux-next
+Already up to date.
+Merging loongarch/loongarch-next (cca5efe77a6a LoongArch: vDSO: Disable UBSAN instrumentation)
+$ git merge -m Merge branch 'loongarch-next' of git://git.kernel.org/pub/scm/linux/kernel/git/chenhuacai/linux-loongson.git loongarch/loongarch-next
+Already up to date.
+Merging m68k/for-next (6b9c045b0602 m68k: defconfig: Update defconfigs for v6.7-rc1)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/geert/linux-m68k.git m68k/for-next
+Already up to date.
+Merging m68knommu/for-next (b401b621758e Linux 6.8-rc5)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/gerg/m68knommu.git m68knommu/for-next
+Already up to date.
+Merging microblaze/next (6613476e225e Linux 6.8-rc1)
+$ git merge -m Merge branch 'next' of git://git.monstr.eu/linux-2.6-microblaze.git microblaze/next
+Already up to date.
+Merging mips/mips-next (3c35da51f77e MIPS: TXx9: Use PCI_SET_ERROR_RESPONSE())
+$ git merge -m Merge branch 'mips-next' of git://git.kernel.org/pub/scm/linux/kernel/git/mips/linux.git mips/mips-next
+Auto-merging Documentation/devicetree/bindings/vendor-prefixes.yaml
+Auto-merging MAINTAINERS
+Auto-merging arch/mips/Kconfig
+Merge made by the 'ort' strategy.
+ Documentation/devicetree/bindings/mips/cpus.yaml   |  15 +-
+ .../devicetree/bindings/mips/mobileye.yaml         |  32 ++
+ .../devicetree/bindings/vendor-prefixes.yaml       |   2 +
+ MAINTAINERS                                        |  23 ++
+ arch/mips/Kbuild                                   |   1 +
+ arch/mips/Kbuild.platforms                         |   1 +
+ arch/mips/Kconfig                                  | 135 +++++--
+ arch/mips/alchemy/common/clock.c                   |   2 +-
+ arch/mips/boot/dts/Makefile                        |   1 +
+ arch/mips/boot/dts/mobileye/Makefile               |   4 +
+ arch/mips/boot/dts/mobileye/eyeq5-epm5.dts         |  23 ++
+ .../mips/boot/dts/mobileye/eyeq5-fixed-clocks.dtsi | 292 ++++++++++++++
+ arch/mips/boot/dts/mobileye/eyeq5.dtsi             | 124 ++++++
+ arch/mips/configs/eyeq5_defconfig                  | 108 ++++++
+ arch/mips/generic/Makefile                         |   6 +-
+ arch/mips/include/asm/addrspace.h                  |   5 +
+ arch/mips/include/asm/cdmm.h                       |   2 +-
+ arch/mips/include/asm/mach-generic/spaces.h        |   4 +
+ arch/mips/include/asm/mips-cm.h                    |   1 +
+ arch/mips/include/asm/mipsregs.h                   | 249 +++++++++---
+ arch/mips/include/asm/regdef.h                     |  91 +++++
+ arch/mips/include/asm/smp-cps.h                    |   9 +-
+ arch/mips/kernel/cps-vec.S                         |  54 +--
+ arch/mips/kernel/pm-cps.c                          | 134 +++----
+ arch/mips/kernel/smp-cps.c                         | 141 +++++--
+ arch/mips/kernel/traps.c                           |  13 +-
+ arch/mips/kvm/entry.c                              | 431 +++++++++------------
+ arch/mips/mm/page.c                                | 202 +++++-----
+ arch/mips/mm/tlbex.c                               | 214 +++++-----
+ arch/mips/mobileye/Makefile                        |   1 +
+ arch/mips/mobileye/Platform                        |  15 +
+ arch/mips/mobileye/board-epm5.its.S                |  24 ++
+ arch/mips/mobileye/vmlinux.its.S                   |  32 ++
+ arch/mips/pci/fixup-ath79.c                        |   2 +-
+ arch/mips/pci/fixup-lantiq.c                       |   9 -
+ arch/mips/pci/ops-tx4927.c                         |  18 +-
+ arch/mips/sgi-ip22/ip22-gio.c                      |   4 +-
+ arch/mips/txx9/generic/setup.c                     |   2 +-
+ drivers/bus/mips_cdmm.c                            |   2 +-
+ drivers/tc/tc-driver.c                             |   2 +-
+ include/linux/tc.h                                 |   2 +-
+ 41 files changed, 1698 insertions(+), 734 deletions(-)
+ create mode 100644 Documentation/devicetree/bindings/mips/mobileye.yaml
+ create mode 100644 arch/mips/boot/dts/mobileye/Makefile
+ create mode 100644 arch/mips/boot/dts/mobileye/eyeq5-epm5.dts
+ create mode 100644 arch/mips/boot/dts/mobileye/eyeq5-fixed-clocks.dtsi
+ create mode 100644 arch/mips/boot/dts/mobileye/eyeq5.dtsi
+ create mode 100644 arch/mips/configs/eyeq5_defconfig
+ create mode 100644 arch/mips/mobileye/Makefile
+ create mode 100644 arch/mips/mobileye/Platform
+ create mode 100644 arch/mips/mobileye/board-epm5.its.S
+ create mode 100644 arch/mips/mobileye/vmlinux.its.S
+Merging openrisc/for-next (c289330331eb openrisc: Remove kernel-doc marker from ioremap comment)
+$ git merge -m Merge branch 'for-next' of git://github.com/openrisc/linux.git openrisc/for-next
+Already up to date.
+Merging parisc-hd/for-next (3a34e3fcdd83 parisc: Fix csum_ipv6_magic on 64-bit systems)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/deller/parisc-linux.git parisc-hd/for-next
+Merge made by the 'ort' strategy.
+ arch/parisc/include/asm/checksum.h |  9 +++++----
+ arch/parisc/include/asm/kprobes.h  |  3 ++-
+ arch/parisc/kernel/ftrace.c        |  2 +-
+ arch/parisc/kernel/processor.c     |  8 --------
+ arch/parisc/kernel/unwind.c        | 14 ++++++--------
+ 5 files changed, 14 insertions(+), 22 deletions(-)
+Merging powerpc/next (3281366a8e79 uapi/auxvec: Define AT_HWCAP3 and AT_HWCAP4 aux vector, entries)
+$ git merge -m Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git powerpc/next
+Auto-merging arch/powerpc/kernel/setup-common.c
+Merge made by the 'ort' strategy.
+ arch/powerpc/include/asm/ibmebus.h       |  2 +-
+ arch/powerpc/include/asm/macio.h         |  2 +-
+ arch/powerpc/include/asm/mpic.h          |  2 +-
+ arch/powerpc/include/asm/smp.h           |  1 +
+ arch/powerpc/include/asm/vio.h           |  2 +-
+ arch/powerpc/include/asm/vmalloc.h       |  4 +--
+ arch/powerpc/kernel/prom.c               | 22 ++++++++++--
+ arch/powerpc/kernel/setup-common.c       | 58 +++++++++++++++++++++---------
+ arch/powerpc/platforms/pseries/ibmebus.c |  4 +--
+ arch/powerpc/platforms/pseries/vio.c     | 61 ++++++++++++++++++--------------
+ arch/powerpc/sysdev/mpic.c               |  2 +-
+ drivers/macintosh/macio_asic.c           |  2 +-
+ drivers/macintosh/therm_windtunnel.c     |  6 ++--
+ drivers/macintosh/windfarm_pm112.c       |  6 ++--
+ drivers/macintosh/windfarm_pm121.c       |  5 ++-
+ drivers/macintosh/windfarm_pm72.c        |  7 ++--
+ drivers/macintosh/windfarm_pm81.c        |  8 ++---
+ drivers/macintosh/windfarm_pm91.c        |  8 ++---
+ drivers/macintosh/windfarm_rm31.c        |  7 ++--
+ include/uapi/linux/auxvec.h              |  2 ++
+ 20 files changed, 125 insertions(+), 86 deletions(-)
+Merging soc-fsl/next (fb9c384625dd bus: fsl-mc: fsl-mc-allocator: Drop a write-only variable)
+$ git merge -m Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/leo/linux.git soc-fsl/next
+Already up to date.
+Merging risc-v/for-next (cb4ede926134 riscv: Avoid code duplication with generic bitops implementation)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/riscv/linux.git risc-v/for-next
+Auto-merging arch/riscv/Kconfig
+Auto-merging arch/riscv/include/asm/bitops.h
+CONFLICT (content): Merge conflict in arch/riscv/include/asm/bitops.h
+Auto-merging arch/riscv/include/asm/pgtable.h
+Auto-merging arch/riscv/include/asm/tlb.h
+Auto-merging arch/riscv/mm/init.c
+Auto-merging include/linux/mm.h
+Auto-merging mm/mmap.c
+Resolved 'arch/riscv/include/asm/bitops.h' using previous resolution.
+Automatic merge failed; fix conflicts and then commit the result.
+$ git commit --no-edit -v -a
+[master d28d59af8253] Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/riscv/linux.git
+$ git diff -M --stat --summary HEAD^..
+ arch/riscv/Kbuild                                  |   1 +
+ arch/riscv/Kconfig                                 |  18 +-
+ arch/riscv/Makefile                                |   5 +
+ arch/riscv/crypto/Kconfig                          |  93 ++++
+ arch/riscv/crypto/Makefile                         |  23 +
+ arch/riscv/crypto/aes-macros.S                     | 156 ++++++
+ arch/riscv/crypto/aes-riscv64-glue.c               | 550 +++++++++++++++++++++
+ arch/riscv/crypto/aes-riscv64-zvkned-zvbb-zvkg.S   | 312 ++++++++++++
+ arch/riscv/crypto/aes-riscv64-zvkned-zvkb.S        | 146 ++++++
+ arch/riscv/crypto/aes-riscv64-zvkned.S             | 180 +++++++
+ arch/riscv/crypto/chacha-riscv64-glue.c            | 101 ++++
+ arch/riscv/crypto/chacha-riscv64-zvkb.S            | 294 +++++++++++
+ arch/riscv/crypto/ghash-riscv64-glue.c             | 168 +++++++
+ arch/riscv/crypto/ghash-riscv64-zvkg.S             |  72 +++
+ arch/riscv/crypto/sha256-riscv64-glue.c            | 137 +++++
+ .../crypto/sha256-riscv64-zvknha_or_zvknhb-zvkb.S  | 225 +++++++++
+ arch/riscv/crypto/sha512-riscv64-glue.c            | 133 +++++
+ arch/riscv/crypto/sha512-riscv64-zvknhb-zvkb.S     | 203 ++++++++
+ arch/riscv/crypto/sm3-riscv64-glue.c               | 112 +++++
+ arch/riscv/crypto/sm3-riscv64-zvksh-zvkb.S         | 123 +++++
+ arch/riscv/crypto/sm4-riscv64-glue.c               | 107 ++++
+ arch/riscv/crypto/sm4-riscv64-zvksed-zvkb.S        | 117 +++++
+ arch/riscv/include/asm/asm.h                       |  10 +
+ arch/riscv/include/asm/bitops.h                    | 138 +-----
+ arch/riscv/include/asm/pgalloc.h                   |  53 +-
+ arch/riscv/include/asm/pgtable.h                   |   6 +
+ arch/riscv/include/asm/tlb.h                       |  18 +
+ arch/riscv/include/asm/vector.h                    |  11 +
+ arch/riscv/kernel/entry.S                          |   3 +
+ arch/riscv/kernel/pi/Makefile                      |   3 +
+ arch/riscv/kernel/smpboot.c                        |   1 -
+ arch/riscv/kernel/traps.c                          |  17 +-
+ arch/riscv/lib/uaccess_vector.S                    |   1 -
+ arch/riscv/mm/init.c                               |   6 +
+ crypto/Kconfig                                     |   3 +
+ drivers/clocksource/timer-clint.c                  |   2 +-
+ drivers/clocksource/timer-riscv.c                  |   2 +-
+ include/asm-generic/bitops/__ffs.h                 |   8 +-
+ include/asm-generic/bitops/__fls.h                 |   8 +-
+ include/asm-generic/bitops/ffs.h                   |   8 +-
+ include/asm-generic/bitops/fls.h                   |   8 +-
+ include/linux/mm.h                                 |   2 +-
+ mm/mmap.c                                          |   2 +-
+ 43 files changed, 3445 insertions(+), 141 deletions(-)
+ create mode 100644 arch/riscv/crypto/Kconfig
+ create mode 100644 arch/riscv/crypto/Makefile
+ create mode 100644 arch/riscv/crypto/aes-macros.S
+ create mode 100644 arch/riscv/crypto/aes-riscv64-glue.c
+ create mode 100644 arch/riscv/crypto/aes-riscv64-zvkned-zvbb-zvkg.S
+ create mode 100644 arch/riscv/crypto/aes-riscv64-zvkned-zvkb.S
+ create mode 100644 arch/riscv/crypto/aes-riscv64-zvkned.S
+ create mode 100644 arch/riscv/crypto/chacha-riscv64-glue.c
+ create mode 100644 arch/riscv/crypto/chacha-riscv64-zvkb.S
+ create mode 100644 arch/riscv/crypto/ghash-riscv64-glue.c
+ create mode 100644 arch/riscv/crypto/ghash-riscv64-zvkg.S
+ create mode 100644 arch/riscv/crypto/sha256-riscv64-glue.c
+ create mode 100644 arch/riscv/crypto/sha256-riscv64-zvknha_or_zvknhb-zvkb.S
+ create mode 100644 arch/riscv/crypto/sha512-riscv64-glue.c
+ create mode 100644 arch/riscv/crypto/sha512-riscv64-zvknhb-zvkb.S
+ create mode 100644 arch/riscv/crypto/sm3-riscv64-glue.c
+ create mode 100644 arch/riscv/crypto/sm3-riscv64-zvksh-zvkb.S
+ create mode 100644 arch/riscv/crypto/sm4-riscv64-glue.c
+ create mode 100644 arch/riscv/crypto/sm4-riscv64-zvksed-zvkb.S
+Merging riscv-dt/riscv-dt-for-next (5669bb5a16a0 riscv: dts: microchip: add specific compatible for mpfs pdma)
+$ git merge -m Merge branch 'riscv-dt-for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/conor/linux.git riscv-dt/riscv-dt-for-next
+Auto-merging arch/riscv/boot/dts/starfive/jh7100.dtsi
+Auto-merging arch/riscv/boot/dts/starfive/jh7110.dtsi
+Merge made by the 'ort' strategy.
+ .../devicetree/bindings/pwm/opencores,pwm.yaml     |  55 +++++++++++
+ arch/riscv/boot/dts/microchip/mpfs.dtsi            |   6 +-
+ .../boot/dts/starfive/jh7100-beaglev-starlight.dts |  11 +++
+ arch/riscv/boot/dts/starfive/jh7100-common.dtsi    | 108 +++++++++++++++++++++
+ .../dts/starfive/jh7100-starfive-visionfive-v1.dts |  22 ++++-
+ arch/riscv/boot/dts/starfive/jh7100.dtsi           |  45 +++++++++
+ .../dts/starfive/jh7110-starfive-visionfive-2.dtsi |  22 +++++
+ arch/riscv/boot/dts/starfive/jh7110.dtsi           |   9 ++
+ 8 files changed, 274 insertions(+), 4 deletions(-)
+ create mode 100644 Documentation/devicetree/bindings/pwm/opencores,pwm.yaml
+Merging riscv-soc/riscv-soc-for-next (6613476e225e Linux 6.8-rc1)
+$ git merge -m Merge branch 'riscv-soc-for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/conor/linux.git riscv-soc/riscv-soc-for-next
+Already up to date.
+Merging s390/for-next (cba7aa7faf86 Merge branch 'features' into for-next)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/s390/linux.git s390/for-next
+Auto-merging arch/s390/Kconfig
+Auto-merging arch/s390/kernel/machine_kexec.c
+Auto-merging arch/s390/kvm/vsie.c
+Auto-merging include/linux/compiler_attributes.h
+Merge made by the 'ort' strategy.
+ arch/s390/Kconfig                                  |  18 +-
+ arch/s390/Makefile                                 |  10 +-
+ arch/s390/boot/.gitignore                          |   1 +
+ arch/s390/boot/Makefile                            |  21 +-
+ arch/s390/boot/boot.h                              |   6 +
+ arch/s390/boot/startup.c                           |  80 +++-
+ arch/s390/boot/vmlinux.lds.S                       |  43 ++
+ arch/s390/crypto/chacha-glue.c                     |   4 +-
+ arch/s390/crypto/chacha-s390.S                     |   2 +-
+ arch/s390/crypto/crc32-vx.c                        |  11 +-
+ arch/s390/crypto/crc32-vx.h                        |  12 +
+ arch/s390/crypto/{crc32be-vx.S => crc32be-vx.c}    | 171 +++-----
+ arch/s390/crypto/{crc32le-vx.S => crc32le-vx.c}    | 239 +++++-----
+ arch/s390/hypfs/hypfs_diag0c.c                     |   3 +-
+ arch/s390/hypfs/hypfs_sprp.c                       |   4 +-
+ arch/s390/include/asm/access-regs.h                |  38 ++
+ arch/s390/include/asm/appldata.h                   |   4 +-
+ arch/s390/include/asm/asm-prototypes.h             |   2 +-
+ arch/s390/include/asm/bug.h                        |   4 +-
+ arch/s390/include/asm/checksum.h                   |  29 +-
+ arch/s390/include/asm/diag.h                       |  15 +-
+ arch/s390/include/asm/entry-common.h               |   5 +-
+ .../include/asm/{vx-insn-asm.h => fpu-insn-asm.h}  |  71 ++-
+ arch/s390/include/asm/fpu-insn.h                   | 486 +++++++++++++++++++++
+ arch/s390/include/asm/fpu-types.h                  |  51 +++
+ arch/s390/include/asm/fpu.h                        | 295 +++++++++++++
+ arch/s390/include/asm/fpu/api.h                    | 126 ------
+ arch/s390/include/asm/fpu/internal.h               |  67 ---
+ arch/s390/include/asm/fpu/types.h                  |  38 --
+ arch/s390/include/asm/kvm_host.h                   |   3 +-
+ arch/s390/include/asm/lowcore.h                    |   2 +-
+ arch/s390/include/asm/pai.h                        |   3 +-
+ arch/s390/include/asm/pci.h                        |   3 +-
+ arch/s390/include/asm/physmem_info.h               |   1 +
+ arch/s390/include/asm/processor.h                  |  11 +-
+ arch/s390/include/asm/ptrace.h                     |   4 +
+ arch/s390/include/asm/stacktrace.h                 |   1 -
+ arch/s390/include/asm/switch_to.h                  |  49 ---
+ arch/s390/include/asm/vx-insn.h                    |  19 -
+ arch/s390/kernel/compat_signal.c                   |  22 +-
+ arch/s390/kernel/crash_dump.c                      |   2 +-
+ arch/s390/kernel/diag.c                            |  31 +-
+ arch/s390/kernel/early.c                           |   3 +-
+ arch/s390/kernel/entry.S                           |  19 +-
+ arch/s390/kernel/entry.h                           |   1 +
+ arch/s390/kernel/fpu.c                             | 380 +++++++---------
+ arch/s390/kernel/ipl.c                             |   3 +-
+ arch/s390/kernel/machine_kexec.c                   |   3 +-
+ arch/s390/kernel/nmi.c                             | 168 ++-----
+ arch/s390/kernel/os_info.c                         |   6 +-
+ arch/s390/kernel/perf_pai_crypto.c                 |  79 ++--
+ arch/s390/kernel/perf_pai_ext.c                    |  47 +-
+ arch/s390/kernel/perf_regs.c                       |  10 +-
+ arch/s390/kernel/process.c                         |  31 +-
+ arch/s390/kernel/ptrace.c                          | 101 ++---
+ arch/s390/kernel/setup.c                           |  12 +-
+ arch/s390/kernel/signal.c                          |  20 +-
+ arch/s390/kernel/smp.c                             |   3 +-
+ arch/s390/kernel/sysinfo.c                         |  27 +-
+ arch/s390/kernel/text_amode31.S                    |   2 +-
+ arch/s390/kernel/time.c                            |   6 +-
+ arch/s390/kernel/traps.c                           |  12 +-
+ arch/s390/kernel/uprobes.c                         |   1 -
+ arch/s390/kernel/vdso32/Makefile                   |   2 +-
+ arch/s390/kernel/vdso32/vdso32.lds.S               |   1 -
+ arch/s390/kernel/vdso64/Makefile                   |   3 +-
+ arch/s390/kernel/vdso64/vdso64.lds.S               |   1 -
+ arch/s390/kernel/vmlinux.lds.S                     |  49 +++
+ arch/s390/kvm/gaccess.c                            |   2 +-
+ arch/s390/kvm/interrupt.c                          |  10 +-
+ arch/s390/kvm/kvm-s390.c                           |  38 +-
+ arch/s390/kvm/vsie.c                               |   3 -
+ arch/s390/lib/Makefile                             |   1 +
+ arch/s390/lib/csum-partial.c                       |  91 ++++
+ arch/s390/mm/extmem.c                              |   4 +-
+ arch/s390/mm/mmap.c                                |  19 +-
+ arch/s390/pci/pci.c                                |  20 +-
+ arch/s390/pci/pci_debug.c                          |  10 +-
+ arch/s390/pci/pci_event.c                          |  15 +-
+ arch/s390/pci/pci_sysfs.c                          |  70 +--
+ arch/s390/tools/.gitignore                         |   1 +
+ arch/s390/tools/Makefile                           |   5 +
+ arch/s390/tools/relocs.c                           | 385 ++++++++++++++++
+ drivers/pci/hotplug/s390_pci_hpc.c                 |  65 ++-
+ drivers/s390/char/vmur.c                           |   4 +-
+ drivers/s390/char/zcore.c                          |   1 -
+ drivers/s390/cio/ccwgroup.c                        |   4 +-
+ drivers/s390/cio/chsc.c                            |   4 +-
+ drivers/s390/cio/chsc_sch.c                        |  20 +-
+ drivers/s390/cio/cmf.c                             |   6 +-
+ drivers/s390/cio/css.c                             |   4 +-
+ drivers/s390/cio/device.c                          |   4 +-
+ drivers/s390/cio/scm.c                             |   4 +-
+ drivers/s390/crypto/ap_bus.c                       |  45 +-
+ drivers/s390/crypto/ap_debug.h                     |   4 +-
+ drivers/s390/crypto/ap_queue.c                     |  31 +-
+ drivers/s390/crypto/pkey_api.c                     | 187 ++++----
+ drivers/s390/crypto/vfio_ap_drv.c                  |   2 +-
+ drivers/s390/crypto/zcrypt_api.c                   |  60 ++-
+ drivers/s390/crypto/zcrypt_ccamisc.c               | 214 ++++-----
+ drivers/s390/crypto/zcrypt_debug.h                 |   4 +-
+ drivers/s390/crypto/zcrypt_ep11misc.c              | 127 +++---
+ drivers/s390/crypto/zcrypt_msgtype50.c             |  14 +-
+ drivers/s390/crypto/zcrypt_msgtype6.c              |  45 +-
+ include/linux/compiler_attributes.h                |  12 +
+ lib/raid6/s390vx.uc                                |  62 +--
+ 106 files changed, 2871 insertions(+), 1723 deletions(-)
+ create mode 100644 arch/s390/crypto/crc32-vx.h
+ rename arch/s390/crypto/{crc32be-vx.S => crc32be-vx.c} (56%)
+ rename arch/s390/crypto/{crc32le-vx.S => crc32le-vx.c} (52%)
+ create mode 100644 arch/s390/include/asm/access-regs.h
+ rename arch/s390/include/asm/{vx-insn-asm.h => fpu-insn-asm.h} (86%)
+ create mode 100644 arch/s390/include/asm/fpu-insn.h
+ create mode 100644 arch/s390/include/asm/fpu-types.h
+ create mode 100644 arch/s390/include/asm/fpu.h
+ delete mode 100644 arch/s390/include/asm/fpu/api.h
+ delete mode 100644 arch/s390/include/asm/fpu/internal.h
+ delete mode 100644 arch/s390/include/asm/fpu/types.h
+ delete mode 100644 arch/s390/include/asm/switch_to.h
+ delete mode 100644 arch/s390/include/asm/vx-insn.h
+ create mode 100644 arch/s390/lib/csum-partial.c
+ create mode 100644 arch/s390/tools/relocs.c
+Merging sh/for-next (0a2d3ce0031f sh: hd64461: Make setup_hd64461 static)
+$ git merge -m Merge branch 'for-next' of git:git.kernel.org/pub/scm/linux/kernel/git/glaubitz/sh-linux.git sh/for-next
+Merge made by the 'ort' strategy.
+ arch/sh/cchips/hd6446x/hd64461.c | 2 +-
+ arch/sh/drivers/dma/dma-sysfs.c  | 2 +-
+ drivers/sh/intc/core.c           | 2 +-
+ drivers/sh/intc/internals.h      | 2 +-
+ 4 files changed, 4 insertions(+), 4 deletions(-)
+Merging uml/next (83aec96c631e um: Mark 32bit syscall helpers as clobbering memory)
+$ git merge -m Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/uml/linux.git uml/next
+Already up to date.
+Merging xtensa/xtensa-for-next (7ab7acb68adf xtensa: fix MAKE_PC_FROM_RA second argument)
+$ git merge -m Merge branch 'xtensa-for-next' of git://github.com/jcmvbkbc/linux-xtensa.git xtensa/xtensa-for-next
+Merge made by the 'ort' strategy.
+ arch/xtensa/include/asm/processor.h | 8 ++++----
+ arch/xtensa/include/asm/ptrace.h    | 2 +-
+ arch/xtensa/kernel/process.c        | 5 +++--
+ arch/xtensa/kernel/stacktrace.c     | 3 ++-
+ 4 files changed, 10 insertions(+), 8 deletions(-)
+Merging bcachefs/for-next (c887148ebf99 thread_with_file: add f_ops.flush)
+$ git merge -m Merge branch 'for-next' of https://evilpiepirate.org/git/bcachefs.git bcachefs/for-next
+Auto-merging MAINTAINERS
+Auto-merging drivers/md/bcache/sysfs.c
+Auto-merging include/linux/sched.h
+Auto-merging include/linux/sched/mm.h
+Auto-merging lib/Kconfig.debug
+Auto-merging lib/sort.c
+Merge made by the 'ort' strategy.
+ MAINTAINERS                                        |  39 +
+ drivers/md/bcache/Kconfig                          |   1 +
+ drivers/md/bcache/bcache.h                         |   1 +
+ drivers/md/bcache/bset.c                           |   6 +-
+ drivers/md/bcache/bset.h                           |   1 +
+ drivers/md/bcache/btree.c                          |   6 +-
+ drivers/md/bcache/super.c                          |   7 +
+ drivers/md/bcache/sysfs.c                          |  25 +-
+ drivers/md/bcache/util.c                           |  30 -
+ drivers/md/bcache/util.h                           |  52 +-
+ fs/bcachefs/Kconfig                                |  11 +-
+ fs/bcachefs/Makefile                               |   5 -
+ fs/bcachefs/alloc_background.c                     | 156 +++-
+ fs/bcachefs/alloc_background.h                     |   1 +
+ fs/bcachefs/alloc_foreground.c                     |  13 +-
+ fs/bcachefs/backpointers.c                         | 140 ++--
+ fs/bcachefs/bbpos_types.h                          |   2 +-
+ fs/bcachefs/bcachefs.h                             |  23 +-
+ fs/bcachefs/bcachefs_format.h                      |  53 +-
+ fs/bcachefs/bset.c                                 |   2 +-
+ fs/bcachefs/btree_cache.c                          |  39 +-
+ fs/bcachefs/btree_gc.c                             |  23 +-
+ fs/bcachefs/btree_io.c                             |  19 +-
+ fs/bcachefs/btree_iter.c                           |  24 +-
+ fs/bcachefs/btree_journal_iter.c                   |  50 +-
+ fs/bcachefs/btree_journal_iter.h                   |  14 +-
+ fs/bcachefs/btree_locking.c                        |   3 +-
+ fs/bcachefs/btree_locking.h                        |   2 +-
+ fs/bcachefs/btree_types.h                          |  11 +-
+ fs/bcachefs/btree_update.c                         |  23 +
+ fs/bcachefs/btree_update.h                         |   3 +-
+ fs/bcachefs/btree_update_interior.c                |  13 +-
+ fs/bcachefs/btree_write_buffer.c                   |   2 +
+ fs/bcachefs/btree_write_buffer_types.h             |   2 +-
+ fs/bcachefs/buckets.c                              |  32 +-
+ fs/bcachefs/chardev.c                              |  57 +-
+ fs/bcachefs/checksum.c                             |   2 +-
+ fs/bcachefs/compress.c                             |  14 +-
+ fs/bcachefs/debug.c                                |   6 +-
+ fs/bcachefs/dirent.c                               | 145 ++--
+ fs/bcachefs/dirent.h                               |   6 +-
+ fs/bcachefs/ec.c                                   |   4 +-
+ fs/bcachefs/errcode.h                              |  10 +-
+ fs/bcachefs/error.c                                |  14 +-
+ fs/bcachefs/error.h                                |   2 +-
+ fs/bcachefs/fifo.h                                 |   4 +-
+ fs/bcachefs/fs-common.c                            |  74 +-
+ fs/bcachefs/fs-io-direct.c                         |   2 +
+ fs/bcachefs/fs.c                                   | 224 ++++--
+ fs/bcachefs/fsck.c                                 | 858 ++++++++++++++-------
+ fs/bcachefs/fsck.h                                 |   1 +
+ fs/bcachefs/inode.c                                |  40 +-
+ fs/bcachefs/inode.h                                |  19 +
+ fs/bcachefs/io_read.c                              |   4 +-
+ fs/bcachefs/io_write.c                             |  20 +-
+ fs/bcachefs/journal.c                              | 306 +++++---
+ fs/bcachefs/journal.h                              |   7 +-
+ fs/bcachefs/journal_io.c                           | 374 +++++----
+ fs/bcachefs/journal_io.h                           |  35 +-
+ fs/bcachefs/journal_reclaim.c                      |  37 +-
+ fs/bcachefs/journal_sb.c                           |   2 +-
+ fs/bcachefs/journal_seq_blacklist.c                |   6 +-
+ fs/bcachefs/journal_types.h                        |  36 +-
+ fs/bcachefs/lru.c                                  |   4 +-
+ fs/bcachefs/migrate.c                              |   8 +-
+ fs/bcachefs/nocow_locking.c                        |   2 +-
+ fs/bcachefs/opts.h                                 |   5 +
+ fs/bcachefs/rebalance.c                            |   4 +-
+ fs/bcachefs/recovery.c                             |   2 +-
+ fs/bcachefs/recovery_types.h                       |   2 +
+ fs/bcachefs/replicas.c                             |  19 +-
+ fs/bcachefs/replicas.h                             |   3 +-
+ fs/bcachefs/sb-clean.c                             |  16 -
+ fs/bcachefs/sb-downgrade.c                         |  11 +-
+ fs/bcachefs/sb-errors_types.h                      |  20 +-
+ fs/bcachefs/sb-members.h                           |   2 +-
+ fs/bcachefs/str_hash.h                             |  15 +-
+ fs/bcachefs/subvolume.c                            | 187 ++++-
+ fs/bcachefs/subvolume.h                            |   9 +-
+ fs/bcachefs/subvolume_format.h                     |   4 +-
+ fs/bcachefs/subvolume_types.h                      |   2 +-
+ fs/bcachefs/super-io.c                             |  13 +-
+ fs/bcachefs/super-io.h                             |   2 +-
+ fs/bcachefs/super.c                                |  66 +-
+ fs/bcachefs/sysfs.c                                |   4 +-
+ fs/bcachefs/thread_with_file.c                     | 299 -------
+ fs/bcachefs/thread_with_file.h                     |  41 -
+ fs/bcachefs/thread_with_file_types.h               |  16 -
+ fs/bcachefs/util.c                                 | 370 +--------
+ fs/bcachefs/util.h                                 | 151 +---
+ {fs/bcachefs => include/linux}/darray.h            |  61 +-
+ include/linux/darray_types.h                       |  22 +
+ {fs/bcachefs => include/linux}/eytzinger.h         |  58 +-
+ {fs/bcachefs => include/linux}/mean_and_variance.h |  14 +-
+ include/linux/mempool.h                            |  13 +
+ include/linux/sched.h                              |   4 +-
+ include/linux/sched/mm.h                           |  60 +-
+ include/linux/thread_with_file.h                   |  79 ++
+ include/linux/thread_with_file_types.h             |  25 +
+ include/linux/time_stats.h                         | 167 ++++
+ kernel/hung_task.c                                 |   1 +
+ lib/Kconfig                                        |   7 +
+ lib/Kconfig.debug                                  |   9 +
+ lib/Makefile                                       |   5 +-
+ {fs/bcachefs => lib}/darray.c                      |  12 +-
+ lib/math/Kconfig                                   |   3 +
+ lib/math/Makefile                                  |   2 +
+ {fs/bcachefs => lib/math}/mean_and_variance.c      |  31 +-
+ {fs/bcachefs => lib/math}/mean_and_variance_test.c |  83 +-
+ lib/sort.c                                         |  89 +++
+ lib/thread_with_file.c                             | 454 +++++++++++
+ lib/time_stats.c                                   | 373 +++++++++
+ mm/mempool.c                                       |  13 +
+ 113 files changed, 3759 insertions(+), 2246 deletions(-)
+ delete mode 100644 fs/bcachefs/thread_with_file.c
+ delete mode 100644 fs/bcachefs/thread_with_file.h
+ delete mode 100644 fs/bcachefs/thread_with_file_types.h
+ rename {fs/bcachefs => include/linux}/darray.h (66%)
+ create mode 100644 include/linux/darray_types.h
+ rename {fs/bcachefs => include/linux}/eytzinger.h (77%)
+ rename {fs/bcachefs => include/linux}/mean_and_variance.h (96%)
+ create mode 100644 include/linux/thread_with_file.h
+ create mode 100644 include/linux/thread_with_file_types.h
+ create mode 100644 include/linux/time_stats.h
+ rename {fs/bcachefs => lib}/darray.c (56%)
+ rename {fs/bcachefs => lib/math}/mean_and_variance.c (90%)
+ rename {fs/bcachefs => lib/math}/mean_and_variance_test.c (78%)
+ create mode 100644 lib/thread_with_file.c
+ create mode 100644 lib/time_stats.c
+Merging pidfd/for-next (a901a3568fd2 Merge tag 'iomap-6.5-merge-1' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/brauner/linux.git pidfd/for-next
+Already up to date.
+Merging fscrypt/for-next (d3a7bd420076 fscrypt: clear keyring before calling key_put())
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/fs/fscrypt/linux.git fscrypt/for-next
+Merge made by the 'ort' strategy.
+ fs/crypto/keyring.c  | 8 ++++++--
+ fs/crypto/keysetup.c | 5 ++++-
+ 2 files changed, 10 insertions(+), 3 deletions(-)
+Merging afs/afs-next (abcbd3bfbbfe afs: trace: Log afs_make_call(), including server address)
+$ git merge -m Merge branch 'afs-next' of git://git.kernel.org/pub/scm/linux/kernel/git/dhowells/linux-fs.git afs/afs-next
+Already up to date.
+Merging btrfs/for-next (d3cfdbb1ea50 Merge branch 'for-next-next-v6.8-20240213' into for-next-20240213)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux.git btrfs/for-next
+Auto-merging fs/btrfs/defrag.c
+Auto-merging fs/btrfs/extent_io.c
+Auto-merging fs/btrfs/volumes.c
+Merge made by the 'ort' strategy.
+ fs/btrfs/accessors.c             |  15 +-
+ fs/btrfs/accessors.h             |  11 +-
+ fs/btrfs/acl.c                   |   1 -
+ fs/btrfs/acl.h                   |  11 ++
+ fs/btrfs/async-thread.c          |   1 -
+ fs/btrfs/async-thread.h          |   3 +
+ fs/btrfs/backref.c               |  20 ++-
+ fs/btrfs/backref.h               |  16 +-
+ fs/btrfs/bio.c                   |  17 +-
+ fs/btrfs/bio.h                   |   2 +
+ fs/btrfs/block-group.c           |  39 ++--
+ fs/btrfs/block-group.h           |  14 ++
+ fs/btrfs/block-rsv.c             |   1 -
+ fs/btrfs/block-rsv.h             |   7 +
+ fs/btrfs/btrfs_inode.h           |  25 ++-
+ fs/btrfs/compression.c           | 132 ++++++++------
+ fs/btrfs/compression.h           |  53 +++---
+ fs/btrfs/ctree.c                 |   6 +-
+ fs/btrfs/ctree.h                 |  28 ++-
+ fs/btrfs/defrag.c                | 155 ++++++++--------
+ fs/btrfs/defrag.h                |  10 ++
+ fs/btrfs/delalloc-space.c        |   2 -
+ fs/btrfs/delalloc-space.h        |   4 +
+ fs/btrfs/delayed-inode.c         |   4 +-
+ fs/btrfs/delayed-inode.h         |   8 +
+ fs/btrfs/delayed-ref.h           |  10 ++
+ fs/btrfs/dev-replace.c           |   2 -
+ fs/btrfs/dev-replace.h           |   4 +
+ fs/btrfs/dir-item.h              |   6 +
+ fs/btrfs/disk-io.c               |  41 +++--
+ fs/btrfs/disk-io.h               |  20 ++-
+ fs/btrfs/export.c                |  12 +-
+ fs/btrfs/export.h                |   4 +
+ fs/btrfs/extent-io-tree.c        |   2 +-
+ fs/btrfs/extent-io-tree.h        |   7 +
+ fs/btrfs/extent-tree.c           |  50 ++++--
+ fs/btrfs/extent-tree.h           |  10 ++
+ fs/btrfs/extent_io.c             | 298 +++++++++++++++++++-----------
+ fs/btrfs/extent_io.h             |  46 +++--
+ fs/btrfs/extent_map.c            |  20 ++-
+ fs/btrfs/extent_map.h            |   8 +
+ fs/btrfs/file-item.c             |   6 -
+ fs/btrfs/file-item.h             |  13 ++
+ fs/btrfs/file.c                  |  43 +++--
+ fs/btrfs/file.h                  |  15 ++
+ fs/btrfs/free-space-cache.c      |   6 +-
+ fs/btrfs/free-space-cache.h      |  15 +-
+ fs/btrfs/free-space-tree.c       |  56 +++---
+ fs/btrfs/free-space-tree.h       |   6 +
+ fs/btrfs/fs.h                    |  67 ++++++-
+ fs/btrfs/inode-item.c            |   1 -
+ fs/btrfs/inode-item.h            |   5 +-
+ fs/btrfs/inode.c                 | 379 +++++++++++++++++++++------------------
+ fs/btrfs/ioctl.c                 |  50 +++---
+ fs/btrfs/ioctl.h                 |   9 +
+ fs/btrfs/locking.c               |   3 +-
+ fs/btrfs/locking.h               |   8 +-
+ fs/btrfs/lru_cache.h             |   2 +
+ fs/btrfs/lzo.c                   |  91 +++++-----
+ fs/btrfs/messages.c              |   2 -
+ fs/btrfs/misc.h                  |   2 +
+ fs/btrfs/ordered-data.c          |   1 -
+ fs/btrfs/ordered-data.h          |  15 ++
+ fs/btrfs/orphan.c                |   1 -
+ fs/btrfs/orphan.h                |   5 +
+ fs/btrfs/print-tree.h            |   3 +
+ fs/btrfs/props.c                 |   3 +-
+ fs/btrfs/props.h                 |   7 +-
+ fs/btrfs/qgroup.c                |   6 +-
+ fs/btrfs/qgroup.h                |  17 +-
+ fs/btrfs/raid-stripe-tree.c      |   1 -
+ fs/btrfs/raid-stripe-tree.h      |   5 +
+ fs/btrfs/raid56.c                |  31 +++-
+ fs/btrfs/raid56.h                |   9 +
+ fs/btrfs/rcu-string.h            |   6 +
+ fs/btrfs/ref-verify.h            |   9 +
+ fs/btrfs/reflink.c               |  12 +-
+ fs/btrfs/reflink.h               |   4 +-
+ fs/btrfs/relocation.c            | 107 +++++------
+ fs/btrfs/relocation.h            |   9 +
+ fs/btrfs/root-tree.c             |  17 +-
+ fs/btrfs/root-tree.h             |  10 ++
+ fs/btrfs/scrub.c                 |   9 +-
+ fs/btrfs/scrub.h                 |   6 +
+ fs/btrfs/send.c                  |  74 +++++---
+ fs/btrfs/send.h                  |   8 +-
+ fs/btrfs/space-info.c            | 166 ++++++++++++++++-
+ fs/btrfs/space-info.h            |  37 ++++
+ fs/btrfs/subpage.h               |   5 +
+ fs/btrfs/super.c                 |   4 +-
+ fs/btrfs/super.h                 |   7 +
+ fs/btrfs/sysfs.c                 | 132 +++++++++++++-
+ fs/btrfs/sysfs.h                 |   9 +
+ fs/btrfs/tests/extent-io-tests.c |  28 ++-
+ fs/btrfs/tests/inode-tests.c     |  40 ++---
+ fs/btrfs/transaction.c           |   2 -
+ fs/btrfs/transaction.h           |  17 +-
+ fs/btrfs/tree-checker.c          |   8 +-
+ fs/btrfs/tree-checker.h          |   2 +
+ fs/btrfs/tree-log.c              |  95 ++++++----
+ fs/btrfs/tree-log.h              |  33 ++++
+ fs/btrfs/tree-mod-log.h          |   8 +-
+ fs/btrfs/ulist.c                 |   1 -
+ fs/btrfs/ulist.h                 |   1 +
+ fs/btrfs/uuid-tree.c             |   3 +-
+ fs/btrfs/uuid-tree.h             |   5 +
+ fs/btrfs/verity.c                |   1 -
+ fs/btrfs/verity.h                |   7 +
+ fs/btrfs/volumes.c               |  26 ++-
+ fs/btrfs/volumes.h               |  49 ++++-
+ fs/btrfs/xattr.h                 |   6 +-
+ fs/btrfs/zlib.c                  | 112 ++++++------
+ fs/btrfs/zoned.c                 |   2 -
+ fs/btrfs/zoned.h                 |  15 ++
+ fs/btrfs/zstd.c                  | 154 +++++++---------
+ 115 files changed, 2232 insertions(+), 1022 deletions(-)
+Merging ceph/master (dbc347ef7f0c ceph: add ceph_cap_unlink_work to fire check_caps() immediately)
+$ git merge -m Merge branch 'master' of git://github.com/ceph/ceph-client.git ceph/master
+Already up to date.
+Merging cifs/for-next (0ab0a5fed476 cifs: allow changing password during remount)
+$ git merge -m Merge branch 'for-next' of git://git.samba.org/sfrench/cifs-2.6.git cifs/for-next
+Merge made by the 'ort' strategy.
+ fs/smb/client/cifs_debug.c |  2 ++
+ fs/smb/client/cifsglob.h   |  1 +
+ fs/smb/client/fs_context.c | 23 ++++++++++++++++++-----
+ fs/smb/client/smb2pdu.c    |  5 +++++
+ 4 files changed, 26 insertions(+), 5 deletions(-)
+Merging configfs/for-next (4425c1d9b44d configfs: improve item creation performance)
+$ git merge -m Merge branch 'for-next' of git://git.infradead.org/users/hch/configfs.git configfs/for-next
+Auto-merging fs/configfs/inode.c
+Merge made by the 'ort' strategy.
+ fs/configfs/configfs_internal.h |  4 ++--
+ fs/configfs/dir.c               | 42 +++++++++++++++++++++++++++++++----------
+ fs/configfs/inode.c             | 24 -----------------------
+ 3 files changed, 34 insertions(+), 36 deletions(-)
+Merging ecryptfs/next (a3d78fe3e1ae fs: ecryptfs: comment typo fix)
+$ git merge -m Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/tyhicks/ecryptfs.git ecryptfs/next
+Auto-merging fs/ecryptfs/crypto.c
+Auto-merging fs/ecryptfs/read_write.c
+Merge made by the 'ort' strategy.
+ fs/ecryptfs/crypto.c   | 2 +-
+ fs/ecryptfs/keystore.c | 2 +-
+ 2 files changed, 2 insertions(+), 2 deletions(-)
+Merging erofs/dev (aa12a790d31b erofs: make erofs_{err,info}() support NULL sb parameter)
+$ git merge -m Merge branch 'dev' of git://git.kernel.org/pub/scm/linux/kernel/git/xiang/erofs.git erofs/dev
+Already up to date.
+Merging exfat/dev (3a7845041eb7 exfat: fix appending discontinuous clusters to empty file)
+$ git merge -m Merge branch 'dev' of git://git.kernel.org/pub/scm/linux/kernel/git/linkinjeon/exfat.git exfat/dev
+Merge made by the 'ort' strategy.
+ fs/exfat/file.c | 35 +++++++++++++++++++++--------------
+ 1 file changed, 21 insertions(+), 14 deletions(-)
+Merging exportfs/exportfs-next (42c3732fa807 fs: Create a generic is_dot_dotdot() utility)
+$ git merge -m Merge branch 'exportfs-next' of git://git.kernel.org/pub/scm/linux/kernel/git/cel/linux exportfs/exportfs-next
+Auto-merging fs/ecryptfs/crypto.c
+Auto-merging fs/f2fs/f2fs.h
+Auto-merging fs/namei.c
+Auto-merging include/linux/fs.h
+Merge made by the 'ort' strategy.
+ fs/crypto/fname.c    |  8 +-------
+ fs/ecryptfs/crypto.c | 10 ----------
+ fs/exportfs/expfs.c  |  2 +-
+ fs/f2fs/f2fs.h       | 11 -----------
+ fs/namei.c           |  6 ++----
+ include/linux/fs.h   | 11 +++++++++++
+ 6 files changed, 15 insertions(+), 33 deletions(-)
+Merging ext3/for_next (21174ac99fe4 Pull MAINTAINER file git tree addition.)
+$ git merge -m Merge branch 'for_next' of git://git.kernel.org/pub/scm/linux/kernel/git/jack/linux-fs.git ext3/for_next
+Auto-merging MAINTAINERS
+Auto-merging fs/ext4/ext4.h
+Auto-merging fs/ext4/super.c
+Auto-merging fs/f2fs/f2fs.h
+Auto-merging include/linux/fs.h
+Auto-merging mm/shmem.c
+Merge made by the 'ort' strategy.
+ MAINTAINERS              |   1 +
+ fs/ext2/balloc.c         |   2 +-
+ fs/ext2/ext2.h           |   2 +-
+ fs/ext2/inode.c          |   2 +-
+ fs/ext2/super.c          |   2 +-
+ fs/ext2/xattr.c          |   2 +-
+ fs/ext4/ext4.h           |   2 +-
+ fs/ext4/super.c          |   2 +-
+ fs/f2fs/f2fs.h           |   2 +-
+ fs/f2fs/super.c          |   2 +-
+ fs/isofs/inode.c         |  18 ++++-
+ fs/jfs/jfs_incore.h      |   2 +-
+ fs/jfs/super.c           |   2 +-
+ fs/notify/fsnotify.c     |  28 +++++---
+ fs/ocfs2/inode.h         |   2 +-
+ fs/ocfs2/quota_global.c  |  12 ++++
+ fs/ocfs2/quota_local.c   |   3 +
+ fs/ocfs2/super.c         |   2 +-
+ fs/quota/dquot.c         | 172 +++++++++++++++++++++++++++--------------------
+ fs/quota/quota_tree.c    | 152 +++++++++++++++++++++++++++++------------
+ fs/quota/quota_v1.c      |   6 ++
+ fs/quota/quota_v2.c      |  35 ++++++++--
+ fs/reiserfs/reiserfs.h   |   2 +-
+ fs/reiserfs/super.c      |   2 +-
+ fs/udf/dir.c             |   2 +-
+ fs/udf/inode.c           |   2 +-
+ fs/udf/namei.c           |  23 ++++---
+ fs/udf/super.c           |  44 +++++++-----
+ include/linux/fs.h       |   2 +-
+ include/linux/fsnotify.h |  12 +++-
+ include/linux/shmem_fs.h |   2 +-
+ mm/shmem.c               |   2 +-
+ 32 files changed, 357 insertions(+), 189 deletions(-)
+Merging ext4/dev (ec9d669eba4c ext4: make ext4_set_iomap() recognize IOMAP_DELALLOC map type)
+$ git merge -m Merge branch 'dev' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4.git ext4/dev
+Already up to date.
+Merging f2fs/dev (21ec68234826 f2fs: fix to avoid potential panic during recovery)
+$ git merge -m Merge branch 'dev' of git://git.kernel.org/pub/scm/linux/kernel/git/jaegeuk/f2fs.git f2fs/dev
+Auto-merging fs/f2fs/f2fs.h
+Auto-merging fs/f2fs/super.c
+Merge made by the 'ort' strategy.
+ Documentation/ABI/testing/sysfs-fs-f2fs | 47 ++++++++++----------
+ Documentation/filesystems/f2fs.rst      | 47 ++++++++++----------
+ fs/f2fs/checkpoint.c                    | 19 ++++++--
+ fs/f2fs/compress.c                      | 45 +++++++++++--------
+ fs/f2fs/data.c                          | 36 ++++++++-------
+ fs/f2fs/dir.c                           |  5 +--
+ fs/f2fs/f2fs.h                          | 78 ++++++++++++++++++++++-----------
+ fs/f2fs/file.c                          | 51 ++++++++++++++-------
+ fs/f2fs/namei.c                         | 11 ++---
+ fs/f2fs/node.c                          |  2 +-
+ fs/f2fs/recovery.c                      | 33 +++++++-------
+ fs/f2fs/segment.c                       |  4 +-
+ fs/f2fs/super.c                         | 57 ++++++++++++++----------
+ 13 files changed, 254 insertions(+), 181 deletions(-)
+Merging fsverity/for-next (8e43fb06e10d fsverity: remove hash page spin lock)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/fs/fsverity/linux.git fsverity/for-next
+Merge made by the 'ort' strategy.
+ fs/verity/fsverity_private.h |  1 -
+ fs/verity/open.c             |  1 -
+ fs/verity/verify.c           | 48 ++++++++++++++++++++++----------------------
+ 3 files changed, 24 insertions(+), 26 deletions(-)
+Merging fuse/for-next (3f29f1c336c0 fuse: disable FOPEN_PARALLEL_DIRECT_WRITES with FUSE_DIRECT_IO_ALLOW_MMAP)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/mszeredi/fuse.git fuse/for-next
+Already up to date.
+Merging gfs2/for-next (6b89b6af459f Merge tag 'gfs2-v6.8-rc2-revert' of git://git.kernel.org/pub/scm/linux/kernel/git/gfs2/linux-gfs2)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/gfs2/linux-gfs2.git gfs2/for-next
+Already up to date.
+Merging jfs/jfs-next (e42e29cc4423 Revert "jfs: fix shift-out-of-bounds in dbJoin")
+$ git merge -m Merge branch 'jfs-next' of git://github.com/kleikamp/linux-shaggy.git jfs/jfs-next
+Already up to date.
+Merging ksmbd/ksmbd-for-next (b401b621758e Linux 6.8-rc5)
+$ git merge -m Merge branch 'ksmbd-for-next' of https://github.com/smfrench/smb3-kernel.git ksmbd/ksmbd-for-next
+Already up to date.
+Merging nfs/linux-next (052d534373b7 Merge tag 'exfat-for-6.8-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/linkinjeon/exfat)
+$ git merge -m Merge branch 'linux-next' of git://git.linux-nfs.org/projects/trondmy/nfs-2.6.git nfs/linux-next
+Already up to date.
+Merging nfs-anna/linux-next (57331a59ac0d NFSv4.1: Use the nfs_client's rpc timeouts for backchannel)
+$ git merge -m Merge branch 'linux-next' of git://git.linux-nfs.org/projects/anna/linux-nfs.git nfs-anna/linux-next
+Already up to date.
+Merging nfsd/nfsd-next (26102396d4e0 NFSD: Document nfsd_setattr() fill-attributes behavior)
+$ git merge -m Merge branch 'nfsd-next' of git://git.kernel.org/pub/scm/linux/kernel/git/cel/linux nfsd/nfsd-next
+Auto-merging MAINTAINERS
+Merge made by the 'ort' strategy.
+ MAINTAINERS                                |   1 +
+ fs/lockd/svc.c                             |   3 -
+ fs/nfs/callback.c                          |   3 -
+ fs/nfsd/blocklayout.c                      |   4 +-
+ fs/nfsd/cache.h                            |   2 -
+ fs/nfsd/filecache.c                        |  76 ++-
+ fs/nfsd/filecache.h                        |   1 +
+ fs/nfsd/netns.h                            |  29 +-
+ fs/nfsd/nfs3proc.c                         |   6 +-
+ fs/nfsd/nfs3xdr.c                          |   5 +-
+ fs/nfsd/nfs4callback.c                     | 193 +++++--
+ fs/nfsd/nfs4layouts.c                      |  63 ++-
+ fs/nfsd/nfs4proc.c                         |  13 +-
+ fs/nfsd/nfs4state.c                        | 843 +++++++++++++++++++++--------
+ fs/nfsd/nfs4xdr.c                          |  29 +-
+ fs/nfsd/nfscache.c                         |  43 +-
+ fs/nfsd/nfsctl.c                           |  17 +-
+ fs/nfsd/nfsd.h                             |   3 +
+ fs/nfsd/nfsfh.c                            |   3 +-
+ fs/nfsd/nfsproc.c                          |   6 +-
+ fs/nfsd/nfssvc.c                           |  16 +-
+ fs/nfsd/pnfs.h                             |   8 +-
+ fs/nfsd/state.h                            |  83 ++-
+ fs/nfsd/stats.c                            |  52 +-
+ fs/nfsd/stats.h                            |  70 +--
+ fs/nfsd/trace.h                            | 194 ++++++-
+ fs/nfsd/vfs.c                              |  84 ++-
+ fs/nfsd/vfs.h                              |   4 +-
+ fs/nfsd/xdr3.h                             |   2 +-
+ fs/nfsd/xdr4cb.h                           |  18 +
+ include/linux/sunrpc/svc.h                 |   5 +-
+ include/linux/sunrpc/svc_rdma.h            |  55 +-
+ include/trace/events/rpcrdma.h             |   4 +
+ include/trace/misc/nfs.h                   |  34 ++
+ net/sunrpc/auth_gss/gss_krb5_crypto.c      |  14 +-
+ net/sunrpc/auth_gss/gss_krb5_mech.c        |  11 +-
+ net/sunrpc/auth_gss/gss_rpc_xdr.c          |  27 +-
+ net/sunrpc/stats.c                         |   2 +-
+ net/sunrpc/svc.c                           |  40 +-
+ net/sunrpc/xprtrdma/svc_rdma_backchannel.c |   2 +-
+ net/sunrpc/xprtrdma/svc_rdma_rw.c          | 245 ++++++---
+ net/sunrpc/xprtrdma/svc_rdma_sendto.c      | 151 +++---
+ net/sunrpc/xprtrdma/svc_rdma_transport.c   |  15 +-
+ net/sunrpc/xprtsock.c                      |   9 -
+ 44 files changed, 1745 insertions(+), 743 deletions(-)
+Merging ntfs3/master (622cd3daa8ea fs/ntfs3: Slightly simplify ntfs_inode_printk())
+$ git merge -m Merge branch 'master' of https://github.com/Paragon-Software-Group/linux-ntfs3.git ntfs3/master
+Already up to date.
+Merging orangefs/for-next (9bf93dcfc453 Julia Lawall reported this null pointer dereference, this should fix it.)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/hubcap/linux orangefs/for-next
+Merge made by the 'ort' strategy.
+ fs/orangefs/orangefs-cache.c  |  2 +-
+ fs/orangefs/orangefs-kernel.h | 10 ----------
+ fs/orangefs/super.c           |  4 ++--
+ 3 files changed, 3 insertions(+), 13 deletions(-)
+Merging overlayfs/overlayfs-next (d17bb4620f90 overlayfs.rst: fix ReST formatting)
+$ git merge -m Merge branch 'overlayfs-next' of git://git.kernel.org/pub/scm/linux/kernel/git/overlayfs/vfs.git overlayfs/overlayfs-next
+Already up to date.
+Merging ubifs/next (adbf4c4954e3 ubi: block: fix memleak in ubiblock_create())
+$ git merge -m Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/rw/ubifs.git ubifs/next
+Already up to date.
+Merging v9fs/9p-next (be3193e58ec2 9p: Fix read/write debug statements to report server reply)
+$ git merge -m Merge branch '9p-next' of git://github.com/martinetd/linux v9fs/9p-next
+Merge made by the 'ort' strategy.
+ net/9p/client.c   | 10 +++++-----
+ net/9p/trans_fd.c |  1 -
+ 2 files changed, 5 insertions(+), 6 deletions(-)
+Merging v9fs-ericvh/ericvh/for-next (be57855f5050 fs/9p: fix dups even in uncached mode)
+$ git merge -m Merge branch 'ericvh/for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/ericvh/v9fs.git v9fs-ericvh/ericvh/for-next
+Merge made by the 'ort' strategy.
+ fs/9p/v9fs.h           |  31 ++------
+ fs/9p/v9fs_vfs.h       |  11 ++-
+ fs/9p/vfs_dir.c        |   4 +-
+ fs/9p/vfs_inode.c      | 150 ++++++--------------------------------
+ fs/9p/vfs_inode_dotl.c | 194 +++++++++----------------------------------------
+ fs/9p/vfs_super.c      |  45 +-----------
+ 6 files changed, 71 insertions(+), 364 deletions(-)
+Merging xfs/for-next (49c379d3a72a xfs: use kvfree for buf in xfs_ioc_getbmap)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux.git xfs/for-next
+Merge made by the 'ort' strategy.
+ fs/xfs/Makefile                   |   3 +-
+ fs/xfs/kmem.c                     |  30 -----------
+ fs/xfs/kmem.h                     |  83 -------------------------------
+ fs/xfs/libxfs/xfs_ag.c            |  10 ++--
+ fs/xfs/libxfs/xfs_attr.c          |   3 +-
+ fs/xfs/libxfs/xfs_attr_leaf.c     |  18 +++----
+ fs/xfs/libxfs/xfs_bmap.c          |   2 +-
+ fs/xfs/libxfs/xfs_btree.c         |   2 +-
+ fs/xfs/libxfs/xfs_btree.h         |   4 +-
+ fs/xfs/libxfs/xfs_btree_staging.c |  10 ++--
+ fs/xfs/libxfs/xfs_da_btree.c      |  22 +++++----
+ fs/xfs/libxfs/xfs_defer.c         |  23 ++++-----
+ fs/xfs/libxfs/xfs_dir2.c          |  48 ++++++++----------
+ fs/xfs/libxfs/xfs_dir2_block.c    |   6 +--
+ fs/xfs/libxfs/xfs_dir2_sf.c       |  16 +++---
+ fs/xfs/libxfs/xfs_iext_tree.c     |  26 ++++++----
+ fs/xfs/libxfs/xfs_inode_fork.c    |  29 ++++++-----
+ fs/xfs/libxfs/xfs_refcount.c      |   2 +-
+ fs/xfs/libxfs/xfs_rmap.c          |   2 +-
+ fs/xfs/scrub/cow_repair.c         |   2 +-
+ fs/xfs/scrub/ialloc_repair.c      |   2 +-
+ fs/xfs/xfs_acl.c                  |   4 +-
+ fs/xfs/xfs_attr_item.c            |  14 +++---
+ fs/xfs/xfs_attr_list.c            |   6 +--
+ fs/xfs/xfs_bmap_item.c            |   7 +--
+ fs/xfs/xfs_bmap_util.c            |   2 +-
+ fs/xfs/xfs_buf.c                  |  48 ++++++++++--------
+ fs/xfs/xfs_buf_item.c             |   8 +--
+ fs/xfs/xfs_buf_item_recover.c     |   8 +--
+ fs/xfs/xfs_discard.c              |  17 +++++--
+ fs/xfs/xfs_dquot.c                |   2 +-
+ fs/xfs/xfs_error.c                |   8 +--
+ fs/xfs/xfs_extent_busy.c          |   5 +-
+ fs/xfs/xfs_extfree_item.c         |   8 +--
+ fs/xfs/xfs_filestream.c           |   6 +--
+ fs/xfs/xfs_icache.c               |   5 +-
+ fs/xfs/xfs_icreate_item.c         |   2 +-
+ fs/xfs/xfs_inode.c                |   4 +-
+ fs/xfs/xfs_inode_item.c           |   2 +-
+ fs/xfs/xfs_inode_item_recover.c   |   5 +-
+ fs/xfs/xfs_ioctl.c                |   8 +--
+ fs/xfs/xfs_iops.c                 |   2 +-
+ fs/xfs/xfs_itable.c               |  12 ++---
+ fs/xfs/xfs_iwalk.c                |   9 ++--
+ fs/xfs/xfs_linux.h                |  14 +++++-
+ fs/xfs/xfs_log.c                  |  20 ++++----
+ fs/xfs/xfs_log_cil.c              |  31 ++++++++----
+ fs/xfs/xfs_log_recover.c          | 101 ++++++++++++++++++++++++--------------
+ fs/xfs/xfs_mount.c                |   2 +-
+ fs/xfs/xfs_mru_cache.c            |  17 ++++---
+ fs/xfs/xfs_qm.c                   |  18 ++++---
+ fs/xfs/xfs_refcount_item.c        |  12 ++---
+ fs/xfs/xfs_rmap_item.c            |  11 +++--
+ fs/xfs/xfs_rtalloc.c              |  10 ++--
+ fs/xfs/xfs_super.c                |   4 +-
+ fs/xfs/xfs_sysfs.c                |   4 --
+ fs/xfs/xfs_trace.h                |  25 ----------
+ fs/xfs/xfs_trans_ail.c            |   7 +--
+ 58 files changed, 373 insertions(+), 438 deletions(-)
+ delete mode 100644 fs/xfs/kmem.c
+ delete mode 100644 fs/xfs/kmem.h
+Merging zonefs/for-next (567e629fd296 zonefs: convert zonefs to use the new mount api)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/dlemoal/zonefs.git zonefs/for-next
+Auto-merging fs/zonefs/super.c
+Merge made by the 'ort' strategy.
+ fs/zonefs/super.c | 165 +++++++++++++++++++++++++++++++-----------------------
+ 1 file changed, 94 insertions(+), 71 deletions(-)
+Merging iomap/iomap-for-next (3ac974796e5d iomap: fix short copy in iomap_write_iter())
+$ git merge -m Merge branch 'iomap-for-next' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux.git iomap/iomap-for-next
+Already up to date.
+Merging djw-vfs/vfs-for-next (ce85a1e04645 xfs: stabilize fs summary counters for online fsck)
+$ git merge -m Merge branch 'vfs-for-next' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux.git djw-vfs/vfs-for-next
+Already up to date.
+Merging file-locks/locks-next (e0152e7481c6 Merge tag 'riscv-for-linus-6.6-mw1' of git://git.kernel.org/pub/scm/linux/kernel/git/riscv/linux)
+$ git merge -m Merge branch 'locks-next' of git://git.kernel.org/pub/scm/linux/kernel/git/jlayton/linux.git file-locks/locks-next
+Already up to date.
+Merging iversion/iversion-next (e0152e7481c6 Merge tag 'riscv-for-linus-6.6-mw1' of git://git.kernel.org/pub/scm/linux/kernel/git/riscv/linux)
+$ git merge -m Merge branch 'iversion-next' of git://git.kernel.org/pub/scm/linux/kernel/git/jlayton/linux.git iversion/iversion-next
+Already up to date.
+Merging vfs-brauner/vfs.all (b630a177e61d Merge branch 'vfs.uuid' into vfs.all)
+$ git merge -m Merge branch 'vfs.all' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs.git vfs-brauner/vfs.all
+Auto-merging CREDITS
+Auto-merging MAINTAINERS
+Auto-merging drivers/block/zram/zram_drv.c
+Auto-merging drivers/md/bcache/bcache.h
+Auto-merging drivers/md/bcache/super.c
+Auto-merging drivers/md/dm.c
+Auto-merging drivers/target/target_core_pscsi.c
+Auto-merging fs/Kconfig
+Auto-merging fs/bcachefs/super-io.c
+CONFLICT (content): Merge conflict in fs/bcachefs/super-io.c
+Auto-merging fs/btrfs/dev-replace.c
+Auto-merging fs/btrfs/ioctl.c
+Auto-merging fs/btrfs/volumes.c
+Auto-merging fs/btrfs/volumes.h
+Auto-merging fs/ext4/ext4.h
+Auto-merging fs/ext4/super.c
+Auto-merging fs/f2fs/f2fs.h
+Auto-merging fs/f2fs/super.c
+Auto-merging fs/nfsd/filecache.c
+Auto-merging fs/nfsd/nfs4callback.c
+Auto-merging fs/nfsd/nfs4layouts.c
+CONFLICT (content): Merge conflict in fs/nfsd/nfs4layouts.c
+Auto-merging fs/nfsd/nfs4state.c
+Auto-merging fs/ntfs3/namei.c
+Auto-merging fs/ocfs2/super.c
+Auto-merging fs/reiserfs/reiserfs.h
+Auto-merging fs/smb/client/file.c
+Auto-merging fs/smb/server/smb2pdu.c
+Auto-merging fs/xfs/xfs_buf.c
+Auto-merging fs/xfs/xfs_mount.c
+Auto-merging fs/xfs/xfs_super.c
+Auto-merging fs/zonefs/file.c
+Auto-merging include/linux/dcache.h
+Auto-merging include/linux/fs.h
+Auto-merging include/linux/swap.h
+Auto-merging init/initramfs.c
+Auto-merging init/main.c
+CONFLICT (content): Merge conflict in init/main.c
+Auto-merging kernel/exit.c
+Auto-merging mm/backing-dev.c
+Auto-merging mm/filemap.c
+Auto-merging mm/shmem.c
+Auto-merging mm/swapfile.c
+Resolved 'fs/bcachefs/super-io.c' using previous resolution.
+Resolved 'fs/nfsd/nfs4layouts.c' using previous resolution.
+Recorded preimage for 'init/main.c'
+Automatic merge failed; fix conflicts and then commit the result.
+$ git commit --no-edit -v -a
+Recorded resolution for 'init/main.c'.
+[master 1d78d5302715] Merge branch 'vfs.all' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs.git
+$ git diff -M --stat --summary HEAD^..
+ CREDITS                                            |    5 +
+ Documentation/filesystems/files.rst                |    2 +-
+ Documentation/filesystems/index.rst                |    1 -
+ Documentation/filesystems/locking.rst              |    2 +-
+ Documentation/filesystems/ntfs.rst                 |  466 ---
+ Documentation/filesystems/vfs.rst                  |   16 +-
+ Documentation/userspace-api/ioctl/ioctl-number.rst |    3 +-
+ MAINTAINERS                                        |   10 -
+ block/bdev.c                                       |  252 +-
+ block/blk.h                                        |    4 +
+ block/fops.c                                       |   48 +-
+ block/genhd.c                                      |   12 +-
+ block/ioctl.c                                      |    9 +-
+ drivers/block/drbd/drbd_int.h                      |    4 +-
+ drivers/block/drbd/drbd_nl.c                       |   58 +-
+ drivers/block/pktcdvd.c                            |   68 +-
+ drivers/block/rnbd/rnbd-srv.c                      |   28 +-
+ drivers/block/rnbd/rnbd-srv.h                      |    2 +-
+ drivers/block/xen-blkback/blkback.c                |    4 +-
+ drivers/block/xen-blkback/common.h                 |    4 +-
+ drivers/block/xen-blkback/xenbus.c                 |   37 +-
+ drivers/block/zram/zram_drv.c                      |   26 +-
+ drivers/block/zram/zram_drv.h                      |    2 +-
+ drivers/md/bcache/bcache.h                         |    4 +-
+ drivers/md/bcache/super.c                          |   74 +-
+ drivers/md/dm.c                                    |   23 +-
+ drivers/md/md.c                                    |   12 +-
+ drivers/md/md.h                                    |    2 +-
+ drivers/mtd/devices/block2mtd.c                    |   46 +-
+ drivers/nvme/target/io-cmd-bdev.c                  |   16 +-
+ drivers/nvme/target/nvmet.h                        |    2 +-
+ drivers/s390/block/dasd.c                          |   10 +-
+ drivers/s390/block/dasd_genhd.c                    |   36 +-
+ drivers/s390/block/dasd_int.h                      |    2 +-
+ drivers/s390/block/dasd_ioctl.c                    |    2 +-
+ drivers/target/target_core_iblock.c                |   18 +-
+ drivers/target/target_core_iblock.h                |    2 +-
+ drivers/target/target_core_pscsi.c                 |   22 +-
+ drivers/target/target_core_pscsi.h                 |    2 +-
+ fs/9p/vfs_file.c                                   |   40 +-
+ fs/Kconfig                                         |    8 +-
+ fs/Makefile                                        |    3 +-
+ fs/afs/flock.c                                     |   60 +-
+ fs/afs/internal.h                                  |    6 +-
+ fs/afs/main.c                                      |    3 +-
+ fs/afs/server.c                                    |   14 +-
+ fs/afs/volume.c                                    |    4 +-
+ fs/attr.c                                          |    2 +-
+ fs/backing-file.c                                  |    4 +-
+ fs/bcachefs/super-io.c                             |   20 +-
+ fs/bcachefs/super_types.h                          |    2 +-
+ fs/btrfs/dev-replace.c                             |   14 +-
+ fs/btrfs/ioctl.c                                   |   16 +-
+ fs/btrfs/volumes.c                                 |   92 +-
+ fs/btrfs/volumes.h                                 |    4 +-
+ fs/buffer.c                                        |   10 +-
+ fs/cachefiles/cache.c                              |    2 +
+ fs/cachefiles/daemon.c                             |    1 +
+ fs/ceph/locks.c                                    |   74 +-
+ fs/cramfs/inode.c                                  |    2 +-
+ fs/dlm/plock.c                                     |   44 +-
+ fs/efs/super.c                                     |  114 +-
+ fs/erofs/data.c                                    |    6 +-
+ fs/erofs/internal.h                                |    2 +-
+ fs/erofs/super.c                                   |   16 +-
+ fs/eventfd.c                                       |   16 +-
+ fs/eventpoll.c                                     |    8 +-
+ fs/exec.c                                          |    1 -
+ fs/ext4/ext4.h                                     |    2 +-
+ fs/ext4/fsmap.c                                    |    8 +-
+ fs/ext4/super.c                                    |   54 +-
+ fs/f2fs/f2fs.h                                     |    2 +-
+ fs/f2fs/super.c                                    |   14 +-
+ fs/fat/inode.c                                     |    3 +
+ fs/fcntl.c                                         |    8 +-
+ fs/fhandle.c                                       |    2 +-
+ fs/file_table.c                                    |   83 +-
+ fs/fs-writeback.c                                  |   25 +
+ fs/fs_parser.c                                     |    4 +-
+ fs/fuse/file.c                                     |   14 +-
+ fs/gfs2/bmap.c                                     |    2 +-
+ fs/gfs2/file.c                                     |   16 +-
+ fs/gfs2/ops_fstype.c                               |    2 +-
+ fs/hfsplus/wrapper.c                               |    2 +-
+ fs/inode.c                                         |    3 +-
+ fs/internal.h                                      |    4 +
+ fs/ioctl.c                                         |   33 +
+ fs/iomap/buffered-io.c                             |  602 ++--
+ fs/jfs/jfs_logmgr.c                                |   26 +-
+ fs/jfs/jfs_logmgr.h                                |    2 +-
+ fs/jfs/jfs_mount.c                                 |    2 +-
+ fs/kernfs/mount.c                                  |    4 +-
+ fs/libfs.c                                         |  125 +-
+ fs/lockd/clnt4xdr.c                                |   14 +-
+ fs/lockd/clntlock.c                                |    2 +-
+ fs/lockd/clntproc.c                                |   65 +-
+ fs/lockd/clntxdr.c                                 |   14 +-
+ fs/lockd/svc4proc.c                                |   10 +-
+ fs/lockd/svclock.c                                 |   64 +-
+ fs/lockd/svcproc.c                                 |   10 +-
+ fs/lockd/svcsubs.c                                 |   24 +-
+ fs/lockd/xdr.c                                     |   14 +-
+ fs/lockd/xdr4.c                                    |   14 +-
+ fs/locks.c                                         |  896 +++---
+ fs/mbcache.c                                       |    5 +-
+ fs/mnt_idmapping.c                                 |    2 +-
+ fs/netfs/buffered_write.c                          |    3 +
+ fs/netfs/direct_write.c                            |    5 +-
+ fs/netfs/io.c                                      |    2 +
+ fs/nfs/blocklayout/blocklayout.h                   |    2 +-
+ fs/nfs/blocklayout/dev.c                           |   66 +-
+ fs/nfs/delegation.c                                |    4 +-
+ fs/nfs/file.c                                      |   22 +-
+ fs/nfs/nfs3proc.c                                  |    2 +-
+ fs/nfs/nfs4_fs.h                                   |    2 +-
+ fs/nfs/nfs4file.c                                  |    2 +-
+ fs/nfs/nfs4proc.c                                  |   39 +-
+ fs/nfs/nfs4state.c                                 |   22 +-
+ fs/nfs/nfs4trace.h                                 |    4 +-
+ fs/nfs/nfs4xdr.c                                   |    8 +-
+ fs/nfs/write.c                                     |    8 +-
+ fs/nfsd/filecache.c                                |    4 +-
+ fs/nfsd/nfs4callback.c                             |    2 +-
+ fs/nfsd/nfs4layouts.c                              |   35 +-
+ fs/nfsd/nfs4state.c                                |  124 +-
+ fs/nsfs.c                                          |  102 +-
+ fs/ntfs/Kconfig                                    |   81 -
+ fs/ntfs/Makefile                                   |   15 -
+ fs/ntfs/aops.c                                     | 1744 -----------
+ fs/ntfs/aops.h                                     |   88 -
+ fs/ntfs/attrib.c                                   | 2624 ----------------
+ fs/ntfs/attrib.h                                   |  102 -
+ fs/ntfs/bitmap.c                                   |  179 --
+ fs/ntfs/bitmap.h                                   |  104 -
+ fs/ntfs/collate.c                                  |  110 -
+ fs/ntfs/collate.h                                  |   36 -
+ fs/ntfs/compress.c                                 |  950 ------
+ fs/ntfs/debug.c                                    |  159 -
+ fs/ntfs/debug.h                                    |   57 -
+ fs/ntfs/dir.c                                      | 1540 ----------
+ fs/ntfs/dir.h                                      |   34 -
+ fs/ntfs/endian.h                                   |   79 -
+ fs/ntfs/file.c                                     | 1997 ------------
+ fs/ntfs/index.c                                    |  440 ---
+ fs/ntfs/index.h                                    |  134 -
+ fs/ntfs/inode.c                                    | 3102 -------------------
+ fs/ntfs/inode.h                                    |  310 --
+ fs/ntfs/layout.h                                   | 2421 ---------------
+ fs/ntfs/lcnalloc.c                                 | 1000 ------
+ fs/ntfs/lcnalloc.h                                 |  131 -
+ fs/ntfs/logfile.c                                  |  849 ------
+ fs/ntfs/logfile.h                                  |  295 --
+ fs/ntfs/malloc.h                                   |   77 -
+ fs/ntfs/mft.c                                      | 2907 ------------------
+ fs/ntfs/mft.h                                      |  110 -
+ fs/ntfs/mst.c                                      |  189 --
+ fs/ntfs/namei.c                                    |  392 ---
+ fs/ntfs/ntfs.h                                     |  150 -
+ fs/ntfs/quota.c                                    |  103 -
+ fs/ntfs/quota.h                                    |   21 -
+ fs/ntfs/runlist.c                                  | 1893 ------------
+ fs/ntfs/runlist.h                                  |   88 -
+ fs/ntfs/super.c                                    | 3202 --------------------
+ fs/ntfs/sysctl.c                                   |   58 -
+ fs/ntfs/sysctl.h                                   |   27 -
+ fs/ntfs/time.h                                     |   89 -
+ fs/ntfs/types.h                                    |   55 -
+ fs/ntfs/unistr.c                                   |  384 ---
+ fs/ntfs/upcase.c                                   |   73 -
+ fs/ntfs/usnjrnl.c                                  |   70 -
+ fs/ntfs/usnjrnl.h                                  |  191 --
+ fs/ntfs/volume.h                                   |  164 -
+ fs/ntfs3/namei.c                                   |    2 +-
+ fs/ocfs2/cluster/heartbeat.c                       |   32 +-
+ fs/ocfs2/locks.c                                   |   12 +-
+ fs/ocfs2/stack_user.c                              |    2 +-
+ fs/ocfs2/super.c                                   |    4 +-
+ fs/open.c                                          |    2 +-
+ fs/overlayfs/super.c                               |   52 +-
+ fs/overlayfs/util.c                                |   18 +-
+ fs/pidfs.c                                         |  248 ++
+ fs/pipe.c                                          |   81 +-
+ fs/posix_acl.c                                     |    4 +-
+ fs/reiserfs/journal.c                              |   38 +-
+ fs/reiserfs/procfs.c                               |    2 +-
+ fs/reiserfs/reiserfs.h                             |    8 +-
+ fs/romfs/super.c                                   |    2 +-
+ fs/select.c                                        |   15 +-
+ fs/smb/client/cifsfs.c                             |    5 +-
+ fs/smb/client/cifssmb.c                            |    8 +-
+ fs/smb/client/file.c                               |   78 +-
+ fs/smb/client/smb2file.c                           |    2 +-
+ fs/smb/server/smb2pdu.c                            |   44 +-
+ fs/smb/server/vfs.c                                |   14 +-
+ fs/super.c                                         |   18 +-
+ fs/sysv/itree.c                                    |   10 +-
+ fs/ubifs/super.c                                   |    2 +-
+ fs/xfs/xfs_aops.c                                  |    9 +-
+ fs/xfs/xfs_buf.c                                   |   10 +-
+ fs/xfs/xfs_buf.h                                   |    4 +-
+ fs/xfs/xfs_mount.c                                 |    4 +-
+ fs/xfs/xfs_super.c                                 |   44 +-
+ fs/zonefs/file.c                                   |    3 +-
+ include/asm-generic/barrier.h                      |    2 -
+ include/linux/backing-dev.h                        |    1 -
+ include/linux/blkdev.h                             |   13 +-
+ include/linux/dcache.h                             |   18 +-
+ include/linux/device-mapper.h                      |    2 +-
+ include/linux/file.h                               |    2 +
+ include/linux/filelock.h                           |  129 +-
+ include/linux/fs.h                                 |   94 +-
+ include/linux/iomap.h                              |   19 +-
+ include/linux/lockd/lockd.h                        |    8 +-
+ include/linux/lockd/xdr.h                          |    2 +-
+ include/linux/ns_common.h                          |    2 +-
+ include/linux/pid.h                                |   10 +-
+ include/linux/pidfs.h                              |   10 +
+ include/linux/pktcdvd.h                            |    4 +-
+ include/linux/poll.h                               |    4 -
+ include/linux/proc_ns.h                            |    2 +-
+ include/linux/sched/signal.h                       |    2 -
+ include/linux/swap.h                               |    2 +-
+ include/trace/events/afs.h                         |    4 +-
+ include/trace/events/filelock.h                    |  102 +-
+ include/uapi/linux/fs.h                            |   30 +-
+ include/uapi/linux/magic.h                         |    1 +
+ include/uapi/linux/pidfd.h                         |    8 +-
+ init/do_mounts.c                                   |    3 +
+ init/do_mounts.h                                   |    9 +
+ init/initramfs.c                                   |    8 +-
+ init/main.c                                        |    2 +
+ kernel/exit.c                                      |   31 +-
+ kernel/fork.c                                      |  147 +-
+ kernel/nsproxy.c                                   |    2 +-
+ kernel/pid.c                                       |   53 +-
+ kernel/power/swap.c                                |   28 +-
+ kernel/signal.c                                    |  110 +-
+ lib/iov_iter.c                                     |   60 +-
+ mm/backing-dev.c                                   |   25 -
+ mm/filemap.c                                       |    9 -
+ mm/shmem.c                                         |    4 +-
+ mm/swapfile.c                                      |   22 +-
+ .../selftests/filesystems/overlayfs/dev_in_maps.c  |   10 +-
+ .../move_mount_set_group_test.c                    |    4 +-
+ tools/testing/selftests/pidfd/pidfd_getfd_test.c   |   31 +-
+ 245 files changed, 3321 insertions(+), 31867 deletions(-)
+ delete mode 100644 Documentation/filesystems/ntfs.rst
+ delete mode 100644 fs/ntfs/Kconfig
+ delete mode 100644 fs/ntfs/Makefile
+ delete mode 100644 fs/ntfs/aops.c
+ delete mode 100644 fs/ntfs/aops.h
+ delete mode 100644 fs/ntfs/attrib.c
+ delete mode 100644 fs/ntfs/attrib.h
+ delete mode 100644 fs/ntfs/bitmap.c
+ delete mode 100644 fs/ntfs/bitmap.h
+ delete mode 100644 fs/ntfs/collate.c
+ delete mode 100644 fs/ntfs/collate.h
+ delete mode 100644 fs/ntfs/compress.c
+ delete mode 100644 fs/ntfs/debug.c
+ delete mode 100644 fs/ntfs/debug.h
+ delete mode 100644 fs/ntfs/dir.c
+ delete mode 100644 fs/ntfs/dir.h
+ delete mode 100644 fs/ntfs/endian.h
+ delete mode 100644 fs/ntfs/file.c
+ delete mode 100644 fs/ntfs/index.c
+ delete mode 100644 fs/ntfs/index.h
+ delete mode 100644 fs/ntfs/inode.c
+ delete mode 100644 fs/ntfs/inode.h
+ delete mode 100644 fs/ntfs/layout.h
+ delete mode 100644 fs/ntfs/lcnalloc.c
+ delete mode 100644 fs/ntfs/lcnalloc.h
+ delete mode 100644 fs/ntfs/logfile.c
+ delete mode 100644 fs/ntfs/logfile.h
+ delete mode 100644 fs/ntfs/malloc.h
+ delete mode 100644 fs/ntfs/mft.c
+ delete mode 100644 fs/ntfs/mft.h
+ delete mode 100644 fs/ntfs/mst.c
+ delete mode 100644 fs/ntfs/namei.c
+ delete mode 100644 fs/ntfs/ntfs.h
+ delete mode 100644 fs/ntfs/quota.c
+ delete mode 100644 fs/ntfs/quota.h
+ delete mode 100644 fs/ntfs/runlist.c
+ delete mode 100644 fs/ntfs/runlist.h
+ delete mode 100644 fs/ntfs/super.c
+ delete mode 100644 fs/ntfs/sysctl.c
+ delete mode 100644 fs/ntfs/sysctl.h
+ delete mode 100644 fs/ntfs/time.h
+ delete mode 100644 fs/ntfs/types.h
+ delete mode 100644 fs/ntfs/unistr.c
+ delete mode 100644 fs/ntfs/upcase.c
+ delete mode 100644 fs/ntfs/usnjrnl.c
+ delete mode 100644 fs/ntfs/usnjrnl.h
+ delete mode 100644 fs/ntfs/volume.h
+ create mode 100644 fs/pidfs.c
+ create mode 100644 include/linux/pidfs.h
+$ git am -3 ../patches/0001-fixup-for-filelock-split-common-fields-into-struct-f.patch
+Applying: fixup for "filelock: split common fields into struct file_lock_core"
+$ git reset HEAD^
+Unstaged changes after reset:
+M	fs/nfsd/nfs4state.c
+$ git add -A .
+$ git commit -v -a --amend
+[master e0a41e2699f5] Merge branch 'vfs.all' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs.git
+ Date: Wed Feb 21 10:30:31 2024 +1100
+Merging vfs/for-next (052d534373b7 Merge tag 'exfat-for-6.8-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/linkinjeon/exfat)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs.git vfs/for-next
+Already up to date.
+Merging printk/for-next (e7081d5a9d97 Merge branch 'rework/console-flushing-fixes' into for-next)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/printk/linux.git printk/for-next
+Auto-merging kernel/panic.c
+Auto-merging kernel/printk/printk.c
+Merge made by the 'ort' strategy.
+ include/linux/printk.h            |   2 +
+ kernel/panic.c                    |   8 +
+ kernel/printk/nbcon.c             |  41 +----
+ kernel/printk/printk.c            | 101 +++++++-----
+ kernel/printk/printk_ringbuffer.c | 337 ++++++++++++++++++++++++++++++++------
+ kernel/printk/printk_ringbuffer.h |  54 +++++-
+ lib/dump_stack.c                  |  16 +-
+ 7 files changed, 420 insertions(+), 139 deletions(-)
+Merging pci/next (5b52c9afa3dd Merge branch 'pci/misc')
+$ git merge -m Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/pci/pci.git pci/next
+Auto-merging MAINTAINERS
+Auto-merging drivers/pci/pci.c
+Auto-merging drivers/pci/pci.h
+Auto-merging drivers/pci/pcie/aspm.c
+Auto-merging include/linux/pci.h
+Auto-merging lib/Kconfig
+Auto-merging lib/Makefile
+Merge made by the 'ort' strategy.
+ .../ABI/testing/sysfs-bus-pci-devices-aer_stats    |  18 +-
+ Documentation/driver-api/device-io.rst             |   3 -
+ Documentation/driver-api/pci/pci.rst               |   6 +
+ MAINTAINERS                                        |   1 -
+ drivers/ntb/core.c                                 |   8 +-
+ drivers/pci/Kconfig                                |   5 +
+ drivers/pci/Makefile                               |   7 +-
+ drivers/pci/controller/dwc/pci-imx6.c              |   3 +-
+ drivers/pci/controller/dwc/pci-keystone.c          |  12 +-
+ drivers/pci/controller/dwc/pci-layerscape-ep.c     |   5 +-
+ drivers/pci/controller/dwc/pcie-keembay.c          |   8 +-
+ drivers/pci/controller/dwc/pcie-rcar-gen4.c        |   4 +-
+ drivers/pci/controller/dwc/pcie-tegra194.c         |  10 +-
+ drivers/pci/controller/dwc/pcie-uniphier-ep.c      |  15 +-
+ drivers/pci/controller/pcie-rcar-ep.c              |  14 +-
+ drivers/pci/devres.c                               | 448 +++++++++++++++++++
+ drivers/pci/endpoint/functions/pci-epf-mhi.c       |  21 +-
+ drivers/pci/endpoint/functions/pci-epf-ntb.c       |   6 +-
+ drivers/pci/endpoint/functions/pci-epf-test.c      |  21 +-
+ drivers/pci/endpoint/functions/pci-epf-vntb.c      |  25 +-
+ drivers/pci/endpoint/pci-epc-core.c                |  25 +-
+ drivers/pci/endpoint/pci-epf-core.c                |  20 +-
+ lib/pci_iomap.c => drivers/pci/iomap.c             |   5 +-
+ drivers/pci/irq.c                                  | 204 +++++++++
+ drivers/pci/mmap.c                                 |  29 ++
+ drivers/pci/p2pdma.c                               |   2 +-
+ drivers/pci/pci-driver.c                           |  11 +-
+ drivers/pci/pci-sysfs.c                            |  29 +-
+ drivers/pci/pci.c                                  | 484 +--------------------
+ drivers/pci/pci.h                                  |  50 ++-
+ drivers/pci/pcie/Makefile                          |   2 +-
+ drivers/pci/pcie/aspm.c                            | 235 ++++++++++
+ drivers/pci/pcie/dpc.c                             |   2 +-
+ drivers/pci/pcie/portdrv.h                         |   2 +-
+ drivers/pci/probe.c                                |  66 +--
+ drivers/pci/setup-irq.c                            |  64 ---
+ drivers/pci/switch/switchtec.c                     |   4 +-
+ include/linux/pci-epc.h                            |  39 +-
+ include/linux/pci-epf.h                            |   4 +-
+ include/linux/pci.h                                |   2 +-
+ lib/Kconfig                                        |   3 -
+ lib/Makefile                                       |   1 -
+ lib/devres.c                                       | 208 +--------
+ 43 files changed, 1175 insertions(+), 956 deletions(-)
+ create mode 100644 drivers/pci/devres.c
+ rename lib/pci_iomap.c => drivers/pci/iomap.c (99%)
+ delete mode 100644 drivers/pci/setup-irq.c
+Merging pstore/for-next/pstore (69f381e67d6f pstore/zone: Add a null pointer check to the psz_kmsg_read)
+$ git merge -m Merge branch 'for-next/pstore' of git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux.git pstore/for-next/pstore
+Auto-merging arch/arm64/configs/defconfig
+Merge made by the 'ort' strategy.
+ arch/arm64/configs/defconfig      |  1 +
+ drivers/firmware/efi/efi-pstore.c | 43 +++++++++++++++++++++++++++++++--------
+ fs/pstore/ram.c                   |  1 +
+ fs/pstore/zone.c                  |  2 ++
+ 4 files changed, 39 insertions(+), 8 deletions(-)
+Merging hid/for-next (8f0a3ff87887 Merge branch 'for-6.9/nintendo' into for-next)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/hid/hid.git hid/for-next
+Merge made by the 'ort' strategy.
+ drivers/hid/hid-apple.c      |   1 -
+ drivers/hid/hid-ids.h        |   7 +
+ drivers/hid/hid-lenovo.c     |  57 ++++--
+ drivers/hid/hid-lg3ff.c      |   4 -
+ drivers/hid/hid-multitouch.c |   1 -
+ drivers/hid/hid-nintendo.c   |  22 ---
+ drivers/hid/hid-prodikeys.c  | 117 ++++--------
+ drivers/hid/hid-samsung.c    | 437 +++++++++++++++++++++++++++++++++++++++----
+ drivers/hid/wacom_wac.c      |   8 +-
+ drivers/hid/wacom_wac.h      |   1 -
+ include/linux/hid.h          |   4 +-
+ 11 files changed, 494 insertions(+), 165 deletions(-)
+Merging i2c/i2c/for-next (67ec505fae32 Merge tag 'i2c-host-fixes-6.8-rc5' of git://git.kernel.org/pub/scm/linux/kernel/git/andi.shyti/linux into i2c/for-current)
+$ git merge -m Merge branch 'i2c/for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/wsa/linux.git i2c/i2c/for-next
+Already up to date.
+Merging i2c-host/i2c/i2c-host (48acf8292280 i2c: Remove redundant comparison in npcm_i2c_reg_slave)
+$ git merge -m Merge branch 'i2c/i2c-host' of git://git.kernel.org/pub/scm/linux/kernel/git/andi.shyti/linux.git i2c-host/i2c/i2c-host
+Auto-merging drivers/i2c/busses/i2c-i801.c
+Merge made by the 'ort' strategy.
+ .../devicetree/bindings/i2c/i2c-mux-pca954x.yaml   |  30 +++
+ .../devicetree/bindings/i2c/renesas,rcar-i2c.yaml  |   1 +
+ drivers/i2c/busses/i2c-hisi.c                      |  13 +-
+ drivers/i2c/busses/i2c-i801.c                      | 228 +++++++++++----------
+ drivers/i2c/busses/i2c-imx.c                       |  62 +-----
+ drivers/i2c/busses/i2c-npcm7xx.c                   |   3 -
+ drivers/i2c/busses/i2c-sh_mobile.c                 |  27 +--
+ drivers/i2c/muxes/i2c-mux-pca954x.c                |  43 +++-
+ 8 files changed, 220 insertions(+), 187 deletions(-)
+Merging i3c/i3c/next (8f06fb458539 i3c: Make i3c_bus_type const)
+$ git merge -m Merge branch 'i3c/next' of git://git.kernel.org/pub/scm/linux/kernel/git/i3c/linux.git i3c/i3c/next
+Merge made by the 'ort' strategy.
+ Documentation/devicetree/bindings/i3c/aspeed,ast2600-i3c.yaml | 2 +-
+ Documentation/devicetree/bindings/i3c/cdns,i3c-master.yaml    | 2 +-
+ Documentation/devicetree/bindings/i3c/i3c.yaml                | 4 ++--
+ Documentation/devicetree/bindings/i3c/mipi-i3c-hci.yaml       | 2 +-
+ Documentation/devicetree/bindings/i3c/silvaco,i3c-master.yaml | 2 +-
+ Documentation/devicetree/bindings/i3c/snps,dw-i3c-master.yaml | 2 +-
+ drivers/i3c/internals.h                                       | 2 +-
+ drivers/i3c/master.c                                          | 2 +-
+ drivers/i3c/master/dw-i3c-master.c                            | 4 +++-
+ 9 files changed, 12 insertions(+), 10 deletions(-)
+Merging hwmon-staging/hwmon-next (e6b33455c319 MAINTAINERS: Drop redundant hwmon entries)
+$ git merge -m Merge branch 'hwmon-next' of git://git.kernel.org/pub/scm/linux/kernel/git/groeck/linux-staging.git hwmon-staging/hwmon-next
+Auto-merging Documentation/devicetree/bindings/vendor-prefixes.yaml
+Auto-merging MAINTAINERS
+Merge made by the 'ort' strategy.
+ Documentation/ABI/testing/sysfs-class-hwmon        |   27 +
+ .../devicetree/bindings/hwmon/adi,ltc4282.yaml     |  159 ++
+ .../bindings/hwmon/amphenol,chipcap2.yaml          |   77 +
+ .../devicetree/bindings/hwmon/ti,ina2xx.yaml       |    9 +
+ .../devicetree/bindings/trivial-devices.yaml       |    4 +
+ .../devicetree/bindings/vendor-prefixes.yaml       |    4 +
+ Documentation/hwmon/asus_rog_ryujin.rst            |   47 +
+ Documentation/hwmon/chipcap2.rst                   |   73 +
+ Documentation/hwmon/emc2305.rst                    |    1 -
+ Documentation/hwmon/index.rst                      |    7 +
+ Documentation/hwmon/ltc4282.rst                    |  133 ++
+ Documentation/hwmon/max6620.rst                    |    2 +-
+ Documentation/hwmon/mpq8785.rst                    |   94 ++
+ Documentation/hwmon/nct6683.rst                    |    1 +
+ Documentation/hwmon/nzxt-kraken3.rst               |   74 +
+ Documentation/hwmon/oxp-sensors.rst                |    1 +
+ Documentation/hwmon/pt5161l.rst                    |   42 +
+ Documentation/hwmon/sht3x.rst                      |   11 +
+ Documentation/hwmon/surface_fan.rst                |   25 +
+ MAINTAINERS                                        |  167 +-
+ drivers/hwmon/Kconfig                              |   65 +
+ drivers/hwmon/Makefile                             |    6 +
+ drivers/hwmon/adm1177.c                            |    1 -
+ drivers/hwmon/adt7310.c                            |    2 +-
+ drivers/hwmon/adt7410.c                            |    4 +-
+ drivers/hwmon/asus_rog_ryujin.c                    |  609 +++++++
+ drivers/hwmon/axi-fan-control.c                    |   73 +-
+ drivers/hwmon/chipcap2.c                           |  822 +++++++++
+ drivers/hwmon/coretemp.c                           |  204 +--
+ drivers/hwmon/ds1621.c                             |    1 -
+ drivers/hwmon/ds620.c                              |    1 -
+ drivers/hwmon/emc1403.c                            |    2 +-
+ drivers/hwmon/emc2305.c                            |    5 -
+ drivers/hwmon/hwmon.c                              |    3 +
+ drivers/hwmon/ina209.c                             |    1 -
+ drivers/hwmon/ina238.c                             |    1 -
+ drivers/hwmon/ina3221.c                            |    2 +-
+ drivers/hwmon/jc42.c                               |    2 +-
+ drivers/hwmon/lm83.c                               |    2 +-
+ drivers/hwmon/ltc4282.c                            | 1782 ++++++++++++++++++++
+ drivers/hwmon/max127.c                             |    1 -
+ drivers/hwmon/max31760.c                           |    3 +-
+ drivers/hwmon/max31790.c                           |    1 -
+ drivers/hwmon/max31827.c                           |    1 -
+ drivers/hwmon/max6621.c                            |    1 -
+ drivers/hwmon/max6697.c                            |    1 -
+ drivers/hwmon/nct6683.c                            |    3 +
+ drivers/hwmon/nct7802.c                            |    2 +-
+ drivers/hwmon/nzxt-kraken3.c                       | 1008 +++++++++++
+ drivers/hwmon/occ/p8_i2c.c                         |    1 -
+ drivers/hwmon/oxp-sensors.c                        |   10 +
+ drivers/hwmon/pmbus/Kconfig                        |    9 +
+ drivers/hwmon/pmbus/Makefile                       |    1 +
+ drivers/hwmon/pmbus/ir36021.c                      |    1 -
+ drivers/hwmon/pmbus/ir38064.c                      |    2 +-
+ drivers/hwmon/pmbus/lm25066.c                      |    2 +-
+ drivers/hwmon/pmbus/mpq8785.c                      |   90 +
+ drivers/hwmon/pmbus/pmbus_core.c                   |    2 +-
+ drivers/hwmon/pmbus/tda38640.c                     |    2 +-
+ drivers/hwmon/powr1220.c                           |    1 -
+ drivers/hwmon/pt5161l.c                            |  667 ++++++++
+ drivers/hwmon/sbrmi.c                              |    1 -
+ drivers/hwmon/sbtsi_temp.c                         |    1 -
+ drivers/hwmon/sch5627.c                            |    2 +-
+ drivers/hwmon/sht3x.c                              |   66 +-
+ drivers/hwmon/surface_fan.c                        |   91 +
+ drivers/hwmon/tmp401.c                             |    2 +-
+ drivers/hwmon/w83773g.c                            |    1 -
+ include/linux/hwmon.h                              |   18 +-
+ 69 files changed, 6234 insertions(+), 303 deletions(-)
+ create mode 100644 Documentation/devicetree/bindings/hwmon/adi,ltc4282.yaml
+ create mode 100644 Documentation/devicetree/bindings/hwmon/amphenol,chipcap2.yaml
+ create mode 100644 Documentation/hwmon/asus_rog_ryujin.rst
+ create mode 100644 Documentation/hwmon/chipcap2.rst
+ create mode 100644 Documentation/hwmon/ltc4282.rst
+ create mode 100644 Documentation/hwmon/mpq8785.rst
+ create mode 100644 Documentation/hwmon/nzxt-kraken3.rst
+ create mode 100644 Documentation/hwmon/pt5161l.rst
+ create mode 100644 Documentation/hwmon/surface_fan.rst
+ create mode 100644 drivers/hwmon/asus_rog_ryujin.c
+ create mode 100644 drivers/hwmon/chipcap2.c
+ create mode 100644 drivers/hwmon/ltc4282.c
+ create mode 100644 drivers/hwmon/nzxt-kraken3.c
+ create mode 100644 drivers/hwmon/pmbus/mpq8785.c
+ create mode 100644 drivers/hwmon/pt5161l.c
+ create mode 100644 drivers/hwmon/surface_fan.c
+Merging jc_docs/docs-next (920290fe2a18 Merge branch 'docs-fixes' into docs-mw)
+$ git merge -m Merge branch 'docs-next' of git://git.lwn.net/linux.git jc_docs/docs-next
+Auto-merging Documentation/admin-guide/kernel-parameters.txt
+Auto-merging Documentation/dev-tools/kasan.rst
+Auto-merging Documentation/process/changes.rst
+Auto-merging MAINTAINERS
+Merge made by the 'ort' strategy.
+ Documentation/ABI/testing/sysfs-bus-vdpa           |   10 +-
+ Documentation/Makefile                             |    1 +
+ Documentation/RCU/torture.rst                      |    2 +-
+ Documentation/admin-guide/README.rst               |   63 +-
+ Documentation/admin-guide/kernel-parameters.txt    |    5 +
+ Documentation/admin-guide/tainted-kernels.rst      |    4 +-
+ Documentation/conf.py                              |    6 +
+ Documentation/dev-tools/checkpatch.rst             |    4 +-
+ Documentation/dev-tools/kasan.rst                  |   21 +
+ Documentation/doc-guide/kernel-doc.rst             |   45 +
+ Documentation/doc-guide/maintainer-profile.rst     |    7 +
+ Documentation/doc-guide/sphinx.rst                 |    7 +-
+ Documentation/driver-api/index.rst                 |  209 +-
+ Documentation/fault-injection/index.rst            |    2 +-
+ Documentation/filesystems/proc.rst                 |    4 +-
+ .../maintainer/maintainer-entry-profile.rst        |    3 +
+ Documentation/networking/bridge.rst                |    2 +-
+ Documentation/process/changes.rst                  |    4 +-
+ Documentation/process/coding-style.rst             |   13 +-
+ .../process/embargoed-hardware-issues.rst          |    2 +-
+ Documentation/process/howto.rst                    |    4 +-
+ Documentation/process/researcher-guidelines.rst    |    2 +-
+ Documentation/sphinx/kerneldoc.py                  |    6 +-
+ Documentation/sphinx/translations.py               |   10 +-
+ Documentation/staging/rpmsg.rst                    |    2 +-
+ Documentation/subsystem-apis.rst                   |    2 +
+ Documentation/translations/it_IT/RCU/index.rst     |   19 +
+ Documentation/translations/it_IT/RCU/torture.rst   |  369 +++
+ .../translations/it_IT/core-api/index.rst          |   12 +
+ .../translations/it_IT/i2c/i2c-protocol.rst        |   99 +
+ Documentation/translations/it_IT/i2c/index.rst     |   46 +
+ Documentation/translations/it_IT/i2c/summary.rst   |   64 +
+ Documentation/translations/it_IT/index.rst         |    2 +
+ Documentation/translations/it_IT/locking/index.rst |   20 +
+ .../translations/it_IT/locking/lockdep-design.rst  |  678 ++++++
+ .../translations/it_IT/locking/lockstat.rst        |  230 ++
+ .../translations/it_IT/locking/locktorture.rst     |  181 ++
+ .../translations/it_IT/locking/locktypes.rst       |  547 +++++
+ .../translations/it_IT/networking/netdev-FAQ.rst   |   13 -
+ .../translations/it_IT/process/coding-style.rst    |    6 +-
+ .../translations/it_IT/subsystem-apis.rst          |   47 +
+ .../translations/sp_SP/process/coding-style.rst    |    6 +-
+ .../sp_SP/process/embargoed-hardware-issues.rst    |    2 +-
+ .../sp_SP/process/researcher-guidelines.rst        |    2 +-
+ .../translations/zh_CN/process/coding-style.rst    |    4 +-
+ .../zh_CN/process/embargoed-hardware-issues.rst    |    2 +-
+ .../translations/zh_TW/process/coding-style.rst    |    4 +-
+ .../zh_TW/process/embargoed-hardware-issues.rst    |    2 +-
+ Documentation/userspace-api/index.rst              |   47 +-
+ Documentation/userspace-api/perf_ring_buffer.rst   |  830 +++++++
+ MAINTAINERS                                        |    8 +-
+ README                                             |    2 +-
+ drivers/gpu/drm/drm_gem_vram_helper.c              |   44 +-
+ include/drm/drm_gem_vram_helper.h                  |   16 +-
+ scripts/kernel-doc                                 | 2427 ++++++++++----------
+ scripts/sphinx-pre-install                         |   13 +-
+ 56 files changed, 4743 insertions(+), 1439 deletions(-)
+ create mode 100644 Documentation/translations/it_IT/RCU/index.rst
+ create mode 100644 Documentation/translations/it_IT/RCU/torture.rst
+ create mode 100644 Documentation/translations/it_IT/i2c/i2c-protocol.rst
+ create mode 100644 Documentation/translations/it_IT/i2c/index.rst
+ create mode 100644 Documentation/translations/it_IT/i2c/summary.rst
+ create mode 100644 Documentation/translations/it_IT/locking/index.rst
+ create mode 100644 Documentation/translations/it_IT/locking/lockdep-design.rst
+ create mode 100644 Documentation/translations/it_IT/locking/lockstat.rst
+ create mode 100644 Documentation/translations/it_IT/locking/locktorture.rst
+ create mode 100644 Documentation/translations/it_IT/locking/locktypes.rst
+ delete mode 100644 Documentation/translations/it_IT/networking/netdev-FAQ.rst
+ create mode 100644 Documentation/translations/it_IT/subsystem-apis.rst
+ create mode 100644 Documentation/userspace-api/perf_ring_buffer.rst
+Merging v4l-dvb/master (8c64f4cdf4e6 media: edia: dvbdev: fix a use-after-free)
+$ git merge -m Merge branch 'master' of git://linuxtv.org/media_tree.git v4l-dvb/master
+Auto-merging MAINTAINERS
+Auto-merging drivers/staging/media/atomisp/pci/atomisp_cmd.c
+CONFLICT (content): Merge conflict in drivers/staging/media/atomisp/pci/atomisp_cmd.c
+Auto-merging drivers/staging/media/atomisp/pci/atomisp_internal.h
+Auto-merging drivers/staging/media/atomisp/pci/atomisp_ioctl.c
+Auto-merging drivers/staging/media/atomisp/pci/atomisp_v4l2.c
+Resolved 'drivers/staging/media/atomisp/pci/atomisp_cmd.c' using previous resolution.
+Automatic merge failed; fix conflicts and then commit the result.
+$ git commit --no-edit -v -a
+[master adb9286e9556] Merge branch 'master' of git://linuxtv.org/media_tree.git
+$ git diff -M --stat --summary HEAD^..
+ Documentation/admin-guide/media/vivid.rst          |    2 +-
+ .../bindings/media/i2c/techwell,tw9900.yaml        |    2 +-
+ .../bindings/media/st,stm32mp25-video-codec.yaml   |   49 +
+ Documentation/driver-api/media/drivers/ccs/ccs.rst |   51 +-
+ Documentation/userspace-api/media/drivers/ccs.rst  |    6 +-
+ .../userspace-api/media/dvb/legacy_dvb_apis.rst    |    1 +
+ .../userspace-api/media/dvb/legacy_dvb_audio.rst   | 1642 +++++++++++++
+ .../media/dvb/legacy_dvb_decoder_api.rst           |   61 +
+ .../userspace-api/media/dvb/legacy_dvb_osd.rst     |  883 +++++++
+ .../userspace-api/media/dvb/legacy_dvb_video.rst   | 2430 ++++++++++++++++++++
+ .../userspace-api/media/mediactl/media-types.rst   |   11 +-
+ .../media/v4l/vidioc-subdev-g-client-cap.rst       |   15 +
+ MAINTAINERS                                        |    1 +
+ arch/arm64/boot/dts/st/stm32mp251.dtsi             |   12 +
+ arch/arm64/boot/dts/st/stm32mp255.dtsi             |   17 +
+ drivers/media/cec/core/cec-adap.c                  |   14 -
+ drivers/media/cec/core/cec-core.c                  |    2 +-
+ drivers/media/common/v4l2-tpg/v4l2-tpg-core.c      |   52 +-
+ drivers/media/dvb-core/dvb_frontend.c              |   12 +-
+ drivers/media/dvb-core/dvbdev.c                    |    5 +
+ drivers/media/dvb-frontends/bcm3510.c              |    3 +-
+ drivers/media/dvb-frontends/cx24110.c              |    4 +-
+ drivers/media/dvb-frontends/dvb-pll.c              |    6 +-
+ drivers/media/i2c/Kconfig                          |    2 +
+ drivers/media/i2c/adv7180.c                        |    4 +-
+ drivers/media/i2c/adv748x/adv748x.h                |    1 -
+ drivers/media/i2c/alvium-csi2.c                    |    2 +-
+ drivers/media/i2c/ar0521.c                         |    6 +-
+ drivers/media/i2c/ccs/ccs-quirk.h                  |    8 +-
+ drivers/media/i2c/imx214.c                         |    2 +-
+ drivers/media/i2c/imx274.c                         |    2 +-
+ drivers/media/i2c/imx319.c                         |   53 +-
+ drivers/media/i2c/imx334.c                         |   41 +-
+ drivers/media/i2c/imx355.c                         |   53 +-
+ drivers/media/i2c/imx415.c                         |  672 ++++--
+ drivers/media/i2c/isl7998x.c                       |    2 +-
+ drivers/media/i2c/max2175.c                        |    2 +-
+ drivers/media/i2c/mt9v032.c                        |    2 +-
+ drivers/media/i2c/ov08x40.c                        | 1307 ++---------
+ drivers/media/i2c/st-vgxy61.c                      |  388 ++--
+ drivers/media/i2c/tc358743.c                       |    7 +-
+ drivers/media/i2c/tc358746.c                       |    4 +-
+ drivers/media/i2c/tvp5150.c                        |    2 +-
+ drivers/media/mc/mc-devnode.c                      |    3 +-
+ drivers/media/mc/mc-entity.c                       |   93 +-
+ drivers/media/pci/cx23885/cx23885-video.c          |    8 +
+ drivers/media/pci/dt3155/dt3155.h                  |    1 -
+ drivers/media/pci/intel/ipu3/ipu3-cio2.c           |   22 +-
+ drivers/media/platform/cadence/cdns-csi2rx.c       |   19 +-
+ .../media/platform/mediatek/jpeg/mtk_jpeg_core.h   |    1 -
+ .../media/platform/mediatek/mdp3/mdp_cfg_data.c    |  729 +++++-
+ drivers/media/platform/mediatek/mdp3/mdp_reg_aal.h |   25 +
+ .../media/platform/mediatek/mdp3/mdp_reg_color.h   |   31 +
+ drivers/media/platform/mediatek/mdp3/mdp_reg_fg.h  |   23 +
+ drivers/media/platform/mediatek/mdp3/mdp_reg_hdr.h |   31 +
+ .../media/platform/mediatek/mdp3/mdp_reg_merge.h   |   25 +
+ drivers/media/platform/mediatek/mdp3/mdp_reg_ovl.h |   25 +
+ drivers/media/platform/mediatek/mdp3/mdp_reg_pad.h |   21 +
+ .../media/platform/mediatek/mdp3/mdp_reg_rdma.h    |   24 +
+ drivers/media/platform/mediatek/mdp3/mdp_reg_rsz.h |    2 +
+ .../media/platform/mediatek/mdp3/mdp_reg_tdshp.h   |   34 +
+ .../media/platform/mediatek/mdp3/mdp_reg_wrot.h    |    8 +
+ .../media/platform/mediatek/mdp3/mdp_sm_mt8195.h   |  283 +++
+ drivers/media/platform/mediatek/mdp3/mtk-img-ipi.h |    4 +
+ .../media/platform/mediatek/mdp3/mtk-mdp3-cfg.h    |    2 +
+ .../media/platform/mediatek/mdp3/mtk-mdp3-cmdq.c   |  438 +++-
+ .../media/platform/mediatek/mdp3/mtk-mdp3-cmdq.h   |    1 +
+ .../media/platform/mediatek/mdp3/mtk-mdp3-comp.c   |  895 ++++++-
+ .../media/platform/mediatek/mdp3/mtk-mdp3-comp.h   |   93 +-
+ .../media/platform/mediatek/mdp3/mtk-mdp3-core.c   |  144 +-
+ .../media/platform/mediatek/mdp3/mtk-mdp3-core.h   |   50 +-
+ .../media/platform/mediatek/mdp3/mtk-mdp3-m2m.c    |   15 +
+ .../media/platform/mediatek/mdp3/mtk-mdp3-regs.c   |   18 +
+ .../media/platform/mediatek/mdp3/mtk-mdp3-regs.h   |    1 +
+ .../media/platform/mediatek/mdp3/mtk-mdp3-vpu.c    |    3 +-
+ .../mediatek/vcodec/decoder/mtk_vcodec_dec.h       |    1 -
+ .../mediatek/vcodec/decoder/vdec/vdec_vp8_req_if.c |    1 -
+ .../vcodec/decoder/vdec/vdec_vp9_req_lat_if.c      |    1 -
+ .../platform/mediatek/vcodec/decoder/vdec_vpu_if.h |    1 -
+ .../mediatek/vcodec/encoder/mtk_vcodec_enc.h       |    1 -
+ .../platform/nxp/imx8-isi/imx8-isi-crossbar.c      |    4 +-
+ drivers/media/platform/nxp/imx8-isi/imx8-isi-hw.c  |    8 +-
+ drivers/media/platform/qcom/venus/core.h           |    1 -
+ .../platform/samsung/exynos4-is/fimc-capture.c     |   52 +-
+ .../media/platform/samsung/exynos4-is/fimc-core.c  |   23 +-
+ .../media/platform/samsung/exynos4-is/fimc-core.h  |   23 +-
+ .../platform/samsung/exynos4-is/fimc-isp-video.c   |    2 +-
+ .../platform/samsung/exynos4-is/fimc-lite-reg.c    |   13 +-
+ .../platform/samsung/exynos4-is/fimc-lite-reg.h    |   12 +-
+ .../media/platform/samsung/exynos4-is/fimc-lite.c  |    2 +-
+ .../media/platform/samsung/exynos4-is/fimc-lite.h  |    3 -
+ .../media/platform/samsung/exynos4-is/fimc-m2m.c   |   23 +-
+ .../media/platform/samsung/exynos4-is/fimc-reg.c   |   38 +-
+ .../media/platform/samsung/exynos4-is/fimc-reg.h   |   10 +-
+ drivers/media/platform/samsung/s5p-mfc/s5p_mfc.c   |   76 +-
+ .../media/platform/samsung/s5p-mfc/s5p_mfc_cmd.c   |    8 +-
+ .../media/platform/samsung/s5p-mfc/s5p_mfc_cmd.h   |    2 +-
+ .../platform/samsung/s5p-mfc/s5p_mfc_cmd_v5.c      |    6 +-
+ .../platform/samsung/s5p-mfc/s5p_mfc_cmd_v5.h      |    2 +-
+ .../platform/samsung/s5p-mfc/s5p_mfc_cmd_v6.c      |    8 +-
+ .../platform/samsung/s5p-mfc/s5p_mfc_cmd_v6.h      |    2 +-
+ .../platform/samsung/s5p-mfc/s5p_mfc_common.h      |   15 +-
+ .../media/platform/samsung/s5p-mfc/s5p_mfc_ctrl.c  |   26 +-
+ .../media/platform/samsung/s5p-mfc/s5p_mfc_dec.c   |   20 +-
+ .../media/platform/samsung/s5p-mfc/s5p_mfc_dec.h   |    3 +-
+ .../media/platform/samsung/s5p-mfc/s5p_mfc_enc.c   |   12 +-
+ .../media/platform/samsung/s5p-mfc/s5p_mfc_enc.h   |    3 +-
+ .../media/platform/samsung/s5p-mfc/s5p_mfc_opr.c   |    7 +-
+ .../platform/samsung/s5p-mfc/s5p_mfc_opr_v5.c      |   28 +-
+ .../platform/samsung/s5p-mfc/s5p_mfc_opr_v5.h      |    2 +-
+ .../platform/samsung/s5p-mfc/s5p_mfc_opr_v6.c      |   36 +-
+ .../platform/samsung/s5p-mfc/s5p_mfc_opr_v6.h      |    2 +-
+ .../media/platform/samsung/s5p-mfc/s5p_mfc_pm.c    |   51 +-
+ .../media/platform/samsung/s5p-mfc/s5p_mfc_pm.h    |    8 +-
+ .../media/platform/ti/j721e-csi2rx/j721e-csi2rx.c  |   24 +
+ drivers/media/platform/verisilicon/Kconfig         |   14 +-
+ drivers/media/platform/verisilicon/Makefile        |    3 +
+ drivers/media/platform/verisilicon/hantro.h        |    1 -
+ drivers/media/platform/verisilicon/hantro_drv.c    |    4 +
+ .../platform/verisilicon/hantro_g1_h264_dec.c      |    2 +-
+ drivers/media/platform/verisilicon/hantro_hw.h     |    2 +
+ .../verisilicon/rockchip_vpu2_hw_h264_dec.c        |    2 +-
+ .../platform/verisilicon/rockchip_vpu981_regs.h    |    2 +-
+ .../media/platform/verisilicon/stm32mp25_vpu_hw.c  |  186 ++
+ drivers/media/platform/xilinx/Kconfig              |    4 +-
+ drivers/media/tuners/xc4000.c                      |    4 +-
+ drivers/media/usb/cx231xx/cx231xx-417.c            |    2 +-
+ drivers/media/usb/dvb-usb/dvb-usb.h                |    2 -
+ drivers/media/usb/em28xx/em28xx-cards.c            |    4 +
+ drivers/media/v4l2-core/v4l2-cci.c                 |    4 +-
+ drivers/media/v4l2-core/v4l2-common.c              |   47 +
+ drivers/media/v4l2-core/v4l2-ctrls-core.c          |   18 +-
+ drivers/media/v4l2-core/v4l2-mc.c                  |   23 +-
+ drivers/media/v4l2-core/v4l2-mem2mem.c             |   10 +-
+ drivers/staging/media/atomisp/TODO                 |   10 -
+ drivers/staging/media/atomisp/i2c/gc2235.h         |    5 +-
+ drivers/staging/media/atomisp/pci/atomisp_cmd.c    |  141 +-
+ .../media/atomisp/pci/atomisp_compat_css20.c       |    2 +-
+ drivers/staging/media/atomisp/pci/atomisp_drvfs.c  |  148 +-
+ drivers/staging/media/atomisp/pci/atomisp_drvfs.h  |    5 +-
+ .../staging/media/atomisp/pci/atomisp_internal.h   |    1 +
+ drivers/staging/media/atomisp/pci/atomisp_ioctl.c  |    8 -
+ drivers/staging/media/atomisp/pci/atomisp_v4l2.c   |  260 +--
+ .../pci/base/circbuf/interface/ia_css_circbuf.h    |    2 +-
+ .../media/atomisp/pci/base/circbuf/src/circbuf.c   |    6 +-
+ .../staging/media/atomisp/pci/ia_css_acc_types.h   |    4 +-
+ drivers/staging/media/atomisp/pci/ia_css_control.h |   29 +-
+ .../staging/media/atomisp/pci/ia_css_firmware.h    |    6 +-
+ drivers/staging/media/atomisp/pci/ia_css_irq.h     |    6 +-
+ .../atomisp/pci/isp/kernels/hdr/ia_css_hdr_types.h |    4 +-
+ .../kernels/macc/macc_1.0/ia_css_macc_table.host.c |    4 +-
+ .../atomisp/pci/isp2400_input_system_global.h      |    2 +-
+ .../atomisp/pci/isp2400_input_system_public.h      |    2 +-
+ .../media/atomisp/pci/runtime/binary/src/binary.c  |    2 +-
+ .../atomisp/pci/runtime/pipeline/src/pipeline.c    |    2 +-
+ .../media/atomisp/pci/runtime/queue/src/queue.c    |   22 +-
+ .../media/atomisp/pci/runtime/rmgr/src/rmgr_vbuf.c |    6 +-
+ drivers/staging/media/atomisp/pci/sh_css.c         |   35 +-
+ drivers/staging/media/atomisp/pci/sh_css_defs.h    |    2 +-
+ drivers/staging/media/atomisp/pci/sh_css_mipi.c    |    4 +-
+ .../staging/media/ipu3/include/uapi/intel-ipu3.h   |    3 -
+ drivers/staging/media/ipu3/ipu3-v4l2.c             |   16 +-
+ drivers/staging/media/meson/vdec/vdec.h            |    1 -
+ include/media/cec.h                                |    2 -
+ include/media/media-entity.h                       |    6 +-
+ include/media/v4l2-common.h                        |   32 +-
+ include/media/videobuf2-core.h                     |    2 +-
+ include/uapi/linux/videodev2.h                     |   32 +-
+ 168 files changed, 9716 insertions(+), 2903 deletions(-)
+ create mode 100644 Documentation/devicetree/bindings/media/st,stm32mp25-video-codec.yaml
+ create mode 100644 Documentation/userspace-api/media/dvb/legacy_dvb_audio.rst
+ create mode 100644 Documentation/userspace-api/media/dvb/legacy_dvb_decoder_api.rst
+ create mode 100644 Documentation/userspace-api/media/dvb/legacy_dvb_osd.rst
+ create mode 100644 Documentation/userspace-api/media/dvb/legacy_dvb_video.rst
+ create mode 100644 drivers/media/platform/mediatek/mdp3/mdp_reg_aal.h
+ create mode 100644 drivers/media/platform/mediatek/mdp3/mdp_reg_color.h
+ create mode 100644 drivers/media/platform/mediatek/mdp3/mdp_reg_fg.h
+ create mode 100644 drivers/media/platform/mediatek/mdp3/mdp_reg_hdr.h
+ create mode 100644 drivers/media/platform/mediatek/mdp3/mdp_reg_merge.h
+ create mode 100644 drivers/media/platform/mediatek/mdp3/mdp_reg_ovl.h
+ create mode 100644 drivers/media/platform/mediatek/mdp3/mdp_reg_pad.h
+ create mode 100644 drivers/media/platform/mediatek/mdp3/mdp_reg_tdshp.h
+ create mode 100644 drivers/media/platform/mediatek/mdp3/mdp_sm_mt8195.h
+ create mode 100644 drivers/media/platform/verisilicon/stm32mp25_vpu_hw.c
+Merging v4l-dvb-next/master (e0b8eb0f6d65 media: visl: Add codec specific variability on output frames)
+$ git merge -m Merge branch 'master' of git://linuxtv.org/mchehab/media-next.git v4l-dvb-next/master
+Auto-merging MAINTAINERS
+Merge made by the 'ort' strategy.
+ Documentation/admin-guide/media/visl.rst           |  12 +-
+ Documentation/driver-api/media/v4l2-subdev.rst     |   2 +-
+ MAINTAINERS                                        |   1 +
+ drivers/media/cec/platform/cros-ec/cros-ec-cec.c   |   2 +
+ drivers/media/dvb-core/dvb_frontend.c              |  13 +-
+ drivers/media/dvb-frontends/bcm3510_priv.h         |   6 +-
+ drivers/media/dvb-frontends/cx24110.h              |   8 +-
+ drivers/media/dvb-frontends/stv6110x_priv.h        |   8 +-
+ drivers/media/dvb-frontends/tda8083.h              |   8 +-
+ drivers/media/dvb-frontends/zl10036.c              |   2 +-
+ drivers/media/dvb-frontends/zl10036.h              |   2 +-
+ drivers/media/i2c/msp3400-driver.c                 |  20 +-
+ drivers/media/i2c/msp3400-driver.h                 |   2 +-
+ drivers/media/i2c/st-vgxy61.c                      |   2 +-
+ drivers/media/pci/bt8xx/bttv-gpio.c                |   2 +-
+ drivers/media/pci/bt8xx/bttvp.h                    |   2 +-
+ drivers/media/pci/sta2x11/sta2x11_vip.c            |   9 +-
+ drivers/media/platform/amphion/vdec.c              |   4 +-
+ drivers/media/platform/marvell/Kconfig             |   2 +
+ .../vcodec/decoder/mtk_vcodec_dec_stateless.c      |  14 +-
+ drivers/media/platform/nxp/imx-jpeg/mxc-jpeg.c     |  16 ++
+ drivers/media/platform/nxp/imx-jpeg/mxc-jpeg.h     |   1 +
+ drivers/media/platform/renesas/Kconfig             |  16 ++
+ drivers/media/platform/renesas/Makefile            |   1 +
+ .../platform/renesas/{rcar-vin => }/rcar-csi2.c    |   0
+ drivers/media/platform/renesas/rcar-vin/Kconfig    |  16 --
+ drivers/media/platform/renesas/rcar-vin/Makefile   |   1 -
+ .../media/platform/renesas/rzg2l-cru/rzg2l-cru.h   |   3 -
+ .../media/platform/renesas/rzg2l-cru/rzg2l-csi2.c  |  37 ++-
+ .../media/platform/renesas/rzg2l-cru/rzg2l-ip.c    |  18 +-
+ .../media/platform/renesas/rzg2l-cru/rzg2l-video.c |  83 +++---
+ drivers/media/platform/sunxi/sun8i-di/sun8i-di.c   |  69 ++---
+ .../media/platform/ti/j721e-csi2rx/j721e-csi2rx.c  |   5 +-
+ drivers/media/test-drivers/vidtv/vidtv_bridge.c    |  26 +-
+ drivers/media/test-drivers/visl/visl-core.c        |  15 +-
+ drivers/media/test-drivers/visl/visl-dec.c         | 311 +++++++++++++--------
+ drivers/media/test-drivers/visl/visl.h             |   1 +
+ drivers/media/tuners/tda18271-fe.c                 |   1 -
+ drivers/media/usb/go7007/go7007-usb.c              |   4 +-
+ drivers/media/usb/pvrusb2/pvrusb2-context.c        |   8 +-
+ drivers/media/usb/pvrusb2/pvrusb2-dvb.c            |  12 +-
+ drivers/media/usb/pvrusb2/pvrusb2-v4l2.c           |  11 +-
+ drivers/media/v4l2-core/v4l2-ctrls-api.c           |   2 +-
+ drivers/media/v4l2-core/v4l2-ctrls-core.c          |   5 +-
+ drivers/media/v4l2-core/v4l2-ioctl.c               |   2 +-
+ drivers/staging/media/imx/imx-media-csc-scaler.c   |   1 +
+ drivers/staging/media/imx/imx-media-fim.c          |   2 +-
+ drivers/staging/media/starfive/camss/stf-capture.c |   8 +-
+ drivers/staging/media/sunxi/cedrus/cedrus_h265.c   |  10 +-
+ include/media/videobuf2-core.h                     |  13 +-
+ 50 files changed, 444 insertions(+), 375 deletions(-)
+ rename drivers/media/platform/renesas/{rcar-vin => }/rcar-csi2.c (100%)
+Merging pm/linux-next (4cb5c331c4df Merge branch 'acpi-misc' into linux-next)
+$ git merge -m Merge branch 'linux-next' of git://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm.git pm/linux-next
+Auto-merging Documentation/admin-guide/kernel-parameters.txt
+Auto-merging arch/x86/Kconfig
+Auto-merging drivers/pci/pci.c
+Auto-merging kernel/power/swap.c
+Merge made by the 'ort' strategy.
+ Documentation/admin-guide/kernel-parameters.txt    |   5 +
+ Documentation/admin-guide/pm/amd-pstate.rst        |  59 ++-
+ Documentation/power/energy-model.rst               | 183 +++++++-
+ Documentation/power/pci.rst                        |   2 +-
+ Documentation/power/runtime_pm.rst                 |  22 +-
+ arch/x86/Kconfig                                   |   5 +-
+ drivers/accel/ivpu/ivpu_pm.c                       |   2 +-
+ drivers/acpi/acpi_processor.c                      |   2 +-
+ drivers/acpi/acpi_video.c                          |  28 +-
+ drivers/acpi/acpi_watchdog.c                       |   2 +-
+ drivers/acpi/bus.c                                 |   2 +-
+ drivers/acpi/cppc_acpi.c                           |  13 +
+ drivers/acpi/internal.h                            |   1 +
+ drivers/acpi/mipi-disco-img.c                      |  71 +++
+ drivers/acpi/nfit/core.c                           |   5 +-
+ drivers/acpi/pci_slot.c                            |   2 +-
+ drivers/acpi/processor_driver.c                    |   6 +
+ drivers/acpi/processor_idle.c                      |   2 +
+ drivers/acpi/property.c                            |   3 +
+ drivers/acpi/resource.c                            |  28 ++
+ drivers/acpi/scan.c                                |   1 +
+ drivers/acpi/utils.c                               |   2 +-
+ drivers/acpi/x86/s2idle.c                          |  37 +-
+ drivers/base/power/main.c                          | 269 +++++------
+ drivers/base/power/runtime.c                       |  35 +-
+ drivers/cpufreq/amd-pstate.c                       | 185 +++++++-
+ drivers/cpufreq/cpufreq.c                          |   4 +-
+ drivers/cpufreq/intel_pstate.c                     |   3 -
+ drivers/cpuidle/driver.c                           |   3 +-
+ drivers/cpuidle/governors/haltpoll.c               |   9 +-
+ drivers/gpu/drm/i915/intel_runtime_pm.c            |   5 +-
+ drivers/gpu/drm/xe/xe_pm.c                         |   2 +-
+ drivers/media/i2c/ccs/ccs-core.c                   |   2 +-
+ drivers/media/i2c/ov64a40.c                        |   2 +-
+ drivers/media/i2c/thp7312.c                        |   2 +-
+ drivers/net/ipa/ipa_smp2p.c                        |   2 +-
+ drivers/net/wireless/intel/iwlwifi/mvm/mvm.h       |   2 -
+ drivers/net/wireless/intel/iwlwifi/mvm/tt.c        |  68 ++-
+ drivers/pci/pci.c                                  |   2 +-
+ drivers/powercap/dtpm_cpu.c                        |  43 +-
+ drivers/powercap/dtpm_devfreq.c                    |  34 +-
+ drivers/powercap/intel_rapl_common.c               |  36 +-
+ drivers/powercap/intel_rapl_msr.c                  |   8 +-
+ drivers/powercap/intel_rapl_tpmi.c                 |  15 +
+ drivers/thermal/cpufreq_cooling.c                  |  45 +-
+ drivers/thermal/devfreq_cooling.c                  |  51 ++-
+ drivers/thermal/gov_bang_bang.c                    |   2 +-
+ drivers/thermal/gov_fair_share.c                   |  16 +-
+ drivers/thermal/gov_power_allocator.c              |   2 +
+ .../int340x_thermal/processor_thermal_device.c     |   8 +-
+ .../intel/int340x_thermal/processor_thermal_rapl.c |   8 +-
+ drivers/thermal/intel/intel_tcc.c                  |  12 +-
+ drivers/thermal/intel/x86_pkg_temp_thermal.c       |   8 +-
+ drivers/thermal/thermal_sysfs.c                    |   2 +-
+ include/acpi/acpi_bus.h                            |   2 +-
+ include/acpi/cppc_acpi.h                           |   5 +
+ include/linux/acpi.h                               |   1 +
+ include/linux/amd-pstate.h                         |  10 +
+ include/linux/cpufreq.h                            |   1 +
+ include/linux/energy_model.h                       | 166 ++++---
+ include/linux/intel_rapl.h                         |   6 +
+ include/linux/intel_tcc.h                          |   2 +-
+ include/linux/pm.h                                 |  30 +-
+ include/linux/pm_runtime.h                         |  30 +-
+ include/linux/suspend.h                            |  74 +--
+ include/linux/thermal.h                            |   4 +-
+ kernel/power/Kconfig                               |  26 +-
+ kernel/power/energy_model.c                        | 494 +++++++++++++++++----
+ kernel/power/hibernate.c                           |  56 ++-
+ kernel/power/main.c                                | 182 +++++---
+ kernel/power/power.h                               |  21 +
+ kernel/power/suspend.c                             |   8 +-
+ kernel/power/swap.c                                | 189 +++++---
+ sound/hda/hdac_device.c                            |   2 +-
+ 74 files changed, 1952 insertions(+), 725 deletions(-)
+Merging cpufreq-arm/cpufreq/arm/linux-next (3093fa33539b cpufreq: qcom-hw: add CONFIG_COMMON_CLK dependency)
+$ git merge -m Merge branch 'cpufreq/arm/linux-next' of git://git.kernel.org/pub/scm/linux/kernel/git/vireshk/pm.git cpufreq-arm/cpufreq/arm/linux-next
+Merge made by the 'ort' strategy.
+ Documentation/power/opp.rst                    |  2 +-
+ Documentation/translations/zh_CN/power/opp.rst |  2 +-
+ drivers/cpufreq/Kconfig.arm                    |  1 +
+ drivers/cpufreq/brcmstb-avs-cpufreq.c          |  2 ++
+ drivers/cpufreq/cpufreq-dt-platdev.c           |  1 +
+ drivers/cpufreq/imx6q-cpufreq.c                | 43 +++++++++-----------------
+ drivers/cpufreq/mediatek-cpufreq-hw.c          | 19 +++++++++++-
+ 7 files changed, 38 insertions(+), 32 deletions(-)
+Merging cpupower/cpupower (babb46746cc5 Fix cpupower-frequency-info.1 man page typo)
+$ git merge -m Merge branch 'cpupower' of git://git.kernel.org/pub/scm/linux/kernel/git/shuah/linux.git cpupower/cpupower
+Merge made by the 'ort' strategy.
+ tools/power/cpupower/man/cpupower-frequency-info.1 | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+Merging devfreq/devfreq-next (b401b621758e Linux 6.8-rc5)
+$ git merge -m Merge branch 'devfreq-next' of git://git.kernel.org/pub/scm/linux/kernel/git/chanwoo/linux.git devfreq/devfreq-next
+Already up to date.
+Merging pmdomain/next (713240877a26 pmdomain: renesas: Adjust the waiting time to cover the worst case)
+$ git merge -m Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/ulfh/linux-pm.git pmdomain/next
+Auto-merging drivers/media/platform/qcom/venus/core.h
+Merge made by the 'ort' strategy.
+ .../devicetree/bindings/power/qcom,rpmpd.yaml      |   2 +
+ .../bindings/soc/imx/fsl,imx8mp-hdmi-blk-ctrl.yaml |  22 +--
+ drivers/base/power/common.c                        | 134 +++++++++++++++++
+ drivers/media/platform/qcom/venus/core.c           |  12 +-
+ drivers/media/platform/qcom/venus/core.h           |   7 +-
+ drivers/media/platform/qcom/venus/pm_helpers.c     |  48 +++----
+ drivers/pmdomain/core.c                            | 149 +++++++++++--------
+ drivers/pmdomain/imx/imx8m-blk-ctrl.c              |   9 +-
+ drivers/pmdomain/imx/imx8mp-blk-ctrl.c             |  19 +--
+ drivers/pmdomain/imx/scu-pd.c                      |   2 +-
+ drivers/pmdomain/mediatek/mtk-scpsys.c             |   4 +-
+ drivers/pmdomain/qcom/rpmpd.c                      |  96 ++++++++++++-
+ drivers/pmdomain/renesas/Kconfig                   |   4 +
+ drivers/pmdomain/renesas/Makefile                  |   1 +
+ drivers/pmdomain/renesas/r8a779a0-sysc.c           |  12 --
+ drivers/pmdomain/renesas/r8a779f0-sysc.c           |  12 --
+ drivers/pmdomain/renesas/r8a779g0-sysc.c           |  12 --
+ drivers/pmdomain/renesas/r8a779h0-sysc.c           |  54 +++++++
+ drivers/pmdomain/renesas/rcar-gen4-sysc.c          |   3 +
+ drivers/pmdomain/renesas/rcar-gen4-sysc.h          |   1 +
+ drivers/pmdomain/renesas/rcar-sysc.c               |   4 +-
+ drivers/pmdomain/tegra/powergate-bpmp.c            |   2 +-
+ drivers/pmdomain/ti/omap_prm.c                     |   2 +
+ drivers/pmdomain/ti/ti_sci_pm_domains.c            |   2 +-
+ drivers/pmdomain/xilinx/zynqmp-pm-domains.c        |   2 +-
+ drivers/remoteproc/imx_dsp_rproc.c                 |  82 ++---------
+ drivers/remoteproc/imx_rproc.c                     |  73 ++--------
+ drivers/remoteproc/qcom_q6v5_adsp.c                | 160 ++++++++++-----------
+ include/dt-bindings/power/qcom-rpmpd.h             |   7 +
+ include/linux/pm_domain.h                          |  60 ++++++--
+ 30 files changed, 591 insertions(+), 406 deletions(-)
+ create mode 100644 drivers/pmdomain/renesas/r8a779h0-sysc.c
+Merging opp/opp/linux-next (ace4b31b297d cpufreq: Move dev_pm_opp_{init|free}_cpufreq_table() to pm_opp.h)
+$ git merge -m Merge branch 'opp/linux-next' of git://git.kernel.org/pub/scm/linux/kernel/git/vireshk/pm.git opp/opp/linux-next
+Auto-merging include/linux/cpufreq.h
+Merge made by the 'ort' strategy.
+ include/linux/cpufreq.h | 20 --------------------
+ include/linux/pm_opp.h  | 16 ++++++++++++++++
+ 2 files changed, 16 insertions(+), 20 deletions(-)
+Merging thermal/thermal/linux-next (5314b1543787 thermal/drivers/exynos: Use set_trips ops)
+$ git merge -m Merge branch 'thermal/linux-next' of git://git.kernel.org/pub/scm/linux/kernel/git/thermal/linux.git thermal/thermal/linux-next
+Already up to date.
+Merging dlm/next (5beebc1dda47 dlm: update format header reflect current format)
+$ git merge -m Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/teigland/linux-dlm.git dlm/next
+Already up to date.
+Merging rdma/for-next (aafe4cc50969 RDMA/rxe: Remove unused 'iova' parameter from rxe_mr_init_user)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git rdma/for-next
+Auto-merging drivers/infiniband/hw/irdma/verbs.c
+Auto-merging drivers/infiniband/ulp/srpt/ib_srpt.c
+Merge made by the 'ort' strategy.
+ drivers/infiniband/hw/cxgb4/iw_cxgb4.h         |   2 -
+ drivers/infiniband/hw/efa/efa.h                |   1 +
+ drivers/infiniband/hw/efa/efa_main.c           |  32 ++-
+ drivers/infiniband/hw/hfi1/tid_rdma.c          |  25 +-
+ drivers/infiniband/hw/hns/hns_roce_cq.c        |  11 +-
+ drivers/infiniband/hw/hns/hns_roce_device.h    |  16 +-
+ drivers/infiniband/hw/hns/hns_roce_hem.c       |  95 ++-----
+ drivers/infiniband/hw/hns/hns_roce_hem.h       |  56 +---
+ drivers/infiniband/hw/hns/hns_roce_hw_v2.c     | 111 ++++----
+ drivers/infiniband/hw/hns/hns_roce_mr.c        | 341 ++++++++++++++++++-------
+ drivers/infiniband/hw/irdma/verbs.c            |   3 +-
+ drivers/infiniband/hw/mana/cq.c                |  25 +-
+ drivers/infiniband/hw/mana/main.c              |  40 +--
+ drivers/infiniband/hw/mana/mana_ib.h           |  20 +-
+ drivers/infiniband/hw/mana/mr.c                |  13 +-
+ drivers/infiniband/hw/mana/qp.c                |  88 ++-----
+ drivers/infiniband/hw/mlx5/mlx5_ib.h           |   1 -
+ drivers/infiniband/sw/rxe/rxe.c                |   6 +-
+ drivers/infiniband/sw/rxe/rxe.h                |   6 +-
+ drivers/infiniband/sw/rxe/rxe_comp.c           |   4 +-
+ drivers/infiniband/sw/rxe/rxe_cq.c             |   4 +-
+ drivers/infiniband/sw/rxe/rxe_loc.h            |   2 +-
+ drivers/infiniband/sw/rxe/rxe_mr.c             |  18 +-
+ drivers/infiniband/sw/rxe/rxe_mw.c             |   2 +-
+ drivers/infiniband/sw/rxe/rxe_qp.c             |   8 +-
+ drivers/infiniband/sw/rxe/rxe_resp.c           |  12 +-
+ drivers/infiniband/sw/rxe/rxe_task.c           |   4 +-
+ drivers/infiniband/sw/rxe/rxe_verbs.c          | 218 ++++++++--------
+ drivers/infiniband/ulp/ipoib/ipoib_multicast.c |   3 +-
+ drivers/infiniband/ulp/srpt/ib_srpt.c          |   3 +-
+ 30 files changed, 597 insertions(+), 573 deletions(-)
+Merging net-next/main (4934446297c2 Merge tag 'linux-can-next-for-6.9-20240220' of git://git.kernel.org/pub/scm/linux/kernel/git/mkl/linux-can-next)
+$ git merge -m Merge branch 'main' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git net-next/main
+Auto-merging .mailmap
+Auto-merging Documentation/userspace-api/ioctl/ioctl-number.rst
+Auto-merging MAINTAINERS
+Auto-merging drivers/media/rc/bpf-lirc.c
+Auto-merging drivers/net/ethernet/broadcom/asp2/bcmasp_intf.c
+Auto-merging drivers/net/ethernet/broadcom/tg3.c
+Auto-merging drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+Auto-merging fs/eventpoll.c
+Auto-merging include/linux/mlx5/mlx5_ifc.h
+Auto-merging include/net/tcp.h
+Auto-merging kernel/bpf/helpers.c
+Auto-merging kernel/bpf/verifier.c
+Auto-merging net/ipv6/addrconf.c
+Auto-merging net/ipv6/ip6_gre.c
+Auto-merging net/iucv/iucv.c
+Auto-merging net/mptcp/pm_netlink.c
+Auto-merging net/mptcp/protocol.c
+Auto-merging net/mptcp/protocol.h
+Auto-merging net/mptcp/subflow.c
+Auto-merging net/sched/act_mirred.c
+Auto-merging security/security.c
+Auto-merging tools/testing/selftests/drivers/net/bonding/bond_options.sh
+Merge made by the 'ort' strategy.
+ .mailmap                                           |    1 +
+ .../bpf/standardization/instruction-set.rst        |   80 +-
+ Documentation/bpf/verifier.rst                     |    2 +-
+ Documentation/dev-tools/kselftest.rst              |   12 +
+ Documentation/devicetree/bindings/leds/common.yaml |   12 +
+ .../devicetree/bindings/leds/leds-bcm63138.yaml    |    4 -
+ .../devicetree/bindings/leds/leds-bcm6328.yaml     |    4 -
+ .../devicetree/bindings/leds/leds-bcm6358.txt      |    2 -
+ .../bindings/leds/leds-pwm-multicolor.yaml         |    4 -
+ .../devicetree/bindings/leds/leds-pwm.yaml         |    5 -
+ .../devicetree/bindings/net/can/tcan4x5x.txt       |    3 +
+ .../devicetree/bindings/net/can/xilinx,can.yaml    |    5 +
+ .../devicetree/bindings/net/dsa/ar9331.txt         |  147 --
+ .../devicetree/bindings/net/dsa/microchip,ksz.yaml |    1 +
+ .../devicetree/bindings/net/dsa/qca,ar9331.yaml    |  161 ++
+ .../bindings/net/ethernet-phy-package.yaml         |   52 +
+ Documentation/devicetree/bindings/net/fsl,fec.yaml |    3 +
+ .../devicetree/bindings/net/nfc/ti,trf7970a.yaml   |    2 +-
+ .../devicetree/bindings/net/qca,qca808x.yaml       |   54 +
+ .../devicetree/bindings/net/qcom,ethqos.yaml       |    9 +-
+ .../devicetree/bindings/net/qcom,ipa.yaml          |    2 +-
+ .../devicetree/bindings/net/qcom,ipq4019-mdio.yaml |   15 +
+ .../devicetree/bindings/net/qcom,qca807x.yaml      |  184 ++
+ .../devicetree/bindings/net/snps,dwmac.yaml        |   17 +-
+ .../bindings/net/starfive,jh7110-dwmac.yaml        |   72 +-
+ .../devicetree/bindings/net/ti,cpsw-switch.yaml    |    5 +-
+ .../devicetree/bindings/net/ti,dp83822.yaml        |   18 +
+ .../bindings/net/ti,k3-am654-cpsw-nuss.yaml        |    5 +-
+ .../devicetree/bindings/net/ti,k3-am654-cpts.yaml  |    5 +-
+ Documentation/netlink/netlink-raw.yaml             |   15 +-
+ Documentation/netlink/specs/dpll.yaml              |   39 +
+ Documentation/netlink/specs/tc.yaml                | 2135 ++++++++++++++++-
+ Documentation/networking/bonding.rst               |   12 +
+ Documentation/networking/can.rst                   |   34 +-
+ .../device_drivers/ethernet/amazon/ena.rst         |    6 +
+ .../networking/device_drivers/ethernet/index.rst   |    1 +
+ .../ethernet/marvell/octeon_ep_vf.rst              |   24 +
+ .../networking/device_drivers/wwan/t7xx.rst        |   46 +
+ Documentation/networking/devlink/mlx5.rst          |    9 +-
+ Documentation/networking/ip-sysctl.rst             |   14 +-
+ Documentation/networking/netconsole.rst            |   66 +
+ Documentation/networking/netdevices.rst            |    4 +-
+ Documentation/networking/xfrm_device.rst           |    4 +-
+ Documentation/userspace-api/ioctl/ioctl-number.rst |    1 +
+ .../userspace-api/netlink/netlink-raw.rst          |   42 +
+ MAINTAINERS                                        |   30 +-
+ arch/arm64/net/bpf_jit_comp.c                      |    5 +
+ arch/x86/net/bpf_jit_comp.c                        |    5 +
+ drivers/dpll/dpll_core.c                           |    5 +-
+ drivers/dpll/dpll_netlink.c                        |    9 +-
+ drivers/media/rc/bpf-lirc.c                        |    2 +-
+ drivers/net/arcnet/arcnet.c                        |    1 +
+ drivers/net/bareudp.c                              |   13 +-
+ drivers/net/bonding/bond_3ad.c                     |  157 +-
+ drivers/net/bonding/bond_main.c                    |   38 +-
+ drivers/net/bonding/bond_netlink.c                 |   16 +
+ drivers/net/bonding/bond_options.c                 |   28 +-
+ drivers/net/can/Kconfig                            |    2 +
+ drivers/net/can/Makefile                           |    1 +
+ drivers/net/can/esd/Kconfig                        |   12 +
+ drivers/net/can/esd/Makefile                       |    7 +
+ drivers/net/can/esd/esd_402_pci-core.c             |  514 +++++
+ drivers/net/can/esd/esdacc.c                       |  764 ++++++
+ drivers/net/can/esd/esdacc.h                       |  356 +++
+ drivers/net/can/kvaser_pciefd.c                    |   55 +
+ drivers/net/can/m_can/m_can.c                      |  579 +++--
+ drivers/net/can/m_can/m_can.h                      |   35 +-
+ drivers/net/can/m_can/m_can_pci.c                  |    1 +
+ drivers/net/can/m_can/m_can_platform.c             |    5 +-
+ drivers/net/can/m_can/tcan4x5x-core.c              |   33 +-
+ drivers/net/can/softing/softing_fw.c               |    2 +-
+ drivers/net/can/xilinx_can.c                       |  169 +-
+ drivers/net/dsa/Kconfig                            |    2 +-
+ drivers/net/dsa/b53/b53_common.c                   |   42 +-
+ drivers/net/dsa/b53/b53_priv.h                     |    7 +-
+ drivers/net/dsa/bcm_sf2.c                          |    2 +-
+ drivers/net/dsa/microchip/ksz8795.c                |  410 ++--
+ drivers/net/dsa/microchip/ksz8795_reg.h            |    1 +
+ drivers/net/dsa/microchip/ksz9477_i2c.c            |    4 +
+ drivers/net/dsa/microchip/ksz_common.c             |   47 +-
+ drivers/net/dsa/microchip/ksz_common.h             |    1 +
+ drivers/net/dsa/microchip/ksz_spi.c                |    5 +
+ drivers/net/dsa/mt7530-mdio.c                      |    7 +-
+ drivers/net/dsa/mt7530.c                           |  321 ++-
+ drivers/net/dsa/mt7530.h                           |   19 +-
+ drivers/net/dsa/mv88e6xxx/chip.c                   |    6 +-
+ drivers/net/dsa/qca/qca8k-8xxx.c                   |   19 +-
+ drivers/net/dsa/qca/qca8k-common.c                 |    4 +-
+ drivers/net/dsa/qca/qca8k.h                        |    4 +-
+ drivers/net/dsa/realtek/Kconfig                    |   20 +-
+ drivers/net/dsa/realtek/Makefile                   |   13 +-
+ drivers/net/dsa/realtek/realtek-mdio.c             |  205 +-
+ drivers/net/dsa/realtek/realtek-mdio.h             |   48 +
+ drivers/net/dsa/realtek/realtek-smi.c              |  279 +--
+ drivers/net/dsa/realtek/realtek-smi.h              |   48 +
+ drivers/net/dsa/realtek/realtek.h                  |   12 +-
+ drivers/net/dsa/realtek/rtl8365mb.c                |  132 +-
+ drivers/net/dsa/realtek/rtl8366-core.c             |   22 +-
+ drivers/net/dsa/realtek/rtl8366rb.c                |  119 +-
+ drivers/net/dsa/realtek/rtl83xx.c                  |  303 +++
+ drivers/net/dsa/realtek/rtl83xx.h                  |   22 +
+ drivers/net/dummy.c                                |    1 +
+ drivers/net/ethernet/Kconfig                       |    3 -
+ drivers/net/ethernet/amazon/ena/ena_com.c          |  323 +--
+ drivers/net/ethernet/amazon/ena/ena_com.h          |    7 +-
+ drivers/net/ethernet/amazon/ena/ena_eth_com.c      |   49 +-
+ drivers/net/ethernet/amazon/ena/ena_eth_com.h      |   39 +-
+ drivers/net/ethernet/amazon/ena/ena_netdev.c       |  181 +-
+ drivers/net/ethernet/amazon/ena/ena_regs_defs.h    |    1 +
+ drivers/net/ethernet/amazon/ena/ena_xdp.c          |    1 -
+ drivers/net/ethernet/amd/pds_core/adminq.c         |   10 +-
+ drivers/net/ethernet/amd/pds_core/auxbus.c         |   18 +-
+ drivers/net/ethernet/amd/pds_core/core.c           |   95 +-
+ drivers/net/ethernet/amd/pds_core/core.h           |    4 +-
+ drivers/net/ethernet/amd/pds_core/debugfs.c        |    8 +-
+ drivers/net/ethernet/amd/pds_core/dev.c            |   22 +-
+ drivers/net/ethernet/amd/pds_core/main.c           |   47 +-
+ .../net/ethernet/aquantia/atlantic/aq_ethtool.c    |   25 +-
+ drivers/net/ethernet/broadcom/asp2/bcmasp.h        |    2 +-
+ .../net/ethernet/broadcom/asp2/bcmasp_ethtool.c    |   12 +-
+ drivers/net/ethernet/broadcom/asp2/bcmasp_intf.c   |    5 +-
+ drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c    |    9 +-
+ .../net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c    |   50 +-
+ drivers/net/ethernet/broadcom/bnx2x/bnx2x_link.c   |   14 +-
+ drivers/net/ethernet/broadcom/bnxt/bnxt.c          |  373 ++-
+ drivers/net/ethernet/broadcom/bnxt/bnxt.h          |   54 +-
+ drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c  |  462 ++--
+ drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.h  |    6 +-
+ drivers/net/ethernet/broadcom/genet/bcmgenet.c     |   16 +-
+ drivers/net/ethernet/broadcom/genet/bcmgenet.h     |    2 +-
+ drivers/net/ethernet/broadcom/genet/bcmmii.c       |    5 +-
+ drivers/net/ethernet/broadcom/tg3.c                |   37 +-
+ drivers/net/ethernet/broadcom/tg3.h                |    2 +-
+ drivers/net/ethernet/cisco/enic/enic_main.c        |    2 +-
+ drivers/net/ethernet/ec_bhf.c                      |    1 +
+ drivers/net/ethernet/engleder/tsnep_main.c         |   36 +-
+ drivers/net/ethernet/freescale/enetc/enetc.c       |    4 +-
+ drivers/net/ethernet/freescale/fec.h               |    2 +-
+ drivers/net/ethernet/freescale/fec_main.c          |   66 +-
+ drivers/net/ethernet/freescale/gianfar.c           |    4 +-
+ drivers/net/ethernet/google/gve/gve.h              |  144 +-
+ drivers/net/ethernet/google/gve/gve_dqo.h          |   18 +-
+ drivers/net/ethernet/google/gve/gve_main.c         |  862 ++++---
+ drivers/net/ethernet/google/gve/gve_rx.c           |  135 +-
+ drivers/net/ethernet/google/gve/gve_rx_dqo.c       |   91 +-
+ drivers/net/ethernet/google/gve/gve_tx.c           |  128 +-
+ drivers/net/ethernet/google/gve/gve_tx_dqo.c       |  108 +-
+ drivers/net/ethernet/google/gve/gve_utils.c        |   31 +
+ drivers/net/ethernet/google/gve/gve_utils.h        |    5 +
+ drivers/net/ethernet/i825xx/sun3_82586.c           |    2 +-
+ drivers/net/ethernet/intel/Kconfig                 |    8 +
+ drivers/net/ethernet/intel/e1000e/ethtool.c        |   16 +-
+ drivers/net/ethernet/intel/i40e/i40e.h             |   93 +-
+ drivers/net/ethernet/intel/i40e/i40e_dcb_nl.c      |   10 +-
+ drivers/net/ethernet/intel/i40e/i40e_debugfs.c     |   97 +-
+ drivers/net/ethernet/intel/i40e/i40e_ethtool.c     |   16 +-
+ drivers/net/ethernet/intel/i40e/i40e_main.c        |  563 ++---
+ drivers/net/ethernet/intel/ice/ice.h               |    5 +-
+ drivers/net/ethernet/intel/ice/ice_base.c          |  134 +-
+ drivers/net/ethernet/intel/ice/ice_base.h          |   10 +-
+ drivers/net/ethernet/intel/ice/ice_common.c        |   37 +
+ drivers/net/ethernet/intel/ice/ice_common.h        |    2 +
+ drivers/net/ethernet/intel/ice/ice_controlq.c      |    2 +-
+ drivers/net/ethernet/intel/ice/ice_ddp.c           |    4 +
+ drivers/net/ethernet/intel/ice/ice_debugfs.c       |   13 +-
+ drivers/net/ethernet/intel/ice/ice_devids.h        |    8 +
+ drivers/net/ethernet/intel/ice/ice_devlink.c       |   68 +-
+ drivers/net/ethernet/intel/ice/ice_dpll.c          |    2 +
+ drivers/net/ethernet/intel/ice/ice_ethtool.c       |    3 +-
+ drivers/net/ethernet/intel/ice/ice_fwlog.c         |    2 +
+ drivers/net/ethernet/intel/ice/ice_lib.c           |  129 --
+ drivers/net/ethernet/intel/ice/ice_lib.h           |   10 -
+ drivers/net/ethernet/intel/ice/ice_main.c          |  207 +-
+ drivers/net/ethernet/intel/ice/ice_ptp.c           |  233 +-
+ drivers/net/ethernet/intel/ice/ice_ptp.h           |   34 +-
+ drivers/net/ethernet/intel/ice/ice_txrx_lib.c      |    8 +-
+ drivers/net/ethernet/intel/ice/ice_type.h          |    1 +
+ drivers/net/ethernet/intel/ice/ice_xsk.c           |   22 +-
+ drivers/net/ethernet/intel/igb/igb_ethtool.c       |   28 +-
+ drivers/net/ethernet/intel/igc/Makefile            |    1 +
+ drivers/net/ethernet/intel/igc/igc.h               |    9 +-
+ drivers/net/ethernet/intel/igc/igc_ethtool.c       |   24 +-
+ drivers/net/ethernet/intel/igc/igc_leds.c          |  280 +++
+ drivers/net/ethernet/intel/igc/igc_main.c          |   27 +-
+ drivers/net/ethernet/intel/igc/igc_regs.h          |    1 +
+ drivers/net/ethernet/intel/ixgbe/ixgbe.h           |   16 +-
+ drivers/net/ethernet/intel/ixgbe/ixgbe_82598.c     |   72 +-
+ drivers/net/ethernet/intel/ixgbe/ixgbe_82599.c     |  151 +-
+ drivers/net/ethernet/intel/ixgbe/ixgbe_common.c    |  266 +--
+ drivers/net/ethernet/intel/ixgbe/ixgbe_common.h    |  112 +-
+ drivers/net/ethernet/intel/ixgbe/ixgbe_dcb.c       |   12 +-
+ drivers/net/ethernet/intel/ixgbe/ixgbe_dcb.h       |   10 +-
+ drivers/net/ethernet/intel/ixgbe/ixgbe_dcb_82598.c |   26 +-
+ drivers/net/ethernet/intel/ixgbe/ixgbe_dcb_82598.h |   24 +-
+ drivers/net/ethernet/intel/ixgbe/ixgbe_dcb_82599.c |   12 +-
+ drivers/net/ethernet/intel/ixgbe/ixgbe_dcb_82599.h |   29 +-
+ drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c   |   38 +-
+ drivers/net/ethernet/intel/ixgbe/ixgbe_main.c      |    4 +-
+ drivers/net/ethernet/intel/ixgbe/ixgbe_mbx.c       |   46 +-
+ drivers/net/ethernet/intel/ixgbe/ixgbe_mbx.h       |   10 +-
+ drivers/net/ethernet/intel/ixgbe/ixgbe_phy.c       |  212 +-
+ drivers/net/ethernet/intel/ixgbe/ixgbe_phy.h       |   52 +-
+ drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c     |    8 +-
+ drivers/net/ethernet/intel/ixgbe/ixgbe_type.h      |  186 +-
+ drivers/net/ethernet/intel/ixgbe/ixgbe_x540.c      |   66 +-
+ drivers/net/ethernet/intel/ixgbe/ixgbe_x540.h      |   18 +-
+ drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c      |  300 +--
+ drivers/net/ethernet/marvell/Kconfig               |    1 +
+ drivers/net/ethernet/marvell/Makefile              |    1 +
+ drivers/net/ethernet/marvell/mvneta.c              |    4 +-
+ drivers/net/ethernet/marvell/octeon_ep_vf/Kconfig  |   19 +
+ drivers/net/ethernet/marvell/octeon_ep_vf/Makefile |   10 +
+ .../ethernet/marvell/octeon_ep_vf/octep_vf_cn9k.c  |  489 ++++
+ .../ethernet/marvell/octeon_ep_vf/octep_vf_cnxk.c  |  500 ++++
+ .../marvell/octeon_ep_vf/octep_vf_config.h         |  160 ++
+ .../marvell/octeon_ep_vf/octep_vf_ethtool.c        |  273 +++
+ .../ethernet/marvell/octeon_ep_vf/octep_vf_main.c  | 1231 ++++++++++
+ .../ethernet/marvell/octeon_ep_vf/octep_vf_main.h  |  334 +++
+ .../ethernet/marvell/octeon_ep_vf/octep_vf_mbox.c  |  430 ++++
+ .../ethernet/marvell/octeon_ep_vf/octep_vf_mbox.h  |  166 ++
+ .../marvell/octeon_ep_vf/octep_vf_regs_cn9k.h      |  154 ++
+ .../marvell/octeon_ep_vf/octep_vf_regs_cnxk.h      |  162 ++
+ .../ethernet/marvell/octeon_ep_vf/octep_vf_rx.c    |  510 ++++
+ .../ethernet/marvell/octeon_ep_vf/octep_vf_rx.h    |  224 ++
+ .../ethernet/marvell/octeon_ep_vf/octep_vf_tx.c    |  330 +++
+ .../ethernet/marvell/octeon_ep_vf/octep_vf_tx.h    |  276 +++
+ drivers/net/ethernet/marvell/octeontx2/af/mbox.h   |    3 +
+ drivers/net/ethernet/marvell/octeontx2/af/npc.h    |   15 +-
+ .../ethernet/marvell/octeontx2/af/npc_profile.h    |  621 ++++-
+ drivers/net/ethernet/marvell/octeontx2/af/rvu.c    |   14 +-
+ drivers/net/ethernet/marvell/octeontx2/af/rvu.h    |   31 +-
+ .../net/ethernet/marvell/octeontx2/af/rvu_nix.c    |  188 +-
+ .../net/ethernet/marvell/octeontx2/af/rvu_npc.c    |    8 +-
+ .../net/ethernet/marvell/octeontx2/af/rvu_reg.h    |    3 +
+ .../net/ethernet/marvell/octeontx2/af/rvu_sdp.c    |   14 +-
+ drivers/net/ethernet/mellanox/mlx4/cmd.c           |    7 +-
+ drivers/net/ethernet/mellanox/mlx4/cq.c            |    4 +-
+ drivers/net/ethernet/mellanox/mlx4/en_clock.c      |    4 +-
+ drivers/net/ethernet/mellanox/mlx4/en_netdev.c     |    5 +-
+ drivers/net/ethernet/mellanox/mlx4/en_rx.c         |    2 +-
+ drivers/net/ethernet/mellanox/mlx4/en_tx.c         |    2 +-
+ drivers/net/ethernet/mellanox/mlx4/eq.c            |    2 +-
+ drivers/net/ethernet/mellanox/mlx4/fw_qos.h        |    8 +-
+ drivers/net/ethernet/mellanox/mlx4/main.c          |    4 +-
+ drivers/net/ethernet/mellanox/mlx4/mlx4_stats.h    |    2 +-
+ drivers/net/ethernet/mellanox/mlx4/port.c          |    2 +-
+ drivers/net/ethernet/mellanox/mlx5/core/dev.c      |    2 +-
+ drivers/net/ethernet/mellanox/mlx5/core/dpll.c     |   32 +-
+ .../net/ethernet/mellanox/mlx5/core/en/params.c    |   24 +-
+ drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c   |    5 +-
+ .../ethernet/mellanox/mlx5/core/en_accel/ipsec.c   |   26 +-
+ .../ethernet/mellanox/mlx5/core/en_accel/ipsec.h   |    1 -
+ .../mellanox/mlx5/core/en_accel/ipsec_rxtx.c       |   25 +-
+ .../mellanox/mlx5/core/en_accel/ipsec_rxtx.h       |    1 -
+ .../mellanox/mlx5/core/en_accel/ipsec_stats.c      |    1 -
+ drivers/net/ethernet/mellanox/mlx5/core/en_main.c  |    7 +
+ drivers/net/ethernet/mellanox/mlx5/core/fw.c       |    6 +-
+ drivers/net/ethernet/mellanox/mlx5/core/health.c   |   45 +-
+ drivers/net/ethernet/mellanox/mlx5/core/main.c     |   38 +-
+ .../net/ethernet/mellanox/mlx5/core/mlx5_core.h    |    7 -
+ .../net/ethernet/mellanox/mlx5/core/sf/dev/dev.c   |    9 +-
+ .../ethernet/mellanox/mlx5/core/sf/dev/driver.c    |   21 +-
+ .../ethernet/mellanox/mlx5/core/steering/dr_dbg.c  |  734 ++++--
+ .../ethernet/mellanox/mlx5/core/steering/dr_dbg.h  |   20 +
+ .../mellanox/mlxsw/core_acl_flex_actions.c         |   16 +-
+ .../ethernet/mellanox/mlxsw/core_acl_flex_keys.c   |    9 +-
+ drivers/net/ethernet/mellanox/mlxsw/minimal.c      |    1 -
+ drivers/net/ethernet/mellanox/mlxsw/spectrum.c     |  160 +-
+ drivers/net/ethernet/mellanox/mlxsw/spectrum.h     |   15 +-
+ drivers/net/ethernet/mellanox/mlxsw/spectrum_acl.c |   11 +-
+ .../ethernet/mellanox/mlxsw/spectrum_acl_tcam.c    |   17 +-
+ .../net/ethernet/mellanox/mlxsw/spectrum_router.c  |   15 +-
+ .../ethernet/mellanox/mlxsw/spectrum_switchdev.c   |    8 +-
+ drivers/net/ethernet/microchip/encx24j600-regmap.c |    5 +-
+ drivers/net/ethernet/microchip/lan743x_ethtool.c   |    4 +-
+ drivers/net/ethernet/microchip/lan743x_ptp.c       |    4 +-
+ .../microchip/lan966x/lan966x_vcap_debugfs.c       |    2 +
+ drivers/net/ethernet/microsoft/mana/gdma_main.c    |   86 +-
+ drivers/net/ethernet/mscc/ocelot.c                 |    1 +
+ drivers/net/ethernet/nvidia/forcedeth.c            |    4 +-
+ drivers/net/ethernet/pensando/ionic/ionic_dev.h    |   11 +
+ .../net/ethernet/pensando/ionic/ionic_ethtool.c    |    5 +
+ drivers/net/ethernet/pensando/ionic/ionic_lif.c    |  193 +-
+ drivers/net/ethernet/pensando/ionic/ionic_lif.h    |   13 +
+ drivers/net/ethernet/pensando/ionic/ionic_stats.c  |   18 +
+ drivers/net/ethernet/pensando/ionic/ionic_txrx.c   |  460 +++-
+ drivers/net/ethernet/pensando/ionic/ionic_txrx.h   |    1 +
+ .../net/ethernet/qlogic/netxen/netxen_nic_init.c   |    2 -
+ drivers/net/ethernet/qlogic/qed/qed_rdma.c         |    2 -
+ drivers/net/ethernet/qlogic/qede/qede_ethtool.c    |   32 +-
+ drivers/net/ethernet/qualcomm/emac/emac.c          |    1 +
+ drivers/net/ethernet/qualcomm/qca_7k.c             |   17 +-
+ drivers/net/ethernet/qualcomm/qca_7k.h             |   16 +-
+ drivers/net/ethernet/qualcomm/qca_7k_common.c      |   17 +-
+ drivers/net/ethernet/qualcomm/qca_7k_common.h      |   29 +-
+ drivers/net/ethernet/qualcomm/qca_debug.c          |   21 +-
+ drivers/net/ethernet/qualcomm/qca_debug.h          |   15 +-
+ drivers/net/ethernet/qualcomm/qca_spi.c            |   71 +-
+ drivers/net/ethernet/qualcomm/qca_spi.h            |   22 +-
+ drivers/net/ethernet/qualcomm/qca_uart.c           |   17 +-
+ drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c |    1 +
+ drivers/net/ethernet/realtek/r8169.h               |    4 +
+ drivers/net/ethernet/realtek/r8169_leds.c          |  145 +-
+ drivers/net/ethernet/realtek/r8169_main.c          |  271 ++-
+ drivers/net/ethernet/realtek/r8169_phy_config.c    |    7 +
+ drivers/net/ethernet/renesas/Kconfig               |    1 +
+ drivers/net/ethernet/renesas/ravb.h                |   40 +-
+ drivers/net/ethernet/renesas/ravb_main.c           |  993 ++++----
+ drivers/net/ethernet/samsung/sxgbe/sxgbe_common.h  |    1 -
+ drivers/net/ethernet/samsung/sxgbe/sxgbe_ethtool.c |    6 +-
+ drivers/net/ethernet/samsung/sxgbe/sxgbe_main.c    |    1 -
+ drivers/net/ethernet/sfc/efx_common.c              |    2 +-
+ drivers/net/ethernet/sfc/falcon/efx.c              |    2 +-
+ drivers/net/ethernet/sfc/siena/efx_common.c        |    2 +-
+ drivers/net/ethernet/smsc/smc91x.c                 |    1 +
+ drivers/net/ethernet/smsc/smsc911x.c               |    1 +
+ drivers/net/ethernet/smsc/smsc9420.c               |    1 +
+ drivers/net/ethernet/stmicro/stmmac/Kconfig        |    6 +-
+ drivers/net/ethernet/stmicro/stmmac/common.h       |    3 +
+ .../ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c    |    6 +-
+ .../net/ethernet/stmicro/stmmac/dwmac-socfpga.c    |    1 +
+ .../net/ethernet/stmicro/stmmac/dwmac-starfive.c   |   32 +-
+ drivers/net/ethernet/stmicro/stmmac/stmmac.h       |    3 +
+ drivers/net/ethernet/stmicro/stmmac/stmmac_est.c   |    6 +
+ .../net/ethernet/stmicro/stmmac/stmmac_ethtool.c   |    6 +-
+ drivers/net/ethernet/stmicro/stmmac/stmmac_main.c  |   69 +-
+ .../net/ethernet/stmicro/stmmac/stmmac_platform.c  |    8 +
+ drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c    |   87 +-
+ drivers/net/ethernet/ti/am65-cpsw-ethtool.c        |    4 +-
+ drivers/net/ethernet/ti/cpsw-common.c              |    1 +
+ drivers/net/ethernet/ti/cpsw_ethtool.c             |    4 +-
+ drivers/net/ethernet/ti/cpsw_priv.h                |    4 +-
+ drivers/net/ethernet/ti/icssg/icssg_ethtool.c      |    4 +-
+ drivers/net/ethernet/ti/icssg/icssg_prueth.c       |    4 -
+ drivers/net/ethernet/wangxun/libwx/wx_hw.c         |    2 -
+ drivers/net/ethernet/wangxun/libwx/wx_lib.c        |   20 +-
+ drivers/net/ethernet/wangxun/libwx/wx_type.h       |    1 -
+ drivers/net/ethernet/wangxun/txgbe/Makefile        |    1 +
+ drivers/net/ethernet/wangxun/txgbe/txgbe_irq.c     |  269 +++
+ drivers/net/ethernet/wangxun/txgbe/txgbe_irq.h     |    7 +
+ drivers/net/ethernet/wangxun/txgbe/txgbe_main.c    |  140 +-
+ drivers/net/ethernet/wangxun/txgbe/txgbe_phy.c     |   59 +-
+ drivers/net/ethernet/wangxun/txgbe/txgbe_phy.h     |    2 +
+ drivers/net/ethernet/wangxun/txgbe/txgbe_type.h    |   17 +
+ drivers/net/ethernet/wiznet/w5300.c                |    3 +-
+ drivers/net/ethernet/xilinx/xilinx_emaclite.c      |    3 +-
+ drivers/net/ethernet/xircom/xirc2ps_cs.c           |    4 +-
+ drivers/net/geneve.c                               |   24 +-
+ drivers/net/gtp.c                                  |   20 +-
+ drivers/net/ipa/ipa_modem.c                        |   96 +-
+ drivers/net/ipa/ipa_power.c                        |   71 -
+ drivers/net/ipa/ipa_power.h                        |   18 -
+ drivers/net/loopback.c                             |    1 +
+ drivers/net/mdio/mdio-ipq4019.c                    |  109 +-
+ drivers/net/mdio/of_mdio.c                         |   79 +-
+ drivers/net/netconsole.c                           |  353 ++-
+ drivers/net/netdevsim/bus.c                        |    2 +-
+ drivers/net/pcs/pcs-lynx.c                         |    1 +
+ drivers/net/pcs/pcs-mtk-lynxi.c                    |    1 +
+ drivers/net/pcs/pcs-xpcs.c                         |    1 +
+ drivers/net/phy/Kconfig                            |    7 +-
+ drivers/net/phy/Makefile                           |    2 +-
+ drivers/net/phy/adin1100.c                         |   55 +
+ drivers/net/phy/aquantia/aquantia_main.c           |  103 +
+ drivers/net/phy/at803x.c                           | 2432 --------------------
+ drivers/net/phy/broadcom.c                         |    3 +-
+ drivers/net/phy/dp83822.c                          |  130 +-
+ drivers/net/phy/dp83867.c                          |   22 +
+ drivers/net/phy/marvell.c                          |    2 +-
+ drivers/net/phy/mdio_bus.c                         |   46 +-
+ drivers/net/phy/micrel.c                           |   71 +-
+ drivers/net/phy/phy-c45.c                          |  113 +-
+ drivers/net/phy/phy.c                              |   11 +-
+ drivers/net/phy/phy_device.c                       |  178 +-
+ drivers/net/phy/phylink.c                          |    8 +-
+ drivers/net/phy/qcom/Kconfig                       |   30 +
+ drivers/net/phy/qcom/Makefile                      |    6 +
+ drivers/net/phy/qcom/at803x.c                      | 1106 +++++++++
+ drivers/net/phy/qcom/qca807x.c                     |  849 +++++++
+ drivers/net/phy/qcom/qca808x.c                     |  644 ++++++
+ drivers/net/phy/qcom/qca83xx.c                     |  275 +++
+ drivers/net/phy/qcom/qcom-phy-lib.c                |  676 ++++++
+ drivers/net/phy/qcom/qcom.h                        |  243 ++
+ drivers/net/phy/realtek.c                          |   44 +-
+ drivers/net/phy/xilinx_gmii2rgmii.c                |    2 +-
+ drivers/net/tun.c                                  |   21 +-
+ drivers/net/usb/ax88179_178a.c                     |   20 +-
+ drivers/net/usb/lan78xx.c                          |    4 +-
+ drivers/net/usb/r8152.c                            |   28 +-
+ drivers/net/veth.c                                 |   75 +-
+ drivers/net/vxlan/vxlan_core.c                     |   61 +-
+ drivers/net/wireless/broadcom/b43/b43.h            |   16 +
+ drivers/net/wireless/broadcom/b43/dma.c            |    4 +-
+ drivers/net/wireless/broadcom/b43/main.c           |   16 +-
+ drivers/net/wireless/broadcom/b43/pio.c            |    6 +-
+ .../broadcom/brcm80211/brcmfmac/bca/core.c         |   30 +-
+ .../broadcom/brcm80211/brcmfmac/cfg80211.c         |   64 +-
+ .../broadcom/brcm80211/brcmfmac/cfg80211.h         |    2 +
+ .../wireless/broadcom/brcm80211/brcmfmac/common.c  |   18 +-
+ .../wireless/broadcom/brcm80211/brcmfmac/core.c    |   12 +-
+ .../wireless/broadcom/brcm80211/brcmfmac/core.h    |    2 +-
+ .../broadcom/brcm80211/brcmfmac/cyw/core.c         |   50 +-
+ .../wireless/broadcom/brcm80211/brcmfmac/feature.c |   11 +-
+ .../wireless/broadcom/brcm80211/brcmfmac/fweh.c    |  154 +-
+ .../wireless/broadcom/brcm80211/brcmfmac/fweh.h    |   60 +-
+ .../wireless/broadcom/brcm80211/brcmfmac/fwil.c    |  116 +-
+ .../wireless/broadcom/brcm80211/brcmfmac/fwil.h    |  125 +-
+ .../broadcom/brcm80211/brcmfmac/fwil_types.h       |    2 +-
+ .../wireless/broadcom/brcm80211/brcmfmac/fwvid.c   |   13 +-
+ .../wireless/broadcom/brcm80211/brcmfmac/fwvid.h   |   48 +-
+ .../broadcom/brcm80211/brcmfmac/wcc/core.c         |   31 +-
+ .../broadcom/brcm80211/brcmsmac/phy/phy_cmn.c      |    3 +-
+ .../broadcom/brcm80211/brcmsmac/phy/phy_int.h      |    2 +-
+ .../broadcom/brcm80211/brcmsmac/phy/phy_n.c        |   11 +-
+ drivers/net/wireless/intel/iwlegacy/common.c       |    4 +-
+ drivers/net/wireless/marvell/mwifiex/cfg80211.c    |    2 +-
+ drivers/net/wireless/marvell/mwifiex/debugfs.c     |    3 -
+ drivers/net/wireless/marvell/mwifiex/wmm.c         |    2 +-
+ drivers/net/wireless/microchip/wilc1000/cfg80211.c |   12 +-
+ drivers/net/wireless/microchip/wilc1000/hif.c      |   40 +-
+ drivers/net/wireless/microchip/wilc1000/netdev.c   |   12 +-
+ drivers/net/wireless/microchip/wilc1000/wlan.c     |   35 +-
+ drivers/net/wireless/microchip/wilc1000/wlan.h     |    6 +
+ drivers/net/wireless/ralink/rt2x00/rt2x00crypto.c  |    5 +-
+ drivers/net/wireless/realtek/rtl8xxxu/rtl8xxxu.h   |   20 +-
+ .../net/wireless/realtek/rtl8xxxu/rtl8xxxu_8188e.c |    3 +-
+ .../net/wireless/realtek/rtl8xxxu/rtl8xxxu_8188f.c |    2 +
+ .../net/wireless/realtek/rtl8xxxu/rtl8xxxu_8192c.c |    1 +
+ .../net/wireless/realtek/rtl8xxxu/rtl8xxxu_8192e.c |    1 +
+ .../net/wireless/realtek/rtl8xxxu/rtl8xxxu_8192f.c |   33 +-
+ .../net/wireless/realtek/rtl8xxxu/rtl8xxxu_8710b.c |    1 +
+ .../net/wireless/realtek/rtl8xxxu/rtl8xxxu_8723a.c |    1 +
+ .../net/wireless/realtek/rtl8xxxu/rtl8xxxu_8723b.c |    1 +
+ .../net/wireless/realtek/rtl8xxxu/rtl8xxxu_core.c  |  409 +++-
+ .../net/wireless/realtek/rtl8xxxu/rtl8xxxu_regs.h  |   15 +
+ drivers/net/wireless/realtek/rtlwifi/efuse.c       |   36 +-
+ drivers/net/wireless/realtek/rtlwifi/efuse.h       |    4 +-
+ drivers/net/wireless/realtek/rtlwifi/pci.c         |   12 +-
+ .../net/wireless/realtek/rtlwifi/rtl8192ce/trx.c   |    4 -
+ .../net/wireless/realtek/rtlwifi/rtl8192cu/sw.c    |    6 +-
+ .../net/wireless/realtek/rtlwifi/rtl8192cu/trx.c   |    3 -
+ .../net/wireless/realtek/rtlwifi/rtl8192de/trx.c   |    5 +-
+ .../net/wireless/realtek/rtlwifi/rtl8723ae/trx.c   |    6 +-
+ drivers/net/wireless/realtek/rtlwifi/usb.c         |  164 +-
+ drivers/net/wireless/realtek/rtlwifi/wifi.h        |   38 +-
+ drivers/net/wireless/realtek/rtw88/debug.c         |   44 +-
+ drivers/net/wireless/realtek/rtw88/pci.c           |    4 +
+ drivers/net/wireless/realtek/rtw88/reg.h           |    3 +
+ drivers/net/wireless/realtek/rtw89/cam.c           |   61 +
+ drivers/net/wireless/realtek/rtw89/cam.h           |  109 +
+ drivers/net/wireless/realtek/rtw89/chan.c          |    2 +-
+ drivers/net/wireless/realtek/rtw89/core.c          |  344 ++-
+ drivers/net/wireless/realtek/rtw89/core.h          |  136 +-
+ drivers/net/wireless/realtek/rtw89/fw.c            |  944 +++++++-
+ drivers/net/wireless/realtek/rtw89/fw.h            |  810 ++++---
+ drivers/net/wireless/realtek/rtw89/mac.c           |   96 +-
+ drivers/net/wireless/realtek/rtw89/mac.h           |    5 +-
+ drivers/net/wireless/realtek/rtw89/mac80211.c      |   18 +-
+ drivers/net/wireless/realtek/rtw89/mac_be.c        |    4 +-
+ drivers/net/wireless/realtek/rtw89/pci.c           |   69 +-
+ drivers/net/wireless/realtek/rtw89/pci.h           |    1 +
+ drivers/net/wireless/realtek/rtw89/phy.c           |   46 +-
+ drivers/net/wireless/realtek/rtw89/phy.h           |   72 +
+ drivers/net/wireless/realtek/rtw89/phy_be.c        |  312 +++
+ drivers/net/wireless/realtek/rtw89/reg.h           |  278 ++-
+ drivers/net/wireless/realtek/rtw89/rtw8851b.c      |   15 +-
+ .../net/wireless/realtek/rtw89/rtw8851b_table.c    |   72 +-
+ drivers/net/wireless/realtek/rtw89/rtw8852a.c      |   11 +-
+ drivers/net/wireless/realtek/rtw89/rtw8852b.c      |   15 +-
+ .../net/wireless/realtek/rtw89/rtw8852b_table.c    |  142 +-
+ drivers/net/wireless/realtek/rtw89/rtw8852c.c      |   14 +-
+ drivers/net/wireless/realtek/rtw89/rtw8922a.c      |  705 +++++-
+ drivers/net/wireless/realtek/rtw89/wow.c           |    2 +-
+ drivers/net/wwan/t7xx/t7xx_hif_cldma.c             |   47 +-
+ drivers/net/wwan/t7xx/t7xx_hif_cldma.h             |   18 +-
+ drivers/net/wwan/t7xx/t7xx_modem_ops.c             |   14 +-
+ drivers/net/wwan/t7xx/t7xx_modem_ops.h             |    1 +
+ drivers/net/wwan/t7xx/t7xx_pci.c                   |  103 +-
+ drivers/net/wwan/t7xx/t7xx_pci.h                   |   14 +-
+ drivers/net/wwan/t7xx/t7xx_port.h                  |    4 +
+ drivers/net/wwan/t7xx/t7xx_port_proxy.c            |  112 +-
+ drivers/net/wwan/t7xx/t7xx_port_proxy.h            |   10 +
+ drivers/net/wwan/t7xx/t7xx_port_wwan.c             |  115 +-
+ drivers/net/wwan/t7xx/t7xx_reg.h                   |   24 +-
+ drivers/net/wwan/t7xx/t7xx_state_monitor.c         |  134 +-
+ drivers/net/wwan/t7xx/t7xx_state_monitor.h         |    1 +
+ drivers/net/wwan/wwan_core.c                       |    4 +
+ drivers/ptp/Kconfig                                |   12 +
+ drivers/ptp/Makefile                               |    1 +
+ drivers/ptp/ptp_clock.c                            |   16 +-
+ drivers/ptp/ptp_fc3.c                              | 1016 ++++++++
+ drivers/ptp/ptp_fc3.h                              |   45 +
+ drivers/ptp/ptp_ocp.c                              |  311 ++-
+ drivers/ptp/ptp_sysfs.c                            |   13 +-
+ fs/eventpoll.c                                     |  131 +-
+ include/linux/bitmap.h                             |   12 +
+ include/linux/bpf.h                                |  142 +-
+ include/linux/bpf_verifier.h                       |    3 +-
+ include/linux/btf.h                                |   13 +
+ include/linux/cpumask.h                            |   16 +
+ include/linux/dpll.h                               |    1 +
+ include/linux/ethtool.h                            |   17 +-
+ include/linux/filter.h                             |    3 +-
+ include/linux/if_tun.h                             |   16 +-
+ include/linux/inet_diag.h                          |    1 +
+ include/linux/ipv6.h                               |    1 +
+ include/linux/lsm_hook_defs.h                      |   15 +-
+ include/linux/mdio.h                               |   55 +
+ include/linux/mfd/idtRC38xxx_reg.h                 |  273 +++
+ include/linux/mlx5/mlx5_ifc.h                      |    9 +
+ include/linux/netdevice.h                          |   33 +-
+ include/linux/phy.h                                |   45 +-
+ include/linux/phylink.h                            |    4 +-
+ include/linux/platform_data/microchip-ksz.h        |    1 +
+ include/linux/ptp_clock_kernel.h                   |    3 +
+ include/linux/rtnetlink.h                          |    3 +
+ include/linux/security.h                           |   43 +-
+ include/linux/skbuff.h                             |  110 +-
+ include/linux/sock_diag.h                          |   10 +-
+ include/linux/stmmac.h                             |    1 +
+ include/linux/wwan.h                               |    2 +
+ include/net/act_api.h                              |    2 +
+ include/net/addrconf.h                             |    5 +-
+ include/net/af_unix.h                              |   22 +-
+ include/net/bond_3ad.h                             |    2 +
+ include/net/bond_options.h                         |    1 +
+ include/net/bonding.h                              |   23 +
+ include/net/busy_poll.h                            |    4 +
+ include/net/dsa.h                                  |    4 +-
+ include/net/dst.h                                  |    1 -
+ include/net/genetlink.h                            |    4 +
+ include/net/ip6_fib.h                              |   52 +-
+ include/net/ip6_route.h                            |    3 +-
+ include/net/ip_tunnels.h                           |    3 +-
+ include/net/net_namespace.h                        |    5 +-
+ include/net/netfilter/nf_tables.h                  |    6 +
+ include/net/netlabel.h                             |    7 +-
+ include/net/netmem.h                               |   41 +
+ include/net/nexthop.h                              |    1 +
+ include/net/page_pool/types.h                      |   13 +-
+ include/net/pkt_cls.h                              |    2 +
+ include/net/pkt_sched.h                            |    2 +
+ include/net/request_sock.h                         |   39 +
+ include/net/route.h                                |    7 +-
+ include/net/scm.h                                  |    1 +
+ include/net/sctp/structs.h                         |    5 +-
+ include/net/sock.h                                 |  137 +-
+ include/net/tcp.h                                  |   45 +
+ include/net/xfrm.h                                 |   14 +-
+ include/uapi/linux/bpf.h                           |   78 +-
+ include/uapi/linux/can.h                           |    9 +-
+ include/uapi/linux/can/isotp.h                     |    1 +
+ include/uapi/linux/can/raw.h                       |   16 +
+ include/uapi/linux/dpll.h                          |   30 +
+ include/uapi/linux/eventpoll.h                     |   13 +
+ include/uapi/linux/if_link.h                       |    1 +
+ include/uapi/linux/mdio.h                          |    2 +
+ include/uapi/linux/netfilter/nf_tables.h           |    6 +-
+ include/uapi/linux/ptp_clock.h                     |   13 +-
+ include/uapi/linux/tc_act/tc_pedit.h               |    2 +-
+ kernel/bpf/Makefile                                |    2 +-
+ kernel/bpf/arraymap.c                              |    2 +-
+ kernel/bpf/bpf_lsm.c                               |   15 +-
+ kernel/bpf/bpf_struct_ops.c                        |  447 ++--
+ kernel/bpf/bpf_struct_ops_types.h                  |   12 -
+ kernel/bpf/btf.c                                   |  276 ++-
+ kernel/bpf/cgroup.c                                |    6 +-
+ kernel/bpf/core.c                                  |   13 +-
+ kernel/bpf/helpers.c                               |    7 +-
+ kernel/bpf/inode.c                                 |  276 ++-
+ kernel/bpf/syscall.c                               |  234 +-
+ kernel/bpf/token.c                                 |  278 +++
+ kernel/bpf/verifier.c                              |  148 +-
+ kernel/configs/debug.config                        |    6 +
+ kernel/trace/bpf_trace.c                           |   17 +-
+ lib/bitmap.c                                       |    7 +
+ lib/test_blackhole_dev.c                           |    3 +-
+ net/8021q/vlan_dev.c                               |   24 +-
+ net/8021q/vlanproc.c                               |   46 +-
+ net/Kconfig                                        |    1 +
+ net/Makefile                                       |    2 +-
+ net/batman-adv/distributed-arp-table.c             |    3 +-
+ net/batman-adv/main.c                              |   14 +-
+ net/batman-adv/main.h                              |    2 +-
+ net/batman-adv/netlink.c                           |    1 -
+ net/bpf/bpf_dummy_struct_ops.c                     |   22 +-
+ net/bridge/br.c                                    |   15 +-
+ net/bridge/br_device.c                             |    9 +-
+ net/bridge/br_fdb.c                                |    5 +-
+ net/bridge/br_netlink.c                            |    3 +-
+ net/bridge/br_vlan.c                               |    4 +-
+ net/bridge/netfilter/Kconfig                       |    7 +
+ net/bridge/netfilter/Makefile                      |    2 +-
+ net/can/af_can.c                                   |    2 +
+ net/can/bcm.c                                      |   69 +-
+ net/can/isotp.c                                    |    5 +-
+ net/can/raw.c                                      |  104 +-
+ net/core/dev.c                                     |  305 ++-
+ net/core/dev.h                                     |    4 +
+ net/core/dst.c                                     |    6 +-
+ net/core/filter.c                                  |  155 +-
+ net/core/link_watch.c                              |   13 +-
+ net/core/net-procfs.c                              |   48 +-
+ net/core/net-sysfs.c                               |   74 +-
+ net/core/net_namespace.c                           |   33 +-
+ net/core/page_pool.c                               |   55 +-
+ net/core/rtnetlink.c                               |   97 +-
+ net/core/scm.c                                     |    5 +
+ net/core/skbuff.c                                  |  131 +-
+ net/core/sock.c                                    |   76 +-
+ net/core/sock_diag.c                               |  120 +-
+ net/core/xdp.c                                     |    6 +-
+ net/dccp/ackvec.c                                  |    8 +-
+ net/dccp/diag.c                                    |    1 +
+ net/dsa/dsa.c                                      |    7 +-
+ net/dsa/tag_sja1105.c                              |    4 +-
+ net/dsa/user.c                                     |   16 +-
+ net/ethtool/common.c                               |    5 +
+ net/ethtool/common.h                               |    1 +
+ net/ethtool/eee.c                                  |   75 +-
+ net/ethtool/ioctl.c                                |   69 +-
+ net/ethtool/netlink.c                              |   14 +-
+ net/hsr/hsr_device.c                               |   28 +-
+ net/ieee802154/6lowpan/core.c                      |    1 +
+ net/ieee802154/socket.c                            |    1 +
+ net/ipv4/af_inet.c                                 |    2 +-
+ net/ipv4/bpf_tcp_ca.c                              |   22 +-
+ net/ipv4/datagram.c                                |    2 +-
+ net/ipv4/fib_trie.c                                |    2 +-
+ net/ipv4/inet_connection_sock.c                    |    2 +-
+ net/ipv4/inet_diag.c                               |  101 +-
+ net/ipv4/inetpeer.c                                |    5 +-
+ net/ipv4/ip_gre.c                                  |   24 +-
+ net/ipv4/ip_output.c                               |    2 +-
+ net/ipv4/ip_tunnel.c                               |   38 +-
+ net/ipv4/ip_vti.c                                  |    8 +-
+ net/ipv4/ipip.c                                    |    8 +-
+ net/ipv4/netfilter/Kconfig                         |   43 +-
+ net/ipv4/netfilter/Makefile                        |    2 +-
+ net/ipv4/nexthop.c                                 |   38 +-
+ net/ipv4/raw_diag.c                                |    1 +
+ net/ipv4/syncookies.c                              |   40 +-
+ net/ipv4/tcp_ao.c                                  |    2 +-
+ net/ipv4/tcp_diag.c                                |    1 +
+ net/ipv4/tcp_input.c                               |   25 +-
+ net/ipv4/udp_diag.c                                |    2 +
+ net/ipv6/addrconf.c                                |  113 +-
+ net/ipv6/ip6_fib.c                                 |   79 +-
+ net/ipv6/ip6_gre.c                                 |   14 +-
+ net/ipv6/ip6_tunnel.c                              |   13 +-
+ net/ipv6/ip6_vti.c                                 |   13 +-
+ net/ipv6/mcast.c                                   |    1 -
+ net/ipv6/ndisc.c                                   |   13 +-
+ net/ipv6/netfilter/Kconfig                         |   20 +-
+ net/ipv6/netfilter/Makefile                        |    2 +-
+ net/ipv6/route.c                                   |   27 +-
+ net/ipv6/sit.c                                     |   14 +-
+ net/ipv6/syncookies.c                              |   13 +-
+ net/iucv/iucv.c                                    |   15 +-
+ net/kcm/kcmsock.c                                  |    7 +-
+ net/l2tp/l2tp_ip.c                                 |    2 +-
+ net/mptcp/mptcp_diag.c                             |    1 +
+ net/mptcp/options.c                                |   20 +-
+ net/mptcp/pm.c                                     |    2 +-
+ net/mptcp/pm_netlink.c                             |   10 +-
+ net/mptcp/protocol.c                               |   52 +-
+ net/mptcp/protocol.h                               |    8 +-
+ net/mptcp/sockopt.c                                |    2 +-
+ net/mptcp/subflow.c                                |   10 +-
+ net/netfilter/Kconfig                              |   12 +-
+ net/netfilter/ipvs/ip_vs_conn.c                    |    4 +-
+ net/netfilter/nf_bpf_link.c                        |    2 +-
+ net/netfilter/nf_conncount.c                       |    8 +-
+ net/netfilter/nf_conntrack_core.c                  |    2 +-
+ net/netfilter/nf_tables_api.c                      |   35 +-
+ net/netlabel/netlabel_kapi.c                       |    8 +-
+ net/netlink/diag.c                                 |    1 +
+ net/netlink/genetlink.c                            |   30 +
+ net/nfc/hci/llc.c                                  |   20 +-
+ net/packet/diag.c                                  |    1 +
+ net/rds/connection.c                               |    4 +-
+ net/sched/Kconfig                                  |   10 -
+ net/sched/act_api.c                                |    2 +-
+ net/sched/act_bpf.c                                |    1 +
+ net/sched/act_connmark.c                           |    1 +
+ net/sched/act_csum.c                               |    1 +
+ net/sched/act_ct.c                                 |    1 +
+ net/sched/act_ctinfo.c                             |    1 +
+ net/sched/act_gact.c                               |    1 +
+ net/sched/act_gate.c                               |    1 +
+ net/sched/act_ife.c                                |    1 +
+ net/sched/act_mirred.c                             |    1 +
+ net/sched/act_mpls.c                               |    1 +
+ net/sched/act_nat.c                                |    1 +
+ net/sched/act_pedit.c                              |    3 +-
+ net/sched/act_police.c                             |    1 +
+ net/sched/act_sample.c                             |    1 +
+ net/sched/act_simple.c                             |    1 +
+ net/sched/act_skbedit.c                            |    1 +
+ net/sched/act_skbmod.c                             |    1 +
+ net/sched/act_tunnel_key.c                         |    1 +
+ net/sched/act_vlan.c                               |    1 +
+ net/sched/cls_api.c                                |    2 +-
+ net/sched/cls_basic.c                              |    1 +
+ net/sched/cls_bpf.c                                |    1 +
+ net/sched/cls_cgroup.c                             |    1 +
+ net/sched/cls_flow.c                               |    1 +
+ net/sched/cls_flower.c                             |    1 +
+ net/sched/cls_fw.c                                 |    1 +
+ net/sched/cls_matchall.c                           |    1 +
+ net/sched/cls_route.c                              |    1 +
+ net/sched/cls_u32.c                                |    1 +
+ net/sched/sch_api.c                                |    4 +-
+ net/sched/sch_cake.c                               |    1 +
+ net/sched/sch_cbs.c                                |    1 +
+ net/sched/sch_choke.c                              |    1 +
+ net/sched/sch_codel.c                              |   33 +-
+ net/sched/sch_drr.c                                |    1 +
+ net/sched/sch_etf.c                                |    1 +
+ net/sched/sch_ets.c                                |    1 +
+ net/sched/sch_fq.c                                 |    1 +
+ net/sched/sch_fq_codel.c                           |    1 +
+ net/sched/sch_gred.c                               |    1 +
+ net/sched/sch_hfsc.c                               |    1 +
+ net/sched/sch_hhf.c                                |    1 +
+ net/sched/sch_htb.c                                |    1 +
+ net/sched/sch_ingress.c                            |    3 +-
+ net/sched/sch_mqprio.c                             |    1 +
+ net/sched/sch_multiq.c                             |    1 +
+ net/sched/sch_netem.c                              |    1 +
+ net/sched/sch_pie.c                                |    1 +
+ net/sched/sch_plug.c                               |    1 +
+ net/sched/sch_prio.c                               |    1 +
+ net/sched/sch_qfq.c                                |    1 +
+ net/sched/sch_red.c                                |    1 +
+ net/sched/sch_sfb.c                                |    1 +
+ net/sched/sch_sfq.c                                |    1 +
+ net/sched/sch_skbprio.c                            |    1 +
+ net/sched/sch_taprio.c                             |   73 +-
+ net/sched/sch_tbf.c                                |    1 +
+ net/sctp/diag.c                                    |    1 +
+ net/sctp/protocol.c                                |   10 +-
+ net/smc/af_smc.c                                   |   22 +-
+ net/smc/smc.h                                      |    4 +-
+ net/smc/smc_clc.c                                  |    6 +-
+ net/smc/smc_clc.h                                  |    2 +-
+ net/smc/smc_core.c                                 |    4 +-
+ net/smc/smc_diag.c                                 |    1 +
+ net/smc/smc_ism.h                                  |   10 +-
+ net/tipc/Kconfig                                   |    7 +-
+ net/tipc/Makefile                                  |    4 +-
+ net/tipc/bearer.c                                  |   15 +-
+ net/tipc/diag.c                                    |    1 +
+ net/tipc/node.c                                    |    2 -
+ net/tipc/socket.c                                  |    1 -
+ net/unix/Kconfig                                   |    5 -
+ net/unix/Makefile                                  |    2 -
+ net/unix/af_unix.c                                 |   73 +-
+ net/unix/diag.c                                    |    1 +
+ net/unix/garbage.c                                 |  200 +-
+ net/unix/scm.c                                     |  159 --
+ net/unix/scm.h                                     |   10 -
+ net/vmw_vsock/diag.c                               |    1 +
+ net/xdp/xsk_diag.c                                 |    1 +
+ net/xfrm/xfrm_interface_core.c                     |   14 +-
+ net/xfrm/xfrm_proc.c                               |    1 +
+ net/xfrm/xfrm_state.c                              |   17 +-
+ net/xfrm/xfrm_user.c                               |    2 +-
+ rust/kernel/net/phy.rs                             |   24 +-
+ security/security.c                                |  101 +-
+ security/selinux/hooks.c                           |   47 +-
+ tools/bpf/bpftool/link.c                           |   96 +-
+ tools/bpf/bpftool/prog.c                           |    2 +-
+ tools/include/uapi/linux/bpf.h                     |   79 +-
+ tools/include/uapi/linux/if_link.h                 |    1 +
+ tools/lib/bpf/Build                                |    2 +-
+ tools/lib/bpf/bpf.c                                |   42 +-
+ tools/lib/bpf/bpf.h                                |   38 +-
+ tools/lib/bpf/bpf_core_read.h                      |    2 +-
+ tools/lib/bpf/btf.c                                |   10 +-
+ tools/lib/bpf/elf.c                                |    2 -
+ tools/lib/bpf/features.c                           |  503 ++++
+ tools/lib/bpf/libbpf.c                             |  604 ++---
+ tools/lib/bpf/libbpf.h                             |   21 +-
+ tools/lib/bpf/libbpf.map                           |    1 +
+ tools/lib/bpf/libbpf_internal.h                    |   50 +-
+ tools/lib/bpf/libbpf_probes.c                      |   12 +-
+ tools/lib/bpf/str_error.h                          |    3 +
+ tools/net/ynl/Makefile.deps                        |    2 +
+ tools/net/ynl/cli.py                               |   22 +-
+ tools/net/ynl/generated/Makefile                   |    5 +-
+ tools/net/ynl/lib/nlspec.py                        |    9 +-
+ tools/net/ynl/lib/ynl.py                           |  209 +-
+ tools/net/ynl/samples/.gitignore                   |    1 +
+ tools/net/ynl/samples/ovs.c                        |   60 +
+ tools/net/ynl/ynl-gen-rst.py                       |    9 +-
+ tools/testing/selftests/Makefile                   |    7 +-
+ tools/testing/selftests/bpf/README.rst             |   32 +-
+ tools/testing/selftests/bpf/bpf_experimental.h     |   21 +-
+ tools/testing/selftests/bpf/bpf_kfuncs.h           |   10 +
+ .../selftests/bpf/bpf_testmod/bpf_testmod.c        |   75 +
+ .../selftests/bpf/bpf_testmod/bpf_testmod.h        |    5 +
+ tools/testing/selftests/bpf/config                 |    1 +
+ .../selftests/bpf/prog_tests/bpf_verif_scale.c     |    2 +-
+ .../testing/selftests/bpf/prog_tests/ctx_rewrite.c |   44 -
+ .../selftests/bpf/prog_tests/fill_link_info.c      |  114 +-
+ .../selftests/bpf/prog_tests/kptr_xchg_inline.c    |   51 +
+ .../selftests/bpf/prog_tests/libbpf_probes.c       |    4 +
+ .../testing/selftests/bpf/prog_tests/libbpf_str.c  |    6 +
+ .../testing/selftests/bpf/prog_tests/reg_bounds.c  |    2 +-
+ .../testing/selftests/bpf/prog_tests/tc_redirect.c |   90 +-
+ .../bpf/prog_tests/tcp_custom_syncookie.c          |  150 ++
+ .../bpf/prog_tests/test_struct_ops_module.c        |   75 +
+ tools/testing/selftests/bpf/prog_tests/token.c     | 1052 +++++++++
+ tools/testing/selftests/bpf/prog_tests/xdpwall.c   |    2 +-
+ tools/testing/selftests/bpf/progs/bpf_misc.h       |    2 +-
+ .../testing/selftests/bpf/progs/bpf_tracing_net.h  |   16 +
+ tools/testing/selftests/bpf/progs/iters.c          |    4 +-
+ .../testing/selftests/bpf/progs/kptr_xchg_inline.c |   48 +
+ tools/testing/selftests/bpf/progs/priv_map.c       |   13 +
+ tools/testing/selftests/bpf/progs/priv_prog.c      |   13 +
+ .../selftests/bpf/progs/struct_ops_module.c        |   30 +
+ .../selftests/bpf/progs/test_core_reloc_type_id.c  |    2 +-
+ .../selftests/bpf/progs/test_fill_link_info.c      |    6 +
+ .../testing/selftests/bpf/progs/test_map_in_map.c  |   26 +
+ tools/testing/selftests/bpf/progs/test_siphash.h   |   64 +
+ .../bpf/progs/test_tcp_custom_syncookie.c          |  572 +++++
+ .../bpf/progs/test_tcp_custom_syncookie.h          |  140 ++
+ .../testing/selftests/bpf/progs/test_tcpbpf_kern.c |    2 +-
+ .../testing/selftests/bpf/progs/test_xdp_dynptr.c  |   10 +-
+ tools/testing/selftests/bpf/progs/token_lsm.c      |   32 +
+ .../bpf/progs/verifier_direct_packet_access.c      |    2 +-
+ .../testing/selftests/bpf/progs/verifier_loops1.c  |   24 +
+ .../selftests/bpf/progs/verifier_spill_fill.c      |  229 +-
+ tools/testing/selftests/bpf/test_loader.c          |    4 +-
+ tools/testing/selftests/bpf/test_maps.c            |    6 +-
+ tools/testing/selftests/bpf/test_progs.c           |   18 -
+ tools/testing/selftests/bpf/test_sock_addr.c       |    3 +-
+ tools/testing/selftests/bpf/test_verifier.c        |   60 +-
+ tools/testing/selftests/bpf/testing_helpers.c      |   92 +-
+ tools/testing/selftests/bpf/testing_helpers.h      |    8 +
+ .../selftests/bpf/verifier/bpf_loop_inline.c       |    6 +
+ tools/testing/selftests/bpf/verifier/precise.c     |    6 +-
+ .../testing/selftests/drivers/net/bonding/Makefile |    7 +-
+ .../drivers/net/bonding/bond-break-lacpdu-tx.sh    |   19 +-
+ .../drivers/net/bonding/bond-eth-type-change.sh    |    2 +-
+ .../drivers/net/bonding/bond-lladdr-target.sh      |   21 +-
+ .../selftests/drivers/net/bonding/bond_options.sh  |   38 +-
+ .../drivers/net/bonding/bond_topo_2d1c.sh          |    8 +-
+ .../drivers/net/bonding/dev_addr_lists.sh          |    2 +-
+ .../selftests/drivers/net/bonding/lag_lib.sh       |    7 +-
+ .../drivers/net/bonding/mode-1-recovery-updelay.sh |    2 +-
+ .../drivers/net/bonding/mode-2-recovery-updelay.sh |    2 +-
+ .../drivers/net/bonding/net_forwarding_lib.sh      |    1 -
+ tools/testing/selftests/drivers/net/dsa/Makefile   |   18 +-
+ .../drivers/net/dsa/bridge_locked_port.sh          |    2 +-
+ .../selftests/drivers/net/dsa/bridge_mdb.sh        |    2 +-
+ .../selftests/drivers/net/dsa/bridge_mld.sh        |    2 +-
+ .../selftests/drivers/net/dsa/bridge_vlan_aware.sh |    2 +-
+ .../selftests/drivers/net/dsa/bridge_vlan_mcast.sh |    2 +-
+ .../drivers/net/dsa/bridge_vlan_unaware.sh         |    2 +-
+ tools/testing/selftests/drivers/net/dsa/lib.sh     |    1 -
+ .../selftests/drivers/net/dsa/local_termination.sh |    2 +-
+ .../selftests/drivers/net/dsa/no_forwarding.sh     |    2 +-
+ .../drivers/net/dsa/run_net_forwarding_test.sh     |    9 +
+ .../selftests/drivers/net/dsa/tc_actions.sh        |    2 +-
+ .../testing/selftests/drivers/net/dsa/tc_common.sh |    1 -
+ .../drivers/net/dsa/test_bridge_fdb_stress.sh      |    2 +-
+ .../selftests/drivers/net/netdevsim/Makefile       |   17 +
+ .../drivers/net/netdevsim/udp_tunnel_nic.sh        |   40 +-
+ tools/testing/selftests/drivers/net/team/Makefile  |    7 +-
+ .../selftests/drivers/net/team/dev_addr_lists.sh   |    4 +-
+ .../testing/selftests/drivers/net/team/lag_lib.sh  |    1 -
+ .../drivers/net/team/net_forwarding_lib.sh         |    1 -
+ tools/testing/selftests/lib.mk                     |   19 +
+ tools/testing/selftests/net/Makefile               |    2 +
+ tools/testing/selftests/net/fcnal-test.sh          |   34 +-
+ tools/testing/selftests/net/fib_tests.sh           |  151 +-
+ tools/testing/selftests/net/forwarding/Makefile    |    3 +
+ tools/testing/selftests/net/forwarding/config      |   35 +
+ tools/testing/selftests/net/forwarding/lib.sh      |   32 +-
+ .../selftests/net/forwarding/mirror_gre_lib.sh     |    2 +-
+ .../net/forwarding/mirror_gre_topo_lib.sh          |    2 +-
+ tools/testing/selftests/net/fq_band_pktlimit.sh    |   14 +-
+ .../selftests/net/openvswitch/openvswitch.sh       |   62 +
+ tools/testing/selftests/net/so_txtime.c            |    7 +-
+ tools/testing/selftests/net/txtimestamp.c          |    3 +-
+ tools/testing/selftests/net/txtimestamp.sh         |   12 +-
+ tools/testing/selftests/net/udpgso.c               |  134 +-
+ tools/testing/selftests/net/udpgso.sh              |   49 +-
+ tools/testing/selftests/tc-testing/config          |    1 +
+ .../tc-testing/tc-tests/actions/mirred.json        |  396 ++++
+ .../selftests/tc-testing/tc-tests/qdiscs/fq.json   |    2 +-
+ .../tc-testing/tc-tests/qdiscs/taprio.json         |    2 +
+ tools/testing/selftests/tc-testing/tdc.py          |    2 +-
+ tools/testing/selftests/tc-testing/tdc.sh          |    3 +-
+ tools/testing/vsock/util.c                         |   17 +-
+ tools/testing/vsock/util.h                         |    4 +
+ tools/testing/vsock/vsock_diag_test.c              |   23 +-
+ tools/testing/vsock/vsock_test.c                   |  102 +-
+ tools/testing/vsock/vsock_test_zerocopy.c          |   12 +-
+ tools/testing/vsock/vsock_uring_test.c             |   17 +-
+ 902 files changed, 42161 insertions(+), 13556 deletions(-)
+ delete mode 100644 Documentation/devicetree/bindings/net/dsa/ar9331.txt
+ create mode 100644 Documentation/devicetree/bindings/net/dsa/qca,ar9331.yaml
+ create mode 100644 Documentation/devicetree/bindings/net/ethernet-phy-package.yaml
+ create mode 100644 Documentation/devicetree/bindings/net/qca,qca808x.yaml
+ create mode 100644 Documentation/devicetree/bindings/net/qcom,qca807x.yaml
+ create mode 100644 Documentation/networking/device_drivers/ethernet/marvell/octeon_ep_vf.rst
+ create mode 100644 drivers/net/can/esd/Kconfig
+ create mode 100644 drivers/net/can/esd/Makefile
+ create mode 100644 drivers/net/can/esd/esd_402_pci-core.c
+ create mode 100644 drivers/net/can/esd/esdacc.c
+ create mode 100644 drivers/net/can/esd/esdacc.h
+ create mode 100644 drivers/net/dsa/realtek/realtek-mdio.h
+ create mode 100644 drivers/net/dsa/realtek/realtek-smi.h
+ create mode 100644 drivers/net/dsa/realtek/rtl83xx.c
+ create mode 100644 drivers/net/dsa/realtek/rtl83xx.h
+ create mode 100644 drivers/net/ethernet/intel/igc/igc_leds.c
+ create mode 100644 drivers/net/ethernet/marvell/octeon_ep_vf/Kconfig
+ create mode 100644 drivers/net/ethernet/marvell/octeon_ep_vf/Makefile
+ create mode 100644 drivers/net/ethernet/marvell/octeon_ep_vf/octep_vf_cn9k.c
+ create mode 100644 drivers/net/ethernet/marvell/octeon_ep_vf/octep_vf_cnxk.c
+ create mode 100644 drivers/net/ethernet/marvell/octeon_ep_vf/octep_vf_config.h
+ create mode 100644 drivers/net/ethernet/marvell/octeon_ep_vf/octep_vf_ethtool.c
+ create mode 100644 drivers/net/ethernet/marvell/octeon_ep_vf/octep_vf_main.c
+ create mode 100644 drivers/net/ethernet/marvell/octeon_ep_vf/octep_vf_main.h
+ create mode 100644 drivers/net/ethernet/marvell/octeon_ep_vf/octep_vf_mbox.c
+ create mode 100644 drivers/net/ethernet/marvell/octeon_ep_vf/octep_vf_mbox.h
+ create mode 100644 drivers/net/ethernet/marvell/octeon_ep_vf/octep_vf_regs_cn9k.h
+ create mode 100644 drivers/net/ethernet/marvell/octeon_ep_vf/octep_vf_regs_cnxk.h
+ create mode 100644 drivers/net/ethernet/marvell/octeon_ep_vf/octep_vf_rx.c
+ create mode 100644 drivers/net/ethernet/marvell/octeon_ep_vf/octep_vf_rx.h
+ create mode 100644 drivers/net/ethernet/marvell/octeon_ep_vf/octep_vf_tx.c
+ create mode 100644 drivers/net/ethernet/marvell/octeon_ep_vf/octep_vf_tx.h
+ create mode 100644 drivers/net/ethernet/wangxun/txgbe/txgbe_irq.c
+ create mode 100644 drivers/net/ethernet/wangxun/txgbe/txgbe_irq.h
+ delete mode 100644 drivers/net/phy/at803x.c
+ create mode 100644 drivers/net/phy/qcom/Kconfig
+ create mode 100644 drivers/net/phy/qcom/Makefile
+ create mode 100644 drivers/net/phy/qcom/at803x.c
+ create mode 100644 drivers/net/phy/qcom/qca807x.c
+ create mode 100644 drivers/net/phy/qcom/qca808x.c
+ create mode 100644 drivers/net/phy/qcom/qca83xx.c
+ create mode 100644 drivers/net/phy/qcom/qcom-phy-lib.c
+ create mode 100644 drivers/net/phy/qcom/qcom.h
+ create mode 100644 drivers/ptp/ptp_fc3.c
+ create mode 100644 drivers/ptp/ptp_fc3.h
+ create mode 100644 include/linux/mfd/idtRC38xxx_reg.h
+ create mode 100644 include/net/netmem.h
+ delete mode 100644 kernel/bpf/bpf_struct_ops_types.h
+ create mode 100644 kernel/bpf/token.c
+ delete mode 100644 net/unix/scm.c
+ delete mode 100644 net/unix/scm.h
+ create mode 100644 tools/lib/bpf/features.c
+ create mode 100644 tools/net/ynl/samples/ovs.c
+ create mode 100644 tools/testing/selftests/bpf/prog_tests/kptr_xchg_inline.c
+ create mode 100644 tools/testing/selftests/bpf/prog_tests/tcp_custom_syncookie.c
+ create mode 100644 tools/testing/selftests/bpf/prog_tests/test_struct_ops_module.c
+ create mode 100644 tools/testing/selftests/bpf/prog_tests/token.c
+ create mode 100644 tools/testing/selftests/bpf/progs/kptr_xchg_inline.c
+ create mode 100644 tools/testing/selftests/bpf/progs/priv_map.c
+ create mode 100644 tools/testing/selftests/bpf/progs/priv_prog.c
+ create mode 100644 tools/testing/selftests/bpf/progs/struct_ops_module.c
+ create mode 100644 tools/testing/selftests/bpf/progs/test_siphash.h
+ create mode 100644 tools/testing/selftests/bpf/progs/test_tcp_custom_syncookie.c
+ create mode 100644 tools/testing/selftests/bpf/progs/test_tcp_custom_syncookie.h
+ create mode 100644 tools/testing/selftests/bpf/progs/token_lsm.c
+ delete mode 120000 tools/testing/selftests/drivers/net/bonding/net_forwarding_lib.sh
+ delete mode 120000 tools/testing/selftests/drivers/net/dsa/lib.sh
+ create mode 100755 tools/testing/selftests/drivers/net/dsa/run_net_forwarding_test.sh
+ delete mode 120000 tools/testing/selftests/drivers/net/dsa/tc_common.sh
+ create mode 100644 tools/testing/selftests/drivers/net/netdevsim/Makefile
+ delete mode 120000 tools/testing/selftests/drivers/net/team/lag_lib.sh
+ delete mode 120000 tools/testing/selftests/drivers/net/team/net_forwarding_lib.sh
+Merging bpf-next/for-next (7648f0c91eaa selftests/bpf: Remove empty TEST_CUSTOM_PROGS)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git bpf-next/for-next
+Auto-merging drivers/hid/bpf/hid_bpf_dispatch.c
+Auto-merging init/Kconfig
+Auto-merging kernel/bpf/helpers.c
+Auto-merging kernel/bpf/verifier.c
+Auto-merging net/core/xdp.c
+Auto-merging net/xdp/xsk.c
+Auto-merging scripts/bpf_doc.py
+Merge made by the 'ort' strategy.
+ Documentation/bpf/kfuncs.rst                       |   8 +-
+ .../bpf/standardization/instruction-set.rst        |  80 +++--
+ Documentation/networking/af_xdp.rst                |  31 +-
+ arch/riscv/net/bpf_jit.h                           | 134 +++++++++
+ arch/riscv/net/bpf_jit_comp64.c                    | 215 +++++---------
+ drivers/hid/bpf/hid_bpf_dispatch.c                 |   8 +-
+ fs/verity/measure.c                                |   4 +-
+ include/linux/bpf-cgroup.h                         |   3 +-
+ include/linux/bpf.h                                |  23 +-
+ include/linux/bpf_local_storage.h                  |  30 +-
+ include/linux/bpf_verifier.h                       |  10 +
+ include/linux/btf.h                                |  23 +-
+ include/linux/btf_ids.h                            |  21 +-
+ include/linux/filter.h                             |  21 +-
+ init/Kconfig                                       |   5 -
+ kernel/bpf/Kconfig                                 |   1 +
+ kernel/bpf/bpf_local_storage.c                     |  52 +---
+ kernel/bpf/bpf_lsm.c                               |   6 +-
+ kernel/bpf/bpf_struct_ops.c                        | 222 ++++++++++++--
+ kernel/bpf/btf.c                                   | 275 ++++++++++++-----
+ kernel/bpf/cgroup.c                                |   3 -
+ kernel/bpf/cpumask.c                               |   4 +-
+ kernel/bpf/helpers.c                               |  16 +-
+ kernel/bpf/log.c                                   |  62 +++-
+ kernel/bpf/map_iter.c                              |   4 +-
+ kernel/bpf/token.c                                 |  16 +-
+ kernel/bpf/verifier.c                              | 206 +++++++++----
+ kernel/cgroup/rstat.c                              |   4 +-
+ kernel/events/core.c                               |   6 +-
+ kernel/trace/bpf_trace.c                           |   8 +-
+ net/bpf/test_run.c                                 |   8 +-
+ net/core/filter.c                                  |  20 +-
+ net/core/xdp.c                                     |   4 +-
+ net/ipv4/bpf_tcp_ca.c                              |   4 +-
+ net/ipv4/fou_bpf.c                                 |   4 +-
+ net/ipv4/tcp_bbr.c                                 |   4 +-
+ net/ipv4/tcp_cubic.c                               |   4 +-
+ net/ipv4/tcp_dctcp.c                               |   4 +-
+ net/netfilter/nf_conntrack_bpf.c                   |   4 +-
+ net/netfilter/nf_nat_bpf.c                         |   4 +-
+ net/xdp/xsk.c                                      |   5 +-
+ net/xfrm/xfrm_interface_bpf.c                      |   4 +-
+ net/xfrm/xfrm_state_bpf.c                          |   4 +-
+ scripts/bpf_doc.py                                 |   2 +-
+ tools/bpf/bpftool/gen.c                            |   9 +-
+ tools/bpf/resolve_btfids/main.c                    |  70 ++++-
+ tools/include/linux/btf_ids.h                      |   9 +
+ tools/lib/bpf/bpf.h                                |  24 +-
+ tools/lib/bpf/bpf_core_read.h                      |  58 +++-
+ tools/lib/bpf/bpf_helpers.h                        |   2 +
+ tools/lib/bpf/btf.c                                |  33 ++-
+ tools/lib/bpf/features.c                           |  58 ++++
+ tools/lib/bpf/libbpf.c                             |  99 ++-----
+ tools/lib/bpf/libbpf.map                           |   5 +-
+ tools/lib/bpf/libbpf_internal.h                    |  16 +
+ tools/lib/bpf/linker.c                             |   2 +-
+ tools/lib/bpf/netlink.c                            |   4 +-
+ tools/testing/selftests/bpf/Makefile               |  36 ++-
+ tools/testing/selftests/bpf/bench.c                |  12 +-
+ tools/testing/selftests/bpf/bpf_kfuncs.h           |  20 +-
+ .../selftests/bpf/bpf_testmod/bpf_testmod.c        |  25 +-
+ .../selftests/bpf/bpf_testmod/bpf_testmod.h        |   6 +-
+ tools/testing/selftests/bpf/prog_tests/cpumask.c   |   6 +-
+ .../selftests/bpf/prog_tests/decap_sanity.c        |   2 +-
+ .../testing/selftests/bpf/prog_tests/fib_lookup.c  |   2 +-
+ .../selftests/bpf/prog_tests/ip_check_defrag.c     |   4 +-
+ .../selftests/bpf/prog_tests/kptr_xchg_inline.c    |   3 +-
+ tools/testing/selftests/bpf/prog_tests/log_fixup.c |   4 +-
+ .../testing/selftests/bpf/prog_tests/lwt_helpers.h |   2 -
+ .../selftests/bpf/prog_tests/lwt_redirect.c        |   4 +-
+ .../testing/selftests/bpf/prog_tests/lwt_reroute.c |   3 +-
+ tools/testing/selftests/bpf/prog_tests/mptcp.c     |   2 +-
+ .../selftests/bpf/prog_tests/rcu_read_lock.c       |   6 +
+ .../selftests/bpf/prog_tests/sock_destroy.c        |   2 +-
+ .../selftests/bpf/prog_tests/sock_iter_batch.c     |   4 +-
+ tools/testing/selftests/bpf/prog_tests/spin_lock.c |   2 +
+ .../selftests/bpf/prog_tests/task_local_storage.c  |   6 -
+ .../bpf/prog_tests/test_struct_ops_maybe_null.c    |  46 +++
+ .../testing/selftests/bpf/prog_tests/test_tunnel.c |  18 +-
+ .../selftests/bpf/prog_tests/tracing_failure.c     |  37 +++
+ tools/testing/selftests/bpf/prog_tests/verifier.c  |   2 +
+ .../selftests/bpf/progs/async_stack_depth.c        |   4 +-
+ tools/testing/selftests/bpf/progs/bpf_compiler.h   |  33 +++
+ .../selftests/bpf/progs/cgrp_ls_recursion.c        |  26 --
+ .../selftests/bpf/progs/connect_unix_prog.c        |   3 +-
+ tools/testing/selftests/bpf/progs/cpumask_common.h |  55 ++--
+ .../selftests/bpf/progs/getpeername_unix_prog.c    |   3 +-
+ .../selftests/bpf/progs/getsockname_unix_prog.c    |   3 +-
+ tools/testing/selftests/bpf/progs/iters.c          |   5 +-
+ tools/testing/selftests/bpf/progs/loop4.c          |   4 +-
+ tools/testing/selftests/bpf/progs/profiler.inc.h   |  17 +-
+ tools/testing/selftests/bpf/progs/pyperf.h         |   7 +-
+ tools/testing/selftests/bpf/progs/rcu_read_lock.c  | 120 ++++++++
+ .../selftests/bpf/progs/recvmsg_unix_prog.c        |   3 +-
+ .../selftests/bpf/progs/sendmsg_unix_prog.c        |   3 +-
+ .../selftests/bpf/progs/sk_storage_omem_uncharge.c |   4 +-
+ .../testing/selftests/bpf/progs/sock_iter_batch.c  |   4 +-
+ tools/testing/selftests/bpf/progs/strobemeta.h     |  18 +-
+ .../selftests/bpf/progs/struct_ops_maybe_null.c    |  29 ++
+ .../bpf/progs/struct_ops_maybe_null_fail.c         |  24 ++
+ .../selftests/bpf/progs/struct_ops_module.c        |   3 +-
+ .../selftests/bpf/progs/task_ls_recursion.c        |  17 --
+ .../selftests/bpf/progs/test_cls_redirect.c        |   7 +-
+ .../selftests/bpf/progs/test_cls_redirect_dynptr.c |   2 +
+ .../selftests/bpf/progs/test_global_func1.c        |   8 +-
+ .../bpf/progs/test_global_func_ctx_args.c          |  19 ++
+ .../selftests/bpf/progs/test_lwt_seg6local.c       |   6 +-
+ .../selftests/bpf/progs/test_ptr_untrusted.c       |   6 +-
+ tools/testing/selftests/bpf/progs/test_seg6_loop.c |   4 +-
+ tools/testing/selftests/bpf/progs/test_skb_ctx.c   |   4 +-
+ tools/testing/selftests/bpf/progs/test_spin_lock.c |  65 +++++
+ .../selftests/bpf/progs/test_spin_lock_fail.c      |  44 +++
+ .../selftests/bpf/progs/test_sysctl_loop1.c        |   6 +-
+ .../selftests/bpf/progs/test_sysctl_loop2.c        |   6 +-
+ .../testing/selftests/bpf/progs/test_sysctl_prog.c |   6 +-
+ tools/testing/selftests/bpf/progs/test_tc_tunnel.c |   5 +-
+ tools/testing/selftests/bpf/progs/test_xdp.c       |   3 +-
+ tools/testing/selftests/bpf/progs/test_xdp_loop.c  |   3 +-
+ .../selftests/bpf/progs/test_xdp_noinline.c        |   5 +-
+ .../testing/selftests/bpf/progs/tracing_failure.c  |  20 ++
+ tools/testing/selftests/bpf/progs/type_cast.c      |  13 +-
+ .../selftests/bpf/progs/verifier_global_ptr_args.c | 182 ++++++++++++
+ .../selftests/bpf/progs/verifier_global_subprogs.c |  29 ++
+ .../selftests/bpf/progs/verifier_spill_fill.c      | 324 ++++++++++++++++++++-
+ .../selftests/bpf/progs/verifier_spin_lock.c       |   2 +-
+ .../selftests/bpf/progs/xdp_synproxy_kern.c        |   6 +-
+ tools/testing/selftests/bpf/progs/xdping_kern.c    |   3 +-
+ tools/testing/selftests/bpf/test_progs.h           |   7 +-
+ tools/testing/selftests/bpf/trace_helpers.c        |   2 +-
+ 129 files changed, 2576 insertions(+), 815 deletions(-)
+ create mode 100644 tools/testing/selftests/bpf/prog_tests/test_struct_ops_maybe_null.c
+ create mode 100644 tools/testing/selftests/bpf/prog_tests/tracing_failure.c
+ create mode 100644 tools/testing/selftests/bpf/progs/bpf_compiler.h
+ create mode 100644 tools/testing/selftests/bpf/progs/struct_ops_maybe_null.c
+ create mode 100644 tools/testing/selftests/bpf/progs/struct_ops_maybe_null_fail.c
+ create mode 100644 tools/testing/selftests/bpf/progs/tracing_failure.c
+ create mode 100644 tools/testing/selftests/bpf/progs/verifier_global_ptr_args.c
+Merging ipsec-next/master (1476de6d2b57 xfrm: Simplify the allocation of slab caches in xfrm_policy_init)
+$ git merge -m Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/klassert/ipsec-next.git ipsec-next/master
+Auto-merging net/ipv6/xfrm6_tunnel.c
+Auto-merging net/xfrm/xfrm_policy.c
+Merge made by the 'ort' strategy.
+ net/ipv6/xfrm6_tunnel.c |   5 +-
+ net/xfrm/xfrm_policy.c  | 147 ++++++++++++++++++++++++++++++++++++++++++++++--
+ 2 files changed, 142 insertions(+), 10 deletions(-)
+Merging mlx5-next/mlx5-next (d727d27db536 RDMA/mlx5: Expose register c0 for RDMA device)
+$ git merge -m Merge branch 'mlx5-next' of git://git.kernel.org/pub/scm/linux/kernel/git/mellanox/linux.git mlx5-next/mlx5-next
+Already up to date.
+Merging netfilter-next/main (219eee9c0d16 net: skbuff: add overflow debug check to pull/push helpers)
+$ git merge -m Merge branch 'main' of git://git.kernel.org/pub/scm/linux/kernel/git/netfilter/nf-next.git netfilter-next/main
+Already up to date.
+Merging ipvs-next/main (f77581bfda24 Merge branch 'add-multi-buff-support-for-xdp-running-in-generic-mode')
+$ git merge -m Merge branch 'main' of git://git.kernel.org/pub/scm/linux/kernel/git/horms/ipvs-next.git ipvs-next/main
+Already up to date.
+Merging bluetooth/master (25956d989c60 Bluetooth: hci_sync: Fix UAF on create_le_conn_complete)
+$ git merge -m Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/bluetooth/bluetooth-next.git bluetooth/master
+Merge made by the 'ort' strategy.
+ drivers/bluetooth/btbcm.c        |   12 +-
+ drivers/bluetooth/btintel.c      |    2 +-
+ drivers/bluetooth/btnxpuart.c    |   24 +-
+ drivers/bluetooth/btqca.c        |    2 +-
+ drivers/bluetooth/btrtl.c        |   14 +
+ drivers/bluetooth/btusb.c        |    8 +
+ drivers/bluetooth/hci_bcm4377.c  |    3 +-
+ drivers/bluetooth/hci_qca.c      |   22 +-
+ include/net/bluetooth/hci.h      |    6 +-
+ include/net/bluetooth/hci_core.h |   25 +-
+ include/net/bluetooth/hci_sync.h |   22 +-
+ include/net/bluetooth/l2cap.h    |   44 +-
+ net/bluetooth/6lowpan.c          |    4 +-
+ net/bluetooth/Kconfig            |    8 -
+ net/bluetooth/Makefile           |    1 -
+ net/bluetooth/a2mp.c             | 1054 ------------------------------------
+ net/bluetooth/a2mp.h             |  154 ------
+ net/bluetooth/amp.c              |  590 ---------------------
+ net/bluetooth/amp.h              |   60 ---
+ net/bluetooth/bnep/core.c        |    2 +-
+ net/bluetooth/hci_conn.c         |  168 +-----
+ net/bluetooth/hci_core.c         |  175 ++++--
+ net/bluetooth/hci_event.c        |  246 ++-------
+ net/bluetooth/hci_request.c      |    2 +-
+ net/bluetooth/hci_sock.c         |    4 +-
+ net/bluetooth/hci_sync.c         |  334 ++++++++++--
+ net/bluetooth/l2cap_core.c       | 1087 +-------------------------------------
+ net/bluetooth/l2cap_sock.c       |   21 +-
+ net/bluetooth/mgmt.c             |  116 +---
+ net/bluetooth/rfcomm/core.c      |    2 +-
+ net/bluetooth/sco.c              |    3 +-
+ 31 files changed, 674 insertions(+), 3541 deletions(-)
+ delete mode 100644 net/bluetooth/a2mp.c
+ delete mode 100644 net/bluetooth/a2mp.h
+ delete mode 100644 net/bluetooth/amp.c
+ delete mode 100644 net/bluetooth/amp.h
+Merging wireless-next/for-next (dd66185c23f7 wifi: wilc1000: add missing read critical sections around vif list traversal)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/wireless/wireless-next.git wireless-next/for-next
+Auto-merging drivers/net/wireless/intel/iwlwifi/mvm/mvm.h
+CONFLICT (content): Merge conflict in drivers/net/wireless/intel/iwlwifi/mvm/mvm.h
+Auto-merging net/wireless/nl80211.c
+Resolved 'drivers/net/wireless/intel/iwlwifi/mvm/mvm.h' using previous resolution.
+Automatic merge failed; fix conflicts and then commit the result.
+$ git commit --no-edit -v -a
+[master ad7c35747e14] Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/wireless/wireless-next.git
+$ git diff -M --stat --summary HEAD^..
+ drivers/bcma/main.c                                |    2 +-
+ drivers/net/wireless/admtek/adm8211.c              |    4 +
+ drivers/net/wireless/ath/ar5523/ar5523.c           |    4 +
+ drivers/net/wireless/ath/ath10k/core.c             |    4 +-
+ drivers/net/wireless/ath/ath10k/coredump.h         |    8 +-
+ drivers/net/wireless/ath/ath10k/htt.c              |    3 +-
+ drivers/net/wireless/ath/ath10k/htt.h              |   12 +-
+ drivers/net/wireless/ath/ath10k/mac.c              |   10 +-
+ drivers/net/wireless/ath/ath10k/pci.c              |   10 +-
+ drivers/net/wireless/ath/ath10k/wmi-tlv.c          |   11 +-
+ drivers/net/wireless/ath/ath10k/wmi-tlv.h          |    4 +-
+ drivers/net/wireless/ath/ath10k/wmi.c              |   24 +-
+ drivers/net/wireless/ath/ath10k/wmi.h              |   62 +-
+ drivers/net/wireless/ath/ath11k/core.h             |   41 +
+ drivers/net/wireless/ath/ath11k/dp.c               |   20 +-
+ drivers/net/wireless/ath/ath11k/dp_tx.c            |    6 +-
+ drivers/net/wireless/ath/ath11k/hal.c              |   19 +-
+ drivers/net/wireless/ath/ath11k/hal.h              |    3 +-
+ drivers/net/wireless/ath/ath11k/hal_rx.c           |    4 +-
+ drivers/net/wireless/ath/ath11k/mac.c              | 1088 ++++++--
+ drivers/net/wireless/ath/ath11k/mac.h              |    5 +-
+ drivers/net/wireless/ath/ath11k/mhi.c              |    4 +-
+ drivers/net/wireless/ath/ath11k/pci.c              |   19 +-
+ drivers/net/wireless/ath/ath11k/pci.h              |    3 +-
+ drivers/net/wireless/ath/ath11k/reg.c              |  267 +-
+ drivers/net/wireless/ath/ath11k/reg.h              |   11 +-
+ drivers/net/wireless/ath/ath11k/testmode.c         |    2 +-
+ drivers/net/wireless/ath/ath11k/wmi.c              |  298 +-
+ drivers/net/wireless/ath/ath11k/wmi.h              |   65 +-
+ drivers/net/wireless/ath/ath12k/core.c             |  217 +-
+ drivers/net/wireless/ath/ath12k/core.h             |   57 +-
+ drivers/net/wireless/ath/ath12k/dp.h               |    3 +-
+ drivers/net/wireless/ath/ath12k/dp_mon.c           |    4 +-
+ drivers/net/wireless/ath/ath12k/dp_rx.c            |    8 +-
+ drivers/net/wireless/ath/ath12k/dp_tx.c            |   10 +-
+ drivers/net/wireless/ath/ath12k/hal_desc.h         |   20 +-
+ drivers/net/wireless/ath/ath12k/hal_rx.c           |   15 +-
+ drivers/net/wireless/ath/ath12k/hw.c               |    9 +
+ drivers/net/wireless/ath/ath12k/hw.h               |   23 +-
+ drivers/net/wireless/ath/ath12k/mac.c              |  868 ++++--
+ drivers/net/wireless/ath/ath12k/mac.h              |    4 +-
+ drivers/net/wireless/ath/ath12k/pci.c              |   10 +
+ drivers/net/wireless/ath/ath12k/qmi.c              |  377 ++-
+ drivers/net/wireless/ath/ath12k/qmi.h              |   34 +-
+ drivers/net/wireless/ath/ath12k/reg.c              |   13 +-
+ drivers/net/wireless/ath/ath12k/trace.h            |   29 +-
+ drivers/net/wireless/ath/ath12k/wmi.c              |   97 +-
+ drivers/net/wireless/ath/ath5k/mac80211-ops.c      |    4 +
+ drivers/net/wireless/ath/ath6kl/cfg80211.c         |    2 +-
+ drivers/net/wireless/ath/ath9k/ahb.c               |    6 +-
+ drivers/net/wireless/ath/ath9k/antenna.c           |    2 +-
+ drivers/net/wireless/ath/ath9k/ar9003_phy.h        |    9 -
+ drivers/net/wireless/ath/ath9k/beacon.c            |    2 +-
+ drivers/net/wireless/ath/ath9k/htc_drv_beacon.c    |    2 +-
+ drivers/net/wireless/ath/ath9k/htc_drv_main.c      |    4 +
+ drivers/net/wireless/ath/ath9k/main.c              |    4 +
+ drivers/net/wireless/ath/ath9k/reg_aic.h           |    4 -
+ drivers/net/wireless/ath/carl9170/main.c           |    4 +
+ drivers/net/wireless/ath/wcn36xx/main.c            |    4 +
+ drivers/net/wireless/atmel/at76c50x-usb.c          |    4 +
+ drivers/net/wireless/broadcom/b43/main.c           |    4 +
+ drivers/net/wireless/broadcom/b43legacy/main.c     |    4 +
+ .../broadcom/brcm80211/brcmfmac/cfg80211.c         |    2 +-
+ .../wireless/broadcom/brcm80211/brcmfmac/fwil.c    |    2 +-
+ .../net/wireless/broadcom/brcm80211/brcmsmac/led.c |    1 -
+ .../broadcom/brcm80211/brcmsmac/mac80211_if.c      |    4 +
+ .../broadcom/brcm80211/brcmsmac/phy/phy_cmn.c      |    3 +-
+ .../broadcom/brcm80211/brcmsmac/phy_shim.c         |    5 +-
+ .../broadcom/brcm80211/brcmsmac/phy_shim.h         |    2 +-
+ drivers/net/wireless/intel/iwlegacy/3945-mac.c     |    4 +
+ drivers/net/wireless/intel/iwlegacy/4965-mac.c     |    4 +
+ drivers/net/wireless/intel/iwlwifi/Kconfig         |    9 +
+ drivers/net/wireless/intel/iwlwifi/Makefile        |    3 +
+ drivers/net/wireless/intel/iwlwifi/cfg/ax210.c     |    4 +-
+ drivers/net/wireless/intel/iwlwifi/cfg/bz.c        |    9 +-
+ drivers/net/wireless/intel/iwlwifi/cfg/sc.c        |   40 +-
+ drivers/net/wireless/intel/iwlwifi/dvm/mac80211.c  |    4 +
+ drivers/net/wireless/intel/iwlwifi/fw/acpi.c       |  623 +----
+ drivers/net/wireless/intel/iwlwifi/fw/acpi.h       |  220 +-
+ drivers/net/wireless/intel/iwlwifi/fw/api/coex.h   |   14 +-
+ drivers/net/wireless/intel/iwlwifi/fw/api/d3.h     |    2 +-
+ .../net/wireless/intel/iwlwifi/fw/api/datapath.h   |    2 +-
+ .../net/wireless/intel/iwlwifi/fw/api/dbg-tlv.h    |    8 +-
+ drivers/net/wireless/intel/iwlwifi/fw/api/debug.h  |    2 +-
+ .../net/wireless/intel/iwlwifi/fw/api/location.h   |    1 +
+ .../net/wireless/intel/iwlwifi/fw/api/mac-cfg.h    |   23 +-
+ drivers/net/wireless/intel/iwlwifi/fw/api/mac.h    |   10 +-
+ .../net/wireless/intel/iwlwifi/fw/api/nvm-reg.h    |   28 +-
+ .../net/wireless/intel/iwlwifi/fw/api/phy-ctxt.h   |   16 +-
+ drivers/net/wireless/intel/iwlwifi/fw/api/power.h  |   40 +-
+ drivers/net/wireless/intel/iwlwifi/fw/api/sta.h    |    4 +-
+ drivers/net/wireless/intel/iwlwifi/fw/api/tx.h     |    4 +
+ drivers/net/wireless/intel/iwlwifi/fw/dbg.c        |   27 +-
+ drivers/net/wireless/intel/iwlwifi/fw/dbg.h        |    2 -
+ drivers/net/wireless/intel/iwlwifi/fw/error-dump.h |   23 +-
+ drivers/net/wireless/intel/iwlwifi/fw/file.h       |   27 +-
+ drivers/net/wireless/intel/iwlwifi/fw/pnvm.c       |   49 +-
+ drivers/net/wireless/intel/iwlwifi/fw/regulatory.c |  500 ++++
+ drivers/net/wireless/intel/iwlwifi/fw/regulatory.h |  199 ++
+ drivers/net/wireless/intel/iwlwifi/fw/runtime.h    |   22 +-
+ drivers/net/wireless/intel/iwlwifi/fw/uefi.c       |  427 ++-
+ drivers/net/wireless/intel/iwlwifi/fw/uefi.h       |  210 +-
+ drivers/net/wireless/intel/iwlwifi/iwl-config.h    |   22 +-
+ drivers/net/wireless/intel/iwlwifi/iwl-dbg-tlv.c   |   67 +-
+ drivers/net/wireless/intel/iwlwifi/iwl-drv.c       |   28 +-
+ drivers/net/wireless/intel/iwlwifi/iwl-drv.h       |   10 +-
+ .../net/wireless/intel/iwlwifi/iwl-eeprom-parse.c  |    2 +-
+ drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c |   75 +-
+ drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.h |    2 +-
+ drivers/net/wireless/intel/iwlwifi/iwl-op-mode.h   |    4 +-
+ drivers/net/wireless/intel/iwlwifi/iwl-prph.h      |    9 +-
+ drivers/net/wireless/intel/iwlwifi/iwl-trans.h     |   69 +-
+ drivers/net/wireless/intel/iwlwifi/mvm/coex.c      |  132 +
+ drivers/net/wireless/intel/iwlwifi/mvm/constants.h |    3 +
+ drivers/net/wireless/intel/iwlwifi/mvm/d3.c        |  133 +-
+ .../net/wireless/intel/iwlwifi/mvm/debugfs-vif.c   |   54 +-
+ drivers/net/wireless/intel/iwlwifi/mvm/debugfs.c   |   10 +-
+ .../net/wireless/intel/iwlwifi/mvm/ftm-initiator.c |    9 +-
+ .../net/wireless/intel/iwlwifi/mvm/ftm-responder.c |   17 +-
+ drivers/net/wireless/intel/iwlwifi/mvm/fw.c        |  346 +--
+ drivers/net/wireless/intel/iwlwifi/mvm/link.c      |   26 +-
+ drivers/net/wireless/intel/iwlwifi/mvm/mac-ctxt.c  |   36 +-
+ drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c  |  212 +-
+ drivers/net/wireless/intel/iwlwifi/mvm/mld-key.c   |   36 +-
+ drivers/net/wireless/intel/iwlwifi/mvm/mld-mac.c   |   11 +-
+ .../net/wireless/intel/iwlwifi/mvm/mld-mac80211.c  |  162 +-
+ drivers/net/wireless/intel/iwlwifi/mvm/mvm.h       |   81 +-
+ drivers/net/wireless/intel/iwlwifi/mvm/nvm.c       |    2 +-
+ drivers/net/wireless/intel/iwlwifi/mvm/ops.c       |  100 +-
+ drivers/net/wireless/intel/iwlwifi/mvm/phy-ctxt.c  |   16 +-
+ drivers/net/wireless/intel/iwlwifi/mvm/power.c     |   29 +-
+ drivers/net/wireless/intel/iwlwifi/mvm/rs-fw.c     |   16 +-
+ drivers/net/wireless/intel/iwlwifi/mvm/rs.c        |    2 +
+ drivers/net/wireless/intel/iwlwifi/mvm/rx.c        |   13 +-
+ drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c      |   51 +-
+ drivers/net/wireless/intel/iwlwifi/mvm/scan.c      |    8 +-
+ drivers/net/wireless/intel/iwlwifi/mvm/sf.c        |    5 +-
+ drivers/net/wireless/intel/iwlwifi/mvm/sta.c       |   34 +-
+ drivers/net/wireless/intel/iwlwifi/mvm/sta.h       |    3 +-
+ .../net/wireless/intel/iwlwifi/mvm/time-event.c    |  190 +-
+ drivers/net/wireless/intel/iwlwifi/mvm/tx.c        |   79 +-
+ drivers/net/wireless/intel/iwlwifi/mvm/utils.c     |    2 +
+ .../wireless/intel/iwlwifi/pcie/ctxt-info-gen3.c   |    2 +-
+ .../net/wireless/intel/iwlwifi/pcie/ctxt-info.c    |    4 +-
+ drivers/net/wireless/intel/iwlwifi/pcie/drv.c      |   49 +-
+ drivers/net/wireless/intel/iwlwifi/pcie/trans.c    |   11 +-
+ drivers/net/wireless/intel/iwlwifi/queue/tx.c      |   16 +-
+ drivers/net/wireless/intel/iwlwifi/tests/Makefile  |    7 +
+ drivers/net/wireless/intel/iwlwifi/tests/devinfo.c |   54 +
+ drivers/net/wireless/intel/iwlwifi/tests/module.c  |   10 +
+ drivers/net/wireless/intersil/p54/main.c           |    4 +
+ drivers/net/wireless/marvell/libertas/cmd.c        |   13 +-
+ drivers/net/wireless/marvell/libertas_tf/main.c    |    4 +
+ drivers/net/wireless/marvell/mwifiex/11h.c         |    2 +-
+ drivers/net/wireless/marvell/mwifiex/11n.c         |   12 +-
+ drivers/net/wireless/marvell/mwifiex/debugfs.c     |   19 +-
+ drivers/net/wireless/marvell/mwifiex/fw.h          |    2 +-
+ drivers/net/wireless/marvell/mwifiex/main.h        |    2 -
+ drivers/net/wireless/marvell/mwifiex/scan.c        |   14 +-
+ drivers/net/wireless/marvell/mwl8k.c               |    4 +
+ drivers/net/wireless/mediatek/mt76/mac80211.c      |    2 +-
+ drivers/net/wireless/mediatek/mt76/mt7603/main.c   |    4 +
+ drivers/net/wireless/mediatek/mt76/mt7615/mcu.c    |    2 +-
+ drivers/net/wireless/mediatek/mt76/mt76x0/pci.c    |    4 +
+ drivers/net/wireless/mediatek/mt76/mt76x0/usb.c    |    4 +
+ .../net/wireless/mediatek/mt76/mt76x2/pci_main.c   |    4 +
+ .../net/wireless/mediatek/mt76/mt76x2/usb_main.c   |    4 +
+ drivers/net/wireless/mediatek/mt76/mt7915/mcu.c    |    6 +-
+ drivers/net/wireless/mediatek/mt76/mt792x_core.c   |    7 +-
+ drivers/net/wireless/mediatek/mt76/mt7996/main.c   |    4 +
+ drivers/net/wireless/mediatek/mt76/mt7996/mcu.c    |    2 +-
+ drivers/net/wireless/mediatek/mt7601u/main.c       |    4 +
+ drivers/net/wireless/microchip/wilc1000/cfg80211.c |    2 +-
+ drivers/net/wireless/microchip/wilc1000/hif.c      |   70 +-
+ drivers/net/wireless/microchip/wilc1000/netdev.c   |   83 +-
+ drivers/net/wireless/microchip/wilc1000/netdev.h   |    6 +
+ drivers/net/wireless/microchip/wilc1000/spi.c      |   75 +-
+ drivers/net/wireless/microchip/wilc1000/wlan.c     |    7 +-
+ drivers/net/wireless/microchip/wilc1000/wlan.h     |    5 +
+ drivers/net/wireless/purelifi/plfxlc/mac.c         |    5 +-
+ drivers/net/wireless/quantenna/qtnfmac/event.c     |    2 +-
+ drivers/net/wireless/ralink/rt2x00/rt2400pci.c     |    4 +
+ drivers/net/wireless/ralink/rt2x00/rt2500pci.c     |    4 +
+ drivers/net/wireless/ralink/rt2x00/rt2500usb.c     |    4 +
+ drivers/net/wireless/ralink/rt2x00/rt2800pci.c     |    4 +
+ drivers/net/wireless/ralink/rt2x00/rt2800soc.c     |    4 +
+ drivers/net/wireless/ralink/rt2x00/rt2800usb.c     |    4 +
+ drivers/net/wireless/ralink/rt2x00/rt61pci.c       |    4 +
+ drivers/net/wireless/ralink/rt2x00/rt73usb.c       |    4 +
+ drivers/net/wireless/realtek/rtl818x/rtl8180/dev.c |    4 +
+ drivers/net/wireless/realtek/rtl818x/rtl8187/dev.c |    4 +
+ drivers/net/wireless/realtek/rtl8xxxu/rtl8xxxu.h   |    8 +-
+ .../net/wireless/realtek/rtl8xxxu/rtl8xxxu_core.c  |  209 +-
+ drivers/net/wireless/realtek/rtlwifi/core.c        |    4 +
+ .../net/wireless/realtek/rtlwifi/rtl8192cu/def.h   |    2 +-
+ .../net/wireless/realtek/rtlwifi/rtl8192cu/mac.c   |    3 +-
+ .../net/wireless/realtek/rtlwifi/rtl8192cu/sw.c    |    1 -
+ .../net/wireless/realtek/rtlwifi/rtl8192cu/trx.c   |   77 +-
+ .../net/wireless/realtek/rtlwifi/rtl8192cu/trx.h   |    1 -
+ drivers/net/wireless/realtek/rtlwifi/usb.c         |   31 +-
+ drivers/net/wireless/realtek/rtlwifi/usb.h         |    2 +
+ drivers/net/wireless/realtek/rtlwifi/wifi.h        |    1 -
+ drivers/net/wireless/realtek/rtw88/mac80211.c      |    4 +
+ drivers/net/wireless/realtek/rtw89/chan.c          |  646 ++++-
+ drivers/net/wireless/realtek/rtw89/chan.h          |    5 +
+ drivers/net/wireless/realtek/rtw89/coex.h          |    1 +
+ drivers/net/wireless/realtek/rtw89/core.c          |   40 +-
+ drivers/net/wireless/realtek/rtw89/core.h          |  127 +-
+ drivers/net/wireless/realtek/rtw89/efuse.h         |    1 +
+ drivers/net/wireless/realtek/rtw89/efuse_be.c      |  142 +
+ drivers/net/wireless/realtek/rtw89/fw.c            | 1438 +++++++++-
+ drivers/net/wireless/realtek/rtw89/fw.h            |  666 ++++-
+ drivers/net/wireless/realtek/rtw89/mac.c           |  200 +-
+ drivers/net/wireless/realtek/rtw89/mac.h           |   54 +-
+ drivers/net/wireless/realtek/rtw89/mac80211.c      |    1 +
+ drivers/net/wireless/realtek/rtw89/mac_be.c        |  193 +-
+ drivers/net/wireless/realtek/rtw89/pci.c           |   67 +-
+ drivers/net/wireless/realtek/rtw89/pci.h           |    6 +-
+ drivers/net/wireless/realtek/rtw89/pci_be.c        |    5 +
+ drivers/net/wireless/realtek/rtw89/phy.c           | 1061 ++++++-
+ drivers/net/wireless/realtek/rtw89/phy.h           |   41 +
+ drivers/net/wireless/realtek/rtw89/phy_be.c        |   19 +
+ drivers/net/wireless/realtek/rtw89/ps.c            |   10 +-
+ drivers/net/wireless/realtek/rtw89/reg.h           |  212 +-
+ drivers/net/wireless/realtek/rtw89/rtw8851b.c      |    2 +
+ drivers/net/wireless/realtek/rtw89/rtw8851be.c     |    2 +
+ drivers/net/wireless/realtek/rtw89/rtw8852a.c      |    2 +
+ drivers/net/wireless/realtek/rtw89/rtw8852ae.c     |    1 +
+ drivers/net/wireless/realtek/rtw89/rtw8852b.c      |    2 +
+ drivers/net/wireless/realtek/rtw89/rtw8852be.c     |    1 +
+ drivers/net/wireless/realtek/rtw89/rtw8852c.c      |    2 +
+ drivers/net/wireless/realtek/rtw89/rtw8852ce.c     |    1 +
+ drivers/net/wireless/realtek/rtw89/rtw8922a.c      |  999 ++++++-
+ drivers/net/wireless/realtek/rtw89/rtw8922a_rfk.c  |  378 +++
+ drivers/net/wireless/realtek/rtw89/rtw8922a_rfk.h  |   18 +
+ drivers/net/wireless/realtek/rtw89/rtw8922ae.c     |    1 +
+ drivers/net/wireless/rsi/rsi_91x_mac80211.c        |    8 +-
+ drivers/net/wireless/rsi/rsi_91x_usb.c             |   12 +-
+ drivers/net/wireless/silabs/wfx/sta.c              |   19 +-
+ drivers/net/wireless/st/cw1200/cw1200_sdio.c       |   42 +-
+ drivers/net/wireless/st/cw1200/cw1200_spi.c        |   79 +-
+ drivers/net/wireless/st/cw1200/main.c              |    4 +
+ drivers/net/wireless/ti/wl1251/main.c              |    4 +
+ drivers/net/wireless/ti/wlcore/event.c             |    2 +-
+ drivers/net/wireless/ti/wlcore/main.c              |    6 +-
+ drivers/net/wireless/ti/wlcore/sdio.c              |    1 -
+ drivers/net/wireless/virtual/mac80211_hwsim.c      |  101 +-
+ drivers/net/wireless/virtual/mac80211_hwsim.h      |    5 +-
+ drivers/net/wireless/zydas/zd1211rw/zd_def.h       |    2 +-
+ drivers/net/wireless/zydas/zd1211rw/zd_mac.c       |    4 +
+ drivers/ssb/main.c                                 |    2 +-
+ drivers/staging/vt6655/device_main.c               |    6 +-
+ drivers/staging/vt6656/main_usb.c                  |    6 +-
+ include/linux/ieee80211.h                          |  120 +-
+ include/linux/platform_data/brcmfmac.h             |    2 +-
+ include/linux/platform_data/net-cw1200.h           |    4 -
+ include/net/cfg80211.h                             |  100 +-
+ include/net/mac80211.h                             |  118 +-
+ include/uapi/linux/nl80211.h                       |   71 +-
+ net/mac80211/Makefile                              |    2 +-
+ net/mac80211/agg-tx.c                              |    2 +-
+ net/mac80211/cfg.c                                 |  232 +-
+ net/mac80211/chan.c                                |  707 +++--
+ net/mac80211/debug.h                               |   18 +-
+ net/mac80211/driver-ops.h                          |   19 +
+ net/mac80211/ht.c                                  |    6 +-
+ net/mac80211/ibss.c                                |   55 +-
+ net/mac80211/ieee80211_i.h                         |  182 +-
+ net/mac80211/iface.c                               |   30 +-
+ net/mac80211/key.c                                 |    4 +
+ net/mac80211/link.c                                |   13 +-
+ net/mac80211/main.c                                |  231 +-
+ net/mac80211/mesh.c                                |  162 +-
+ net/mac80211/mesh.h                                |    3 +-
+ net/mac80211/mesh_plink.c                          |   28 +-
+ net/mac80211/mlme.c                                | 2952 +++++++++++---------
+ net/mac80211/ocb.c                                 |    5 +-
+ net/mac80211/offchannel.c                          |   21 +-
+ net/mac80211/parse.c                               |  926 ++++++
+ net/mac80211/rate.c                                |   12 +-
+ net/mac80211/rx.c                                  |   53 +-
+ net/mac80211/scan.c                                |   42 +-
+ net/mac80211/spectmgmt.c                           |  337 ++-
+ net/mac80211/sta_info.c                            |   21 +-
+ net/mac80211/sta_info.h                            |   14 +-
+ net/mac80211/tdls.c                                |   67 +-
+ net/mac80211/tests/elems.c                         |    1 +
+ net/mac80211/trace.h                               |  119 +-
+ net/mac80211/trace_msg.h                           |    2 -
+ net/mac80211/tx.c                                  |   46 +-
+ net/mac80211/util.c                                | 1803 ++++--------
+ net/mac80211/vht.c                                 |    6 +-
+ net/mac80211/wpa.c                                 |   33 +-
+ net/wireless/chan.c                                |  397 +--
+ net/wireless/core.h                                |   52 +-
+ net/wireless/mlme.c                                |  146 +-
+ net/wireless/nl80211.c                             |  233 +-
+ net/wireless/reg.c                                 |   10 +-
+ net/wireless/scan.c                                |  192 +-
+ net/wireless/sme.c                                 |    3 +-
+ net/wireless/tests/Makefile                        |    2 +-
+ net/wireless/tests/chan.c                          |  228 ++
+ net/wireless/tests/scan.c                          |  277 +-
+ net/wireless/trace.h                               |   52 +-
+ net/wireless/util.c                                |   76 +
+ tools/testing/kunit/configs/all_tests.config       |    5 +
+ 306 files changed, 18996 insertions(+), 7362 deletions(-)
+ create mode 100644 drivers/net/wireless/intel/iwlwifi/fw/regulatory.c
+ create mode 100644 drivers/net/wireless/intel/iwlwifi/fw/regulatory.h
+ create mode 100644 drivers/net/wireless/intel/iwlwifi/tests/Makefile
+ create mode 100644 drivers/net/wireless/intel/iwlwifi/tests/devinfo.c
+ create mode 100644 drivers/net/wireless/intel/iwlwifi/tests/module.c
+ create mode 100644 drivers/net/wireless/realtek/rtw89/rtw8922a_rfk.c
+ create mode 100644 drivers/net/wireless/realtek/rtw89/rtw8922a_rfk.h
+ create mode 100644 net/mac80211/parse.c
+ create mode 100644 net/wireless/tests/chan.c
+Merging wpan-next/master (2373699560a7 mac802154: Avoid new associations while disassociating)
+$ git merge -m Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/wpan/wpan-next.git wpan-next/master
+Already up to date.
+Merging wpan-staging/staging (2373699560a7 mac802154: Avoid new associations while disassociating)
+$ git merge -m Merge branch 'staging' of git://git.kernel.org/pub/scm/linux/kernel/git/wpan/wpan-next.git wpan-staging/staging
+Already up to date.
+Merging mtd/mtd/next (18af7e357033 mtd: flashchip: explicitly include <linux/wait.h>)
+$ git merge -m Merge branch 'mtd/next' of git://git.kernel.org/pub/scm/linux/kernel/git/mtd/linux.git mtd/mtd/next
+Merge made by the 'ort' strategy.
+ Documentation/devicetree/bindings/mtd/mtd.yaml | 2 ++
+ drivers/mtd/ssfdc.c                            | 7 ++++---
+ include/linux/mtd/flashchip.h                  | 1 +
+ 3 files changed, 7 insertions(+), 3 deletions(-)
+Merging nand/nand/next (4bd14b2fd8a8 mtd: spinand: esmt: Extend IDs to 5 bytes)
+$ git merge -m Merge branch 'nand/next' of git://git.kernel.org/pub/scm/linux/kernel/git/mtd/linux.git nand/nand/next
+Merge made by the 'ort' strategy.
+ drivers/mtd/nand/raw/fsl_elbc_nand.c |  3 ++-
+ drivers/mtd/nand/spi/esmt.c          |  9 ++++++---
+ drivers/mtd/nand/spi/winbond.c       | 12 ++++++++++++
+ include/linux/mtd/spinand.h          |  2 +-
+ 4 files changed, 21 insertions(+), 5 deletions(-)
+Merging spi-nor/spi-nor/next (3c0e1dfa703c MAINTAINERS: change my mail to the kernel.org one)
+$ git merge -m Merge branch 'spi-nor/next' of git://git.kernel.org/pub/scm/linux/kernel/git/mtd/linux.git spi-nor/spi-nor/next
+Already up to date.
+Merging crypto/master (7d42e097607c crypto: qat - resolve race condition during AER recovery)
+$ git merge -m Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/herbert/cryptodev-2.6.git crypto/master
+Auto-merging MAINTAINERS
+Auto-merging drivers/crypto/virtio/virtio_crypto_akcipher_algs.c
+Merge made by the 'ort' strategy.
+ Documentation/ABI/testing/debugfs-driver-qat       |  26 ++
+ Documentation/ABI/testing/debugfs-hisi-hpre        |  22 ++
+ Documentation/ABI/testing/debugfs-hisi-sec         |  22 ++
+ Documentation/ABI/testing/debugfs-hisi-zip         |  22 ++
+ Documentation/ABI/testing/sysfs-driver-qat         |  20 ++
+ .../bindings/crypto/qcom,inline-crypto-engine.yaml |   1 +
+ .../devicetree/bindings/crypto/qcom-qce.yaml       |   1 +
+ MAINTAINERS                                        |  18 +-
+ arch/arm64/crypto/Kconfig                          |   1 +
+ arch/arm64/crypto/aes-ce-ccm-core.S                | 265 ++++++++-------------
+ arch/arm64/crypto/aes-ce-ccm-glue.c                | 154 ++++++++----
+ arch/arm64/crypto/aes-glue.c                       |   1 +
+ arch/powerpc/crypto/Kconfig                        |  20 ++
+ arch/powerpc/crypto/Makefile                       |  20 +-
+ {drivers/crypto/vmx => arch/powerpc/crypto}/aes.c  |   0
+ .../crypto/vmx => arch/powerpc/crypto}/aes_cbc.c   |   0
+ .../crypto/vmx => arch/powerpc/crypto}/aes_ctr.c   |   0
+ .../crypto/vmx => arch/powerpc/crypto}/aes_xts.c   |   0
+ .../crypto/vmx => arch/powerpc/crypto}/aesp8-ppc.h |   0
+ .../vmx => arch/powerpc/crypto}/aesp8-ppc.pl       |   0
+ .../crypto/vmx => arch/powerpc/crypto}/ghash.c     |   0
+ .../vmx => arch/powerpc/crypto}/ghashp8-ppc.pl     |   0
+ {drivers/crypto/vmx => arch/powerpc/crypto}/vmx.c  |   0
+ crypto/ahash.c                                     |  21 +-
+ crypto/asymmetric_keys/verify_pefile.c             |   4 +-
+ crypto/pcbc.c                                      |   4 +-
+ crypto/rsa.c                                       |  36 ++-
+ crypto/testmgr.c                                   |   8 -
+ drivers/crypto/Kconfig                             |  14 +-
+ drivers/crypto/Makefile                            |   2 +-
+ drivers/crypto/allwinner/sun8i-ce/sun8i-ce-hash.c  |   2 +-
+ drivers/crypto/ccp/psp-dev.c                       |  11 +-
+ drivers/crypto/hisilicon/debugfs.c                 |  58 +++++
+ drivers/crypto/hisilicon/hpre/hpre_main.c          |   2 +-
+ drivers/crypto/hisilicon/qm.c                      | 184 +++++++++-----
+ drivers/crypto/hisilicon/sec2/sec_crypto.c         |  33 +--
+ drivers/crypto/hisilicon/sec2/sec_main.c           |   7 +-
+ drivers/crypto/hisilicon/zip/zip_main.c            |   2 +-
+ drivers/crypto/intel/iaa/iaa_crypto.h              |  25 --
+ drivers/crypto/intel/iaa/iaa_crypto_comp_fixed.c   |   1 -
+ drivers/crypto/intel/iaa/iaa_crypto_main.c         | 108 +--------
+ drivers/crypto/intel/iaa/iaa_crypto_stats.c        |   2 -
+ drivers/crypto/intel/qat/Kconfig                   |  14 ++
+ drivers/crypto/intel/qat/qat_common/Makefile       |   2 +
+ .../intel/qat/qat_common/adf_accel_devices.h       |   2 +
+ drivers/crypto/intel/qat/qat_common/adf_aer.c      | 138 ++++++++++-
+ .../crypto/intel/qat/qat_common/adf_cfg_strings.h  |   1 +
+ .../crypto/intel/qat/qat_common/adf_common_drv.h   |  10 +
+ .../crypto/intel/qat/qat_common/adf_gen4_hw_data.c |   3 +
+ .../crypto/intel/qat/qat_common/adf_heartbeat.c    |  20 +-
+ .../crypto/intel/qat/qat_common/adf_heartbeat.h    |  21 ++
+ .../intel/qat/qat_common/adf_heartbeat_dbgfs.c     |  52 ++++
+ .../intel/qat/qat_common/adf_heartbeat_inject.c    |  76 ++++++
+ .../crypto/intel/qat/qat_common/adf_hw_arbiter.c   |  25 ++
+ drivers/crypto/intel/qat/qat_common/adf_init.c     |  12 +
+ drivers/crypto/intel/qat/qat_common/adf_isr.c      |   9 +-
+ drivers/crypto/intel/qat/qat_common/adf_pfvf_msg.h |   7 +-
+ .../crypto/intel/qat/qat_common/adf_pfvf_pf_msg.c  |  64 ++++-
+ .../crypto/intel/qat/qat_common/adf_pfvf_pf_msg.h  |  21 ++
+ .../intel/qat/qat_common/adf_pfvf_pf_proto.c       |   8 +
+ .../intel/qat/qat_common/adf_pfvf_vf_proto.c       |   6 +
+ drivers/crypto/intel/qat/qat_common/adf_rl.c       |  20 +-
+ drivers/crypto/intel/qat/qat_common/adf_sriov.c    |  38 ++-
+ drivers/crypto/intel/qat/qat_common/adf_sysfs.c    |  37 +++
+ .../crypto/virtio/virtio_crypto_akcipher_algs.c    |  12 +-
+ drivers/crypto/virtio/virtio_crypto_core.c         |   2 -
+ drivers/crypto/vmx/.gitignore                      |   3 -
+ drivers/crypto/vmx/Kconfig                         |  14 --
+ drivers/crypto/vmx/Makefile                        |  23 --
+ drivers/crypto/vmx/ppc-xlate.pl                    | 231 ------------------
+ drivers/crypto/xilinx/zynqmp-aes-gcm.c             |   3 +
+ include/crypto/internal/hash.h                     |   2 -
+ include/crypto/public_key.h                        |   1 +
+ include/linux/hisi_acc_qm.h                        |  10 +-
+ 74 files changed, 1213 insertions(+), 791 deletions(-)
+ rename {drivers/crypto/vmx => arch/powerpc/crypto}/aes.c (100%)
+ rename {drivers/crypto/vmx => arch/powerpc/crypto}/aes_cbc.c (100%)
+ rename {drivers/crypto/vmx => arch/powerpc/crypto}/aes_ctr.c (100%)
+ rename {drivers/crypto/vmx => arch/powerpc/crypto}/aes_xts.c (100%)
+ rename {drivers/crypto/vmx => arch/powerpc/crypto}/aesp8-ppc.h (100%)
+ rename {drivers/crypto/vmx => arch/powerpc/crypto}/aesp8-ppc.pl (100%)
+ rename {drivers/crypto/vmx => arch/powerpc/crypto}/ghash.c (100%)
+ rename {drivers/crypto/vmx => arch/powerpc/crypto}/ghashp8-ppc.pl (100%)
+ rename {drivers/crypto/vmx => arch/powerpc/crypto}/vmx.c (100%)
+ create mode 100644 drivers/crypto/intel/qat/qat_common/adf_heartbeat_inject.c
+ delete mode 100644 drivers/crypto/vmx/.gitignore
+ delete mode 100644 drivers/crypto/vmx/Kconfig
+ delete mode 100644 drivers/crypto/vmx/Makefile
+ delete mode 100644 drivers/crypto/vmx/ppc-xlate.pl
+Merging drm/drm-next (9ac4beb7578a Merge tag 'drm-misc-next-2024-02-15' of git://anongit.freedesktop.org/drm/drm-misc into drm-next)
+$ git merge -m Merge branch 'drm-next' of git://git.freedesktop.org/git/drm/drm.git drm/drm-next
+Auto-merging MAINTAINERS
+Auto-merging drivers/gpu/drm/amd/amdgpu/amdgpu.h
+Auto-merging drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+Auto-merging drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+Auto-merging drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+Auto-merging drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+Auto-merging drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+Auto-merging drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
+Auto-merging drivers/gpu/drm/amd/amdkfd/kfd_topology.c
+Auto-merging drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
+Auto-merging drivers/gpu/drm/amd/display/dc/clk_mgr/dcn35/dcn35_clk_mgr.c
+Auto-merging drivers/gpu/drm/amd/display/dc/core/dc.c
+Auto-merging drivers/gpu/drm/amd/display/dc/dcn30/dcn30_dpp_cm.c
+Auto-merging drivers/gpu/drm/amd/display/dc/dml/dcn32/dcn32_fpu.c
+Auto-merging drivers/gpu/drm/amd/display/dc/hwss/dce110/dce110_hwseq.c
+Auto-merging drivers/gpu/drm/amd/display/dc/hwss/dcn20/dcn20_hwseq.c
+Auto-merging drivers/gpu/drm/amd/display/dc/hwss/dcn20/dcn20_hwseq.h
+Auto-merging drivers/gpu/drm/amd/display/dc/hwss/dcn21/dcn21_hwseq.c
+CONFLICT (content): Merge conflict in drivers/gpu/drm/amd/display/dc/hwss/dcn21/dcn21_hwseq.c
+Auto-merging drivers/gpu/drm/amd/display/dc/hwss/dcn32/dcn32_init.c
+Auto-merging drivers/gpu/drm/amd/display/dc/hwss/hw_sequencer.h
+Auto-merging drivers/gpu/drm/amd/display/dc/hwss/hw_sequencer_private.h
+CONFLICT (content): Merge conflict in drivers/gpu/drm/amd/display/dc/hwss/hw_sequencer_private.h
+Auto-merging drivers/gpu/drm/amd/display/dc/inc/resource.h
+Auto-merging drivers/gpu/drm/amd/display/dc/link/link_validation.c
+Auto-merging drivers/gpu/drm/amd/display/dc/link/protocols/link_dp_training.c
+Auto-merging drivers/gpu/drm/amd/display/dc/resource/dcn35/dcn35_resource.c
+CONFLICT (content): Merge conflict in drivers/gpu/drm/amd/display/dc/resource/dcn35/dcn35_resource.c
+Auto-merging drivers/gpu/drm/drm_crtc.c
+Auto-merging drivers/gpu/drm/drm_gem_vram_helper.c
+Auto-merging drivers/gpu/drm/i915/display/intel_dp.c
+Auto-merging drivers/gpu/drm/meson/meson_encoder_hdmi.c
+Auto-merging drivers/gpu/drm/nouveau/nouveau_svm.c
+Auto-merging drivers/gpu/drm/scheduler/sched_main.c
+Auto-merging drivers/gpu/drm/xe/xe_gt.c
+Resolved 'drivers/gpu/drm/amd/display/dc/hwss/dcn21/dcn21_hwseq.c' using previous resolution.
+Resolved 'drivers/gpu/drm/amd/display/dc/hwss/hw_sequencer_private.h' using previous resolution.
+Resolved 'drivers/gpu/drm/amd/display/dc/resource/dcn35/dcn35_resource.c' using previous resolution.
+Automatic merge failed; fix conflicts and then commit the result.
+$ git commit --no-edit -v -a
+[master 4517ecaaca37] Merge branch 'drm-next' of git://git.freedesktop.org/git/drm/drm.git
+$ git diff -M --stat --summary HEAD^..
+ .../display/bridge/fsl,imx8mp-hdmi-tx.yaml         | 102 +++
+ .../bindings/display/bridge/ti,sn65dsi86.yaml      |   2 +-
+ .../bindings/display/imx/fsl,imx8mp-hdmi-pvi.yaml  |  84 ++
+ .../display/panel/boe,th101mb31ig002-28a.yaml      |  58 ++
+ .../bindings/display/panel/novatek,nt35510.yaml    |   4 +-
+ .../bindings/display/panel/novatek,nt36672e.yaml   |  66 ++
+ .../bindings/display/panel/panel-lvds.yaml         |   2 +
+ .../bindings/display/panel/panel-simple.yaml       |   2 +
+ .../display/panel/rocktech,jh057n00900.yaml        |   3 +
+ .../bindings/display/panel/visionox,rm69299.yaml   |   3 +-
+ .../display/rockchip/rockchip,dw-hdmi.yaml         |  33 +-
+ .../bindings/display/solomon,ssd1307fb.yaml        |  20 +-
+ .../bindings/display/solomon,ssd132x.yaml          |  12 +-
+ .../bindings/display/solomon,ssd133x.yaml          |  45 ++
+ Documentation/gpu/amdgpu/dgpu-asic-info-table.csv  |   2 +
+ Documentation/gpu/amdgpu/display/dcn-blocks.rst    |  78 ++
+ .../gpu/amdgpu/display/display-contributing.rst    | 168 ++++
+ .../gpu/amdgpu/display/display-manager.rst         |   3 -
+ Documentation/gpu/amdgpu/display/index.rst         |  78 +-
+ Documentation/gpu/drm-internals.rst                |  12 -
+ Documentation/gpu/introduction.rst                 |   2 +
+ Documentation/gpu/rfc/index.rst                    |   4 -
+ Documentation/gpu/rfc/xe.rst                       | 234 ------
+ Documentation/gpu/todo.rst                         |  23 +
+ MAINTAINERS                                        |   2 +-
+ arch/powerpc/platforms/ps3/Kconfig                 |   1 +
+ drivers/accel/qaic/mhi_controller.c                |   4 +-
+ drivers/accel/qaic/qaic.h                          |   3 +-
+ drivers/accel/qaic/qaic_data.c                     |  59 +-
+ drivers/accel/qaic/qaic_drv.c                      | 144 ++--
+ drivers/dma-buf/dma-fence.c                        |   8 +-
+ drivers/dma-buf/dma-resv.c                         |   4 +-
+ drivers/firmware/Kconfig                           |   1 +
+ drivers/firmware/sysfb.c                           |  51 +-
+ drivers/firmware/sysfb_simplefb.c                  |   5 +-
+ drivers/gpu/drm/Kconfig                            |  19 +-
+ drivers/gpu/drm/amd/amdgpu/Makefile                |   2 +-
+ drivers/gpu/drm/amd/amdgpu/amdgpu.h                |  14 +-
+ drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c            | 879 +++++++++++++++++++++
+ drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h            | 202 +++++
+ drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c         |   5 +-
+ drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h         |  12 +-
+ drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c   |  39 +-
+ drivers/gpu/drm/amd/amdgpu/amdgpu_atombios.c       |  24 +-
+ drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c   |   3 +-
+ drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.h   |   2 +-
+ drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c             |   6 +-
+ drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c            |   2 +-
+ drivers/gpu/drm/amd/amdgpu/amdgpu_device.c         |  44 +-
+ drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c      |  12 +-
+ drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c        |   4 +
+ drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c            |   6 +-
+ drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c            |  35 +-
+ drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c            |   7 +-
+ drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c            |  55 +-
+ drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h            |   4 +
+ drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c            |   7 +-
+ drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c            |  17 +-
+ drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c            |  33 +-
+ drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h            |   1 +
+ drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c            |   8 +-
+ drivers/gpu/drm/amd/amdgpu/amdgpu_object.c         |   6 +-
+ drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c            | 186 +++--
+ drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h            |   9 +-
+ drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c            | 689 ++++++++++++----
+ drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h            |  60 +-
+ drivers/gpu/drm/amd/amdgpu/amdgpu_rlc.c            |   2 +-
+ drivers/gpu/drm/amd/amdgpu/amdgpu_rlc.h            |   2 +-
+ drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.c          |  70 +-
+ drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.h          |   9 +-
+ drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c            |  11 +-
+ drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c            | 155 +++-
+ drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h            |  10 +-
+ drivers/gpu/drm/amd/amdgpu/amdgpu_umr.h            |   4 +-
+ drivers/gpu/drm/amd/amdgpu/amdgpu_umsch_mm.c       |   2 +-
+ drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c            |   2 +-
+ drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c           |  83 +-
+ drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h           |   7 +-
+ drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c           |   3 +-
+ drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c             |  69 +-
+ drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h             |  18 +-
+ drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c           |  81 +-
+ drivers/gpu/drm/amd/amdgpu/atom.c                  |  41 +-
+ drivers/gpu/drm/amd/amdgpu/atom.h                  |   2 +-
+ drivers/gpu/drm/amd/amdgpu/atombios_crtc.c         |  28 +-
+ drivers/gpu/drm/amd/amdgpu/atombios_dp.c           |   4 +-
+ drivers/gpu/drm/amd/amdgpu/atombios_encoders.c     |  16 +-
+ drivers/gpu/drm/amd/amdgpu/atombios_i2c.c          |   4 +-
+ drivers/gpu/drm/amd/amdgpu/clearstate_gfx9.h       |  27 +-
+ drivers/gpu/drm/amd/amdgpu/clearstate_si.h         |  24 +-
+ drivers/gpu/drm/amd/amdgpu/dce_v10_0.c             |   2 +
+ drivers/gpu/drm/amd/amdgpu/dce_v11_0.c             |   2 +
+ drivers/gpu/drm/amd/amdgpu/dce_v6_0.c              |  22 +-
+ drivers/gpu/drm/amd/amdgpu/dce_v8_0.c              |  22 +-
+ drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c             |   2 +-
+ drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c             |  16 +-
+ drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c           |   2 +-
+ drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c              |   4 +-
+ drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c              |   4 +-
+ drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c              |   2 +-
+ drivers/gpu/drm/amd/amdgpu/gfx_v9_4.c              |   5 +-
+ drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c            |  92 ++-
+ drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c             |   9 +-
+ drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c             |   9 +-
+ drivers/gpu/drm/amd/amdgpu/gmc_v6_0.c              |   5 +-
+ drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c              |   5 +-
+ drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c              |   5 +-
+ drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c              |  12 +-
+ drivers/gpu/drm/amd/amdgpu/jpeg_v2_5.c             |  10 +-
+ drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c            |  87 ++
+ drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c              |   3 +-
+ drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c              |  29 +-
+ drivers/gpu/drm/amd/amdgpu/mxgpu_nv.h              |   1 +
+ drivers/gpu/drm/amd/amdgpu/navi10_ih.c             |   3 +-
+ drivers/gpu/drm/amd/amdgpu/psp_v13_0.c             |  99 +--
+ drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c             |  15 +-
+ drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c           |  72 ++
+ drivers/gpu/drm/amd/amdgpu/ta_ras_if.h             |  36 +
+ drivers/gpu/drm/amd/amdgpu/umc_v12_0.c             | 252 ++++--
+ drivers/gpu/drm/amd/amdgpu/umc_v12_0.h             |   3 +
+ drivers/gpu/drm/amd/amdgpu/umc_v6_0.c              |   2 +-
+ drivers/gpu/drm/amd/amdkfd/kfd_crat.c              |  93 ++-
+ drivers/gpu/drm/amd/amdkfd/kfd_crat.h              |   1 +
+ drivers/gpu/drm/amd/amdkfd/kfd_debug.c             |   4 +-
+ drivers/gpu/drm/amd/amdkfd/kfd_events.c            |   6 +-
+ drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c   |   7 +-
+ drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c   |   7 +-
+ drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c    |   7 +-
+ drivers/gpu/drm/amd/amdkfd/kfd_svm.c               |  10 +-
+ drivers/gpu/drm/amd/amdkfd/kfd_topology.c          |   3 +
+ drivers/gpu/drm/amd/display/TODO                   | 110 ---
+ drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c  | 139 +++-
+ drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h  |   1 +
+ .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_crtc.c |  72 +-
+ .../drm/amd/display/amdgpu_dm/amdgpu_dm_debugfs.c  |  55 +-
+ .../drm/amd/display/amdgpu_dm/amdgpu_dm_replay.c   | 119 +--
+ .../drm/amd/display/amdgpu_dm/amdgpu_dm_replay.h   |   4 +-
+ drivers/gpu/drm/amd/display/dc/basics/conversion.c |  34 +
+ drivers/gpu/drm/amd/display/dc/basics/conversion.h |   4 +
+ .../gpu/drm/amd/display/dc/bios/command_table.c    |   2 +-
+ .../gpu/drm/amd/display/dc/bios/command_table2.c   |   2 +-
+ .../drm/amd/display/dc/clk_mgr/dcn21/rn_clk_mgr.c  |   2 -
+ .../dc/clk_mgr/dcn21/rn_clk_mgr_vbios_smu.c        |   4 -
+ .../drm/amd/display/dc/clk_mgr/dcn301/dcn301_smu.c |   4 -
+ .../drm/amd/display/dc/clk_mgr/dcn31/dcn31_smu.c   |   4 -
+ .../drm/amd/display/dc/clk_mgr/dcn314/dcn314_smu.c |   6 -
+ .../drm/amd/display/dc/clk_mgr/dcn315/dcn315_smu.c |   4 -
+ .../drm/amd/display/dc/clk_mgr/dcn316/dcn316_smu.c |   4 -
+ .../amd/display/dc/clk_mgr/dcn32/dcn32_clk_mgr.c   |  42 +-
+ .../dc/clk_mgr/dcn32/dcn32_clk_mgr_smu_msg.h       |   3 +-
+ .../amd/display/dc/clk_mgr/dcn35/dcn35_clk_mgr.c   |  40 +-
+ .../drm/amd/display/dc/clk_mgr/dcn35/dcn35_smu.c   |  15 +
+ drivers/gpu/drm/amd/display/dc/core/dc.c           |  68 +-
+ drivers/gpu/drm/amd/display/dc/core/dc_resource.c  |  54 +-
+ drivers/gpu/drm/amd/display/dc/core/dc_stream.c    |  18 +
+ drivers/gpu/drm/amd/display/dc/core/dc_surface.c   |   2 +
+ drivers/gpu/drm/amd/display/dc/dc.h                |  10 +-
+ drivers/gpu/drm/amd/display/dc/dc_dmub_srv.c       |  31 +-
+ drivers/gpu/drm/amd/display/dc/dc_hw_types.h       |   3 +-
+ drivers/gpu/drm/amd/display/dc/dce/dce_audio.c     | 299 ++++++-
+ drivers/gpu/drm/amd/display/dc/dce/dce_audio.h     |   3 +-
+ drivers/gpu/drm/amd/display/dc/dce/dmub_replay.c   |   4 +-
+ .../gpu/drm/amd/display/dc/dcn10/dcn10_cm_common.c |  20 +
+ .../gpu/drm/amd/display/dc/dcn10/dcn10_cm_common.h |   4 +-
+ drivers/gpu/drm/amd/display/dc/dcn10/dcn10_dpp.c   |   3 +-
+ drivers/gpu/drm/amd/display/dc/dcn10/dcn10_dpp.h   |   3 +
+ .../gpu/drm/amd/display/dc/dcn10/dcn10_dpp_cm.c    |  70 +-
+ drivers/gpu/drm/amd/display/dc/dcn10/dcn10_opp.c   |   7 +
+ drivers/gpu/drm/amd/display/dc/dcn20/dcn20_dpp.c   |  31 +-
+ drivers/gpu/drm/amd/display/dc/dcn20/dcn20_dpp.h   |   3 +
+ .../gpu/drm/amd/display/dc/dcn20/dcn20_dpp_cm.c    |  55 ++
+ drivers/gpu/drm/amd/display/dc/dcn20/dcn20_mpc.c   |  24 +-
+ drivers/gpu/drm/amd/display/dc/dcn201/dcn201_dpp.c |   1 +
+ drivers/gpu/drm/amd/display/dc/dcn30/dcn30_dpp.c   |  38 +-
+ drivers/gpu/drm/amd/display/dc/dcn30/dcn30_dpp.h   |   2 +
+ .../gpu/drm/amd/display/dc/dcn30/dcn30_dpp_cm.c    |  54 ++
+ drivers/gpu/drm/amd/display/dc/dcn30/dcn30_mpc.c   | 106 ++-
+ drivers/gpu/drm/amd/display/dc/dcn30/dcn30_mpc.h   |   4 +
+ drivers/gpu/drm/amd/display/dc/dcn32/dcn32_dpp.c   |   1 +
+ drivers/gpu/drm/amd/display/dc/dm_cp_psp.h         |   3 +
+ .../amd/display/dc/dml/dcn30/display_mode_vba_30.c |  16 +-
+ .../gpu/drm/amd/display/dc/dml/dcn303/dcn303_fpu.c |  11 +
+ .../gpu/drm/amd/display/dc/dml/dcn35/dcn35_fpu.c   |   4 +-
+ .../amd/display/dc/dml2/dml2_dc_resource_mgmt.c    |  41 +-
+ drivers/gpu/drm/amd/display/dc/dml2/dml2_utils.c   |   2 +-
+ drivers/gpu/drm/amd/display/dc/dsc/dc_dsc.c        |   5 +
+ .../drm/amd/display/dc/hwss/dce110/dce110_hwseq.c  |  56 +-
+ .../drm/amd/display/dc/hwss/dcn10/dcn10_hwseq.c    |  97 ++-
+ .../drm/amd/display/dc/hwss/dcn20/dcn20_hwseq.c    | 114 ++-
+ .../drm/amd/display/dc/hwss/dcn20/dcn20_hwseq.h    |   2 +
+ .../drm/amd/display/dc/hwss/dcn30/dcn30_hwseq.c    | 167 +++-
+ .../drm/amd/display/dc/hwss/dcn30/dcn30_hwseq.h    |   6 +-
+ .../gpu/drm/amd/display/dc/hwss/dcn30/dcn30_init.c |   2 +-
+ .../drm/amd/display/dc/hwss/dcn31/dcn31_hwseq.c    |  20 +-
+ .../drm/amd/display/dc/hwss/dcn31/dcn31_hwseq.h    |   4 +
+ .../gpu/drm/amd/display/dc/hwss/dcn31/dcn31_init.c |   2 +-
+ .../drm/amd/display/dc/hwss/dcn314/dcn314_init.c   |   2 +-
+ .../gpu/drm/amd/display/dc/hwss/dcn32/dcn32_init.c |   2 +-
+ .../drm/amd/display/dc/hwss/dcn35/dcn35_hwseq.c    |  21 +-
+ .../drm/amd/display/dc/hwss/dcn35/dcn35_hwseq.h    |   3 +
+ .../gpu/drm/amd/display/dc/hwss/dcn35/dcn35_init.c |   2 +-
+ .../drm/amd/display/dc/hwss/dcn351/dcn351_init.c   |   2 +-
+ drivers/gpu/drm/amd/display/dc/hwss/hw_sequencer.h |   2 +
+ .../drm/amd/display/dc/hwss/hw_sequencer_private.h |   2 -
+ drivers/gpu/drm/amd/display/dc/inc/core_types.h    |   2 +
+ drivers/gpu/drm/amd/display/dc/inc/hw/audio.h      |   3 +-
+ .../drm/amd/display/dc/inc/hw/clk_mgr_internal.h   |   6 +
+ drivers/gpu/drm/amd/display/dc/inc/hw/dchubbub.h   |   6 +
+ drivers/gpu/drm/amd/display/dc/inc/hw/dpp.h        |  39 +
+ drivers/gpu/drm/amd/display/dc/inc/hw/hubp.h       |  15 +-
+ drivers/gpu/drm/amd/display/dc/inc/hw/mpc.h        | 253 ++++--
+ drivers/gpu/drm/amd/display/dc/inc/hw/opp.h        |  16 +
+ .../drm/amd/display/dc/inc/hw/timing_generator.h   |   2 -
+ drivers/gpu/drm/amd/display/dc/inc/resource.h      |   4 +
+ .../drm/amd/display/dc/link/hwss/link_hwss_dio.h   |  10 +
+ .../gpu/drm/amd/display/dc/link/link_detection.c   |  18 +
+ drivers/gpu/drm/amd/display/dc/link/link_dpms.c    |  58 ++
+ .../gpu/drm/amd/display/dc/link/link_validation.c  |   2 -
+ .../display/dc/link/protocols/link_dp_training.c   |   5 +-
+ .../link_dp_training_fixed_vs_pe_retimer.c         | 372 +--------
+ .../link_dp_training_fixed_vs_pe_retimer.h         |   5 -
+ .../dc/link/protocols/link_edp_panel_control.c     |   2 +-
+ .../amd/display/dc/resource/dcn30/dcn30_resource.c |  11 +
+ .../amd/display/dc/resource/dcn31/dcn31_resource.c |   2 -
+ .../display/dc/resource/dcn321/dcn321_resource.c   |   1 +
+ .../amd/display/dc/resource/dcn35/dcn35_resource.c |   4 +-
+ drivers/gpu/drm/amd/display/dmub/dmub_srv.h        |  16 +-
+ drivers/gpu/drm/amd/display/dmub/inc/dmub_cmd.h    |  30 +-
+ drivers/gpu/drm/amd/display/dmub/src/dmub_dcn32.c  |   2 -
+ drivers/gpu/drm/amd/display/dmub/src/dmub_srv.c    | 121 ++-
+ drivers/gpu/drm/amd/display/include/audio_types.h  |  15 +
+ drivers/gpu/drm/amd/include/amd_shared.h           |   1 +
+ drivers/gpu/drm/amd/include/arct_ip_offset.h       |   6 +-
+ .../amd/include/asic_reg/dcn/dcn_3_1_6_offset.h    |   4 +
+ .../amd/include/asic_reg/dcn/dcn_3_1_6_sh_mask.h   |  10 +
+ .../amd/include/asic_reg/dcn/dcn_3_5_0_offset.h    |  24 +
+ .../amd/include/asic_reg/dcn/dcn_3_5_0_sh_mask.h   |  65 ++
+ drivers/gpu/drm/amd/include/atom-bits.h            |   2 +-
+ drivers/gpu/drm/amd/include/beige_goby_ip_offset.h |   6 +-
+ drivers/gpu/drm/amd/include/cgs_common.h           |  23 +-
+ .../gpu/drm/amd/include/cyan_skillfish_ip_offset.h |   6 +-
+ .../drm/amd/include/dimgrey_cavefish_ip_offset.h   |   6 +-
+ drivers/gpu/drm/amd/include/dm_pp_interface.h      |   9 +-
+ drivers/gpu/drm/amd/include/kgd_pp_interface.h     |   6 +-
+ drivers/gpu/drm/amd/include/navi12_ip_offset.h     |   6 +-
+ drivers/gpu/drm/amd/include/navi14_ip_offset.h     |   6 +-
+ drivers/gpu/drm/amd/include/pptable.h              |   6 +-
+ drivers/gpu/drm/amd/include/renoir_ip_offset.h     |   6 +-
+ .../gpu/drm/amd/include/sienna_cichlid_ip_offset.h |   6 +-
+ drivers/gpu/drm/amd/include/v10_structs.h          |   3 +-
+ drivers/gpu/drm/amd/include/vangogh_ip_offset.h    |   6 +-
+ drivers/gpu/drm/amd/include/vega10_ip_offset.h     |   6 +-
+ drivers/gpu/drm/amd/include/vega20_ip_offset.h     |  78 +-
+ .../gpu/drm/amd/pm/powerplay/hwmgr/ppatomctrl.c    |  42 +-
+ .../gpu/drm/amd/pm/powerplay/hwmgr/ppatomfwctrl.c  |   4 +-
+ drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c     |  16 +-
+ drivers/gpu/drm/amd/pm/swsmu/smu12/smu_v12_0.c     |   2 +-
+ drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c     |  18 +-
+ .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c   | 164 +++-
+ drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0.c     |   2 +-
+ drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c             |   9 +-
+ drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h             |  10 +
+ drivers/gpu/drm/bridge/adv7511/adv7511_drv.c       |  47 +-
+ drivers/gpu/drm/bridge/analogix/anx7625.c          |  30 +-
+ .../gpu/drm/bridge/cadence/cdns-mhdp8546-core.c    |  28 +-
+ drivers/gpu/drm/bridge/chrontel-ch7033.c           |  12 +-
+ drivers/gpu/drm/bridge/display-connector.c         |   8 +-
+ drivers/gpu/drm/bridge/imx/Kconfig                 |  18 +
+ drivers/gpu/drm/bridge/imx/Makefile                |   2 +
+ drivers/gpu/drm/bridge/imx/imx8mp-hdmi-pvi.c       | 207 +++++
+ drivers/gpu/drm/bridge/imx/imx8mp-hdmi-tx.c        | 154 ++++
+ drivers/gpu/drm/bridge/ite-it6505.c                |  21 +-
+ drivers/gpu/drm/bridge/ite-it66121.c               |  16 +-
+ drivers/gpu/drm/bridge/lontium-lt8912b.c           |  20 +-
+ drivers/gpu/drm/bridge/lontium-lt9611.c            |   9 +-
+ drivers/gpu/drm/bridge/lontium-lt9611uxc.c         |  19 +-
+ .../drm/bridge/megachips-stdpxxxx-ge-b850v3-fw.c   |  18 +-
+ drivers/gpu/drm/bridge/nxp-ptn3460.c               |  22 +-
+ drivers/gpu/drm/bridge/samsung-dsim.c              |  18 +-
+ drivers/gpu/drm/bridge/sii902x.c                   |  38 +-
+ drivers/gpu/drm/bridge/simple-bridge.c             |  17 +-
+ drivers/gpu/drm/bridge/synopsys/dw-hdmi.c          |  44 +-
+ drivers/gpu/drm/bridge/tc358767.c                  | 193 +++--
+ drivers/gpu/drm/bridge/ti-sn65dsi86.c              |   8 +-
+ drivers/gpu/drm/bridge/ti-tfp410.c                 |  18 +-
+ drivers/gpu/drm/ci/build.sh                        |   1 +
+ drivers/gpu/drm/ci/gitlab-ci.yml                   |  14 +-
+ drivers/gpu/drm/ci/test.yml                        |  31 +-
+ drivers/gpu/drm/ci/testlist.txt                    |  49 ++
+ drivers/gpu/drm/ci/xfails/msm-apq8016-fails.txt    |   3 +-
+ drivers/gpu/drm/ci/xfails/msm-sc7180-fails.txt     |  30 -
+ drivers/gpu/drm/ci/xfails/msm-sc7180-flakes.txt    |  17 -
+ drivers/gpu/drm/ci/xfails/msm-sc7180-skips.txt     |   7 -
+ .../xfails/msm-sc7180-trogdor-kingoftown-fails.txt |  17 +
+ .../msm-sc7180-trogdor-lazor-limozeen-fails.txt    |  17 +
+ drivers/gpu/drm/ci/xfails/msm-sdm845-fails.txt     |   5 +-
+ drivers/gpu/drm/ci/xfails/msm-sdm845-flakes.txt    |  28 +-
+ drivers/gpu/drm/ci/xfails/msm-sdm845-skips.txt     |   7 +-
+ drivers/gpu/drm/display/drm_dp_aux_bus.c           |   2 +-
+ drivers/gpu/drm/display/drm_dp_helper.c            |  17 +-
+ drivers/gpu/drm/display/drm_dp_mst_topology.c      |  23 +-
+ drivers/gpu/drm/drm_bridge.c                       |  17 +-
+ drivers/gpu/drm/drm_bridge_connector.c             |  16 +-
+ drivers/gpu/drm/drm_crtc.c                         |   8 +-
+ drivers/gpu/drm/drm_debugfs.c                      |   4 -
+ drivers/gpu/drm/drm_edid.c                         |  25 +-
+ drivers/gpu/drm/drm_exec.c                         |   2 +-
+ drivers/gpu/drm/drm_gem_vram_helper.c              |   2 -
+ drivers/gpu/drm/drm_ioc32.c                        |   4 +-
+ drivers/gpu/drm/drm_managed.c                      |  39 +
+ drivers/gpu/drm/drm_mipi_dsi.c                     |   2 +-
+ drivers/gpu/drm/drm_mode_config.c                  |   2 +-
+ drivers/gpu/drm/drm_modes.c                        |  22 +
+ drivers/gpu/drm/drm_modeset_lock.c                 |   2 +-
+ drivers/gpu/drm/drm_panel_orientation_quirks.c     |  12 +
+ drivers/gpu/drm/drm_print.c                        |  29 +-
+ drivers/gpu/drm/drm_probe_helper.c                 |  36 -
+ drivers/gpu/drm/hisilicon/hibmc/hibmc_drm_drv.h    |   1 -
+ drivers/gpu/drm/hisilicon/hibmc/hibmc_drm_vdac.c   |   1 +
+ drivers/gpu/drm/i915/display/i9xx_plane.c          |  30 +
+ drivers/gpu/drm/i915/display/i9xx_plane.h          |   7 +
+ drivers/gpu/drm/i915/display/intel_atomic_plane.c  |   6 +-
+ drivers/gpu/drm/i915/display/intel_backlight.c     |   2 +-
+ drivers/gpu/drm/i915/display/intel_bios.c          |  36 +-
+ drivers/gpu/drm/i915/display/intel_bios.h          |   5 +-
+ drivers/gpu/drm/i915/display/intel_cdclk.c         | 361 +++++----
+ drivers/gpu/drm/i915/display/intel_crt.c           |   5 +
+ drivers/gpu/drm/i915/display/intel_crtc.c          | 128 +--
+ .../gpu/drm/i915/display/intel_crtc_state_dump.c   |   5 +-
+ drivers/gpu/drm/i915/display/intel_cursor.c        |  63 +-
+ drivers/gpu/drm/i915/display/intel_cx0_phy.c       | 231 +++---
+ drivers/gpu/drm/i915/display/intel_cx0_phy_regs.h  |  63 +-
+ drivers/gpu/drm/i915/display/intel_ddi.c           |  67 +-
+ drivers/gpu/drm/i915/display/intel_display.c       | 105 ++-
+ drivers/gpu/drm/i915/display/intel_display_core.h  |  18 +-
+ .../gpu/drm/i915/display/intel_display_debugfs.c   |  26 +-
+ .../i915/display/intel_display_debugfs_params.c    |   1 +
+ .../gpu/drm/i915/display/intel_display_device.c    |   2 +-
+ .../gpu/drm/i915/display/intel_display_driver.c    | 168 +++-
+ .../gpu/drm/i915/display/intel_display_driver.h    |   6 +
+ drivers/gpu/drm/i915/display/intel_display_irq.c   |  10 +-
+ drivers/gpu/drm/i915/display/intel_display_types.h |  38 +-
+ drivers/gpu/drm/i915/display/intel_dmc.c           |   2 +-
+ drivers/gpu/drm/i915/display/intel_dp.c            | 192 ++---
+ drivers/gpu/drm/i915/display/intel_dp.h            |  10 +-
+ drivers/gpu/drm/i915/display/intel_dp_aux.c        |  29 +-
+ drivers/gpu/drm/i915/display/intel_dp_mst.c        |   4 +
+ drivers/gpu/drm/i915/display/intel_dpll_mgr.c      |  83 +-
+ drivers/gpu/drm/i915/display/intel_dpll_mgr.h      |  18 +-
+ drivers/gpu/drm/i915/display/intel_dsb.c           |   4 +
+ drivers/gpu/drm/i915/display/intel_dvo.c           |   5 +
+ drivers/gpu/drm/i915/display/intel_fbc.c           |  13 +-
+ drivers/gpu/drm/i915/display/intel_fbdev_fb.c      |   5 +-
+ drivers/gpu/drm/i915/display/intel_global_state.c  | 137 +++-
+ drivers/gpu/drm/i915/display/intel_global_state.h  |   9 +-
+ drivers/gpu/drm/i915/display/intel_gmbus.c         |   5 +-
+ drivers/gpu/drm/i915/display/intel_hdcp.c          |  78 +-
+ drivers/gpu/drm/i915/display/intel_hdcp_gsc.c      |   2 +-
+ drivers/gpu/drm/i915/display/intel_hdcp_regs.h     |  28 +-
+ drivers/gpu/drm/i915/display/intel_hdmi.c          |  16 +-
+ drivers/gpu/drm/i915/display/intel_hotplug.c       | 165 +++-
+ drivers/gpu/drm/i915/display/intel_hotplug.h       |   4 +
+ drivers/gpu/drm/i915/display/intel_hotplug_irq.c   |   6 +-
+ drivers/gpu/drm/i915/display/intel_opregion.c      | 176 +++--
+ drivers/gpu/drm/i915/display/intel_opregion.h      |  47 +-
+ drivers/gpu/drm/i915/display/intel_panel.c         |   4 +
+ drivers/gpu/drm/i915/display/intel_plane_initial.c | 255 ++++--
+ drivers/gpu/drm/i915/display/intel_plane_initial.h |   4 +-
+ drivers/gpu/drm/i915/display/intel_pps.c           |   2 +-
+ drivers/gpu/drm/i915/display/intel_psr.c           | 202 +++--
+ drivers/gpu/drm/i915/display/intel_psr.h           |   6 -
+ drivers/gpu/drm/i915/display/intel_psr_regs.h      |  63 ++
+ drivers/gpu/drm/i915/display/intel_sdvo.c          |   6 +
+ drivers/gpu/drm/i915/display/intel_tc.c            |  40 +-
+ drivers/gpu/drm/i915/display/intel_tc.h            |   2 +-
+ drivers/gpu/drm/i915/display/intel_tv.c            |   7 +-
+ drivers/gpu/drm/i915/display/intel_vblank.c        | 130 +++
+ drivers/gpu/drm/i915/display/intel_vblank.h        |  12 +
+ drivers/gpu/drm/i915/display/skl_universal_plane.c |  28 +
+ drivers/gpu/drm/i915/display/skl_universal_plane.h |   2 +
+ drivers/gpu/drm/i915/display/skl_watermark.c       |  54 +-
+ drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c     |   8 -
+ drivers/gpu/drm/i915/gem/i915_gem_pm.c             |  10 +
+ drivers/gpu/drm/i915/gem/i915_gem_region.c         |   2 +-
+ drivers/gpu/drm/i915/gem/i915_gem_stolen.c         |  25 +-
+ drivers/gpu/drm/i915/gem/i915_gem_ttm.c            |  50 +-
+ drivers/gpu/drm/i915/gem/i915_gem_userptr.c        |  42 -
+ drivers/gpu/drm/i915/gem/i915_gem_userptr.h        |  14 -
+ drivers/gpu/drm/i915/gem/selftests/i915_gem_mman.c |  18 +-
+ drivers/gpu/drm/i915/gt/gen8_engine_cs.c           |   4 +-
+ drivers/gpu/drm/i915/gt/intel_engine_cs.c          |   3 +-
+ drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c   |   6 +-
+ drivers/gpu/drm/i915/gt/intel_ggtt.c               |  10 +-
+ drivers/gpu/drm/i915/gt/intel_gt_sysfs_pm.c        |  18 +-
+ drivers/gpu/drm/i915/gt/intel_gtt.c                |   3 +-
+ drivers/gpu/drm/i915/gt/intel_mocs.c               |   2 +-
+ drivers/gpu/drm/i915/gt/intel_rc6.c                |   2 +-
+ drivers/gpu/drm/i915/gt/intel_region_lmem.c        |  14 +-
+ drivers/gpu/drm/i915/gt/intel_reset.c              |   3 +-
+ drivers/gpu/drm/i915/gt/intel_workarounds.c        |  30 +-
+ drivers/gpu/drm/i915/gt/selftest_context.c         |   3 +-
+ .../gpu/drm/i915/gt/selftest_engine_heartbeat.c    |  10 +-
+ drivers/gpu/drm/i915/gt/selftest_rc6.c             |   4 +-
+ drivers/gpu/drm/i915/gt/selftest_tlb.c             |   4 +-
+ drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c         |  21 +-
+ drivers/gpu/drm/i915/gt/uc/intel_guc_fw.c          |  10 +-
+ drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c  | 126 ++-
+ drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h  |   2 +
+ drivers/gpu/drm/i915/gt/uc/intel_huc.c             |  64 +-
+ drivers/gpu/drm/i915/gt/uc/intel_uc.c              |   4 +-
+ drivers/gpu/drm/i915/gvt/kvmgt.c                   |   2 +-
+ drivers/gpu/drm/i915/i915_debugfs.c                |   2 +-
+ drivers/gpu/drm/i915/i915_driver.c                 |  28 +-
+ drivers/gpu/drm/i915/i915_drv.h                    |   8 -
+ drivers/gpu/drm/i915/i915_gem.c                    |   5 -
+ drivers/gpu/drm/i915/i915_gpu_error.c              |   2 +-
+ drivers/gpu/drm/i915/i915_perf.c                   |   2 +-
+ drivers/gpu/drm/i915/i915_query.c                  |  35 +-
+ drivers/gpu/drm/i915/i915_reg.h                    |  18 +-
+ drivers/gpu/drm/i915/i915_syncmap.c                |  19 +-
+ drivers/gpu/drm/i915/i915_utils.c                  |  17 +
+ drivers/gpu/drm/i915/i915_utils.h                  |   2 +
+ drivers/gpu/drm/i915/intel_memory_region.c         |  33 +-
+ drivers/gpu/drm/i915/intel_memory_region.h         |   3 +-
+ drivers/gpu/drm/i915/intel_region_ttm.c            |   8 +-
+ drivers/gpu/drm/i915/intel_uncore.c                |   5 +-
+ drivers/gpu/drm/i915/selftests/i915_active.c       |   8 +-
+ .../gpu/drm/i915/selftests/intel_memory_region.c   |   4 +-
+ drivers/gpu/drm/i915/soc/intel_pch.c               |  16 +-
+ drivers/gpu/drm/i915/soc/intel_pch.h               |   6 +-
+ drivers/gpu/drm/imx/dcss/dcss-blkctl.c             |  13 +-
+ drivers/gpu/drm/imx/dcss/dcss-ctxld.c              |  14 +-
+ drivers/gpu/drm/imx/dcss/dcss-dev.c                |  19 +-
+ drivers/gpu/drm/imx/dcss/dcss-dev.h                |   1 -
+ drivers/gpu/drm/imx/dcss/dcss-dpr.c                |  21 +-
+ drivers/gpu/drm/imx/dcss/dcss-drv.c                |  12 +-
+ drivers/gpu/drm/imx/dcss/dcss-dtg.c                |  26 +-
+ drivers/gpu/drm/imx/dcss/dcss-scaler.c             |  21 +-
+ drivers/gpu/drm/imx/dcss/dcss-ss.c                 |  12 +-
+ drivers/gpu/drm/imx/ipuv3/imx-ldb.c                |   2 +-
+ drivers/gpu/drm/ingenic/Kconfig                    |   1 -
+ drivers/gpu/drm/lima/lima_ctx.c                    |   2 +-
+ drivers/gpu/drm/lima/lima_ctx.h                    |   1 -
+ drivers/gpu/drm/lima/lima_gem.c                    |  23 +-
+ drivers/gpu/drm/lima/lima_gp.c                     |  39 +-
+ drivers/gpu/drm/lima/lima_l2_cache.c               |   6 +-
+ drivers/gpu/drm/lima/lima_mmu.c                    |  18 +-
+ drivers/gpu/drm/lima/lima_pmu.c                    |   3 +-
+ drivers/gpu/drm/lima/lima_pp.c                     |  37 +-
+ drivers/gpu/drm/lima/lima_sched.c                  |  38 +-
+ drivers/gpu/drm/lima/lima_sched.h                  |   3 +-
+ drivers/gpu/drm/loongson/lsdc_drv.c                |   2 +-
+ drivers/gpu/drm/loongson/lsdc_ttm.c                |   2 -
+ drivers/gpu/drm/mcde/Kconfig                       |   1 -
+ drivers/gpu/drm/mediatek/mtk_dp.c                  |  31 +-
+ drivers/gpu/drm/mediatek/mtk_hdmi.c                |  26 +-
+ drivers/gpu/drm/meson/meson_encoder_hdmi.c         |  20 +-
+ drivers/gpu/drm/mgag200/mgag200_drv.c              |   9 +-
+ drivers/gpu/drm/mgag200/mgag200_mode.c             |  14 +-
+ drivers/gpu/drm/msm/dp/dp_display.c                |   1 +
+ drivers/gpu/drm/msm/hdmi/hdmi_bridge.c             |  33 +-
+ drivers/gpu/drm/nouveau/dispnv04/crtc.c            |   4 +-
+ drivers/gpu/drm/nouveau/dispnv50/head.c            |   1 +
+ drivers/gpu/drm/nouveau/nouveau_bo.c               |  59 +-
+ drivers/gpu/drm/nouveau/nouveau_bo.h               |   1 -
+ drivers/gpu/drm/nouveau/nouveau_connector.h        |   2 +-
+ drivers/gpu/drm/nouveau/nouveau_ioc32.c            |   4 +-
+ drivers/gpu/drm/nouveau/nouveau_svm.c              |  10 +-
+ drivers/gpu/drm/nouveau/nvif/outp.c                |   3 +-
+ drivers/gpu/drm/nouveau/nvkm/engine/gr/gf100.c     |   2 +-
+ drivers/gpu/drm/nouveau/nvkm/subdev/acr/lsfw.c     |   3 +-
+ drivers/gpu/drm/nouveau/nvkm/subdev/bios/init.c    | 136 ++--
+ drivers/gpu/drm/nouveau/nvkm/subdev/volt/gk20a.c   |   4 +-
+ drivers/gpu/drm/omapdrm/dss/hdmi4.c                |  22 +-
+ drivers/gpu/drm/omapdrm/dss/hdmi5.c                |  12 +-
+ drivers/gpu/drm/panel/Kconfig                      | 223 +++---
+ drivers/gpu/drm/panel/Makefile                     |   2 +
+ drivers/gpu/drm/panel/panel-boe-himax8279d.c       |  18 +-
+ .../gpu/drm/panel/panel-boe-th101mb31ig002-28a.c   | 322 ++++++++
+ drivers/gpu/drm/panel/panel-edp.c                  |  97 ++-
+ drivers/gpu/drm/panel/panel-leadtek-ltk050h3146w.c |  23 +-
+ drivers/gpu/drm/panel/panel-novatek-nt35510.c      | 424 ++++++++--
+ drivers/gpu/drm/panel/panel-novatek-nt36523.c      |   8 +-
+ drivers/gpu/drm/panel/panel-novatek-nt36672e.c     | 643 +++++++++++++++
+ drivers/gpu/drm/panel/panel-simple.c               |  61 +-
+ drivers/gpu/drm/panel/panel-sitronix-st7703.c      | 104 +++
+ drivers/gpu/drm/panel/panel-visionox-r66451.c      |   1 +
+ drivers/gpu/drm/panel/panel-visionox-vtdr6130.c    |   1 +
+ drivers/gpu/drm/pl111/Kconfig                      |   1 -
+ drivers/gpu/drm/qxl/qxl_object.c                   |   2 -
+ drivers/gpu/drm/qxl/qxl_ttm.c                      |   2 -
+ drivers/gpu/drm/radeon/atom-bits.h                 |   2 +-
+ drivers/gpu/drm/radeon/atom.c                      |  47 +-
+ drivers/gpu/drm/radeon/atom.h                      |   4 +-
+ drivers/gpu/drm/radeon/atombios_crtc.c             |  28 +-
+ drivers/gpu/drm/radeon/atombios_dp.c               |   4 +-
+ drivers/gpu/drm/radeon/atombios_encoders.c         |  38 +-
+ drivers/gpu/drm/radeon/atombios_i2c.c              |   2 +-
+ drivers/gpu/drm/radeon/btc_dpm.c                   |  90 +--
+ drivers/gpu/drm/radeon/ci_dpm.c                    |  31 +-
+ drivers/gpu/drm/radeon/ci_dpm.h                    |   6 +-
+ drivers/gpu/drm/radeon/clearstate_cayman.h         |   9 +-
+ drivers/gpu/drm/radeon/clearstate_ci.h             |   3 +-
+ drivers/gpu/drm/radeon/evergreen.c                 |  20 +-
+ drivers/gpu/drm/radeon/evergreen_cs.c              |   4 +-
+ drivers/gpu/drm/radeon/evergreen_reg.h             |  10 +-
+ drivers/gpu/drm/radeon/evergreen_smc.h             |   9 +-
+ drivers/gpu/drm/radeon/kv_dpm.c                    |   9 +-
+ drivers/gpu/drm/radeon/kv_smc.c                    |   2 +-
+ drivers/gpu/drm/radeon/ni.c                        |  31 +-
+ drivers/gpu/drm/radeon/ni_dpm.c                    |   3 -
+ drivers/gpu/drm/radeon/ni_dpm.h                    |  12 +-
+ drivers/gpu/drm/radeon/nislands_smc.h              |  51 +-
+ drivers/gpu/drm/radeon/r100.c                      |   2 +-
+ drivers/gpu/drm/radeon/r300_reg.h                  |   2 +-
+ drivers/gpu/drm/radeon/r600.c                      |   3 +-
+ drivers/gpu/drm/radeon/r600_dpm.c                  |   6 +-
+ drivers/gpu/drm/radeon/r600_dpm.h                  |   3 +-
+ drivers/gpu/drm/radeon/radeon.h                    |   6 +-
+ drivers/gpu/drm/radeon/radeon_asic.c               |   8 +-
+ drivers/gpu/drm/radeon/radeon_atombios.c           |  44 +-
+ drivers/gpu/drm/radeon/radeon_atpx_handler.c       |  12 +-
+ drivers/gpu/drm/radeon/radeon_audio.c              |  11 +-
+ drivers/gpu/drm/radeon/radeon_audio.h              |   6 +-
+ drivers/gpu/drm/radeon/radeon_mode.h               |   9 +-
+ drivers/gpu/drm/radeon/radeon_object.c             |   2 -
+ drivers/gpu/drm/radeon/radeon_pm.c                 |   4 +-
+ drivers/gpu/drm/radeon/radeon_ttm.c                |   8 +-
+ drivers/gpu/drm/radeon/radeon_uvd.c                |   1 -
+ drivers/gpu/drm/radeon/rs400.c                     |   4 +-
+ drivers/gpu/drm/radeon/rs600.c                     |   3 +-
+ drivers/gpu/drm/radeon/rv515.c                     |   3 +-
+ drivers/gpu/drm/radeon/rv6xx_dpm.h                 |   3 +-
+ drivers/gpu/drm/radeon/rv770_dpm.c                 |   4 +-
+ drivers/gpu/drm/radeon/rv770_smc.h                 |  27 +-
+ drivers/gpu/drm/radeon/si.c                        |  63 +-
+ drivers/gpu/drm/radeon/si_dpm.c                    | 132 ++--
+ drivers/gpu/drm/radeon/si_dpm.h                    |  21 +-
+ drivers/gpu/drm/radeon/smu7.h                      |   6 +-
+ drivers/gpu/drm/radeon/smu7_discrete.h             |  51 +-
+ drivers/gpu/drm/radeon/smu7_fusion.h               |  42 +-
+ drivers/gpu/drm/radeon/sumo_dpm.c                  |  18 +-
+ drivers/gpu/drm/radeon/trinity_dpm.c               |  22 +-
+ drivers/gpu/drm/radeon/trinity_dpm.h               |   3 +-
+ drivers/gpu/drm/radeon/uvd_v1_0.c                  |   2 +-
+ drivers/gpu/drm/rockchip/analogix_dp-rockchip.c    |   3 +
+ drivers/gpu/drm/rockchip/inno_hdmi.c               | 575 ++++++++------
+ drivers/gpu/drm/rockchip/inno_hdmi.h               |   5 -
+ drivers/gpu/drm/rockchip/rockchip_lvds.c           |   3 +-
+ drivers/gpu/drm/rockchip/rockchip_vop_reg.c        |  13 +-
+ drivers/gpu/drm/rockchip/rockchip_vop_reg.h        |   3 +
+ drivers/gpu/drm/scheduler/sched_main.c             |  11 +-
+ drivers/gpu/drm/solomon/ssd130x-spi.c              |   7 +
+ drivers/gpu/drm/solomon/ssd130x.c                  | 370 +++++++++
+ drivers/gpu/drm/solomon/ssd130x.h                  |   5 +-
+ drivers/gpu/drm/tegra/dpaux.c                      |  14 +-
+ drivers/gpu/drm/tegra/drm.h                        |   2 +-
+ drivers/gpu/drm/tegra/dsi.c                        |  59 +-
+ drivers/gpu/drm/tegra/hdmi.c                       |  21 +-
+ drivers/gpu/drm/tegra/output.c                     |  17 +-
+ drivers/gpu/drm/tegra/rgb.c                        |  18 +-
+ drivers/gpu/drm/tegra/sor.c                        |   1 +
+ drivers/gpu/drm/tests/drm_managed_test.c           |  79 +-
+ drivers/gpu/drm/tests/drm_mm_test.c                |   2 +-
+ drivers/gpu/drm/tilcdc/tilcdc_drv.c                |  19 +-
+ drivers/gpu/drm/ttm/tests/Makefile                 |   3 +
+ drivers/gpu/drm/ttm/tests/ttm_bo_test.c            | 622 +++++++++++++++
+ drivers/gpu/drm/ttm/tests/ttm_kunit_helpers.c      |  48 +-
+ drivers/gpu/drm/ttm/tests/ttm_kunit_helpers.h      |   3 +
+ drivers/gpu/drm/ttm/tests/ttm_pool_test.c          |   3 +-
+ drivers/gpu/drm/ttm/tests/ttm_resource_test.c      | 335 ++++++++
+ drivers/gpu/drm/ttm/tests/ttm_tt_test.c            | 295 +++++++
+ drivers/gpu/drm/ttm/ttm_bo.c                       |  30 +-
+ drivers/gpu/drm/ttm/ttm_bo_util.c                  |  13 +-
+ drivers/gpu/drm/ttm/ttm_resource.c                 |  76 +-
+ drivers/gpu/drm/ttm/ttm_tt.c                       |  15 +
+ drivers/gpu/drm/tve200/Kconfig                     |   1 -
+ drivers/gpu/drm/v3d/v3d_debugfs.c                  |  15 +
+ drivers/gpu/drm/vc4/vc4_hdmi.c                     |   1 +
+ drivers/gpu/drm/vc4/vc4_plane.c                    |  10 +-
+ drivers/gpu/drm/virtio/virtgpu_submit.c            |   6 +-
+ drivers/gpu/drm/vkms/Kconfig                       |  15 +
+ drivers/gpu/drm/vkms/vkms_composer.c               |  14 +-
+ drivers/gpu/drm/vmwgfx/ttm_object.c                |   6 +-
+ drivers/gpu/drm/vmwgfx/ttm_object.h                |   3 +-
+ drivers/gpu/drm/vmwgfx/vmwgfx_bo.c                 |  35 +-
+ drivers/gpu/drm/vmwgfx/vmwgfx_drv.h                |   1 -
+ drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c            |  20 +-
+ drivers/gpu/drm/vmwgfx/vmwgfx_gmrid_manager.c      |   5 +-
+ drivers/gpu/drm/vmwgfx/vmwgfx_kms.c                | 300 +++----
+ drivers/gpu/drm/vmwgfx/vmwgfx_kms.h                |   6 +-
+ drivers/gpu/drm/vmwgfx/vmwgfx_ldu.c                |   5 +-
+ drivers/gpu/drm/vmwgfx/vmwgfx_scrn.c               |   5 +-
+ drivers/gpu/drm/vmwgfx/vmwgfx_stdu.c               |  21 +-
+ drivers/gpu/drm/vmwgfx/vmwgfx_surface.c            |  18 +-
+ drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c         |  32 -
+ drivers/gpu/drm/xe/display/xe_plane_initial.c      |  67 +-
+ drivers/gpu/drm/xe/xe_bo.c                         |  33 +-
+ drivers/gpu/drm/xe/xe_gt.c                         |   2 +-
+ drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c        |   2 +-
+ drivers/gpu/drm/xe/xe_gt_topology.c                |   4 +-
+ drivers/gpu/drm/xe/xe_guc_submit.c                 |   3 +-
+ drivers/gpu/drm/xe/xe_reg_sr.c                     |   2 +-
+ drivers/gpu/drm/xlnx/zynqmp_disp.c                 |   2 +-
+ drivers/gpu/drm/xlnx/zynqmp_dp.c                   |  22 +-
+ drivers/gpu/host1x/bus.c                           |   2 +-
+ drivers/gpu/host1x/bus.h                           |   2 +-
+ drivers/staging/sm750fb/Kconfig                    |   1 -
+ drivers/video/Kconfig                              |   9 +-
+ drivers/video/Makefile                             |   7 +-
+ drivers/video/cmdline.c                            |   2 +
+ drivers/video/fbdev/Kconfig                        |  35 -
+ drivers/video/fbdev/core/Kconfig                   |   2 +-
+ drivers/video/fbdev/core/fbmem.c                   |   2 -
+ drivers/video/fbdev/efifb.c                        | 225 ++----
+ drivers/video/fbdev/geode/Kconfig                  |   3 -
+ drivers/video/fbdev/simplefb.c                     |   2 +-
+ drivers/video/fbdev/vesafb.c                       |  78 +-
+ drivers/video/screen_info_generic.c                | 146 ++++
+ drivers/video/screen_info_pci.c                    | 136 ++++
+ include/drm/display/drm_dp.h                       |   1 +
+ include/drm/display/drm_dp_helper.h                |   3 +-
+ include/drm/drm_atomic.h                           |  70 +-
+ include/drm/drm_bridge.h                           |  25 +-
+ include/drm/drm_edid.h                             |  46 +-
+ include/drm/drm_exec.h                             |   4 +-
+ include/drm/drm_fixed.h                            |   2 +-
+ include/drm/drm_gpuvm.h                            |   2 +-
+ include/drm/drm_managed.h                          |   4 +
+ include/drm/drm_modes.h                            |   2 +
+ include/drm/drm_print.h                            | 223 +++---
+ include/drm/drm_probe_helper.h                     |   1 -
+ include/drm/drm_rect.h                             |   4 +-
+ include/drm/i915_pciids.h                          |   3 +
+ include/drm/ttm/ttm_placement.h                    |  10 +-
+ include/drm/ttm/ttm_resource.h                     |   8 +-
+ include/drm/ttm/ttm_tt.h                           |   9 +-
+ include/linux/fb.h                                 |   7 -
+ include/linux/iosys-map.h                          |   2 +-
+ include/linux/screen_info.h                        | 126 +++
+ include/linux/sysfb.h                              |   6 +-
+ include/sound/hdmi-codec.h                         |   1 -
+ include/uapi/drm/amdgpu_drm.h                      |   2 +
+ include/uapi/drm/i915_drm.h                        |  12 +
+ include/uapi/drm/nouveau_drm.h                     |  56 +-
+ include/uapi/drm/qaic_accel.h                      |  13 +-
+ include/uapi/drm/vmwgfx_drm.h                      |   6 +-
+ include/uapi/linux/kfd_ioctl.h                     |   3 +-
+ include/uapi/linux/virtio_gpu.h                    |   2 +
+ include/video/cmdline.h                            |   8 +-
+ 650 files changed, 16404 insertions(+), 6371 deletions(-)
+ create mode 100644 Documentation/devicetree/bindings/display/bridge/fsl,imx8mp-hdmi-tx.yaml
+ create mode 100644 Documentation/devicetree/bindings/display/imx/fsl,imx8mp-hdmi-pvi.yaml
+ create mode 100644 Documentation/devicetree/bindings/display/panel/boe,th101mb31ig002-28a.yaml
+ create mode 100644 Documentation/devicetree/bindings/display/panel/novatek,nt36672e.yaml
+ create mode 100644 Documentation/devicetree/bindings/display/solomon,ssd133x.yaml
+ create mode 100644 Documentation/gpu/amdgpu/display/dcn-blocks.rst
+ create mode 100644 Documentation/gpu/amdgpu/display/display-contributing.rst
+ delete mode 100644 Documentation/gpu/rfc/xe.rst
+ create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
+ create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h
+ delete mode 100644 drivers/gpu/drm/amd/display/TODO
+ create mode 100644 drivers/gpu/drm/bridge/imx/imx8mp-hdmi-pvi.c
+ create mode 100644 drivers/gpu/drm/bridge/imx/imx8mp-hdmi-tx.c
+ delete mode 100644 drivers/gpu/drm/ci/xfails/msm-sc7180-fails.txt
+ delete mode 100644 drivers/gpu/drm/ci/xfails/msm-sc7180-flakes.txt
+ delete mode 100644 drivers/gpu/drm/ci/xfails/msm-sc7180-skips.txt
+ create mode 100644 drivers/gpu/drm/ci/xfails/msm-sc7180-trogdor-kingoftown-fails.txt
+ create mode 100644 drivers/gpu/drm/ci/xfails/msm-sc7180-trogdor-lazor-limozeen-fails.txt
+ delete mode 100644 drivers/gpu/drm/i915/gem/i915_gem_userptr.h
+ create mode 100644 drivers/gpu/drm/panel/panel-boe-th101mb31ig002-28a.c
+ create mode 100644 drivers/gpu/drm/panel/panel-novatek-nt36672e.c
+ create mode 100644 drivers/gpu/drm/ttm/tests/ttm_bo_test.c
+ create mode 100644 drivers/gpu/drm/ttm/tests/ttm_resource_test.c
+ create mode 100644 drivers/gpu/drm/ttm/tests/ttm_tt_test.c
+ create mode 100644 drivers/gpu/drm/vkms/Kconfig
+ create mode 100644 drivers/video/screen_info_generic.c
+ create mode 100644 drivers/video/screen_info_pci.c
+Merging drm-ci/topic/drm-ci (ad6bfe1b66a5 drm: ci: docs: fix build warning - add missing escape)
+$ git merge -m Merge branch 'topic/drm-ci' of git://git.freedesktop.org/git/drm/drm.git drm-ci/topic/drm-ci
+Already up to date.
+Merging drm-exynos/for-linux-next (9ac4beb7578a Merge tag 'drm-misc-next-2024-02-15' of git://anongit.freedesktop.org/drm/drm-misc into drm-next)
+$ git merge -m Merge branch 'for-linux-next' of git://git.kernel.org/pub/scm/linux/kernel/git/daeinki/drm-exynos.git drm-exynos/for-linux-next
+Already up to date.
+Merging drm-misc/for-linux-next (1f4c6f11a557 drm/ci: mark universal-plane-sanity as failing on SC7180)
+$ git merge -m Merge branch 'for-linux-next' of git://anongit.freedesktop.org/drm/drm-misc drm-misc/for-linux-next
+Auto-merging Documentation/devicetree/bindings/vendor-prefixes.yaml
+Auto-merging drivers/accel/ivpu/ivpu_drv.c
+Auto-merging drivers/accel/ivpu/ivpu_fw.c
+Auto-merging drivers/accel/ivpu/ivpu_hw_37xx.c
+Auto-merging drivers/accel/ivpu/ivpu_hw_40xx.c
+Auto-merging drivers/accel/ivpu/ivpu_job.c
+Auto-merging drivers/accel/ivpu/ivpu_pm.c
+Auto-merging drivers/gpu/drm/meson/meson_encoder_cvbs.c
+Auto-merging drivers/gpu/drm/meson/meson_encoder_dsi.c
+Auto-merging drivers/gpu/drm/meson/meson_encoder_hdmi.c
+Merge made by the 'ort' strategy.
+ .../bindings/display/panel/himax,hx83112a.yaml     |   74 +
+ .../display/panel/leadtek,ltk500hd1829.yaml        |    4 +-
+ .../bindings/display/panel/panel-lvds.yaml         |    2 +
+ .../bindings/display/panel/panel-simple.yaml       |    2 +
+ .../bindings/display/ti/ti,am65x-dss.yaml          |    7 +-
+ .../devicetree/bindings/vendor-prefixes.yaml       |    2 +
+ Documentation/gpu/drm-usage-stats.rst              |    2 +-
+ drivers/accel/ivpu/ivpu_debugfs.c                  |   32 +-
+ drivers/accel/ivpu/ivpu_drv.c                      |   12 +-
+ drivers/accel/ivpu/ivpu_drv.h                      |    7 +-
+ drivers/accel/ivpu/ivpu_fw.c                       |   49 +-
+ drivers/accel/ivpu/ivpu_fw_log.c                   |    6 +-
+ drivers/accel/ivpu/ivpu_gem.c                      |   70 +-
+ drivers/accel/ivpu/ivpu_gem.h                      |    6 +-
+ drivers/accel/ivpu/ivpu_hw_37xx.c                  |   10 +-
+ drivers/accel/ivpu/ivpu_hw_40xx.c                  |   10 +-
+ drivers/accel/ivpu/ivpu_ipc.c                      |   12 +-
+ drivers/accel/ivpu/ivpu_job.c                      |   22 +-
+ drivers/accel/ivpu/ivpu_pm.c                       |   10 +-
+ drivers/accel/ivpu/vpu_boot_api.h                  |   46 +-
+ drivers/accel/ivpu/vpu_jsm_api.h                   |   32 +-
+ drivers/char/agp/agp.h                             |    1 -
+ drivers/gpu/drm/amd/amdgpu/amdgpu_fdinfo.c         |    4 +
+ drivers/gpu/drm/amd/amdgpu/amdgpu_object.c         |   11 +
+ drivers/gpu/drm/amd/amdgpu/amdgpu_object.h         |    6 +
+ drivers/gpu/drm/bridge/synopsys/dw-hdmi.c          |    1 +
+ drivers/gpu/drm/ci/testlist.txt                    | 1888 +++++++++-----------
+ drivers/gpu/drm/ci/xfails/msm-apq8096-fails.txt    |    2 -
+ .../xfails/msm-sc7180-trogdor-kingoftown-fails.txt |    1 +
+ .../xfails/msm-sc7180-trogdor-kingoftown-skips.txt |    2 +
+ .../msm-sc7180-trogdor-lazor-limozeen-fails.txt    |    1 +
+ .../msm-sc7180-trogdor-lazor-limozeen-skips.txt    |    2 +
+ drivers/gpu/drm/drm_file.c                         |    2 +-
+ drivers/gpu/drm/i915/i915_drm_client.c             |    2 +-
+ drivers/gpu/drm/meson/meson_drv.c                  |    6 +-
+ drivers/gpu/drm/meson/meson_encoder_cvbs.c         |   24 +-
+ drivers/gpu/drm/meson/meson_encoder_cvbs.h         |    2 +-
+ drivers/gpu/drm/meson/meson_encoder_dsi.c          |   23 +-
+ drivers/gpu/drm/meson/meson_encoder_dsi.h          |    2 +-
+ drivers/gpu/drm/meson/meson_encoder_hdmi.c         |   15 +-
+ drivers/gpu/drm/meson/meson_encoder_hdmi.h         |    2 +-
+ drivers/gpu/drm/panel/Kconfig                      |   10 +
+ drivers/gpu/drm/panel/Makefile                     |    1 +
+ drivers/gpu/drm/panel/panel-boe-tv101wum-nl6.c     |    2 +
+ drivers/gpu/drm/panel/panel-edp.c                  |   19 +-
+ drivers/gpu/drm/panel/panel-himax-hx83112a.c       |  372 ++++
+ drivers/gpu/drm/panel/panel-leadtek-ltk500hd1829.c |  265 ++-
+ drivers/gpu/drm/panel/panel-simple.c               |   20 +
+ drivers/gpu/drm/xe/xe_drm_client.c                 |    2 +-
+ drivers/gpu/host1x/cdma.c                          |    3 +-
+ include/drm/drm_bridge.h                           |    2 +-
+ include/drm/drm_gem.h                              |   13 +
+ 52 files changed, 1857 insertions(+), 1266 deletions(-)
+ create mode 100644 Documentation/devicetree/bindings/display/panel/himax,hx83112a.yaml
+ create mode 100644 drivers/gpu/drm/ci/xfails/msm-sc7180-trogdor-kingoftown-skips.txt
+ create mode 100644 drivers/gpu/drm/ci/xfails/msm-sc7180-trogdor-lazor-limozeen-skips.txt
+ create mode 100644 drivers/gpu/drm/panel/panel-himax-hx83112a.c
+$ git am -3 ../patches/0001-drm-ttm-initialise-places.patch
+Applying: drm/ttm: initialise places
+$ git reset HEAD^
+Unstaged changes after reset:
+M	drivers/gpu/drm/i915/gem/i915_gem_ttm.c
+$ git add -A .
+$ git commit -v -a --amend
+[master cfe75650b5fa] Merge branch 'for-linux-next' of git://anongit.freedesktop.org/drm/drm-misc
+ Date: Wed Feb 21 11:47:42 2024 +1100
+Merging amdgpu/drm-next (31e0a586f338 drm/amdgpu: add MMHUB 3.3.1 support)
+$ git merge -m Merge branch 'drm-next' of https://gitlab.freedesktop.org/agd5f/linux amdgpu/drm-next
+Auto-merging drivers/gpu/drm/amd/amdgpu/amdgpu.h
+Auto-merging drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+Auto-merging drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+Auto-merging drivers/gpu/drm/amd/amdgpu/soc21.c
+Auto-merging drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
+Auto-merging drivers/gpu/drm/amd/display/dc/clk_mgr/dcn35/dcn35_clk_mgr.c
+Auto-merging drivers/gpu/drm/amd/display/dc/resource/dcn35/dcn35_resource.c
+Auto-merging drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
+Merge made by the 'ort' strategy.
+ drivers/gpu/drm/amd/amdgpu/Makefile                |   15 +-
+ drivers/gpu/drm/amd/amdgpu/amdgpu.h                |    3 +-
+ drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c            |    3 +-
+ drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c      |   27 +
+ drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c            |   11 +-
+ drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c           |   46 +
+ drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.h           |   36 +
+ drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c            |   67 +-
+ drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h            |    5 +-
+ drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c     |    3 +
+ drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.c          |    6 +-
+ drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c          |    7 +-
+ drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.h          |    1 +
+ drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c            |    2 +
+ drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h            |   42 +
+ drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h             |   15 +-
+ drivers/gpu/drm/amd/amdgpu/athub_v4_1_0.c          |  122 +
+ drivers/gpu/drm/amd/amdgpu/athub_v4_1_0.h          |   30 +
+ drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c             |    1 +
+ drivers/gpu/drm/amd/amdgpu/hdp_v7_0.c              |  142 +
+ drivers/gpu/drm/amd/amdgpu/hdp_v7_0.h              |   31 +
+ drivers/gpu/drm/amd/amdgpu/ih_v7_0.c               |  767 ++
+ drivers/gpu/drm/amd/amdgpu/ih_v7_0.h               |   28 +
+ drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c           |   16 +-
+ drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.h           |   15 +
+ drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_5.c           |  302 +-
+ drivers/gpu/drm/amd/amdgpu/jpeg_v5_0_0.c           |  570 ++
+ drivers/gpu/drm/amd/amdgpu/jpeg_v5_0_0.h           |   29 +
+ drivers/gpu/drm/amd/amdgpu/lsdma_v7_0.c            |  121 +
+ drivers/gpu/drm/amd/amdgpu/lsdma_v7_0.h            |   31 +
+ drivers/gpu/drm/amd/amdgpu/mmhub_v3_3.c            |    1 +
+ drivers/gpu/drm/amd/amdgpu/nbio_v7_11.c            |    9 +-
+ drivers/gpu/drm/amd/amdgpu/psp_gfx_if.h            |    1 +
+ drivers/gpu/drm/amd/amdgpu/psp_v11_0.c             |    2 +-
+ drivers/gpu/drm/amd/amdgpu/psp_v13_0.c             |    5 +-
+ drivers/gpu/drm/amd/amdgpu/psp_v14_0.c             |  672 ++
+ drivers/gpu/drm/amd/amdgpu/psp_v14_0.h             |   32 +
+ drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c             |    1 +
+ drivers/gpu/drm/amd/amdgpu/soc21.c                 |    2 +
+ drivers/gpu/drm/amd/amdgpu/vcn_v5_0_0.c            | 1339 ++++
+ drivers/gpu/drm/amd/amdgpu/vcn_v5_0_0.h            |   37 +
+ drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h     |    2 +-
+ .../gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm |   17 +-
+ drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c       |   29 +-
+ drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h       |   25 +
+ drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c  |   37 +-
+ drivers/gpu/drm/amd/display/dc/clk_mgr/clk_mgr.c   |    1 -
+ .../amd/display/dc/clk_mgr/dcn35/dcn35_clk_mgr.c   |    1 -
+ .../drm/amd/display/dc/clk_mgr/dcn35/dcn35_smu.c   |   12 +-
+ drivers/gpu/drm/amd/display/dc/core/dc.c           |   11 +-
+ drivers/gpu/drm/amd/display/dc/core/dc_resource.c  |   37 -
+ drivers/gpu/drm/amd/display/dc/dc.h                |   15 +-
+ drivers/gpu/drm/amd/display/dc/dc_dmub_srv.c       |   83 +-
+ .../gpu/drm/amd/display/dc/dce/dmub_hw_lock_mgr.c  |    4 +
+ .../gpu/drm/amd/display/dc/dml/dcn35/dcn35_fpu.c   |    8 +-
+ .../drm/amd/display/dc/hwss/dcn20/dcn20_hwseq.c    |    9 +-
+ drivers/gpu/drm/amd/display/dc/inc/core_types.h    |    2 -
+ drivers/gpu/drm/amd/display/dc/inc/resource.h      |    4 -
+ .../amd/display/dc/link/accessories/link_dp_cts.c  |   27 +-
+ .../link/hwss/link_hwss_dio_fixed_vs_pe_retimer.c  |   16 +-
+ .../hwss/link_hwss_hpo_fixed_vs_pe_retimer_dp.c    |   51 +-
+ .../amd/display/dc/link/protocols/link_dp_phy.c    |    6 +-
+ .../dc/link/protocols/link_edp_panel_control.c     |    3 +-
+ .../amd/display/dc/resource/dcn35/dcn35_resource.c |    3 +-
+ drivers/gpu/drm/amd/display/dmub/dmub_srv.h        |    6 +-
+ drivers/gpu/drm/amd/display/dmub/inc/dmub_cmd.h    |  115 +
+ drivers/gpu/drm/amd/display/dmub/src/dmub_dcn20.c  |    3 +-
+ drivers/gpu/drm/amd/display/dmub/src/dmub_dcn20.h  |    3 +-
+ drivers/gpu/drm/amd/display/dmub/src/dmub_dcn30.c  |    3 +-
+ drivers/gpu/drm/amd/display/dmub/src/dmub_dcn30.h  |    3 +-
+ drivers/gpu/drm/amd/display/dmub/src/dmub_dcn31.c  |    3 +-
+ drivers/gpu/drm/amd/display/dmub/src/dmub_dcn31.h  |    3 +-
+ drivers/gpu/drm/amd/display/dmub/src/dmub_dcn32.c  |    3 +-
+ drivers/gpu/drm/amd/display/dmub/src/dmub_dcn32.h  |    3 +-
+ drivers/gpu/drm/amd/display/dmub/src/dmub_dcn35.c  |   12 +-
+ drivers/gpu/drm/amd/display/dmub/src/dmub_dcn35.h  |    8 +-
+ drivers/gpu/drm/amd/display/dmub/src/dmub_srv.c    |   13 +-
+ .../drm/amd/display/include/link_service_types.h   |    9 +
+ drivers/gpu/drm/amd/include/amd_shared.h           |    1 +
+ .../include/asic_reg/athub/athub_4_1_0_offset.h    |  287 +
+ .../include/asic_reg/athub/athub_4_1_0_sh_mask.h   | 1348 ++++
+ .../amd/include/asic_reg/hdp/hdp_7_0_0_offset.h    |  219 +
+ .../amd/include/asic_reg/hdp/hdp_7_0_0_sh_mask.h   |  735 ++
+ .../include/asic_reg/lsdma/lsdma_7_0_0_offset.h    |  388 +
+ .../include/asic_reg/lsdma/lsdma_7_0_0_sh_mask.h   | 1411 ++++
+ .../drm/amd/include/asic_reg/mp/mp_14_0_2_offset.h |  468 ++
+ .../amd/include/asic_reg/mp/mp_14_0_2_sh_mask.h    |  692 ++
+ .../amd/include/asic_reg/nbio/nbio_7_11_0_offset.h |    2 +
+ .../amd/include/asic_reg/oss/osssys_7_0_0_offset.h |  279 +
+ .../include/asic_reg/oss/osssys_7_0_0_sh_mask.h    | 1029 +++
+ .../amd/include/asic_reg/vcn/vcn_5_0_0_offset.h    | 1672 +++++
+ .../amd/include/asic_reg/vcn/vcn_5_0_0_sh_mask.h   | 7627 ++++++++++++++++++++
+ drivers/gpu/drm/amd/pm/amdgpu_dpm.c                |   15 +
+ drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h            |    1 +
+ drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c          |   40 +-
+ drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h      |    6 +
+ .../amd/pm/swsmu/inc/pmfw_if/smu_v13_0_6_ppsmc.h   |    3 +-
+ drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h       |    3 +-
+ .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c   |  142 +-
+ drivers/gpu/drm/radeon/ni.c                        |    2 +-
+ 100 files changed, 21184 insertions(+), 371 deletions(-)
+ create mode 100644 drivers/gpu/drm/amd/amdgpu/athub_v4_1_0.c
+ create mode 100644 drivers/gpu/drm/amd/amdgpu/athub_v4_1_0.h
+ create mode 100644 drivers/gpu/drm/amd/amdgpu/hdp_v7_0.c
+ create mode 100644 drivers/gpu/drm/amd/amdgpu/hdp_v7_0.h
+ create mode 100644 drivers/gpu/drm/amd/amdgpu/ih_v7_0.c
+ create mode 100644 drivers/gpu/drm/amd/amdgpu/ih_v7_0.h
+ create mode 100644 drivers/gpu/drm/amd/amdgpu/jpeg_v5_0_0.c
+ create mode 100644 drivers/gpu/drm/amd/amdgpu/jpeg_v5_0_0.h
+ create mode 100644 drivers/gpu/drm/amd/amdgpu/lsdma_v7_0.c
+ create mode 100644 drivers/gpu/drm/amd/amdgpu/lsdma_v7_0.h
+ create mode 100644 drivers/gpu/drm/amd/amdgpu/psp_v14_0.c
+ create mode 100644 drivers/gpu/drm/amd/amdgpu/psp_v14_0.h
+ create mode 100644 drivers/gpu/drm/amd/amdgpu/vcn_v5_0_0.c
+ create mode 100644 drivers/gpu/drm/amd/amdgpu/vcn_v5_0_0.h
+ create mode 100644 drivers/gpu/drm/amd/include/asic_reg/athub/athub_4_1_0_offset.h
+ create mode 100644 drivers/gpu/drm/amd/include/asic_reg/athub/athub_4_1_0_sh_mask.h
+ create mode 100644 drivers/gpu/drm/amd/include/asic_reg/hdp/hdp_7_0_0_offset.h
+ create mode 100644 drivers/gpu/drm/amd/include/asic_reg/hdp/hdp_7_0_0_sh_mask.h
+ create mode 100644 drivers/gpu/drm/amd/include/asic_reg/lsdma/lsdma_7_0_0_offset.h
+ create mode 100644 drivers/gpu/drm/amd/include/asic_reg/lsdma/lsdma_7_0_0_sh_mask.h
+ create mode 100644 drivers/gpu/drm/amd/include/asic_reg/mp/mp_14_0_2_offset.h
+ create mode 100644 drivers/gpu/drm/amd/include/asic_reg/mp/mp_14_0_2_sh_mask.h
+ create mode 100644 drivers/gpu/drm/amd/include/asic_reg/oss/osssys_7_0_0_offset.h
+ create mode 100644 drivers/gpu/drm/amd/include/asic_reg/oss/osssys_7_0_0_sh_mask.h
+ create mode 100644 drivers/gpu/drm/amd/include/asic_reg/vcn/vcn_5_0_0_offset.h
+ create mode 100644 drivers/gpu/drm/amd/include/asic_reg/vcn/vcn_5_0_0_sh_mask.h
+Merging drm-intel/for-linux-next (bf7626f19d6f drm/i915/tv: Fix TV mode)
+$ git merge -m Merge branch 'for-linux-next' of git://anongit.freedesktop.org/drm-intel drm-intel/for-linux-next
+Auto-merging drivers/gpu/drm/i915/display/intel_display.c
+Auto-merging drivers/gpu/drm/i915/display/intel_dp.c
+Auto-merging drivers/gpu/drm/i915/display/intel_sdvo.c
+Auto-merging drivers/gpu/drm/i915/gvt/interrupt.c
+Merge made by the 'ort' strategy.
+ drivers/gpu/drm/i915/display/dvo_ch7017.c          |   2 +-
+ drivers/gpu/drm/i915/display/dvo_ch7xxx.c          |   2 +-
+ drivers/gpu/drm/i915/display/dvo_ivch.c            |   2 +-
+ drivers/gpu/drm/i915/display/dvo_ns2501.c          |   6 +-
+ drivers/gpu/drm/i915/display/dvo_sil164.c          |   2 +-
+ drivers/gpu/drm/i915/display/dvo_tfp410.c          |   2 +-
+ drivers/gpu/drm/i915/display/i9xx_wm.c             |  81 +++----
+ drivers/gpu/drm/i915/display/intel_bios.c          |  73 ++++---
+ drivers/gpu/drm/i915/display/intel_bios.h          |   3 +-
+ drivers/gpu/drm/i915/display/intel_cdclk.c         |  60 ++++--
+ drivers/gpu/drm/i915/display/intel_color.c         |  11 +-
+ drivers/gpu/drm/i915/display/intel_crt.c           |   3 +
+ drivers/gpu/drm/i915/display/intel_cx0_phy.c       |  32 +--
+ drivers/gpu/drm/i915/display/intel_display.c       | 106 +++++-----
+ .../gpu/drm/i915/display/intel_display_debugfs.c   |  47 +++++
+ drivers/gpu/drm/i915/display/intel_display_types.h |   3 +-
+ drivers/gpu/drm/i915/display/intel_dp.c            |   4 +-
+ drivers/gpu/drm/i915/display/intel_dp_hdcp.c       |  12 +-
+ drivers/gpu/drm/i915/display/intel_dp_mst.c        |   4 +
+ drivers/gpu/drm/i915/display/intel_dpll_mgr.c      | 103 ++++++++-
+ drivers/gpu/drm/i915/display/intel_dpll_mgr.h      |   3 +
+ drivers/gpu/drm/i915/display/intel_dsi.h           |   4 -
+ drivers/gpu/drm/i915/display/intel_dvo.c           |   5 +
+ drivers/gpu/drm/i915/display/intel_dvo_dev.h       |  25 ---
+ drivers/gpu/drm/i915/display/intel_fb.c            |   7 +-
+ drivers/gpu/drm/i915/display/intel_global_state.h  |   4 +-
+ drivers/gpu/drm/i915/display/intel_opregion.c      |   6 +-
+ drivers/gpu/drm/i915/display/intel_sdvo.c          | 234 ++++++++++++---------
+ drivers/gpu/drm/i915/display/intel_tv.c            |  10 +-
+ drivers/gpu/drm/i915/display/skl_universal_plane.c |   5 +
+ drivers/gpu/drm/i915/gt/uc/intel_guc.h             |   2 -
+ drivers/gpu/drm/i915/gvt/fb_decoder.h              |  11 -
+ drivers/gpu/drm/i915/gvt/gtt.h                     |   3 -
+ drivers/gpu/drm/i915/gvt/gvt.h                     |   5 -
+ drivers/gpu/drm/i915/gvt/interrupt.c               |   1 -
+ drivers/gpu/drm/i915/gvt/interrupt.h               |   2 -
+ drivers/gpu/drm/i915/gvt/mmio.h                    |   2 -
+ drivers/gpu/drm/i915/gvt/scheduler.h               |   2 -
+ drivers/gpu/drm/i915/i915_drm_client.h             |   2 -
+ drivers/gpu/drm/i915/i915_perf_types.h             |   1 -
+ drivers/gpu/drm/i915/i915_request.c                |   1 -
+ drivers/gpu/drm/i915/i915_vma_types.h              |   1 -
+ drivers/gpu/drm/i915/intel_memory_region.h         |   2 -
+ include/drm/i915_pciids.h                          |   4 +-
+ 44 files changed, 529 insertions(+), 371 deletions(-)
+Merging drm-tegra/for-next (2429b3c529da drm/tegra: Avoid potential 32-bit integer overflow)
+$ git merge -m Merge branch 'for-next' of https://gitlab.freedesktop.org/drm/tegra.git drm-tegra/for-next
+Already up to date.
+Merging drm-msm/msm-next (41c177cf3541 Merge tag 'drm-misc-next-2024-02-08' into msm-next)
+$ git merge -m Merge branch 'msm-next' of https://gitlab.freedesktop.org/drm/msm.git drm-msm/msm-next
+Merge made by the 'ort' strategy.
+Merging drm-msm-lumag/msm-next-lumag (ffa0c87f172b drm/msm/mdp5: drop global_state_lock)
+$ git merge -m Merge branch 'msm-next-lumag' of https://gitlab.freedesktop.org/lumag/msm.git drm-msm-lumag/msm-next-lumag
+Merge made by the 'ort' strategy.
+ .../bindings/display/msm/dsi-controller-main.yaml  |   2 +
+ .../devicetree/bindings/display/msm/qcom,mdss.yaml |   1 +
+ .../bindings/display/msm/qcom,sm8650-mdss.yaml     |   4 +
+ drivers/gpu/drm/msm/Makefile                       |   2 -
+ .../gpu/drm/msm/disp/dpu1/catalog/dpu_3_2_sdm660.h | 291 +++++++++++++++++
+ .../gpu/drm/msm/disp/dpu1/catalog/dpu_3_3_sdm630.h | 225 +++++++++++++
+ drivers/gpu/drm/msm/disp/dpu1/dpu_encoder.c        | 107 +++---
+ drivers/gpu/drm/msm/disp/dpu1/dpu_encoder.h        |   7 +
+ drivers/gpu/drm/msm/disp/dpu1/dpu_encoder_phys.h   |  15 +-
+ .../gpu/drm/msm/disp/dpu1/dpu_encoder_phys_cmd.c   |  93 +++---
+ .../gpu/drm/msm/disp/dpu1/dpu_encoder_phys_vid.c   |  58 ++--
+ .../gpu/drm/msm/disp/dpu1/dpu_encoder_phys_wb.c    |  88 ++---
+ drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.c     |   2 +
+ drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.h     |   2 +
+ drivers/gpu/drm/msm/disp/dpu1/dpu_hw_intf.c        |  15 +-
+ drivers/gpu/drm/msm/disp/dpu1/dpu_hw_intf.h        |   1 +
+ drivers/gpu/drm/msm/disp/dpu1/dpu_kms.c            | 126 ++++++--
+ drivers/gpu/drm/msm/disp/dpu1/dpu_kms.h            |   1 -
+ drivers/gpu/drm/msm/disp/dpu1/dpu_rm.c             | 154 ++++++++-
+ drivers/gpu/drm/msm/disp/dpu1/dpu_trace.h          |  74 +++--
+ drivers/gpu/drm/msm/disp/dpu1/dpu_writeback.c      |  61 +++-
+ drivers/gpu/drm/msm/disp/dpu1/dpu_writeback.h      |   3 +-
+ drivers/gpu/drm/msm/disp/mdp5/mdp5_cmd_encoder.c   |  42 ---
+ drivers/gpu/drm/msm/disp/mdp5/mdp5_encoder.c       |  42 ---
+ drivers/gpu/drm/msm/disp/mdp5/mdp5_irq.c           |   2 -
+ drivers/gpu/drm/msm/disp/mdp5/mdp5_kms.c           |  71 +---
+ drivers/gpu/drm/msm/disp/mdp5/mdp5_kms.h           |  10 -
+ drivers/gpu/drm/msm/disp/mdp5/mdp5_smp.c           |  12 +-
+ drivers/gpu/drm/msm/disp/mdp5/mdp5_smp.h           |   4 +-
+ drivers/gpu/drm/msm/dp/dp_aux.c                    |   9 +-
+ drivers/gpu/drm/msm/dp/dp_aux.h                    |   2 +
+ drivers/gpu/drm/msm/dp/dp_catalog.c                | 158 ++++++---
+ drivers/gpu/drm/msm/dp/dp_catalog.h                |   6 +-
+ drivers/gpu/drm/msm/dp/dp_ctrl.c                   | 358 ++++++++++++++-------
+ drivers/gpu/drm/msm/dp/dp_ctrl.h                   |  17 +-
+ drivers/gpu/drm/msm/dp/dp_debug.c                  |   3 +-
+ drivers/gpu/drm/msm/dp/dp_display.c                | 102 ++----
+ drivers/gpu/drm/msm/dp/dp_display.h                |   3 +-
+ drivers/gpu/drm/msm/dp/dp_link.h                   |  23 --
+ drivers/gpu/drm/msm/dp/dp_panel.c                  |  66 ++++
+ drivers/gpu/drm/msm/dp/dp_parser.c                 | 327 -------------------
+ drivers/gpu/drm/msm/dp/dp_parser.h                 | 155 ---------
+ drivers/gpu/drm/msm/dp/dp_power.c                  | 183 -----------
+ drivers/gpu/drm/msm/dp/dp_power.h                  |  95 ------
+ drivers/gpu/drm/msm/dsi/dsi.c                      |  10 +-
+ drivers/gpu/drm/msm/dsi/dsi.h                      |  22 +-
+ drivers/gpu/drm/msm/dsi/dsi_host.c                 |  51 ++-
+ drivers/gpu/drm/msm/dsi/dsi_manager.c              |  65 +---
+ drivers/gpu/drm/msm/msm_drv.c                      |  33 ++
+ drivers/gpu/drm/msm/msm_drv.h                      |   4 +
+ drivers/gpu/drm/msm/msm_io_utils.c                 |  13 +
+ drivers/gpu/drm/msm/msm_kms.h                      |   4 -
+ drivers/gpu/drm/msm/msm_mdss.c                     |  51 +++
+ 53 files changed, 1695 insertions(+), 1580 deletions(-)
+ create mode 100644 drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_3_2_sdm660.h
+ create mode 100644 drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_3_3_sdm630.h
+ delete mode 100644 drivers/gpu/drm/msm/dp/dp_parser.c
+ delete mode 100644 drivers/gpu/drm/msm/dp/dp_parser.h
+ delete mode 100644 drivers/gpu/drm/msm/dp/dp_power.c
+ delete mode 100644 drivers/gpu/drm/msm/dp/dp_power.h
+Merging etnaviv/etnaviv/next (c9959996a8fc drm/etnaviv: add sensitive state for PE_RT_ADDR_4_PIPE(3, 0|1) address)
+$ git merge -m Merge branch 'etnaviv/next' of https://git.pengutronix.de/git/lst/linux etnaviv/etnaviv/next
+Auto-merging drivers/gpu/drm/etnaviv/etnaviv_drv.c
+Auto-merging drivers/gpu/drm/etnaviv/etnaviv_gpu.c
+Merge made by the 'ort' strategy.
+ drivers/gpu/drm/etnaviv/etnaviv_cmd_parser.c |  1 +
+ drivers/gpu/drm/etnaviv/etnaviv_drv.c        | 93 ++++++++++++++++++----------
+ drivers/gpu/drm/etnaviv/etnaviv_gem.c        | 12 ++--
+ drivers/gpu/drm/etnaviv/etnaviv_gpu.c        | 33 +++++++++-
+ drivers/gpu/drm/etnaviv/etnaviv_gpu.h        | 12 ++++
+ drivers/gpu/drm/etnaviv/etnaviv_hwdb.c       | 34 ++++++++++
+ drivers/gpu/drm/etnaviv/etnaviv_mmu.c        |  4 +-
+ drivers/gpu/drm/etnaviv/etnaviv_perfmon.c    |  4 +-
+ include/uapi/drm/etnaviv_drm.h               |  5 ++
+ 9 files changed, 154 insertions(+), 44 deletions(-)
+Merging fbdev/for-next (72fee6b0a3a4 fbdev: Restrict FB_SH_MOBILE_LCDC to SuperH)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/deller/linux-fbdev.git fbdev/for-next
+Auto-merging drivers/video/fbdev/Kconfig
+Merge made by the 'ort' strategy.
+ drivers/video/fbdev/Kconfig | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+Merging regmap/for-next (cfe1cab458f0 Merge remote-tracking branch 'regmap/for-6.9' into regmap-next)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/broonie/regmap.git regmap/for-next
+Merge made by the 'ort' strategy.
+ drivers/base/regmap/internal.h      |  1 +
+ drivers/base/regmap/regcache-flat.c |  2 +-
+ drivers/base/regmap/regcache.c      |  4 +++-
+ drivers/base/regmap/regmap.c        | 10 ++++++----
+ include/linux/regmap.h              |  5 +++++
+ 5 files changed, 16 insertions(+), 6 deletions(-)
+Merging sound/for-next (52592932405c ALSA: oxfw: add support for Miglia Harmony Audio)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/tiwai/sound.git sound/for-next
+Auto-merging MAINTAINERS
+Auto-merging sound/firewire/amdtp-stream.c
+Auto-merging sound/pci/hda/patch_realtek.c
+Merge made by the 'ort' strategy.
+ .../sound/kernel-api/writing-an-alsa-driver.rst    |   6 +-
+ MAINTAINERS                                        |   7 +
+ include/sound/ak4531_codec.h                       |   3 +
+ include/sound/emux_synth.h                         |   2 +-
+ include/sound/sb.h                                 |   3 +
+ include/uapi/linux/virtio_snd.h                    | 154 +++++++
+ sound/aoa/fabrics/layout.c                         |   7 +-
+ sound/aoa/soundbus/core.c                          |   2 +-
+ sound/arm/aaci.c                                   |  10 +-
+ sound/arm/pxa2xx-ac97.c                            |   7 +-
+ sound/core/Kconfig                                 |  17 +
+ sound/core/Makefile                                |   2 +
+ sound/core/pcm.c                                   |   6 +-
+ sound/core/seq/Kconfig                             |   1 -
+ sound/core/seq/oss/seq_oss_device.h                |   2 +-
+ sound/core/seq/oss/seq_oss_init.c                  |   4 +-
+ sound/core/seq/seq_midi.c                          |   8 +-
+ sound/core/seq/seq_virmidi.c                       |   9 +-
+ sound/core/seq_device.c                            |   2 +-
+ sound/core/sound_kunit.c                           | 311 ++++++++++++++
+ sound/drivers/aloop.c                              |   9 +-
+ sound/drivers/dummy.c                              |   9 +-
+ sound/drivers/pcsp/pcsp.c                          |   9 +-
+ sound/firewire/Kconfig                             |   2 +
+ sound/firewire/amdtp-stream.c                      |  12 +-
+ sound/firewire/amdtp-stream.h                      |   4 +
+ sound/firewire/motu/motu-protocol-v3.c             |   9 +
+ sound/firewire/motu/motu.c                         |   2 +
+ sound/firewire/motu/motu.h                         |   1 +
+ sound/firewire/oxfw/oxfw-stream.c                  | 100 +++--
+ sound/firewire/oxfw/oxfw.c                         |  10 +-
+ sound/firewire/oxfw/oxfw.h                         |   7 +-
+ sound/hda/intel-sdw-acpi.c                         |   7 +
+ sound/pci/ali5451/ali5451.c                        |  32 +-
+ sound/pci/als300.c                                 |   9 +-
+ sound/pci/als4000.c                                |   9 +-
+ sound/pci/atiixp.c                                 |  12 +-
+ sound/pci/atiixp_modem.c                           |  11 +-
+ sound/pci/aw2/aw2-saa7146.h                        |   5 +-
+ sound/pci/azt3328.c                                |  11 +-
+ sound/pci/cmipci.c                                 |  11 +-
+ sound/pci/cs4281.c                                 |  13 +-
+ sound/pci/ctxfi/ctamixer.c                         |  10 +-
+ sound/pci/ctxfi/ctamixer.h                         |   8 +-
+ sound/pci/ctxfi/ctatc.c                            |  23 +-
+ sound/pci/ctxfi/ctdaio.c                           |   5 +-
+ sound/pci/ctxfi/ctdaio.h                           |   4 +-
+ sound/pci/ctxfi/ctsrc.c                            |  10 +-
+ sound/pci/ctxfi/ctsrc.h                            |   8 +-
+ sound/pci/echoaudio/echoaudio.c                    |  21 +-
+ sound/pci/echoaudio/echoaudio.h                    |   2 -
+ sound/pci/ens1370.c                                |   9 +-
+ sound/pci/es1938.c                                 |  11 +-
+ sound/pci/es1968.c                                 |  17 +-
+ sound/pci/fm801.c                                  |  11 +-
+ sound/pci/hda/Kconfig                              |   4 +
+ sound/pci/hda/Makefile                             |   2 +
+ sound/pci/hda/cs35l41_hda_property.c               |  90 +++-
+ sound/pci/hda/hda_component.c                      | 169 ++++++++
+ sound/pci/hda/hda_component.h                      |  59 +++
+ sound/pci/hda/patch_realtek.c                      | 270 ++++--------
+ sound/pci/intel8x0.c                               |   9 +-
+ sound/pci/intel8x0m.c                              |   9 +-
+ sound/pci/maestro3.c                               |  29 +-
+ sound/pci/nm256/nm256.c                            |   9 +-
+ sound/pci/riptide/riptide.c                        |  11 +-
+ sound/pci/rme96.c                                  |  30 +-
+ sound/pci/sis7019.c                                |  13 +-
+ sound/pci/via82xx.c                                |  15 +-
+ sound/pci/via82xx_modem.c                          |   9 +-
+ sound/soc/pxa/pxa2xx-ac97.c                        |   6 +-
+ sound/spi/at73c213.c                               |  11 +-
+ sound/synth/emux/emux.c                            |   4 +-
+ sound/virtio/Makefile                              |   1 +
+ sound/virtio/virtio_card.c                         |  21 +
+ sound/virtio/virtio_card.h                         |  22 +
+ sound/virtio/virtio_kctl.c                         | 477 +++++++++++++++++++++
+ 77 files changed, 1674 insertions(+), 592 deletions(-)
+ create mode 100644 sound/core/sound_kunit.c
+ create mode 100644 sound/pci/hda/hda_component.c
+ create mode 100644 sound/virtio/virtio_kctl.c
+Merging ieee1394/for-next (41ebb53b1bff firewire: core: fix build failure due to the caller of fw_csr_string())
+$ git merge -m Merge branch 'for-next' of https://git.kernel.org/pub/scm/linux/kernel/git/ieee1394/linux1394.git ieee1394/for-next
+Merge made by the 'ort' strategy.
+ drivers/firewire/core-device.c | 18 +++++-------------
+ 1 file changed, 5 insertions(+), 13 deletions(-)
+Merging sound-asoc/for-next (db38c4ba8be6 Merge remote-tracking branch 'asoc/for-6.9' into asoc-next)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/broonie/sound.git sound-asoc/for-next
+Merge made by the 'ort' strategy.
+ .../ABI/testing/sysfs-bus-pci-devices-avs          |    8 +
+ .../bindings/sound/atmel,asoc-wm8904.yaml          |   84 +
+ .../bindings/sound/atmel,sam9x5-wm8731-audio.yaml  |   76 +
+ .../bindings/sound/atmel-sam9x5-wm8731-audio.txt   |   35 -
+ .../devicetree/bindings/sound/atmel-wm8904.txt     |   55 -
+ .../bindings/sound/audio-graph-port.yaml           |    2 +-
+ .../devicetree/bindings/sound/cirrus,cs35l45.yaml  |    3 +
+ .../devicetree/bindings/sound/fsl,asrc.txt         |   80 -
+ .../devicetree/bindings/sound/fsl,easrc.yaml       |    4 +-
+ .../devicetree/bindings/sound/fsl,imx-asrc.yaml    |  162 +
+ .../devicetree/bindings/sound/fsl,micfil.yaml      |   14 +-
+ .../devicetree/bindings/sound/fsl,sai.yaml         |    6 +
+ .../bindings/sound/infineon,peb2466.yaml           |    2 +-
+ .../devicetree/bindings/sound/qcom,sm8250.yaml     |    2 +-
+ .../devicetree/bindings/sound/qcom,wcd938x.yaml    |   81 +-
+ .../bindings/sound/qcom,wcd939x-sdw.yaml           |   69 +
+ .../devicetree/bindings/sound/qcom,wcd939x.yaml    |   96 +
+ .../bindings/sound/qcom,wcd93xx-common.yaml        |   95 +
+ .../devicetree/bindings/sound/samsung,tm2.yaml     |    7 +-
+ drivers/soundwire/Makefile                         |    2 +-
+ drivers/soundwire/amd_init.c                       |  235 ++
+ drivers/soundwire/amd_init.h                       |   13 +
+ drivers/soundwire/amd_manager.c                    |   47 +-
+ drivers/soundwire/amd_manager.h                    |   16 +-
+ drivers/soundwire/dmi-quirks.c                     |    8 +
+ include/linux/soundwire/sdw_amd.h                  |   83 +-
+ include/linux/spi/spi.h                            |    2 -
+ include/sound/cs42l42.h                            |    5 +-
+ include/sound/hda-mlink.h                          |    2 +-
+ include/sound/soc.h                                |    4 +-
+ include/sound/sof/dai-amd.h                        |    7 +
+ include/sound/sof/dai.h                            |    2 +
+ include/sound/tas2781.h                            |    1 -
+ include/uapi/sound/intel/avs/tokens.h              |    9 +
+ include/uapi/sound/sof/tokens.h                    |    4 +
+ sound/pci/hda/tas2781_hda_i2c.c                    |   12 -
+ sound/soc/amd/Kconfig                              |   17 +
+ sound/soc/amd/Makefile                             |    2 +-
+ sound/soc/amd/acp/Kconfig                          |    7 +
+ sound/soc/amd/acp/Makefile                         |    2 +
+ sound/soc/amd/acp/acp-mach-common.c                |    6 +-
+ sound/soc/amd/acp/acp-sof-mach.c                   |   26 +-
+ sound/soc/amd/acp/amd-sdw-acpi.c                   |   62 +
+ sound/soc/amd/ps/acp63.h                           |   89 +-
+ sound/soc/amd/ps/pci-ps.c                          |  567 +--
+ sound/soc/atmel/mikroe-proto.c                     |    8 +-
+ sound/soc/codecs/Kconfig                           |   38 +-
+ sound/soc/codecs/Makefile                          |    9 +
+ sound/soc/codecs/cs35l56-shared.c                  |    1 +
+ sound/soc/codecs/cs42l42.c                         |    1 -
+ sound/soc/codecs/cs42l43-jack.c                    |   27 +-
+ sound/soc/codecs/cs42l43-sdw.c                     |    1 +
+ sound/soc/codecs/cs42l43.c                         |   82 +-
+ sound/soc/codecs/cs42l43.h                         |   25 +-
+ sound/soc/codecs/es8326.c                          |   92 +-
+ sound/soc/codecs/es8326.h                          |    5 +-
+ sound/soc/codecs/framer-codec.c                    |  413 +++
+ sound/soc/codecs/lpass-tx-macro.c                  |   16 +-
+ sound/soc/codecs/lpass-va-macro.c                  |   57 +
+ sound/soc/codecs/max98363.c                        |    2 +-
+ sound/soc/codecs/max98373-sdw.c                    |    2 +-
+ sound/soc/codecs/nau8540.c                         |  112 +-
+ sound/soc/codecs/nau8540.h                         |   13 +-
+ sound/soc/codecs/rt1017-sdca-sdw.c                 |    2 +-
+ sound/soc/codecs/rt274.c                           |    2 +-
+ sound/soc/codecs/rt286.c                           |    2 +-
+ sound/soc/codecs/rt298.c                           |    2 +-
+ sound/soc/codecs/rt5514-spi.c                      |    2 +-
+ sound/soc/codecs/rt5645.c                          |    2 +-
+ sound/soc/codecs/rt5651.c                          |    2 +-
+ sound/soc/codecs/rt5659.c                          |    2 +-
+ sound/soc/codecs/rt5663.c                          |    2 +-
+ sound/soc/codecs/rt5665.c                          |    2 +-
+ sound/soc/codecs/rt5668.c                          |    2 +-
+ sound/soc/codecs/rt5682-i2c.c                      |    2 +-
+ sound/soc/codecs/rt5682s.c                         |    2 +-
+ sound/soc/codecs/rt712-sdca-dmic.c                 |    2 +-
+ sound/soc/codecs/rt712-sdca-sdw.c                  |    2 +-
+ sound/soc/codecs/rt722-sdca-sdw.c                  |    2 +-
+ sound/soc/codecs/tas2781-comlib.c                  |    1 -
+ sound/soc/codecs/tas2781-fmwlib.c                  |    2 +-
+ sound/soc/codecs/wcd-clsh-v2.h                     |    1 +
+ sound/soc/codecs/wcd-mbhc-v2.c                     |   95 +-
+ sound/soc/codecs/wcd-mbhc-v2.h                     |    3 +
+ sound/soc/codecs/wcd939x-sdw.c                     | 1551 ++++++++
+ sound/soc/codecs/wcd939x.c                         | 3686 ++++++++++++++++++++
+ sound/soc/codecs/wcd939x.h                         |  989 ++++++
+ sound/soc/fsl/eukrea-tlv320.c                      |    8 +-
+ sound/soc/fsl/fsl_sai.c                            |   13 +
+ sound/soc/fsl/p1022_rdk.c                          |   33 +-
+ sound/soc/intel/avs/Makefile                       |    3 +-
+ sound/soc/intel/avs/avs.h                          |    4 +
+ sound/soc/intel/avs/core.c                         |    1 +
+ sound/soc/intel/avs/path.c                         |   33 +
+ sound/soc/intel/avs/sysfs.c                        |   35 +
+ sound/soc/intel/avs/topology.c                     |  164 +-
+ sound/soc/intel/avs/topology.h                     |   13 +
+ sound/soc/intel/boards/Kconfig                     |    1 +
+ sound/soc/intel/boards/sof_board_helpers.c         |  249 +-
+ sound/soc/intel/boards/sof_board_helpers.h         |   29 +
+ sound/soc/intel/boards/sof_cs42l42.c               |  232 +-
+ sound/soc/intel/boards/sof_rt5682.c                |  151 +-
+ sound/soc/intel/boards/sof_sdw.c                   |  106 +-
+ sound/soc/intel/boards/sof_sdw_common.h            |   82 +-
+ sound/soc/intel/boards/sof_sdw_cs42l42.c           |   35 +-
+ sound/soc/intel/boards/sof_sdw_cs42l43.c           |   25 +-
+ sound/soc/intel/boards/sof_sdw_cs_amp.c            |    3 +-
+ sound/soc/intel/boards/sof_sdw_maxim.c             |    4 +-
+ sound/soc/intel/boards/sof_sdw_rt5682.c            |   35 +-
+ sound/soc/intel/boards/sof_sdw_rt700.c             |   35 +-
+ sound/soc/intel/boards/sof_sdw_rt711.c             |   19 +-
+ sound/soc/intel/boards/sof_sdw_rt712_sdca.c        |   39 +-
+ sound/soc/intel/boards/sof_sdw_rt715.c             |   12 +-
+ sound/soc/intel/boards/sof_sdw_rt715_sdca.c        |   12 +-
+ sound/soc/intel/boards/sof_sdw_rt_amp.c            |   53 +-
+ .../soc/intel/boards/sof_sdw_rt_sdca_jack_common.c |   19 +-
+ sound/soc/intel/boards/sof_ssp_common.h            |    8 +
+ sound/soc/intel/common/soc-acpi-intel-adl-match.c  |   12 +-
+ sound/soc/intel/common/soc-acpi-intel-jsl-match.c  |   10 +-
+ sound/soc/intel/common/soc-acpi-intel-lnl-match.c  |  102 +
+ sound/soc/intel/common/soc-acpi-intel-mtl-match.c  |   61 +-
+ sound/soc/intel/common/soc-acpi-intel-rpl-match.c  |    4 +-
+ sound/soc/intel/common/soc-acpi-intel-tgl-match.c  |    6 +-
+ sound/soc/meson/aiu.c                              |   19 +-
+ sound/soc/meson/aiu.h                              |    1 -
+ sound/soc/meson/t9015.c                            |   20 +-
+ sound/soc/pxa/Kconfig                              |    3 -
+ sound/soc/qcom/common.c                            |    2 +-
+ sound/soc/sh/rz-ssi.c                              |    2 +-
+ sound/soc/soc-core.c                               |    9 +-
+ sound/soc/sof/amd/Kconfig                          |   18 +
+ sound/soc/sof/amd/acp-common.c                     |   65 +-
+ sound/soc/sof/amd/acp-dsp-offset.h                 |   10 +
+ sound/soc/sof/amd/acp-loader.c                     |   34 +-
+ sound/soc/sof/amd/acp.c                            |  232 +-
+ sound/soc/sof/amd/acp.h                            |   26 +-
+ sound/soc/sof/amd/pci-acp63.c                      |    7 +
+ sound/soc/sof/core.c                               |   10 +
+ sound/soc/sof/debug.c                              |    8 +-
+ sound/soc/sof/fw-file-profile.c                    |   18 +-
+ sound/soc/sof/imx/imx8.c                           |   16 +
+ sound/soc/sof/imx/imx8m.c                          |   10 +
+ sound/soc/sof/imx/imx8ulp.c                        |   10 +
+ sound/soc/sof/intel/hda-common-ops.c               |    1 +
+ sound/soc/sof/intel/hda-dai-ops.c                  |   51 +-
+ sound/soc/sof/intel/hda-dai.c                      |   17 +-
+ sound/soc/sof/intel/hda-dsp.c                      |    5 +
+ sound/soc/sof/intel/hda-stream.c                   |    9 +
+ sound/soc/sof/intel/hda.c                          |   80 +-
+ sound/soc/sof/intel/hda.h                          |    5 +
+ sound/soc/sof/intel/lnl.c                          |   24 +-
+ sound/soc/sof/ipc3-loader.c                        |    2 +
+ sound/soc/sof/ipc3-pcm.c                           |   25 +
+ sound/soc/sof/ipc3-topology.c                      |   40 +
+ sound/soc/sof/ipc4-pcm.c                           |   19 +-
+ sound/soc/sof/ipc4-priv.h                          |    4 +
+ sound/soc/sof/ipc4-topology.c                      |   28 +-
+ sound/soc/sof/ops.h                                |    9 +
+ sound/soc/sof/sof-audio.c                          |    8 +-
+ sound/soc/sof/sof-audio.h                          |    2 +
+ sound/soc/sof/sof-priv.h                           |   10 +
+ sound/soc/sof/topology.c                           |   30 +-
+ sound/soc/ti/j721e-evm.c                           |    4 +-
+ sound/soc/ti/omap-hdmi.c                           |   10 +-
+ 164 files changed, 10234 insertions(+), 1695 deletions(-)
+ create mode 100644 Documentation/ABI/testing/sysfs-bus-pci-devices-avs
+ create mode 100644 Documentation/devicetree/bindings/sound/atmel,asoc-wm8904.yaml
+ create mode 100644 Documentation/devicetree/bindings/sound/atmel,sam9x5-wm8731-audio.yaml
+ delete mode 100644 Documentation/devicetree/bindings/sound/atmel-sam9x5-wm8731-audio.txt
+ delete mode 100644 Documentation/devicetree/bindings/sound/atmel-wm8904.txt
+ delete mode 100644 Documentation/devicetree/bindings/sound/fsl,asrc.txt
+ create mode 100644 Documentation/devicetree/bindings/sound/fsl,imx-asrc.yaml
+ create mode 100644 Documentation/devicetree/bindings/sound/qcom,wcd939x-sdw.yaml
+ create mode 100644 Documentation/devicetree/bindings/sound/qcom,wcd939x.yaml
+ create mode 100644 Documentation/devicetree/bindings/sound/qcom,wcd93xx-common.yaml
+ create mode 100644 drivers/soundwire/amd_init.c
+ create mode 100644 drivers/soundwire/amd_init.h
+ create mode 100644 sound/soc/amd/acp/amd-sdw-acpi.c
+ create mode 100644 sound/soc/codecs/framer-codec.c
+ create mode 100644 sound/soc/codecs/wcd939x-sdw.c
+ create mode 100644 sound/soc/codecs/wcd939x.c
+ create mode 100644 sound/soc/codecs/wcd939x.h
+ create mode 100644 sound/soc/intel/avs/sysfs.c
+Merging modules/modules-next (d1909c022173 module: Don't ignore errors from set_memory_XX())
+$ git merge -m Merge branch 'modules-next' of git://git.kernel.org/pub/scm/linux/kernel/git/mcgrof/linux.git modules/modules-next
+Auto-merging init/main.c
+Auto-merging kernel/module/main.c
+Merge made by the 'ort' strategy.
+ arch/powerpc/include/asm/mmu.h |  9 +-----
+ include/linux/init.h           |  4 ---
+ init/main.c                    | 21 +++++---------
+ kernel/module/internal.h       |  6 ++--
+ kernel/module/main.c           | 20 +++++++++++---
+ kernel/module/strict_rwx.c     | 63 +++++++++++++++++++++++++++---------------
+ lib/test_kmod.c                |  6 +++-
+ 7 files changed, 73 insertions(+), 56 deletions(-)
+Merging input/next (d03f030115fe Input: gameport - make gameport_bus const)
+$ git merge -m Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/dtor/input.git input/next
+Merge made by the 'ort' strategy.
+ .../bindings/input/touchscreen/goodix,gt9916.yaml  |  95 +++
+ .../bindings/input/touchscreen/goodix.yaml         |   5 +-
+ .../bindings/input/touchscreen/melfas,mms114.yaml  |   6 +-
+ .../bindings/input/touchscreen/silead,gsl1680.yaml |   2 +-
+ drivers/input/gameport/gameport.c                  |   4 +-
+ drivers/input/input-leds.c                         |   8 +-
+ drivers/input/input.c                              |  14 +-
+ drivers/input/keyboard/bcm-keypad.c                |   2 +-
+ drivers/input/keyboard/matrix_keypad.c             | 170 ++---
+ drivers/input/misc/88pm80x_onkey.c                 |  14 +-
+ drivers/input/mouse/Kconfig                        |  12 -
+ drivers/input/mouse/Makefile                       |   1 -
+ drivers/input/mouse/navpoint.c                     | 350 ----------
+ drivers/input/rmi4/rmi_driver.c                    |   6 +-
+ drivers/input/touchscreen/Kconfig                  |  31 +
+ drivers/input/touchscreen/Makefile                 |   3 +
+ drivers/input/touchscreen/goodix_berlin.h          |  24 +
+ drivers/input/touchscreen/goodix_berlin_core.c     | 755 +++++++++++++++++++++
+ drivers/input/touchscreen/goodix_berlin_i2c.c      |  75 ++
+ drivers/input/touchscreen/goodix_berlin_spi.c      | 178 +++++
+ include/linux/input/navpoint.h                     |   8 -
+ 21 files changed, 1246 insertions(+), 517 deletions(-)
+ create mode 100644 Documentation/devicetree/bindings/input/touchscreen/goodix,gt9916.yaml
+ delete mode 100644 drivers/input/mouse/navpoint.c
+ create mode 100644 drivers/input/touchscreen/goodix_berlin.h
+ create mode 100644 drivers/input/touchscreen/goodix_berlin_core.c
+ create mode 100644 drivers/input/touchscreen/goodix_berlin_i2c.c
+ create mode 100644 drivers/input/touchscreen/goodix_berlin_spi.c
+ delete mode 100644 include/linux/input/navpoint.h
+Merging block/for-next (40192a566897 Merge branch 'for-6.9/block' into for-next)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.dk/linux-block.git block/for-next
+Auto-merging block/blk-iocost.c
+Auto-merging block/blk-wbt.c
+Auto-merging block/blk.h
+CONFLICT (content): Merge conflict in block/blk.h
+Auto-merging block/genhd.c
+Auto-merging drivers/block/pktcdvd.c
+Auto-merging drivers/block/sunvdc.c
+Auto-merging drivers/block/virtio_blk.c
+Auto-merging drivers/block/zram/zram_drv.c
+Auto-merging drivers/md/bcache/super.c
+Auto-merging drivers/md/dm.c
+Auto-merging drivers/md/md.c
+Auto-merging drivers/nvdimm/pmem.c
+Auto-merging drivers/nvme/host/core.c
+Auto-merging drivers/nvme/host/nvme.h
+Auto-merging drivers/s390/block/dasd.c
+Auto-merging drivers/s390/block/dasd_genhd.c
+Auto-merging drivers/s390/block/dasd_int.h
+Auto-merging drivers/s390/block/dasd_ioctl.c
+Auto-merging drivers/s390/block/dcssblk.c
+Auto-merging drivers/ufs/core/ufshcd.c
+Auto-merging fs/btrfs/zoned.c
+Auto-merging fs/f2fs/segment.c
+Auto-merging fs/internal.h
+Auto-merging fs/open.c
+Auto-merging fs/zonefs/super.c
+Auto-merging include/linux/blkdev.h
+Auto-merging include/linux/sched.h
+CONFLICT (content): Merge conflict in include/linux/sched.h
+Resolved 'block/blk.h' using previous resolution.
+Resolved 'include/linux/sched.h' using previous resolution.
+Automatic merge failed; fix conflicts and then commit the result.
+$ git commit --no-edit -v -a
+[master 732d072f23d8] Merge branch 'for-next' of git://git.kernel.dk/linux-block.git
+$ git diff -M --stat --summary HEAD^..
+ arch/m68k/emu/nfblock.c                |  10 +-
+ arch/um/drivers/ubd_kern.c             |   8 +-
+ arch/xtensa/platforms/iss/simdisk.c    |   8 +-
+ block/bfq-cgroup.c                     |  14 +-
+ block/bfq-iosched.c                    |  28 +-
+ block/bio-integrity.c                  |   1 +
+ block/bio.c                            |  26 +-
+ block/blk-cgroup.c                     |   2 +-
+ block/blk-cgroup.h                     |   1 +
+ block/blk-core.c                       |  33 +-
+ block/blk-flush.c                      |   2 +-
+ block/blk-integrity.c                  |   1 +
+ block/blk-iocost.c                     |   8 +-
+ block/blk-iolatency.c                  |   6 +-
+ block/blk-mq.c                         | 148 ++++-----
+ block/blk-settings.c                   | 278 +++++++++++++----
+ block/blk-sysfs.c                      |  59 ++--
+ block/blk-throttle.c                   |  10 +-
+ block/blk-wbt.c                        |   6 +-
+ block/blk-zoned.c                      |  19 +-
+ block/blk.h                            |  72 ++++-
+ block/bsg-lib.c                        |   2 +-
+ block/genhd.c                          |  12 +-
+ block/t10-pi.c                         |  72 +++--
+ drivers/block/amiflop.c                |   2 +-
+ drivers/block/aoe/aoeblk.c             |  15 +-
+ drivers/block/ataflop.c                |   2 +-
+ drivers/block/brd.c                    |  26 +-
+ drivers/block/drbd/drbd_main.c         |   6 +-
+ drivers/block/drbd/drbd_state.c        |  24 +-
+ drivers/block/drbd/drbd_state_change.h |   8 +-
+ drivers/block/floppy.c                 |  17 +-
+ drivers/block/loop.c                   |  75 ++---
+ drivers/block/mtip32xx/mtip32xx.c      |  13 +-
+ drivers/block/n64cart.c                |  12 +-
+ drivers/block/nbd.c                    |  19 +-
+ drivers/block/null_blk/main.c          | 540 +++++++++------------------------
+ drivers/block/null_blk/null_blk.h      |  22 +-
+ drivers/block/null_blk/trace.h         |   5 +-
+ drivers/block/null_blk/zoned.c         |  25 +-
+ drivers/block/pktcdvd.c                |   7 +-
+ drivers/block/ps3disk.c                |  17 +-
+ drivers/block/ps3vram.c                |   6 +-
+ drivers/block/rbd.c                    |  31 +-
+ drivers/block/rnbd/rnbd-clt.c          |  64 ++--
+ drivers/block/sunvdc.c                 |  18 +-
+ drivers/block/swim.c                   |   2 +-
+ drivers/block/swim3.c                  |   2 +-
+ drivers/block/ublk_drv.c               |  90 +++---
+ drivers/block/virtio_blk.c             | 443 ++++++++++++++-------------
+ drivers/block/xen-blkfront.c           |   2 +-
+ drivers/block/z2ram.c                  |   2 +-
+ drivers/block/zram/zram_drv.c          |  51 ++--
+ drivers/cdrom/gdrom.c                  |  14 +-
+ drivers/md/bcache/super.c              |  48 +--
+ drivers/md/dm-zoned-metadata.c         |   5 +-
+ drivers/md/dm.c                        |   4 +-
+ drivers/md/md-linear.h                 |  17 --
+ drivers/md/md-multipath.h              |  32 --
+ drivers/md/md.c                        |  55 ++--
+ drivers/md/raid5.c                     |   6 +-
+ drivers/memstick/core/ms_block.c       |  14 +-
+ drivers/memstick/core/mspro_block.c    |  15 +-
+ drivers/mmc/core/queue.c               | 105 ++++---
+ drivers/mtd/mtd_blkdevs.c              |  12 +-
+ drivers/mtd/ubi/block.c                |   6 +-
+ drivers/nvdimm/btt.c                   |  14 +-
+ drivers/nvdimm/pmem.c                  |  14 +-
+ drivers/nvme/host/apple.c              |   2 +-
+ drivers/nvme/host/core.c               |  16 +-
+ drivers/nvme/host/multipath.c          |   6 +-
+ drivers/nvme/host/nvme.h               |   1 +
+ drivers/nvme/target/zns.c              |   5 +-
+ drivers/s390/block/dasd.c              | 106 +++----
+ drivers/s390/block/dasd_3990_erp.c     |  80 ++---
+ drivers/s390/block/dasd_alias.c        |   8 -
+ drivers/s390/block/dasd_devmap.c       |  34 +--
+ drivers/s390/block/dasd_diag.c         |   4 -
+ drivers/s390/block/dasd_eckd.c         | 157 ++++------
+ drivers/s390/block/dasd_eer.c          |   7 -
+ drivers/s390/block/dasd_erp.c          |   9 +-
+ drivers/s390/block/dasd_fba.c          |  55 ++--
+ drivers/s390/block/dasd_genhd.c        |   7 +-
+ drivers/s390/block/dasd_int.h          |  29 --
+ drivers/s390/block/dasd_ioctl.c        |   6 -
+ drivers/s390/block/dasd_proc.c         |   5 -
+ drivers/s390/block/dcssblk.c           |  10 +-
+ drivers/s390/block/scm_blk.c           |  17 +-
+ drivers/scsi/scsi_scan.c               |   2 +-
+ drivers/ufs/core/ufshcd.c              |   2 +-
+ fs/btrfs/zoned.c                       |  35 ++-
+ fs/f2fs/segment.c                      |  15 +-
+ fs/internal.h                          |   1 +
+ fs/open.c                              |  65 ++--
+ fs/zonefs/super.c                      |   2 +-
+ include/linux/blk-integrity.h          |   1 +
+ include/linux/blk-mq.h                 |  10 +-
+ include/linux/blk_types.h              |  42 ---
+ include/linux/blkdev.h                 |  66 +++-
+ include/linux/io_uring_types.h         | 130 ++++----
+ include/linux/sched.h                  |   2 +-
+ include/trace/events/io_uring.h        |  30 +-
+ include/uapi/linux/io_uring.h          |  13 +
+ io_uring/Makefile                      |   3 +-
+ io_uring/cancel.c                      |   3 +-
+ io_uring/cancel.h                      |  10 +
+ io_uring/filetable.h                   |   2 +-
+ io_uring/io_uring.c                    | 229 +++++++-------
+ io_uring/io_uring.h                    |  77 ++++-
+ io_uring/kbuf.c                        |  16 +-
+ io_uring/kbuf.h                        |  41 ++-
+ io_uring/napi.c                        | 332 ++++++++++++++++++++
+ io_uring/napi.h                        | 104 +++++++
+ io_uring/opdef.c                       |  10 +
+ io_uring/poll.c                        |  14 +-
+ io_uring/register.c                    |  13 +
+ io_uring/rsrc.h                        |   2 -
+ io_uring/rw.c                          |   9 +-
+ io_uring/sqpoll.c                      |  42 ++-
+ io_uring/truncate.c                    |  48 +++
+ io_uring/truncate.h                    |   4 +
+ io_uring/uring_cmd.c                   |   1 +
+ io_uring/xattr.c                       |   2 +-
+ kernel/sched/core.c                    |   6 +-
+ 124 files changed, 2578 insertions(+), 2103 deletions(-)
+ delete mode 100644 drivers/md/md-linear.h
+ delete mode 100644 drivers/md/md-multipath.h
+ create mode 100644 io_uring/napi.c
+ create mode 100644 io_uring/napi.h
+ create mode 100644 io_uring/truncate.c
+ create mode 100644 io_uring/truncate.h
+Merging device-mapper/for-next (10e8baf7b3eb Merge branch 'dm-vdo-wip' into for-next)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm.git device-mapper/for-next
+Auto-merging MAINTAINERS
+Auto-merging drivers/md/Kconfig
+Auto-merging drivers/md/dm.c
+Merge made by the 'ort' strategy.
+ Documentation/admin-guide/device-mapper/index.rst  |    2 +
+ .../admin-guide/device-mapper/vdo-design.rst       |  633 +++
+ Documentation/admin-guide/device-mapper/vdo.rst    |  388 ++
+ MAINTAINERS                                        |    8 +
+ drivers/md/Kconfig                                 |    2 +
+ drivers/md/Makefile                                |    1 +
+ drivers/md/dm-bio-prison-v1.c                      |    2 +-
+ drivers/md/dm-bufio.c                              |   74 +-
+ drivers/md/dm-cache-policy-smq.c                   |    2 +-
+ drivers/md/dm-crypt.c                              |  103 +-
+ drivers/md/dm-dust.c                               |    2 +-
+ drivers/md/dm-ebs-target.c                         |    2 +-
+ drivers/md/dm-flakey.c                             |    2 +-
+ drivers/md/dm-integrity.c                          |  105 +-
+ drivers/md/dm-io.c                                 |   23 +-
+ drivers/md/dm-ioctl.c                              |    2 +-
+ drivers/md/dm-kcopyd.c                             |    4 +-
+ drivers/md/dm-log-userspace-base.c                 |    2 +-
+ drivers/md/dm-log.c                                |    6 +-
+ drivers/md/dm-mpath.c                              |    2 +-
+ drivers/md/dm-ps-round-robin.c                     |    2 +-
+ drivers/md/dm-raid.c                               |    4 +-
+ drivers/md/dm-raid1.c                              |    6 +-
+ drivers/md/dm-region-hash.c                        |    2 +-
+ drivers/md/dm-snap-persistent.c                    |    4 +-
+ drivers/md/dm-thin.c                               |   22 +-
+ drivers/md/dm-vdo/Kconfig                          |   17 +
+ drivers/md/dm-vdo/Makefile                         |   55 +
+ drivers/md/dm-vdo/action-manager.c                 |  388 ++
+ drivers/md/dm-vdo/action-manager.h                 |  110 +
+ drivers/md/dm-vdo/admin-state.c                    |  506 ++
+ drivers/md/dm-vdo/admin-state.h                    |  178 +
+ drivers/md/dm-vdo/block-map.c                      | 3320 +++++++++++++
+ drivers/md/dm-vdo/block-map.h                      |  394 ++
+ drivers/md/dm-vdo/completion.c                     |  140 +
+ drivers/md/dm-vdo/completion.h                     |  152 +
+ drivers/md/dm-vdo/constants.h                      |   96 +
+ drivers/md/dm-vdo/cpu.h                            |   59 +
+ drivers/md/dm-vdo/data-vio.c                       | 2065 ++++++++
+ drivers/md/dm-vdo/data-vio.h                       |  669 +++
+ drivers/md/dm-vdo/dedupe.c                         | 3007 ++++++++++++
+ drivers/md/dm-vdo/dedupe.h                         |  120 +
+ drivers/md/dm-vdo/dm-vdo-target.c                  | 2924 +++++++++++
+ drivers/md/dm-vdo/dump.c                           |  278 ++
+ drivers/md/dm-vdo/dump.h                           |   17 +
+ drivers/md/dm-vdo/encodings.c                      | 1485 ++++++
+ drivers/md/dm-vdo/encodings.h                      | 1299 +++++
+ drivers/md/dm-vdo/errors.c                         |  313 ++
+ drivers/md/dm-vdo/errors.h                         |   80 +
+ drivers/md/dm-vdo/flush.c                          |  560 +++
+ drivers/md/dm-vdo/flush.h                          |   44 +
+ drivers/md/dm-vdo/funnel-queue.c                   |  171 +
+ drivers/md/dm-vdo/funnel-queue.h                   |  110 +
+ drivers/md/dm-vdo/funnel-workqueue.c               |  638 +++
+ drivers/md/dm-vdo/funnel-workqueue.h               |   51 +
+ drivers/md/dm-vdo/indexer/chapter-index.c          |  292 ++
+ drivers/md/dm-vdo/indexer/chapter-index.h          |   61 +
+ drivers/md/dm-vdo/indexer/config.c                 |  378 ++
+ drivers/md/dm-vdo/indexer/config.h                 |  124 +
+ drivers/md/dm-vdo/indexer/delta-index.c            | 1988 ++++++++
+ drivers/md/dm-vdo/indexer/delta-index.h            |  278 ++
+ drivers/md/dm-vdo/indexer/funnel-requestqueue.c    |  279 ++
+ drivers/md/dm-vdo/indexer/funnel-requestqueue.h    |   31 +
+ drivers/md/dm-vdo/indexer/geometry.c               |  201 +
+ drivers/md/dm-vdo/indexer/geometry.h               |  140 +
+ drivers/md/dm-vdo/indexer/hash-utils.h             |   66 +
+ drivers/md/dm-vdo/indexer/index-layout.c           | 1769 +++++++
+ drivers/md/dm-vdo/indexer/index-layout.h           |   43 +
+ drivers/md/dm-vdo/indexer/index-page-map.c         |  175 +
+ drivers/md/dm-vdo/indexer/index-page-map.h         |   50 +
+ drivers/md/dm-vdo/indexer/index-session.c          |  728 +++
+ drivers/md/dm-vdo/indexer/index-session.h          |   85 +
+ drivers/md/dm-vdo/indexer/index.c                  | 1397 ++++++
+ drivers/md/dm-vdo/indexer/index.h                  |   83 +
+ drivers/md/dm-vdo/indexer/indexer.h                |  353 ++
+ drivers/md/dm-vdo/indexer/io-factory.c             |  415 ++
+ drivers/md/dm-vdo/indexer/io-factory.h             |   64 +
+ drivers/md/dm-vdo/indexer/murmurhash3.c            |  175 +
+ drivers/md/dm-vdo/indexer/murmurhash3.h            |   15 +
+ drivers/md/dm-vdo/indexer/open-chapter.c           |  428 ++
+ drivers/md/dm-vdo/indexer/open-chapter.h           |   79 +
+ drivers/md/dm-vdo/indexer/radix-sort.c             |  332 ++
+ drivers/md/dm-vdo/indexer/radix-sort.h             |   26 +
+ drivers/md/dm-vdo/indexer/sparse-cache.c           |  626 +++
+ drivers/md/dm-vdo/indexer/sparse-cache.h           |   46 +
+ drivers/md/dm-vdo/indexer/volume-index.c           | 1281 +++++
+ drivers/md/dm-vdo/indexer/volume-index.h           |  193 +
+ drivers/md/dm-vdo/indexer/volume.c                 | 1695 +++++++
+ drivers/md/dm-vdo/indexer/volume.h                 |  173 +
+ drivers/md/dm-vdo/int-map.c                        |  709 +++
+ drivers/md/dm-vdo/int-map.h                        |   39 +
+ drivers/md/dm-vdo/io-submitter.c                   |  477 ++
+ drivers/md/dm-vdo/io-submitter.h                   |   47 +
+ drivers/md/dm-vdo/logger.c                         |  239 +
+ drivers/md/dm-vdo/logger.h                         |  101 +
+ drivers/md/dm-vdo/logical-zone.c                   |  375 ++
+ drivers/md/dm-vdo/logical-zone.h                   |   89 +
+ drivers/md/dm-vdo/memory-alloc.c                   |  438 ++
+ drivers/md/dm-vdo/memory-alloc.h                   |  162 +
+ drivers/md/dm-vdo/message-stats.c                  |  772 +++
+ drivers/md/dm-vdo/message-stats.h                  |   13 +
+ drivers/md/dm-vdo/numeric.h                        |   78 +
+ drivers/md/dm-vdo/packer.c                         |  784 +++
+ drivers/md/dm-vdo/packer.h                         |  122 +
+ drivers/md/dm-vdo/permassert.c                     |   26 +
+ drivers/md/dm-vdo/permassert.h                     |   49 +
+ drivers/md/dm-vdo/physical-zone.c                  |  646 +++
+ drivers/md/dm-vdo/physical-zone.h                  |  115 +
+ drivers/md/dm-vdo/priority-table.c                 |  226 +
+ drivers/md/dm-vdo/priority-table.h                 |   47 +
+ drivers/md/dm-vdo/recovery-journal.c               | 1764 +++++++
+ drivers/md/dm-vdo/recovery-journal.h               |  316 ++
+ drivers/md/dm-vdo/repair.c                         | 1756 +++++++
+ drivers/md/dm-vdo/repair.h                         |   14 +
+ drivers/md/dm-vdo/slab-depot.c                     | 5095 ++++++++++++++++++++
+ drivers/md/dm-vdo/slab-depot.h                     |  601 +++
+ drivers/md/dm-vdo/statistics.h                     |  278 ++
+ drivers/md/dm-vdo/status-codes.c                   |  123 +
+ drivers/md/dm-vdo/status-codes.h                   |  110 +
+ drivers/md/dm-vdo/string-utils.c                   |   22 +
+ drivers/md/dm-vdo/string-utils.h                   |   23 +
+ drivers/md/dm-vdo/thread-device.c                  |   34 +
+ drivers/md/dm-vdo/thread-device.h                  |   20 +
+ drivers/md/dm-vdo/thread-registry.c                |   93 +
+ drivers/md/dm-vdo/thread-registry.h                |   32 +
+ drivers/md/dm-vdo/thread-utils.c                   |  136 +
+ drivers/md/dm-vdo/thread-utils.h                   |   21 +
+ drivers/md/dm-vdo/time-utils.h                     |   28 +
+ drivers/md/dm-vdo/types.h                          |  393 ++
+ drivers/md/dm-vdo/vdo.c                            | 1732 +++++++
+ drivers/md/dm-vdo/vdo.h                            |  366 ++
+ drivers/md/dm-vdo/vio.c                            |  501 ++
+ drivers/md/dm-vdo/vio.h                            |  199 +
+ drivers/md/dm-vdo/wait-queue.c                     |  205 +
+ drivers/md/dm-vdo/wait-queue.h                     |  138 +
+ drivers/md/dm-verity-fec.c                         |   21 +-
+ drivers/md/dm-verity-target.c                      |  109 +-
+ drivers/md/dm-verity.h                             |   10 +-
+ drivers/md/dm-writecache.c                         |   10 +-
+ drivers/md/dm.c                                    |    2 +-
+ drivers/md/persistent-data/dm-block-manager.c      |    2 +-
+ include/linux/dm-bufio.h                           |    7 +
+ include/linux/dm-io.h                              |    3 +-
+ 143 files changed, 54307 insertions(+), 131 deletions(-)
+ create mode 100644 Documentation/admin-guide/device-mapper/vdo-design.rst
+ create mode 100644 Documentation/admin-guide/device-mapper/vdo.rst
+ create mode 100644 drivers/md/dm-vdo/Kconfig
+ create mode 100644 drivers/md/dm-vdo/Makefile
+ create mode 100644 drivers/md/dm-vdo/action-manager.c
+ create mode 100644 drivers/md/dm-vdo/action-manager.h
+ create mode 100644 drivers/md/dm-vdo/admin-state.c
+ create mode 100644 drivers/md/dm-vdo/admin-state.h
+ create mode 100644 drivers/md/dm-vdo/block-map.c
+ create mode 100644 drivers/md/dm-vdo/block-map.h
+ create mode 100644 drivers/md/dm-vdo/completion.c
+ create mode 100644 drivers/md/dm-vdo/completion.h
+ create mode 100644 drivers/md/dm-vdo/constants.h
+ create mode 100644 drivers/md/dm-vdo/cpu.h
+ create mode 100644 drivers/md/dm-vdo/data-vio.c
+ create mode 100644 drivers/md/dm-vdo/data-vio.h
+ create mode 100644 drivers/md/dm-vdo/dedupe.c
+ create mode 100644 drivers/md/dm-vdo/dedupe.h
+ create mode 100644 drivers/md/dm-vdo/dm-vdo-target.c
+ create mode 100644 drivers/md/dm-vdo/dump.c
+ create mode 100644 drivers/md/dm-vdo/dump.h
+ create mode 100644 drivers/md/dm-vdo/encodings.c
+ create mode 100644 drivers/md/dm-vdo/encodings.h
+ create mode 100644 drivers/md/dm-vdo/errors.c
+ create mode 100644 drivers/md/dm-vdo/errors.h
+ create mode 100644 drivers/md/dm-vdo/flush.c
+ create mode 100644 drivers/md/dm-vdo/flush.h
+ create mode 100644 drivers/md/dm-vdo/funnel-queue.c
+ create mode 100644 drivers/md/dm-vdo/funnel-queue.h
+ create mode 100644 drivers/md/dm-vdo/funnel-workqueue.c
+ create mode 100644 drivers/md/dm-vdo/funnel-workqueue.h
+ create mode 100644 drivers/md/dm-vdo/indexer/chapter-index.c
+ create mode 100644 drivers/md/dm-vdo/indexer/chapter-index.h
+ create mode 100644 drivers/md/dm-vdo/indexer/config.c
+ create mode 100644 drivers/md/dm-vdo/indexer/config.h
+ create mode 100644 drivers/md/dm-vdo/indexer/delta-index.c
+ create mode 100644 drivers/md/dm-vdo/indexer/delta-index.h
+ create mode 100644 drivers/md/dm-vdo/indexer/funnel-requestqueue.c
+ create mode 100644 drivers/md/dm-vdo/indexer/funnel-requestqueue.h
+ create mode 100644 drivers/md/dm-vdo/indexer/geometry.c
+ create mode 100644 drivers/md/dm-vdo/indexer/geometry.h
+ create mode 100644 drivers/md/dm-vdo/indexer/hash-utils.h
+ create mode 100644 drivers/md/dm-vdo/indexer/index-layout.c
+ create mode 100644 drivers/md/dm-vdo/indexer/index-layout.h
+ create mode 100644 drivers/md/dm-vdo/indexer/index-page-map.c
+ create mode 100644 drivers/md/dm-vdo/indexer/index-page-map.h
+ create mode 100644 drivers/md/dm-vdo/indexer/index-session.c
+ create mode 100644 drivers/md/dm-vdo/indexer/index-session.h
+ create mode 100644 drivers/md/dm-vdo/indexer/index.c
+ create mode 100644 drivers/md/dm-vdo/indexer/index.h
+ create mode 100644 drivers/md/dm-vdo/indexer/indexer.h
+ create mode 100644 drivers/md/dm-vdo/indexer/io-factory.c
+ create mode 100644 drivers/md/dm-vdo/indexer/io-factory.h
+ create mode 100644 drivers/md/dm-vdo/indexer/murmurhash3.c
+ create mode 100644 drivers/md/dm-vdo/indexer/murmurhash3.h
+ create mode 100644 drivers/md/dm-vdo/indexer/open-chapter.c
+ create mode 100644 drivers/md/dm-vdo/indexer/open-chapter.h
+ create mode 100644 drivers/md/dm-vdo/indexer/radix-sort.c
+ create mode 100644 drivers/md/dm-vdo/indexer/radix-sort.h
+ create mode 100644 drivers/md/dm-vdo/indexer/sparse-cache.c
+ create mode 100644 drivers/md/dm-vdo/indexer/sparse-cache.h
+ create mode 100644 drivers/md/dm-vdo/indexer/volume-index.c
+ create mode 100644 drivers/md/dm-vdo/indexer/volume-index.h
+ create mode 100644 drivers/md/dm-vdo/indexer/volume.c
+ create mode 100644 drivers/md/dm-vdo/indexer/volume.h
+ create mode 100644 drivers/md/dm-vdo/int-map.c
+ create mode 100644 drivers/md/dm-vdo/int-map.h
+ create mode 100644 drivers/md/dm-vdo/io-submitter.c
+ create mode 100644 drivers/md/dm-vdo/io-submitter.h
+ create mode 100644 drivers/md/dm-vdo/logger.c
+ create mode 100644 drivers/md/dm-vdo/logger.h
+ create mode 100644 drivers/md/dm-vdo/logical-zone.c
+ create mode 100644 drivers/md/dm-vdo/logical-zone.h
+ create mode 100644 drivers/md/dm-vdo/memory-alloc.c
+ create mode 100644 drivers/md/dm-vdo/memory-alloc.h
+ create mode 100644 drivers/md/dm-vdo/message-stats.c
+ create mode 100644 drivers/md/dm-vdo/message-stats.h
+ create mode 100644 drivers/md/dm-vdo/numeric.h
+ create mode 100644 drivers/md/dm-vdo/packer.c
+ create mode 100644 drivers/md/dm-vdo/packer.h
+ create mode 100644 drivers/md/dm-vdo/permassert.c
+ create mode 100644 drivers/md/dm-vdo/permassert.h
+ create mode 100644 drivers/md/dm-vdo/physical-zone.c
+ create mode 100644 drivers/md/dm-vdo/physical-zone.h
+ create mode 100644 drivers/md/dm-vdo/priority-table.c
+ create mode 100644 drivers/md/dm-vdo/priority-table.h
+ create mode 100644 drivers/md/dm-vdo/recovery-journal.c
+ create mode 100644 drivers/md/dm-vdo/recovery-journal.h
+ create mode 100644 drivers/md/dm-vdo/repair.c
+ create mode 100644 drivers/md/dm-vdo/repair.h
+ create mode 100644 drivers/md/dm-vdo/slab-depot.c
+ create mode 100644 drivers/md/dm-vdo/slab-depot.h
+ create mode 100644 drivers/md/dm-vdo/statistics.h
+ create mode 100644 drivers/md/dm-vdo/status-codes.c
+ create mode 100644 drivers/md/dm-vdo/status-codes.h
+ create mode 100644 drivers/md/dm-vdo/string-utils.c
+ create mode 100644 drivers/md/dm-vdo/string-utils.h
+ create mode 100644 drivers/md/dm-vdo/thread-device.c
+ create mode 100644 drivers/md/dm-vdo/thread-device.h
+ create mode 100644 drivers/md/dm-vdo/thread-registry.c
+ create mode 100644 drivers/md/dm-vdo/thread-registry.h
+ create mode 100644 drivers/md/dm-vdo/thread-utils.c
+ create mode 100644 drivers/md/dm-vdo/thread-utils.h
+ create mode 100644 drivers/md/dm-vdo/time-utils.h
+ create mode 100644 drivers/md/dm-vdo/types.h
+ create mode 100644 drivers/md/dm-vdo/vdo.c
+ create mode 100644 drivers/md/dm-vdo/vdo.h
+ create mode 100644 drivers/md/dm-vdo/vio.c
+ create mode 100644 drivers/md/dm-vdo/vio.h
+ create mode 100644 drivers/md/dm-vdo/wait-queue.c
+ create mode 100644 drivers/md/dm-vdo/wait-queue.h
+Merging libata/for-next (1ab5b472493f Merge remote-tracking branch 'libata/for-6.9' into HEAD)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/libata/linux libata/for-next
+Merge made by the 'ort' strategy.
+ Documentation/devicetree/bindings/ata/ahci-mtk.txt |  51 ---
+ .../devicetree/bindings/ata/atmel-at91_cf.txt      |  19 -
+ .../devicetree/bindings/ata/mediatek,mtk-ahci.yaml |  98 +++++
+ drivers/ata/Kconfig                                |   5 +-
+ drivers/ata/ahci.c                                 | 452 +++++++++++----------
+ drivers/ata/ahci.h                                 |  10 +-
+ drivers/ata/ahci_ceva.c                            | 125 +++---
+ drivers/ata/libahci.c                              |   7 -
+ drivers/ata/libata-core.c                          |   4 +
+ drivers/ata/pata_parport/pata_parport.c            |   2 +-
+ 10 files changed, 415 insertions(+), 358 deletions(-)
+ delete mode 100644 Documentation/devicetree/bindings/ata/ahci-mtk.txt
+ delete mode 100644 Documentation/devicetree/bindings/ata/atmel-at91_cf.txt
+ create mode 100644 Documentation/devicetree/bindings/ata/mediatek,mtk-ahci.yaml
+Merging pcmcia/pcmcia-next (1bec7691b327 pcmcia: ds: make pcmcia_bus_type const)
+$ git merge -m Merge branch 'pcmcia-next' of git://git.kernel.org/pub/scm/linux/kernel/git/brodo/linux.git pcmcia/pcmcia-next
+Merge made by the 'ort' strategy.
+ drivers/pcmcia/cs_internal.h | 2 +-
+ drivers/pcmcia/ds.c          | 2 +-
+ 2 files changed, 2 insertions(+), 2 deletions(-)
+Merging mmc/next (25e69172db8a mmc: davinci_mmc: Drop dangling variable)
+$ git merge -m Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/ulfh/mmc.git mmc/next
+Auto-merging MAINTAINERS
+Auto-merging drivers/mmc/core/queue.c
+CONFLICT (content): Merge conflict in drivers/mmc/core/queue.c
+Resolved 'drivers/mmc/core/queue.c' using previous resolution.
+Automatic merge failed; fix conflicts and then commit the result.
+$ git commit --no-edit -v -a
+[master 7f4c36bf6cb4] Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/ulfh/mmc.git
+$ git diff -M --stat --summary HEAD^..
+ .../devicetree/bindings/mmc/fsl-imx-esdhc.yaml     |   6 +-
+ .../devicetree/bindings/mmc/renesas,sdhi.yaml      |   1 +
+ MAINTAINERS                                        |   4 +-
+ drivers/memstick/core/memstick.c                   |   2 +-
+ drivers/mmc/core/block.c                           |  14 +--
+ drivers/mmc/core/bus.c                             |   2 +-
+ drivers/mmc/core/host.c                            |   5 +-
+ drivers/mmc/core/queue.c                           |   3 -
+ drivers/mmc/core/sdio_bus.c                        |   2 +-
+ drivers/mmc/host/davinci_mmc.c                     |  59 ++++++------
+ drivers/mmc/host/moxart-mmc.c                      |  90 +++++++++---------
+ drivers/mmc/host/mvsdio.c                          |  71 ++++++++++----
+ drivers/mmc/host/mxcmmc.c                          |  53 +++++++----
+ drivers/mmc/host/omap.c                            |  53 +++++------
+ drivers/mmc/host/renesas_sdhi.h                    |   3 +-
+ drivers/mmc/host/sdhci-esdhc-mcf.c                 |  12 ++-
+ drivers/mmc/host/sh_mmcif.c                        | 102 +++++++++++++--------
+ drivers/mmc/host/wbsd.c                            |   2 -
+ 18 files changed, 272 insertions(+), 212 deletions(-)
+Merging mfd/for-mfd-next (d5132d176d6f mfd: rc5t583: Convert to use maple tree register cache)
+$ git merge -m Merge branch 'for-mfd-next' of git://git.kernel.org/pub/scm/linux/kernel/git/lee/mfd.git mfd/for-mfd-next
+Auto-merging MAINTAINERS
+Auto-merging drivers/spi/spi-cs42l43.c
+Merge made by the 'ort' strategy.
+ .../devicetree/bindings/mfd/atmel,hlcdc.yaml       |  99 ++++++++++
+ .../devicetree/bindings/mfd/atmel-hlcdc.txt        |  56 ------
+ Documentation/devicetree/bindings/mfd/iqs62x.yaml  |   2 +-
+ .../devicetree/bindings/mfd/qcom,tcsr.yaml         |   2 +
+ Documentation/devicetree/bindings/mfd/syscon.yaml  |   2 +
+ MAINTAINERS                                        |   6 +
+ drivers/mfd/ac100.c                                |   2 +-
+ drivers/mfd/as3711.c                               |   2 +-
+ drivers/mfd/as3722.c                               |   2 +-
+ drivers/mfd/axp20x.c                               |   4 +-
+ drivers/mfd/bcm590xx.c                             |   4 +-
+ drivers/mfd/bd9571mwv.c                            |   4 +-
+ drivers/mfd/cros_ec_dev.c                          |   9 +
+ drivers/mfd/cs42l43-i2c.c                          |  15 +-
+ drivers/mfd/cs42l43-sdw.c                          |  15 +-
+ drivers/mfd/cs42l43.c                              |  52 +++---
+ drivers/mfd/cs42l43.h                              |  10 +-
+ drivers/mfd/da9052-core.c                          |   2 +-
+ drivers/mfd/da9055-core.c                          |   2 +-
+ drivers/mfd/da9062-core.c                          |   4 +-
+ drivers/mfd/da9063-i2c.c                           |   2 +-
+ drivers/mfd/da9150-core.c                          |   2 +-
+ drivers/mfd/intel-lpss-pci.c                       |  28 ++-
+ drivers/mfd/intel-lpss.c                           |   9 +-
+ drivers/mfd/intel-lpss.h                           |  14 +-
+ drivers/mfd/khadas-mcu.c                           |   2 +-
+ drivers/mfd/lochnagar-i2c.c                        |   4 +-
+ drivers/mfd/lpc_ich.c                              |   3 +-
+ drivers/mfd/mcp-core.c                             |   2 +-
+ drivers/mfd/omap-usb-host.c                        |   2 +-
+ drivers/mfd/rave-sp.c                              |   2 +-
+ drivers/mfd/rc5t583.c                              |   2 +-
+ drivers/mfd/rk8xx-spi.c                            |   2 +-
+ drivers/mfd/rn5t618.c                              |   2 +-
+ drivers/mfd/rohm-bd71828.c                         |   4 +-
+ drivers/mfd/rohm-bd718x7.c                         |   2 +-
+ drivers/mfd/rohm-bd9576.c                          |   2 +-
+ drivers/mfd/rsmu_i2c.c                             |   2 +-
+ drivers/mfd/si476x-prop.c                          |   2 +-
+ drivers/mfd/stmfx.c                                |   2 +-
+ drivers/mfd/stpmic1.c                              |   2 +-
+ drivers/mfd/wm5102-tables.c                        |   2 +-
+ drivers/mfd/wm5110-tables.c                        |   2 +-
+ drivers/mfd/wm831x-auxadc.c                        |  43 ++---
+ drivers/mfd/wm8350-regmap.c                        |   2 +-
+ drivers/mfd/wm8400-core.c                          |   2 +-
+ drivers/mfd/wm97xx-core.c                          |   6 +-
+ drivers/spi/spi-cs42l43.c                          |   2 +
+ drivers/watchdog/Kconfig                           |  11 ++
+ drivers/watchdog/Makefile                          |   1 +
+ drivers/watchdog/cros_ec_wdt.c                     | 204 +++++++++++++++++++++
+ include/linux/mfd/cs42l43.h                        |  19 +-
+ include/linux/mfd/sun4i-gpadc.h                    |   4 +-
+ include/linux/platform_data/cros_ec_commands.h     |  78 ++++----
+ 54 files changed, 536 insertions(+), 226 deletions(-)
+ create mode 100644 Documentation/devicetree/bindings/mfd/atmel,hlcdc.yaml
+ delete mode 100644 Documentation/devicetree/bindings/mfd/atmel-hlcdc.txt
+ create mode 100644 drivers/watchdog/cros_ec_wdt.c
+Merging backlight/for-backlight-next (770c0f4975fd dt-bindings: backlight: qcom-wled: Fix bouncing email addresses)
+$ git merge -m Merge branch 'for-backlight-next' of git://git.kernel.org/pub/scm/linux/kernel/git/lee/backlight.git backlight/for-backlight-next
+Auto-merging MAINTAINERS
+Merge made by the 'ort' strategy.
+ .../bindings/leds/backlight/kinetic,ktd2801.yaml   |  46 ++++++++
+ .../bindings/leds/backlight/qcom-wled.yaml         |   4 +-
+ MAINTAINERS                                        |  13 +++
+ drivers/leds/Kconfig                               |   4 +
+ drivers/leds/Makefile                              |   3 +
+ drivers/leds/flash/Kconfig                         |   2 +-
+ drivers/leds/flash/leds-ktd2692.c                  | 116 +++++--------------
+ drivers/leds/leds-expresswire.c                    |  68 +++++++++++
+ drivers/video/backlight/Kconfig                    |   7 ++
+ drivers/video/backlight/Makefile                   |   1 +
+ drivers/video/backlight/hx8357.c                   |  67 ++++++-----
+ drivers/video/backlight/ktd2801-backlight.c        | 128 +++++++++++++++++++++
+ drivers/video/backlight/ktz8866.c                  |   6 +-
+ drivers/video/backlight/mp3309c.c                  |   4 +-
+ include/linux/leds-expresswire.h                   |  36 ++++++
+ 15 files changed, 374 insertions(+), 131 deletions(-)
+ create mode 100644 Documentation/devicetree/bindings/leds/backlight/kinetic,ktd2801.yaml
+ create mode 100644 drivers/leds/leds-expresswire.c
+ create mode 100644 drivers/video/backlight/ktd2801-backlight.c
+ create mode 100644 include/linux/leds-expresswire.h
+Merging battery/for-next (a9b254892ce1 power: supply: axp288_fuel_gauge: Add STCK1A* Intel Compute Sticks to the deny-list)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/sre/linux-power-supply.git battery/for-next
+Auto-merging drivers/power/supply/bq27xxx_battery_i2c.c
+Merge made by the 'ort' strategy.
+ drivers/power/reset/as3722-poweroff.c       |  30 +++---
+ drivers/power/reset/atc260x-poweroff.c      |  55 ++++-------
+ drivers/power/reset/axxia-reset.c           |  16 ++-
+ drivers/power/reset/brcm-kona-reset.c       |  11 +--
+ drivers/power/reset/gemini-poweroff.c       |  16 +--
+ drivers/power/reset/msm-poweroff.c          |  21 ++--
+ drivers/power/reset/mt6323-poweroff.c       |  26 ++---
+ drivers/power/reset/regulator-poweroff.c    |  36 +++----
+ drivers/power/reset/restart-poweroff.c      |  25 ++---
+ drivers/power/reset/rmobile-reset.c         |  35 ++-----
+ drivers/power/reset/syscon-poweroff.c       |  66 ++++++-------
+ drivers/power/reset/tps65086-restart.c      |  58 ++---------
+ drivers/power/reset/xgene-reboot.c          |  21 ++--
+ drivers/power/supply/axp20x_usb_power.c     | 147 +++++++++++++++++++++++++---
+ drivers/power/supply/axp288_fuel_gauge.c    |  10 +-
+ drivers/power/supply/bq27xxx_battery.c      |  41 +++++---
+ drivers/power/supply/bq27xxx_battery_i2c.c  |  46 ++++-----
+ drivers/power/supply/da9030_battery.c       |   6 +-
+ drivers/power/supply/da9052-battery.c       |   4 +-
+ drivers/power/supply/da9150-charger.c       |  72 ++++----------
+ drivers/power/supply/ds2760_battery.c       |   4 +-
+ drivers/power/supply/goldfish_battery.c     |  24 ++---
+ drivers/power/supply/lp8727_charger.c       |  35 ++-----
+ drivers/power/supply/lp8788-charger.c       |  21 ++--
+ drivers/power/supply/max14577_charger.c     |   8 +-
+ drivers/power/supply/max77693_charger.c     |  10 +-
+ drivers/power/supply/max8925_power.c        |  37 ++-----
+ drivers/power/supply/pcf50633-charger.c     |  23 ++---
+ drivers/power/supply/rt5033_battery.c       |  14 +--
+ drivers/power/supply/rx51_battery.c         |  57 +++--------
+ drivers/power/supply/tps65090-charger.c     |  18 ++--
+ drivers/power/supply/twl4030_madc_battery.c |  59 +++--------
+ drivers/power/supply/wm831x_backup.c        |  13 +--
+ drivers/power/supply/wm831x_power.c         |  24 ++---
+ drivers/power/supply/wm8350_power.c         |  30 ++----
+ include/linux/power/bq27xxx_battery.h       |   1 -
+ 36 files changed, 457 insertions(+), 663 deletions(-)
+Merging regulator/for-next (32ca2f8f6696 Merge remote-tracking branch 'regulator/for-6.9' into regulator-next)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/broonie/regulator.git regulator/for-next
+Merge made by the 'ort' strategy.
+ .../bindings/regulator/mcp16502-regulator.txt      | 144 -----------------
+ .../bindings/regulator/microchip,mcp16502.yaml     | 180 +++++++++++++++++++++
+ .../regulator/qcom,usb-vbus-regulator.yaml         |   9 +-
+ .../devicetree/bindings/regulator/ti,tps65132.yaml |  84 ++++++++++
+ .../bindings/regulator/tps65132-regulator.txt      |  46 ------
+ drivers/regulator/core.c                           |   1 -
+ drivers/regulator/fixed-helper.c                   |   4 +-
+ drivers/regulator/lp873x-regulator.c               |   3 +-
+ drivers/regulator/lp87565-regulator.c              |   3 +-
+ drivers/regulator/pwm-regulator.c                  |  40 ++---
+ drivers/regulator/qcom_smd-regulator.c             |  19 +--
+ 11 files changed, 304 insertions(+), 229 deletions(-)
+ delete mode 100644 Documentation/devicetree/bindings/regulator/mcp16502-regulator.txt
+ create mode 100644 Documentation/devicetree/bindings/regulator/microchip,mcp16502.yaml
+ create mode 100644 Documentation/devicetree/bindings/regulator/ti,tps65132.yaml
+ delete mode 100644 Documentation/devicetree/bindings/regulator/tps65132-regulator.txt
+Merging security/next (1fc5baf574b7 Automated merge of 'dev' into 'next')
+$ git merge -m Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/pcmoore/lsm.git security/next
+Auto-merging fs/attr.c
+Auto-merging fs/file_table.c
+Auto-merging fs/namei.c
+Auto-merging fs/nfsd/vfs.c
+Auto-merging fs/open.c
+Auto-merging fs/posix_acl.c
+Auto-merging include/linux/lsm_hook_defs.h
+Auto-merging include/linux/security.h
+Auto-merging security/security.c
+Auto-merging security/selinux/hooks.c
+Merge made by the 'ort' strategy.
+ fs/attr.c                                          |   5 +-
+ fs/file_table.c                                    |   3 +-
+ fs/namei.c                                         |  12 +-
+ fs/nfsd/vfs.c                                      |   3 +-
+ fs/open.c                                          |   1 -
+ fs/posix_acl.c                                     |   5 +-
+ fs/xattr.c                                         |   9 +-
+ include/linux/evm.h                                | 117 +--------
+ include/linux/ima.h                                | 142 -----------
+ include/linux/integrity.h                          |  27 ---
+ include/linux/lsm_hook_defs.h                      |  20 +-
+ include/linux/security.h                           |  59 +++++
+ include/uapi/linux/lsm.h                           |   2 +
+ security/integrity/Makefile                        |   1 +
+ security/integrity/digsig_asymmetric.c             |  23 --
+ security/integrity/evm/Kconfig                     |   1 +
+ security/integrity/evm/evm.h                       |  19 ++
+ security/integrity/evm/evm_crypto.c                |   4 +-
+ security/integrity/evm/evm_main.c                  | 195 ++++++++++++---
+ security/integrity/iint.c                          | 197 +--------------
+ security/integrity/ima/Kconfig                     |   1 +
+ security/integrity/ima/Makefile                    |   2 +-
+ security/integrity/ima/ima.h                       | 148 ++++++++++--
+ security/integrity/ima/ima_api.c                   |  23 +-
+ security/integrity/ima/ima_appraise.c              |  66 ++++--
+ security/integrity/ima/ima_iint.c                  | 142 +++++++++++
+ security/integrity/ima/ima_init.c                  |   2 +-
+ security/integrity/ima/ima_main.c                  | 148 +++++++++---
+ security/integrity/ima/ima_policy.c                |   2 +-
+ security/integrity/integrity.h                     |  80 +------
+ security/keys/key.c                                |  10 +-
+ security/security.c                                | 263 +++++++++++++--------
+ security/selinux/hooks.c                           |   3 +-
+ security/smack/smack_lsm.c                         |   4 +-
+ .../testing/selftests/lsm/lsm_list_modules_test.c  |   6 +
+ 35 files changed, 906 insertions(+), 839 deletions(-)
+ create mode 100644 security/integrity/ima/ima_iint.c
+Merging apparmor/apparmor-next (8ead196be219 apparmor: Fix memory leak in unpack_profile())
+$ git merge -m Merge branch 'apparmor-next' of git://git.kernel.org/pub/scm/linux/kernel/git/jj/linux-apparmor apparmor/apparmor-next
+Already up to date.
+Merging integrity/next-integrity (85445b964290 integrity: eliminate unnecessary "Problem loading X.509 certificate" msg)
+$ git merge -m Merge branch 'next-integrity' of git://git.kernel.org/pub/scm/linux/kernel/git/zohar/linux-integrity integrity/next-integrity
+Merge made by the 'ort' strategy.
+ security/integrity/digsig.c | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+Merging selinux/next (7c655bee5cd8 selinux: only filter copy-up xattrs following initialization)
+$ git merge -m Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/pcmoore/selinux.git selinux/next
+Auto-merging security/selinux/hooks.c
+Merge made by the 'ort' strategy.
+ security/selinux/hooks.c | 28 +++++++++++++++-------------
+ 1 file changed, 15 insertions(+), 13 deletions(-)
+Merging smack/next (69b6d71052b5 Smack: use init_task_smack() in smack_cred_transfer())
+$ git merge -m Merge branch 'next' of git://github.com/cschaufler/smack-next smack/next
+Auto-merging security/smack/smack_lsm.c
+Merge made by the 'ort' strategy.
+ fs/ramfs/inode.c           |  32 ++++++++++++-
+ security/smack/smack_lsm.c | 112 ++++++++++++++++++++++++---------------------
+ 2 files changed, 92 insertions(+), 52 deletions(-)
+Merging tomoyo/master (0bb80ecc33a8 Linux 6.6-rc1)
+$ git merge -m Merge branch 'master' of https://scm.osdn.net/gitroot/tomoyo/tomoyo-test1.git tomoyo/master
+Already up to date.
+Merging tpmdd/next (4a25541b236f tpm: tis_i2c: Add compatible string nuvoton,npct75x)
+$ git merge -m Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/jarkko/linux-tpmdd.git tpmdd/next
+Auto-merging drivers/char/tpm/tpm_tis.c
+Auto-merging drivers/char/tpm/tpm_tis_core.c
+Merge made by the 'ort' strategy.
+ Documentation/devicetree/bindings/tpm/tcg,tpm_tis-spi.yaml | 1 +
+ drivers/char/tpm/tpm_ftpm_tee.c                            | 6 +++---
+ drivers/char/tpm/tpm_tis.c                                 | 1 +
+ drivers/char/tpm/tpm_tis_core.c                            | 3 +--
+ drivers/char/tpm/tpm_tis_i2c.c                             | 2 ++
+ drivers/char/tpm/tpm_tis_spi_main.c                        | 1 +
+ 6 files changed, 9 insertions(+), 5 deletions(-)
+Merging watchdog/master (41bccc98fb79 Linux 6.8-rc2)
+$ git merge -m Merge branch 'master' of git://www.linux-watchdog.org/linux-watchdog-next.git watchdog/master
+Already up to date.
+Merging iommu/next (05f64ad28da1 Merge branches 'arm/mediatek', 'arm/renesas', 'x86/amd' and 'core' into next)
+$ git merge -m Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/joro/iommu.git iommu/next
+Auto-merging MAINTAINERS
+Auto-merging drivers/iommu/Kconfig
+Auto-merging drivers/iommu/intel/Kconfig
+Merge made by the 'ort' strategy.
+ MAINTAINERS                                     |   1 -
+ drivers/iommu/Kconfig                           |   4 +
+ drivers/iommu/Makefile                          |   3 +-
+ drivers/iommu/amd/amd_iommu.h                   |  41 +-
+ drivers/iommu/amd/amd_iommu_types.h             |  34 +-
+ drivers/iommu/amd/init.c                        |  12 +-
+ drivers/iommu/amd/io_pgtable_v2.c               |  21 +-
+ drivers/iommu/amd/iommu.c                       | 636 +++++++++++-------------
+ drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c |  14 +-
+ drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c     | 103 ++--
+ drivers/iommu/intel/Kconfig                     |   1 +
+ drivers/iommu/intel/iommu.c                     |  28 +-
+ drivers/iommu/intel/iommu.h                     |   4 +-
+ drivers/iommu/intel/svm.c                       |  41 +-
+ drivers/iommu/io-pgfault.c                      | 467 +++++++++--------
+ drivers/iommu/iommu-sva.c                       |  71 ++-
+ drivers/iommu/iommu-sva.h                       |  71 ---
+ drivers/iommu/iommu.c                           | 252 +---------
+ drivers/iommu/iova.c                            | 143 +++---
+ drivers/iommu/ipmmu-vmsa.c                      |  15 +-
+ drivers/iommu/mtk_iommu.c                       |   2 +-
+ drivers/iommu/mtk_iommu_v1.c                    |   4 +-
+ include/linux/iommu.h                           | 286 ++++++++---
+ include/uapi/linux/iommu.h                      | 161 ------
+ 24 files changed, 1044 insertions(+), 1371 deletions(-)
+ delete mode 100644 drivers/iommu/iommu-sva.h
+ delete mode 100644 include/uapi/linux/iommu.h
+Merging audit/next (aa13b709084a audit: use KMEM_CACHE() instead of kmem_cache_create())
+$ git merge -m Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/pcmoore/audit.git audit/next
+Merge made by the 'ort' strategy.
+ kernel/audit.c       | 4 +---
+ kernel/auditfilter.c | 2 +-
+ 2 files changed, 2 insertions(+), 4 deletions(-)
+Merging devicetree/for-next (2ff94f7ce292 docs: dt: writing-schema: document expectations on example DTS)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/robh/linux.git devicetree/for-next
+Auto-merging Documentation/devicetree/bindings/Makefile
+Auto-merging Documentation/devicetree/bindings/trivial-devices.yaml
+CONFLICT (content): Merge conflict in Documentation/devicetree/bindings/trivial-devices.yaml
+Auto-merging Documentation/devicetree/bindings/vendor-prefixes.yaml
+Auto-merging MAINTAINERS
+Auto-merging drivers/of/base.c
+Auto-merging drivers/of/property.c
+Resolved 'Documentation/devicetree/bindings/trivial-devices.yaml' using previous resolution.
+Automatic merge failed; fix conflicts and then commit the result.
+$ git commit --no-edit -v -a
+[master 4f6823c4bb27] Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/robh/linux.git
+$ git diff -M --stat --summary HEAD^..
+ Documentation/devicetree/bindings/Makefile         |   3 -
+ .../bindings/display/panel/visionox,r66451.yaml    |   2 +-
+ .../devicetree/bindings/fpga/fpga-region.txt       | 479 ---------------------
+ .../devicetree/bindings/fpga/fpga-region.yaml      | 358 +++++++++++++++
+ .../devicetree/bindings/gpio/mrvl-gpio.yaml        |   2 +-
+ .../devicetree/bindings/i2c/i2c-demux-pinctrl.yaml |   3 +-
+ Documentation/devicetree/bindings/i2c/i2c-pxa.yaml |   2 +-
+ .../mediatek,mt6577-sysirq.yaml                    |  85 ++++
+ .../interrupt-controller/mediatek,sysirq.txt       |  44 --
+ .../devicetree/bindings/misc/qcom,fastrpc.yaml     |   2 +
+ .../devicetree/bindings/misc/xlnx,sd-fec.txt       |  58 ---
+ .../devicetree/bindings/misc/xlnx,sd-fec.yaml      | 140 ++++++
+ .../devicetree/bindings/mux/mux-controller.yaml    |   2 +-
+ .../devicetree/bindings/net/can/fsl,flexcan.yaml   |   3 +
+ .../devicetree/bindings/rtc/sa1100-rtc.yaml        |   2 +-
+ .../devicetree/bindings/submitting-patches.rst     |  23 +-
+ .../devicetree/bindings/timer/mrvl,mmp-timer.yaml  |   2 +-
+ .../devicetree/bindings/trivial-devices.yaml       |  79 ++--
+ .../devicetree/bindings/usb/cypress,hx3.yaml       |   2 +-
+ .../devicetree/bindings/vendor-prefixes.yaml       |   2 +
+ .../devicetree/bindings/writing-schema.rst         |  30 +-
+ Documentation/misc-devices/xilinx_sdfec.rst        |   2 +-
+ MAINTAINERS                                        |   8 +-
+ drivers/of/base.c                                  |   4 +-
+ drivers/of/property.c                              |  12 +-
+ include/dt-bindings/power/amlogic,c3-pwrc.h        |   2 +-
+ include/linux/of_graph.h                           |   4 +-
+ 27 files changed, 694 insertions(+), 661 deletions(-)
+ delete mode 100644 Documentation/devicetree/bindings/fpga/fpga-region.txt
+ create mode 100644 Documentation/devicetree/bindings/fpga/fpga-region.yaml
+ create mode 100644 Documentation/devicetree/bindings/interrupt-controller/mediatek,mt6577-sysirq.yaml
+ delete mode 100644 Documentation/devicetree/bindings/interrupt-controller/mediatek,sysirq.txt
+ delete mode 100644 Documentation/devicetree/bindings/misc/xlnx,sd-fec.txt
+ create mode 100644 Documentation/devicetree/bindings/misc/xlnx,sd-fec.yaml
+Merging dt-krzk/for-next (8c82b4eef297 ARM: dts: sti: minor whitespace cleanup around '=')
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/krzk/linux-dt.git dt-krzk/for-next
+Merge made by the 'ort' strategy.
+ arch/arm/boot/dts/marvell/dove-cubox.dts      | 4 ++--
+ arch/arm/boot/dts/marvell/mmp2-brownstone.dts | 2 +-
+ arch/arm/boot/dts/st/stih407-pinctrl.dtsi     | 8 ++++----
+ arch/arm/boot/dts/ti/davinci/da850.dtsi       | 4 ++--
+ 4 files changed, 9 insertions(+), 9 deletions(-)
+Merging mailbox/for-next (cd795fb0c352 mailbox: mtk-cmdq: Add CMDQ driver support for mt8188)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/jassibrar/mailbox.git mailbox/for-next
+Already up to date.
+Merging spi/for-next (55072343f1df Merge remote-tracking branch 'spi/for-6.9' into spi-next)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/broonie/spi.git spi/for-next
+Auto-merging drivers/char/tpm/tpm_tis_spi_main.c
+Auto-merging drivers/spi/spi-cs42l43.c
+Auto-merging include/linux/spi/spi.h
+Auto-merging include/media/v4l2-common.h
+Merge made by the 'ort' strategy.
+ .../devicetree/bindings/spi/samsung,spi.yaml       |   2 +
+ .../devicetree/bindings/spi/spi-fsl-lpspi.yaml     |   1 +
+ .../devicetree/bindings/spi/spi-nxp-fspi.yaml      |  18 +-
+ Documentation/driver-api/driver-model/devres.rst   |   2 +-
+ Documentation/spi/spi-summary.rst                  | 114 ++---
+ drivers/char/tpm/tpm_tis_spi_main.c                |   4 +-
+ drivers/fpga/ice40-spi.c                           |   4 +-
+ drivers/iio/adc/ad_sigma_delta.c                   |  14 +-
+ drivers/input/joystick/psxpad-spi.c                |   4 +-
+ drivers/input/rmi4/rmi_spi.c                       |   2 +-
+ drivers/media/pci/mgb4/mgb4_core.c                 |  14 +-
+ drivers/media/pci/netup_unidvb/netup_unidvb_spi.c  |  48 +-
+ drivers/media/usb/msi2500/msi2500.c                |  38 +-
+ drivers/media/v4l2-core/v4l2-spi.c                 |   4 +-
+ drivers/misc/gehc-achc.c                           |   8 +-
+ drivers/mmc/host/mmc_spi.c                         |   6 +-
+ drivers/mtd/devices/mtd_dataflash.c                |   2 +-
+ drivers/net/ethernet/micrel/ks8851_spi.c           |   4 +-
+ drivers/net/ethernet/vertexcom/mse102x.c           |   2 +-
+ drivers/net/ieee802154/ca8210.c                    |   2 +-
+ drivers/net/wireless/marvell/libertas/if_spi.c     |   2 +-
+ drivers/platform/chrome/cros_ec_spi.c              |   8 +-
+ drivers/spi/Kconfig                                |   2 +-
+ drivers/spi/spi-ath79.c                            |   4 +-
+ drivers/spi/spi-au1550.c                           |   2 +-
+ drivers/spi/spi-axi-spi-engine.c                   |  85 ++--
+ drivers/spi/spi-bcm2835.c                          |  27 +-
+ drivers/spi/spi-bitbang.c                          |  64 +--
+ drivers/spi/spi-butterfly.c                        |   6 +-
+ drivers/spi/spi-cadence-quadspi.c                  |   7 +-
+ drivers/spi/spi-cavium.c                           |   6 +-
+ drivers/spi/spi-cavium.h                           |   2 +-
+ drivers/spi/spi-cs42l43.c                          |  22 +-
+ drivers/spi/spi-davinci.c                          |   6 +-
+ drivers/spi/spi-dw-dma.c                           |   2 +-
+ drivers/spi/spi-fsl-dspi.c                         |  15 +-
+ drivers/spi/spi-fsl-lib.c                          |  14 +-
+ drivers/spi/spi-geni-qcom.c                        |   2 +-
+ drivers/spi/spi-gpio.c                             |   2 +-
+ drivers/spi/spi-intel.c                            |  34 +-
+ drivers/spi/spi-lm70llp.c                          |   6 +-
+ drivers/spi/spi-loopback-test.c                    |   4 +-
+ drivers/spi/spi-mt65xx.c                           |   5 +
+ drivers/spi/spi-nxp-fspi.c                         |   2 +-
+ drivers/spi/spi-oc-tiny.c                          |   6 +-
+ drivers/spi/spi-omap-uwire.c                       |   4 +-
+ drivers/spi/spi-pci1xxxx.c                         | 514 +++++++++++++++++++--
+ drivers/spi/spi-rockchip.c                         |  13 +-
+ drivers/spi/spi-s3c64xx.c                          | 191 +++++---
+ drivers/spi/spi-sh-sci.c                           |  10 +-
+ drivers/spi/spi-slave-mt27xx.c                     |   2 +-
+ drivers/spi/spi-stm32.c                            |   4 +-
+ drivers/spi/spi-xilinx.c                           |   4 +-
+ drivers/spi/spi-xtensa-xtfpga.c                    |   2 +-
+ drivers/spi/spi.c                                  | 148 ++----
+ drivers/spi/spidev.c                               |   2 +-
+ drivers/staging/fbtft/fbtft-core.c                 |   4 +-
+ drivers/staging/greybus/spilib.c                   |  66 +--
+ drivers/usb/gadget/udc/max3420_udc.c               |   2 +-
+ drivers/video/fbdev/mmp/hw/mmp_spi.c               |  26 +-
+ include/linux/amba/pl022.h                         |   4 +-
+ include/linux/spi/pxa2xx_spi.h                     |   3 +-
+ include/linux/spi/spi.h                            |  32 +-
+ include/linux/spi/spi_bitbang.h                    |   2 +-
+ include/linux/spi/spi_gpio.h                       |   4 +-
+ include/media/v4l2-common.h                        |   6 +-
+ 66 files changed, 1061 insertions(+), 610 deletions(-)
+Merging tip/master (c56ac217a3c0 Merge branch into tip/master: 'x86/vdso')
+$ git merge -m Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git tip/master
+Auto-merging Documentation/admin-guide/kernel-parameters.txt
+Auto-merging MAINTAINERS
+Auto-merging arch/x86/Kconfig
+Auto-merging arch/x86/Makefile
+Auto-merging arch/x86/include/asm/pgtable.h
+Auto-merging arch/x86/kernel/Makefile
+Auto-merging arch/x86/kernel/alternative.c
+Auto-merging arch/x86/kernel/cpu/mshyperv.c
+Auto-merging arch/x86/kernel/kvm.c
+Auto-merging arch/x86/kernel/smp.c
+Auto-merging arch/x86/mm/dump_pagetables.c
+Auto-merging arch/x86/mm/fault.c
+Auto-merging arch/x86/mm/tlb.c
+Auto-merging arch/x86/net/bpf_jit_comp.c
+Auto-merging drivers/iommu/amd/amd_iommu.h
+Auto-merging drivers/iommu/amd/init.c
+Auto-merging drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+Auto-merging drivers/perf/arm_smmuv3_pmu.c
+Auto-merging include/linux/bitmap.h
+Auto-merging kernel/sched/core.c
+Auto-merging kernel/sched/fair.c
+Auto-merging net/sched/sch_api.c
+Auto-merging scripts/Makefile.lib
+Auto-merging scripts/generate_rust_target.rs
+Merge made by the 'ort' strategy.
+ Documentation/admin-guide/hw-vuln/spectre.rst    |    8 +-
+ Documentation/admin-guide/kernel-parameters.txt  |   15 +-
+ Documentation/arch/x86/amd-memory-encryption.rst |   16 +-
+ Documentation/arch/x86/mds.rst                   |   34 +-
+ Documentation/arch/x86/pti.rst                   |    6 +-
+ Documentation/arch/x86/x86_64/fred.rst           |   96 ++
+ Documentation/arch/x86/x86_64/index.rst          |    1 +
+ Documentation/process/maintainer-tip.rst         |   34 +-
+ Documentation/virt/coco/sev-guest.rst            |   52 +
+ MAINTAINERS                                      |   10 +
+ arch/x86/Kbuild                                  |    2 +
+ arch/x86/Kconfig                                 |   66 +-
+ arch/x86/Makefile                                |   14 +-
+ arch/x86/boot/compressed/acpi.c                  |    2 +
+ arch/x86/boot/compressed/cmdline.c               |    2 +
+ arch/x86/boot/compressed/efi.c                   |    2 +
+ arch/x86/boot/compressed/efi.h                   |    9 -
+ arch/x86/boot/compressed/ident_map_64.c          |    6 +-
+ arch/x86/boot/compressed/misc.c                  |   44 +-
+ arch/x86/boot/compressed/misc.h                  |    3 +
+ arch/x86/boot/compressed/pgtable_64.c            |    1 +
+ arch/x86/boot/compressed/sev.c                   |    5 +
+ arch/x86/coco/core.c                             |    7 +-
+ arch/x86/configs/i386_defconfig                  |    2 +-
+ arch/x86/entry/Makefile                          |    5 +-
+ arch/x86/entry/calling.h                         |  115 ++-
+ arch/x86/entry/entry.S                           |   27 +
+ arch/x86/entry/entry_32.S                        |    9 +-
+ arch/x86/entry/entry_64.S                        |   40 +-
+ arch/x86/entry/entry_64_compat.S                 |    1 +
+ arch/x86/entry/entry_64_fred.S                   |  131 +++
+ arch/x86/entry/entry_fred.c                      |  294 ++++++
+ arch/x86/entry/thunk_32.S                        |   34 +-
+ arch/x86/entry/thunk_64.S                        |   33 -
+ arch/x86/entry/vdso/Makefile                     |   32 +-
+ arch/x86/entry/vsyscall/vsyscall_64.c            |    2 +-
+ arch/x86/include/asm/asm-prototypes.h            |    1 +
+ arch/x86/include/asm/asm.h                       |   14 +
+ arch/x86/include/asm/barrier.h                   |    2 +-
+ arch/x86/include/asm/coco.h                      |    8 +-
+ arch/x86/include/asm/cpufeatures.h               |    5 +-
+ arch/x86/include/asm/current.h                   |    9 +-
+ arch/x86/include/asm/desc.h                      |    2 -
+ arch/x86/include/asm/disabled-features.h         |   26 +-
+ arch/x86/include/asm/efi.h                       |   14 +-
+ arch/x86/include/asm/entry-common.h              |    1 -
+ arch/x86/include/asm/extable_fixup_types.h       |    4 +-
+ arch/x86/include/asm/fpu/sched.h                 |   10 +-
+ arch/x86/include/asm/fred.h                      |   97 ++
+ arch/x86/include/asm/ia32.h                      |    4 +-
+ arch/x86/include/asm/idtentry.h                  |   88 +-
+ arch/x86/include/asm/iommu.h                     |    1 +
+ arch/x86/include/asm/kexec.h                     |    1 -
+ arch/x86/include/asm/kvm-x86-ops.h               |    1 +
+ arch/x86/include/asm/kvm_host.h                  |    1 +
+ arch/x86/include/asm/kvmclock.h                  |    2 -
+ arch/x86/include/asm/linkage.h                   |   16 +-
+ arch/x86/include/asm/mem_encrypt.h               |   16 +-
+ arch/x86/include/asm/msr-index.h                 |   24 +-
+ arch/x86/include/asm/msr.h                       |   18 +
+ arch/x86/include/asm/nospec-branch.h             |   80 +-
+ arch/x86/include/asm/page.h                      |    6 +-
+ arch/x86/include/asm/pci.h                       |   13 -
+ arch/x86/include/asm/percpu.h                    |  191 +++-
+ arch/x86/include/asm/pgalloc.h                   |    2 +-
+ arch/x86/include/asm/pgtable-3level.h            |    2 +-
+ arch/x86/include/asm/pgtable.h                   |   18 +-
+ arch/x86/include/asm/pgtable_64.h                |    3 +-
+ arch/x86/include/asm/preempt.h                   |    2 +-
+ arch/x86/include/asm/processor-flags.h           |    2 +-
+ arch/x86/include/asm/processor.h                 |    3 +
+ arch/x86/include/asm/pti.h                       |    2 +-
+ arch/x86/include/asm/ptrace.h                    |  104 +-
+ arch/x86/include/asm/resctrl.h                   |   90 ++
+ arch/x86/include/asm/setup_data.h                |   32 +
+ arch/x86/include/asm/sev.h                       |   41 +-
+ arch/x86/include/asm/static_call.h               |    2 +-
+ arch/x86/include/asm/switch_to.h                 |    8 +-
+ arch/x86/include/asm/text-patching.h             |    2 +
+ arch/x86/include/asm/thread_info.h               |   12 +-
+ arch/x86/include/asm/trap_pf.h                   |   20 +-
+ arch/x86/include/asm/trapnr.h                    |   12 +
+ arch/x86/include/asm/uaccess_64.h                |   11 +-
+ arch/x86/include/asm/vmx.h                       |   17 +-
+ arch/x86/include/asm/x86_init.h                  |    2 -
+ arch/x86/include/uapi/asm/bootparam.h            |   72 +-
+ arch/x86/include/uapi/asm/processor-flags.h      |    7 +
+ arch/x86/include/uapi/asm/setup_data.h           |   83 ++
+ arch/x86/kernel/Makefile                         |    1 +
+ arch/x86/kernel/acpi/wakeup_64.S                 |   24 +-
+ arch/x86/kernel/alternative.c                    |   23 +-
+ arch/x86/kernel/asm-offsets.c                    |    2 +-
+ arch/x86/kernel/callthunks.c                     |   32 +-
+ arch/x86/kernel/cpu/acrn.c                       |    4 +-
+ arch/x86/kernel/cpu/amd.c                        |   37 +-
+ arch/x86/kernel/cpu/bugs.c                       |   63 +-
+ arch/x86/kernel/cpu/common.c                     |   46 +-
+ arch/x86/kernel/cpu/cpuid-deps.c                 |    2 +
+ arch/x86/kernel/cpu/mce/core.c                   |   28 +-
+ arch/x86/kernel/cpu/mshyperv.c                   |   15 +-
+ arch/x86/kernel/cpu/mtrr/generic.c               |    3 +
+ arch/x86/kernel/cpu/resctrl/core.c               |  120 +--
+ arch/x86/kernel/cpu/resctrl/ctrlmondata.c        |   48 +-
+ arch/x86/kernel/cpu/resctrl/internal.h           |   75 +-
+ arch/x86/kernel/cpu/resctrl/monitor.c            |  501 +++++++---
+ arch/x86/kernel/cpu/resctrl/pseudo_lock.c        |   15 +-
+ arch/x86/kernel/cpu/resctrl/rdtgroup.c           |  388 +++++---
+ arch/x86/kernel/crash.c                          |    4 +
+ arch/x86/kernel/dumpstack.c                      |    2 +-
+ arch/x86/kernel/espfix_64.c                      |    8 +
+ arch/x86/kernel/fred.c                           |   59 ++
+ arch/x86/kernel/ftrace.c                         |    3 +-
+ arch/x86/kernel/head_32.S                        |    4 +-
+ arch/x86/kernel/head_64.S                        |   39 +-
+ arch/x86/kernel/idt.c                            |    4 +-
+ arch/x86/kernel/irqinit.c                        |    7 +-
+ arch/x86/kernel/kprobes/opt.c                    |    2 +-
+ arch/x86/kernel/kvm.c                            |    2 +-
+ arch/x86/kernel/kvmclock.c                       |    4 +-
+ arch/x86/kernel/ldt.c                            |    8 +-
+ arch/x86/kernel/nmi.c                            |   51 +-
+ arch/x86/kernel/process_32.c                     |    7 +-
+ arch/x86/kernel/process_64.c                     |   74 +-
+ arch/x86/kernel/sev-shared.c                     |  116 ++-
+ arch/x86/kernel/sev.c                            |   19 +-
+ arch/x86/kernel/smp.c                            |   10 +-
+ arch/x86/kernel/static_call.c                    |    2 +-
+ arch/x86/kernel/traps.c                          |   78 +-
+ arch/x86/kernel/tsc.c                            |   34 +-
+ arch/x86/kernel/vmlinux.lds.S                    |   11 +-
+ arch/x86/kvm/lapic.c                             |    5 +-
+ arch/x86/kvm/mmu/mmu.c                           |    2 +-
+ arch/x86/kvm/mmu/mmu_internal.h                  |    2 +-
+ arch/x86/kvm/svm/nested.c                        |    2 +-
+ arch/x86/kvm/svm/sev.c                           |   37 +-
+ arch/x86/kvm/svm/svm.c                           |   19 +-
+ arch/x86/kvm/svm/svm.h                           |    1 +
+ arch/x86/kvm/svm/vmenter.S                       |    4 +-
+ arch/x86/kvm/vmx/run_flags.h                     |    7 +-
+ arch/x86/kvm/vmx/vmenter.S                       |    9 +-
+ arch/x86/kvm/vmx/vmx.c                           |   34 +-
+ arch/x86/lib/Makefile                            |    2 +-
+ arch/x86/lib/cmpxchg16b_emu.S                    |   12 +-
+ arch/x86/lib/cmpxchg8b_emu.S                     |   30 +-
+ arch/x86/lib/insn-eval.c                         |    6 +-
+ arch/x86/lib/retpoline.S                         |   41 +-
+ arch/x86/lib/x86-opcode-map.txt                  |    4 +-
+ arch/x86/mm/Makefile                             |    2 +-
+ arch/x86/mm/debug_pagetables.c                   |    4 +-
+ arch/x86/mm/dump_pagetables.c                    |    4 +-
+ arch/x86/mm/extable.c                            |   78 ++
+ arch/x86/mm/fault.c                              |   37 +-
+ arch/x86/mm/mem_encrypt.c                        |   56 +-
+ arch/x86/mm/mem_encrypt_identity.c               |   40 +-
+ arch/x86/mm/pgtable.c                            |    4 +-
+ arch/x86/mm/tlb.c                                |   10 +-
+ arch/x86/net/bpf_jit_comp.c                      |    4 +-
+ arch/x86/net/bpf_jit_comp32.c                    |    2 +-
+ arch/x86/platform/efi/efi.c                      |    5 +
+ arch/x86/platform/pvh/enlighten.c                |    1 +
+ arch/x86/purgatory/Makefile                      |    2 +-
+ arch/x86/virt/svm/Makefile                       |    3 +
+ arch/x86/virt/svm/sev.c                          |  560 +++++++++++
+ arch/x86/xen/enlighten_pvh.c                     |    1 +
+ arch/x86/xen/vga.c                               |    1 -
+ arch/x86/xen/xen-asm.S                           |   10 +-
+ drivers/base/platform-msi.c                      |  119 ++-
+ drivers/clocksource/arm_arch_timer.c             |    6 +-
+ drivers/crypto/ccp/Kconfig                       |    2 +-
+ drivers/crypto/ccp/sev-dev.c                     | 1156 ++++++++++++++++++++--
+ drivers/crypto/ccp/sev-dev.h                     |    5 +
+ drivers/dma/mv_xor_v2.c                          |    8 +-
+ drivers/dma/qcom/hidma.c                         |    6 +-
+ drivers/iommu/amd/amd_iommu.h                    |    1 -
+ drivers/iommu/amd/init.c                         |  129 ++-
+ drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c      |    5 +-
+ drivers/irqchip/irq-bcm6345-l1.c                 |    2 +-
+ drivers/irqchip/irq-bcm7038-l1.c                 |    2 +-
+ drivers/irqchip/irq-gic-v3-its.c                 |    4 +-
+ drivers/irqchip/irq-gic-v3.c                     |   57 +-
+ drivers/irqchip/irq-gic.c                        |   27 +-
+ drivers/irqchip/irq-loongson-eiointc.c           |   22 +-
+ drivers/irqchip/irq-sifive-plic.c                |    8 +-
+ drivers/mailbox/bcm-flexrm-mailbox.c             |    8 +-
+ drivers/pci/msi/irqdomain.c                      |    2 +-
+ drivers/perf/arm_smmuv3_pmu.c                    |    4 +-
+ drivers/ptp/ptp_kvm_common.c                     |   10 +-
+ drivers/ptp/ptp_kvm_x86.c                        |    4 +-
+ drivers/ufs/host/ufs-qcom.c                      |    8 +-
+ drivers/xen/events/events_base.c                 |    2 +-
+ include/linux/amd-iommu.h                        |    6 +-
+ include/linux/atomic/atomic-arch-fallback.h      |   46 +-
+ include/linux/atomic/atomic-instrumented.h       |   68 +-
+ include/linux/atomic/atomic-long.h               |   24 +-
+ include/linux/bitmap.h                           |    3 +
+ include/linux/clocksource_ids.h                  |    3 +
+ include/linux/compiler-gcc.h                     |    2 +-
+ include/linux/compiler.h                         |    2 +-
+ include/linux/hrtimer.h                          |  119 +--
+ include/linux/hrtimer_defs.h                     |  104 ++
+ include/linux/indirect_call_wrapper.h            |    2 +-
+ include/linux/irq.h                              |    2 +-
+ include/linux/irqdomain.h                        |   17 +
+ include/linux/irqdomain_defs.h                   |    2 +
+ include/linux/irqhandler.h                       |    2 +-
+ include/linux/jiffies.h                          |   15 +-
+ include/linux/module.h                           |    2 +-
+ include/linux/msi.h                              |   24 +-
+ include/linux/objtool.h                          |    2 +-
+ include/linux/psp-sev.h                          |  321 +++++-
+ include/linux/pti.h                              |    2 +-
+ include/linux/ptp_kvm.h                          |    4 +-
+ include/linux/resctrl.h                          |   48 +-
+ include/linux/tick.h                             |    9 +-
+ include/linux/timekeeping.h                      |   10 +-
+ include/net/netfilter/nf_tables_core.h           |    2 +-
+ include/net/tc_wrapper.h                         |    2 +-
+ include/uapi/linux/psp-sev.h                     |   59 ++
+ kernel/cpu.c                                     |    9 +-
+ kernel/irq/irq_sim.c                             |   28 +-
+ kernel/irq/irqdesc.c                             |  112 ++-
+ kernel/irq/irqdomain.c                           |   28 +-
+ kernel/irq/manage.c                              |  109 +-
+ kernel/irq/msi.c                                 |  192 +++-
+ kernel/sched/core.c                              |    4 +-
+ kernel/sched/fair.c                              |    2 -
+ kernel/time/clockevents.c                        |    2 +-
+ kernel/time/clocksource.c                        |    2 +-
+ kernel/time/hrtimer.c                            |   18 +-
+ kernel/time/tick-sched.c                         |   10 +
+ kernel/time/timekeeping.c                        |   33 +-
+ kernel/time/timer.c                              |   45 +
+ kernel/trace/ring_buffer.c                       |    2 +-
+ net/netfilter/Makefile                           |    2 +-
+ net/netfilter/nf_tables_core.c                   |    6 +-
+ net/netfilter/nft_ct.c                           |    4 +-
+ net/netfilter/nft_lookup.c                       |    2 +-
+ net/sched/sch_api.c                              |    2 +-
+ scripts/Makefile.lib                             |    8 +-
+ scripts/Makefile.vmlinux_o                       |    2 +-
+ scripts/atomic/kerneldoc/add_unless              |    1 +
+ scripts/atomic/kerneldoc/cmpxchg                 |    1 +
+ scripts/atomic/kerneldoc/dec_if_positive         |    1 +
+ scripts/atomic/kerneldoc/dec_unless_positive     |    1 +
+ scripts/atomic/kerneldoc/inc_not_zero            |    1 +
+ scripts/atomic/kerneldoc/inc_unless_negative     |    1 +
+ scripts/atomic/kerneldoc/try_cmpxchg             |    3 +-
+ scripts/generate_rust_target.rs                  |    2 +-
+ scripts/mod/modpost.c                            |    2 +-
+ tools/arch/x86/include/asm/cpufeatures.h         |    3 +
+ tools/arch/x86/include/asm/disabled-features.h   |   18 +-
+ tools/arch/x86/include/asm/msr-index.h           |   13 +-
+ tools/arch/x86/lib/x86-opcode-map.txt            |    4 +-
+ tools/objtool/arch/x86/decode.c                  |   19 +-
+ tools/objtool/arch/x86/special.c                 |    2 +-
+ tools/objtool/check.c                            |    4 +-
+ 256 files changed, 6717 insertions(+), 1860 deletions(-)
+ create mode 100644 Documentation/arch/x86/x86_64/fred.rst
+ create mode 100644 arch/x86/entry/entry_64_fred.S
+ create mode 100644 arch/x86/entry/entry_fred.c
+ create mode 100644 arch/x86/include/asm/fred.h
+ create mode 100644 arch/x86/include/asm/setup_data.h
+ create mode 100644 arch/x86/include/uapi/asm/setup_data.h
+ create mode 100644 arch/x86/kernel/fred.c
+ create mode 100644 arch/x86/virt/svm/Makefile
+ create mode 100644 arch/x86/virt/svm/sev.c
+Merging clockevents/timers/drivers/next (9256cec7b4f3 clocksource/drivers/arm_global_timer: Remove stray tab)
+$ git merge -m Merge branch 'timers/drivers/next' of git://git.linaro.org/people/daniel.lezcano/linux.git clockevents/timers/drivers/next
+Merge made by the 'ort' strategy.
+ .../bindings/timer/nxp,sysctr-timer.yaml           |   4 +-
+ .../devicetree/bindings/timer/renesas,ostm.yaml    |   2 +-
+ .../devicetree/bindings/timer/renesas,tmu.yaml     |  18 ++-
+ .../bindings/timer/samsung,exynos4210-mct.yaml     |   2 +
+ drivers/clocksource/arm_global_timer.c             |   4 +-
+ drivers/clocksource/timer-imx-gpt.c                |   3 +-
+ drivers/clocksource/timer-imx-sysctr.c             | 121 +++++++++++++++------
+ drivers/clocksource/timer-stm32.c                  |   4 +-
+ drivers/clocksource/timer-ti-32k.c                 |   2 +-
+ 9 files changed, 117 insertions(+), 43 deletions(-)
+Merging edac/edac-for-next (4cc8411bb56d Merge edac-amd-atl into for-next)
+$ git merge -m Merge branch 'edac-for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/ras/ras.git edac/edac-for-next
+Auto-merging MAINTAINERS
+Merge made by the 'ort' strategy.
+ .../admin-guide/RAS/address-translation.rst        |  24 +
+ .../ras.rst => admin-guide/RAS/error-decoding.rst} |  11 +-
+ Documentation/admin-guide/RAS/index.rst            |   7 +
+ .../admin-guide/{ras.rst => RAS/main.rst}          |  10 +-
+ Documentation/admin-guide/index.rst                |   2 +-
+ Documentation/index.rst                            |   1 -
+ MAINTAINERS                                        |  15 +-
+ drivers/edac/Kconfig                               |   1 +
+ drivers/edac/amd64_edac.c                          | 286 +-------
+ drivers/edac/i10nm_base.c                          |   1 +
+ drivers/edac/igen6_edac.c                          |   2 +
+ drivers/edac/synopsys_edac.c                       |   4 +-
+ drivers/edac/versal_edac.c                         | 193 ++++-
+ drivers/ras/Kconfig                                |  13 +
+ drivers/ras/Makefile                               |   3 +
+ drivers/ras/amd/atl/Kconfig                        |  21 +
+ drivers/ras/amd/atl/Makefile                       |  18 +
+ drivers/ras/amd/atl/access.c                       | 133 ++++
+ drivers/ras/amd/atl/core.c                         | 225 ++++++
+ drivers/ras/amd/atl/dehash.c                       | 500 +++++++++++++
+ drivers/ras/amd/atl/denormalize.c                  | 719 ++++++++++++++++++
+ drivers/ras/amd/atl/internal.h                     | 306 ++++++++
+ drivers/ras/amd/atl/map.c                          | 682 +++++++++++++++++
+ drivers/ras/amd/atl/reg_fields.h                   | 606 +++++++++++++++
+ drivers/ras/amd/atl/system.c                       | 288 ++++++++
+ drivers/ras/amd/atl/umc.c                          | 341 +++++++++
+ drivers/ras/amd/fmpm.c                             | 812 +++++++++++++++++++++
+ drivers/ras/ras.c                                  |  31 +
+ include/linux/ras.h                                |  18 +
+ 29 files changed, 4946 insertions(+), 327 deletions(-)
+ create mode 100644 Documentation/admin-guide/RAS/address-translation.rst
+ rename Documentation/{RAS/ras.rst => admin-guide/RAS/error-decoding.rst} (73%)
+ create mode 100644 Documentation/admin-guide/RAS/index.rst
+ rename Documentation/admin-guide/{ras.rst => RAS/main.rst} (99%)
+ create mode 100644 drivers/ras/amd/atl/Kconfig
+ create mode 100644 drivers/ras/amd/atl/Makefile
+ create mode 100644 drivers/ras/amd/atl/access.c
+ create mode 100644 drivers/ras/amd/atl/core.c
+ create mode 100644 drivers/ras/amd/atl/dehash.c
+ create mode 100644 drivers/ras/amd/atl/denormalize.c
+ create mode 100644 drivers/ras/amd/atl/internal.h
+ create mode 100644 drivers/ras/amd/atl/map.c
+ create mode 100644 drivers/ras/amd/atl/reg_fields.h
+ create mode 100644 drivers/ras/amd/atl/system.c
+ create mode 100644 drivers/ras/amd/atl/umc.c
+ create mode 100644 drivers/ras/amd/fmpm.c
+Merging ftrace/for-next (4af12c95cbe8 Merge probes/for-next)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace.git ftrace/for-next
+Merge made by the 'ort' strategy.
+Merging rcu/rcu/next (b4c7a9cd36e1 rcutorture: Enable RCU priority boosting for TREE09)
+$ git merge -m Merge branch 'rcu/next' of git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu.git rcu/rcu/next
+Auto-merging Documentation/admin-guide/kernel-parameters.txt
+Auto-merging arch/x86/kernel/nmi.c
+Auto-merging arch/x86/kernel/tsc.c
+Auto-merging include/linux/sched.h
+Auto-merging kernel/fork.c
+Auto-merging kernel/trace/ftrace.c
+Merge made by the 'ort' strategy.
+ Documentation/RCU/checklist.rst                    |  32 +-
+ Documentation/RCU/rcu_dereference.rst              |   5 +-
+ Documentation/RCU/whatisRCU.rst                    |  25 +-
+ Documentation/admin-guide/kernel-parameters.rst    |   1 +
+ Documentation/admin-guide/kernel-parameters.txt    | 489 +++++++++++----------
+ Documentation/atomic_t.txt                         |   4 +-
+ Documentation/litmus-tests/README                  |  45 ++
+ .../atomic/cmpxchg-fail-ordered-1.litmus           |  34 ++
+ .../atomic/cmpxchg-fail-ordered-2.litmus           |  30 ++
+ .../atomic/cmpxchg-fail-unordered-1.litmus         |  33 ++
+ .../atomic/cmpxchg-fail-unordered-2.litmus         |  30 ++
+ arch/x86/kernel/nmi.c                              |   2 +-
+ arch/x86/kernel/tsc.c                              |   2 +-
+ fs/proc/bootconfig.c                               |  12 +-
+ include/linux/rcu_sync.h                           |   1 -
+ include/linux/rcupdate.h                           |   6 +-
+ include/linux/sched.h                              |   2 +
+ include/linux/srcutiny.h                           |   2 +
+ init/init_task.c                                   |   1 +
+ kernel/context_tracking.c                          |   4 +
+ kernel/fork.c                                      |   1 +
+ kernel/rcu/Kconfig                                 |  15 +-
+ kernel/rcu/rcu.h                                   |  13 +-
+ kernel/rcu/rcuscale.c                              |   6 +-
+ kernel/rcu/rcutorture.c                            |  13 +-
+ kernel/rcu/srcutiny.c                              |  31 +-
+ kernel/rcu/srcutree.c                              |  24 +-
+ kernel/rcu/sync.c                                  |  16 -
+ kernel/rcu/tasks.h                                 | 110 +++--
+ kernel/rcu/tree.c                                  | 235 ++++++----
+ kernel/rcu/tree.h                                  |  20 +-
+ kernel/rcu/tree_exp.h                              |  83 +---
+ kernel/rcu/tree_nocb.h                             |  69 +--
+ kernel/rcu/tree_plugin.h                           |  52 +--
+ kernel/trace/ftrace.c                              |  13 +-
+ tools/testing/selftests/rcutorture/bin/torture.sh  |   4 +-
+ .../selftests/rcutorture/configs/rcu/TREE09        |   5 +-
+ 37 files changed, 888 insertions(+), 582 deletions(-)
+ create mode 100644 Documentation/litmus-tests/atomic/cmpxchg-fail-ordered-1.litmus
+ create mode 100644 Documentation/litmus-tests/atomic/cmpxchg-fail-ordered-2.litmus
+ create mode 100644 Documentation/litmus-tests/atomic/cmpxchg-fail-unordered-1.litmus
+ create mode 100644 Documentation/litmus-tests/atomic/cmpxchg-fail-unordered-2.litmus
+Merging kvm/next (687d8f4c3dea Merge branch 'kvm-kconfig')
+$ git merge -m Merge branch 'next' of git://git.kernel.org/pub/scm/virt/kvm/kvm.git kvm/next
+Auto-merging arch/arm64/Kconfig
+Auto-merging arch/arm64/kvm/Kconfig
+Auto-merging arch/mips/Kconfig
+Auto-merging arch/s390/Kconfig
+Auto-merging arch/x86/Kconfig
+Auto-merging arch/x86/include/asm/idtentry.h
+Auto-merging arch/x86/kernel/idt.c
+Auto-merging arch/x86/kvm/Kconfig
+Merge made by the 'ort' strategy.
+ arch/arm64/Kconfig                       |   1 -
+ arch/arm64/include/uapi/asm/kvm.h        |  15 +-
+ arch/arm64/kvm/Kconfig                   |   3 +-
+ arch/loongarch/Kconfig                   |   1 -
+ arch/loongarch/include/uapi/asm/kvm.h    |   2 -
+ arch/loongarch/kvm/Kconfig               |   2 +-
+ arch/mips/Kconfig                        |  18 +-
+ arch/mips/include/uapi/asm/kvm.h         |   2 -
+ arch/mips/kvm/Kconfig                    |   3 +-
+ arch/powerpc/include/uapi/asm/kvm.h      |  45 +-
+ arch/powerpc/kvm/Kconfig                 |   1 -
+ arch/riscv/include/uapi/asm/kvm.h        |   1 -
+ arch/riscv/kvm/Kconfig                   |   1 +
+ arch/s390/Kconfig                        |   1 -
+ arch/s390/include/uapi/asm/kvm.h         | 315 +++++++++++++-
+ arch/s390/kvm/Kconfig                    |   1 -
+ arch/x86/Kconfig                         |   1 -
+ arch/x86/include/asm/hardirq.h           |   2 +-
+ arch/x86/include/asm/idtentry.h          |   2 +-
+ arch/x86/include/asm/irq.h               |   2 +-
+ arch/x86/include/asm/irq_vectors.h       |   2 +-
+ arch/x86/include/uapi/asm/kvm.h          | 278 ++++++++++++-
+ arch/x86/include/uapi/asm/kvm_para.h     |   2 +-
+ arch/x86/kernel/idt.c                    |   2 +-
+ arch/x86/kernel/irq.c                    |   4 +-
+ arch/x86/kvm/Kconfig                     |   4 +-
+ drivers/vfio/vfio.h                      |   2 +-
+ drivers/vfio/vfio_main.c                 |   4 +-
+ include/linux/bits.h                     |   8 +-
+ include/uapi/asm-generic/bitsperlong.h   |   4 +
+ include/uapi/linux/bits.h                |  15 +
+ include/uapi/linux/kvm.h                 | 689 +------------------------------
+ scripts/gdb/linux/constants.py.in        |   6 +-
+ scripts/gdb/linux/interrupts.py          |   2 +-
+ tools/arch/x86/include/asm/irq_vectors.h |   2 +-
+ virt/kvm/Kconfig                         |   7 +-
+ virt/kvm/kvm_main.c                      |   2 +-
+ 37 files changed, 700 insertions(+), 752 deletions(-)
+ create mode 100644 include/uapi/linux/bits.h
+Merging kvm-arm/next (9e00a15ec81e Merge branch kvm-arm64/vm-configuration into kvmarm/next)
+$ git merge -m Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/kvmarm/kvmarm.git kvm-arm/next
+Auto-merging arch/arm64/include/asm/cpufeature.h
+Auto-merging arch/arm64/include/asm/kvm_emulate.h
+Auto-merging arch/arm64/kernel/cpufeature.c
+CONFLICT (content): Merge conflict in arch/arm64/kernel/cpufeature.c
+Auto-merging arch/arm64/kernel/head.S
+Auto-merging arch/arm64/kvm/Kconfig
+Auto-merging arch/arm64/kvm/guest.c
+Auto-merging arch/arm64/kvm/mmu.c
+Auto-merging arch/arm64/tools/cpucaps
+Auto-merging arch/arm64/tools/sysreg
+Auto-merging tools/testing/selftests/kvm/aarch64/arch_timer.c
+Auto-merging tools/testing/selftests/kvm/aarch64/hypercalls.c
+Auto-merging tools/testing/selftests/kvm/aarch64/page_fault_test.c
+Auto-merging tools/testing/selftests/kvm/aarch64/vpmu_counter_access.c
+Resolved 'arch/arm64/kernel/cpufeature.c' using previous resolution.
+Automatic merge failed; fix conflicts and then commit the result.
+$ git commit --no-edit -v -a
+[master 0933fd858639] Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/kvmarm/kvmarm.git
+$ git diff -M --stat --summary HEAD^..
+ arch/arm64/include/asm/cpu.h                       |   1 +
+ arch/arm64/include/asm/cpufeature.h                |   1 +
+ arch/arm64/include/asm/kvm_arm.h                   |   4 +-
+ arch/arm64/include/asm/kvm_emulate.h               |   3 +-
+ arch/arm64/include/asm/kvm_host.h                  |  98 +++++++-
+ arch/arm64/include/asm/kvm_mmu.h                   |  46 ++--
+ arch/arm64/include/asm/kvm_nested.h                |   1 -
+ arch/arm64/include/asm/sysreg.h                    |   5 +-
+ arch/arm64/kernel/cpufeature.c                     | 105 +++++++-
+ arch/arm64/kernel/cpuinfo.c                        |   1 +
+ arch/arm64/kernel/head.S                           |  23 +-
+ arch/arm64/kvm/Kconfig                             |   1 -
+ arch/arm64/kvm/arm.c                               |  10 +-
+ arch/arm64/kvm/check-res-bits.h                    | 121 +++++++++
+ arch/arm64/kvm/debug.c                             |   3 +-
+ arch/arm64/kvm/emulate-nested.c                    | 231 ++++++++++++-----
+ arch/arm64/kvm/guest.c                             |   7 +-
+ arch/arm64/kvm/hyp/aarch32.c                       |   4 +-
+ arch/arm64/kvm/hyp/include/hyp/switch.h            | 130 +++++-----
+ arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h         |  24 +-
+ arch/arm64/kvm/hyp/vhe/sysreg-sr.c                 |   2 +-
+ arch/arm64/kvm/mmu.c                               |   2 +-
+ arch/arm64/kvm/nested.c                            | 274 ++++++++++++++++++++-
+ arch/arm64/kvm/pmu-emul.c                          |  15 +-
+ arch/arm64/kvm/sys_regs.c                          | 264 ++++++++++++++++----
+ arch/arm64/kvm/sys_regs.h                          |   2 +
+ arch/arm64/kvm/vgic/vgic-init.c                    |   4 +-
+ arch/arm64/kvm/vgic/vgic-its.c                     |   4 +-
+ arch/arm64/kvm/vgic/vgic.c                         |   2 +-
+ arch/arm64/tools/cpucaps                           |   1 +
+ arch/arm64/tools/sysreg                            |  45 +++-
+ include/kvm/arm_pmu.h                              |  11 -
+ tools/testing/selftests/kvm/aarch64/arch_timer.c   |   4 +-
+ .../selftests/kvm/aarch64/debug-exceptions.c       |   2 +-
+ tools/testing/selftests/kvm/aarch64/hypercalls.c   |   4 +-
+ .../selftests/kvm/aarch64/page_fault_test.c        |   2 +-
+ tools/testing/selftests/kvm/aarch64/set_id_regs.c  |  18 +-
+ .../selftests/kvm/aarch64/vpmu_counter_access.c    |  12 +-
+ 38 files changed, 1221 insertions(+), 266 deletions(-)
+ create mode 100644 arch/arm64/kvm/check-res-bits.h
+Merging kvms390/next (7b2411e79367 KVM: s390: fix virtual vs physical address confusion)
+$ git merge -m Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/kvms390/linux.git kvms390/next
+Auto-merging arch/s390/kvm/kvm-s390.c
+Merge made by the 'ort' strategy.
+ arch/s390/kvm/kvm-s390.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+Merging kvm-ppc/topic/ppc-kvm (41bccc98fb79 Linux 6.8-rc2)
+$ git merge -m Merge branch 'topic/ppc-kvm' of git://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git kvm-ppc/topic/ppc-kvm
+Already up to date.
+Merging kvm-riscv/riscv_kvm_next (f072b272aa27 RISC-V: KVM: Use correct restricted types)
+$ git merge -m Merge branch 'riscv_kvm_next' of https://github.com/kvm-riscv/linux.git kvm-riscv/riscv_kvm_next
+Already up to date.
+Merging kvm-x86/next (ca19f5c9d3c3 Merge branches 'asyncpf', 'asyncpf_abi', 'fixes', 'generic', 'misc', 'mmu', 'pmu', 'svm', 'vmx' and 'xen')
+$ git merge -m Merge branch 'next' of https://github.com/kvm-x86/linux.git kvm-x86/next
+Auto-merging arch/s390/kvm/gaccess.c
+Auto-merging arch/s390/kvm/kvm-s390.c
+Auto-merging arch/x86/include/asm/kvm_host.h
+Auto-merging arch/x86/include/uapi/asm/kvm_para.h
+Auto-merging arch/x86/kernel/kvm.c
+Auto-merging arch/x86/kvm/mmu/mmu.c
+Auto-merging arch/x86/kvm/svm/sev.c
+CONFLICT (content): Merge conflict in arch/x86/kvm/svm/sev.c
+Auto-merging arch/x86/kvm/vmx/vmx.c
+Auto-merging include/uapi/linux/kvm.h
+CONFLICT (content): Merge conflict in include/uapi/linux/kvm.h
+Auto-merging virt/kvm/kvm_main.c
+Resolved 'arch/x86/kvm/svm/sev.c' using previous resolution.
+Resolved 'include/uapi/linux/kvm.h' using previous resolution.
+Automatic merge failed; fix conflicts and then commit the result.
+$ git commit --no-edit -v -a
+[master 607833f0c9d3] Merge branch 'next' of https://github.com/kvm-x86/linux.git
+$ git diff -M --stat --summary HEAD^..
+ Documentation/virt/kvm/api.rst                     |  51 +-
+ Documentation/virt/kvm/x86/msr.rst                 |  19 +-
+ arch/s390/kvm/diag.c                               |   2 +-
+ arch/s390/kvm/gaccess.c                            |  14 +-
+ arch/s390/kvm/kvm-s390.c                           |   4 +-
+ arch/s390/kvm/priv.c                               |   4 +-
+ arch/s390/kvm/sigp.c                               |   2 +-
+ arch/x86/include/asm/kvm-x86-pmu-ops.h             |   4 +-
+ arch/x86/include/asm/kvm_host.h                    |   1 +
+ arch/x86/include/uapi/asm/kvm_para.h               |   1 -
+ arch/x86/kernel/kvm.c                              |  11 +-
+ arch/x86/kvm/emulate.c                             |  30 +-
+ arch/x86/kvm/kvm_emulate.h                         |   2 +-
+ arch/x86/kvm/mmu/mmu.c                             |   4 +-
+ arch/x86/kvm/pmu.c                                 | 163 ++++--
+ arch/x86/kvm/pmu.h                                 |  57 +-
+ arch/x86/kvm/svm/pmu.c                             |  22 +-
+ arch/x86/kvm/svm/sev.c                             |  58 +-
+ arch/x86/kvm/trace.h                               |  10 +-
+ arch/x86/kvm/vmx/nested.c                          |   2 +-
+ arch/x86/kvm/vmx/pmu_intel.c                       | 222 +++-----
+ arch/x86/kvm/vmx/vmx.c                             |   2 +-
+ arch/x86/kvm/x86.c                                 |  60 +-
+ arch/x86/kvm/xen.c                                 | 173 ++++--
+ include/linux/kvm_host.h                           |  57 +-
+ include/linux/kvm_types.h                          |   8 -
+ tools/testing/selftests/kvm/Makefile               |   2 +
+ .../testing/selftests/kvm/include/kvm_util_base.h  |   4 +
+ tools/testing/selftests/kvm/include/x86_64/pmu.h   |  97 ++++
+ .../selftests/kvm/include/x86_64/processor.h       | 148 +++--
+ tools/testing/selftests/kvm/lib/kvm_util.c         |  62 ++-
+ tools/testing/selftests/kvm/lib/x86_64/pmu.c       |  31 ++
+ tools/testing/selftests/kvm/lib/x86_64/processor.c |  15 +-
+ .../selftests/kvm/x86_64/pmu_counters_test.c       | 617 +++++++++++++++++++++
+ .../selftests/kvm/x86_64/pmu_event_filter_test.c   | 143 ++---
+ .../kvm/x86_64/smaller_maxphyaddr_emulation_test.c |   2 +-
+ .../selftests/kvm/x86_64/userspace_msr_exit_test.c |  29 +-
+ .../selftests/kvm/x86_64/vmx_pmu_caps_test.c       |   2 +-
+ .../testing/selftests/kvm/x86_64/xen_shinfo_test.c |  59 +-
+ virt/kvm/async_pf.c                                |  73 ++-
+ virt/kvm/kvm_main.c                                |   4 +-
+ virt/kvm/pfncache.c                                | 230 ++++----
+ 42 files changed, 1801 insertions(+), 700 deletions(-)
+ create mode 100644 tools/testing/selftests/kvm/include/x86_64/pmu.h
+ create mode 100644 tools/testing/selftests/kvm/lib/x86_64/pmu.c
+ create mode 100644 tools/testing/selftests/kvm/x86_64/pmu_counters_test.c
+$ git am -3 ../patches/0001-fixup-for-code-moving-to-arch-x86-include-uapi-asm-k.patch
+Applying: fixup for code moving to arch/x86/include/uapi/asm/kvm.h
+$ git reset HEAD^
+Unstaged changes after reset:
+M	arch/x86/include/uapi/asm/kvm.h
+$ git add -A .
+$ git commit -v -a --amend
+[master fc1fe24d1637] Merge branch 'next' of https://github.com/kvm-x86/linux.git
+ Date: Wed Feb 21 13:04:52 2024 +1100
+Merging xen-tip/linux-next (fa765c4b4aed xen/events: close evtchn after mapping cleanup)
+$ git merge -m Merge branch 'linux-next' of git://git.kernel.org/pub/scm/linux/kernel/git/xen/tip.git xen-tip/linux-next
+Already up to date.
+Merging percpu/for-next (2d9ad81ef935 Merge branch 'for-6.8-fixes' into for-next)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/dennis/percpu.git percpu/for-next
+Merge made by the 'ort' strategy.
+Merging workqueues/for-next (fd0a68a2337b workqueue, irq_work: Build fix for !CONFIG_IRQ_WORK)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/wq.git workqueues/for-next
+Auto-merging Documentation/core-api/workqueue.rst
+Auto-merging init/Kconfig
+Auto-merging init/main.c
+Merge made by the 'ort' strategy.
+ Documentation/core-api/workqueue.rst |   43 +-
+ include/linux/async.h                |    1 +
+ include/linux/workqueue.h            |   93 ++-
+ init/Kconfig                         |    2 +-
+ init/main.c                          |    1 +
+ kernel/async.c                       |   17 +-
+ kernel/softirq.c                     |    3 +
+ kernel/workqueue.c                   | 1351 ++++++++++++++++++++++++++++------
+ rust/kernel/workqueue.rs             |    6 +-
+ tools/workqueue/wq_dump.py           |  104 ++-
+ 10 files changed, 1322 insertions(+), 299 deletions(-)
+Merging drivers-x86/for-next (c5211eacf332 platform/x86: ideapad-laptop: support Fn+R dual-function key)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/pdx86/platform-drivers-x86.git drivers-x86/for-next
+Merge made by the 'ort' strategy.
+ .../admin-guide/laptops/thinkpad-acpi.rst          |   7 +-
+ drivers/platform/mellanox/mlxbf-bootctl.c          |  14 +-
+ drivers/platform/mellanox/mlxbf-pmc.c              | 210 ++++----
+ .../platform/surface/surface_aggregator_registry.c |   7 +
+ drivers/platform/x86/Kconfig                       |   6 -
+ drivers/platform/x86/amd/Kconfig                   |   2 +-
+ drivers/platform/x86/amd/hsmp.c                    | 584 ++++++++++++++++-----
+ drivers/platform/x86/asus-wmi.c                    |   1 -
+ drivers/platform/x86/dell/Kconfig                  |   3 -
+ drivers/platform/x86/dell/dell-laptop.c            |   2 -
+ drivers/platform/x86/dell/dell-wmi-privacy.c       |   1 -
+ drivers/platform/x86/hp/hp-wmi.c                   |  71 ++-
+ drivers/platform/x86/huawei-wmi.c                  |   1 -
+ drivers/platform/x86/ibm_rtl.c                     |   2 +-
+ drivers/platform/x86/ideapad-laptop.c              |   2 +
+ drivers/platform/x86/intel/ifs/load.c              |   2 +-
+ drivers/platform/x86/intel/ifs/runtest.c           | 101 ++--
+ drivers/platform/x86/silicom-platform.c            |   7 +-
+ drivers/platform/x86/thinkpad_acpi.c               |  29 +-
+ drivers/platform/x86/wmi.c                         |  82 ++-
+ include/trace/events/intel_ifs.h                   |  12 +-
+ 21 files changed, 782 insertions(+), 364 deletions(-)
+Merging chrome-platform/for-next (6613476e225e Linux 6.8-rc1)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/chrome-platform/linux.git chrome-platform/for-next
+Already up to date.
+Merging chrome-platform-firmware/for-firmware-next (8a0a62941a04 firmware: coreboot: Replace tag with id table in driver struct)
+$ git merge -m Merge branch 'for-firmware-next' of git://git.kernel.org/pub/scm/linux/kernel/git/chrome-platform/linux.git chrome-platform-firmware/for-firmware-next
+Merge made by the 'ort' strategy.
+ drivers/firmware/google/cbmem.c                |  8 +++++++-
+ drivers/firmware/google/coreboot_table.c       | 22 ++++++++++++++++++++--
+ drivers/firmware/google/coreboot_table.h       |  3 ++-
+ drivers/firmware/google/framebuffer-coreboot.c |  8 +++++++-
+ drivers/firmware/google/memconsole-coreboot.c  |  8 +++++++-
+ drivers/firmware/google/vpd.c                  |  8 +++++++-
+ include/linux/mod_devicetable.h                | 10 ++++++++++
+ scripts/mod/devicetable-offsets.c              |  3 +++
+ scripts/mod/file2alias.c                       | 10 ++++++++++
+ 9 files changed, 73 insertions(+), 7 deletions(-)
+Merging hsi/for-next (a0e35a173a86 hsi: hsi_core: make hsi_bus_type const)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/sre/linux-hsi.git hsi/for-next
+Merge made by the 'ort' strategy.
+ drivers/hsi/hsi_core.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+Merging leds-lj/for-leds-next (12ce20e02e53 leds: trigger: netdev: Fix kernel panic on interface rename trig notify)
+$ git merge -m Merge branch 'for-leds-next' of git://git.kernel.org/pub/scm/linux/kernel/git/lee/leds.git leds-lj/for-leds-next
+Auto-merging drivers/leds/Kconfig
+Auto-merging drivers/leds/flash/Kconfig
+Merge made by the 'ort' strategy.
+ .../ABI/testing/sysfs-class-led-trigger-netdev     |  12 +
+ .../ABI/testing/sysfs-class-led-trigger-tty        |  14 +-
+ .../devicetree/bindings/leds/leds-qcom-lpg.yaml    |  82 ++++-
+ drivers/leds/Kconfig                               |   4 +-
+ drivers/leds/flash/Kconfig                         |   4 +-
+ drivers/leds/led-class.c                           |   6 +
+ drivers/leds/led-triggers.c                        |  38 ++-
+ drivers/leds/leds-aw200xx.c                        |   2 +-
+ drivers/leds/leds-pca963x.c                        |  28 ++
+ drivers/leds/leds-spi-byte.c                       |  11 +-
+ drivers/leds/leds.h                                |   1 -
+ drivers/leds/rgb/leds-qcom-lpg.c                   | 364 +++++++++++++++++++--
+ drivers/leds/trigger/ledtrig-audio.c               |   2 +
+ drivers/leds/trigger/ledtrig-default-on.c          |   1 +
+ drivers/leds/trigger/ledtrig-netdev.c              | 102 +++++-
+ drivers/leds/trigger/ledtrig-panic.c               |  23 +-
+ drivers/staging/greybus/Kconfig                    |   2 +-
+ drivers/staging/greybus/light.c                    |  21 --
+ include/dt-bindings/leds/common.h                  |   3 +
+ include/linux/led-class-flash.h                    |  24 --
+ include/linux/led-class-multicolor.h               |  29 --
+ include/linux/leds.h                               |  19 --
+ 22 files changed, 612 insertions(+), 180 deletions(-)
+Merging ipmi/for-next (296455ade1fd Merge tag 'char-misc-6.8-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/char-misc)
+$ git merge -m Merge branch 'for-next' of git://github.com/cminyard/linux-ipmi.git ipmi/for-next
+Already up to date.
+Merging driver-core/driver-core-next (07749061b837 Merge 6.8-rc5 into driver-core-next)
+$ git merge -m Merge branch 'driver-core-next' of git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/driver-core.git driver-core/driver-core-next
+Auto-merging drivers/base/cpu.c
+Auto-merging kernel/ksysfs.c
+Auto-merging kernel/workqueue.c
+Merge made by the 'ort' strategy.
+ drivers/base/component.c    |  4 ++--
+ drivers/base/cpu.c          |  2 +-
+ fs/kernfs/dir.c             | 31 ++++++++++++++++++++-----------
+ fs/kernfs/file.c            |  8 +++++---
+ fs/kernfs/kernfs-internal.h |  2 ++
+ include/linux/cpu.h         |  2 +-
+ include/linux/kernfs.h      | 10 ++++++----
+ include/linux/kobject.h     |  2 +-
+ kernel/ksysfs.c             |  2 +-
+ kernel/workqueue.c          |  2 +-
+ lib/kobject_uevent.c        | 24 +++++++++++-------------
+ 11 files changed, 51 insertions(+), 38 deletions(-)
+Merging usb/usb-next (3bf0514dc6f3 Revert "xhci: add helper to stop endpoint and wait for completion")
+$ git merge -m Merge branch 'usb-next' of git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/usb.git usb/usb-next
+Auto-merging Documentation/devicetree/bindings/regulator/qcom,usb-vbus-regulator.yaml
+CONFLICT (content): Merge conflict in Documentation/devicetree/bindings/regulator/qcom,usb-vbus-regulator.yaml
+Auto-merging Documentation/devicetree/bindings/soc/qcom/qcom,pmic-glink.yaml
+Auto-merging Documentation/devicetree/bindings/sound/qcom,sm8250.yaml
+Auto-merging arch/arm64/boot/dts/qcom/qcm6490-fairphone-fp5.dts
+Auto-merging arch/arm64/boot/dts/qcom/sc7280.dtsi
+Auto-merging drivers/usb/dwc3/gadget.c
+Auto-merging drivers/usb/roles/class.c
+CONFLICT (content): Merge conflict in drivers/usb/roles/class.c
+Auto-merging drivers/usb/typec/tcpm/tcpm.c
+Auto-merging tools/testing/selftests/Makefile
+Resolved 'Documentation/devicetree/bindings/regulator/qcom,usb-vbus-regulator.yaml' using previous resolution.
+Resolved 'drivers/usb/roles/class.c' using previous resolution.
+Automatic merge failed; fix conflicts and then commit the result.
+$ git commit --no-edit -v -a
+[master 155a1d46300e] Merge branch 'usb-next' of git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/usb.git
+$ git diff -M --stat --summary HEAD^..
+ Documentation/ABI/testing/configfs-usb-gadget-ffs  |   12 +-
+ Documentation/ABI/testing/sysfs-class-usb_role     |    6 +
+ .../regulator/qcom,usb-vbus-regulator.yaml         |    1 +
+ .../bindings/soc/qcom/qcom,pmic-glink.yaml         |    1 +
+ .../devicetree/bindings/sound/qcom,q6usb.yaml      |   55 +
+ .../devicetree/bindings/sound/qcom,sm8250.yaml     |   15 +
+ .../devicetree/bindings/usb/ci-hdrc-usb2.yaml      |    2 +-
+ .../devicetree/bindings/usb/fcs,fsa4480.yaml       |   12 +-
+ .../devicetree/bindings/usb/generic-ehci.yaml      |    1 +
+ .../devicetree/bindings/usb/gpio-sbu-mux.yaml      |   12 +-
+ .../devicetree/bindings/usb/ite,it5205.yaml        |   72 ++
+ .../devicetree/bindings/usb/mediatek,mtu3.yaml     |    5 +-
+ .../devicetree/bindings/usb/microchip,usb5744.yaml |    2 -
+ .../devicetree/bindings/usb/nxp,ptn36502.yaml      |   12 +-
+ .../devicetree/bindings/usb/onnn,nb7vpq904m.yaml   |   13 +-
+ .../devicetree/bindings/usb/qcom,pmic-typec.yaml   |   32 +-
+ .../bindings/usb/qcom,wcd939x-usbss.yaml           |   12 +-
+ .../devicetree/bindings/usb/ti,am62-usb.yaml       |    8 +-
+ .../devicetree/bindings/usb/usb-nop-xceiv.yaml     |   11 +-
+ .../devicetree/bindings/usb/usb-switch.yaml        |   67 +
+ Documentation/devicetree/bindings/usb/usb.yaml     |    2 +
+ Documentation/usb/functionfs.rst                   |   36 +
+ Documentation/usb/gadget-testing.rst               |    8 +
+ drivers/phy/Kconfig                                |    1 +
+ drivers/phy/Makefile                               |    1 +
+ drivers/phy/phy-core.c                             |   47 +
+ drivers/phy/realtek/Kconfig                        |   32 +
+ drivers/phy/realtek/Makefile                       |    3 +
+ drivers/phy/realtek/phy-rtk-usb2.c                 | 1312 ++++++++++++++++++++
+ drivers/phy/realtek/phy-rtk-usb3.c                 |  748 +++++++++++
+ drivers/platform/chrome/cros_ec_typec.c            |   19 +
+ drivers/usb/core/Kconfig                           |   27 +
+ drivers/usb/core/driver.c                          |    8 +-
+ drivers/usb/core/endpoint.c                        |    2 +-
+ drivers/usb/core/hcd.c                             |   20 +-
+ drivers/usb/core/hub.c                             |   29 +
+ drivers/usb/core/message.c                         |    2 +-
+ drivers/usb/core/phy.c                             |  120 ++
+ drivers/usb/core/phy.h                             |    3 +
+ drivers/usb/core/port.c                            |    2 +-
+ drivers/usb/core/sysfs.c                           |   22 +-
+ drivers/usb/core/usb.c                             |    2 +-
+ drivers/usb/core/usb.h                             |    8 +-
+ drivers/usb/dwc3/core.h                            |    1 +
+ drivers/usb/dwc3/dwc3-of-simple.c                  |    3 +-
+ drivers/usb/dwc3/ep0.c                             |    1 +
+ drivers/usb/dwc3/gadget.c                          |   91 +-
+ drivers/usb/dwc3/gadget.h                          |    1 +
+ drivers/usb/dwc3/host.c                            |   50 +
+ drivers/usb/gadget/Kconfig                         |    1 +
+ drivers/usb/gadget/function/f_fs.c                 |  533 +++++++-
+ drivers/usb/gadget/function/u_ether.c              |    2 +-
+ drivers/usb/gadget/function/uvc_video.c            |  123 +-
+ drivers/usb/gadget/udc/core.c                      |    7 +-
+ drivers/usb/host/ehci-orion.c                      |   18 +-
+ drivers/usb/host/xhci-caps.h                       |   85 ++
+ drivers/usb/host/xhci-mem.c                        |    2 +-
+ drivers/usb/host/xhci-port.h                       |  176 +++
+ drivers/usb/host/xhci-ring.c                       |  200 +--
+ drivers/usb/host/xhci.c                            |   28 +-
+ drivers/usb/host/xhci.h                            |  265 +---
+ drivers/usb/image/mdc800.c                         |    1 -
+ drivers/usb/mtu3/mtu3_host.c                       |   30 +
+ drivers/usb/phy/phy-generic.c                      |   55 +-
+ drivers/usb/phy/phy.c                              |    2 +-
+ drivers/usb/roles/class.c                          |   43 +-
+ drivers/usb/storage/freecom.c                      |    1 -
+ drivers/usb/storage/sddr55.c                       |    4 +-
+ drivers/usb/typec/altmodes/displayport.c           |  162 ++-
+ drivers/usb/typec/bus.c                            |  102 ++
+ drivers/usb/typec/class.c                          |   59 +
+ drivers/usb/typec/class.h                          |    1 +
+ drivers/usb/typec/mux/Kconfig                      |   10 +
+ drivers/usb/typec/mux/Makefile                     |    1 +
+ drivers/usb/typec/mux/it5205.c                     |  294 +++++
+ drivers/usb/typec/pd.c                             |   30 +-
+ drivers/usb/typec/tcpm/fusb302.c                   |    2 +-
+ drivers/usb/typec/tcpm/qcom/Makefile               |    3 +-
+ drivers/usb/typec/tcpm/qcom/qcom_pmic_typec.c      |  256 +---
+ drivers/usb/typec/tcpm/qcom/qcom_pmic_typec.h      |   27 +
+ .../usb/typec/tcpm/qcom/qcom_pmic_typec_pdphy.c    |  159 ++-
+ .../usb/typec/tcpm/qcom/qcom_pmic_typec_pdphy.h    |   94 +-
+ .../typec/tcpm/qcom/qcom_pmic_typec_pdphy_stub.c   |   80 ++
+ drivers/usb/typec/tcpm/qcom/qcom_pmic_typec_port.c |  290 ++++-
+ drivers/usb/typec/tcpm/qcom/qcom_pmic_typec_port.h |  172 +--
+ drivers/usb/typec/tcpm/tcpci.c                     |   26 +-
+ drivers/usb/typec/tcpm/tcpci_maxim.h               |    1 +
+ drivers/usb/typec/tcpm/tcpci_maxim_core.c          |   38 +-
+ drivers/usb/typec/tcpm/tcpm.c                      | 1028 ++++++++++++---
+ drivers/usb/typec/tcpm/wcove.c                     |    2 +-
+ drivers/usb/typec/ucsi/ucsi.c                      |   49 +-
+ drivers/usb/typec/ucsi/ucsi.h                      |   67 +-
+ drivers/usb/typec/ucsi/ucsi_ccg.c                  |   92 +-
+ drivers/usb/typec/ucsi/ucsi_glink.c                |    1 +
+ include/linux/phy/phy.h                            |   21 +
+ include/linux/usb/audio-v2.h                       |    4 +-
+ include/linux/usb/gadget.h                         |    2 +
+ include/linux/usb/pd.h                             |    1 +
+ include/linux/usb/pd_vdo.h                         |    8 +-
+ include/linux/usb/tcpci.h                          |   13 +
+ include/linux/usb/tcpm.h                           |   16 +-
+ include/linux/usb/typec.h                          |    7 +
+ include/linux/usb/typec_altmode.h                  |   30 +
+ include/uapi/linux/usb/ch9.h                       |    2 +
+ include/uapi/linux/usb/functionfs.h                |   41 +
+ tools/testing/selftests/Makefile                   |    1 +
+ tools/testing/selftests/devices/Makefile           |    4 +
+ .../devices/boards/Dell Inc.,XPS 13 9300.yaml      |   40 +
+ .../selftests/devices/boards/google,spherion.yaml  |   50 +
+ tools/testing/selftests/devices/ksft.py            |   90 ++
+ .../selftests/devices/test_discoverable_devices.py |  318 +++++
+ 111 files changed, 6886 insertions(+), 1336 deletions(-)
+ create mode 100644 Documentation/devicetree/bindings/sound/qcom,q6usb.yaml
+ create mode 100644 Documentation/devicetree/bindings/usb/ite,it5205.yaml
+ create mode 100644 Documentation/devicetree/bindings/usb/usb-switch.yaml
+ create mode 100644 drivers/phy/realtek/Kconfig
+ create mode 100644 drivers/phy/realtek/Makefile
+ create mode 100644 drivers/phy/realtek/phy-rtk-usb2.c
+ create mode 100644 drivers/phy/realtek/phy-rtk-usb3.c
+ create mode 100644 drivers/usb/host/xhci-caps.h
+ create mode 100644 drivers/usb/host/xhci-port.h
+ create mode 100644 drivers/usb/typec/mux/it5205.c
+ create mode 100644 drivers/usb/typec/tcpm/qcom/qcom_pmic_typec.h
+ create mode 100644 drivers/usb/typec/tcpm/qcom/qcom_pmic_typec_pdphy_stub.c
+ create mode 100644 tools/testing/selftests/devices/Makefile
+ create mode 100644 tools/testing/selftests/devices/boards/Dell Inc.,XPS 13 9300.yaml
+ create mode 100644 tools/testing/selftests/devices/boards/google,spherion.yaml
+ create mode 100644 tools/testing/selftests/devices/ksft.py
+ create mode 100755 tools/testing/selftests/devices/test_discoverable_devices.py
+Merging thunderbolt/next (b4734507ac55 thunderbolt: Improve DisplayPort tunnel setup process to be more robust)
+$ git merge -m Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/westeri/thunderbolt.git thunderbolt/next
+Auto-merging drivers/thunderbolt/switch.c
+Auto-merging drivers/thunderbolt/tb_regs.h
+Auto-merging drivers/thunderbolt/usb4.c
+Merge made by the 'ort' strategy.
+ drivers/thunderbolt/domain.c  |  15 +-
+ drivers/thunderbolt/icm.c     |   2 +-
+ drivers/thunderbolt/lc.c      |  45 +++
+ drivers/thunderbolt/nhi.c     |  11 +-
+ drivers/thunderbolt/nvm.c     |   4 +-
+ drivers/thunderbolt/path.c    |  13 +
+ drivers/thunderbolt/switch.c  | 138 ++++++-
+ drivers/thunderbolt/tb.c      | 855 ++++++++++++++++++++++++++----------------
+ drivers/thunderbolt/tb.h      |  17 +-
+ drivers/thunderbolt/tb_regs.h |   6 +
+ drivers/thunderbolt/tunnel.c  |  96 ++---
+ drivers/thunderbolt/tunnel.h  |   6 +
+ drivers/thunderbolt/usb4.c    |  43 ++-
+ drivers/thunderbolt/xdomain.c |  12 +-
+ 14 files changed, 833 insertions(+), 430 deletions(-)
+Merging usb-serial/usb-next (54be6c6c5ae8 Linux 6.8-rc3)
+$ git merge -m Merge branch 'usb-next' of git://git.kernel.org/pub/scm/linux/kernel/git/johan/usb-serial.git usb-serial/usb-next
+Already up to date.
+Merging tty/tty-next (1643281347f8 serial: pmac_zilog: Convert to platform remove callback returning void)
+$ git merge -m Merge branch 'tty-next' of git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/tty.git tty/tty-next
+Auto-merging drivers/bluetooth/btnxpuart.c
+Auto-merging drivers/mfd/rave-sp.c
+Auto-merging drivers/net/ethernet/qualcomm/qca_uart.c
+Auto-merging drivers/tty/serial/stm32-usart.c
+Merge made by the 'ort' strategy.
+ .../devicetree/bindings/serial/cdns,uart.yaml      |    1 +
+ .../devicetree/bindings/serial/fsl-lpuart.yaml     |    1 +
+ .../devicetree/bindings/serial/renesas,hscif.yaml  |    1 +
+ .../devicetree/bindings/serial/samsung_uart.yaml   |    2 +
+ Documentation/driver-api/tty/console.rst           |   45 +
+ Documentation/driver-api/tty/index.rst             |    1 +
+ arch/m68k/amiga/config.c                           |    2 +-
+ arch/m68k/hp300/config.c                           |    6 +-
+ drivers/bluetooth/btmtkuart.c                      |    4 +-
+ drivers/bluetooth/btnxpuart.c                      |    4 +-
+ drivers/bluetooth/hci_serdev.c                     |    4 +-
+ drivers/gnss/serial.c                              |    2 +-
+ drivers/gnss/sirf.c                                |    2 +-
+ drivers/greybus/gb-beagleplay.c                    |    6 +-
+ drivers/iio/chemical/pms7003.c                     |    4 +-
+ drivers/iio/chemical/scd30_serial.c                |    4 +-
+ drivers/iio/chemical/sps30_serial.c                |    4 +-
+ drivers/iio/imu/bno055/bno055_ser_core.c           |    4 +-
+ drivers/input/keyboard/amikbd.c                    |    6 +-
+ drivers/mfd/rave-sp.c                              |    4 +-
+ drivers/net/ethernet/qualcomm/qca_uart.c           |    2 +-
+ drivers/nfc/pn533/uart.c                           |    4 +-
+ drivers/nfc/s3fwrn5/uart.c                         |    4 +-
+ drivers/platform/chrome/cros_ec_uart.c             |    4 +-
+ drivers/platform/surface/aggregator/core.c         |    4 +-
+ drivers/tty/Kconfig                                |    7 +-
+ drivers/tty/amiserial.c                            |    6 +-
+ drivers/tty/goldfish.c                             |    5 +-
+ drivers/tty/hvc/hvc_iucv.c                         |    6 +-
+ drivers/tty/serdev/core.c                          |    2 +-
+ drivers/tty/serdev/serdev-ttyport.c                |   10 +-
+ drivers/tty/serial/8250/8250_bcm7271.c             |   17 +-
+ drivers/tty/serial/8250/8250_of.c                  |   44 +-
+ drivers/tty/serial/8250/8250_pci1xxxx.c            |  160 +-
+ drivers/tty/serial/8250/8250_port.c                |   31 +-
+ drivers/tty/serial/fsl_linflexuart.c               |    1 -
+ drivers/tty/serial/jsm/jsm_cls.c                   |    1 -
+ drivers/tty/serial/lpc32xx_hs.c                    |    2 -
+ drivers/tty/serial/max310x.c                       |  327 ++---
+ drivers/tty/serial/pmac_zilog.c                    |    9 +-
+ drivers/tty/serial/qcom_geni_serial.c              |   27 +-
+ drivers/tty/serial/samsung_tty.c                   |  259 ++--
+ drivers/tty/serial/serial_base_bus.c               |    2 +-
+ drivers/tty/serial/serial_txx9.c                   |    3 +-
+ drivers/tty/serial/stm32-usart.c                   |  223 +--
+ drivers/tty/serial/stm32-usart.h                   |   38 +-
+ drivers/tty/serial/xilinx_uartps.c                 |  236 ++-
+ drivers/tty/tty_buffer.c                           |    1 +
+ drivers/tty/vt/Makefile                            |    4 +-
+ drivers/tty/vt/selection.c                         |   43 +-
+ drivers/tty/vt/vt.c                                | 1531 +++++++++++---------
+ drivers/tty/vt/vt_ioctl.c                          |    6 +-
+ drivers/video/console/dummycon.c                   |   38 +-
+ drivers/video/console/mdacon.c                     |   43 +-
+ drivers/video/console/newport_con.c                |   69 +-
+ drivers/video/console/sticon.c                     |   79 +-
+ drivers/video/console/vgacon.c                     |  152 +-
+ drivers/video/fbdev/core/bitblit.c                 |   13 +-
+ drivers/video/fbdev/core/fbcon.c                   |  123 +-
+ drivers/video/fbdev/core/fbcon.h                   |    4 +-
+ drivers/video/fbdev/core/fbcon_ccw.c               |   13 +-
+ drivers/video/fbdev/core/fbcon_cw.c                |   13 +-
+ drivers/video/fbdev/core/fbcon_ud.c                |   13 +-
+ drivers/video/fbdev/core/tileblit.c                |    4 +-
+ drivers/video/fbdev/tgafb.c                        |    2 +-
+ include/linux/console.h                            |  126 +-
+ include/linux/console_struct.h                     |    1 -
+ include/linux/selection.h                          |   48 +-
+ include/linux/serdev.h                             |    8 +-
+ include/linux/serial_8250.h                        |    6 +
+ include/linux/soc/qcom/geni-se.h                   |    1 +
+ include/linux/tty.h                                |    1 -
+ include/linux/vt_kern.h                            |   12 +-
+ include/uapi/linux/fb.h                            |    8 +-
+ include/uapi/linux/vesa.h                          |   18 +
+ lib/Kconfig.kgdb                                   |    2 +-
+ sound/drivers/serial-generic.c                     |    4 +-
+ 77 files changed, 2252 insertions(+), 1675 deletions(-)
+ create mode 100644 Documentation/driver-api/tty/console.rst
+ create mode 100644 include/uapi/linux/vesa.h
+Merging char-misc/char-misc-next (8d11c6d9b14f Merge 6.8-rc5 into char-misc-next)
+$ git merge -m Merge branch 'char-misc-next' of git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/char-misc.git char-misc/char-misc-next
+Merge made by the 'ort' strategy.
+ drivers/comedi/drivers/das08.c    |  1 -
+ drivers/misc/eeprom/idt_89hpesx.c |  6 +++---
+ drivers/misc/hpilo.c              |  8 ++++----
+ drivers/misc/mei/gsc-me.c         |  5 +++++
+ drivers/misc/mei/hdcp/Kconfig     |  2 +-
+ drivers/misc/mei/hdcp/mei_hdcp.c  | 14 ++++++++++++--
+ drivers/misc/mei/pxp/Kconfig      |  2 +-
+ drivers/misc/mei/pxp/mei_pxp.c    | 14 ++++++++++++--
+ 8 files changed, 38 insertions(+), 14 deletions(-)
+Merging accel/habanalabs-next (570a7f66cc7a accel/habanalabs: modify pci health check)
+$ git merge -m Merge branch 'habanalabs-next' of git://git.kernel.org/pub/scm/linux/kernel/git/ogabbay/linux.git accel/habanalabs-next
+Merge made by the 'ort' strategy.
+ .../accel/habanalabs/common/command_submission.c   |   3 +-
+ drivers/accel/habanalabs/common/debugfs.c          |  18 +-
+ drivers/accel/habanalabs/common/device.c           |  55 +++-
+ drivers/accel/habanalabs/common/firmware_if.c      |  50 ++-
+ drivers/accel/habanalabs/common/habanalabs.h       |  63 ++--
+ drivers/accel/habanalabs/common/hw_queue.c         |  17 +
+ drivers/accel/habanalabs/common/hwmon.c            |  29 +-
+ drivers/accel/habanalabs/common/mmu/Makefile       |   2 +-
+ drivers/accel/habanalabs/common/mmu/mmu.c          | 223 ++++++++++++-
+ drivers/accel/habanalabs/common/mmu/mmu_v1.c       | 354 +++-----------------
+ drivers/accel/habanalabs/common/mmu/mmu_v2.c       | 338 +++++++++++++++++++
+ drivers/accel/habanalabs/common/mmu/mmu_v2_hr.c    |  24 +-
+ drivers/accel/habanalabs/common/security.c         |  33 +-
+ drivers/accel/habanalabs/common/security.h         |   3 +-
+ drivers/accel/habanalabs/gaudi/gaudi.c             |   9 +-
+ drivers/accel/habanalabs/gaudi2/gaudi2.c           | 365 +++++++++++++--------
+ drivers/accel/habanalabs/gaudi2/gaudi2P.h          |  15 +-
+ drivers/accel/habanalabs/goya/goya.c               |  12 +-
+ drivers/accel/habanalabs/goya/goya_coresight.c     |   3 +-
+ .../habanalabs/include/hw_ip/mmu/mmu_general.h     |   2 +
+ 20 files changed, 1041 insertions(+), 577 deletions(-)
+ create mode 100644 drivers/accel/habanalabs/common/mmu/mmu_v2.c
+Merging coresight/next (c099fdd218a0 coresight: tpdm: Fix build break due to uninitialised field)
+$ git merge -m Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/coresight/linux.git coresight/next
+Merge made by the 'ort' strategy.
+ .../ABI/testing/sysfs-bus-coresight-devices-tpdm   |  87 ++++
+ .../bindings/arm/qcom,coresight-tpdm.yaml          |  35 ++
+ drivers/hwtracing/coresight/Makefile               |  20 +
+ drivers/hwtracing/coresight/coresight-cfg-afdo.c   |   1 +
+ drivers/hwtracing/coresight/coresight-core.c       | 496 ++-------------------
+ drivers/hwtracing/coresight/coresight-cti-core.c   |   2 +-
+ drivers/hwtracing/coresight/coresight-etb10.c      |  31 +-
+ drivers/hwtracing/coresight/coresight-etm-perf.c   |   2 +-
+ drivers/hwtracing/coresight/coresight-etm.h        |   2 -
+ drivers/hwtracing/coresight/coresight-etm3x-core.c |  27 +-
+ .../hwtracing/coresight/coresight-etm3x-sysfs.c    |   4 +-
+ drivers/hwtracing/coresight/coresight-etm4x-core.c |  38 +-
+ drivers/hwtracing/coresight/coresight-etm4x.h      |   1 -
+ drivers/hwtracing/coresight/coresight-funnel.c     |   4 +-
+ drivers/hwtracing/coresight/coresight-priv.h       |   9 +-
+ drivers/hwtracing/coresight/coresight-replicator.c |   2 +-
+ drivers/hwtracing/coresight/coresight-stm.c        |  32 +-
+ drivers/hwtracing/coresight/coresight-sysfs.c      | 391 ++++++++++++++++
+ drivers/hwtracing/coresight/coresight-tmc-core.c   |   4 +-
+ drivers/hwtracing/coresight/coresight-tmc-etf.c    |  46 +-
+ drivers/hwtracing/coresight/coresight-tmc-etr.c    |  33 +-
+ drivers/hwtracing/coresight/coresight-tmc.h        |   2 -
+ drivers/hwtracing/coresight/coresight-tpda.c       | 151 ++++---
+ drivers/hwtracing/coresight/coresight-tpda.h       |   6 +
+ drivers/hwtracing/coresight/coresight-tpdm.c       | 459 +++++++++++++++++--
+ drivers/hwtracing/coresight/coresight-tpdm.h       | 114 +++++
+ drivers/hwtracing/coresight/coresight-tpiu.c       |  16 +-
+ drivers/hwtracing/coresight/ultrasoc-smb.c         |  24 +-
+ drivers/hwtracing/coresight/ultrasoc-smb.h         |   2 -
+ include/linux/coresight.h                          | 148 +++---
+ 30 files changed, 1382 insertions(+), 807 deletions(-)
+Merging fastrpc/for-next (6613476e225e Linux 6.8-rc1)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/srini/fastrpc.git fastrpc/for-next
+Already up to date.
+Merging fpga/for-next (ff49b00e9621 fpga: dfl: make dfl_bus_type const)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/fpga/linux-fpga.git fpga/for-next
+Merge made by the 'ort' strategy.
+ Documentation/driver-api/fpga/fpga-mgr.rst | 34 ++++++-----
+ drivers/fpga/dfl.c                         |  2 +-
+ drivers/fpga/fpga-mgr.c                    | 93 +++++++++++++++++++-----------
+ include/linux/fpga/fpga-mgr.h              | 29 ++++++++--
+ 4 files changed, 104 insertions(+), 54 deletions(-)
+Merging icc/icc-next (b9a9c447277f Merge branch 'icc-fixes' into icc-next)
+$ git merge -m Merge branch 'icc-next' of git://git.kernel.org/pub/scm/linux/kernel/git/djakov/icc.git icc/icc-next
+Merge made by the 'ort' strategy.
+ .../devicetree/bindings/interconnect/qcom,rpm.yaml |    3 +
+ drivers/interconnect/qcom/Kconfig                  |    9 +
+ drivers/interconnect/qcom/Makefile                 |    2 +
+ drivers/interconnect/qcom/msm8909.c                | 1329 ++++++++++++++++++++
+ drivers/interconnect/qcom/sa8775p.c                |   56 +-
+ drivers/interconnect/qcom/sm6115.c                 |   12 +-
+ drivers/interconnect/qcom/sm8250.c                 |    2 +-
+ drivers/interconnect/qcom/sm8550.c                 |  574 ---------
+ drivers/interconnect/qcom/sm8550.h                 |  284 ++---
+ drivers/interconnect/qcom/x1e80100.c               |  327 +----
+ include/dt-bindings/interconnect/qcom,msm8909.h    |   93 ++
+ .../dt-bindings/interconnect/qcom,x1e80100-rpmh.h  |   24 -
+ 12 files changed, 1599 insertions(+), 1116 deletions(-)
+ create mode 100644 drivers/interconnect/qcom/msm8909.c
+ create mode 100644 include/dt-bindings/interconnect/qcom,msm8909.h
+Merging iio/togreg (3cc5ebd3a2d6 iio: imu: bmi323: Add ACPI Match Table)
+$ git merge -m Merge branch 'togreg' of git://git.kernel.org/pub/scm/linux/kernel/git/jic23/iio.git iio/togreg
+Auto-merging MAINTAINERS
+Auto-merging drivers/iio/accel/Kconfig
+Auto-merging drivers/iio/adc/ad4130.c
+Auto-merging drivers/iio/adc/ad_sigma_delta.c
+Auto-merging drivers/iio/humidity/hdc3020.c
+Auto-merging drivers/iio/industrialio-core.c
+Auto-merging drivers/iio/light/hid-sensor-als.c
+Auto-merging drivers/of/property.c
+CONFLICT (content): Merge conflict in drivers/of/property.c
+Resolved 'drivers/of/property.c' using previous resolution.
+Automatic merge failed; fix conflicts and then commit the result.
+$ git commit --no-edit -v -a
+[master 78caa12d9fcb] Merge branch 'togreg' of git://git.kernel.org/pub/scm/linux/kernel/git/jic23/iio.git
+$ git diff -M --stat --summary HEAD^..
+ .../devicetree/bindings/iio/adc/adi,ad9467.yaml    |   4 +
+ .../devicetree/bindings/iio/adc/adi,axi-adc.yaml   |   8 +-
+ .../bindings/iio/adc/richtek,rtq6056.yaml          |   9 +-
+ .../devicetree/bindings/iio/adc/ti,ads1298.yaml    |  80 +++
+ .../bindings/iio/afe/voltage-divider.yaml          |  11 +
+ .../bindings/iio/frequency/adi,admfm2000.yaml      | 127 ++++
+ .../bindings/iio/humidity/ti,hdc3020.yaml          |   3 +
+ .../devicetree/bindings/iio/imu/st,lsm6dsx.yaml    |   4 +-
+ .../devicetree/bindings/iio/light/ams,as73211.yaml |   7 +-
+ .../bindings/iio/pressure/honeywell,hsc030pa.yaml  |   3 +
+ .../iio/pressure/honeywell,mprls0025pa.yaml        |  98 ++-
+ MAINTAINERS                                        |  22 +-
+ drivers/iio/Kconfig                                |   9 +
+ drivers/iio/Makefile                               |   1 +
+ drivers/iio/accel/Kconfig                          |   8 +-
+ drivers/iio/accel/Makefile                         |   1 +
+ drivers/iio/accel/adxl367.c                        | 297 ++++----
+ drivers/iio/accel/bmc150-accel-i2c.c               |  15 +-
+ drivers/iio/accel/bmc150-accel-spi.c               |   3 +-
+ drivers/iio/accel/bmi088-accel-i2c.c               |  70 ++
+ drivers/iio/accel/da280.c                          |  66 +-
+ drivers/iio/accel/kxcjk-1013.c                     |  33 +-
+ drivers/iio/accel/mma9551.c                        |   4 +-
+ drivers/iio/accel/mma9553.c                        |   4 +-
+ drivers/iio/accel/mxc4005.c                        |   5 +-
+ drivers/iio/accel/mxc6255.c                        |   4 +-
+ drivers/iio/accel/st_accel_i2c.c                   |   5 +-
+ drivers/iio/accel/stk8ba50.c                       |   4 +-
+ drivers/iio/adc/Kconfig                            |  15 +-
+ drivers/iio/adc/Makefile                           |   1 +
+ drivers/iio/adc/ad4130.c                           | 131 ++--
+ drivers/iio/adc/ad7091r-base.c                     |  25 +-
+ drivers/iio/adc/ad9467.c                           | 267 ++++---
+ drivers/iio/adc/ad_sigma_delta.c                   |   7 +-
+ drivers/iio/adc/adi-axi-adc.c                      | 407 +++--------
+ drivers/iio/adc/max1363.c                          | 161 ++---
+ drivers/iio/adc/rtq6056.c                          | 275 +++++++-
+ drivers/iio/adc/ti-adc108s102.c                    |   4 +-
+ drivers/iio/adc/ti-ads1015.c                       |   2 +-
+ drivers/iio/adc/ti-ads1298.c                       | 769 +++++++++++++++++++++
+ drivers/iio/buffer/industrialio-buffer-dmaengine.c |  11 +-
+ .../iio/common/inv_sensors/inv_sensors_timestamp.c |   2 +-
+ drivers/iio/dummy/iio_dummy_evgen.c                |   2 -
+ drivers/iio/dummy/iio_simple_dummy.c               | 193 +++---
+ drivers/iio/frequency/Kconfig                      |  10 +
+ drivers/iio/frequency/Makefile                     |   1 +
+ drivers/iio/frequency/admfm2000.c                  | 282 ++++++++
+ drivers/iio/gyro/bmg160_i2c.c                      |   4 +-
+ drivers/iio/health/afe4403.c                       |  65 +-
+ drivers/iio/health/afe4404.c                       |  65 +-
+ drivers/iio/humidity/hdc3020.c                     | 451 ++++++++----
+ drivers/iio/humidity/hts221_i2c.c                  |   4 +-
+ drivers/iio/imu/adis16475.c                        |   8 +-
+ drivers/iio/imu/adis16480.c                        |   9 +-
+ drivers/iio/imu/bmi160/bmi160_i2c.c                |   9 +
+ drivers/iio/imu/bmi323/bmi323_core.c               |  78 +--
+ drivers/iio/imu/bmi323/bmi323_i2c.c                |  21 +
+ drivers/iio/imu/fxos8700_i2c.c                     |   3 +-
+ drivers/iio/imu/fxos8700_spi.c                     |   3 +-
+ drivers/iio/imu/kmx61.c                            |   2 +-
+ drivers/iio/imu/st_lsm6dsx/Kconfig                 |  31 +-
+ drivers/iio/imu/st_lsm6dsx/st_lsm6dsx.h            |   2 +
+ drivers/iio/imu/st_lsm6dsx/st_lsm6dsx_buffer.c     |  28 +-
+ drivers/iio/imu/st_lsm6dsx/st_lsm6dsx_core.c       |  33 +-
+ drivers/iio/imu/st_lsm6dsx/st_lsm6dsx_i2c.c        |   5 +
+ drivers/iio/imu/st_lsm6dsx/st_lsm6dsx_spi.c        |   5 +
+ drivers/iio/imu/st_lsm9ds0/st_lsm9ds0.h            |   5 +-
+ drivers/iio/imu/st_lsm9ds0/st_lsm9ds0_core.c       |  21 +-
+ drivers/iio/imu/st_lsm9ds0/st_lsm9ds0_i2c.c        |   6 +-
+ drivers/iio/imu/st_lsm9ds0/st_lsm9ds0_spi.c        |   4 +-
+ drivers/iio/industrialio-backend.c                 | 418 +++++++++++
+ drivers/iio/industrialio-core.c                    |   6 +-
+ drivers/iio/industrialio-gts-helper.c              |  15 +-
+ drivers/iio/light/Kconfig                          |   5 +-
+ drivers/iio/light/as73211.c                        | 142 +++-
+ drivers/iio/light/hid-sensor-als.c                 | 122 +++-
+ drivers/iio/light/jsa1212.c                        |   4 +-
+ drivers/iio/light/ltr501.c                         |   3 +-
+ drivers/iio/light/max44000.c                       |   6 +-
+ drivers/iio/light/rpr0521.c                        |   4 +-
+ drivers/iio/light/stk3310.c                        |   4 +-
+ drivers/iio/light/us5182d.c                        |   4 +-
+ drivers/iio/light/vcnl4000.c                       |  36 +-
+ drivers/iio/magnetometer/bmc150_magn_i2c.c         |   3 +-
+ drivers/iio/magnetometer/bmc150_magn_spi.c         |   3 +-
+ drivers/iio/magnetometer/mmc35240.c                |   4 +-
+ drivers/iio/potentiometer/max5487.c                |   4 +-
+ drivers/iio/pressure/Kconfig                       |  16 +-
+ drivers/iio/pressure/Makefile                      |   2 +
+ drivers/iio/pressure/hp206c.c                      |   6 +-
+ drivers/iio/pressure/hsc030pa.c                    |  49 +-
+ drivers/iio/pressure/hsc030pa.h                    |   7 +
+ drivers/iio/pressure/hsc030pa_i2c.c                |   9 +-
+ drivers/iio/pressure/hsc030pa_spi.c                |   7 +-
+ drivers/iio/pressure/mprls0025pa.c                 | 313 ++++-----
+ drivers/iio/pressure/mprls0025pa.h                 | 102 +++
+ drivers/iio/pressure/mprls0025pa_i2c.c             | 100 +++
+ drivers/iio/pressure/mprls0025pa_spi.c             |  92 +++
+ drivers/iio/pressure/st_pressure_i2c.c             |   5 +-
+ drivers/iio/proximity/sx9310.c                     | 114 ++-
+ drivers/iio/proximity/sx9324.c                     | 178 +++--
+ drivers/iio/proximity/sx9360.c                     | 115 ++-
+ drivers/iio/test/Kconfig                           |  14 +
+ drivers/iio/test/Makefile                          |   1 +
+ drivers/iio/test/iio-test-gts.c                    | 513 ++++++++++++++
+ drivers/of/property.c                              |   2 +
+ include/linux/hid-sensor-ids.h                     |   4 +
+ include/linux/iio/adc/adi-axi-adc.h                |  68 --
+ include/linux/iio/backend.h                        |  72 ++
+ include/linux/iio/buffer-dmaengine.h               |   3 +
+ include/linux/iio/iio.h                            |  30 +-
+ tools/iio/iio_utils.c                              |   2 +-
+ 112 files changed, 5051 insertions(+), 1888 deletions(-)
+ create mode 100644 Documentation/devicetree/bindings/iio/adc/ti,ads1298.yaml
+ create mode 100644 Documentation/devicetree/bindings/iio/frequency/adi,admfm2000.yaml
+ create mode 100644 drivers/iio/accel/bmi088-accel-i2c.c
+ create mode 100644 drivers/iio/adc/ti-ads1298.c
+ create mode 100644 drivers/iio/frequency/admfm2000.c
+ create mode 100644 drivers/iio/industrialio-backend.c
+ create mode 100644 drivers/iio/pressure/mprls0025pa.h
+ create mode 100644 drivers/iio/pressure/mprls0025pa_i2c.c
+ create mode 100644 drivers/iio/pressure/mprls0025pa_spi.c
+ create mode 100644 drivers/iio/test/iio-test-gts.c
+ delete mode 100644 include/linux/iio/adc/adi-axi-adc.h
+ create mode 100644 include/linux/iio/backend.h
+Merging phy-next/next (505dfc6ba84c phy: ti: tusb1210: Define device IDs)
+$ git merge -m Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/phy/linux-phy.git phy-next/next
+Auto-merging MAINTAINERS
+Auto-merging drivers/phy/qualcomm/phy-qcom-qmp-usb.c
+CONFLICT (content): Merge conflict in drivers/phy/qualcomm/phy-qcom-qmp-usb.c
+Resolved 'drivers/phy/qualcomm/phy-qcom-qmp-usb.c' using previous resolution.
+Automatic merge failed; fix conflicts and then commit the result.
+$ git commit --no-edit -v -a
+[master a64c3886fd04] Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/phy/linux-phy.git
+$ git diff -M --stat --summary HEAD^..
+ .../bindings/phy/mediatek,mt8365-csi-rx.yaml       |   79 ++
+ .../bindings/phy/phy-cadence-torrent.yaml          |   11 +-
+ .../bindings/phy/qcom,msm8998-qmp-usb3-phy.yaml    |  184 ++++
+ .../bindings/phy/qcom,sc8280xp-qmp-pcie-phy.yaml   |    6 +
+ .../bindings/phy/qcom,sc8280xp-qmp-ufs-phy.yaml    |   48 +-
+ .../phy/qcom,sc8280xp-qmp-usb3-uni-phy.yaml        |   22 -
+ .../bindings/phy/rockchip,rk3588-hdptx-phy.yaml    |   91 ++
+ MAINTAINERS                                        |    7 +
+ drivers/phy/cadence/phy-cadence-torrent.c          |  720 +++++++++++-
+ drivers/phy/marvell/phy-armada38x-comphy.c         |    7 +-
+ drivers/phy/mediatek/Kconfig                       |   12 +
+ drivers/phy/mediatek/Makefile                      |    2 +
+ drivers/phy/mediatek/phy-mtk-mipi-csi-0-5-rx-reg.h |   62 ++
+ drivers/phy/mediatek/phy-mtk-mipi-csi-0-5.c        |  294 +++++
+ drivers/phy/qualcomm/Makefile                      |    2 +-
+ drivers/phy/qualcomm/phy-qcom-edp.c                |    3 +-
+ drivers/phy/qualcomm/phy-qcom-qmp-combo.c          |  109 +-
+ drivers/phy/qualcomm/phy-qcom-qmp-common.h         |   59 +
+ drivers/phy/qualcomm/phy-qcom-qmp-dp-com-v3.h      |   18 +
+ drivers/phy/qualcomm/phy-qcom-qmp-dp-phy-v3.h      |   21 +
+ drivers/phy/qualcomm/phy-qcom-qmp-dp-phy-v4.h      |   19 +
+ drivers/phy/qualcomm/phy-qcom-qmp-dp-phy-v5.h      |   13 +
+ drivers/phy/qualcomm/phy-qcom-qmp-dp-phy-v6.h      |   13 +
+ drivers/phy/qualcomm/phy-qcom-qmp-dp-phy.h         |   62 ++
+ drivers/phy/qualcomm/phy-qcom-qmp-pcie-msm8996.c   |   70 +-
+ drivers/phy/qualcomm/phy-qcom-qmp-pcie.c           |  288 +++--
+ drivers/phy/qualcomm/phy-qcom-qmp-pcs-pcie-v6.h    |    2 +
+ drivers/phy/qualcomm/phy-qcom-qmp-pcs-pcie-v6_20.h |    2 +
+ drivers/phy/qualcomm/phy-qcom-qmp-pcs-sgmii.h      |   20 +
+ drivers/phy/qualcomm/phy-qcom-qmp-pcs-ufs-v6.h     |    2 +
+ drivers/phy/qualcomm/phy-qcom-qmp-pcs-v6_20.h      |    1 +
+ drivers/phy/qualcomm/phy-qcom-qmp-qserdes-com-v6.h |    2 +
+ .../qualcomm/phy-qcom-qmp-qserdes-txrx-ufs-v6.h    |    8 +
+ .../phy/qualcomm/phy-qcom-qmp-qserdes-txrx-v6_20.h |    2 +
+ drivers/phy/qualcomm/phy-qcom-qmp-ufs.c            |  305 +++---
+ drivers/phy/qualcomm/phy-qcom-qmp-usb-legacy.c     |   76 +-
+ drivers/phy/qualcomm/phy-qcom-qmp-usb.c            |  422 +------
+ drivers/phy/qualcomm/phy-qcom-qmp-usbc.c           | 1149 ++++++++++++++++++++
+ drivers/phy/qualcomm/phy-qcom-qmp.h                |  101 +-
+ drivers/phy/qualcomm/phy-qcom-sgmii-eth.c          |  417 +++----
+ drivers/phy/rockchip/Kconfig                       |    8 +
+ drivers/phy/rockchip/Makefile                      |    1 +
+ drivers/phy/rockchip/phy-rockchip-samsung-hdptx.c  | 1028 +++++++++++++++++
+ drivers/phy/ti/phy-gmii-sel.c                      |   24 +
+ drivers/phy/ti/phy-tusb1210.c                      |   57 +-
+ 45 files changed, 4582 insertions(+), 1267 deletions(-)
+ create mode 100644 Documentation/devicetree/bindings/phy/mediatek,mt8365-csi-rx.yaml
+ create mode 100644 Documentation/devicetree/bindings/phy/qcom,msm8998-qmp-usb3-phy.yaml
+ create mode 100644 Documentation/devicetree/bindings/phy/rockchip,rk3588-hdptx-phy.yaml
+ create mode 100644 drivers/phy/mediatek/phy-mtk-mipi-csi-0-5-rx-reg.h
+ create mode 100644 drivers/phy/mediatek/phy-mtk-mipi-csi-0-5.c
+ create mode 100644 drivers/phy/qualcomm/phy-qcom-qmp-common.h
+ create mode 100644 drivers/phy/qualcomm/phy-qcom-qmp-dp-com-v3.h
+ create mode 100644 drivers/phy/qualcomm/phy-qcom-qmp-dp-phy-v3.h
+ create mode 100644 drivers/phy/qualcomm/phy-qcom-qmp-dp-phy-v4.h
+ create mode 100644 drivers/phy/qualcomm/phy-qcom-qmp-dp-phy-v5.h
+ create mode 100644 drivers/phy/qualcomm/phy-qcom-qmp-dp-phy-v6.h
+ create mode 100644 drivers/phy/qualcomm/phy-qcom-qmp-dp-phy.h
+ create mode 100644 drivers/phy/qualcomm/phy-qcom-qmp-pcs-sgmii.h
+ create mode 100644 drivers/phy/qualcomm/phy-qcom-qmp-usbc.c
+ create mode 100644 drivers/phy/rockchip/phy-rockchip-samsung-hdptx.c
+Merging soundwire/next (81a7d0c4d059 soundwire: bus_type: make sdw_bus_type const)
+$ git merge -m Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/vkoul/soundwire.git soundwire/next
+Merge made by the 'ort' strategy.
+ Documentation/driver-api/soundwire/stream.rst | 4 ++--
+ drivers/soundwire/bus_type.c                  | 2 +-
+ drivers/soundwire/intel_auxdevice.c           | 2 --
+ include/linux/soundwire/sdw_type.h            | 2 +-
+ 4 files changed, 4 insertions(+), 6 deletions(-)
+Merging extcon/extcon-next (b401b621758e Linux 6.8-rc5)
+$ git merge -m Merge branch 'extcon-next' of git://git.kernel.org/pub/scm/linux/kernel/git/chanwoo/extcon.git extcon/extcon-next
+Already up to date.
+Merging gnss/gnss-next (54be6c6c5ae8 Linux 6.8-rc3)
+$ git merge -m Merge branch 'gnss-next' of git://git.kernel.org/pub/scm/linux/kernel/git/johan/gnss.git gnss/gnss-next
+Already up to date.
+Merging vfio/next (78f70c02bdbc vfio/virtio: fix virtio-pci dependency)
+$ git merge -m Merge branch 'next' of git://github.com/awilliam/linux-vfio.git vfio/next
+Already up to date.
+Merging w1/for-next (d97d263132a6 w1: w1-gpio: Convert to platform remove callback returning void)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/krzk/linux-w1.git w1/for-next
+Merge made by the 'ort' strategy.
+ .../devicetree/bindings/serial/serial.yaml         |   2 +-
+ Documentation/devicetree/bindings/w1/w1-uart.yaml  |  59 +++
+ Documentation/w1/masters/index.rst                 |   1 +
+ Documentation/w1/masters/w1-uart.rst               |  54 +++
+ drivers/w1/masters/Kconfig                         |  10 +
+ drivers/w1/masters/Makefile                        |   1 +
+ drivers/w1/masters/mxc_w1.c                        |   6 +-
+ drivers/w1/masters/omap_hdq.c                      |   6 +-
+ drivers/w1/masters/sgi_w1.c                        |   6 +-
+ drivers/w1/masters/w1-gpio.c                       |   6 +-
+ drivers/w1/masters/w1-uart.c                       | 415 +++++++++++++++++++++
+ drivers/w1/w1.c                                    |   2 +-
+ 12 files changed, 550 insertions(+), 18 deletions(-)
+ create mode 100644 Documentation/devicetree/bindings/w1/w1-uart.yaml
+ create mode 100644 Documentation/w1/masters/w1-uart.rst
+ create mode 100644 drivers/w1/masters/w1-uart.c
+Merging spmi/spmi-next (b85ea95d0864 Linux 6.7-rc1)
+$ git merge -m Merge branch 'spmi-next' of git://git.kernel.org/pub/scm/linux/kernel/git/sboyd/spmi.git spmi/spmi-next
+Already up to date.
+Merging staging/staging-next (455c5e12a3b7 staging: gdm724x: constantify the struct device_type usage)
+$ git merge -m Merge branch 'staging-next' of git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/staging.git staging/staging-next
+Merge made by the 'ort' strategy.
+ drivers/staging/Kconfig                            |    4 -
+ drivers/staging/Makefile                           |    2 -
+ drivers/staging/axis-fifo/axis-fifo.c              |    7 +-
+ drivers/staging/board/Kconfig                      |   12 -
+ drivers/staging/board/Makefile                     |    4 -
+ drivers/staging/board/TODO                         |    2 -
+ drivers/staging/board/armadillo800eva.c            |   88 -
+ drivers/staging/board/board.c                      |  204 --
+ drivers/staging/board/board.h                      |   46 -
+ drivers/staging/board/kzm9d.c                      |   26 -
+ drivers/staging/emxx_udc/Kconfig                   |   11 -
+ drivers/staging/emxx_udc/Makefile                  |    2 -
+ drivers/staging/emxx_udc/TODO                      |    6 -
+ drivers/staging/emxx_udc/emxx_udc.c                | 3223 --------------------
+ drivers/staging/emxx_udc/emxx_udc.h                |  554 ----
+ drivers/staging/fieldbus/anybuss/arcx-anybus.c     |    6 +-
+ drivers/staging/fieldbus/anybuss/host.c            |    2 +-
+ drivers/staging/fieldbus/dev_core.c                |    6 +-
+ drivers/staging/gdm724x/gdm_lte.c                  |    2 +-
+ drivers/staging/greybus/audio_manager.c            |    8 +-
+ drivers/staging/greybus/authentication.c           |    6 +-
+ drivers/staging/greybus/fw-download.c              |    7 +-
+ drivers/staging/greybus/fw-management.c            |   20 +-
+ drivers/staging/greybus/gbphy.c                    |    8 +-
+ drivers/staging/greybus/loopback.c                 |    6 +-
+ drivers/staging/greybus/raw.c                      |    6 +-
+ drivers/staging/greybus/vibrator.c                 |    6 +-
+ drivers/staging/rtl8192e/rtl8192e/r8192E_dev.c     |   17 +-
+ drivers/staging/rtl8192e/rtl8192e/r8192E_phy.c     |    1 -
+ drivers/staging/rtl8192e/rtl8192e/rtl_core.c       |   54 +-
+ drivers/staging/rtl8192e/rtl8192e/rtl_dm.c         |    6 +-
+ drivers/staging/rtl8192e/rtl8192e/rtl_ps.c         |    4 +-
+ drivers/staging/rtl8192e/rtl8192e/rtl_wx.c         |    4 +-
+ drivers/staging/rtl8192e/rtl819x_BAProc.c          |    2 +-
+ drivers/staging/rtl8192e/rtl819x_HT.h              |    6 +-
+ drivers/staging/rtl8192e/rtl819x_HTProc.c          |   36 +-
+ drivers/staging/rtl8192e/rtl819x_Qos.h             |    2 +-
+ drivers/staging/rtl8192e/rtl819x_TSProc.c          |    6 +-
+ drivers/staging/rtl8192e/rtllib.h                  |   44 +-
+ drivers/staging/rtl8192e/rtllib_rx.c               |   13 +-
+ drivers/staging/rtl8192e/rtllib_softmac.c          |  182 +-
+ drivers/staging/rtl8192e/rtllib_softmac_wx.c       |    4 +-
+ drivers/staging/rtl8192e/rtllib_tx.c               |   18 +-
+ drivers/staging/rtl8192e/rtllib_wx.c               |    2 +-
+ drivers/staging/rtl8723bs/core/rtw_ieee80211.c     |    4 +-
+ drivers/staging/rtl8723bs/core/rtw_sta_mgt.c       |    3 +-
+ drivers/staging/rtl8723bs/os_dep/ioctl_cfg80211.c  |    3 +-
+ .../vc04_services/interface/vchiq_arm/vchiq_bus.c  |    2 +-
+ .../vc04_services/interface/vchiq_arm/vchiq_bus.h  |    2 +-
+ drivers/staging/vme_user/vme.c                     |    2 +-
+ drivers/staging/vme_user/vme.h                     |    2 +-
+ drivers/staging/vt6655/card.c                      |   74 +-
+ drivers/staging/vt6655/rxtx.h                      |    1 -
+ 53 files changed, 282 insertions(+), 4486 deletions(-)
+ delete mode 100644 drivers/staging/board/Kconfig
+ delete mode 100644 drivers/staging/board/Makefile
+ delete mode 100644 drivers/staging/board/TODO
+ delete mode 100644 drivers/staging/board/armadillo800eva.c
+ delete mode 100644 drivers/staging/board/board.c
+ delete mode 100644 drivers/staging/board/board.h
+ delete mode 100644 drivers/staging/board/kzm9d.c
+ delete mode 100644 drivers/staging/emxx_udc/Kconfig
+ delete mode 100644 drivers/staging/emxx_udc/Makefile
+ delete mode 100644 drivers/staging/emxx_udc/TODO
+ delete mode 100644 drivers/staging/emxx_udc/emxx_udc.c
+ delete mode 100644 drivers/staging/emxx_udc/emxx_udc.h
+Merging counter-next/counter-next (b6dce0452a02 counter: fix privdata alignment)
+$ git merge -m Merge branch 'counter-next' of git://git.kernel.org/pub/scm/linux/kernel/git/wbg/counter.git counter-next/counter-next
+Auto-merging drivers/counter/counter-core.c
+Merge made by the 'ort' strategy.
+ drivers/counter/counter-core.c | 2 +-
+ include/linux/counter.h        | 1 -
+ 2 files changed, 1 insertion(+), 2 deletions(-)
+Merging mux/for-next (44c026a73be8 Linux 6.4-rc3)
+$ git merge -m Merge branch 'for-next' of https://gitlab.com/peda-linux/mux.git mux/for-next
+Already up to date.
+Merging dmaengine/next (35b78e2eef2d dt-bindings: renesas,rcar-dmac: Add r8a779h0 support)
+$ git merge -m Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/vkoul/dmaengine.git dmaengine/next
+Auto-merging drivers/dma/fsl-edma-common.c
+Auto-merging drivers/dma/fsl-edma-common.h
+Auto-merging drivers/dma/fsl-edma-main.c
+Auto-merging drivers/dma/idxd/idxd.h
+Merge made by the 'ort' strategy.
+ .../bindings/dma/allwinner,sun50i-a64-dma.yaml     |  12 +-
+ .../devicetree/bindings/dma/fsl,edma.yaml          |   2 +
+ .../devicetree/bindings/dma/marvell,mmp-dma.yaml   |  72 +++++
+ .../bindings/dma/mediatek,mt7622-hsdma.yaml        |  63 +++++
+ Documentation/devicetree/bindings/dma/mmp-dma.txt  |  81 ------
+ .../devicetree/bindings/dma/mtk-hsdma.txt          |  33 ---
+ .../devicetree/bindings/dma/renesas,rcar-dmac.yaml |   1 +
+ drivers/dma/Kconfig                                |  14 +-
+ drivers/dma/bestcomm/sram.c                        |   5 -
+ drivers/dma/fsl-edma-common.c                      | 101 ++++---
+ drivers/dma/fsl-edma-common.h                      | 159 ++++++++++-
+ drivers/dma/fsl-edma-main.c                        |  19 +-
+ drivers/dma/idxd/bus.c                             |   2 +-
+ drivers/dma/idxd/idxd.h                            |   2 +-
+ drivers/dma/mcf-edma-main.c                        |   2 +-
+ drivers/dma/pl330.c                                |   1 +
+ drivers/dma/ti/k3-psil-j721s2.c                    |  73 +++++
+ drivers/dma/ti/k3-udma-glue.c                      | 314 +++++++++++++++------
+ drivers/dma/xilinx/xilinx_dma.c                    |   6 +
+ include/linux/dma/k3-udma-glue.h                   |  10 +
+ 20 files changed, 688 insertions(+), 284 deletions(-)
+ create mode 100644 Documentation/devicetree/bindings/dma/marvell,mmp-dma.yaml
+ create mode 100644 Documentation/devicetree/bindings/dma/mediatek,mt7622-hsdma.yaml
+ delete mode 100644 Documentation/devicetree/bindings/dma/mmp-dma.txt
+ delete mode 100644 Documentation/devicetree/bindings/dma/mtk-hsdma.txt
+Merging cgroup/for-next (8d4c171f451d docs: cgroup-v1: add missing code-block tags)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git cgroup/for-next
+Merge made by the 'ort' strategy.
+ Documentation/admin-guide/cgroup-v1/hugetlb.rst | 20 ++++++++++++--------
+ 1 file changed, 12 insertions(+), 8 deletions(-)
+Merging scsi/for-next (d970d094663a Merge branch 'fixes' into for-next)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/jejb/scsi.git scsi/for-next
+Auto-merging drivers/scsi/scsi_scan.c
+Auto-merging drivers/ufs/core/ufshcd.c
+Auto-merging drivers/ufs/host/ufs-qcom.c
+Merge made by the 'ort' strategy.
+ drivers/message/fusion/mptfc.c              |   4 +-
+ drivers/scsi/3w-9xxx.c                      |  44 +--
+ drivers/scsi/3w-sas.c                       |  36 +--
+ drivers/scsi/3w-xxxx.c                      |  44 +--
+ drivers/scsi/53c700.c                       |   2 +-
+ drivers/scsi/Kconfig                        |   9 +
+ drivers/scsi/aacraid/aachba.c               |   6 +-
+ drivers/scsi/ch.c                           |  27 +-
+ drivers/scsi/device_handler/scsi_dh_hp_sw.c |  49 ++-
+ drivers/scsi/device_handler/scsi_dh_rdac.c  |  84 +++---
+ drivers/scsi/fcoe/fcoe_sysfs.c              |   4 +-
+ drivers/scsi/fnic/fnic_attrs.c              |   7 +-
+ drivers/scsi/fnic/fnic_scsi.c               |   4 +-
+ drivers/scsi/hisi_sas/hisi_sas_main.c       |  26 +-
+ drivers/scsi/hisi_sas/hisi_sas_v3_hw.c      |   8 +-
+ drivers/scsi/ibmvscsi/ibmvfc.c              |  22 +-
+ drivers/scsi/ibmvscsi_tgt/ibmvscsi_tgt.c    |  24 +-
+ drivers/scsi/isci/init.c                    |   2 +-
+ drivers/scsi/lpfc/lpfc.h                    |  94 +++---
+ drivers/scsi/lpfc/lpfc_attr.c               | 107 +++----
+ drivers/scsi/lpfc/lpfc_bsg.c                |   8 +-
+ drivers/scsi/lpfc/lpfc_ct.c                 | 154 ++++------
+ drivers/scsi/lpfc/lpfc_debugfs.c            |  14 +-
+ drivers/scsi/lpfc/lpfc_els.c                | 446 ++++++++++++----------------
+ drivers/scsi/lpfc/lpfc_hbadisc.c            | 350 +++++++++-------------
+ drivers/scsi/lpfc/lpfc_hw4.h                |   4 +-
+ drivers/scsi/lpfc/lpfc_init.c               | 137 +++++----
+ drivers/scsi/lpfc/lpfc_mbox.c               |  10 +-
+ drivers/scsi/lpfc/lpfc_nportdisc.c          |  91 +++---
+ drivers/scsi/lpfc/lpfc_nvme.c               |  20 +-
+ drivers/scsi/lpfc/lpfc_nvmet.c              |  14 +-
+ drivers/scsi/lpfc/lpfc_scsi.c               |  10 +-
+ drivers/scsi/lpfc/lpfc_sli.c                |  56 ++--
+ drivers/scsi/lpfc/lpfc_version.h            |   6 +-
+ drivers/scsi/lpfc/lpfc_vport.c              |  69 ++---
+ drivers/scsi/megaraid.c                     |   2 +-
+ drivers/scsi/mpi3mr/mpi3mr_os.c             |  12 +-
+ drivers/scsi/mpt3sas/mpt3sas_base.c         | 113 ++++---
+ drivers/scsi/mpt3sas/mpt3sas_base.h         |   8 +-
+ drivers/scsi/mpt3sas/mpt3sas_ctl.c          |  54 ++++
+ drivers/scsi/mpt3sas/mpt3sas_ctl.h          |  10 +
+ drivers/scsi/mpt3sas/mpt3sas_scsih.c        |   1 +
+ drivers/scsi/pm8001/pm8001_ctl.c            |   6 +-
+ drivers/scsi/scsi_debug.c                   |   4 +-
+ drivers/scsi/scsi_devinfo.c                 |   6 +-
+ drivers/scsi/scsi_lib.c                     | 124 +++++++-
+ drivers/scsi/scsi_lib_test.c                | 330 ++++++++++++++++++++
+ drivers/scsi/scsi_priv.h                    |   2 +-
+ drivers/scsi/scsi_scan.c                    | 105 ++++---
+ drivers/scsi/scsi_sysfs.c                   |   2 +-
+ drivers/scsi/scsi_transport_iscsi.c         |   4 +-
+ drivers/scsi/scsi_transport_spi.c           |  35 +--
+ drivers/scsi/sd.c                           | 220 +++++++++-----
+ drivers/scsi/ses.c                          |  66 ++--
+ drivers/scsi/sr.c                           |  38 +--
+ drivers/target/loopback/tcm_loop.c          |   2 +-
+ drivers/ufs/core/ufs-mcq.c                  |  12 +-
+ drivers/ufs/core/ufs-sysfs.c                |  49 +++
+ drivers/ufs/core/ufshcd.c                   |  90 ++++--
+ drivers/ufs/host/ufs-mediatek.c             |  90 ++++--
+ drivers/ufs/host/ufs-mediatek.h             |   7 +-
+ drivers/ufs/host/ufs-qcom.c                 |  28 +-
+ include/scsi/scsi_device.h                  |  48 +++
+ include/scsi/scsi_host.h                    |   6 +-
+ include/ufs/ufshcd.h                        |   7 +
+ include/ufs/ufshci.h                        |   3 +
+ 66 files changed, 2117 insertions(+), 1359 deletions(-)
+ create mode 100644 drivers/scsi/scsi_lib_test.c
+Merging scsi-mkp/for-next (9f3dbcb5632d scsi: csiostor: Avoid function pointer casts)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/mkp/scsi.git scsi-mkp/for-next
+Merge made by the 'ort' strategy.
+ drivers/scsi/csiostor/csio_defs.h  | 18 ++++++++++++++++--
+ drivers/scsi/csiostor/csio_lnode.c |  8 ++++----
+ drivers/scsi/csiostor/csio_lnode.h | 13 -------------
+ drivers/scsi/qla1280.c             |  1 -
+ 4 files changed, 20 insertions(+), 20 deletions(-)
+Merging vhost/linux-next (f16d65124380 vdpa/mlx5: Add mkey leak detection)
+$ git merge -m Merge branch 'linux-next' of git://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost.git vhost/linux-next
+Already up to date.
+Merging rpmsg/for-next (929654e8f1ad Merge branches 'rpmsg-next' and 'rproc-next' into for-next)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/remoteproc/linux.git rpmsg/for-next
+Auto-merging drivers/remoteproc/imx_dsp_rproc.c
+CONFLICT (content): Merge conflict in drivers/remoteproc/imx_dsp_rproc.c
+Auto-merging drivers/remoteproc/imx_rproc.c
+Resolved 'drivers/remoteproc/imx_dsp_rproc.c' using previous resolution.
+Automatic merge failed; fix conflicts and then commit the result.
+$ git commit --no-edit -v -a
+[master 95f2f32abd1d] Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/remoteproc/linux.git
+$ git diff -M --stat --summary HEAD^..
+ .../bindings/remoteproc/qcom,sm8550-pas.yaml       |  45 +++++-
+ drivers/remoteproc/imx_dsp_rproc.c                 |  11 +-
+ drivers/remoteproc/imx_rproc.c                     |  16 +--
+ drivers/remoteproc/qcom_q6v5_pas.c                 | 150 +++++++++++++++-----
+ drivers/remoteproc/remoteproc_core.c               |  29 +++-
+ drivers/remoteproc/remoteproc_virtio.c             |   6 +-
+ drivers/remoteproc/st_remoteproc.c                 |  15 +-
+ drivers/remoteproc/stm32_rproc.c                   |  10 +-
+ drivers/remoteproc/ti_k3_dsp_remoteproc.c          | 156 +++++++--------------
+ drivers/rpmsg/rpmsg_char.c                         |  12 +-
+ drivers/rpmsg/rpmsg_core.c                         |   2 +-
+ drivers/rpmsg/rpmsg_ctrl.c                         |  12 +-
+ 12 files changed, 271 insertions(+), 193 deletions(-)
+Merging gpio/for-next (0bb80ecc33a8 Linux 6.6-rc1)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/linusw/linux-gpio.git gpio/for-next
+Already up to date.
+Merging gpio-brgl/gpio/for-next (56c608c9e773 gpio: Add ChromeOS EC GPIO driver)
+$ git merge -m Merge branch 'gpio/for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/brgl/linux.git gpio-brgl/gpio/for-next
+Auto-merging Documentation/userspace-api/index.rst
+CONFLICT (content): Merge conflict in Documentation/userspace-api/index.rst
+Auto-merging MAINTAINERS
+Auto-merging drivers/gpio/gpiolib.c
+Auto-merging include/linux/gpio/driver.h
+Resolved 'Documentation/userspace-api/index.rst' using previous resolution.
+Automatic merge failed; fix conflicts and then commit the result.
+$ git commit --no-edit -v -a
+[master 4c7aea75f232] Merge branch 'gpio/for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/brgl/linux.git
+$ git diff -M --stat --summary HEAD^..
+ Documentation/ABI/obsolete/sysfs-gpio              |   4 +-
+ Documentation/ABI/testing/gpio-cdev                |   9 +-
+ Documentation/admin-guide/gpio/gpio-mockup.rst     |   8 +
+ Documentation/admin-guide/gpio/index.rst           |   6 +-
+ Documentation/admin-guide/gpio/obsolete.rst        |  13 +
+ .../devicetree/bindings/gpio/gpio-mvebu.yaml       |   2 +-
+ .../devicetree/bindings/gpio/gpio-pca9570.yaml     |   3 +
+ .../bindings/gpio/renesas,rcar-gpio.yaml           |   1 +
+ .../devicetree/bindings/mfd/google,cros-ec.yaml    |   7 +
+ Documentation/driver-api/gpio/consumer.rst         |  10 +-
+ Documentation/userspace-api/gpio/chardev.rst       | 116 +++
+ Documentation/userspace-api/gpio/chardev_v1.rst    | 131 +++
+ Documentation/userspace-api/gpio/error-codes.rst   |  79 ++
+ .../userspace-api/gpio/gpio-get-chipinfo-ioctl.rst |  41 +
+ .../gpio/gpio-get-lineevent-ioctl.rst              |  84 ++
+ .../gpio/gpio-get-linehandle-ioctl.rst             | 125 +++
+ .../userspace-api/gpio/gpio-get-lineinfo-ioctl.rst |  54 ++
+ .../gpio/gpio-get-lineinfo-unwatch-ioctl.rst       |  49 ++
+ .../gpio/gpio-get-lineinfo-watch-ioctl.rst         |  74 ++
+ .../gpio/gpio-handle-get-line-values-ioctl.rst     |  56 ++
+ .../gpio/gpio-handle-set-config-ioctl.rst          |  63 ++
+ .../gpio/gpio-handle-set-line-values-ioctl.rst     |  48 ++
+ .../gpio/gpio-lineevent-data-read.rst              |  84 ++
+ .../gpio/gpio-lineinfo-changed-read.rst            |  87 ++
+ .../userspace-api/gpio/gpio-v2-get-line-ioctl.rst  | 152 ++++
+ .../gpio/gpio-v2-get-lineinfo-ioctl.rst            |  50 ++
+ .../gpio/gpio-v2-get-lineinfo-watch-ioctl.rst      |  67 ++
+ .../userspace-api/gpio/gpio-v2-line-event-read.rst |  83 ++
+ .../gpio/gpio-v2-line-get-values-ioctl.rst         |  51 ++
+ .../gpio/gpio-v2-line-set-config-ioctl.rst         |  58 ++
+ .../gpio/gpio-v2-line-set-values-ioctl.rst         |  47 ++
+ .../gpio/gpio-v2-lineinfo-changed-read.rst         |  81 ++
+ Documentation/userspace-api/gpio/index.rst         |  18 +
+ Documentation/userspace-api/gpio/obsolete.rst      |  11 +
+ .../{admin-guide => userspace-api}/gpio/sysfs.rst  |  31 +-
+ Documentation/userspace-api/index.rst              |   1 +
+ MAINTAINERS                                        |   1 +
+ drivers/gpio/Kconfig                               |  15 +-
+ drivers/gpio/Makefile                              |   1 +
+ drivers/gpio/gpio-cros-ec.c                        | 209 +++++
+ drivers/gpio/gpio-eic-sprd.c                       |  10 +-
+ drivers/gpio/gpiolib-acpi.c                        |   2 +-
+ drivers/gpio/gpiolib-cdev.c                        |  96 ++-
+ drivers/gpio/gpiolib-legacy.c                      |  12 +
+ drivers/gpio/gpiolib-of.c                          |  15 +-
+ drivers/gpio/gpiolib-sysfs.c                       | 152 ++--
+ drivers/gpio/gpiolib.c                             | 927 +++++++++++----------
+ drivers/gpio/gpiolib.h                             |  87 +-
+ include/linux/gpio/driver.h                        |  27 +-
+ include/uapi/linux/gpio.h                          |  63 +-
+ 50 files changed, 2766 insertions(+), 655 deletions(-)
+ create mode 100644 Documentation/admin-guide/gpio/obsolete.rst
+ create mode 100644 Documentation/userspace-api/gpio/chardev.rst
+ create mode 100644 Documentation/userspace-api/gpio/chardev_v1.rst
+ create mode 100644 Documentation/userspace-api/gpio/error-codes.rst
+ create mode 100644 Documentation/userspace-api/gpio/gpio-get-chipinfo-ioctl.rst
+ create mode 100644 Documentation/userspace-api/gpio/gpio-get-lineevent-ioctl.rst
+ create mode 100644 Documentation/userspace-api/gpio/gpio-get-linehandle-ioctl.rst
+ create mode 100644 Documentation/userspace-api/gpio/gpio-get-lineinfo-ioctl.rst
+ create mode 100644 Documentation/userspace-api/gpio/gpio-get-lineinfo-unwatch-ioctl.rst
+ create mode 100644 Documentation/userspace-api/gpio/gpio-get-lineinfo-watch-ioctl.rst
+ create mode 100644 Documentation/userspace-api/gpio/gpio-handle-get-line-values-ioctl.rst
+ create mode 100644 Documentation/userspace-api/gpio/gpio-handle-set-config-ioctl.rst
+ create mode 100644 Documentation/userspace-api/gpio/gpio-handle-set-line-values-ioctl.rst
+ create mode 100644 Documentation/userspace-api/gpio/gpio-lineevent-data-read.rst
+ create mode 100644 Documentation/userspace-api/gpio/gpio-lineinfo-changed-read.rst
+ create mode 100644 Documentation/userspace-api/gpio/gpio-v2-get-line-ioctl.rst
+ create mode 100644 Documentation/userspace-api/gpio/gpio-v2-get-lineinfo-ioctl.rst
+ create mode 100644 Documentation/userspace-api/gpio/gpio-v2-get-lineinfo-watch-ioctl.rst
+ create mode 100644 Documentation/userspace-api/gpio/gpio-v2-line-event-read.rst
+ create mode 100644 Documentation/userspace-api/gpio/gpio-v2-line-get-values-ioctl.rst
+ create mode 100644 Documentation/userspace-api/gpio/gpio-v2-line-set-config-ioctl.rst
+ create mode 100644 Documentation/userspace-api/gpio/gpio-v2-line-set-values-ioctl.rst
+ create mode 100644 Documentation/userspace-api/gpio/gpio-v2-lineinfo-changed-read.rst
+ create mode 100644 Documentation/userspace-api/gpio/index.rst
+ create mode 100644 Documentation/userspace-api/gpio/obsolete.rst
+ rename Documentation/{admin-guide => userspace-api}/gpio/sysfs.rst (87%)
+ create mode 100644 drivers/gpio/gpio-cros-ec.c
+Merging gpio-intel/for-next (6613476e225e Linux 6.8-rc1)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/andy/linux-gpio-intel.git gpio-intel/for-next
+Already up to date.
+Merging pinctrl/for-next (b3b8c7865c27 Merge branch 'devel' into for-next)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/linusw/linux-pinctrl.git pinctrl/for-next
+Merge made by the 'ort' strategy.
+ .../bindings/pinctrl/amlogic,meson-pinctrl-a1.yaml |   2 +-
+ .../pinctrl/amlogic,meson-pinctrl-g12a-aobus.yaml  |   2 +-
+ .../amlogic,meson-pinctrl-g12a-periphs.yaml        |   2 +-
+ .../pinctrl/amlogic,meson8-pinctrl-aobus.yaml      |   2 +-
+ .../pinctrl/amlogic,meson8-pinctrl-cbus.yaml       |   2 +-
+ .../devicetree/bindings/pinctrl/cirrus,madera.yaml |   3 +-
+ .../bindings/pinctrl/cypress,cy8c95x0.yaml         |  24 +-
+ .../bindings/pinctrl/nuvoton,npcm845-pinctrl.yaml  |   2 -
+ .../bindings/pinctrl/nuvoton,wpcm450-pinctrl.yaml  |   3 +-
+ .../pinctrl/nvidia,tegra234-pinmux-aon.yaml        |   7 +-
+ .../pinctrl/nvidia,tegra234-pinmux-common.yaml     |  82 +++---
+ .../bindings/pinctrl/nvidia,tegra234-pinmux.yaml   |   7 +-
+ .../devicetree/bindings/pinctrl/pincfg-node.yaml   |   2 +-
+ ...nx,zynq-pinctrl.yaml => xlnx,pinctrl-zynq.yaml} |   6 +-
+ arch/riscv/boot/dts/renesas/r9a07g043f.dtsi        |   4 +
+ drivers/pinctrl/cirrus/pinctrl-cs42l43.c           |  18 +-
+ drivers/pinctrl/mediatek/pinctrl-mt7981.c          |  24 +-
+ drivers/pinctrl/mediatek/pinctrl-mt7986.c          |   2 +-
+ drivers/pinctrl/mediatek/pinctrl-mt8186.c          |   1 -
+ drivers/pinctrl/mediatek/pinctrl-mt8192.c          |   1 -
+ drivers/pinctrl/nuvoton/pinctrl-wpcm450.c          |   2 +-
+ drivers/pinctrl/pinctrl-mcp23s08.c                 |  15 +-
+ drivers/pinctrl/pinctrl-st.c                       |   3 +-
+ drivers/pinctrl/pinctrl-zynqmp.c                   |   8 +-
+ drivers/pinctrl/renesas/core.c                     |   4 +-
+ drivers/pinctrl/renesas/pfc-r8a779g0.c             |  14 +
+ drivers/pinctrl/renesas/pinctrl-rzg2l.c            | 309 +++++++++++++++++----
+ drivers/pinctrl/stm32/pinctrl-stm32mp257.c         |   2 +-
+ 28 files changed, 406 insertions(+), 147 deletions(-)
+ rename Documentation/devicetree/bindings/pinctrl/{xlnx,zynq-pinctrl.yaml => xlnx,pinctrl-zynq.yaml} (98%)
+Merging pinctrl-intel/for-next (6613476e225e Linux 6.8-rc1)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/pinctrl/intel.git pinctrl-intel/for-next
+Already up to date.
+Merging pinctrl-renesas/renesas-pinctrl (97191e536c37 pinctrl: renesas: r8a779h0: Add Audio pins, groups, functions)
+$ git merge -m Merge branch 'renesas-pinctrl' of git://git.kernel.org/pub/scm/linux/kernel/git/geert/renesas-drivers.git pinctrl-renesas/renesas-pinctrl
+Merge made by the 'ort' strategy.
+ .../devicetree/bindings/pinctrl/renesas,pfc.yaml   |    1 +
+ drivers/pinctrl/renesas/Kconfig                    |    5 +
+ drivers/pinctrl/renesas/Makefile                   |    1 +
+ drivers/pinctrl/renesas/core.c                     |    6 +
+ drivers/pinctrl/renesas/pfc-r8a779h0.c             | 3967 ++++++++++++++++++++
+ drivers/pinctrl/renesas/sh_pfc.h                   |    1 +
+ 6 files changed, 3981 insertions(+)
+ create mode 100644 drivers/pinctrl/renesas/pfc-r8a779h0.c
+Merging pinctrl-samsung/for-next (6613476e225e Linux 6.8-rc1)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/pinctrl/samsung.git pinctrl-samsung/for-next
+Already up to date.
+Merging pwm/pwm/for-next (801de0882d8a pwm: dwc: simplify error handling)
+$ git merge -m Merge branch 'pwm/for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/ukleinek/linux.git pwm/pwm/for-next
+Auto-merging Documentation/driver-api/driver-model/devres.rst
+Auto-merging drivers/gpu/drm/bridge/ti-sn65dsi86.c
+Auto-merging drivers/leds/rgb/leds-qcom-lpg.c
+Merge made by the 'ort' strategy.
+ .../devicetree/bindings/pwm/atmel,hlcdc-pwm.yaml   |  35 +
+ .../devicetree/bindings/pwm/atmel-hlcdc-pwm.txt    |  29 -
+ .../devicetree/bindings/pwm/marvell,pxa-pwm.yaml   |  51 ++
+ .../bindings/pwm/mediatek,mt2712-pwm.yaml          |   1 +
+ Documentation/devicetree/bindings/pwm/pxa-pwm.txt  |  30 -
+ Documentation/driver-api/driver-model/devres.rst   |   1 +
+ Documentation/driver-api/pwm.rst                   |  11 +-
+ drivers/gpio/gpio-mvebu.c                          |  18 +-
+ drivers/gpu/drm/bridge/ti-sn65dsi86.c              |  32 +-
+ drivers/leds/rgb/leds-qcom-lpg.c                   |  16 +-
+ drivers/pinctrl/intel/pinctrl-intel.c              |   6 +-
+ drivers/pwm/core.c                                 | 734 +++++++++++----------
+ drivers/pwm/pwm-ab8500.c                           |  36 +-
+ drivers/pwm/pwm-apple.c                            |  18 +-
+ drivers/pwm/pwm-atmel-hlcdc.c                      |  42 +-
+ drivers/pwm/pwm-atmel-tcb.c                        |  32 +-
+ drivers/pwm/pwm-atmel.c                            |  47 +-
+ drivers/pwm/pwm-bcm-iproc.c                        |  19 +-
+ drivers/pwm/pwm-bcm-kona.c                         |  23 +-
+ drivers/pwm/pwm-bcm2835.c                          |  22 +-
+ drivers/pwm/pwm-berlin.c                           |  29 +-
+ drivers/pwm/pwm-brcmstb.c                          |  17 +-
+ drivers/pwm/pwm-clk.c                              |  27 +-
+ drivers/pwm/pwm-clps711x.c                         |  28 +-
+ drivers/pwm/pwm-crc.c                              |  22 +-
+ drivers/pwm/pwm-cros-ec.c                          |  59 +-
+ drivers/pwm/pwm-dwc-core.c                         |  26 +-
+ drivers/pwm/pwm-dwc.c                              |  77 ++-
+ drivers/pwm/pwm-dwc.h                              |  14 +-
+ drivers/pwm/pwm-ep93xx.c                           |  21 +-
+ drivers/pwm/pwm-fsl-ftm.c                          |  49 +-
+ drivers/pwm/pwm-hibvt.c                            |  70 +-
+ drivers/pwm/pwm-img.c                              |  60 +-
+ drivers/pwm/pwm-imx-tpm.c                          |  34 +-
+ drivers/pwm/pwm-imx1.c                             |  20 +-
+ drivers/pwm/pwm-imx27.c                            |  35 +-
+ drivers/pwm/pwm-intel-lgm.c                        |  17 +-
+ drivers/pwm/pwm-iqs620a.c                          |  30 +-
+ drivers/pwm/pwm-jz4740.c                           |  36 +-
+ drivers/pwm/pwm-keembay.c                          |  17 +-
+ drivers/pwm/pwm-lp3943.c                           |  17 +-
+ drivers/pwm/pwm-lpc18xx-sct.c                      |  34 +-
+ drivers/pwm/pwm-lpc32xx.c                          |  21 +-
+ drivers/pwm/pwm-lpss-pci.c                         |  10 +-
+ drivers/pwm/pwm-lpss-platform.c                    |  10 +-
+ drivers/pwm/pwm-lpss.c                             |  38 +-
+ drivers/pwm/pwm-lpss.h                             |   1 -
+ drivers/pwm/pwm-mediatek.c                         |  38 +-
+ drivers/pwm/pwm-meson.c                            |  57 +-
+ drivers/pwm/pwm-microchip-core.c                   |  17 +-
+ drivers/pwm/pwm-mtk-disp.c                         |  25 +-
+ drivers/pwm/pwm-mxs.c                              |  32 +-
+ drivers/pwm/pwm-ntxec.c                            |  14 +-
+ drivers/pwm/pwm-omap-dmtimer.c                     |  47 +-
+ drivers/pwm/pwm-pca9685.c                          | 161 ++---
+ drivers/pwm/pwm-pxa.c                              |  25 +-
+ drivers/pwm/pwm-raspberrypi-poe.c                  |  20 +-
+ drivers/pwm/pwm-rcar.c                             |  27 +-
+ drivers/pwm/pwm-renesas-tpu.c                      |  20 +-
+ drivers/pwm/pwm-rockchip.c                         |  24 +-
+ drivers/pwm/pwm-rz-mtu3.c                          |  60 +-
+ drivers/pwm/pwm-samsung.c                          |  94 +--
+ drivers/pwm/pwm-sifive.c                           |  30 +-
+ drivers/pwm/pwm-sl28cpld.c                         |  13 +-
+ drivers/pwm/pwm-spear.c                            |  18 +-
+ drivers/pwm/pwm-sprd.c                             |  58 +-
+ drivers/pwm/pwm-sti.c                              |  70 +-
+ drivers/pwm/pwm-stm32-lp.c                         |  31 +-
+ drivers/pwm/pwm-stm32.c                            |  56 +-
+ drivers/pwm/pwm-stmpe.c                            |  58 +-
+ drivers/pwm/pwm-sun4i.c                            | 100 +--
+ drivers/pwm/pwm-sunplus.c                          |  17 +-
+ drivers/pwm/pwm-tegra.c                            |  50 +-
+ drivers/pwm/pwm-tiecap.c                           |  55 +-
+ drivers/pwm/pwm-tiehrpwm.c                         |  72 +-
+ drivers/pwm/pwm-twl-led.c                          |  55 +-
+ drivers/pwm/pwm-twl.c                              |  50 +-
+ drivers/pwm/pwm-visconti.c                         |  17 +-
+ drivers/pwm/pwm-vt8500.c                           |  43 +-
+ drivers/pwm/pwm-xilinx.c                           |  34 +-
+ drivers/pwm/sysfs.c                                |   4 +-
+ drivers/staging/greybus/pwm.c                      | 133 ++--
+ include/linux/platform_data/x86/pwm-lpss.h         |   4 +-
+ include/linux/pwm.h                                |  51 +-
+ 84 files changed, 1910 insertions(+), 1792 deletions(-)
+ create mode 100644 Documentation/devicetree/bindings/pwm/atmel,hlcdc-pwm.yaml
+ delete mode 100644 Documentation/devicetree/bindings/pwm/atmel-hlcdc-pwm.txt
+ create mode 100644 Documentation/devicetree/bindings/pwm/marvell,pxa-pwm.yaml
+ delete mode 100644 Documentation/devicetree/bindings/pwm/pxa-pwm.txt
+Merging ktest/for-next (7dc8e24f0e09 ktest: Restore stty setting at first in dodie)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/rostedt/linux-ktest.git ktest/for-next
+Already up to date.
+Merging kselftest/next (6f1a214d446b selftests: sched: Fix spelling mistake "hiearchy" -> "hierarchy")
+$ git merge -m Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/shuah/linux-kselftest.git kselftest/next
+Auto-merging Documentation/dev-tools/kselftest.rst
+Auto-merging MAINTAINERS
+Auto-merging arch/s390/configs/debug_defconfig
+Auto-merging arch/s390/configs/defconfig
+Auto-merging lib/Kconfig.debug
+Auto-merging lib/Makefile
+Auto-merging tools/testing/selftests/Makefile
+Auto-merging tools/testing/selftests/dt/test_unprobed_devices.sh
+Auto-merging tools/testing/selftests/lib.mk
+Auto-merging tools/testing/selftests/livepatch/functions.sh
+Merge made by the 'ort' strategy.
+ Documentation/dev-tools/kselftest.rst              |   4 +
+ MAINTAINERS                                        |   2 +-
+ arch/s390/configs/debug_defconfig                  |   1 -
+ arch/s390/configs/defconfig                        |   1 -
+ lib/Kconfig.debug                                  |  22 --
+ lib/Makefile                                       |   2 -
+ lib/livepatch/Makefile                             |  14 -
+ tools/testing/selftests/Makefile                   |   2 +
+ tools/testing/selftests/dt/Makefile                |   2 +-
+ .../testing/selftests/dt/test_unprobed_devices.sh  |   6 +-
+ tools/testing/selftests/ftrace/ftracetest          |   2 +-
+ .../ftrace/test.d/00basic/test_ownership.tc        |   2 +-
+ .../selftests/ftrace/test.d/ftrace/func_hotplug.tc |  42 +++
+ .../ftrace/test.d/trigger/trigger-hist-mod.tc      |   2 +-
+ .../selftests/futex/functional/futex_requeue_pi.c  |  13 +-
+ .../selftests/{dt => kselftest}/ktap_helpers.sh    |  47 ++-
+ tools/testing/selftests/lib.mk                     |  26 +-
+ tools/testing/selftests/livepatch/Makefile         |   5 +-
+ tools/testing/selftests/livepatch/README           |  25 +-
+ tools/testing/selftests/livepatch/config           |   1 -
+ tools/testing/selftests/livepatch/functions.sh     |  34 +--
+ .../testing/selftests/livepatch/test-callbacks.sh  |  50 +--
+ tools/testing/selftests/livepatch/test-ftrace.sh   |   6 +-
+ .../testing/selftests/livepatch/test-livepatch.sh  |  10 +-
+ .../selftests/livepatch/test-shadow-vars.sh        |   2 +-
+ tools/testing/selftests/livepatch/test-state.sh    |  18 +-
+ tools/testing/selftests/livepatch/test-syscall.sh  |  53 ++++
+ tools/testing/selftests/livepatch/test-sysfs.sh    |   6 +-
+ .../selftests/livepatch/test_klp-call_getpid.c     |  44 +++
+ .../selftests/livepatch/test_modules/Makefile      |  20 ++
+ .../test_modules}/test_klp_atomic_replace.c        |   0
+ .../test_modules}/test_klp_callbacks_busy.c        |   0
+ .../test_modules}/test_klp_callbacks_demo.c        |   0
+ .../test_modules}/test_klp_callbacks_demo2.c       |   0
+ .../test_modules}/test_klp_callbacks_mod.c         |   0
+ .../livepatch/test_modules}/test_klp_livepatch.c   |   0
+ .../livepatch/test_modules}/test_klp_shadow_vars.c |   0
+ .../livepatch/test_modules}/test_klp_state.c       |   0
+ .../livepatch/test_modules}/test_klp_state2.c      |   0
+ .../livepatch/test_modules}/test_klp_state3.c      |   0
+ .../livepatch/test_modules/test_klp_syscall.c      | 116 +++++++
+ tools/testing/selftests/mqueue/setting             |   1 +
+ tools/testing/selftests/power_supply/Makefile      |   4 +
+ tools/testing/selftests/power_supply/helpers.sh    | 178 +++++++++++
+ .../power_supply/test_power_supply_properties.sh   | 114 +++++++
+ tools/testing/selftests/resctrl/cache.c            | 289 ++++++------------
+ tools/testing/selftests/resctrl/cat_test.c         | 339 +++++++++++++--------
+ tools/testing/selftests/resctrl/cmt_test.c         |  80 +++--
+ tools/testing/selftests/resctrl/fill_buf.c         | 136 ++++-----
+ tools/testing/selftests/resctrl/mba_test.c         |  30 +-
+ tools/testing/selftests/resctrl/mbm_test.c         |  32 +-
+ tools/testing/selftests/resctrl/resctrl.h          | 135 ++++++--
+ tools/testing/selftests/resctrl/resctrl_tests.c    | 213 +++++--------
+ tools/testing/selftests/resctrl/resctrl_val.c      | 138 +++++----
+ tools/testing/selftests/resctrl/resctrlfs.c        | 323 +++++++++++++-------
+ tools/testing/selftests/sched/cs_prctl_test.c      |   2 +-
+ .../selftests/thermal/intel/power_floor/.gitignore |   1 +
+ .../thermal/intel/workload_hint/.gitignore         |   1 +
+ tools/testing/selftests/uevent/.gitignore          |   1 +
+ 59 files changed, 1705 insertions(+), 892 deletions(-)
+ delete mode 100644 lib/livepatch/Makefile
+ create mode 100644 tools/testing/selftests/ftrace/test.d/ftrace/func_hotplug.tc
+ rename tools/testing/selftests/{dt => kselftest}/ktap_helpers.sh (66%)
+ create mode 100755 tools/testing/selftests/livepatch/test-syscall.sh
+ create mode 100644 tools/testing/selftests/livepatch/test_klp-call_getpid.c
+ create mode 100644 tools/testing/selftests/livepatch/test_modules/Makefile
+ rename {lib/livepatch => tools/testing/selftests/livepatch/test_modules}/test_klp_atomic_replace.c (100%)
+ rename {lib/livepatch => tools/testing/selftests/livepatch/test_modules}/test_klp_callbacks_busy.c (100%)
+ rename {lib/livepatch => tools/testing/selftests/livepatch/test_modules}/test_klp_callbacks_demo.c (100%)
+ rename {lib/livepatch => tools/testing/selftests/livepatch/test_modules}/test_klp_callbacks_demo2.c (100%)
+ rename {lib/livepatch => tools/testing/selftests/livepatch/test_modules}/test_klp_callbacks_mod.c (100%)
+ rename {lib/livepatch => tools/testing/selftests/livepatch/test_modules}/test_klp_livepatch.c (100%)
+ rename {lib/livepatch => tools/testing/selftests/livepatch/test_modules}/test_klp_shadow_vars.c (100%)
+ rename {lib/livepatch => tools/testing/selftests/livepatch/test_modules}/test_klp_state.c (100%)
+ rename {lib/livepatch => tools/testing/selftests/livepatch/test_modules}/test_klp_state2.c (100%)
+ rename {lib/livepatch => tools/testing/selftests/livepatch/test_modules}/test_klp_state3.c (100%)
+ create mode 100644 tools/testing/selftests/livepatch/test_modules/test_klp_syscall.c
+ create mode 100644 tools/testing/selftests/mqueue/setting
+ create mode 100644 tools/testing/selftests/power_supply/Makefile
+ create mode 100644 tools/testing/selftests/power_supply/helpers.sh
+ create mode 100755 tools/testing/selftests/power_supply/test_power_supply_properties.sh
+ create mode 100644 tools/testing/selftests/thermal/intel/power_floor/.gitignore
+ create mode 100644 tools/testing/selftests/thermal/intel/workload_hint/.gitignore
+ create mode 100644 tools/testing/selftests/uevent/.gitignore
+Merging kunit/test (6613476e225e Linux 6.8-rc1)
+$ git merge -m Merge branch 'test' of git://git.kernel.org/pub/scm/linux/kernel/git/shuah/linux-kselftest.git kunit/test
+Already up to date.
+Merging kunit-next/kunit (08c454e26daa kunit: Mark filter* params as rw)
+$ git merge -m Merge branch 'kunit' of git://git.kernel.org/pub/scm/linux/kernel/git/shuah/linux-kselftest.git kunit-next/kunit
+Merge made by the 'ort' strategy.
+ lib/kunit/executor.c                | 6 +++---
+ tools/testing/kunit/kunit_kernel.py | 1 +
+ 2 files changed, 4 insertions(+), 3 deletions(-)
+Merging livepatching/for-next (602bf1830798 Merge branch 'for-6.7' into for-next)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/livepatching/livepatching livepatching/for-next
+Merge made by the 'ort' strategy.
+Merging rtc/rtc-next (6613476e225e Linux 6.8-rc1)
+$ git merge -m Merge branch 'rtc-next' of git://git.kernel.org/pub/scm/linux/kernel/git/abelloni/linux.git rtc/rtc-next
+Already up to date.
+Merging nvdimm/libnvdimm-for-next (bc22374c96d9 device-dax: make dax_bus_type const)
+$ git merge -m Merge branch 'libnvdimm-for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm.git nvdimm/libnvdimm-for-next
+Auto-merging drivers/dax/bus.c
+Merge made by the 'ort' strategy.
+ drivers/dax/bus.c      | 2 +-
+ drivers/nvdimm/Kconfig | 2 +-
+ drivers/nvdimm/bus.c   | 2 +-
+ 3 files changed, 3 insertions(+), 3 deletions(-)
+Merging at24/at24/for-next (6613476e225e Linux 6.8-rc1)
+$ git merge -m Merge branch 'at24/for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/brgl/linux.git at24/at24/for-next
+Already up to date.
+Merging ntb/ntb-next (9341b37ec17a ntb_perf: Fix printk format)
+$ git merge -m Merge branch 'ntb-next' of https://github.com/jonmason/ntb.git ntb/ntb-next
+Merge made by the 'ort' strategy.
+ drivers/ntb/hw/intel/ntb_hw_gen1.c | 2 +-
+ drivers/ntb/test/ntb_perf.c        | 2 +-
+ 2 files changed, 2 insertions(+), 2 deletions(-)
+Merging seccomp/for-next/seccomp (56af94aace8a samples: user-trap: fix strict-aliasing warning)
+$ git merge -m Merge branch 'for-next/seccomp' of git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux.git seccomp/for-next/seccomp
+Auto-merging tools/testing/selftests/seccomp/seccomp_benchmark.c
+Merge made by the 'ort' strategy.
+ samples/seccomp/user-trap.c                        |  8 +++--
+ .../testing/selftests/seccomp/seccomp_benchmark.c  | 38 ++++++++++++++++++--
+ tools/testing/selftests/seccomp/seccomp_bpf.c      | 41 ++++++++++++++++------
+ 3 files changed, 73 insertions(+), 14 deletions(-)
+Merging fsi/next (c5eeb63edac9 fsi: Fix panic on scom file read)
+$ git merge -m Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/joel/fsi.git fsi/next
+Merge made by the 'ort' strategy.
+ drivers/fsi/fsi-sbefifo.c |  9 ++++++++-
+ drivers/fsi/i2cr-scom.c   | 11 ++++++++++-
+ 2 files changed, 18 insertions(+), 2 deletions(-)
+Merging slimbus/for-next (04b945e4cf81 slimbus: qcom-ngd-ctrl: Make QMI message rules const)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/srini/slimbus.git slimbus/for-next
+Merge made by the 'ort' strategy.
+ drivers/slimbus/qcom-ngd-ctrl.c | 8 ++++----
+ 1 file changed, 4 insertions(+), 4 deletions(-)
+Merging nvmem/for-next (2c8df24cc166 nvmem: mtk-efuse: Drop NVMEM device name)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/srini/nvmem.git nvmem/for-next
+Auto-merging MAINTAINERS
+Merge made by the 'ort' strategy.
+ .../bindings/nvmem/layouts/fixed-cell.yaml         |  22 +--
+ .../devicetree/bindings/nvmem/nvmem-provider.yaml  |  18 ++
+ .../bindings/nvmem/xlnx,zynqmp-nvmem.txt           |  46 -----
+ .../bindings/nvmem/xlnx,zynqmp-nvmem.yaml          |  42 ++++
+ MAINTAINERS                                        |   8 +
+ drivers/firmware/xilinx/zynqmp.c                   |  25 +++
+ drivers/nvmem/mtk-efuse.c                          |  20 +-
+ drivers/nvmem/zynqmp_nvmem.c                       | 213 ++++++++++++++++++---
+ include/linux/firmware/xlnx-zynqmp.h               |   8 +
+ 9 files changed, 313 insertions(+), 89 deletions(-)
+ create mode 100644 Documentation/devicetree/bindings/nvmem/nvmem-provider.yaml
+ delete mode 100644 Documentation/devicetree/bindings/nvmem/xlnx,zynqmp-nvmem.txt
+ create mode 100644 Documentation/devicetree/bindings/nvmem/xlnx,zynqmp-nvmem.yaml
+Merging xarray/main (2a15de80dd0f idr: fix param name in idr_alloc_cyclic() doc)
+$ git merge -m Merge branch 'main' of git://git.infradead.org/users/willy/xarray.git xarray/main
+Already up to date.
+Merging hyperv/hyperv-next (ce9ecca0238b Linux 6.6-rc2)
+$ git merge -m Merge branch 'hyperv-next' of git://git.kernel.org/pub/scm/linux/kernel/git/hyperv/linux.git hyperv/hyperv-next
+Already up to date.
+Merging auxdisplay/for-next (34ddc83dc720 auxdisplay: linedisp: Add support for overriding character mapping)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/andy/linux-auxdisplay.git auxdisplay/for-next
+Auto-merging MAINTAINERS
+Merge made by the 'ort' strategy.
+ .../bindings/auxdisplay/arm,versatile-lcd.yaml     |   4 +-
+ .../bindings/auxdisplay/hit,hd44780.yaml           |  62 +++---
+ .../bindings/auxdisplay/holtek,ht16k33.yaml        |  50 ++---
+ .../bindings/auxdisplay/img,ascii-lcd.yaml         |   4 +-
+ MAINTAINERS                                        |   8 +-
+ drivers/auxdisplay/ht16k33.c                       |   7 +-
+ drivers/auxdisplay/img-ascii-lcd.c                 |  23 ++-
+ drivers/auxdisplay/line-display.c                  | 157 +++++++++++++--
+ drivers/auxdisplay/line-display.h                  |  52 ++++-
+ drivers/auxdisplay/panel.c                         | 216 +++++++++------------
+ 10 files changed, 375 insertions(+), 208 deletions(-)
+Merging kgdb/kgdb/for-next (4f41d30cd6dc kdb: Fix a potential buffer overflow in kdb_local())
+$ git merge -m Merge branch 'kgdb/for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/danielt/linux.git kgdb/kgdb/for-next
+Already up to date.
+Merging hmm/hmm (6613476e225e Linux 6.8-rc1)
+$ git merge -m Merge branch 'hmm' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git hmm/hmm
+Already up to date.
+Merging cfi/cfi/next (06c2afb862f9 Linux 6.5-rc1)
+$ git merge -m Merge branch 'cfi/next' of git://git.kernel.org/pub/scm/linux/kernel/git/mtd/linux.git cfi/cfi/next
+Already up to date.
+Merging mhi/mhi-next (ceeb64f41fe6 bus: mhi: host: Add tracing support)
+$ git merge -m Merge branch 'mhi-next' of git://git.kernel.org/pub/scm/linux/kernel/git/mani/mhi.git mhi/mhi-next
+Merge made by the 'ort' strategy.
+ drivers/bus/mhi/common.h        |  38 +++---
+ drivers/bus/mhi/ep/main.c       |   5 +-
+ drivers/bus/mhi/host/boot.c     |  11 +-
+ drivers/bus/mhi/host/init.c     |  79 ++++++------
+ drivers/bus/mhi/host/internal.h |  50 ++++++-
+ drivers/bus/mhi/host/main.c     |  19 ++-
+ drivers/bus/mhi/host/pm.c       |  27 +++-
+ drivers/bus/mhi/host/trace.h    | 280 ++++++++++++++++++++++++++++++++++++++++
+ include/linux/mhi.h             |   2 -
+ 9 files changed, 423 insertions(+), 88 deletions(-)
+ create mode 100644 drivers/bus/mhi/host/trace.h
+Merging memblock/for-next (2159bd4e9057 memblock: Return NUMA_NO_NODE instead of -1 to improve code readability)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/rppt/memblock.git memblock/for-next
+Already up to date.
+Merging cxl/next (73bf93edeeea cxl/core: use sysfs_emit() for attr's _show())
+$ git merge -m Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/cxl/cxl.git cxl/next
+Already up to date.
+Merging zstd/zstd-next (3f832dfb8a8e zstd: fix g_debuglevel export warning)
+$ git merge -m Merge branch 'zstd-next' of https://github.com/terrelln/linux.git zstd/zstd-next
+Merge made by the 'ort' strategy.
+ include/linux/zstd.h                           |    2 +-
+ include/linux/zstd_errors.h                    |   23 +-
+ include/linux/zstd_lib.h                       |  697 ++++++++--
+ lib/zstd/Makefile                              |    2 +-
+ lib/zstd/common/allocations.h                  |   56 +
+ lib/zstd/common/bits.h                         |  149 ++
+ lib/zstd/common/bitstream.h                    |   53 +-
+ lib/zstd/common/compiler.h                     |   14 +-
+ lib/zstd/common/cpu.h                          |    3 +-
+ lib/zstd/common/debug.c                        |    5 +-
+ lib/zstd/common/debug.h                        |    3 +-
+ lib/zstd/common/entropy_common.c               |   42 +-
+ lib/zstd/common/error_private.c                |   12 +-
+ lib/zstd/common/error_private.h                |    3 +-
+ lib/zstd/common/fse.h                          |   89 +-
+ lib/zstd/common/fse_decompress.c               |   94 +-
+ lib/zstd/common/huf.h                          |  234 +---
+ lib/zstd/common/mem.h                          |    2 +-
+ lib/zstd/common/portability_macros.h           |   26 +-
+ lib/zstd/common/zstd_common.c                  |   38 +-
+ lib/zstd/common/zstd_deps.h                    |   16 +-
+ lib/zstd/common/zstd_internal.h                |   99 +-
+ lib/zstd/compress/clevels.h                    |    3 +-
+ lib/zstd/compress/fse_compress.c               |   59 +-
+ lib/zstd/compress/hist.c                       |    3 +-
+ lib/zstd/compress/hist.h                       |    3 +-
+ lib/zstd/compress/huf_compress.c               |  372 +++--
+ lib/zstd/compress/zstd_compress.c              | 1758 +++++++++++++++++-------
+ lib/zstd/compress/zstd_compress_internal.h     |  333 +++--
+ lib/zstd/compress/zstd_compress_literals.c     |  155 ++-
+ lib/zstd/compress/zstd_compress_literals.h     |   25 +-
+ lib/zstd/compress/zstd_compress_sequences.c    |    7 +-
+ lib/zstd/compress/zstd_compress_sequences.h    |    3 +-
+ lib/zstd/compress/zstd_compress_superblock.c   |   47 +-
+ lib/zstd/compress/zstd_compress_superblock.h   |    3 +-
+ lib/zstd/compress/zstd_cwksp.h                 |  149 +-
+ lib/zstd/compress/zstd_double_fast.c           |  129 +-
+ lib/zstd/compress/zstd_double_fast.h           |    6 +-
+ lib/zstd/compress/zstd_fast.c                  |  578 ++++++--
+ lib/zstd/compress/zstd_fast.h                  |    6 +-
+ lib/zstd/compress/zstd_lazy.c                  |  518 +++----
+ lib/zstd/compress/zstd_lazy.h                  |    7 +-
+ lib/zstd/compress/zstd_ldm.c                   |   11 +-
+ lib/zstd/compress/zstd_ldm.h                   |    3 +-
+ lib/zstd/compress/zstd_ldm_geartab.h           |    3 +-
+ lib/zstd/compress/zstd_opt.c                   |  187 +--
+ lib/zstd/compress/zstd_opt.h                   |    3 +-
+ lib/zstd/decompress/huf_decompress.c           |  772 +++++++----
+ lib/zstd/decompress/zstd_ddict.c               |    9 +-
+ lib/zstd/decompress/zstd_ddict.h               |    3 +-
+ lib/zstd/decompress/zstd_decompress.c          |  259 +++-
+ lib/zstd/decompress/zstd_decompress_block.c    |  283 ++--
+ lib/zstd/decompress/zstd_decompress_block.h    |    8 +-
+ lib/zstd/decompress/zstd_decompress_internal.h |    7 +-
+ lib/zstd/decompress_sources.h                  |    2 +-
+ lib/zstd/zstd_common_module.c                  |    5 +-
+ lib/zstd/zstd_compress_module.c                |    2 +-
+ lib/zstd/zstd_decompress_module.c              |    4 +-
+ 58 files changed, 4791 insertions(+), 2596 deletions(-)
+ create mode 100644 lib/zstd/common/allocations.h
+ create mode 100644 lib/zstd/common/bits.h
+Merging efi/next (841c35169323 Linux 6.8-rc4)
+$ git merge -m Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/efi/efi.git efi/next
+Already up to date.
+Merging unicode/for-next (367122c529f3 libfs: Attempt exact-match comparison first during casefolded lookup)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/krisman/unicode.git unicode/for-next
+Auto-merging fs/libfs.c
+Auto-merging include/linux/fs.h
+Merge made by the 'ort' strategy.
+ fs/libfs.c            | 40 +++++++++++++++++++++++-----------------
+ fs/overlayfs/params.c | 13 ++++++++++---
+ include/linux/fs.h    |  9 +++++++++
+ 3 files changed, 42 insertions(+), 20 deletions(-)
+Merging slab/slab/for-next (7d2ec24bd8a5 Merge branch 'slab/for-6.9/optimize-get-freelist' into slab/for-next)
+$ git merge -m Merge branch 'slab/for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/vbabka/slab.git slab/slab/for-next
+Auto-merging Documentation/admin-guide/kernel-parameters.txt
+Auto-merging mm/slab_common.c
+Merge made by the 'ort' strategy.
+ Documentation/admin-guide/kernel-parameters.txt | 79 +++++++++++--------------
+ Documentation/mm/slub.rst                       | 60 +++++++++----------
+ drivers/misc/lkdtm/heap.c                       |  2 +-
+ mm/Kconfig.debug                                |  6 +-
+ mm/slab.h                                       |  6 +-
+ mm/slab_common.c                                | 17 +++---
+ mm/slub.c                                       | 75 +++++++++++------------
+ 7 files changed, 117 insertions(+), 128 deletions(-)
+Merging random/master (1f719a2f3fa6 Merge tag 'net-6.8-rc4' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net)
+$ git merge -m Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/crng/random.git random/master
+Already up to date.
+Merging landlock/next (28c2be13a1e0 landlock: Document IOCTL support)
+$ git merge -m Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/mic/linux.git landlock/next
+Auto-merging tools/testing/kunit/configs/all_tests.config
+Merge made by the 'ort' strategy.
+ Documentation/userspace-api/landlock.rst     | 121 +++++-
+ include/uapi/linux/landlock.h                |  55 ++-
+ samples/landlock/sandboxer.c                 |  13 +-
+ security/landlock/.kunitconfig               |   4 +
+ security/landlock/Kconfig                    |  15 +
+ security/landlock/common.h                   |   2 +
+ security/landlock/fs.c                       | 465 +++++++++++++++++++++-
+ security/landlock/fs.h                       |   3 +
+ security/landlock/limits.h                   |  11 +-
+ security/landlock/ruleset.h                  |   2 +-
+ security/landlock/syscalls.c                 |  37 +-
+ tools/testing/kunit/configs/all_tests.config |   1 +
+ tools/testing/selftests/landlock/base_test.c |   2 +-
+ tools/testing/selftests/landlock/common.h    |  39 +-
+ tools/testing/selftests/landlock/fs_test.c   | 561 ++++++++++++++++++++++++++-
+ 15 files changed, 1246 insertions(+), 85 deletions(-)
+ create mode 100644 security/landlock/.kunitconfig
+Merging rust/rust-next (e3c3d34507c7 docs: rust: Add description of Rust documentation test as KUnit ones)
+$ git merge -m Merge branch 'rust-next' of https://github.com/Rust-for-Linux/linux.git rust/rust-next
+Auto-merging Documentation/process/changes.rst
+CONFLICT (content): Merge conflict in Documentation/process/changes.rst
+Auto-merging rust/kernel/workqueue.rs
+Auto-merging scripts/min-tool-version.sh
+Resolved 'Documentation/process/changes.rst' using previous resolution.
+Automatic merge failed; fix conflicts and then commit the result.
+$ git commit --no-edit -v -a
+[master 5445329723c3] Merge branch 'rust-next' of https://github.com/Rust-for-Linux/linux.git
+$ git diff -M --stat --summary HEAD^..
+ Documentation/process/changes.rst          |   2 +-
+ Documentation/rust/general-information.rst |  24 -----
+ Documentation/rust/index.rst               |   1 +
+ Documentation/rust/testing.rst             | 135 +++++++++++++++++++++++++++++
+ rust/alloc/alloc.rs                        |   9 +-
+ rust/alloc/boxed.rs                        |  20 +++--
+ rust/alloc/lib.rs                          |   7 +-
+ rust/alloc/raw_vec.rs                      |  19 +++-
+ rust/alloc/vec/mod.rs                      |  16 ++--
+ rust/bindings/bindings_helper.h            |   5 +-
+ rust/kernel/allocator.rs                   |   2 +-
+ rust/kernel/error.rs                       |  10 +--
+ rust/kernel/init.rs                        |  22 ++---
+ rust/kernel/ioctl.rs                       |   6 +-
+ rust/kernel/lib.rs                         |   5 +-
+ rust/kernel/str.rs                         |   8 +-
+ rust/kernel/sync.rs                        |   5 +-
+ rust/kernel/sync/arc.rs                    |  30 +++----
+ rust/kernel/sync/condvar.rs                | 110 ++++++++++++++++++-----
+ rust/kernel/sync/lock.rs                   |  19 ++--
+ rust/kernel/sync/lock/mutex.rs             |   3 +-
+ rust/kernel/sync/lock/spinlock.rs          |   5 +-
+ rust/kernel/sync/locked_by.rs              |   7 +-
+ rust/kernel/task.rs                        |  24 ++++-
+ rust/kernel/time.rs                        |  20 +++++
+ rust/kernel/types.rs                       |   3 +
+ rust/kernel/workqueue.rs                   |  78 ++++++++---------
+ scripts/min-tool-version.sh                |   2 +-
+ 28 files changed, 423 insertions(+), 174 deletions(-)
+ create mode 100644 Documentation/rust/testing.rst
+ create mode 100644 rust/kernel/time.rs
+Merging sysctl/sysctl-next (cec030ec414e MAINTAINERS: Update sysctl tree location)
+$ git merge -m Merge branch 'sysctl-next' of git://git.kernel.org/pub/scm/linux/kernel/git/sysctl/sysctl.git sysctl/sysctl-next
+Auto-merging MAINTAINERS
+Merge made by the 'ort' strategy.
+ MAINTAINERS | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+Merging execve/for-next/execve (15fd1dc3dadb fs: binfmt_elf_efpic: don't use missing interpreter's properties)
+$ git merge -m Merge branch 'for-next/execve' of git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux.git execve/for-next/execve
+Merge made by the 'ort' strategy.
+ fs/binfmt_elf_fdpic.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+Merging bitmap/bitmap-for-next (071ad962baf5 bitmap: Step down as a reviewer)
+$ git merge -m Merge branch 'bitmap-for-next' of https://github.com/norov/linux.git bitmap/bitmap-for-next
+Auto-merging MAINTAINERS
+Auto-merging arch/x86/kvm/hyperv.c
+Auto-merging drivers/block/null_blk/main.c
+CONFLICT (content): Merge conflict in drivers/block/null_blk/main.c
+Auto-merging drivers/infiniband/ulp/rtrs/rtrs-clt.c
+Auto-merging drivers/iommu/arm/arm-smmu/arm-smmu.h
+Auto-merging drivers/media/usb/em28xx/em28xx-cards.c
+Auto-merging drivers/net/ethernet/sfc/rx_common.c
+Auto-merging drivers/net/wireless/realtek/rtw88/pci.c
+Auto-merging drivers/net/wireless/realtek/rtw89/pci.c
+Auto-merging drivers/pci/controller/pci-hyperv.c
+Auto-merging drivers/perf/alibaba_uncore_drw_pmu.c
+Auto-merging drivers/perf/arm-cci.c
+Auto-merging drivers/perf/arm-ccn.c
+Auto-merging drivers/perf/arm_dmc620_pmu.c
+Auto-merging drivers/perf/arm_pmuv3.c
+Auto-merging drivers/scsi/mpi3mr/mpi3mr_os.c
+Auto-merging drivers/scsi/scsi_lib.c
+Auto-merging drivers/tty/nozomi.c
+Auto-merging drivers/tty/serial/sc16is7xx.c
+CONFLICT (content): Merge conflict in drivers/tty/serial/sc16is7xx.c
+Auto-merging drivers/usb/class/cdc-acm.c
+Auto-merging include/linux/cpumask.h
+Auto-merging kernel/sched/sched.h
+Auto-merging kernel/watch_queue.c
+Auto-merging lib/sbitmap.c
+Auto-merging sound/pci/hda/hda_codec.c
+Recorded preimage for 'drivers/block/null_blk/main.c'
+Resolved 'drivers/tty/serial/sc16is7xx.c' using previous resolution.
+Automatic merge failed; fix conflicts and then commit the result.
+$ git commit --no-edit -v -a
+Recorded resolution for 'drivers/block/null_blk/main.c'.
+[master efac291baba8] Merge branch 'bitmap-for-next' of https://github.com/norov/linux.git
+$ git diff -M --stat --summary HEAD^..
+ MAINTAINERS                                  |   1 -
+ arch/m68k/include/asm/mmu_context.h          |  11 +-
+ arch/microblaze/include/asm/mmu_context_mm.h |  11 +-
+ arch/mips/sgi-ip30/ip30-irq.c                |  12 +-
+ arch/powerpc/mm/book3s32/mmu_context.c       |  10 +-
+ arch/powerpc/platforms/pasemi/dma_lib.c      |  41 +---
+ arch/powerpc/platforms/powernv/pci-sriov.c   |  12 +-
+ arch/sh/boards/mach-x3proto/ilsel.c          |   4 +-
+ arch/sparc/kernel/pci_msi.c                  |   9 +-
+ arch/x86/kvm/hyperv.c                        |  36 ++--
+ drivers/dma/idxd/perfmon.c                   |   8 +-
+ drivers/infiniband/ulp/rtrs/rtrs-clt.c       |  15 +-
+ drivers/iommu/arm/arm-smmu/arm-smmu.h        |  10 +-
+ drivers/iommu/msm_iommu.c                    |  18 +-
+ drivers/isdn/mISDN/core.c                    |   9 +-
+ drivers/media/radio/radio-shark.c            |   5 +-
+ drivers/media/radio/radio-shark2.c           |   5 +-
+ drivers/media/usb/cx231xx/cx231xx-cards.c    |  16 +-
+ drivers/media/usb/em28xx/em28xx-cards.c      |  37 ++--
+ drivers/net/ethernet/rocker/rocker_ofdpa.c   |  11 +-
+ drivers/net/ethernet/sfc/rx_common.c         |   4 +-
+ drivers/net/ethernet/sfc/siena/rx_common.c   |   4 +-
+ drivers/net/ethernet/sfc/siena/siena_sriov.c |  14 +-
+ drivers/net/wireless/ath/ath10k/snoc.c       |   9 +-
+ drivers/net/wireless/realtek/rtw88/pci.c     |   5 +-
+ drivers/net/wireless/realtek/rtw89/pci.c     |   5 +-
+ drivers/pci/controller/pci-hyperv.c          |   7 +-
+ drivers/perf/alibaba_uncore_drw_pmu.c        |  10 +-
+ drivers/perf/arm-cci.c                       |  24 +--
+ drivers/perf/arm-ccn.c                       |  10 +-
+ drivers/perf/arm_dmc620_pmu.c                |   9 +-
+ drivers/perf/arm_pmuv3.c                     |   8 +-
+ drivers/scsi/mpi3mr/mpi3mr_os.c              |  21 +-
+ drivers/scsi/qedi/qedi_main.c                |   9 +-
+ drivers/scsi/scsi_lib.c                      |   7 +-
+ drivers/tty/nozomi.c                         |   5 +-
+ drivers/usb/class/cdc-acm.c                  |   5 +-
+ include/linux/cpumask.h                      |  12 ++
+ include/linux/find.h                         | 301 ++++++++++++++++++++++++++-
+ kernel/sched/sched.h                         |  14 +-
+ kernel/watch_queue.c                         |   6 +-
+ lib/find_bit.c                               |  85 ++++++++
+ lib/sbitmap.c                                |  46 +---
+ lib/test_bitmap.c                            |  61 ++++++
+ net/bluetooth/cmtp/core.c                    |  10 +-
+ net/smc/smc_wr.c                             |  10 +-
+ sound/pci/hda/hda_codec.c                    |   7 +-
+ sound/usb/caiaq/audio.c                      |  13 +-
+ 48 files changed, 617 insertions(+), 385 deletions(-)
+Merging hte/for-next (b85ea95d0864 Linux 6.7-rc1)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/pateldipen1984/linux.git hte/for-next
+Already up to date.
+Merging kspp/for-next/kspp (f0f427340429 leaking_addresses: Provide mechanism to scan binary files)
+$ git merge -m Merge branch 'for-next/kspp' of git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux.git kspp/for-next/kspp
+Auto-merging MAINTAINERS
+Auto-merging arch/arm/Kconfig
+Auto-merging arch/arm64/Kconfig
+Auto-merging arch/mips/Kconfig
+Auto-merging arch/parisc/Kconfig
+Auto-merging arch/powerpc/Kconfig
+Auto-merging arch/riscv/Kconfig
+Auto-merging arch/s390/Kconfig
+Auto-merging arch/x86/Kconfig
+Auto-merging arch/x86/kvm/mmu/mmu.c
+Auto-merging fs/namei.c
+Auto-merging include/linux/compiler_types.h
+Auto-merging include/linux/string.h
+Auto-merging kernel/printk/printk.c
+Auto-merging lib/Kconfig.ubsan
+Auto-merging lib/Makefile
+Auto-merging lib/test_ubsan.c
+Auto-merging scripts/Makefile.lib
+Auto-merging scripts/Makefile.ubsan
+Merge made by the 'ort' strategy.
+ Documentation/dev-tools/ubsan.rst           |  28 +++-----
+ MAINTAINERS                                 |  18 +++++
+ arch/arm/Kconfig                            |   2 +-
+ arch/arm/include/asm/word-at-a-time.h       |   3 +-
+ arch/arm64/Kconfig                          |   2 +-
+ arch/arm64/include/asm/word-at-a-time.h     |   3 +-
+ arch/mips/Kconfig                           |   2 +-
+ arch/parisc/Kconfig                         |   2 +-
+ arch/powerpc/Kconfig                        |   2 +-
+ arch/powerpc/include/asm/word-at-a-time.h   |   4 +-
+ arch/riscv/Kconfig                          |   2 +-
+ arch/riscv/include/asm/word-at-a-time.h     |   3 +-
+ arch/s390/Kconfig                           |   2 +-
+ arch/s390/include/asm/word-at-a-time.h      |   3 +-
+ arch/sh/include/asm/word-at-a-time.h        |   2 +
+ arch/um/drivers/net_kern.c                  |   2 +-
+ arch/um/drivers/vector_kern.c               |   2 +-
+ arch/um/drivers/vector_user.c               |   4 +-
+ arch/um/include/shared/user.h               |   3 +-
+ arch/um/os-Linux/drivers/ethertap_user.c    |   2 +-
+ arch/um/os-Linux/drivers/tuntap_user.c      |   2 +-
+ arch/um/os-Linux/umid.c                     |   6 +-
+ arch/x86/Kconfig                            |   2 +-
+ arch/x86/include/asm/word-at-a-time.h       |   3 +-
+ arch/x86/kvm/mmu/mmu.c                      |   1 +
+ drivers/misc/lkdtm/bugs.c                   |   3 +-
+ drivers/misc/lkdtm/core.c                   |  22 +++---
+ drivers/misc/vmw_vmci/vmci_datagram.c       |   7 +-
+ fs/namei.c                                  |   2 +-
+ include/asm-generic/word-at-a-time.h        |   3 +-
+ include/linux/fortify-string.h              |  22 +-----
+ include/linux/kernel.h                      |  38 +----------
+ include/linux/overflow.h                    | 101 ++++++++++++++++++++++++----
+ include/linux/string.h                      |  78 +++++++++++++++++++--
+ include/linux/string_choices.h              |  11 +++
+ include/linux/wordpart.h                    |  42 ++++++++++++
+ kernel/configs/hardening.config             |   7 +-
+ kernel/printk/printk.c                      |  11 ---
+ lib/Kconfig.ubsan                           |  13 +---
+ lib/Makefile                                |   1 +
+ lib/overflow_kunit.c                        |  67 ++++++++++++++++--
+ lib/string.c                                |  25 ++++---
+ lib/string_helpers.c                        |  34 ----------
+ lib/test_ubsan.c                            |   4 +-
+ scripts/Makefile.lib                        |   2 +-
+ scripts/Makefile.ubsan                      |   2 +-
+ scripts/coccinelle/api/string_choices.cocci |  41 +++++++++++
+ scripts/coccinelle/misc/struct_size.cocci   |  74 ++++++++++++++++++++
+ scripts/leaking_addresses.pl                |  53 +++++++++++++++
+ 49 files changed, 554 insertions(+), 214 deletions(-)
+ create mode 100644 include/linux/wordpart.h
+ create mode 100644 scripts/coccinelle/api/string_choices.cocci
+ create mode 100644 scripts/coccinelle/misc/struct_size.cocci
+Merging kspp-gustavo/for-next/kspp (6613476e225e Linux 6.8-rc1)
+$ git merge -m Merge branch 'for-next/kspp' of git://git.kernel.org/pub/scm/linux/kernel/git/gustavoars/linux.git kspp-gustavo/for-next/kspp
+Already up to date.
+Merging nolibc/nolibc (6613476e225e Linux 6.8-rc1)
+$ git merge -m Merge branch 'nolibc' of git://git.kernel.org/pub/scm/linux/kernel/git/shuah/linux-kselftest.git nolibc/nolibc
+Already up to date.
+Merging tsm/tsm-next (f4738f56d1dc virt: tdx-guest: Add Quote generation support using TSM_REPORTS)
+$ git merge -m Merge branch 'tsm-next' of git://git.kernel.org/pub/scm/linux/kernel/git/djbw/linux tsm/tsm-next
+Already up to date.
+Merging iommufd/for-next (6613476e225e Linux 6.8-rc1)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/jgg/iommufd.git iommufd/for-next
+Already up to date.
+Merging header_cleanup/header_cleanup (5f4c01f1e3c7 spinlock: Fix failing build for PREEMPT_RT)
+$ git merge -m Merge branch 'header_cleanup' of https://evilpiepirate.org/git/bcachefs.git header_cleanup/header_cleanup
+Already up to date.
diff --git a/localversion-next b/localversion-next
new file mode 100644
index 00000000000000..55a7997c812013
--- /dev/null
+++ b/localversion-next
@@ -0,0 +1 @@
+-next-20240221

From d125291f2fffec8571cbfeddaa59df48f0dd2a61 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Fri, 27 Oct 2023 11:21:59 -0700
Subject: [PATCH 1374/1406] KVM: Add hugepage support for dedicated guest
 memory

Extended guest_memfd to allow backing guest memory with hugepages. This
is done as a best-effort by default until a better-defined mechanism is
put in place that can provide better control/assurances to userspace
about hugepage allocations.

When reporting the max order when KVM gets a pfn from guest_memfd, force
order-0 pages if the hugepage is not fully contained by the memslot
binding, e.g. if userspace requested hugepages but punches a hole in the
memslot bindings in order to emulate x86's VGA hole.

Link: https://lore.kernel.org/kvm/20231027182217.3615211-1-seanjc@google.com/T/#mccbd3e8bf9897f0ddbf864e6318d6f2f208b269c
Signed-off-by: Sean Christopherson <seanjc@google.com>
Message-Id: <20231027182217.3615211-18-seanjc@google.com>
[Allow even with CONFIG_TRANSPARENT_HUGEPAGE; dropped momentarily due to
 uneasiness about the API. - Paolo]
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
[mdr: based on discussion in the Link regarding original patch, make the
      following set of changes:
      - For now, don't introduce an opt-in flag to enable hugepage
        support. By default, just make a best-effort for PMD_ORDER
        allocations so that there are no false assurances to userspace
        that they'll get hugepages. It's better at least than the
        current guarantee that they will get 4K pages every time. A more
        proper opt-in interface can then improve on things later.
      - Pass GFP_NOWARN to alloc_pages() so failures are not disruptive
        to normal operations
      - Drop size checks during creation time. Instead just avoid huge
        allocations if they extend beyond end of the memfd.
      - Drop hugepage-related unit tests since everything is now handled
        transparently to userspace anyway.
      - Update commit message accordingly.]
Signed-off-by: Michael Roth <michael.roth@amd.com>
---
 virt/kvm/guest_memfd.c | 63 +++++++++++++++++++++++++++++++++++++-----
 1 file changed, 56 insertions(+), 7 deletions(-)

diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index 0f4e0cf4f158b1..4860140c32a611 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -13,14 +13,46 @@ struct kvm_gmem {
 	struct list_head entry;
 };
 
-static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index)
+static struct folio *kvm_gmem_get_huge_folio(struct inode *inode, pgoff_t index,
+					     unsigned int order)
 {
+	pgoff_t npages = 1UL << order;
+	pgoff_t huge_index = round_down(index, npages);
+	struct address_space *mapping  = inode->i_mapping;
+	gfp_t gfp = mapping_gfp_mask(mapping) | __GFP_NOWARN;
+	loff_t size = i_size_read(inode);
 	struct folio *folio;
 
-	/* TODO: Support huge pages. */
-	folio = filemap_grab_folio(inode->i_mapping, index);
-	if (IS_ERR_OR_NULL(folio))
+	/* Make sure hugepages would be fully-contained by inode */
+	if ((huge_index + npages) * PAGE_SIZE > size)
+		return NULL;
+
+	if (filemap_range_has_page(mapping, (loff_t)huge_index << PAGE_SHIFT,
+				   (loff_t)(huge_index + npages - 1) << PAGE_SHIFT))
+		return NULL;
+
+	folio = filemap_alloc_folio(gfp, order);
+	if (!folio)
+		return NULL;
+
+	if (filemap_add_folio(mapping, folio, huge_index, gfp)) {
+		folio_put(folio);
 		return NULL;
+	}
+
+	return folio;
+}
+
+static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index)
+{
+	struct folio *folio;
+
+	folio = kvm_gmem_get_huge_folio(inode, index, PMD_ORDER);
+	if (!folio) {
+		folio = filemap_grab_folio(inode->i_mapping, index);
+		if (IS_ERR_OR_NULL(folio))
+			return NULL;
+	}
 
 	/*
 	 * Use the up-to-date flag to track whether or not the memory has been
@@ -360,6 +392,7 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
 	inode->i_mode |= S_IFREG;
 	inode->i_size = size;
 	mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
+	mapping_set_large_folios(inode->i_mapping);
 	mapping_set_unmovable(inode->i_mapping);
 	/* Unmovable mappings are supposed to be marked unevictable as well. */
 	WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping));
@@ -485,7 +518,7 @@ void kvm_gmem_unbind(struct kvm_memory_slot *slot)
 int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
 		     gfn_t gfn, kvm_pfn_t *pfn, int *max_order)
 {
-	pgoff_t index = gfn - slot->base_gfn + slot->gmem.pgoff;
+	pgoff_t index, huge_index;
 	struct kvm_gmem *gmem;
 	struct folio *folio;
 	struct page *page;
@@ -498,6 +531,7 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
 
 	gmem = file->private_data;
 
+	index = gfn - slot->base_gfn + slot->gmem.pgoff;
 	if (WARN_ON_ONCE(xa_load(&gmem->bindings, index) != slot)) {
 		r = -EIO;
 		goto out_fput;
@@ -517,9 +551,24 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
 	page = folio_file_page(folio, index);
 
 	*pfn = page_to_pfn(page);
-	if (max_order)
-		*max_order = 0;
+	if (!max_order)
+		goto success;
+
+	*max_order = compound_order(compound_head(page));
+	if (!*max_order)
+		goto success;
 
+	/*
+	 * The folio can be mapped with a hugepage if and only if the folio is
+	 * fully contained by the range the memslot is bound to.  Note, the
+	 * caller is responsible for handling gfn alignment, this only deals
+	 * with the file binding.
+	 */
+	huge_index = ALIGN(index, 1ull << *max_order);
+	if (huge_index < ALIGN(slot->gmem.pgoff, 1ull << *max_order) ||
+	    huge_index + (1ull << *max_order) > slot->gmem.pgoff + slot->npages)
+		*max_order = 0;
+success:
 	r = 0;
 
 out_unlock:

From f4f43eb41281e9df9159c27106f775a593997dfb Mon Sep 17 00:00:00 2001
From: Michael Roth <michael.roth@amd.com>
Date: Sun, 8 Oct 2023 20:22:54 -0500
Subject: [PATCH 1375/1406] mm: Introduce AS_INACCESSIBLE for
 encrypted/confidential memory

filemap users like guest_memfd may use page cache pages to
allocate/manage memory that is only intended to be accessed by guests
via hardware protections like encryption. Writes to memory of this sort
in common paths like truncation may cause unexpected behavior such
writing garbage instead of zeros when attempting to zero pages, or
worse, triggering hardware protections that are considered fatal as far
as the kernel is concerned.

Introduce a new address_space flag, AS_INACCESSIBLE, and use this
initially to prevent zero'ing of pages during truncation, with the
understanding that it is up to the owner of the mapping to handle this
specially if needed.

Link: https://lore.kernel.org/lkml/ZR9LYhpxTaTk6PJX@google.com/
Cc: Matthew Wilcox <willy@infradead.org>
Suggested-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Michael Roth <michael.roth@amd.com>
---
 include/linux/pagemap.h | 1 +
 mm/truncate.c           | 3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 2df35e65557d27..f879c1d54da7a0 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -207,6 +207,7 @@ enum mapping_flags {
 	AS_STABLE_WRITES,	/* must wait for writeback before modifying
 				   folio contents */
 	AS_UNMOVABLE,		/* The mapping cannot be moved, ever */
+	AS_INACCESSIBLE,	/* Do not attempt direct R/W access to the mapping */
 };
 
 /**
diff --git a/mm/truncate.c b/mm/truncate.c
index 725b150e47ac4c..c501338c7ebddc 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -233,7 +233,8 @@ bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end)
 	 * doing a complex calculation here, and then doing the zeroing
 	 * anyway if the page split fails.
 	 */
-	folio_zero_range(folio, offset, length);
+	if (!(folio->mapping->flags & AS_INACCESSIBLE))
+		folio_zero_range(folio, offset, length);
 
 	if (folio_has_private(folio))
 		folio_invalidate(folio, offset, length);

From 60ebf3d1be1aa86183e4e08a337677567d0fdcd2 Mon Sep 17 00:00:00 2001
From: Michael Roth <michael.roth@amd.com>
Date: Sun, 8 Oct 2023 20:59:41 -0500
Subject: [PATCH 1376/1406] KVM: Use AS_INACCESSIBLE when creating guest_memfd
 inode

truncate_inode_pages_range() may attempt to zero pages before truncating
them, and this will occur before arch-specific invalidations can be
triggered via .invalidate_folio/.free_folio hooks via kvm_gmem_aops. For
AMD SEV-SNP this would result in an RMP #PF being generated by the
hardware, which is currently treated as fatal (and even if specifically
allowed for, would not result in anything other than garbage being
written to guest pages due to encryption). On Intel TDX this would also
result in undesirable behavior.

Set the AS_INACCESSIBLE flag to prevent the MM from attempting
unexpected accesses of this sort during operations like truncation.

This may also in some cases yield a decent performance improvement for
guest_memfd userspace implementations that hole-punch ranges immediately
after private->shared conversions via KVM_SET_MEMORY_ATTRIBUTES, since
the current implementation of truncate_inode_pages_range() always ends
up zero'ing an entire 4K range if it is backing by a 2M folio.

Link: https://lore.kernel.org/lkml/ZR9LYhpxTaTk6PJX@google.com/
Suggested-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Michael Roth <michael.roth@amd.com>
---
 virt/kvm/guest_memfd.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index 4860140c32a611..ff692e1cd64bb6 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -389,6 +389,7 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
 	inode->i_private = (void *)(unsigned long)flags;
 	inode->i_op = &kvm_gmem_iops;
 	inode->i_mapping->a_ops = &kvm_gmem_aops;
+	inode->i_mapping->flags |= AS_INACCESSIBLE;
 	inode->i_mode |= S_IFREG;
 	inode->i_size = size;
 	mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);

From aa052a2c9e0e97aa0baeadc15183948eb7f153b0 Mon Sep 17 00:00:00 2001
From: Michael Roth <michael.roth@amd.com>
Date: Sun, 21 May 2023 20:24:54 -0500
Subject: [PATCH 1377/1406] KVM: x86: Add gmem hook for initializing memory

guest_memfd pages are generally expected to be in some arch-defined
initial state prior to using them for guest memory. For SEV-SNP this
initial state is 'private', or 'guest-owned', and requires additional
operations to move these pages into a 'private' state by updating the
corresponding entries the RMP table.

Allow for an arch-defined hook to handle updates of this sort, and go
ahead and implement one for x86 so KVM implementations like AMD SVM can
register a kvm_x86_ops callback to handle these updates for SEV-SNP
guests.

The preparation callback is always called when allocating/grabbing
folios via gmem, and it is up to the architecture to keep track of
whether or not the pages are already in the expected state (e.g. the RMP
table in the case of SEV-SNP).

In some cases, it is necessary to defer the preparation of the pages to
handle things like in-place encryption of initial guest memory payloads
before marking these pages as 'private'/'guest-owned', so also add a
helper that performs the same function as kvm_gmem_get_pfn(), but allows
for the preparation callback to be bypassed to allow for pages to be
accessed beforehand.

Link: https://lore.kernel.org/lkml/ZLqVdvsF11Ddo7Dq@google.com/
Signed-off-by: Michael Roth <michael.roth@amd.com>
---
 arch/x86/include/asm/kvm-x86-ops.h |  1 +
 arch/x86/include/asm/kvm_host.h    |  1 +
 arch/x86/kvm/x86.c                 |  6 ++++
 include/linux/kvm_host.h           | 14 ++++++++
 virt/kvm/Kconfig                   |  4 +++
 virt/kvm/guest_memfd.c             | 56 +++++++++++++++++++++++++++---
 6 files changed, 77 insertions(+), 5 deletions(-)

diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
index ab24ce2079889b..5e6b58439100b7 100644
--- a/arch/x86/include/asm/kvm-x86-ops.h
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@@ -139,6 +139,7 @@ KVM_X86_OP(vcpu_deliver_sipi_vector)
 KVM_X86_OP_OPTIONAL_RET0(vcpu_get_apicv_inhibit_reasons);
 KVM_X86_OP_OPTIONAL(get_untagged_addr)
 KVM_X86_OP_OPTIONAL(alloc_apic_backing_page)
+KVM_X86_OP_OPTIONAL_RET0(gmem_prepare)
 
 #undef KVM_X86_OP
 #undef KVM_X86_OP_OPTIONAL
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 829b175cf825a3..bcb389ba38cf9f 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1798,6 +1798,7 @@ struct kvm_x86_ops {
 
 	gva_t (*get_untagged_addr)(struct kvm_vcpu *vcpu, gva_t gva, unsigned int flags);
 	void *(*alloc_apic_backing_page)(struct kvm_vcpu *vcpu);
+	int (*gmem_prepare)(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order);
 };
 
 struct kvm_x86_nested_ops {
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 86d88bc7a6d091..0ac4a4c8f780ce 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -13523,6 +13523,12 @@ bool kvm_arch_no_poll(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_arch_no_poll);
 
+#ifdef CONFIG_HAVE_KVM_GMEM_PREPARE
+int kvm_arch_gmem_prepare(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, int max_order)
+{
+	return static_call(kvm_x86_gmem_prepare)(kvm, pfn, gfn, max_order);
+}
+#endif
 
 int kvm_spec_ctrl_test_value(u64 value)
 {
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 18e28610749eca..14090d858a6885 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -2406,9 +2406,19 @@ static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn)
 #endif /* CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES */
 
 #ifdef CONFIG_KVM_PRIVATE_MEM
+int __kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
+		       gfn_t gfn, kvm_pfn_t *pfn, int *max_order, bool prep);
 int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
 		     gfn_t gfn, kvm_pfn_t *pfn, int *max_order);
 #else
+static inline int __kvm_gmem_get_pfn(struct kvm *kvm,
+				     struct kvm_memory_slot *slot, gfn_t gfn,
+				     kvm_pfn_t *pfn, int *max_order, bool prep)
+{
+	KVM_BUG_ON(1, kvm);
+	return -EIO;
+}
+
 static inline int kvm_gmem_get_pfn(struct kvm *kvm,
 				   struct kvm_memory_slot *slot, gfn_t gfn,
 				   kvm_pfn_t *pfn, int *max_order)
@@ -2418,4 +2428,8 @@ static inline int kvm_gmem_get_pfn(struct kvm *kvm,
 }
 #endif /* CONFIG_KVM_PRIVATE_MEM */
 
+#ifdef CONFIG_HAVE_KVM_GMEM_PREPARE
+int kvm_arch_gmem_prepare(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, int max_order);
+#endif
+
 #endif
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
index 29b73eedfe741a..ca870157b2ed14 100644
--- a/virt/kvm/Kconfig
+++ b/virt/kvm/Kconfig
@@ -109,3 +109,7 @@ config KVM_GENERIC_PRIVATE_MEM
        select KVM_GENERIC_MEMORY_ATTRIBUTES
        select KVM_PRIVATE_MEM
        bool
+
+config HAVE_KVM_GMEM_PREPARE
+       bool
+       depends on KVM_PRIVATE_MEM
diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index ff692e1cd64bb6..2689ccf95d63ce 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -43,7 +43,40 @@ static struct folio *kvm_gmem_get_huge_folio(struct inode *inode, pgoff_t index,
 	return folio;
 }
 
-static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index)
+static int kvm_gmem_prepare_folio(struct inode *inode, pgoff_t index, struct folio *folio)
+{
+#ifdef CONFIG_HAVE_KVM_GMEM_PREPARE
+	struct list_head *gmem_list = &inode->i_mapping->i_private_list;
+	struct kvm_gmem *gmem;
+
+	list_for_each_entry(gmem, gmem_list, entry) {
+		struct kvm_memory_slot *slot;
+		struct kvm *kvm = gmem->kvm;
+		struct page *page;
+		kvm_pfn_t pfn;
+		gfn_t gfn;
+		int rc;
+
+		slot = xa_load(&gmem->bindings, index);
+		if (!slot)
+			continue;
+
+		page = folio_file_page(folio, index);
+		pfn = page_to_pfn(page);
+		gfn = slot->base_gfn + index - slot->gmem.pgoff;
+		rc = kvm_arch_gmem_prepare(kvm, gfn, pfn, compound_order(compound_head(page)));
+		if (rc) {
+			pr_warn_ratelimited("gmem: Failed to prepare folio for index %lx, error %d.\n",
+					    index, rc);
+			return rc;
+		}
+	}
+
+#endif
+	return 0;
+}
+
+static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index, bool prep)
 {
 	struct folio *folio;
 
@@ -73,6 +106,12 @@ static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index)
 		folio_mark_uptodate(folio);
 	}
 
+	if (prep && kvm_gmem_prepare_folio(inode, index, folio)) {
+		folio_unlock(folio);
+		folio_put(folio);
+		return NULL;
+	}
+
 	/*
 	 * Ignore accessed, referenced, and dirty flags.  The memory is
 	 * unevictable and there is no storage to write back to.
@@ -177,7 +216,7 @@ static long kvm_gmem_allocate(struct inode *inode, loff_t offset, loff_t len)
 			break;
 		}
 
-		folio = kvm_gmem_get_folio(inode, index);
+		folio = kvm_gmem_get_folio(inode, index, true);
 		if (!folio) {
 			r = -ENOMEM;
 			break;
@@ -516,8 +555,8 @@ void kvm_gmem_unbind(struct kvm_memory_slot *slot)
 	fput(file);
 }
 
-int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
-		     gfn_t gfn, kvm_pfn_t *pfn, int *max_order)
+int __kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
+		       gfn_t gfn, kvm_pfn_t *pfn, int *max_order, bool prep)
 {
 	pgoff_t index, huge_index;
 	struct kvm_gmem *gmem;
@@ -538,7 +577,7 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
 		goto out_fput;
 	}
 
-	folio = kvm_gmem_get_folio(file_inode(file), index);
+	folio = kvm_gmem_get_folio(file_inode(file), index, prep);
 	if (!folio) {
 		r = -ENOMEM;
 		goto out_fput;
@@ -579,4 +618,11 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
 
 	return r;
 }
+EXPORT_SYMBOL_GPL(__kvm_gmem_get_pfn);
+
+int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
+		     gfn_t gfn, kvm_pfn_t *pfn, int *max_order)
+{
+	return __kvm_gmem_get_pfn(kvm, slot, gfn, pfn, max_order, true);
+}
 EXPORT_SYMBOL_GPL(kvm_gmem_get_pfn);

From 90fe9a63c305c0b0715a89b2a5a60e81c1b36c20 Mon Sep 17 00:00:00 2001
From: Michael Roth <michael.roth@amd.com>
Date: Fri, 9 Jun 2023 08:53:37 -0500
Subject: [PATCH 1378/1406] KVM: x86: Add gmem hook for invalidating memory

In some cases, like with SEV-SNP, guest memory needs to be updated in a
platform-specific manner before it can be safely freed back to the host.
Wire up arch-defined hooks to the .free_folio kvm_gmem_aops callback to
allow for special handling of this sort when freeing memory in response
to FALLOC_FL_PUNCH_HOLE operations and when releasing the inode, and go
ahead and define an arch-specific hook for x86 since it will be needed
for handling memory used for SEV-SNP guests.

Signed-off-by: Michael Roth <michael.roth@amd.com>
---
 arch/x86/include/asm/kvm-x86-ops.h |  1 +
 arch/x86/include/asm/kvm_host.h    |  1 +
 arch/x86/kvm/x86.c                 |  7 +++++++
 include/linux/kvm_host.h           |  4 ++++
 virt/kvm/Kconfig                   |  4 ++++
 virt/kvm/guest_memfd.c             | 14 ++++++++++++++
 6 files changed, 31 insertions(+)

diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
index 5e6b58439100b7..c4b7b0db7be320 100644
--- a/arch/x86/include/asm/kvm-x86-ops.h
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@@ -140,6 +140,7 @@ KVM_X86_OP_OPTIONAL_RET0(vcpu_get_apicv_inhibit_reasons);
 KVM_X86_OP_OPTIONAL(get_untagged_addr)
 KVM_X86_OP_OPTIONAL(alloc_apic_backing_page)
 KVM_X86_OP_OPTIONAL_RET0(gmem_prepare)
+KVM_X86_OP_OPTIONAL(gmem_invalidate)
 
 #undef KVM_X86_OP
 #undef KVM_X86_OP_OPTIONAL
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index bcb389ba38cf9f..04293822497c96 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1799,6 +1799,7 @@ struct kvm_x86_ops {
 	gva_t (*get_untagged_addr)(struct kvm_vcpu *vcpu, gva_t gva, unsigned int flags);
 	void *(*alloc_apic_backing_page)(struct kvm_vcpu *vcpu);
 	int (*gmem_prepare)(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order);
+	void (*gmem_invalidate)(kvm_pfn_t start, kvm_pfn_t end);
 };
 
 struct kvm_x86_nested_ops {
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0ac4a4c8f780ce..0583f8ba154c80 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -13530,6 +13530,13 @@ int kvm_arch_gmem_prepare(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, int max_ord
 }
 #endif
 
+#ifdef CONFIG_HAVE_KVM_GMEM_INVALIDATE
+void kvm_arch_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end)
+{
+	static_call_cond(kvm_x86_gmem_invalidate)(start, end);
+}
+#endif
+
 int kvm_spec_ctrl_test_value(u64 value)
 {
 	/*
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 14090d858a6885..256f3f91901694 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -2432,4 +2432,8 @@ static inline int kvm_gmem_get_pfn(struct kvm *kvm,
 int kvm_arch_gmem_prepare(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, int max_order);
 #endif
 
+#ifdef CONFIG_HAVE_KVM_GMEM_INVALIDATE
+void kvm_arch_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end);
+#endif
+
 #endif
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
index ca870157b2ed14..754c6c923427f9 100644
--- a/virt/kvm/Kconfig
+++ b/virt/kvm/Kconfig
@@ -113,3 +113,7 @@ config KVM_GENERIC_PRIVATE_MEM
 config HAVE_KVM_GMEM_PREPARE
        bool
        depends on KVM_PRIVATE_MEM
+
+config HAVE_KVM_GMEM_INVALIDATE
+       bool
+       depends on KVM_PRIVATE_MEM
diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index 2689ccf95d63ce..4ede25dd8d140c 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -369,10 +369,24 @@ static int kvm_gmem_error_folio(struct address_space *mapping, struct folio *fol
 	return MF_DELAYED;
 }
 
+#ifdef CONFIG_HAVE_KVM_GMEM_INVALIDATE
+static void kvm_gmem_free_folio(struct folio *folio)
+{
+	struct page *page = folio_page(folio, 0);
+	kvm_pfn_t pfn = page_to_pfn(page);
+	int order = folio_order(folio);
+
+	kvm_arch_gmem_invalidate(pfn, pfn + (1ul << order));
+}
+#endif
+
 static const struct address_space_operations kvm_gmem_aops = {
 	.dirty_folio = noop_dirty_folio,
 	.migrate_folio	= kvm_gmem_migrate_folio,
 	.error_remove_folio = kvm_gmem_error_folio,
+#ifdef CONFIG_HAVE_KVM_GMEM_INVALIDATE
+	.free_folio = kvm_gmem_free_folio,
+#endif
 };
 
 static int kvm_gmem_getattr(struct mnt_idmap *idmap, const struct path *path,

From a345169daf1a2843b2a8fb6474101ffb50c673c4 Mon Sep 17 00:00:00 2001
From: Isaku Yamahata <isaku.yamahata@intel.com>
Date: Thu, 12 Oct 2023 04:59:29 -0500
Subject: [PATCH 1379/1406] KVM: x86/mmu: Pass around full 64-bit error code
 for KVM page faults

In some cases the full 64-bit error code for the KVM page fault will be
needed to determine things like whether or not a fault was for a private
or shared guest page, so update related code to accept the full 64-bit
value so it can be plumbed all the way through to where it is needed.

The accessors of fault->error_code are changed as follows:

- FNAME(page_fault): change to explicitly use lower_32_bits() since that
                     is no longer done in kvm_mmu_page_fault()
- kvm_mmu_page_fault(): explicit mask with PFERR_RSVD_MASK,
                        PFERR_NESTED_GUEST_PAGE
- mmutrace: changed u32 -> u64

Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
Link: https://lore.kernel.org/kvm/20230612042559.375660-1-michael.roth@amd.com/T/#mbd0b20c9a2cf50319d5d2a27b63f73c772112076
[mdr: drop references/changes to code not in current gmem tree, update
      commit message]
Signed-off-by: Michael Roth <michael.roth@amd.com>
---
 arch/x86/kvm/mmu/mmu.c          | 3 +--
 arch/x86/kvm/mmu/mmu_internal.h | 4 ++--
 arch/x86/kvm/mmu/mmutrace.h     | 2 +-
 arch/x86/kvm/mmu/paging_tmpl.h  | 2 +-
 4 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 08991c25f9e36d..5d9ffbe0029573 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -5814,8 +5814,7 @@ int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 err
 	}
 
 	if (r == RET_PF_INVALID) {
-		r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa,
-					  lower_32_bits(error_code), false,
+		r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa, error_code, false,
 					  &emulation_type);
 		if (KVM_BUG_ON(r == RET_PF_INVALID, vcpu->kvm))
 			return -EIO;
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
index 5390a591a5718c..49b428cca04e4d 100644
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -190,7 +190,7 @@ static inline bool is_nx_huge_page_enabled(struct kvm *kvm)
 struct kvm_page_fault {
 	/* arguments to kvm_mmu_do_page_fault.  */
 	const gpa_t addr;
-	const u32 error_code;
+	const u64 error_code;
 	const bool prefetch;
 
 	/* Derived from error_code.  */
@@ -280,7 +280,7 @@ enum {
 };
 
 static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
-					u32 err, bool prefetch, int *emulation_type)
+					u64 err, bool prefetch, int *emulation_type)
 {
 	struct kvm_page_fault fault = {
 		.addr = cr2_or_gpa,
diff --git a/arch/x86/kvm/mmu/mmutrace.h b/arch/x86/kvm/mmu/mmutrace.h
index ae86820cef697a..195d98bc8de85e 100644
--- a/arch/x86/kvm/mmu/mmutrace.h
+++ b/arch/x86/kvm/mmu/mmutrace.h
@@ -260,7 +260,7 @@ TRACE_EVENT(
 	TP_STRUCT__entry(
 		__field(int, vcpu_id)
 		__field(gpa_t, cr2_or_gpa)
-		__field(u32, error_code)
+		__field(u64, error_code)
 		__field(u64 *, sptep)
 		__field(u64, old_spte)
 		__field(u64, new_spte)
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index 4d4e98fe4f3548..c418f3b1cfca4b 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -787,7 +787,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
 	 * The bit needs to be cleared before walking guest page tables.
 	 */
 	r = FNAME(walk_addr)(&walker, vcpu, fault->addr,
-			     fault->error_code & ~PFERR_RSVD_MASK);
+			     lower_32_bits(fault->error_code) & ~PFERR_RSVD_MASK);
 
 	/*
 	 * The page is not mapped by the guest.  Let the guest handle it.

From 98ab5eba182296a3db55288d351ff61e271b2343 Mon Sep 17 00:00:00 2001
From: Michael Roth <michael.roth@amd.com>
Date: Fri, 7 Jul 2023 07:02:36 -0500
Subject: [PATCH 1380/1406] KVM: x86: Add KVM_X86_SNP_VM vm_type

In some cases, such as detecting whether a page fault should be handled
as a private fault or not, KVM will need to handle things differently
versus the existing KVM_X86_PROTECTED_VM type.

Add a new KVM_X86_SNP_VM to allow for this, along with a helper to query
the vm_type.

Signed-off-by: Michael Roth <michael.roth@amd.com>
---
 arch/x86/include/asm/kvm_host.h |  2 ++
 arch/x86/include/uapi/asm/kvm.h |  1 +
 arch/x86/kvm/x86.c              | 20 +++++++++++++++++---
 3 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 04293822497c96..3bd229d9130e57 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -2144,6 +2144,8 @@ void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level,
 #define kvm_arch_has_private_mem(kvm) false
 #endif
 
+bool kvm_is_vm_type(struct kvm *kvm, unsigned long type);
+
 static inline u16 kvm_read_ldt(void)
 {
 	u16 ldt;
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index ad29984d5e398d..e1bd264563d9e4 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -833,5 +833,6 @@ struct kvm_hyperv_eventfd {
 
 #define KVM_X86_DEFAULT_VM	0
 #define KVM_X86_SW_PROTECTED_VM	1
+#define KVM_X86_SNP_VM		3
 
 #endif /* _ASM_X86_KVM_H */
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0583f8ba154c80..242fdee79c6166 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4578,9 +4578,21 @@ static int kvm_ioctl_get_supported_hv_cpuid(struct kvm_vcpu *vcpu,
 
 static bool kvm_is_vm_type_supported(unsigned long type)
 {
-	return type == KVM_X86_DEFAULT_VM ||
-	       (type == KVM_X86_SW_PROTECTED_VM &&
-		IS_ENABLED(CONFIG_KVM_SW_PROTECTED_VM) && tdp_enabled);
+	if (type == KVM_X86_DEFAULT_VM)
+		return true;
+	else if (type == KVM_X86_SW_PROTECTED_VM &&
+		 IS_ENABLED(CONFIG_KVM_SW_PROTECTED_VM) && tdp_enabled)
+		return true;
+	else if (type == KVM_X86_SNP_VM &&
+		 IS_ENABLED(CONFIG_KVM_AMD_SEV) && tdp_enabled)
+		return true;
+
+	return false;
+}
+
+bool kvm_is_vm_type(struct kvm *kvm, unsigned long type)
+{
+	return kvm->arch.vm_type == type;
 }
 
 int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
@@ -4784,6 +4796,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 		r = BIT(KVM_X86_DEFAULT_VM);
 		if (kvm_is_vm_type_supported(KVM_X86_SW_PROTECTED_VM))
 			r |= BIT(KVM_X86_SW_PROTECTED_VM);
+		if (kvm_is_vm_type_supported(KVM_X86_SNP_VM))
+			r |= BIT(KVM_X86_SNP_VM);
 		break;
 	default:
 		break;

From 3393aae4565613805a3f14d2147584e6c8f002e8 Mon Sep 17 00:00:00 2001
From: Brijesh Singh <brijesh.singh@amd.com>
Date: Tue, 26 Apr 2022 18:46:34 +0000
Subject: [PATCH 1381/1406] KVM: x86: Define RMP page fault error bits for #NPF

When SEV-SNP is enabled globally, the hardware places restrictions on all
memory accesses based on the RMP entry, whether the hypervisor or a VM,
performs the accesses. When hardware encounters an RMP access violation
during a guest access, it will cause a #VMEXIT(NPF) with a number of
additional bits set to indicate the reasons for the #NPF. Define those
here.

See APM2 section 16.36.10 for more details.

Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
Signed-off-by: Ashish Kalra <ashish.kalra@amd.com>
[mdr: add some additional details to commit message]
Signed-off-by: Michael Roth <michael.roth@amd.com>
---
 arch/x86/include/asm/kvm_host.h | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 3bd229d9130e57..18aaac7f56b119 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -262,9 +262,13 @@ enum x86_intercept_stage;
 #define PFERR_FETCH_BIT 4
 #define PFERR_PK_BIT 5
 #define PFERR_SGX_BIT 15
+#define PFERR_GUEST_RMP_BIT 31
 #define PFERR_GUEST_FINAL_BIT 32
 #define PFERR_GUEST_PAGE_BIT 33
 #define PFERR_IMPLICIT_ACCESS_BIT 48
+#define PFERR_GUEST_ENC_BIT 34
+#define PFERR_GUEST_SIZEM_BIT 35
+#define PFERR_GUEST_VMPL_BIT 36
 
 #define PFERR_PRESENT_MASK	BIT(PFERR_PRESENT_BIT)
 #define PFERR_WRITE_MASK	BIT(PFERR_WRITE_BIT)
@@ -276,6 +280,10 @@ enum x86_intercept_stage;
 #define PFERR_GUEST_FINAL_MASK	BIT_ULL(PFERR_GUEST_FINAL_BIT)
 #define PFERR_GUEST_PAGE_MASK	BIT_ULL(PFERR_GUEST_PAGE_BIT)
 #define PFERR_IMPLICIT_ACCESS	BIT_ULL(PFERR_IMPLICIT_ACCESS_BIT)
+#define PFERR_GUEST_RMP_MASK	BIT_ULL(PFERR_GUEST_RMP_BIT)
+#define PFERR_GUEST_ENC_MASK	BIT_ULL(PFERR_GUEST_ENC_BIT)
+#define PFERR_GUEST_SIZEM_MASK	BIT_ULL(PFERR_GUEST_SIZEM_BIT)
+#define PFERR_GUEST_VMPL_MASK	BIT_ULL(PFERR_GUEST_VMPL_BIT)
 
 #define PFERR_NESTED_GUEST_PAGE (PFERR_GUEST_PAGE_MASK |	\
 				 PFERR_WRITE_MASK |		\

From 98a06e6b2108bcd0ea714769588e14d7e085714b Mon Sep 17 00:00:00 2001
From: Michael Roth <michael.roth@amd.com>
Date: Thu, 9 Feb 2023 20:55:01 -0600
Subject: [PATCH 1382/1406] KVM: x86: Determine shared/private faults based on
 vm_type

For KVM_X86_SNP_VM, only the PFERR_GUEST_ENC_MASK flag is needed to
determine with an #NPF is due to a private/shared access by the guest.
Implement that handling here. Also add handling needed to deal with
SNP guests which in some cases will make MMIO accesses with the
encryption bit.

Signed-off-by: Michael Roth <michael.roth@amd.com>
---
 arch/x86/kvm/mmu/mmu.c          | 12 ++++++++++--
 arch/x86/kvm/mmu/mmu_internal.h | 20 +++++++++++++++++++-
 2 files changed, 29 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 5d9ffbe0029573..a626552d051e65 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -4332,6 +4332,7 @@ static int kvm_faultin_pfn_private(struct kvm_vcpu *vcpu,
 static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
 {
 	struct kvm_memory_slot *slot = fault->slot;
+	bool private_fault = fault->is_private;
 	bool async;
 
 	/*
@@ -4361,12 +4362,19 @@ static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
 			return RET_PF_EMULATE;
 	}
 
-	if (fault->is_private != kvm_mem_is_private(vcpu->kvm, fault->gfn)) {
+	/*
+	 * In some cases SNP guests will make MMIO accesses with the encryption
+	 * bit set. Handle these via the normal MMIO fault path.
+	 */
+	if (!slot && private_fault && kvm_is_vm_type(vcpu->kvm, KVM_X86_SNP_VM))
+		private_fault = false;
+
+	if (private_fault != kvm_mem_is_private(vcpu->kvm, fault->gfn)) {
 		kvm_mmu_prepare_memory_fault_exit(vcpu, fault);
 		return -EFAULT;
 	}
 
-	if (fault->is_private)
+	if (private_fault)
 		return kvm_faultin_pfn_private(vcpu, fault);
 
 	async = false;
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
index 49b428cca04e4d..4665d2ce996c48 100644
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -251,6 +251,24 @@ struct kvm_page_fault {
 
 int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);
 
+static bool kvm_mmu_fault_is_private(struct kvm *kvm, gpa_t gpa, u64 err)
+{
+	bool private_fault = false;
+
+	if (kvm_is_vm_type(kvm, KVM_X86_SNP_VM)) {
+		private_fault = !!(err & PFERR_GUEST_ENC_MASK);
+	} else if (kvm_is_vm_type(kvm, KVM_X86_SW_PROTECTED_VM)) {
+		/*
+		 * This handling is for gmem self-tests and guests that treat
+		 * userspace as the authority on whether a fault should be
+		 * private or not.
+		 */
+		private_fault = kvm_mem_is_private(kvm, gpa >> PAGE_SHIFT);
+	}
+
+	return private_fault;
+}
+
 /*
  * Return values of handle_mmio_page_fault(), mmu.page_fault(), fast_page_fault(),
  * and of course kvm_mmu_do_page_fault().
@@ -298,7 +316,7 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
 		.max_level = KVM_MAX_HUGEPAGE_LEVEL,
 		.req_level = PG_LEVEL_4K,
 		.goal_level = PG_LEVEL_4K,
-		.is_private = kvm_mem_is_private(vcpu->kvm, cr2_or_gpa >> PAGE_SHIFT),
+		.is_private = kvm_mmu_fault_is_private(vcpu->kvm, cr2_or_gpa, err),
 	};
 	int r;
 

From 978a7a6353447918e7ce21f425684bb289f0c910 Mon Sep 17 00:00:00 2001
From: Michael Roth <michael.roth@amd.com>
Date: Tue, 14 Feb 2023 12:12:15 -0600
Subject: [PATCH 1383/1406] KVM: SEV: Select KVM_GENERIC_PRIVATE_MEM when
 CONFIG_KVM_AMD_SEV=y

SEV-SNP relies on restricted/protected memory support to run guests, so
make sure to enable that support via the CONFIG_KVM_GENERIC_PRIVATE_MEM
config option.

Signed-off-by: Michael Roth <michael.roth@amd.com>
---
 arch/x86/kvm/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 3a7d317784ca83..4322fe608c3fa3 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -121,6 +121,7 @@ config KVM_AMD_SEV
 	default y
 	depends on KVM_AMD && X86_64
 	depends on CRYPTO_DEV_SP_PSP && !(KVM_AMD=y && CRYPTO_DEV_CCP_DD=m)
+	select KVM_GENERIC_PRIVATE_MEM
 	help
 	  Provides support for launching Encrypted VMs (SEV) and Encrypted VMs
 	  with Encrypted State (SEV-ES) on AMD processors.

From 953aefc783cb5379c9f2e67f72698c209ac100f0 Mon Sep 17 00:00:00 2001
From: Tom Lendacky <thomas.lendacky@amd.com>
Date: Tue, 26 Apr 2022 18:11:10 +0000
Subject: [PATCH 1384/1406] KVM: SEV: Add support to handle AP reset MSR
 protocol

Add support for AP Reset Hold being invoked using the GHCB MSR protocol,
available in version 2 of the GHCB specification.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
Signed-off-by: Ashish Kalra <ashish.kalra@amd.com>
Signed-off-by: Michael Roth <michael.roth@amd.com>
---
 arch/x86/include/asm/sev-common.h |  6 ++--
 arch/x86/kvm/svm/sev.c            | 56 ++++++++++++++++++++++++++-----
 arch/x86/kvm/svm/svm.h            |  1 +
 3 files changed, 53 insertions(+), 10 deletions(-)

diff --git a/arch/x86/include/asm/sev-common.h b/arch/x86/include/asm/sev-common.h
index b463fcbd4b9070..01261f7054ad7b 100644
--- a/arch/x86/include/asm/sev-common.h
+++ b/arch/x86/include/asm/sev-common.h
@@ -54,8 +54,10 @@
 	(((unsigned long)fn) << 32))
 
 /* AP Reset Hold */
-#define GHCB_MSR_AP_RESET_HOLD_REQ	0x006
-#define GHCB_MSR_AP_RESET_HOLD_RESP	0x007
+#define GHCB_MSR_AP_RESET_HOLD_REQ		0x006
+#define GHCB_MSR_AP_RESET_HOLD_RESP		0x007
+#define GHCB_MSR_AP_RESET_HOLD_RESULT_POS	12
+#define GHCB_MSR_AP_RESET_HOLD_RESULT_MASK	GENMASK_ULL(51, 0)
 
 /* GHCB GPA Register */
 #define GHCB_MSR_REG_GPA_REQ		0x012
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index cf332c7dbc1651..b164edf1b943e1 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -65,6 +65,10 @@ module_param_named(debug_swap, sev_es_debug_swap_enabled, bool, 0444);
 #define sev_es_debug_swap_enabled false
 #endif /* CONFIG_KVM_AMD_SEV */
 
+#define AP_RESET_HOLD_NONE		0
+#define AP_RESET_HOLD_NAE_EVENT		1
+#define AP_RESET_HOLD_MSR_PROTO		2
+
 static u8 sev_enc_bit;
 static DECLARE_RWSEM(sev_deactivate_lock);
 static DEFINE_MUTEX(sev_bitmap_lock);
@@ -2610,6 +2614,9 @@ static int sev_es_validate_vmgexit(struct vcpu_svm *svm)
 
 void sev_es_unmap_ghcb(struct vcpu_svm *svm)
 {
+	/* Clear any indication that the vCPU is in a type of AP Reset Hold */
+	svm->sev_es.ap_reset_hold_type = AP_RESET_HOLD_NONE;
+
 	if (!svm->sev_es.ghcb)
 		return;
 
@@ -2821,6 +2828,22 @@ static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm)
 				  GHCB_MSR_INFO_POS);
 		break;
 	}
+	case GHCB_MSR_AP_RESET_HOLD_REQ:
+		svm->sev_es.ap_reset_hold_type = AP_RESET_HOLD_MSR_PROTO;
+		ret = kvm_emulate_ap_reset_hold(&svm->vcpu);
+
+		/*
+		 * Preset the result to a non-SIPI return and then only set
+		 * the result to non-zero when delivering a SIPI.
+		 */
+		set_ghcb_msr_bits(svm, 0,
+				  GHCB_MSR_AP_RESET_HOLD_RESULT_MASK,
+				  GHCB_MSR_AP_RESET_HOLD_RESULT_POS);
+
+		set_ghcb_msr_bits(svm, GHCB_MSR_AP_RESET_HOLD_RESP,
+				  GHCB_MSR_INFO_MASK,
+				  GHCB_MSR_INFO_POS);
+		break;
 	case GHCB_MSR_TERM_REQ: {
 		u64 reason_set, reason_code;
 
@@ -2920,6 +2943,7 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu)
 		ret = 1;
 		break;
 	case SVM_VMGEXIT_AP_HLT_LOOP:
+		svm->sev_es.ap_reset_hold_type = AP_RESET_HOLD_NAE_EVENT;
 		ret = kvm_emulate_ap_reset_hold(vcpu);
 		break;
 	case SVM_VMGEXIT_AP_JUMP_TABLE: {
@@ -3163,15 +3187,31 @@ void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
 		return;
 	}
 
-	/*
-	 * Subsequent SIPI: Return from an AP Reset Hold VMGEXIT, where
-	 * the guest will set the CS and RIP. Set SW_EXIT_INFO_2 to a
-	 * non-zero value.
-	 */
-	if (!svm->sev_es.ghcb)
-		return;
+	/* Subsequent SIPI */
+	switch (svm->sev_es.ap_reset_hold_type) {
+	case AP_RESET_HOLD_NAE_EVENT:
+		/*
+		 * Return from an AP Reset Hold VMGEXIT, where the guest will
+		 * set the CS and RIP. Set SW_EXIT_INFO_2 to a non-zero value.
+		 */
+		ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, 1);
+		break;
+	case AP_RESET_HOLD_MSR_PROTO:
+		/*
+		 * Return from an AP Reset Hold VMGEXIT, where the guest will
+		 * set the CS and RIP. Set GHCB data field to a non-zero value.
+		 */
+		set_ghcb_msr_bits(svm, 1,
+				  GHCB_MSR_AP_RESET_HOLD_RESULT_MASK,
+				  GHCB_MSR_AP_RESET_HOLD_RESULT_POS);
 
-	ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, 1);
+		set_ghcb_msr_bits(svm, GHCB_MSR_AP_RESET_HOLD_RESP,
+				  GHCB_MSR_INFO_MASK,
+				  GHCB_MSR_INFO_POS);
+		break;
+	default:
+		break;
+	}
 }
 
 struct page *snp_safe_alloc_page(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 7f1fbd874c4582..eecb2b744d795b 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -197,6 +197,7 @@ struct vcpu_sev_es_state {
 	u8 valid_bitmap[16];
 	struct kvm_host_map ghcb_map;
 	bool received_first_sipi;
+	unsigned int ap_reset_hold_type;
 
 	/* SEV-ES scratch area support */
 	u64 sw_scratch;

From c48b2b208f0241c06ed92a544296fc9bddb78569 Mon Sep 17 00:00:00 2001
From: Brijesh Singh <brijesh.singh@amd.com>
Date: Tue, 26 Apr 2022 18:13:09 +0000
Subject: [PATCH 1385/1406] KVM: SEV: Add GHCB handling for Hypervisor Feature
 Support requests

Version 2 of the GHCB specification introduced advertisement of features
that are supported by the Hypervisor.

Now that KVM supports version 2 of the GHCB specification, bump the
maximum supported protocol version.

Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
Signed-off-by: Ashish Kalra <ashish.kalra@amd.com>
Signed-off-by: Michael Roth <michael.roth@amd.com>
---
 arch/x86/include/asm/sev-common.h |  2 ++
 arch/x86/kvm/svm/sev.c            | 12 ++++++++++++
 arch/x86/kvm/svm/svm.h            |  3 ++-
 3 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/sev-common.h b/arch/x86/include/asm/sev-common.h
index 01261f7054ad7b..5a8246dd532f8d 100644
--- a/arch/x86/include/asm/sev-common.h
+++ b/arch/x86/include/asm/sev-common.h
@@ -101,6 +101,8 @@ enum psc_op {
 /* GHCB Hypervisor Feature Request/Response */
 #define GHCB_MSR_HV_FT_REQ		0x080
 #define GHCB_MSR_HV_FT_RESP		0x081
+#define GHCB_MSR_HV_FT_POS		12
+#define GHCB_MSR_HV_FT_MASK		GENMASK_ULL(51, 0)
 #define GHCB_MSR_HV_FT_RESP_VAL(v)			\
 	/* GHCBData[63:12] */				\
 	(((u64)(v) & GENMASK_ULL(63, 12)) >> 12)
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index b164edf1b943e1..7e541b69fde1cc 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -2584,6 +2584,7 @@ static int sev_es_validate_vmgexit(struct vcpu_svm *svm)
 	case SVM_VMGEXIT_AP_HLT_LOOP:
 	case SVM_VMGEXIT_AP_JUMP_TABLE:
 	case SVM_VMGEXIT_UNSUPPORTED_EVENT:
+	case SVM_VMGEXIT_HV_FEATURES:
 		break;
 	default:
 		reason = GHCB_ERR_INVALID_EVENT;
@@ -2844,6 +2845,12 @@ static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm)
 				  GHCB_MSR_INFO_MASK,
 				  GHCB_MSR_INFO_POS);
 		break;
+	case GHCB_MSR_HV_FT_REQ:
+		set_ghcb_msr_bits(svm, GHCB_HV_FT_SUPPORTED,
+				  GHCB_MSR_HV_FT_MASK, GHCB_MSR_HV_FT_POS);
+		set_ghcb_msr_bits(svm, GHCB_MSR_HV_FT_RESP,
+				  GHCB_MSR_INFO_MASK, GHCB_MSR_INFO_POS);
+		break;
 	case GHCB_MSR_TERM_REQ: {
 		u64 reason_set, reason_code;
 
@@ -2968,6 +2975,11 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu)
 		ret = 1;
 		break;
 	}
+	case SVM_VMGEXIT_HV_FEATURES:
+		ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, GHCB_HV_FT_SUPPORTED);
+
+		ret = 1;
+		break;
 	case SVM_VMGEXIT_UNSUPPORTED_EVENT:
 		vcpu_unimpl(vcpu,
 			    "vmgexit: unsupported event - exit_info_1=%#llx, exit_info_2=%#llx\n",
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index eecb2b744d795b..d0f8167ada7cc6 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -665,9 +665,10 @@ void avic_refresh_virtual_apic_mode(struct kvm_vcpu *vcpu);
 
 /* sev.c */
 
-#define GHCB_VERSION_MAX	1ULL
+#define GHCB_VERSION_MAX	2ULL
 #define GHCB_VERSION_MIN	1ULL
 
+#define GHCB_HV_FT_SUPPORTED	GHCB_HV_FT_SNP
 
 extern unsigned int max_sev_asid;
 

From d8744c49be18051e673a23e5d0c9a5cf3bf1fedd Mon Sep 17 00:00:00 2001
From: Brijesh Singh <brijesh.singh@amd.com>
Date: Tue, 26 Apr 2022 18:20:53 +0000
Subject: [PATCH 1386/1406] KVM: SEV: Add initial SEV-SNP support

The next generation of SEV is called SEV-SNP (Secure Nested Paging).
SEV-SNP builds upon existing SEV and SEV-ES functionality while adding
new hardware-based security protection. SEV-SNP adds strong memory
encryption and integrity protection to help prevent malicious
hypervisor-based attacks such as data replay, memory re-mapping, and
more, to create an isolated execution environment.

Implement some initial infrastructure in KVM to check/report when SNP is
enabled on the system.

Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
Signed-off-by: Ashish Kalra <ashish.kalra@amd.com>
[mdr: commit fixups, use similar ASID reporting as with SEV/SEV-ES]
Signed-off-by: Michael Roth <michael.roth@amd.com>
---
 arch/x86/kvm/svm/sev.c | 10 ++++++++++
 arch/x86/kvm/svm/svm.h |  8 ++++++++
 2 files changed, 18 insertions(+)

diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 7e541b69fde1cc..89f7042eb53350 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -59,10 +59,13 @@ module_param_named(sev_es, sev_es_enabled, bool, 0444);
 /* enable/disable SEV-ES DebugSwap support */
 static bool sev_es_debug_swap_enabled = true;
 module_param_named(debug_swap, sev_es_debug_swap_enabled, bool, 0444);
+
+static bool sev_snp_enabled;
 #else
 #define sev_enabled false
 #define sev_es_enabled false
 #define sev_es_debug_swap_enabled false
+#define sev_snp_enabled false
 #endif /* CONFIG_KVM_AMD_SEV */
 
 #define AP_RESET_HOLD_NONE		0
@@ -2195,6 +2198,7 @@ void __init sev_hardware_setup(void)
 {
 #ifdef CONFIG_KVM_AMD_SEV
 	unsigned int eax, ebx, ecx, edx, sev_asid_count, sev_es_asid_count;
+	bool sev_snp_supported = false;
 	bool sev_es_supported = false;
 	bool sev_supported = false;
 
@@ -2275,6 +2279,7 @@ void __init sev_hardware_setup(void)
 	sev_es_asid_count = min_sev_asid - 1;
 	WARN_ON_ONCE(misc_cg_set_capacity(MISC_CG_RES_SEV_ES, sev_es_asid_count));
 	sev_es_supported = true;
+	sev_snp_supported = sev_snp_enabled && cpu_feature_enabled(X86_FEATURE_SEV_SNP);
 
 out:
 	if (boot_cpu_has(X86_FEATURE_SEV))
@@ -2287,12 +2292,17 @@ void __init sev_hardware_setup(void)
 		pr_info("SEV-ES %s (ASIDs %u - %u)\n",
 			sev_es_supported ? "enabled" : "disabled",
 			min_sev_asid > 1 ? 1 : 0, min_sev_asid - 1);
+	if (boot_cpu_has(X86_FEATURE_SEV_SNP))
+		pr_info("SEV-SNP %s (ASIDs %u - %u)\n",
+			sev_snp_supported ? "enabled" : "disabled",
+			min_sev_asid > 1 ? 1 : 0, min_sev_asid - 1);
 
 	sev_enabled = sev_supported;
 	sev_es_enabled = sev_es_supported;
 	if (!sev_es_enabled || !cpu_feature_enabled(X86_FEATURE_DEBUG_SWAP) ||
 	    !cpu_feature_enabled(X86_FEATURE_NO_NESTED_DATA_BP))
 		sev_es_debug_swap_enabled = false;
+	sev_snp_enabled = sev_snp_supported;
 #endif
 }
 
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index d0f8167ada7cc6..a3e27c82866b2c 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -79,6 +79,7 @@ enum {
 struct kvm_sev_info {
 	bool active;		/* SEV enabled guest */
 	bool es_active;		/* SEV-ES enabled guest */
+	bool snp_active;	/* SEV-SNP enabled guest */
 	unsigned int asid;	/* ASID used for this guest */
 	unsigned int handle;	/* SEV firmware handle */
 	int fd;			/* SEV device fd */
@@ -341,6 +342,13 @@ static __always_inline bool sev_es_guest(struct kvm *kvm)
 #endif
 }
 
+static __always_inline bool sev_snp_guest(struct kvm *kvm)
+{
+	struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+
+	return sev_es_guest(kvm) && sev->snp_active;
+}
+
 static inline void vmcb_mark_all_dirty(struct vmcb *vmcb)
 {
 	vmcb->control.clean = 0;

From aef3234f4127d0001c8219811e112469ceff7a42 Mon Sep 17 00:00:00 2001
From: Brijesh Singh <brijesh.singh@amd.com>
Date: Sat, 17 Feb 2024 10:06:30 -0600
Subject: [PATCH 1387/1406] *KVM: SEV: Add KVM_SNP_INIT command

TODO: move to using common KVM_SEV_INIT2 interface. The flag discovery
mechanism also needs to be changed since it provides no mechanism for
discovery outside of trying to trigger the error path by guessing at
unimplemented flags. May went to leverage something similar to the
KVM_SEV_INIT2 VMSA feature machinery for this as well which involves a
KVM device ioctl to return the feature bitmap.

The KVM_SNP_INIT command is used by the hypervisor to initialize the
SEV-SNP platform context. In a typical workflow, this command should be
the first command issued. When creating SEV-SNP guest, the VMM must use
this command instead of the KVM_SEV_INIT or KVM_SEV_ES_INIT.

The flags value must be zero, it will be extended in future SNP support
to communicate the optional features (such as restricted INT injection
etc).

Co-developed-by: Pavan Kumar Paluri <papaluri@amd.com>
Signed-off-by: Pavan Kumar Paluri <papaluri@amd.com>
Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
Signed-off-by: Ashish Kalra <ashish.kalra@amd.com>
Signed-off-by: Michael Roth <michael.roth@amd.com>
---
 .../virt/kvm/x86/amd-memory-encryption.rst    | 21 ++++++++++
 arch/x86/include/asm/svm.h                    |  1 +
 arch/x86/include/uapi/asm/kvm.h               |  8 ++++
 arch/x86/kvm/svm/sev.c                        | 39 ++++++++++++++++++-
 arch/x86/kvm/svm/svm.h                        |  4 ++
 5 files changed, 72 insertions(+), 1 deletion(-)

diff --git a/Documentation/virt/kvm/x86/amd-memory-encryption.rst b/Documentation/virt/kvm/x86/amd-memory-encryption.rst
index 995780088eb231..9172f4a15832b5 100644
--- a/Documentation/virt/kvm/x86/amd-memory-encryption.rst
+++ b/Documentation/virt/kvm/x86/amd-memory-encryption.rst
@@ -434,6 +434,27 @@ issued by the hypervisor to make the guest ready for execution.
 
 Returns: 0 on success, -negative on error
 
+18. KVM_SNP_INIT
+----------------
+
+The KVM_SNP_INIT command can be used by the hypervisor to initialize SEV-SNP
+context. In a typical workflow, this command should be the first command issued.
+
+Parameters (in/out): struct kvm_snp_init
+
+Returns: 0 on success, -negative on error
+
+::
+
+        struct kvm_snp_init {
+                __u64 flags;
+        };
+
+The flags bitmap allows for future extensions but is currently unused.
+
+If the any bits specified via flags are not supported then -EOPNOTSUPP is returned,
+and the supported flags are returned.
+
 References
 ==========
 
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index 87a7b917d30ea9..ba8ce15b27d787 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -286,6 +286,7 @@ static_assert((X2AVIC_MAX_PHYSICAL_ID & AVIC_PHYSICAL_MAX_INDEX_MASK) == X2AVIC_
 #define AVIC_HPA_MASK	~((0xFFFULL << 52) | 0xFFF)
 
 #define SVM_SEV_FEAT_DEBUG_SWAP                        BIT(5)
+#define SVM_SEV_FEAT_SNP_ACTIVE                        BIT(0)
 
 struct vmcb_seg {
 	u16 selector;
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index e1bd264563d9e4..fa2a5aa5cd2fac 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -689,6 +689,9 @@ enum sev_cmd_id {
 	/* Guest Migration Extension */
 	KVM_SEV_SEND_CANCEL,
 
+	/* SNP-specific commands */
+	KVM_SEV_SNP_INIT,
+
 	KVM_SEV_NR_MAX,
 };
 
@@ -785,6 +788,11 @@ struct kvm_sev_receive_update_data {
 	__u32 trans_len;
 };
 
+/* TODO: use a common struct via KVM_SEV_INIT2 */
+struct kvm_snp_init {
+	__u64 flags;
+};
+
 #define KVM_X2APIC_API_USE_32BIT_IDS            (1ULL << 0)
 #define KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK  (1ULL << 1)
 
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 89f7042eb53350..9b5233dd2bf836 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -258,6 +258,25 @@ static void sev_unbind_asid(struct kvm *kvm, unsigned int handle)
 	sev_decommission(handle);
 }
 
+static int verify_snp_init_flags(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+	struct kvm_snp_init params;
+	int ret = 0;
+
+	if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data, sizeof(params)))
+		return -EFAULT;
+
+	if (params.flags & ~SEV_SNP_SUPPORTED_FLAGS)
+		ret = -EOPNOTSUPP;
+
+	params.flags = SEV_SNP_SUPPORTED_FLAGS;
+
+	if (copy_to_user((void __user *)(uintptr_t)argp->data, &params, sizeof(params)))
+		ret = -EFAULT;
+
+	return ret;
+}
+
 static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
 {
 	struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
@@ -271,11 +290,18 @@ static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
 		return -EINVAL;
 
 	sev->active = true;
-	sev->es_active = argp->id == KVM_SEV_ES_INIT;
+	sev->es_active = (argp->id == KVM_SEV_ES_INIT || argp->id == KVM_SEV_SNP_INIT);
+	sev->snp_active = argp->id == KVM_SEV_SNP_INIT;
 	ret = sev_asid_new(sev);
 	if (ret)
 		goto e_no_asid;
 
+	if (sev->snp_active) {
+		ret = verify_snp_init_flags(kvm, argp);
+		if (ret)
+			goto e_free;
+	}
+
 	init_args.probe = false;
 	ret = sev_platform_init(&init_args);
 	if (ret)
@@ -293,6 +319,7 @@ static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
 	sev_asid_free(sev);
 	sev->asid = 0;
 e_no_asid:
+	sev->snp_active = false;
 	sev->es_active = false;
 	sev->active = false;
 	return ret;
@@ -631,6 +658,10 @@ static int sev_es_sync_vmsa(struct vcpu_svm *svm)
 	if (sev_es_debug_swap_enabled)
 		save->sev_features |= SVM_SEV_FEAT_DEBUG_SWAP;
 
+	/* Enable the SEV-SNP feature */
+	if (sev_snp_guest(svm->vcpu.kvm))
+		save->sev_features |= SVM_SEV_FEAT_SNP_ACTIVE;
+
 	pr_debug("Virtual Machine Save Area (VMSA):\n");
 	print_hex_dump_debug("", DUMP_PREFIX_NONE, 16, 1, save, sizeof(*save), false);
 
@@ -1889,6 +1920,12 @@ int sev_mem_enc_ioctl(struct kvm *kvm, void __user *argp)
 	}
 
 	switch (sev_cmd.id) {
+	case KVM_SEV_SNP_INIT:
+		if (!sev_snp_enabled) {
+			r = -ENOTTY;
+			goto out;
+		}
+		fallthrough;
 	case KVM_SEV_ES_INIT:
 		if (!sev_es_enabled) {
 			r = -ENOTTY;
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index a3e27c82866b2c..07a9eb5b6ce56e 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -76,6 +76,9 @@ enum {
 /* TPR and CR2 are always written before VMRUN */
 #define VMCB_ALWAYS_DIRTY_MASK	((1U << VMCB_INTR) | (1U << VMCB_CR2))
 
+/* Supported init feature flags */
+#define SEV_SNP_SUPPORTED_FLAGS		0x0
+
 struct kvm_sev_info {
 	bool active;		/* SEV enabled guest */
 	bool es_active;		/* SEV-ES enabled guest */
@@ -91,6 +94,7 @@ struct kvm_sev_info {
 	struct list_head mirror_entry; /* Use as a list entry of mirrors */
 	struct misc_cg *misc_cg; /* For misc cgroup accounting */
 	atomic_t migration_in_progress;
+	u64 snp_init_flags;
 };
 
 struct kvm_svm {

From b9f457b82ea46f76f34f1a926e97f85c8670c476 Mon Sep 17 00:00:00 2001
From: Brijesh Singh <brijesh.singh@amd.com>
Date: Sat, 17 Feb 2024 10:19:45 -0600
Subject: [PATCH 1388/1406] KVM: SEV: Add KVM_SEV_SNP_LAUNCH_START command

KVM_SEV_SNP_LAUNCH_START begins the launch process for an SEV-SNP guest.
The command initializes a cryptographic digest context used to construct
the measurement of the guest. If the guest is expected to be migrated,
the command also binds a migration agent (MA) to the guest.

For more information see the SEV-SNP specification.

Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
Signed-off-by: Ashish Kalra <ashish.kalra@amd.com>
[mdr: hold sev_deactivate_lock when calling SEV_CMD_SNP_DECOMMISSION]
Signed-off-by: Michael Roth <michael.roth@amd.com>
---
 .../virt/kvm/x86/amd-memory-encryption.rst    |  19 +++
 arch/x86/include/uapi/asm/kvm.h               |   6 +
 arch/x86/kvm/svm/sev.c                        | 144 +++++++++++++++++-
 arch/x86/kvm/svm/svm.h                        |   1 +
 4 files changed, 167 insertions(+), 3 deletions(-)

diff --git a/Documentation/virt/kvm/x86/amd-memory-encryption.rst b/Documentation/virt/kvm/x86/amd-memory-encryption.rst
index 9172f4a15832b5..eaabc0cfc8a7ea 100644
--- a/Documentation/virt/kvm/x86/amd-memory-encryption.rst
+++ b/Documentation/virt/kvm/x86/amd-memory-encryption.rst
@@ -455,6 +455,25 @@ The flags bitmap allows for future extensions but is currently unused.
 If the any bits specified via flags are not supported then -EOPNOTSUPP is returned,
 and the supported flags are returned.
 
+19. KVM_SEV_SNP_LAUNCH_START
+----------------------------
+
+The KVM_SNP_LAUNCH_START command is used for creating the memory encryption
+context for the SEV-SNP guest.
+
+Parameters (in): struct  kvm_sev_snp_launch_start
+
+Returns: 0 on success, -negative on error
+
+::
+
+        struct kvm_sev_snp_launch_start {
+                __u64 policy;           /* Guest policy to use. */
+                __u8 gosvw[16];         /* Guest OS visible workarounds. */
+        };
+
+See the SEV-SNP specification for further detail on the launch input.
+
 References
 ==========
 
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index fa2a5aa5cd2fac..01d1895372981e 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -691,6 +691,7 @@ enum sev_cmd_id {
 
 	/* SNP-specific commands */
 	KVM_SEV_SNP_INIT,
+	KVM_SEV_SNP_LAUNCH_START,
 
 	KVM_SEV_NR_MAX,
 };
@@ -793,6 +794,11 @@ struct kvm_snp_init {
 	__u64 flags;
 };
 
+struct kvm_sev_snp_launch_start {
+	__u64 policy;
+	__u8 gosvw[16];
+}
+
 #define KVM_X2APIC_API_USE_32BIT_IDS            (1ULL << 0)
 #define KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK  (1ULL << 1)
 
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 9b5233dd2bf836..b703e3388a648a 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -24,6 +24,7 @@
 #include <asm/trapnr.h>
 #include <asm/fpu/xcr.h>
 #include <asm/debugreg.h>
+#include <asm/sev.h>
 
 #include "mmu.h"
 #include "x86.h"
@@ -72,6 +73,10 @@ static bool sev_snp_enabled;
 #define AP_RESET_HOLD_NAE_EVENT		1
 #define AP_RESET_HOLD_MSR_PROTO		2
 
+/* As defined by SEV-SNP Firmware ABI, under "Guest Policy". */
+#define SNP_POLICY_MASK_SMT		BIT_ULL(16)
+#define SNP_POLICY_MASK_SINGLE_SOCKET	BIT_ULL(20)
+
 static u8 sev_enc_bit;
 static DECLARE_RWSEM(sev_deactivate_lock);
 static DEFINE_MUTEX(sev_bitmap_lock);
@@ -82,6 +87,8 @@ static unsigned int nr_asids;
 static unsigned long *sev_asid_bitmap;
 static unsigned long *sev_reclaim_asid_bitmap;
 
+static int snp_decommission_context(struct kvm *kvm);
+
 struct enc_region {
 	struct list_head list;
 	unsigned long npages;
@@ -108,12 +115,17 @@ static int sev_flush_asids(unsigned int min_asid, unsigned int max_asid)
 	down_write(&sev_deactivate_lock);
 
 	wbinvd_on_all_cpus();
-	ret = sev_guest_df_flush(&error);
+
+	if (sev_snp_enabled)
+		ret = sev_do_cmd(SEV_CMD_SNP_DF_FLUSH, NULL, &error);
+	else
+		ret = sev_guest_df_flush(&error);
 
 	up_write(&sev_deactivate_lock);
 
 	if (ret)
-		pr_err("SEV: DF_FLUSH failed, ret=%d, error=%#x\n", ret, error);
+		pr_err("SEV%s: DF_FLUSH failed, ret=%d, error=%#x\n",
+		       sev_snp_enabled ? "-SNP" : "", ret, error);
 
 	return ret;
 }
@@ -1896,6 +1908,94 @@ int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd)
 	return ret;
 }
 
+/*
+ * The guest context contains all the information, keys and metadata
+ * associated with the guest that the firmware tracks to implement SEV
+ * and SNP features. The firmware stores the guest context in hypervisor
+ * provide page via the SNP_GCTX_CREATE command.
+ */
+static void *snp_context_create(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+	struct sev_data_snp_addr data = {};
+	void *context;
+	int rc;
+
+	/* Allocate memory for context page */
+	context = snp_alloc_firmware_page(GFP_KERNEL_ACCOUNT);
+	if (!context)
+		return NULL;
+
+	data.address = __psp_pa(context);
+	rc = __sev_issue_cmd(argp->sev_fd, SEV_CMD_SNP_GCTX_CREATE, &data, &argp->error);
+	if (rc) {
+		snp_free_firmware_page(context);
+		return NULL;
+	}
+
+	return context;
+}
+
+static int snp_bind_asid(struct kvm *kvm, int *error)
+{
+	struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+	struct sev_data_snp_activate data = {0};
+
+	data.gctx_paddr = __psp_pa(sev->snp_context);
+	data.asid   = sev_get_asid(kvm);
+	return sev_issue_cmd(kvm, SEV_CMD_SNP_ACTIVATE, &data, error);
+}
+
+static int snp_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+	struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+	struct sev_data_snp_launch_start start = {0};
+	struct kvm_sev_snp_launch_start params;
+	int rc;
+
+	if (!sev_snp_guest(kvm))
+		return -ENOTTY;
+
+	if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data, sizeof(params)))
+		return -EFAULT;
+
+	/* Don't allow userspace to allocate memory for more than 1 SNP context. */
+	if (sev->snp_context)
+		return -EINVAL;
+
+	sev->snp_context = snp_context_create(kvm, argp);
+	if (!sev->snp_context)
+		return -ENOTTY;
+
+	if (params.policy & SNP_POLICY_MASK_SINGLE_SOCKET) {
+		pr_warn("SEV-SNP hypervisor does not support limiting guests to a single socket.");
+		return -EINVAL;
+	}
+
+	if (!(params.policy & SNP_POLICY_MASK_SMT)) {
+		pr_warn("SEV-SNP hypervisor does not support limiting guests to a single SMT thread.");
+		return -EINVAL;
+	}
+
+	start.gctx_paddr = __psp_pa(sev->snp_context);
+	start.policy = params.policy;
+	memcpy(start.gosvw, params.gosvw, sizeof(params.gosvw));
+	rc = __sev_issue_cmd(argp->sev_fd, SEV_CMD_SNP_LAUNCH_START, &start, &argp->error);
+	if (rc)
+		goto e_free_context;
+
+	sev->fd = argp->sev_fd;
+	rc = snp_bind_asid(kvm, &argp->error);
+	if (rc)
+		goto e_free_context;
+
+	return 0;
+
+e_free_context:
+	snp_decommission_context(kvm);
+
+	return rc;
+}
+
 int sev_mem_enc_ioctl(struct kvm *kvm, void __user *argp)
 {
 	struct kvm_sev_cmd sev_cmd;
@@ -1986,6 +2086,9 @@ int sev_mem_enc_ioctl(struct kvm *kvm, void __user *argp)
 	case KVM_SEV_RECEIVE_FINISH:
 		r = sev_receive_finish(kvm, &sev_cmd);
 		break;
+	case KVM_SEV_SNP_LAUNCH_START:
+		r = snp_launch_start(kvm, &sev_cmd);
+		break;
 	default:
 		r = -EINVAL;
 		goto out;
@@ -2178,6 +2281,33 @@ int sev_vm_copy_enc_context_from(struct kvm *kvm, unsigned int source_fd)
 	return ret;
 }
 
+static int snp_decommission_context(struct kvm *kvm)
+{
+	struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+	struct sev_data_snp_addr data = {};
+	int ret;
+
+	/* If context is not created then do nothing */
+	if (!sev->snp_context)
+		return 0;
+
+	data.address = __sme_pa(sev->snp_context);
+	down_write(&sev_deactivate_lock);
+	ret = sev_do_cmd(SEV_CMD_SNP_DECOMMISSION, &data, NULL);
+	if (WARN_ONCE(ret, "failed to release guest context")) {
+		up_write(&sev_deactivate_lock);
+		return ret;
+	}
+
+	up_write(&sev_deactivate_lock);
+
+	/* free the context page now */
+	snp_free_firmware_page(sev->snp_context);
+	sev->snp_context = NULL;
+
+	return 0;
+}
+
 void sev_vm_destroy(struct kvm *kvm)
 {
 	struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
@@ -2219,7 +2349,15 @@ void sev_vm_destroy(struct kvm *kvm)
 		}
 	}
 
-	sev_unbind_asid(kvm, sev->handle);
+	if (sev_snp_guest(kvm)) {
+		if (snp_decommission_context(kvm)) {
+			WARN_ONCE(1, "Failed to free SNP guest context, leaking asid!\n");
+			return;
+		}
+	} else {
+		sev_unbind_asid(kvm, sev->handle);
+	}
+
 	sev_asid_free(sev);
 }
 
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 07a9eb5b6ce56e..9c633173b779e6 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -95,6 +95,7 @@ struct kvm_sev_info {
 	struct misc_cg *misc_cg; /* For misc cgroup accounting */
 	atomic_t migration_in_progress;
 	u64 snp_init_flags;
+	void *snp_context;      /* SNP guest context page */
 };
 
 struct kvm_svm {

From ba2af1ff6172a2ba944065a2f020c043f62e1dde Mon Sep 17 00:00:00 2001
From: Brijesh Singh <brijesh.singh@amd.com>
Date: Sat, 17 Feb 2024 10:37:37 -0600
Subject: [PATCH 1389/1406] *KVM: SEV: Add KVM_SEV_SNP_LAUNCH_UPDATE command

The KVM_SEV_SNP_LAUNCH_UPDATE command can be used to insert data into
the guest's memory. The data is encrypted with the cryptographic context
created with the KVM_SEV_SNP_LAUNCH_START.

In addition to the inserting data, it can insert a two special pages
into the guests memory: the secrets page and the CPUID page.

While terminating the guest, reclaim the guest pages added in the RMP
table. If the reclaim fails, then the page is no longer safe to be
released back to the system and leak them.

For more information see the SEV-SNP specification.

Co-developed-by: Michael Roth <michael.roth@amd.com>
Signed-off-by: Michael Roth <michael.roth@amd.com>
Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
Signed-off-by: Ashish Kalra <ashish.kalra@amd.com>
---
 .../virt/kvm/x86/amd-memory-encryption.rst    |  31 +++
 arch/x86/include/uapi/asm/kvm.h               |  17 +-
 arch/x86/kvm/svm/sev.c                        | 184 ++++++++++++++++++
 3 files changed, 231 insertions(+), 1 deletion(-)

diff --git a/Documentation/virt/kvm/x86/amd-memory-encryption.rst b/Documentation/virt/kvm/x86/amd-memory-encryption.rst
index eaabc0cfc8a7ea..4a14246c8abbde 100644
--- a/Documentation/virt/kvm/x86/amd-memory-encryption.rst
+++ b/Documentation/virt/kvm/x86/amd-memory-encryption.rst
@@ -474,6 +474,37 @@ Returns: 0 on success, -negative on error
 
 See the SEV-SNP specification for further detail on the launch input.
 
+20. KVM_SEV_SNP_LAUNCH_UPDATE
+-----------------------------
+
+The KVM_SEV_SNP_LAUNCH_UPDATE is used for encrypting a memory region. It also
+calculates a measurement of the memory contents. The measurement is a signature
+of the memory contents that can be sent to the guest owner as an attestation
+that the memory was encrypted correctly by the firmware.
+
+Parameters (in): struct  kvm_sev_snp_launch_update
+
+Returns: 0 on success, -negative on error
+
+::
+
+        struct kvm_sev_snp_launch_update {
+                __u64 start_gfn;        /* Guest page number to load/encrypt data into. */
+                __u64 uaddr;            /* Userspace address of data to be loaded/encrypted. */
+                __u32 len;              /* Length of memory region. */
+                __u8 page_type;         /* The type of the guest page being initialized. */
+        };
+
+where the allowed values for page_type are::
+
+	#define KVM_SEV_SNP_PAGE_TYPE_NORMAL           0x1
+	#define KVM_SEV_SNP_PAGE_TYPE_ZERO             0x3
+	#define KVM_SEV_SNP_PAGE_TYPE_UNMEASURED       0x4
+	#define KVM_SEV_SNP_PAGE_TYPE_SECRETS          0x5
+	#define KVM_SEV_SNP_PAGE_TYPE_CPUID            0x6
+
+See the SEV-SNP spec for further details on how each page type is used/measured.
+
 References
 ==========
 
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index 01d1895372981e..988dc157ba564b 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -692,6 +692,7 @@ enum sev_cmd_id {
 	/* SNP-specific commands */
 	KVM_SEV_SNP_INIT,
 	KVM_SEV_SNP_LAUNCH_START,
+	KVM_SEV_SNP_LAUNCH_UPDATE,
 
 	KVM_SEV_NR_MAX,
 };
@@ -797,7 +798,21 @@ struct kvm_snp_init {
 struct kvm_sev_snp_launch_start {
 	__u64 policy;
 	__u8 gosvw[16];
-}
+};
+
+/* Kept in sync with firmware values for simplicity. */
+#define KVM_SEV_SNP_PAGE_TYPE_NORMAL		0x1
+#define KVM_SEV_SNP_PAGE_TYPE_ZERO		0x3
+#define KVM_SEV_SNP_PAGE_TYPE_UNMEASURED	0x4
+#define KVM_SEV_SNP_PAGE_TYPE_SECRETS		0x5
+#define KVM_SEV_SNP_PAGE_TYPE_CPUID		0x6
+
+struct kvm_sev_snp_launch_update {
+	__u64 gfn_start;
+	__u64 uaddr;
+	__u32 len;
+	__u8 type;
+};
 
 #define KVM_X2APIC_API_USE_32BIT_IDS            (1ULL << 0)
 #define KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK  (1ULL << 1)
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index b703e3388a648a..e053df3c501537 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -253,6 +253,36 @@ static void sev_decommission(unsigned int handle)
 	sev_guest_decommission(&decommission, NULL);
 }
 
+static int snp_page_reclaim(u64 pfn)
+{
+	struct sev_data_snp_page_reclaim data = {0};
+	int err, rc;
+
+	data.paddr = __sme_set(pfn << PAGE_SHIFT);
+	rc = sev_do_cmd(SEV_CMD_SNP_PAGE_RECLAIM, &data, &err);
+	if (rc) {
+		/*
+		 * If the reclaim failed, then page is no longer safe
+		 * to use.
+		 */
+		snp_leak_pages(pfn, 1);
+	}
+
+	return rc;
+}
+
+static int host_rmp_make_shared(u64 pfn, enum pg_level level, bool leak)
+{
+	int rc;
+
+	rc = rmp_make_shared(pfn, level);
+	if (rc && leak)
+		snp_leak_pages(pfn,
+			       page_level_size(level) >> PAGE_SHIFT);
+
+	return rc;
+}
+
 static void sev_unbind_asid(struct kvm *kvm, unsigned int handle)
 {
 	struct sev_data_deactivate deactivate;
@@ -1996,6 +2026,157 @@ static int snp_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
 	return rc;
 }
 
+static int snp_launch_update(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+	struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+	struct sev_data_snp_launch_update data = {0};
+	struct kvm_sev_snp_launch_update params;
+	int *error = &argp->error;
+	int i, n = 0, ret = 0;
+	unsigned int npages;
+	kvm_pfn_t *pfns;
+	gfn_t gfn;
+
+	if (!sev_snp_guest(kvm) || !sev->snp_context)
+		return -EINVAL;
+
+	if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data, sizeof(params)))
+		return -EFAULT;
+
+	if (!IS_ALIGNED(params.len, PAGE_SIZE) ||
+	    (params.type != KVM_SEV_SNP_PAGE_TYPE_NORMAL &&
+	     params.type != KVM_SEV_SNP_PAGE_TYPE_ZERO &&
+	     params.type != KVM_SEV_SNP_PAGE_TYPE_UNMEASURED &&
+	     params.type != KVM_SEV_SNP_PAGE_TYPE_SECRETS &&
+	     params.type != KVM_SEV_SNP_PAGE_TYPE_CPUID))
+		return -EINVAL;
+
+	npages = params.len / PAGE_SIZE;
+	pfns = kvmalloc_array(npages, sizeof(*pfns), GFP_KERNEL_ACCOUNT);
+	if (!pfns)
+		return -ENOMEM;
+
+	data.gctx_paddr = __psp_pa(sev->snp_context);
+
+	pr_debug("%s: GFN range 0x%llx-0x%llx type %d\n", __func__,
+		 params.gfn_start, params.gfn_start + npages, params.type);
+
+	/*
+	 * For each GFN that's being prepared as part of the initial guest
+	 * state, the following pre-conditions are verified:
+	 *
+	 *   1) The backing memslot is a valid private memslot.
+	 *   2) The GFN has been set to private via KVM_SET_MEMORY_ATTRIBUTES
+	 *      beforehand.
+	 *   3) The PFN of the guest_memfd has not already been set to private
+	 *      in the RMP table.
+	 *
+	 * The KVM MMU relies on kvm->mmu_invalidate_seq to retry nested page
+	 * faults if there's a race between a fault and an attribute update via
+	 * KVM_SET_MEMORY_ATTRIBUTES, and a similar approach could be utilized
+	 * here. However, kvm->slots_lock guards against both this as well as
+	 * concurrent memslot updates occurring while these checks are being
+	 * performed, so use that here to make it easier to reason about the
+	 * initial expected state and better guard against unexpected
+	 * situations.
+	 */
+	mutex_lock(&kvm->slots_lock);
+
+	for (gfn = params.gfn_start, i = 0; gfn < params.gfn_start + npages; gfn++, i++) {
+		struct kvm_memory_slot *memslot;
+		int order, level;
+		bool assigned;
+		void *kvaddr;
+
+		if (!kvm_mem_is_private(kvm, gfn)) {
+			pr_debug("Failed to ensure GFN 0x%llx has private memory attribute set\n",
+				 gfn);
+			ret = -EINVAL;
+			goto e_release;
+		}
+
+		memslot = gfn_to_memslot(kvm, params.gfn_start);
+		if (!kvm_slot_can_be_private(memslot)) {
+			ret = -EINVAL;
+			goto e_release;
+		}
+
+		ret = __kvm_gmem_get_pfn(kvm, memslot, gfn, &pfns[i], &order, false);
+		if (ret) {
+			ret = -EINVAL;
+			goto e_release;
+		}
+
+		n++;
+		ret = snp_lookup_rmpentry((u64)pfns[i], &assigned, &level);
+		if (ret || assigned) {
+			pr_debug("Failed to ensure GFN 0x%llx RMP entry is initial shared state, ret: %d assigned: %d\n",
+				 gfn, ret, assigned);
+			ret = -EINVAL;
+			goto e_release;
+		}
+
+		kvaddr = pfn_to_kaddr(pfns[i]);
+		if (!virt_addr_valid(kvaddr)) {
+			ret = -EINVAL;
+			goto e_release;
+		}
+
+		ret = copy_from_user(kvaddr, (void __user *)params.uaddr + i * PAGE_SIZE,
+				     PAGE_SIZE);
+		if (ret)
+			goto e_release;
+
+		ret = rmp_make_private(pfns[i], gfn << PAGE_SHIFT, PG_LEVEL_4K,
+				       sev_get_asid(kvm), true);
+		if (ret)
+			goto e_release;
+
+		data.address = __sme_set(pfns[i] << PAGE_SHIFT);
+		data.page_size = PG_LEVEL_TO_RMP(PG_LEVEL_4K);
+		data.page_type = params.type;
+		ret = __sev_issue_cmd(argp->sev_fd, SEV_CMD_SNP_LAUNCH_UPDATE,
+				      &data, error);
+		if (ret) {
+			pr_debug("SEV-SNP launch update failed, ret: 0x%x, fw_error: 0x%x\n",
+				 ret, *error);
+			snp_page_reclaim(pfns[i]);
+
+			/*
+			 * When invalid CPUID function entries are detected,
+			 * firmware writes the expected values into page and
+			 * leaves it unencrypted so it can be used for debugging
+			 * and error-reporting.
+			 *
+			 * Copy this page back into the source buffer so
+			 * userspace can use this information to provide
+			 * information on which CPUID leaves/fields failed CPUID
+			 * validation.
+			 */
+			if (params.type == KVM_SEV_SNP_PAGE_TYPE_CPUID &&
+			    *error == SEV_RET_INVALID_PARAM) {
+				host_rmp_make_shared(pfns[i], PG_LEVEL_4K, true);
+				if (copy_to_user((void __user *)params.uaddr, kvaddr, PAGE_SIZE))
+					pr_debug("Failed to write CPUID page back to userspace\n");
+			}
+
+			goto e_release;
+		}
+	}
+
+e_release:
+	for (i = 0; i < n; i++) {
+		if (ret)
+			host_rmp_make_shared(pfns[i], PG_LEVEL_4K, true);
+
+		kvm_release_pfn_dirty(pfns[i]);
+	}
+
+	mutex_unlock(&kvm->slots_lock);
+	kvfree(pfns);
+	return ret;
+}
+
 int sev_mem_enc_ioctl(struct kvm *kvm, void __user *argp)
 {
 	struct kvm_sev_cmd sev_cmd;
@@ -2089,6 +2270,9 @@ int sev_mem_enc_ioctl(struct kvm *kvm, void __user *argp)
 	case KVM_SEV_SNP_LAUNCH_START:
 		r = snp_launch_start(kvm, &sev_cmd);
 		break;
+	case KVM_SEV_SNP_LAUNCH_UPDATE:
+		r = snp_launch_update(kvm, &sev_cmd);
+		break;
 	default:
 		r = -EINVAL;
 		goto out;

From 005e2ac69c5142ec95d67e8fda448c64eb009347 Mon Sep 17 00:00:00 2001
From: Michael Roth <michael.roth@amd.com>
Date: Tue, 20 Feb 2024 18:45:12 -0600
Subject: [PATCH 1390/1406] *KVM: SEV: Add KVM_SEV_SNP_LAUNCH_FINISH command

TODO: see if ID block / Author Key enabled bits can be set automatically
rather than exposed directly via KVM API. See if there are other
opportunities to architect things in user-facing APIs.

The KVM_SEV_SNP_LAUNCH_FINISH finalize the cryptographic digest and stores
it as the measurement of the guest at launch.

While finalizing the launch flow, it also issues the LAUNCH_UPDATE command
to encrypt the VMSA pages.

If its an SNP guest, then VMSA was added in the RMP entry as
a guest owned page and also removed from the kernel direct map
so flush it later after it is transitioned back to hypervisor
state and restored in the direct map.

Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
Signed-off-by: Harald Hoyer <harald@profian.com>
Signed-off-by: Ashish Kalra <ashish.kalra@amd.com>
[mdr: always measure BSP first to get consistent launch measurements]
Signed-off-by: Michael Roth <michael.roth@amd.com>
---
 .../virt/kvm/x86/amd-memory-encryption.rst    |  25 ++++
 arch/x86/include/uapi/asm/kvm.h               |  14 ++
 arch/x86/kvm/svm/sev.c                        | 136 ++++++++++++++++++
 3 files changed, 175 insertions(+)

diff --git a/Documentation/virt/kvm/x86/amd-memory-encryption.rst b/Documentation/virt/kvm/x86/amd-memory-encryption.rst
index 4a14246c8abbde..5903476dde6cb3 100644
--- a/Documentation/virt/kvm/x86/amd-memory-encryption.rst
+++ b/Documentation/virt/kvm/x86/amd-memory-encryption.rst
@@ -505,6 +505,31 @@ where the allowed values for page_type are::
 
 See the SEV-SNP spec for further details on how each page type is used/measured.
 
+21. KVM_SEV_SNP_LAUNCH_FINISH
+-----------------------------
+
+After completion of the SNP guest launch flow, the KVM_SEV_SNP_LAUNCH_FINISH
+command can be issued to make the guest ready for the execution.
+
+Parameters (in): struct kvm_sev_snp_launch_finish
+
+Returns: 0 on success, -negative on error
+
+::
+
+        struct kvm_sev_snp_launch_finish {
+                __u64 id_block_uaddr;
+                __u64 id_auth_uaddr;
+                __u8 id_block_en;
+                __u8 auth_key_en;
+                __u8 host_data[32];
+                __u8 pad[6];
+        };
+
+
+See SEV-SNP specification for SNP_LAUNCH_FINISH further details on launch
+finish input parameters.
+
 References
 ==========
 
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index 988dc157ba564b..900ebdb2c1ab61 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -693,6 +693,7 @@ enum sev_cmd_id {
 	KVM_SEV_SNP_INIT,
 	KVM_SEV_SNP_LAUNCH_START,
 	KVM_SEV_SNP_LAUNCH_UPDATE,
+	KVM_SEV_SNP_LAUNCH_FINISH,
 
 	KVM_SEV_NR_MAX,
 };
@@ -814,6 +815,19 @@ struct kvm_sev_snp_launch_update {
 	__u8 type;
 };
 
+#define KVM_SEV_SNP_ID_BLOCK_SIZE	96
+#define KVM_SEV_SNP_ID_AUTH_SIZE	4096
+#define KVM_SEV_SNP_FINISH_DATA_SIZE	32
+
+struct kvm_sev_snp_launch_finish {
+	__u64 id_block_uaddr;
+	__u64 id_auth_uaddr;
+	__u8 id_block_en;
+	__u8 auth_key_en;
+	__u8 host_data[KVM_SEV_SNP_FINISH_DATA_SIZE];
+	__u8 pad[6];
+};
+
 #define KVM_X2APIC_API_USE_32BIT_IDS            (1ULL << 0)
 #define KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK  (1ULL << 1)
 
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index e053df3c501537..6bb3042a1c0433 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -77,6 +77,8 @@ static bool sev_snp_enabled;
 #define SNP_POLICY_MASK_SMT		BIT_ULL(16)
 #define SNP_POLICY_MASK_SINGLE_SOCKET	BIT_ULL(20)
 
+#define INITIAL_VMSA_GPA 0xFFFFFFFFF000
+
 static u8 sev_enc_bit;
 static DECLARE_RWSEM(sev_deactivate_lock);
 static DEFINE_MUTEX(sev_bitmap_lock);
@@ -755,6 +757,7 @@ static int sev_launch_update_vmsa(struct kvm *kvm, struct kvm_sev_cmd *argp)
 	if (!sev_es_guest(kvm))
 		return -ENOTTY;
 
+	/* Handle remaining vCPUs. */
 	kvm_for_each_vcpu(i, vcpu, kvm) {
 		ret = mutex_lock_killable(&vcpu->mutex);
 		if (ret)
@@ -2177,6 +2180,123 @@ static int snp_launch_update(struct kvm *kvm, struct kvm_sev_cmd *argp)
 	return ret;
 }
 
+static int snp_launch_update_vmsa(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+	struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+	struct sev_data_snp_launch_update data = {};
+	bool boot_vcpu_handled = false;
+	struct kvm_vcpu *vcpu;
+	unsigned long i;
+	int ret;
+
+	data.gctx_paddr = __psp_pa(sev->snp_context);
+	data.page_type = SNP_PAGE_TYPE_VMSA;
+
+handle_remaining_vcpus:
+	kvm_for_each_vcpu(i, vcpu, kvm) {
+		struct vcpu_svm *svm = to_svm(vcpu);
+		u64 pfn = __pa(svm->sev_es.vmsa) >> PAGE_SHIFT;
+
+		/* Handle boot vCPU first to ensure consistent measurement of initial state. */
+		if (!boot_vcpu_handled && vcpu->vcpu_id != 0)
+			continue;
+
+		if (boot_vcpu_handled && vcpu->vcpu_id == 0)
+			continue;
+
+		/* Perform some pre-encryption checks against the VMSA */
+		ret = sev_es_sync_vmsa(svm);
+		if (ret)
+			return ret;
+
+		/* Transition the VMSA page to a firmware state. */
+		ret = rmp_make_private(pfn, INITIAL_VMSA_GPA, PG_LEVEL_4K, sev->asid, true);
+		if (ret)
+			return ret;
+
+		/* Issue the SNP command to encrypt the VMSA */
+		data.address = __sme_pa(svm->sev_es.vmsa);
+		ret = __sev_issue_cmd(argp->sev_fd, SEV_CMD_SNP_LAUNCH_UPDATE,
+				      &data, &argp->error);
+		if (ret) {
+			snp_page_reclaim(pfn);
+			return ret;
+		}
+
+		svm->vcpu.arch.guest_state_protected = true;
+
+		if (!boot_vcpu_handled) {
+			boot_vcpu_handled = true;
+			goto handle_remaining_vcpus;
+		}
+	}
+
+	return 0;
+}
+
+static int snp_launch_finish(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+	struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+	struct kvm_sev_snp_launch_finish params;
+	struct sev_data_snp_launch_finish *data;
+	void *id_block = NULL, *id_auth = NULL;
+	int ret;
+
+	if (!sev_snp_guest(kvm))
+		return -ENOTTY;
+
+	if (!sev->snp_context)
+		return -EINVAL;
+
+	if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data, sizeof(params)))
+		return -EFAULT;
+
+	/* Measure all vCPUs using LAUNCH_UPDATE before finalizing the launch flow. */
+	ret = snp_launch_update_vmsa(kvm, argp);
+	if (ret)
+		return ret;
+
+	data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
+	if (!data)
+		return -ENOMEM;
+
+	if (params.id_block_en) {
+		id_block = psp_copy_user_blob(params.id_block_uaddr, KVM_SEV_SNP_ID_BLOCK_SIZE);
+		if (IS_ERR(id_block)) {
+			ret = PTR_ERR(id_block);
+			goto e_free;
+		}
+
+		data->id_block_en = 1;
+		data->id_block_paddr = __sme_pa(id_block);
+
+		id_auth = psp_copy_user_blob(params.id_auth_uaddr, KVM_SEV_SNP_ID_AUTH_SIZE);
+		if (IS_ERR(id_auth)) {
+			ret = PTR_ERR(id_auth);
+			goto e_free_id_block;
+		}
+
+		data->id_auth_paddr = __sme_pa(id_auth);
+
+		if (params.auth_key_en)
+			data->auth_key_en = 1;
+	}
+
+	memcpy(data->host_data, params.host_data, KVM_SEV_SNP_FINISH_DATA_SIZE);
+	data->gctx_paddr = __psp_pa(sev->snp_context);
+	ret = sev_issue_cmd(kvm, SEV_CMD_SNP_LAUNCH_FINISH, data, &argp->error);
+
+	kfree(id_auth);
+
+e_free_id_block:
+	kfree(id_block);
+
+e_free:
+	kfree(data);
+
+	return ret;
+}
+
 int sev_mem_enc_ioctl(struct kvm *kvm, void __user *argp)
 {
 	struct kvm_sev_cmd sev_cmd;
@@ -2273,6 +2393,9 @@ int sev_mem_enc_ioctl(struct kvm *kvm, void __user *argp)
 	case KVM_SEV_SNP_LAUNCH_UPDATE:
 		r = snp_launch_update(kvm, &sev_cmd);
 		break;
+	case KVM_SEV_SNP_LAUNCH_FINISH:
+		r = snp_launch_finish(kvm, &sev_cmd);
+		break;
 	default:
 		r = -EINVAL;
 		goto out;
@@ -2749,11 +2872,24 @@ void sev_free_vcpu(struct kvm_vcpu *vcpu)
 
 	svm = to_svm(vcpu);
 
+	/*
+	 * If it's an SNP guest, then the VMSA was marked in the RMP table as
+	 * a guest-owned page. Transition the page to hypervisor state before
+	 * releasing it back to the system.
+	 */
+	if (sev_snp_guest(vcpu->kvm)) {
+		u64 pfn = __pa(svm->sev_es.vmsa) >> PAGE_SHIFT;
+
+		if (host_rmp_make_shared(pfn, PG_LEVEL_4K, true))
+			goto skip_vmsa_free;
+	}
+
 	if (vcpu->arch.guest_state_protected)
 		sev_flush_encrypted_page(vcpu, svm->sev_es.vmsa);
 
 	__free_page(virt_to_page(svm->sev_es.vmsa));
 
+skip_vmsa_free:
 	if (svm->sev_es.ghcb_sa_free)
 		kvfree(svm->sev_es.ghcb_sa);
 }

From f0d9a52d4e19b3a0c2477c35e3d445ef24ae91fd Mon Sep 17 00:00:00 2001
From: Brijesh Singh <brijesh.singh@amd.com>
Date: Tue, 26 Apr 2022 18:54:45 +0000
Subject: [PATCH 1391/1406] KVM: SEV: Add support to handle GHCB GPA register
 VMGEXIT

SEV-SNP guests are required to perform a GHCB GPA registration. Before
using a GHCB GPA for a vCPU the first time, a guest must register the
vCPU GHCB GPA. If hypervisor can work with the guest requested GPA then
it must respond back with the same GPA otherwise return -1.

On VMEXIT, verify that the GHCB GPA matches with the registered value.
If a mismatch is detected, then abort the guest.

Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
Signed-off-by: Ashish Kalra <ashish.kalra@amd.com>
Signed-off-by: Michael Roth <michael.roth@amd.com>
---
 arch/x86/include/asm/sev-common.h |  8 ++++++++
 arch/x86/kvm/svm/sev.c            | 27 +++++++++++++++++++++++++++
 arch/x86/kvm/svm/svm.h            |  7 +++++++
 3 files changed, 42 insertions(+)

diff --git a/arch/x86/include/asm/sev-common.h b/arch/x86/include/asm/sev-common.h
index 5a8246dd532f8d..1006bfffe07aa7 100644
--- a/arch/x86/include/asm/sev-common.h
+++ b/arch/x86/include/asm/sev-common.h
@@ -59,6 +59,14 @@
 #define GHCB_MSR_AP_RESET_HOLD_RESULT_POS	12
 #define GHCB_MSR_AP_RESET_HOLD_RESULT_MASK	GENMASK_ULL(51, 0)
 
+/* Preferred GHCB GPA Request */
+#define GHCB_MSR_PREF_GPA_REQ		0x010
+#define GHCB_MSR_GPA_VALUE_POS		12
+#define GHCB_MSR_GPA_VALUE_MASK		GENMASK_ULL(51, 0)
+
+#define GHCB_MSR_PREF_GPA_RESP		0x011
+#define GHCB_MSR_PREF_GPA_NONE		0xfffffffffffff
+
 /* GHCB GPA Register */
 #define GHCB_MSR_REG_GPA_REQ		0x012
 #define GHCB_MSR_REG_GPA_REQ_VAL(v)			\
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 6bb3042a1c0433..8f88537eb3759d 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -3356,6 +3356,26 @@ static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm)
 		set_ghcb_msr_bits(svm, GHCB_MSR_HV_FT_RESP,
 				  GHCB_MSR_INFO_MASK, GHCB_MSR_INFO_POS);
 		break;
+	case GHCB_MSR_PREF_GPA_REQ:
+		set_ghcb_msr_bits(svm, GHCB_MSR_PREF_GPA_NONE, GHCB_MSR_GPA_VALUE_MASK,
+				  GHCB_MSR_GPA_VALUE_POS);
+		set_ghcb_msr_bits(svm, GHCB_MSR_PREF_GPA_RESP, GHCB_MSR_INFO_MASK,
+				  GHCB_MSR_INFO_POS);
+		break;
+	case GHCB_MSR_REG_GPA_REQ: {
+		u64 gfn;
+
+		gfn = get_ghcb_msr_bits(svm, GHCB_MSR_GPA_VALUE_MASK,
+					GHCB_MSR_GPA_VALUE_POS);
+
+		svm->sev_es.ghcb_registered_gpa = gfn_to_gpa(gfn);
+
+		set_ghcb_msr_bits(svm, gfn, GHCB_MSR_GPA_VALUE_MASK,
+				  GHCB_MSR_GPA_VALUE_POS);
+		set_ghcb_msr_bits(svm, GHCB_MSR_REG_GPA_RESP, GHCB_MSR_INFO_MASK,
+				  GHCB_MSR_INFO_POS);
+		break;
+	}
 	case GHCB_MSR_TERM_REQ: {
 		u64 reason_set, reason_code;
 
@@ -3419,6 +3439,13 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu)
 	trace_kvm_vmgexit_enter(vcpu->vcpu_id, svm->sev_es.ghcb);
 
 	sev_es_sync_from_ghcb(svm);
+
+	/* SEV-SNP guest requires that the GHCB GPA must be registered */
+	if (sev_snp_guest(svm->vcpu.kvm) && !ghcb_gpa_is_registered(svm, ghcb_gpa)) {
+		vcpu_unimpl(&svm->vcpu, "vmgexit: GHCB GPA [%#llx] is not registered.\n", ghcb_gpa);
+		return -EINVAL;
+	}
+
 	ret = sev_es_validate_vmgexit(svm);
 	if (ret)
 		return ret;
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 9c633173b779e6..2bee24017baef7 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -211,6 +211,8 @@ struct vcpu_sev_es_state {
 	u32 ghcb_sa_len;
 	bool ghcb_sa_sync;
 	bool ghcb_sa_free;
+
+	u64 ghcb_registered_gpa;
 };
 
 struct vcpu_svm {
@@ -354,6 +356,11 @@ static __always_inline bool sev_snp_guest(struct kvm *kvm)
 	return sev_es_guest(kvm) && sev->snp_active;
 }
 
+static inline bool ghcb_gpa_is_registered(struct vcpu_svm *svm, u64 val)
+{
+	return svm->sev_es.ghcb_registered_gpa == val;
+}
+
 static inline void vmcb_mark_all_dirty(struct vmcb *vmcb)
 {
 	vmcb->control.clean = 0;

From 37ba9cbb4bfcffc7720744495429dcf8bd9fd9e1 Mon Sep 17 00:00:00 2001
From: Brijesh Singh <brijesh.singh@amd.com>
Date: Tue, 24 May 2022 12:19:48 -0500
Subject: [PATCH 1392/1406] KVM: SEV: Add support to handle MSR based Page
 State Change VMGEXIT

SEV-SNP VMs can ask the hypervisor to change the page state in the RMP
table to be private or shared using the Page State Change MSR protocol
as defined in the GHCB specification.

When using gmem, private/shared memory is allocated through separate
pools, and KVM relies on userspace issuing a KVM_SET_MEMORY_ATTRIBUTES
KVM ioctl to tell the KVM MMU whether or not a particular GFN should be
backed by private memory or not.

Forward these page state change requests to userspace so that it can
issue the expected KVM ioctls. The KVM MMU will handle updating the RMP
entries when it is ready to map a private page into a guest.

Define a new KVM_EXIT_VMGEXIT for exits of this type, and structure it
so that it can be extended for other cases where VMGEXITs need some
level of handling in userspace.

Co-developed-by: Michael Roth <michael.roth@amd.com>
Signed-off-by: Michael Roth <michael.roth@amd.com>
Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
Signed-off-by: Ashish Kalra <ashish.kalra@amd.com>
---
 Documentation/virt/kvm/api.rst    | 33 +++++++++++++++++++++++++++++++
 arch/x86/include/asm/sev-common.h |  6 ++++++
 arch/x86/kvm/svm/sev.c            | 33 +++++++++++++++++++++++++++++++
 include/uapi/linux/kvm.h          | 17 ++++++++++++++++
 4 files changed, 89 insertions(+)

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index bd93cafd3e4e3e..2afe3147e705b6 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -7060,6 +7060,39 @@ Please note that the kernel is allowed to use the kvm_run structure as the
 primary storage for certain register types. Therefore, the kernel may use the
 values in kvm_run even if the corresponding bit in kvm_dirty_regs is not set.
 
+::
+
+		/* KVM_EXIT_VMGEXIT */
+		struct kvm_user_vmgexit {
+		#define KVM_USER_VMGEXIT_PSC_MSR	1
+			__u32 type; /* KVM_USER_VMGEXIT_* type */
+			union {
+				struct {
+					__u64 gpa;
+		#define KVM_USER_VMGEXIT_PSC_MSR_OP_PRIVATE	1
+		#define KVM_USER_VMGEXIT_PSC_MSR_OP_SHARED	2
+					__u8 op;
+					__u32 ret;
+				} psc_msr;
+			};
+		};
+
+If exit reason is KVM_EXIT_VMGEXIT then it indicates that an SEV-SNP guest
+has issued a VMGEXIT instruction (as documented by the AMD Architecture
+Programmer's Manual (APM)) to the hypervisor that needs to be serviced by
+userspace. These are generally handled by the host kernel, but in some
+cases some aspects handling a VMGEXIT are handled by userspace.
+
+A kvm_user_vmgexit structure is defined to encapsulate the data to be
+sent to or returned by userspace. The type field defines the specific type
+of exit that needs to be serviced, and that type is used as a discriminator
+to determine which union type should be used for input/output.
+
+For the KVM_USER_VMGEXIT_PSC_MSR type, the psc_msr union type is used. The
+kernel will supply the 'gpa' and 'op' fields, and userspace is expected to
+update the private/shared state of the GPA using the corresponding
+KVM_SET_MEMORY_ATTRIBUTES ioctl. The 'ret' field is to be set to 0 by
+userpace on success, or some non-zero value on failure.
 
 6. Capabilities that can be enabled on vCPUs
 ============================================
diff --git a/arch/x86/include/asm/sev-common.h b/arch/x86/include/asm/sev-common.h
index 1006bfffe07aa7..6d68db812de1cd 100644
--- a/arch/x86/include/asm/sev-common.h
+++ b/arch/x86/include/asm/sev-common.h
@@ -101,11 +101,17 @@ enum psc_op {
 	/* GHCBData[11:0] */				\
 	GHCB_MSR_PSC_REQ)
 
+#define GHCB_MSR_PSC_REQ_TO_GFN(msr) (((msr) & GENMASK_ULL(51, 12)) >> 12)
+#define GHCB_MSR_PSC_REQ_TO_OP(msr) (((msr) & GENMASK_ULL(55, 52)) >> 52)
+
 #define GHCB_MSR_PSC_RESP		0x015
 #define GHCB_MSR_PSC_RESP_VAL(val)			\
 	/* GHCBData[63:32] */				\
 	(((u64)(val) & GENMASK_ULL(63, 32)) >> 32)
 
+/* Set highest bit as a generic error response */
+#define GHCB_MSR_PSC_RESP_ERROR (BIT_ULL(63) | GHCB_MSR_PSC_RESP)
+
 /* GHCB Hypervisor Feature Request/Response */
 #define GHCB_MSR_HV_FT_REQ		0x080
 #define GHCB_MSR_HV_FT_RESP		0x081
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 8f88537eb3759d..350c97ba4098ea 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -3278,6 +3278,36 @@ static void set_ghcb_msr(struct vcpu_svm *svm, u64 value)
 	svm->vmcb->control.ghcb_gpa = value;
 }
 
+static int snp_complete_psc_msr(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+	u64 vmm_ret = vcpu->run->vmgexit.psc_msr.ret;
+
+	set_ghcb_msr(svm, (vmm_ret << 32) | GHCB_MSR_PSC_RESP);
+
+	return 1; /* resume guest */
+}
+
+static int snp_begin_psc_msr(struct kvm_vcpu *vcpu, u64 ghcb_msr)
+{
+	u64 gpa = gfn_to_gpa(GHCB_MSR_PSC_REQ_TO_GFN(ghcb_msr));
+	u8 op = GHCB_MSR_PSC_REQ_TO_OP(ghcb_msr);
+	struct vcpu_svm *svm = to_svm(vcpu);
+
+	if (op != SNP_PAGE_STATE_PRIVATE && op != SNP_PAGE_STATE_SHARED) {
+		set_ghcb_msr(svm, GHCB_MSR_PSC_RESP_ERROR);
+		return 1; /* resume guest */
+	}
+
+	vcpu->run->exit_reason = KVM_EXIT_VMGEXIT;
+	vcpu->run->vmgexit.type = KVM_USER_VMGEXIT_PSC_MSR;
+	vcpu->run->vmgexit.psc_msr.gpa = gpa;
+	vcpu->run->vmgexit.psc_msr.op = op;
+	vcpu->arch.complete_userspace_io = snp_complete_psc_msr;
+
+	return 0; /* forward request to userspace */
+}
+
 static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm)
 {
 	struct vmcb_control_area *control = &svm->vmcb->control;
@@ -3376,6 +3406,9 @@ static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm)
 				  GHCB_MSR_INFO_POS);
 		break;
 	}
+	case GHCB_MSR_PSC_REQ:
+		ret = snp_begin_psc_msr(vcpu, control->ghcb_gpa);
+		break;
 	case GHCB_MSR_TERM_REQ: {
 		u64 reason_set, reason_code;
 
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 2190adbe30027c..54b81e46a9fa62 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -135,6 +135,20 @@ struct kvm_xen_exit {
 	} u;
 };
 
+struct kvm_user_vmgexit {
+#define KVM_USER_VMGEXIT_PSC_MSR	1
+	__u32 type; /* KVM_USER_VMGEXIT_* type */
+	union {
+		struct {
+			__u64 gpa;
+#define KVM_USER_VMGEXIT_PSC_MSR_OP_PRIVATE	1
+#define KVM_USER_VMGEXIT_PSC_MSR_OP_SHARED	2
+			__u8 op;
+			__u32 ret;
+		} psc_msr;
+	};
+};
+
 #define KVM_S390_GET_SKEYS_NONE   1
 #define KVM_S390_SKEYS_MAX        1048576
 
@@ -178,6 +192,7 @@ struct kvm_xen_exit {
 #define KVM_EXIT_NOTIFY           37
 #define KVM_EXIT_LOONGARCH_IOCSR  38
 #define KVM_EXIT_MEMORY_FAULT     39
+#define KVM_EXIT_VMGEXIT          40
 
 /* For KVM_EXIT_INTERNAL_ERROR */
 /* Emulate instruction failed. */
@@ -433,6 +448,8 @@ struct kvm_run {
 			__u64 gpa;
 			__u64 size;
 		} memory_fault;
+		/* KVM_EXIT_VMGEXIT */
+		struct kvm_user_vmgexit vmgexit;
 		/* Fix the size of the union. */
 		char padding[256];
 	};

From 9ca02d70c043d2cfa56cc9d822513d09bef17345 Mon Sep 17 00:00:00 2001
From: Brijesh Singh <brijesh.singh@amd.com>
Date: Tue, 13 Dec 2022 17:14:41 -0600
Subject: [PATCH 1393/1406] KVM: SEV: Add support to handle Page State Change
 VMGEXIT

SEV-SNP VMs can ask the hypervisor to change the page state in the RMP
table to be private or shared using the Page State Change NAE event
as defined in the GHCB specification version 2.

Forward these requests to userspace as KVM_EXIT_VMGEXITs, similar to how
it is done for requests that don't use a GHCB page.

Co-developed-by: Michael Roth <michael.roth@amd.com>
Signed-off-by: Michael Roth <michael.roth@amd.com>
Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
Signed-off-by: Ashish Kalra <ashish.kalra@amd.com>
---
 Documentation/virt/kvm/api.rst | 14 ++++++++++++++
 arch/x86/kvm/svm/sev.c         | 16 ++++++++++++++++
 include/uapi/linux/kvm.h       |  5 +++++
 3 files changed, 35 insertions(+)

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 2afe3147e705b6..003369c2f5eaf2 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -7065,6 +7065,7 @@ values in kvm_run even if the corresponding bit in kvm_dirty_regs is not set.
 		/* KVM_EXIT_VMGEXIT */
 		struct kvm_user_vmgexit {
 		#define KVM_USER_VMGEXIT_PSC_MSR	1
+		#define KVM_USER_VMGEXIT_PSC		2
 			__u32 type; /* KVM_USER_VMGEXIT_* type */
 			union {
 				struct {
@@ -7074,9 +7075,14 @@ values in kvm_run even if the corresponding bit in kvm_dirty_regs is not set.
 					__u8 op;
 					__u32 ret;
 				} psc_msr;
+				struct {
+					__u64 shared_gpa;
+					__u64 ret;
+				} psc;
 			};
 		};
 
+
 If exit reason is KVM_EXIT_VMGEXIT then it indicates that an SEV-SNP guest
 has issued a VMGEXIT instruction (as documented by the AMD Architecture
 Programmer's Manual (APM)) to the hypervisor that needs to be serviced by
@@ -7094,6 +7100,14 @@ update the private/shared state of the GPA using the corresponding
 KVM_SET_MEMORY_ATTRIBUTES ioctl. The 'ret' field is to be set to 0 by
 userpace on success, or some non-zero value on failure.
 
+For the KVM_USER_VMGEXIT_PSC type, the psc union type is used. The kernel
+will supply the GPA of the Page State Structure defined in the GHCB spec.
+Userspace will process this structure as defined by the GHCB, and issue
+KVM_SET_MEMORY_ATTRIBUTES ioctls to set the GPAs therein to the expected
+private/shared state. Userspace will return a value in 'ret' that is in
+agreement with the GHCB-defined return values that the guest will expect
+in the SW_EXITINFO2 field of the GHCB in response to these requests.
+
 6. Capabilities that can be enabled on vCPUs
 ============================================
 
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 350c97ba4098ea..ea62fedd2ec80e 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -3090,6 +3090,7 @@ static int sev_es_validate_vmgexit(struct vcpu_svm *svm)
 	case SVM_VMGEXIT_AP_JUMP_TABLE:
 	case SVM_VMGEXIT_UNSUPPORTED_EVENT:
 	case SVM_VMGEXIT_HV_FEATURES:
+	case SVM_VMGEXIT_PSC:
 		break;
 	default:
 		reason = GHCB_ERR_INVALID_EVENT;
@@ -3308,6 +3309,15 @@ static int snp_begin_psc_msr(struct kvm_vcpu *vcpu, u64 ghcb_msr)
 	return 0; /* forward request to userspace */
 }
 
+static int snp_complete_psc(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+
+	ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, vcpu->run->vmgexit.psc.ret);
+
+	return 1; /* resume guest */
+}
+
 static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm)
 {
 	struct vmcb_control_area *control = &svm->vmcb->control;
@@ -3545,6 +3555,12 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu)
 
 		ret = 1;
 		break;
+	case SVM_VMGEXIT_PSC:
+		vcpu->run->exit_reason = KVM_EXIT_VMGEXIT;
+		vcpu->run->vmgexit.type = KVM_USER_VMGEXIT_PSC;
+		vcpu->run->vmgexit.psc.shared_gpa = svm->sev_es.sw_scratch;
+		vcpu->arch.complete_userspace_io = snp_complete_psc;
+		break;
 	case SVM_VMGEXIT_UNSUPPORTED_EVENT:
 		vcpu_unimpl(vcpu,
 			    "vmgexit: unsupported event - exit_info_1=%#llx, exit_info_2=%#llx\n",
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 54b81e46a9fa62..e33c48bfbd67d9 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -137,6 +137,7 @@ struct kvm_xen_exit {
 
 struct kvm_user_vmgexit {
 #define KVM_USER_VMGEXIT_PSC_MSR	1
+#define KVM_USER_VMGEXIT_PSC		2
 	__u32 type; /* KVM_USER_VMGEXIT_* type */
 	union {
 		struct {
@@ -146,6 +147,10 @@ struct kvm_user_vmgexit {
 			__u8 op;
 			__u32 ret;
 		} psc_msr;
+		struct {
+			__u64 shared_gpa;
+			__u64 ret;
+		} psc;
 	};
 };
 

From 5bf87c32579c40e1850e743d7d87ed42edd42f8c Mon Sep 17 00:00:00 2001
From: Brijesh Singh <brijesh.singh@amd.com>
Date: Tue, 26 Apr 2022 19:14:13 +0000
Subject: [PATCH 1394/1406] KVM: x86: Export the kvm_zap_gfn_range() for the
 SNP use

While resolving the RMP page fault, there may be cases where the page
level between the RMP entry and TDP does not match and the 2M RMP entry
must be split into 4K RMP entries. Or a 2M TDP page need to be broken
into multiple of 4K pages.

To keep the RMP and TDP page level in sync, zap the gfn range after
splitting the pages in the RMP entry. The zap should force the TDP to
gets rebuilt with the new page level.

Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
Signed-off-by: Ashish Kalra <ashish.kalra@amd.com>
Signed-off-by: Michael Roth <michael.roth@amd.com>
---
 arch/x86/include/asm/kvm_host.h | 1 +
 arch/x86/kvm/mmu.h              | 2 --
 arch/x86/kvm/mmu/mmu.c          | 1 +
 3 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 18aaac7f56b119..c84179eb194250 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1926,6 +1926,7 @@ void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
 				   const struct kvm_memory_slot *memslot);
 void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen);
 void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long kvm_nr_mmu_pages);
+void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end);
 
 int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3);
 
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 60f21bb4c27b19..df4d2c137a6782 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -252,8 +252,6 @@ static inline bool kvm_mmu_honors_guest_mtrrs(struct kvm *kvm)
 	return __kvm_mmu_honors_guest_mtrrs(kvm_arch_has_noncoherent_dma(kvm));
 }
 
-void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end);
-
 int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu);
 
 int kvm_mmu_post_init_vm(struct kvm *kvm);
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index a626552d051e65..6db0c217d47362 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -6703,6 +6703,7 @@ static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
 
 	return need_tlb_flush;
 }
+EXPORT_SYMBOL_GPL(kvm_zap_gfn_range);
 
 static void kvm_rmap_zap_collapsible_sptes(struct kvm *kvm,
 					   const struct kvm_memory_slot *slot)

From b7025f7a84d6af8f8b1f9b81d4d2c1c9504fb601 Mon Sep 17 00:00:00 2001
From: Brijesh Singh <brijesh.singh@amd.com>
Date: Tue, 19 Dec 2023 19:26:43 -0600
Subject: [PATCH 1395/1406] KVM: SEV: Add support to handle RMP nested page
 faults

When SEV-SNP is enabled in the guest, the hardware places restrictions
on all memory accesses based on the contents of the RMP table. When
hardware encounters RMP check failure caused by the guest memory access
it raises the #NPF. The error code contains additional information on
the access type. See the APM volume 2 for additional information.

When using gmem, RMP faults resulting from mismatches between the state
in the RMP table vs. what the guest expects via its page table result
in KVM_EXIT_MEMORY_FAULTs being forwarded to userspace to handle. This
means the only expected case that needs to be handled in the kernel is
when the page size of the entry in the RMP table is larger than the
mapping in the nested page table, in which case a PSMASH instruction
needs to be issued to split the large RMP entry into individual 4K
entries so that subsequent accesses can succeed.

Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
Co-developed-by: Michael Roth <michael.roth@amd.com>
Signed-off-by: Michael Roth <michael.roth@amd.com>
Signed-off-by: Ashish Kalra <ashish.kalra@amd.com>
---
 arch/x86/include/asm/sev.h |  3 ++
 arch/x86/kvm/svm/sev.c     | 92 ++++++++++++++++++++++++++++++++++++++
 arch/x86/kvm/svm/svm.c     | 21 +++++++--
 arch/x86/kvm/svm/svm.h     |  1 +
 4 files changed, 113 insertions(+), 4 deletions(-)

diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h
index d7b27cb34c2b0d..c37f5a80fa7fc3 100644
--- a/arch/x86/include/asm/sev.h
+++ b/arch/x86/include/asm/sev.h
@@ -91,6 +91,9 @@ extern bool handle_vc_boot_ghcb(struct pt_regs *regs);
 /* RMUPDATE detected 4K page and 2MB page overlap. */
 #define RMPUPDATE_FAIL_OVERLAP		4
 
+/* PSMASH failed due to concurrent access by another CPU */
+#define PSMASH_FAIL_INUSE		3
+
 /* RMP page size */
 #define RMP_PG_SIZE_4K			0
 #define RMP_PG_SIZE_2M			1
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index ea62fedd2ec80e..c5567179353fb7 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -3279,6 +3279,13 @@ static void set_ghcb_msr(struct vcpu_svm *svm, u64 value)
 	svm->vmcb->control.ghcb_gpa = value;
 }
 
+static int snp_rmptable_psmash(kvm_pfn_t pfn)
+{
+	pfn = pfn & ~(KVM_PAGES_PER_HPAGE(PG_LEVEL_2M) - 1);
+
+	return psmash(pfn);
+}
+
 static int snp_complete_psc_msr(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
@@ -3838,3 +3845,88 @@ struct page *snp_safe_alloc_page(struct kvm_vcpu *vcpu)
 
 	return p;
 }
+
+void handle_rmp_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code)
+{
+	struct kvm_memory_slot *slot;
+	struct kvm *kvm = vcpu->kvm;
+	int order, rmp_level, ret;
+	bool assigned;
+	kvm_pfn_t pfn;
+	gfn_t gfn;
+
+	gfn = gpa >> PAGE_SHIFT;
+
+	/*
+	 * The only time RMP faults occur for shared pages is when the guest is
+	 * triggering an RMP fault for an implicit page-state change from
+	 * shared->private. Implicit page-state changes are forwarded to
+	 * userspace via KVM_EXIT_MEMORY_FAULT events, however, so RMP faults
+	 * for shared pages should not end up here.
+	 */
+	if (!kvm_mem_is_private(kvm, gfn)) {
+		pr_warn_ratelimited("SEV: Unexpected RMP fault, size-mismatch for non-private GPA 0x%llx\n",
+				    gpa);
+		return;
+	}
+
+	slot = gfn_to_memslot(kvm, gfn);
+	if (!kvm_slot_can_be_private(slot)) {
+		pr_warn_ratelimited("SEV: Unexpected RMP fault, non-private slot for GPA 0x%llx\n",
+				    gpa);
+		return;
+	}
+
+	ret = kvm_gmem_get_pfn(kvm, slot, gfn, &pfn, &order);
+	if (ret) {
+		pr_warn_ratelimited("SEV: Unexpected RMP fault, no private backing page for GPA 0x%llx\n",
+				    gpa);
+		return;
+	}
+
+	ret = snp_lookup_rmpentry(pfn, &assigned, &rmp_level);
+	if (ret || !assigned) {
+		pr_warn_ratelimited("SEV: Unexpected RMP fault, no assigned RMP entry found for GPA 0x%llx PFN 0x%llx error %d\n",
+				    gpa, pfn, ret);
+		goto out;
+	}
+
+	/*
+	 * There are 2 cases where a PSMASH may be needed to resolve an #NPF
+	 * with PFERR_GUEST_RMP_BIT set:
+	 *
+	 * 1) RMPADJUST/PVALIDATE can trigger an #NPF with PFERR_GUEST_SIZEM
+	 *    bit set if the guest issues them with a smaller granularity than
+	 *    what is indicated by the page-size bit in the 2MB-aligned RMP
+	 *    entry for the PFN that backs the GPA.
+	 *
+	 * 2) Guest access via NPT can trigger an #NPF if the NPT mapping is
+	 *    smaller than what is indicated by the 2MB-aligned RMP entry for
+	 *    the PFN that backs the GPA.
+	 *
+	 * In both these cases, the corresponding 2M RMP entry needs to
+	 * be PSMASH'd to 512 4K RMP entries.  If the RMP entry is already
+	 * split into 4K RMP entries, then this is likely a spurious case which
+	 * can occur when there are concurrent accesses by the guest to a 2MB
+	 * GPA range that is backed by a 2MB-aligned PFN who's RMP entry is in
+	 * the process of being PMASH'd into 4K entries. These cases should
+	 * resolve automatically on subsequent accesses, so just ignore them
+	 * here.
+	 */
+	if (rmp_level == PG_LEVEL_4K) {
+		pr_debug_ratelimited("%s: Spurious RMP fault for GPA 0x%llx, error_code 0x%llx",
+				     __func__, gpa, error_code);
+		goto out;
+	}
+
+	pr_debug_ratelimited("%s: Splitting 2M RMP entry for GPA 0x%llx, error_code 0x%llx",
+			     __func__, gpa, error_code);
+	ret = snp_rmptable_psmash(pfn);
+	if (ret && ret != PSMASH_FAIL_INUSE)
+		pr_err_ratelimited("SEV: Unable to split RMP entry for GPA 0x%llx PFN 0x%llx ret %d\n",
+				   gpa, pfn, ret);
+
+	kvm_zap_gfn_range(kvm, gfn, gfn + PTRS_PER_PMD);
+out:
+	put_page(pfn_to_page(pfn));
+}
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 272d5ed37ce77c..33dcf87f2871b5 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -2051,15 +2051,28 @@ static int pf_interception(struct kvm_vcpu *vcpu)
 static int npf_interception(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
+	int rc;
 
 	u64 fault_address = svm->vmcb->control.exit_info_2;
 	u64 error_code = svm->vmcb->control.exit_info_1;
 
 	trace_kvm_page_fault(vcpu, fault_address, error_code);
-	return kvm_mmu_page_fault(vcpu, fault_address, error_code,
-			static_cpu_has(X86_FEATURE_DECODEASSISTS) ?
-			svm->vmcb->control.insn_bytes : NULL,
-			svm->vmcb->control.insn_len);
+	rc = kvm_mmu_page_fault(vcpu, fault_address, error_code,
+				static_cpu_has(X86_FEATURE_DECODEASSISTS) ?
+				svm->vmcb->control.insn_bytes : NULL,
+				svm->vmcb->control.insn_len);
+
+	/*
+	 * rc == 0 indicates a userspace exit is needed to handle page
+	 * transitions, so do that first before updating the RMP table.
+	 */
+	if (error_code & PFERR_GUEST_RMP_MASK) {
+		if (rc == 0)
+			return rc;
+		handle_rmp_page_fault(vcpu, fault_address, error_code);
+	}
+
+	return rc;
 }
 
 static int db_interception(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 2bee24017baef7..fb98d88d812470 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -717,6 +717,7 @@ void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector);
 void sev_es_prepare_switch_to_guest(struct sev_es_save_area *hostsa);
 void sev_es_unmap_ghcb(struct vcpu_svm *svm);
 struct page *snp_safe_alloc_page(struct kvm_vcpu *vcpu);
+void handle_rmp_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code);
 
 /* vmenter.S */
 

From 79fc5059617289c43e7ce160b2d5fbf4c351af9c Mon Sep 17 00:00:00 2001
From: Tom Lendacky <thomas.lendacky@amd.com>
Date: Tue, 26 Apr 2022 19:19:40 +0000
Subject: [PATCH 1396/1406] KVM: SEV: Use a VMSA physical address variable for
 populating VMCB

In preparation to support SEV-SNP AP Creation, use a variable that holds
the VMSA physical address rather than converting the virtual address.
This will allow SEV-SNP AP Creation to set the new physical address that
will be used should the vCPU reset path be taken.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Signed-off-by: Ashish Kalra <ashish.kalra@amd.com>
Signed-off-by: Michael Roth <michael.roth@amd.com>
---
 arch/x86/kvm/svm/sev.c | 3 +--
 arch/x86/kvm/svm/svm.c | 9 ++++++++-
 arch/x86/kvm/svm/svm.h | 1 +
 3 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index c5567179353fb7..4074b727795a87 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -3662,8 +3662,7 @@ static void sev_es_init_vmcb(struct vcpu_svm *svm)
 	 * the VMSA will be NULL if this vCPU is the destination for intrahost
 	 * migration, and will be copied later.
 	 */
-	if (svm->sev_es.vmsa)
-		svm->vmcb->control.vmsa_pa = __pa(svm->sev_es.vmsa);
+	svm->vmcb->control.vmsa_pa = svm->sev_es.vmsa_pa;
 
 	/* Can't intercept CR register access, HV can't modify CR registers */
 	svm_clr_intercept(svm, INTERCEPT_CR0_READ);
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 33dcf87f2871b5..e4cad6aa37c1e9 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -1459,9 +1459,16 @@ static int svm_vcpu_create(struct kvm_vcpu *vcpu)
 	svm->vmcb01.pa = __sme_set(page_to_pfn(vmcb01_page) << PAGE_SHIFT);
 	svm_switch_vmcb(svm, &svm->vmcb01);
 
-	if (vmsa_page)
+	if (vmsa_page) {
 		svm->sev_es.vmsa = page_address(vmsa_page);
 
+		/*
+		 * Do not include the encryption mask on the VMSA physical
+		 * address since hardware will access it using the guest key.
+		 */
+		svm->sev_es.vmsa_pa = __pa(svm->sev_es.vmsa);
+	}
+
 	svm->guest_state_loaded = false;
 
 	return 0;
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index fb98d88d812470..4ef41f4d4ee6f6 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -202,6 +202,7 @@ struct vcpu_sev_es_state {
 	struct ghcb *ghcb;
 	u8 valid_bitmap[16];
 	struct kvm_host_map ghcb_map;
+	hpa_t vmsa_pa;
 	bool received_first_sipi;
 	unsigned int ap_reset_hold_type;
 

From 5da5395b9d0feca83d303924c057239b54b2f43d Mon Sep 17 00:00:00 2001
From: Tom Lendacky <thomas.lendacky@amd.com>
Date: Tue, 26 Apr 2022 19:21:40 +0000
Subject: [PATCH 1397/1406] KVM: SEV: Support SEV-SNP AP Creation NAE event

Add support for the SEV-SNP AP Creation NAE event. This allows SEV-SNP
guests to alter the register state of the APs on their own. This allows
the guest a way of simulating INIT-SIPI.

A new event, KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, is created and used
so as to avoid updating the VMSA pointer while the vCPU is running.

For CREATE
  The guest supplies the GPA of the VMSA to be used for the vCPU with
  the specified APIC ID. The GPA is saved in the svm struct of the
  target vCPU, the KVM_REQ_UPDATE_PROTECTED_GUEST_STATE event is added
  to the vCPU and then the vCPU is kicked.

For CREATE_ON_INIT:
  The guest supplies the GPA of the VMSA to be used for the vCPU with
  the specified APIC ID the next time an INIT is performed. The GPA is
  saved in the svm struct of the target vCPU.

For DESTROY:
  The guest indicates it wishes to stop the vCPU. The GPA is cleared
  from the svm struct, the KVM_REQ_UPDATE_PROTECTED_GUEST_STATE event is
  added to vCPU and then the vCPU is kicked.

The KVM_REQ_UPDATE_PROTECTED_GUEST_STATE event handler will be invoked
as a result of the event or as a result of an INIT. The handler sets the
vCPU to the KVM_MP_STATE_UNINITIALIZED state, so that any errors will
leave the vCPU as not runnable. Any previous VMSA pages that were
installed as part of an SEV-SNP AP Creation NAE event are un-pinned. If
a new VMSA is to be installed, the VMSA guest page is pinned and set as
the VMSA in the vCPU VMCB and the vCPU state is set to
KVM_MP_STATE_RUNNABLE. If a new VMSA is not to be installed, the VMSA is
cleared in the vCPU VMCB and the vCPU state is left as
KVM_MP_STATE_UNINITIALIZED to prevent it from being run.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
Signed-off-by: Ashish Kalra <ashish.kalra@amd.com>
[mdr: add handling for gmem]
Signed-off-by: Michael Roth <michael.roth@amd.com>
---
 arch/x86/include/asm/kvm_host.h |   1 +
 arch/x86/include/asm/svm.h      |   5 +
 arch/x86/kvm/svm/sev.c          | 219 ++++++++++++++++++++++++++++++++
 arch/x86/kvm/svm/svm.c          |   3 +
 arch/x86/kvm/svm/svm.h          |   8 +-
 arch/x86/kvm/x86.c              |  11 ++
 6 files changed, 246 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index c84179eb194250..a165a9f225234d 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -121,6 +121,7 @@
 	KVM_ARCH_REQ_FLAGS(31, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
 #define KVM_REQ_HV_TLB_FLUSH \
 	KVM_ARCH_REQ_FLAGS(32, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
+#define KVM_REQ_UPDATE_PROTECTED_GUEST_STATE	KVM_ARCH_REQ(34)
 
 #define CR0_RESERVED_BITS                                               \
 	(~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index ba8ce15b27d787..4b73cf5e9de003 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -287,6 +287,11 @@ static_assert((X2AVIC_MAX_PHYSICAL_ID & AVIC_PHYSICAL_MAX_INDEX_MASK) == X2AVIC_
 
 #define SVM_SEV_FEAT_DEBUG_SWAP                        BIT(5)
 #define SVM_SEV_FEAT_SNP_ACTIVE                        BIT(0)
+#define SVM_SEV_FEAT_RESTRICTED_INJECTION              BIT(3)
+#define SVM_SEV_FEAT_ALTERNATE_INJECTION	       BIT(4)
+#define SVM_SEV_FEAT_INT_INJ_MODES		\
+	(SVM_SEV_FEAT_RESTRICTED_INJECTION |	\
+	 SVM_SEV_FEAT_ALTERNATE_INJECTION)
 
 struct vmcb_seg {
 	u16 selector;
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 4074b727795a87..8ea1f02f4e08ce 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -658,6 +658,7 @@ static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
 
 static int sev_es_sync_vmsa(struct vcpu_svm *svm)
 {
+	struct kvm_sev_info *sev = &to_kvm_svm(svm->vcpu.kvm)->sev_info;
 	struct sev_es_save_area *save = svm->sev_es.vmsa;
 
 	/* Check some debug related fields before encrypting the VMSA */
@@ -706,6 +707,12 @@ static int sev_es_sync_vmsa(struct vcpu_svm *svm)
 	if (sev_snp_guest(svm->vcpu.kvm))
 		save->sev_features |= SVM_SEV_FEAT_SNP_ACTIVE;
 
+	/*
+	 * Save the VMSA synced SEV features. For now, they are the same for
+	 * all vCPUs, so just save each time.
+	 */
+	sev->sev_features = save->sev_features;
+
 	pr_debug("Virtual Machine Save Area (VMSA):\n");
 	print_hex_dump_debug("", DUMP_PREFIX_NONE, 16, 1, save, sizeof(*save), false);
 
@@ -3085,6 +3092,11 @@ static int sev_es_validate_vmgexit(struct vcpu_svm *svm)
 		if (!kvm_ghcb_sw_scratch_is_valid(svm))
 			goto vmgexit_err;
 		break;
+	case SVM_VMGEXIT_AP_CREATION:
+		if (lower_32_bits(control->exit_info_1) != SVM_VMGEXIT_AP_DESTROY)
+			if (!kvm_ghcb_rax_is_valid(svm))
+				goto vmgexit_err;
+		break;
 	case SVM_VMGEXIT_NMI_COMPLETE:
 	case SVM_VMGEXIT_AP_HLT_LOOP:
 	case SVM_VMGEXIT_AP_JUMP_TABLE:
@@ -3325,6 +3337,202 @@ static int snp_complete_psc(struct kvm_vcpu *vcpu)
 	return 1; /* resume guest */
 }
 
+static int __sev_snp_update_protected_guest_state(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+	hpa_t cur_pa;
+
+	WARN_ON(!mutex_is_locked(&svm->sev_es.snp_vmsa_mutex));
+
+	/* Save off the current VMSA PA for later checks */
+	cur_pa = svm->sev_es.vmsa_pa;
+
+	/* Mark the vCPU as offline and not runnable */
+	vcpu->arch.pv.pv_unhalted = false;
+	vcpu->arch.mp_state = KVM_MP_STATE_HALTED;
+
+	/* Clear use of the VMSA */
+	svm->sev_es.vmsa_pa = INVALID_PAGE;
+	svm->vmcb->control.vmsa_pa = INVALID_PAGE;
+
+	/*
+	 * sev->sev_es.vmsa holds the virtual address of the VMSA initially
+	 * allocated by the host. If the guest specified a new a VMSA via
+	 * AP_CREATION, it will have been pinned to avoid future issues
+	 * with things like page migration support. Make sure to un-pin it
+	 * before switching to a newer guest-specified VMSA.
+	 */
+	if (cur_pa != __pa(svm->sev_es.vmsa) && VALID_PAGE(cur_pa))
+		kvm_release_pfn_dirty(__phys_to_pfn(cur_pa));
+
+	if (VALID_PAGE(svm->sev_es.snp_vmsa_gpa)) {
+		gfn_t gfn = gpa_to_gfn(svm->sev_es.snp_vmsa_gpa);
+		struct kvm_memory_slot *slot;
+		kvm_pfn_t pfn;
+
+		slot = gfn_to_memslot(vcpu->kvm, gfn);
+		if (!slot)
+			return -EINVAL;
+
+		/*
+		 * The new VMSA will be private memory guest memory, so
+		 * retrieve the PFN from the gmem backend, and leave the ref
+		 * count of the associated folio elevated to ensure it won't
+		 * ever be migrated.
+		 */
+		if (kvm_gmem_get_pfn(vcpu->kvm, slot, gfn, &pfn, NULL))
+			return -EINVAL;
+
+		/* Use the new VMSA */
+		svm->sev_es.vmsa_pa = pfn_to_hpa(pfn);
+		svm->vmcb->control.vmsa_pa = svm->sev_es.vmsa_pa;
+
+		/* Mark the vCPU as runnable */
+		vcpu->arch.pv.pv_unhalted = false;
+		vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+
+		svm->sev_es.snp_vmsa_gpa = INVALID_PAGE;
+	}
+
+	/*
+	 * When replacing the VMSA during SEV-SNP AP creation,
+	 * mark the VMCB dirty so that full state is always reloaded.
+	 */
+	vmcb_mark_all_dirty(svm->vmcb);
+
+	return 0;
+}
+
+/*
+ * Invoked as part of svm_vcpu_reset() processing of an init event.
+ */
+void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+	int ret;
+
+	if (!sev_snp_guest(vcpu->kvm))
+		return;
+
+	mutex_lock(&svm->sev_es.snp_vmsa_mutex);
+
+	if (!svm->sev_es.snp_ap_create)
+		goto unlock;
+
+	svm->sev_es.snp_ap_create = false;
+
+	ret = __sev_snp_update_protected_guest_state(vcpu);
+	if (ret)
+		vcpu_unimpl(vcpu, "snp: AP state update on init failed\n");
+
+unlock:
+	mutex_unlock(&svm->sev_es.snp_vmsa_mutex);
+}
+
+static int sev_snp_ap_creation(struct vcpu_svm *svm)
+{
+	struct kvm_sev_info *sev = &to_kvm_svm(svm->vcpu.kvm)->sev_info;
+	struct kvm_vcpu *vcpu = &svm->vcpu;
+	struct kvm_vcpu *target_vcpu;
+	struct vcpu_svm *target_svm;
+	unsigned int request;
+	unsigned int apic_id;
+	bool kick;
+	int ret;
+
+	request = lower_32_bits(svm->vmcb->control.exit_info_1);
+	apic_id = upper_32_bits(svm->vmcb->control.exit_info_1);
+
+	/* Validate the APIC ID */
+	target_vcpu = kvm_get_vcpu_by_id(vcpu->kvm, apic_id);
+	if (!target_vcpu) {
+		vcpu_unimpl(vcpu, "vmgexit: invalid AP APIC ID [%#x] from guest\n",
+			    apic_id);
+		return -EINVAL;
+	}
+
+	ret = 0;
+
+	target_svm = to_svm(target_vcpu);
+
+	/*
+	 * The target vCPU is valid, so the vCPU will be kicked unless the
+	 * request is for CREATE_ON_INIT. For any errors at this stage, the
+	 * kick will place the vCPU in an non-runnable state.
+	 */
+	kick = true;
+
+	mutex_lock(&target_svm->sev_es.snp_vmsa_mutex);
+
+	target_svm->sev_es.snp_vmsa_gpa = INVALID_PAGE;
+	target_svm->sev_es.snp_ap_create = true;
+
+	/* Interrupt injection mode shouldn't change for AP creation */
+	if (request < SVM_VMGEXIT_AP_DESTROY) {
+		u64 sev_features;
+
+		sev_features = vcpu->arch.regs[VCPU_REGS_RAX];
+		sev_features ^= sev->sev_features;
+		if (sev_features & SVM_SEV_FEAT_INT_INJ_MODES) {
+			vcpu_unimpl(vcpu, "vmgexit: invalid AP injection mode [%#lx] from guest\n",
+				    vcpu->arch.regs[VCPU_REGS_RAX]);
+			ret = -EINVAL;
+			goto out;
+		}
+	}
+
+	switch (request) {
+	case SVM_VMGEXIT_AP_CREATE_ON_INIT:
+		kick = false;
+		fallthrough;
+	case SVM_VMGEXIT_AP_CREATE:
+		if (!page_address_valid(vcpu, svm->vmcb->control.exit_info_2)) {
+			vcpu_unimpl(vcpu, "vmgexit: invalid AP VMSA address [%#llx] from guest\n",
+				    svm->vmcb->control.exit_info_2);
+			ret = -EINVAL;
+			goto out;
+		}
+
+		/*
+		 * Malicious guest can RMPADJUST a large page into VMSA which
+		 * will hit the SNP erratum where the CPU will incorrectly signal
+		 * an RMP violation #PF if a hugepage collides with the RMP entry
+		 * of VMSA page, reject the AP CREATE request if VMSA address from
+		 * guest is 2M aligned.
+		 */
+		if (IS_ALIGNED(svm->vmcb->control.exit_info_2, PMD_SIZE)) {
+			vcpu_unimpl(vcpu,
+				    "vmgexit: AP VMSA address [%llx] from guest is unsafe as it is 2M aligned\n",
+				    svm->vmcb->control.exit_info_2);
+			ret = -EINVAL;
+			goto out;
+		}
+
+		target_svm->sev_es.snp_vmsa_gpa = svm->vmcb->control.exit_info_2;
+		break;
+	case SVM_VMGEXIT_AP_DESTROY:
+		break;
+	default:
+		vcpu_unimpl(vcpu, "vmgexit: invalid AP creation request [%#x] from guest\n",
+			    request);
+		ret = -EINVAL;
+		break;
+	}
+
+out:
+	if (kick) {
+		if (target_vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)
+			target_vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+
+		kvm_make_request(KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, target_vcpu);
+		kvm_vcpu_kick(target_vcpu);
+	}
+
+	mutex_unlock(&target_svm->sev_es.snp_vmsa_mutex);
+
+	return ret;
+}
+
 static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm)
 {
 	struct vmcb_control_area *control = &svm->vmcb->control;
@@ -3568,6 +3776,15 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu)
 		vcpu->run->vmgexit.psc.shared_gpa = svm->sev_es.sw_scratch;
 		vcpu->arch.complete_userspace_io = snp_complete_psc;
 		break;
+	case SVM_VMGEXIT_AP_CREATION:
+		ret = sev_snp_ap_creation(svm);
+		if (ret) {
+			ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, 2);
+			ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, GHCB_ERR_INVALID_INPUT);
+		}
+
+		ret = 1;
+		break;
 	case SVM_VMGEXIT_UNSUPPORTED_EVENT:
 		vcpu_unimpl(vcpu,
 			    "vmgexit: unsupported event - exit_info_1=%#llx, exit_info_2=%#llx\n",
@@ -3734,6 +3951,8 @@ void sev_es_vcpu_reset(struct vcpu_svm *svm)
 	set_ghcb_msr(svm, GHCB_MSR_SEV_INFO(GHCB_VERSION_MAX,
 					    GHCB_VERSION_MIN,
 					    sev_enc_bit));
+
+	mutex_init(&svm->sev_es.snp_vmsa_mutex);
 }
 
 void sev_es_prepare_switch_to_guest(struct sev_es_save_area *hostsa)
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index e4cad6aa37c1e9..65561551d9308e 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -1398,6 +1398,9 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
 	svm->spec_ctrl = 0;
 	svm->virt_spec_ctrl = 0;
 
+	if (init_event)
+		sev_snp_init_protected_guest_state(vcpu);
+
 	init_vmcb(vcpu);
 
 	if (!init_event)
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 4ef41f4d4ee6f6..d953ae41c619a2 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -96,6 +96,7 @@ struct kvm_sev_info {
 	atomic_t migration_in_progress;
 	u64 snp_init_flags;
 	void *snp_context;      /* SNP guest context page */
+	u64 sev_features;	/* Features set at VMSA creation */
 };
 
 struct kvm_svm {
@@ -214,6 +215,10 @@ struct vcpu_sev_es_state {
 	bool ghcb_sa_free;
 
 	u64 ghcb_registered_gpa;
+
+	struct mutex snp_vmsa_mutex; /* Used to handle concurrent updates of VMSA. */
+	gpa_t snp_vmsa_gpa;
+	bool snp_ap_create;
 };
 
 struct vcpu_svm {
@@ -689,7 +694,7 @@ void avic_refresh_virtual_apic_mode(struct kvm_vcpu *vcpu);
 #define GHCB_VERSION_MAX	2ULL
 #define GHCB_VERSION_MIN	1ULL
 
-#define GHCB_HV_FT_SUPPORTED	GHCB_HV_FT_SNP
+#define GHCB_HV_FT_SUPPORTED	(GHCB_HV_FT_SNP | GHCB_HV_FT_SNP_AP_CREATION)
 
 extern unsigned int max_sev_asid;
 
@@ -719,6 +724,7 @@ void sev_es_prepare_switch_to_guest(struct sev_es_save_area *hostsa);
 void sev_es_unmap_ghcb(struct vcpu_svm *svm);
 struct page *snp_safe_alloc_page(struct kvm_vcpu *vcpu);
 void handle_rmp_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code);
+void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu);
 
 /* vmenter.S */
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 242fdee79c6166..3d091de11744ec 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -10869,6 +10869,14 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 
 		if (kvm_check_request(KVM_REQ_UPDATE_CPU_DIRTY_LOGGING, vcpu))
 			static_call(kvm_x86_update_cpu_dirty_logging)(vcpu);
+
+		if (kvm_check_request(KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, vcpu)) {
+			kvm_vcpu_reset(vcpu, true);
+			if (vcpu->arch.mp_state != KVM_MP_STATE_RUNNABLE) {
+				r = 1;
+				goto out;
+			}
+		}
 	}
 
 	if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win ||
@@ -13083,6 +13091,9 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
 	if (kvm_test_request(KVM_REQ_PMI, vcpu))
 		return true;
 
+	if (kvm_test_request(KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, vcpu))
+		return true;
+
 	if (kvm_arch_interrupt_allowed(vcpu) &&
 	    (kvm_cpu_has_interrupt(vcpu) ||
 	    kvm_guest_apic_has_interrupt(vcpu)))

From 014b38961370c6286eebabcc2159b623090d220c Mon Sep 17 00:00:00 2001
From: Michael Roth <michael.roth@amd.com>
Date: Mon, 18 Sep 2023 09:31:40 -0500
Subject: [PATCH 1398/1406] KVM: SEV: Add support for GHCB-based termination
 requests

GHCB version 2 adds support for a GHCB-based termination request that
a guest can issue when it reaches an error state and wishes to inform
the hypervisor that it should be terminated. Implement support for that
similarly to GHCB MSR-based termination requests that are already
available to SEV-ES guests via earlier versions of the GHCB protocol.

See 'Termination Request' in the 'Invoking VMGEXIT' section of the GHCB
specification for more details.

Signed-off-by: Michael Roth <michael.roth@amd.com>
---
 arch/x86/kvm/svm/sev.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 8ea1f02f4e08ce..d8d5147772e8b9 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -3103,6 +3103,7 @@ static int sev_es_validate_vmgexit(struct vcpu_svm *svm)
 	case SVM_VMGEXIT_UNSUPPORTED_EVENT:
 	case SVM_VMGEXIT_HV_FEATURES:
 	case SVM_VMGEXIT_PSC:
+	case SVM_VMGEXIT_TERM_REQUEST:
 		break;
 	default:
 		reason = GHCB_ERR_INVALID_EVENT;
@@ -3785,6 +3786,14 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu)
 
 		ret = 1;
 		break;
+	case SVM_VMGEXIT_TERM_REQUEST:
+		pr_info("SEV-ES guest requested termination: reason %#llx info %#llx\n",
+			control->exit_info_1, control->exit_info_2);
+		vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
+		vcpu->run->system_event.type = KVM_SYSTEM_EVENT_SEV_TERM;
+		vcpu->run->system_event.ndata = 1;
+		vcpu->run->system_event.data[0] = control->ghcb_gpa;
+		break;
 	case SVM_VMGEXIT_UNSUPPORTED_EVENT:
 		vcpu_unimpl(vcpu,
 			    "vmgexit: unsupported event - exit_info_1=%#llx, exit_info_2=%#llx\n",

From 14fd433e1f856a016843a1964ac894359a41cc5d Mon Sep 17 00:00:00 2001
From: Michael Roth <michael.roth@amd.com>
Date: Tue, 24 May 2022 12:13:10 -0500
Subject: [PATCH 1399/1406] KVM: SEV: Implement gmem hook for initializing
 private pages

This will handle RMP table updates and direct map changes needed to put
a page into a private state before mapping it into an SEV-SNP guest.

Signed-off-by: Michael Roth <michael.roth@amd.com>
---
 arch/x86/kvm/Kconfig   |  1 +
 arch/x86/kvm/svm/sev.c | 98 ++++++++++++++++++++++++++++++++++++++++++
 arch/x86/kvm/svm/svm.c |  2 +
 arch/x86/kvm/svm/svm.h |  1 +
 virt/kvm/guest_memfd.c |  4 +-
 5 files changed, 104 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 4322fe608c3fa3..d4d658cfcfd833 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -122,6 +122,7 @@ config KVM_AMD_SEV
 	depends on KVM_AMD && X86_64
 	depends on CRYPTO_DEV_SP_PSP && !(KVM_AMD=y && CRYPTO_DEV_CCP_DD=m)
 	select KVM_GENERIC_PRIVATE_MEM
+	select HAVE_KVM_GMEM_PREPARE
 	help
 	  Provides support for launching Encrypted VMs (SEV) and Encrypted VMs
 	  with Encrypted State (SEV-ES) on AMD processors.
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index d8d5147772e8b9..fbd6c5078ebdc2 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -4157,3 +4157,101 @@ void handle_rmp_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code)
 out:
 	put_page(pfn_to_page(pfn));
 }
+
+static bool is_pfn_range_shared(kvm_pfn_t start, kvm_pfn_t end)
+{
+	kvm_pfn_t pfn = start;
+
+	while (pfn < end) {
+		int ret, rmp_level;
+		bool assigned;
+
+		ret = snp_lookup_rmpentry(pfn, &assigned, &rmp_level);
+		if (ret) {
+			pr_warn_ratelimited("SEV: Failed to retrieve RMP entry: PFN 0x%llx GFN start 0x%llx GFN end 0x%llx RMP level %d error %d\n",
+					    pfn, start, end, rmp_level, ret);
+			return false;
+		}
+
+		if (assigned) {
+			pr_debug("%s: overlap detected, PFN 0x%llx start 0x%llx end 0x%llx RMP level %d\n",
+				 __func__, pfn, start, end, rmp_level);
+			return false;
+		}
+
+		pfn++;
+	}
+
+	return true;
+}
+
+static u8 max_level_for_order(int order)
+{
+	if (order >= KVM_HPAGE_GFN_SHIFT(PG_LEVEL_2M))
+		return PG_LEVEL_2M;
+
+	return PG_LEVEL_4K;
+}
+
+static bool is_large_rmp_possible(struct kvm *kvm, kvm_pfn_t pfn, int order)
+{
+	kvm_pfn_t pfn_aligned = ALIGN_DOWN(pfn, PTRS_PER_PMD);
+
+	/*
+	 * If this is a large folio, and the entire 2M range containing the
+	 * PFN is currently shared, then the entire 2M-aligned range can be
+	 * set to private via a single 2M RMP entry.
+	 */
+	if (max_level_for_order(order) > PG_LEVEL_4K &&
+	    is_pfn_range_shared(pfn_aligned, pfn_aligned + PTRS_PER_PMD))
+		return true;
+
+	return false;
+}
+
+int sev_gmem_prepare(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order)
+{
+	struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+	kvm_pfn_t pfn_aligned;
+	gfn_t gfn_aligned;
+	int level, rc;
+	bool assigned;
+
+	if (!sev_snp_guest(kvm))
+		return 0;
+
+	rc = snp_lookup_rmpentry(pfn, &assigned, &level);
+	if (rc) {
+		pr_err_ratelimited("SEV: Failed to look up RMP entry: GFN %llx PFN %llx error %d\n",
+				   gfn, pfn, rc);
+		return -ENOENT;
+	}
+
+	if (assigned) {
+		pr_debug("%s: already assigned: gfn %llx pfn %llx max_order %d level %d\n",
+			 __func__, gfn, pfn, max_order, level);
+		return 0;
+	}
+
+	if (is_large_rmp_possible(kvm, pfn, max_order)) {
+		level = PG_LEVEL_2M;
+		pfn_aligned = ALIGN_DOWN(pfn, PTRS_PER_PMD);
+		gfn_aligned = ALIGN_DOWN(gfn, PTRS_PER_PMD);
+	} else {
+		level = PG_LEVEL_4K;
+		pfn_aligned = pfn;
+		gfn_aligned = gfn;
+	}
+
+	rc = rmp_make_private(pfn_aligned, gfn_to_gpa(gfn_aligned), level, sev->asid, false);
+	if (rc) {
+		pr_err_ratelimited("SEV: Failed to update RMP entry: GFN %llx PFN %llx level %d error %d\n",
+				   gfn, pfn, level, rc);
+		return -EINVAL;
+	}
+
+	pr_debug("%s: updated: gfn %llx pfn %llx pfn_aligned %llx max_order %d level %d\n",
+		 __func__, gfn, pfn, pfn_aligned, max_order, level);
+
+	return 0;
+}
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 65561551d9308e..7e8158be610c58 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -5065,6 +5065,8 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
 	.vcpu_deliver_sipi_vector = svm_vcpu_deliver_sipi_vector,
 	.vcpu_get_apicv_inhibit_reasons = avic_vcpu_get_apicv_inhibit_reasons,
 	.alloc_apic_backing_page = svm_alloc_apic_backing_page,
+
+	.gmem_prepare = sev_gmem_prepare,
 };
 
 /*
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index d953ae41c619a2..9ece9612dbb9a5 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -725,6 +725,7 @@ void sev_es_unmap_ghcb(struct vcpu_svm *svm);
 struct page *snp_safe_alloc_page(struct kvm_vcpu *vcpu);
 void handle_rmp_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code);
 void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu);
+int sev_gmem_prepare(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order);
 
 /* vmenter.S */
 
diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index 4ede25dd8d140c..8542327d09f7df 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -66,8 +66,8 @@ static int kvm_gmem_prepare_folio(struct inode *inode, pgoff_t index, struct fol
 		gfn = slot->base_gfn + index - slot->gmem.pgoff;
 		rc = kvm_arch_gmem_prepare(kvm, gfn, pfn, compound_order(compound_head(page)));
 		if (rc) {
-			pr_warn_ratelimited("gmem: Failed to prepare folio for index %lx, error %d.\n",
-					    index, rc);
+			pr_warn_ratelimited("gmem: Failed to prepare folio for index %lx GFN %llx PFN %llx error %d.\n",
+					    index, gfn, pfn, rc);
 			return rc;
 		}
 	}

From bbb77223e4d8cdd88d33f5990004e86ddb3530f2 Mon Sep 17 00:00:00 2001
From: Michael Roth <michael.roth@amd.com>
Date: Wed, 19 Oct 2022 08:53:33 -0500
Subject: [PATCH 1400/1406] KVM: SEV: Implement gmem hook for invalidating
 private pages

Implement a platform hook to do the work of restoring the direct map
entries of gmem-managed pages and transitioning the corresponding RMP
table entries back to the default shared/hypervisor-owned state.

Signed-off-by: Michael Roth <michael.roth@amd.com>
---
 arch/x86/kvm/Kconfig   |  1 +
 arch/x86/kvm/svm/sev.c | 63 ++++++++++++++++++++++++++++++++++++++++++
 arch/x86/kvm/svm/svm.c |  1 +
 arch/x86/kvm/svm/svm.h |  2 ++
 4 files changed, 67 insertions(+)

diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index d4d658cfcfd833..f2d148a3a43ca0 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -123,6 +123,7 @@ config KVM_AMD_SEV
 	depends on CRYPTO_DEV_SP_PSP && !(KVM_AMD=y && CRYPTO_DEV_CCP_DD=m)
 	select KVM_GENERIC_PRIVATE_MEM
 	select HAVE_KVM_GMEM_PREPARE
+	select HAVE_KVM_GMEM_INVALIDATE
 	help
 	  Provides support for launching Encrypted VMs (SEV) and Encrypted VMs
 	  with Encrypted State (SEV-ES) on AMD processors.
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index fbd6c5078ebdc2..071c4f66fca44b 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -4255,3 +4255,66 @@ int sev_gmem_prepare(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order)
 
 	return 0;
 }
+
+void sev_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end)
+{
+	kvm_pfn_t pfn;
+
+	pr_debug("%s: PFN start 0x%llx PFN end 0x%llx\n", __func__, start, end);
+
+	for (pfn = start; pfn < end;) {
+		bool use_2m_update = false;
+		int rc, rmp_level;
+		bool assigned;
+
+		rc = snp_lookup_rmpentry(pfn, &assigned, &rmp_level);
+		if (rc) {
+			pr_debug_ratelimited("SEV: Failed to retrieve RMP entry for PFN 0x%llx error %d\n",
+					     pfn, rc);
+			goto next_pfn;
+		}
+
+		if (!assigned)
+			goto next_pfn;
+
+		use_2m_update = IS_ALIGNED(pfn, PTRS_PER_PMD) &&
+				end >= (pfn + PTRS_PER_PMD) &&
+				rmp_level > PG_LEVEL_4K;
+
+		/*
+		 * If an unaligned PFN corresponds to a 2M region assigned as a
+		 * large page in he RMP table, PSMASH the region into individual
+		 * 4K RMP entries before attempting to convert a 4K sub-page.
+		 */
+		if (!use_2m_update && rmp_level > PG_LEVEL_4K) {
+			rc = snp_rmptable_psmash(pfn);
+			if (rc)
+				pr_err_ratelimited("SEV: Failed to PSMASH RMP entry for PFN 0x%llx error %d\n",
+						   pfn, rc);
+		}
+
+		rc = rmp_make_shared(pfn, use_2m_update ? PG_LEVEL_2M : PG_LEVEL_4K);
+		if (WARN_ON_ONCE(rc)) {
+			pr_err_ratelimited("SEV: Failed to update RMP entry for PFN 0x%llx error %d\n",
+					   pfn, rc);
+			goto next_pfn;
+		}
+
+		/*
+		 * SEV-ES avoids host/guest cache coherency issues through
+		 * WBINVD hooks issued via MMU notifiers during run-time, and
+		 * KVM's VM destroy path at shutdown. Those MMU notifier events
+		 * don't cover gmem since there is no requirement to map pages
+		 * to a HVA in order to use them for a running guest. While the
+		 * shutdown path would still likely cover things for SNP guests,
+		 * userspace may also free gmem pages during run-time via
+		 * hole-punching operations on the guest_memfd, so flush the
+		 * cache entries for these pages before free'ing them back to
+		 * the host.
+		 */
+		clflush_cache_range(__va(pfn_to_hpa(pfn)),
+				    use_2m_update ? PMD_SIZE : PAGE_SIZE);
+next_pfn:
+		pfn += use_2m_update ? PTRS_PER_PMD : 1;
+	}
+}
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 7e8158be610c58..e35f1f08c7a20b 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -5067,6 +5067,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
 	.alloc_apic_backing_page = svm_alloc_apic_backing_page,
 
 	.gmem_prepare = sev_gmem_prepare,
+	.gmem_invalidate = sev_gmem_invalidate,
 };
 
 /*
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 9ece9612dbb9a5..a56109e100ac05 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -726,6 +726,8 @@ struct page *snp_safe_alloc_page(struct kvm_vcpu *vcpu);
 void handle_rmp_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code);
 void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu);
 int sev_gmem_prepare(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order);
+void sev_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end);
+int sev_gmem_max_level(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, u8 *max_level);
 
 /* vmenter.S */
 

From bb3903e583326b51b40d2185d3ca4b15bf7943bb Mon Sep 17 00:00:00 2001
From: Michael Roth <michael.roth@amd.com>
Date: Tue, 29 Aug 2023 10:38:47 -0500
Subject: [PATCH 1401/1406] KVM: x86: Add gmem hook for determining max NPT
 mapping level

In the case of SEV-SNP, whether or not a 2MB page can be mapped via a
2MB mapping in the guest's nested page table depends on whether or not
any subpages within the range have already been initialized as private
in the RMP table. The existing mixed-attribute tracking in KVM is
insufficient here, for instance:

  - gmem allocates 2MB page
  - guest issues PVALIDATE on 2MB page
  - guest later converts a subpage to shared
  - SNP host code issues PSMASH to split 2MB RMP mapping to 4K
  - KVM MMU splits NPT mapping to 4K

At this point there are no mixed attributes, and KVM would normally
allow for 2MB NPT mappings again, but this is actually not allowed
because the RMP table mappings are 4K and cannot be promoted on the
hypervisor side, so the NPT mappings must still be limited to 4K to
match this.

Add a hook to determine the max NPT mapping size in situations like
this.

Signed-off-by: Michael Roth <michael.roth@amd.com>
---
 arch/x86/include/asm/kvm-x86-ops.h |  1 +
 arch/x86/include/asm/kvm_host.h    |  1 +
 arch/x86/kvm/mmu/mmu.c             | 12 ++++++++++--
 arch/x86/kvm/svm/sev.c             | 27 +++++++++++++++++++++++++++
 arch/x86/kvm/svm/svm.c             |  1 +
 5 files changed, 40 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
index c4b7b0db7be320..b0a174213dad0e 100644
--- a/arch/x86/include/asm/kvm-x86-ops.h
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@@ -140,6 +140,7 @@ KVM_X86_OP_OPTIONAL_RET0(vcpu_get_apicv_inhibit_reasons);
 KVM_X86_OP_OPTIONAL(get_untagged_addr)
 KVM_X86_OP_OPTIONAL(alloc_apic_backing_page)
 KVM_X86_OP_OPTIONAL_RET0(gmem_prepare)
+KVM_X86_OP_OPTIONAL_RET0(gmem_max_level)
 KVM_X86_OP_OPTIONAL(gmem_invalidate)
 
 #undef KVM_X86_OP
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index a165a9f225234d..4df86a4ac45882 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1809,6 +1809,7 @@ struct kvm_x86_ops {
 	void *(*alloc_apic_backing_page)(struct kvm_vcpu *vcpu);
 	int (*gmem_prepare)(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order);
 	void (*gmem_invalidate)(kvm_pfn_t start, kvm_pfn_t end);
+	int (*gmem_max_level)(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, u8 *max_level);
 };
 
 struct kvm_x86_nested_ops {
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 6db0c217d47362..c379bc92d07f01 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -4309,6 +4309,7 @@ static int kvm_faultin_pfn_private(struct kvm_vcpu *vcpu,
 				   struct kvm_page_fault *fault)
 {
 	int max_order, r;
+	u8 max_level;
 
 	if (!kvm_slot_can_be_private(fault->slot)) {
 		kvm_mmu_prepare_memory_fault_exit(vcpu, fault);
@@ -4322,8 +4323,15 @@ static int kvm_faultin_pfn_private(struct kvm_vcpu *vcpu,
 		return r;
 	}
 
-	fault->max_level = min(kvm_max_level_for_order(max_order),
-			       fault->max_level);
+	max_level = kvm_max_level_for_order(max_order);
+	r = static_call(kvm_x86_gmem_max_level)(vcpu->kvm, fault->pfn,
+						fault->gfn, &max_level);
+	if (r) {
+		kvm_release_pfn_clean(fault->pfn);
+		return r;
+	}
+
+	fault->max_level = min(max_level, fault->max_level);
 	fault->map_writable = !(fault->slot->flags & KVM_MEM_READONLY);
 
 	return RET_PF_CONTINUE;
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 071c4f66fca44b..c9a55f58e19cc8 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -4318,3 +4318,30 @@ void sev_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end)
 		pfn += use_2m_update ? PTRS_PER_PMD : 1;
 	}
 }
+
+int sev_gmem_max_level(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, u8 *max_level)
+{
+	int level, rc;
+	bool assigned;
+
+	if (!sev_snp_guest(kvm))
+		return 0;
+
+	rc = snp_lookup_rmpentry(pfn, &assigned, &level);
+	if (rc) {
+		pr_err_ratelimited("SEV: RMP entry not found: GFN %llx PFN %llx level %d error %d\n",
+				   gfn, pfn, level, rc);
+		return -ENOENT;
+	}
+
+	if (!assigned) {
+		pr_err_ratelimited("SEV: RMP entry is not assigned: GFN %llx PFN %llx level %d\n",
+				   gfn, pfn, level);
+		return -EINVAL;
+	}
+
+	if (level < *max_level)
+		*max_level = level;
+
+	return 0;
+}
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index e35f1f08c7a20b..b2832c381d4217 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -5067,6 +5067,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
 	.alloc_apic_backing_page = svm_alloc_apic_backing_page,
 
 	.gmem_prepare = sev_gmem_prepare,
+	.gmem_max_level = sev_gmem_max_level,
 	.gmem_invalidate = sev_gmem_invalidate,
 };
 

From 895f4fc29f25ccd64dc5155d320d7413ee489d22 Mon Sep 17 00:00:00 2001
From: Ashish Kalra <ashish.kalra@amd.com>
Date: Thu, 24 Aug 2023 20:48:25 +0000
Subject: [PATCH 1402/1406] KVM: SEV: Avoid WBINVD for HVA-based MMU
 notifications for SNP

With SNP/guest_memfd, private/encrypted memory should not be mappable,
and MMU notifications for HVA-mapped memory will only be relevant to
unencrypted guest memory. Therefore, the rationale behind issuing a
wbinvd_on_all_cpus() in sev_guest_memory_reclaimed() should not apply
for SNP guests and can be ignored.

Signed-off-by: Ashish Kalra <ashish.kalra@amd.com>
[mdr: Add some clarifications in commit]
Signed-off-by: Michael Roth <michael.roth@amd.com>
---
 arch/x86/kvm/svm/sev.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index c9a55f58e19cc8..46f73d3f6d037e 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -2864,7 +2864,14 @@ static void sev_flush_encrypted_page(struct kvm_vcpu *vcpu, void *va)
 
 void sev_guest_memory_reclaimed(struct kvm *kvm)
 {
-	if (!sev_guest(kvm))
+	/*
+	 * With SNP+gmem, private/encrypted memory should be
+	 * unreachable via the hva-based mmu notifiers. Additionally,
+	 * for shared->private translations, H/W coherency will ensure
+	 * first guest access to the page would clear out any existing
+	 * dirty copies of that cacheline.
+	 */
+	if (!sev_guest(kvm) || sev_snp_guest(kvm))
 		return;
 
 	wbinvd_on_all_cpus();

From a7bf5564918eab457db09959b46d02a24eeb804c Mon Sep 17 00:00:00 2001
From: Brijesh Singh <brijesh.singh@amd.com>
Date: Thu, 14 Sep 2023 23:17:45 -0500
Subject: [PATCH 1403/1406] KVM: SVM: Add module parameter to enable the
 SEV-SNP

Add a module parameter than can be used to enable or disable the SEV-SNP
feature. Now that KVM contains the support for the SNP set the GHCB
hypervisor feature flag to indicate that SNP is supported.

Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
Signed-off-by: Ashish Kalra <ashish.kalra@amd.com>
---
 arch/x86/kvm/svm/sev.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 46f73d3f6d037e..d5769a7186b3ec 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -61,7 +61,8 @@ module_param_named(sev_es, sev_es_enabled, bool, 0444);
 static bool sev_es_debug_swap_enabled = true;
 module_param_named(debug_swap, sev_es_debug_swap_enabled, bool, 0444);
 
-static bool sev_snp_enabled;
+static bool sev_snp_enabled = true;
+module_param_named(sev_snp, sev_snp_enabled, bool, 0444);
 #else
 #define sev_enabled false
 #define sev_es_enabled false

From b9b008586451422d72ae56bc6afa87fa4c00fbca Mon Sep 17 00:00:00 2001
From: Brijesh Singh <brijesh.singh@amd.com>
Date: Tue, 13 Dec 2022 17:45:20 -0600
Subject: [PATCH 1404/1406] KVM: SEV: Provide support for SNP_GUEST_REQUEST NAE
 event

Version 2 of GHCB specification added support for the SNP Guest Request
Message NAE event. The event allows for an SEV-SNP guest to make
requests to the SEV-SNP firmware through hypervisor using the
SNP_GUEST_REQUEST API defined in the SEV-SNP firmware specification.

This is used by guests primarily to request attestation reports from
firmware. There are other request types are available as well, but the
specifics of what guest requests are being made are opaque to the
hypervisor, which only serves as a proxy for the guest requests and
firmware responses.

Implement handling for these events.

Co-developed-by: Alexey Kardashevskiy <aik@amd.com>
Signed-off-by: Alexey Kardashevskiy <aik@amd.com>
Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
Signed-off-by: Ashish Kalra <ashish.kalra@amd.com>
[mdr: ensure FW command failures are indicated to guest, drop extended
 request handling to be re-written as separate patch, massage commit]
Signed-off-by: Michael Roth <michael.roth@amd.com>
---
 arch/x86/kvm/svm/sev.c         | 83 ++++++++++++++++++++++++++++++++++
 include/uapi/linux/sev-guest.h |  9 ++++
 2 files changed, 92 insertions(+)

diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index d5769a7186b3ec..2ffc1334ef33e5 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -19,6 +19,7 @@
 #include <linux/misc_cgroup.h>
 #include <linux/processor.h>
 #include <linux/trace_events.h>
+#include <uapi/linux/sev-guest.h>
 
 #include <asm/pkru.h>
 #include <asm/trapnr.h>
@@ -3112,6 +3113,7 @@ static int sev_es_validate_vmgexit(struct vcpu_svm *svm)
 	case SVM_VMGEXIT_HV_FEATURES:
 	case SVM_VMGEXIT_PSC:
 	case SVM_VMGEXIT_TERM_REQUEST:
+	case SVM_VMGEXIT_GUEST_REQUEST:
 		break;
 	default:
 		reason = GHCB_ERR_INVALID_EVENT;
@@ -3542,6 +3544,83 @@ static int sev_snp_ap_creation(struct vcpu_svm *svm)
 	return ret;
 }
 
+static bool snp_setup_guest_buf(struct kvm *kvm, struct sev_data_snp_guest_request *data,
+				gpa_t req_gpa, gpa_t resp_gpa)
+{
+	struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+	kvm_pfn_t req_pfn, resp_pfn;
+
+	if (!IS_ALIGNED(req_gpa, PAGE_SIZE) || !IS_ALIGNED(resp_gpa, PAGE_SIZE))
+		return false;
+
+	req_pfn = gfn_to_pfn(kvm, gpa_to_gfn(req_gpa));
+	if (is_error_noslot_pfn(req_pfn))
+		return false;
+
+	resp_pfn = gfn_to_pfn(kvm, gpa_to_gfn(resp_gpa));
+	if (is_error_noslot_pfn(resp_pfn))
+		return false;
+
+	if (rmp_make_private(resp_pfn, 0, PG_LEVEL_4K, 0, true))
+		return false;
+
+	data->gctx_paddr = __psp_pa(sev->snp_context);
+	data->req_paddr = __sme_set(req_pfn << PAGE_SHIFT);
+	data->res_paddr = __sme_set(resp_pfn << PAGE_SHIFT);
+
+	return true;
+}
+
+static bool snp_cleanup_guest_buf(struct sev_data_snp_guest_request *data)
+{
+	u64 pfn = __sme_clr(data->res_paddr) >> PAGE_SHIFT;
+
+	if (snp_page_reclaim(pfn))
+		return false;
+
+	if (rmp_make_shared(pfn, PG_LEVEL_4K))
+		return false;
+
+	return true;
+}
+
+static bool __snp_handle_guest_req(struct kvm *kvm, gpa_t req_gpa, gpa_t resp_gpa,
+				   sev_ret_code *fw_err)
+{
+	struct sev_data_snp_guest_request data = {0};
+	struct kvm_sev_info *sev;
+	bool ret = true;
+
+	if (!sev_snp_guest(kvm))
+		return false;
+
+	sev = &to_kvm_svm(kvm)->sev_info;
+
+	if (!snp_setup_guest_buf(kvm, &data, req_gpa, resp_gpa))
+		return false;
+
+	if (sev_issue_cmd(kvm, SEV_CMD_SNP_GUEST_REQUEST, &data, fw_err))
+		ret = false;
+
+	if (!snp_cleanup_guest_buf(&data))
+		ret = false;
+
+	return ret;
+}
+
+static void snp_handle_guest_req(struct vcpu_svm *svm, gpa_t req_gpa, gpa_t resp_gpa)
+{
+	struct kvm_vcpu *vcpu = &svm->vcpu;
+	struct kvm *kvm = vcpu->kvm;
+	sev_ret_code fw_err = 0;
+	int vmm_ret = 0;
+
+	if (!__snp_handle_guest_req(kvm, req_gpa, resp_gpa, &fw_err))
+		vmm_ret = SNP_GUEST_VMM_ERR_GENERIC;
+
+	ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, SNP_GUEST_ERR(vmm_ret, fw_err));
+}
+
 static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm)
 {
 	struct vmcb_control_area *control = &svm->vmcb->control;
@@ -3802,6 +3881,10 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu)
 		vcpu->run->system_event.ndata = 1;
 		vcpu->run->system_event.data[0] = control->ghcb_gpa;
 		break;
+	case SVM_VMGEXIT_GUEST_REQUEST:
+		snp_handle_guest_req(svm, control->exit_info_1, control->exit_info_2);
+		ret = 1;
+		break;
 	case SVM_VMGEXIT_UNSUPPORTED_EVENT:
 		vcpu_unimpl(vcpu,
 			    "vmgexit: unsupported event - exit_info_1=%#llx, exit_info_2=%#llx\n",
diff --git a/include/uapi/linux/sev-guest.h b/include/uapi/linux/sev-guest.h
index 154a87a1eca978..7bd78e2585697b 100644
--- a/include/uapi/linux/sev-guest.h
+++ b/include/uapi/linux/sev-guest.h
@@ -89,8 +89,17 @@ struct snp_ext_report_req {
 #define SNP_GUEST_FW_ERR_MASK		GENMASK_ULL(31, 0)
 #define SNP_GUEST_VMM_ERR_SHIFT		32
 #define SNP_GUEST_VMM_ERR(x)		(((u64)x) << SNP_GUEST_VMM_ERR_SHIFT)
+#define SNP_GUEST_FW_ERR(x)		((x) & SNP_GUEST_FW_ERR_MASK)
+#define SNP_GUEST_ERR(vmm_err, fw_err)	(SNP_GUEST_VMM_ERR(vmm_err) | \
+					 SNP_GUEST_FW_ERR(fw_err))
 
+/*
+ * The GHCB spec only formally defines INVALID_LEN/BUSY VMM errors, but define
+ * a GENERIC error code such that it won't ever conflict with GHCB-defined
+ * errors if any get added in the future.
+ */
 #define SNP_GUEST_VMM_ERR_INVALID_LEN	1
 #define SNP_GUEST_VMM_ERR_BUSY		2
+#define SNP_GUEST_VMM_ERR_GENERIC	BIT(31)
 
 #endif /* __UAPI_LINUX_SEV_GUEST_H_ */

From 701c89ec8749aa33d8388d6c2eb290df6d2f0b0e Mon Sep 17 00:00:00 2001
From: Michael Roth <michael.roth@amd.com>
Date: Wed, 27 Dec 2023 15:41:56 -0600
Subject: [PATCH 1405/1406] crypto: ccp: Add the SNP_SET_CONFIG_{START,END}
 commands

These commands can be used to create a transaction such that commands
that update the reported TCB, such as SNP_SET_CONFIG/SNP_COMMIT, and
updates to userspace-supplied certificates, can be handled atomically
relative to any extended guest requests issued by any SNP guests while
the updates are taking place.

Without this interface, there is a risk that a guest will be given
certificate information that does not correspond to the VCEK/VLEK used
to sign a particular attestation report unless all the running guests
are paused in advance, which would cause disruption to all guests in the
system even if no attestation requests are being made. Even then, care
is needed to ensure that KVM does not pass along certificate information
that was fetched from userspace in advance of the guest being paused.

This interface also provides some versatility with how similar firmware
maintenance activity can be handled in the future without passing
unnecessary management complexity on to userspace.

Signed-off-by: Michael Roth <michael.roth@amd.com>
---
 Documentation/virt/coco/sev-guest.rst | 37 +++++++++++++++++++--
 arch/x86/include/asm/sev.h            |  4 +++
 arch/x86/virt/svm/sev.c               | 43 ++++++++++++++++++++++++
 drivers/crypto/ccp/sev-dev.c          | 47 +++++++++++++++++++++++++++
 include/uapi/linux/psp-sev.h          | 12 +++++++
 5 files changed, 141 insertions(+), 2 deletions(-)

diff --git a/Documentation/virt/coco/sev-guest.rst b/Documentation/virt/coco/sev-guest.rst
index e1eaf6a830ce4d..cfcd6d8e626373 100644
--- a/Documentation/virt/coco/sev-guest.rst
+++ b/Documentation/virt/coco/sev-guest.rst
@@ -128,8 +128,6 @@ the SEV-SNP specification for further details.
 
 The SNP_GET_EXT_REPORT ioctl is similar to the SNP_GET_REPORT. The difference is
 related to the additional certificate data that is returned with the report.
-The certificate data returned is being provided by the hypervisor through the
-SNP_SET_EXT_CONFIG.
 
 The ioctl uses the SNP_GUEST_REQUEST (MSG_REPORT_REQ) command provided by the SEV-SNP
 firmware to get the attestation report.
@@ -176,6 +174,41 @@ to SNP_CONFIG command defined in the SEV-SNP spec. The current values of
 the firmware parameters affected by this command can be queried via
 SNP_PLATFORM_STATUS.
 
+2.7 SNP_SET_CONFIG_START / SNP_SET_CONFIG_END
+---------------------------------------------
+:Technology: sev-snp
+:Type: hypervisor ioctl cmd
+:Parameters (out): struct sev_user_data_snp_config_transaction
+:Returns (out): 0 on success, -negative on error
+
+When requesting attestation reports, SNP guests have the option of issuing
+an extended guest request which allows host userspace to supply additional
+certificate data that can be used to validate the signature used to sign
+the attestation report. This signature is generated using a key that is
+derived from the reported TCB that can be set via the SNP_SET_CONFIG and
+SNP_COMMIT ioctls, so the accompanying certificate data needs to be kept in
+sync with the changes made to the reported TCB via these ioctls.
+
+To allow for this, SNP_SET_CONFIG_START can be issued prior to performing
+any updates to the reported TCB or certificate data that will be fetched
+from userspace. Any attestation report requests via extended guest requests
+that are in-progress, or received after SNP_SET_CONFIG_START is issued, will
+result in the guest receiving a GHCB-defined error message instructing it to
+retry the request. Once the updates are completed on the host,
+SNP_SET_CONFIG_END must be issued to resume normal servicing of extended
+guest requests.
+
+In general, hosts should never have more than 1 outstanding
+SNP_SET_CONFIG_{START,END} transaction in flight at any point in time, and
+attempting to issue SNP_SET_CONFIG_START will fail if a transaction is
+already in progress. However, there may be occassions where a transaction
+needs to be aborted via SNP_SET_CONFIG_END due to unexpected activity in
+userspace such as timeouts, crashes, etc. To allow for callers of
+SNP_SET_CONFIG_{START,END} to detect such a situation, each ioctl will return
+a transaction ID in the response so the caller can monitor whether the
+start/end ID both match. If they don't, the caller should assume the
+transaction has been invalidated and retry the full update sequence.
+
 3. SEV-SNP CPUID Enforcement
 ============================
 
diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h
index c37f5a80fa7fc3..554108fdfbfff5 100644
--- a/arch/x86/include/asm/sev.h
+++ b/arch/x86/include/asm/sev.h
@@ -271,6 +271,8 @@ int psmash(u64 pfn);
 int rmp_make_private(u64 pfn, u64 gpa, enum pg_level level, u32 asid, bool immutable);
 int rmp_make_shared(u64 pfn, enum pg_level level);
 void snp_leak_pages(u64 pfn, unsigned int npages);
+int snp_config_transaction_start(u64 *transaction_id);
+void snp_config_transaction_end(u64 *transaction_id);
 #else
 static inline bool snp_probe_rmptable_info(void) { return false; }
 static inline int snp_lookup_rmpentry(u64 pfn, bool *assigned, int *level) { return -ENODEV; }
@@ -283,6 +285,8 @@ static inline int rmp_make_private(u64 pfn, u64 gpa, enum pg_level level, u32 as
 }
 static inline int rmp_make_shared(u64 pfn, enum pg_level level) { return -ENODEV; }
 static inline void snp_leak_pages(u64 pfn, unsigned int npages) {}
+static inline int snp_config_transaction_start(u64 *transaction_id) { return 0; }
+static inline void snp_config_transaction_end(u64 *transaction_id) { return 0; }
 #endif
 
 #endif
diff --git a/arch/x86/virt/svm/sev.c b/arch/x86/virt/svm/sev.c
index cffe1157a90acf..39e55caaa0bc63 100644
--- a/arch/x86/virt/svm/sev.c
+++ b/arch/x86/virt/svm/sev.c
@@ -70,6 +70,11 @@ static DEFINE_SPINLOCK(snp_leaked_pages_list_lock);
 
 static unsigned long snp_nr_leaked_pages;
 
+/* For synchronizing TCB updates with extended guest requests */
+static DEFINE_MUTEX(snp_transaction_lock);
+static u64 snp_transaction_id;
+static bool snp_transaction_pending;
+
 #undef pr_fmt
 #define pr_fmt(fmt)	"SEV-SNP: " fmt
 
@@ -558,3 +563,41 @@ void snp_leak_pages(u64 pfn, unsigned int npages)
 	spin_unlock(&snp_leaked_pages_list_lock);
 }
 EXPORT_SYMBOL_GPL(snp_leak_pages);
+
+int snp_config_transaction_start(u64 *transaction_id)
+{
+	mutex_lock(&snp_transaction_lock);
+
+	if (snp_transaction_pending) {
+		mutex_unlock(&snp_transaction_lock);
+		return -EBUSY;
+	}
+
+	/*
+	 * The actual transaction ID update will happen when
+	 * snp_config_transaction_end() is called, so return
+	 * the *anticipated* transaction ID that will be
+	 * return by snp_config_transaction_end(). This is to
+	 * ensure that unbalanced/aborted transactions will
+	 * be noticeable when the caller that started the
+	 * transaction calls snp_config_transaction_end().
+	 */
+	*transaction_id = snp_transaction_id + 1;
+	snp_transaction_pending = true;
+
+	mutex_unlock(&snp_transaction_lock);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(snp_config_transaction_start);
+
+void snp_config_transaction_end(u64 *transaction_id)
+{
+	mutex_lock(&snp_transaction_lock);
+
+	snp_transaction_pending = false;
+	*transaction_id = ++snp_transaction_id;
+
+	mutex_unlock(&snp_transaction_lock);
+}
+EXPORT_SYMBOL_GPL(snp_config_transaction_end);
diff --git a/drivers/crypto/ccp/sev-dev.c b/drivers/crypto/ccp/sev-dev.c
index f44efbb89c346a..f1ee0f83f61388 100644
--- a/drivers/crypto/ccp/sev-dev.c
+++ b/drivers/crypto/ccp/sev-dev.c
@@ -2027,6 +2027,47 @@ static int sev_ioctl_do_snp_set_config(struct sev_issue_cmd *argp, bool writable
 	return __sev_do_cmd_locked(SEV_CMD_SNP_CONFIG, &config, &argp->error);
 }
 
+static int sev_ioctl_do_snp_set_config_start(struct sev_issue_cmd *argp, bool writable)
+{
+	struct sev_user_data_snp_config_transaction transaction = {0};
+	struct sev_device *sev = psp_master->sev_data;
+	int ret;
+
+	if (!sev->snp_initialized || !argp->data)
+		return -EINVAL;
+
+	if (!writable)
+		return -EPERM;
+
+	ret = snp_config_transaction_start(&transaction.id);
+	if (ret)
+		return ret;
+
+	if (copy_to_user((void __user *)argp->data, &transaction, sizeof(transaction)))
+		return -EFAULT;
+
+	return 0;
+}
+
+static int sev_ioctl_do_snp_set_config_end(struct sev_issue_cmd *argp, bool writable)
+{
+	struct sev_user_data_snp_config_transaction transaction = {0};
+	struct sev_device *sev = psp_master->sev_data;
+
+	if (!sev->snp_initialized || !argp->data)
+		return -EINVAL;
+
+	if (!writable)
+		return -EPERM;
+
+	snp_config_transaction_end(&transaction.id);
+
+	if (copy_to_user((void __user *)argp->data, &transaction, sizeof(transaction)))
+		return -EFAULT;
+
+	return 0;
+}
+
 static long sev_ioctl(struct file *file, unsigned int ioctl, unsigned long arg)
 {
 	void __user *argp = (void __user *)arg;
@@ -2087,6 +2128,12 @@ static long sev_ioctl(struct file *file, unsigned int ioctl, unsigned long arg)
 	case SNP_SET_CONFIG:
 		ret = sev_ioctl_do_snp_set_config(&input, writable);
 		break;
+	case SNP_SET_CONFIG_START:
+		ret = sev_ioctl_do_snp_set_config_start(&input, writable);
+		break;
+	case SNP_SET_CONFIG_END:
+		ret = sev_ioctl_do_snp_set_config_end(&input, writable);
+		break;
 	default:
 		ret = -EINVAL;
 		goto out;
diff --git a/include/uapi/linux/psp-sev.h b/include/uapi/linux/psp-sev.h
index b7a2c2ee35b7e8..2fd05e6f702d0f 100644
--- a/include/uapi/linux/psp-sev.h
+++ b/include/uapi/linux/psp-sev.h
@@ -31,6 +31,8 @@ enum {
 	SNP_PLATFORM_STATUS,
 	SNP_COMMIT,
 	SNP_SET_CONFIG,
+	SNP_SET_CONFIG_START,
+	SNP_SET_CONFIG_END,
 
 	SEV_MAX,
 };
@@ -214,6 +216,16 @@ struct sev_user_data_snp_config {
 	__u8 rsvd1[52];
 } __packed;
 
+/**
+ * struct sev_user_data_snp_config_transaction - metadata for config transactions
+ *
+ * @id: the ID of the transaction started/ended by a call to SNP_SET_CONFIG_START
+ *	or SNP_SET_CONFIG_END, respectively.
+ */
+struct sev_user_data_snp_config_transaction {
+	__u64 id;		/* Out */
+} __packed;
+
 /**
  * struct sev_issue_cmd - SEV ioctl parameters
  *

From cc2568386ccb5b0d7a46b35dc2fad3412d3aad26 Mon Sep 17 00:00:00 2001
From: Michael Roth <michael.roth@amd.com>
Date: Wed, 27 Dec 2023 11:07:27 -0600
Subject: [PATCH 1406/1406] KVM: SEV: Provide support for
 SNP_EXTENDED_GUEST_REQUEST NAE event

Version 2 of GHCB specification added support for the SNP Extended Guest
Request Message NAE event. This event serves a nearly identical purpose
to the previously-added SNP_GUEST_REQUEST event, but allows for
additional certificate data to be supplied via an additional
guest-supplied buffer to be used mainly for verifying the signature of
an attestation report as returned by firmware.

This certificate data is supplied by userspace, so unlike with
SNP_GUEST_REQUEST events, SNP_EXTENDED_GUEST_REQUEST events are first
forwarded to userspace via a KVM_EXIT_VMGEXIT exit type, and then the
firmware request is made only afterward.

Implement handling for these events.

Since there is a potential for race conditions where the
userspace-supplied certificate data may be out-of-sync relative to the
reported TCB that firmware will use when signing attestation reports,
make use of the transaction/synchronization mechanisms added by the
SNP_SET_CONFIG_{START,END} SEV device ioctls such that the guest will be
told to retry the request when an update to reported TCB or
userspace-supplied certificates may have occurred or is in progress
while an extended guest request is being processed.

Signed-off-by: Michael Roth <michael.roth@amd.com>
---
 Documentation/virt/kvm/api.rst | 26 ++++++++++++
 arch/x86/include/asm/sev.h     |  4 ++
 arch/x86/kvm/svm/sev.c         | 75 ++++++++++++++++++++++++++++++++++
 arch/x86/kvm/svm/svm.h         |  3 ++
 arch/x86/virt/svm/sev.c        | 21 ++++++++++
 include/uapi/linux/kvm.h       |  6 +++
 6 files changed, 135 insertions(+)

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 003369c2f5eaf2..d79d2f1e29fbf9 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -7066,6 +7066,7 @@ values in kvm_run even if the corresponding bit in kvm_dirty_regs is not set.
 		struct kvm_user_vmgexit {
 		#define KVM_USER_VMGEXIT_PSC_MSR	1
 		#define KVM_USER_VMGEXIT_PSC		2
+		#define KVM_USER_VMGEXIT_EXT_GUEST_REQ	3
 			__u32 type; /* KVM_USER_VMGEXIT_* type */
 			union {
 				struct {
@@ -7079,6 +7080,11 @@ values in kvm_run even if the corresponding bit in kvm_dirty_regs is not set.
 					__u64 shared_gpa;
 					__u64 ret;
 				} psc;
+				struct {
+					__u64 data_gpa;
+					__u64 data_npages;
+					__u32 ret;
+				} ext_guest_req;
 			};
 		};
 
@@ -7108,6 +7114,26 @@ private/shared state. Userspace will return a value in 'ret' that is in
 agreement with the GHCB-defined return values that the guest will expect
 in the SW_EXITINFO2 field of the GHCB in response to these requests.
 
+For the KVM_USER_VMGEXIT_EXT_GUEST_REQ type, the ext_guest_req union type
+is used. The kernel will supply in 'data_gpa' the value the guest supplies
+via the RAX field of the GHCB when issued extended guest requests.
+'data_npages' will similarly contain the value the guest supplies in RBX
+denoting the number of shared pages available to write the certificate
+data into.
+
+  - If the supplied number of pages is sufficient, userspace should write
+    the certificate data blob (in the format defined by the GHCB spec) in
+    the address indicated by 'data_gpa' and set 'ret' to 0.
+
+  - If the number of pages supplied is not sufficient, userspace must write
+    the required number of pages in 'data_npages' and then set 'ret' to 1.
+
+  - If userspace is temporarily unable to handle the request, 'ret' should
+    be set to 2 to inform the guest to retry later.
+
+  - If some other error occurred, userspace should set 'ret' to a non-zero
+    value that is distinct from the specific return values mentioned above.
+
 6. Capabilities that can be enabled on vCPUs
 ============================================
 
diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h
index 554108fdfbfff5..6465187a74854c 100644
--- a/arch/x86/include/asm/sev.h
+++ b/arch/x86/include/asm/sev.h
@@ -273,6 +273,8 @@ int rmp_make_shared(u64 pfn, enum pg_level level);
 void snp_leak_pages(u64 pfn, unsigned int npages);
 int snp_config_transaction_start(u64 *transaction_id);
 void snp_config_transaction_end(u64 *transaction_id);
+u64 snp_config_transaction_get_id(void);
+bool snp_config_transaction_is_stale(u64 transaction_id);
 #else
 static inline bool snp_probe_rmptable_info(void) { return false; }
 static inline int snp_lookup_rmpentry(u64 pfn, bool *assigned, int *level) { return -ENODEV; }
@@ -287,6 +289,8 @@ static inline int rmp_make_shared(u64 pfn, enum pg_level level) { return -ENODEV
 static inline void snp_leak_pages(u64 pfn, unsigned int npages) {}
 static inline int snp_config_transaction_start(u64 *transaction_id) { return 0; }
 static inline void snp_config_transaction_end(u64 *transaction_id) { return 0; }
+static inline u64 snp_config_transaction_get_id(void) { return 0; }
+static inline bool snp_config_transaction_is_stale(u64 transaction_id) { return false; }
 #endif
 
 #endif
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 2ffc1334ef33e5..681ab67785471f 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -3114,6 +3114,7 @@ static int sev_es_validate_vmgexit(struct vcpu_svm *svm)
 	case SVM_VMGEXIT_PSC:
 	case SVM_VMGEXIT_TERM_REQUEST:
 	case SVM_VMGEXIT_GUEST_REQUEST:
+	case SVM_VMGEXIT_EXT_GUEST_REQUEST:
 		break;
 	default:
 		reason = GHCB_ERR_INVALID_EVENT;
@@ -3621,6 +3622,77 @@ static void snp_handle_guest_req(struct vcpu_svm *svm, gpa_t req_gpa, gpa_t resp
 	ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, SNP_GUEST_ERR(vmm_ret, fw_err));
 }
 
+static int snp_complete_ext_guest_req(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+	struct vmcb_control_area *control;
+	struct kvm *kvm = vcpu->kvm;
+	sev_ret_code fw_err = 0;
+	int vmm_ret;
+
+	vmm_ret = vcpu->run->vmgexit.ext_guest_req.ret;
+	if (vmm_ret) {
+		if (vmm_ret == SNP_GUEST_VMM_ERR_INVALID_LEN)
+			vcpu->arch.regs[VCPU_REGS_RBX] =
+				vcpu->run->vmgexit.ext_guest_req.data_npages;
+		goto abort_request;
+	}
+
+	control = &svm->vmcb->control;
+
+	if (!__snp_handle_guest_req(kvm, control->exit_info_1, control->exit_info_2,
+				    &fw_err))
+		vmm_ret = SNP_GUEST_VMM_ERR_GENERIC;
+
+	/*
+	 * Give errors related to stale transactions precedence to provide more
+	 * potential options for servicing firmware while guests are running.
+	 */
+	if (snp_config_transaction_is_stale(svm->snp_transaction_id))
+		vmm_ret = SNP_GUEST_VMM_ERR_BUSY;
+
+abort_request:
+	ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, SNP_GUEST_ERR(vmm_ret, fw_err));
+
+	return 1; /* resume guest */
+}
+
+static int snp_begin_ext_guest_req(struct kvm_vcpu *vcpu)
+{
+	int vmm_ret = SNP_GUEST_VMM_ERR_GENERIC;
+	struct vcpu_svm *svm = to_svm(vcpu);
+	unsigned long data_npages;
+	sev_ret_code fw_err;
+	gpa_t data_gpa;
+
+	if (!sev_snp_guest(vcpu->kvm))
+		goto abort_request;
+
+	data_gpa = vcpu->arch.regs[VCPU_REGS_RAX];
+	data_npages = vcpu->arch.regs[VCPU_REGS_RBX];
+
+	if (!IS_ALIGNED(data_gpa, PAGE_SIZE))
+		goto abort_request;
+
+	svm->snp_transaction_id = snp_config_transaction_get_id();
+	if (snp_config_transaction_is_stale(svm->snp_transaction_id)) {
+		vmm_ret = SNP_GUEST_VMM_ERR_BUSY;
+		goto abort_request;
+	}
+
+	vcpu->run->exit_reason = KVM_EXIT_VMGEXIT;
+	vcpu->run->vmgexit.type = KVM_USER_VMGEXIT_EXT_GUEST_REQ;
+	vcpu->run->vmgexit.ext_guest_req.data_gpa = data_gpa;
+	vcpu->run->vmgexit.ext_guest_req.data_npages = data_npages;
+	vcpu->arch.complete_userspace_io = snp_complete_ext_guest_req;
+
+	return 0; /* forward request to userspace */
+
+abort_request:
+	ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, SNP_GUEST_ERR(vmm_ret, fw_err));
+	return 1; /* resume guest */
+}
+
 static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm)
 {
 	struct vmcb_control_area *control = &svm->vmcb->control;
@@ -3885,6 +3957,9 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu)
 		snp_handle_guest_req(svm, control->exit_info_1, control->exit_info_2);
 		ret = 1;
 		break;
+	case SVM_VMGEXIT_EXT_GUEST_REQUEST:
+		ret = snp_begin_ext_guest_req(vcpu);
+		break;
 	case SVM_VMGEXIT_UNSUPPORTED_EVENT:
 		vcpu_unimpl(vcpu,
 			    "vmgexit: unsupported event - exit_info_1=%#llx, exit_info_2=%#llx\n",
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index a56109e100ac05..a2ac6dc3a79a58 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -307,6 +307,9 @@ struct vcpu_svm {
 
 	/* Guest GIF value, used when vGIF is not enabled */
 	bool guest_gif;
+
+	/* Transaction ID associated with SNP config updates */
+	u64 snp_transaction_id;
 };
 
 struct svm_cpu_data {
diff --git a/arch/x86/virt/svm/sev.c b/arch/x86/virt/svm/sev.c
index 39e55caaa0bc63..5480600560f8e0 100644
--- a/arch/x86/virt/svm/sev.c
+++ b/arch/x86/virt/svm/sev.c
@@ -601,3 +601,24 @@ void snp_config_transaction_end(u64 *transaction_id)
 	mutex_unlock(&snp_transaction_lock);
 }
 EXPORT_SYMBOL_GPL(snp_config_transaction_end);
+
+u64 snp_config_transaction_get_id(void)
+{
+	return snp_transaction_id;
+}
+EXPORT_SYMBOL_GPL(snp_config_transaction_get_id);
+
+bool snp_config_transaction_is_stale(u64 transaction_id)
+{
+	bool stale;
+
+	mutex_lock(&snp_transaction_lock);
+
+	stale = (snp_transaction_pending ||
+		 transaction_id != snp_transaction_id);
+
+	mutex_unlock(&snp_transaction_lock);
+
+	return stale;
+}
+EXPORT_SYMBOL_GPL(snp_config_transaction_is_stale);
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index e33c48bfbd67d9..585de3a2591ee7 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -138,6 +138,7 @@ struct kvm_xen_exit {
 struct kvm_user_vmgexit {
 #define KVM_USER_VMGEXIT_PSC_MSR	1
 #define KVM_USER_VMGEXIT_PSC		2
+#define KVM_USER_VMGEXIT_EXT_GUEST_REQ	3
 	__u32 type; /* KVM_USER_VMGEXIT_* type */
 	union {
 		struct {
@@ -151,6 +152,11 @@ struct kvm_user_vmgexit {
 			__u64 shared_gpa;
 			__u64 ret;
 		} psc;
+		struct {
+			__u64 data_gpa;
+			__u64 data_npages;
+			__u32 ret;
+		} ext_guest_req;
 	};
 };